Back to Blog
2 min read

Cost Optimization Strategies for Azure OpenAI Deployments

Azure OpenAI costs can grow quickly without proper management. Implementing cost optimization strategies ensures you get maximum value from your AI investments while maintaining quality.

Token Usage Optimization

Implement intelligent prompt management to reduce token consumption:

from dataclasses import dataclass
from typing import Optional
import tiktoken

@dataclass
class TokenBudget:
    max_prompt_tokens: int
    max_completion_tokens: int
    warning_threshold: float = 0.8

class TokenOptimizer:
    def __init__(self, model: str = "gpt-4"):
        self.encoding = tiktoken.encoding_for_model(model)

    def count_tokens(self, text: str) -> int:
        """Count tokens in text."""
        return len(self.encoding.encode(text))

    def truncate_to_budget(self, text: str, max_tokens: int) -> str:
        """Truncate text to fit within token budget."""
        tokens = self.encoding.encode(text)
        if len(tokens) <= max_tokens:
            return text

        truncated_tokens = tokens[:max_tokens]
        return self.encoding.decode(truncated_tokens)

    def optimize_prompt(
        self,
        system_prompt: str,
        user_content: str,
        context: Optional[str],
        budget: TokenBudget
    ) -> dict:
        """Optimize prompt components to fit budget."""

        system_tokens = self.count_tokens(system_prompt)
        user_tokens = self.count_tokens(user_content)

        remaining_budget = budget.max_prompt_tokens - system_tokens - user_tokens

        optimized_context = ""
        if context and remaining_budget > 100:
            optimized_context = self.truncate_to_budget(context, remaining_budget - 50)

        total_tokens = system_tokens + user_tokens + self.count_tokens(optimized_context)

        return {
            "system_prompt": system_prompt,
            "user_content": user_content,
            "context": optimized_context,
            "total_prompt_tokens": total_tokens,
            "budget_utilization": total_tokens / budget.max_prompt_tokens
        }

Implementing Caching

Cache repeated requests to avoid redundant API calls:

import hashlib
import json
from datetime import datetime, timedelta

class ResponseCache:
    def __init__(self, redis_client, default_ttl_hours: int = 24):
        self.redis = redis_client
        self.default_ttl = timedelta(hours=default_ttl_hours)

    def _generate_key(self, prompt: str, model: str, temperature: float) -> str:
        """Generate cache key from request parameters."""
        content = json.dumps({
            "prompt": prompt,
            "model": model,
            "temperature": temperature
        }, sort_keys=True)
        return f"ai_cache:{hashlib.sha256(content.encode()).hexdigest()}"

    def get_cached_response(self, prompt: str, model: str, temperature: float) -> Optional[dict]:
        """Retrieve cached response if available."""
        key = self._generate_key(prompt, model, temperature)
        cached = self.redis.get(key)

        if cached:
            return json.loads(cached)
        return None

    def cache_response(
        self,
        prompt: str,
        model: str,
        temperature: float,
        response: dict,
        ttl: Optional[timedelta] = None
    ):
        """Cache response for future requests."""
        key = self._generate_key(prompt, model, temperature)
        ttl = ttl or self.default_ttl

        self.redis.setex(key, ttl, json.dumps(response))

Model Selection Strategy

Choose the right model for each task:

class ModelSelector:
    MODEL_COSTS = {
        "gpt-4": {"input": 0.03, "output": 0.06},
        "gpt-4-turbo": {"input": 0.01, "output": 0.03},
        "gpt-3.5-turbo": {"input": 0.0005, "output": 0.0015}
    }

    def select_model(self, task_complexity: str, quality_requirement: str) -> str:
        """Select optimal model based on requirements."""
        if quality_requirement == "high" and task_complexity in ["complex", "reasoning"]:
            return "gpt-4"
        elif task_complexity in ["simple", "classification"]:
            return "gpt-3.5-turbo"
        return "gpt-4-turbo"

Balance cost and quality by routing requests to appropriate models based on task requirements.

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.