Skip to content
Back to Blog
1 min read

Cost Optimization Strategies for Azure OpenAI Deployments

I wrote “Cost Optimization Strategies for Azure OpenAI Deployments” to share practical, production-minded guidance on this topic.

Token Usage Optimization

Implement intelligent prompt management to reduce token consumption:

from dataclasses import dataclass
from typing import Optional
import tiktoken

@dataclass
class TokenBudget:
    max_prompt_tokens: int
    max_completion_tokens: int
    warning_threshold: float = 0.8

class TokenOptimizer:
    def __init__(self, model: str = "gpt-4"):
        self.encoding = tiktoken.encoding_for_model(model)

    def count_tokens(self, text: str) -> int:
        """Count tokens in text."""
        return len(self.encoding.encode(text))

    def truncate_to_budget(self, text: str, max_tokens: int) -> str:
        """Truncate text to fit within token budget."""
        tokens = self.encoding.encode(text)
        if len(tokens) <= max_tokens:
            return text

        truncated_tokens = tokens[:max_tokens]
        return self.encoding.decode(truncated_tokens)

    def optimize_prompt(
        self,
        system_prompt: str,
        user_content: str,
        context: Optional[str],
        budget: TokenBudget
    ) -> dict:
        """Optimize prompt components to fit budget."""

        system_tokens = self.count_tokens(system_prompt)
        user_tokens = self.count_tokens(user_content)

        remaining_budget = budget.max_prompt_tokens - system_tokens - user_tokens

        optimized_context = ""
        if context and remaining_budget > 100:
            optimized_context = self.truncate_to_budget(context, remaining_budget - 50)

        total_tokens = system_tokens + user_tokens + self.count_tokens(optimized_context)

        return {
            "system_prompt": system_prompt,
            "user_content": user_content,
            "context": optimized_context,
            "total_prompt_tokens": total_tokens,
            "budget_utilization": total_tokens / budget.max_prompt_tokens
        }

Implementing Caching

Cache repeated requests to avoid redundant API calls:

import hashlib
import json
from datetime import datetime, timedelta

class ResponseCache:
    def __init__(self, redis_client, default_ttl_hours: int = 24):
        self.redis = redis_client
        self.default_ttl = timedelta(hours=default_ttl_hours)

    def _generate_key(self, prompt: str, model: str, temperature: float) -> str:
        """Generate cache key from request parameters."""
        content = json.dumps({
            "prompt": prompt,
            "model": model,
            "temperature": temperature
        }, sort_keys=True)
        return f"ai_cache:{hashlib.sha256(content.encode()).hexdigest()}"

    def get_cached_response(self, prompt: str, model: str, temperature: float) -> Optional[dict]:
        """Retrieve cached response if available."""
        key = self._generate_key(prompt, model, temperature)
        cached = self.redis.get(key)

        if cached:
            return json.loads(cached)
        return None

    def cache_response(
        self,
        prompt: str,
        model: str,
        temperature: float,
        response: dict,
        ttl: Optional[timedelta] = None
    ):
        """Cache response for future requests."""
        key = self._generate_key(prompt, model, temperature)
        ttl = ttl or self.default_ttl

        self.redis.setex(key, ttl, json.dumps(response))

Model Selection Strategy

Choose the right model for each task:

class ModelSelector:
    MODEL_COSTS = {
        "gpt-4": {"input": 0.03, "output": 0.06},
        "gpt-4-turbo": {"input": 0.01, "output": 0.03},
        "gpt-3.5-turbo": {"input": 0.0005, "output": 0.0015}
    }

    def select_model(self, task_complexity: str, quality_requirement: str) -> str:
        """Select optimal model based on requirements."""
        if quality_requirement == "high" and task_complexity in ["complex", "reasoning"]:
            return "gpt-4"
        elif task_complexity in ["simple", "classification"]:
            return "gpt-3.5-turbo"
        return "gpt-4-turbo"

Balance cost and quality by routing requests to appropriate models based on task requirements.\n\n## Takeaways\n\nAdd a concise, personal takeaway and recommended next steps here.\n

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.