February 6, 2024 1 min read

Cost Optimization for LLMs: Strategies That Work

Cost Optimization LLM Azure OpenAI Token Management Enterprise AI

LLM costs can spiral quickly at scale. Here are proven strategies to reduce costs without sacrificing quality.

Cost Reduction Strategies

1. Prompt Optimization

def optimize_prompt(prompt: str, context: str) -> str:
    """Reduce prompt size while preserving meaning."""

    # Remove redundant instructions
    prompt = re.sub(r'\b(please|kindly|could you)\b', '', prompt, flags=re.I)

    # Truncate context to relevant portions
    max_context_chars = 4000
    if len(context) > max_context_chars:
        context = context[:max_context_chars] + "..."

    # Use concise formatting
    return f"{prompt}\n\nContext: {context}"

# Before: 2000 tokens
# After: 800 tokens
# Savings: 60%

2. Response Caching

import hashlib
from functools import lru_cache

class LLMCache:
    def __init__(self, ttl_hours: int = 24):
        self.cache = {}
        self.ttl = ttl_hours * 3600

    def get_or_call(self, prompt: str, llm_func) -> str:
        cache_key = hashlib.md5(prompt.encode()).hexdigest()

        if cache_key in self.cache:
            cached = self.cache[cache_key]
            if time.time() - cached["time"] < self.ttl:
                return cached["response"]

        response = llm_func(prompt)
        self.cache[cache_key] = {"response": response, "time": time.time()}
        return response

# Cache hit rate of 30% = 30% cost reduction

3. Model Tiering

def select_model_by_task(task_complexity: str) -> tuple[str, float]:
    tiers = {
        "simple": ("gpt-3.5-turbo", 0.002),
        "medium": ("gpt-4-turbo", 0.03),
        "complex": ("gpt-4", 0.06)
    }
    return tiers.get(task_complexity, tiers["medium"])

# 60% simple, 30% medium, 10% complex
# Blended cost: 0.6 * 0.002 + 0.3 * 0.03 + 0.1 * 0.06 = $0.016/1K
# vs all GPT-4: $0.06/1K
# Savings: 73%

4. Batch Processing

async def batch_process(prompts: list[str], batch_size: int = 20) -> list[str]:
    """Process prompts in batches for efficiency."""

    results = []
    for i in range(0, len(prompts), batch_size):
        batch = prompts[i:i + batch_size]
        batch_results = await asyncio.gather(*[
            process_single(p) for p in batch
        ])
        results.extend(batch_results)

    return results

# Reduces overhead and enables better rate limit management

5. Token Budget Management

class TokenBudget:
    def __init__(self, daily_budget: int = 1_000_000):
        self.daily_budget = daily_budget
        self.used_today = 0

    def can_process(self, estimated_tokens: int) -> bool:
        return self.used_today + estimated_tokens <= self.daily_budget

    def record_usage(self, tokens: int):
        self.used_today += tokens

    def get_remaining(self) -> int:
        return self.daily_budget - self.used_today

Cost Monitoring

def calculate_daily_cost(usage_log: list[dict]) -> dict:
    pricing = {
        "gpt-4-turbo": {"input": 0.01, "output": 0.03},
        "gpt-3.5-turbo": {"input": 0.0005, "output": 0.0015}
    }

    costs = {}
    for entry in usage_log:
        model = entry["model"]
        p = pricing.get(model, pricing["gpt-4-turbo"])
        cost = (
            entry["input_tokens"] / 1000 * p["input"] +
            entry["output_tokens"] / 1000 * p["output"]
        )
        costs[model] = costs.get(model, 0) + cost

    return costs

Best Practices Summary

Cache aggressively - Identical queries don’t need re-processing
Tier your models - Match model to task complexity
Optimize prompts - Shorter prompts = lower costs
Set budgets - Prevent runaway spending
Monitor continuously - Track cost per query

Conclusion

Cost optimization is essential for sustainable AI at scale. Combine these strategies to reduce costs by 50-80% while maintaining quality.