Back to Blog
3 min read

AI Cost Optimization: Managing LLM Expenses

AI costs can spiral quickly. Here’s how to optimize LLM expenses without sacrificing quality.

AI Cost Optimization Strategies

from azure.ai.openai import AzureOpenAI
from dataclasses import dataclass
from typing import Dict, List
import tiktoken

@dataclass
class CostMetrics:
    input_tokens: int
    output_tokens: int
    model: str
    cost_usd: float

class AICostOptimizer:
    def __init__(self, openai_client: AzureOpenAI):
        self.openai = openai_client
        self.encoder = tiktoken.encoding_for_model("gpt-4o")

        # Cost per 1K tokens (as of 2025)
        self.pricing = {
            "gpt-4o": {"input": 0.005, "output": 0.015},
            "gpt-4o-mini": {"input": 0.00015, "output": 0.0006},
            "gpt-4": {"input": 0.03, "output": 0.06},
            "text-embedding-3-small": {"input": 0.00002, "output": 0},
            "text-embedding-3-large": {"input": 0.00013, "output": 0}
        }

    def estimate_cost(self, prompt: str, model: str, expected_output_tokens: int = 500) -> float:
        """Estimate cost before making API call."""
        input_tokens = len(self.encoder.encode(prompt))
        pricing = self.pricing.get(model, self.pricing["gpt-4o"])

        input_cost = (input_tokens / 1000) * pricing["input"]
        output_cost = (expected_output_tokens / 1000) * pricing["output"]

        return input_cost + output_cost

    def optimize_prompt(self, prompt: str, max_tokens: int = 1000) -> str:
        """Compress prompt while preserving meaning."""
        current_tokens = len(self.encoder.encode(prompt))

        if current_tokens <= max_tokens:
            return prompt

        # Strategies for compression
        # 1. Remove redundant whitespace
        prompt = " ".join(prompt.split())

        # 2. Use abbreviations for common phrases
        abbreviations = {
            "for example": "e.g.",
            "that is": "i.e.",
            "and so on": "etc."
        }
        for full, abbrev in abbreviations.items():
            prompt = prompt.replace(full, abbrev)

        # 3. If still too long, use LLM to compress
        if len(self.encoder.encode(prompt)) > max_tokens:
            prompt = self.llm_compress(prompt, max_tokens)

        return prompt

    async def smart_model_selection(self, task: str, complexity: float) -> str:
        """Select cheapest model that can handle the task."""
        if complexity < 0.3:
            return "gpt-4o-mini"  # 20x cheaper
        elif complexity < 0.7:
            return "gpt-4o-mini"  # Try mini first
        else:
            return "gpt-4o"

    async def batch_requests(self, prompts: List[str], model: str) -> List[str]:
        """Batch multiple requests to reduce overhead."""
        # Combine prompts with delimiters
        combined = "\n---\n".join([f"[{i}] {p}" for i, p in enumerate(prompts)])

        response = await self.openai.chat.completions.create(
            model=model,
            messages=[{
                "role": "system",
                "content": "Answer each numbered question. Separate answers with ---"
            }, {
                "role": "user",
                "content": combined
            }]
        )

        # Parse responses
        responses = response.choices[0].message.content.split("---")
        return [r.strip() for r in responses]

    def track_costs(self, response, model: str) -> CostMetrics:
        """Track actual costs from response."""
        input_tokens = response.usage.prompt_tokens
        output_tokens = response.usage.completion_tokens
        pricing = self.pricing.get(model, self.pricing["gpt-4o"])

        cost = (input_tokens / 1000) * pricing["input"] + \
               (output_tokens / 1000) * pricing["output"]

        return CostMetrics(
            input_tokens=input_tokens,
            output_tokens=output_tokens,
            model=model,
            cost_usd=cost
        )

# Cost optimization strategies summary
# | Strategy | Savings | Implementation Effort |
# |----------|---------|----------------------|
# | Model routing | 50-80% | Medium |
# | Prompt caching | 30-50% | Low |
# | Prompt compression | 20-40% | Low |
# | Batching | 10-20% | Medium |
# | Output length limits | 20-30% | Low |

Strategic cost optimization can reduce AI expenses by 50-80% while maintaining quality.

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.