6 min read
Token Budgeting: Cost-Effective LLM Applications
Token usage directly impacts cost and latency in LLM applications. Today, I will cover strategies for effective token budgeting.
Understanding Token Costs
# Azure OpenAI pricing (example - check current pricing)
pricing = {
"gpt-4": {"input": 0.03, "output": 0.06}, # per 1K tokens
"gpt-4-32k": {"input": 0.06, "output": 0.12},
"gpt-4-turbo": {"input": 0.01, "output": 0.03},
"gpt-35-turbo": {"input": 0.0015, "output": 0.002}
}
def estimate_cost(
input_tokens: int,
output_tokens: int,
model: str = "gpt-4"
) -> float:
"""Estimate cost for a request"""
rates = pricing.get(model, pricing["gpt-4"])
return (input_tokens / 1000 * rates["input"] +
output_tokens / 1000 * rates["output"])
# Example: 2000 input tokens, 500 output tokens with GPT-4
cost = estimate_cost(2000, 500, "gpt-4")
print(f"Estimated cost: ${cost:.4f}") # ~$0.09
Token Budget Manager
from dataclasses import dataclass
from datetime import datetime, timedelta
import threading
@dataclass
class UsageRecord:
timestamp: datetime
input_tokens: int
output_tokens: int
model: str
cost: float
class TokenBudgetManager:
"""Manage token budgets across requests"""
def __init__(self, hourly_limit: int = 100000, daily_limit: int = 1000000):
self.hourly_limit = hourly_limit
self.daily_limit = daily_limit
self.usage_log: list[UsageRecord] = []
self.lock = threading.Lock()
def can_proceed(self, estimated_tokens: int) -> tuple[bool, str]:
"""Check if request is within budget"""
with self.lock:
hourly_usage = self._get_usage(hours=1)
daily_usage = self._get_usage(hours=24)
if hourly_usage + estimated_tokens > self.hourly_limit:
return False, f"Hourly limit reached ({hourly_usage}/{self.hourly_limit})"
if daily_usage + estimated_tokens > self.daily_limit:
return False, f"Daily limit reached ({daily_usage}/{self.daily_limit})"
return True, "OK"
def record_usage(self, input_tokens: int, output_tokens: int, model: str):
"""Record token usage"""
with self.lock:
cost = estimate_cost(input_tokens, output_tokens, model)
record = UsageRecord(
timestamp=datetime.utcnow(),
input_tokens=input_tokens,
output_tokens=output_tokens,
model=model,
cost=cost
)
self.usage_log.append(record)
# Cleanup old records
cutoff = datetime.utcnow() - timedelta(days=7)
self.usage_log = [r for r in self.usage_log if r.timestamp > cutoff]
def _get_usage(self, hours: int) -> int:
"""Get total token usage for time period"""
cutoff = datetime.utcnow() - timedelta(hours=hours)
return sum(
r.input_tokens + r.output_tokens
for r in self.usage_log
if r.timestamp > cutoff
)
def get_stats(self) -> dict:
"""Get usage statistics"""
now = datetime.utcnow()
hourly = [r for r in self.usage_log if r.timestamp > now - timedelta(hours=1)]
daily = [r for r in self.usage_log if r.timestamp > now - timedelta(hours=24)]
return {
"hourly_tokens": sum(r.input_tokens + r.output_tokens for r in hourly),
"hourly_cost": sum(r.cost for r in hourly),
"hourly_remaining": self.hourly_limit - sum(r.input_tokens + r.output_tokens for r in hourly),
"daily_tokens": sum(r.input_tokens + r.output_tokens for r in daily),
"daily_cost": sum(r.cost for r in daily),
"daily_remaining": self.daily_limit - sum(r.input_tokens + r.output_tokens for r in daily)
}
Model Selection Based on Budget
class ModelSelector:
"""Select appropriate model based on task and budget"""
def __init__(self, budget_manager: TokenBudgetManager):
self.budget = budget_manager
self.model_capabilities = {
"gpt-4": {"reasoning": 10, "creativity": 9, "speed": 5, "cost": 10},
"gpt-4-turbo": {"reasoning": 9, "creativity": 9, "speed": 7, "cost": 7},
"gpt-35-turbo": {"reasoning": 7, "creativity": 7, "speed": 9, "cost": 2}
}
def select_model(
self,
task_type: str,
estimated_tokens: int,
quality_required: str = "medium"
) -> str:
"""Select best model for task within budget"""
# Check budget
can_proceed, reason = self.budget.can_proceed(estimated_tokens)
if not can_proceed:
# Force cheapest model if over budget
return "gpt-35-turbo"
# Task-based selection
task_requirements = {
"simple_qa": {"reasoning": 5, "model": "gpt-35-turbo"},
"summarization": {"reasoning": 6, "model": "gpt-35-turbo"},
"analysis": {"reasoning": 8, "model": "gpt-4-turbo"},
"complex_reasoning": {"reasoning": 9, "model": "gpt-4"},
"code_generation": {"reasoning": 8, "model": "gpt-4-turbo"},
"creative_writing": {"creativity": 8, "model": "gpt-4"}
}
# Quality adjustment
quality_multiplier = {"low": 0.7, "medium": 1.0, "high": 1.3}[quality_required]
req = task_requirements.get(task_type, {"reasoning": 7, "model": "gpt-4-turbo"})
# If high quality required or complex task, use better model
if quality_required == "high" or req["reasoning"] * quality_multiplier > 8:
return "gpt-4"
elif req["reasoning"] * quality_multiplier > 6:
return "gpt-4-turbo"
else:
return "gpt-35-turbo"
Request Optimization
class RequestOptimizer:
"""Optimize requests to minimize token usage"""
def __init__(self, client):
self.client = client
def optimize_prompt(self, prompt: str, max_tokens: int = None) -> str:
"""Compress prompt while preserving meaning"""
# Remove unnecessary whitespace
import re
prompt = re.sub(r'\s+', ' ', prompt).strip()
# Remove common filler words if over budget
if max_tokens and count_tokens(prompt) > max_tokens:
fillers = ["please", "kindly", "basically", "actually", "really", "very"]
for filler in fillers:
prompt = re.sub(rf'\b{filler}\b\s*', '', prompt, flags=re.IGNORECASE)
return prompt
def batch_requests(self, prompts: list, batch_size: int = 5) -> list:
"""Batch similar requests to reduce overhead"""
# Group similar prompts
batched_prompt = "Process each of the following requests:\n\n"
for i, prompt in enumerate(prompts[:batch_size], 1):
batched_prompt += f"Request {i}:\n{prompt}\n\n"
batched_prompt += "Provide numbered responses for each request."
return batched_prompt
def use_caching(self, prompt: str, cache: dict) -> tuple[str, bool]:
"""Check cache before making API call"""
import hashlib
# Create cache key
key = hashlib.md5(prompt.encode()).hexdigest()
if key in cache:
return cache[key], True # Cache hit
return None, False # Cache miss
def estimate_and_optimize(
self,
prompt: str,
context: list,
budget_tokens: int
) -> dict:
"""Optimize request to fit budget"""
prompt_tokens = count_tokens(prompt)
context_tokens = count_messages_tokens(context)
total = prompt_tokens + context_tokens
optimizations = []
if total > budget_tokens:
# Strategy 1: Compress prompt
optimized_prompt = self.optimize_prompt(prompt, budget_tokens // 2)
optimizations.append("prompt_compressed")
# Strategy 2: Reduce context
if count_tokens(optimized_prompt) + context_tokens > budget_tokens:
# Keep only essential context
context = context[-5:] # Last 5 messages
optimizations.append("context_reduced")
return {
"prompt": optimized_prompt if optimizations else prompt,
"context": context,
"estimated_tokens": count_tokens(optimized_prompt or prompt) + count_messages_tokens(context),
"optimizations": optimizations
}
Integration Example
class CostAwareChatService:
"""Chat service with cost controls"""
def __init__(self, client, budget_manager, model_selector, optimizer):
self.client = client
self.budget = budget_manager
self.selector = model_selector
self.optimizer = optimizer
self.cache = {}
def chat(
self,
user_message: str,
conversation: list,
task_type: str = "simple_qa",
quality: str = "medium"
) -> dict:
"""Process chat with cost optimization"""
# Check cache
cached, hit = self.optimizer.use_caching(user_message, self.cache)
if hit:
return {"response": cached, "cached": True, "cost": 0}
# Estimate tokens
estimated = count_tokens(user_message) + count_messages_tokens(conversation) + 500
# Check budget
can_proceed, reason = self.budget.can_proceed(estimated)
if not can_proceed:
return {"error": reason, "cost": 0}
# Select model
model = self.selector.select_model(task_type, estimated, quality)
# Optimize request
optimized = self.optimizer.estimate_and_optimize(
user_message,
conversation,
budget_tokens=8000 if model == "gpt-4" else 4000
)
# Make request
messages = optimized["context"] + [{"role": "user", "content": optimized["prompt"]}]
response = self.client.chat.completions.create(
model=model,
messages=messages,
max_tokens=500
)
# Record usage
usage = response.usage
self.budget.record_usage(usage.prompt_tokens, usage.completion_tokens, model)
result = response.choices[0].message.content
# Cache result
cache_key = self.optimizer.use_caching(user_message, {})[0]
self.cache[cache_key] = result
return {
"response": result,
"model": model,
"tokens": {"input": usage.prompt_tokens, "output": usage.completion_tokens},
"cost": estimate_cost(usage.prompt_tokens, usage.completion_tokens, model),
"optimizations": optimized["optimizations"],
"cached": False
}
Effective token budgeting keeps costs under control. Tomorrow, I will cover caching strategies for LLM applications.