3 min read
Caching Strategies for LLM Cost Reduction
Caching LLM responses can reduce costs by 30-70%. Here are effective caching strategies for production systems.
Exact Match Caching
import hashlib
import redis
import json
class ExactMatchCache:
def __init__(self, redis_client: redis.Redis, ttl_hours: int = 24):
self.redis = redis_client
self.ttl = ttl_hours * 3600
def _hash_prompt(self, prompt: str) -> str:
return hashlib.sha256(prompt.encode()).hexdigest()
def get(self, prompt: str) -> str | None:
key = f"llm:exact:{self._hash_prompt(prompt)}"
cached = self.redis.get(key)
return json.loads(cached) if cached else None
def set(self, prompt: str, response: str):
key = f"llm:exact:{self._hash_prompt(prompt)}"
self.redis.setex(key, self.ttl, json.dumps(response))
def get_or_call(self, prompt: str, llm_func) -> tuple[str, bool]:
cached = self.get(prompt)
if cached:
return cached, True # Cache hit
response = llm_func(prompt)
self.set(prompt, response)
return response, False # Cache miss
Semantic Caching
import numpy as np
from typing import Optional
class SemanticCache:
def __init__(self, embedding_func, similarity_threshold: float = 0.95):
self.embed = embedding_func
self.threshold = similarity_threshold
self.cache: list[dict] = []
def _cosine_similarity(self, a: np.array, b: np.array) -> float:
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
def get(self, prompt: str) -> Optional[str]:
query_embedding = self.embed(prompt)
for entry in self.cache:
similarity = self._cosine_similarity(query_embedding, entry["embedding"])
if similarity >= self.threshold:
return entry["response"]
return None
def set(self, prompt: str, response: str):
embedding = self.embed(prompt)
self.cache.append({
"prompt": prompt,
"embedding": embedding,
"response": response
})
# Similar prompts return cached responses
# "How do I reset my password?" ≈ "How can I change my password?"
Hybrid Caching Strategy
class HybridCache:
def __init__(self, exact_cache: ExactMatchCache, semantic_cache: SemanticCache):
self.exact = exact_cache
self.semantic = semantic_cache
def get(self, prompt: str) -> tuple[str | None, str]:
# Try exact match first (faster)
exact_result = self.exact.get(prompt)
if exact_result:
return exact_result, "exact"
# Try semantic match
semantic_result = self.semantic.get(prompt)
if semantic_result:
return semantic_result, "semantic"
return None, "miss"
def set(self, prompt: str, response: str):
self.exact.set(prompt, response)
self.semantic.set(prompt, response)
Cache Analytics
class CacheAnalytics:
def __init__(self):
self.hits = {"exact": 0, "semantic": 0}
self.misses = 0
def record(self, cache_type: str):
if cache_type == "miss":
self.misses += 1
else:
self.hits[cache_type] += 1
def get_stats(self) -> dict:
total = sum(self.hits.values()) + self.misses
return {
"hit_rate": sum(self.hits.values()) / total if total > 0 else 0,
"exact_hits": self.hits["exact"],
"semantic_hits": self.hits["semantic"],
"misses": self.misses,
"estimated_savings_percent": sum(self.hits.values()) / total * 100 if total > 0 else 0
}
Best Practices
- Start with exact match - Simple and effective
- Add semantic caching - For variable phrasing
- Set appropriate TTL - Balance freshness and savings
- Monitor hit rates - Optimize threshold based on data
- Cache selectively - Not all responses should be cached
Conclusion
Caching is the fastest path to LLM cost reduction. Implement exact match caching first, then add semantic caching for additional savings.