February 8, 2024 1 min read

Caching Strategies for LLM Cost Reduction

Caching Cost Reduction LLM Redis Performance

Caching LLM responses can reduce costs by 30-70%. Here are effective caching strategies for production systems.

Exact Match Caching

import hashlib
import redis
import json

class ExactMatchCache:
    def __init__(self, redis_client: redis.Redis, ttl_hours: int = 24):
        self.redis = redis_client
        self.ttl = ttl_hours * 3600

    def _hash_prompt(self, prompt: str) -> str:
        return hashlib.sha256(prompt.encode()).hexdigest()

    def get(self, prompt: str) -> str | None:
        key = f"llm:exact:{self._hash_prompt(prompt)}"
        cached = self.redis.get(key)
        return json.loads(cached) if cached else None

    def set(self, prompt: str, response: str):
        key = f"llm:exact:{self._hash_prompt(prompt)}"
        self.redis.setex(key, self.ttl, json.dumps(response))

    def get_or_call(self, prompt: str, llm_func) -> tuple[str, bool]:
        cached = self.get(prompt)
        if cached:
            return cached, True  # Cache hit
        response = llm_func(prompt)
        self.set(prompt, response)
        return response, False  # Cache miss

Semantic Caching

import numpy as np
from typing import Optional

class SemanticCache:
    def __init__(self, embedding_func, similarity_threshold: float = 0.95):
        self.embed = embedding_func
        self.threshold = similarity_threshold
        self.cache: list[dict] = []

    def _cosine_similarity(self, a: np.array, b: np.array) -> float:
        return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

    def get(self, prompt: str) -> Optional[str]:
        query_embedding = self.embed(prompt)

        for entry in self.cache:
            similarity = self._cosine_similarity(query_embedding, entry["embedding"])
            if similarity >= self.threshold:
                return entry["response"]
        return None

    def set(self, prompt: str, response: str):
        embedding = self.embed(prompt)
        self.cache.append({
            "prompt": prompt,
            "embedding": embedding,
            "response": response
        })

# Similar prompts return cached responses
# "How do I reset my password?" ≈ "How can I change my password?"

Hybrid Caching Strategy

class HybridCache:
    def __init__(self, exact_cache: ExactMatchCache, semantic_cache: SemanticCache):
        self.exact = exact_cache
        self.semantic = semantic_cache

    def get(self, prompt: str) -> tuple[str | None, str]:
        # Try exact match first (faster)
        exact_result = self.exact.get(prompt)
        if exact_result:
            return exact_result, "exact"

        # Try semantic match
        semantic_result = self.semantic.get(prompt)
        if semantic_result:
            return semantic_result, "semantic"

        return None, "miss"

    def set(self, prompt: str, response: str):
        self.exact.set(prompt, response)
        self.semantic.set(prompt, response)

Cache Analytics

class CacheAnalytics:
    def __init__(self):
        self.hits = {"exact": 0, "semantic": 0}
        self.misses = 0

    def record(self, cache_type: str):
        if cache_type == "miss":
            self.misses += 1
        else:
            self.hits[cache_type] += 1

    def get_stats(self) -> dict:
        total = sum(self.hits.values()) + self.misses
        return {
            "hit_rate": sum(self.hits.values()) / total if total > 0 else 0,
            "exact_hits": self.hits["exact"],
            "semantic_hits": self.hits["semantic"],
            "misses": self.misses,
            "estimated_savings_percent": sum(self.hits.values()) / total * 100 if total > 0 else 0
        }

Best Practices

Start with exact match - Simple and effective
Add semantic caching - For variable phrasing
Set appropriate TTL - Balance freshness and savings
Monitor hit rates - Optimize threshold based on data
Cache selectively - Not all responses should be cached

Conclusion

Caching is the fastest path to LLM cost reduction. Implement exact match caching first, then add semantic caching for additional savings.