2 min read
Prompt Caching Strategies: Reducing Latency and Cost
Prompt caching can dramatically reduce both latency and cost for AI applications. Here’s how to implement it.
Prompt Caching Implementation
from azure.ai.openai import AzureOpenAI
import hashlib
import redis
from typing import Optional
import json
class PromptCache:
def __init__(self, redis_client: redis.Redis, ttl_seconds: int = 3600):
self.redis = redis_client
self.ttl = ttl_seconds
def _cache_key(self, messages: list, model: str, temperature: float) -> str:
"""Generate deterministic cache key."""
content = json.dumps({
"messages": messages,
"model": model,
"temperature": temperature
}, sort_keys=True)
return f"prompt:{hashlib.sha256(content.encode()).hexdigest()}"
async def get_or_compute(
self,
openai_client: AzureOpenAI,
messages: list,
model: str = "gpt-4o",
temperature: float = 0
) -> str:
"""Get cached response or compute new one."""
# Only cache deterministic requests
if temperature > 0:
return await self._compute(openai_client, messages, model, temperature)
key = self._cache_key(messages, model, temperature)
cached = self.redis.get(key)
if cached:
return json.loads(cached)
response = await self._compute(openai_client, messages, model, temperature)
self.redis.setex(key, self.ttl, json.dumps(response))
return response
async def _compute(self, client, messages, model, temperature) -> str:
response = await client.chat.completions.create(
model=model,
messages=messages,
temperature=temperature
)
return response.choices[0].message.content
class SemanticCache:
"""Cache based on semantic similarity, not exact match."""
def __init__(self, openai_client: AzureOpenAI, similarity_threshold: float = 0.95):
self.openai = openai_client
self.threshold = similarity_threshold
self.cache = [] # (embedding, query, response)
async def get_embedding(self, text: str) -> list:
response = await self.openai.embeddings.create(
input=text,
model="text-embedding-3-small"
)
return response.data[0].embedding
async def get_or_compute(self, query: str, compute_fn) -> str:
"""Find semantically similar cached query or compute."""
query_embedding = await self.get_embedding(query)
for cached_emb, cached_query, cached_response in self.cache:
similarity = self.cosine_similarity(query_embedding, cached_emb)
if similarity >= self.threshold:
return cached_response
response = await compute_fn(query)
self.cache.append((query_embedding, query, response))
return response
def cosine_similarity(self, a: list, b: list) -> float:
import numpy as np
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
Smart caching can reduce AI costs by 30-50% for applications with repetitive queries.