Back to Blog
2 min read

Implementing Semantic Caching for LLM Applications

Semantic caching goes beyond exact-match caching by returning cached responses for semantically similar queries. This dramatically reduces LLM API costs while maintaining response quality. Here’s how to implement semantic caching with Redis and embeddings.

Architecture Overview

The semantic cache uses vector similarity to find cached responses:

import redis
import numpy as np
from openai import AzureOpenAI
import json
import hashlib

class SemanticCache:
    def __init__(
        self,
        openai_client: AzureOpenAI,
        redis_client: redis.Redis,
        similarity_threshold: float = 0.95
    ):
        self.llm = openai_client
        self.redis = redis_client
        self.threshold = similarity_threshold
        self.embedding_model = "text-embedding-3-small"

    async def get_embedding(self, text: str) -> list[float]:
        response = await self.llm.embeddings.create(
            model=self.embedding_model,
            input=text
        )
        return response.data[0].embedding

    def cosine_similarity(self, a: list[float], b: list[float]) -> float:
        a, b = np.array(a), np.array(b)
        return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

    async def get_cached_response(self, query: str) -> str | None:
        query_embedding = await self.get_embedding(query)

        # Search cached embeddings
        cached_keys = self.redis.keys("cache:embedding:*")

        for key in cached_keys:
            cached_data = json.loads(self.redis.get(key))
            similarity = self.cosine_similarity(
                query_embedding,
                cached_data["embedding"]
            )

            if similarity >= self.threshold:
                # Update hit count for analytics
                self.redis.hincrby("cache:stats", "hits", 1)
                return cached_data["response"]

        self.redis.hincrby("cache:stats", "misses", 1)
        return None

    async def cache_response(self, query: str, response: str):
        embedding = await self.get_embedding(query)
        cache_key = f"cache:embedding:{hashlib.md5(query.encode()).hexdigest()}"

        self.redis.setex(
            cache_key,
            3600 * 24,  # 24 hour TTL
            json.dumps({
                "query": query,
                "embedding": embedding,
                "response": response
            })
        )

Integration with LLM Calls

Wrap your LLM calls to check cache first:

async def cached_completion(self, query: str, **kwargs) -> str:
    # Check cache
    cached = await self.get_cached_response(query)
    if cached:
        return cached

    # Call LLM
    response = await self.llm.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": query}],
        **kwargs
    )

    result = response.choices[0].message.content

    # Cache for future use
    await self.cache_response(query, result)

    return result

Production Considerations

Use Redis Vector Search for efficient similarity matching at scale. Tune the similarity threshold based on your use case - higher values ensure more precise matches, lower values increase cache hit rates.

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.