RAG Patterns That Actually Work in Production

Everyone’s building RAG systems. Most of them are terrible. Let me show you patterns that actually work when real users hit your system.

The Naive RAG (Don’t Do This)

def naive_rag(question):
    # Embed question
    embedding = get_embedding(question)
    
    # Search vector store
    docs = vector_store.search(embedding, top_k=5)
    
    # Stuff into prompt
    context = "\n".join([doc.content for doc in docs])
    prompt = f"Context: {context}\n\nQuestion: {question}"
    
    return llm.generate(prompt)

This works in demos. It fails in production because:

No relevance filtering
No context ranking
No handling of multi-intent queries
No citation tracking
No fallback for poor retrieval

Pattern 1: Hybrid Search

Vector search alone misses exact matches. Keyword search alone misses semantic similarity. Combine them.

def hybrid_search(query: str, top_k: int = 5) -> List[Document]:
    # Vector search
    embedding = embed_model.encode(query)
    vector_results = vector_store.search(embedding, top_k=top_k*2)
    
    # Keyword search (BM25)
    keyword_results = bm25_index.search(query, top_k=top_k*2)
    
    # Reciprocal Rank Fusion
    combined = reciprocal_rank_fusion(
        [vector_results, keyword_results],
        weights=[0.7, 0.3]  # Favor vector search
    )
    
    return combined[:top_k]

def reciprocal_rank_fusion(result_lists, weights):
    scores = {}
    for results, weight in zip(result_lists, weights):
        for rank, doc in enumerate(results):
            if doc.id not in scores:
                scores[doc.id] = 0
            scores[doc.id] += weight / (rank + 60)  # k=60 is common
    
    return sorted(scores.items(), key=lambda x: x[1], reverse=True)

Pattern 2: Query Decomposition

Complex questions need breaking down.

async def decompose_and_answer(question: str) -> str:
    # Check if question is complex
    complexity = await assess_complexity(question)
    
    if complexity == "simple":
        return await simple_rag(question)
    
    # Decompose into sub-questions
    sub_questions = await decompose_question(question)
    
    # Answer each sub-question
    sub_answers = await asyncio.gather(*[
        simple_rag(sq) for sq in sub_questions
    ])
    
    # Synthesize final answer
    synthesis_prompt = f"""
    Original question: {question}
    
    Sub-answers:
    {format_sub_answers(sub_questions, sub_answers)}
    
    Provide a comprehensive answer to the original question.
    """
    
    return await llm.generate(synthesis_prompt)

Pattern 3: Re-ranking

Your first-pass retrieval isn’t optimal. Re-rank with a cross-encoder.

from sentence_transformers import CrossEncoder

class ReRanker:
    def __init__(self):
        self.model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
    
    def rerank(self, query: str, documents: List[Document], top_k: int = 5):
        # Score each doc against query
        pairs = [[query, doc.content] for doc in documents]
        scores = self.model.predict(pairs)
        
        # Sort by score
        doc_scores = list(zip(documents, scores))
        doc_scores.sort(key=lambda x: x[1], reverse=True)
        
        return [doc for doc, score in doc_scores[:top_k]]

Pattern 4: Contextual Compression

Don’t send entire documents. Extract relevant sentences.

def compress_context(query: str, documents: List[Document]) -> str:
    compressed_chunks = []
    
    for doc in documents:
        # Split into sentences
        sentences = sent_tokenize(doc.content)
        
        # Score each sentence
        sentence_embeddings = embed_model.encode(sentences)
        query_embedding = embed_model.encode([query])[0]
        
        similarities = cosine_similarity([query_embedding], sentence_embeddings)[0]
        
        # Take top 3 most relevant sentences
        top_indices = np.argsort(similarities)[-3:]
        relevant_sentences = [sentences[i] for i in sorted(top_indices)]
        
        compressed_chunks.append(" ".join(relevant_sentences))
    
    return "\n\n".join(compressed_chunks)

Pattern 5: Citation Tracking

Users need to verify your answers. Track sources.

def rag_with_citations(question: str) -> Dict:
    # Retrieve
    docs = hybrid_search(question, top_k=5)
    
    # Rerank
    docs = reranker.rerank(question, docs, top_k=3)
    
    # Prepare context with markers
    context_parts = []
    for i, doc in enumerate(docs, 1):
        context_parts.append(f"[{i}] {doc.content}")
    
    context = "\n\n".join(context_parts)
    
    # Generate with citation instructions
    prompt = f"""
    Answer the question using the provided context. 
    Cite your sources using [1], [2], [3] notation.
    
    Context:
    {context}
    
    Question: {question}
    
    Answer with citations:
    """
    
    answer = llm.generate(prompt)
    
    return {
        "answer": answer,
        "sources": [
            {
                "id": i+1,
                "title": doc.metadata.get("title", "Untitled"),
                "url": doc.metadata.get("url"),
                "excerpt": doc.content[:200]
            }
            for i, doc in enumerate(docs)
        ]
    }

Pattern 6: Fallback Handling

Sometimes retrieval fails. Handle it gracefully.

async def robust_rag(question: str) -> str:
    docs = await hybrid_search(question)
    
    # Check retrieval quality
    relevance_scores = [doc.score for doc in docs]
    avg_relevance = np.mean(relevance_scores)
    
    if avg_relevance < 0.7:  # Poor retrieval
        # Try query reformulation
        reformulated = await reformulate_query(question)
        docs = await hybrid_search(reformulated)
        
        if np.mean([doc.score for doc in docs]) < 0.7:
            # Still poor, acknowledge limitation
            return await llm.generate(f"""
            I don't have enough reliable information to answer: {question}
            
            I searched our knowledge base but couldn't find sufficiently relevant information.
            Could you rephrase your question or provide more context?
            """)
    
    # Proceed with normal RAG
    return await generate_answer(question, docs)

Pattern 7: Chunk Optimization

How you chunk matters more than you think.

class SmartChunker:
    def __init__(self, chunk_size=512, overlap=50):
        self.chunk_size = chunk_size
        self.overlap = overlap
    
    def chunk_document(self, document: str, metadata: dict) -> List[Chunk]:
        # Detect structure
        sections = self.detect_sections(document)
        
        chunks = []
        for section in sections:
            # Don't break sections unnaturally
            if len(section) < self.chunk_size * 1.5:
                # Keep small sections whole
                chunks.append(Chunk(
                    content=section,
                    metadata={**metadata, "section": section[:50]}
                ))
            else:
                # Split large sections with overlap
                section_chunks = self.split_with_overlap(section)
                chunks.extend([
                    Chunk(content=c, metadata=metadata)
                    for c in section_chunks
                ])
        
        return chunks

What Actually Matters

After building a dozen RAG systems, here’s what moves the needle:

Retrieval quality >> Model quality - GPT-4 with bad retrieval < GPT-3.5 with good retrieval
Chunk strategy matters - Spend time on this. Bad chunks = bad results
Hybrid search wins - Especially for domain-specific queries
Re-ranking is worth it - 20% better relevance for minimal latency
User feedback loops - Track when users aren’t satisfied. Fix those queries

Production Checklist

Don’t build RAG systems with tutorial-level patterns. These are the patterns that actually work when users depend on your system.