7 min read
RAG Architecture Maturity Model: From Basic to Production-Grade
Retrieval Augmented Generation (RAG) has become the standard pattern for grounding LLMs with enterprise data. But there’s a massive gap between basic RAG demos and production systems. This post presents a maturity model for RAG architectures.
RAG Maturity Levels
Level 1: Basic RAG
The “hello world” of RAG:
# Level 1: Basic RAG
from openai import OpenAI
import chromadb
client = OpenAI()
chroma = chromadb.Client()
collection = chroma.create_collection("documents")
def basic_rag(query: str) -> str:
# 1. Embed the query
query_embedding = client.embeddings.create(
model="text-embedding-ada-002",
input=query
).data[0].embedding
# 2. Retrieve similar documents
results = collection.query(
query_embeddings=[query_embedding],
n_results=5
)
# 3. Generate response with context
context = "\n".join(results["documents"][0])
response = client.chat.completions.create(
model="gpt-4-turbo",
messages=[
{"role": "system", "content": "Answer based on the provided context."},
{"role": "user", "content": f"Context:\n{context}\n\nQuestion: {query}"}
]
)
return response.choices[0].message.content
Characteristics:
- Single embedding model
- Basic vector similarity search
- Fixed chunk size
- No relevance filtering
- No source citation
Level 2: Hybrid RAG
Combining vector and keyword search:
# Level 2: Hybrid RAG with Azure AI Search
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizedQuery
class HybridRAG:
def __init__(self, search_client: SearchClient, openai_client):
self.search = search_client
self.openai = openai_client
def retrieve(self, query: str, top_k: int = 10) -> list[dict]:
# Get query embedding
embedding = self.openai.embeddings.create(
model="text-embedding-ada-002",
input=query
).data[0].embedding
# Hybrid search: vector + keyword
vector_query = VectorizedQuery(
vector=embedding,
k_nearest_neighbors=top_k,
fields="content_vector"
)
results = self.search.search(
search_text=query, # Keyword search
vector_queries=[vector_query], # Vector search
select=["title", "content", "source", "chunk_id"],
top=top_k
)
return [
{
"content": r["content"],
"source": r["source"],
"score": r["@search.score"]
}
for r in results
]
def generate(self, query: str, contexts: list[dict]) -> str:
context_text = "\n\n".join([
f"[Source: {c['source']}]\n{c['content']}"
for c in contexts
])
response = self.openai.chat.completions.create(
model="gpt-4-turbo",
messages=[
{
"role": "system",
"content": """Answer based on the provided context.
Always cite your sources using [Source: X] format.
If the context doesn't contain the answer, say so."""
},
{
"role": "user",
"content": f"Context:\n{context_text}\n\nQuestion: {query}"
}
]
)
return response.choices[0].message.content
Improvements:
- Keyword + vector search
- Source citation
- Explicit handling of missing information
Level 3: Semantic RAG
Adding semantic ranking and relevance filtering:
# Level 3: Semantic RAG with reranking
class SemanticRAG(HybridRAG):
def __init__(self, search_client, openai_client, reranker_threshold: float = 0.5):
super().__init__(search_client, openai_client)
self.reranker_threshold = reranker_threshold
def retrieve(self, query: str, top_k: int = 10) -> list[dict]:
embedding = self.openai.embeddings.create(
model="text-embedding-ada-002",
input=query
).data[0].embedding
vector_query = VectorizedQuery(
vector=embedding,
k_nearest_neighbors=top_k * 2, # Over-retrieve for reranking
fields="content_vector"
)
results = self.search.search(
search_text=query,
vector_queries=[vector_query],
query_type="semantic", # Enable semantic ranking
semantic_configuration_name="my-semantic-config",
select=["title", "content", "source", "chunk_id"],
top=top_k * 2
)
# Filter by reranker score
filtered = []
for r in results:
reranker_score = r.get("@search.reranker_score", 0)
if reranker_score >= self.reranker_threshold:
filtered.append({
"content": r["content"],
"source": r["source"],
"score": reranker_score
})
# Return top_k after filtering
return sorted(filtered, key=lambda x: x["score"], reverse=True)[:top_k]
Improvements:
- Semantic reranking
- Relevance threshold filtering
- Better precision
Level 4: Agentic RAG
Query understanding and multi-step retrieval:
# Level 4: Agentic RAG with query decomposition
from dataclasses import dataclass
from enum import Enum
class QueryIntent(Enum):
FACTUAL = "factual"
COMPARISON = "comparison"
PROCEDURAL = "procedural"
ANALYTICAL = "analytical"
@dataclass
class DecomposedQuery:
original: str
intent: QueryIntent
sub_queries: list[str]
required_sources: list[str]
class AgenticRAG:
def __init__(self, search_client, openai_client):
self.search = search_client
self.openai = openai_client
async def process(self, query: str) -> str:
# Step 1: Analyze and decompose query
decomposed = await self._decompose_query(query)
# Step 2: Retrieve for each sub-query
all_contexts = []
for sub_query in decomposed.sub_queries:
contexts = await self._retrieve(sub_query, decomposed.required_sources)
all_contexts.extend(contexts)
# Step 3: Deduplicate and rank contexts
unique_contexts = self._deduplicate(all_contexts)
# Step 4: Generate response based on intent
return await self._generate(query, unique_contexts, decomposed.intent)
async def _decompose_query(self, query: str) -> DecomposedQuery:
"""Use LLM to understand and decompose query."""
response = self.openai.chat.completions.create(
model="gpt-4-turbo",
response_format={"type": "json_object"},
messages=[
{
"role": "system",
"content": """Analyze the query and return JSON:
{
"intent": "factual|comparison|procedural|analytical",
"sub_queries": ["list of specific questions to answer"],
"required_sources": ["types of documents needed"]
}"""
},
{"role": "user", "content": query}
]
)
result = json.loads(response.choices[0].message.content)
return DecomposedQuery(
original=query,
intent=QueryIntent(result["intent"]),
sub_queries=result["sub_queries"],
required_sources=result["required_sources"]
)
async def _retrieve(self, query: str, source_types: list[str]) -> list[dict]:
"""Retrieve with source filtering."""
embedding = self.openai.embeddings.create(
model="text-embedding-ada-002",
input=query
).data[0].embedding
# Build source filter
source_filter = " or ".join([f"source_type eq '{s}'" for s in source_types])
vector_query = VectorizedQuery(
vector=embedding,
k_nearest_neighbors=10,
fields="content_vector"
)
results = self.search.search(
search_text=query,
vector_queries=[vector_query],
filter=source_filter if source_types else None,
query_type="semantic",
semantic_configuration_name="my-semantic-config",
top=10
)
return [{"content": r["content"], "source": r["source"]} for r in results]
def _deduplicate(self, contexts: list[dict]) -> list[dict]:
"""Remove duplicate contexts."""
seen = set()
unique = []
for ctx in contexts:
content_hash = hash(ctx["content"][:200])
if content_hash not in seen:
seen.add(content_hash)
unique.append(ctx)
return unique
async def _generate(
self,
query: str,
contexts: list[dict],
intent: QueryIntent
) -> str:
"""Generate response tailored to intent."""
intent_instructions = {
QueryIntent.FACTUAL: "Provide a direct, factual answer with citations.",
QueryIntent.COMPARISON: "Structure your response as a comparison, highlighting similarities and differences.",
QueryIntent.PROCEDURAL: "Provide step-by-step instructions.",
QueryIntent.ANALYTICAL: "Provide analysis with reasoning and evidence."
}
context_text = "\n\n".join([
f"[{c['source']}]: {c['content']}"
for c in contexts
])
response = self.openai.chat.completions.create(
model="gpt-4-turbo",
messages=[
{
"role": "system",
"content": f"{intent_instructions[intent]} Always cite sources."
},
{
"role": "user",
"content": f"Context:\n{context_text}\n\nQuestion: {query}"
}
]
)
return response.choices[0].message.content
Level 5: Production RAG
Full production system with evaluation and monitoring:
# Level 5: Production RAG with full observability
from dataclasses import dataclass
from datetime import datetime
import uuid
@dataclass
class RAGResponse:
query_id: str
query: str
response: str
contexts: list[dict]
latency_ms: float
tokens_used: int
model: str
timestamp: datetime
@dataclass
class RAGEvaluation:
query_id: str
relevance_score: float
groundedness_score: float
answer_quality_score: float
citations_valid: bool
class ProductionRAG:
def __init__(
self,
search_client,
openai_client,
cache,
metrics_collector,
evaluator
):
self.search = search_client
self.openai = openai_client
self.cache = cache
self.metrics = metrics_collector
self.evaluator = evaluator
async def query(self, query: str, user_id: str) -> RAGResponse:
query_id = str(uuid.uuid4())
start_time = datetime.utcnow()
# Check cache
cached = await self.cache.get(query)
if cached:
self.metrics.record("cache_hit", 1)
return cached
# Retrieve
contexts = await self._retrieve(query)
# Generate
response_text, tokens = await self._generate(query, contexts)
# Build response
latency = (datetime.utcnow() - start_time).total_seconds() * 1000
response = RAGResponse(
query_id=query_id,
query=query,
response=response_text,
contexts=contexts,
latency_ms=latency,
tokens_used=tokens,
model="gpt-4-turbo",
timestamp=datetime.utcnow()
)
# Record metrics
self.metrics.record("query_latency", latency)
self.metrics.record("tokens_used", tokens)
self.metrics.record("contexts_retrieved", len(contexts))
# Cache response
await self.cache.set(query, response, ttl=3600)
# Async evaluation
asyncio.create_task(self._evaluate(response))
return response
async def _evaluate(self, response: RAGResponse):
"""Evaluate response quality asynchronously."""
evaluation = await self.evaluator.evaluate(
query=response.query,
response=response.response,
contexts=response.contexts
)
# Record evaluation metrics
self.metrics.record("relevance_score", evaluation.relevance_score)
self.metrics.record("groundedness_score", evaluation.groundedness_score)
# Alert on low quality
if evaluation.groundedness_score < 0.7:
await self._alert_low_quality(response, evaluation)
async def _retrieve(self, query: str) -> list[dict]:
"""Production retrieval with fallback."""
try:
return await self._hybrid_retrieve(query)
except Exception as e:
self.metrics.record("retrieval_error", 1)
# Fallback to keyword-only
return await self._keyword_retrieve(query)
async def _generate(self, query: str, contexts: list[dict]) -> tuple[str, int]:
"""Generate with retry and fallback."""
try:
return await self._generate_gpt4(query, contexts)
except Exception as e:
self.metrics.record("generation_fallback", 1)
return await self._generate_gpt35(query, contexts)
Maturity Assessment Checklist
| Level | Capability | Have It? |
|---|---|---|
| 1 | Basic vector search | |
| 1 | Single embedding model | |
| 2 | Hybrid search (vector + keyword) | |
| 2 | Source citations | |
| 3 | Semantic reranking | |
| 3 | Relevance filtering | |
| 4 | Query decomposition | |
| 4 | Multi-step retrieval | |
| 4 | Intent-aware generation | |
| 5 | Response caching | |
| 5 | Quality evaluation | |
| 5 | Comprehensive monitoring | |
| 5 | Fallback strategies |
Conclusion
Most organizations start at Level 1-2 and need to reach Level 4-5 for production. Focus on:
- Hybrid search - Vector-only misses keyword matches
- Semantic reranking - Critical for precision
- Query understanding - Complex queries need decomposition
- Evaluation - You can’t improve what you don’t measure
- Observability - Monitor everything in production
The journey from demo to production RAG is significant, but each level delivers measurable improvements in answer quality.