Semantic Router: Intent-Based LLM Routing
Traditional routing uses rules and keywords. Semantic routing understands intent. By using embeddings to match user queries to predefined intents, we can route to the most appropriate model or handler without brittle keyword matching.
What is Semantic Routing?
Semantic routing compares the meaning of a user’s query against a set of example utterances. When the query is semantically similar to examples for a particular route, we trigger that route.
from dataclasses import dataclass
import numpy as np
from typing import Optional
@dataclass
class Route:
name: str
utterances: list[str]
model: str
system_prompt: str
threshold: float = 0.75
class SemanticRouter:
def __init__(self, embedding_model: str = "text-embedding-3-small"):
self.embedding_model = embedding_model
self.routes: list[Route] = []
self.route_embeddings: dict[str, np.ndarray] = {}
def add_route(self, route: Route):
"""Add a route with its example utterances."""
self.routes.append(route)
# Compute embeddings for all utterances
embeddings = [
self._get_embedding(utterance)
for utterance in route.utterances
]
# Store average embedding as route centroid
self.route_embeddings[route.name] = np.mean(embeddings, axis=0)
def route(self, query: str) -> Optional[tuple[Route, float]]:
"""Route query to most similar route."""
query_embedding = self._get_embedding(query)
best_route = None
best_similarity = 0.0
for route in self.routes:
route_embedding = self.route_embeddings[route.name]
similarity = self._cosine_similarity(query_embedding, route_embedding)
if similarity > best_similarity and similarity >= route.threshold:
best_similarity = similarity
best_route = route
if best_route:
return best_route, best_similarity
return None
def _get_embedding(self, text: str) -> np.ndarray:
"""Get embedding for text using Azure OpenAI."""
from openai import AzureOpenAI
client = AzureOpenAI()
response = client.embeddings.create(
model=self.embedding_model,
input=text
)
return np.array(response.data[0].embedding)
def _cosine_similarity(self, a: np.ndarray, b: np.ndarray) -> float:
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
# Define routes
code_route = Route(
name="code_assistance",
utterances=[
"Help me write a function",
"Debug this code",
"Review my implementation",
"How do I implement this algorithm",
"Fix this error in my code",
"Optimize this function",
],
model="claude-3.5-sonnet",
system_prompt="You are a senior software engineer. Provide clean, efficient code with explanations."
)
data_route = Route(
name="data_analysis",
utterances=[
"Analyze this dataset",
"Create a visualization",
"What insights can you find",
"Run some statistics on this data",
"Build a dashboard query",
"Write SQL for this report",
],
model="gpt-4o",
system_prompt="You are a data analyst. Focus on actionable insights and clear visualizations."
)
general_route = Route(
name="general_chat",
utterances=[
"Hello",
"How are you",
"Tell me about yourself",
"What can you do",
"Thanks for your help",
],
model="gpt-4o-mini",
system_prompt="You are a helpful assistant. Be concise and friendly.",
threshold=0.6 # Lower threshold for general chat
)
# Usage
router = SemanticRouter()
router.add_route(code_route)
router.add_route(data_route)
router.add_route(general_route)
# Route queries
queries = [
"I need help fixing a bug in my Python script",
"Can you create a chart showing sales trends",
"Hey there!",
]
for query in queries:
result = router.route(query)
if result:
route, similarity = result
print(f"'{query}' -> {route.name} ({similarity:.2f})")
else:
print(f"'{query}' -> No matching route")
Advanced Semantic Router
Add support for multi-route matching and dynamic thresholds:
from typing import Callable
import json
@dataclass
class AdvancedRoute:
name: str
utterances: list[str]
model: str
system_prompt: str
threshold: float = 0.75
preprocessor: Callable[[str], str] = None
postprocessor: Callable[[str], any] = None
metadata: dict = None
class AdvancedSemanticRouter:
def __init__(self):
self.routes: list[AdvancedRoute] = []
self.route_embeddings: dict[str, list[np.ndarray]] = {}
self.embedding_cache: dict[str, np.ndarray] = {}
def add_route(self, route: AdvancedRoute):
"""Add route and compute embeddings."""
self.routes.append(route)
# Store all embeddings (not just centroid) for better matching
embeddings = []
for utterance in route.utterances:
if utterance in self.embedding_cache:
emb = self.embedding_cache[utterance]
else:
emb = self._get_embedding(utterance)
self.embedding_cache[utterance] = emb
embeddings.append(emb)
self.route_embeddings[route.name] = embeddings
def route(
self,
query: str,
return_scores: bool = False
) -> tuple[AdvancedRoute, float] | dict:
"""Route using max similarity to any utterance."""
query_embedding = self._get_embedding(query)
scores = {}
for route in self.routes:
# Find max similarity to any utterance
similarities = [
self._cosine_similarity(query_embedding, emb)
for emb in self.route_embeddings[route.name]
]
max_similarity = max(similarities)
if max_similarity >= route.threshold:
scores[route.name] = {
"route": route,
"similarity": max_similarity,
"all_similarities": similarities
}
if return_scores:
return scores
if not scores:
return None
# Return best match
best = max(scores.values(), key=lambda x: x["similarity"])
return best["route"], best["similarity"]
def route_multi(
self,
query: str,
top_k: int = 3
) -> list[tuple[AdvancedRoute, float]]:
"""Return top-k matching routes."""
query_embedding = self._get_embedding(query)
all_scores = []
for route in self.routes:
similarities = [
self._cosine_similarity(query_embedding, emb)
for emb in self.route_embeddings[route.name]
]
max_similarity = max(similarities)
all_scores.append((route, max_similarity))
# Sort by similarity and return top-k
all_scores.sort(key=lambda x: x[1], reverse=True)
return all_scores[:top_k]
def _get_embedding(self, text: str) -> np.ndarray:
if text in self.embedding_cache:
return self.embedding_cache[text]
from openai import AzureOpenAI
client = AzureOpenAI()
response = client.embeddings.create(
model="text-embedding-3-small",
input=text
)
embedding = np.array(response.data[0].embedding)
self.embedding_cache[text] = embedding
return embedding
def _cosine_similarity(self, a: np.ndarray, b: np.ndarray) -> float:
return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
def save_embeddings(self, path: str):
"""Save computed embeddings to file."""
data = {
route_name: [emb.tolist() for emb in embeddings]
for route_name, embeddings in self.route_embeddings.items()
}
with open(path, 'w') as f:
json.dump(data, f)
def load_embeddings(self, path: str):
"""Load pre-computed embeddings."""
with open(path, 'r') as f:
data = json.load(f)
for route_name, embeddings in data.items():
self.route_embeddings[route_name] = [
np.array(emb) for emb in embeddings
]
Hybrid Router
Combine semantic routing with other routing strategies:
class HybridSemanticRouter:
def __init__(self):
self.semantic_router = AdvancedSemanticRouter()
self.keyword_overrides: dict[str, AdvancedRoute] = {}
self.fallback_route: AdvancedRoute = None
def add_keyword_override(self, keyword: str, route: AdvancedRoute):
"""Add keyword that bypasses semantic matching."""
self.keyword_overrides[keyword.lower()] = route
def set_fallback(self, route: AdvancedRoute):
"""Set default route when nothing matches."""
self.fallback_route = route
def route(
self,
query: str,
context: dict = None
) -> tuple[AdvancedRoute, str, float]:
"""
Route using hybrid approach.
Returns (route, routing_method, confidence)
"""
query_lower = query.lower()
# 1. Check keyword overrides first
for keyword, route in self.keyword_overrides.items():
if keyword in query_lower:
return route, "keyword", 1.0
# 2. Try semantic routing
result = self.semantic_router.route(query)
if result:
route, similarity = result
return route, "semantic", similarity
# 3. Context-based routing
if context:
context_route = self._route_by_context(context)
if context_route:
return context_route, "context", 0.7
# 4. Fallback
if self.fallback_route:
return self.fallback_route, "fallback", 0.5
raise ValueError("No route found and no fallback configured")
def _route_by_context(self, context: dict) -> AdvancedRoute:
"""Route based on conversation context."""
# If previous messages were about code, continue with code route
if context.get("last_route") == "code_assistance":
for route in self.semantic_router.routes:
if route.name == "code_assistance":
return route
return None
# Usage
hybrid = HybridSemanticRouter()
# Add routes
hybrid.semantic_router.add_route(code_route)
hybrid.semantic_router.add_route(data_route)
hybrid.semantic_router.add_route(general_route)
# Add keyword overrides for urgent routing
urgent_route = AdvancedRoute(
name="urgent_support",
utterances=["This is urgent", "Emergency", "Critical issue"],
model="gpt-4o",
system_prompt="You are handling an urgent support request. Be swift and thorough."
)
hybrid.add_keyword_override("urgent", urgent_route)
hybrid.add_keyword_override("emergency", urgent_route)
# Set fallback
hybrid.set_fallback(general_route)
# Route queries
route, method, confidence = hybrid.route("URGENT: my production database is down")
print(f"Routed via {method} to {route.name} (confidence: {confidence})")
Route Training and Optimization
Improve routes based on feedback:
from datetime import datetime
from collections import defaultdict
class TrainableSemanticRouter:
def __init__(self):
self.router = AdvancedSemanticRouter()
self.feedback_log: list[dict] = []
self.route_performance: dict[str, dict] = defaultdict(
lambda: {"correct": 0, "incorrect": 0, "missed": 0}
)
def route_and_log(
self,
query: str,
session_id: str
) -> tuple[AdvancedRoute, float]:
"""Route and prepare for feedback."""
result = self.router.route(query)
log_entry = {
"session_id": session_id,
"query": query,
"timestamp": datetime.utcnow().isoformat(),
"routed_to": result[0].name if result else None,
"confidence": result[1] if result else 0,
"feedback": None
}
self.feedback_log.append(log_entry)
return result
def record_feedback(
self,
session_id: str,
was_correct: bool,
correct_route: str = None
):
"""Record user feedback on routing decision."""
# Find the log entry
for entry in reversed(self.feedback_log):
if entry["session_id"] == session_id and entry["feedback"] is None:
entry["feedback"] = {
"correct": was_correct,
"intended_route": correct_route
}
# Update performance metrics
routed = entry["routed_to"]
if was_correct:
self.route_performance[routed]["correct"] += 1
else:
self.route_performance[routed]["incorrect"] += 1
if correct_route:
self.route_performance[correct_route]["missed"] += 1
break
def get_optimization_suggestions(self) -> list[str]:
"""Analyze feedback and suggest improvements."""
suggestions = []
for route_name, stats in self.route_performance.items():
total = stats["correct"] + stats["incorrect"]
if total < 10:
continue
accuracy = stats["correct"] / total
if accuracy < 0.8:
suggestions.append(
f"Route '{route_name}' has {accuracy:.0%} accuracy. "
f"Consider adding more example utterances."
)
if stats["missed"] > 5:
suggestions.append(
f"Route '{route_name}' was intended {stats['missed']} times "
f"but not matched. Review threshold or add examples."
)
return suggestions
def auto_adjust_thresholds(self):
"""Automatically adjust route thresholds based on feedback."""
for route in self.router.routes:
stats = self.route_performance[route.name]
total = stats["correct"] + stats["incorrect"]
if total < 20:
continue
accuracy = stats["correct"] / total
# If accuracy is low, raise threshold (be more selective)
if accuracy < 0.7:
route.threshold = min(0.9, route.threshold + 0.05)
print(f"Raised threshold for {route.name} to {route.threshold}")
# If many misses, lower threshold (be more inclusive)
elif stats["missed"] / max(total, 1) > 0.2:
route.threshold = max(0.5, route.threshold - 0.05)
print(f"Lowered threshold for {route.name} to {route.threshold}")
def export_misrouted_queries(self) -> list[dict]:
"""Export queries that were routed incorrectly for manual review."""
misrouted = [
entry for entry in self.feedback_log
if entry["feedback"] and not entry["feedback"]["correct"]
]
return misrouted
Integration with Azure AI Search
Use Azure AI Search for scalable semantic routing:
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
SearchIndex, SearchField, VectorSearch,
HnswAlgorithmConfiguration, VectorSearchProfile
)
from azure.core.credentials import AzureKeyCredential
class AzureSemanticRouter:
def __init__(
self,
endpoint: str,
key: str,
index_name: str = "routes"
):
self.credential = AzureKeyCredential(key)
self.index_client = SearchIndexClient(endpoint, self.credential)
self.search_client = SearchClient(endpoint, index_name, self.credential)
self.index_name = index_name
def setup_index(self):
"""Create search index for routes."""
fields = [
SearchField(name="id", type="Edm.String", key=True),
SearchField(name="route_name", type="Edm.String", filterable=True),
SearchField(name="utterance", type="Edm.String", searchable=True),
SearchField(
name="embedding",
type="Collection(Edm.Single)",
vector_search_dimensions=1536,
vector_search_profile_name="vector-profile"
),
SearchField(name="model", type="Edm.String"),
SearchField(name="system_prompt", type="Edm.String"),
SearchField(name="threshold", type="Edm.Double"),
]
vector_search = VectorSearch(
algorithms=[
HnswAlgorithmConfiguration(name="hnsw-config")
],
profiles=[
VectorSearchProfile(
name="vector-profile",
algorithm_configuration_name="hnsw-config"
)
]
)
index = SearchIndex(
name=self.index_name,
fields=fields,
vector_search=vector_search
)
self.index_client.create_or_update_index(index)
def add_route(self, route: AdvancedRoute):
"""Index route utterances."""
documents = []
for i, utterance in enumerate(route.utterances):
embedding = self._get_embedding(utterance)
documents.append({
"id": f"{route.name}_{i}",
"route_name": route.name,
"utterance": utterance,
"embedding": embedding.tolist(),
"model": route.model,
"system_prompt": route.system_prompt,
"threshold": route.threshold,
})
self.search_client.upload_documents(documents)
def route(self, query: str) -> tuple[str, str, float]:
"""Route query using vector search."""
query_embedding = self._get_embedding(query)
results = self.search_client.search(
search_text=None,
vector_queries=[{
"vector": query_embedding.tolist(),
"k_nearest_neighbors": 5,
"fields": "embedding"
}],
select=["route_name", "model", "system_prompt", "threshold"]
)
best_result = None
best_score = 0
for result in results:
if result["@search.score"] > best_score:
if result["@search.score"] >= result["threshold"]:
best_score = result["@search.score"]
best_result = result
if best_result:
return best_result["route_name"], best_result["model"], best_score
return None
def _get_embedding(self, text: str) -> np.ndarray:
from openai import AzureOpenAI
client = AzureOpenAI()
response = client.embeddings.create(
model="text-embedding-3-small",
input=text
)
return np.array(response.data[0].embedding)
Best Practices
- Use diverse utterances: Cover different phrasings of the same intent
- Set appropriate thresholds: Balance precision and recall
- Monitor and iterate: Track routing accuracy and improve
- Cache embeddings: Avoid recomputing for known utterances
- Have fallbacks: Handle unmatched queries gracefully
Conclusion
Semantic routing brings intelligence to LLM orchestration. Instead of brittle keyword rules, you match meaning. This results in more natural interactions and better routing accuracy.
Start with clear intent definitions, diverse examples, and iterate based on real usage patterns.