3 min read
LLM Routing: Intelligent Model Selection for Cost and Quality
Not every query needs GPT-4. Smart routing sends simple queries to cheaper models while reserving powerful (expensive) models for complex tasks.
The Routing Problem
Query: "What's 2+2?" → GPT-3.5 ($0.0005/1K) ✓
Query: "Explain quantum physics" → GPT-4 ($0.01/1K) ✓
Query: "Debug this complex code" → GPT-4 ($0.01/1K) ✓
Router Implementation
from enum import Enum
import re
class ModelTier(Enum):
SIMPLE = "gpt-3.5-turbo"
STANDARD = "gpt-4-turbo"
PREMIUM = "gpt-4"
class QueryRouter:
def __init__(self):
self.complexity_keywords = {
"high": ["analyze", "synthesize", "compare", "evaluate", "debug", "architect"],
"medium": ["explain", "summarize", "write", "create", "implement"],
"low": ["what is", "define", "list", "when", "where", "who"]
}
def route(self, query: str) -> ModelTier:
"""Route query to appropriate model tier."""
query_lower = query.lower()
word_count = len(query.split())
# Check for high complexity indicators
if any(kw in query_lower for kw in self.complexity_keywords["high"]):
return ModelTier.PREMIUM
# Check for code (usually needs better model)
if "```" in query or self._has_code_indicators(query_lower):
return ModelTier.STANDARD
# Long queries often need more capability
if word_count > 200:
return ModelTier.STANDARD
# Simple queries
if word_count < 20 and any(kw in query_lower for kw in self.complexity_keywords["low"]):
return ModelTier.SIMPLE
return ModelTier.STANDARD # Default
def _has_code_indicators(self, text: str) -> bool:
code_patterns = ["function", "def ", "class ", "import ", "const ", "var "]
return any(p in text for p in code_patterns)
ML-Based Router
from sklearn.ensemble import RandomForestClassifier
import numpy as np
class MLRouter:
def __init__(self):
self.model = RandomForestClassifier()
self.is_trained = False
def extract_features(self, query: str) -> np.array:
"""Extract features from query for classification."""
return np.array([
len(query),
len(query.split()),
query.count("?"),
int("code" in query.lower()),
int("analyze" in query.lower()),
int("explain" in query.lower()),
int(any(c.isupper() for c in query)),
query.count("\n")
]).reshape(1, -1)
def train(self, queries: list[str], optimal_models: list[str]):
"""Train router on historical data."""
X = np.vstack([self.extract_features(q) for q in queries])
self.model.fit(X, optimal_models)
self.is_trained = True
def route(self, query: str) -> str:
if not self.is_trained:
return "gpt-4-turbo" # Default
features = self.extract_features(query)
return self.model.predict(features)[0]
Cost Savings Analysis
def analyze_routing_savings(queries: list[str], router: QueryRouter):
"""Calculate cost savings from routing."""
pricing = {
ModelTier.SIMPLE: 0.0005,
ModelTier.STANDARD: 0.01,
ModelTier.PREMIUM: 0.03
}
no_routing_cost = sum(pricing[ModelTier.PREMIUM] for _ in queries)
with_routing_cost = sum(pricing[router.route(q)] for q in queries)
return {
"without_routing": no_routing_cost,
"with_routing": with_routing_cost,
"savings": no_routing_cost - with_routing_cost,
"savings_percent": (no_routing_cost - with_routing_cost) / no_routing_cost * 100
}
Best Practices
- Start with rules - Simple heuristics work well
- Collect feedback - Track when routing was wrong
- Train ML router - Use feedback to improve
- Monitor quality - Ensure cheaper models perform adequately
- A/B test - Validate routing decisions
Conclusion
LLM routing can reduce costs by 50-70% without sacrificing quality. Start with rule-based routing and evolve to ML-based as you collect data.