July 6, 2024 5 min read

LLM Routing Strategies for Production Systems

When you have multiple LLMs available, the question becomes: which one handles each request? Routing strategies determine this, and getting it right impacts cost, latency, and quality significantly.

The Routing Decision

Every request needs a routing decision based on:

Task complexity: Simple vs. complex
Latency requirements: Real-time vs. batch
Cost constraints: Budget limits
Quality requirements: Acceptable error rate
Context size: Token requirements

Static Routing

The simplest approach: fixed rules based on task type.

from dataclasses import dataclass
from typing import Optional
import re

@dataclass
class RoutingRule:
    name: str
    model: str
    conditions: dict

class StaticRouter:
    def __init__(self):
        self.rules = [
            RoutingRule(
                name="code_tasks",
                model="claude-3.5-sonnet",
                conditions={"task_type": "code"}
            ),
            RoutingRule(
                name="classification",
                model="gpt-4o-mini",
                conditions={"task_type": "classification"}
            ),
            RoutingRule(
                name="long_context",
                model="claude-3.5-sonnet",
                conditions={"min_tokens": 50000}
            ),
            RoutingRule(
                name="default",
                model="gpt-4o",
                conditions={}
            ),
        ]

    def route(
        self,
        task_type: str,
        input_tokens: int,
        metadata: Optional[dict] = None
    ) -> str:
        for rule in self.rules:
            if self._matches(rule, task_type, input_tokens, metadata):
                return rule.model

        return "gpt-4o"  # Default fallback

    def _matches(
        self,
        rule: RoutingRule,
        task_type: str,
        input_tokens: int,
        metadata: Optional[dict]
    ) -> bool:
        conditions = rule.conditions

        if "task_type" in conditions and conditions["task_type"] != task_type:
            return False

        if "min_tokens" in conditions and input_tokens < conditions["min_tokens"]:
            return False

        if "max_tokens" in conditions and input_tokens > conditions["max_tokens"]:
            return False

        return True

# Usage
router = StaticRouter()
model = router.route(task_type="code", input_tokens=5000)
print(f"Routing to: {model}")

Pros: Simple, predictable, easy to debug Cons: Doesn’t adapt, requires manual tuning

Dynamic Routing

Adjust routes based on real-time conditions:

from datetime import datetime, timedelta
from collections import defaultdict
import threading

class DynamicRouter:
    def __init__(self):
        self.model_stats = defaultdict(lambda: {
            "success_count": 0,
            "error_count": 0,
            "total_latency_ms": 0,
            "last_error": None
        })
        self.lock = threading.Lock()
        self.circuit_breakers = {}

    def route(
        self,
        candidates: list[str],
        optimize_for: str = "balanced"
    ) -> str:
        available = [m for m in candidates if not self._is_circuit_open(m)]

        if not available:
            # All circuits open, try first candidate anyway
            available = candidates[:1]

        if optimize_for == "latency":
            return self._select_fastest(available)
        elif optimize_for == "reliability":
            return self._select_most_reliable(available)
        elif optimize_for == "cost":
            return self._select_cheapest(available)
        else:  # balanced
            return self._select_balanced(available)

    def _select_fastest(self, candidates: list[str]) -> str:
        def avg_latency(model):
            stats = self.model_stats[model]
            total = stats["success_count"] + stats["error_count"]
            if total == 0:
                return float("inf")
            return stats["total_latency_ms"] / total

        return min(candidates, key=avg_latency)

    def _select_most_reliable(self, candidates: list[str]) -> str:
        def success_rate(model):
            stats = self.model_stats[model]
            total = stats["success_count"] + stats["error_count"]
            if total == 0:
                return 0.5  # Unknown, assume average
            return stats["success_count"] / total

        return max(candidates, key=success_rate)

    def _select_cheapest(self, candidates: list[str]) -> str:
        costs = {
            "gpt-4o": 0.005,
            "claude-3.5-sonnet": 0.003,
            "gpt-4o-mini": 0.00015,
        }
        return min(candidates, key=lambda m: costs.get(m, 0.01))

    def _select_balanced(self, candidates: list[str]) -> str:
        def score(model):
            stats = self.model_stats[model]
            total = stats["success_count"] + stats["error_count"]

            if total < 10:
                return 0.5  # Not enough data

            success_rate = stats["success_count"] / total
            avg_latency = stats["total_latency_ms"] / total

            # Normalize latency (assume 2000ms is max acceptable)
            latency_score = max(0, 1 - (avg_latency / 2000))

            return (success_rate * 0.6) + (latency_score * 0.4)

        return max(candidates, key=score)

    def _is_circuit_open(self, model: str) -> bool:
        if model not in self.circuit_breakers:
            return False

        cb = self.circuit_breakers[model]
        if cb["state"] == "open":
            # Check if cooldown has passed
            if datetime.utcnow() > cb["retry_after"]:
                cb["state"] = "half-open"
                return False
            return True

        return False

    def record_result(
        self,
        model: str,
        success: bool,
        latency_ms: float
    ):
        with self.lock:
            stats = self.model_stats[model]

            if success:
                stats["success_count"] += 1
                stats["total_latency_ms"] += latency_ms

                # Reset circuit breaker on success
                if model in self.circuit_breakers:
                    if self.circuit_breakers[model]["state"] == "half-open":
                        del self.circuit_breakers[model]
            else:
                stats["error_count"] += 1
                stats["last_error"] = datetime.utcnow()

                # Check if circuit should open
                total = stats["success_count"] + stats["error_count"]
                if total >= 10:
                    error_rate = stats["error_count"] / total
                    if error_rate > 0.5:  # >50% errors
                        self.circuit_breakers[model] = {
                            "state": "open",
                            "retry_after": datetime.utcnow() + timedelta(minutes=5)
                        }

# Usage
router = DynamicRouter()

# Route request
model = router.route(
    candidates=["gpt-4o", "claude-3.5-sonnet"],
    optimize_for="balanced"
)

# After completion, record result
router.record_result(model, success=True, latency_ms=450)

Content-Based Routing

Analyze the request content to choose the best model:

import re
from typing import Callable

class ContentRouter:
    def __init__(self):
        self.analyzers: list[tuple[Callable[[str], bool], str]] = [
            (self._is_code_heavy, "claude-3.5-sonnet"),
            (self._is_math_heavy, "gpt-4o"),
            (self._is_simple_query, "gpt-4o-mini"),
            (self._has_image, "gpt-4o"),
        ]
        self.default_model = "gpt-4o"

    def route(self, content: str, has_images: bool = False) -> str:
        if has_images:
            return "gpt-4o"  # Vision capable

        for analyzer, model in self.analyzers:
            if analyzer(content):
                return model

        return self.default_model

    def _is_code_heavy(self, content: str) -> bool:
        # Check for code blocks or programming keywords
        code_patterns = [
            r"```\w*\n",  # Code blocks
            r"def \w+\(",  # Python functions
            r"function \w+\(",  # JavaScript functions
            r"class \w+",  # Class definitions
            r"import \w+",  # Import statements
        ]

        matches = sum(1 for p in code_patterns if re.search(p, content))
        return matches >= 2

    def _is_math_heavy(self, content: str) -> bool:
        math_patterns = [
            r"\d+\s*[\+\-\*\/\^]\s*\d+",  # Arithmetic
            r"equation|formula|calculate|solve",
            r"integral|derivative|matrix",
            r"\$.*\$",  # LaTeX
        ]

        matches = sum(1 for p in math_patterns if re.search(p, content, re.I))
        return matches >= 2

    def _is_simple_query(self, content: str) -> bool:
        # Short queries without complexity indicators
        if len(content) > 500:
            return False

        complex_indicators = [
            "explain in detail",
            "analyze",
            "compare and contrast",
            "step by step",
            "comprehensive",
        ]

        return not any(ind in content.lower() for ind in complex_indicators)

    def _has_image(self, content: str) -> bool:
        # Would check for image attachments in real implementation
        return False

# Usage
router = ContentRouter()

code_prompt = """
```python
def process_data(df):
    # Need help optimizing this
    return df.groupby('category').agg({'value': 'sum'})

Review this code for performance issues. """

model = router.route(code_prompt) print(f”Code task routed to: {model}”) # claude-3.5-sonnet

simple_prompt = “What is the capital of France?” model = router.route(simple_prompt) print(f”Simple query routed to: {model}”) # gpt-4o-mini


## Hybrid Routing

Combine multiple strategies:

```python
from enum import Enum
from dataclasses import dataclass

class RoutingStrategy(Enum):
    STATIC = "static"
    DYNAMIC = "dynamic"
    CONTENT = "content"
    COST = "cost"
    LATENCY = "latency"

@dataclass
class RoutingContext:
    content: str
    task_type: str
    input_tokens: int
    has_images: bool
    max_latency_ms: int
    max_cost: float
    priority: str  # "quality", "speed", "cost"

class HybridRouter:
    def __init__(self):
        self.static_router = StaticRouter()
        self.dynamic_router = DynamicRouter()
        self.content_router = ContentRouter()

    def route(self, context: RoutingContext) -> str:
        candidates = self._get_candidates(context)

        # Apply filters
        candidates = self._filter_by_cost(candidates, context.max_cost)
        candidates = self._filter_by_latency(candidates, context.max_latency_ms)
        candidates = self._filter_by_capability(candidates, context)

        if not candidates:
            raise ValueError("No suitable model found for constraints")

        # Final selection based on priority
        if context.priority == "quality":
            return self._select_highest_quality(candidates)
        elif context.priority == "speed":
            return self.dynamic_router.route(candidates, optimize_for="latency")
        elif context.priority == "cost":
            return self.dynamic_router.route(candidates, optimize_for="cost")

        return self.dynamic_router.route(candidates, optimize_for="balanced")

    def _get_candidates(self, context: RoutingContext) -> list[str]:
        # Start with content-based suggestion
        content_suggestion = self.content_router.route(
            context.content,
            context.has_images
        )

        # Get static routing suggestion
        static_suggestion = self.static_router.route(
            context.task_type,
            context.input_tokens
        )

        # Build candidate list prioritizing suggestions
        all_models = ["gpt-4o", "claude-3.5-sonnet", "gpt-4o-mini"]
        candidates = []

        if content_suggestion:
            candidates.append(content_suggestion)
        if static_suggestion and static_suggestion not in candidates:
            candidates.append(static_suggestion)

        for model in all_models:
            if model not in candidates:
                candidates.append(model)

        return candidates

    def _filter_by_cost(
        self,
        candidates: list[str],
        max_cost: float
    ) -> list[str]:
        costs = {
            "gpt-4o": 0.005,
            "claude-3.5-sonnet": 0.003,
            "gpt-4o-mini": 0.00015,
        }
        return [m for m in candidates if costs.get(m, 0.01) <= max_cost]

    def _filter_by_latency(
        self,
        candidates: list[str],
        max_latency_ms: int
    ) -> list[str]:
        expected_latency = {
            "gpt-4o": 800,
            "claude-3.5-sonnet": 600,
            "gpt-4o-mini": 300,
        }
        return [m for m in candidates if expected_latency.get(m, 1000) <= max_latency_ms]

    def _filter_by_capability(
        self,
        candidates: list[str],
        context: RoutingContext
    ) -> list[str]:
        if context.has_images:
            vision_models = ["gpt-4o", "claude-3.5-sonnet"]
            return [m for m in candidates if m in vision_models]

        if context.input_tokens > 100000:
            long_context_models = ["claude-3.5-sonnet", "gpt-4o"]
            return [m for m in candidates if m in long_context_models]

        return candidates

    def _select_highest_quality(self, candidates: list[str]) -> str:
        quality_ranking = ["claude-3.5-sonnet", "gpt-4o", "gpt-4o-mini"]
        for model in quality_ranking:
            if model in candidates:
                return model
        return candidates[0]

# Usage
router = HybridRouter()

context = RoutingContext(
    content="Review this Python code for security issues...",
    task_type="code_review",
    input_tokens=2000,
    has_images=False,
    max_latency_ms=2000,
    max_cost=0.01,
    priority="quality"
)

model = router.route(context)
print(f"Selected model: {model}")

A/B Testing Routes

Test routing strategies systematically:

import random
import hashlib
from datetime import datetime

class ABTestRouter:
    def __init__(self):
        self.experiments = {
            "code_model_test": {
                "control": {"model": "gpt-4o", "weight": 0.5},
                "treatment": {"model": "claude-3.5-sonnet", "weight": 0.5},
                "task_types": ["code_review", "code_generation"],
            }
        }
        self.results = []

    def route_with_experiment(
        self,
        task_type: str,
        user_id: str
    ) -> tuple[str, str, str]:  # model, experiment_name, variant
        # Find applicable experiment
        for exp_name, exp_config in self.experiments.items():
            if task_type in exp_config.get("task_types", []):
                variant = self._assign_variant(user_id, exp_name, exp_config)
                model = exp_config[variant]["model"]
                return model, exp_name, variant

        return "gpt-4o", None, None

    def _assign_variant(
        self,
        user_id: str,
        experiment_name: str,
        config: dict
    ) -> str:
        # Deterministic assignment based on user_id
        hash_input = f"{user_id}:{experiment_name}"
        hash_value = int(hashlib.md5(hash_input.encode()).hexdigest(), 16)
        normalized = (hash_value % 1000) / 1000

        control_weight = config["control"]["weight"]
        return "control" if normalized < control_weight else "treatment"

    def record_result(
        self,
        experiment_name: str,
        variant: str,
        success: bool,
        latency_ms: float,
        quality_score: float = None
    ):
        self.results.append({
            "timestamp": datetime.utcnow().isoformat(),
            "experiment": experiment_name,
            "variant": variant,
            "success": success,
            "latency_ms": latency_ms,
            "quality_score": quality_score
        })

    def get_experiment_stats(self, experiment_name: str) -> dict:
        exp_results = [r for r in self.results if r["experiment"] == experiment_name]

        stats = {}
        for variant in ["control", "treatment"]:
            variant_results = [r for r in exp_results if r["variant"] == variant]
            if variant_results:
                stats[variant] = {
                    "count": len(variant_results),
                    "success_rate": sum(1 for r in variant_results if r["success"]) / len(variant_results),
                    "avg_latency": sum(r["latency_ms"] for r in variant_results) / len(variant_results),
                    "avg_quality": sum(r["quality_score"] or 0 for r in variant_results) / len(variant_results)
                }

        return stats

Best Practices

Start simple: Begin with static routing, add complexity as needed
Measure everything: You can’t optimize what you don’t measure
Use circuit breakers: Protect against cascading failures
Test routing changes: A/B test before full rollout
Document routing logic: Make decisions traceable

Conclusion

Good routing is the difference between an efficient multi-model system and a chaotic one. Start with clear routing rules, measure outcomes, and iterate based on data.

The goal is maximizing value: right model, right task, right cost.