Skip to content
Back to Blog
1 min read

Model Routing Patterns: Smart Query Distribution

I wrote “Model Routing Patterns: Smart Query Distribution” to share practical, production-minded guidance on this topic.

Intelligent Model Routing

from azure.ai.openai import AzureOpenAI
from dataclasses import dataclass
from typing import Callable
import numpy as np

@dataclass
class ModelConfig:
    name: str
    endpoint: str
    cost_per_1k_tokens: float
    avg_latency_ms: float
    max_complexity: float  # 0-1 scale

class ModelRouter:
    def __init__(self, openai_client: AzureOpenAI):
        self.openai = openai_client
        self.models = {}
        self.classifier = QueryClassifier(openai_client)

    def register_model(self, config: ModelConfig):
        """Register a model for routing."""
        self.models[config.name] = config

    async def route(self, query: str, constraints: dict = None) -> str:
        """Route query to optimal model."""
        # Classify query complexity
        complexity = await self.classifier.estimate_complexity(query)

        # Apply constraints
        constraints = constraints or {}
        max_latency = constraints.get("max_latency_ms", float("inf"))
        max_cost = constraints.get("max_cost_per_1k", float("inf"))

        # Find suitable models
        candidates = []
        for name, config in self.models.items():
            if (config.max_complexity >= complexity and
                config.avg_latency_ms <= max_latency and
                config.cost_per_1k_tokens <= max_cost):
                candidates.append(config)

        if not candidates:
            # Fall back to most capable model
            return max(self.models.values(), key=lambda m: m.max_complexity).name

        # Select cheapest suitable model
        return min(candidates, key=lambda m: m.cost_per_1k_tokens).name

    async def execute(self, query: str, constraints: dict = None) -> dict:
        """Route and execute query."""
        model_name = await self.route(query, constraints)
        model_config = self.models[model_name]

        response = await self.openai.chat.completions.create(
            model=model_name,
            messages=[{"role": "user", "content": query}]
        )

        return {
            "response": response.choices[0].message.content,
            "model_used": model_name,
            "estimated_cost": self.estimate_cost(response, model_config)
        }


class QueryClassifier:
    def __init__(self, openai_client: AzureOpenAI):
        self.openai = openai_client

    async def estimate_complexity(self, query: str) -> float:
        """Estimate query complexity (0-1)."""
        # Fast heuristics first
        heuristic_score = self.heuristic_complexity(query)

        if heuristic_score < 0.3 or heuristic_score > 0.7:
            return heuristic_score

        # Use LLM for ambiguous cases
        return await self.llm_complexity(query)

    def heuristic_complexity(self, query: str) -> float:
        """Fast heuristic-based complexity estimation."""
        score = 0.0

        # Length factor
        score += min(len(query) / 1000, 0.3)

        # Question words
        complex_patterns = ["why", "how", "explain", "compare", "analyze"]
        if any(p in query.lower() for p in complex_patterns):
            score += 0.2

        # Code indicators
        if "```" in query or "def " in query or "function" in query:
            score += 0.2

        # Multi-step indicators
        if " and " in query.lower() or " then " in query.lower():
            score += 0.15

        return min(score, 1.0)

    async def llm_complexity(self, query: str) -> float:
        """Use LLM for complexity estimation."""
        response = await self.openai.chat.completions.create(
            model="gpt-4o-mini",  # Use cheap model for classification
            messages=[{
                "role": "user",
                "content": f"Rate complexity 0-1: {query[:500]}"
            }]
        )
        return float(response.choices[0].message.content)

Smart routing can reduce AI costs by 50-70% while maintaining quality.\n\n## Takeaways\n\nAdd a concise, personal takeaway and recommended next steps here.\n

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.