4 min read
Multi-LLM Strategies: Building Resilient AI Applications
Relying on a single LLM provider creates risk. Multi-LLM strategies improve resilience, cost efficiency, and capability coverage. Here’s how to implement them.
Why Multi-LLM?
- Resilience: Failover when one provider has outages
- Cost optimization: Route to cheaper models when possible
- Capability matching: Use best model for each task
- Vendor independence: Avoid lock-in
Architecture Patterns
Pattern 1: Fallback Chain
from typing import Optional
import time
class FallbackLLMClient:
def __init__(self, clients: list[dict]):
self.clients = clients # Ordered by preference
async def complete(self, prompt: str, **kwargs) -> tuple[str, str]:
"""Try each client in order until one succeeds."""
last_error = None
for config in self.clients:
try:
client = config["client"]
model = config["model"]
response = await self._call_client(client, model, prompt, **kwargs)
return response, config["name"]
except Exception as e:
last_error = e
print(f"Failed {config['name']}: {e}, trying next...")
continue
raise Exception(f"All LLM providers failed. Last error: {last_error}")
async def _call_client(self, client, model, prompt, **kwargs):
# Implementation varies by client type
pass
# Usage
clients = [
{"name": "azure-openai", "client": azure_client, "model": "gpt-4"},
{"name": "openai-direct", "client": openai_client, "model": "gpt-4"},
{"name": "anthropic", "client": anthropic_client, "model": "claude-3-opus"}
]
llm = FallbackLLMClient(clients)
response, provider = await llm.complete("Analyze this data...")
print(f"Response from {provider}")
Pattern 2: Load Balancing
import random
from collections import defaultdict
class LoadBalancedLLM:
def __init__(self, endpoints: list[dict]):
self.endpoints = endpoints
self.weights = [e.get("weight", 1) for e in endpoints]
self.health = {e["name"]: True for e in endpoints}
self.error_counts = defaultdict(int)
def select_endpoint(self) -> dict:
"""Select endpoint based on weights and health."""
healthy = [
(e, w) for e, w in zip(self.endpoints, self.weights)
if self.health[e["name"]]
]
if not healthy:
# All unhealthy, reset and try anyway
self.health = {e["name"]: True for e in self.endpoints}
healthy = list(zip(self.endpoints, self.weights))
endpoints, weights = zip(*healthy)
return random.choices(endpoints, weights=weights)[0]
async def complete(self, prompt: str, **kwargs) -> str:
endpoint = self.select_endpoint()
try:
response = await self._call(endpoint, prompt, **kwargs)
self.error_counts[endpoint["name"]] = 0
return response
except Exception as e:
self.error_counts[endpoint["name"]] += 1
# Mark unhealthy after 3 consecutive errors
if self.error_counts[endpoint["name"]] >= 3:
self.health[endpoint["name"]] = False
# Retry with different endpoint
return await self.complete(prompt, **kwargs)
Pattern 3: Task-Based Routing
class TaskRouter:
def __init__(self):
self.routes = {
"code": {"provider": "openai", "model": "gpt-4-turbo"},
"analysis": {"provider": "anthropic", "model": "claude-3-opus"},
"simple_qa": {"provider": "openai", "model": "gpt-3.5-turbo"},
"long_document": {"provider": "anthropic", "model": "claude-3-opus"},
"creative": {"provider": "openai", "model": "gpt-4-turbo"},
"default": {"provider": "openai", "model": "gpt-4-turbo"}
}
def classify_task(self, prompt: str, context_length: int) -> str:
"""Classify the task type from prompt characteristics."""
prompt_lower = prompt.lower()
if context_length > 50000:
return "long_document"
if any(kw in prompt_lower for kw in ["code", "function", "implement", "debug"]):
return "code"
if any(kw in prompt_lower for kw in ["analyze", "compare", "evaluate"]):
return "analysis"
if any(kw in prompt_lower for kw in ["write a story", "creative", "imagine"]):
return "creative"
if len(prompt) < 100:
return "simple_qa"
return "default"
def route(self, prompt: str, context_length: int = 0) -> dict:
task_type = self.classify_task(prompt, context_length)
return self.routes[task_type]
Cost Optimization
class CostAwareLLM:
PRICING = {
"gpt-4-turbo": {"input": 0.01, "output": 0.03},
"gpt-3.5-turbo": {"input": 0.0005, "output": 0.0015},
"claude-3-opus": {"input": 0.015, "output": 0.075},
"claude-3-sonnet": {"input": 0.003, "output": 0.015}
}
def __init__(self, budget_per_day: float = 100.0):
self.budget = budget_per_day
self.spent_today = 0.0
def select_model(self, estimated_tokens: int, quality_required: str) -> str:
"""Select model based on budget and quality needs."""
remaining_budget = self.budget - self.spent_today
if quality_required == "high":
preferred = ["gpt-4-turbo", "claude-3-opus"]
else:
preferred = ["gpt-3.5-turbo", "claude-3-sonnet", "gpt-4-turbo"]
for model in preferred:
estimated_cost = self._estimate_cost(model, estimated_tokens)
if estimated_cost < remaining_budget * 0.1: # Don't use more than 10% on one call
return model
return preferred[-1] # Fall back to first choice if budget allows
def _estimate_cost(self, model: str, tokens: int) -> float:
pricing = self.PRICING.get(model, self.PRICING["gpt-4-turbo"])
return (tokens / 1000) * (pricing["input"] + pricing["output"]) / 2
Monitoring
from dataclasses import dataclass
from datetime import datetime
@dataclass
class LLMMetric:
timestamp: datetime
provider: str
model: str
latency_ms: float
tokens_in: int
tokens_out: int
cost: float
success: bool
class MultiLLMMonitor:
def __init__(self):
self.metrics: list[LLMMetric] = []
def record(self, metric: LLMMetric):
self.metrics.append(metric)
def get_summary(self, hours: int = 24) -> dict:
cutoff = datetime.utcnow() - timedelta(hours=hours)
recent = [m for m in self.metrics if m.timestamp > cutoff]
by_provider = defaultdict(list)
for m in recent:
by_provider[m.provider].append(m)
return {
provider: {
"calls": len(metrics),
"success_rate": sum(m.success for m in metrics) / len(metrics),
"avg_latency": sum(m.latency_ms for m in metrics) / len(metrics),
"total_cost": sum(m.cost for m in metrics)
}
for provider, metrics in by_provider.items()
}
Best Practices
- Start with fallback - Simple and effective
- Add routing - As you understand usage patterns
- Monitor everything - Latency, cost, success rate
- Test failover - Regularly verify fallback works
- Abstract the interface - Keep application code provider-agnostic
Conclusion
Multi-LLM strategies transform AI applications from fragile to resilient. Start with basic fallback, evolve to smart routing, and continuously optimize based on monitoring data.