1 min read
Continuous Evaluation: Monitoring AI Quality in Production
I wrote “Continuous Evaluation: Monitoring AI Quality in Production” to share practical, production-minded guidance on this topic.
Continuous Evaluation System
from dataclasses import dataclass
from typing import Dict, List, Optional
from datetime import datetime
import asyncio
@dataclass
class EvaluationMetric:
name: str
threshold: float
weight: float
class ContinuousEvaluator:
def __init__(self, config: Dict):
self.metrics = [
EvaluationMetric("relevancy", threshold=0.7, weight=0.3),
EvaluationMetric("faithfulness", threshold=0.8, weight=0.3),
EvaluationMetric("coherence", threshold=0.7, weight=0.2),
EvaluationMetric("safety", threshold=0.95, weight=0.2)
]
self.sample_rate = config.get("sample_rate", 0.1)
self.alert_handler = AlertHandler()
async def evaluate_sample(self, interaction: Dict) -> Optional[Dict]:
"""Evaluate a sampled interaction."""
import random
if random.random() > self.sample_rate:
return None
scores = {}
for metric in self.metrics:
score = await self.compute_metric(metric.name, interaction)
scores[metric.name] = score
# Calculate overall score
overall = sum(
scores[m.name] * m.weight
for m in self.metrics
)
result = {
"timestamp": datetime.now(),
"interaction_id": interaction["id"],
"scores": scores,
"overall_score": overall,
"passed": self.check_thresholds(scores)
}
# Store for trending
await self.store_evaluation(result)
# Check for alerts
await self.check_alerts(result)
return result
async def compute_metric(self, metric: str, interaction: Dict) -> float:
"""Compute specific quality metric."""
if metric == "relevancy":
return await self.evaluate_relevancy(
interaction["question"],
interaction["response"]
)
elif metric == "faithfulness":
return await self.evaluate_faithfulness(
interaction["response"],
interaction.get("context", [])
)
elif metric == "coherence":
return await self.evaluate_coherence(interaction["response"])
elif metric == "safety":
return await self.evaluate_safety(interaction["response"])
def check_thresholds(self, scores: Dict) -> bool:
"""Check if all metrics meet thresholds."""
for metric in self.metrics:
if scores.get(metric.name, 0) < metric.threshold:
return False
return True
async def check_alerts(self, result: Dict):
"""Check for alerting conditions."""
# Individual metric alerts
for metric in self.metrics:
if result["scores"].get(metric.name, 1) < metric.threshold:
await self.alert_handler.send(
severity="warning",
message=f"Quality metric {metric.name} below threshold",
details=result
)
# Overall score alert
if result["overall_score"] < 0.6:
await self.alert_handler.send(
severity="critical",
message="Overall quality score critically low",
details=result
)
async def get_trends(self, hours: int = 24) -> Dict:
"""Get quality trends over time."""
evaluations = await self.get_recent_evaluations(hours)
return {
"total_evaluated": len(evaluations),
"pass_rate": sum(e["passed"] for e in evaluations) / len(evaluations),
"avg_scores": {
metric.name: np.mean([e["scores"][metric.name] for e in evaluations])
for metric in self.metrics
},
"trend": self.calculate_trend(evaluations)
}
Continuous evaluation maintains AI quality standards throughout the system lifecycle.\n\n## Takeaways\n\nAdd a concise, personal takeaway and recommended next steps here.\n