2 min read
Continuous Evaluation: Monitoring AI Quality in Production
Continuous evaluation ensures AI quality doesn’t degrade over time. Here’s how to implement it.
Continuous Evaluation System
from dataclasses import dataclass
from typing import Dict, List, Optional
from datetime import datetime
import asyncio
@dataclass
class EvaluationMetric:
name: str
threshold: float
weight: float
class ContinuousEvaluator:
def __init__(self, config: Dict):
self.metrics = [
EvaluationMetric("relevancy", threshold=0.7, weight=0.3),
EvaluationMetric("faithfulness", threshold=0.8, weight=0.3),
EvaluationMetric("coherence", threshold=0.7, weight=0.2),
EvaluationMetric("safety", threshold=0.95, weight=0.2)
]
self.sample_rate = config.get("sample_rate", 0.1)
self.alert_handler = AlertHandler()
async def evaluate_sample(self, interaction: Dict) -> Optional[Dict]:
"""Evaluate a sampled interaction."""
import random
if random.random() > self.sample_rate:
return None
scores = {}
for metric in self.metrics:
score = await self.compute_metric(metric.name, interaction)
scores[metric.name] = score
# Calculate overall score
overall = sum(
scores[m.name] * m.weight
for m in self.metrics
)
result = {
"timestamp": datetime.now(),
"interaction_id": interaction["id"],
"scores": scores,
"overall_score": overall,
"passed": self.check_thresholds(scores)
}
# Store for trending
await self.store_evaluation(result)
# Check for alerts
await self.check_alerts(result)
return result
async def compute_metric(self, metric: str, interaction: Dict) -> float:
"""Compute specific quality metric."""
if metric == "relevancy":
return await self.evaluate_relevancy(
interaction["question"],
interaction["response"]
)
elif metric == "faithfulness":
return await self.evaluate_faithfulness(
interaction["response"],
interaction.get("context", [])
)
elif metric == "coherence":
return await self.evaluate_coherence(interaction["response"])
elif metric == "safety":
return await self.evaluate_safety(interaction["response"])
def check_thresholds(self, scores: Dict) -> bool:
"""Check if all metrics meet thresholds."""
for metric in self.metrics:
if scores.get(metric.name, 0) < metric.threshold:
return False
return True
async def check_alerts(self, result: Dict):
"""Check for alerting conditions."""
# Individual metric alerts
for metric in self.metrics:
if result["scores"].get(metric.name, 1) < metric.threshold:
await self.alert_handler.send(
severity="warning",
message=f"Quality metric {metric.name} below threshold",
details=result
)
# Overall score alert
if result["overall_score"] < 0.6:
await self.alert_handler.send(
severity="critical",
message="Overall quality score critically low",
details=result
)
async def get_trends(self, hours: int = 24) -> Dict:
"""Get quality trends over time."""
evaluations = await self.get_recent_evaluations(hours)
return {
"total_evaluated": len(evaluations),
"pass_rate": sum(e["passed"] for e in evaluations) / len(evaluations),
"avg_scores": {
metric.name: np.mean([e["scores"][metric.name] for e in evaluations])
for metric in self.metrics
},
"trend": self.calculate_trend(evaluations)
}
Continuous evaluation maintains AI quality standards throughout the system lifecycle.