May 20, 2025 1 min read
Continuous Evaluation: Monitoring AI Quality in Production

AI Evaluation Monitoring Quality Production
Continuous evaluation ensures AI quality doesn’t degrade over time. Here’s how to implement it.
Continuous Evaluation System

from dataclasses import dataclass
from typing import Dict, List, Optional
from datetime import datetime
import asyncio

@dataclass
class EvaluationMetric:
    name: str
    threshold: float
    weight: float

class ContinuousEvaluator:
    def __init__(self, config: Dict):
        self.metrics = [
            EvaluationMetric("relevancy", threshold=0.7, weight=0.3),
            EvaluationMetric("faithfulness", threshold=0.8, weight=0.3),
            EvaluationMetric("coherence", threshold=0.7, weight=0.2),
            EvaluationMetric("safety", threshold=0.95, weight=0.2)
        ]
        self.sample_rate = config.get("sample_rate", 0.1)
        self.alert_handler = AlertHandler()

    async def evaluate_sample(self, interaction: Dict) -> Optional[Dict]:
        """Evaluate a sampled interaction."""
        import random
        if random.random() > self.sample_rate:
            return None

        scores = {}
        for metric in self.metrics:
            score = await self.compute_metric(metric.name, interaction)
            scores[metric.name] = score

        # Calculate overall score
        overall = sum(
            scores[m.name] * m.weight
            for m in self.metrics
        )

        result = {
            "timestamp": datetime.now(),
            "interaction_id": interaction["id"],
            "scores": scores,
            "overall_score": overall,
            "passed": self.check_thresholds(scores)
        }

        # Store for trending
        await self.store_evaluation(result)

        # Check for alerts
        await self.check_alerts(result)

        return result

    async def compute_metric(self, metric: str, interaction: Dict) -> float:
        """Compute specific quality metric."""
        if metric == "relevancy":
            return await self.evaluate_relevancy(
                interaction["question"],
                interaction["response"]
            )
        elif metric == "faithfulness":
            return await self.evaluate_faithfulness(
                interaction["response"],
                interaction.get("context", [])
            )
        elif metric == "coherence":
            return await self.evaluate_coherence(interaction["response"])
        elif metric == "safety":
            return await self.evaluate_safety(interaction["response"])

    def check_thresholds(self, scores: Dict) -> bool:
        """Check if all metrics meet thresholds."""
        for metric in self.metrics:
            if scores.get(metric.name, 0) < metric.threshold:
                return False
        return True

    async def check_alerts(self, result: Dict):
        """Check for alerting conditions."""
        # Individual metric alerts
        for metric in self.metrics:
            if result["scores"].get(metric.name, 1) < metric.threshold:
                await self.alert_handler.send(
                    severity="warning",
                    message=f"Quality metric {metric.name} below threshold",
                    details=result
                )

        # Overall score alert
        if result["overall_score"] < 0.6:
            await self.alert_handler.send(
                severity="critical",
                message="Overall quality score critically low",
                details=result
            )

    async def get_trends(self, hours: int = 24) -> Dict:
        """Get quality trends over time."""
        evaluations = await self.get_recent_evaluations(hours)

        return {
            "total_evaluated": len(evaluations),
            "pass_rate": sum(e["passed"] for e in evaluations) / len(evaluations),
            "avg_scores": {
                metric.name: np.mean([e["scores"][metric.name] for e in evaluations])
                for metric in self.metrics
            },
            "trend": self.calculate_trend(evaluations)
        }
Continuous evaluation maintains AI quality standards throughout the system lifecycle.