Back to Blog
2 min read

Continuous Evaluation: Monitoring AI Quality in Production

Continuous evaluation ensures AI quality doesn’t degrade over time. Here’s how to implement it.

Continuous Evaluation System

from dataclasses import dataclass
from typing import Dict, List, Optional
from datetime import datetime
import asyncio

@dataclass
class EvaluationMetric:
    name: str
    threshold: float
    weight: float

class ContinuousEvaluator:
    def __init__(self, config: Dict):
        self.metrics = [
            EvaluationMetric("relevancy", threshold=0.7, weight=0.3),
            EvaluationMetric("faithfulness", threshold=0.8, weight=0.3),
            EvaluationMetric("coherence", threshold=0.7, weight=0.2),
            EvaluationMetric("safety", threshold=0.95, weight=0.2)
        ]
        self.sample_rate = config.get("sample_rate", 0.1)
        self.alert_handler = AlertHandler()

    async def evaluate_sample(self, interaction: Dict) -> Optional[Dict]:
        """Evaluate a sampled interaction."""
        import random
        if random.random() > self.sample_rate:
            return None

        scores = {}
        for metric in self.metrics:
            score = await self.compute_metric(metric.name, interaction)
            scores[metric.name] = score

        # Calculate overall score
        overall = sum(
            scores[m.name] * m.weight
            for m in self.metrics
        )

        result = {
            "timestamp": datetime.now(),
            "interaction_id": interaction["id"],
            "scores": scores,
            "overall_score": overall,
            "passed": self.check_thresholds(scores)
        }

        # Store for trending
        await self.store_evaluation(result)

        # Check for alerts
        await self.check_alerts(result)

        return result

    async def compute_metric(self, metric: str, interaction: Dict) -> float:
        """Compute specific quality metric."""
        if metric == "relevancy":
            return await self.evaluate_relevancy(
                interaction["question"],
                interaction["response"]
            )
        elif metric == "faithfulness":
            return await self.evaluate_faithfulness(
                interaction["response"],
                interaction.get("context", [])
            )
        elif metric == "coherence":
            return await self.evaluate_coherence(interaction["response"])
        elif metric == "safety":
            return await self.evaluate_safety(interaction["response"])

    def check_thresholds(self, scores: Dict) -> bool:
        """Check if all metrics meet thresholds."""
        for metric in self.metrics:
            if scores.get(metric.name, 0) < metric.threshold:
                return False
        return True

    async def check_alerts(self, result: Dict):
        """Check for alerting conditions."""
        # Individual metric alerts
        for metric in self.metrics:
            if result["scores"].get(metric.name, 1) < metric.threshold:
                await self.alert_handler.send(
                    severity="warning",
                    message=f"Quality metric {metric.name} below threshold",
                    details=result
                )

        # Overall score alert
        if result["overall_score"] < 0.6:
            await self.alert_handler.send(
                severity="critical",
                message="Overall quality score critically low",
                details=result
            )

    async def get_trends(self, hours: int = 24) -> Dict:
        """Get quality trends over time."""
        evaluations = await self.get_recent_evaluations(hours)

        return {
            "total_evaluated": len(evaluations),
            "pass_rate": sum(e["passed"] for e in evaluations) / len(evaluations),
            "avg_scores": {
                metric.name: np.mean([e["scores"][metric.name] for e in evaluations])
                for metric in self.metrics
            },
            "trend": self.calculate_trend(evaluations)
        }

Continuous evaluation maintains AI quality standards throughout the system lifecycle.

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.