Skip to content
Back to Blog
1 min read

Continuous Evaluation: Monitoring AI Quality in Production

I wrote “Continuous Evaluation: Monitoring AI Quality in Production” to share practical, production-minded guidance on this topic.

Continuous Evaluation System

from dataclasses import dataclass
from typing import Dict, List, Optional
from datetime import datetime
import asyncio

@dataclass
class EvaluationMetric:
    name: str
    threshold: float
    weight: float

class ContinuousEvaluator:
    def __init__(self, config: Dict):
        self.metrics = [
            EvaluationMetric("relevancy", threshold=0.7, weight=0.3),
            EvaluationMetric("faithfulness", threshold=0.8, weight=0.3),
            EvaluationMetric("coherence", threshold=0.7, weight=0.2),
            EvaluationMetric("safety", threshold=0.95, weight=0.2)
        ]
        self.sample_rate = config.get("sample_rate", 0.1)
        self.alert_handler = AlertHandler()

    async def evaluate_sample(self, interaction: Dict) -> Optional[Dict]:
        """Evaluate a sampled interaction."""
        import random
        if random.random() > self.sample_rate:
            return None

        scores = {}
        for metric in self.metrics:
            score = await self.compute_metric(metric.name, interaction)
            scores[metric.name] = score

        # Calculate overall score
        overall = sum(
            scores[m.name] * m.weight
            for m in self.metrics
        )

        result = {
            "timestamp": datetime.now(),
            "interaction_id": interaction["id"],
            "scores": scores,
            "overall_score": overall,
            "passed": self.check_thresholds(scores)
        }

        # Store for trending
        await self.store_evaluation(result)

        # Check for alerts
        await self.check_alerts(result)

        return result

    async def compute_metric(self, metric: str, interaction: Dict) -> float:
        """Compute specific quality metric."""
        if metric == "relevancy":
            return await self.evaluate_relevancy(
                interaction["question"],
                interaction["response"]
            )
        elif metric == "faithfulness":
            return await self.evaluate_faithfulness(
                interaction["response"],
                interaction.get("context", [])
            )
        elif metric == "coherence":
            return await self.evaluate_coherence(interaction["response"])
        elif metric == "safety":
            return await self.evaluate_safety(interaction["response"])

    def check_thresholds(self, scores: Dict) -> bool:
        """Check if all metrics meet thresholds."""
        for metric in self.metrics:
            if scores.get(metric.name, 0) < metric.threshold:
                return False
        return True

    async def check_alerts(self, result: Dict):
        """Check for alerting conditions."""
        # Individual metric alerts
        for metric in self.metrics:
            if result["scores"].get(metric.name, 1) < metric.threshold:
                await self.alert_handler.send(
                    severity="warning",
                    message=f"Quality metric {metric.name} below threshold",
                    details=result
                )

        # Overall score alert
        if result["overall_score"] < 0.6:
            await self.alert_handler.send(
                severity="critical",
                message="Overall quality score critically low",
                details=result
            )

    async def get_trends(self, hours: int = 24) -> Dict:
        """Get quality trends over time."""
        evaluations = await self.get_recent_evaluations(hours)

        return {
            "total_evaluated": len(evaluations),
            "pass_rate": sum(e["passed"] for e in evaluations) / len(evaluations),
            "avg_scores": {
                metric.name: np.mean([e["scores"][metric.name] for e in evaluations])
                for metric in self.metrics
            },
            "trend": self.calculate_trend(evaluations)
        }

Continuous evaluation maintains AI quality standards throughout the system lifecycle.\n\n## Takeaways\n\nAdd a concise, personal takeaway and recommended next steps here.\n

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.