Skip to content
Back to Blog
1 min read

Model Evaluation Techniques: Beyond Accuracy Metrics

I wrote “Model Evaluation Techniques: Beyond Accuracy Metrics” to share practical, production-minded guidance on this topic.

Comprehensive Evaluation Framework

Build an evaluation framework that captures multiple quality dimensions:

from dataclasses import dataclass
from typing import Dict, List
import numpy as np
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix
)

@dataclass
class EvaluationResult:
    metric_name: str
    value: float
    threshold: float
    passed: bool
    details: dict

class ModelEvaluator:
    def __init__(self, model, test_data, test_labels):
        self.model = model
        self.X_test = test_data
        self.y_test = test_labels
        self.predictions = model.predict(test_data)
        self.probabilities = model.predict_proba(test_data) if hasattr(model, 'predict_proba') else None
        self.results: List[EvaluationResult] = []

    def evaluate_classification_metrics(self, thresholds: dict) -> List[EvaluationResult]:
        """Evaluate standard classification metrics."""

        metrics = {
            "accuracy": accuracy_score(self.y_test, self.predictions),
            "precision": precision_score(self.y_test, self.predictions, average='weighted'),
            "recall": recall_score(self.y_test, self.predictions, average='weighted'),
            "f1": f1_score(self.y_test, self.predictions, average='weighted')
        }

        if self.probabilities is not None and len(np.unique(self.y_test)) == 2:
            metrics["auc_roc"] = roc_auc_score(self.y_test, self.probabilities[:, 1])

        for metric_name, value in metrics.items():
            if value is not None:
                threshold = thresholds.get(metric_name, 0.0)
                self.results.append(EvaluationResult(
                    metric_name=metric_name,
                    value=value,
                    threshold=threshold,
                    passed=value >= threshold,
                    details={}
                ))

        return self.results

Evaluating Fairness Across Groups

Test model behavior across demographic groups:

def evaluate_fairness(self, sensitive_feature: np.ndarray, groups: list) -> Dict:
    """Evaluate model fairness across demographic groups."""

    group_metrics = {}

    for group in groups:
        mask = sensitive_feature == group
        if mask.sum() == 0:
            continue

        group_metrics[group] = {
            "accuracy": accuracy_score(self.y_test[mask], self.predictions[mask]),
            "positive_rate": self.predictions[mask].mean(),
            "sample_size": mask.sum()
        }

    accuracies = [m["accuracy"] for m in group_metrics.values()]
    positive_rates = [m["positive_rate"] for m in group_metrics.values()]

    return {
        "group_metrics": group_metrics,
        "accuracy_disparity": max(accuracies) - min(accuracies),
        "demographic_parity_diff": max(positive_rates) - min(positive_rates),
        "fairness_passed": (max(accuracies) - min(accuracies)) < 0.1
    }

Business Impact Evaluation

Always connect model metrics to business outcomes. A model with lower accuracy but better performance on high-value segments may deliver more business value than a higher-accuracy model that underperforms on critical use cases.\n\n## Takeaways\n\nAdd a concise, personal takeaway and recommended next steps here.\n

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.