2 min read
Model Evaluation Techniques: Beyond Accuracy Metrics
Evaluating machine learning models requires looking beyond simple accuracy metrics. Production models need evaluation across multiple dimensions including performance, fairness, robustness, and business impact.
Comprehensive Evaluation Framework
Build an evaluation framework that captures multiple quality dimensions:
from dataclasses import dataclass
from typing import Dict, List
import numpy as np
from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score,
roc_auc_score, confusion_matrix
)
@dataclass
class EvaluationResult:
metric_name: str
value: float
threshold: float
passed: bool
details: dict
class ModelEvaluator:
def __init__(self, model, test_data, test_labels):
self.model = model
self.X_test = test_data
self.y_test = test_labels
self.predictions = model.predict(test_data)
self.probabilities = model.predict_proba(test_data) if hasattr(model, 'predict_proba') else None
self.results: List[EvaluationResult] = []
def evaluate_classification_metrics(self, thresholds: dict) -> List[EvaluationResult]:
"""Evaluate standard classification metrics."""
metrics = {
"accuracy": accuracy_score(self.y_test, self.predictions),
"precision": precision_score(self.y_test, self.predictions, average='weighted'),
"recall": recall_score(self.y_test, self.predictions, average='weighted'),
"f1": f1_score(self.y_test, self.predictions, average='weighted')
}
if self.probabilities is not None and len(np.unique(self.y_test)) == 2:
metrics["auc_roc"] = roc_auc_score(self.y_test, self.probabilities[:, 1])
for metric_name, value in metrics.items():
if value is not None:
threshold = thresholds.get(metric_name, 0.0)
self.results.append(EvaluationResult(
metric_name=metric_name,
value=value,
threshold=threshold,
passed=value >= threshold,
details={}
))
return self.results
Evaluating Fairness Across Groups
Test model behavior across demographic groups:
def evaluate_fairness(self, sensitive_feature: np.ndarray, groups: list) -> Dict:
"""Evaluate model fairness across demographic groups."""
group_metrics = {}
for group in groups:
mask = sensitive_feature == group
if mask.sum() == 0:
continue
group_metrics[group] = {
"accuracy": accuracy_score(self.y_test[mask], self.predictions[mask]),
"positive_rate": self.predictions[mask].mean(),
"sample_size": mask.sum()
}
accuracies = [m["accuracy"] for m in group_metrics.values()]
positive_rates = [m["positive_rate"] for m in group_metrics.values()]
return {
"group_metrics": group_metrics,
"accuracy_disparity": max(accuracies) - min(accuracies),
"demographic_parity_diff": max(positive_rates) - min(positive_rates),
"fairness_passed": (max(accuracies) - min(accuracies)) < 0.1
}
Business Impact Evaluation
Always connect model metrics to business outcomes. A model with lower accuracy but better performance on high-value segments may deliver more business value than a higher-accuracy model that underperforms on critical use cases.