2 min read
Responsible AI: Implementing Fairness Metrics in ML Pipelines
Building responsible AI systems requires measuring and mitigating bias throughout the ML lifecycle. Fairness metrics help identify disparate impact across demographic groups before models reach production.
Understanding Fairness Metrics
Different fairness definitions apply to different contexts. Demographic parity, equalized odds, and calibration each capture distinct aspects of fairness.
import numpy as np
from sklearn.metrics import confusion_matrix
from typing import Dict, List
from dataclasses import dataclass
@dataclass
class FairnessReport:
demographic_parity_difference: float
equalized_odds_difference: float
calibration_difference: float
group_metrics: Dict[str, Dict[str, float]]
class FairnessEvaluator:
def __init__(self, y_true: np.ndarray, y_pred: np.ndarray,
sensitive_feature: np.ndarray):
self.y_true = y_true
self.y_pred = y_pred
self.sensitive_feature = sensitive_feature
self.groups = np.unique(sensitive_feature)
def demographic_parity(self) -> Dict[str, float]:
"""Calculate selection rate for each group."""
rates = {}
for group in self.groups:
mask = self.sensitive_feature == group
rates[str(group)] = np.mean(self.y_pred[mask])
return rates
def equalized_odds(self) -> Dict[str, Dict[str, float]]:
"""Calculate TPR and FPR for each group."""
metrics = {}
for group in self.groups:
mask = self.sensitive_feature == group
tn, fp, fn, tp = confusion_matrix(
self.y_true[mask],
self.y_pred[mask]
).ravel()
tpr = tp / (tp + fn) if (tp + fn) > 0 else 0
fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
metrics[str(group)] = {"tpr": tpr, "fpr": fpr}
return metrics
def generate_report(self) -> FairnessReport:
"""Generate comprehensive fairness report."""
dp = self.demographic_parity()
eo = self.equalized_odds()
dp_values = list(dp.values())
dp_diff = max(dp_values) - min(dp_values)
tpr_values = [m["tpr"] for m in eo.values()]
fpr_values = [m["fpr"] for m in eo.values()]
eo_diff = max(
max(tpr_values) - min(tpr_values),
max(fpr_values) - min(fpr_values)
)
return FairnessReport(
demographic_parity_difference=dp_diff,
equalized_odds_difference=eo_diff,
calibration_difference=0.0, # Simplified
group_metrics={
"demographic_parity": dp,
"equalized_odds": eo
}
)
Setting Thresholds
The four-fifths rule from employment law suggests selection rates should be within 80% of each other. For equalized odds, differences greater than 0.1 typically warrant investigation.
def check_fairness_thresholds(report: FairnessReport) -> List[str]:
"""Check if fairness metrics exceed acceptable thresholds."""
warnings = []
if report.demographic_parity_difference > 0.2:
warnings.append(f"Demographic parity violation: {report.demographic_parity_difference:.3f}")
if report.equalized_odds_difference > 0.1:
warnings.append(f"Equalized odds violation: {report.equalized_odds_difference:.3f}")
return warnings
Integrate fairness evaluation into your CI/CD pipeline to catch bias issues before deployment. Responsible AI is not a one-time check but an ongoing commitment.