Back to Blog
4 min read

Error Analysis for Machine Learning Models

Error analysis helps identify where and why your model fails. The Error Analysis component in Azure ML’s Responsible AI Dashboard provides powerful tools for understanding model errors.

Understanding Error Analysis

Error analysis identifies:

  • Cohorts with high error rates
  • Root causes of model failures
  • Patterns in misclassifications
  • Data quality issues

Setting Up Error Analysis

from raiwidgets import ErrorAnalysisDashboard
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd

# Load and prepare data
df = pd.read_csv("customer_data.csv")
X = df.drop("churn", axis=1)
y = df["churn"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train model
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)

# Launch Error Analysis Dashboard
ErrorAnalysisDashboard(
    global_explanation=None,
    model=model,
    dataset=X_test,
    true_y=y_test,
    categorical_features=["contract_type", "payment_method"],
    features=X_test.columns.tolist()
)

Error Tree Analysis

from erroranalysis import ModelAnalyzer
from erroranalysis._internal.constants import ModelTask

# Create analyzer
analyzer = ModelAnalyzer(
    model=model,
    dataset=X_test,
    true_y=y_test,
    feature_names=X_test.columns.tolist(),
    categorical_features=["contract_type", "payment_method"],
    model_task=ModelTask.CLASSIFICATION
)

# Build error tree
tree = analyzer.compute_error_tree(
    max_depth=4,
    num_leaves=31,
    min_child_samples=20
)

# Analyze tree nodes
def print_tree(node, depth=0):
    """Print error tree structure"""
    indent = "  " * depth
    if node.is_leaf:
        print(f"{indent}Leaf: {node.error_rate:.2%} error, {node.size} samples")
    else:
        print(f"{indent}Split on {node.feature} at {node.threshold}")
        print_tree(node.left, depth + 1)
        print_tree(node.right, depth + 1)

print_tree(tree.root)

Identifying Error Cohorts

import numpy as np
from sklearn.metrics import classification_report

class ErrorCohortAnalyzer:
    def __init__(self, model, X, y):
        self.model = model
        self.X = X
        self.y = y
        self.predictions = model.predict(X)
        self.errors = self.predictions != y

    def find_high_error_cohorts(self, features, bins=10):
        """Find cohorts with high error rates"""
        cohorts = []

        for feature in features:
            if self.X[feature].dtype in ['object', 'category']:
                # Categorical feature
                for value in self.X[feature].unique():
                    mask = self.X[feature] == value
                    error_rate = self.errors[mask].mean()
                    size = mask.sum()
                    cohorts.append({
                        'feature': feature,
                        'condition': f"{feature} == {value}",
                        'error_rate': error_rate,
                        'size': size,
                        'mask': mask
                    })
            else:
                # Numerical feature - bin it
                percentiles = np.percentile(self.X[feature], np.linspace(0, 100, bins + 1))
                for i in range(len(percentiles) - 1):
                    low, high = percentiles[i], percentiles[i + 1]
                    mask = (self.X[feature] >= low) & (self.X[feature] < high)
                    if mask.sum() > 0:
                        error_rate = self.errors[mask].mean()
                        cohorts.append({
                            'feature': feature,
                            'condition': f"{feature} in [{low:.2f}, {high:.2f})",
                            'error_rate': error_rate,
                            'size': mask.sum(),
                            'mask': mask
                        })

        # Sort by error rate
        cohorts.sort(key=lambda x: x['error_rate'], reverse=True)
        return cohorts

    def analyze_cohort(self, mask):
        """Detailed analysis of a specific cohort"""
        y_true = self.y[mask]
        y_pred = self.predictions[mask]

        return {
            'size': mask.sum(),
            'error_rate': (y_pred != y_true).mean(),
            'report': classification_report(y_true, y_pred, output_dict=True)
        }

# Usage
analyzer = ErrorCohortAnalyzer(model, X_test, y_test)
high_error_cohorts = analyzer.find_high_error_cohorts(X_test.columns)

print("Top 5 High-Error Cohorts:")
for cohort in high_error_cohorts[:5]:
    print(f"  {cohort['condition']}: {cohort['error_rate']:.2%} error ({cohort['size']} samples)")

Confusion Matrix Analysis

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

def analyze_confusion_by_cohort(model, X, y, feature, value):
    """Analyze confusion matrix for a specific cohort"""
    mask = X[feature] == value if X[feature].dtype == 'object' else X[feature] >= value

    y_true = y[mask]
    y_pred = model.predict(X[mask])

    cm = confusion_matrix(y_true, y_pred)

    fig, axes = plt.subplots(1, 2, figsize=(12, 5))

    # Overall confusion matrix
    ConfusionMatrixDisplay.from_predictions(y, model.predict(X), ax=axes[0])
    axes[0].set_title("Overall")

    # Cohort confusion matrix
    ConfusionMatrixDisplay.from_predictions(y_true, y_pred, ax=axes[1])
    axes[1].set_title(f"Cohort: {feature} = {value}")

    plt.tight_layout()
    plt.savefig("confusion_comparison.png")

analyze_confusion_by_cohort(model, X_test, y_test, "contract_type", "month-to-month")

Error Pattern Detection

class ErrorPatternDetector:
    def __init__(self, model, X, y):
        self.model = model
        self.X = X
        self.y = y
        self.predictions = model.predict(X)
        self.probabilities = model.predict_proba(X)

    def find_confident_errors(self, threshold=0.9):
        """Find high-confidence incorrect predictions"""
        errors = self.predictions != self.y
        max_probs = np.max(self.probabilities, axis=1)
        confident_errors = errors & (max_probs >= threshold)

        error_indices = np.where(confident_errors)[0]

        return pd.DataFrame({
            'index': error_indices,
            'true_label': self.y.iloc[error_indices].values,
            'predicted': self.predictions[error_indices],
            'confidence': max_probs[error_indices]
        }).sort_values('confidence', ascending=False)

    def find_boundary_errors(self, threshold=0.55):
        """Find errors near decision boundary"""
        errors = self.predictions != self.y
        max_probs = np.max(self.probabilities, axis=1)
        boundary_errors = errors & (max_probs <= threshold)

        error_indices = np.where(boundary_errors)[0]

        return pd.DataFrame({
            'index': error_indices,
            'true_label': self.y.iloc[error_indices].values,
            'predicted': self.predictions[error_indices],
            'confidence': max_probs[error_indices]
        })

# Usage
detector = ErrorPatternDetector(model, X_test, y_test)

confident_errors = detector.find_confident_errors(threshold=0.9)
print(f"Found {len(confident_errors)} high-confidence errors")

boundary_errors = detector.find_boundary_errors(threshold=0.55)
print(f"Found {len(boundary_errors)} boundary errors")

Actionable Insights

def generate_error_report(analyzer):
    """Generate actionable error analysis report"""
    report = []

    # Find worst cohorts
    cohorts = analyzer.find_high_error_cohorts(X_test.columns)

    report.append("## High-Error Cohorts\n")
    for cohort in cohorts[:5]:
        if cohort['error_rate'] > 0.3:  # 30% error threshold
            report.append(f"- **{cohort['condition']}**: {cohort['error_rate']:.1%} error rate")
            report.append(f"  - Recommendation: Collect more training data for this segment")

    # Add model improvement suggestions
    report.append("\n## Recommendations\n")
    report.append("1. Review feature engineering for high-error cohorts")
    report.append("2. Consider separate models for distinct data segments")
    report.append("3. Investigate data quality issues in error-prone regions")

    return "\n".join(report)

print(generate_error_report(analyzer))

Error analysis reveals the weaknesses in your model and guides targeted improvements.

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.