Back to Blog
5 min read

A/B Testing for ML Models in Production

A/B testing ML models helps determine which model version performs better on real business metrics. Unlike canary deployment (which focuses on stability), A/B testing focuses on model effectiveness.

A/B Testing vs Canary Deployment

AspectA/B TestingCanary Deployment
GoalCompare effectivenessValidate stability
DurationWeeksHours to days
MetricsBusiness KPIsError rates, latency
Traffic splitFixed (e.g., 50/50)Gradual increase

Setting Up A/B Testing

from azure.ai.ml import MLClient
from azure.ai.ml.entities import ManagedOnlineEndpoint, ManagedOnlineDeployment
from azure.identity import DefaultAzureCredential
import uuid

ml_client = MLClient(
    credential=DefaultAzureCredential(),
    subscription_id="your-subscription",
    resource_group_name="your-rg",
    workspace_name="your-workspace"
)

# Deploy Model A (control)
model_a_deployment = ManagedOnlineDeployment(
    name="model-a",
    endpoint_name="ab-test-endpoint",
    model="azureml:recommendation-model:1",
    instance_type="Standard_DS2_v2",
    instance_count=2
)

# Deploy Model B (treatment)
model_b_deployment = ManagedOnlineDeployment(
    name="model-b",
    endpoint_name="ab-test-endpoint",
    model="azureml:recommendation-model:2",
    instance_type="Standard_DS2_v2",
    instance_count=2
)

ml_client.online_deployments.begin_create_or_update(model_a_deployment).result()
ml_client.online_deployments.begin_create_or_update(model_b_deployment).result()

# Set 50/50 traffic split
endpoint = ml_client.online_endpoints.get("ab-test-endpoint")
endpoint.traffic = {"model-a": 50, "model-b": 50}
ml_client.online_endpoints.begin_create_or_update(endpoint).result()

User Assignment and Tracking

import hashlib
from typing import Tuple

class ABTestAssigner:
    """Consistently assign users to experiment variants"""

    def __init__(self, experiment_name: str, variants: dict):
        """
        variants: {"model-a": 50, "model-b": 50} - percentages
        """
        self.experiment_name = experiment_name
        self.variants = variants
        self._build_ranges()

    def _build_ranges(self):
        """Build percentage ranges for variant assignment"""
        self.ranges = []
        cumulative = 0
        for variant, percentage in self.variants.items():
            self.ranges.append((cumulative, cumulative + percentage, variant))
            cumulative += percentage

    def assign(self, user_id: str) -> str:
        """Assign user to a variant deterministically"""
        # Create hash of user_id + experiment_name for consistent assignment
        hash_input = f"{user_id}:{self.experiment_name}"
        hash_value = int(hashlib.md5(hash_input.encode()).hexdigest(), 16)
        bucket = hash_value % 100

        for low, high, variant in self.ranges:
            if low <= bucket < high:
                return variant

        return self.ranges[0][2]  # Default to first variant

    def get_assignment_with_tracking(self, user_id: str) -> Tuple[str, dict]:
        """Get assignment with tracking metadata"""
        variant = self.assign(user_id)
        tracking_data = {
            "experiment_name": self.experiment_name,
            "user_id": user_id,
            "variant": variant,
            "assignment_time": datetime.utcnow().isoformat()
        }
        return variant, tracking_data

# Usage
assigner = ABTestAssigner("recommendation-v2", {"model-a": 50, "model-b": 50})

# Same user always gets same variant
print(assigner.assign("user123"))  # Always returns same variant
print(assigner.assign("user123"))  # Same result

Calling the Right Model

import requests
import json

class ABTestClient:
    def __init__(self, endpoint_uri, api_key, assigner):
        self.endpoint_uri = endpoint_uri
        self.api_key = api_key
        self.assigner = assigner

    def predict(self, user_id: str, features: dict) -> dict:
        """Get prediction from assigned model variant"""
        variant, tracking = self.assigner.get_assignment_with_tracking(user_id)

        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {self.api_key}",
            "azureml-model-deployment": variant  # Route to specific deployment
        }

        response = requests.post(
            self.endpoint_uri,
            data=json.dumps(features),
            headers=headers
        )

        result = response.json()
        result["_ab_test"] = tracking

        # Log for analysis
        self._log_prediction(tracking, features, result)

        return result

    def _log_prediction(self, tracking, features, result):
        """Log prediction for later analysis"""
        log_entry = {
            **tracking,
            "features": features,
            "prediction": result.get("predictions"),
            "timestamp": datetime.utcnow().isoformat()
        }
        # Send to logging service (Event Hub, Application Insights, etc.)
        print(f"Logged: {log_entry}")

# Usage
client = ABTestClient(endpoint_uri, api_key, assigner)
result = client.predict("user123", {"product_history": [1, 2, 3]})
print(f"Variant: {result['_ab_test']['variant']}")
print(f"Predictions: {result['predictions']}")

Tracking Business Metrics

from azure.eventhub import EventHubProducerClient
import json

class ABTestMetricsTracker:
    def __init__(self, eventhub_connection_string, eventhub_name):
        self.producer = EventHubProducerClient.from_connection_string(
            eventhub_connection_string,
            eventhub_name=eventhub_name
        )

    def track_event(self, user_id: str, event_type: str, event_data: dict):
        """Track a business event for A/B analysis"""
        event = {
            "user_id": user_id,
            "event_type": event_type,
            "event_data": event_data,
            "timestamp": datetime.utcnow().isoformat()
        }

        event_data_batch = self.producer.create_batch()
        event_data_batch.add(EventData(json.dumps(event)))
        self.producer.send_batch(event_data_batch)

    def track_conversion(self, user_id: str, variant: str, conversion_value: float):
        """Track a conversion event"""
        self.track_event(user_id, "conversion", {
            "variant": variant,
            "value": conversion_value
        })

    def track_click(self, user_id: str, variant: str, item_id: str, position: int):
        """Track a click event"""
        self.track_event(user_id, "click", {
            "variant": variant,
            "item_id": item_id,
            "position": position
        })

# Usage
tracker = ABTestMetricsTracker(connection_string, "ab-test-events")

# When user clicks a recommendation
tracker.track_click("user123", "model-b", "product-456", position=2)

# When user converts
tracker.track_conversion("user123", "model-b", 99.99)

Statistical Analysis

import scipy.stats as stats
import numpy as np
from typing import Dict

class ABTestAnalyzer:
    def __init__(self, control_data: list, treatment_data: list):
        self.control = np.array(control_data)
        self.treatment = np.array(treatment_data)

    def calculate_statistics(self) -> Dict:
        """Calculate basic statistics for both groups"""
        return {
            "control": {
                "n": len(self.control),
                "mean": np.mean(self.control),
                "std": np.std(self.control),
                "sum": np.sum(self.control)
            },
            "treatment": {
                "n": len(self.treatment),
                "mean": np.mean(self.treatment),
                "std": np.std(self.treatment),
                "sum": np.sum(self.treatment)
            }
        }

    def t_test(self) -> Dict:
        """Perform two-sample t-test"""
        statistic, p_value = stats.ttest_ind(self.control, self.treatment)

        return {
            "statistic": statistic,
            "p_value": p_value,
            "significant_at_95": p_value < 0.05,
            "significant_at_99": p_value < 0.01
        }

    def calculate_lift(self) -> float:
        """Calculate lift (relative improvement)"""
        control_mean = np.mean(self.control)
        treatment_mean = np.mean(self.treatment)

        if control_mean == 0:
            return float('inf')

        return (treatment_mean - control_mean) / control_mean * 100

    def confidence_interval(self, confidence=0.95) -> Dict:
        """Calculate confidence interval for the difference"""
        diff = np.mean(self.treatment) - np.mean(self.control)
        se = np.sqrt(
            np.var(self.treatment) / len(self.treatment) +
            np.var(self.control) / len(self.control)
        )

        z = stats.norm.ppf((1 + confidence) / 2)
        ci_low = diff - z * se
        ci_high = diff + z * se

        return {
            "difference": diff,
            "ci_low": ci_low,
            "ci_high": ci_high,
            "confidence": confidence
        }

    def full_analysis(self) -> Dict:
        """Run full A/B test analysis"""
        stats_result = self.calculate_statistics()
        t_test_result = self.t_test()
        lift = self.calculate_lift()
        ci = self.confidence_interval()

        return {
            "statistics": stats_result,
            "t_test": t_test_result,
            "lift_percent": lift,
            "confidence_interval": ci,
            "recommendation": self._get_recommendation(t_test_result, lift)
        }

    def _get_recommendation(self, t_test_result, lift):
        """Generate recommendation based on results"""
        if not t_test_result["significant_at_95"]:
            return "CONTINUE - Not yet statistically significant"
        elif lift > 0:
            return f"ADOPT TREATMENT - {lift:.1f}% improvement (p < 0.05)"
        else:
            return f"KEEP CONTROL - Treatment is {abs(lift):.1f}% worse (p < 0.05)"

# Example analysis
control_conversions = [0, 1, 0, 0, 1, 0, 1, 1, 0, 0]  # Model A
treatment_conversions = [1, 1, 0, 1, 1, 0, 1, 1, 1, 0]  # Model B

analyzer = ABTestAnalyzer(control_conversions, treatment_conversions)
results = analyzer.full_analysis()

print(f"Lift: {results['lift_percent']:.1f}%")
print(f"P-value: {results['t_test']['p_value']:.4f}")
print(f"Recommendation: {results['recommendation']}")

A/B testing provides statistical evidence for model selection based on actual business outcomes.

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.