Back to Blog
2 min read

A/B Testing AI Features: Data-Driven AI Improvements

A/B testing AI features requires special considerations for non-deterministic systems. Here’s how to do it right.

AI A/B Testing Framework

from dataclasses import dataclass
from typing import Optional, Callable
import random
import hashlib
from scipy import stats
import numpy as np

@dataclass
class AIExperiment:
    name: str
    control_config: dict
    treatment_config: dict
    allocation_percentage: float = 0.5
    min_samples: int = 1000

class AIExperimentFramework:
    def __init__(self):
        self.experiments = {}
        self.results_store = ResultsStore()

    def create_experiment(self, experiment: AIExperiment):
        """Create new A/B experiment."""
        self.experiments[experiment.name] = experiment

    def get_variant(self, experiment_name: str, user_id: str) -> str:
        """Deterministic variant assignment based on user ID."""
        experiment = self.experiments[experiment_name]

        # Consistent hashing for stable assignment
        hash_input = f"{experiment_name}:{user_id}"
        hash_value = int(hashlib.md5(hash_input.encode()).hexdigest(), 16)
        bucket = (hash_value % 100) / 100

        if bucket < experiment.allocation_percentage:
            return "treatment"
        return "control"

    def get_config(self, experiment_name: str, user_id: str) -> dict:
        """Get configuration for user's variant."""
        experiment = self.experiments[experiment_name]
        variant = self.get_variant(experiment_name, user_id)

        if variant == "treatment":
            return experiment.treatment_config
        return experiment.control_config

    def log_metric(self, experiment_name: str, user_id: str, metric_name: str, value: float):
        """Log metric for analysis."""
        variant = self.get_variant(experiment_name, user_id)
        self.results_store.log(experiment_name, variant, metric_name, value)

    def analyze_experiment(self, experiment_name: str, metric_name: str) -> dict:
        """Analyze experiment results with statistical significance."""
        control_data = self.results_store.get(experiment_name, "control", metric_name)
        treatment_data = self.results_store.get(experiment_name, "treatment", metric_name)

        # T-test for significance
        t_stat, p_value = stats.ttest_ind(control_data, treatment_data)

        return {
            "control_mean": np.mean(control_data),
            "treatment_mean": np.mean(treatment_data),
            "lift": (np.mean(treatment_data) - np.mean(control_data)) / np.mean(control_data),
            "p_value": p_value,
            "significant": p_value < 0.05,
            "control_n": len(control_data),
            "treatment_n": len(treatment_data)
        }

# Example: Testing different prompts
experiment = AIExperiment(
    name="prompt_v2_test",
    control_config={"prompt_version": "v1", "temperature": 0.7},
    treatment_config={"prompt_version": "v2", "temperature": 0.7},
    allocation_percentage=0.5
)
framework.create_experiment(experiment)

# In application
config = framework.get_config("prompt_v2_test", user_id)
response = generate_response(config)
framework.log_metric("prompt_v2_test", user_id, "user_satisfaction", rating)

Rigorous experimentation enables confident AI improvements based on real user impact.

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.