Back to Blog
2 min read

AI Testing Frameworks: Comprehensive Testing for AI Systems

Testing AI systems requires specialized frameworks. Here’s how to implement comprehensive AI testing.

AI Testing Framework

import pytest
from typing import List, Dict
from dataclasses import dataclass

@dataclass
class TestCase:
    id: str
    input: str
    expected_behavior: str
    context: str = None

class AITestFramework:
    def __init__(self, ai_client, evaluator):
        self.ai = ai_client
        self.evaluator = evaluator

    async def run_test_suite(self, test_cases: List[TestCase]) -> Dict:
        """Run complete test suite."""
        results = {
            "passed": 0,
            "failed": 0,
            "details": []
        }

        for case in test_cases:
            result = await self.run_test(case)
            results["details"].append(result)
            if result["passed"]:
                results["passed"] += 1
            else:
                results["failed"] += 1

        results["pass_rate"] = results["passed"] / len(test_cases)
        return results

    async def run_test(self, case: TestCase) -> Dict:
        """Run single test case."""
        # Get AI response
        response = await self.ai.generate(case.input, case.context)

        # Evaluate response
        evaluation = await self.evaluator.evaluate(
            input=case.input,
            output=response,
            expected=case.expected_behavior
        )

        return {
            "test_id": case.id,
            "input": case.input,
            "output": response,
            "expected": case.expected_behavior,
            "passed": evaluation["score"] >= 0.7,
            "score": evaluation["score"],
            "feedback": evaluation["feedback"]
        }

    async def test_consistency(self, input: str, num_runs: int = 5) -> Dict:
        """Test response consistency."""
        responses = []
        for _ in range(num_runs):
            response = await self.ai.generate(input)
            responses.append(response)

        # Calculate consistency metrics
        embeddings = await self.get_embeddings(responses)
        similarities = self.pairwise_similarities(embeddings)

        return {
            "responses": responses,
            "avg_similarity": np.mean(similarities),
            "min_similarity": np.min(similarities),
            "consistent": np.mean(similarities) > 0.85
        }

    async def test_robustness(self, input: str, perturbations: List[str]) -> Dict:
        """Test robustness to input variations."""
        base_response = await self.ai.generate(input)
        results = []

        for perturbed in perturbations:
            perturbed_response = await self.ai.generate(perturbed)
            similarity = await self.semantic_similarity(base_response, perturbed_response)
            results.append({
                "perturbation": perturbed,
                "similarity": similarity,
                "robust": similarity > 0.8
            })

        return {
            "base_input": input,
            "base_response": base_response,
            "perturbation_results": results,
            "robustness_score": np.mean([r["similarity"] for r in results])
        }

    async def test_boundaries(self, boundary_cases: List[Dict]) -> Dict:
        """Test behavior at boundaries."""
        results = []
        for case in boundary_cases:
            response = await self.ai.generate(case["input"])
            expected_behavior = case["expected"]
            actual_behavior = self.classify_behavior(response)

            results.append({
                "case": case["description"],
                "expected": expected_behavior,
                "actual": actual_behavior,
                "passed": actual_behavior == expected_behavior
            })

        return {
            "boundary_tests": results,
            "pass_rate": sum(r["passed"] for r in results) / len(results)
        }

Comprehensive AI testing ensures reliable behavior across diverse scenarios.

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.