Skip to content
Back to Blog
1 min read

AI Testing Frameworks: Comprehensive Testing for AI Systems

I wrote “AI Testing Frameworks: Comprehensive Testing for AI Systems” to share practical, production-minded guidance on this topic.

AI Testing Framework

import pytest
from typing import List, Dict
from dataclasses import dataclass

@dataclass
class TestCase:
    id: str
    input: str
    expected_behavior: str
    context: str = None

class AITestFramework:
    def __init__(self, ai_client, evaluator):
        self.ai = ai_client
        self.evaluator = evaluator

    async def run_test_suite(self, test_cases: List[TestCase]) -> Dict:
        """Run complete test suite."""
        results = {
            "passed": 0,
            "failed": 0,
            "details": []
        }

        for case in test_cases:
            result = await self.run_test(case)
            results["details"].append(result)
            if result["passed"]:
                results["passed"] += 1
            else:
                results["failed"] += 1

        results["pass_rate"] = results["passed"] / len(test_cases)
        return results

    async def run_test(self, case: TestCase) -> Dict:
        """Run single test case."""
        # Get AI response
        response = await self.ai.generate(case.input, case.context)

        # Evaluate response
        evaluation = await self.evaluator.evaluate(
            input=case.input,
            output=response,
            expected=case.expected_behavior
        )

        return {
            "test_id": case.id,
            "input": case.input,
            "output": response,
            "expected": case.expected_behavior,
            "passed": evaluation["score"] >= 0.7,
            "score": evaluation["score"],
            "feedback": evaluation["feedback"]
        }

    async def test_consistency(self, input: str, num_runs: int = 5) -> Dict:
        """Test response consistency."""
        responses = []
        for _ in range(num_runs):
            response = await self.ai.generate(input)
            responses.append(response)

        # Calculate consistency metrics
        embeddings = await self.get_embeddings(responses)
        similarities = self.pairwise_similarities(embeddings)

        return {
            "responses": responses,
            "avg_similarity": np.mean(similarities),
            "min_similarity": np.min(similarities),
            "consistent": np.mean(similarities) > 0.85
        }

    async def test_robustness(self, input: str, perturbations: List[str]) -> Dict:
        """Test robustness to input variations."""
        base_response = await self.ai.generate(input)
        results = []

        for perturbed in perturbations:
            perturbed_response = await self.ai.generate(perturbed)
            similarity = await self.semantic_similarity(base_response, perturbed_response)
            results.append({
                "perturbation": perturbed,
                "similarity": similarity,
                "robust": similarity > 0.8
            })

        return {
            "base_input": input,
            "base_response": base_response,
            "perturbation_results": results,
            "robustness_score": np.mean([r["similarity"] for r in results])
        }

    async def test_boundaries(self, boundary_cases: List[Dict]) -> Dict:
        """Test behavior at boundaries."""
        results = []
        for case in boundary_cases:
            response = await self.ai.generate(case["input"])
            expected_behavior = case["expected"]
            actual_behavior = self.classify_behavior(response)

            results.append({
                "case": case["description"],
                "expected": expected_behavior,
                "actual": actual_behavior,
                "passed": actual_behavior == expected_behavior
            })

        return {
            "boundary_tests": results,
            "pass_rate": sum(r["passed"] for r in results) / len(results)
        }

Comprehensive AI testing ensures reliable behavior across diverse scenarios.\n\n## Takeaways\n\nAdd a concise, personal takeaway and recommended next steps here.\n

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.