Back to Blog
2 min read

Testing AI Applications: Strategies for LLM-Based Systems

Testing AI applications presents unique challenges due to the probabilistic nature of LLM outputs. Effective testing strategies combine traditional software testing with AI-specific evaluation approaches.

Property-Based Testing for AI

Test invariant properties rather than exact outputs:

import pytest
from hypothesis import given, strategies as st
from dataclasses import dataclass
from typing import Callable

@dataclass
class AITestCase:
    input_text: str
    expected_properties: list[Callable[[str], bool]]
    category: str

class AIPropertyTester:
    def __init__(self, ai_client, deployment: str):
        self.client = ai_client
        self.deployment = deployment

    def test_response_properties(self, test_case: AITestCase) -> dict:
        """Test that response satisfies expected properties."""

        response = self.client.chat.completions.create(
            model=self.deployment,
            messages=[{"role": "user", "content": test_case.input_text}]
        )

        output = response.choices[0].message.content
        results = {}

        for i, prop in enumerate(test_case.expected_properties):
            try:
                results[f"property_{i}"] = {
                    "passed": prop(output),
                    "output_sample": output[:200]
                }
            except Exception as e:
                results[f"property_{i}"] = {"passed": False, "error": str(e)}

        return {
            "input": test_case.input_text,
            "category": test_case.category,
            "all_passed": all(r["passed"] for r in results.values()),
            "property_results": results
        }

# Property definitions
def contains_no_pii(text: str) -> bool:
    """Check that response contains no PII patterns."""
    import re
    pii_patterns = [
        r'\b\d{3}-\d{2}-\d{4}\b',  # SSN
        r'\b\d{16}\b',  # Credit card
    ]
    return not any(re.search(p, text) for p in pii_patterns)

def is_valid_json(text: str) -> bool:
    """Check that response is valid JSON."""
    import json
    try:
        json.loads(text)
        return True
    except:
        return False

def response_length_reasonable(text: str, max_length: int = 2000) -> bool:
    """Check that response length is reasonable."""
    return len(text) <= max_length

Regression Testing with Golden Datasets

Maintain test datasets for consistent evaluation:

import json
from pathlib import Path
from typing import List
import difflib

class GoldenDatasetTester:
    def __init__(self, golden_path: str, ai_client, similarity_threshold: float = 0.8):
        self.golden_path = Path(golden_path)
        self.client = ai_client
        self.threshold = similarity_threshold

    def load_golden_cases(self) -> List[dict]:
        """Load golden test cases."""
        with open(self.golden_path) as f:
            return json.load(f)

    def run_regression_tests(self) -> dict:
        """Run all regression tests against golden dataset."""

        cases = self.load_golden_cases()
        results = []

        for case in cases:
            response = self.client.chat.completions.create(
                model="gpt-4",
                messages=case["messages"],
                temperature=0.1
            )

            actual_output = response.choices[0].message.content
            expected_output = case["expected_output"]

            similarity = self._calculate_similarity(actual_output, expected_output)

            results.append({
                "case_id": case["id"],
                "passed": similarity >= self.threshold,
                "similarity": similarity,
                "actual": actual_output[:500],
                "expected": expected_output[:500]
            })

        return {
            "total_cases": len(results),
            "passed": sum(1 for r in results if r["passed"]),
            "failed": sum(1 for r in results if not r["passed"]),
            "details": results
        }

    def _calculate_similarity(self, text1: str, text2: str) -> float:
        """Calculate semantic similarity between texts."""
        return difflib.SequenceMatcher(None, text1.lower(), text2.lower()).ratio()

Integration Testing

Test AI components within the full application context, including error handling, timeout behavior, and integration with downstream systems.

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.