November 8, 2023 1 min read

Seed Parameter: Achieving Reproducible LLM Outputs

OpenAI API Reproducibility GPT-4 Testing

Seed Parameter: Achieving Reproducible LLM Outputs

Reproducibility has been one of the biggest challenges in LLM applications. The new seed parameter in GPT-4 Turbo brings deterministic outputs, enabling consistent testing and debugging.

Understanding the Seed Parameter

When you provide a seed value, the model attempts to return the same response for identical requests:

from openai import OpenAI

client = OpenAI()

def generate_with_seed(prompt: str, seed: int) -> tuple[str, str]:
    """Generate response with seed for reproducibility."""
    response = client.chat.completions.create(
        model="gpt-4-1106-preview",
        messages=[
            {"role": "user", "content": prompt}
        ],
        seed=seed,
        temperature=0  # Combine with low temperature for best results
    )

    return (
        response.choices[0].message.content,
        response.system_fingerprint  # Track model version
    )

# Generate with same seed multiple times
prompt = "Write a haiku about programming"
seed = 42

result1, fingerprint1 = generate_with_seed(prompt, seed)
result2, fingerprint2 = generate_with_seed(prompt, seed)

print(f"Response 1: {result1}")
print(f"Response 2: {result2}")
print(f"Fingerprints match: {fingerprint1 == fingerprint2}")
print(f"Responses match: {result1 == result2}")

System Fingerprint

The system_fingerprint helps track model versions:

def track_model_consistency(prompt: str, seed: int, expected_fingerprint: str = None):
    """Track model consistency across calls."""
    response = client.chat.completions.create(
        model="gpt-4-1106-preview",
        messages=[{"role": "user", "content": prompt}],
        seed=seed
    )

    current_fingerprint = response.system_fingerprint

    if expected_fingerprint and current_fingerprint != expected_fingerprint:
        print(f"Warning: Model version changed!")
        print(f"Expected: {expected_fingerprint}")
        print(f"Current: {current_fingerprint}")
        return None, current_fingerprint

    return response.choices[0].message.content, current_fingerprint

# Store fingerprint for consistency tracking
_, initial_fingerprint = track_model_consistency("Hello", 42)
print(f"Initial fingerprint: {initial_fingerprint}")

# Later calls can verify consistency
result, _ = track_model_consistency("Hello", 42, initial_fingerprint)

Use Cases for Reproducibility

1. Testing and Validation

import unittest

class TestLLMOutputs(unittest.TestCase):
    def setUp(self):
        self.client = OpenAI()
        self.seed = 12345

    def generate(self, prompt: str) -> str:
        response = self.client.chat.completions.create(
            model="gpt-4-1106-preview",
            messages=[{"role": "user", "content": prompt}],
            seed=self.seed,
            temperature=0
        )
        return response.choices[0].message.content

    def test_sentiment_analysis(self):
        """Test that sentiment analysis is consistent."""
        result = self.generate(
            "Classify the sentiment of 'I love this product!' as positive, negative, or neutral. Reply with one word."
        )
        self.assertEqual(result.strip().lower(), "positive")

    def test_entity_extraction(self):
        """Test entity extraction consistency."""
        result = self.generate(
            "Extract the company name from: 'Microsoft announced new AI features'. Reply with just the company name."
        )
        self.assertEqual(result.strip(), "Microsoft")

if __name__ == "__main__":
    unittest.main()

2. A/B Testing Prompts

def compare_prompts(prompts: list, test_inputs: list, seed: int) -> dict:
    """Compare different prompts with consistent outputs."""
    results = {i: [] for i in range(len(prompts))}

    for test_input in test_inputs:
        for idx, prompt_template in enumerate(prompts):
            prompt = prompt_template.format(input=test_input)

            response = client.chat.completions.create(
                model="gpt-4-1106-preview",
                messages=[{"role": "user", "content": prompt}],
                seed=seed,
                temperature=0
            )

            results[idx].append({
                "input": test_input,
                "output": response.choices[0].message.content,
                "fingerprint": response.system_fingerprint
            })

    return results

# Compare two prompt strategies
prompts = [
    "Summarize this text in one sentence: {input}",
    "You are an expert summarizer. Create a concise one-sentence summary: {input}"
]

test_inputs = [
    "The quick brown fox jumps over the lazy dog. This is a classic pangram.",
    "Machine learning is transforming industries worldwide."
]

comparison = compare_prompts(prompts, test_inputs, seed=42)

3. Debugging and Logging

import logging
from datetime import datetime

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class ReproducibleLLMClient:
    def __init__(self, default_seed: int = None):
        self.client = OpenAI()
        self.default_seed = default_seed
        self.call_log = []

    def generate(self, prompt: str, seed: int = None) -> str:
        """Generate with full logging for reproducibility."""
        effective_seed = seed or self.default_seed

        response = self.client.chat.completions.create(
            model="gpt-4-1106-preview",
            messages=[{"role": "user", "content": prompt}],
            seed=effective_seed,
            temperature=0
        )

        log_entry = {
            "timestamp": datetime.now().isoformat(),
            "prompt": prompt,
            "seed": effective_seed,
            "fingerprint": response.system_fingerprint,
            "response": response.choices[0].message.content,
            "usage": {
                "prompt_tokens": response.usage.prompt_tokens,
                "completion_tokens": response.usage.completion_tokens
            }
        }

        self.call_log.append(log_entry)
        logger.info(f"Generated response with seed {effective_seed}, fingerprint {response.system_fingerprint}")

        return response.choices[0].message.content

    def replay(self, log_entry: dict) -> bool:
        """Replay a logged call and verify consistency."""
        new_response = self.generate(log_entry["prompt"], log_entry["seed"])
        return new_response == log_entry["response"]

# Usage
llm = ReproducibleLLMClient(default_seed=42)
result = llm.generate("What is 2 + 2?")

# Later, verify reproducibility
is_consistent = llm.replay(llm.call_log[0])
print(f"Output is reproducible: {is_consistent}")

Best Practices

Always use temperature=0 with seeds for maximum consistency
Store system fingerprints to detect model updates
Document seeds used in production for debugging
Version your prompts alongside seeds

# Configuration management for reproducible LLM calls
LLM_CONFIG = {
    "version": "1.0.0",
    "model": "gpt-4-1106-preview",
    "seeds": {
        "classification": 42,
        "summarization": 123,
        "extraction": 456
    },
    "expected_fingerprints": {
        # Update these when model changes are detected
    }
}

Conclusion

The seed parameter transforms LLM development by enabling reproducibility. This is crucial for testing, debugging, and building reliable production systems. Tomorrow, we’ll explore the enhanced function calling capabilities in GPT-4 Turbo!