5 min read
LLM Benchmarking: Understanding Model Performance
LLM Benchmarking: Understanding Model Performance
Understanding LLM benchmarks is essential for making informed model selection decisions. This guide covers the major benchmarks and how to interpret their results.
Major LLM Benchmarks
from dataclasses import dataclass
from typing import List, Dict
from enum import Enum
class BenchmarkCategory(Enum):
REASONING = "reasoning"
KNOWLEDGE = "knowledge"
CODING = "coding"
MATH = "math"
LANGUAGE = "language"
@dataclass
class Benchmark:
name: str
category: BenchmarkCategory
description: str
metric: str
human_baseline: float
benchmarks = [
Benchmark(
name="MMLU",
category=BenchmarkCategory.KNOWLEDGE,
description="Massive Multitask Language Understanding - 57 subjects",
metric="accuracy",
human_baseline=0.898
),
Benchmark(
name="HumanEval",
category=BenchmarkCategory.CODING,
description="Python coding problems with unit tests",
metric="pass@1",
human_baseline=None # Varies by developer
),
Benchmark(
name="GSM8K",
category=BenchmarkCategory.MATH,
description="Grade school math word problems",
metric="accuracy",
human_baseline=0.95
),
Benchmark(
name="HellaSwag",
category=BenchmarkCategory.REASONING,
description="Commonsense reasoning completion",
metric="accuracy",
human_baseline=0.954
),
Benchmark(
name="TruthfulQA",
category=BenchmarkCategory.KNOWLEDGE,
description="Questions testing truthfulness",
metric="MC1 accuracy",
human_baseline=0.94
),
Benchmark(
name="ARC-Challenge",
category=BenchmarkCategory.REASONING,
description="AI2 Reasoning Challenge - hard science questions",
metric="accuracy",
human_baseline=None
),
Benchmark(
name="MATH",
category=BenchmarkCategory.MATH,
description="Competition mathematics problems",
metric="accuracy",
human_baseline=0.90 # Expert mathematicians
)
]
Model Performance Database
from typing import Optional
@dataclass
class ModelScore:
model_name: str
benchmark: str
score: float
date: str
notes: Optional[str] = None
# March 2024 benchmark data
model_scores = [
# Claude 3 Opus
ModelScore("Claude 3 Opus", "MMLU", 0.868, "2024-03-04"),
ModelScore("Claude 3 Opus", "HumanEval", 0.849, "2024-03-04"),
ModelScore("Claude 3 Opus", "GSM8K", 0.950, "2024-03-04"),
ModelScore("Claude 3 Opus", "HellaSwag", 0.953, "2024-03-04"),
# GPT-4 Turbo
ModelScore("GPT-4 Turbo", "MMLU", 0.864, "2024-01-25"),
ModelScore("GPT-4 Turbo", "HumanEval", 0.670, "2024-01-25"),
ModelScore("GPT-4 Turbo", "GSM8K", 0.920, "2024-01-25"),
ModelScore("GPT-4 Turbo", "HellaSwag", 0.950, "2024-01-25"),
# Llama 2 70B
ModelScore("Llama 2 70B", "MMLU", 0.689, "2023-07-18"),
ModelScore("Llama 2 70B", "HumanEval", 0.298, "2023-07-18"),
ModelScore("Llama 2 70B", "GSM8K", 0.568, "2023-07-18"),
# Mistral Large
ModelScore("Mistral Large", "MMLU", 0.812, "2024-02-26"),
ModelScore("Mistral Large", "HumanEval", 0.451, "2024-02-26"),
ModelScore("Mistral Large", "GSM8K", 0.810, "2024-02-26"),
]
def compare_models(benchmark: str) -> List[tuple]:
"""Compare all models on a specific benchmark"""
scores = [s for s in model_scores if s.benchmark == benchmark]
scores.sort(key=lambda x: x.score, reverse=True)
return [(s.model_name, s.score) for s in scores]
# Compare on MMLU
print("MMLU Rankings:")
for model, score in compare_models("MMLU"):
print(f" {model}: {score:.1%}")
Running Your Own Benchmarks
import json
from typing import Callable
import anthropic
from openai import OpenAI
class BenchmarkRunner:
def __init__(self):
self.anthropic = anthropic.Anthropic()
self.openai = OpenAI()
def run_mmlu_sample(
self,
model: str,
provider: str,
questions: List[dict]
) -> float:
"""Run MMLU-style questions"""
correct = 0
for q in questions:
prompt = f"""Answer the following multiple choice question.
Reply with only the letter (A, B, C, or D).
Question: {q['question']}
A) {q['choices'][0]}
B) {q['choices'][1]}
C) {q['choices'][2]}
D) {q['choices'][3]}
Answer:"""
if provider == "anthropic":
response = self.anthropic.messages.create(
model=model,
max_tokens=1,
messages=[{"role": "user", "content": prompt}]
)
answer = response.content[0].text.strip().upper()
else:
response = self.openai.chat.completions.create(
model=model,
max_tokens=1,
messages=[{"role": "user", "content": prompt}]
)
answer = response.choices[0].message.content.strip().upper()
correct_letter = chr(65 + q['answer']) # 0->A, 1->B, etc.
if answer == correct_letter:
correct += 1
return correct / len(questions)
def run_coding_benchmark(
self,
model: str,
provider: str,
problems: List[dict],
executor: Callable
) -> float:
"""Run HumanEval-style coding problems"""
passed = 0
for problem in problems:
prompt = f"""Complete the following Python function.
Return only the function implementation, no explanations.
{problem['prompt']}"""
if provider == "anthropic":
response = self.anthropic.messages.create(
model=model,
max_tokens=500,
messages=[{"role": "user", "content": prompt}]
)
code = response.content[0].text
else:
response = self.openai.chat.completions.create(
model=model,
max_tokens=500,
messages=[{"role": "user", "content": prompt}]
)
code = response.choices[0].message.content
# Execute tests
try:
result = executor(code, problem['test_cases'])
if result:
passed += 1
except Exception as e:
pass # Test failed
return passed / len(problems)
# Example usage
runner = BenchmarkRunner()
sample_questions = [
{
"question": "What is the capital of France?",
"choices": ["London", "Berlin", "Paris", "Madrid"],
"answer": 2 # Paris
},
# Add more questions...
]
score = runner.run_mmlu_sample(
model="claude-3-opus-20240229",
provider="anthropic",
questions=sample_questions
)
print(f"MMLU Sample Score: {score:.1%}")
Visualizing Benchmark Results
import matplotlib.pyplot as plt
import numpy as np
def plot_model_comparison(scores: List[ModelScore]):
"""Create radar chart comparing models"""
models = list(set(s.model_name for s in scores))
benchmarks = list(set(s.benchmark for s in scores))
# Create score matrix
score_matrix = {}
for model in models:
score_matrix[model] = {}
for benchmark in benchmarks:
matching = [s for s in scores
if s.model_name == model and s.benchmark == benchmark]
score_matrix[model][benchmark] = matching[0].score if matching else 0
# Plot
angles = np.linspace(0, 2 * np.pi, len(benchmarks), endpoint=False).tolist()
angles += angles[:1] # Close the plot
fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(polar=True))
for model in models:
values = [score_matrix[model].get(b, 0) for b in benchmarks]
values += values[:1]
ax.plot(angles, values, label=model, linewidth=2)
ax.fill(angles, values, alpha=0.1)
ax.set_xticks(angles[:-1])
ax.set_xticklabels(benchmarks)
ax.set_ylim(0, 1)
ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.0))
ax.set_title("Model Benchmark Comparison")
plt.tight_layout()
plt.savefig("benchmark_comparison.png")
plt.show()
# Generate comparison
plot_model_comparison(model_scores)
Key Insights
- No single benchmark tells the whole story - Use multiple benchmarks
- Domain-specific evaluation is crucial - Test on your actual use case
- Benchmark scores can be gamed - Be skeptical of outlier results
- Human baselines matter - Compare against human performance
Conclusion
Benchmarks provide useful signals but shouldn’t be the only factor in model selection. Always validate with your specific use cases and data.