6 min read
LLM Benchmarking: Evaluating Models for Your Use Case
LLM Benchmarking: Evaluating Models for Your Use Case
Public benchmarks like MMLU and HumanEval provide useful comparisons, but the real question is: how will the model perform on YOUR tasks? Let’s build a comprehensive evaluation framework.
Understanding Standard Benchmarks
from dataclasses import dataclass
from typing import List, Dict, Callable
from enum import Enum
class BenchmarkCategory(Enum):
KNOWLEDGE = "World Knowledge"
REASONING = "Reasoning & Logic"
CODE = "Code Generation"
MATH = "Mathematical Ability"
LANGUAGE = "Language Understanding"
SAFETY = "Safety & Ethics"
@dataclass
class StandardBenchmark:
name: str
category: BenchmarkCategory
description: str
metric: str
what_it_measures: str
limitations: List[str]
standard_benchmarks = {
"MMLU": StandardBenchmark(
name="MMLU",
category=BenchmarkCategory.KNOWLEDGE,
description="Massive Multitask Language Understanding",
metric="Accuracy (%)",
what_it_measures="Knowledge across 57 subjects from STEM to humanities",
limitations=[
"Multiple choice format may not reflect real-world use",
"Memorization can inflate scores",
"Doesn't test reasoning depth"
]
),
"HumanEval": StandardBenchmark(
name="HumanEval",
category=BenchmarkCategory.CODE,
description="Code generation benchmark",
metric="pass@k",
what_it_measures="Ability to generate working Python functions",
limitations=[
"Only Python",
"Relatively simple problems",
"Doesn't test debugging or refactoring"
]
),
"GSM8K": StandardBenchmark(
name="GSM8K",
category=BenchmarkCategory.MATH,
description="Grade School Math 8K",
metric="Accuracy (%)",
what_it_measures="Multi-step mathematical reasoning",
limitations=[
"Only grade-school level",
"Word problems may have specific patterns",
"Chain-of-thought prompting inflates scores"
]
),
"TruthfulQA": StandardBenchmark(
name="TruthfulQA",
category=BenchmarkCategory.SAFETY,
description="Truthfulness evaluation",
metric="MC1/MC2 Accuracy",
what_it_measures="Resistance to common misconceptions",
limitations=[
"Limited question set",
"May not cover domain-specific misinformation",
"Binary truth assessment oversimplified"
]
)
}
Building Custom Evaluation Framework
import json
import time
from typing import List, Dict, Any, Callable, Optional
from dataclasses import dataclass, field
from abc import ABC, abstractmethod
@dataclass
class EvalExample:
input_text: str
expected_output: str
category: str
difficulty: str = "medium"
metadata: Dict = field(default_factory=dict)
@dataclass
class EvalResult:
example_id: int
input_text: str
expected: str
actual: str
score: float
latency_ms: float
tokens_used: int
metadata: Dict = field(default_factory=dict)
class Evaluator(ABC):
@abstractmethod
def score(self, expected: str, actual: str, context: Dict = None) -> float:
pass
class ExactMatchEvaluator(Evaluator):
def score(self, expected: str, actual: str, context: Dict = None) -> float:
return 1.0 if expected.strip().lower() == actual.strip().lower() else 0.0
class ContainsEvaluator(Evaluator):
def score(self, expected: str, actual: str, context: Dict = None) -> float:
return 1.0 if expected.lower() in actual.lower() else 0.0
class SemanticSimilarityEvaluator(Evaluator):
def __init__(self, embedding_model):
self.embedding_model = embedding_model
def score(self, expected: str, actual: str, context: Dict = None) -> float:
# Use embedding similarity
emb1 = self.embedding_model.encode(expected)
emb2 = self.embedding_model.encode(actual)
# Cosine similarity
from numpy import dot
from numpy.linalg import norm
return float(dot(emb1, emb2) / (norm(emb1) * norm(emb2)))
class LLMJudgeEvaluator(Evaluator):
def __init__(self, judge_model):
self.judge_model = judge_model
def score(self, expected: str, actual: str, context: Dict = None) -> float:
prompt = f"""Rate the quality of the response on a scale of 0-10.
Expected answer: {expected}
Actual response: {actual}
Consider:
- Accuracy of information
- Completeness
- Relevance to the question
Score (0-10):"""
result = self.judge_model.generate(prompt)
try:
score = float(result.strip().split()[0]) / 10
return min(max(score, 0), 1) # Clamp to [0, 1]
except:
return 0.5 # Default if parsing fails
class EvaluationPipeline:
def __init__(
self,
model: Any, # Your LLM interface
evaluator: Evaluator,
examples: List[EvalExample]
):
self.model = model
self.evaluator = evaluator
self.examples = examples
self.results: List[EvalResult] = []
def run(self, max_tokens: int = 500) -> Dict:
"""Run evaluation on all examples."""
for i, example in enumerate(self.examples):
start_time = time.time()
# Generate response
response = self.model.generate(
example.input_text,
max_tokens=max_tokens
)
latency = (time.time() - start_time) * 1000
# Score response
score = self.evaluator.score(
example.expected_output,
response,
{"category": example.category}
)
self.results.append(EvalResult(
example_id=i,
input_text=example.input_text,
expected=example.expected_output,
actual=response,
score=score,
latency_ms=latency,
tokens_used=len(response.split()), # Approximate
metadata={"category": example.category, "difficulty": example.difficulty}
))
return self.summarize()
def summarize(self) -> Dict:
"""Summarize evaluation results."""
scores = [r.score for r in self.results]
latencies = [r.latency_ms for r in self.results]
# Group by category
by_category = {}
for r in self.results:
cat = r.metadata.get("category", "unknown")
if cat not in by_category:
by_category[cat] = []
by_category[cat].append(r.score)
return {
"total_examples": len(self.results),
"overall_score": sum(scores) / len(scores),
"score_std": self._std(scores),
"avg_latency_ms": sum(latencies) / len(latencies),
"p95_latency_ms": sorted(latencies)[int(len(latencies) * 0.95)],
"by_category": {
cat: sum(scores) / len(scores)
for cat, scores in by_category.items()
}
}
def _std(self, values: List[float]) -> float:
mean = sum(values) / len(values)
variance = sum((x - mean) ** 2 for x in values) / len(values)
return variance ** 0.5
Creating Domain-Specific Test Sets
def create_customer_support_eval_set() -> List[EvalExample]:
"""Create evaluation set for customer support use case."""
return [
EvalExample(
input_text="What is your return policy for electronics?",
expected_output="Our return policy for electronics allows returns within 30 days with original packaging and receipt. Items must be in working condition.",
category="policy",
difficulty="easy"
),
EvalExample(
input_text="My order #12345 hasn't arrived yet. It's been 2 weeks.",
expected_output="I apologize for the delay. Let me look up order #12345 and provide you with tracking information and estimated delivery date.",
category="order_inquiry",
difficulty="medium"
),
EvalExample(
input_text="I want to speak to a manager right now!",
expected_output="I understand your frustration. I'd like to try to help resolve your issue first. Could you tell me what happened? If needed, I can escalate to a supervisor.",
category="escalation",
difficulty="hard"
),
EvalExample(
input_text="Can I use multiple discount codes?",
expected_output="Only one discount code can be applied per order. If you have multiple codes, I recommend using the one with the highest value.",
category="policy",
difficulty="easy"
)
]
def create_code_review_eval_set() -> List[EvalExample]:
"""Create evaluation set for code review use case."""
return [
EvalExample(
input_text="""Review this code:
def calculate_total(items):
total = 0
for item in items:
total = total + item['price'] * item['quantity']
return total""",
expected_output="The code is functional but could be improved: 1) Use += operator, 2) Consider using sum() with generator expression, 3) Add type hints, 4) Handle potential KeyError if price/quantity missing.",
category="code_quality",
difficulty="medium"
)
]
Running Model Comparisons
def compare_models(
models: Dict[str, Any],
eval_set: List[EvalExample],
evaluator: Evaluator
) -> Dict[str, Dict]:
"""Compare multiple models on the same evaluation set."""
results = {}
for model_name, model in models.items():
print(f"Evaluating {model_name}...")
pipeline = EvaluationPipeline(model, evaluator, eval_set)
results[model_name] = pipeline.run()
# Create comparison table
comparison = {
"models": list(results.keys()),
"overall_scores": [r["overall_score"] for r in results.values()],
"avg_latencies": [r["avg_latency_ms"] for r in results.values()]
}
return {
"detailed_results": results,
"comparison": comparison
}
# Example usage
"""
models = {
"gpt-4-turbo": gpt4_client,
"gpt-35-turbo": gpt35_client,
"llama-2-70b": llama_client,
"mixtral-8x7b": mixtral_client
}
eval_set = create_customer_support_eval_set()
evaluator = LLMJudgeEvaluator(judge_model)
results = compare_models(models, eval_set, evaluator)
"""
Best Practices
- Use multiple evaluators - Combine exact match, semantic similarity, and LLM-as-judge
- Test edge cases - Include adversarial and boundary examples
- Version your eval sets - Track changes over time
- Blind evaluation - Remove model names during human review
- Continuously update - Add examples from production failures
Tomorrow, we’ll explore Azure OpenAI updates and the Assistants API on Azure!