March 15, 2024 1 min read

MMLU Scores Explained: What They Really Mean

MMLU (Massive Multitask Language Understanding) is one of the most cited LLM benchmarks. Understanding what it measures and its limitations is crucial for interpreting model comparisons.

What is MMLU?

MMLU tests knowledge across 57 subjects organized into four categories:

from dataclasses import dataclass
from typing import List, Dict

@dataclass
class MMLUSubject:
    name: str
    category: str
    num_questions: int

MMLU_SUBJECTS = [
    # STEM
    MMLUSubject("abstract_algebra", "STEM", 100),
    MMLUSubject("anatomy", "STEM", 135),
    MMLUSubject("astronomy", "STEM", 152),
    MMLUSubject("college_biology", "STEM", 144),
    MMLUSubject("college_chemistry", "STEM", 100),
    MMLUSubject("college_computer_science", "STEM", 100),
    MMLUSubject("college_mathematics", "STEM", 100),
    MMLUSubject("college_physics", "STEM", 102),
    MMLUSubject("computer_security", "STEM", 100),
    MMLUSubject("electrical_engineering", "STEM", 145),
    MMLUSubject("machine_learning", "STEM", 112),

    # Humanities
    MMLUSubject("formal_logic", "Humanities", 126),
    MMLUSubject("high_school_european_history", "Humanities", 165),
    MMLUSubject("high_school_us_history", "Humanities", 204),
    MMLUSubject("high_school_world_history", "Humanities", 237),
    MMLUSubject("international_law", "Humanities", 121),
    MMLUSubject("jurisprudence", "Humanities", 108),
    MMLUSubject("logical_fallacies", "Humanities", 163),
    MMLUSubject("moral_disputes", "Humanities", 346),
    MMLUSubject("moral_scenarios", "Humanities", 895),
    MMLUSubject("philosophy", "Humanities", 311),
    MMLUSubject("prehistory", "Humanities", 324),
    MMLUSubject("professional_law", "Humanities", 1534),
    MMLUSubject("world_religions", "Humanities", 171),

    # Social Sciences
    MMLUSubject("econometrics", "Social Sciences", 114),
    MMLUSubject("high_school_geography", "Social Sciences", 198),
    MMLUSubject("high_school_government_and_politics", "Social Sciences", 193),
    MMLUSubject("high_school_macroeconomics", "Social Sciences", 390),
    MMLUSubject("high_school_microeconomics", "Social Sciences", 238),
    MMLUSubject("high_school_psychology", "Social Sciences", 545),
    MMLUSubject("human_sexuality", "Social Sciences", 131),
    MMLUSubject("professional_psychology", "Social Sciences", 612),
    MMLUSubject("public_relations", "Social Sciences", 110),
    MMLUSubject("security_studies", "Social Sciences", 245),
    MMLUSubject("sociology", "Social Sciences", 201),
    MMLUSubject("us_foreign_policy", "Social Sciences", 110),

    # Other
    MMLUSubject("business_ethics", "Other", 100),
    MMLUSubject("clinical_knowledge", "Other", 265),
    MMLUSubject("college_medicine", "Other", 173),
    MMLUSubject("global_facts", "Other", 100),
    MMLUSubject("management", "Other", 103),
    MMLUSubject("marketing", "Other", 234),
    MMLUSubject("medical_genetics", "Other", 100),
    MMLUSubject("miscellaneous", "Other", 783),
    MMLUSubject("nutrition", "Other", 306),
    MMLUSubject("professional_accounting", "Other", 282),
    MMLUSubject("professional_medicine", "Other", 272),
    MMLUSubject("virology", "Other", 166),
]

def get_category_stats():
    """Get statistics by category"""
    categories: Dict[str, List[MMLUSubject]] = {}
    for subject in MMLU_SUBJECTS:
        if subject.category not in categories:
            categories[subject.category] = []
        categories[subject.category].append(subject)

    for category, subjects in categories.items():
        total_questions = sum(s.num_questions for s in subjects)
        print(f"{category}: {len(subjects)} subjects, {total_questions} questions")

get_category_stats()

MMLU Question Format

def format_mmlu_question(question: dict) -> str:
    """Format MMLU question for evaluation"""

    formatted = f"""Question: {question['question']}

A) {question['choices'][0]}
B) {question['choices'][1]}
C) {question['choices'][2]}
D) {question['choices'][3]}

Answer:"""

    return formatted

# Example MMLU question
example_question = {
    "question": "What is the time complexity of binary search?",
    "choices": ["O(1)", "O(n)", "O(log n)", "O(n log n)"],
    "answer": 2  # C) O(log n)
}

print(format_mmlu_question(example_question))

Running MMLU Evaluation

import anthropic
import json
from typing import List, Tuple

class MMLUEvaluator:
    def __init__(self):
        self.client = anthropic.Anthropic()

    def evaluate_subject(
        self,
        model: str,
        questions: List[dict],
        few_shot_examples: List[dict] = None
    ) -> Tuple[float, List[dict]]:
        """Evaluate model on MMLU subject"""

        results = []
        correct = 0

        for q in questions:
            # Build prompt
            prompt = ""

            # Add few-shot examples if provided
            if few_shot_examples:
                for ex in few_shot_examples:
                    prompt += format_mmlu_question(ex)
                    answer_letter = chr(65 + ex['answer'])
                    prompt += f" {answer_letter}\n\n"

            prompt += format_mmlu_question(q)

            # Get model response
            response = self.client.messages.create(
                model=model,
                max_tokens=1,
                messages=[{"role": "user", "content": prompt}]
            )

            predicted = response.content[0].text.strip().upper()
            expected = chr(65 + q['answer'])

            is_correct = predicted == expected
            if is_correct:
                correct += 1

            results.append({
                "question": q['question'],
                "predicted": predicted,
                "expected": expected,
                "correct": is_correct
            })

        accuracy = correct / len(questions)
        return accuracy, results

    def evaluate_full_mmlu(
        self,
        model: str,
        dataset: dict,
        num_few_shot: int = 5
    ) -> dict:
        """Evaluate on full MMLU"""

        category_scores = {}
        subject_scores = {}

        for subject_name, data in dataset.items():
            # Use some questions as few-shot examples
            few_shot = data['questions'][:num_few_shot]
            test_questions = data['questions'][num_few_shot:]

            accuracy, _ = self.evaluate_subject(
                model=model,
                questions=test_questions,
                few_shot_examples=few_shot
            )

            subject_scores[subject_name] = accuracy

            # Aggregate by category
            category = data['category']
            if category not in category_scores:
                category_scores[category] = []
            category_scores[category].append(accuracy)

        # Calculate averages
        results = {
            "subject_scores": subject_scores,
            "category_averages": {
                cat: sum(scores) / len(scores)
                for cat, scores in category_scores.items()
            },
            "overall_average": sum(subject_scores.values()) / len(subject_scores)
        }

        return results

# Usage
evaluator = MMLUEvaluator()
# results = evaluator.evaluate_full_mmlu("claude-3-opus-20240229", mmlu_dataset)

Interpreting MMLU Scores

def interpret_mmlu_score(score: float) -> str:
    """Interpret MMLU score in context"""

    if score >= 0.90:
        level = "Expert level"
        context = "Approaching human expert performance"
    elif score >= 0.80:
        level = "Strong performance"
        context = "College graduate level"
    elif score >= 0.70:
        level = "Good performance"
        context = "Undergraduate level"
    elif score >= 0.60:
        level = "Moderate performance"
        context = "High school advanced level"
    elif score >= 0.50:
        level = "Basic performance"
        context = "High school level"
    else:
        level = "Below baseline"
        context = "Random guessing is 25%"

    return f"{level} ({score:.1%}): {context}"

# Model scores interpretation
model_mmlu_scores = {
    "Claude 3 Opus": 0.868,
    "GPT-4 Turbo": 0.864,
    "Claude 3 Sonnet": 0.790,
    "Llama 2 70B": 0.689,
    "Mistral Large": 0.812,
}

for model, score in model_mmlu_scores.items():
    print(f"{model}: {interpret_mmlu_score(score)}")

MMLU Limitations

# Key limitations to consider

limitations = {
    "multiple_choice_bias": """
        MMLU uses multiple choice format, which may not reflect
        real-world task performance where open-ended responses
        are required.
    """,

    "memorization_risk": """
        Popular benchmarks like MMLU may be partially memorized
        by models trained on large internet corpora.
    """,

    "static_knowledge": """
        MMLU tests factual knowledge that may become outdated.
        It doesn't measure reasoning ability directly.
    """,

    "english_only": """
        MMLU is English-only and may not reflect multilingual
        capabilities.
    """,

    "answer_distribution": """
        Uneven answer distribution across questions could allow
        gaming through pattern recognition.
    """
}

def print_limitations():
    print("MMLU Benchmark Limitations:\n")
    for limitation, description in limitations.items():
        print(f"- {limitation.replace('_', ' ').title()}")
        print(f"  {description.strip()}\n")

print_limitations()

Best Practices for Using MMLU

Use alongside other benchmarks - MMLU alone is insufficient
Consider few-shot vs zero-shot - Results vary significantly
Look at category breakdowns - Overall scores hide weaknesses
Test on your domain - MMLU may not predict task performance

Conclusion

MMLU provides a useful signal for general knowledge capabilities, but it should be one of many factors in model evaluation. Always validate with domain-specific testing for production use cases.