March 17, 2024 1 min read

Task-Specific Evaluation for LLMs: Beyond Generic Benchmarks

AI LLM Evaluation Machine Learning Testing

Task-Specific Evaluation for LLMs: Beyond Generic Benchmarks

Generic benchmarks like MMLU and HumanEval don’t predict performance on your specific use cases. This guide covers how to design and implement task-specific evaluation pipelines.

Why Task-Specific Evaluation?

from dataclasses import dataclass
from typing import List, Dict, Callable, Any
from enum import Enum

class TaskType(Enum):
    CLASSIFICATION = "classification"
    EXTRACTION = "extraction"
    GENERATION = "generation"
    SUMMARIZATION = "summarization"
    TRANSLATION = "translation"
    QA = "question_answering"
    CHAT = "conversational"

@dataclass
class EvaluationTask:
    name: str
    task_type: TaskType
    description: str
    metrics: List[str]
    sample_size: int

# Define your specific tasks
my_tasks = [
    EvaluationTask(
        name="customer_intent_classification",
        task_type=TaskType.CLASSIFICATION,
        description="Classify customer messages into intent categories",
        metrics=["accuracy", "f1_macro", "confusion_matrix"],
        sample_size=500
    ),
    EvaluationTask(
        name="contract_clause_extraction",
        task_type=TaskType.EXTRACTION,
        description="Extract key clauses from legal contracts",
        metrics=["precision", "recall", "f1", "exact_match"],
        sample_size=200
    ),
    EvaluationTask(
        name="technical_documentation",
        task_type=TaskType.GENERATION,
        description="Generate API documentation from code",
        metrics=["bleu", "rouge_l", "human_rating"],
        sample_size=100
    )
]

Building a Custom Evaluation Framework

import json
from abc import ABC, abstractmethod
from typing import Optional
import anthropic

class TaskEvaluator(ABC):
    """Base class for task-specific evaluators"""

    def __init__(self, task: EvaluationTask):
        self.task = task
        self.results = []

    @abstractmethod
    def prepare_prompt(self, example: dict) -> str:
        """Prepare the prompt for the model"""
        pass

    @abstractmethod
    def parse_response(self, response: str) -> Any:
        """Parse the model's response"""
        pass

    @abstractmethod
    def score(self, prediction: Any, ground_truth: Any) -> Dict[str, float]:
        """Score the prediction against ground truth"""
        pass

    def evaluate(
        self,
        model: str,
        examples: List[dict],
        client: anthropic.Anthropic
    ) -> Dict[str, float]:
        """Run evaluation on all examples"""

        for example in examples:
            prompt = self.prepare_prompt(example)

            response = client.messages.create(
                model=model,
                max_tokens=1000,
                messages=[{"role": "user", "content": prompt}]
            )

            prediction = self.parse_response(response.content[0].text)
            scores = self.score(prediction, example['ground_truth'])

            self.results.append({
                "example_id": example.get('id'),
                "prediction": prediction,
                "ground_truth": example['ground_truth'],
                "scores": scores
            })

        return self.aggregate_scores()

    def aggregate_scores(self) -> Dict[str, float]:
        """Aggregate scores across all examples"""
        if not self.results:
            return {}

        metric_sums = {}
        for result in self.results:
            for metric, value in result['scores'].items():
                metric_sums[metric] = metric_sums.get(metric, 0) + value

        return {
            metric: total / len(self.results)
            for metric, total in metric_sums.items()
        }

Classification Evaluator

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import numpy as np

class ClassificationEvaluator(TaskEvaluator):
    def __init__(self, task: EvaluationTask, categories: List[str]):
        super().__init__(task)
        self.categories = categories

    def prepare_prompt(self, example: dict) -> str:
        categories_str = ", ".join(self.categories)
        return f"""Classify the following text into one of these categories: {categories_str}

Text: {example['text']}

Respond with only the category name, nothing else."""

    def parse_response(self, response: str) -> str:
        response = response.strip().lower()
        # Find best matching category
        for cat in self.categories:
            if cat.lower() in response:
                return cat
        return response  # Return as-is if no match

    def score(self, prediction: str, ground_truth: str) -> Dict[str, float]:
        return {
            "correct": 1.0 if prediction.lower() == ground_truth.lower() else 0.0
        }

    def aggregate_scores(self) -> Dict[str, float]:
        predictions = [r['prediction'].lower() for r in self.results]
        ground_truths = [r['ground_truth'].lower() for r in self.results]

        return {
            "accuracy": accuracy_score(ground_truths, predictions),
            "f1_macro": f1_score(ground_truths, predictions, average='macro', zero_division=0),
            "f1_weighted": f1_score(ground_truths, predictions, average='weighted', zero_division=0)
        }

# Usage
intent_evaluator = ClassificationEvaluator(
    task=my_tasks[0],
    categories=["billing", "technical", "sales", "general"]
)

Extraction Evaluator

from typing import Set

class ExtractionEvaluator(TaskEvaluator):
    def __init__(self, task: EvaluationTask, extraction_schema: dict):
        super().__init__(task)
        self.schema = extraction_schema

    def prepare_prompt(self, example: dict) -> str:
        schema_str = json.dumps(self.schema, indent=2)
        return f"""Extract information from the following document according to this schema:

{schema_str}

Document:
{example['document']}

Return a JSON object with the extracted information."""

    def parse_response(self, response: str) -> dict:
        try:
            # Extract JSON from response
            start = response.find('{')
            end = response.rfind('}') + 1
            if start >= 0 and end > start:
                return json.loads(response[start:end])
        except json.JSONDecodeError:
            pass
        return {}

    def score(self, prediction: dict, ground_truth: dict) -> Dict[str, float]:
        # Calculate field-level metrics
        all_fields = set(ground_truth.keys())
        correct_fields = 0
        total_fields = len(all_fields)

        for field in all_fields:
            pred_value = prediction.get(field, "")
            true_value = ground_truth.get(field, "")

            if str(pred_value).strip().lower() == str(true_value).strip().lower():
                correct_fields += 1

        precision = correct_fields / len(prediction) if prediction else 0
        recall = correct_fields / total_fields if total_fields else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

        return {
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "exact_match": 1.0 if prediction == ground_truth else 0.0
        }

# Usage
contract_schema = {
    "parties": "List of party names",
    "effective_date": "Contract start date",
    "term_length": "Duration of contract",
    "payment_terms": "Payment conditions",
    "termination_clause": "Conditions for termination"
}

contract_evaluator = ExtractionEvaluator(
    task=my_tasks[1],
    extraction_schema=contract_schema
)

Generation Evaluator with LLM Judge

class GenerationEvaluator(TaskEvaluator):
    def __init__(self, task: EvaluationTask, criteria: List[str]):
        super().__init__(task)
        self.criteria = criteria
        self.judge_client = anthropic.Anthropic()

    def prepare_prompt(self, example: dict) -> str:
        return f"""Generate technical documentation for the following code:

```python
{example['code']}

Include:

Function description
Parameters with types
Return value
Usage example"""

def parse_response(self, response: str) -> str: return response.strip()

def score(self, prediction: str, ground_truth: str) -> Dict[str, float]: # Use LLM as judge scores = {}
```
  for criterion in self.criteria:
      judge_prompt = f"""Rate the following generated documentation on {criterion}.
```

Reference (ideal): {ground_truth}

Generated: {prediction}

Score from 1-5 where: 1 = Very poor 2 = Poor 3 = Acceptable 4 = Good 5 = Excellent

Respond with only the number."""

        response = self.judge_client.messages.create(
            model="claude-3-sonnet-20240229",
            max_tokens=10,
            messages=[{"role": "user", "content": judge_prompt}]
        )

        try:
            score = int(response.content[0].text.strip())
            scores[criterion] = score / 5.0
        except:
            scores[criterion] = 0.5

    return scores

Usage

doc_evaluator = GenerationEvaluator( task=my_tasks[2], criteria=[“accuracy”, “completeness”, “clarity”, “code_quality”] )


## Running Full Evaluation Suite

```python
class EvaluationSuite:
    def __init__(self, evaluators: List[TaskEvaluator]):
        self.evaluators = evaluators
        self.client = anthropic.Anthropic()

    def run(
        self,
        models: List[str],
        datasets: Dict[str, List[dict]]
    ) -> Dict[str, Dict[str, Dict[str, float]]]:
        """Run all evaluators on all models"""

        results = {}

        for model in models:
            results[model] = {}

            for evaluator in self.evaluators:
                task_name = evaluator.task.name
                examples = datasets.get(task_name, [])

                if examples:
                    scores = evaluator.evaluate(model, examples, self.client)
                    results[model][task_name] = scores

        return results

    def generate_report(self, results: Dict) -> str:
        report = "# Task-Specific Evaluation Report\n\n"

        for model, tasks in results.items():
            report += f"## {model}\n\n"
            for task, scores in tasks.items():
                report += f"### {task}\n"
                for metric, value in scores.items():
                    report += f"- {metric}: {value:.3f}\n"
                report += "\n"

        return report

# Run evaluation
suite = EvaluationSuite([
    intent_evaluator,
    contract_evaluator,
    doc_evaluator
])

# results = suite.run(
#     models=["claude-3-opus-20240229", "claude-3-sonnet-20240229"],
#     datasets=my_datasets
# )

Conclusion

Task-specific evaluation provides actionable insights that generic benchmarks cannot. Invest in building evaluation datasets that reflect your actual use cases and iterate on both your prompts and evaluation criteria.