Back to Blog
6 min read

Task-Specific Evaluation for LLMs: Beyond Generic Benchmarks

Task-Specific Evaluation for LLMs: Beyond Generic Benchmarks

Generic benchmarks like MMLU and HumanEval don’t predict performance on your specific use cases. This guide covers how to design and implement task-specific evaluation pipelines.

Why Task-Specific Evaluation?

from dataclasses import dataclass
from typing import List, Dict, Callable, Any
from enum import Enum

class TaskType(Enum):
    CLASSIFICATION = "classification"
    EXTRACTION = "extraction"
    GENERATION = "generation"
    SUMMARIZATION = "summarization"
    TRANSLATION = "translation"
    QA = "question_answering"
    CHAT = "conversational"

@dataclass
class EvaluationTask:
    name: str
    task_type: TaskType
    description: str
    metrics: List[str]
    sample_size: int

# Define your specific tasks
my_tasks = [
    EvaluationTask(
        name="customer_intent_classification",
        task_type=TaskType.CLASSIFICATION,
        description="Classify customer messages into intent categories",
        metrics=["accuracy", "f1_macro", "confusion_matrix"],
        sample_size=500
    ),
    EvaluationTask(
        name="contract_clause_extraction",
        task_type=TaskType.EXTRACTION,
        description="Extract key clauses from legal contracts",
        metrics=["precision", "recall", "f1", "exact_match"],
        sample_size=200
    ),
    EvaluationTask(
        name="technical_documentation",
        task_type=TaskType.GENERATION,
        description="Generate API documentation from code",
        metrics=["bleu", "rouge_l", "human_rating"],
        sample_size=100
    )
]

Building a Custom Evaluation Framework

import json
from abc import ABC, abstractmethod
from typing import Optional
import anthropic

class TaskEvaluator(ABC):
    """Base class for task-specific evaluators"""

    def __init__(self, task: EvaluationTask):
        self.task = task
        self.results = []

    @abstractmethod
    def prepare_prompt(self, example: dict) -> str:
        """Prepare the prompt for the model"""
        pass

    @abstractmethod
    def parse_response(self, response: str) -> Any:
        """Parse the model's response"""
        pass

    @abstractmethod
    def score(self, prediction: Any, ground_truth: Any) -> Dict[str, float]:
        """Score the prediction against ground truth"""
        pass

    def evaluate(
        self,
        model: str,
        examples: List[dict],
        client: anthropic.Anthropic
    ) -> Dict[str, float]:
        """Run evaluation on all examples"""

        for example in examples:
            prompt = self.prepare_prompt(example)

            response = client.messages.create(
                model=model,
                max_tokens=1000,
                messages=[{"role": "user", "content": prompt}]
            )

            prediction = self.parse_response(response.content[0].text)
            scores = self.score(prediction, example['ground_truth'])

            self.results.append({
                "example_id": example.get('id'),
                "prediction": prediction,
                "ground_truth": example['ground_truth'],
                "scores": scores
            })

        return self.aggregate_scores()

    def aggregate_scores(self) -> Dict[str, float]:
        """Aggregate scores across all examples"""
        if not self.results:
            return {}

        metric_sums = {}
        for result in self.results:
            for metric, value in result['scores'].items():
                metric_sums[metric] = metric_sums.get(metric, 0) + value

        return {
            metric: total / len(self.results)
            for metric, total in metric_sums.items()
        }

Classification Evaluator

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import numpy as np

class ClassificationEvaluator(TaskEvaluator):
    def __init__(self, task: EvaluationTask, categories: List[str]):
        super().__init__(task)
        self.categories = categories

    def prepare_prompt(self, example: dict) -> str:
        categories_str = ", ".join(self.categories)
        return f"""Classify the following text into one of these categories: {categories_str}

Text: {example['text']}

Respond with only the category name, nothing else."""

    def parse_response(self, response: str) -> str:
        response = response.strip().lower()
        # Find best matching category
        for cat in self.categories:
            if cat.lower() in response:
                return cat
        return response  # Return as-is if no match

    def score(self, prediction: str, ground_truth: str) -> Dict[str, float]:
        return {
            "correct": 1.0 if prediction.lower() == ground_truth.lower() else 0.0
        }

    def aggregate_scores(self) -> Dict[str, float]:
        predictions = [r['prediction'].lower() for r in self.results]
        ground_truths = [r['ground_truth'].lower() for r in self.results]

        return {
            "accuracy": accuracy_score(ground_truths, predictions),
            "f1_macro": f1_score(ground_truths, predictions, average='macro', zero_division=0),
            "f1_weighted": f1_score(ground_truths, predictions, average='weighted', zero_division=0)
        }

# Usage
intent_evaluator = ClassificationEvaluator(
    task=my_tasks[0],
    categories=["billing", "technical", "sales", "general"]
)

Extraction Evaluator

from typing import Set

class ExtractionEvaluator(TaskEvaluator):
    def __init__(self, task: EvaluationTask, extraction_schema: dict):
        super().__init__(task)
        self.schema = extraction_schema

    def prepare_prompt(self, example: dict) -> str:
        schema_str = json.dumps(self.schema, indent=2)
        return f"""Extract information from the following document according to this schema:

{schema_str}

Document:
{example['document']}

Return a JSON object with the extracted information."""

    def parse_response(self, response: str) -> dict:
        try:
            # Extract JSON from response
            start = response.find('{')
            end = response.rfind('}') + 1
            if start >= 0 and end > start:
                return json.loads(response[start:end])
        except json.JSONDecodeError:
            pass
        return {}

    def score(self, prediction: dict, ground_truth: dict) -> Dict[str, float]:
        # Calculate field-level metrics
        all_fields = set(ground_truth.keys())
        correct_fields = 0
        total_fields = len(all_fields)

        for field in all_fields:
            pred_value = prediction.get(field, "")
            true_value = ground_truth.get(field, "")

            if str(pred_value).strip().lower() == str(true_value).strip().lower():
                correct_fields += 1

        precision = correct_fields / len(prediction) if prediction else 0
        recall = correct_fields / total_fields if total_fields else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

        return {
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "exact_match": 1.0 if prediction == ground_truth else 0.0
        }

# Usage
contract_schema = {
    "parties": "List of party names",
    "effective_date": "Contract start date",
    "term_length": "Duration of contract",
    "payment_terms": "Payment conditions",
    "termination_clause": "Conditions for termination"
}

contract_evaluator = ExtractionEvaluator(
    task=my_tasks[1],
    extraction_schema=contract_schema
)

Generation Evaluator with LLM Judge

class GenerationEvaluator(TaskEvaluator):
    def __init__(self, task: EvaluationTask, criteria: List[str]):
        super().__init__(task)
        self.criteria = criteria
        self.judge_client = anthropic.Anthropic()

    def prepare_prompt(self, example: dict) -> str:
        return f"""Generate technical documentation for the following code:

```python
{example['code']}

Include:

  • Function description

  • Parameters with types

  • Return value

  • Usage example"""

    def parse_response(self, response: str) -> str: return response.strip()

    def score(self, prediction: str, ground_truth: str) -> Dict[str, float]: # Use LLM as judge scores = {}

      for criterion in self.criteria:
          judge_prompt = f"""Rate the following generated documentation on {criterion}.

Reference (ideal): {ground_truth}

Generated: {prediction}

Score from 1-5 where: 1 = Very poor 2 = Poor 3 = Acceptable 4 = Good 5 = Excellent

Respond with only the number."""

        response = self.judge_client.messages.create(
            model="claude-3-sonnet-20240229",
            max_tokens=10,
            messages=[{"role": "user", "content": judge_prompt}]
        )

        try:
            score = int(response.content[0].text.strip())
            scores[criterion] = score / 5.0
        except:
            scores[criterion] = 0.5

    return scores

Usage

doc_evaluator = GenerationEvaluator( task=my_tasks[2], criteria=[“accuracy”, “completeness”, “clarity”, “code_quality”] )


## Running Full Evaluation Suite

```python
class EvaluationSuite:
    def __init__(self, evaluators: List[TaskEvaluator]):
        self.evaluators = evaluators
        self.client = anthropic.Anthropic()

    def run(
        self,
        models: List[str],
        datasets: Dict[str, List[dict]]
    ) -> Dict[str, Dict[str, Dict[str, float]]]:
        """Run all evaluators on all models"""

        results = {}

        for model in models:
            results[model] = {}

            for evaluator in self.evaluators:
                task_name = evaluator.task.name
                examples = datasets.get(task_name, [])

                if examples:
                    scores = evaluator.evaluate(model, examples, self.client)
                    results[model][task_name] = scores

        return results

    def generate_report(self, results: Dict) -> str:
        report = "# Task-Specific Evaluation Report\n\n"

        for model, tasks in results.items():
            report += f"## {model}\n\n"
            for task, scores in tasks.items():
                report += f"### {task}\n"
                for metric, value in scores.items():
                    report += f"- {metric}: {value:.3f}\n"
                report += "\n"

        return report

# Run evaluation
suite = EvaluationSuite([
    intent_evaluator,
    contract_evaluator,
    doc_evaluator
])

# results = suite.run(
#     models=["claude-3-opus-20240229", "claude-3-sonnet-20240229"],
#     datasets=my_datasets
# )

Conclusion

Task-specific evaluation provides actionable insights that generic benchmarks cannot. Invest in building evaluation datasets that reflect your actual use cases and iterate on both your prompts and evaluation criteria.

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.