March 21, 2024 1 min read

Generation Metrics for RAG: Measuring Answer Quality

While retrieval metrics measure what documents are found, generation metrics evaluate the quality of the synthesized answer. This guide covers metrics specific to RAG generation evaluation.

Categories of Generation Metrics

from enum import Enum
from dataclasses import dataclass
from typing import List, Dict, Optional

class MetricCategory(Enum):
    LEXICAL = "lexical"  # Word overlap-based
    SEMANTIC = "semantic"  # Meaning-based
    FAITHFULNESS = "faithfulness"  # Grounded in context
    FLUENCY = "fluency"  # Language quality
    RELEVANCE = "relevance"  # Answers the question

@dataclass
class GenerationMetric:
    name: str
    category: MetricCategory
    description: str
    requires_reference: bool
    requires_context: bool

Lexical Metrics

from collections import Counter
import numpy as np

class LexicalMetrics:
    """Word-overlap based metrics"""

    @staticmethod
    def rouge_n(hypothesis: str, reference: str, n: int = 1) -> Dict[str, float]:
        """
        ROUGE-N: N-gram overlap between hypothesis and reference

        Returns precision, recall, and F1
        """
        def get_ngrams(text: str, n: int) -> Counter:
            tokens = text.lower().split()
            return Counter(tuple(tokens[i:i+n]) for i in range(len(tokens) - n + 1))

        hyp_ngrams = get_ngrams(hypothesis, n)
        ref_ngrams = get_ngrams(reference, n)

        overlap = sum((hyp_ngrams & ref_ngrams).values())
        hyp_total = sum(hyp_ngrams.values())
        ref_total = sum(ref_ngrams.values())

        precision = overlap / hyp_total if hyp_total > 0 else 0
        recall = overlap / ref_total if ref_total > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

        return {"precision": precision, "recall": recall, "f1": f1}

    @staticmethod
    def rouge_l(hypothesis: str, reference: str) -> Dict[str, float]:
        """
        ROUGE-L: Longest Common Subsequence

        Better at capturing sentence-level structure
        """
        def lcs_length(x: List[str], y: List[str]) -> int:
            m, n = len(x), len(y)
            dp = [[0] * (n + 1) for _ in range(m + 1)]

            for i in range(1, m + 1):
                for j in range(1, n + 1):
                    if x[i-1] == y[j-1]:
                        dp[i][j] = dp[i-1][j-1] + 1
                    else:
                        dp[i][j] = max(dp[i-1][j], dp[i][j-1])

            return dp[m][n]

        hyp_tokens = hypothesis.lower().split()
        ref_tokens = reference.lower().split()

        lcs = lcs_length(hyp_tokens, ref_tokens)

        precision = lcs / len(hyp_tokens) if hyp_tokens else 0
        recall = lcs / len(ref_tokens) if ref_tokens else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

        return {"precision": precision, "recall": recall, "f1": f1}

    @staticmethod
    def bleu(hypothesis: str, reference: str, max_n: int = 4) -> float:
        """
        BLEU: Bilingual Evaluation Understudy

        Geometric mean of n-gram precisions with brevity penalty
        """
        def get_ngrams(text: str, n: int) -> Counter:
            tokens = text.lower().split()
            return Counter(tuple(tokens[i:i+n]) for i in range(len(tokens) - n + 1))

        hyp_tokens = hypothesis.lower().split()
        ref_tokens = reference.lower().split()

        # Brevity penalty
        if len(hyp_tokens) <= len(ref_tokens):
            bp = np.exp(1 - len(ref_tokens) / len(hyp_tokens)) if hyp_tokens else 0
        else:
            bp = 1.0

        # N-gram precisions
        precisions = []
        for n in range(1, max_n + 1):
            hyp_ngrams = get_ngrams(hypothesis, n)
            ref_ngrams = get_ngrams(reference, n)

            overlap = sum((hyp_ngrams & ref_ngrams).values())
            total = sum(hyp_ngrams.values())

            precision = overlap / total if total > 0 else 0
            precisions.append(precision)

        # Geometric mean
        if all(p > 0 for p in precisions):
            geo_mean = np.exp(np.mean(np.log(precisions)))
        else:
            geo_mean = 0

        return bp * geo_mean

    @staticmethod
    def meteor(hypothesis: str, reference: str) -> float:
        """
        Simplified METEOR-like scoring

        Considers unigram matching with stemming awareness
        """
        # Simplified version - just unigram F1 with different weighting
        hyp_words = set(hypothesis.lower().split())
        ref_words = set(reference.lower().split())

        matches = len(hyp_words & ref_words)

        precision = matches / len(hyp_words) if hyp_words else 0
        recall = matches / len(ref_words) if ref_words else 0

        # METEOR uses recall-weighted harmonic mean
        alpha = 0.9
        if precision + recall > 0:
            fmean = (precision * recall) / (alpha * precision + (1 - alpha) * recall)
        else:
            fmean = 0

        return fmean

Semantic Metrics

import anthropic

class SemanticMetrics:
    """Meaning-based metrics using LLMs"""

    def __init__(self):
        self.client = anthropic.Anthropic()

    def semantic_similarity(self, text1: str, text2: str) -> float:
        """
        Semantic similarity between two texts using LLM

        Returns score from 0 (different) to 1 (same meaning)
        """
        prompt = f"""Rate the semantic similarity between these two texts.
Consider meaning, not exact wording.

Text 1: {text1}

Text 2: {text2}

Score from 0 to 1:
- 0.0: Completely different meanings
- 0.5: Related but different
- 1.0: Same meaning

Respond with only the number."""

        response = self.client.messages.create(
            model="claude-3-sonnet-20240229",
            max_tokens=10,
            messages=[{"role": "user", "content": prompt}]
        )

        try:
            return float(response.content[0].text.strip())
        except:
            return 0.5

    def entailment_score(self, premise: str, hypothesis: str) -> float:
        """
        Check if premise entails hypothesis

        Used to verify if generated answer follows from context
        """
        prompt = f"""Determine if the premise entails the hypothesis.
Entailment means the hypothesis logically follows from the premise.

Premise: {premise}

Hypothesis: {hypothesis}

Score:
- 1.0: Definitely entails
- 0.5: Neutral/unclear
- 0.0: Contradicts

Respond with only the number."""

        response = self.client.messages.create(
            model="claude-3-sonnet-20240229",
            max_tokens=10,
            messages=[{"role": "user", "content": prompt}]
        )

        try:
            return float(response.content[0].text.strip())
        except:
            return 0.5

    def bertscore_approximation(
        self,
        hypothesis: str,
        reference: str
    ) -> Dict[str, float]:
        """
        BERTScore approximation using LLM

        Real BERTScore uses BERT embeddings; this approximates semantically
        """
        prompt = f"""Evaluate the semantic overlap between these texts.
Consider meaning similarity at the phrase level.

Reference: {reference}

Generated: {hypothesis}

Rate precision (how much of generated is in reference), recall (how much of reference is in generated), and F1.
Format: P: X.XX, R: X.XX, F1: X.XX"""

        response = self.client.messages.create(
            model="claude-3-sonnet-20240229",
            max_tokens=50,
            messages=[{"role": "user", "content": prompt}]
        )

        text = response.content[0].text
        try:
            # Parse response
            parts = text.split(",")
            precision = float(parts[0].split(":")[1].strip())
            recall = float(parts[1].split(":")[1].strip())
            f1 = float(parts[2].split(":")[1].strip())
            return {"precision": precision, "recall": recall, "f1": f1}
        except:
            return {"precision": 0.5, "recall": 0.5, "f1": 0.5}

RAG-Specific Generation Metrics

class RAGGenerationMetrics:
    """Metrics specific to RAG answer generation"""

    def __init__(self):
        self.client = anthropic.Anthropic()

    def groundedness(self, answer: str, context: str) -> float:
        """
        Groundedness: Is the answer grounded in the context?

        Key metric for RAG - penalizes hallucination
        """
        prompt = f"""Evaluate how grounded this answer is in the given context.
A grounded answer only contains information that can be found or inferred from the context.

Context:
{context}

Answer:
{answer}

Score:
- 1.0: Fully grounded, all information from context
- 0.5: Partially grounded, some unsupported claims
- 0.0: Not grounded, mostly fabricated

Respond with only the number."""

        response = self.client.messages.create(
            model="claude-3-sonnet-20240229",
            max_tokens=10,
            messages=[{"role": "user", "content": prompt}]
        )

        try:
            return float(response.content[0].text.strip())
        except:
            return 0.5

    def completeness(
        self,
        answer: str,
        question: str,
        context: str
    ) -> float:
        """
        Completeness: Does the answer cover all relevant information?
        """
        prompt = f"""Evaluate the completeness of this answer.
Consider what information is available in the context to answer the question.

Question: {question}

Context:
{context}

Answer:
{answer}

Score:
- 1.0: Completely answers the question using available info
- 0.5: Partially complete
- 0.0: Very incomplete

Respond with only the number."""

        response = self.client.messages.create(
            model="claude-3-sonnet-20240229",
            max_tokens=10,
            messages=[{"role": "user", "content": prompt}]
        )

        try:
            return float(response.content[0].text.strip())
        except:
            return 0.5

    def coherence(self, answer: str) -> float:
        """
        Coherence: Is the answer well-structured and logical?
        """
        prompt = f"""Evaluate the coherence of this answer.
Consider logical flow, structure, and clarity.

Answer:
{answer}

Score:
- 1.0: Very coherent and well-structured
- 0.5: Somewhat coherent
- 0.0: Incoherent

Respond with only the number."""

        response = self.client.messages.create(
            model="claude-3-sonnet-20240229",
            max_tokens=10,
            messages=[{"role": "user", "content": prompt}]
        )

        try:
            return float(response.content[0].text.strip())
        except:
            return 0.5

    def evaluate(
        self,
        questions: List[str],
        answers: List[str],
        contexts: List[str],
        references: Optional[List[str]] = None
    ) -> Dict[str, float]:
        """Run comprehensive generation evaluation"""

        lexical = LexicalMetrics()
        semantic = SemanticMetrics()

        metrics = {
            "groundedness": [],
            "completeness": [],
            "coherence": [],
            "rouge_l_f1": [],
            "bleu": []
        }

        for i, (q, a, c) in enumerate(zip(questions, answers, contexts)):
            metrics["groundedness"].append(self.groundedness(a, c))
            metrics["completeness"].append(self.completeness(a, q, c))
            metrics["coherence"].append(self.coherence(a))

            if references:
                metrics["rouge_l_f1"].append(lexical.rouge_l(a, references[i])["f1"])
                metrics["bleu"].append(lexical.bleu(a, references[i]))

        return {k: np.mean(v) for k, v in metrics.items() if v}

Conclusion

Generation metrics for RAG must evaluate both output quality and faithfulness to source documents. Combine lexical metrics with LLM-based semantic evaluation for comprehensive assessment.