March 19, 2024 1 min read

RAGAS Framework: Automated RAG Evaluation

RAGAS (Retrieval Augmented Generation Assessment) is an open-source framework for evaluating RAG pipelines. This guide covers how to implement and use RAGAS for your RAG systems.

What is RAGAS?

RAGAS provides a set of metrics specifically designed for RAG evaluation:

from dataclasses import dataclass
from typing import List, Dict

@dataclass
class RAGASMetrics:
    """Core RAGAS metrics"""

    # Generation metrics
    faithfulness: float  # Is the answer grounded in context?
    answer_relevancy: float  # Does the answer address the question?

    # Retrieval metrics
    context_precision: float  # Are retrieved docs relevant?
    context_recall: float  # Are all needed docs retrieved?

    # Overall
    answer_correctness: float  # Is the answer factually correct?

Installing and Setting Up RAGAS

pip install ragas langchain openai

# Basic RAGAS setup
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
)
from datasets import Dataset

# Prepare your evaluation data
eval_data = {
    "question": [
        "What is Azure OpenAI?",
        "How do you deploy a model in Azure ML?"
    ],
    "answer": [
        "Azure OpenAI is a cloud service that provides access to OpenAI's models.",
        "You can deploy models using managed online endpoints or batch endpoints."
    ],
    "contexts": [
        ["Azure OpenAI Service provides REST API access to OpenAI's models including GPT-4."],
        ["Azure ML supports managed online endpoints for real-time inference and batch endpoints for batch scoring."]
    ],
    "ground_truth": [
        "Azure OpenAI is Microsoft's cloud service providing access to OpenAI's language models.",
        "Models can be deployed via managed endpoints, Kubernetes, or batch endpoints."
    ]
}

# Convert to Dataset
dataset = Dataset.from_dict(eval_data)

# Run evaluation
result = evaluate(
    dataset,
    metrics=[
        faithfulness,
        answer_relevancy,
        context_precision,
        context_recall,
    ]
)

print(result)

Implementing RAGAS Metrics from Scratch

import anthropic
import numpy as np
from typing import List

class RAGASEvaluator:
    """RAGAS-style evaluation using Claude"""

    def __init__(self):
        self.client = anthropic.Anthropic()

    def faithfulness(
        self,
        answer: str,
        contexts: List[str]
    ) -> float:
        """
        Faithfulness measures if the answer is grounded in the context.
        Steps:
        1. Extract claims from the answer
        2. Verify each claim against the context
        3. Return ratio of supported claims
        """

        # Step 1: Extract claims
        claims_prompt = f"""Extract all factual claims from this answer as a numbered list.
Each claim should be a single, verifiable statement.

Answer: {answer}

List each claim on a new line, numbered."""

        claims_response = self.client.messages.create(
            model="claude-3-sonnet-20240229",
            max_tokens=500,
            messages=[{"role": "user", "content": claims_prompt}]
        )

        claims_text = claims_response.content[0].text
        claims = [line.strip() for line in claims_text.split('\n')
                  if line.strip() and line.strip()[0].isdigit()]

        if not claims:
            return 1.0  # No claims to verify

        # Step 2: Verify each claim
        context_combined = "\n\n".join(contexts)
        supported = 0

        for claim in claims:
            verify_prompt = f"""Given the following context, determine if this claim is supported.

Context:
{context_combined}

Claim: {claim}

Is this claim supported by the context? Answer only 'yes' or 'no'."""

            verify_response = self.client.messages.create(
                model="claude-3-sonnet-20240229",
                max_tokens=10,
                messages=[{"role": "user", "content": verify_prompt}]
            )

            if 'yes' in verify_response.content[0].text.lower():
                supported += 1

        return supported / len(claims)

    def answer_relevancy(
        self,
        question: str,
        answer: str
    ) -> float:
        """
        Answer relevancy measures if the answer addresses the question.
        Uses reverse generation: generate questions from the answer
        and measure similarity to original question.
        """

        # Generate questions from the answer
        gen_prompt = f"""Based on this answer, generate 3 questions that this answer would address.

Answer: {answer}

List 3 questions, one per line."""

        gen_response = self.client.messages.create(
            model="claude-3-sonnet-20240229",
            max_tokens=300,
            messages=[{"role": "user", "content": gen_prompt}]
        )

        generated_questions = [
            line.strip() for line in gen_response.content[0].text.split('\n')
            if line.strip() and '?' in line
        ]

        if not generated_questions:
            return 0.5

        # Measure similarity of generated questions to original
        similarities = []
        for gen_q in generated_questions:
            sim_prompt = f"""Rate the semantic similarity between these two questions.
0 = completely different topics
1 = identical meaning

Question 1: {question}
Question 2: {gen_q}

Respond with only a number between 0 and 1."""

            sim_response = self.client.messages.create(
                model="claude-3-sonnet-20240229",
                max_tokens=10,
                messages=[{"role": "user", "content": sim_prompt}]
            )

            try:
                similarity = float(sim_response.content[0].text.strip())
                similarities.append(similarity)
            except:
                similarities.append(0.5)

        return np.mean(similarities)

    def context_precision(
        self,
        question: str,
        contexts: List[str],
        ground_truth: str
    ) -> float:
        """
        Context precision measures if retrieved contexts are relevant.
        For each context, check if it's needed to answer the question.
        """

        if not contexts:
            return 0.0

        relevant_count = 0

        for context in contexts:
            relevance_prompt = f"""Determine if this context is relevant and useful for answering the question.

Question: {question}

Context: {context}

Is this context relevant for answering the question? Answer 'yes' or 'no'."""

            response = self.client.messages.create(
                model="claude-3-sonnet-20240229",
                max_tokens=10,
                messages=[{"role": "user", "content": relevance_prompt}]
            )

            if 'yes' in response.content[0].text.lower():
                relevant_count += 1

        return relevant_count / len(contexts)

    def context_recall(
        self,
        ground_truth: str,
        contexts: List[str]
    ) -> float:
        """
        Context recall measures if all information needed for ground truth
        is present in the retrieved contexts.
        """

        # Extract statements from ground truth
        extract_prompt = f"""Extract key factual statements from this answer.
List each statement on a new line.

Answer: {ground_truth}"""

        extract_response = self.client.messages.create(
            model="claude-3-sonnet-20240229",
            max_tokens=300,
            messages=[{"role": "user", "content": extract_prompt}]
        )

        statements = [
            line.strip() for line in extract_response.content[0].text.split('\n')
            if line.strip()
        ]

        if not statements:
            return 1.0

        context_combined = "\n\n".join(contexts)
        attributed = 0

        for statement in statements:
            check_prompt = f"""Can this statement be attributed to the given context?

Context:
{context_combined}

Statement: {statement}

Can this be attributed to the context? Answer 'yes' or 'no'."""

            check_response = self.client.messages.create(
                model="claude-3-sonnet-20240229",
                max_tokens=10,
                messages=[{"role": "user", "content": check_prompt}]
            )

            if 'yes' in check_response.content[0].text.lower():
                attributed += 1

        return attributed / len(statements)

    def evaluate(
        self,
        questions: List[str],
        answers: List[str],
        contexts: List[List[str]],
        ground_truths: List[str]
    ) -> Dict[str, float]:
        """Run full RAGAS evaluation"""

        faithfulness_scores = []
        relevancy_scores = []
        precision_scores = []
        recall_scores = []

        for q, a, c, gt in zip(questions, answers, contexts, ground_truths):
            faithfulness_scores.append(self.faithfulness(a, c))
            relevancy_scores.append(self.answer_relevancy(q, a))
            precision_scores.append(self.context_precision(q, c, gt))
            recall_scores.append(self.context_recall(gt, c))

        return {
            "faithfulness": np.mean(faithfulness_scores),
            "answer_relevancy": np.mean(relevancy_scores),
            "context_precision": np.mean(precision_scores),
            "context_recall": np.mean(recall_scores)
        }

Running RAGAS Evaluation

# Example usage
evaluator = RAGASEvaluator()

# Your RAG system outputs
questions = [
    "What is Azure Machine Learning?",
    "How do you create a Kubernetes cluster in Azure?"
]

answers = [
    "Azure Machine Learning is a cloud-based platform for building and deploying ML models.",
    "You can create an AKS cluster using Azure CLI, Portal, or ARM templates."
]

contexts = [
    ["Azure ML is Microsoft's cloud service for machine learning workflows."],
    ["Azure Kubernetes Service (AKS) can be deployed via CLI: az aks create..."]
]

ground_truths = [
    "Azure ML is a cloud platform for training, deploying, and managing ML models.",
    "AKS clusters can be created through Azure Portal, CLI, PowerShell, or ARM templates."
]

# Run evaluation
results = evaluator.evaluate(questions, answers, contexts, ground_truths)

print("RAGAS Evaluation Results:")
for metric, score in results.items():
    print(f"  {metric}: {score:.3f}")

Best Practices

Use diverse test cases: Cover different question types and complexity
Include edge cases: Test with no context, wrong context, etc.
Version your evaluations: Track metrics over time
Combine with human evaluation: RAGAS supplements but doesn’t replace human judgment

Conclusion

RAGAS provides a standardized approach to RAG evaluation. Whether using the library directly or implementing the metrics yourself, consistent evaluation is key to improving RAG systems.