February 5, 2023 1 min read

Re-ranking Search Results for Better RAG Performance

Initial retrieval is fast but imprecise. Re-ranking takes the top results and uses more sophisticated methods to improve ordering. This two-stage approach balances speed and quality for production RAG systems.

Why Re-ranking?

# Stage 1: Fast retrieval (100ms)
# - Retrieve top 50-100 candidates
# - Uses efficient vector/keyword search
# - Good recall but lower precision

# Stage 2: Re-ranking (500ms)
# - Re-score top 10-20 candidates
# - Uses expensive but accurate models
# - High precision for final results

# Result: Best of both worlds

Cross-Encoder Re-ranking

Cross-encoders directly score query-document pairs:

from sentence_transformers import CrossEncoder
from typing import List, Dict, Tuple

class CrossEncoderReranker:
    """Re-rank using a cross-encoder model."""

    def __init__(self, model_name: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"):
        self.model = CrossEncoder(model_name)

    def rerank(
        self,
        query: str,
        documents: List[Dict],
        text_field: str = "content",
        top_k: int = 10
    ) -> List[Dict]:
        """Re-rank documents using cross-encoder."""
        if not documents:
            return []

        # Create query-document pairs
        pairs = [(query, doc[text_field]) for doc in documents]

        # Score all pairs
        scores = self.model.predict(pairs)

        # Combine with documents
        scored_docs = list(zip(documents, scores))
        scored_docs.sort(key=lambda x: x[1], reverse=True)

        # Return top k with scores
        return [
            {**doc, "rerank_score": float(score)}
            for doc, score in scored_docs[:top_k]
        ]

# Usage
reranker = CrossEncoderReranker()
initial_results = retriever.search("Azure Functions scaling", top_k=50)
reranked = reranker.rerank("Azure Functions scaling", initial_results, top_k=10)

LLM-Based Re-ranking

Use an LLM to judge relevance:

import openai
from typing import List, Dict
import json

class LLMReranker:
    """Re-rank using LLM relevance judgments."""

    def __init__(self, deployment: str = "gpt-35-turbo"):
        self.deployment = deployment

    def score_document(
        self,
        query: str,
        document: str
    ) -> float:
        """Score a single document's relevance."""
        prompt = f"""Rate the relevance of this document to the query on a scale of 0-10.

Query: {query}
Document: {document[:1000]}

Return only a number from 0-10:"""

        response = openai.ChatCompletion.create(
            engine=self.deployment,
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
            max_tokens=5
        )

        try:
            score = float(response.choices[0].message.content.strip())
            return min(max(score, 0), 10) / 10  # Normalize to 0-1
        except:
            return 0.5

    def batch_rerank(
        self,
        query: str,
        documents: List[Dict],
        text_field: str = "content",
        top_k: int = 10
    ) -> List[Dict]:
        """Re-rank documents with batch LLM call."""
        if not documents:
            return []

        # Create numbered document list
        doc_list = "\n".join([
            f"{i+1}. {doc[text_field][:200]}..."
            for i, doc in enumerate(documents[:20])  # Limit to 20
        ])

        prompt = f"""Given the query: "{query}"

Rank these documents by relevance (most relevant first).
Return only the document numbers separated by commas.

Documents:
{doc_list}

Ranking (e.g., "3,1,5,2,4"):"""

        response = openai.ChatCompletion.create(
            engine=self.deployment,
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
            max_tokens=100
        )

        # Parse ranking
        try:
            ranking_str = response.choices[0].message.content.strip()
            ranking = [int(x.strip()) - 1 for x in ranking_str.split(",")]

            reranked = []
            for rank, idx in enumerate(ranking):
                if 0 <= idx < len(documents):
                    doc = documents[idx].copy()
                    doc["rerank_position"] = rank + 1
                    reranked.append(doc)

            return reranked[:top_k]
        except:
            return documents[:top_k]

    def pairwise_rerank(
        self,
        query: str,
        documents: List[Dict],
        text_field: str = "content"
    ) -> List[Dict]:
        """Re-rank using pairwise comparisons."""
        # Use tournament-style comparison
        n = len(documents)
        wins = [0] * n

        for i in range(n):
            for j in range(i + 1, n):
                winner = self._compare_pair(
                    query,
                    documents[i][text_field],
                    documents[j][text_field]
                )
                if winner == 1:
                    wins[i] += 1
                else:
                    wins[j] += 1

        # Sort by wins
        indexed = list(enumerate(documents))
        indexed.sort(key=lambda x: wins[x[0]], reverse=True)

        return [doc for _, doc in indexed]

    def _compare_pair(
        self,
        query: str,
        doc1: str,
        doc2: str
    ) -> int:
        """Compare two documents, return 1 or 2 for winner."""
        prompt = f"""Query: {query}

Document A: {doc1[:500]}
Document B: {doc2[:500]}

Which document is more relevant to the query? Reply with only 'A' or 'B':"""

        response = openai.ChatCompletion.create(
            engine=self.deployment,
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
            max_tokens=5
        )

        answer = response.choices[0].message.content.strip().upper()
        return 1 if answer == "A" else 2

Cohere Re-ranking

Using Cohere’s specialized rerank API:

import cohere

class CohereReranker:
    """Re-rank using Cohere's rerank API."""

    def __init__(self, api_key: str, model: str = "rerank-english-v2.0"):
        self.client = cohere.Client(api_key)
        self.model = model

    def rerank(
        self,
        query: str,
        documents: List[Dict],
        text_field: str = "content",
        top_k: int = 10
    ) -> List[Dict]:
        """Re-rank using Cohere."""
        if not documents:
            return []

        texts = [doc[text_field] for doc in documents]

        response = self.client.rerank(
            query=query,
            documents=texts,
            model=self.model,
            top_n=top_k
        )

        reranked = []
        for result in response.results:
            doc = documents[result.index].copy()
            doc["rerank_score"] = result.relevance_score
            reranked.append(doc)

        return reranked

Multi-Stage Re-ranking Pipeline

class MultiStageReranker:
    """Multi-stage re-ranking pipeline."""

    def __init__(
        self,
        cross_encoder_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2",
        llm_deployment: str = "gpt-35-turbo"
    ):
        self.cross_encoder = CrossEncoderReranker(cross_encoder_model)
        self.llm_reranker = LLMReranker(llm_deployment)

    def rerank(
        self,
        query: str,
        documents: List[Dict],
        text_field: str = "content",
        final_k: int = 5
    ) -> List[Dict]:
        """Multi-stage re-ranking."""
        if len(documents) <= final_k:
            return documents

        # Stage 1: Cross-encoder (fast, handles many docs)
        stage1_k = min(20, len(documents))
        stage1_results = self.cross_encoder.rerank(
            query, documents, text_field, top_k=stage1_k
        )

        # Stage 2: LLM (slower, more accurate, fewer docs)
        final_results = self.llm_reranker.batch_rerank(
            query, stage1_results, text_field, top_k=final_k
        )

        return final_results

Combining Initial Scores with Re-rank Scores

class ScoreFusionReranker:
    """Combine initial and re-rank scores."""

    def __init__(
        self,
        reranker,
        initial_weight: float = 0.3,
        rerank_weight: float = 0.7
    ):
        self.reranker = reranker
        self.initial_weight = initial_weight
        self.rerank_weight = rerank_weight

    def rerank(
        self,
        query: str,
        documents: List[Dict],
        initial_score_field: str = "score",
        text_field: str = "content",
        top_k: int = 10
    ) -> List[Dict]:
        """Re-rank and combine scores."""
        # Get re-rank scores
        reranked = self.reranker.rerank(query, documents, text_field)

        # Normalize initial scores
        initial_scores = [doc.get(initial_score_field, 0) for doc in documents]
        max_initial = max(initial_scores) if initial_scores else 1
        min_initial = min(initial_scores) if initial_scores else 0
        range_initial = max_initial - min_initial or 1

        # Normalize re-rank scores
        rerank_scores = [doc.get("rerank_score", 0) for doc in reranked]
        max_rerank = max(rerank_scores) if rerank_scores else 1
        min_rerank = min(rerank_scores) if rerank_scores else 0
        range_rerank = max_rerank - min_rerank or 1

        # Combine scores
        for doc in reranked:
            norm_initial = (doc.get(initial_score_field, 0) - min_initial) / range_initial
            norm_rerank = (doc.get("rerank_score", 0) - min_rerank) / range_rerank

            doc["combined_score"] = (
                self.initial_weight * norm_initial +
                self.rerank_weight * norm_rerank
            )

        # Sort by combined score
        reranked.sort(key=lambda x: x.get("combined_score", 0), reverse=True)
        return reranked[:top_k]

Best Practices

Retrieve more, re-rank fewer: Get top 50-100, re-rank top 10-20
Use specialized models: Cross-encoders outperform bi-encoders for re-ranking
Consider latency: LLM re-ranking adds 500ms+ per call
Cache when possible: Same query-doc pairs get same scores
Combine scores carefully: Initial scores still provide signal
Test on your data: Re-ranking effectiveness varies by domain