March 14, 2025 1 min read
Reranking Strategies: Improving RAG Precision

Reranking is often the difference between mediocre and excellent RAG results. Here’s how to implement it.
Reranking Implementation

from azure.ai.openai import AzureOpenAI
from sentence_transformers import CrossEncoder
import numpy as np

class RerankerPipeline:
    def __init__(self, openai_client: AzureOpenAI):
        self.openai = openai_client
        self.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')

    def cross_encoder_rerank(self, query: str, documents: list[str], top_k: int = 5) -> list:
        """Rerank using cross-encoder model."""
        pairs = [[query, doc] for doc in documents]
        scores = self.cross_encoder.predict(pairs)

        # Sort by score
        ranked = sorted(zip(documents, scores), key=lambda x: x[1], reverse=True)
        return [doc for doc, score in ranked[:top_k]]

    async def llm_rerank(self, query: str, documents: list[str], top_k: int = 5) -> list:
        """Rerank using LLM for complex queries."""
        docs_text = "\n\n".join([f"[{i}] {doc}" for i, doc in enumerate(documents)])

        response = await self.openai.chat.completions.create(
            model="gpt-4o",
            messages=[{
                "role": "system",
                "content": """Rank these documents by relevance to the query.
                Return a JSON array of document indices in order of relevance.
                Consider semantic relevance, not just keyword matching."""
            }, {
                "role": "user",
                "content": f"Query: {query}\n\nDocuments:\n{docs_text}"
            }],
            response_format={"type": "json_object"}
        )

        ranking = json.loads(response.choices[0].message.content)["ranking"]
        return [documents[i] for i in ranking[:top_k]]

    async def cohere_rerank(self, query: str, documents: list[str], top_k: int = 5) -> list:
        """Use Cohere's reranking API."""
        import cohere
        co = cohere.Client(api_key)

        results = co.rerank(
            query=query,
            documents=documents,
            top_n=top_k,
            model="rerank-english-v3.0"
        )
        return [documents[r.index] for r in results.results]

    def reciprocal_rank_fusion(self, rankings: list[list], k: int = 60) -> list:
        """Combine multiple rankings using RRF."""
        scores = {}
        for ranking in rankings:
            for rank, doc in enumerate(ranking):
                if doc not in scores:
                    scores[doc] = 0
                scores[doc] += 1 / (k + rank + 1)

        return sorted(scores.keys(), key=lambda x: scores[x], reverse=True)
Cross-encoder reranking typically improves RAG precision by 10-20%.