6 min read
Hybrid Search: Combining Vector and Keyword Search
Neither vector search nor keyword search is perfect on its own. Vector search understands meaning but can miss exact matches. Keyword search finds exact terms but misses synonyms. Hybrid search combines both for better results.
Why Hybrid Search?
# Vector search weakness: Exact terms
query = "error code 0x80070005"
# Vector search might return general "error handling" docs
# instead of specific error code documentation
# Keyword search weakness: Synonyms
query = "fix computer freezing"
# Keyword search misses "resolve system hang" or "troubleshoot PC lock up"
# Hybrid combines strengths of both
Basic Hybrid Search Implementation
from typing import List, Dict
import numpy as np
from rank_bm25 import BM25Okapi
class HybridSearchEngine:
"""Combine vector and keyword search."""
def __init__(
self,
embedding_model,
vector_weight: float = 0.5
):
self.embedding_model = embedding_model
self.vector_weight = vector_weight
self.keyword_weight = 1 - vector_weight
self.documents: List[Dict] = []
self.embeddings: List[List[float]] = []
self.bm25 = None
def _tokenize(self, text: str) -> List[str]:
"""Simple tokenization."""
return text.lower().split()
def add_documents(self, documents: List[Dict], text_field: str = "content"):
"""Add documents to both indexes."""
self.documents = documents
# Build vector index
texts = [doc[text_field] for doc in documents]
self.embeddings = [
self.embedding_model.embed(text)
for text in texts
]
# Build BM25 index
tokenized = [self._tokenize(text) for text in texts]
self.bm25 = BM25Okapi(tokenized)
def _vector_search(self, query: str, top_k: int) -> List[tuple]:
"""Perform vector similarity search."""
query_embedding = self.embedding_model.embed(query)
scores = []
for i, doc_emb in enumerate(self.embeddings):
similarity = np.dot(query_embedding, doc_emb) / (
np.linalg.norm(query_embedding) * np.linalg.norm(doc_emb)
)
scores.append((i, similarity))
scores.sort(key=lambda x: x[1], reverse=True)
return scores[:top_k]
def _keyword_search(self, query: str, top_k: int) -> List[tuple]:
"""Perform BM25 keyword search."""
tokenized_query = self._tokenize(query)
scores = self.bm25.get_scores(tokenized_query)
indexed_scores = [(i, score) for i, score in enumerate(scores)]
indexed_scores.sort(key=lambda x: x[1], reverse=True)
return indexed_scores[:top_k]
def _normalize_scores(self, scores: List[tuple]) -> Dict[int, float]:
"""Normalize scores to 0-1 range."""
if not scores:
return {}
max_score = max(s for _, s in scores)
min_score = min(s for _, s in scores)
range_score = max_score - min_score
if range_score == 0:
return {idx: 1.0 for idx, _ in scores}
return {
idx: (score - min_score) / range_score
for idx, score in scores
}
def search(
self,
query: str,
top_k: int = 10,
vector_weight: float = None
) -> List[Dict]:
"""Perform hybrid search."""
v_weight = vector_weight if vector_weight is not None else self.vector_weight
k_weight = 1 - v_weight
# Get results from both methods
vector_results = self._vector_search(query, top_k * 2)
keyword_results = self._keyword_search(query, top_k * 2)
# Normalize scores
vector_scores = self._normalize_scores(vector_results)
keyword_scores = self._normalize_scores(keyword_results)
# Combine scores
combined_scores = {}
all_indices = set(vector_scores.keys()) | set(keyword_scores.keys())
for idx in all_indices:
v_score = vector_scores.get(idx, 0)
k_score = keyword_scores.get(idx, 0)
combined_scores[idx] = v_weight * v_score + k_weight * k_score
# Sort and return
sorted_results = sorted(
combined_scores.items(),
key=lambda x: x[1],
reverse=True
)
return [
{
**self.documents[idx],
"score": score,
"vector_score": vector_scores.get(idx, 0),
"keyword_score": keyword_scores.get(idx, 0)
}
for idx, score in sorted_results[:top_k]
]
Reciprocal Rank Fusion (RRF)
A robust way to combine rankings:
class RRFHybridSearch:
"""Hybrid search using Reciprocal Rank Fusion."""
def __init__(self, embedding_model, k: int = 60):
self.embedding_model = embedding_model
self.k = k # RRF parameter, typically 60
self.documents = []
self.embeddings = []
self.bm25 = None
def _rrf_score(self, rank: int) -> float:
"""Calculate RRF score for a rank."""
return 1 / (self.k + rank)
def search(self, query: str, top_k: int = 10) -> List[Dict]:
"""Search using RRF to combine results."""
# Get rankings from both methods
vector_results = self._vector_search(query, top_k * 2)
keyword_results = self._keyword_search(query, top_k * 2)
# Calculate RRF scores
rrf_scores = {}
for rank, (idx, _) in enumerate(vector_results, 1):
rrf_scores[idx] = rrf_scores.get(idx, 0) + self._rrf_score(rank)
for rank, (idx, _) in enumerate(keyword_results, 1):
rrf_scores[idx] = rrf_scores.get(idx, 0) + self._rrf_score(rank)
# Sort by combined RRF score
sorted_results = sorted(
rrf_scores.items(),
key=lambda x: x[1],
reverse=True
)
return [
{**self.documents[idx], "rrf_score": score}
for idx, score in sorted_results[:top_k]
]
Dynamic Weight Adjustment
Adjust weights based on query characteristics:
class AdaptiveHybridSearch:
"""Hybrid search with adaptive weighting."""
def __init__(self, embedding_model):
self.embedding_model = embedding_model
self.documents = []
self.embeddings = []
self.bm25 = None
def _analyze_query(self, query: str) -> Dict:
"""Analyze query to determine optimal weights."""
analysis = {
"has_quotes": '"' in query,
"has_special_terms": any(c in query for c in ['#', '@', '/', '\\']),
"is_question": query.strip().endswith('?'),
"word_count": len(query.split()),
"has_numbers": any(c.isdigit() for c in query)
}
return analysis
def _determine_weights(self, query: str) -> tuple:
"""Determine vector/keyword weights based on query."""
analysis = self._analyze_query(query)
# Start with balanced weights
vector_weight = 0.5
keyword_weight = 0.5
# Exact phrase search - boost keyword
if analysis["has_quotes"]:
keyword_weight += 0.3
vector_weight -= 0.3
# Technical terms, codes - boost keyword
if analysis["has_special_terms"] or analysis["has_numbers"]:
keyword_weight += 0.2
vector_weight -= 0.2
# Natural language questions - boost vector
if analysis["is_question"] and analysis["word_count"] > 5:
vector_weight += 0.2
keyword_weight -= 0.2
# Normalize
total = vector_weight + keyword_weight
return vector_weight / total, keyword_weight / total
def search(self, query: str, top_k: int = 10) -> List[Dict]:
"""Search with adaptive weighting."""
v_weight, k_weight = self._determine_weights(query)
# Log weights for debugging
print(f"Query: '{query}' - Vector: {v_weight:.2f}, Keyword: {k_weight:.2f}")
# Perform hybrid search with determined weights
# ... (implementation similar to HybridSearchEngine)
Integration with Azure Cognitive Search
from azure.search.documents import SearchClient
from azure.search.documents.models import Vector
class AzureHybridSearch:
"""Hybrid search using Azure Cognitive Search."""
def __init__(
self,
endpoint: str,
key: str,
index_name: str,
embedding_model
):
self.search_client = SearchClient(
endpoint=endpoint,
index_name=index_name,
credential=AzureKeyCredential(key)
)
self.embedding_model = embedding_model
def search(
self,
query: str,
top_k: int = 10,
vector_fields: str = "contentVector",
filter_expr: str = None
) -> List[Dict]:
"""Perform hybrid search."""
query_embedding = self.embedding_model.embed(query)
vector = Vector(
value=query_embedding,
k=top_k,
fields=vector_fields
)
results = self.search_client.search(
search_text=query, # Keyword search
vectors=[vector], # Vector search
filter=filter_expr,
top=top_k,
select=["id", "title", "content", "category"]
)
return [dict(r) for r in results]
Evaluation
class HybridSearchEvaluator:
"""Evaluate hybrid search performance."""
def evaluate(
self,
search_engine,
test_queries: List[str],
relevance_judgments: Dict[str, List[str]], # query -> list of relevant doc ids
weight_configs: List[float] = [0.0, 0.25, 0.5, 0.75, 1.0]
) -> Dict:
"""Evaluate different weight configurations."""
results = {}
for vector_weight in weight_configs:
mrr_sum = 0
ndcg_sum = 0
for query in test_queries:
relevant_ids = set(relevance_judgments.get(query, []))
search_results = search_engine.search(
query,
vector_weight=vector_weight
)
# Calculate MRR
for rank, result in enumerate(search_results, 1):
if result["id"] in relevant_ids:
mrr_sum += 1 / rank
break
# Calculate NDCG@10
dcg = sum(
1 / np.log2(rank + 1)
for rank, r in enumerate(search_results[:10], 1)
if r["id"] in relevant_ids
)
ideal_dcg = sum(
1 / np.log2(rank + 1)
for rank in range(1, min(len(relevant_ids), 10) + 1)
)
ndcg_sum += dcg / ideal_dcg if ideal_dcg > 0 else 0
n = len(test_queries)
results[vector_weight] = {
"mrr": mrr_sum / n,
"ndcg@10": ndcg_sum / n
}
return results
Best Practices
- Start balanced: 50/50 is often a good default
- Tune on your data: Optimal weights are domain-specific
- Consider query type: Adjust weights dynamically
- Use RRF for robustness: Less sensitive to score scaling
- Evaluate thoroughly: Test with realistic queries