Improved Vector Search in Azure AI Search: Deep Dive
Vector search is the foundation of modern AI applications. Azure AI Search has made significant improvements to its vector capabilities. Let’s dive deep into what’s changed and how to leverage these improvements.
Vector Search Fundamentals
Vector search finds similar items by comparing their mathematical representations (embeddings). The key components:
- Embeddings: Dense vectors representing semantic meaning
- Index structure: How vectors are organized for fast retrieval
- Distance metric: How similarity is calculated
- Approximate nearest neighbor (ANN): Trade-off between speed and accuracy
HNSW Algorithm Deep Dive
Azure AI Search uses HNSW (Hierarchical Navigable Small World) for approximate nearest neighbor search:
Level 2: [A] -------- [B]
|
Level 1: [A] -- [C] -- [B] -- [D]
| | |
Level 0: [A]-[E]-[C]-[F]-[B]-[G]-[D]-[H]
Key Parameters:
from azure.search.documents.indexes.models import (
HnswAlgorithmConfiguration, HnswParameters
)
# Understanding each parameter
hnsw_config = HnswAlgorithmConfiguration(
name="optimized",
parameters=HnswParameters(
# m: Number of bi-directional links per node
# Higher = better recall, more memory, slower indexing
# Range: 4-10, Default: 4
m=6,
# efConstruction: Size of dynamic candidate list during index building
# Higher = better index quality, slower indexing
# Range: 100-1000, Default: 400
ef_construction=600,
# efSearch: Size of dynamic candidate list during search
# Higher = better recall, slower search
# Range: 100-1000, Default: 500
ef_search=500,
# metric: Distance function
# Options: cosine, euclidean, dotProduct
metric="cosine"
)
)
Choosing the Right Configuration
High Recall Configuration (when accuracy matters most):
high_recall = HnswAlgorithmConfiguration(
name="high-recall",
parameters=HnswParameters(
m=8,
ef_construction=800,
ef_search=800,
metric="cosine"
)
)
# Use case: Legal document search, medical records
# Trade-off: 20-30% slower queries, 2x index size
Balanced Configuration (recommended for most use cases):
balanced = HnswAlgorithmConfiguration(
name="balanced",
parameters=HnswParameters(
m=4,
ef_construction=400,
ef_search=500,
metric="cosine"
)
)
# Use case: General RAG applications, chatbots
High Speed Configuration (when latency is critical):
high_speed = HnswAlgorithmConfiguration(
name="high-speed",
parameters=HnswParameters(
m=4,
ef_construction=200,
ef_search=200,
metric="cosine"
)
)
# Use case: Real-time autocomplete, high-volume APIs
# Trade-off: 5-10% lower recall
Multi-Vector Indexing
Index multiple embeddings per document for different search scenarios:
from azure.search.documents.indexes.models import (
SearchIndex, SearchField, SearchFieldDataType,
VectorSearch, VectorSearchProfile
)
index = SearchIndex(
name="multi-vector-index",
fields=[
SearchField(name="id", type=SearchFieldDataType.String, key=True),
SearchField(name="title", type=SearchFieldDataType.String),
SearchField(name="content", type=SearchFieldDataType.String),
# Title embedding - for title-focused searches
SearchField(
name="title_vector",
type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
vector_search_dimensions=1536,
vector_search_profile_name="title-profile"
),
# Content embedding - for content searches
SearchField(
name="content_vector",
type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
vector_search_dimensions=1536,
vector_search_profile_name="content-profile"
),
# Question embedding - trained on questions about content
SearchField(
name="question_vector",
type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
vector_search_dimensions=1536,
vector_search_profile_name="question-profile"
)
],
vector_search=VectorSearch(
algorithms=[
HnswAlgorithmConfiguration(name="hnsw-high", parameters=HnswParameters(m=6, ef_search=600)),
HnswAlgorithmConfiguration(name="hnsw-fast", parameters=HnswParameters(m=4, ef_search=300))
],
profiles=[
VectorSearchProfile(name="title-profile", algorithm_configuration_name="hnsw-fast"),
VectorSearchProfile(name="content-profile", algorithm_configuration_name="hnsw-high"),
VectorSearchProfile(name="question-profile", algorithm_configuration_name="hnsw-high")
]
)
)
Weighted Multi-Vector Search
from azure.search.documents.models import VectorizedQuery
def multi_vector_search(query: str, query_type: str = "general"):
embeddings = generate_embedding(query)
# Weight vectors based on query type
if query_type == "question":
weights = {"question": 0.6, "content": 0.3, "title": 0.1}
elif query_type == "title_lookup":
weights = {"title": 0.7, "content": 0.3, "question": 0.0}
else:
weights = {"content": 0.5, "question": 0.3, "title": 0.2}
vector_queries = []
if weights["title"] > 0:
vector_queries.append(VectorizedQuery(
vector=embeddings,
k_nearest_neighbors=50,
fields="title_vector",
weight=weights["title"]
))
if weights["content"] > 0:
vector_queries.append(VectorizedQuery(
vector=embeddings,
k_nearest_neighbors=50,
fields="content_vector",
weight=weights["content"]
))
if weights["question"] > 0:
vector_queries.append(VectorizedQuery(
vector=embeddings,
k_nearest_neighbors=50,
fields="question_vector",
weight=weights["question"]
))
results = client.search(
search_text=None,
vector_queries=vector_queries,
top=10
)
return list(results)
Filtering with Vector Search
Combine filters with vector search efficiently:
# Pre-filtering: Filter before vector search (more efficient for selective filters)
# Post-filtering: Filter after vector search (more accurate for broad filters)
def filtered_vector_search(
query_embedding: list[float],
category: str = None,
date_from: str = None,
min_score: float = None
):
# Build filter
filters = []
if category:
filters.append(f"category eq '{category}'")
if date_from:
filters.append(f"date ge {date_from}")
filter_str = " and ".join(filters) if filters else None
vector_query = VectorizedQuery(
vector=query_embedding,
k_nearest_neighbors=100, # Over-fetch when filtering
fields="content_vector"
)
results = client.search(
search_text=None,
vector_queries=[vector_query],
filter=filter_str,
top=20
)
# Post-filter by score if needed
if min_score:
return [r for r in results if r["@search.score"] >= min_score]
return list(results)
Monitoring Vector Search Performance
import time
from dataclasses import dataclass
from typing import Optional
@dataclass
class SearchMetrics:
query_time_ms: float
result_count: int
top_score: float
avg_score: float
filter_applied: bool
def measure_search(query_func, *args, **kwargs) -> tuple[list, SearchMetrics]:
"""Measure search performance."""
start = time.perf_counter()
results = list(query_func(*args, **kwargs))
elapsed = (time.perf_counter() - start) * 1000
scores = [r.get("@search.score", 0) for r in results]
metrics = SearchMetrics(
query_time_ms=elapsed,
result_count=len(results),
top_score=max(scores) if scores else 0,
avg_score=sum(scores) / len(scores) if scores else 0,
filter_applied="filter" in kwargs
)
return results, metrics
# Usage
results, metrics = measure_search(
client.search,
search_text=None,
vector_queries=[vector_query],
top=10
)
print(f"Query time: {metrics.query_time_ms:.2f}ms")
print(f"Results: {metrics.result_count}")
print(f"Top score: {metrics.top_score:.4f}")
Optimizing Index Performance
Batch Indexing:
from azure.search.documents import SearchClient
from concurrent.futures import ThreadPoolExecutor
import itertools
def batch_index_documents(documents: list[dict], batch_size: int = 1000):
"""Index documents in parallel batches."""
def index_batch(batch):
return client.upload_documents(documents=batch)
# Split into batches
batches = [
documents[i:i + batch_size]
for i in range(0, len(documents), batch_size)
]
# Index in parallel
with ThreadPoolExecutor(max_workers=4) as executor:
results = list(executor.map(index_batch, batches))
# Summarize results
total_succeeded = sum(r.succeeded_count for r in results)
total_failed = sum(r.failed_count for r in results)
return {"succeeded": total_succeeded, "failed": total_failed}
Index Warm-up:
async def warm_up_index(sample_queries: list[str]):
"""Warm up index with representative queries."""
for query in sample_queries:
embedding = await generate_embedding(query)
vector_query = VectorizedQuery(
vector=embedding,
k_nearest_neighbors=10,
fields="content_vector"
)
# Fire and forget
list(client.search(search_text=None, vector_queries=[vector_query], top=1))
print(f"Warmed up with {len(sample_queries)} queries")
Best Practices
- Choose embedding model carefully: Dimensions affect storage and speed
- Tune HNSW for your workload: Test different configurations
- Use multi-vector for different search types: Title, content, questions
- Over-fetch when filtering: Request more than you need
- Monitor performance: Track latency and recall
- Batch index updates: Don’t index one document at a time
Conclusion
Vector search in Azure AI Search has matured significantly. Understanding HNSW parameters, multi-vector strategies, and performance optimization allows you to build highly effective search applications.
Start with balanced settings, measure your specific workload, and tune from there.