3 min read
Azure AI Search Index Projections: Chunking Documents at Index Time
Index projections enable you to create one-to-many relationships during indexing, transforming parent documents into multiple child chunks. This is essential for effective RAG implementations.
The Problem
Large documents need to be chunked for RAG, but traditional indexing creates one index document per source document:
Source Document (10 pages) → One Index Document
↓
Query retrieves entire 10-page document
(Too much context for LLM)
The Solution: Index Projections
Source Document (10 pages) → Skillset (Chunking) → Multiple Index Documents
↓
Query retrieves relevant chunks
(Right-sized for LLM)
Configuration
Skillset with Chunking
from azure.search.documents.indexes.models import (
SearchIndexerSkillset,
SplitSkill,
AzureOpenAIEmbeddingSkill
)
skillset = SearchIndexerSkillset(
name="chunking-skillset",
skills=[
SplitSkill(
name="chunk-documents",
description="Split documents into chunks",
text_split_mode="pages",
maximum_page_length=2000,
page_overlap_length=200,
inputs=[
{"name": "text", "source": "/document/content"}
],
outputs=[
{"name": "textItems", "targetName": "chunks"}
]
),
AzureOpenAIEmbeddingSkill(
name="embed-chunks",
resource_uri="https://your-openai.openai.azure.com",
deployment_id="text-embedding-ada-002",
model_name="text-embedding-ada-002",
inputs=[
{"name": "text", "source": "/document/chunks/*"}
],
outputs=[
{"name": "embedding", "targetName": "vector"}
]
)
]
)
Indexer with Projections
indexer = SearchIndexer(
name="document-indexer",
data_source_name="blob-source",
target_index_name="chunks-index",
skillset_name="chunking-skillset",
parameters=SearchIndexerParameters(
configuration={
"indexProjections": {
"selectors": [
{
"targetIndexName": "chunks-index",
"parentKeyFieldName": "parent_id",
"sourceContext": "/document/chunks/*",
"mappings": [
{"name": "chunk_id", "source": "/document/chunks/*/id"},
{"name": "content", "source": "/document/chunks/*/content"},
{"name": "vector", "source": "/document/chunks/*/vector"},
{"name": "chunk_index", "source": "/document/chunks/*/ordinal"},
{"name": "source_file", "source": "/document/metadata_storage_name"},
{"name": "source_path", "source": "/document/metadata_storage_path"}
]
}
],
"parameters": {
"projectionMode": "generatedKeyAsKeyField"
}
}
}
)
)
Target Index Schema
index = SearchIndex(
name="chunks-index",
fields=[
SearchField(name="chunk_id", type="Edm.String", key=True),
SearchField(name="parent_id", type="Edm.String", filterable=True),
SearchField(name="content", type="Edm.String", searchable=True),
SearchField(
name="vector",
type="Collection(Edm.Single)",
vector_search_dimensions=1536,
vector_search_profile_name="vector-profile"
),
SearchField(name="chunk_index", type="Edm.Int32", sortable=True),
SearchField(name="source_file", type="Edm.String", filterable=True),
SearchField(name="source_path", type="Edm.String")
],
vector_search=vector_search_config
)
Query Patterns
Search Chunks
def search_chunks(query: str, query_vector: list, top_k: int = 5):
"""Search for relevant chunks."""
results = search_client.search(
search_text=query,
vector_queries=[
VectorizedQuery(vector=query_vector, k=top_k, fields="vector")
],
select=["chunk_id", "content", "parent_id", "chunk_index", "source_file"],
top=top_k
)
return list(results)
Retrieve Context with Adjacent Chunks
def get_context_with_neighbors(chunk_id: str, parent_id: str, window: int = 1):
"""Get chunk with neighboring chunks for better context."""
# Get the target chunk's position
target = search_client.get_document(chunk_id)
target_index = target["chunk_index"]
# Search for neighboring chunks
results = search_client.search(
search_text="*",
filter=f"parent_id eq '{parent_id}' and chunk_index ge {target_index - window} and chunk_index le {target_index + window}",
order_by=["chunk_index"],
select=["chunk_id", "content", "chunk_index"]
)
# Combine content in order
chunks = sorted(results, key=lambda x: x["chunk_index"])
combined_content = "\n\n".join([c["content"] for c in chunks])
return combined_content
Group Results by Document
def search_and_group_by_document(query: str, query_vector: list, top_docs: int = 3):
"""Search chunks but group results by source document."""
# Over-retrieve chunks
results = search_client.search(
search_text=query,
vector_queries=[
VectorizedQuery(vector=query_vector, k=20, fields="vector")
],
select=["chunk_id", "content", "parent_id", "source_file", "@search.score"],
top=20
)
# Group by parent document
docs = {}
for r in results:
parent = r["parent_id"]
if parent not in docs:
docs[parent] = {
"source_file": r["source_file"],
"chunks": [],
"max_score": 0
}
docs[parent]["chunks"].append(r)
docs[parent]["max_score"] = max(docs[parent]["max_score"], r["@search.score"])
# Sort by best chunk score and return top N documents
sorted_docs = sorted(docs.values(), key=lambda x: x["max_score"], reverse=True)
return sorted_docs[:top_docs]
Best Practices
- Chunk size matters - 500-2000 tokens typically works well
- Add overlap - Prevents information loss at boundaries
- Preserve metadata - Keep parent_id and source info for tracing
- Index chunk position - Enables retrieving adjacent chunks
- Test chunk sizes - Evaluate retrieval quality with your data
Conclusion
Index projections transform RAG architectures by enabling proper document chunking at index time. The result is more relevant retrievals and better LLM responses. Use them for any RAG system dealing with multi-page documents.