1 min read
Document Chunking Strategies for RAG Systems
I wrote “Document Chunking Strategies for RAG Systems” to share practical, production-minded guidance on this topic.
Why Chunking Matters
# Problem: Documents are too long for embedding models
# OpenAI embeddings: max 8191 tokens
# Optimal for retrieval: 100-500 tokens
# Bad: Entire document as one chunk
# - Too much noise in embedding
# - Retrieved context too broad
# Bad: Too small chunks
# - Loses context
# - More storage and computation
# Good: Right-sized chunks
# - Coherent semantic units
# - Enough context for understanding
Strategy 1: Fixed-Size Chunking
Simple but effective for uniform content:
from typing import List
import tiktoken
class FixedSizeChunker:
"""Chunk by fixed token count."""
def __init__(
self,
chunk_size: int = 500,
chunk_overlap: int = 50,
model: str = "gpt-3.5-turbo"
):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.encoding = tiktoken.encoding_for_model(model)
def count_tokens(self, text: str) -> int:
return len(self.encoding.encode(text))
def chunk(self, text: str) -> List[str]:
"""Split text into fixed-size chunks with overlap."""
tokens = self.encoding.encode(text)
chunks = []
start = 0
while start < len(tokens):
end = start + self.chunk_size
chunk_tokens = tokens[start:end]
chunk_text = self.encoding.decode(chunk_tokens)
chunks.append(chunk_text)
start = end - self.chunk_overlap
return chunks
# Usage
chunker = FixedSizeChunker(chunk_size=500, chunk_overlap=50)
chunks = chunker.chunk(long_document)
Strategy 2: Sentence-Based Chunking
Respect sentence boundaries:
import re
from typing import List
class SentenceChunker:
"""Chunk by sentences, respecting boundaries."""
def __init__(
self,
max_chunk_size: int = 500,
min_chunk_size: int = 100
):
self.max_chunk_size = max_chunk_size
self.min_chunk_size = min_chunk_size
self.sentence_pattern = re.compile(r'(?<=[.!?])\s+')
def split_sentences(self, text: str) -> List[str]:
"""Split text into sentences."""
sentences = self.sentence_pattern.split(text)
return [s.strip() for s in sentences if s.strip()]
def chunk(self, text: str) -> List[str]:
"""Group sentences into chunks."""
sentences = self.split_sentences(text)
chunks = []
current_chunk = []
current_length = 0
for sentence in sentences:
sentence_length = len(sentence.split())
if current_length + sentence_length > self.max_chunk_size:
if current_chunk:
chunks.append(' '.join(current_chunk))
current_chunk = [sentence]
current_length = sentence_length
else:
current_chunk.append(sentence)
current_length += sentence_length
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks
Strategy 3: Paragraph-Based Chunking
Natural document structure:
class ParagraphChunker:
"""Chunk by paragraphs."""
def __init__(
self,
max_chunk_size: int = 1000,
combine_short: bool = True,
min_paragraph_length: int = 50
):
self.max_chunk_size = max_chunk_size
self.combine_short = combine_short
self.min_paragraph_length = min_paragraph_length
def chunk(self, text: str) -> List[str]:
"""Split by paragraphs, optionally combining short ones."""
paragraphs = text.split('\n\n')
paragraphs = [p.strip() for p in paragraphs if p.strip()]
if not self.combine_short:
return paragraphs
chunks = []
current_chunk = []
current_length = 0
for para in paragraphs:
para_length = len(para)
if para_length > self.max_chunk_size:
# Large paragraph becomes its own chunk
if current_chunk:
chunks.append('\n\n'.join(current_chunk))
current_chunk = []
current_length = 0
chunks.append(para)
elif current_length + para_length > self.max_chunk_size:
# Start new chunk
chunks.append('\n\n'.join(current_chunk))
current_chunk = [para]
current_length = para_length
else:
current_chunk.append(para)
current_length += para_length
if current_chunk:
chunks.append('\n\n'.join(current_chunk))
return chunks
Strategy 4: Semantic Chunking
Use embeddings to find natural break points:
import numpy as np
from typing import List, Tuple
class SemanticChunker:
"""Chunk based on semantic similarity."""
def __init__(
self,
embedding_model,
similarity_threshold: float = 0.5,
min_chunk_size: int = 100
):
self.embedding_model = embedding_model
self.similarity_threshold = similarity_threshold
self.min_chunk_size = min_chunk_size
def cosine_similarity(self, a: List[float], b: List[float]) -> float:
a, b = np.array(a), np.array(b)
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
def find_break_points(
self,
sentences: List[str]
) -> List[int]:
"""Find semantic break points."""
if len(sentences) < 2:
return []
# Get embeddings for all sentences
embeddings = [self.embedding_model.embed(s) for s in sentences]
# Find points where similarity drops
break_points = []
for i in range(1, len(embeddings)):
sim = self.cosine_similarity(embeddings[i-1], embeddings[i])
if sim < self.similarity_threshold:
break_points.append(i)
return break_points
def chunk(self, text: str) -> List[str]:
"""Chunk based on semantic breaks."""
# Split into sentences first
sentences = re.split(r'(?<=[.!?])\s+', text)
sentences = [s.strip() for s in sentences if s.strip()]
# Find break points
break_points = self.find_break_points(sentences)
# Create chunks
chunks = []
start = 0
for bp in break_points:
chunk = ' '.join(sentences[start:bp])
if len(chunk) >= self.min_chunk_size:
chunks.append(chunk)
start = bp
# Add final chunk
if start < len(sentences):
chunk = ' '.join(sentences[start:])
if len(chunk) >= self.min_chunk_size:
chunks.append(chunk)
return chunks
Strategy 5: Recursive Chunking
Hierarchical splitting with multiple separators:
class RecursiveChunker:
"""Recursively split using multiple separators."""
def __init__(
self,
chunk_size: int = 500,
chunk_overlap: int = 50,
separators: List[str] = None
):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.separators = separators or [
"\n\n", # Paragraphs
"\n", # Lines
". ", # Sentences
", ", # Clauses
" ", # Words
"" # Characters
]
def _split_text(self, text: str, separator: str) -> List[str]:
if separator:
return text.split(separator)
return list(text)
def _merge_splits(
self,
splits: List[str],
separator: str
) -> List[str]:
"""Merge splits back together respecting chunk size."""
chunks = []
current_chunk = []
current_length = 0
for split in splits:
split_length = len(split)
if current_length + split_length > self.chunk_size:
if current_chunk:
chunks.append(separator.join(current_chunk))
current_chunk = [split]
current_length = split_length
else:
current_chunk.append(split)
current_length += split_length + len(separator)
if current_chunk:
chunks.append(separator.join(current_chunk))
return chunks
def chunk(self, text: str, separators: List[str] = None) -> List[str]:
"""Recursively chunk text."""
separators = separators or self.separators
if not separators:
return [text]
separator = separators[0]
splits = self._split_text(text, separator)
# Check if any splits are too large
final_chunks = []
for split in splits:
if len(split) > self.chunk_size:
# Recursively split with next separator
sub_chunks = self.chunk(split, separators[1:])
final_chunks.extend(sub_chunks)
else:
final_chunks.append(split)
# Merge small chunks
return self._merge_splits(final_chunks, separator)
Strategy 6: Document-Aware Chunking
Respect document structure (headers, sections):
import re
from dataclasses import dataclass
@dataclass
class DocumentSection:
title: str
level: int
content: str
parent: str = ""
class DocumentAwareChunker:
"""Chunk respecting document structure."""
def __init__(self, max_chunk_size: int = 1000):
self.max_chunk_size = max_chunk_size
self.header_pattern = re.compile(r'^(#{1,6})\s+(.+)$', re.MULTILINE)
def parse_structure(self, text: str) -> List[DocumentSection]:
"""Parse document into sections."""
sections = []
current_headers = {}
lines = text.split('\n')
current_content = []
current_title = "Introduction"
current_level = 0
for line in lines:
header_match = self.header_pattern.match(line)
if header_match:
# Save previous section
if current_content:
content = '\n'.join(current_content).strip()
if content:
parent = current_headers.get(current_level - 1, "")
sections.append(DocumentSection(
title=current_title,
level=current_level,
content=content,
parent=parent
))
# Start new section
level = len(header_match.group(1))
title = header_match.group(2)
current_headers[level] = title
current_title = title
current_level = level
current_content = []
else:
current_content.append(line)
# Save last section
if current_content:
content = '\n'.join(current_content).strip()
if content:
sections.append(DocumentSection(
title=current_title,
level=current_level,
content=content,
parent=current_headers.get(current_level - 1, "")
))
return sections
def chunk(self, text: str) -> List[dict]:
"""Chunk with document structure metadata."""
sections = self.parse_structure(text)
chunks = []
for section in sections:
if len(section.content) <= self.max_chunk_size:
chunks.append({
"content": section.content,
"metadata": {
"title": section.title,
"level": section.level,
"parent": section.parent
}
})
else:
# Split large sections
sub_chunks = self._split_section(section.content)
for i, sub in enumerate(sub_chunks):
chunks.append({
"content": sub,
"metadata": {
"title": f"{section.title} (part {i+1})",
"level": section.level,
"parent": section.parent
}
})
return chunks
Choosing a Strategy
CHUNKING_GUIDE = {
"uniform_content": {
"strategy": "Fixed-Size",
"when": "Content is uniform (e.g., logs, records)",
"params": {"chunk_size": 500, "overlap": 50}
},
"prose_text": {
"strategy": "Sentence-Based",
"when": "Natural language prose",
"params": {"max_chunk_size": 500}
},
"structured_docs": {
"strategy": "Document-Aware",
"when": "Documents with headers/sections",
"params": {"respect_structure": True}
},
"varied_content": {
"strategy": "Recursive",
"when": "Mixed content types",
"params": {"separators": ["\n\n", "\n", ". "]}
},
"high_quality": {
"strategy": "Semantic",
"when": "Quality is critical, latency acceptable",
"params": {"similarity_threshold": 0.5}
}
}
Best Practices
- Include metadata: Track source, position, and structure
- Use overlap: Prevent losing context at boundaries
- Test empirically: Measure retrieval quality with different sizes
- Consider query type: Adjust based on typical queries
- Preserve context: Include headers/titles in chunks
Resources
- LangChain Text Splitters
- Chunking Strategies Comparison\n\n## Takeaways\n\nAdd a concise, personal takeaway and recommended next steps here.\n