February 2, 2023 1 min read

Document Chunking Strategies for RAG Systems

Chunking is the process of splitting documents into smaller pieces for embedding and retrieval. The quality of your chunks directly impacts RAG performance. Let’s explore different chunking strategies.

Why Chunking Matters

# Problem: Documents are too long for embedding models
# OpenAI embeddings: max 8191 tokens
# Optimal for retrieval: 100-500 tokens

# Bad: Entire document as one chunk
# - Too much noise in embedding
# - Retrieved context too broad

# Bad: Too small chunks
# - Loses context
# - More storage and computation

# Good: Right-sized chunks
# - Coherent semantic units
# - Enough context for understanding

Strategy 1: Fixed-Size Chunking

Simple but effective for uniform content:

from typing import List
import tiktoken

class FixedSizeChunker:
    """Chunk by fixed token count."""

    def __init__(
        self,
        chunk_size: int = 500,
        chunk_overlap: int = 50,
        model: str = "gpt-3.5-turbo"
    ):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.encoding = tiktoken.encoding_for_model(model)

    def count_tokens(self, text: str) -> int:
        return len(self.encoding.encode(text))

    def chunk(self, text: str) -> List[str]:
        """Split text into fixed-size chunks with overlap."""
        tokens = self.encoding.encode(text)
        chunks = []

        start = 0
        while start < len(tokens):
            end = start + self.chunk_size
            chunk_tokens = tokens[start:end]
            chunk_text = self.encoding.decode(chunk_tokens)
            chunks.append(chunk_text)
            start = end - self.chunk_overlap

        return chunks

# Usage
chunker = FixedSizeChunker(chunk_size=500, chunk_overlap=50)
chunks = chunker.chunk(long_document)

Strategy 2: Sentence-Based Chunking

Respect sentence boundaries:

import re
from typing import List

class SentenceChunker:
    """Chunk by sentences, respecting boundaries."""

    def __init__(
        self,
        max_chunk_size: int = 500,
        min_chunk_size: int = 100
    ):
        self.max_chunk_size = max_chunk_size
        self.min_chunk_size = min_chunk_size
        self.sentence_pattern = re.compile(r'(?<=[.!?])\s+')

    def split_sentences(self, text: str) -> List[str]:
        """Split text into sentences."""
        sentences = self.sentence_pattern.split(text)
        return [s.strip() for s in sentences if s.strip()]

    def chunk(self, text: str) -> List[str]:
        """Group sentences into chunks."""
        sentences = self.split_sentences(text)
        chunks = []
        current_chunk = []
        current_length = 0

        for sentence in sentences:
            sentence_length = len(sentence.split())

            if current_length + sentence_length > self.max_chunk_size:
                if current_chunk:
                    chunks.append(' '.join(current_chunk))
                current_chunk = [sentence]
                current_length = sentence_length
            else:
                current_chunk.append(sentence)
                current_length += sentence_length

        if current_chunk:
            chunks.append(' '.join(current_chunk))

        return chunks

Strategy 3: Paragraph-Based Chunking

Natural document structure:

class ParagraphChunker:
    """Chunk by paragraphs."""

    def __init__(
        self,
        max_chunk_size: int = 1000,
        combine_short: bool = True,
        min_paragraph_length: int = 50
    ):
        self.max_chunk_size = max_chunk_size
        self.combine_short = combine_short
        self.min_paragraph_length = min_paragraph_length

    def chunk(self, text: str) -> List[str]:
        """Split by paragraphs, optionally combining short ones."""
        paragraphs = text.split('\n\n')
        paragraphs = [p.strip() for p in paragraphs if p.strip()]

        if not self.combine_short:
            return paragraphs

        chunks = []
        current_chunk = []
        current_length = 0

        for para in paragraphs:
            para_length = len(para)

            if para_length > self.max_chunk_size:
                # Large paragraph becomes its own chunk
                if current_chunk:
                    chunks.append('\n\n'.join(current_chunk))
                    current_chunk = []
                    current_length = 0
                chunks.append(para)
            elif current_length + para_length > self.max_chunk_size:
                # Start new chunk
                chunks.append('\n\n'.join(current_chunk))
                current_chunk = [para]
                current_length = para_length
            else:
                current_chunk.append(para)
                current_length += para_length

        if current_chunk:
            chunks.append('\n\n'.join(current_chunk))

        return chunks

Strategy 4: Semantic Chunking

Use embeddings to find natural break points:

import numpy as np
from typing import List, Tuple

class SemanticChunker:
    """Chunk based on semantic similarity."""

    def __init__(
        self,
        embedding_model,
        similarity_threshold: float = 0.5,
        min_chunk_size: int = 100
    ):
        self.embedding_model = embedding_model
        self.similarity_threshold = similarity_threshold
        self.min_chunk_size = min_chunk_size

    def cosine_similarity(self, a: List[float], b: List[float]) -> float:
        a, b = np.array(a), np.array(b)
        return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

    def find_break_points(
        self,
        sentences: List[str]
    ) -> List[int]:
        """Find semantic break points."""
        if len(sentences) < 2:
            return []

        # Get embeddings for all sentences
        embeddings = [self.embedding_model.embed(s) for s in sentences]

        # Find points where similarity drops
        break_points = []
        for i in range(1, len(embeddings)):
            sim = self.cosine_similarity(embeddings[i-1], embeddings[i])
            if sim < self.similarity_threshold:
                break_points.append(i)

        return break_points

    def chunk(self, text: str) -> List[str]:
        """Chunk based on semantic breaks."""
        # Split into sentences first
        sentences = re.split(r'(?<=[.!?])\s+', text)
        sentences = [s.strip() for s in sentences if s.strip()]

        # Find break points
        break_points = self.find_break_points(sentences)

        # Create chunks
        chunks = []
        start = 0

        for bp in break_points:
            chunk = ' '.join(sentences[start:bp])
            if len(chunk) >= self.min_chunk_size:
                chunks.append(chunk)
            start = bp

        # Add final chunk
        if start < len(sentences):
            chunk = ' '.join(sentences[start:])
            if len(chunk) >= self.min_chunk_size:
                chunks.append(chunk)

        return chunks

Strategy 5: Recursive Chunking

Hierarchical splitting with multiple separators:

class RecursiveChunker:
    """Recursively split using multiple separators."""

    def __init__(
        self,
        chunk_size: int = 500,
        chunk_overlap: int = 50,
        separators: List[str] = None
    ):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.separators = separators or [
            "\n\n",  # Paragraphs
            "\n",    # Lines
            ". ",    # Sentences
            ", ",    # Clauses
            " ",     # Words
            ""       # Characters
        ]

    def _split_text(self, text: str, separator: str) -> List[str]:
        if separator:
            return text.split(separator)
        return list(text)

    def _merge_splits(
        self,
        splits: List[str],
        separator: str
    ) -> List[str]:
        """Merge splits back together respecting chunk size."""
        chunks = []
        current_chunk = []
        current_length = 0

        for split in splits:
            split_length = len(split)

            if current_length + split_length > self.chunk_size:
                if current_chunk:
                    chunks.append(separator.join(current_chunk))
                current_chunk = [split]
                current_length = split_length
            else:
                current_chunk.append(split)
                current_length += split_length + len(separator)

        if current_chunk:
            chunks.append(separator.join(current_chunk))

        return chunks

    def chunk(self, text: str, separators: List[str] = None) -> List[str]:
        """Recursively chunk text."""
        separators = separators or self.separators

        if not separators:
            return [text]

        separator = separators[0]
        splits = self._split_text(text, separator)

        # Check if any splits are too large
        final_chunks = []
        for split in splits:
            if len(split) > self.chunk_size:
                # Recursively split with next separator
                sub_chunks = self.chunk(split, separators[1:])
                final_chunks.extend(sub_chunks)
            else:
                final_chunks.append(split)

        # Merge small chunks
        return self._merge_splits(final_chunks, separator)

Strategy 6: Document-Aware Chunking

Respect document structure (headers, sections):

import re
from dataclasses import dataclass

@dataclass
class DocumentSection:
    title: str
    level: int
    content: str
    parent: str = ""

class DocumentAwareChunker:
    """Chunk respecting document structure."""

    def __init__(self, max_chunk_size: int = 1000):
        self.max_chunk_size = max_chunk_size
        self.header_pattern = re.compile(r'^(#{1,6})\s+(.+)$', re.MULTILINE)

    def parse_structure(self, text: str) -> List[DocumentSection]:
        """Parse document into sections."""
        sections = []
        current_headers = {}

        lines = text.split('\n')
        current_content = []
        current_title = "Introduction"
        current_level = 0

        for line in lines:
            header_match = self.header_pattern.match(line)

            if header_match:
                # Save previous section
                if current_content:
                    content = '\n'.join(current_content).strip()
                    if content:
                        parent = current_headers.get(current_level - 1, "")
                        sections.append(DocumentSection(
                            title=current_title,
                            level=current_level,
                            content=content,
                            parent=parent
                        ))

                # Start new section
                level = len(header_match.group(1))
                title = header_match.group(2)
                current_headers[level] = title
                current_title = title
                current_level = level
                current_content = []
            else:
                current_content.append(line)

        # Save last section
        if current_content:
            content = '\n'.join(current_content).strip()
            if content:
                sections.append(DocumentSection(
                    title=current_title,
                    level=current_level,
                    content=content,
                    parent=current_headers.get(current_level - 1, "")
                ))

        return sections

    def chunk(self, text: str) -> List[dict]:
        """Chunk with document structure metadata."""
        sections = self.parse_structure(text)
        chunks = []

        for section in sections:
            if len(section.content) <= self.max_chunk_size:
                chunks.append({
                    "content": section.content,
                    "metadata": {
                        "title": section.title,
                        "level": section.level,
                        "parent": section.parent
                    }
                })
            else:
                # Split large sections
                sub_chunks = self._split_section(section.content)
                for i, sub in enumerate(sub_chunks):
                    chunks.append({
                        "content": sub,
                        "metadata": {
                            "title": f"{section.title} (part {i+1})",
                            "level": section.level,
                            "parent": section.parent
                        }
                    })

        return chunks

Choosing a Strategy

CHUNKING_GUIDE = {
    "uniform_content": {
        "strategy": "Fixed-Size",
        "when": "Content is uniform (e.g., logs, records)",
        "params": {"chunk_size": 500, "overlap": 50}
    },
    "prose_text": {
        "strategy": "Sentence-Based",
        "when": "Natural language prose",
        "params": {"max_chunk_size": 500}
    },
    "structured_docs": {
        "strategy": "Document-Aware",
        "when": "Documents with headers/sections",
        "params": {"respect_structure": True}
    },
    "varied_content": {
        "strategy": "Recursive",
        "when": "Mixed content types",
        "params": {"separators": ["\n\n", "\n", ". "]}
    },
    "high_quality": {
        "strategy": "Semantic",
        "when": "Quality is critical, latency acceptable",
        "params": {"similarity_threshold": 0.5}
    }
}

Best Practices

Include metadata: Track source, position, and structure
Use overlap: Prevent losing context at boundaries
Test empirically: Measure retrieval quality with different sizes
Consider query type: Adjust based on typical queries
Preserve context: Include headers/titles in chunks