Skip to content
Back to Blog
1 min read

Retrieval-Augmented Generation: Chunking Strategies for Better Results

I wrote “Retrieval-Augmented Generation: Chunking Strategies for Better Results” to share practical, production-minded guidance on this topic.

Chunking Fundamentals

from dataclasses import dataclass
from typing import List, Optional
import re

@dataclass
class Chunk:
    text: str
    metadata: dict
    start_index: int
    end_index: int
    chunk_id: str

class DocumentChunker:
    def __init__(self, chunk_size: int = 512, chunk_overlap: int = 50):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

    def fixed_size_chunks(self, text: str, doc_id: str) -> List[Chunk]:
        """Simple fixed-size chunking with overlap."""
        chunks = []
        start = 0
        chunk_num = 0

        while start < len(text):
            end = start + self.chunk_size

            # Avoid cutting words
            if end < len(text):
                while end > start and text[end] not in " \n\t":
                    end -= 1

            chunk_text = text[start:end].strip()
            if chunk_text:
                chunks.append(Chunk(
                    text=chunk_text,
                    metadata={"doc_id": doc_id, "chunk_num": chunk_num},
                    start_index=start,
                    end_index=end,
                    chunk_id=f"{doc_id}_chunk_{chunk_num}"
                ))
                chunk_num += 1

            start = end - self.chunk_overlap

        return chunks

    def semantic_chunks(self, text: str, doc_id: str) -> List[Chunk]:
        """Chunk based on semantic boundaries like paragraphs and sections."""
        chunks = []

        # Split on paragraph boundaries
        paragraphs = re.split(r'\n\s*\n', text)

        current_chunk = []
        current_length = 0
        chunk_num = 0
        start_index = 0

        for para in paragraphs:
            para = para.strip()
            if not para:
                continue

            para_length = len(para)

            # If single paragraph exceeds chunk size, split it
            if para_length > self.chunk_size:
                # Flush current chunk
                if current_chunk:
                    chunk_text = "\n\n".join(current_chunk)
                    chunks.append(Chunk(
                        text=chunk_text,
                        metadata={"doc_id": doc_id, "chunk_num": chunk_num},
                        start_index=start_index,
                        end_index=start_index + len(chunk_text),
                        chunk_id=f"{doc_id}_chunk_{chunk_num}"
                    ))
                    chunk_num += 1
                    start_index += len(chunk_text) + 2

                # Split large paragraph by sentences
                sentences = self._split_sentences(para)
                for sentence_chunk in self._group_sentences(sentences, self.chunk_size):
                    chunks.append(Chunk(
                        text=sentence_chunk,
                        metadata={"doc_id": doc_id, "chunk_num": chunk_num},
                        start_index=start_index,
                        end_index=start_index + len(sentence_chunk),
                        chunk_id=f"{doc_id}_chunk_{chunk_num}"
                    ))
                    chunk_num += 1
                    start_index += len(sentence_chunk) + 1

                current_chunk = []
                current_length = 0

            elif current_length + para_length > self.chunk_size:
                # Flush current chunk
                chunk_text = "\n\n".join(current_chunk)
                chunks.append(Chunk(
                    text=chunk_text,
                    metadata={"doc_id": doc_id, "chunk_num": chunk_num},
                    start_index=start_index,
                    end_index=start_index + len(chunk_text),
                    chunk_id=f"{doc_id}_chunk_{chunk_num}"
                ))
                chunk_num += 1
                start_index += len(chunk_text) + 2

                current_chunk = [para]
                current_length = para_length
            else:
                current_chunk.append(para)
                current_length += para_length

        # Flush remaining
        if current_chunk:
            chunk_text = "\n\n".join(current_chunk)
            chunks.append(Chunk(
                text=chunk_text,
                metadata={"doc_id": doc_id, "chunk_num": chunk_num},
                start_index=start_index,
                end_index=start_index + len(chunk_text),
                chunk_id=f"{doc_id}_chunk_{chunk_num}"
            ))

        return chunks

    def _split_sentences(self, text: str) -> List[str]:
        """Split text into sentences."""
        return re.split(r'(?<=[.!?])\s+', text)

    def _group_sentences(self, sentences: List[str], max_length: int) -> List[str]:
        """Group sentences into chunks of max_length."""
        groups = []
        current = []
        current_len = 0

        for sentence in sentences:
            if current_len + len(sentence) > max_length and current:
                groups.append(" ".join(current))
                current = [sentence]
                current_len = len(sentence)
            else:
                current.append(sentence)
                current_len += len(sentence)

        if current:
            groups.append(" ".join(current))

        return groups

Contextual Chunking with Headers

def hierarchical_chunks(self, text: str, doc_id: str) -> List[Chunk]:
    """Preserve document hierarchy in chunks."""
    chunks = []
    current_headers = []

    # Match markdown-style headers
    sections = re.split(r'(^#{1,3}\s+.+$)', text, flags=re.MULTILINE)

    for i, section in enumerate(sections):
        if section.startswith('#'):
            level = len(re.match(r'^#+', section).group())
            header_text = section.lstrip('#').strip()

            # Update header stack
            current_headers = current_headers[:level-1] + [header_text]
        elif section.strip():
            # Create chunk with header context
            header_context = " > ".join(current_headers)
            chunk_text = f"[Context: {header_context}]\n\n{section.strip()}"

            chunks.append(Chunk(
                text=chunk_text,
                metadata={
                    "doc_id": doc_id,
                    "headers": current_headers.copy()
                },
                start_index=0,
                end_index=len(chunk_text),
                chunk_id=f"{doc_id}_section_{len(chunks)}"
            ))

    return chunks

The best chunking strategy depends on your document types. Technical documentation benefits from header-aware chunking. Conversational content works well with semantic paragraph chunking. Experiment and measure retrieval quality to find the optimal approach.\n\n## Takeaways\n\nAdd a concise, personal takeaway and recommended next steps here.\n

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.