Back to Blog
7 min read

Recursive Summarization for Long Documents

When documents exceed even GPT-4’s 32K context, recursive summarization becomes essential. This pattern processes arbitrarily long documents through iterative compression.

The Recursive Pattern

from dataclasses import dataclass
from typing import Callable
import asyncio

@dataclass
class RecursiveConfig:
    chunk_size: int = 4000          # tokens per chunk
    chunk_overlap: int = 200         # overlap between chunks
    summary_ratio: float = 0.2       # target compression ratio
    max_iterations: int = 5          # prevent infinite recursion
    min_final_tokens: int = 500      # stop when below this

class RecursiveSummarizer:
    """Recursively summarize documents of any length."""

    def __init__(self, client):
        self.client = client
        self.counter = TokenCounter()

    async def summarize(
        self,
        document: str,
        config: RecursiveConfig = None
    ) -> dict:
        """Recursively summarize until target length reached."""
        config = config or RecursiveConfig()

        current_text = document
        iteration = 0
        history = []

        while iteration < config.max_iterations:
            current_tokens = self.counter.count(current_text)

            # Check if we're done
            if current_tokens <= config.min_final_tokens:
                break

            # Check if fits in context
            if current_tokens <= 7000:  # Can process in one pass
                summary = await self._single_pass_summary(current_text, config)
                history.append({
                    "iteration": iteration,
                    "input_tokens": current_tokens,
                    "output_tokens": self.counter.count(summary)
                })
                current_text = summary
                break

            # Need chunked processing
            summary = await self._chunked_summary(current_text, config)
            history.append({
                "iteration": iteration,
                "input_tokens": current_tokens,
                "output_tokens": self.counter.count(summary)
            })

            current_text = summary
            iteration += 1

        return {
            "summary": current_text,
            "iterations": len(history),
            "compression_history": history,
            "original_tokens": self.counter.count(document),
            "final_tokens": self.counter.count(current_text)
        }

    async def _single_pass_summary(
        self,
        text: str,
        config: RecursiveConfig
    ) -> str:
        """Single-pass summarization."""
        target_words = int(len(text.split()) * config.summary_ratio)

        prompt = f"""Summarize this text in approximately {target_words} words.
Preserve key information, main arguments, and important details.

Text:
{text}"""

        response = await self.client.chat_completion(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}]
        )

        return response.content

    async def _chunked_summary(
        self,
        text: str,
        config: RecursiveConfig
    ) -> str:
        """Chunk and summarize."""
        chunks = self._create_chunks(text, config)

        # Summarize chunks in parallel
        tasks = [
            self._summarize_chunk(chunk, i, len(chunks), config)
            for i, chunk in enumerate(chunks)
        ]
        summaries = await asyncio.gather(*tasks)

        # Combine summaries
        combined = "\n\n".join(summaries)
        return combined

    def _create_chunks(
        self,
        text: str,
        config: RecursiveConfig
    ) -> list[str]:
        """Create overlapping chunks."""
        words = text.split()
        chunks = []
        step = config.chunk_size - config.chunk_overlap

        for i in range(0, len(words), step):
            chunk = ' '.join(words[i:i + config.chunk_size])
            chunks.append(chunk)
            if i + config.chunk_size >= len(words):
                break

        return chunks

    async def _summarize_chunk(
        self,
        chunk: str,
        index: int,
        total: int,
        config: RecursiveConfig
    ) -> str:
        """Summarize a single chunk."""
        target_words = int(len(chunk.split()) * config.summary_ratio)

        prompt = f"""Summarize this section ({index + 1} of {total}) in ~{target_words} words.
Preserve important information and context for later combination.

Section:
{chunk}"""

        response = await self.client.chat_completion(
            model="gpt-35-turbo",  # Use cheaper model for chunks
            messages=[{"role": "user", "content": prompt}]
        )

        return response.content

Tree-based Summarization

For better quality on very long documents:

class TreeSummarizer:
    """Tree-structured summarization for improved coherence."""

    def __init__(self, client, branch_factor: int = 4):
        self.client = client
        self.branch_factor = branch_factor
        self.counter = TokenCounter()

    async def summarize(self, document: str) -> dict:
        """Build summary tree from leaves to root."""

        # Create leaf summaries
        chunks = self._split_document(document)
        leaf_summaries = await self._summarize_leaves(chunks)

        # Build tree upward
        current_level = leaf_summaries
        levels = [current_level]

        while len(current_level) > 1:
            next_level = await self._summarize_level(current_level)
            levels.append(next_level)
            current_level = next_level

        return {
            "summary": current_level[0] if current_level else "",
            "tree_depth": len(levels),
            "leaf_count": len(leaf_summaries)
        }

    async def _summarize_leaves(self, chunks: list[str]) -> list[str]:
        """Summarize leaf chunks in parallel."""
        tasks = [self._summarize_leaf(chunk, i) for i, chunk in enumerate(chunks)]
        return await asyncio.gather(*tasks)

    async def _summarize_leaf(self, chunk: str, index: int) -> str:
        """Summarize a leaf chunk."""
        prompt = f"""Create a concise summary of this text section.
Capture: main points, key facts, important details.

Section {index + 1}:
{chunk}"""

        response = await self.client.chat_completion(
            model="gpt-35-turbo",
            messages=[{"role": "user", "content": prompt}]
        )
        return response.content

    async def _summarize_level(self, summaries: list[str]) -> list[str]:
        """Combine summaries into next level."""
        next_level = []

        for i in range(0, len(summaries), self.branch_factor):
            group = summaries[i:i + self.branch_factor]
            combined = await self._combine_summaries(group)
            next_level.append(combined)

        return next_level

    async def _combine_summaries(self, summaries: list[str]) -> str:
        """Combine multiple summaries into one."""
        combined_text = "\n\n---\n\n".join(summaries)

        prompt = f"""Synthesize these summaries into a unified, coherent summary.
Eliminate redundancy, preserve all unique information.

Summaries:
{combined_text}"""

        response = await self.client.chat_completion(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}]
        )
        return response.content

Extractive-Abstractive Hybrid

Combine extraction with abstraction for better faithfulness:

class HybridSummarizer:
    """Hybrid extractive-abstractive summarization."""

    async def summarize(
        self,
        document: str,
        extract_ratio: float = 0.3,
        final_ratio: float = 0.1
    ) -> dict:
        """Two-phase summarization."""

        # Phase 1: Extract important sentences
        extracted = await self._extract_key_sentences(document, extract_ratio)

        # Phase 2: Abstractive rewrite
        abstract = await self._abstract_rewrite(extracted, document, final_ratio)

        return {
            "extracted_sentences": extracted,
            "final_summary": abstract,
            "method": "hybrid"
        }

    async def _extract_key_sentences(
        self,
        document: str,
        ratio: float
    ) -> str:
        """Extract key sentences."""
        target_sentences = max(5, int(document.count('.') * ratio))

        prompt = f"""Extract the {target_sentences} most important sentences from this document.
Return only the extracted sentences, maintaining original wording.
Select sentences that:
- Capture main ideas
- Contain key facts
- Represent different parts of the document

Document:
{document[:15000]}"""

        response = await self.client.chat_completion(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}]
        )
        return response.content

    async def _abstract_rewrite(
        self,
        extracted: str,
        original: str,
        ratio: float
    ) -> str:
        """Rewrite extracted content coherently."""
        target_words = int(len(original.split()) * ratio)

        prompt = f"""Rewrite these extracted sentences into a coherent {target_words}-word summary.
Improve flow and readability while preserving all information.

Extracted content:
{extracted}"""

        response = await self.client.chat_completion(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}]
        )
        return response.content

Quality Preservation Techniques

class QualityPreservingSummarizer:
    """Summarize with quality checks at each stage."""

    async def summarize_with_verification(
        self,
        document: str,
        target_length: int = 500
    ) -> dict:
        """Summarize with verification pass."""

        # Generate summary
        summary = await self._generate_summary(document, target_length)

        # Verify against original
        verification = await self._verify_summary(document, summary)

        # Correct if needed
        if not verification["faithful"]:
            summary = await self._correct_summary(document, summary, verification["issues"])

        return {
            "summary": summary,
            "verification": verification,
            "corrected": not verification["faithful"]
        }

    async def _verify_summary(
        self,
        document: str,
        summary: str
    ) -> dict:
        """Verify summary against document."""
        prompt = f"""Verify this summary against the original document.

Original (excerpt):
{document[:8000]}

Summary:
{summary}

Check:
1. Are all claims in the summary supported by the document?
2. Is any important information missing?
3. Are there any factual errors?

Return JSON:
{{"faithful": true/false, "issues": ["issue1", ...], "missing": ["missing info", ...]}}"""

        response = await self.client.chat_completion(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}],
            temperature=0
        )

        import json
        try:
            return json.loads(response.content)
        except:
            return {"faithful": True, "issues": []}

    async def _correct_summary(
        self,
        document: str,
        summary: str,
        issues: list[str]
    ) -> str:
        """Correct summary based on issues."""
        issues_str = "\n".join(f"- {issue}" for issue in issues)

        prompt = f"""Correct this summary to fix the identified issues.

Original document (excerpt):
{document[:6000]}

Current summary:
{summary}

Issues to fix:
{issues_str}

Provide corrected summary."""

        response = await self.client.chat_completion(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}]
        )
        return response.content

Performance Optimization

class OptimizedRecursiveSummarizer:
    """Optimized recursive summarization with caching and parallelism."""

    def __init__(self, client, cache=None):
        self.client = client
        self.cache = cache
        self.counter = TokenCounter()

    async def summarize(
        self,
        document: str,
        cache_key: str = None
    ) -> dict:
        """Summarize with caching and parallel processing."""

        # Check cache
        if self.cache and cache_key:
            cached = await self.cache.get(cache_key)
            if cached:
                return {"summary": cached, "cached": True}

        # Process
        chunks = self._smart_chunk(document)

        # Parallel summarization with rate limiting
        semaphore = asyncio.Semaphore(5)  # Max 5 concurrent requests

        async def bounded_summarize(chunk, index):
            async with semaphore:
                return await self._summarize_chunk(chunk, index, len(chunks))

        summaries = await asyncio.gather(*[
            bounded_summarize(chunk, i) for i, chunk in enumerate(chunks)
        ])

        # Combine
        final = await self._hierarchical_combine(summaries)

        # Cache result
        if self.cache and cache_key:
            await self.cache.set(cache_key, final)

        return {"summary": final, "cached": False}

    def _smart_chunk(self, document: str) -> list[str]:
        """Chunk at natural boundaries."""
        # Try to split at paragraph boundaries
        paragraphs = document.split('\n\n')

        chunks = []
        current_chunk = []
        current_tokens = 0
        max_tokens = 4000

        for para in paragraphs:
            para_tokens = self.counter.count(para)

            if current_tokens + para_tokens > max_tokens and current_chunk:
                chunks.append('\n\n'.join(current_chunk))
                current_chunk = [para]
                current_tokens = para_tokens
            else:
                current_chunk.append(para)
                current_tokens += para_tokens

        if current_chunk:
            chunks.append('\n\n'.join(current_chunk))

        return chunks

Recursive summarization enables processing of documents of any length while maintaining quality through iterative refinement.

Michael John Pena

Michael John Pena

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.