7 min read
Document Summarization Patterns with GPT-4
GPT-4’s longer context and improved comprehension make it excellent for document summarization. Here are production-ready patterns for different summarization needs.
Summarization Approaches
Different documents need different approaches:
from enum import Enum
from dataclasses import dataclass
class SummaryType(Enum):
EXTRACTIVE = "extractive" # Pull key sentences
ABSTRACTIVE = "abstractive" # Rewrite in new words
BULLET_POINTS = "bullet_points"
EXECUTIVE = "executive" # High-level for leadership
TECHNICAL = "technical" # Preserve technical details
STRUCTURED = "structured" # Specific sections
@dataclass
class SummaryConfig:
summary_type: SummaryType
target_length: int # words
audience: str
preserve_quotes: bool = False
include_metadata: bool = True
class DocumentSummarizer:
"""Multi-strategy document summarization."""
def __init__(self, client):
self.client = client
self.counter = TokenCounter()
async def summarize(
self,
document: str,
config: SummaryConfig
) -> dict:
"""Summarize document based on configuration."""
doc_tokens = self.counter.count(document)
# Select strategy based on document size
if doc_tokens <= 25000:
return await self._direct_summarize(document, config)
else:
return await self._chunked_summarize(document, config)
async def _direct_summarize(
self,
document: str,
config: SummaryConfig
) -> dict:
"""Direct summarization for documents that fit in context."""
prompts = {
SummaryType.EXTRACTIVE: f"""Extract the {config.target_length // 20} most important sentences from this document.
Present them in order of importance, maintaining original wording.""",
SummaryType.ABSTRACTIVE: f"""Write a {config.target_length}-word summary of this document.
Rephrase in clear, concise language while preserving all key information.""",
SummaryType.BULLET_POINTS: f"""Summarize this document in {config.target_length // 15} bullet points.
Each point should be a complete, standalone insight.""",
SummaryType.EXECUTIVE: f"""Write a {config.target_length}-word executive summary.
Focus on: key findings, business impact, and recommended actions.
Use language appropriate for {config.audience}.""",
SummaryType.TECHNICAL: f"""Write a {config.target_length}-word technical summary.
Preserve: technical terms, numbers, specifications, and methodology details.""",
SummaryType.STRUCTURED: """Summarize in this structure:
## Overview (2-3 sentences)
## Key Points (5-7 bullets)
## Details (brief elaboration on each point)
## Conclusion (1-2 sentences)"""
}
prompt = f"""{prompts[config.summary_type]}
Document:
{document}"""
response = await self.client.chat_completion(
model="gpt-4-32k" if self.counter.count(document) > 7000 else "gpt-4",
messages=[
{"role": "system", "content": f"You are creating summaries for {config.audience}."},
{"role": "user", "content": prompt}
]
)
return {
"summary": response.content,
"type": config.summary_type.value,
"method": "direct"
}
Map-Reduce Summarization
For documents exceeding context limits:
class MapReduceSummarizer:
"""Map-reduce pattern for large document summarization."""
def __init__(self, client, chunk_size: int = 5000):
self.client = client
self.chunk_size = chunk_size
self.counter = TokenCounter()
async def summarize(
self,
document: str,
final_length: int = 500
) -> dict:
"""Summarize large document using map-reduce."""
# Split into chunks
chunks = self._split_document(document)
# Map phase: summarize each chunk
chunk_summaries = []
for i, chunk in enumerate(chunks):
summary = await self._summarize_chunk(chunk, i + 1, len(chunks))
chunk_summaries.append(summary)
# Reduce phase: combine summaries
if len(chunk_summaries) > 10:
# Recursive reduction for very large documents
combined = await self._recursive_reduce(chunk_summaries, final_length)
else:
combined = await self._reduce_summaries(chunk_summaries, final_length)
return {
"summary": combined,
"chunks_processed": len(chunks),
"method": "map_reduce"
}
def _split_document(self, document: str) -> list[str]:
"""Split document into chunks."""
words = document.split()
chunks = []
chunk_words = self.chunk_size
for i in range(0, len(words), chunk_words):
chunk = ' '.join(words[i:i + chunk_words])
chunks.append(chunk)
return chunks
async def _summarize_chunk(
self,
chunk: str,
chunk_num: int,
total_chunks: int
) -> str:
"""Summarize a single chunk."""
prompt = f"""Summarize this document section ({chunk_num} of {total_chunks}).
Extract key information, main points, and important details.
Keep summary to ~200 words.
Section:
{chunk}"""
response = await self.client.chat_completion(
model="gpt-35-turbo", # Use cheaper model for chunks
messages=[{"role": "user", "content": prompt}]
)
return response.content
async def _reduce_summaries(
self,
summaries: list[str],
final_length: int
) -> str:
"""Combine chunk summaries into final summary."""
all_summaries = "\n\n---\n\n".join([
f"Section {i+1}:\n{s}" for i, s in enumerate(summaries)
])
prompt = f"""Combine these section summaries into a coherent {final_length}-word final summary.
Section Summaries:
{all_summaries}
Create a unified summary that:
1. Synthesizes information across all sections
2. Eliminates redundancy
3. Maintains logical flow
4. Preserves key details"""
response = await self.client.chat_completion(
model="gpt-4",
messages=[{"role": "user", "content": prompt}]
)
return response.content
async def _recursive_reduce(
self,
summaries: list[str],
final_length: int
) -> str:
"""Recursively reduce many summaries."""
while len(summaries) > 5:
# Combine pairs
new_summaries = []
for i in range(0, len(summaries), 3):
group = summaries[i:i + 3]
combined = await self._reduce_summaries(group, 300)
new_summaries.append(combined)
summaries = new_summaries
return await self._reduce_summaries(summaries, final_length)
Hierarchical Summarization
For structured documents:
class HierarchicalSummarizer:
"""Summarize preserving document structure."""
def __init__(self, client):
self.client = client
async def summarize_hierarchical(
self,
document: str,
depth: int = 2
) -> dict:
"""Create hierarchical summary preserving structure."""
# Extract document structure
structure = await self._extract_structure(document)
# Summarize at each level
summaries = {}
for section in structure["sections"]:
section_summary = await self._summarize_section(
section["title"],
section["content"],
depth
)
summaries[section["title"]] = section_summary
# Create overall summary
overall = await self._create_overview(structure, summaries)
return {
"overview": overall,
"sections": summaries,
"structure": structure
}
async def _extract_structure(self, document: str) -> dict:
"""Extract document structure."""
prompt = f"""Analyze the structure of this document.
Document:
{document[:8000]}
Return JSON:
{{
"title": "document title",
"sections": [
{{"title": "section title", "level": 1, "start": "first few words...", "topics": ["topic1"]}}
]
}}"""
response = await self.client.chat_completion(
model="gpt-4",
messages=[{"role": "user", "content": prompt}],
temperature=0
)
import json
try:
return json.loads(response.content)
except:
return {"title": "Document", "sections": []}
async def _summarize_section(
self,
title: str,
content: str,
depth: int
) -> dict:
"""Summarize a section with optional subsections."""
prompt = f"""Summarize this section: {title}
Content:
{content}
Provide:
1. Main summary (2-3 sentences)
2. Key points (3-5 bullets)
3. Notable details (if any)"""
response = await self.client.chat_completion(
model="gpt-4",
messages=[{"role": "user", "content": prompt}]
)
return {"title": title, "summary": response.content}
Comparative Summarization
Summarize multiple documents together:
class ComparativeSummarizer:
"""Summarize and compare multiple documents."""
async def compare_and_summarize(
self,
documents: list[dict], # [{"title": "...", "content": "..."}]
focus: str = None
) -> dict:
"""Create comparative summary of multiple documents."""
# Summarize each document
summaries = []
for doc in documents:
summary = await self._summarize_for_comparison(doc["content"])
summaries.append({
"title": doc["title"],
"summary": summary
})
# Create comparison
comparison = await self._create_comparison(summaries, focus)
return {
"individual_summaries": summaries,
"comparison": comparison
}
async def _create_comparison(
self,
summaries: list[dict],
focus: str
) -> dict:
"""Create comparison across documents."""
summaries_text = "\n\n".join([
f"Document: {s['title']}\n{s['summary']}"
for s in summaries
])
focus_instruction = f"\nFocus on: {focus}" if focus else ""
prompt = f"""Compare these document summaries.
{summaries_text}
{focus_instruction}
Provide:
1. Common themes across all documents
2. Key differences
3. Contradictions or conflicts
4. Unique insights from each
5. Synthesis - what can we conclude from all documents together?"""
response = await self.client.chat_completion(
model="gpt-4",
messages=[{"role": "user", "content": prompt}]
)
return {
"analysis": response.content,
"documents_compared": len(summaries)
}
Progressive Summarization
Different levels of detail:
class ProgressiveSummarizer:
"""Generate summaries at multiple detail levels."""
async def multi_level_summary(
self,
document: str
) -> dict:
"""Generate summaries at different detail levels."""
levels = {
"one_line": "Summarize in exactly one sentence.",
"brief": "Summarize in 50 words.",
"standard": "Summarize in 150 words.",
"detailed": "Summarize in 400 words, preserving key details.",
"comprehensive": "Create a comprehensive 800-word summary with all important information."
}
summaries = {}
for level, instruction in levels.items():
prompt = f"""{instruction}
Document:
{document[:25000]}"""
response = await self.client.chat_completion(
model="gpt-4",
messages=[{"role": "user", "content": prompt}]
)
summaries[level] = response.content
return summaries
Quality Metrics
class SummaryEvaluator:
"""Evaluate summary quality."""
async def evaluate(
self,
document: str,
summary: str
) -> dict:
"""Evaluate summary against original document."""
prompt = f"""Evaluate this summary against the original document.
Original Document (excerpt):
{document[:5000]}
Summary:
{summary}
Rate (1-10) and explain:
1. Coverage: Does it cover main points?
2. Accuracy: Is the information correct?
3. Coherence: Is it well-organized and readable?
4. Conciseness: Is it appropriately condensed?
5. Faithfulness: Does it avoid adding information not in the original?
Return JSON with scores and explanations."""
response = await self.client.chat_completion(
model="gpt-4",
messages=[{"role": "user", "content": prompt}],
temperature=0
)
return {"evaluation": response.content}
Effective summarization adapts to document type, size, and audience needs. These patterns provide a foundation for production-ready summarization systems.