1 min read
Reliable Data Extraction with LLMs: Patterns and Practices
I wrote “Reliable Data Extraction with LLMs: Patterns and Practices” to share practical, production-minded guidance on this topic.
The Extraction Challenge
from openai import OpenAI
from pydantic import BaseModel, Field
from typing import List, Optional
from enum import Enum
client = OpenAI()
# Define what we want to extract
class ContactInfo(BaseModel):
email: Optional[str] = None
phone: Optional[str] = None
address: Optional[str] = None
class Company(BaseModel):
name: str
industry: Optional[str] = None
website: Optional[str] = None
class Person(BaseModel):
name: str
title: Optional[str] = None
company: Optional[Company] = None
contact: Optional[ContactInfo] = None
class ExtractionResult(BaseModel):
people: List[Person]
companies: List[Company]
dates: List[str]
monetary_values: List[str]
confidence: float = Field(ge=0, le=1)
def extract_entities(text: str) -> ExtractionResult:
"""Extract structured entities from text"""
response = client.beta.chat.completions.parse(
model="gpt-4o-2024-08-06",
messages=[
{
"role": "system",
"content": """
Extract all entities from the text. Be thorough but only include
information that is explicitly stated or strongly implied.
Set confidence based on clarity of the source text.
"""
},
{
"role": "user",
"content": text
}
],
response_format=ExtractionResult
)
return response.choices[0].message.parsed
Multi-Pass Extraction
from typing import Dict, Any
import json
class MultiPassExtractor:
"""
Use multiple passes for higher quality extraction
"""
def __init__(self):
self.client = OpenAI()
def extract(self, text: str) -> Dict[str, Any]:
# Pass 1: Initial extraction
initial = self._initial_extraction(text)
# Pass 2: Validate and enrich
validated = self._validate_extraction(text, initial)
# Pass 3: Fill gaps
final = self._fill_gaps(text, validated)
return final
def _initial_extraction(self, text: str) -> dict:
"""First pass: broad extraction"""
response = self.client.chat.completions.create(
model="gpt-4o-2024-08-06",
messages=[
{
"role": "system",
"content": "Extract all entities, facts, and relationships from the text."
},
{"role": "user", "content": text}
],
response_format={"type": "json_object"}
)
return json.loads(response.choices[0].message.content)
def _validate_extraction(self, text: str, extraction: dict) -> dict:
"""Second pass: validate extracted data"""
response = self.client.chat.completions.create(
model="gpt-4o-2024-08-06",
messages=[
{
"role": "system",
"content": """
Review this extraction against the source text.
Mark each field as 'verified', 'uncertain', or 'incorrect'.
Fix any errors and add confidence scores.
"""
},
{
"role": "user",
"content": f"Source: {text}\n\nExtraction: {json.dumps(extraction)}"
}
],
response_format={"type": "json_object"}
)
return json.loads(response.choices[0].message.content)
def _fill_gaps(self, text: str, validated: dict) -> dict:
"""Third pass: fill in missing information"""
response = self.client.chat.completions.create(
model="gpt-4o-2024-08-06",
messages=[
{
"role": "system",
"content": """
Look for any information in the source that wasn't captured.
Add missing fields but don't hallucinate - only add what's there.
"""
},
{
"role": "user",
"content": f"Source: {text}\n\nCurrent extraction: {json.dumps(validated)}"
}
],
response_format={"type": "json_object"}
)
return json.loads(response.choices[0].message.content)
Domain-Specific Extraction
# Invoice extraction
class LineItem(BaseModel):
description: str
quantity: float
unit_price: float
total: float
class Invoice(BaseModel):
invoice_number: str
date: str
vendor: Company
customer: Company
line_items: List[LineItem]
subtotal: float
tax: float
total: float
payment_terms: Optional[str] = None
due_date: Optional[str] = None
def extract_invoice(text: str) -> Invoice:
"""Extract invoice data with domain-specific handling"""
response = client.beta.chat.completions.parse(
model="gpt-4o-2024-08-06",
messages=[
{
"role": "system",
"content": """
Extract invoice information. Ensure:
- All monetary values are numbers (not strings)
- Dates are in ISO format (YYYY-MM-DD)
- Line item totals = quantity * unit_price
- Subtotal = sum of line item totals
- Total = subtotal + tax
"""
},
{"role": "user", "content": text}
],
response_format=Invoice
)
return response.choices[0].message.parsed
# Resume extraction
class Education(BaseModel):
institution: str
degree: str
field: str
graduation_year: Optional[int] = None
gpa: Optional[float] = None
class Experience(BaseModel):
company: str
title: str
start_date: str
end_date: Optional[str] = None
description: str
achievements: List[str]
class Resume(BaseModel):
name: str
contact: ContactInfo
summary: Optional[str] = None
experience: List[Experience]
education: List[Education]
skills: List[str]
certifications: List[str] = []
def extract_resume(text: str) -> Resume:
"""Extract resume information"""
response = client.beta.chat.completions.parse(
model="gpt-4o-2024-08-06",
messages=[
{
"role": "system",
"content": """
Extract resume information. Order experience by date (most recent first).
Parse skills into individual items. Identify certifications specifically.
"""
},
{"role": "user", "content": text}
],
response_format=Resume
)
return response.choices[0].message.parsed
Handling Ambiguity
class AmbiguousValue(BaseModel):
value: str
alternatives: List[str] = []
confidence: float
context: str
class ExtractionWithUncertainty(BaseModel):
certain_fields: Dict[str, Any]
uncertain_fields: List[AmbiguousValue]
missing_fields: List[str]
extraction_notes: str
def extract_with_uncertainty(text: str, schema_description: str) -> ExtractionWithUncertainty:
"""
Extract data while explicitly tracking uncertainty
"""
response = client.beta.chat.completions.parse(
model="gpt-4o-2024-08-06",
messages=[
{
"role": "system",
"content": f"""
Extract information according to this schema: {schema_description}
Be explicit about uncertainty:
- Put clear, unambiguous values in 'certain_fields'
- Put ambiguous values in 'uncertain_fields' with alternatives
- List fields you couldn't find in 'missing_fields'
- Add notes about extraction challenges
"""
},
{"role": "user", "content": text}
],
response_format=ExtractionWithUncertainty
)
return response.choices[0].message.parsed
# Usage
result = extract_with_uncertainty(
"John (or maybe Jon) Smith, Senior Dev at Acme (might be ACME Corp)",
"Person with name, title, company"
)
# Access uncertain data explicitly
for uncertain in result.uncertain_fields:
print(f"{uncertain.value} (confidence: {uncertain.confidence})")
print(f" Alternatives: {uncertain.alternatives}")
Validation and Correction
from typing import Tuple
class ExtractionValidator:
"""Validate and correct extracted data"""
def __init__(self):
self.client = OpenAI()
def validate_and_correct(self, extraction: BaseModel,
source_text: str) -> Tuple[BaseModel, List[str]]:
"""
Validate extraction against source and return corrections
"""
corrections = []
# Convert to dict for analysis
data = extraction.model_dump()
# Check for hallucinations
hallucination_check = self._check_hallucinations(data, source_text)
if hallucination_check:
corrections.extend(hallucination_check)
# Check internal consistency
consistency_check = self._check_consistency(data)
if consistency_check:
corrections.extend(consistency_check)
# Apply corrections
if corrections:
corrected = self._apply_corrections(extraction, corrections)
return corrected, corrections
return extraction, []
def _check_hallucinations(self, data: dict, source: str) -> List[str]:
"""Check if extracted data is supported by source"""
response = self.client.chat.completions.create(
model="gpt-4o-2024-08-06",
messages=[
{
"role": "system",
"content": """
Check if each value in the extraction is supported by the source.
Return a list of values that appear to be hallucinated.
Format: ["field_name: issue description"]
"""
},
{
"role": "user",
"content": f"Source: {source}\n\nExtraction: {json.dumps(data)}"
}
],
response_format={"type": "json_object"}
)
result = json.loads(response.choices[0].message.content)
return result.get("hallucinations", [])
def _check_consistency(self, data: dict) -> List[str]:
"""Check internal consistency of extracted data"""
issues = []
# Example: Check if totals add up
if 'line_items' in data and 'subtotal' in data:
calculated = sum(item.get('total', 0) for item in data['line_items'])
if abs(calculated - data['subtotal']) > 0.01:
issues.append(f"subtotal: Calculated {calculated}, extracted {data['subtotal']}")
return issues
def _apply_corrections(self, extraction: BaseModel,
corrections: List[str]) -> BaseModel:
"""Apply corrections to extraction"""
# Re-extract with corrections guidance
response = self.client.beta.chat.completions.parse(
model="gpt-4o-2024-08-06",
messages=[
{
"role": "system",
"content": f"""
Fix these issues in the extraction:
{chr(10).join(corrections)}
"""
},
{
"role": "user",
"content": extraction.model_dump_json()
}
],
response_format=type(extraction)
)
return response.choices[0].message.parsed
Reliable extraction requires careful schema design, multi-pass validation, and explicit handling of uncertainty. These patterns help you build extraction systems you can trust in production.\n\n## Takeaways\n\nAdd a concise, personal takeaway and recommended next steps here.\n