September 17, 2024 1 min read

Reliable Data Extraction with LLMs: Patterns and Practices

Extracting structured data from unstructured text is one of the most valuable applications of LLMs. Let’s explore how to do it reliably in production.

The Extraction Challenge

from openai import OpenAI
from pydantic import BaseModel, Field
from typing import List, Optional
from enum import Enum

client = OpenAI()

# Define what we want to extract
class ContactInfo(BaseModel):
    email: Optional[str] = None
    phone: Optional[str] = None
    address: Optional[str] = None

class Company(BaseModel):
    name: str
    industry: Optional[str] = None
    website: Optional[str] = None

class Person(BaseModel):
    name: str
    title: Optional[str] = None
    company: Optional[Company] = None
    contact: Optional[ContactInfo] = None

class ExtractionResult(BaseModel):
    people: List[Person]
    companies: List[Company]
    dates: List[str]
    monetary_values: List[str]
    confidence: float = Field(ge=0, le=1)

def extract_entities(text: str) -> ExtractionResult:
    """Extract structured entities from text"""

    response = client.beta.chat.completions.parse(
        model="gpt-4o-2024-08-06",
        messages=[
            {
                "role": "system",
                "content": """
                Extract all entities from the text. Be thorough but only include
                information that is explicitly stated or strongly implied.
                Set confidence based on clarity of the source text.
                """
            },
            {
                "role": "user",
                "content": text
            }
        ],
        response_format=ExtractionResult
    )

    return response.choices[0].message.parsed

Multi-Pass Extraction

from typing import Dict, Any
import json

class MultiPassExtractor:
    """
    Use multiple passes for higher quality extraction
    """

    def __init__(self):
        self.client = OpenAI()

    def extract(self, text: str) -> Dict[str, Any]:
        # Pass 1: Initial extraction
        initial = self._initial_extraction(text)

        # Pass 2: Validate and enrich
        validated = self._validate_extraction(text, initial)

        # Pass 3: Fill gaps
        final = self._fill_gaps(text, validated)

        return final

    def _initial_extraction(self, text: str) -> dict:
        """First pass: broad extraction"""

        response = self.client.chat.completions.create(
            model="gpt-4o-2024-08-06",
            messages=[
                {
                    "role": "system",
                    "content": "Extract all entities, facts, and relationships from the text."
                },
                {"role": "user", "content": text}
            ],
            response_format={"type": "json_object"}
        )

        return json.loads(response.choices[0].message.content)

    def _validate_extraction(self, text: str, extraction: dict) -> dict:
        """Second pass: validate extracted data"""

        response = self.client.chat.completions.create(
            model="gpt-4o-2024-08-06",
            messages=[
                {
                    "role": "system",
                    "content": """
                    Review this extraction against the source text.
                    Mark each field as 'verified', 'uncertain', or 'incorrect'.
                    Fix any errors and add confidence scores.
                    """
                },
                {
                    "role": "user",
                    "content": f"Source: {text}\n\nExtraction: {json.dumps(extraction)}"
                }
            ],
            response_format={"type": "json_object"}
        )

        return json.loads(response.choices[0].message.content)

    def _fill_gaps(self, text: str, validated: dict) -> dict:
        """Third pass: fill in missing information"""

        response = self.client.chat.completions.create(
            model="gpt-4o-2024-08-06",
            messages=[
                {
                    "role": "system",
                    "content": """
                    Look for any information in the source that wasn't captured.
                    Add missing fields but don't hallucinate - only add what's there.
                    """
                },
                {
                    "role": "user",
                    "content": f"Source: {text}\n\nCurrent extraction: {json.dumps(validated)}"
                }
            ],
            response_format={"type": "json_object"}
        )

        return json.loads(response.choices[0].message.content)

Domain-Specific Extraction

# Invoice extraction
class LineItem(BaseModel):
    description: str
    quantity: float
    unit_price: float
    total: float

class Invoice(BaseModel):
    invoice_number: str
    date: str
    vendor: Company
    customer: Company
    line_items: List[LineItem]
    subtotal: float
    tax: float
    total: float
    payment_terms: Optional[str] = None
    due_date: Optional[str] = None

def extract_invoice(text: str) -> Invoice:
    """Extract invoice data with domain-specific handling"""

    response = client.beta.chat.completions.parse(
        model="gpt-4o-2024-08-06",
        messages=[
            {
                "role": "system",
                "content": """
                Extract invoice information. Ensure:
                - All monetary values are numbers (not strings)
                - Dates are in ISO format (YYYY-MM-DD)
                - Line item totals = quantity * unit_price
                - Subtotal = sum of line item totals
                - Total = subtotal + tax
                """
            },
            {"role": "user", "content": text}
        ],
        response_format=Invoice
    )

    return response.choices[0].message.parsed

# Resume extraction
class Education(BaseModel):
    institution: str
    degree: str
    field: str
    graduation_year: Optional[int] = None
    gpa: Optional[float] = None

class Experience(BaseModel):
    company: str
    title: str
    start_date: str
    end_date: Optional[str] = None
    description: str
    achievements: List[str]

class Resume(BaseModel):
    name: str
    contact: ContactInfo
    summary: Optional[str] = None
    experience: List[Experience]
    education: List[Education]
    skills: List[str]
    certifications: List[str] = []

def extract_resume(text: str) -> Resume:
    """Extract resume information"""

    response = client.beta.chat.completions.parse(
        model="gpt-4o-2024-08-06",
        messages=[
            {
                "role": "system",
                "content": """
                Extract resume information. Order experience by date (most recent first).
                Parse skills into individual items. Identify certifications specifically.
                """
            },
            {"role": "user", "content": text}
        ],
        response_format=Resume
    )

    return response.choices[0].message.parsed

Handling Ambiguity

class AmbiguousValue(BaseModel):
    value: str
    alternatives: List[str] = []
    confidence: float
    context: str

class ExtractionWithUncertainty(BaseModel):
    certain_fields: Dict[str, Any]
    uncertain_fields: List[AmbiguousValue]
    missing_fields: List[str]
    extraction_notes: str

def extract_with_uncertainty(text: str, schema_description: str) -> ExtractionWithUncertainty:
    """
    Extract data while explicitly tracking uncertainty
    """

    response = client.beta.chat.completions.parse(
        model="gpt-4o-2024-08-06",
        messages=[
            {
                "role": "system",
                "content": f"""
                Extract information according to this schema: {schema_description}

                Be explicit about uncertainty:
                - Put clear, unambiguous values in 'certain_fields'
                - Put ambiguous values in 'uncertain_fields' with alternatives
                - List fields you couldn't find in 'missing_fields'
                - Add notes about extraction challenges
                """
            },
            {"role": "user", "content": text}
        ],
        response_format=ExtractionWithUncertainty
    )

    return response.choices[0].message.parsed

# Usage
result = extract_with_uncertainty(
    "John (or maybe Jon) Smith, Senior Dev at Acme (might be ACME Corp)",
    "Person with name, title, company"
)

# Access uncertain data explicitly
for uncertain in result.uncertain_fields:
    print(f"{uncertain.value} (confidence: {uncertain.confidence})")
    print(f"  Alternatives: {uncertain.alternatives}")

Validation and Correction

from typing import Tuple

class ExtractionValidator:
    """Validate and correct extracted data"""

    def __init__(self):
        self.client = OpenAI()

    def validate_and_correct(self, extraction: BaseModel,
                            source_text: str) -> Tuple[BaseModel, List[str]]:
        """
        Validate extraction against source and return corrections
        """
        corrections = []

        # Convert to dict for analysis
        data = extraction.model_dump()

        # Check for hallucinations
        hallucination_check = self._check_hallucinations(data, source_text)
        if hallucination_check:
            corrections.extend(hallucination_check)

        # Check internal consistency
        consistency_check = self._check_consistency(data)
        if consistency_check:
            corrections.extend(consistency_check)

        # Apply corrections
        if corrections:
            corrected = self._apply_corrections(extraction, corrections)
            return corrected, corrections

        return extraction, []

    def _check_hallucinations(self, data: dict, source: str) -> List[str]:
        """Check if extracted data is supported by source"""

        response = self.client.chat.completions.create(
            model="gpt-4o-2024-08-06",
            messages=[
                {
                    "role": "system",
                    "content": """
                    Check if each value in the extraction is supported by the source.
                    Return a list of values that appear to be hallucinated.
                    Format: ["field_name: issue description"]
                    """
                },
                {
                    "role": "user",
                    "content": f"Source: {source}\n\nExtraction: {json.dumps(data)}"
                }
            ],
            response_format={"type": "json_object"}
        )

        result = json.loads(response.choices[0].message.content)
        return result.get("hallucinations", [])

    def _check_consistency(self, data: dict) -> List[str]:
        """Check internal consistency of extracted data"""
        issues = []

        # Example: Check if totals add up
        if 'line_items' in data and 'subtotal' in data:
            calculated = sum(item.get('total', 0) for item in data['line_items'])
            if abs(calculated - data['subtotal']) > 0.01:
                issues.append(f"subtotal: Calculated {calculated}, extracted {data['subtotal']}")

        return issues

    def _apply_corrections(self, extraction: BaseModel,
                          corrections: List[str]) -> BaseModel:
        """Apply corrections to extraction"""
        # Re-extract with corrections guidance
        response = self.client.beta.chat.completions.parse(
            model="gpt-4o-2024-08-06",
            messages=[
                {
                    "role": "system",
                    "content": f"""
                    Fix these issues in the extraction:
                    {chr(10).join(corrections)}
                    """
                },
                {
                    "role": "user",
                    "content": extraction.model_dump_json()
                }
            ],
            response_format=type(extraction)
        )

        return response.choices[0].message.parsed

Reliable extraction requires careful schema design, multi-pass validation, and explicit handling of uncertainty. These patterns help you build extraction systems you can trust in production.