2 min read
Document Understanding with AI: From Images to Insights
Modern document understanding combines OCR, layout analysis, and LLMs for intelligent document processing.
Azure Document Intelligence + GPT-4
from azure.ai.documentintelligence import DocumentIntelligenceClient
from openai import AzureOpenAI
doc_client = DocumentIntelligenceClient(endpoint="...", credential=credential)
openai_client = AzureOpenAI(...)
async def process_document(document_path: str, query: str) -> dict:
"""Process document with Document Intelligence and GPT-4."""
# Extract with Document Intelligence
with open(document_path, "rb") as f:
poller = doc_client.begin_analyze_document("prebuilt-document", f)
result = poller.result()
# Structure extracted content
extracted_text = "\n".join([
para.content for page in result.pages
for para in page.get("paragraphs", [])
])
# Analyze with GPT-4
response = openai_client.chat.completions.create(
model="gpt-4-turbo",
messages=[
{"role": "system", "content": "You analyze documents and answer questions."},
{"role": "user", "content": f"Document:\n{extracted_text}\n\nQuestion: {query}"}
]
)
return {
"extracted_text": extracted_text,
"answer": response.choices[0].message.content,
"tables": [t.to_dict() for t in result.tables] if result.tables else []
}
Layout-Aware Processing
def process_with_layout(document_path: str) -> dict:
"""Process document preserving layout information."""
with open(document_path, "rb") as f:
poller = doc_client.begin_analyze_document("prebuilt-layout", f)
result = poller.result()
structured_content = {
"pages": [],
"tables": [],
"key_value_pairs": []
}
for page in result.pages:
page_content = {
"page_number": page.page_number,
"paragraphs": [],
"lines": []
}
for line in page.lines:
page_content["lines"].append({
"text": line.content,
"bounding_box": line.polygon
})
structured_content["pages"].append(page_content)
return structured_content
Table Extraction
def extract_tables(document_path: str) -> list[dict]:
"""Extract tables from document."""
with open(document_path, "rb") as f:
poller = doc_client.begin_analyze_document("prebuilt-layout", f)
result = poller.result()
tables = []
for table in result.tables:
table_data = {
"rows": table.row_count,
"columns": table.column_count,
"cells": []
}
for cell in table.cells:
table_data["cells"].append({
"row": cell.row_index,
"column": cell.column_index,
"content": cell.content
})
tables.append(table_data)
return tables
Best Practices
- Choose the right model - prebuilt-invoice, prebuilt-receipt, or prebuilt-document
- Handle multi-page - Process pages appropriately
- Preserve layout - Use bounding boxes for spatial understanding
- Combine with LLM - GPT-4 for reasoning over extracted content
- Validate extractions - Confidence scores indicate reliability
Conclusion
Document understanding pipelines combine specialized extraction with LLM reasoning for powerful document automation.