1 min read
Intelligent Document Processing with Azure Form Recognizer
I wrote “Intelligent Document Processing with Azure Form Recognizer” to share practical, production-minded guidance on this topic.
Form Recognizer Capabilities
- Pre-built models: Invoices, receipts, IDs, business cards
- Layout analysis: Tables, text, selection marks
- Custom models: Train on your specific documents
- Composed models: Combine multiple models
Setting Up
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
endpoint = "https://your-form-recognizer.cognitiveservices.azure.com/"
key = "your-api-key"
client = DocumentAnalysisClient(
endpoint=endpoint,
credential=AzureKeyCredential(key)
)
Pre-built Invoice Model
from dataclasses import dataclass
from typing import Optional
from decimal import Decimal
@dataclass
class ExtractedInvoice:
vendor_name: Optional[str]
invoice_number: Optional[str]
invoice_date: Optional[str]
due_date: Optional[str]
total_amount: Optional[Decimal]
tax_amount: Optional[Decimal]
line_items: list
def extract_invoice(file_path: str) -> ExtractedInvoice:
"""Extract data from an invoice document."""
with open(file_path, "rb") as f:
poller = client.begin_analyze_document("prebuilt-invoice", f)
result = poller.result()
for document in result.documents:
fields = document.fields
# Extract line items
line_items = []
if "Items" in fields and fields["Items"].value:
for item in fields["Items"].value:
item_fields = item.value
line_items.append({
"description": item_fields.get("Description", {}).value if item_fields.get("Description") else None,
"quantity": item_fields.get("Quantity", {}).value if item_fields.get("Quantity") else None,
"unit_price": item_fields.get("UnitPrice", {}).value if item_fields.get("UnitPrice") else None,
"amount": item_fields.get("Amount", {}).value if item_fields.get("Amount") else None,
})
return ExtractedInvoice(
vendor_name=fields.get("VendorName", {}).value if fields.get("VendorName") else None,
invoice_number=fields.get("InvoiceId", {}).value if fields.get("InvoiceId") else None,
invoice_date=str(fields.get("InvoiceDate", {}).value) if fields.get("InvoiceDate") else None,
due_date=str(fields.get("DueDate", {}).value) if fields.get("DueDate") else None,
total_amount=fields.get("InvoiceTotal", {}).value if fields.get("InvoiceTotal") else None,
tax_amount=fields.get("TotalTax", {}).value if fields.get("TotalTax") else None,
line_items=line_items
)
# Usage
invoice = extract_invoice("invoice.pdf")
print(f"Vendor: {invoice.vendor_name}")
print(f"Invoice #: {invoice.invoice_number}")
print(f"Total: ${invoice.total_amount}")
print(f"Line Items: {len(invoice.line_items)}")
Layout Analysis for Tables
def extract_tables(file_path: str) -> list[list[list[str]]]:
"""Extract tables from a document."""
with open(file_path, "rb") as f:
poller = client.begin_analyze_document("prebuilt-layout", f)
result = poller.result()
tables = []
for table in result.tables:
# Create 2D array for table
rows = [[None] * table.column_count for _ in range(table.row_count)]
for cell in table.cells:
rows[cell.row_index][cell.column_index] = cell.content
tables.append(rows)
return tables
# Usage
tables = extract_tables("report.pdf")
for i, table in enumerate(tables):
print(f"\nTable {i + 1}:")
for row in table:
print(" | ".join(str(cell) for cell in row))
Building a Custom Model
When pre-built models don’t fit your documents:
from azure.ai.formrecognizer import DocumentModelAdministrationClient
admin_client = DocumentModelAdministrationClient(
endpoint=endpoint,
credential=AzureKeyCredential(key)
)
def train_custom_model(
training_data_url: str,
model_id: str,
description: str
):
"""Train a custom model on labeled documents."""
# Start training
poller = admin_client.begin_build_document_model(
build_mode="template", # or "neural" for varied documents
blob_container_url=training_data_url,
model_id=model_id,
description=description
)
model = poller.result()
print(f"Model ID: {model.model_id}")
print(f"Created: {model.created_on}")
print(f"Doc types: {list(model.doc_types.keys())}")
return model
# Train on your labeled documents in blob storage
# training_url = "https://yourstorage.blob.core.windows.net/training-data?SAS_TOKEN"
# train_custom_model(training_url, "my-custom-model", "Custom purchase orders")
Document Processing Pipeline
import os
from azure.storage.blob import BlobServiceClient
from azure.servicebus import ServiceBusClient, ServiceBusMessage
import json
from enum import Enum
class DocumentType(Enum):
INVOICE = "invoice"
RECEIPT = "receipt"
PURCHASE_ORDER = "purchase_order"
UNKNOWN = "unknown"
class DocumentProcessor:
def __init__(
self,
form_recognizer_client: DocumentAnalysisClient,
blob_client: BlobServiceClient,
servicebus_client: ServiceBusClient
):
self.fr_client = form_recognizer_client
self.blob_client = blob_client
self.sb_client = servicebus_client
def classify_document(self, file_path: str) -> DocumentType:
"""Classify document type based on content."""
with open(file_path, "rb") as f:
poller = self.fr_client.begin_analyze_document("prebuilt-layout", f)
result = poller.result()
# Simple classification based on keywords
text = " ".join([line.content for page in result.pages for line in page.lines])
text_lower = text.lower()
if "invoice" in text_lower:
return DocumentType.INVOICE
elif "receipt" in text_lower:
return DocumentType.RECEIPT
elif "purchase order" in text_lower or "p.o." in text_lower:
return DocumentType.PURCHASE_ORDER
else:
return DocumentType.UNKNOWN
def process_document(self, file_path: str) -> dict:
"""Process document based on its type."""
doc_type = self.classify_document(file_path)
with open(file_path, "rb") as f:
if doc_type == DocumentType.INVOICE:
poller = self.fr_client.begin_analyze_document("prebuilt-invoice", f)
elif doc_type == DocumentType.RECEIPT:
poller = self.fr_client.begin_analyze_document("prebuilt-receipt", f)
else:
poller = self.fr_client.begin_analyze_document("prebuilt-layout", f)
result = poller.result()
return {
"document_type": doc_type.value,
"confidence": result.documents[0].confidence if result.documents else None,
"fields": self._extract_fields(result),
"tables": self._extract_tables(result)
}
def _extract_fields(self, result) -> dict:
"""Extract fields from analysis result."""
fields = {}
for document in result.documents:
for name, field in document.fields.items():
if field.value is not None:
fields[name] = {
"value": str(field.value),
"confidence": field.confidence
}
return fields
def _extract_tables(self, result) -> list:
"""Extract tables from analysis result."""
tables = []
for table in result.tables:
table_data = {
"rows": table.row_count,
"columns": table.column_count,
"cells": []
}
for cell in table.cells:
table_data["cells"].append({
"row": cell.row_index,
"column": cell.column_index,
"content": cell.content
})
tables.append(table_data)
return tables
def process_and_route(self, file_path: str, source: str):
"""Process document and route to appropriate queue."""
result = self.process_document(file_path)
# Add metadata
result["source"] = source
result["file_name"] = os.path.basename(file_path)
# Route to appropriate queue based on document type
queue_name = f"documents-{result['document_type']}"
with self.sb_client.get_queue_sender(queue_name) as sender:
message = ServiceBusMessage(json.dumps(result))
sender.send_messages(message)
return result
Batch Processing with Azure Functions
import azure.functions as func
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
import json
import os
app = func.FunctionApp()
@app.blob_trigger(
arg_name="blob",
path="incoming-documents/{name}",
connection="AzureWebJobsStorage"
)
@app.queue_output(
arg_name="output",
queue_name="processed-documents",
connection="AzureWebJobsStorage"
)
def process_incoming_document(blob: func.InputStream, output: func.Out[str]):
"""Process documents uploaded to blob storage."""
client = DocumentAnalysisClient(
endpoint=os.environ["FORM_RECOGNIZER_ENDPOINT"],
credential=AzureKeyCredential(os.environ["FORM_RECOGNIZER_KEY"])
)
# Analyze document
poller = client.begin_analyze_document("prebuilt-invoice", blob)
result = poller.result()
# Extract data
extracted = {
"file_name": blob.name,
"documents": []
}
for document in result.documents:
doc_data = {
"doc_type": document.doc_type,
"confidence": document.confidence,
"fields": {}
}
for name, field in document.fields.items():
if field.value is not None:
doc_data["fields"][name] = str(field.value)
extracted["documents"].append(doc_data)
# Send to output queue
output.set(json.dumps(extracted))
Validation and Confidence Scoring
@dataclass
class ValidationResult:
is_valid: bool
confidence_score: float
issues: list[str]
def validate_extracted_invoice(invoice: ExtractedInvoice) -> ValidationResult:
"""Validate extracted invoice data."""
issues = []
confidence_scores = []
# Required fields
if not invoice.vendor_name:
issues.append("Missing vendor name")
if not invoice.invoice_number:
issues.append("Missing invoice number")
if not invoice.total_amount:
issues.append("Missing total amount")
# Line item validation
if invoice.line_items:
line_total = sum(
item.get("amount", 0) or 0
for item in invoice.line_items
)
if invoice.total_amount and abs(line_total - float(invoice.total_amount)) > 0.01:
issues.append(f"Line items total ({line_total}) doesn't match invoice total ({invoice.total_amount})")
# Date validation
from datetime import datetime
if invoice.invoice_date:
try:
date = datetime.strptime(invoice.invoice_date, "%Y-%m-%d")
if date > datetime.now():
issues.append("Invoice date is in the future")
except:
issues.append("Invalid invoice date format")
return ValidationResult(
is_valid=len(issues) == 0,
confidence_score=0.9 if len(issues) == 0 else 0.5,
issues=issues
)
Integration with ERP Systems
class ERPIntegration:
def __init__(self, erp_api_url: str, api_key: str):
self.api_url = erp_api_url
self.api_key = api_key
def create_invoice_entry(self, invoice: ExtractedInvoice) -> dict:
"""Create invoice entry in ERP system."""
payload = {
"vendor": invoice.vendor_name,
"invoice_number": invoice.invoice_number,
"invoice_date": invoice.invoice_date,
"due_date": invoice.due_date,
"total_amount": float(invoice.total_amount) if invoice.total_amount else 0,
"tax_amount": float(invoice.tax_amount) if invoice.tax_amount else 0,
"line_items": [
{
"description": item.get("description"),
"quantity": item.get("quantity"),
"unit_price": item.get("unit_price"),
"amount": item.get("amount")
}
for item in invoice.line_items
],
"status": "pending_approval"
}
response = requests.post(
f"{self.api_url}/invoices",
headers={
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
},
json=payload
)
return response.json()
Azure Form Recognizer transforms manual document processing into automated workflows. The combination of pre-built and custom models handles most document types, while confidence scores enable human-in-the-loop validation for edge cases.\n\n## Takeaways\n\nAdd a concise, personal takeaway and recommended next steps here.\n