5 min read
Azure Form Recognizer Prebuilt Models: Quick Document Processing
Azure Form Recognizer prebuilt models provide instant document intelligence without training. They’re optimized for common document types and ready for production use.
Available Prebuilt Models
PREBUILT_MODELS = {
"prebuilt-read": {
"description": "OCR - extract text and layout from any document",
"use_cases": ["General text extraction", "Scanned documents"]
},
"prebuilt-layout": {
"description": "Extract text, tables, and structure",
"use_cases": ["Table extraction", "Document structure analysis"]
},
"prebuilt-document": {
"description": "General document with key-value pairs",
"use_cases": ["Forms", "Questionnaires"]
},
"prebuilt-invoice": {
"description": "Extract invoice fields",
"use_cases": ["Accounts payable", "Invoice processing"]
},
"prebuilt-receipt": {
"description": "Extract receipt data",
"use_cases": ["Expense management", "Retail"]
},
"prebuilt-idDocument": {
"description": "Extract ID document fields",
"use_cases": ["Identity verification", "KYC"]
},
"prebuilt-businessCard": {
"description": "Extract business card info",
"use_cases": ["Contact management", "CRM"]
},
"prebuilt-healthInsuranceCard.us": {
"description": "US health insurance cards",
"use_cases": ["Healthcare intake", "Insurance verification"]
},
"prebuilt-tax.us.w2": {
"description": "US W-2 tax forms",
"use_cases": ["Tax processing", "Payroll"]
}
}
Invoice Processing
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
client = DocumentAnalysisClient(
endpoint="https://your-resource.cognitiveservices.azure.com/",
credential=AzureKeyCredential("your-key")
)
def process_invoice(file_path: str) -> dict:
"""Extract data from an invoice."""
with open(file_path, "rb") as f:
poller = client.begin_analyze_document("prebuilt-invoice", f)
result = poller.result()
invoice = {}
for doc in result.documents:
# Vendor info
invoice["vendor"] = {
"name": get_field_value(doc.fields, "VendorName"),
"address": get_field_value(doc.fields, "VendorAddress"),
"tax_id": get_field_value(doc.fields, "VendorTaxId")
}
# Customer info
invoice["customer"] = {
"name": get_field_value(doc.fields, "CustomerName"),
"address": get_field_value(doc.fields, "CustomerAddress"),
"id": get_field_value(doc.fields, "CustomerId")
}
# Invoice details
invoice["details"] = {
"invoice_id": get_field_value(doc.fields, "InvoiceId"),
"invoice_date": get_field_value(doc.fields, "InvoiceDate"),
"due_date": get_field_value(doc.fields, "DueDate"),
"purchase_order": get_field_value(doc.fields, "PurchaseOrder")
}
# Amounts
invoice["amounts"] = {
"subtotal": get_field_value(doc.fields, "SubTotal"),
"tax": get_field_value(doc.fields, "TotalTax"),
"total": get_field_value(doc.fields, "InvoiceTotal"),
"amount_due": get_field_value(doc.fields, "AmountDue"),
"previous_balance": get_field_value(doc.fields, "PreviousUnpaidBalance")
}
# Line items
invoice["items"] = []
items_field = doc.fields.get("Items")
if items_field and items_field.value:
for item in items_field.value:
invoice["items"].append({
"description": get_field_value(item.value, "Description"),
"quantity": get_field_value(item.value, "Quantity"),
"unit": get_field_value(item.value, "Unit"),
"unit_price": get_field_value(item.value, "UnitPrice"),
"amount": get_field_value(item.value, "Amount"),
"product_code": get_field_value(item.value, "ProductCode")
})
return invoice
def get_field_value(fields: dict, name: str):
"""Safely get field value."""
field = fields.get(name)
return field.value if field else None
# Usage
invoice_data = process_invoice("invoice.pdf")
print(f"Invoice: {invoice_data['details']['invoice_id']}")
print(f"Total: {invoice_data['amounts']['total']}")
Receipt Processing
def process_receipt(file_path: str) -> dict:
"""Extract data from a receipt."""
with open(file_path, "rb") as f:
poller = client.begin_analyze_document("prebuilt-receipt", f)
result = poller.result()
receipt = {}
for doc in result.documents:
receipt = {
"merchant": {
"name": get_field_value(doc.fields, "MerchantName"),
"address": get_field_value(doc.fields, "MerchantAddress"),
"phone": get_field_value(doc.fields, "MerchantPhoneNumber")
},
"transaction": {
"date": get_field_value(doc.fields, "TransactionDate"),
"time": get_field_value(doc.fields, "TransactionTime")
},
"totals": {
"subtotal": get_field_value(doc.fields, "Subtotal"),
"tax": get_field_value(doc.fields, "TotalTax"),
"tip": get_field_value(doc.fields, "Tip"),
"total": get_field_value(doc.fields, "Total")
},
"items": []
}
items = doc.fields.get("Items")
if items and items.value:
for item in items.value:
receipt["items"].append({
"name": get_field_value(item.value, "Name"),
"quantity": get_field_value(item.value, "Quantity"),
"price": get_field_value(item.value, "TotalPrice")
})
return receipt
ID Document Processing
def process_id_document(file_path: str) -> dict:
"""Extract data from ID document (license, passport, etc.)."""
with open(file_path, "rb") as f:
poller = client.begin_analyze_document("prebuilt-idDocument", f)
result = poller.result()
id_data = {}
for doc in result.documents:
doc_type = doc.doc_type # e.g., "idDocument.driverLicense"
id_data = {
"document_type": doc_type,
"personal": {
"first_name": get_field_value(doc.fields, "FirstName"),
"last_name": get_field_value(doc.fields, "LastName"),
"date_of_birth": get_field_value(doc.fields, "DateOfBirth"),
"sex": get_field_value(doc.fields, "Sex"),
"address": get_field_value(doc.fields, "Address")
},
"document": {
"number": get_field_value(doc.fields, "DocumentNumber"),
"date_of_issue": get_field_value(doc.fields, "DateOfIssue"),
"date_of_expiration": get_field_value(doc.fields, "DateOfExpiration"),
"region": get_field_value(doc.fields, "Region"),
"country": get_field_value(doc.fields, "CountryRegion")
}
}
# Additional fields for driver's license
if "driverLicense" in doc_type:
id_data["license"] = {
"class": get_field_value(doc.fields, "DocumentDiscriminator"),
"endorsements": get_field_value(doc.fields, "Endorsements"),
"restrictions": get_field_value(doc.fields, "Restrictions")
}
return id_data
Batch Processing
import asyncio
from concurrent.futures import ThreadPoolExecutor
class BatchProcessor:
"""Process multiple documents efficiently."""
def __init__(self, client, model_id: str, max_workers: int = 5):
self.client = client
self.model_id = model_id
self.max_workers = max_workers
def process_single(self, file_path: str) -> dict:
"""Process a single document."""
with open(file_path, "rb") as f:
poller = self.client.begin_analyze_document(self.model_id, f)
result = poller.result()
return {
"file": file_path,
"documents": [
{
"type": doc.doc_type,
"confidence": doc.confidence,
"fields": {
name: {
"value": field.value,
"confidence": field.confidence
}
for name, field in doc.fields.items()
}
}
for doc in result.documents
]
}
def process_batch(self, file_paths: list) -> list:
"""Process multiple documents in parallel."""
results = []
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
futures = {
executor.submit(self.process_single, path): path
for path in file_paths
}
for future in futures:
try:
result = future.result()
results.append(result)
except Exception as e:
results.append({
"file": futures[future],
"error": str(e)
})
return results
# Usage
processor = BatchProcessor(client, "prebuilt-invoice")
results = processor.process_batch([
"invoice1.pdf",
"invoice2.pdf",
"invoice3.pdf"
])
for result in results:
if "error" in result:
print(f"Error processing {result['file']}: {result['error']}")
else:
print(f"Processed {result['file']}: {len(result['documents'])} documents")
Integration Example
class DocumentProcessingPipeline:
"""Complete document processing pipeline."""
def __init__(self, endpoint: str, key: str):
self.client = DocumentAnalysisClient(
endpoint=endpoint,
credential=AzureKeyCredential(key)
)
def detect_and_process(self, file_path: str) -> dict:
"""Detect document type and process accordingly."""
# First, analyze layout to detect type
with open(file_path, "rb") as f:
poller = self.client.begin_analyze_document("prebuilt-document", f)
initial_result = poller.result()
# Determine best model based on content
model_id = self._detect_document_type(initial_result)
# Re-process with specific model
with open(file_path, "rb") as f:
poller = self.client.begin_analyze_document(model_id, f)
result = poller.result()
return {
"detected_model": model_id,
"result": self._extract_fields(result)
}
def _detect_document_type(self, result) -> str:
"""Detect document type from content."""
text = ""
for page in result.pages:
for line in page.lines:
text += line.content.lower() + " "
if any(kw in text for kw in ["invoice", "bill to", "invoice number"]):
return "prebuilt-invoice"
elif any(kw in text for kw in ["receipt", "subtotal", "thank you"]):
return "prebuilt-receipt"
elif any(kw in text for kw in ["driver license", "passport", "date of birth"]):
return "prebuilt-idDocument"
else:
return "prebuilt-document"
def _extract_fields(self, result) -> dict:
"""Extract fields from result."""
fields = {}
for doc in result.documents:
for name, field in doc.fields.items():
fields[name] = field.value
return fields
Best Practices
- Choose the right model: Use specific models for better accuracy
- Handle confidence scores: Filter low-confidence extractions
- Validate extracted data: Apply business rules
- Process asynchronously: For better throughput
- Cache results: Avoid re-processing same documents