7 min read
Intelligent Document Processing with Azure Form Recognizer
Azure Form Recognizer is an AI-powered document extraction service that understands your forms. It can extract text, key-value pairs, tables, and structures from documents, automating data entry and document processing workflows.
Setting Up Form Recognizer
# Create Form Recognizer resource
az cognitiveservices account create \
--name myformrecognizer \
--resource-group myResourceGroup \
--kind FormRecognizer \
--sku S0 \
--location eastus
Pre-built Models
Form Recognizer includes pre-built models for common document types.
Invoice Processing
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
import json
class InvoiceProcessor:
def __init__(self, endpoint, key):
self.client = DocumentAnalysisClient(
endpoint=endpoint,
credential=AzureKeyCredential(key)
)
def analyze_invoice(self, document_url):
"""Analyze invoice using pre-built model."""
poller = self.client.begin_analyze_document_from_url(
"prebuilt-invoice",
document_url
)
result = poller.result()
invoices = []
for invoice in result.documents:
invoice_data = {
"vendor_name": self._get_field_value(invoice.fields.get("VendorName")),
"vendor_address": self._get_field_value(invoice.fields.get("VendorAddress")),
"customer_name": self._get_field_value(invoice.fields.get("CustomerName")),
"customer_address": self._get_field_value(invoice.fields.get("CustomerAddress")),
"invoice_id": self._get_field_value(invoice.fields.get("InvoiceId")),
"invoice_date": self._get_field_value(invoice.fields.get("InvoiceDate")),
"due_date": self._get_field_value(invoice.fields.get("DueDate")),
"purchase_order": self._get_field_value(invoice.fields.get("PurchaseOrder")),
"subtotal": self._get_field_value(invoice.fields.get("SubTotal")),
"tax": self._get_field_value(invoice.fields.get("TotalTax")),
"total": self._get_field_value(invoice.fields.get("InvoiceTotal")),
"amount_due": self._get_field_value(invoice.fields.get("AmountDue")),
"items": self._extract_line_items(invoice.fields.get("Items"))
}
invoices.append(invoice_data)
return invoices
def analyze_invoice_stream(self, document_stream):
"""Analyze invoice from file stream."""
poller = self.client.begin_analyze_document(
"prebuilt-invoice",
document_stream
)
return self._process_result(poller.result())
def _get_field_value(self, field):
if field is None:
return None
return {
"value": field.value,
"confidence": field.confidence
}
def _extract_line_items(self, items_field):
if items_field is None:
return []
items = []
for item in items_field.value:
item_data = {}
if "Description" in item.value:
item_data["description"] = item.value["Description"].value
if "Quantity" in item.value:
item_data["quantity"] = item.value["Quantity"].value
if "UnitPrice" in item.value:
item_data["unit_price"] = item.value["UnitPrice"].value
if "Amount" in item.value:
item_data["amount"] = item.value["Amount"].value
items.append(item_data)
return items
# Usage
processor = InvoiceProcessor(
"https://your-resource.cognitiveservices.azure.com",
"your-key"
)
# Analyze from URL
invoice = processor.analyze_invoice(
"https://example.com/invoice.pdf"
)[0]
print(f"Vendor: {invoice['vendor_name']['value']}")
print(f"Invoice #: {invoice['invoice_id']['value']}")
print(f"Total: ${invoice['total']['value']}")
print(f"Line items:")
for item in invoice['items']:
print(f" - {item.get('description')}: ${item.get('amount')}")
Receipt Processing
def analyze_receipt(self, receipt_url):
"""Analyze receipt using pre-built model."""
poller = self.client.begin_analyze_document_from_url(
"prebuilt-receipt",
receipt_url
)
result = poller.result()
receipts = []
for receipt in result.documents:
receipt_data = {
"merchant_name": self._get_field_value(receipt.fields.get("MerchantName")),
"merchant_address": self._get_field_value(receipt.fields.get("MerchantAddress")),
"merchant_phone": self._get_field_value(receipt.fields.get("MerchantPhoneNumber")),
"transaction_date": self._get_field_value(receipt.fields.get("TransactionDate")),
"transaction_time": self._get_field_value(receipt.fields.get("TransactionTime")),
"subtotal": self._get_field_value(receipt.fields.get("Subtotal")),
"tax": self._get_field_value(receipt.fields.get("TotalTax")),
"tip": self._get_field_value(receipt.fields.get("Tip")),
"total": self._get_field_value(receipt.fields.get("Total")),
"items": self._extract_receipt_items(receipt.fields.get("Items"))
}
receipts.append(receipt_data)
return receipts
Business Card Processing
def analyze_business_card(self, card_url):
"""Extract contact information from business card."""
poller = self.client.begin_analyze_document_from_url(
"prebuilt-businessCard",
card_url
)
result = poller.result()
contacts = []
for card in result.documents:
contact = {
"names": [n.value for n in card.fields.get("ContactNames", {}).value or []],
"job_titles": [j.value for j in card.fields.get("JobTitles", {}).value or []],
"companies": [c.value for c in card.fields.get("CompanyNames", {}).value or []],
"emails": [e.value for e in card.fields.get("Emails", {}).value or []],
"phones": [p.value for p in card.fields.get("MobilePhones", {}).value or []] +
[p.value for p in card.fields.get("WorkPhones", {}).value or []],
"addresses": [a.value for a in card.fields.get("Addresses", {}).value or []],
"websites": [w.value for w in card.fields.get("Websites", {}).value or []]
}
contacts.append(contact)
return contacts
ID Document Processing
def analyze_id_document(self, id_url):
"""Extract information from ID documents (passport, driver's license)."""
poller = self.client.begin_analyze_document_from_url(
"prebuilt-idDocument",
id_url
)
result = poller.result()
documents = []
for id_doc in result.documents:
doc_data = {
"document_type": id_doc.doc_type,
"first_name": self._get_field_value(id_doc.fields.get("FirstName")),
"last_name": self._get_field_value(id_doc.fields.get("LastName")),
"date_of_birth": self._get_field_value(id_doc.fields.get("DateOfBirth")),
"date_of_expiration": self._get_field_value(id_doc.fields.get("DateOfExpiration")),
"document_number": self._get_field_value(id_doc.fields.get("DocumentNumber")),
"address": self._get_field_value(id_doc.fields.get("Address")),
"country_region": self._get_field_value(id_doc.fields.get("CountryRegion")),
"sex": self._get_field_value(id_doc.fields.get("Sex"))
}
documents.append(doc_data)
return documents
Custom Models
Train custom models for your specific document types.
from azure.ai.formrecognizer import DocumentModelAdministrationClient
import time
class CustomModelTrainer:
def __init__(self, endpoint, key):
self.admin_client = DocumentModelAdministrationClient(
endpoint=endpoint,
credential=AzureKeyCredential(key)
)
self.analysis_client = DocumentAnalysisClient(
endpoint=endpoint,
credential=AzureKeyCredential(key)
)
def train_model(self, training_data_url, model_id, description=None):
"""Train a custom model on labeled data."""
poller = self.admin_client.begin_build_document_model(
"neural", # or "template"
blob_container_url=training_data_url,
model_id=model_id,
description=description
)
model = poller.result()
print(f"Model ID: {model.model_id}")
print(f"Description: {model.description}")
print(f"Created: {model.created_on}")
print("Document types:")
for name, doc_type in model.doc_types.items():
print(f" {name}:")
for field_name, field in doc_type.field_schema.items():
print(f" - {field_name}: {field['type']}")
return model
def compose_models(self, model_ids, composed_model_id, description=None):
"""Combine multiple models into one."""
poller = self.admin_client.begin_compose_document_model(
model_ids,
model_id=composed_model_id,
description=description
)
return poller.result()
def analyze_with_custom_model(self, model_id, document_url):
"""Analyze document with custom model."""
poller = self.analysis_client.begin_analyze_document_from_url(
model_id,
document_url
)
result = poller.result()
documents = []
for doc in result.documents:
doc_data = {
"doc_type": doc.doc_type,
"confidence": doc.confidence,
"fields": {}
}
for name, field in doc.fields.items():
doc_data["fields"][name] = {
"value": field.value,
"confidence": field.confidence,
"value_type": field.value_type
}
documents.append(doc_data)
return documents
def list_models(self):
"""List all custom models."""
models = self.admin_client.list_document_models()
return [
{
"model_id": m.model_id,
"description": m.description,
"created_on": m.created_on
}
for m in models
]
def delete_model(self, model_id):
"""Delete a custom model."""
self.admin_client.delete_document_model(model_id)
# Train custom model
trainer = CustomModelTrainer(
"https://your-resource.cognitiveservices.azure.com",
"your-key"
)
# Training data in blob storage with OCR JSON files
model = trainer.train_model(
training_data_url="https://storage.blob.core.windows.net/training-data?sv=...",
model_id="purchase-order-v1",
description="Custom model for purchase orders"
)
# Use custom model
results = trainer.analyze_with_custom_model(
"purchase-order-v1",
"https://example.com/po.pdf"
)
Layout Analysis
def analyze_layout(self, document_url):
"""Extract text, tables, and structure from document."""
poller = self.client.begin_analyze_document_from_url(
"prebuilt-layout",
document_url
)
result = poller.result()
analysis = {
"pages": [],
"tables": [],
"paragraphs": []
}
# Extract page information
for page in result.pages:
page_data = {
"page_number": page.page_number,
"width": page.width,
"height": page.height,
"unit": page.unit,
"lines": [
{
"content": line.content,
"bounding_box": line.polygon
}
for line in page.lines
],
"words": [
{
"content": word.content,
"confidence": word.confidence
}
for word in page.words
]
}
analysis["pages"].append(page_data)
# Extract tables
for table in result.tables:
table_data = {
"row_count": table.row_count,
"column_count": table.column_count,
"cells": [
{
"content": cell.content,
"row_index": cell.row_index,
"column_index": cell.column_index,
"row_span": cell.row_span,
"column_span": cell.column_span,
"is_header": cell.kind == "columnHeader"
}
for cell in table.cells
]
}
analysis["tables"].append(table_data)
# Extract paragraphs
for paragraph in result.paragraphs:
analysis["paragraphs"].append({
"content": paragraph.content,
"role": paragraph.role # title, sectionHeading, pageHeader, etc.
})
return analysis
def extract_table_as_dataframe(table_data):
"""Convert extracted table to pandas DataFrame."""
import pandas as pd
# Create empty grid
rows = [[None] * table_data["column_count"] for _ in range(table_data["row_count"])]
# Fill in cells
for cell in table_data["cells"]:
rows[cell["row_index"]][cell["column_index"]] = cell["content"]
# Create DataFrame
df = pd.DataFrame(rows[1:], columns=rows[0])
return df
Batch Processing
import asyncio
from concurrent.futures import ThreadPoolExecutor
async def process_documents_batch(processor, document_urls, model_id="prebuilt-invoice"):
"""Process multiple documents concurrently."""
results = []
async def process_one(url):
poller = processor.client.begin_analyze_document_from_url(model_id, url)
return poller.result()
# Process in batches of 5
batch_size = 5
for i in range(0, len(document_urls), batch_size):
batch = document_urls[i:i + batch_size]
tasks = [process_one(url) for url in batch]
batch_results = await asyncio.gather(*tasks)
results.extend(batch_results)
return results
# Usage
async def main():
urls = [
"https://storage.blob.core.windows.net/invoices/inv001.pdf",
"https://storage.blob.core.windows.net/invoices/inv002.pdf",
"https://storage.blob.core.windows.net/invoices/inv003.pdf",
]
processor = InvoiceProcessor("endpoint", "key")
results = await process_documents_batch(processor, urls)
for i, result in enumerate(results):
print(f"Invoice {i + 1}: Total = {result.documents[0].fields.get('InvoiceTotal')}")
asyncio.run(main())
Integration Example: Invoice Processing Pipeline
from azure.storage.blob import BlobServiceClient
import json
class InvoicePipeline:
def __init__(self, form_recognizer_endpoint, form_recognizer_key,
storage_connection_string):
self.processor = InvoiceProcessor(form_recognizer_endpoint, form_recognizer_key)
self.blob_service = BlobServiceClient.from_connection_string(storage_connection_string)
def process_new_invoices(self, input_container, output_container):
"""Process new invoices from blob storage."""
input_client = self.blob_service.get_container_client(input_container)
output_client = self.blob_service.get_container_client(output_container)
processed = []
for blob in input_client.list_blobs():
if blob.name.lower().endswith(('.pdf', '.jpg', '.png')):
print(f"Processing: {blob.name}")
# Get blob URL with SAS token
blob_url = self._get_blob_url_with_sas(input_container, blob.name)
# Process invoice
try:
invoices = self.processor.analyze_invoice(blob_url)
# Save results
result_json = json.dumps(invoices, default=str, indent=2)
output_blob_name = f"{blob.name}.json"
output_client.upload_blob(output_blob_name, result_json, overwrite=True)
processed.append({
"file": blob.name,
"status": "success",
"data": invoices
})
except Exception as e:
processed.append({
"file": blob.name,
"status": "error",
"error": str(e)
})
return processed
def _get_blob_url_with_sas(self, container, blob_name):
from datetime import datetime, timedelta
from azure.storage.blob import generate_blob_sas, BlobSasPermissions
sas_token = generate_blob_sas(
self.blob_service.account_name,
container,
blob_name,
permission=BlobSasPermissions(read=True),
expiry=datetime.utcnow() + timedelta(hours=1)
)
return f"{self.blob_service.url}{container}/{blob_name}?{sas_token}"
Conclusion
Azure Form Recognizer transforms document processing:
- Pre-built models for invoices, receipts, IDs, and business cards
- Custom models for domain-specific documents
- Layout analysis for complex document structures
- Table extraction for structured data
- Batch processing for high-volume scenarios
It eliminates manual data entry and enables intelligent document workflows.