Skip to content
Back to Blog
1 min read

Intelligent Document Processing with Azure Form Recognizer

Form Recognizer is where I’ve seen the most immediate ROI from Cognitive Services in enterprise settings. Accounts payable teams manually keying invoice data from PDFs. Insurance claims with handwritten forms. Medical referrals with structured but non-standard layouts. Form Recognizer’s prebuilt models—invoices, receipts, ID documents, business cards, tax forms—are production-ready for common formats without any training. The custom models let you train on your specific form layout with as few as five labelled examples. The confidence scores per field are honest and actionable; you route low-confidence extractions to human review rather than letting bad data into the system.

Setting Up Form Recognizer

# Create Form Recognizer resource
az cognitiveservices account create \
    --name myformrecognizer \
    --resource-group myResourceGroup \
    --kind FormRecognizer \
    --sku S0 \
    --location eastus

Pre-built Models

Form Recognizer includes pre-built models for common document types.

Invoice Processing

from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
import json

class InvoiceProcessor:
    def __init__(self, endpoint, key):
        self.client = DocumentAnalysisClient(
            endpoint=endpoint,
            credential=AzureKeyCredential(key)
        )

    def analyze_invoice(self, document_url):
        """Analyze invoice using pre-built model."""
        poller = self.client.begin_analyze_document_from_url(
            "prebuilt-invoice",
            document_url
        )
        result = poller.result()

        invoices = []
        for invoice in result.documents:
            invoice_data = {
                "vendor_name": self._get_field_value(invoice.fields.get("VendorName")),
                "vendor_address": self._get_field_value(invoice.fields.get("VendorAddress")),
                "customer_name": self._get_field_value(invoice.fields.get("CustomerName")),
                "customer_address": self._get_field_value(invoice.fields.get("CustomerAddress")),
                "invoice_id": self._get_field_value(invoice.fields.get("InvoiceId")),
                "invoice_date": self._get_field_value(invoice.fields.get("InvoiceDate")),
                "due_date": self._get_field_value(invoice.fields.get("DueDate")),
                "purchase_order": self._get_field_value(invoice.fields.get("PurchaseOrder")),
                "subtotal": self._get_field_value(invoice.fields.get("SubTotal")),
                "tax": self._get_field_value(invoice.fields.get("TotalTax")),
                "total": self._get_field_value(invoice.fields.get("InvoiceTotal")),
                "amount_due": self._get_field_value(invoice.fields.get("AmountDue")),
                "items": self._extract_line_items(invoice.fields.get("Items"))
            }
            invoices.append(invoice_data)

        return invoices

    def analyze_invoice_stream(self, document_stream):
        """Analyze invoice from file stream."""
        poller = self.client.begin_analyze_document(
            "prebuilt-invoice",
            document_stream
        )
        return self._process_result(poller.result())

    def _get_field_value(self, field):
        if field is None:
            return None
        return {
            "value": field.value,
            "confidence": field.confidence
        }

    def _extract_line_items(self, items_field):
        if items_field is None:
            return []

        items = []
        for item in items_field.value:
            item_data = {}
            if "Description" in item.value:
                item_data["description"] = item.value["Description"].value
            if "Quantity" in item.value:
                item_data["quantity"] = item.value["Quantity"].value
            if "UnitPrice" in item.value:
                item_data["unit_price"] = item.value["UnitPrice"].value
            if "Amount" in item.value:
                item_data["amount"] = item.value["Amount"].value
            items.append(item_data)

        return items


# Usage
processor = InvoiceProcessor(
    "https://your-resource.cognitiveservices.azure.com",
    "your-key"
)

# Analyze from URL
invoice = processor.analyze_invoice(
    "https://example.com/invoice.pdf"
)[0]

print(f"Vendor: {invoice['vendor_name']['value']}")
print(f"Invoice #: {invoice['invoice_id']['value']}")
print(f"Total: ${invoice['total']['value']}")
print(f"Line items:")
for item in invoice['items']:
    print(f"  - {item.get('description')}: ${item.get('amount')}")

Receipt Processing

def analyze_receipt(self, receipt_url):
    """Analyze receipt using pre-built model."""
    poller = self.client.begin_analyze_document_from_url(
        "prebuilt-receipt",
        receipt_url
    )
    result = poller.result()

    receipts = []
    for receipt in result.documents:
        receipt_data = {
            "merchant_name": self._get_field_value(receipt.fields.get("MerchantName")),
            "merchant_address": self._get_field_value(receipt.fields.get("MerchantAddress")),
            "merchant_phone": self._get_field_value(receipt.fields.get("MerchantPhoneNumber")),
            "transaction_date": self._get_field_value(receipt.fields.get("TransactionDate")),
            "transaction_time": self._get_field_value(receipt.fields.get("TransactionTime")),
            "subtotal": self._get_field_value(receipt.fields.get("Subtotal")),
            "tax": self._get_field_value(receipt.fields.get("TotalTax")),
            "tip": self._get_field_value(receipt.fields.get("Tip")),
            "total": self._get_field_value(receipt.fields.get("Total")),
            "items": self._extract_receipt_items(receipt.fields.get("Items"))
        }
        receipts.append(receipt_data)

    return receipts

Business Card Processing

def analyze_business_card(self, card_url):
    """Extract contact information from business card."""
    poller = self.client.begin_analyze_document_from_url(
        "prebuilt-businessCard",
        card_url
    )
    result = poller.result()

    contacts = []
    for card in result.documents:
        contact = {
            "names": [n.value for n in card.fields.get("ContactNames", {}).value or []],
            "job_titles": [j.value for j in card.fields.get("JobTitles", {}).value or []],
            "companies": [c.value for c in card.fields.get("CompanyNames", {}).value or []],
            "emails": [e.value for e in card.fields.get("Emails", {}).value or []],
            "phones": [p.value for p in card.fields.get("MobilePhones", {}).value or []] +
                     [p.value for p in card.fields.get("WorkPhones", {}).value or []],
            "addresses": [a.value for a in card.fields.get("Addresses", {}).value or []],
            "websites": [w.value for w in card.fields.get("Websites", {}).value or []]
        }
        contacts.append(contact)

    return contacts

ID Document Processing

def analyze_id_document(self, id_url):
    """Extract information from ID documents (passport, driver's license)."""
    poller = self.client.begin_analyze_document_from_url(
        "prebuilt-idDocument",
        id_url
    )
    result = poller.result()

    documents = []
    for id_doc in result.documents:
        doc_data = {
            "document_type": id_doc.doc_type,
            "first_name": self._get_field_value(id_doc.fields.get("FirstName")),
            "last_name": self._get_field_value(id_doc.fields.get("LastName")),
            "date_of_birth": self._get_field_value(id_doc.fields.get("DateOfBirth")),
            "date_of_expiration": self._get_field_value(id_doc.fields.get("DateOfExpiration")),
            "document_number": self._get_field_value(id_doc.fields.get("DocumentNumber")),
            "address": self._get_field_value(id_doc.fields.get("Address")),
            "country_region": self._get_field_value(id_doc.fields.get("CountryRegion")),
            "sex": self._get_field_value(id_doc.fields.get("Sex"))
        }
        documents.append(doc_data)

    return documents

Custom Models

Train custom models for your specific document types.

from azure.ai.formrecognizer import DocumentModelAdministrationClient
import time

class CustomModelTrainer:
    def __init__(self, endpoint, key):
        self.admin_client = DocumentModelAdministrationClient(
            endpoint=endpoint,
            credential=AzureKeyCredential(key)
        )
        self.analysis_client = DocumentAnalysisClient(
            endpoint=endpoint,
            credential=AzureKeyCredential(key)
        )

    def train_model(self, training_data_url, model_id, description=None):
        """Train a custom model on labeled data."""
        poller = self.admin_client.begin_build_document_model(
            "neural",  # or "template"
            blob_container_url=training_data_url,
            model_id=model_id,
            description=description
        )

        model = poller.result()
        print(f"Model ID: {model.model_id}")
        print(f"Description: {model.description}")
        print(f"Created: {model.created_on}")

        print("Document types:")
        for name, doc_type in model.doc_types.items():
            print(f"  {name}:")
            for field_name, field in doc_type.field_schema.items():
                print(f"    - {field_name}: {field['type']}")

        return model

    def compose_models(self, model_ids, composed_model_id, description=None):
        """Combine multiple models into one."""
        poller = self.admin_client.begin_compose_document_model(
            model_ids,
            model_id=composed_model_id,
            description=description
        )
        return poller.result()

    def analyze_with_custom_model(self, model_id, document_url):
        """Analyze document with custom model."""
        poller = self.analysis_client.begin_analyze_document_from_url(
            model_id,
            document_url
        )
        result = poller.result()

        documents = []
        for doc in result.documents:
            doc_data = {
                "doc_type": doc.doc_type,
                "confidence": doc.confidence,
                "fields": {}
            }

            for name, field in doc.fields.items():
                doc_data["fields"][name] = {
                    "value": field.value,
                    "confidence": field.confidence,
                    "value_type": field.value_type
                }

            documents.append(doc_data)

        return documents

    def list_models(self):
        """List all custom models."""
        models = self.admin_client.list_document_models()
        return [
            {
                "model_id": m.model_id,
                "description": m.description,
                "created_on": m.created_on
            }
            for m in models
        ]

    def delete_model(self, model_id):
        """Delete a custom model."""
        self.admin_client.delete_document_model(model_id)


# Train custom model
trainer = CustomModelTrainer(
    "https://your-resource.cognitiveservices.azure.com",
    "your-key"
)

# Training data in blob storage with OCR JSON files
model = trainer.train_model(
    training_data_url="https://storage.blob.core.windows.net/training-data?sv=...",
    model_id="purchase-order-v1",
    description="Custom model for purchase orders"
)

# Use custom model
results = trainer.analyze_with_custom_model(
    "purchase-order-v1",
    "https://example.com/po.pdf"
)

Layout Analysis

def analyze_layout(self, document_url):
    """Extract text, tables, and structure from document."""
    poller = self.client.begin_analyze_document_from_url(
        "prebuilt-layout",
        document_url
    )
    result = poller.result()

    analysis = {
        "pages": [],
        "tables": [],
        "paragraphs": []
    }

    # Extract page information
    for page in result.pages:
        page_data = {
            "page_number": page.page_number,
            "width": page.width,
            "height": page.height,
            "unit": page.unit,
            "lines": [
                {
                    "content": line.content,
                    "bounding_box": line.polygon
                }
                for line in page.lines
            ],
            "words": [
                {
                    "content": word.content,
                    "confidence": word.confidence
                }
                for word in page.words
            ]
        }
        analysis["pages"].append(page_data)

    # Extract tables
    for table in result.tables:
        table_data = {
            "row_count": table.row_count,
            "column_count": table.column_count,
            "cells": [
                {
                    "content": cell.content,
                    "row_index": cell.row_index,
                    "column_index": cell.column_index,
                    "row_span": cell.row_span,
                    "column_span": cell.column_span,
                    "is_header": cell.kind == "columnHeader"
                }
                for cell in table.cells
            ]
        }
        analysis["tables"].append(table_data)

    # Extract paragraphs
    for paragraph in result.paragraphs:
        analysis["paragraphs"].append({
            "content": paragraph.content,
            "role": paragraph.role  # title, sectionHeading, pageHeader, etc.
        })

    return analysis


def extract_table_as_dataframe(table_data):
    """Convert extracted table to pandas DataFrame."""
    import pandas as pd

    # Create empty grid
    rows = [[None] * table_data["column_count"] for _ in range(table_data["row_count"])]

    # Fill in cells
    for cell in table_data["cells"]:
        rows[cell["row_index"]][cell["column_index"]] = cell["content"]

    # Create DataFrame
    df = pd.DataFrame(rows[1:], columns=rows[0])
    return df

Batch Processing

import asyncio
from concurrent.futures import ThreadPoolExecutor

async def process_documents_batch(processor, document_urls, model_id="prebuilt-invoice"):
    """Process multiple documents concurrently."""
    results = []

    async def process_one(url):
        poller = processor.client.begin_analyze_document_from_url(model_id, url)
        return poller.result()

    # Process in batches of 5
    batch_size = 5
    for i in range(0, len(document_urls), batch_size):
        batch = document_urls[i:i + batch_size]
        tasks = [process_one(url) for url in batch]
        batch_results = await asyncio.gather(*tasks)
        results.extend(batch_results)

    return results


# Usage
async def main():
    urls = [
        "https://storage.blob.core.windows.net/invoices/inv001.pdf",
        "https://storage.blob.core.windows.net/invoices/inv002.pdf",
        "https://storage.blob.core.windows.net/invoices/inv003.pdf",
    ]

    processor = InvoiceProcessor("endpoint", "key")
    results = await process_documents_batch(processor, urls)

    for i, result in enumerate(results):
        print(f"Invoice {i + 1}: Total = {result.documents[0].fields.get('InvoiceTotal')}")

asyncio.run(main())

Integration Example: Invoice Processing Pipeline

from azure.storage.blob import BlobServiceClient
import json

class InvoicePipeline:
    def __init__(self, form_recognizer_endpoint, form_recognizer_key,
                 storage_connection_string):
        self.processor = InvoiceProcessor(form_recognizer_endpoint, form_recognizer_key)
        self.blob_service = BlobServiceClient.from_connection_string(storage_connection_string)

    def process_new_invoices(self, input_container, output_container):
        """Process new invoices from blob storage."""
        input_client = self.blob_service.get_container_client(input_container)
        output_client = self.blob_service.get_container_client(output_container)

        processed = []

        for blob in input_client.list_blobs():
            if blob.name.lower().endswith(('.pdf', '.jpg', '.png')):
                print(f"Processing: {blob.name}")

                # Get blob URL with SAS token
                blob_url = self._get_blob_url_with_sas(input_container, blob.name)

                # Process invoice
                try:
                    invoices = self.processor.analyze_invoice(blob_url)

                    # Save results
                    result_json = json.dumps(invoices, default=str, indent=2)
                    output_blob_name = f"{blob.name}.json"
                    output_client.upload_blob(output_blob_name, result_json, overwrite=True)

                    processed.append({
                        "file": blob.name,
                        "status": "success",
                        "data": invoices
                    })
                except Exception as e:
                    processed.append({
                        "file": blob.name,
                        "status": "error",
                        "error": str(e)
                    })

        return processed

    def _get_blob_url_with_sas(self, container, blob_name):
        from datetime import datetime, timedelta
        from azure.storage.blob import generate_blob_sas, BlobSasPermissions

        sas_token = generate_blob_sas(
            self.blob_service.account_name,
            container,
            blob_name,
            permission=BlobSasPermissions(read=True),
            expiry=datetime.utcnow() + timedelta(hours=1)
        )

        return f"{self.blob_service.url}{container}/{blob_name}?{sas_token}"

Conclusion

Azure Form Recognizer transforms document processing:

  • Pre-built models for invoices, receipts, IDs, and business cards
  • Custom models for domain-specific documents
  • Layout analysis for complex document structures
  • Table extraction for structured data
  • Batch processing for high-volume scenarios

It eliminates manual data entry and enables intelligent document workflows.

Michael John Pena

Michael John Pena

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.