March 8, 2023 1 min read

Intelligent Document Processing with Azure Form Recognizer

Azure AI Form Recognizer Document Processing OCR

Paper documents still dominate many business processes. Azure Form Recognizer transforms unstructured documents into structured data, automating data entry and enabling intelligent workflows. Here’s how to build a document processing pipeline.

Form Recognizer Capabilities

Pre-built models: Invoices, receipts, IDs, business cards
Layout analysis: Tables, text, selection marks
Custom models: Train on your specific documents
Composed models: Combine multiple models

Setting Up

from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential

endpoint = "https://your-form-recognizer.cognitiveservices.azure.com/"
key = "your-api-key"

client = DocumentAnalysisClient(
    endpoint=endpoint,
    credential=AzureKeyCredential(key)
)

Pre-built Invoice Model

from dataclasses import dataclass
from typing import Optional
from decimal import Decimal

@dataclass
class ExtractedInvoice:
    vendor_name: Optional[str]
    invoice_number: Optional[str]
    invoice_date: Optional[str]
    due_date: Optional[str]
    total_amount: Optional[Decimal]
    tax_amount: Optional[Decimal]
    line_items: list

def extract_invoice(file_path: str) -> ExtractedInvoice:
    """Extract data from an invoice document."""
    with open(file_path, "rb") as f:
        poller = client.begin_analyze_document("prebuilt-invoice", f)

    result = poller.result()

    for document in result.documents:
        fields = document.fields

        # Extract line items
        line_items = []
        if "Items" in fields and fields["Items"].value:
            for item in fields["Items"].value:
                item_fields = item.value
                line_items.append({
                    "description": item_fields.get("Description", {}).value if item_fields.get("Description") else None,
                    "quantity": item_fields.get("Quantity", {}).value if item_fields.get("Quantity") else None,
                    "unit_price": item_fields.get("UnitPrice", {}).value if item_fields.get("UnitPrice") else None,
                    "amount": item_fields.get("Amount", {}).value if item_fields.get("Amount") else None,
                })

        return ExtractedInvoice(
            vendor_name=fields.get("VendorName", {}).value if fields.get("VendorName") else None,
            invoice_number=fields.get("InvoiceId", {}).value if fields.get("InvoiceId") else None,
            invoice_date=str(fields.get("InvoiceDate", {}).value) if fields.get("InvoiceDate") else None,
            due_date=str(fields.get("DueDate", {}).value) if fields.get("DueDate") else None,
            total_amount=fields.get("InvoiceTotal", {}).value if fields.get("InvoiceTotal") else None,
            tax_amount=fields.get("TotalTax", {}).value if fields.get("TotalTax") else None,
            line_items=line_items
        )

# Usage
invoice = extract_invoice("invoice.pdf")
print(f"Vendor: {invoice.vendor_name}")
print(f"Invoice #: {invoice.invoice_number}")
print(f"Total: ${invoice.total_amount}")
print(f"Line Items: {len(invoice.line_items)}")

Layout Analysis for Tables

def extract_tables(file_path: str) -> list[list[list[str]]]:
    """Extract tables from a document."""
    with open(file_path, "rb") as f:
        poller = client.begin_analyze_document("prebuilt-layout", f)

    result = poller.result()

    tables = []
    for table in result.tables:
        # Create 2D array for table
        rows = [[None] * table.column_count for _ in range(table.row_count)]

        for cell in table.cells:
            rows[cell.row_index][cell.column_index] = cell.content

        tables.append(rows)

    return tables

# Usage
tables = extract_tables("report.pdf")
for i, table in enumerate(tables):
    print(f"\nTable {i + 1}:")
    for row in table:
        print(" | ".join(str(cell) for cell in row))

Building a Custom Model

When pre-built models don’t fit your documents:

from azure.ai.formrecognizer import DocumentModelAdministrationClient

admin_client = DocumentModelAdministrationClient(
    endpoint=endpoint,
    credential=AzureKeyCredential(key)
)

def train_custom_model(
    training_data_url: str,
    model_id: str,
    description: str
):
    """Train a custom model on labeled documents."""

    # Start training
    poller = admin_client.begin_build_document_model(
        build_mode="template",  # or "neural" for varied documents
        blob_container_url=training_data_url,
        model_id=model_id,
        description=description
    )

    model = poller.result()

    print(f"Model ID: {model.model_id}")
    print(f"Created: {model.created_on}")
    print(f"Doc types: {list(model.doc_types.keys())}")

    return model

# Train on your labeled documents in blob storage
# training_url = "https://yourstorage.blob.core.windows.net/training-data?SAS_TOKEN"
# train_custom_model(training_url, "my-custom-model", "Custom purchase orders")

Document Processing Pipeline

import os
from azure.storage.blob import BlobServiceClient
from azure.servicebus import ServiceBusClient, ServiceBusMessage
import json
from enum import Enum

class DocumentType(Enum):
    INVOICE = "invoice"
    RECEIPT = "receipt"
    PURCHASE_ORDER = "purchase_order"
    UNKNOWN = "unknown"

class DocumentProcessor:
    def __init__(
        self,
        form_recognizer_client: DocumentAnalysisClient,
        blob_client: BlobServiceClient,
        servicebus_client: ServiceBusClient
    ):
        self.fr_client = form_recognizer_client
        self.blob_client = blob_client
        self.sb_client = servicebus_client

    def classify_document(self, file_path: str) -> DocumentType:
        """Classify document type based on content."""
        with open(file_path, "rb") as f:
            poller = self.fr_client.begin_analyze_document("prebuilt-layout", f)

        result = poller.result()

        # Simple classification based on keywords
        text = " ".join([line.content for page in result.pages for line in page.lines])
        text_lower = text.lower()

        if "invoice" in text_lower:
            return DocumentType.INVOICE
        elif "receipt" in text_lower:
            return DocumentType.RECEIPT
        elif "purchase order" in text_lower or "p.o." in text_lower:
            return DocumentType.PURCHASE_ORDER
        else:
            return DocumentType.UNKNOWN

    def process_document(self, file_path: str) -> dict:
        """Process document based on its type."""
        doc_type = self.classify_document(file_path)

        with open(file_path, "rb") as f:
            if doc_type == DocumentType.INVOICE:
                poller = self.fr_client.begin_analyze_document("prebuilt-invoice", f)
            elif doc_type == DocumentType.RECEIPT:
                poller = self.fr_client.begin_analyze_document("prebuilt-receipt", f)
            else:
                poller = self.fr_client.begin_analyze_document("prebuilt-layout", f)

        result = poller.result()

        return {
            "document_type": doc_type.value,
            "confidence": result.documents[0].confidence if result.documents else None,
            "fields": self._extract_fields(result),
            "tables": self._extract_tables(result)
        }

    def _extract_fields(self, result) -> dict:
        """Extract fields from analysis result."""
        fields = {}
        for document in result.documents:
            for name, field in document.fields.items():
                if field.value is not None:
                    fields[name] = {
                        "value": str(field.value),
                        "confidence": field.confidence
                    }
        return fields

    def _extract_tables(self, result) -> list:
        """Extract tables from analysis result."""
        tables = []
        for table in result.tables:
            table_data = {
                "rows": table.row_count,
                "columns": table.column_count,
                "cells": []
            }
            for cell in table.cells:
                table_data["cells"].append({
                    "row": cell.row_index,
                    "column": cell.column_index,
                    "content": cell.content
                })
            tables.append(table_data)
        return tables

    def process_and_route(self, file_path: str, source: str):
        """Process document and route to appropriate queue."""
        result = self.process_document(file_path)

        # Add metadata
        result["source"] = source
        result["file_name"] = os.path.basename(file_path)

        # Route to appropriate queue based on document type
        queue_name = f"documents-{result['document_type']}"

        with self.sb_client.get_queue_sender(queue_name) as sender:
            message = ServiceBusMessage(json.dumps(result))
            sender.send_messages(message)

        return result

Batch Processing with Azure Functions

import azure.functions as func
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
import json
import os

app = func.FunctionApp()

@app.blob_trigger(
    arg_name="blob",
    path="incoming-documents/{name}",
    connection="AzureWebJobsStorage"
)
@app.queue_output(
    arg_name="output",
    queue_name="processed-documents",
    connection="AzureWebJobsStorage"
)
def process_incoming_document(blob: func.InputStream, output: func.Out[str]):
    """Process documents uploaded to blob storage."""

    client = DocumentAnalysisClient(
        endpoint=os.environ["FORM_RECOGNIZER_ENDPOINT"],
        credential=AzureKeyCredential(os.environ["FORM_RECOGNIZER_KEY"])
    )

    # Analyze document
    poller = client.begin_analyze_document("prebuilt-invoice", blob)
    result = poller.result()

    # Extract data
    extracted = {
        "file_name": blob.name,
        "documents": []
    }

    for document in result.documents:
        doc_data = {
            "doc_type": document.doc_type,
            "confidence": document.confidence,
            "fields": {}
        }

        for name, field in document.fields.items():
            if field.value is not None:
                doc_data["fields"][name] = str(field.value)

        extracted["documents"].append(doc_data)

    # Send to output queue
    output.set(json.dumps(extracted))

Validation and Confidence Scoring

@dataclass
class ValidationResult:
    is_valid: bool
    confidence_score: float
    issues: list[str]

def validate_extracted_invoice(invoice: ExtractedInvoice) -> ValidationResult:
    """Validate extracted invoice data."""
    issues = []
    confidence_scores = []

    # Required fields
    if not invoice.vendor_name:
        issues.append("Missing vendor name")
    if not invoice.invoice_number:
        issues.append("Missing invoice number")
    if not invoice.total_amount:
        issues.append("Missing total amount")

    # Line item validation
    if invoice.line_items:
        line_total = sum(
            item.get("amount", 0) or 0
            for item in invoice.line_items
        )
        if invoice.total_amount and abs(line_total - float(invoice.total_amount)) > 0.01:
            issues.append(f"Line items total ({line_total}) doesn't match invoice total ({invoice.total_amount})")

    # Date validation
    from datetime import datetime
    if invoice.invoice_date:
        try:
            date = datetime.strptime(invoice.invoice_date, "%Y-%m-%d")
            if date > datetime.now():
                issues.append("Invoice date is in the future")
        except:
            issues.append("Invalid invoice date format")

    return ValidationResult(
        is_valid=len(issues) == 0,
        confidence_score=0.9 if len(issues) == 0 else 0.5,
        issues=issues
    )

Integration with ERP Systems

class ERPIntegration:
    def __init__(self, erp_api_url: str, api_key: str):
        self.api_url = erp_api_url
        self.api_key = api_key

    def create_invoice_entry(self, invoice: ExtractedInvoice) -> dict:
        """Create invoice entry in ERP system."""
        payload = {
            "vendor": invoice.vendor_name,
            "invoice_number": invoice.invoice_number,
            "invoice_date": invoice.invoice_date,
            "due_date": invoice.due_date,
            "total_amount": float(invoice.total_amount) if invoice.total_amount else 0,
            "tax_amount": float(invoice.tax_amount) if invoice.tax_amount else 0,
            "line_items": [
                {
                    "description": item.get("description"),
                    "quantity": item.get("quantity"),
                    "unit_price": item.get("unit_price"),
                    "amount": item.get("amount")
                }
                for item in invoice.line_items
            ],
            "status": "pending_approval"
        }

        response = requests.post(
            f"{self.api_url}/invoices",
            headers={
                "Authorization": f"Bearer {self.api_key}",
                "Content-Type": "application/json"
            },
            json=payload
        )

        return response.json()

Azure Form Recognizer transforms manual document processing into automated workflows. The combination of pre-built and custom models handles most document types, while confidence scores enable human-in-the-loop validation for edge cases.