September 5, 2023 1 min read

Building Document Intelligence Solutions with Azure Form Recognizer

Azure Form Recognizer Document Intelligence AI OCR

Introduction

Azure Form Recognizer (now part of Azure AI Document Intelligence) enables automated extraction of text, key-value pairs, tables, and structures from documents. This is essential for automating document-heavy business processes.

Form Recognizer Capabilities

Pre-built Models

Invoice: Extract vendor, dates, line items, totals
Receipt: Extract merchant, transaction details
ID Document: Extract identity information
Business Card: Extract contact information
W-2 Tax Forms: Extract tax document data

Custom Models

Train on your specific document types
Layout-based or neural training options

Getting Started

Installation

pip install azure-ai-formrecognizer

Basic Setup

import os
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential

# Initialize client
endpoint = os.getenv("AZURE_FORM_RECOGNIZER_ENDPOINT")
key = os.getenv("AZURE_FORM_RECOGNIZER_KEY")

client = DocumentAnalysisClient(
    endpoint=endpoint,
    credential=AzureKeyCredential(key)
)

Analyzing Documents

General Document Analysis

def analyze_document(document_url: str) -> dict:
    """Analyze a document and extract content."""
    poller = client.begin_analyze_document_from_url(
        "prebuilt-document",
        document_url
    )
    result = poller.result()

    analysis = {
        "content": result.content,
        "pages": [],
        "tables": [],
        "key_value_pairs": []
    }

    # Extract page information
    for page in result.pages:
        page_info = {
            "page_number": page.page_number,
            "width": page.width,
            "height": page.height,
            "lines": [line.content for line in page.lines] if page.lines else []
        }
        analysis["pages"].append(page_info)

    # Extract tables
    for table in result.tables:
        table_data = {
            "row_count": table.row_count,
            "column_count": table.column_count,
            "cells": []
        }
        for cell in table.cells:
            table_data["cells"].append({
                "row": cell.row_index,
                "column": cell.column_index,
                "content": cell.content,
                "is_header": cell.kind == "columnHeader"
            })
        analysis["tables"].append(table_data)

    # Extract key-value pairs
    for kv_pair in result.key_value_pairs:
        if kv_pair.key and kv_pair.value:
            analysis["key_value_pairs"].append({
                "key": kv_pair.key.content,
                "value": kv_pair.value.content,
                "confidence": kv_pair.confidence
            })

    return analysis

# Usage
result = analyze_document("https://example.com/document.pdf")
print(f"Found {len(result['key_value_pairs'])} key-value pairs")
print(f"Found {len(result['tables'])} tables")

Invoice Processing

from dataclasses import dataclass
from typing import List, Optional
from datetime import date

@dataclass
class LineItem:
    description: str
    quantity: Optional[float]
    unit_price: Optional[float]
    amount: Optional[float]

@dataclass
class Invoice:
    vendor_name: Optional[str]
    vendor_address: Optional[str]
    invoice_id: Optional[str]
    invoice_date: Optional[date]
    due_date: Optional[date]
    subtotal: Optional[float]
    tax: Optional[float]
    total: Optional[float]
    line_items: List[LineItem]

def process_invoice(invoice_url: str) -> Invoice:
    """Process an invoice and extract structured data."""
    poller = client.begin_analyze_document_from_url(
        "prebuilt-invoice",
        invoice_url
    )
    result = poller.result()

    if not result.documents:
        raise ValueError("No invoice found in document")

    invoice_doc = result.documents[0]
    fields = invoice_doc.fields

    def get_value(field_name: str, default=None):
        field = fields.get(field_name)
        if field and field.value:
            return field.value
        return default

    def get_currency_value(field_name: str):
        field = fields.get(field_name)
        if field and field.value:
            return field.value.amount
        return None

    # Extract line items
    line_items = []
    items_field = fields.get("Items")
    if items_field and items_field.value:
        for item in items_field.value:
            item_fields = item.value
            line_items.append(LineItem(
                description=get_value("Description", item_fields) if "Description" in item_fields else None,
                quantity=get_value("Quantity", item_fields) if "Quantity" in item_fields else None,
                unit_price=get_currency_value("UnitPrice") if "UnitPrice" in item_fields else None,
                amount=get_currency_value("Amount") if "Amount" in item_fields else None
            ))

    return Invoice(
        vendor_name=get_value("VendorName"),
        vendor_address=str(get_value("VendorAddress")) if get_value("VendorAddress") else None,
        invoice_id=get_value("InvoiceId"),
        invoice_date=get_value("InvoiceDate"),
        due_date=get_value("DueDate"),
        subtotal=get_currency_value("SubTotal"),
        tax=get_currency_value("TotalTax"),
        total=get_currency_value("InvoiceTotal"),
        line_items=line_items
    )

# Usage
invoice = process_invoice("https://example.com/invoice.pdf")
print(f"Invoice from: {invoice.vendor_name}")
print(f"Total: ${invoice.total}")
print(f"Line items: {len(invoice.line_items)}")

Receipt Processing

@dataclass
class Receipt:
    merchant_name: Optional[str]
    merchant_address: Optional[str]
    transaction_date: Optional[date]
    transaction_time: Optional[str]
    items: List[dict]
    subtotal: Optional[float]
    tax: Optional[float]
    total: Optional[float]

def process_receipt(receipt_url: str) -> Receipt:
    """Process a receipt and extract transaction data."""
    poller = client.begin_analyze_document_from_url(
        "prebuilt-receipt",
        receipt_url
    )
    result = poller.result()

    if not result.documents:
        raise ValueError("No receipt found")

    receipt_doc = result.documents[0]
    fields = receipt_doc.fields

    def get_field_value(name: str):
        field = fields.get(name)
        if field:
            return field.value
        return None

    # Extract items
    items = []
    items_field = fields.get("Items")
    if items_field and items_field.value:
        for item in items_field.value:
            item_dict = {}
            if item.value:
                for key, val in item.value.items():
                    if val and val.value:
                        item_dict[key] = val.value
            items.append(item_dict)

    return Receipt(
        merchant_name=get_field_value("MerchantName"),
        merchant_address=str(get_field_value("MerchantAddress")) if get_field_value("MerchantAddress") else None,
        transaction_date=get_field_value("TransactionDate"),
        transaction_time=get_field_value("TransactionTime"),
        items=items,
        subtotal=get_field_value("Subtotal"),
        tax=get_field_value("TotalTax"),
        total=get_field_value("Total")
    )

# Usage
receipt = process_receipt("https://example.com/receipt.jpg")
print(f"Store: {receipt.merchant_name}")
print(f"Total: ${receipt.total}")

Building a Document Processing Pipeline

from enum import Enum
from typing import Union
import asyncio

class DocumentType(Enum):
    INVOICE = "invoice"
    RECEIPT = "receipt"
    GENERAL = "general"
    UNKNOWN = "unknown"

class DocumentProcessor:
    def __init__(self, client: DocumentAnalysisClient):
        self.client = client

    def classify_document(self, document_url: str) -> DocumentType:
        """Classify document type based on content."""
        # Analyze with general model first
        poller = self.client.begin_analyze_document_from_url(
            "prebuilt-document",
            document_url
        )
        result = poller.result()

        content_lower = result.content.lower()

        # Simple classification based on keywords
        if any(word in content_lower for word in ["invoice", "bill to", "invoice number"]):
            return DocumentType.INVOICE
        elif any(word in content_lower for word in ["receipt", "total", "thank you for"]):
            return DocumentType.RECEIPT
        else:
            return DocumentType.GENERAL

    def process(self, document_url: str) -> Union[Invoice, Receipt, dict]:
        """Process document based on its type."""
        doc_type = self.classify_document(document_url)

        if doc_type == DocumentType.INVOICE:
            return process_invoice(document_url)
        elif doc_type == DocumentType.RECEIPT:
            return process_receipt(document_url)
        else:
            return analyze_document(document_url)

    async def process_batch(self, document_urls: List[str]) -> List[dict]:
        """Process multiple documents concurrently."""
        results = []

        for url in document_urls:
            try:
                result = self.process(url)
                results.append({
                    "url": url,
                    "status": "success",
                    "data": result
                })
            except Exception as e:
                results.append({
                    "url": url,
                    "status": "error",
                    "error": str(e)
                })

        return results

# Usage
processor = DocumentProcessor(client)

# Single document
result = processor.process("https://example.com/document.pdf")

# Batch processing
urls = [
    "https://example.com/invoice1.pdf",
    "https://example.com/receipt1.jpg",
    "https://example.com/contract.pdf"
]
results = asyncio.run(processor.process_batch(urls))

Combining with Azure OpenAI

Enhance extracted data with GPT-4 for additional insights:

from langchain_openai import AzureChatOpenAI

def analyze_invoice_with_ai(invoice: Invoice) -> dict:
    """Use GPT-4 to provide insights on invoice data."""

    llm = AzureChatOpenAI(
        azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
        api_key=os.getenv("AZURE_OPENAI_KEY"),
        deployment_name="gpt-4"
    )

    invoice_summary = f"""
    Invoice Analysis Request:
    - Vendor: {invoice.vendor_name}
    - Invoice ID: {invoice.invoice_id}
    - Date: {invoice.invoice_date}
    - Due Date: {invoice.due_date}
    - Subtotal: ${invoice.subtotal}
    - Tax: ${invoice.tax}
    - Total: ${invoice.total}
    - Line Items: {len(invoice.line_items)}
    """

    prompt = f"""Analyze this invoice and provide:
1. Payment urgency (based on due date)
2. Any potential issues or anomalies
3. Category suggestions for expense tracking
4. Budget allocation recommendations

{invoice_summary}"""

    response = llm.invoke(prompt)

    return {
        "invoice": invoice,
        "ai_analysis": response.content
    }

# Usage
invoice = process_invoice("https://example.com/invoice.pdf")
analysis = analyze_invoice_with_ai(invoice)
print(analysis["ai_analysis"])

Best Practices

Error Handling

from azure.core.exceptions import HttpResponseError

def safe_process_document(document_url: str) -> dict:
    """Process document with comprehensive error handling."""
    try:
        poller = client.begin_analyze_document_from_url(
            "prebuilt-document",
            document_url
        )
        result = poller.result()
        return {"status": "success", "data": result}

    except HttpResponseError as e:
        if e.status_code == 400:
            return {"status": "error", "error": "Invalid document format"}
        elif e.status_code == 404:
            return {"status": "error", "error": "Document not found"}
        else:
            return {"status": "error", "error": str(e)}

    except Exception as e:
        return {"status": "error", "error": f"Unexpected error: {str(e)}"}

Cost Optimization

class CostOptimizedProcessor:
    def __init__(self, client):
        self.client = client
        self.page_count = 0
        self.cost_per_page = 0.01  # Approximate cost

    def process_with_tracking(self, document_url: str) -> dict:
        """Process document while tracking costs."""
        poller = self.client.begin_analyze_document_from_url(
            "prebuilt-document",
            document_url
        )
        result = poller.result()

        pages = len(result.pages)
        self.page_count += pages
        estimated_cost = pages * self.cost_per_page

        return {
            "result": result,
            "pages_processed": pages,
            "estimated_cost": estimated_cost,
            "total_pages_session": self.page_count
        }

    def get_session_cost(self) -> float:
        return self.page_count * self.cost_per_page

Conclusion

Azure Form Recognizer provides powerful document intelligence capabilities for automating document processing workflows. Combined with GPT-4 for additional analysis, you can build sophisticated document understanding solutions that transform how organizations handle paperwork.