February 20, 2023 1 min read

Azure AI Document Intelligence: Extracting Data from Documents

Azure Document Intelligence AI OCR Data Extraction

Azure AI Document Intelligence (formerly Form Recognizer) uses AI to extract text, key-value pairs, tables, and structures from documents. It’s essential for automating document processing workflows.

Getting Started

pip install azure-ai-formrecognizer

from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential

# Initialize client
endpoint = "https://your-resource.cognitiveservices.azure.com/"
key = "your-api-key"

client = DocumentAnalysisClient(
    endpoint=endpoint,
    credential=AzureKeyCredential(key)
)

Prebuilt Models

Use prebuilt models for common document types:

# Read model - OCR and layout
def extract_text(file_path: str) -> str:
    """Extract text from any document."""
    with open(file_path, "rb") as f:
        poller = client.begin_analyze_document("prebuilt-read", f)
    result = poller.result()

    text = ""
    for page in result.pages:
        for line in page.lines:
            text += line.content + "\n"

    return text

# Invoice model
def extract_invoice(file_path: str) -> dict:
    """Extract data from invoice."""
    with open(file_path, "rb") as f:
        poller = client.begin_analyze_document("prebuilt-invoice", f)
    result = poller.result()

    invoice_data = {}
    for doc in result.documents:
        invoice_data = {
            "vendor_name": doc.fields.get("VendorName", {}).value,
            "customer_name": doc.fields.get("CustomerName", {}).value,
            "invoice_date": doc.fields.get("InvoiceDate", {}).value,
            "due_date": doc.fields.get("DueDate", {}).value,
            "total": doc.fields.get("InvoiceTotal", {}).value,
            "items": []
        }

        items = doc.fields.get("Items", {}).value or []
        for item in items:
            invoice_data["items"].append({
                "description": item.value.get("Description", {}).value,
                "quantity": item.value.get("Quantity", {}).value,
                "unit_price": item.value.get("UnitPrice", {}).value,
                "amount": item.value.get("Amount", {}).value
            })

    return invoice_data

# Receipt model
def extract_receipt(file_path: str) -> dict:
    """Extract data from receipt."""
    with open(file_path, "rb") as f:
        poller = client.begin_analyze_document("prebuilt-receipt", f)
    result = poller.result()

    for doc in result.documents:
        return {
            "merchant_name": doc.fields.get("MerchantName", {}).value,
            "transaction_date": doc.fields.get("TransactionDate", {}).value,
            "total": doc.fields.get("Total", {}).value,
            "items": [
                {
                    "name": item.value.get("Name", {}).value,
                    "price": item.value.get("TotalPrice", {}).value
                }
                for item in doc.fields.get("Items", {}).value or []
            ]
        }

# ID document model
def extract_id(file_path: str) -> dict:
    """Extract data from ID document."""
    with open(file_path, "rb") as f:
        poller = client.begin_analyze_document("prebuilt-idDocument", f)
    result = poller.result()

    for doc in result.documents:
        return {
            "first_name": doc.fields.get("FirstName", {}).value,
            "last_name": doc.fields.get("LastName", {}).value,
            "document_number": doc.fields.get("DocumentNumber", {}).value,
            "date_of_birth": doc.fields.get("DateOfBirth", {}).value,
            "expiration_date": doc.fields.get("DateOfExpiration", {}).value
        }

Layout Analysis

Extract document structure:

def analyze_layout(file_path: str) -> dict:
    """Analyze document layout."""
    with open(file_path, "rb") as f:
        poller = client.begin_analyze_document("prebuilt-layout", f)
    result = poller.result()

    layout = {
        "pages": [],
        "tables": [],
        "paragraphs": []
    }

    for page in result.pages:
        page_info = {
            "page_number": page.page_number,
            "width": page.width,
            "height": page.height,
            "lines": [line.content for line in page.lines],
            "selection_marks": len(page.selection_marks or [])
        }
        layout["pages"].append(page_info)

    for table in result.tables:
        table_data = {
            "row_count": table.row_count,
            "column_count": table.column_count,
            "cells": []
        }
        for cell in table.cells:
            table_data["cells"].append({
                "row": cell.row_index,
                "column": cell.column_index,
                "content": cell.content,
                "is_header": cell.kind == "columnHeader"
            })
        layout["tables"].append(table_data)

    for paragraph in result.paragraphs:
        layout["paragraphs"].append({
            "content": paragraph.content,
            "role": paragraph.role
        })

    return layout

Table Extraction

Extract and process tables:

import pandas as pd

def extract_tables_to_dataframes(file_path: str) -> list:
    """Extract tables as pandas DataFrames."""
    with open(file_path, "rb") as f:
        poller = client.begin_analyze_document("prebuilt-layout", f)
    result = poller.result()

    dataframes = []

    for table in result.tables:
        # Initialize grid
        grid = [[None] * table.column_count for _ in range(table.row_count)]

        # Fill grid
        for cell in table.cells:
            grid[cell.row_index][cell.column_index] = cell.content

        # Create DataFrame
        if grid:
            # Check if first row is header
            if all(cell.kind == "columnHeader" for cell in table.cells if cell.row_index == 0):
                df = pd.DataFrame(grid[1:], columns=grid[0])
            else:
                df = pd.DataFrame(grid)

            dataframes.append(df)

    return dataframes

# Usage
tables = extract_tables_to_dataframes("financial_report.pdf")
for i, df in enumerate(tables):
    print(f"Table {i+1}:")
    print(df.to_string())

Custom Models

Train models for your specific documents:

from azure.ai.formrecognizer import DocumentModelAdministrationClient

# Admin client for training
admin_client = DocumentModelAdministrationClient(
    endpoint=endpoint,
    credential=AzureKeyCredential(key)
)

# Train custom model
def train_custom_model(
    training_data_url: str,
    model_id: str,
    description: str = ""
) -> str:
    """Train a custom document model."""
    poller = admin_client.begin_build_document_model(
        model_id=model_id,
        blob_container_url=training_data_url,
        description=description
    )

    model = poller.result()
    return model.model_id

# Use custom model
def extract_with_custom_model(file_path: str, model_id: str) -> dict:
    """Extract data using custom model."""
    with open(file_path, "rb") as f:
        poller = client.begin_analyze_document(model_id, f)
    result = poller.result()

    extracted_data = {}
    for doc in result.documents:
        for name, field in doc.fields.items():
            extracted_data[name] = {
                "value": field.value,
                "confidence": field.confidence
            }

    return extracted_data

Integration with RAG

Use Document Intelligence to feed RAG systems:

class DocumentProcessor:
    """Process documents for RAG ingestion."""

    def __init__(self, endpoint: str, key: str):
        self.client = DocumentAnalysisClient(
            endpoint=endpoint,
            credential=AzureKeyCredential(key)
        )

    def process_for_rag(self, file_path: str) -> list:
        """Process document and return chunks for RAG."""
        with open(file_path, "rb") as f:
            poller = self.client.begin_analyze_document("prebuilt-layout", f)
        result = poller.result()

        chunks = []

        # Process paragraphs
        for para in result.paragraphs:
            chunks.append({
                "type": "paragraph",
                "content": para.content,
                "role": para.role,
                "page": para.bounding_regions[0].page_number if para.bounding_regions else None
            })

        # Process tables as structured text
        for i, table in enumerate(result.tables):
            table_text = self._table_to_text(table)
            chunks.append({
                "type": "table",
                "content": table_text,
                "role": "table",
                "page": table.bounding_regions[0].page_number if table.bounding_regions else None
            })

        return chunks

    def _table_to_text(self, table) -> str:
        """Convert table to readable text."""
        rows = [[None] * table.column_count for _ in range(table.row_count)]

        for cell in table.cells:
            rows[cell.row_index][cell.column_index] = cell.content

        lines = []
        for i, row in enumerate(rows):
            if i == 0:
                lines.append(" | ".join(str(c) for c in row))
                lines.append("-" * 50)
            else:
                lines.append(" | ".join(str(c) for c in row))

        return "\n".join(lines)

# Usage
processor = DocumentProcessor(endpoint, key)
chunks = processor.process_for_rag("contract.pdf")

# Add to vector store
for chunk in chunks:
    embedding = get_embedding(chunk["content"])
    vector_store.add(chunk["content"], embedding, chunk)

Best Practices

Choose the right model: Prebuilt for common docs, custom for specific formats
Handle confidence scores: Filter low-confidence extractions
Process asynchronously: Use polling for large documents
Validate extracted data: Implement business rules validation
Monitor costs: Track pages processed