Back to Blog
2 min read

Azure Document Intelligence: Extracting Structured Data from Documents

Azure Document Intelligence extracts structured data from forms, invoices, and documents. Here’s how to use it.

Document Intelligence Integration

from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult
from azure.core.credentials import AzureKeyCredential

class DocumentProcessor:
    def __init__(self, endpoint: str, key: str):
        self.client = DocumentIntelligenceClient(
            endpoint=endpoint,
            credential=AzureKeyCredential(key)
        )

    async def analyze_document(self, document_bytes: bytes, model_id: str = "prebuilt-layout") -> dict:
        """Analyze document with specified model."""
        poller = self.client.begin_analyze_document(
            model_id,
            document_bytes,
            content_type="application/pdf"
        )

        result = poller.result()
        return self.parse_result(result)

    async def extract_invoice(self, invoice_bytes: bytes) -> dict:
        """Extract structured data from invoice."""
        poller = self.client.begin_analyze_document(
            "prebuilt-invoice",
            invoice_bytes
        )

        result = poller.result()

        if result.documents:
            invoice = result.documents[0]
            return {
                "vendor_name": self.get_field_value(invoice, "VendorName"),
                "invoice_number": self.get_field_value(invoice, "InvoiceId"),
                "invoice_date": self.get_field_value(invoice, "InvoiceDate"),
                "due_date": self.get_field_value(invoice, "DueDate"),
                "total": self.get_field_value(invoice, "InvoiceTotal"),
                "items": self.extract_line_items(invoice)
            }

        return {}

    async def extract_receipt(self, receipt_bytes: bytes) -> dict:
        """Extract data from receipt."""
        poller = self.client.begin_analyze_document(
            "prebuilt-receipt",
            receipt_bytes
        )

        result = poller.result()

        if result.documents:
            receipt = result.documents[0]
            return {
                "merchant_name": self.get_field_value(receipt, "MerchantName"),
                "transaction_date": self.get_field_value(receipt, "TransactionDate"),
                "total": self.get_field_value(receipt, "Total"),
                "items": self.extract_items(receipt)
            }

        return {}

    async def extract_tables(self, document_bytes: bytes) -> list[dict]:
        """Extract all tables from document."""
        poller = self.client.begin_analyze_document(
            "prebuilt-layout",
            document_bytes
        )

        result = poller.result()
        tables = []

        for table in result.tables:
            table_data = {
                "rows": table.row_count,
                "columns": table.column_count,
                "cells": []
            }

            for cell in table.cells:
                table_data["cells"].append({
                    "row": cell.row_index,
                    "column": cell.column_index,
                    "content": cell.content,
                    "is_header": cell.kind == "columnHeader"
                })

            tables.append(table_data)

        return tables

    def get_field_value(self, document, field_name: str):
        """Safely extract field value."""
        field = document.fields.get(field_name)
        if field:
            return field.content if hasattr(field, 'content') else field.value
        return None

    def extract_line_items(self, invoice) -> list[dict]:
        """Extract line items from invoice."""
        items_field = invoice.fields.get("Items")
        if not items_field or not items_field.value:
            return []

        items = []
        for item in items_field.value:
            items.append({
                "description": self.get_field_value(item, "Description"),
                "quantity": self.get_field_value(item, "Quantity"),
                "unit_price": self.get_field_value(item, "UnitPrice"),
                "amount": self.get_field_value(item, "Amount")
            })

        return items

Document Intelligence automates data extraction from complex business documents.

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.