Skip to content
Back to Blog
1 min read

Azure Document Intelligence: Extracting Structured Data from Documents

I wrote “Azure Document Intelligence: Extracting Structured Data from Documents” to share practical, production-minded guidance on this topic.

Document Intelligence Integration

from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult
from azure.core.credentials import AzureKeyCredential

class DocumentProcessor:
    def __init__(self, endpoint: str, key: str):
        self.client = DocumentIntelligenceClient(
            endpoint=endpoint,
            credential=AzureKeyCredential(key)
        )

    async def analyze_document(self, document_bytes: bytes, model_id: str = "prebuilt-layout") -> dict:
        """Analyze document with specified model."""
        poller = self.client.begin_analyze_document(
            model_id,
            document_bytes,
            content_type="application/pdf"
        )

        result = poller.result()
        return self.parse_result(result)

    async def extract_invoice(self, invoice_bytes: bytes) -> dict:
        """Extract structured data from invoice."""
        poller = self.client.begin_analyze_document(
            "prebuilt-invoice",
            invoice_bytes
        )

        result = poller.result()

        if result.documents:
            invoice = result.documents[0]
            return {
                "vendor_name": self.get_field_value(invoice, "VendorName"),
                "invoice_number": self.get_field_value(invoice, "InvoiceId"),
                "invoice_date": self.get_field_value(invoice, "InvoiceDate"),
                "due_date": self.get_field_value(invoice, "DueDate"),
                "total": self.get_field_value(invoice, "InvoiceTotal"),
                "items": self.extract_line_items(invoice)
            }

        return {}

    async def extract_receipt(self, receipt_bytes: bytes) -> dict:
        """Extract data from receipt."""
        poller = self.client.begin_analyze_document(
            "prebuilt-receipt",
            receipt_bytes
        )

        result = poller.result()

        if result.documents:
            receipt = result.documents[0]
            return {
                "merchant_name": self.get_field_value(receipt, "MerchantName"),
                "transaction_date": self.get_field_value(receipt, "TransactionDate"),
                "total": self.get_field_value(receipt, "Total"),
                "items": self.extract_items(receipt)
            }

        return {}

    async def extract_tables(self, document_bytes: bytes) -> list[dict]:
        """Extract all tables from document."""
        poller = self.client.begin_analyze_document(
            "prebuilt-layout",
            document_bytes
        )

        result = poller.result()
        tables = []

        for table in result.tables:
            table_data = {
                "rows": table.row_count,
                "columns": table.column_count,
                "cells": []
            }

            for cell in table.cells:
                table_data["cells"].append({
                    "row": cell.row_index,
                    "column": cell.column_index,
                    "content": cell.content,
                    "is_header": cell.kind == "columnHeader"
                })

            tables.append(table_data)

        return tables

    def get_field_value(self, document, field_name: str):
        """Safely extract field value."""
        field = document.fields.get(field_name)
        if field:
            return field.content if hasattr(field, 'content') else field.value
        return None

    def extract_line_items(self, invoice) -> list[dict]:
        """Extract line items from invoice."""
        items_field = invoice.fields.get("Items")
        if not items_field or not items_field.value:
            return []

        items = []
        for item in items_field.value:
            items.append({
                "description": self.get_field_value(item, "Description"),
                "quantity": self.get_field_value(item, "Quantity"),
                "unit_price": self.get_field_value(item, "UnitPrice"),
                "amount": self.get_field_value(item, "Amount")
            })

        return items

Document Intelligence automates data extraction from complex business documents.\n\n## Takeaways\n\nAdd a concise, personal takeaway and recommended next steps here.\n

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.