February 21, 2023 1 min read

Custom Models in Azure Form Recognizer for Domain-Specific Documents

Azure Form Recognizer Document Intelligence Custom Models AI

When prebuilt models don’t fit your specific document types, Azure Form Recognizer lets you train custom models. Let’s explore how to create and use custom models for domain-specific document extraction.

Custom Model Types

# Template models - for fixed layout documents
# Neural models - for varying layouts, better generalization

from azure.ai.formrecognizer import DocumentModelAdministrationClient
from azure.core.credentials import AzureKeyCredential

admin_client = DocumentModelAdministrationClient(
    endpoint="https://your-resource.cognitiveservices.azure.com/",
    credential=AzureKeyCredential("your-key")
)

Preparing Training Data

# Training data structure in Azure Blob Storage:
# /training-data/
#   /document1.pdf
#   /document1.pdf.labels.json
#   /document1.pdf.ocr.json  (optional, for pre-labeled)
#   /document2.pdf
#   /document2.pdf.labels.json
#   ...

# Label file format (document1.pdf.labels.json):
LABEL_EXAMPLE = {
    "document": "document1.pdf",
    "labels": [
        {
            "label": "CustomerName",
            "key": None,
            "value": [
                {
                    "page": 1,
                    "text": "Contoso Ltd",
                    "boundingBoxes": [[100, 200, 300, 200, 300, 220, 100, 220]]
                }
            ]
        },
        {
            "label": "InvoiceDate",
            "key": None,
            "value": [
                {
                    "page": 1,
                    "text": "2023-02-21",
                    "boundingBoxes": [[400, 200, 500, 200, 500, 220, 400, 220]]
                }
            ]
        }
    ]
}

Training a Custom Model

import time

def train_custom_model(
    admin_client,
    training_data_url: str,
    model_id: str,
    description: str = "",
    build_mode: str = "template"  # or "neural"
) -> str:
    """Train a custom document model."""

    # Start training
    poller = admin_client.begin_build_document_model(
        build_mode=build_mode,
        blob_container_url=training_data_url,
        model_id=model_id,
        description=description
    )

    print(f"Training model {model_id}...")

    # Wait for completion
    model = poller.result()

    print(f"Model trained successfully!")
    print(f"  Model ID: {model.model_id}")
    print(f"  Created: {model.created_on}")
    print(f"  Doc types: {list(model.doc_types.keys())}")

    return model.model_id

# Train template model (fixed layouts)
template_model_id = train_custom_model(
    admin_client,
    training_data_url="https://storage.blob.core.windows.net/training?sas=...",
    model_id="my-invoice-model-v1",
    description="Custom invoice model for Contoso",
    build_mode="template"
)

# Train neural model (varying layouts)
neural_model_id = train_custom_model(
    admin_client,
    training_data_url="https://storage.blob.core.windows.net/training?sas=...",
    model_id="my-contract-model-v1",
    description="Custom contract model with neural extraction",
    build_mode="neural"
)

Using Custom Models

from azure.ai.formrecognizer import DocumentAnalysisClient

analysis_client = DocumentAnalysisClient(
    endpoint="https://your-resource.cognitiveservices.azure.com/",
    credential=AzureKeyCredential("your-key")
)

def analyze_with_custom_model(
    file_path: str,
    model_id: str
) -> dict:
    """Analyze document with custom model."""

    with open(file_path, "rb") as f:
        poller = analysis_client.begin_analyze_document(
            model_id=model_id,
            document=f
        )

    result = poller.result()

    extracted_data = {}

    for doc in result.documents:
        print(f"Document type: {doc.doc_type}")
        print(f"Confidence: {doc.confidence:.2%}")

        for field_name, field in doc.fields.items():
            extracted_data[field_name] = {
                "value": field.value,
                "confidence": field.confidence,
                "content": field.content
            }

            print(f"  {field_name}: {field.value} ({field.confidence:.2%})")

    return extracted_data

# Use the model
data = analyze_with_custom_model("new_invoice.pdf", "my-invoice-model-v1")

Composed Models

Combine multiple models for different document types:

def create_composed_model(
    admin_client,
    model_ids: list,
    composed_model_id: str,
    description: str = ""
) -> str:
    """Create a composed model from multiple models."""

    poller = admin_client.begin_compose_document_model(
        component_model_ids=model_ids,
        model_id=composed_model_id,
        description=description
    )

    model = poller.result()

    print(f"Composed model created: {model.model_id}")
    print(f"Component models: {len(model_ids)}")

    return model.model_id

# Compose models for different document types
composed_id = create_composed_model(
    admin_client,
    model_ids=["invoice-model-v1", "receipt-model-v1", "contract-model-v1"],
    composed_model_id="all-documents-v1",
    description="Handles invoices, receipts, and contracts"
)

# The composed model automatically routes to the right sub-model

Model Management

class ModelManager:
    """Manage custom document models."""

    def __init__(self, admin_client):
        self.client = admin_client

    def list_models(self) -> list:
        """List all custom models."""
        models = self.client.list_document_models()
        return [
            {
                "id": m.model_id,
                "description": m.description,
                "created": m.created_on
            }
            for m in models
        ]

    def get_model_info(self, model_id: str) -> dict:
        """Get detailed model information."""
        model = self.client.get_document_model(model_id)

        return {
            "id": model.model_id,
            "description": model.description,
            "created": model.created_on,
            "doc_types": {
                name: {
                    "fields": list(doc_type.field_schema.keys()),
                    "field_confidence": doc_type.field_confidence
                }
                for name, doc_type in model.doc_types.items()
            }
        }

    def delete_model(self, model_id: str):
        """Delete a model."""
        self.client.delete_document_model(model_id)
        print(f"Deleted model: {model_id}")

    def copy_model(
        self,
        model_id: str,
        target_resource_id: str,
        target_region: str
    ) -> str:
        """Copy model to another resource."""
        # Get copy authorization from target
        target_client = DocumentModelAdministrationClient(
            endpoint=f"https://{target_resource_id}.cognitiveservices.azure.com/",
            credential=AzureKeyCredential("target-key")
        )

        auth = target_client.get_copy_authorization(
            model_id=f"{model_id}-copy",
            description=f"Copy of {model_id}"
        )

        # Copy model
        poller = self.client.begin_copy_document_model_to(
            model_id=model_id,
            target=auth
        )

        result = poller.result()
        return result.model_id

Evaluation and Improvement

class ModelEvaluator:
    """Evaluate custom model performance."""

    def __init__(self, analysis_client, model_id: str):
        self.client = analysis_client
        self.model_id = model_id

    def evaluate_on_test_set(
        self,
        test_files: list,
        ground_truth: dict
    ) -> dict:
        """Evaluate model on test documents."""
        results = {
            "total": len(test_files),
            "field_accuracy": {},
            "avg_confidence": []
        }

        for file_path in test_files:
            with open(file_path, "rb") as f:
                poller = self.client.begin_analyze_document(
                    model_id=self.model_id,
                    document=f
                )
                analysis = poller.result()

            for doc in analysis.documents:
                results["avg_confidence"].append(doc.confidence)

                expected = ground_truth.get(file_path, {})

                for field_name, field in doc.fields.items():
                    if field_name not in results["field_accuracy"]:
                        results["field_accuracy"][field_name] = {
                            "correct": 0,
                            "total": 0
                        }

                    results["field_accuracy"][field_name]["total"] += 1

                    if field.value == expected.get(field_name):
                        results["field_accuracy"][field_name]["correct"] += 1

        # Calculate percentages
        for field in results["field_accuracy"]:
            stats = results["field_accuracy"][field]
            stats["accuracy"] = stats["correct"] / stats["total"] if stats["total"] > 0 else 0

        results["overall_confidence"] = (
            sum(results["avg_confidence"]) / len(results["avg_confidence"])
            if results["avg_confidence"] else 0
        )

        return results

# Usage
evaluator = ModelEvaluator(analysis_client, "my-invoice-model-v1")
metrics = evaluator.evaluate_on_test_set(
    test_files=["test1.pdf", "test2.pdf"],
    ground_truth={
        "test1.pdf": {"CustomerName": "Contoso", "Amount": "100.00"},
        "test2.pdf": {"CustomerName": "Fabrikam", "Amount": "250.00"}
    }
)
print(f"Overall confidence: {metrics['overall_confidence']:.2%}")

Best Practices

Use 5+ training samples: More samples improve accuracy
Include edge cases: Train on variations
Validate with held-out data: Don’t overfit
Version your models: Track improvements
Monitor in production: Track confidence scores
Retrain periodically: Document formats evolve