September 20, 2023 1 min read

Custom Named Entity Recognition with Azure AI Language

Introduction

While Azure AI Language provides pre-built named entity recognition for common entity types, many domains require extraction of specialized entities unique to their business. Custom NER in Azure AI Language enables training models to recognize domain-specific entities like product names, medical terms, legal clauses, or any custom category.

Custom NER Architecture

Entity Schema Design

from dataclasses import dataclass, field
from typing import List, Dict, Optional
from enum import Enum

class EntityType(Enum):
    SIMPLE = "simple"
    COMPOSITE = "composite"
    HIERARCHICAL = "hierarchical"

@dataclass
class EntityDefinition:
    name: str
    description: str
    entity_type: EntityType = EntityType.SIMPLE
    examples: List[str] = field(default_factory=list)
    subentities: List['EntityDefinition'] = field(default_factory=list)

@dataclass
class EntityAnnotation:
    text: str
    entity_type: str
    start_offset: int
    end_offset: int

@dataclass
class AnnotatedDocument:
    text: str
    annotations: List[EntityAnnotation]
    document_id: Optional[str] = None

class NERSchemaBuilder:
    """Build custom NER schema"""

    def __init__(self, project_name: str):
        self.project_name = project_name
        self.entities: List[EntityDefinition] = []

    def add_entity(
        self,
        name: str,
        description: str,
        examples: List[str] = None
    ) -> 'NERSchemaBuilder':
        """Add simple entity type"""
        self.entities.append(EntityDefinition(
            name=name,
            description=description,
            entity_type=EntityType.SIMPLE,
            examples=examples or []
        ))
        return self

    def add_composite_entity(
        self,
        name: str,
        description: str,
        components: List[str]
    ) -> 'NERSchemaBuilder':
        """Add composite entity (made of other entities)"""
        subentities = [
            EntityDefinition(name=comp, description=f"Component of {name}")
            for comp in components
        ]
        self.entities.append(EntityDefinition(
            name=name,
            description=description,
            entity_type=EntityType.COMPOSITE,
            subentities=subentities
        ))
        return self

    def validate_schema(self) -> Dict:
        """Validate NER schema"""
        issues = []

        if len(self.entities) == 0:
            issues.append("At least one entity type is required")

        # Check for naming conflicts
        names = [e.name for e in self.entities]
        duplicates = [n for n in names if names.count(n) > 1]
        if duplicates:
            issues.append(f"Duplicate entity names: {set(duplicates)}")

        # Check examples
        for entity in self.entities:
            if len(entity.examples) < 3:
                issues.append(f"Entity '{entity.name}' needs more examples (minimum 3)")

        return {
            "valid": len(issues) == 0,
            "issues": issues,
            "entity_count": len(self.entities)
        }

    def export_schema(self) -> Dict:
        """Export schema for documentation"""
        return {
            "project_name": self.project_name,
            "entities": [
                {
                    "name": e.name,
                    "description": e.description,
                    "type": e.entity_type.value,
                    "examples": e.examples
                }
                for e in self.entities
            ]
        }

# Example: Legal document NER
legal_schema = NERSchemaBuilder("LegalDocumentNER")
legal_schema.add_entity(
    "Party",
    "Legal party in a contract",
    ["Acme Corporation", "John Smith", "XYZ Holdings LLC"]
)
legal_schema.add_entity(
    "ContractDate",
    "Date mentioned in contract",
    ["January 1, 2023", "effective date", "termination date"]
)
legal_schema.add_entity(
    "MonetaryAmount",
    "Money amounts in contract",
    ["$10,000", "one million dollars", "USD 5,000.00"]
)
legal_schema.add_entity(
    "LegalClause",
    "Standard legal clause types",
    ["indemnification", "confidentiality", "force majeure"]
)

validation = legal_schema.validate_schema()
print(f"Schema valid: {validation['valid']}")

Training Data Annotation

import json
import re

class NERAnnotationTool:
    """Tool for creating and managing NER annotations"""

    def __init__(self, schema: NERSchemaBuilder):
        self.schema = schema
        self.documents: List[AnnotatedDocument] = []
        self.entity_names = [e.name for e in schema.entities]

    def create_annotation(
        self,
        text: str,
        entity_type: str,
        entity_text: str
    ) -> EntityAnnotation:
        """Create annotation by finding entity in text"""
        if entity_type not in self.entity_names:
            raise ValueError(f"Unknown entity type: {entity_type}")

        # Find the entity text in the document
        match = re.search(re.escape(entity_text), text)
        if not match:
            raise ValueError(f"Entity text '{entity_text}' not found in document")

        return EntityAnnotation(
            text=entity_text,
            entity_type=entity_type,
            start_offset=match.start(),
            end_offset=match.end()
        )

    def annotate_document(
        self,
        text: str,
        entities: List[Dict]
    ) -> AnnotatedDocument:
        """Annotate a document with multiple entities"""
        annotations = []

        for entity in entities:
            annotation = self.create_annotation(
                text,
                entity["type"],
                entity["text"]
            )
            annotations.append(annotation)

        doc = AnnotatedDocument(
            text=text,
            annotations=annotations,
            document_id=f"doc_{len(self.documents)}"
        )
        self.documents.append(doc)
        return doc

    def auto_annotate(
        self,
        text: str,
        patterns: Dict[str, List[str]]
    ) -> AnnotatedDocument:
        """Automatically annotate using patterns"""
        annotations = []

        for entity_type, pattern_list in patterns.items():
            for pattern in pattern_list:
                for match in re.finditer(pattern, text, re.IGNORECASE):
                    annotations.append(EntityAnnotation(
                        text=match.group(),
                        entity_type=entity_type,
                        start_offset=match.start(),
                        end_offset=match.end()
                    ))

        # Remove overlapping annotations (keep longest)
        annotations = self._remove_overlaps(annotations)

        doc = AnnotatedDocument(
            text=text,
            annotations=annotations,
            document_id=f"auto_{len(self.documents)}"
        )
        self.documents.append(doc)
        return doc

    def _remove_overlaps(self, annotations: List[EntityAnnotation]) -> List[EntityAnnotation]:
        """Remove overlapping annotations, keeping longest"""
        if not annotations:
            return []

        # Sort by start position
        sorted_anns = sorted(annotations, key=lambda x: (x.start_offset, -x.end_offset))

        result = [sorted_anns[0]]
        for ann in sorted_anns[1:]:
            if ann.start_offset >= result[-1].end_offset:
                result.append(ann)
            elif ann.end_offset - ann.start_offset > result[-1].end_offset - result[-1].start_offset:
                result[-1] = ann

        return result

    def export_azure_format(self, output_dir: str) -> Dict:
        """Export annotations in Azure Language Studio format"""
        from pathlib import Path

        output_path = Path(output_dir)
        output_path.mkdir(parents=True, exist_ok=True)
        docs_path = output_path / "documents"
        docs_path.mkdir(exist_ok=True)

        export_data = {
            "projectFileVersion": "2022-05-01",
            "stringIndexType": "Utf16CodeUnit",
            "metadata": {
                "projectKind": "CustomEntityRecognition",
                "projectName": self.schema.project_name,
                "language": "en-us"
            },
            "assets": {
                "projectKind": "CustomEntityRecognition",
                "entities": [
                    {"category": e.name}
                    for e in self.schema.entities
                ],
                "documents": []
            }
        }

        for i, doc in enumerate(self.documents):
            # Write document text
            doc_filename = f"doc_{i}.txt"
            with open(docs_path / doc_filename, 'w', encoding='utf-8') as f:
                f.write(doc.text)

            # Add document with annotations
            doc_entry = {
                "location": doc_filename,
                "language": "en-us",
                "entities": [
                    {
                        "category": ann.entity_type,
                        "offset": ann.start_offset,
                        "length": ann.end_offset - ann.start_offset
                    }
                    for ann in doc.annotations
                ]
            }
            export_data["assets"]["documents"].append(doc_entry)

        # Write export file
        with open(output_path / "labels.json", 'w', encoding='utf-8') as f:
            json.dump(export_data, f, indent=2)

        return {
            "output_dir": str(output_path),
            "document_count": len(self.documents),
            "total_annotations": sum(len(d.annotations) for d in self.documents)
        }

# Usage
annotator = NERAnnotationTool(legal_schema)

# Manual annotation
doc = annotator.annotate_document(
    "This Agreement is entered into between Acme Corporation and John Smith, effective January 1, 2023, for the amount of $10,000.",
    [
        {"type": "Party", "text": "Acme Corporation"},
        {"type": "Party", "text": "John Smith"},
        {"type": "ContractDate", "text": "January 1, 2023"},
        {"type": "MonetaryAmount", "text": "$10,000"}
    ]
)

# Auto-annotation with patterns
patterns = {
    "MonetaryAmount": [r'\$[\d,]+(?:\.\d{2})?', r'\d+\s+(?:dollars?|USD)'],
    "ContractDate": [r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{4}\b']
}
auto_doc = annotator.auto_annotate(
    "The payment of $5,000.00 is due by March 15, 2024.",
    patterns
)

# Export
result = annotator.export_azure_format("./ner_export")
print(f"Exported {result['total_annotations']} annotations")

Using Custom NER Model

import os
from azure.ai.textanalytics import TextAnalyticsClient
from azure.core.credentials import AzureKeyCredential

class CustomNERExtractor:
    """Extract entities using custom NER model"""

    def __init__(self, project_name: str, deployment_name: str):
        self.client = TextAnalyticsClient(
            endpoint=os.getenv("AZURE_LANGUAGE_ENDPOINT"),
            credential=AzureKeyCredential(os.getenv("AZURE_LANGUAGE_KEY"))
        )
        self.project_name = project_name
        self.deployment_name = deployment_name

    def extract_entities(self, documents: List[str]) -> List[Dict]:
        """Extract custom entities from documents"""
        poller = self.client.begin_recognize_custom_entities(
            documents,
            project_name=self.project_name,
            deployment_name=self.deployment_name
        )
        result = poller.result()

        extractions = []
        for doc_result in result:
            if not doc_result.is_error:
                entities = [
                    {
                        "text": entity.text,
                        "category": entity.category,
                        "confidence": entity.confidence_score,
                        "offset": entity.offset,
                        "length": entity.length
                    }
                    for entity in doc_result.entities
                ]
                extractions.append({"entities": entities})
            else:
                extractions.append({"error": doc_result.error.message})

        return extractions

    def extract_and_group(self, documents: List[str]) -> List[Dict]:
        """Extract entities and group by category"""
        extractions = self.extract_entities(documents)

        grouped = []
        for extraction in extractions:
            if "error" in extraction:
                grouped.append(extraction)
                continue

            by_category = {}
            for entity in extraction["entities"]:
                category = entity["category"]
                if category not in by_category:
                    by_category[category] = []
                by_category[category].append({
                    "text": entity["text"],
                    "confidence": entity["confidence"]
                })

            grouped.append({"entities_by_category": by_category})

        return grouped

    def extract_with_context(
        self,
        documents: List[str],
        context_window: int = 50
    ) -> List[Dict]:
        """Extract entities with surrounding context"""
        extractions = self.extract_entities(documents)

        with_context = []
        for doc, extraction in zip(documents, extractions):
            if "error" in extraction:
                with_context.append(extraction)
                continue

            entities_with_context = []
            for entity in extraction["entities"]:
                start = max(0, entity["offset"] - context_window)
                end = min(len(doc), entity["offset"] + entity["length"] + context_window)

                entities_with_context.append({
                    **entity,
                    "context": doc[start:end],
                    "context_start": start,
                    "context_end": end
                })

            with_context.append({"entities": entities_with_context})

        return with_context

# Usage
extractor = CustomNERExtractor(
    project_name="LegalDocumentNER",
    deployment_name="production"
)

legal_texts = [
    "This Services Agreement between TechCorp Inc. and StartupXYZ, dated April 1, 2023, establishes payment terms of $50,000 quarterly.",
    "The confidentiality clause requires both parties to maintain strict non-disclosure for 5 years."
]

# Basic extraction
results = extractor.extract_entities(legal_texts)
for i, result in enumerate(results):
    print(f"\nDocument {i+1}:")
    for entity in result.get("entities", []):
        print(f"  {entity['category']}: '{entity['text']}' ({entity['confidence']:.2%})")

# Grouped extraction
grouped = extractor.extract_and_group(legal_texts)
for i, result in enumerate(grouped):
    print(f"\nDocument {i+1} by category:")
    for category, entities in result.get("entities_by_category", {}).items():
        print(f"  {category}: {[e['text'] for e in entities]}")

Entity Validation and Post-Processing

from typing import Callable

class EntityPostProcessor:
    """Post-process extracted entities"""

    def __init__(self):
        self.validators: Dict[str, Callable] = {}
        self.normalizers: Dict[str, Callable] = {}

    def register_validator(self, entity_type: str, validator: Callable[[str], bool]):
        """Register validation function for entity type"""
        self.validators[entity_type] = validator

    def register_normalizer(self, entity_type: str, normalizer: Callable[[str], str]):
        """Register normalization function for entity type"""
        self.normalizers[entity_type] = normalizer

    def process(self, extractions: List[Dict]) -> List[Dict]:
        """Process and validate extracted entities"""
        processed = []

        for extraction in extractions:
            if "error" in extraction:
                processed.append(extraction)
                continue

            valid_entities = []
            for entity in extraction.get("entities", []):
                entity_type = entity["category"]

                # Validate
                if entity_type in self.validators:
                    is_valid = self.validators[entity_type](entity["text"])
                    if not is_valid:
                        continue

                # Normalize
                if entity_type in self.normalizers:
                    entity["normalized_text"] = self.normalizers[entity_type](entity["text"])

                valid_entities.append(entity)

            processed.append({"entities": valid_entities})

        return processed

# Example validators and normalizers
def validate_monetary_amount(text: str) -> bool:
    """Validate monetary amount format"""
    import re
    pattern = r'^\$?[\d,]+(?:\.\d{2})?$|^\d+\s+(?:dollars?|USD)$'
    return bool(re.match(pattern, text, re.IGNORECASE))

def normalize_monetary_amount(text: str) -> str:
    """Normalize monetary amount to standard format"""
    import re
    # Extract numeric value
    numbers = re.findall(r'[\d,]+(?:\.\d{2})?', text)
    if numbers:
        value = float(numbers[0].replace(',', ''))
        return f"${value:,.2f}"
    return text

def normalize_date(text: str) -> str:
    """Normalize date to ISO format"""
    from datetime import datetime
    try:
        # Try common formats
        for fmt in ["%B %d, %Y", "%m/%d/%Y", "%d-%m-%Y"]:
            try:
                dt = datetime.strptime(text, fmt)
                return dt.strftime("%Y-%m-%d")
            except ValueError:
                continue
    except Exception:
        pass
    return text

# Usage
processor = EntityPostProcessor()
processor.register_validator("MonetaryAmount", validate_monetary_amount)
processor.register_normalizer("MonetaryAmount", normalize_monetary_amount)
processor.register_normalizer("ContractDate", normalize_date)

processed_results = processor.process(results)

Conclusion

Custom NER in Azure AI Language enables extraction of domain-specific entities tailored to your business needs. By carefully designing entity schemas, creating quality training annotations, and implementing proper validation and post-processing, you can build accurate entity extraction systems for specialized domains like legal, healthcare, finance, or any custom use case.