September 20, 2021 1 min read

Named Entity Recognition with Azure Cognitive Services

Azure Cognitive Services NER NLP Information Extraction

Named Entity Recognition (NER) identifies and classifies named entities in text into predefined categories such as persons, organizations, locations, dates, and more. It’s fundamental for information extraction and building knowledge graphs.

Entity Categories

Azure NER recognizes these entity types:

Person: Names of people
Organization: Companies, institutions
Location: Places, addresses, geopolitical entities
DateTime: Dates, times, durations
Quantity: Numbers, percentages, measurements
Email, URL, Phone: Contact information
Product: Named products
Event: Named events

Basic Entity Recognition

from azure.ai.textanalytics import TextAnalyticsClient
from azure.core.credentials import AzureKeyCredential

def create_client(key: str, endpoint: str) -> TextAnalyticsClient:
    return TextAnalyticsClient(
        endpoint=endpoint,
        credential=AzureKeyCredential(key)
    )

def recognize_entities(client: TextAnalyticsClient, documents: list) -> list:
    """Recognize named entities in documents."""

    results = client.recognize_entities(documents)

    recognized = []
    for doc in results:
        if not doc.is_error:
            entities = []
            for entity in doc.entities:
                entities.append({
                    "text": entity.text,
                    "category": entity.category,
                    "subcategory": entity.subcategory,
                    "confidence": entity.confidence_score,
                    "offset": entity.offset,
                    "length": entity.length
                })
            recognized.append({"entities": entities})
        else:
            recognized.append({"error": doc.error.message})

    return recognized

client = create_client("your-key", "your-endpoint")

# Extract entities from news article
news_article = """
Microsoft CEO Satya Nadella announced on Tuesday that the company
will invest $20 billion in cybersecurity over the next five years.
The announcement was made at the company's headquarters in Redmond,
Washington. This follows Microsoft's acquisition of Nuance Communications
for $19.7 billion earlier this year.
"""

result = recognize_entities(client, [news_article])

print("Entities found:")
for entity in result[0]["entities"]:
    print(f"  {entity['text']}")
    print(f"    Type: {entity['category']}", end="")
    if entity['subcategory']:
        print(f" / {entity['subcategory']}", end="")
    print(f" (confidence: {entity['confidence']:.2f})")

Entity Linking to Wikipedia

def link_entities(client: TextAnalyticsClient, documents: list) -> list:
    """Link entities to Wikipedia entries."""

    results = client.recognize_linked_entities(documents)

    linked = []
    for doc in results:
        if not doc.is_error:
            entities = []
            for entity in doc.entities:
                entities.append({
                    "name": entity.name,
                    "wikipedia_url": entity.url,
                    "data_source": entity.data_source,
                    "wikipedia_id": entity.data_source_entity_id,
                    "matches": [
                        {
                            "text": match.text,
                            "confidence": match.confidence_score,
                            "offset": match.offset
                        }
                        for match in entity.matches
                    ]
                })
            linked.append({"entities": entities})
        else:
            linked.append({"error": doc.error.message})

    return linked

# Link entities to knowledge base
text = """
Albert Einstein developed the theory of relativity while working
in Bern, Switzerland. He later joined Princeton University in
New Jersey, where he continued his research in physics.
"""

result = link_entities(client, [text])

print("Linked Entities:")
for entity in result[0]["entities"]:
    print(f"\n{entity['name']}")
    print(f"  Wikipedia: {entity['wikipedia_url']}")
    for match in entity['matches']:
        print(f"  Mention: '{match['text']}' (confidence: {match['confidence']:.2f})")

Building an Information Extractor

from collections import defaultdict

class InformationExtractor:
    def __init__(self, client: TextAnalyticsClient):
        self.client = client

    def extract_structured_info(self, text: str) -> dict:
        """Extract structured information from text."""

        # Get entities
        ner_results = self.client.recognize_entities([text])
        linked_results = self.client.recognize_linked_entities([text])

        info = {
            "people": [],
            "organizations": [],
            "locations": [],
            "dates": [],
            "quantities": [],
            "products": [],
            "events": [],
            "wikipedia_links": {}
        }

        # Process NER results
        if not ner_results[0].is_error:
            for entity in ner_results[0].entities:
                if entity.category == "Person":
                    info["people"].append(entity.text)
                elif entity.category == "Organization":
                    info["organizations"].append(entity.text)
                elif entity.category == "Location":
                    info["locations"].append(entity.text)
                elif entity.category == "DateTime":
                    info["dates"].append(entity.text)
                elif entity.category == "Quantity":
                    info["quantities"].append({
                        "value": entity.text,
                        "type": entity.subcategory
                    })
                elif entity.category == "Product":
                    info["products"].append(entity.text)
                elif entity.category == "Event":
                    info["events"].append(entity.text)

        # Add Wikipedia links
        if not linked_results[0].is_error:
            for entity in linked_results[0].entities:
                info["wikipedia_links"][entity.name] = entity.url

        # Deduplicate
        for key in ["people", "organizations", "locations", "dates", "products", "events"]:
            if isinstance(info[key], list) and info[key] and isinstance(info[key][0], str):
                info[key] = list(set(info[key]))

        return info

    def extract_relationships(self, text: str) -> list:
        """Extract basic relationships between entities."""

        ner_results = self.client.recognize_entities([text])

        if ner_results[0].is_error:
            return []

        entities = ner_results[0].entities

        # Find co-occurring entities (simplified relationship extraction)
        relationships = []
        sentences = text.split('.')

        for sentence in sentences:
            sentence_entities = [
                e for e in entities
                if e.text in sentence
            ]

            # Find person-organization pairs
            persons = [e for e in sentence_entities if e.category == "Person"]
            orgs = [e for e in sentence_entities if e.category == "Organization"]

            for person in persons:
                for org in orgs:
                    relationships.append({
                        "person": person.text,
                        "organization": org.text,
                        "context": sentence.strip()
                    })

        return relationships

# Extract information from news
extractor = InformationExtractor(client)

news = """
Elon Musk, CEO of Tesla and SpaceX, announced on March 15th that Tesla
will accept Bitcoin as payment for vehicles. The announcement caused
Bitcoin to surge 5% to over $55,000. Tesla had previously invested
$1.5 billion in Bitcoin in February. The company is headquartered in
Austin, Texas, after relocating from Palo Alto, California.
"""

info = extractor.extract_structured_info(news)
print("Extracted Information:")
print(f"  People: {info['people']}")
print(f"  Organizations: {info['organizations']}")
print(f"  Locations: {info['locations']}")
print(f"  Dates: {info['dates']}")
print(f"  Quantities: {info['quantities']}")

relationships = extractor.extract_relationships(news)
print("\nRelationships:")
for rel in relationships:
    print(f"  {rel['person']} <-> {rel['organization']}")

PII Detection

def detect_pii(client: TextAnalyticsClient, documents: list) -> list:
    """Detect personally identifiable information."""

    results = client.recognize_pii_entities(documents)

    detected = []
    for doc in results:
        if not doc.is_error:
            pii_entities = []
            for entity in doc.entities:
                pii_entities.append({
                    "text": entity.text,
                    "category": entity.category,
                    "subcategory": entity.subcategory,
                    "confidence": entity.confidence_score
                })
            detected.append({
                "pii_entities": pii_entities,
                "redacted_text": doc.redacted_text
            })
        else:
            detected.append({"error": doc.error.message})

    return detected

# Detect PII in customer data
customer_message = """
Hi, my name is John Smith and I'd like to update my account.
My email is john.smith@email.com and my phone number is 555-123-4567.
My credit card ending in 4242 was charged incorrectly.
Please send correspondence to 123 Main Street, Seattle, WA 98101.
"""

result = detect_pii(client, [customer_message])

print("PII Detected:")
for pii in result[0]["pii_entities"]:
    print(f"  {pii['text']} -> {pii['category']}")

print("\nRedacted Text:")
print(result[0]["redacted_text"])

Building a Document Processor

class DocumentProcessor:
    def __init__(self, client: TextAnalyticsClient):
        self.client = client

    def process_document(self, text: str) -> dict:
        """Full NLP processing of a document."""

        # Batch multiple analyses
        entities = self.client.recognize_entities([text])
        linked = self.client.recognize_linked_entities([text])
        key_phrases = self.client.extract_key_phrases([text])
        sentiment = self.client.analyze_sentiment([text])

        result = {
            "entities": {},
            "linked_entities": [],
            "key_phrases": [],
            "sentiment": None
        }

        # Process entities by category
        if not entities[0].is_error:
            for entity in entities[0].entities:
                category = entity.category
                if category not in result["entities"]:
                    result["entities"][category] = []
                result["entities"][category].append(entity.text)

        # Process linked entities
        if not linked[0].is_error:
            result["linked_entities"] = [
                {"name": e.name, "url": e.url}
                for e in linked[0].entities
            ]

        # Process key phrases
        if not key_phrases[0].is_error:
            result["key_phrases"] = key_phrases[0].key_phrases

        # Process sentiment
        if not sentiment[0].is_error:
            result["sentiment"] = sentiment[0].sentiment

        return result

    def generate_summary(self, text: str) -> str:
        """Generate a structured summary."""

        processed = self.process_document(text)

        summary = []
        summary.append(f"Sentiment: {processed['sentiment']}")

        if processed['entities'].get('Person'):
            summary.append(f"People mentioned: {', '.join(set(processed['entities']['Person']))}")

        if processed['entities'].get('Organization'):
            summary.append(f"Organizations: {', '.join(set(processed['entities']['Organization']))}")

        if processed['entities'].get('Location'):
            summary.append(f"Locations: {', '.join(set(processed['entities']['Location']))}")

        if processed['key_phrases']:
            summary.append(f"Key topics: {', '.join(processed['key_phrases'][:5])}")

        return "\n".join(summary)

# Process a document
processor = DocumentProcessor(client)

article = """
Amazon announced today that Andy Jassy will become CEO, replacing
Jeff Bezos who founded the company in 1994 in Seattle. Under Bezos'
leadership, Amazon grew from an online bookstore to a trillion-dollar
technology giant. Jassy previously led Amazon Web Services (AWS),
which has become the company's most profitable division.
"""

summary = processor.generate_summary(article)
print(summary)

Best Practices

Confidence Thresholds: Filter low-confidence entities
Context Matters: Same text can be different entity types
Batch Processing: Process multiple documents together
Entity Linking: Use for disambiguation
PII Handling: Always detect and protect sensitive data
Domain Customization: Consider custom entity types for specific domains

Named Entity Recognition is essential for extracting structured information from unstructured text, enabling applications from search enhancement to automated data entry.