September 17, 2021 1 min read

Text Analytics with Azure Cognitive Services

Azure Cognitive Services NLP Text Analytics AI

Azure Text Analytics provides advanced natural language processing capabilities including sentiment analysis, key phrase extraction, named entity recognition, and language detection. It enables applications to understand and analyze unstructured text at scale.

Available Features

Sentiment Analysis: Determine positive, negative, or neutral sentiment
Key Phrase Extraction: Identify main talking points
Named Entity Recognition (NER): Extract people, places, organizations
Entity Linking: Connect entities to Wikipedia
Language Detection: Identify the language of text
Healthcare NLP: Extract medical entities and relations

Setting Up the Client

from azure.ai.textanalytics import TextAnalyticsClient
from azure.core.credentials import AzureKeyCredential

def get_text_analytics_client(key: str, endpoint: str) -> TextAnalyticsClient:
    """Create a Text Analytics client."""
    credential = AzureKeyCredential(key)
    return TextAnalyticsClient(endpoint=endpoint, credential=credential)

client = get_text_analytics_client(
    "your-key",
    "https://your-resource.cognitiveservices.azure.com/"
)

Sentiment Analysis

from azure.ai.textanalytics import TextAnalyticsClient
from azure.core.credentials import AzureKeyCredential

def analyze_sentiment(client: TextAnalyticsClient, documents: list) -> list:
    """Analyze sentiment of documents."""

    results = client.analyze_sentiment(
        documents,
        show_opinion_mining=True
    )

    analyzed = []
    for doc in results:
        if not doc.is_error:
            analyzed.append({
                "id": doc.id,
                "sentiment": doc.sentiment,
                "confidence_scores": {
                    "positive": doc.confidence_scores.positive,
                    "neutral": doc.confidence_scores.neutral,
                    "negative": doc.confidence_scores.negative
                },
                "sentences": [
                    {
                        "text": sentence.text,
                        "sentiment": sentence.sentiment,
                        "opinions": [
                            {
                                "target": opinion.target.text,
                                "sentiment": opinion.target.sentiment,
                                "assessments": [
                                    {"text": a.text, "sentiment": a.sentiment}
                                    for a in opinion.assessments
                                ]
                            }
                            for opinion in sentence.mined_opinions
                        ]
                    }
                    for sentence in doc.sentences
                ]
            })
        else:
            analyzed.append({"id": doc.id, "error": doc.error.message})

    return analyzed

# Analyze product reviews
reviews = [
    {"id": "1", "text": "The hotel room was amazing with a great view! However, the WiFi was slow."},
    {"id": "2", "text": "Terrible experience. The food was cold and the staff was rude."},
    {"id": "3", "text": "Decent product for the price. Nothing special but it works."}
]

results = analyze_sentiment(client, [r["text"] for r in reviews])

for i, result in enumerate(results):
    print(f"\nReview {i+1}: {result['sentiment']}")
    print(f"Confidence: pos={result['confidence_scores']['positive']:.2f}, "
          f"neg={result['confidence_scores']['negative']:.2f}")

    for sentence in result.get('sentences', []):
        for opinion in sentence.get('opinions', []):
            print(f"  Target: {opinion['target']} ({opinion['sentiment']})")
            for assessment in opinion['assessments']:
                print(f"    - {assessment['text']}: {assessment['sentiment']}")

Key Phrase Extraction

def extract_key_phrases(client: TextAnalyticsClient, documents: list) -> list:
    """Extract key phrases from documents."""

    results = client.extract_key_phrases(documents)

    extracted = []
    for doc in results:
        if not doc.is_error:
            extracted.append({
                "id": doc.id,
                "key_phrases": doc.key_phrases
            })
        else:
            extracted.append({"id": doc.id, "error": doc.error.message})

    return extracted

# Extract key phrases from articles
articles = [
    """
    Microsoft Azure provides a comprehensive set of cloud services
    including computing, analytics, storage, and networking. Users can
    pick and choose from these services to develop and scale new applications,
    or run existing applications in the public cloud.
    """,
    """
    Machine learning is a subset of artificial intelligence that enables
    systems to learn and improve from experience without being explicitly
    programmed. Deep learning is a subset of machine learning that uses
    neural networks with many layers.
    """
]

results = extract_key_phrases(client, articles)

for i, result in enumerate(results):
    print(f"\nArticle {i+1} key phrases:")
    for phrase in result['key_phrases']:
        print(f"  - {phrase}")

Named Entity Recognition

def recognize_entities(client: TextAnalyticsClient, documents: list) -> list:
    """Recognize named entities in documents."""

    results = client.recognize_entities(documents)

    recognized = []
    for doc in results:
        if not doc.is_error:
            recognized.append({
                "id": doc.id,
                "entities": [
                    {
                        "text": entity.text,
                        "category": entity.category,
                        "subcategory": entity.subcategory,
                        "confidence": entity.confidence_score
                    }
                    for entity in doc.entities
                ]
            })
        else:
            recognized.append({"id": doc.id, "error": doc.error.message})

    return recognized

# Recognize entities in news article
news_article = """
Microsoft CEO Satya Nadella announced today that the company will invest
$20 billion in cybersecurity over the next five years. The announcement
was made at the headquarters in Redmond, Washington. This follows the
recent acquisition of Nuance Communications for $19.7 billion in April 2021.
"""

results = recognize_entities(client, [news_article])

for result in results:
    print("Entities found:")
    for entity in result['entities']:
        print(f"  {entity['text']}")
        print(f"    Category: {entity['category']}")
        if entity['subcategory']:
            print(f"    Subcategory: {entity['subcategory']}")
        print(f"    Confidence: {entity['confidence']:.2f}")
        print()

Entity Linking

def link_entities(client: TextAnalyticsClient, documents: list) -> list:
    """Link entities to Wikipedia articles."""

    results = client.recognize_linked_entities(documents)

    linked = []
    for doc in results:
        if not doc.is_error:
            linked.append({
                "id": doc.id,
                "entities": [
                    {
                        "name": entity.name,
                        "url": entity.url,
                        "data_source": entity.data_source,
                        "matches": [
                            {
                                "text": match.text,
                                "confidence": match.confidence_score
                            }
                            for match in entity.matches
                        ]
                    }
                    for entity in doc.entities
                ]
            })

    return linked

# Link entities
text = "Albert Einstein developed the theory of relativity while working at the patent office in Bern, Switzerland."

results = link_entities(client, [text])

for result in results:
    for entity in result['entities']:
        print(f"{entity['name']}")
        print(f"  Wikipedia: {entity['url']}")
        print()

Language Detection

def detect_language(client: TextAnalyticsClient, documents: list) -> list:
    """Detect the language of documents."""

    results = client.detect_language(documents)

    detected = []
    for doc in results:
        if not doc.is_error:
            detected.append({
                "id": doc.id,
                "language": doc.primary_language.name,
                "iso_code": doc.primary_language.iso6391_name,
                "confidence": doc.primary_language.confidence_score
            })

    return detected

# Detect languages
texts = [
    "Hello, how are you doing today?",
    "Bonjour, comment allez-vous?",
    "Hallo, wie geht es Ihnen?",
    "Hola, como estas?",
    "Ciao, come stai?"
]

results = detect_language(client, texts)

for i, result in enumerate(results):
    print(f"'{texts[i][:30]}...'")
    print(f"  Language: {result['language']} ({result['iso_code']})")
    print(f"  Confidence: {result['confidence']:.2f}")
    print()

Batch Processing

from azure.ai.textanalytics import (
    TextDocumentInput,
    RecognizeEntitiesAction,
    ExtractKeyPhrasesAction,
    AnalyzeSentimentAction
)

def analyze_batch(client: TextAnalyticsClient, documents: list) -> dict:
    """Perform multiple analyses in a single request."""

    poller = client.begin_analyze_actions(
        documents,
        actions=[
            RecognizeEntitiesAction(),
            ExtractKeyPhrasesAction(),
            AnalyzeSentimentAction()
        ]
    )

    results = {
        "entities": [],
        "key_phrases": [],
        "sentiment": []
    }

    for page in poller.result():
        for result in page:
            if result.kind == "EntityRecognition":
                for doc in result.document_results:
                    results["entities"].append({
                        "id": doc.id,
                        "entities": [e.text for e in doc.entities]
                    })
            elif result.kind == "KeyPhraseExtraction":
                for doc in result.document_results:
                    results["key_phrases"].append({
                        "id": doc.id,
                        "phrases": doc.key_phrases
                    })
            elif result.kind == "SentimentAnalysis":
                for doc in result.document_results:
                    results["sentiment"].append({
                        "id": doc.id,
                        "sentiment": doc.sentiment
                    })

    return results

# Batch analysis
documents = [
    "Microsoft announced new Azure AI features in Seattle.",
    "The quarterly earnings exceeded analyst expectations.",
    "Customer satisfaction scores improved significantly this quarter."
]

results = analyze_batch(client, documents)
print(results)

Building a Content Analyzer Service

from flask import Flask, request, jsonify

app = Flask(__name__)
client = get_text_analytics_client("your-key", "your-endpoint")

@app.route('/analyze', methods=['POST'])
def analyze():
    """Comprehensive text analysis endpoint."""

    data = request.json
    text = data.get('text', '')
    analyses = data.get('analyses', ['sentiment', 'entities', 'key_phrases'])

    results = {}

    if 'sentiment' in analyses:
        sentiment = client.analyze_sentiment([text])
        results['sentiment'] = sentiment[0].sentiment if not sentiment[0].is_error else None

    if 'entities' in analyses:
        entities = client.recognize_entities([text])
        results['entities'] = [
            {"text": e.text, "category": e.category}
            for e in entities[0].entities
        ] if not entities[0].is_error else []

    if 'key_phrases' in analyses:
        phrases = client.extract_key_phrases([text])
        results['key_phrases'] = phrases[0].key_phrases if not phrases[0].is_error else []

    if 'language' in analyses:
        lang = client.detect_language([text])
        results['language'] = lang[0].primary_language.name if not lang[0].is_error else None

    return jsonify(results)

if __name__ == '__main__':
    app.run(port=5000)

Best Practices

Batch Requests: Process multiple documents together for efficiency
Error Handling: Always check for errors in responses
Language Hints: Provide language hints when known
Document Limits: Stay within 5,120 characters per document
Rate Limiting: Implement retry logic for throttling
Caching: Cache results for frequently analyzed content

Text Analytics enables rich understanding of unstructured text, powering applications from customer feedback analysis to content recommendation systems.