Back to Blog
6 min read

Azure AI Services Updates: Vision, Speech, and Language Capabilities

Introduction

Azure AI Services (formerly Cognitive Services) continues to evolve with new capabilities and improvements. This post provides an overview of the current state of Azure’s AI services for vision, speech, and language processing.

Azure AI Vision

Current Capabilities

Azure AI Vision provides comprehensive image analysis:

from azure.ai.vision.imageanalysis import ImageAnalysisClient
from azure.ai.vision.imageanalysis.models import VisualFeatures
from azure.core.credentials import AzureKeyCredential
import os

client = ImageAnalysisClient(
    endpoint=os.getenv("AZURE_VISION_ENDPOINT"),
    credential=AzureKeyCredential(os.getenv("AZURE_VISION_KEY"))
)

def analyze_image(image_url: str) -> dict:
    """Comprehensive image analysis."""
    result = client.analyze_from_url(
        image_url=image_url,
        visual_features=[
            VisualFeatures.CAPTION,
            VisualFeatures.DENSE_CAPTIONS,
            VisualFeatures.OBJECTS,
            VisualFeatures.TAGS,
            VisualFeatures.PEOPLE,
            VisualFeatures.SMART_CROPS,
            VisualFeatures.READ
        ],
        gender_neutral_caption=True
    )

    analysis = {
        "caption": result.caption.text if result.caption else None,
        "confidence": result.caption.confidence if result.caption else None,
        "objects": [],
        "tags": [],
        "text": []
    }

    if result.objects:
        for obj in result.objects.list:
            analysis["objects"].append({
                "name": obj.tags[0].name if obj.tags else "unknown",
                "confidence": obj.tags[0].confidence if obj.tags else 0
            })

    if result.tags:
        analysis["tags"] = [
            {"name": tag.name, "confidence": tag.confidence}
            for tag in result.tags.list
        ]

    if result.read and result.read.blocks:
        for block in result.read.blocks:
            for line in block.lines:
                analysis["text"].append(line.text)

    return analysis

# Usage
result = analyze_image("https://example.com/image.jpg")
print(f"Caption: {result['caption']}")
print(f"Objects: {[o['name'] for o in result['objects']]}")

Custom Vision for Specialized Models

from azure.cognitiveservices.vision.customvision.training import CustomVisionTrainingClient
from azure.cognitiveservices.vision.customvision.prediction import CustomVisionPredictionClient
from msrest.authentication import ApiKeyCredentials

class CustomVisionService:
    def __init__(self, training_endpoint: str, training_key: str,
                 prediction_endpoint: str, prediction_key: str):
        credentials = ApiKeyCredentials(in_headers={"Training-key": training_key})
        self.trainer = CustomVisionTrainingClient(training_endpoint, credentials)

        pred_credentials = ApiKeyCredentials(in_headers={"Prediction-key": prediction_key})
        self.predictor = CustomVisionPredictionClient(prediction_endpoint, pred_credentials)

    def create_project(self, name: str, domain_type: str = "Classification") -> str:
        """Create a new Custom Vision project."""
        domains = self.trainer.get_domains()

        # Find appropriate domain
        domain = next(
            (d for d in domains if domain_type in d.name and d.exportable),
            domains[0]
        )

        project = self.trainer.create_project(name, domain_id=domain.id)
        return project.id

    def predict(self, project_id: str, iteration_name: str, image_url: str) -> list:
        """Get predictions for an image."""
        results = self.predictor.classify_image_url(
            project_id,
            iteration_name,
            image_url
        )

        return [
            {"tag": p.tag_name, "probability": p.probability}
            for p in results.predictions
        ]

# Usage
service = CustomVisionService(
    os.getenv("CV_TRAINING_ENDPOINT"),
    os.getenv("CV_TRAINING_KEY"),
    os.getenv("CV_PREDICTION_ENDPOINT"),
    os.getenv("CV_PREDICTION_KEY")
)

Azure AI Speech

Speech to Text

import azure.cognitiveservices.speech as speechsdk

class SpeechService:
    def __init__(self):
        self.speech_config = speechsdk.SpeechConfig(
            subscription=os.getenv("AZURE_SPEECH_KEY"),
            region=os.getenv("AZURE_SPEECH_REGION")
        )

    def transcribe_audio(self, audio_file: str) -> dict:
        """Transcribe audio file to text."""
        audio_config = speechsdk.AudioConfig(filename=audio_file)

        recognizer = speechsdk.SpeechRecognizer(
            speech_config=self.speech_config,
            audio_config=audio_config
        )

        result = recognizer.recognize_once()

        if result.reason == speechsdk.ResultReason.RecognizedSpeech:
            return {
                "success": True,
                "text": result.text,
                "confidence": "high"
            }
        elif result.reason == speechsdk.ResultReason.NoMatch:
            return {
                "success": False,
                "error": "No speech could be recognized"
            }
        else:
            return {
                "success": False,
                "error": f"Recognition failed: {result.reason}"
            }

    def transcribe_continuous(self, audio_file: str, callback) -> list:
        """Continuous transcription with callbacks."""
        audio_config = speechsdk.AudioConfig(filename=audio_file)

        recognizer = speechsdk.SpeechRecognizer(
            speech_config=self.speech_config,
            audio_config=audio_config
        )

        results = []
        done = False

        def handle_result(evt):
            if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
                results.append(evt.result.text)
                callback(evt.result.text)

        def handle_stop(evt):
            nonlocal done
            done = True

        recognizer.recognized.connect(handle_result)
        recognizer.session_stopped.connect(handle_stop)
        recognizer.canceled.connect(handle_stop)

        recognizer.start_continuous_recognition()

        while not done:
            import time
            time.sleep(0.5)

        recognizer.stop_continuous_recognition()
        return results

# Usage
speech = SpeechService()
result = speech.transcribe_audio("meeting_recording.wav")
print(result["text"])

Text to Speech

def text_to_speech(text: str, output_file: str, voice: str = "en-US-JennyNeural") -> bool:
    """Convert text to speech."""
    speech_config = speechsdk.SpeechConfig(
        subscription=os.getenv("AZURE_SPEECH_KEY"),
        region=os.getenv("AZURE_SPEECH_REGION")
    )

    speech_config.speech_synthesis_voice_name = voice

    audio_config = speechsdk.AudioConfig(filename=output_file)

    synthesizer = speechsdk.SpeechSynthesizer(
        speech_config=speech_config,
        audio_config=audio_config
    )

    result = synthesizer.speak_text_async(text).get()

    return result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted

# Usage
success = text_to_speech(
    "Hello, welcome to Azure AI Services.",
    "output.wav",
    "en-US-GuyNeural"
)

Azure AI Language

Text Analytics

from azure.ai.textanalytics import TextAnalyticsClient
from azure.core.credentials import AzureKeyCredential

class LanguageService:
    def __init__(self):
        self.client = TextAnalyticsClient(
            endpoint=os.getenv("AZURE_LANGUAGE_ENDPOINT"),
            credential=AzureKeyCredential(os.getenv("AZURE_LANGUAGE_KEY"))
        )

    def analyze_sentiment(self, texts: list) -> list:
        """Analyze sentiment of texts."""
        results = self.client.analyze_sentiment(texts)

        return [
            {
                "text": texts[i][:50] + "...",
                "sentiment": result.sentiment,
                "confidence": {
                    "positive": result.confidence_scores.positive,
                    "neutral": result.confidence_scores.neutral,
                    "negative": result.confidence_scores.negative
                }
            }
            for i, result in enumerate(results)
            if not result.is_error
        ]

    def extract_entities(self, texts: list) -> list:
        """Extract named entities from texts."""
        results = self.client.recognize_entities(texts)

        all_entities = []
        for result in results:
            if not result.is_error:
                entities = [
                    {
                        "text": entity.text,
                        "category": entity.category,
                        "subcategory": entity.subcategory,
                        "confidence": entity.confidence_score
                    }
                    for entity in result.entities
                ]
                all_entities.append(entities)

        return all_entities

    def extract_key_phrases(self, texts: list) -> list:
        """Extract key phrases from texts."""
        results = self.client.extract_key_phrases(texts)

        return [
            result.key_phrases
            for result in results
            if not result.is_error
        ]

    def detect_language(self, texts: list) -> list:
        """Detect language of texts."""
        results = self.client.detect_language(texts)

        return [
            {
                "language": result.primary_language.name,
                "iso_code": result.primary_language.iso6391_name,
                "confidence": result.primary_language.confidence_score
            }
            for result in results
            if not result.is_error
        ]

# Usage
language = LanguageService()

texts = [
    "I love using Azure AI Services! The capabilities are amazing.",
    "The service was slow and the support was unhelpful."
]

# Sentiment analysis
sentiments = language.analyze_sentiment(texts)
for s in sentiments:
    print(f"Sentiment: {s['sentiment']}")

# Entity extraction
entities = language.extract_entities(texts)
for e in entities:
    print(f"Entities: {[ent['text'] for ent in e]}")

# Key phrases
phrases = language.extract_key_phrases(texts)
for p in phrases:
    print(f"Key phrases: {p}")

Question Answering

from azure.ai.language.questionanswering import QuestionAnsweringClient

class QAService:
    def __init__(self, project_name: str, deployment_name: str):
        self.client = QuestionAnsweringClient(
            endpoint=os.getenv("AZURE_LANGUAGE_ENDPOINT"),
            credential=AzureKeyCredential(os.getenv("AZURE_LANGUAGE_KEY"))
        )
        self.project = project_name
        self.deployment = deployment_name

    def answer_question(self, question: str) -> dict:
        """Get answer from knowledge base."""
        result = self.client.get_answers(
            question=question,
            project_name=self.project,
            deployment_name=self.deployment
        )

        if result.answers:
            top_answer = result.answers[0]
            return {
                "answer": top_answer.answer,
                "confidence": top_answer.confidence,
                "source": top_answer.source
            }

        return {
            "answer": "No answer found",
            "confidence": 0,
            "source": None
        }

# Usage
qa = QAService("my-kb-project", "production")
result = qa.answer_question("What are Azure Cognitive Services?")
print(f"Answer: {result['answer']}")

Combining Services

Multimodal Analysis Pipeline

class MultimodalAnalyzer:
    def __init__(self):
        self.vision = ImageAnalysisClient(
            endpoint=os.getenv("AZURE_VISION_ENDPOINT"),
            credential=AzureKeyCredential(os.getenv("AZURE_VISION_KEY"))
        )
        self.language = LanguageService()

    def analyze_image_with_context(self, image_url: str, context: str) -> dict:
        """Analyze image and combine with language analysis."""
        # Get image analysis
        image_result = self.vision.analyze_from_url(
            image_url=image_url,
            visual_features=[
                VisualFeatures.CAPTION,
                VisualFeatures.TAGS,
                VisualFeatures.READ
            ]
        )

        # Extract text from image
        extracted_text = []
        if image_result.read and image_result.read.blocks:
            for block in image_result.read.blocks:
                for line in block.lines:
                    extracted_text.append(line.text)

        # Analyze extracted text
        text_to_analyze = " ".join(extracted_text) if extracted_text else context

        sentiment = self.language.analyze_sentiment([text_to_analyze])
        entities = self.language.extract_entities([text_to_analyze])

        return {
            "image_caption": image_result.caption.text if image_result.caption else None,
            "image_tags": [t.name for t in image_result.tags.list] if image_result.tags else [],
            "extracted_text": extracted_text,
            "sentiment": sentiment[0] if sentiment else None,
            "entities": entities[0] if entities else []
        }

# Usage
analyzer = MultimodalAnalyzer()
result = analyzer.analyze_image_with_context(
    "https://example.com/document.jpg",
    "Business document analysis"
)

Conclusion

Azure AI Services provides a comprehensive suite of capabilities for building intelligent applications. The integration between vision, speech, and language services enables powerful multimodal solutions. As these services continue to evolve, we can expect even more sophisticated capabilities for enterprise AI applications.

Resources

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.