Back to Blog
6 min read

Speech-to-Text Improvements in Azure AI: Accuracy and Performance

Introduction

Azure AI Speech’s speech-to-text capabilities have seen significant improvements in accuracy, especially for challenging scenarios like noisy environments, accented speech, and domain-specific vocabulary. This post explores these improvements and how to leverage them effectively.

Improved Accuracy Features

Noise Handling and Enhancement

import os
import azure.cognitiveservices.speech as speechsdk

class EnhancedSpeechRecognizer:
    def __init__(self):
        self.speech_config = speechsdk.SpeechConfig(
            subscription=os.getenv("AZURE_SPEECH_KEY"),
            region=os.getenv("AZURE_SPEECH_REGION")
        )

        # Enable audio processing enhancements
        self.speech_config.set_property(
            speechsdk.PropertyId.Speech_SegmentationSilenceTimeoutMs,
            "2000"  # 2 seconds silence detection
        )

    def recognize_with_noise_suppression(
        self,
        audio_file: str,
        enable_audio_logging: bool = False
    ) -> dict:
        """Recognize speech with enhanced noise handling"""

        # Configure for noisy audio
        self.speech_config.set_property(
            speechsdk.PropertyId.SpeechServiceConnection_EndSilenceTimeoutMs,
            "1500"
        )

        if enable_audio_logging:
            self.speech_config.enable_audio_logging()

        audio_config = speechsdk.AudioConfig(filename=audio_file)
        recognizer = speechsdk.SpeechRecognizer(
            speech_config=self.speech_config,
            audio_config=audio_config
        )

        result = recognizer.recognize_once()

        return {
            "text": result.text if result.reason == speechsdk.ResultReason.RecognizedSpeech else "",
            "reason": str(result.reason),
            "confidence": self._extract_confidence(result)
        }

    def _extract_confidence(self, result) -> float:
        """Extract confidence score from result"""
        try:
            json_result = result.properties.get(
                speechsdk.PropertyId.SpeechServiceResponse_JsonResult
            )
            if json_result:
                import json
                data = json.loads(json_result)
                if "NBest" in data and len(data["NBest"]) > 0:
                    return data["NBest"][0].get("Confidence", 0.0)
        except Exception:
            pass
        return 0.0

    def recognize_with_detailed_results(self, audio_file: str) -> dict:
        """Get detailed recognition results including alternatives"""
        self.speech_config.output_format = speechsdk.OutputFormat.Detailed

        audio_config = speechsdk.AudioConfig(filename=audio_file)
        recognizer = speechsdk.SpeechRecognizer(
            speech_config=self.speech_config,
            audio_config=audio_config
        )

        result = recognizer.recognize_once()

        detailed = {
            "best_text": result.text,
            "alternatives": [],
            "word_timings": []
        }

        if result.reason == speechsdk.ResultReason.RecognizedSpeech:
            json_result = result.properties.get(
                speechsdk.PropertyId.SpeechServiceResponse_JsonResult
            )
            if json_result:
                import json
                data = json.loads(json_result)

                # Get N-best alternatives
                if "NBest" in data:
                    for alt in data["NBest"]:
                        detailed["alternatives"].append({
                            "text": alt.get("Display", ""),
                            "confidence": alt.get("Confidence", 0),
                            "lexical": alt.get("Lexical", "")
                        })

                        # Get word timings from first alternative
                        if "Words" in alt and not detailed["word_timings"]:
                            for word in alt["Words"]:
                                detailed["word_timings"].append({
                                    "word": word.get("Word", ""),
                                    "offset": word.get("Offset", 0),
                                    "duration": word.get("Duration", 0)
                                })

        return detailed

# Usage
recognizer = EnhancedSpeechRecognizer()

# Basic recognition with noise suppression
result = recognizer.recognize_with_noise_suppression("noisy_audio.wav")
print(f"Text: {result['text']}")
print(f"Confidence: {result['confidence']:.2%}")

# Detailed results
detailed = recognizer.recognize_with_detailed_results("audio.wav")
print(f"Best: {detailed['best_text']}")
print("Alternatives:")
for alt in detailed["alternatives"][:3]:
    print(f"  {alt['text']} ({alt['confidence']:.2%})")

Custom Speech Models for Domain Accuracy

class CustomSpeechTrainer:
    """Train custom speech models for improved domain accuracy"""

    def __init__(self):
        self.speech_config = speechsdk.SpeechConfig(
            subscription=os.getenv("AZURE_SPEECH_KEY"),
            region=os.getenv("AZURE_SPEECH_REGION")
        )
        # Custom Speech API endpoint would be different
        self.custom_speech_endpoint = os.getenv("CUSTOM_SPEECH_ENDPOINT")

    def prepare_training_data(
        self,
        transcripts: list,
        audio_files: list
    ) -> dict:
        """Prepare data format for custom speech training

        Args:
            transcripts: List of text transcripts
            audio_files: List of corresponding audio file paths
        """
        # Training data format
        training_data = []
        for transcript, audio in zip(transcripts, audio_files):
            training_data.append({
                "audio_url": audio,
                "transcript": transcript,
                "locale": "en-US"
            })

        return {
            "data_type": "acoustic",
            "training_data": training_data
        }

    def prepare_language_data(self, sentences: list, phrases: list) -> dict:
        """Prepare language model training data

        Args:
            sentences: Domain-specific sentences
            phrases: Important phrases and terms
        """
        return {
            "data_type": "language",
            "sentences": sentences,
            "phrases": phrases,
            "locale": "en-US"
        }

    def create_phrase_list_file(
        self,
        phrases: list,
        output_path: str
    ):
        """Create phrase list file for upload"""
        with open(output_path, "w", encoding="utf-8") as f:
            for phrase in phrases:
                f.write(f"{phrase}\n")

    def create_pronunciation_file(
        self,
        pronunciations: list,
        output_path: str
    ):
        """Create custom pronunciation file

        Args:
            pronunciations: List of {"word": "...", "pronunciation": "..."}
        """
        with open(output_path, "w", encoding="utf-8") as f:
            for item in pronunciations:
                f.write(f"{item['word']}\t{item['pronunciation']}\n")

# Example usage for medical domain
trainer = CustomSpeechTrainer()

# Medical phrases for better recognition
medical_phrases = [
    "myocardial infarction",
    "electrocardiogram",
    "systolic blood pressure",
    "diastolic blood pressure",
    "magnetic resonance imaging",
    "computed tomography",
    "prothrombin time",
    "international normalized ratio"
]

trainer.create_phrase_list_file(medical_phrases, "medical_phrases.txt")

# Custom pronunciations
pronunciations = [
    {"word": "COVID-19", "pronunciation": "k ow v ih d n ay n t iy n"},
    {"word": "mRNA", "pronunciation": "eh m aa r eh n ey"},
    {"word": "HIPAA", "pronunciation": "hh ih p ax"}
]

trainer.create_pronunciation_file(pronunciations, "pronunciations.txt")

Batch Transcription for Large Audio Files

import requests
import time
from typing import List, Optional

class BatchTranscriber:
    """Batch transcription for processing large audio files"""

    def __init__(self):
        self.api_key = os.getenv("AZURE_SPEECH_KEY")
        self.region = os.getenv("AZURE_SPEECH_REGION")
        self.base_url = f"https://{self.region}.api.cognitive.microsoft.com/speechtotext/v3.1"

    def create_transcription(
        self,
        audio_urls: List[str],
        locale: str = "en-US",
        model_id: Optional[str] = None,
        properties: dict = None
    ) -> str:
        """Create a batch transcription job

        Args:
            audio_urls: List of audio file URLs (must be accessible)
            locale: Language locale
            model_id: Custom model ID (optional)
            properties: Additional properties

        Returns:
            Transcription job ID
        """
        url = f"{self.base_url}/transcriptions"
        headers = {
            "Ocp-Apim-Subscription-Key": self.api_key,
            "Content-Type": "application/json"
        }

        body = {
            "contentUrls": audio_urls,
            "locale": locale,
            "displayName": f"Batch transcription {time.strftime('%Y%m%d-%H%M%S')}",
            "properties": properties or {
                "wordLevelTimestampsEnabled": True,
                "punctuationMode": "DictatedAndAutomatic",
                "profanityFilterMode": "Masked",
                "diarizationEnabled": True,
                "timeToLive": "PT12H"  # Results available for 12 hours
            }
        }

        if model_id:
            body["model"] = {"self": f"{self.base_url}/models/{model_id}"}

        response = requests.post(url, headers=headers, json=body)
        response.raise_for_status()

        data = response.json()
        return data["self"].split("/")[-1]

    def get_transcription_status(self, transcription_id: str) -> dict:
        """Get status of transcription job"""
        url = f"{self.base_url}/transcriptions/{transcription_id}"
        headers = {"Ocp-Apim-Subscription-Key": self.api_key}

        response = requests.get(url, headers=headers)
        response.raise_for_status()

        return response.json()

    def wait_for_completion(
        self,
        transcription_id: str,
        poll_interval: int = 30,
        timeout: int = 3600
    ) -> dict:
        """Wait for transcription to complete"""
        start_time = time.time()

        while True:
            status = self.get_transcription_status(transcription_id)

            if status["status"] == "Succeeded":
                return status
            elif status["status"] == "Failed":
                raise Exception(f"Transcription failed: {status}")

            if time.time() - start_time > timeout:
                raise TimeoutError("Transcription timed out")

            print(f"Status: {status['status']}, waiting...")
            time.sleep(poll_interval)

    def get_transcription_results(self, transcription_id: str) -> List[dict]:
        """Get transcription results"""
        url = f"{self.base_url}/transcriptions/{transcription_id}/files"
        headers = {"Ocp-Apim-Subscription-Key": self.api_key}

        response = requests.get(url, headers=headers)
        response.raise_for_status()

        files = response.json()["values"]
        results = []

        for file in files:
            if file["kind"] == "Transcription":
                # Download transcription content
                content_response = requests.get(
                    file["links"]["contentUrl"],
                    headers=headers
                )
                results.append(content_response.json())

        return results

    def transcribe_and_wait(
        self,
        audio_urls: List[str],
        locale: str = "en-US"
    ) -> List[dict]:
        """Convenience method to transcribe and wait for results"""
        job_id = self.create_transcription(audio_urls, locale)
        print(f"Created transcription job: {job_id}")

        self.wait_for_completion(job_id)
        return self.get_transcription_results(job_id)

# Usage
batch = BatchTranscriber()

# Transcribe multiple files
audio_urls = [
    "https://storage.blob.core.windows.net/audio/meeting1.wav",
    "https://storage.blob.core.windows.net/audio/meeting2.wav"
]

results = batch.transcribe_and_wait(audio_urls)

for result in results:
    print(f"Source: {result['source']}")
    for segment in result.get("combinedRecognizedPhrases", []):
        print(f"  Speaker: {segment.get('speaker', 'unknown')}")
        print(f"  Text: {segment['display']}")

Speaker Diarization

class SpeakerDiarization:
    """Identify different speakers in audio"""

    def __init__(self):
        self.speech_config = speechsdk.SpeechConfig(
            subscription=os.getenv("AZURE_SPEECH_KEY"),
            region=os.getenv("AZURE_SPEECH_REGION")
        )

    def transcribe_with_diarization(
        self,
        audio_file: str,
        min_speakers: int = 1,
        max_speakers: int = 5
    ) -> List[dict]:
        """Transcribe audio with speaker identification"""
        audio_config = speechsdk.AudioConfig(filename=audio_file)

        # Create conversation transcriber
        transcriber = speechsdk.transcription.ConversationTranscriber(
            speech_config=self.speech_config,
            audio_config=audio_config
        )

        results = []
        done = False

        def handle_transcribed(evt):
            if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
                results.append({
                    "speaker_id": evt.result.speaker_id,
                    "text": evt.result.text,
                    "offset": evt.result.offset,
                    "duration": evt.result.duration
                })

        def handle_canceled(evt):
            nonlocal done
            done = True

        def handle_stopped(evt):
            nonlocal done
            done = True

        transcriber.transcribed.connect(handle_transcribed)
        transcriber.canceled.connect(handle_canceled)
        transcriber.session_stopped.connect(handle_stopped)

        transcriber.start_transcribing_async()

        while not done:
            time.sleep(0.5)

        transcriber.stop_transcribing_async()

        return results

    def format_conversation(self, results: List[dict]) -> str:
        """Format diarized results as conversation"""
        formatted = []
        current_speaker = None

        for segment in sorted(results, key=lambda x: x["offset"]):
            speaker = segment["speaker_id"]

            if speaker != current_speaker:
                current_speaker = speaker
                formatted.append(f"\n[Speaker {speaker}]")

            formatted.append(segment["text"])

        return " ".join(formatted)

# Usage
diarizer = SpeakerDiarization()

results = diarizer.transcribe_with_diarization("meeting.wav", max_speakers=4)
conversation = diarizer.format_conversation(results)
print(conversation)

Conclusion

The latest speech-to-text improvements in Azure AI provide significantly enhanced accuracy for real-world scenarios. By leveraging noise suppression, custom models, batch transcription, and speaker diarization, you can build robust transcription solutions for challenging use cases. The combination of real-time and batch processing options ensures flexibility for various application requirements.

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.