September 13, 2021 1 min read

Building Speech-to-Text Applications with Azure Cognitive Services

Azure Cognitive Services Speech Recognition AI Voice Applications

Azure Speech-to-Text service converts spoken audio into text with high accuracy. It supports real-time streaming, batch transcription, and customization for domain-specific vocabulary.

Speech Service Capabilities

Real-time transcription: Stream audio and receive text instantly
Batch transcription: Process audio files at scale
Custom models: Train for specific vocabulary and acoustics
Multi-language support: 100+ languages and variants
Speaker diarization: Identify who said what

Basic Speech Recognition

import azure.cognitiveservices.speech as speechsdk

def transcribe_microphone():
    """Transcribe speech from microphone in real-time."""

    speech_config = speechsdk.SpeechConfig(
        subscription="your-speech-key",
        region="westus"
    )
    speech_config.speech_recognition_language = "en-US"

    # Use default microphone
    audio_config = speechsdk.AudioConfig(use_default_microphone=True)

    recognizer = speechsdk.SpeechRecognizer(
        speech_config=speech_config,
        audio_config=audio_config
    )

    print("Speak into the microphone...")
    result = recognizer.recognize_once()

    if result.reason == speechsdk.ResultReason.RecognizedSpeech:
        print(f"Recognized: {result.text}")
    elif result.reason == speechsdk.ResultReason.NoMatch:
        print("No speech could be recognized")
    elif result.reason == speechsdk.ResultReason.Canceled:
        cancellation = result.cancellation_details
        print(f"Canceled: {cancellation.reason}")

transcribe_microphone()

Continuous Recognition

import azure.cognitiveservices.speech as speechsdk
import threading

def continuous_recognition(audio_file: str):
    """Continuously recognize speech from an audio file."""

    speech_config = speechsdk.SpeechConfig(
        subscription="your-speech-key",
        region="westus"
    )

    audio_config = speechsdk.AudioConfig(filename=audio_file)

    recognizer = speechsdk.SpeechRecognizer(
        speech_config=speech_config,
        audio_config=audio_config
    )

    done = threading.Event()
    all_results = []

    def handle_recognized(evt):
        """Handle recognized speech."""
        if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
            all_results.append({
                'text': evt.result.text,
                'offset': evt.result.offset,
                'duration': evt.result.duration
            })
            print(f"Recognized: {evt.result.text}")

    def handle_session_stopped(evt):
        """Handle session end."""
        print("Session stopped")
        done.set()

    def handle_canceled(evt):
        """Handle cancellation."""
        print(f"Canceled: {evt.reason}")
        done.set()

    # Connect callbacks
    recognizer.recognized.connect(handle_recognized)
    recognizer.session_stopped.connect(handle_session_stopped)
    recognizer.canceled.connect(handle_canceled)

    # Start continuous recognition
    recognizer.start_continuous_recognition()

    # Wait for completion
    done.wait()

    recognizer.stop_continuous_recognition()

    return all_results

# Transcribe audio file
results = continuous_recognition("meeting_recording.wav")

Real-time Streaming with WebSocket

import azure.cognitiveservices.speech as speechsdk
import pyaudio
import asyncio

class RealtimeTranscriber:
    def __init__(self, speech_key: str, region: str):
        self.speech_config = speechsdk.SpeechConfig(
            subscription=speech_key,
            region=region
        )
        self.speech_config.speech_recognition_language = "en-US"
        self.speech_config.enable_dictation()

        # Enable detailed results
        self.speech_config.output_format = speechsdk.OutputFormat.Detailed

    def create_push_stream(self):
        """Create a push audio stream for real-time audio."""
        stream_format = speechsdk.audio.AudioStreamFormat(
            samples_per_second=16000,
            bits_per_sample=16,
            channels=1
        )
        return speechsdk.audio.PushAudioInputStream(stream_format)

    async def transcribe_stream(self, push_stream):
        """Transcribe audio from a push stream."""
        audio_config = speechsdk.AudioConfig(stream=push_stream)

        recognizer = speechsdk.SpeechRecognizer(
            speech_config=self.speech_config,
            audio_config=audio_config
        )

        # Handle interim results
        def handle_recognizing(evt):
            print(f"Interim: {evt.result.text}", end='\r')

        def handle_recognized(evt):
            if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
                print(f"\nFinal: {evt.result.text}")
                # Parse detailed results for confidence
                json_result = evt.result.json
                print(f"Confidence: {evt.result.properties}")

        recognizer.recognizing.connect(handle_recognizing)
        recognizer.recognized.connect(handle_recognized)

        recognizer.start_continuous_recognition()

        return recognizer

# Example: Feeding audio data
transcriber = RealtimeTranscriber("your-key", "westus")
push_stream = transcriber.create_push_stream()

# In a real app, you would feed audio chunks:
# push_stream.write(audio_chunk)
# push_stream.close() when done

Batch Transcription

import requests
import json
import time

class BatchTranscriber:
    def __init__(self, speech_key: str, region: str):
        self.base_url = f"https://{region}.api.cognitive.microsoft.com/speechtotext/v3.0"
        self.headers = {
            "Ocp-Apim-Subscription-Key": speech_key,
            "Content-Type": "application/json"
        }

    def submit_transcription(self, audio_urls: list, locale: str = "en-US") -> str:
        """Submit a batch transcription job."""

        payload = {
            "contentUrls": audio_urls,
            "locale": locale,
            "displayName": f"Batch transcription {time.time()}",
            "properties": {
                "wordLevelTimestampsEnabled": True,
                "punctuationMode": "DictatedAndAutomatic",
                "profanityFilterMode": "Masked",
                "diarizationEnabled": True,
                "timeToLive": "PT12H"
            }
        }

        response = requests.post(
            f"{self.base_url}/transcriptions",
            headers=self.headers,
            json=payload
        )

        if response.status_code == 201:
            transcription = response.json()
            return transcription["self"]
        else:
            raise Exception(f"Failed to submit: {response.text}")

    def get_transcription_status(self, transcription_url: str) -> dict:
        """Get the status of a transcription job."""
        response = requests.get(transcription_url, headers=self.headers)
        return response.json()

    def get_results(self, transcription_url: str) -> list:
        """Get transcription results when complete."""
        # Get files URL
        status = self.get_transcription_status(transcription_url)

        if status["status"] != "Succeeded":
            raise Exception(f"Transcription not complete: {status['status']}")

        # Get results files
        files_url = f"{transcription_url}/files"
        response = requests.get(files_url, headers=self.headers)
        files = response.json()

        results = []
        for file_info in files["values"]:
            if file_info["kind"] == "Transcription":
                content_url = file_info["links"]["contentUrl"]
                content = requests.get(content_url).json()
                results.append(content)

        return results

    def transcribe_and_wait(self, audio_urls: list, poll_interval: int = 30) -> list:
        """Submit transcription and wait for results."""
        transcription_url = self.submit_transcription(audio_urls)
        print(f"Transcription submitted: {transcription_url}")

        while True:
            status = self.get_transcription_status(transcription_url)
            print(f"Status: {status['status']}")

            if status["status"] == "Succeeded":
                return self.get_results(transcription_url)
            elif status["status"] == "Failed":
                raise Exception(f"Transcription failed: {status}")

            time.sleep(poll_interval)

# Usage
transcriber = BatchTranscriber("your-key", "westus")
results = transcriber.transcribe_and_wait([
    "https://storage.blob.core.windows.net/audio/meeting1.wav",
    "https://storage.blob.core.windows.net/audio/meeting2.wav"
])

for result in results:
    for segment in result["combinedRecognizedPhrases"]:
        print(segment["display"])

Custom Speech Models

# custom_speech.py
import requests

class CustomSpeechTrainer:
    def __init__(self, speech_key: str, region: str):
        self.base_url = f"https://{region}.api.cognitive.microsoft.com/speechtotext/v3.0"
        self.headers = {
            "Ocp-Apim-Subscription-Key": speech_key,
            "Content-Type": "application/json"
        }

    def create_dataset(self, name: str, audio_url: str, transcript_url: str) -> str:
        """Create a training dataset."""
        payload = {
            "displayName": name,
            "locale": "en-US",
            "kind": "Acoustic",
            "contentUrl": audio_url,
            "properties": {
                "transcriptionContentUrl": transcript_url
            }
        }

        response = requests.post(
            f"{self.base_url}/datasets",
            headers=self.headers,
            json=payload
        )

        return response.json()["self"]

    def train_model(self, name: str, dataset_url: str, base_model_url: str) -> str:
        """Train a custom acoustic model."""
        payload = {
            "displayName": name,
            "locale": "en-US",
            "datasets": [{"self": dataset_url}],
            "baseModel": {"self": base_model_url}
        }

        response = requests.post(
            f"{self.base_url}/models",
            headers=self.headers,
            json=payload
        )

        return response.json()["self"]

    def create_endpoint(self, name: str, model_url: str) -> str:
        """Deploy a custom model to an endpoint."""
        payload = {
            "displayName": name,
            "locale": "en-US",
            "model": {"self": model_url}
        }

        response = requests.post(
            f"{self.base_url}/endpoints",
            headers=self.headers,
            json=payload
        )

        return response.json()["self"]

Integration with Azure Functions

# function_app.py
import azure.functions as func
import azure.cognitiveservices.speech as speechsdk
import json
import os

app = func.FunctionApp()

@app.function_name(name="TranscribeAudio")
@app.route(route="transcribe", methods=["POST"])
@app.blob_input(arg_name="audioBlob",
                path="audio/{filename}",
                connection="AzureWebJobsStorage")
def transcribe_audio(req: func.HttpRequest, audioBlob: bytes) -> func.HttpResponse:
    """HTTP-triggered function to transcribe audio."""

    speech_config = speechsdk.SpeechConfig(
        subscription=os.environ["SPEECH_KEY"],
        region=os.environ["SPEECH_REGION"]
    )

    # Write blob to temp file
    import tempfile
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
        f.write(audioBlob)
        temp_path = f.name

    audio_config = speechsdk.AudioConfig(filename=temp_path)
    recognizer = speechsdk.SpeechRecognizer(
        speech_config=speech_config,
        audio_config=audio_config
    )

    result = recognizer.recognize_once()

    # Clean up
    os.remove(temp_path)

    return func.HttpResponse(
        json.dumps({"text": result.text}),
        mimetype="application/json"
    )

Best Practices

Audio Quality: Use 16kHz, 16-bit, mono audio
Noise Handling: Pre-process audio to reduce background noise
Custom Models: Train for domain-specific terminology
Error Handling: Implement retry logic for network issues
Streaming: Use continuous recognition for long audio
Batch Processing: Use batch API for large volumes

Azure Speech-to-Text enables powerful voice-driven applications with industry-leading accuracy and flexibility.