Back to Blog
2 min read

Azure AI Speech: Building Voice-Enabled AI Applications

Voice interfaces are becoming increasingly important for AI applications. Azure AI Speech provides comprehensive capabilities for speech recognition, synthesis, and real-time translation.

Implementing Speech-to-Text

Build real-time speech recognition:

import azure.cognitiveservices.speech as speechsdk
from typing import Callable, Optional
import asyncio

class SpeechRecognizer:
    def __init__(self, speech_key: str, speech_region: str):
        self.speech_config = speechsdk.SpeechConfig(
            subscription=speech_key,
            region=speech_region
        )
        self.speech_config.speech_recognition_language = "en-US"

    def recognize_from_microphone(self, on_recognized: Callable[[str], None]) -> None:
        """Continuous speech recognition from microphone."""

        audio_config = speechsdk.AudioConfig(use_default_microphone=True)
        recognizer = speechsdk.SpeechRecognizer(
            speech_config=self.speech_config,
            audio_config=audio_config
        )

        def handle_result(evt):
            if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
                on_recognized(evt.result.text)

        recognizer.recognized.connect(handle_result)
        recognizer.start_continuous_recognition()

    def recognize_from_file(self, audio_file: str) -> str:
        """Recognize speech from audio file."""

        audio_config = speechsdk.AudioConfig(filename=audio_file)
        recognizer = speechsdk.SpeechRecognizer(
            speech_config=self.speech_config,
            audio_config=audio_config
        )

        result = recognizer.recognize_once()

        if result.reason == speechsdk.ResultReason.RecognizedSpeech:
            return result.text
        elif result.reason == speechsdk.ResultReason.NoMatch:
            return ""
        else:
            raise Exception(f"Speech recognition failed: {result.reason}")

Text-to-Speech with Neural Voices

Generate natural-sounding speech:

class SpeechSynthesizer:
    def __init__(self, speech_key: str, speech_region: str):
        self.speech_config = speechsdk.SpeechConfig(
            subscription=speech_key,
            region=speech_region
        )
        self.speech_config.speech_synthesis_voice_name = "en-US-JennyNeural"

    def synthesize_to_speaker(self, text: str) -> None:
        """Synthesize speech and play through speaker."""

        synthesizer = speechsdk.SpeechSynthesizer(speech_config=self.speech_config)
        result = synthesizer.speak_text_async(text).get()

        if result.reason != speechsdk.ResultReason.SynthesizingAudioCompleted:
            raise Exception(f"Speech synthesis failed: {result.reason}")

    def synthesize_to_file(self, text: str, output_file: str) -> None:
        """Synthesize speech to audio file."""

        audio_config = speechsdk.AudioConfig(filename=output_file)
        synthesizer = speechsdk.SpeechSynthesizer(
            speech_config=self.speech_config,
            audio_config=audio_config
        )

        result = synthesizer.speak_text_async(text).get()

        if result.reason != speechsdk.ResultReason.SynthesizingAudioCompleted:
            raise Exception(f"Speech synthesis failed: {result.reason}")

    def synthesize_with_ssml(self, ssml: str, output_file: str) -> None:
        """Synthesize with SSML for advanced control."""

        audio_config = speechsdk.AudioConfig(filename=output_file)
        synthesizer = speechsdk.SpeechSynthesizer(
            speech_config=self.speech_config,
            audio_config=audio_config
        )

        result = synthesizer.speak_ssml_async(ssml).get()

Building a Voice-Enabled AI Assistant

Combine speech recognition with Azure OpenAI for conversational AI:

class VoiceAIAssistant:
    def __init__(self, recognizer: SpeechRecognizer, synthesizer: SpeechSynthesizer, openai_client):
        self.recognizer = recognizer
        self.synthesizer = synthesizer
        self.openai_client = openai_client

    def process_voice_query(self, audio_file: str) -> str:
        """Process voice query and respond with speech."""

        text = self.recognizer.recognize_from_file(audio_file)
        response = self.openai_client.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", "content": text}]
        )
        answer = response.choices[0].message.content
        self.synthesizer.synthesize_to_speaker(answer)
        return answer

Voice interfaces make AI applications accessible to broader audiences and enable hands-free interaction scenarios.

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.