March 23, 2021 1 min read

Azure Cognitive Services Speech - Voice-Enabled Applications

Azure Cognitive Services Speech AI Voice

Azure Cognitive Services Speech provides powerful speech capabilities including speech-to-text, text-to-speech, speech translation, and speaker recognition. These services enable developers to build voice-enabled applications with natural language interaction.

Setting Up Speech Services

# Create Speech resource
az cognitiveservices account create \
    --name myspeechservice \
    --resource-group myResourceGroup \
    --kind SpeechServices \
    --sku S0 \
    --location eastus

# Get keys
az cognitiveservices account keys list \
    --name myspeechservice \
    --resource-group myResourceGroup

Speech-to-Text

Python SDK

import azure.cognitiveservices.speech as speechsdk
import json

# Configure speech service
speech_config = speechsdk.SpeechConfig(
    subscription="your-subscription-key",
    region="eastus"
)

# Set recognition language
speech_config.speech_recognition_language = "en-US"

# Enable detailed output
speech_config.output_format = speechsdk.OutputFormat.Detailed

# Create recognizer with microphone
audio_config = speechsdk.AudioConfig(use_default_microphone=True)
speech_recognizer = speechsdk.SpeechRecognizer(
    speech_config=speech_config,
    audio_config=audio_config
)

def recognize_once():
    """Single utterance recognition."""
    print("Say something...")
    result = speech_recognizer.recognize_once()

    if result.reason == speechsdk.ResultReason.RecognizedSpeech:
        print(f"Recognized: {result.text}")

        # Get detailed results
        detailed = json.loads(result.json)
        print(f"Confidence: {detailed['NBest'][0]['Confidence']}")

    elif result.reason == speechsdk.ResultReason.NoMatch:
        print(f"No speech recognized: {result.no_match_details}")

    elif result.reason == speechsdk.ResultReason.Canceled:
        cancellation = result.cancellation_details
        print(f"Canceled: {cancellation.reason}")


def continuous_recognition():
    """Continuous speech recognition."""
    done = False

    def recognized_handler(evt):
        if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
            print(f"RECOGNIZED: {evt.result.text}")

    def session_stopped_handler(evt):
        nonlocal done
        print("Session stopped")
        done = True

    # Connect callbacks
    speech_recognizer.recognized.connect(recognized_handler)
    speech_recognizer.session_stopped.connect(session_stopped_handler)
    speech_recognizer.canceled.connect(session_stopped_handler)

    # Start continuous recognition
    speech_recognizer.start_continuous_recognition()

    while not done:
        pass

    speech_recognizer.stop_continuous_recognition()


# Recognition from audio file
def recognize_from_file(audio_file_path):
    """Recognize speech from audio file."""
    audio_config = speechsdk.AudioConfig(filename=audio_file_path)
    recognizer = speechsdk.SpeechRecognizer(
        speech_config=speech_config,
        audio_config=audio_config
    )

    result = recognizer.recognize_once()
    return result.text

Real-Time Transcription with WebSocket

import asyncio
import websockets
import json

async def transcribe_stream():
    """Real-time streaming transcription."""
    speech_config = speechsdk.SpeechConfig(
        subscription="your-subscription-key",
        region="eastus"
    )

    # Configure for streaming
    push_stream = speechsdk.audio.PushAudioInputStream()
    audio_config = speechsdk.audio.AudioConfig(stream=push_stream)

    recognizer = speechsdk.SpeechRecognizer(
        speech_config=speech_config,
        audio_config=audio_config
    )

    all_results = []

    def recognizing_handler(evt):
        """Handle partial results (interim)."""
        print(f"RECOGNIZING: {evt.result.text}")

    def recognized_handler(evt):
        """Handle final results."""
        if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
            all_results.append(evt.result.text)
            print(f"RECOGNIZED: {evt.result.text}")

    recognizer.recognizing.connect(recognizing_handler)
    recognizer.recognized.connect(recognized_handler)

    recognizer.start_continuous_recognition_async()

    # Simulate streaming audio (replace with actual audio source)
    async with websockets.connect("wss://your-audio-source") as ws:
        while True:
            audio_chunk = await ws.recv()
            push_stream.write(audio_chunk)

    recognizer.stop_continuous_recognition_async()
    return " ".join(all_results)

Text-to-Speech

import azure.cognitiveservices.speech as speechsdk

# Configure TTS
speech_config = speechsdk.SpeechConfig(
    subscription="your-subscription-key",
    region="eastus"
)

# Set voice
speech_config.speech_synthesis_voice_name = "en-US-JennyNeural"

def text_to_speech(text):
    """Convert text to speech."""
    synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config)

    result = synthesizer.speak_text_async(text).get()

    if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
        print("Speech synthesized successfully")
    elif result.reason == speechsdk.ResultReason.Canceled:
        cancellation = result.cancellation_details
        print(f"Synthesis canceled: {cancellation.reason}")


def text_to_speech_with_ssml(ssml):
    """Use SSML for advanced control."""
    synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config)
    result = synthesizer.speak_ssml_async(ssml).get()
    return result


# SSML example
ssml = """
<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis"
       xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="en-US">
    <voice name="en-US-JennyNeural">
        <mstts:express-as style="cheerful">
            Welcome to the Azure Speech Service demo!
        </mstts:express-as>
        <break time="500ms"/>
        <prosody rate="-10%" pitch="+5%">
            This is an example of SSML with prosody control.
        </prosody>
    </voice>
</speak>
"""


def text_to_audio_file(text, output_file):
    """Save synthesized speech to file."""
    audio_config = speechsdk.audio.AudioOutputConfig(filename=output_file)
    synthesizer = speechsdk.SpeechSynthesizer(
        speech_config=speech_config,
        audio_config=audio_config
    )

    result = synthesizer.speak_text_async(text).get()
    return result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted


# List available voices
def list_voices():
    """Get available voices."""
    synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config)
    voices_result = synthesizer.get_voices_async().get()

    for voice in voices_result.voices:
        print(f"{voice.short_name} - {voice.local_name} ({voice.locale})")

Speech Translation

import azure.cognitiveservices.speech as speechsdk

def translate_speech():
    """Real-time speech translation."""
    translation_config = speechsdk.translation.SpeechTranslationConfig(
        subscription="your-subscription-key",
        region="eastus"
    )

    # Source language
    translation_config.speech_recognition_language = "en-US"

    # Target languages
    translation_config.add_target_language("es")
    translation_config.add_target_language("fr")
    translation_config.add_target_language("de")

    # Voice for speech output
    translation_config.voice_name = "es-ES-ElviraNeural"

    audio_config = speechsdk.audio.AudioConfig(use_default_microphone=True)
    translator = speechsdk.translation.TranslationRecognizer(
        translation_config=translation_config,
        audio_config=audio_config
    )

    def result_handler(evt):
        if evt.result.reason == speechsdk.ResultReason.TranslatedSpeech:
            print(f"Recognized: {evt.result.text}")
            for lang, translation in evt.result.translations.items():
                print(f"  -> {lang}: {translation}")

    translator.recognized.connect(result_handler)

    print("Speak in English...")
    translator.start_continuous_recognition()

    import time
    time.sleep(30)  # Listen for 30 seconds

    translator.stop_continuous_recognition()


def translate_with_synthesis():
    """Translation with voice output in target language."""
    translation_config = speechsdk.translation.SpeechTranslationConfig(
        subscription="your-subscription-key",
        region="eastus"
    )

    translation_config.speech_recognition_language = "en-US"
    translation_config.add_target_language("es")

    audio_config = speechsdk.audio.AudioConfig(use_default_microphone=True)

    translator = speechsdk.translation.TranslationRecognizer(
        translation_config=translation_config,
        audio_config=audio_config
    )

    def synthesize_translation(text, target_language):
        """Synthesize translated text."""
        speech_config = speechsdk.SpeechConfig(
            subscription="your-subscription-key",
            region="eastus"
        )
        speech_config.speech_synthesis_voice_name = f"{target_language}-ES-ElviraNeural"

        synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config)
        synthesizer.speak_text_async(text).get()

    def result_handler(evt):
        if evt.result.reason == speechsdk.ResultReason.TranslatedSpeech:
            spanish_translation = evt.result.translations.get("es", "")
            if spanish_translation:
                synthesize_translation(spanish_translation, "es")

    translator.recognized.connect(result_handler)
    translator.start_continuous_recognition()

Speaker Recognition

import azure.cognitiveservices.speech as speechsdk
import uuid

def enroll_speaker():
    """Enroll a speaker for verification."""
    profile_config = speechsdk.speaker.VoiceProfileConfig(
        subscription="your-subscription-key",
        region="eastus"
    )

    # Create voice profile
    profile_client = speechsdk.speaker.VoiceProfileClient(
        speech_config=profile_config
    )

    profile = profile_client.create_profile_async(
        speechsdk.speaker.VoiceProfileType.TextIndependentVerification,
        "en-US"
    ).get()

    print(f"Profile created: {profile.profile_id}")

    # Enroll with audio
    audio_config = speechsdk.audio.AudioConfig(use_default_microphone=True)

    print("Please speak for enrollment (20 seconds of speech required)...")

    for i in range(3):  # Multiple enrollments for accuracy
        print(f"Recording {i + 1}/3...")
        result = profile_client.enroll_profile_async(profile, audio_config).get()

        if result.reason == speechsdk.ResultReason.EnrollingVoiceProfile:
            print(f"Remaining enrollments: {result.profile_enrollment_result.enrollments_remaining}")

    print("Enrollment complete!")
    return profile.profile_id


def verify_speaker(profile_id):
    """Verify a speaker against enrolled profile."""
    profile_config = speechsdk.speaker.VoiceProfileConfig(
        subscription="your-subscription-key",
        region="eastus"
    )

    profile_client = speechsdk.speaker.VoiceProfileClient(
        speech_config=profile_config
    )

    profile = speechsdk.speaker.VoiceProfile(profile_id)
    audio_config = speechsdk.audio.AudioConfig(use_default_microphone=True)

    verifier = speechsdk.speaker.SpeakerVerifier(
        speech_config=profile_config
    )

    print("Please speak for verification...")
    result = verifier.verify_once_async(profile, audio_config).get()

    if result.reason == speechsdk.ResultReason.Verified:
        print(f"Verified! Score: {result.score}")
        return True
    else:
        print(f"Verification failed: {result.reason}")
        return False


def identify_speaker(profile_ids):
    """Identify speaker from multiple enrolled profiles."""
    profile_config = speechsdk.speaker.VoiceProfileConfig(
        subscription="your-subscription-key",
        region="eastus"
    )

    profiles = [speechsdk.speaker.VoiceProfile(pid) for pid in profile_ids]
    audio_config = speechsdk.audio.AudioConfig(use_default_microphone=True)

    identifier = speechsdk.speaker.SpeakerIdentifier(
        speech_config=profile_config
    )

    print("Please speak for identification...")
    result = identifier.identify_once_async(
        speechsdk.speaker.SpeakerIdentificationModel(profiles),
        audio_config
    ).get()

    if result.reason == speechsdk.ResultReason.Identified:
        print(f"Identified profile: {result.profile_id}")
        print(f"Confidence: {result.score}")
        return result.profile_id
    else:
        print("Speaker not identified")
        return None

Integration Example: Voice Assistant

import azure.cognitiveservices.speech as speechsdk
from azure.core.credentials import AzureKeyCredential
from azure.ai.textanalytics import TextAnalyticsClient

class VoiceAssistant:
    def __init__(self, speech_key, speech_region, language_key, language_endpoint):
        self.speech_config = speechsdk.SpeechConfig(
            subscription=speech_key,
            region=speech_region
        )
        self.speech_config.speech_recognition_language = "en-US"
        self.speech_config.speech_synthesis_voice_name = "en-US-JennyNeural"

        self.text_analytics = TextAnalyticsClient(
            endpoint=language_endpoint,
            credential=AzureKeyCredential(language_key)
        )

    def listen(self):
        """Listen for user speech."""
        recognizer = speechsdk.SpeechRecognizer(speech_config=self.speech_config)
        result = recognizer.recognize_once()

        if result.reason == speechsdk.ResultReason.RecognizedSpeech:
            return result.text
        return None

    def speak(self, text):
        """Speak response."""
        synthesizer = speechsdk.SpeechSynthesizer(speech_config=self.speech_config)
        synthesizer.speak_text_async(text).get()

    def analyze_intent(self, text):
        """Analyze user intent."""
        # Simple keyword matching (replace with LUIS or CLU for production)
        text_lower = text.lower()

        if "weather" in text_lower:
            return "weather"
        elif "time" in text_lower:
            return "time"
        elif "reminder" in text_lower:
            return "reminder"
        else:
            return "unknown"

    def run(self):
        """Main assistant loop."""
        self.speak("Hello! How can I help you?")

        while True:
            user_input = self.listen()
            if user_input:
                print(f"You said: {user_input}")

                intent = self.analyze_intent(user_input)
                response = self.handle_intent(intent, user_input)
                self.speak(response)

                if "goodbye" in user_input.lower():
                    break

    def handle_intent(self, intent, text):
        """Handle detected intent."""
        if intent == "weather":
            return "The weather today is sunny with a high of 72 degrees."
        elif intent == "time":
            from datetime import datetime
            return f"The current time is {datetime.now().strftime('%I:%M %p')}"
        else:
            return "I'm sorry, I didn't understand that. Could you please repeat?"

Conclusion

Azure Speech Services enable rich voice experiences:

Speech-to-Text: Real-time transcription with high accuracy
Text-to-Speech: Natural-sounding neural voices
Translation: Real-time multi-language translation
Speaker Recognition: Voice-based identity verification

These capabilities power virtual assistants, accessibility features, and multilingual applications.