September 12, 2023 1 min read

Azure AI Speech Updates: Enhanced Recognition and Synthesis

Azure Speech AI Speech Recognition Text-to-Speech

Introduction

Azure AI Speech services continue to advance with significant improvements in accuracy, language support, and real-time capabilities. This post explores the latest features and demonstrates how to leverage them in your applications.

Speech-to-Text Enhancements

Basic Speech Recognition

import os
import azure.cognitiveservices.speech as speechsdk

class SpeechRecognizer:
    def __init__(self):
        self.speech_config = speechsdk.SpeechConfig(
            subscription=os.getenv("AZURE_SPEECH_KEY"),
            region=os.getenv("AZURE_SPEECH_REGION")
        )
        # Set recognition language
        self.speech_config.speech_recognition_language = "en-US"

    def recognize_from_microphone(self) -> str:
        """Recognize speech from microphone"""
        audio_config = speechsdk.AudioConfig(use_default_microphone=True)
        recognizer = speechsdk.SpeechRecognizer(
            speech_config=self.speech_config,
            audio_config=audio_config
        )

        result = recognizer.recognize_once()

        if result.reason == speechsdk.ResultReason.RecognizedSpeech:
            return result.text
        elif result.reason == speechsdk.ResultReason.NoMatch:
            return "No speech could be recognized"
        elif result.reason == speechsdk.ResultReason.Canceled:
            cancellation = result.cancellation_details
            return f"Recognition canceled: {cancellation.reason}"

    def recognize_from_file(self, audio_file: str) -> str:
        """Recognize speech from audio file"""
        audio_config = speechsdk.AudioConfig(filename=audio_file)
        recognizer = speechsdk.SpeechRecognizer(
            speech_config=self.speech_config,
            audio_config=audio_config
        )

        result = recognizer.recognize_once()

        if result.reason == speechsdk.ResultReason.RecognizedSpeech:
            return result.text
        else:
            return f"Recognition failed: {result.reason}"

    def recognize_continuous(self, audio_file: str, callback=None) -> list:
        """Continuous recognition for longer audio"""
        audio_config = speechsdk.AudioConfig(filename=audio_file)
        recognizer = speechsdk.SpeechRecognizer(
            speech_config=self.speech_config,
            audio_config=audio_config
        )

        results = []
        done = False

        def handle_result(evt):
            if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
                results.append({
                    "text": evt.result.text,
                    "offset": evt.result.offset,
                    "duration": evt.result.duration
                })
                if callback:
                    callback(evt.result.text)

        def stop_cb(evt):
            nonlocal done
            done = True

        recognizer.recognized.connect(handle_result)
        recognizer.session_stopped.connect(stop_cb)
        recognizer.canceled.connect(stop_cb)

        recognizer.start_continuous_recognition()
        while not done:
            pass
        recognizer.stop_continuous_recognition()

        return results

# Usage
recognizer = SpeechRecognizer()

# From file
text = recognizer.recognize_from_file("audio.wav")
print(f"Recognized: {text}")

# Continuous recognition
def on_recognized(text):
    print(f">> {text}")

results = recognizer.recognize_continuous("long_audio.wav", callback=on_recognized)

Advanced Recognition with Phrase Lists

class EnhancedRecognizer:
    def __init__(self):
        self.speech_config = speechsdk.SpeechConfig(
            subscription=os.getenv("AZURE_SPEECH_KEY"),
            region=os.getenv("AZURE_SPEECH_REGION")
        )

    def recognize_with_phrases(
        self,
        audio_file: str,
        phrases: list,
        boost: float = 1.0
    ) -> str:
        """Recognize with phrase hints for better accuracy"""
        audio_config = speechsdk.AudioConfig(filename=audio_file)
        recognizer = speechsdk.SpeechRecognizer(
            speech_config=self.speech_config,
            audio_config=audio_config
        )

        # Add phrase list for better recognition
        phrase_list = speechsdk.PhraseListGrammar.from_recognizer(recognizer)
        for phrase in phrases:
            phrase_list.addPhrase(phrase)

        result = recognizer.recognize_once()
        return result.text if result.reason == speechsdk.ResultReason.RecognizedSpeech else ""

    def recognize_with_custom_model(
        self,
        audio_file: str,
        endpoint_id: str
    ) -> str:
        """Recognize using custom speech model"""
        self.speech_config.endpoint_id = endpoint_id

        audio_config = speechsdk.AudioConfig(filename=audio_file)
        recognizer = speechsdk.SpeechRecognizer(
            speech_config=self.speech_config,
            audio_config=audio_config
        )

        result = recognizer.recognize_once()
        return result.text if result.reason == speechsdk.ResultReason.RecognizedSpeech else ""

    def recognize_with_language_detection(self, audio_file: str) -> dict:
        """Recognize speech with automatic language detection"""
        # Configure auto language detection
        auto_detect_config = speechsdk.languageconfig.AutoDetectSourceLanguageConfig(
            languages=["en-US", "es-ES", "fr-FR", "de-DE", "ja-JP"]
        )

        audio_config = speechsdk.AudioConfig(filename=audio_file)
        recognizer = speechsdk.SpeechRecognizer(
            speech_config=self.speech_config,
            audio_config=audio_config,
            auto_detect_source_language_config=auto_detect_config
        )

        result = recognizer.recognize_once()

        if result.reason == speechsdk.ResultReason.RecognizedSpeech:
            detected_lang = speechsdk.AutoDetectSourceLanguageResult(result).language
            return {
                "text": result.text,
                "detected_language": detected_lang
            }
        return {"text": "", "detected_language": None}

# Usage
enhanced = EnhancedRecognizer()

# With phrase hints for technical terms
technical_phrases = [
    "Azure OpenAI",
    "Kubernetes",
    "microservices",
    "containerization",
    "CI/CD pipeline"
]
text = enhanced.recognize_with_phrases("meeting.wav", technical_phrases)

# With language detection
result = enhanced.recognize_with_language_detection("multilingual.wav")
print(f"Language: {result['detected_language']}, Text: {result['text']}")

Real-Time Transcription

import queue
import threading

class RealTimeTranscriber:
    def __init__(self):
        self.speech_config = speechsdk.SpeechConfig(
            subscription=os.getenv("AZURE_SPEECH_KEY"),
            region=os.getenv("AZURE_SPEECH_REGION")
        )
        # Enable interim results
        self.speech_config.set_property(
            speechsdk.PropertyId.SpeechServiceResponse_RequestSentenceBoundary,
            "true"
        )

        self.transcript_queue = queue.Queue()
        self.is_running = False

    def start_transcription(self):
        """Start real-time transcription from microphone"""
        audio_config = speechsdk.AudioConfig(use_default_microphone=True)
        self.recognizer = speechsdk.SpeechRecognizer(
            speech_config=self.speech_config,
            audio_config=audio_config
        )

        # Handle interim results
        def handle_recognizing(evt):
            self.transcript_queue.put({
                "type": "interim",
                "text": evt.result.text
            })

        # Handle final results
        def handle_recognized(evt):
            if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
                self.transcript_queue.put({
                    "type": "final",
                    "text": evt.result.text,
                    "offset": evt.result.offset,
                    "duration": evt.result.duration
                })

        def handle_canceled(evt):
            self.is_running = False

        self.recognizer.recognizing.connect(handle_recognizing)
        self.recognizer.recognized.connect(handle_recognized)
        self.recognizer.canceled.connect(handle_canceled)

        self.is_running = True
        self.recognizer.start_continuous_recognition()

    def stop_transcription(self):
        """Stop transcription"""
        if self.is_running:
            self.recognizer.stop_continuous_recognition()
            self.is_running = False

    def get_transcript(self, timeout: float = 0.1):
        """Get next transcript item from queue"""
        try:
            return self.transcript_queue.get(timeout=timeout)
        except queue.Empty:
            return None

# Usage example
transcriber = RealTimeTranscriber()
transcriber.start_transcription()

print("Listening... (Ctrl+C to stop)")
try:
    while transcriber.is_running:
        item = transcriber.get_transcript()
        if item:
            if item["type"] == "interim":
                print(f"\r[interim] {item['text']}", end="", flush=True)
            else:
                print(f"\n[final] {item['text']}")
except KeyboardInterrupt:
    transcriber.stop_transcription()

Text-to-Speech Updates

Neural Voice Synthesis

class SpeechSynthesizer:
    def __init__(self):
        self.speech_config = speechsdk.SpeechConfig(
            subscription=os.getenv("AZURE_SPEECH_KEY"),
            region=os.getenv("AZURE_SPEECH_REGION")
        )

    def list_voices(self, locale: str = None) -> list:
        """List available neural voices"""
        synthesizer = speechsdk.SpeechSynthesizer(
            speech_config=self.speech_config,
            audio_config=None
        )

        result = synthesizer.get_voices_async(locale).get()

        voices = []
        for voice in result.voices:
            voices.append({
                "name": voice.short_name,
                "display_name": voice.local_name,
                "locale": voice.locale,
                "gender": voice.gender.name,
                "voice_type": voice.voice_type.name,
                "styles": voice.style_list
            })

        return voices

    def synthesize_to_speaker(self, text: str, voice: str = "en-US-JennyNeural"):
        """Synthesize speech and play through speaker"""
        self.speech_config.speech_synthesis_voice_name = voice

        audio_config = speechsdk.AudioConfig(use_default_speaker=True)
        synthesizer = speechsdk.SpeechSynthesizer(
            speech_config=self.speech_config,
            audio_config=audio_config
        )

        result = synthesizer.speak_text_async(text).get()
        return result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted

    def synthesize_to_file(
        self,
        text: str,
        output_file: str,
        voice: str = "en-US-JennyNeural",
        format: str = "wav"
    ) -> bool:
        """Synthesize speech to audio file"""
        self.speech_config.speech_synthesis_voice_name = voice

        # Set output format
        if format == "mp3":
            self.speech_config.set_speech_synthesis_output_format(
                speechsdk.SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3
            )
        elif format == "wav":
            self.speech_config.set_speech_synthesis_output_format(
                speechsdk.SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm
            )

        audio_config = speechsdk.AudioConfig(filename=output_file)
        synthesizer = speechsdk.SpeechSynthesizer(
            speech_config=self.speech_config,
            audio_config=audio_config
        )

        result = synthesizer.speak_text_async(text).get()
        return result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted

    def synthesize_with_ssml(self, ssml: str, output_file: str = None):
        """Synthesize using SSML for advanced control"""
        if output_file:
            audio_config = speechsdk.AudioConfig(filename=output_file)
        else:
            audio_config = speechsdk.AudioConfig(use_default_speaker=True)

        synthesizer = speechsdk.SpeechSynthesizer(
            speech_config=self.speech_config,
            audio_config=audio_config
        )

        result = synthesizer.speak_ssml_async(ssml).get()
        return result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted

# Usage
synth = SpeechSynthesizer()

# List available voices
voices = synth.list_voices("en-US")
for v in voices[:5]:
    print(f"{v['name']}: {v['display_name']} ({v['gender']})")
    if v['styles']:
        print(f"  Styles: {', '.join(v['styles'])}")

# Synthesize with different voices
synth.synthesize_to_file(
    "Welcome to Azure AI Speech services.",
    "welcome.wav",
    voice="en-US-GuyNeural"
)

SSML for Expressive Speech

class SSMLBuilder:
    def __init__(self, voice: str = "en-US-JennyNeural"):
        self.voice = voice
        self.content = []

    def add_text(self, text: str):
        """Add plain text"""
        self.content.append(text)
        return self

    def add_break(self, time: str = "500ms"):
        """Add a pause"""
        self.content.append(f'<break time="{time}"/>')
        return self

    def add_emphasis(self, text: str, level: str = "moderate"):
        """Add emphasized text (reduced, moderate, strong)"""
        self.content.append(f'<emphasis level="{level}">{text}</emphasis>')
        return self

    def add_prosody(
        self,
        text: str,
        rate: str = None,
        pitch: str = None,
        volume: str = None
    ):
        """Add text with prosody control"""
        attrs = []
        if rate:
            attrs.append(f'rate="{rate}"')
        if pitch:
            attrs.append(f'pitch="{pitch}"')
        if volume:
            attrs.append(f'volume="{volume}"')

        attr_str = " ".join(attrs)
        self.content.append(f'<prosody {attr_str}>{text}</prosody>')
        return self

    def add_say_as(self, text: str, interpret_as: str, format: str = None):
        """Add text with specific interpretation (date, time, number, etc.)"""
        if format:
            self.content.append(f'<say-as interpret-as="{interpret_as}" format="{format}">{text}</say-as>')
        else:
            self.content.append(f'<say-as interpret-as="{interpret_as}">{text}</say-as>')
        return self

    def add_style(self, text: str, style: str):
        """Add text with emotional style (voice must support it)"""
        self.content.append(f'<mstts:express-as style="{style}">{text}</mstts:express-as>')
        return self

    def build(self) -> str:
        """Build complete SSML document"""
        content_str = "".join(self.content)
        return f'''<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis"
    xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="en-US">
    <voice name="{self.voice}">
        {content_str}
    </voice>
</speak>'''

# Usage
builder = SSMLBuilder("en-US-JennyNeural")
ssml = (builder
    .add_text("Welcome to the meeting.")
    .add_break("1s")
    .add_prosody("Today we have some exciting news!", rate="+10%", pitch="+5%")
    .add_break()
    .add_text("The meeting is scheduled for ")
    .add_say_as("2023-09-15", "date", "mdy")
    .add_text(" at ")
    .add_say_as("14:30", "time", "hms12")
    .add_break("500ms")
    .add_emphasis("Please don't be late!", "strong")
    .build()
)

synth.synthesize_with_ssml(ssml, "meeting_announcement.wav")

Conclusion

Azure AI Speech services provide comprehensive capabilities for both speech recognition and synthesis. With continuous improvements in accuracy, language support, and real-time processing, these services enable powerful voice-enabled applications. The combination of standard recognition, phrase hints, and SSML-based synthesis gives developers fine-grained control over speech interactions.