January 30, 2025 1 min read

Audio AI: Speech Recognition, Synthesis, and Analysis for Data Applications

AI Audio Speech Recognition Azure Cognitive Services

Audio AI enables voice interfaces, meeting transcription, and audio content analysis. For data professionals, this means new data sources and interaction patterns. Let’s explore audio AI capabilities on Azure.

Azure Speech Services Overview

Azure Speech Services
├── Speech-to-Text (Transcription)
│   ├── Real-time transcription
│   ├── Batch transcription
│   └── Custom speech models
├── Text-to-Speech (Synthesis)
│   ├── Neural voices
│   └── Custom voices
├── Speech Translation
├── Speaker Recognition
└── Pronunciation Assessment

Speech-to-Text for Data Pipelines

Real-Time Transcription

import azure.cognitiveservices.speech as speechsdk

class RealtimeTranscriber:
    def __init__(self, speech_key: str, region: str):
        self.speech_config = speechsdk.SpeechConfig(
            subscription=speech_key,
            region=region
        )
        self.speech_config.speech_recognition_language = "en-US"

    def transcribe_from_microphone(self, callback=None):
        """Transcribe speech from microphone in real-time."""

        audio_config = speechsdk.audio.AudioConfig(use_default_microphone=True)
        recognizer = speechsdk.SpeechRecognizer(
            speech_config=self.speech_config,
            audio_config=audio_config
        )

        results = []

        def handle_result(evt):
            if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
                results.append({
                    "text": evt.result.text,
                    "offset": evt.result.offset,
                    "duration": evt.result.duration
                })
                if callback:
                    callback(evt.result.text)

        recognizer.recognized.connect(handle_result)

        # Start continuous recognition
        recognizer.start_continuous_recognition()

        return recognizer, results

    def transcribe_file(self, audio_file: str) -> dict:
        """Transcribe an audio file."""

        audio_config = speechsdk.audio.AudioConfig(filename=audio_file)
        recognizer = speechsdk.SpeechRecognizer(
            speech_config=self.speech_config,
            audio_config=audio_config
        )

        result = recognizer.recognize_once()

        if result.reason == speechsdk.ResultReason.RecognizedSpeech:
            return {"success": True, "text": result.text}
        elif result.reason == speechsdk.ResultReason.NoMatch:
            return {"success": False, "error": "No speech recognized"}
        else:
            return {"success": False, "error": str(result.reason)}

Batch Transcription for Large Files

from azure.storage.blob import BlobServiceClient
import requests
import time

class BatchTranscriber:
    def __init__(self, speech_key: str, region: str):
        self.speech_key = speech_key
        self.region = region
        self.base_url = f"https://{region}.api.cognitive.microsoft.com/speechtotext/v3.1"

    def transcribe_batch(
        self,
        audio_urls: list[str],
        language: str = "en-US"
    ) -> str:
        """Submit batch transcription job."""

        headers = {
            "Ocp-Apim-Subscription-Key": self.speech_key,
            "Content-Type": "application/json"
        }

        body = {
            "contentUrls": audio_urls,
            "locale": language,
            "displayName": f"Batch transcription {datetime.now().isoformat()}",
            "properties": {
                "wordLevelTimestampsEnabled": True,
                "diarizationEnabled": True,  # Speaker identification
                "punctuationMode": "DictatedAndAutomatic"
            }
        }

        response = requests.post(
            f"{self.base_url}/transcriptions",
            headers=headers,
            json=body
        )

        if response.status_code == 201:
            return response.json()["self"]  # URL to check status
        else:
            raise Exception(f"Failed to create transcription: {response.text}")

    def wait_for_completion(self, transcription_url: str, timeout_minutes: int = 60) -> dict:
        """Wait for batch transcription to complete."""

        headers = {"Ocp-Apim-Subscription-Key": self.speech_key}

        start_time = time.time()
        while time.time() - start_time < timeout_minutes * 60:
            response = requests.get(transcription_url, headers=headers)
            status = response.json()

            if status["status"] == "Succeeded":
                # Get results
                files_url = status["links"]["files"]
                files_response = requests.get(files_url, headers=headers)
                return files_response.json()

            elif status["status"] == "Failed":
                raise Exception(f"Transcription failed: {status}")

            time.sleep(30)  # Check every 30 seconds

        raise TimeoutError("Transcription timed out")

Meeting Transcription with Speaker Diarization

class MeetingTranscriber:
    def __init__(self, speech_key: str, region: str):
        self.speech_config = speechsdk.SpeechConfig(
            subscription=speech_key,
            region=region
        )

    def transcribe_meeting(self, audio_file: str) -> list[dict]:
        """Transcribe a meeting with speaker identification."""

        audio_config = speechsdk.audio.AudioConfig(filename=audio_file)

        # Create conversation transcriber
        conversation_transcriber = speechsdk.transcription.ConversationTranscriber(
            speech_config=self.speech_config,
            audio_config=audio_config
        )

        transcription = []
        done = False

        def handle_transcribed(evt):
            if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
                transcription.append({
                    "speaker": evt.result.speaker_id,
                    "text": evt.result.text,
                    "offset_seconds": evt.result.offset / 10_000_000,  # Convert ticks to seconds
                    "duration_seconds": evt.result.duration / 10_000_000
                })

        def handle_stopped(evt):
            nonlocal done
            done = True

        conversation_transcriber.transcribed.connect(handle_transcribed)
        conversation_transcriber.session_stopped.connect(handle_stopped)
        conversation_transcriber.canceled.connect(handle_stopped)

        conversation_transcriber.start_transcribing_async()

        while not done:
            time.sleep(0.5)

        conversation_transcriber.stop_transcribing_async()

        return transcription

    def format_as_dialogue(self, transcription: list[dict]) -> str:
        """Format transcription as readable dialogue."""

        lines = []
        current_speaker = None

        for segment in transcription:
            if segment["speaker"] != current_speaker:
                current_speaker = segment["speaker"]
                lines.append(f"\n[{current_speaker}]:")

            lines.append(segment["text"])

        return " ".join(lines)

Text-to-Speech for Reports

class ReportNarrator:
    def __init__(self, speech_key: str, region: str):
        self.speech_config = speechsdk.SpeechConfig(
            subscription=speech_key,
            region=region
        )
        # Use a neural voice
        self.speech_config.speech_synthesis_voice_name = "en-US-JennyNeural"

    def narrate_report(self, text: str, output_file: str) -> bool:
        """Convert report text to audio file."""

        audio_config = speechsdk.audio.AudioOutputConfig(filename=output_file)
        synthesizer = speechsdk.SpeechSynthesizer(
            speech_config=self.speech_config,
            audio_config=audio_config
        )

        result = synthesizer.speak_text_async(text).get()

        if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
            return True
        else:
            return False

    def narrate_with_ssml(self, content: dict, output_file: str) -> bool:
        """Narrate with SSML for better control."""

        ssml = f"""
        <speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="en-US">
            <voice name="en-US-JennyNeural">
                <prosody rate="medium" pitch="medium">
                    <p>
                        <s><emphasis level="strong">{content['title']}</emphasis></s>
                        <break time="500ms"/>
                        <s>Report generated on {content['date']}</s>
                    </p>
                    <break time="1s"/>
                    <p>
                        <s>Key highlights:</s>
                        {"".join(f'<s>{highlight}</s><break time="300ms"/>' for highlight in content['highlights'])}
                    </p>
                    <break time="1s"/>
                    <p>
                        <s>In summary, {content['summary']}</s>
                    </p>
                </prosody>
            </voice>
        </speak>
        """

        audio_config = speechsdk.audio.AudioOutputConfig(filename=output_file)
        synthesizer = speechsdk.SpeechSynthesizer(
            speech_config=self.speech_config,
            audio_config=audio_config
        )

        result = synthesizer.speak_ssml_async(ssml).get()
        return result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted

Audio Analysis with AI

class AudioAnalyzer:
    def __init__(self, transcriber: RealtimeTranscriber, llm_client):
        self.transcriber = transcriber
        self.llm = llm_client

    async def analyze_customer_call(self, audio_file: str) -> dict:
        """Analyze a customer support call."""

        # Transcribe
        transcription_result = self.transcriber.transcribe_file(audio_file)

        if not transcription_result["success"]:
            return {"error": transcription_result["error"]}

        transcript = transcription_result["text"]

        # Analyze with LLM
        response = await self.llm.chat.complete_async(
            deployment="gpt-4o",
            messages=[{
                "role": "user",
                "content": f"""Analyze this customer support call transcript:

                {transcript}

                Provide:
                1. Call summary
                2. Customer sentiment (positive/neutral/negative)
                3. Issue category
                4. Resolution status (resolved/escalated/pending)
                5. Key action items
                6. Quality score (1-10) for the support agent
                7. Improvement suggestions

                Return as JSON."""
            }]
        )

        return json.loads(response.choices[0].message.content)

    async def extract_meeting_insights(self, transcription: list[dict]) -> dict:
        """Extract actionable insights from meeting transcription."""

        transcript_text = "\n".join([
            f"{s['speaker']}: {s['text']}"
            for s in transcription
        ])

        response = await self.llm.chat.complete_async(
            deployment="gpt-4o",
            messages=[{
                "role": "user",
                "content": f"""Analyze this meeting transcript:

                {transcript_text}

                Extract:
                1. Meeting summary (2-3 sentences)
                2. Key decisions made
                3. Action items with owners
                4. Questions raised but not answered
                5. Follow-up meetings needed
                6. Topics that need more discussion

                Return as JSON."""
            }]
        )

        return json.loads(response.choices[0].message.content)

Voice Interface for Data Queries

class VoiceDataAssistant:
    def __init__(self, transcriber, synthesizer, data_assistant):
        self.transcriber = transcriber
        self.synthesizer = synthesizer
        self.assistant = data_assistant  # Your existing data assistant

    async def process_voice_query(self) -> str:
        """Process a voice query and respond with voice."""

        # Listen for query
        recognizer, _ = self.transcriber.transcribe_from_microphone()

        print("Listening... Speak your question.")
        result = recognizer.recognize_once()

        if result.reason == speechsdk.ResultReason.RecognizedSpeech:
            query = result.text
            print(f"You asked: {query}")

            # Process with data assistant
            response = await self.assistant.query(query)

            # Speak the response
            self.synthesizer.speak_text_async(response["answer"]).get()

            return response

        return {"error": "Could not understand speech"}

Best Practices

Audio quality: Clean audio improves accuracy significantly
Custom models: Train on domain-specific vocabulary
Batch large files: Use batch API for cost efficiency
Cache transcriptions: Store results to avoid re-processing
Handle silence: Detect and handle empty audio gracefully

Audio AI adds a new dimension to data applications. Start with transcription use cases and expand to voice interfaces as your needs evolve.