September 15, 2021 1 min read

Implementing Speaker Recognition with Azure Cognitive Services

Azure Cognitive Services Speaker Recognition Biometrics Security

Azure Speaker Recognition enables voice-based identity verification and identification. It can verify if a speaker is who they claim to be or identify who is speaking from a group of enrolled voices.

Speaker Recognition Modes

Speaker Verification: Verify a claimed identity (1:1 matching)
Speaker Identification: Identify who is speaking (1:N matching)

Both modes support:

Text-dependent: User speaks a specific passphrase
Text-independent: User speaks any content

Setting Up the Client

import azure.cognitiveservices.speech as speechsdk
import time

class SpeakerRecognitionClient:
    def __init__(self, speech_key: str, region: str):
        self.speech_config = speechsdk.SpeechConfig(
            subscription=speech_key,
            region=region
        )

    def create_voice_profile(self, profile_type: str) -> str:
        """Create a new voice profile for enrollment."""

        if profile_type == "verification_text_dependent":
            profile_type_enum = speechsdk.VoiceProfileType.TextDependentVerification
        elif profile_type == "verification_text_independent":
            profile_type_enum = speechsdk.VoiceProfileType.TextIndependentVerification
        else:
            profile_type_enum = speechsdk.VoiceProfileType.TextIndependentIdentification

        client = speechsdk.VoiceProfileClient(
            speech_config=self.speech_config
        )

        profile = client.create_profile_async(
            profile_type_enum,
            "en-US"
        ).get()

        print(f"Created profile: {profile.profile_id}")
        return profile.profile_id

Text-Dependent Speaker Verification

class TextDependentVerification:
    def __init__(self, speech_key: str, region: str):
        self.speech_config = speechsdk.SpeechConfig(
            subscription=speech_key,
            region=region
        )
        self.client = speechsdk.VoiceProfileClient(
            speech_config=self.speech_config
        )

    def get_activation_phrases(self) -> list:
        """Get available activation phrases."""
        phrases = self.client.get_activation_phrases_async(
            speechsdk.VoiceProfileType.TextDependentVerification,
            "en-US"
        ).get()
        return phrases.phrases

    def enroll_speaker(self, profile_id: str, audio_file: str) -> dict:
        """Enroll a speaker with their voice sample."""

        profile = speechsdk.VoiceProfile(
            profile_id,
            speechsdk.VoiceProfileType.TextDependentVerification
        )

        audio_config = speechsdk.AudioConfig(filename=audio_file)

        result = self.client.enroll_profile_async(
            profile,
            audio_config
        ).get()

        return {
            "reason": str(result.reason),
            "enrollments_count": result.enrollments_count,
            "remaining_enrollments": result.remaining_enrollments_count
        }

    def verify_speaker(self, profile_id: str, audio_file: str) -> dict:
        """Verify a speaker against their enrolled profile."""

        model = speechsdk.SpeakerVerificationModel.from_profile(
            speechsdk.VoiceProfile(
                profile_id,
                speechsdk.VoiceProfileType.TextDependentVerification
            )
        )

        audio_config = speechsdk.AudioConfig(filename=audio_file)

        recognizer = speechsdk.SpeakerRecognizer(
            speech_config=self.speech_config,
            audio_config=audio_config
        )

        result = recognizer.recognize_once_async(model).get()

        return {
            "verified": result.reason == speechsdk.ResultReason.RecognizedSpeaker,
            "score": result.score,
            "profile_id": result.profile_id
        }

# Usage
verifier = TextDependentVerification("your-key", "westus")

# Get available phrases
phrases = verifier.get_activation_phrases()
print("Available phrases:", phrases[:3])

# Create profile
profile_id = "your-profile-id"

# Enroll (requires multiple samples)
for i in range(3):
    result = verifier.enroll_speaker(profile_id, f"enrollment_{i}.wav")
    print(f"Enrollment {i+1}: {result}")

# Verify
verification = verifier.verify_speaker(profile_id, "verification_sample.wav")
print(f"Verified: {verification['verified']}, Score: {verification['score']}")

Text-Independent Speaker Identification

class TextIndependentIdentification:
    def __init__(self, speech_key: str, region: str):
        self.speech_config = speechsdk.SpeechConfig(
            subscription=speech_key,
            region=region
        )
        self.client = speechsdk.VoiceProfileClient(
            speech_config=self.speech_config
        )

    def enroll_speaker(self, profile_id: str, audio_file: str) -> dict:
        """Enroll speaker with text-independent audio."""

        profile = speechsdk.VoiceProfile(
            profile_id,
            speechsdk.VoiceProfileType.TextIndependentIdentification
        )

        audio_config = speechsdk.AudioConfig(filename=audio_file)

        result = self.client.enroll_profile_async(
            profile,
            audio_config
        ).get()

        return {
            "reason": str(result.reason),
            "audio_length": result.audio_length
        }

    def identify_speaker(self, profile_ids: list, audio_file: str) -> dict:
        """Identify which enrolled speaker is in the audio."""

        profiles = [
            speechsdk.VoiceProfile(
                pid,
                speechsdk.VoiceProfileType.TextIndependentIdentification
            )
            for pid in profile_ids
        ]

        model = speechsdk.SpeakerIdentificationModel.from_profiles(profiles)

        audio_config = speechsdk.AudioConfig(filename=audio_file)

        recognizer = speechsdk.SpeakerRecognizer(
            speech_config=self.speech_config,
            audio_config=audio_config
        )

        result = recognizer.recognize_once_async(model).get()

        if result.reason == speechsdk.ResultReason.RecognizedSpeakers:
            return {
                "identified": True,
                "profile_id": result.profile_id,
                "score": result.score
            }
        else:
            return {
                "identified": False,
                "reason": str(result.reason)
            }

# Usage
identifier = TextIndependentIdentification("your-key", "westus")

# Enroll multiple speakers
speaker_profiles = {}
for speaker_name in ["alice", "bob", "charlie"]:
    # Create profile
    profile_id = f"profile-{speaker_name}"
    # Enroll with their audio
    identifier.enroll_speaker(profile_id, f"{speaker_name}_enrollment.wav")
    speaker_profiles[profile_id] = speaker_name

# Identify unknown speaker
result = identifier.identify_speaker(
    list(speaker_profiles.keys()),
    "unknown_speaker.wav"
)

if result["identified"]:
    speaker = speaker_profiles[result["profile_id"]]
    print(f"Identified speaker: {speaker} (score: {result['score']})")

Real-Time Speaker Diarization

import azure.cognitiveservices.speech as speechsdk

def transcribe_with_diarization(audio_file: str) -> list:
    """Transcribe audio with speaker identification."""

    speech_config = speechsdk.SpeechConfig(
        subscription="your-key",
        region="westus"
    )

    audio_config = speechsdk.AudioConfig(filename=audio_file)

    # Enable speaker diarization
    speech_config.set_property(
        speechsdk.PropertyId.SpeechServiceConnection_SingleLanguageIdPriority,
        "Latency"
    )

    transcriber = speechsdk.transcription.ConversationTranscriber(
        speech_config=speech_config,
        audio_config=audio_config
    )

    results = []
    done = False

    def handle_transcribed(evt):
        if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
            results.append({
                "speaker_id": evt.result.speaker_id,
                "text": evt.result.text,
                "offset": evt.result.offset,
                "duration": evt.result.duration
            })

    def handle_stopped(evt):
        nonlocal done
        done = True

    transcriber.transcribed.connect(handle_transcribed)
    transcriber.session_stopped.connect(handle_stopped)
    transcriber.canceled.connect(handle_stopped)

    transcriber.start_transcribing_async()

    while not done:
        time.sleep(0.5)

    transcriber.stop_transcribing_async()

    return results

# Transcribe meeting with speaker labels
results = transcribe_with_diarization("meeting.wav")

current_speaker = None
for segment in results:
    if segment['speaker_id'] != current_speaker:
        current_speaker = segment['speaker_id']
        print(f"\n[Speaker {current_speaker}]")
    print(segment['text'], end=' ')

Building a Voice Authentication API

from flask import Flask, request, jsonify
import azure.cognitiveservices.speech as speechsdk
import os
import tempfile
import uuid

app = Flask(__name__)

class VoiceAuthService:
    def __init__(self):
        self.speech_config = speechsdk.SpeechConfig(
            subscription=os.environ["SPEECH_KEY"],
            region=os.environ["SPEECH_REGION"]
        )
        self.client = speechsdk.VoiceProfileClient(
            speech_config=self.speech_config
        )
        self.profiles = {}  # In production, use a database

    def create_profile(self, user_id: str) -> str:
        profile = self.client.create_profile_async(
            speechsdk.VoiceProfileType.TextIndependentVerification,
            "en-US"
        ).get()
        self.profiles[user_id] = profile.profile_id
        return profile.profile_id

    def enroll(self, user_id: str, audio_data: bytes) -> dict:
        if user_id not in self.profiles:
            self.create_profile(user_id)

        # Save audio to temp file
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
            f.write(audio_data)
            temp_path = f.name

        profile = speechsdk.VoiceProfile(
            self.profiles[user_id],
            speechsdk.VoiceProfileType.TextIndependentVerification
        )

        audio_config = speechsdk.AudioConfig(filename=temp_path)
        result = self.client.enroll_profile_async(profile, audio_config).get()

        os.remove(temp_path)

        return {
            "enrolled": result.reason == speechsdk.ResultReason.EnrolledVoiceProfile,
            "remaining_enrollments": result.remaining_enrollments_count
        }

    def verify(self, user_id: str, audio_data: bytes) -> dict:
        if user_id not in self.profiles:
            return {"verified": False, "error": "User not enrolled"}

        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
            f.write(audio_data)
            temp_path = f.name

        model = speechsdk.SpeakerVerificationModel.from_profile(
            speechsdk.VoiceProfile(
                self.profiles[user_id],
                speechsdk.VoiceProfileType.TextIndependentVerification
            )
        )

        audio_config = speechsdk.AudioConfig(filename=temp_path)
        recognizer = speechsdk.SpeakerRecognizer(
            speech_config=self.speech_config,
            audio_config=audio_config
        )

        result = recognizer.recognize_once_async(model).get()
        os.remove(temp_path)

        return {
            "verified": result.reason == speechsdk.ResultReason.RecognizedSpeaker,
            "confidence": result.score
        }

auth_service = VoiceAuthService()

@app.route('/enroll', methods=['POST'])
def enroll():
    user_id = request.form.get('user_id')
    audio = request.files['audio'].read()
    result = auth_service.enroll(user_id, audio)
    return jsonify(result)

@app.route('/verify', methods=['POST'])
def verify():
    user_id = request.form.get('user_id')
    audio = request.files['audio'].read()
    result = auth_service.verify(user_id, audio)
    return jsonify(result)

if __name__ == '__main__':
    app.run(port=5000)

Best Practices

Audio Quality: Use consistent recording conditions
Enrollment Samples: Collect diverse samples for robustness
Threshold Tuning: Adjust confidence thresholds per use case
Privacy: Handle voice data with appropriate security
Multi-Factor: Combine with other authentication methods
Liveness Detection: Implement anti-spoofing measures

Speaker Recognition enables secure, frictionless authentication and personalized experiences based on voice identity.