Back to Blog
7 min read

Meeting Summarization with Azure AI

Introduction

Meeting summarization combines speech recognition, speaker diarization, and language understanding to transform lengthy meetings into actionable summaries. Azure AI provides the building blocks to create comprehensive meeting intelligence solutions.

Meeting Summarization Pipeline

Architecture Overview

from dataclasses import dataclass, field
from typing import List, Optional, Dict
from datetime import datetime, timedelta
from enum import Enum

class MeetingSegmentType(Enum):
    DISCUSSION = "discussion"
    DECISION = "decision"
    ACTION_ITEM = "action_item"
    QUESTION = "question"
    PRESENTATION = "presentation"

@dataclass
class Speaker:
    id: str
    name: Optional[str] = None
    role: Optional[str] = None

@dataclass
class TranscriptSegment:
    speaker: Speaker
    text: str
    start_time: float
    end_time: float
    segment_type: Optional[MeetingSegmentType] = None

@dataclass
class ActionItem:
    description: str
    assignee: Optional[str]
    due_date: Optional[str]
    priority: str = "medium"

@dataclass
class Decision:
    description: str
    made_by: List[str]
    timestamp: float

@dataclass
class MeetingSummary:
    title: str
    date: datetime
    duration_minutes: float
    participants: List[Speaker]
    executive_summary: str
    key_topics: List[str]
    decisions: List[Decision]
    action_items: List[ActionItem]
    transcript_segments: List[TranscriptSegment]
    questions_raised: List[str]
    next_steps: List[str]

Speech-to-Text with Diarization

import os
import azure.cognitiveservices.speech as speechsdk
import time

class MeetingTranscriber:
    def __init__(self):
        self.speech_config = speechsdk.SpeechConfig(
            subscription=os.getenv("AZURE_SPEECH_KEY"),
            region=os.getenv("AZURE_SPEECH_REGION")
        )
        self.speech_config.speech_recognition_language = "en-US"
        self.speech_config.set_property(
            speechsdk.PropertyId.SpeechServiceResponse_DiarizeIntermediateResults,
            "true"
        )

    def transcribe_meeting(
        self,
        audio_file: str,
        expected_speakers: int = 2
    ) -> List[TranscriptSegment]:
        """Transcribe meeting with speaker diarization"""
        audio_config = speechsdk.AudioConfig(filename=audio_file)

        conversation_transcriber = speechsdk.transcription.ConversationTranscriber(
            speech_config=self.speech_config,
            audio_config=audio_config
        )

        segments = []
        done = False

        def handle_transcribed(evt):
            if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
                segment = TranscriptSegment(
                    speaker=Speaker(id=evt.result.speaker_id),
                    text=evt.result.text,
                    start_time=evt.result.offset / 10_000_000,  # Convert to seconds
                    end_time=(evt.result.offset + evt.result.duration) / 10_000_000
                )
                segments.append(segment)
                print(f"[{segment.speaker.id}] {segment.text[:50]}...")

        def handle_canceled(evt):
            nonlocal done
            done = True

        def handle_stopped(evt):
            nonlocal done
            done = True

        conversation_transcriber.transcribed.connect(handle_transcribed)
        conversation_transcriber.canceled.connect(handle_canceled)
        conversation_transcriber.session_stopped.connect(handle_stopped)

        conversation_transcriber.start_transcribing_async()

        while not done:
            time.sleep(0.5)

        conversation_transcriber.stop_transcribing_async()

        return segments

    def identify_speakers(
        self,
        segments: List[TranscriptSegment],
        speaker_names: Dict[str, str] = None
    ) -> List[TranscriptSegment]:
        """Map speaker IDs to names if provided"""
        if not speaker_names:
            # Generate default names
            unique_speakers = set(seg.speaker.id for seg in segments)
            speaker_names = {
                sid: f"Speaker {i+1}"
                for i, sid in enumerate(sorted(unique_speakers))
            }

        for segment in segments:
            if segment.speaker.id in speaker_names:
                segment.speaker.name = speaker_names[segment.speaker.id]

        return segments

# Usage
transcriber = MeetingTranscriber()
segments = transcriber.transcribe_meeting("meeting_recording.wav")
segments = transcriber.identify_speakers(segments, {
    "Guest-1": "John Smith",
    "Guest-2": "Jane Doe"
})

AI-Powered Summary Generation

from openai import AzureOpenAI
import json

class MeetingSummarizer:
    def __init__(self):
        self.client = AzureOpenAI(
            api_key=os.getenv("AZURE_OPENAI_API_KEY"),
            api_version="2023-09-01-preview",
            azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
        )

    def generate_summary(
        self,
        segments: List[TranscriptSegment],
        meeting_title: str = "Team Meeting"
    ) -> MeetingSummary:
        """Generate comprehensive meeting summary"""
        # Format transcript for LLM
        transcript_text = self._format_transcript(segments)

        # Generate summary using GPT-4
        summary_response = self._generate_executive_summary(transcript_text)
        action_items = self._extract_action_items(transcript_text)
        decisions = self._extract_decisions(transcript_text)
        topics = self._extract_key_topics(transcript_text)
        questions = self._extract_questions(transcript_text)

        # Calculate duration
        if segments:
            duration = (segments[-1].end_time - segments[0].start_time) / 60
        else:
            duration = 0

        # Get unique participants
        participants = list({seg.speaker.id: seg.speaker for seg in segments}.values())

        return MeetingSummary(
            title=meeting_title,
            date=datetime.now(),
            duration_minutes=duration,
            participants=participants,
            executive_summary=summary_response,
            key_topics=topics,
            decisions=decisions,
            action_items=action_items,
            transcript_segments=segments,
            questions_raised=questions,
            next_steps=self._extract_next_steps(transcript_text)
        )

    def _format_transcript(self, segments: List[TranscriptSegment]) -> str:
        """Format transcript for LLM processing"""
        formatted = []
        for seg in segments:
            speaker_name = seg.speaker.name or seg.speaker.id
            time_str = f"{int(seg.start_time // 60):02d}:{int(seg.start_time % 60):02d}"
            formatted.append(f"[{time_str}] {speaker_name}: {seg.text}")
        return "\n".join(formatted)

    def _generate_executive_summary(self, transcript: str) -> str:
        """Generate executive summary"""
        prompt = f"""Analyze this meeting transcript and provide a concise executive summary (2-3 paragraphs).
Focus on the main purpose, key discussions, and outcomes.

Transcript:
{transcript}

Executive Summary:"""

        response = self.client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a professional meeting analyst."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=500,
            temperature=0.3
        )

        return response.choices[0].message.content

    def _extract_action_items(self, transcript: str) -> List[ActionItem]:
        """Extract action items from transcript"""
        prompt = f"""Extract all action items from this meeting transcript.
For each action item, identify:
- Description of the task
- Person assigned (if mentioned)
- Due date (if mentioned)
- Priority (high/medium/low based on urgency discussed)

Return as JSON array with fields: description, assignee, due_date, priority

Transcript:
{transcript}

Action Items JSON:"""

        response = self.client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You extract action items from meetings. Return only valid JSON."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=1000,
            temperature=0.1
        )

        try:
            items_data = json.loads(response.choices[0].message.content)
            return [ActionItem(**item) for item in items_data]
        except (json.JSONDecodeError, TypeError):
            return []

    def _extract_decisions(self, transcript: str) -> List[Decision]:
        """Extract decisions made during meeting"""
        prompt = f"""Extract all decisions made during this meeting.
For each decision, identify:
- What was decided
- Who made or agreed to the decision

Return as JSON array with fields: description, made_by (array of names)

Transcript:
{transcript}

Decisions JSON:"""

        response = self.client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You extract decisions from meetings. Return only valid JSON."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=500,
            temperature=0.1
        )

        try:
            decisions_data = json.loads(response.choices[0].message.content)
            return [Decision(
                description=d["description"],
                made_by=d["made_by"],
                timestamp=0
            ) for d in decisions_data]
        except (json.JSONDecodeError, TypeError):
            return []

    def _extract_key_topics(self, transcript: str) -> List[str]:
        """Extract main topics discussed"""
        prompt = f"""List the main topics discussed in this meeting (5-10 bullet points).

Transcript:
{transcript}

Topics (one per line):"""

        response = self.client.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=300,
            temperature=0.3
        )

        topics = response.choices[0].message.content.strip().split("\n")
        return [t.lstrip("- •").strip() for t in topics if t.strip()]

    def _extract_questions(self, transcript: str) -> List[str]:
        """Extract questions raised during meeting"""
        prompt = f"""List any important questions raised during this meeting that may need follow-up.

Transcript:
{transcript}

Questions (one per line):"""

        response = self.client.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=300,
            temperature=0.3
        )

        questions = response.choices[0].message.content.strip().split("\n")
        return [q.lstrip("- •?").strip() for q in questions if q.strip()]

    def _extract_next_steps(self, transcript: str) -> List[str]:
        """Extract next steps and follow-ups"""
        prompt = f"""Based on this meeting, list the recommended next steps or follow-up items.

Transcript:
{transcript}

Next Steps (one per line):"""

        response = self.client.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=300,
            temperature=0.3
        )

        steps = response.choices[0].message.content.strip().split("\n")
        return [s.lstrip("- •0123456789.").strip() for s in steps if s.strip()]

# Usage
summarizer = MeetingSummarizer()
summary = summarizer.generate_summary(segments, "Weekly Team Standup")

Report Generation

class MeetingReportGenerator:
    """Generate formatted meeting reports"""

    def generate_markdown_report(self, summary: MeetingSummary) -> str:
        """Generate Markdown report"""
        report = f"""# {summary.title}

**Date:** {summary.date.strftime('%B %d, %Y')}
**Duration:** {summary.duration_minutes:.0f} minutes
**Participants:** {', '.join(p.name or p.id for p in summary.participants)}

## Executive Summary

{summary.executive_summary}

## Key Topics Discussed

{self._format_list(summary.key_topics)}

## Decisions Made

{self._format_decisions(summary.decisions)}

## Action Items

{self._format_action_items(summary.action_items)}

## Questions for Follow-up

{self._format_list(summary.questions_raised)}

## Next Steps

{self._format_list(summary.next_steps)}

---

## Full Transcript

{self._format_transcript(summary.transcript_segments)}
"""
        return report

    def _format_list(self, items: List[str]) -> str:
        if not items:
            return "_No items_"
        return "\n".join(f"- {item}" for item in items)

    def _format_decisions(self, decisions: List[Decision]) -> str:
        if not decisions:
            return "_No decisions recorded_"
        lines = []
        for d in decisions:
            by = ", ".join(d.made_by) if d.made_by else "Team"
            lines.append(f"- **{d.description}** (by {by})")
        return "\n".join(lines)

    def _format_action_items(self, items: List[ActionItem]) -> str:
        if not items:
            return "_No action items_"
        lines = []
        for item in items:
            assignee = item.assignee or "Unassigned"
            due = f" (Due: {item.due_date})" if item.due_date else ""
            priority_emoji = {"high": "🔴", "medium": "🟡", "low": "🟢"}.get(item.priority, "⚪")
            lines.append(f"- {priority_emoji} **{assignee}**: {item.description}{due}")
        return "\n".join(lines)

    def _format_transcript(self, segments: List[TranscriptSegment]) -> str:
        lines = []
        for seg in segments:
            time_str = f"{int(seg.start_time // 60):02d}:{int(seg.start_time % 60):02d}"
            speaker = seg.speaker.name or seg.speaker.id
            lines.append(f"**[{time_str}] {speaker}:** {seg.text}")
        return "\n\n".join(lines)

    def generate_html_report(self, summary: MeetingSummary) -> str:
        """Generate HTML report"""
        import markdown
        md_report = self.generate_markdown_report(summary)
        html_content = markdown.markdown(md_report, extensions=['tables', 'fenced_code'])

        return f"""<!DOCTYPE html>
<html>
<head>
    <title>{summary.title} - Meeting Summary</title>
    <style>
        body {{ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; }}
        h1 {{ color: #333; border-bottom: 2px solid #0078d4; padding-bottom: 10px; }}
        h2 {{ color: #0078d4; margin-top: 30px; }}
        ul {{ line-height: 1.8; }}
        .action-item {{ background: #f0f0f0; padding: 10px; margin: 10px 0; border-left: 4px solid #0078d4; }}
    </style>
</head>
<body>
{html_content}
</body>
</html>"""

# Usage
generator = MeetingReportGenerator()
md_report = generator.generate_markdown_report(summary)

with open("meeting_summary.md", "w") as f:
    f.write(md_report)

Conclusion

Meeting summarization with Azure AI combines speech recognition, speaker identification, and language understanding to transform meetings into actionable documents. By implementing diarization, AI-powered extraction, and structured reporting, you can build comprehensive meeting intelligence solutions that save time and improve team productivity.

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.