2 min read
Multi-Modal AI Applications: Beyond Text
Modern AI applications combine text, images, audio, and video for richer experiences. Here’s how to build them.
Multi-Modal Pipeline
from azure.ai.openai import AzureOpenAI
from azure.cognitiveservices.speech import SpeechConfig, SpeechRecognizer
import base64
class MultiModalAgent:
def __init__(self, openai_client: AzureOpenAI, speech_config: SpeechConfig):
self.openai = openai_client
self.speech = speech_config
async def analyze_image(self, image_bytes: bytes, question: str) -> str:
"""Analyze image and answer questions about it."""
image_b64 = base64.b64encode(image_bytes).decode()
response = await self.openai.chat.completions.create(
model="gpt-4o",
messages=[{
"role": "user",
"content": [
{"type": "text", "text": question},
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{image_b64}",
"detail": "high"
}
}
]
}]
)
return response.choices[0].message.content
async def analyze_document(self, document_images: list[bytes]) -> dict:
"""Extract structured data from document images."""
extracted = []
for image in document_images:
result = await self.analyze_image(
image,
"Extract all text, tables, and key information from this document."
)
extracted.append(result)
# Combine and structure
combined = "\n\n".join(extracted)
return await self.structure_extraction(combined)
async def transcribe_and_analyze(self, audio_bytes: bytes) -> dict:
"""Transcribe audio and analyze content."""
# Transcribe using Whisper
transcription = await self.openai.audio.transcriptions.create(
model="whisper-1",
file=audio_bytes
)
# Analyze transcription
analysis = await self.openai.chat.completions.create(
model="gpt-4o",
messages=[{
"role": "system",
"content": "Analyze this transcript. Extract key points, action items, and sentiment."
}, {
"role": "user",
"content": transcription.text
}],
response_format={"type": "json_object"}
)
return {
"transcript": transcription.text,
"analysis": json.loads(analysis.choices[0].message.content)
}
async def generate_with_images(self, prompt: str, context_images: list[bytes]) -> str:
"""Generate response using multiple images as context."""
content = [{"type": "text", "text": prompt}]
for img in context_images:
img_b64 = base64.b64encode(img).decode()
content.append({
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{img_b64}"}
})
response = await self.openai.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": content}]
)
return response.choices[0].message.content
Multi-modal AI opens up applications from document processing to video analysis.