5 min read
Multimodal Conversations with GPT-4o
The real power of GPT-4o emerges when you combine modalities in a single conversation. Text, images, and audio working together create experiences that weren’t possible before.
The Multimodal Conversation Flow
A typical multimodal conversation might look like:
User: [Shows image of error log] "What's wrong here?"
GPT-4o: [Analyzes image, identifies issue]
User: [Voice] "Can you help me fix it?"
GPT-4o: [Voice response with code suggestion]
User: [Shows updated screenshot] "Is this right now?"
GPT-4o: [Confirms or suggests corrections]
Building a Multimodal Chat Interface
from dataclasses import dataclass
from typing import List, Optional, Literal
from enum import Enum
import base64
class MessageType(Enum):
TEXT = "text"
IMAGE = "image"
AUDIO = "audio"
@dataclass
class MultimodalMessage:
role: Literal["user", "assistant", "system"]
content_type: MessageType
content: str # text, base64 image, or base64 audio
class MultimodalChat:
def __init__(self, client, system_prompt: str = None):
self.client = client
self.messages = []
if system_prompt:
self.messages.append({
"role": "system",
"content": system_prompt
})
def add_text(self, text: str, role: str = "user"):
self.messages.append({
"role": role,
"content": text
})
def add_image(self, image_path: str, text: str = ""):
with open(image_path, "rb") as f:
image_data = base64.b64encode(f.read()).decode("utf-8")
content = []
if text:
content.append({"type": "text", "text": text})
content.append({
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{image_data}"}
})
self.messages.append({
"role": "user",
"content": content
})
def chat(self, user_input=None) -> str:
if user_input:
self.add_text(user_input)
response = self.client.chat.completions.create(
model="gpt-4o",
messages=self.messages,
max_tokens=2000
)
assistant_message = response.choices[0].message.content
self.messages.append({
"role": "assistant",
"content": assistant_message
})
return assistant_message
Practical Example: Debug Assistant
class DebugAssistant:
def __init__(self, client):
self.chat = MultimodalChat(
client,
system_prompt="""You are an expert debugging assistant.
Analyze error logs, screenshots, and code to help diagnose issues.
Be concise and provide actionable solutions."""
)
def analyze_error(self, error_screenshot: str) -> str:
self.chat.add_image(
error_screenshot,
"I'm seeing this error. What's wrong and how do I fix it?"
)
return self.chat.chat()
def provide_context(self, code_snippet: str) -> str:
return self.chat.chat(f"Here's the relevant code:\n```\n{code_snippet}\n```")
def verify_fix(self, fixed_screenshot: str) -> str:
self.chat.add_image(
fixed_screenshot,
"I made changes. Does this look correct now?"
)
return self.chat.chat()
# Usage
assistant = DebugAssistant(client)
# Step 1: Show the error
print(assistant.analyze_error("error_screen.png"))
# Output: "The error shows a NullReferenceException on line 42..."
# Step 2: Provide code context
print(assistant.provide_context("""
def process_data(items):
for item in items:
result = item.value * 2
return result
"""))
# Output: "The issue is that 'result' is only set inside the loop..."
# Step 3: Verify the fix
print(assistant.verify_fix("fixed_screen.png"))
# Output: "Yes, the error is resolved. The code now correctly..."
Combining Audio and Vision
For real-time applications:
class MultimodalRealtimeClient {
constructor(apiKey) {
this.apiKey = apiKey;
this.ws = null;
}
async connect() {
this.ws = new WebSocket(
'wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview',
['realtime', `openai-insecure-api-key.${this.apiKey}`, 'openai-beta.realtime-v1']
);
this.ws.onopen = () => {
this.configure();
};
}
configure() {
this.ws.send(JSON.stringify({
type: 'session.update',
session: {
modalities: ['text', 'audio'],
voice: 'alloy',
instructions: `You are a multimodal assistant that can discuss
images shared earlier in the conversation while responding
with voice.`
}
}));
}
async sendImageForDiscussion(imageBase64, description) {
// First, send image via REST API to establish context
const response = await fetch('https://api.openai.com/v1/chat/completions', {
method: 'POST',
headers: {
'Authorization': `Bearer ${this.apiKey}`,
'Content-Type': 'application/json'
},
body: JSON.stringify({
model: 'gpt-4o',
messages: [{
role: 'user',
content: [
{ type: 'text', text: description },
{ type: 'image_url', image_url: { url: `data:image/png;base64,${imageBase64}` } }
]
}]
})
});
const data = await response.json();
this.imageContext = data.choices[0].message.content;
// Update realtime session with image context
this.ws.send(JSON.stringify({
type: 'session.update',
session: {
instructions: `Previous image analysis: ${this.imageContext}\n\n
Use this context when the user asks about the image.`
}
}));
}
}
Context Management Strategies
Managing context across modalities:
class ConversationContext:
def __init__(self, max_images: int = 5, max_tokens: int = 100000):
self.messages = []
self.image_count = 0
self.max_images = max_images
self.max_tokens = max_tokens
self.token_count = 0
def add_message(self, message: dict, estimated_tokens: int):
# Check if we need to trim context
while self.token_count + estimated_tokens > self.max_tokens:
self._trim_oldest()
self.messages.append(message)
self.token_count += estimated_tokens
# Count images
if isinstance(message.get("content"), list):
for item in message["content"]:
if item.get("type") == "image_url":
self.image_count += 1
# Trim old images if needed
while self.image_count > self.max_images:
self._remove_oldest_image()
def _trim_oldest(self):
if len(self.messages) > 1: # Keep system message
removed = self.messages.pop(1)
# Estimate and subtract tokens
self.token_count -= self._estimate_tokens(removed)
def _remove_oldest_image(self):
for i, msg in enumerate(self.messages):
if isinstance(msg.get("content"), list):
for j, item in enumerate(msg["content"]):
if item.get("type") == "image_url":
msg["content"].pop(j)
self.image_count -= 1
return
def _estimate_tokens(self, message) -> int:
# Simplified estimation
if isinstance(message.get("content"), str):
return len(message["content"]) // 4
return 500 # Default for complex messages
Best Practices
- Maintain coherent context - Reference previous images/audio naturally
- Handle modality transitions - Smoothly switch between text and voice
- Optimize token usage - Summarize previous visual context when possible
- Provide clear instructions - Tell the model how to handle each modality
- Design for failure - Handle cases where one modality fails
Real-World Application: Data Analytics Assistant
class DataAnalyticsAssistant:
def __init__(self, client):
self.client = client
self.context = []
async def analyze_dashboard(self, screenshot_path: str) -> str:
"""Analyze a dashboard screenshot and provide insights."""
with open(screenshot_path, "rb") as f:
image_data = base64.b64encode(f.read()).decode("utf-8")
response = self.client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "system",
"content": "You are a data analyst. Analyze dashboards and provide actionable insights."
},
{
"role": "user",
"content": [
{"type": "text", "text": "Analyze this dashboard and highlight key insights."},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_data}"}}
]
}
]
)
return response.choices[0].message.content
def follow_up(self, question: str) -> str:
"""Ask follow-up questions about the analysis."""
self.context.append({"role": "user", "content": question})
response = self.client.chat.completions.create(
model="gpt-4o",
messages=self.context
)
answer = response.choices[0].message.content
self.context.append({"role": "assistant", "content": answer})
return answer
What’s Next
Tomorrow I’ll cover Azure OpenAI GPT-4o deployment and configuration specifics.