May 4, 2024 1 min read

GPT-4o Real-Time Vision: Processing Images at Scale

GPT-4o’s vision capabilities are impressive, but what makes them practical for enterprise is the combination of quality, speed, and cost. Today I’m exploring real-time vision use cases.

Vision API Basics

from openai import AzureOpenAI
import base64
import httpx

client = AzureOpenAI(
    api_version="2024-05-01-preview",
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    api_key=os.environ["AZURE_OPENAI_KEY"]
)

def analyze_image(image_path, prompt):
    with open(image_path, "rb") as f:
        image_data = base64.b64encode(f.read()).decode("utf-8")

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/png;base64,{image_data}",
                            "detail": "high"
                        }
                    }
                ]
            }
        ],
        max_tokens=1000
    )
    return response.choices[0].message.content

Image Detail Levels

GPT-4o supports three detail levels:

Detail	Tokens	Use Case
`low`	85	Quick classification
`high`	85 + 170/tile	Detailed analysis
`auto`	Varies	Let model decide

def estimate_tokens(width, height, detail="auto"):
    if detail == "low":
        return 85

    # Scale down if needed
    if max(width, height) > 2048:
        ratio = 2048 / max(width, height)
        width = int(width * ratio)
        height = int(height * ratio)

    if min(width, height) > 768:
        ratio = 768 / min(width, height)
        width = int(width * ratio)
        height = int(height * ratio)

    tiles_x = (width + 511) // 512
    tiles_y = (height + 511) // 512

    return 85 + (170 * tiles_x * tiles_y)

Document Processing Pipeline

Processing invoices, receipts, and forms:

import asyncio
from dataclasses import dataclass
from typing import List
import json

@dataclass
class InvoiceData:
    vendor: str
    invoice_number: str
    date: str
    total: float
    line_items: List[dict]

async def process_invoice(image_path: str) -> InvoiceData:
    prompt = """Extract invoice data in JSON format:
    {
        "vendor": "company name",
        "invoice_number": "INV-XXX",
        "date": "YYYY-MM-DD",
        "total": 0.00,
        "line_items": [
            {"description": "", "quantity": 0, "unit_price": 0.00, "total": 0.00}
        ]
    }
    Return only valid JSON."""

    with open(image_path, "rb") as f:
        image_data = base64.b64encode(f.read()).decode("utf-8")

    response = await asyncio.to_thread(
        client.chat.completions.create,
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/png;base64,{image_data}",
                            "detail": "high"
                        }
                    }
                ]
            }
        ],
        response_format={"type": "json_object"},
        max_tokens=1000
    )

    data = json.loads(response.choices[0].message.content)
    return InvoiceData(**data)

async def process_batch(image_paths: List[str]) -> List[InvoiceData]:
    tasks = [process_invoice(path) for path in image_paths]
    return await asyncio.gather(*tasks)

Architecture Diagram Analysis

Analyzing technical diagrams:

def analyze_architecture(diagram_path: str) -> dict:
    prompt = """Analyze this architecture diagram and provide:
    1. List of components/services identified
    2. Data flow between components
    3. Potential single points of failure
    4. Scalability concerns
    5. Security considerations

    Format as structured JSON."""

    with open(diagram_path, "rb") as f:
        image_data = base64.b64encode(f.read()).decode("utf-8")

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": "You are an expert cloud architect reviewing system designs."
            },
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/png;base64,{image_data}",
                            "detail": "high"
                        }
                    }
                ]
            }
        ],
        response_format={"type": "json_object"},
        max_tokens=2000
    )

    return json.loads(response.choices[0].message.content)

Chart and Graph Understanding

Extracting data from visualizations:

def extract_chart_data(chart_image: str) -> dict:
    prompt = """Analyze this chart/graph and extract:
    1. Chart type (bar, line, pie, etc.)
    2. Axis labels and units
    3. Data series with approximate values
    4. Key insights or trends
    5. Any anomalies or notable patterns

    Return as JSON with these sections."""

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {"url": chart_image, "detail": "high"}
                    }
                ]
            }
        ],
        response_format={"type": "json_object"}
    )

    return json.loads(response.choices[0].message.content)

Multi-Image Analysis

Comparing multiple images:

def compare_images(image_paths: List[str], comparison_prompt: str) -> str:
    content = [{"type": "text", "text": comparison_prompt}]

    for i, path in enumerate(image_paths):
        with open(path, "rb") as f:
            image_data = base64.b64encode(f.read()).decode("utf-8")

        content.append({
            "type": "image_url",
            "image_url": {
                "url": f"data:image/png;base64,{image_data}",
                "detail": "low"  # Use low for comparison to save tokens
            }
        })

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": content}],
        max_tokens=1500
    )

    return response.choices[0].message.content

# Compare dashboard screenshots over time
result = compare_images(
    ["dashboard_jan.png", "dashboard_feb.png", "dashboard_mar.png"],
    "Compare these three monthly dashboard screenshots. What trends do you see? Any concerning changes?"
)

Integration with Azure Blob Storage

Processing images stored in Azure:

from azure.storage.blob import BlobServiceClient
from azure.identity import DefaultAzureCredential

credential = DefaultAzureCredential()
blob_service = BlobServiceClient(
    account_url=f"https://{storage_account}.blob.core.windows.net",
    credential=credential
)

async def process_blob_image(container: str, blob_name: str) -> dict:
    container_client = blob_service.get_container_client(container)
    blob_client = container_client.get_blob_client(blob_name)

    # Download blob
    blob_data = blob_client.download_blob().readall()
    image_data = base64.b64encode(blob_data).decode("utf-8")

    # Determine MIME type
    mime_type = "image/png" if blob_name.endswith(".png") else "image/jpeg"

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Describe this image in detail."},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:{mime_type};base64,{image_data}"
                        }
                    }
                ]
            }
        ]
    )

    return {
        "blob": blob_name,
        "analysis": response.choices[0].message.content
    }

Best Practices

Optimize image size - Resize before sending to reduce tokens
Use appropriate detail level - Low for classification, high for extraction
Batch requests - Process multiple images concurrently
Cache results - Store analysis results to avoid reprocessing
Handle errors gracefully - Images may fail validation

What’s Next

Tomorrow we’ll explore multimodal conversations combining text, voice, and vision in a single interaction.