March 16, 2023 1 min read

GPT-4 Multimodal: Understanding Vision Capabilities

GPT-4’s vision capabilities are arguably the most transformative feature of this release. While not publicly available yet, the demos and documentation reveal powerful possibilities for enterprise applications.

What GPT-4 Vision Can Do

Based on OpenAI’s demos and technical report:

Understand diagrams: Flowcharts, architecture diagrams, ERDs
Analyze charts: Extract data from visualizations
Read documents: Beyond OCR - understand context and layout
Interpret screenshots: UI analysis, error identification
Describe images: Detailed, contextual descriptions

Preparing for Vision APIs

While we wait for access, we can prepare our applications:

from dataclasses import dataclass
from typing import Optional, Union
from enum import Enum
import base64
import httpx

class ImageSource(Enum):
    URL = "url"
    BASE64 = "base64"
    FILE = "file"

@dataclass
class ImageInput:
    source_type: ImageSource
    data: str  # URL, base64 string, or file path
    detail: str = "auto"  # low, high, or auto

    @classmethod
    def from_url(cls, url: str, detail: str = "auto") -> "ImageInput":
        return cls(ImageSource.URL, url, detail)

    @classmethod
    def from_file(cls, path: str, detail: str = "auto") -> "ImageInput":
        with open(path, "rb") as f:
            data = base64.b64encode(f.read()).decode()
        return cls(ImageSource.BASE64, data, detail)

    @classmethod
    def from_bytes(cls, data: bytes, detail: str = "auto") -> "ImageInput":
        encoded = base64.b64encode(data).decode()
        return cls(ImageSource.BASE64, encoded, detail)

    def to_api_format(self) -> dict:
        """Convert to OpenAI API format."""
        if self.source_type == ImageSource.URL:
            return {
                "type": "image_url",
                "image_url": {
                    "url": self.data,
                    "detail": self.detail
                }
            }
        else:
            return {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/jpeg;base64,{self.data}",
                    "detail": self.detail
                }
            }

class VisionMessage:
    """Build multimodal messages with text and images."""

    def __init__(self, role: str = "user"):
        self.role = role
        self.content = []

    def add_text(self, text: str) -> "VisionMessage":
        self.content.append({"type": "text", "text": text})
        return self

    def add_image(self, image: ImageInput) -> "VisionMessage":
        self.content.append(image.to_api_format())
        return self

    def build(self) -> dict:
        return {"role": self.role, "content": self.content}

Use Case: Architecture Diagram Analysis

class ArchitectureAnalyzer:
    """Analyze architecture diagrams using GPT-4 Vision."""

    def __init__(self, client):
        self.client = client

    async def analyze_diagram(
        self,
        image: ImageInput,
        analysis_type: str = "general"
    ) -> dict:
        """Analyze an architecture diagram."""

        prompts = {
            "general": "Describe this architecture diagram in detail. Identify all components, their relationships, and data flows.",
            "security": "Analyze this architecture diagram for security concerns. Identify potential vulnerabilities, missing security controls, and compliance issues.",
            "scalability": "Evaluate this architecture for scalability. Identify bottlenecks, single points of failure, and suggest improvements.",
            "cost": "Analyze this Azure architecture for cost optimization. Identify potentially expensive components and suggest alternatives."
        }

        prompt = prompts.get(analysis_type, prompts["general"])

        message = (VisionMessage()
            .add_text(prompt)
            .add_image(image)
            .build())

        response = await self.client.chat_completion(
            model="gpt-4-vision-preview",
            messages=[message],
            max_tokens=2000
        )

        return {
            "analysis_type": analysis_type,
            "findings": response.content,
            "model": "gpt-4-vision"
        }

    async def compare_architectures(
        self,
        current: ImageInput,
        proposed: ImageInput
    ) -> dict:
        """Compare two architecture diagrams."""

        message = (VisionMessage()
            .add_text("Compare these two architecture diagrams. The first is the current state, the second is proposed. Identify: 1) What's being added, 2) What's being removed, 3) What's changing, 4) Potential risks in the migration.")
            .add_image(current)
            .add_image(proposed)
            .build())

        response = await self.client.chat_completion(
            model="gpt-4-vision-preview",
            messages=[message],
            max_tokens=2000
        )

        return response.content

Use Case: Dashboard Analysis

class DashboardAnalyzer:
    """Extract insights from dashboard screenshots."""

    async def analyze_dashboard(
        self,
        screenshot: ImageInput,
        context: str = ""
    ) -> dict:
        """Analyze a Power BI or similar dashboard."""

        prompt = f"""Analyze this dashboard screenshot.

Context: {context if context else "Business intelligence dashboard"}

Please provide:
1. Key metrics shown and their current values
2. Trends visible in any charts
3. Any anomalies or areas of concern
4. Actionable insights based on the data

Format your response as structured analysis."""

        message = (VisionMessage()
            .add_text(prompt)
            .add_image(screenshot)
            .build())

        response = await self.client.chat_completion(
            model="gpt-4-vision-preview",
            messages=[message]
        )

        return {
            "analysis": response.content,
            "source": "dashboard_screenshot"
        }

    async def extract_chart_data(self, chart_image: ImageInput) -> dict:
        """Extract approximate data from a chart."""

        prompt = """Extract the data from this chart as accurately as possible.

Return the data in this JSON format:
{
    "chart_type": "bar|line|pie|etc",
    "title": "chart title if visible",
    "x_axis": "label",
    "y_axis": "label",
    "data_points": [
        {"label": "x value", "value": number},
        ...
    ],
    "notes": "any relevant observations"
}

Be as accurate as possible with the numbers."""

        message = (VisionMessage()
            .add_text(prompt)
            .add_image(chart_image)
            .build())

        response = await self.client.chat_completion(
            model="gpt-4-vision-preview",
            messages=[message],
            temperature=0.1
        )

        import json
        try:
            return json.loads(response.content)
        except:
            return {"raw": response.content}

Use Case: Error Screenshot Analysis

class ErrorAnalyzer:
    """Analyze error screenshots for debugging."""

    async def diagnose_error(
        self,
        screenshot: ImageInput,
        application_context: str = ""
    ) -> dict:
        """Analyze an error screenshot and suggest solutions."""

        prompt = f"""Analyze this error screenshot.

Application context: {application_context}

Please provide:
1. Error identification - what error is shown
2. Likely cause - based on the error message and any visible context
3. Suggested solutions - step by step remediation
4. Related issues - other things to check

If this is an Azure-related error, include relevant Azure documentation links."""

        message = (VisionMessage()
            .add_text(prompt)
            .add_image(screenshot)
            .build())

        response = await self.client.chat_completion(
            model="gpt-4-vision-preview",
            messages=[message]
        )

        return {
            "diagnosis": response.content,
            "image_analyzed": True
        }

Use Case: Document Processing

class DocumentProcessor:
    """Process scanned documents with vision."""

    async def extract_table(self, document_image: ImageInput) -> dict:
        """Extract table data from document image."""

        prompt = """Extract all table data from this document image.

Return as JSON:
{
    "tables": [
        {
            "headers": ["col1", "col2", ...],
            "rows": [
                ["val1", "val2", ...],
                ...
            ]
        }
    ]
}

Preserve the structure and all values as accurately as possible."""

        message = (VisionMessage()
            .add_text(prompt)
            .add_image(document_image)
            .build())

        response = await self.client.chat_completion(
            model="gpt-4-vision-preview",
            messages=[message],
            temperature=0
        )

        return response.content

    async def analyze_form(self, form_image: ImageInput) -> dict:
        """Extract field values from a form."""

        prompt = """Extract all filled fields from this form.

Return as JSON:
{
    "form_type": "detected form type",
    "fields": {
        "field_name": "field_value",
        ...
    },
    "signatures": true/false,
    "completeness": "complete|partial|empty"
}"""

        message = (VisionMessage()
            .add_text(prompt)
            .add_image(form_image)
            .build())

        response = await self.client.chat_completion(
            model="gpt-4-vision-preview",
            messages=[message],
            temperature=0
        )

        return response.content

Cost Considerations

Vision API pricing is expected to be higher than text-only:

Image processing adds significant compute cost
detail: "high" costs more than detail: "low"
Multiple images multiply costs

def estimate_vision_cost(
    images: list[ImageInput],
    text_tokens: int = 500
) -> float:
    """Estimate cost for vision request."""
    # Estimated pricing (will update when official)
    base_cost = text_tokens * 0.03 / 1000  # GPT-4 text cost

    image_cost = 0
    for img in images:
        if img.detail == "low":
            image_cost += 0.01  # Estimated
        else:
            image_cost += 0.05  # Estimated for high detail

    return base_cost + image_cost

Preparing Your Pipeline

class VisionPipeline:
    """Production-ready vision processing pipeline."""

    def __init__(self, config):
        self.config = config
        self.fallback_to_text = True

    async def process(
        self,
        image: ImageInput,
        task: str,
        fallback_text: str = None
    ) -> dict:
        """Process image with fallback."""
        try:
            # Try vision API
            result = await self._process_vision(image, task)
            return {"success": True, "result": result, "method": "vision"}

        except Exception as e:
            if self.fallback_to_text and fallback_text:
                # Fall back to text description
                result = await self._process_text(fallback_text, task)
                return {"success": True, "result": result, "method": "text_fallback"}

            return {"success": False, "error": str(e)}

    async def _process_vision(self, image: ImageInput, task: str):
        # Vision API call
        pass

    async def _process_text(self, text: str, task: str):
        # Text API call
        pass

GPT-4 Vision opens new categories of applications. Start planning your use cases now so you’re ready when access becomes available.