January 29, 2025 1 min read

Vision-Language Models: A Practical Guide for Data Applications

Vision-Language Models (VLMs) like GPT-4o and Gemini can understand images alongside text. For data professionals, this opens new possibilities: automated report analysis, dashboard interpretation, and document processing. Let’s explore practical applications.

Understanding Vision-Language Models

VLMs process images and text together, enabling:

Image description and analysis
Visual question answering
Document understanding
Chart and diagram interpretation
Multi-image reasoning

Basic Usage with GPT-4o

from openai import AzureOpenAI
import base64

client = AzureOpenAI(
    api_key=os.environ["AZURE_OPENAI_KEY"],
    api_version="2024-06-01",
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"]
)

def encode_image(image_path: str) -> str:
    """Encode image to base64."""
    with open(image_path, "rb") as f:
        return base64.b64encode(f.read()).decode("utf-8")

def analyze_image(image_path: str, prompt: str) -> str:
    """Analyze an image with a custom prompt."""
    base64_image = encode_image(image_path)

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/png;base64,{base64_image}",
                        "detail": "high"  # high, low, or auto
                    }
                }
            ]
        }],
        max_tokens=1000
    )

    return response.choices[0].message.content

# Example usage
result = analyze_image(
    "dashboard_screenshot.png",
    "Analyze this dashboard. What are the key metrics shown and any concerning trends?"
)

Dashboard and Report Analysis

class DashboardAnalyzer:
    def __init__(self, client: AzureOpenAI):
        self.client = client

    async def analyze_dashboard(self, image_path: str) -> dict:
        """Extract insights from a dashboard screenshot."""

        base64_image = encode_image(image_path)

        response = await self.client.chat.completions.create(
            model="gpt-4o",
            messages=[{
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": """Analyze this dashboard and provide:
                        1. List of all metrics/KPIs visible with their values
                        2. Time period shown
                        3. Key trends (up, down, stable)
                        4. Any anomalies or concerning values
                        5. Recommended actions based on the data

                        Return as JSON:
                        {
                            "metrics": [{"name": "...", "value": "...", "trend": "up/down/stable"}],
                            "time_period": "...",
                            "key_insights": ["..."],
                            "anomalies": ["..."],
                            "recommendations": ["..."]
                        }"""
                    },
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/png;base64,{base64_image}"}
                    }
                ]
            }]
        )

        return json.loads(response.choices[0].message.content)

    async def compare_dashboards(self, image1_path: str, image2_path: str) -> dict:
        """Compare two dashboard snapshots."""

        base64_image1 = encode_image(image1_path)
        base64_image2 = encode_image(image2_path)

        response = await self.client.chat.completions.create(
            model="gpt-4o",
            messages=[{
                "role": "user",
                "content": [
                    {"type": "text", "text": "Compare these two dashboard snapshots. What changed between them?"},
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/png;base64,{base64_image1}"}
                    },
                    {"type": "text", "text": "First dashboard (before)"},
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/png;base64,{base64_image2}"}
                    },
                    {"type": "text", "text": "Second dashboard (after). What are the key differences?"}
                ]
            }]
        )

        return {"comparison": response.choices[0].message.content}

Data Extraction from Charts

class ChartDataExtractor:
    def __init__(self, client: AzureOpenAI):
        self.client = client

    async def extract_data_from_chart(self, image_path: str) -> dict:
        """Extract numerical data from a chart image."""

        base64_image = encode_image(image_path)

        response = await self.client.chat.completions.create(
            model="gpt-4o",
            messages=[{
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": """Extract the data from this chart as accurately as possible.

                        Return JSON with:
                        {
                            "chart_type": "bar/line/pie/scatter/etc",
                            "title": "chart title if visible",
                            "x_axis": {"label": "...", "values": [...]},
                            "y_axis": {"label": "...", "unit": "..."},
                            "series": [
                                {"name": "series name", "data": [{"x": ..., "y": ...}, ...]}
                            ],
                            "confidence": "high/medium/low",
                            "notes": "any caveats about the extraction"
                        }

                        Be as precise as possible with the values."""
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/png;base64,{base64_image}",
                            "detail": "high"
                        }
                    }
                ]
            }]
        )

        return json.loads(response.choices[0].message.content)

    async def chart_to_dataframe(self, image_path: str) -> pd.DataFrame:
        """Convert chart to pandas DataFrame."""

        data = await self.extract_data_from_chart(image_path)

        rows = []
        for series in data.get("series", []):
            for point in series.get("data", []):
                rows.append({
                    "series": series["name"],
                    "x": point["x"],
                    "y": point["y"]
                })

        return pd.DataFrame(rows)

Document Intelligence

class DocumentVisionAnalyzer:
    def __init__(self, client: AzureOpenAI):
        self.client = client

    async def analyze_invoice(self, image_path: str) -> dict:
        """Extract structured data from an invoice image."""

        base64_image = encode_image(image_path)

        response = await self.client.chat.completions.create(
            model="gpt-4o",
            messages=[{
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": """Extract all information from this invoice:

                        Return JSON:
                        {
                            "vendor": {"name": "...", "address": "...", "tax_id": "..."},
                            "customer": {"name": "...", "address": "..."},
                            "invoice_number": "...",
                            "invoice_date": "...",
                            "due_date": "...",
                            "line_items": [
                                {"description": "...", "quantity": ..., "unit_price": ..., "total": ...}
                            ],
                            "subtotal": ...,
                            "tax": ...,
                            "total": ...,
                            "currency": "...",
                            "payment_terms": "..."
                        }"""
                    },
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/png;base64,{base64_image}"}
                    }
                ]
            }]
        )

        return json.loads(response.choices[0].message.content)

    async def analyze_technical_diagram(self, image_path: str) -> dict:
        """Analyze a technical or architectural diagram."""

        base64_image = encode_image(image_path)

        response = await self.client.chat.completions.create(
            model="gpt-4o",
            messages=[{
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": """Analyze this technical diagram and extract:

                        1. What type of diagram is this?
                        2. All components/elements shown
                        3. Relationships/connections between components
                        4. Data flows if applicable
                        5. Technologies/products mentioned

                        Return structured JSON representing the diagram."""
                    },
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/png;base64,{base64_image}"}
                    }
                ]
            }]
        )

        return json.loads(response.choices[0].message.content)

Quality Control with Vision

class VisualQABot:
    def __init__(self, client: AzureOpenAI):
        self.client = client

    async def verify_report_output(
        self,
        report_image: str,
        expected_content: dict
    ) -> dict:
        """Verify a generated report matches expectations."""

        base64_image = encode_image(report_image)

        response = await self.client.chat.completions.create(
            model="gpt-4o",
            messages=[{
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": f"""Verify this report matches the expected content:

                        Expected:
                        {json.dumps(expected_content, indent=2)}

                        Check the image and report:
                        1. Are all expected sections present?
                        2. Do the values match (approximately)?
                        3. Is the formatting correct?
                        4. Any missing or extra content?

                        Return JSON:
                        {{
                            "matches": true/false,
                            "discrepancies": ["..."],
                            "missing": ["..."],
                            "confidence": "high/medium/low"
                        }}"""
                    },
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/png;base64,{base64_image}"}
                    }
                ]
            }]
        )

        return json.loads(response.choices[0].message.content)

Best Practices

Image Preparation

from PIL import Image
import io

def optimize_image_for_vlm(image_path: str, max_size: int = 2048) -> str:
    """Optimize image for VLM processing."""

    img = Image.open(image_path)

    # Resize if too large
    if max(img.size) > max_size:
        ratio = max_size / max(img.size)
        new_size = (int(img.size[0] * ratio), int(img.size[1] * ratio))
        img = img.resize(new_size, Image.LANCZOS)

    # Convert to RGB if necessary
    if img.mode != "RGB":
        img = img.convert("RGB")

    # Save to buffer
    buffer = io.BytesIO()
    img.save(buffer, format="PNG", optimize=True)

    return base64.b64encode(buffer.getvalue()).decode()

Cost Optimization

def select_detail_level(image_path: str, task: str) -> str:
    """Select appropriate detail level for cost optimization."""

    # High detail: complex charts, small text, detailed diagrams
    # Low detail: simple images, overview analysis
    # Auto: let the model decide

    high_detail_tasks = [
        "extract_data",
        "read_text",
        "analyze_chart",
        "detailed_comparison"
    ]

    if task in high_detail_tasks:
        return "high"

    # Check image complexity
    img = Image.open(image_path)
    if img.size[0] > 1000 or img.size[1] > 1000:
        return "high"

    return "low"

Prompting Tips

Be specific: Tell the model exactly what to look for
Request structure: Ask for JSON output for parsing
Set expectations: Mention confidence levels and caveats
Use detail parameter: Control cost vs. accuracy tradeoff
Batch when possible: Process multiple images in one call

Vision-language models transform how we interact with visual data. Start with simple use cases like dashboard analysis and expand to more complex document processing workflows.