February 17, 2025 1 min read

Vision-Language Models in Production: Practical Implementation Guide

Vision-language models (VLMs) like GPT-4o can understand images and text together. Deploying them in production requires careful consideration of latency, cost, and reliability. Here’s a practical guide.

Production Architecture

from fastapi import FastAPI, UploadFile, HTTPException
from azure.ai.foundry import AIFoundryClient
import asyncio
import base64
from io import BytesIO
from PIL import Image

app = FastAPI()

class VLMService:
    def __init__(self):
        self.ai_client = AIFoundryClient(
            project="vlm-production",
            credential=DefaultAzureCredential()
        )
        self.cache = ResponseCache()
        self.rate_limiter = RateLimiter(rpm=100, tpm=100000)

    async def analyze_image(
        self,
        image_data: bytes,
        prompt: str,
        detail: str = "auto"
    ) -> dict:
        # Check cache
        cache_key = self._cache_key(image_data, prompt)
        cached = await self.cache.get(cache_key)
        if cached:
            return cached

        # Rate limiting
        await self.rate_limiter.acquire()

        # Optimize image
        optimized = self._optimize_image(image_data, detail)

        # Call VLM
        response = await self.ai_client.chat.complete_async(
            deployment="gpt-4o",
            messages=[{
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{optimized}",
                            "detail": detail
                        }
                    }
                ]
            }],
            max_tokens=1000
        )

        result = {
            "analysis": response.choices[0].message.content,
            "tokens_used": response.usage.total_tokens
        }

        # Cache result
        await self.cache.set(cache_key, result)

        return result

    def _optimize_image(self, image_data: bytes, detail: str) -> str:
        """Optimize image for VLM processing."""
        img = Image.open(BytesIO(image_data))

        # Resize based on detail level
        if detail == "low":
            max_size = 512
        elif detail == "high":
            max_size = 2048
        else:
            # Auto: based on image complexity
            max_size = 1024

        if max(img.size) > max_size:
            ratio = max_size / max(img.size)
            new_size = (int(img.size[0] * ratio), int(img.size[1] * ratio))
            img = img.resize(new_size, Image.LANCZOS)

        # Convert to RGB if needed
        if img.mode != "RGB":
            img = img.convert("RGB")

        # Compress
        buffer = BytesIO()
        img.save(buffer, format="JPEG", quality=85, optimize=True)

        return base64.b64encode(buffer.getvalue()).decode()

vlm_service = VLMService()

@app.post("/analyze")
async def analyze_image(
    file: UploadFile,
    prompt: str = "Describe this image"
):
    if file.content_type not in ["image/jpeg", "image/png", "image/webp"]:
        raise HTTPException(400, "Unsupported image format")

    image_data = await file.read()

    if len(image_data) > 20 * 1024 * 1024:  # 20MB limit
        raise HTTPException(400, "Image too large")

    result = await vlm_service.analyze_image(image_data, prompt)
    return result

Batch Processing

class VLMBatchProcessor:
    """Process multiple images efficiently."""

    def __init__(self, vlm_service: VLMService, max_concurrent: int = 10):
        self.vlm_service = vlm_service
        self.semaphore = asyncio.Semaphore(max_concurrent)

    async def process_batch(
        self,
        images: list[dict],
        prompt: str
    ) -> list[dict]:
        """Process batch of images concurrently."""

        async def process_one(image: dict):
            async with self.semaphore:
                try:
                    result = await self.vlm_service.analyze_image(
                        image["data"],
                        prompt
                    )
                    return {
                        "id": image["id"],
                        "success": True,
                        **result
                    }
                except Exception as e:
                    return {
                        "id": image["id"],
                        "success": False,
                        "error": str(e)
                    }

        tasks = [process_one(img) for img in images]
        return await asyncio.gather(*tasks)

    async def process_directory(
        self,
        directory: str,
        prompt: str,
        output_path: str
    ):
        """Process all images in a directory."""
        import os

        images = []
        for filename in os.listdir(directory):
            if filename.lower().endswith(('.jpg', '.jpeg', '.png')):
                path = os.path.join(directory, filename)
                with open(path, 'rb') as f:
                    images.append({
                        "id": filename,
                        "data": f.read()
                    })

        results = await self.process_batch(images, prompt)

        # Save results
        with open(output_path, 'w') as f:
            json.dump(results, f, indent=2)

        return results

Structured Output Extraction

class VLMExtractor:
    """Extract structured data from images."""

    def __init__(self, vlm_service: VLMService):
        self.vlm_service = vlm_service

    async def extract_invoice_data(self, image_data: bytes) -> dict:
        """Extract structured data from invoice image."""

        prompt = """Extract all data from this invoice image.
        Return valid JSON with this structure:
        {
            "vendor": {"name": "", "address": "", "tax_id": ""},
            "customer": {"name": "", "address": ""},
            "invoice_number": "",
            "invoice_date": "",
            "due_date": "",
            "line_items": [
                {"description": "", "quantity": 0, "unit_price": 0, "total": 0}
            ],
            "subtotal": 0,
            "tax": 0,
            "total": 0,
            "currency": ""
        }
        Return ONLY the JSON, no other text."""

        result = await self.vlm_service.analyze_image(
            image_data, prompt, detail="high"
        )

        # Parse JSON from response
        try:
            return json.loads(result["analysis"])
        except json.JSONDecodeError:
            # Try to extract JSON from response
            import re
            json_match = re.search(r'\{.*\}', result["analysis"], re.DOTALL)
            if json_match:
                return json.loads(json_match.group())
            raise ValueError("Could not parse structured data from response")

    async def extract_table_data(self, image_data: bytes) -> list[dict]:
        """Extract table data from image."""

        prompt = """Extract the table data from this image.
        Return as JSON array where each row is an object with column headers as keys.
        Example: [{"Column1": "value1", "Column2": "value2"}, ...]
        Return ONLY the JSON array."""

        result = await self.vlm_service.analyze_image(
            image_data, prompt, detail="high"
        )

        return json.loads(result["analysis"])

    async def extract_with_schema(
        self,
        image_data: bytes,
        schema: dict
    ) -> dict:
        """Extract data matching a specific schema."""

        prompt = f"""Extract data from this image matching this schema:
        {json.dumps(schema, indent=2)}

        Return valid JSON matching the schema exactly.
        Use null for missing values.
        Return ONLY the JSON."""

        result = await self.vlm_service.analyze_image(
            image_data, prompt, detail="high"
        )

        return json.loads(result["analysis"])

Multi-Image Analysis

class MultiImageAnalyzer:
    """Analyze multiple images together."""

    def __init__(self, ai_client: AIFoundryClient):
        self.ai_client = ai_client

    async def compare_images(
        self,
        images: list[bytes],
        comparison_prompt: str
    ) -> dict:
        """Compare multiple images."""

        content = [{"type": "text", "text": comparison_prompt}]

        for i, image_data in enumerate(images):
            optimized = self._optimize(image_data)
            content.append({
                "type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{optimized}"}
            })
            content.append({
                "type": "text",
                "text": f"[Image {i+1}]"
            })

        response = await self.ai_client.chat.complete_async(
            deployment="gpt-4o",
            messages=[{"role": "user", "content": content}],
            max_tokens=2000
        )

        return {"comparison": response.choices[0].message.content}

    async def analyze_image_sequence(
        self,
        images: list[bytes],
        context: str
    ) -> dict:
        """Analyze a sequence of related images."""

        # Process each image
        analyses = []
        for img in images:
            result = await self._analyze_single(img)
            analyses.append(result)

        # Synthesize
        synthesis_prompt = f"""Based on these {len(images)} image analyses:
        {json.dumps(analyses)}

        Context: {context}

        Provide a comprehensive synthesis identifying:
        1. Common themes
        2. Progression/changes
        3. Key insights
        4. Recommendations"""

        response = await self.ai_client.chat.complete_async(
            deployment="gpt-4o",
            messages=[{"role": "user", "content": synthesis_prompt}]
        )

        return {
            "individual_analyses": analyses,
            "synthesis": response.choices[0].message.content
        }

Error Handling and Fallbacks

class RobustVLMService:
    """VLM service with comprehensive error handling."""

    def __init__(self, ai_client):
        self.ai_client = ai_client
        self.fallback_client = None  # Secondary provider

    async def analyze_with_retry(
        self,
        image_data: bytes,
        prompt: str,
        max_retries: int = 3
    ) -> dict:
        """Analyze with retry logic."""

        last_error = None

        for attempt in range(max_retries):
            try:
                return await self._analyze(image_data, prompt)
            except RateLimitError:
                wait_time = 2 ** attempt
                await asyncio.sleep(wait_time)
            except ContentFilterError as e:
                return {
                    "error": "content_filtered",
                    "message": "Image was filtered by content policy"
                }
            except Exception as e:
                last_error = e
                if attempt < max_retries - 1:
                    await asyncio.sleep(1)

        # Try fallback
        if self.fallback_client:
            try:
                return await self._analyze_fallback(image_data, prompt)
            except:
                pass

        raise last_error

    async def _analyze(self, image_data: bytes, prompt: str) -> dict:
        """Primary analysis."""
        pass

    async def _analyze_fallback(self, image_data: bytes, prompt: str) -> dict:
        """Fallback to alternative provider."""
        pass

Cost Optimization

def select_detail_level(image_data: bytes, task: str) -> str:
    """Select optimal detail level based on task and image."""

    # Estimate image complexity
    img = Image.open(BytesIO(image_data))
    width, height = img.size

    # Task-based selection
    high_detail_tasks = ["ocr", "table_extraction", "detailed_analysis"]
    low_detail_tasks = ["classification", "object_detection", "quick_summary"]

    if task in low_detail_tasks:
        return "low"
    if task in high_detail_tasks:
        return "high"

    # Size-based for auto tasks
    if width < 768 and height < 768:
        return "low"
    if width > 2048 or height > 2048:
        return "high"

    return "auto"

# Cost comparison
# Low detail: ~85 tokens per image
# High detail: ~765 tokens for 1024x1024, more for larger
# Auto: System decides based on image

Best Practices

Optimize images: Resize and compress before sending
Choose detail wisely: Low detail for simple tasks
Cache responses: VLM calls are expensive
Batch when possible: Amortize latency
Structured prompts: Get consistent output formats
Handle failures: Images may be filtered or rejected

VLMs unlock powerful visual understanding capabilities. Deploy them thoughtfully with proper optimization and error handling.