Back to Blog
2 min read

Vision-Language Models: Understanding Images with AI

Vision-language models combine visual understanding with language capabilities. Let’s explore practical applications.

Vision-Language Applications

from azure.ai.openai import AzureOpenAI
import base64
from PIL import Image
import io

class VisionLanguageAgent:
    def __init__(self, openai_client: AzureOpenAI):
        self.openai = openai_client

    async def describe_image(self, image_bytes: bytes, detail_level: str = "high") -> str:
        """Generate detailed image description."""
        image_b64 = base64.b64encode(image_bytes).decode()

        response = await self.openai.chat.completions.create(
            model="gpt-4o",
            messages=[{
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "Describe this image in detail, including objects, people, text, colors, and composition."
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/png;base64,{image_b64}",
                            "detail": detail_level
                        }
                    }
                ]
            }]
        )
        return response.choices[0].message.content

    async def extract_text_from_image(self, image_bytes: bytes) -> dict:
        """OCR with understanding - extract and structure text."""
        image_b64 = base64.b64encode(image_bytes).decode()

        response = await self.openai.chat.completions.create(
            model="gpt-4o",
            messages=[{
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": """Extract all text from this image.
                        Return JSON with:
                        - raw_text: all text found
                        - structured: organized by sections/categories
                        - handwritten: boolean if handwriting detected"""
                    },
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/png;base64,{image_b64}"}
                    }
                ]
            }],
            response_format={"type": "json_object"}
        )
        return json.loads(response.choices[0].message.content)

    async def compare_images(self, image1: bytes, image2: bytes, comparison_type: str) -> dict:
        """Compare two images."""
        img1_b64 = base64.b64encode(image1).decode()
        img2_b64 = base64.b64encode(image2).decode()

        response = await self.openai.chat.completions.create(
            model="gpt-4o",
            messages=[{
                "role": "user",
                "content": [
                    {"type": "text", "text": f"Compare these two images for {comparison_type}. List similarities and differences."},
                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img1_b64}"}},
                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img2_b64}"}}
                ]
            }]
        )
        return response.choices[0].message.content

    async def visual_qa(self, image_bytes: bytes, questions: list[str]) -> list[dict]:
        """Answer multiple questions about an image."""
        image_b64 = base64.b64encode(image_bytes).decode()
        results = []

        for question in questions:
            response = await self.openai.chat.completions.create(
                model="gpt-4o",
                messages=[{
                    "role": "user",
                    "content": [
                        {"type": "text", "text": question},
                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}}
                    ]
                }]
            )
            results.append({"question": question, "answer": response.choices[0].message.content})

        return results

Vision-language models enable applications from document understanding to visual inspection.

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.