Back to Blog
2 min read

Multimodal RAG: Retrieval Across Text, Images, and Documents

Multimodal RAG extends retrieval to images, tables, and complex documents. Here’s how to build it.

Multimodal RAG Pipeline

from azure.ai.openai import AzureOpenAI
from azure.ai.documentintelligence import DocumentIntelligenceClient
import base64
from typing import Union

class MultimodalRAG:
    def __init__(self, openai_client: AzureOpenAI, doc_client: DocumentIntelligenceClient):
        self.openai = openai_client
        self.doc_intelligence = doc_client
        self.vector_store = VectorStore()

    async def process_document(self, file_bytes: bytes, file_type: str) -> list[dict]:
        """Process document and extract multimodal content."""
        if file_type == "pdf":
            return await self.process_pdf(file_bytes)
        elif file_type in ["png", "jpg", "jpeg"]:
            return await self.process_image(file_bytes)
        else:
            return await self.process_text(file_bytes.decode())

    async def process_pdf(self, pdf_bytes: bytes) -> list[dict]:
        """Extract text, tables, and images from PDF."""
        # Use Document Intelligence for extraction
        result = self.doc_intelligence.begin_analyze_document(
            "prebuilt-layout",
            pdf_bytes,
            output_content_format="markdown"
        ).result()

        chunks = []

        # Process text content
        for paragraph in result.paragraphs:
            chunks.append({
                "type": "text",
                "content": paragraph.content,
                "page": paragraph.bounding_regions[0].page_number
            })

        # Process tables
        for table in result.tables:
            table_md = self.table_to_markdown(table)
            chunks.append({
                "type": "table",
                "content": table_md,
                "page": table.bounding_regions[0].page_number
            })

        # Process figures/images
        for figure in result.figures:
            if figure.elements:
                # Get image bytes and create description
                description = await self.describe_figure(figure)
                chunks.append({
                    "type": "image",
                    "content": description,
                    "page": figure.bounding_regions[0].page_number
                })

        return chunks

    async def describe_figure(self, figure) -> str:
        """Generate description for extracted figure."""
        # Convert figure to image bytes
        image_b64 = base64.b64encode(figure.data).decode()

        response = await self.openai.chat.completions.create(
            model="gpt-4o",
            messages=[{
                "role": "user",
                "content": [
                    {"type": "text", "text": "Describe this figure in detail, including any data or relationships shown."},
                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}}
                ]
            }]
        )
        return response.choices[0].message.content

    async def embed_multimodal(self, chunks: list[dict]) -> list[dict]:
        """Generate embeddings for multimodal content."""
        for chunk in chunks:
            if chunk["type"] == "text":
                embedding = await self.get_text_embedding(chunk["content"])
            elif chunk["type"] == "table":
                # Embed table with summary
                summary = await self.summarize_table(chunk["content"])
                embedding = await self.get_text_embedding(summary)
            else:  # image
                embedding = await self.get_text_embedding(chunk["content"])

            chunk["embedding"] = embedding

        return chunks

    async def query(self, question: str, include_images: bool = True) -> str:
        """Query multimodal knowledge base."""
        query_embedding = await self.get_text_embedding(question)
        results = self.vector_store.search(query_embedding, top_k=10)

        # Build multimodal context
        context = self.build_context(results, include_images)

        response = await self.openai.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "Answer based on the provided context."},
                {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {question}"}
            ]
        )
        return response.choices[0].message.content

Multimodal RAG unlocks intelligence from documents containing text, tables, and images.

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.