Skip to content
Back to Blog
1 min read

Multimodal RAG: Retrieval Across Text, Images, and Documents

I wrote “Multimodal RAG: Retrieval Across Text, Images, and Documents” to share practical, production-minded guidance on this topic.

Multimodal RAG Pipeline

from azure.ai.openai import AzureOpenAI
from azure.ai.documentintelligence import DocumentIntelligenceClient
import base64
from typing import Union

class MultimodalRAG:
    def __init__(self, openai_client: AzureOpenAI, doc_client: DocumentIntelligenceClient):
        self.openai = openai_client
        self.doc_intelligence = doc_client
        self.vector_store = VectorStore()

    async def process_document(self, file_bytes: bytes, file_type: str) -> list[dict]:
        """Process document and extract multimodal content."""
        if file_type == "pdf":
            return await self.process_pdf(file_bytes)
        elif file_type in ["png", "jpg", "jpeg"]:
            return await self.process_image(file_bytes)
        else:
            return await self.process_text(file_bytes.decode())

    async def process_pdf(self, pdf_bytes: bytes) -> list[dict]:
        """Extract text, tables, and images from PDF."""
        # Use Document Intelligence for extraction
        result = self.doc_intelligence.begin_analyze_document(
            "prebuilt-layout",
            pdf_bytes,
            output_content_format="markdown"
        ).result()

        chunks = []

        # Process text content
        for paragraph in result.paragraphs:
            chunks.append({
                "type": "text",
                "content": paragraph.content,
                "page": paragraph.bounding_regions[0].page_number
            })

        # Process tables
        for table in result.tables:
            table_md = self.table_to_markdown(table)
            chunks.append({
                "type": "table",
                "content": table_md,
                "page": table.bounding_regions[0].page_number
            })

        # Process figures/images
        for figure in result.figures:
            if figure.elements:
                # Get image bytes and create description
                description = await self.describe_figure(figure)
                chunks.append({
                    "type": "image",
                    "content": description,
                    "page": figure.bounding_regions[0].page_number
                })

        return chunks

    async def describe_figure(self, figure) -> str:
        """Generate description for extracted figure."""
        # Convert figure to image bytes
        image_b64 = base64.b64encode(figure.data).decode()

        response = await self.openai.chat.completions.create(
            model="gpt-4o",
            messages=[{
                "role": "user",
                "content": [
                    {"type": "text", "text": "Describe this figure in detail, including any data or relationships shown."},
                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}}
                ]
            }]
        )
        return response.choices[0].message.content

    async def embed_multimodal(self, chunks: list[dict]) -> list[dict]:
        """Generate embeddings for multimodal content."""
        for chunk in chunks:
            if chunk["type"] == "text":
                embedding = await self.get_text_embedding(chunk["content"])
            elif chunk["type"] == "table":
                # Embed table with summary
                summary = await self.summarize_table(chunk["content"])
                embedding = await self.get_text_embedding(summary)
            else:  # image
                embedding = await self.get_text_embedding(chunk["content"])

            chunk["embedding"] = embedding

        return chunks

    async def query(self, question: str, include_images: bool = True) -> str:
        """Query multimodal knowledge base."""
        query_embedding = await self.get_text_embedding(question)
        results = self.vector_store.search(query_embedding, top_k=10)

        # Build multimodal context
        context = self.build_context(results, include_images)

        response = await self.openai.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "Answer based on the provided context."},
                {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {question}"}
            ]
        )
        return response.choices[0].message.content

Multimodal RAG unlocks intelligence from documents containing text, tables, and images.\n\n## Takeaways\n\nAdd a concise, personal takeaway and recommended next steps here.\n

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.