Skip to content
Back to Blog
1 min read

Multimodal RAG: Combining Text, Images, and Tables for Enhanced Search

I wrote “Multimodal RAG: Combining Text, Images, and Tables for Enhanced Search” to share practical, production-minded guidance on this topic.

Processing Multimodal Documents

Use GPT-4o to extract structured content from complex documents:

from openai import AzureOpenAI
from azure.ai.documentintelligence import DocumentIntelligenceClient
import base64

class MultimodalProcessor:
    def __init__(self, openai_client: AzureOpenAI, doc_intel_client: DocumentIntelligenceClient):
        self.llm = openai_client
        self.doc_intel = doc_intel_client

    async def process_document(self, pdf_bytes: bytes) -> dict:
        # Extract structure with Document Intelligence
        poller = self.doc_intel.begin_analyze_document(
            "prebuilt-layout",
            pdf_bytes,
            output_content_format="markdown"
        )
        result = poller.result()

        extracted_content = {
            "text_chunks": [],
            "tables": [],
            "figures": []
        }

        # Process tables
        for table in result.tables:
            table_markdown = self._table_to_markdown(table)
            description = await self._describe_table(table_markdown)
            extracted_content["tables"].append({
                "content": table_markdown,
                "description": description,
                "page": table.bounding_regions[0].page_number
            })

        # Process figures/images
        for figure in result.figures:
            image_data = self._extract_figure_image(pdf_bytes, figure)
            description = await self._describe_image(image_data)
            extracted_content["figures"].append({
                "description": description,
                "caption": figure.caption.content if figure.caption else None,
                "page": figure.bounding_regions[0].page_number
            })

        return extracted_content

    async def _describe_image(self, image_base64: str) -> str:
        response = await self.llm.chat.completions.create(
            model="gpt-4o",
            messages=[{
                "role": "user",
                "content": [
                    {"type": "text", "text": "Describe this image in detail for search indexing:"},
                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}
                ]
            }],
            max_tokens=500
        )
        return response.choices[0].message.content

Indexing Multimodal Content

Create unified embeddings for all content types:

async def create_multimodal_index(self, processed_doc: dict) -> list[dict]:
    index_entries = []

    # Index text chunks
    for chunk in processed_doc["text_chunks"]:
        embedding = await self.get_embedding(chunk["content"])
        index_entries.append({
            "type": "text",
            "content": chunk["content"],
            "embedding": embedding
        })

    # Index table descriptions
    for table in processed_doc["tables"]:
        embedding = await self.get_embedding(table["description"])
        index_entries.append({
            "type": "table",
            "content": table["content"],
            "description": table["description"],
            "embedding": embedding
        })

    # Index figure descriptions
    for figure in processed_doc["figures"]:
        embedding = await self.get_embedding(figure["description"])
        index_entries.append({
            "type": "figure",
            "description": figure["description"],
            "embedding": embedding
        })

    return index_entries

Multimodal RAG ensures users find relevant information regardless of how it’s represented in the source documents.\n\n## Takeaways\n\nAdd a concise, personal takeaway and recommended next steps here.\n

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.