1 min read
Multimodal RAG Advances: Images, Documents, and Beyond
I wrote “Multimodal RAG Advances: Images, Documents, and Beyond” to share practical, production-minded guidance on this topic.
Multimodal Embedding Strategies
from azure.ai.foundry import AIFoundryClient
import base64
class MultimodalEmbedder:
"""Generate embeddings for multiple content types."""
def __init__(self, ai_client: AIFoundryClient):
self.ai_client = ai_client
async def embed_text(self, text: str) -> list[float]:
"""Embed text content."""
response = await self.ai_client.embeddings.create_async(
deployment="text-embedding-3-large",
input=[text],
dimensions=1024
)
return response.data[0].embedding
async def embed_image(self, image_path: str) -> list[float]:
"""Embed image via description-based approach."""
# Generate detailed image description
description = await self._describe_image(image_path)
# Embed the description
return await self.embed_text(description)
async def embed_image_native(self, image_path: str) -> list[float]:
"""Embed image using native multimodal embedding (if available)."""
# Some models support direct image embedding
with open(image_path, "rb") as f:
image_data = base64.b64encode(f.read()).decode()
response = await self.ai_client.embeddings.create_async(
deployment="multimodal-embedding",
input=[{"type": "image", "data": image_data}]
)
return response.data[0].embedding
async def embed_document_page(self, page_image: str, page_text: str) -> dict:
"""Create hybrid embedding for document page."""
# Text embedding
text_emb = await self.embed_text(page_text) if page_text else None
# Visual embedding (captures layout, charts, etc.)
visual_emb = await self.embed_image(page_image)
# Combined representation
return {
"text_embedding": text_emb,
"visual_embedding": visual_emb,
"combined_embedding": self._combine_embeddings(text_emb, visual_emb)
}
def _combine_embeddings(self, text_emb: list, visual_emb: list) -> list:
"""Combine text and visual embeddings."""
if text_emb is None:
return visual_emb
if visual_emb is None:
return text_emb
# Weighted average
import numpy as np
text_weight, visual_weight = 0.7, 0.3
combined = np.array(text_emb) * text_weight + np.array(visual_emb) * visual_weight
return combined.tolist()
Late Fusion Multimodal Search
class LateFusionMultimodalSearch:
"""Search text and images separately, then fuse results."""
def __init__(self, text_index, image_index, embedder):
self.text_index = text_index
self.image_index = image_index
self.embedder = embedder
async def search(
self,
query: str,
include_images: bool = True,
top_k: int = 10
) -> list[dict]:
"""Search with late fusion of text and image results."""
# Embed query
query_embedding = await self.embedder.embed_text(query)
# Search both indexes
text_results = await self.text_index.search(query_embedding, top_k=top_k)
image_results = []
if include_images:
image_results = await self.image_index.search(query_embedding, top_k=top_k)
# Fuse results
all_results = []
for r in text_results:
all_results.append({
**r,
"content_type": "text",
"fusion_score": r["score"]
})
for r in image_results:
# Images may have lower scores but high visual relevance
all_results.append({
**r,
"content_type": "image",
"fusion_score": r["score"] * 0.9 # Slight discount
})
# Sort by fusion score
all_results.sort(key=lambda x: x["fusion_score"], reverse=True)
return all_results[:top_k]
Chart and Diagram Understanding
class ChartRAG:
"""RAG system specialized for charts and diagrams."""
def __init__(self, ai_client: AIFoundryClient, index_client):
self.ai_client = ai_client
self.index = index_client
async def index_chart(self, chart_path: str, metadata: dict) -> str:
"""Index a chart with extracted data and description."""
# Extract chart data
chart_data = await self._extract_chart_data(chart_path)
# Generate searchable description
description = await self._generate_description(chart_path, chart_data)
# Create index entry
doc = {
"id": metadata.get("id", str(uuid.uuid4())),
"type": "chart",
"path": chart_path,
"chart_type": chart_data["type"],
"title": chart_data.get("title", ""),
"data_summary": chart_data["summary"],
"description": description,
"extracted_data": json.dumps(chart_data["data"]),
"embedding": await self.embedder.embed_text(description)
}
self.index.upload_documents([doc])
return doc["id"]
async def _extract_chart_data(self, chart_path: str) -> dict:
"""Extract structured data from chart."""
with open(chart_path, "rb") as f:
image_data = base64.b64encode(f.read()).decode()
response = await self.ai_client.chat.complete_async(
deployment="gpt-4o",
messages=[{
"role": "user",
"content": [
{"type": "text", "text": """Extract all data from this chart.
Return JSON with:
- type: chart type
- title: chart title
- x_axis: x-axis label
- y_axis: y-axis label
- data: array of data points
- summary: brief text summary of key insights"""},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_data}"}}
]
}]
)
return json.loads(response.choices[0].message.content)
async def answer_chart_question(self, question: str) -> dict:
"""Answer questions about indexed charts."""
# Search for relevant charts
query_embedding = await self.embedder.embed_text(question)
results = await self.index.search(
query_embedding,
filter="type eq 'chart'",
top_k=3
)
if not results:
return {"answer": "No relevant charts found.", "charts": []}
# Build context with chart data
context_parts = []
chart_images = []
for r in results:
context_parts.append(f"Chart: {r['title']}\nData: {r['extracted_data']}\nSummary: {r['data_summary']}")
chart_images.append(r["path"])
# Answer with context
response = await self.ai_client.chat.complete_async(
deployment="gpt-4o",
messages=[{
"role": "system",
"content": f"Answer based on these charts:\n\n" + "\n\n".join(context_parts)
}, {
"role": "user",
"content": question
}]
)
return {
"answer": response.choices[0].message.content,
"charts": chart_images,
"sources": [r["id"] for r in results]
}
Document Layout Understanding
class LayoutAwareRAG:
"""RAG that understands document layout and structure."""
def __init__(self, doc_intelligence_client, ai_client, index_client):
self.doc_client = doc_intelligence_client
self.ai_client = ai_client
self.index = index_client
async def index_document(self, doc_path: str) -> list[str]:
"""Index document with layout understanding."""
# Extract with layout analysis
with open(doc_path, "rb") as f:
result = self.doc_client.begin_analyze_document(
"prebuilt-layout", f
).result()
indexed_ids = []
# Process each page
for page in result.pages:
page_content = self._extract_page_content(page, result)
# Identify semantic regions
regions = self._identify_regions(page_content)
# Index each region
for region in regions:
doc_id = await self._index_region(region, doc_path, page.page_number)
indexed_ids.append(doc_id)
return indexed_ids
def _identify_regions(self, page_content: dict) -> list[dict]:
"""Identify semantic regions in page."""
regions = []
# Headers
for header in page_content.get("headers", []):
regions.append({
"type": "header",
"content": header["text"],
"level": header.get("level", 1)
})
# Paragraphs
for para in page_content.get("paragraphs", []):
regions.append({
"type": "paragraph",
"content": para["text"]
})
# Tables
for table in page_content.get("tables", []):
regions.append({
"type": "table",
"content": self._table_to_markdown(table),
"headers": table.get("column_headers", [])
})
# Figures
for figure in page_content.get("figures", []):
regions.append({
"type": "figure",
"content": figure.get("caption", ""),
"image_path": figure.get("image_path")
})
return regions
async def search_with_layout(self, query: str, region_types: list = None) -> list[dict]:
"""Search with optional filtering by region type."""
query_embedding = await self.embedder.embed_text(query)
filter_str = None
if region_types:
filter_str = " or ".join([f"type eq '{t}'" for t in region_types])
results = await self.index.search(
query_embedding,
filter=filter_str,
top_k=10
)
return results
Best Practices for Multimodal RAG
- Rich image descriptions: Generate detailed text descriptions for visual content
- Preserve structure: Maintain document layout information
- Multiple embeddings: Consider separate text and visual embeddings
- Chart data extraction: Store extracted data alongside visual embeddings
- Hybrid retrieval: Combine visual and textual search
- Quality filtering: Filter low-quality or irrelevant images
Multimodal RAG unlocks knowledge trapped in visual formats. Start with document-heavy use cases where diagrams and charts carry critical information.\n\n## Takeaways\n\nAdd a concise, personal takeaway and recommended next steps here.\n