Back to Blog
3 min read

Azure Document Intelligence: Table Extraction and Analysis

Extracting tables from documents unlocks structured data trapped in PDFs and images. Azure Document Intelligence excels at identifying table boundaries, cell relationships, and complex merged cells.

Table Extraction Basics

from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult
from azure.core.credentials import AzureKeyCredential
import pandas as pd
import os

client = DocumentIntelligenceClient(
    endpoint=os.environ["AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT"],
    credential=AzureKeyCredential(os.environ["AZURE_DOCUMENT_INTELLIGENCE_KEY"])
)

def extract_tables(document_url: str) -> list[pd.DataFrame]:
    """Extract all tables from a document as DataFrames."""

    poller = client.begin_analyze_document(
        model_id="prebuilt-layout",
        analyze_request={"urlSource": document_url}
    )
    result: AnalyzeResult = poller.result()

    dataframes = []

    for table in result.tables:
        # Initialize empty grid
        rows = table.row_count
        cols = table.column_count
        grid = [[None for _ in range(cols)] for _ in range(rows)]

        # Fill grid with cell content
        for cell in table.cells:
            row_idx = cell.row_index
            col_idx = cell.column_index
            content = cell.content

            # Handle merged cells (spans)
            row_span = cell.row_span or 1
            col_span = cell.column_span or 1

            for r in range(row_span):
                for c in range(col_span):
                    if row_idx + r < rows and col_idx + c < cols:
                        grid[row_idx + r][col_idx + c] = content

        # Convert to DataFrame
        df = pd.DataFrame(grid[1:], columns=grid[0] if grid else None)
        df.attrs["confidence"] = sum(c.confidence for c in table.cells) / len(table.cells)
        df.attrs["page_number"] = table.bounding_regions[0].page_number if table.bounding_regions else 1

        dataframes.append(df)

    return dataframes

Handling Complex Table Structures

from dataclasses import dataclass
from typing import List, Optional

@dataclass
class TableMetadata:
    page_number: int
    row_count: int
    column_count: int
    has_headers: bool
    confidence: float
    bounding_box: List[float]

def analyze_table_structure(table) -> TableMetadata:
    """Analyze table structure for processing decisions."""

    # Check if first row appears to be headers
    header_cells = [c for c in table.cells if c.row_index == 0]
    has_headers = all(
        c.kind == "columnHeader" or (c.content and c.content.isupper())
        for c in header_cells
    )

    return TableMetadata(
        page_number=table.bounding_regions[0].page_number if table.bounding_regions else 1,
        row_count=table.row_count,
        column_count=table.column_count,
        has_headers=has_headers,
        confidence=sum(c.confidence for c in table.cells) / len(table.cells),
        bounding_box=table.bounding_regions[0].polygon if table.bounding_regions else []
    )

def extract_tables_with_context(document_url: str) -> List[dict]:
    """Extract tables with surrounding context for better understanding."""

    poller = client.begin_analyze_document(
        model_id="prebuilt-layout",
        analyze_request={"urlSource": document_url}
    )
    result: AnalyzeResult = poller.result()

    tables_with_context = []

    for i, table in enumerate(result.tables):
        metadata = analyze_table_structure(table)

        # Find paragraphs near this table (potential captions)
        nearby_text = []
        if result.paragraphs:
            table_top = min(p.y for p in table.bounding_regions[0].polygon) if table.bounding_regions else 0
            for para in result.paragraphs:
                if para.bounding_regions:
                    para_bottom = max(p.y for p in para.bounding_regions[0].polygon)
                    # Check if paragraph is just above table
                    if abs(para_bottom - table_top) < 50:
                        nearby_text.append(para.content)

        tables_with_context.append({
            "index": i,
            "metadata": metadata,
            "caption": nearby_text[0] if nearby_text else None,
            "dataframe": extract_single_table(table)
        })

    return tables_with_context

Table extraction transforms static reports into queryable data. Combine extracted tables with LLM analysis to answer questions about financial statements, research papers, and technical specifications.

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.