February 23, 2023 1 min read

Document Layout Analysis and Table Extraction with Azure AI

Azure Document Intelligence Tables Layout AI

Understanding document structure is crucial for accurate data extraction. Azure’s layout analysis capabilities extract text, tables, and structural elements from any document.

Layout Analysis Basics

from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential

client = DocumentAnalysisClient(
    endpoint="https://your-resource.cognitiveservices.azure.com/",
    credential=AzureKeyCredential("your-key")
)

def analyze_layout(file_path: str) -> dict:
    """Analyze document layout."""

    with open(file_path, "rb") as f:
        poller = client.begin_analyze_document("prebuilt-layout", f)
    result = poller.result()

    layout = {
        "pages": [],
        "tables": [],
        "paragraphs": [],
        "styles": []
    }

    # Process pages
    for page in result.pages:
        page_info = {
            "page_number": page.page_number,
            "width": page.width,
            "height": page.height,
            "unit": page.unit,
            "lines": [],
            "words": [],
            "selection_marks": []
        }

        for line in page.lines:
            page_info["lines"].append({
                "content": line.content,
                "bounding_box": line.polygon
            })

        for word in page.words:
            page_info["words"].append({
                "content": word.content,
                "confidence": word.confidence
            })

        if page.selection_marks:
            for mark in page.selection_marks:
                page_info["selection_marks"].append({
                    "state": mark.state,
                    "confidence": mark.confidence
                })

        layout["pages"].append(page_info)

    # Process tables
    for table in result.tables:
        layout["tables"].append(extract_table(table))

    # Process paragraphs
    for para in result.paragraphs:
        layout["paragraphs"].append({
            "content": para.content,
            "role": para.role,
            "bounding_regions": [
                {"page": r.page_number}
                for r in para.bounding_regions
            ] if para.bounding_regions else []
        })

    return layout

Table Extraction

import pandas as pd

def extract_table(table) -> dict:
    """Extract table data."""
    table_data = {
        "row_count": table.row_count,
        "column_count": table.column_count,
        "cells": [],
        "spans": []
    }

    for cell in table.cells:
        cell_info = {
            "row": cell.row_index,
            "column": cell.column_index,
            "content": cell.content,
            "kind": cell.kind,  # "content", "rowHeader", "columnHeader", etc.
            "row_span": cell.row_span,
            "column_span": cell.column_span
        }
        table_data["cells"].append(cell_info)

        # Track spans
        if cell.row_span > 1 or cell.column_span > 1:
            table_data["spans"].append({
                "row": cell.row_index,
                "column": cell.column_index,
                "row_span": cell.row_span,
                "column_span": cell.column_span
            })

    return table_data

def table_to_dataframe(table) -> pd.DataFrame:
    """Convert extracted table to pandas DataFrame."""

    # Initialize grid
    grid = [[None] * table.column_count for _ in range(table.row_count)]

    # Fill grid with cell contents
    for cell in table.cells:
        grid[cell.row_index][cell.column_index] = cell.content

        # Handle spans
        if cell.row_span > 1 or cell.column_span > 1:
            for r in range(cell.row_index, cell.row_index + cell.row_span):
                for c in range(cell.column_index, cell.column_index + cell.column_span):
                    if r < table.row_count and c < table.column_count:
                        grid[r][c] = cell.content

    # Determine if first row is header
    header_row = None
    for cell in table.cells:
        if cell.row_index == 0 and cell.kind in ["columnHeader", "rowHeader"]:
            header_row = grid[0]
            break

    if header_row:
        df = pd.DataFrame(grid[1:], columns=header_row)
    else:
        df = pd.DataFrame(grid)

    return df

def extract_all_tables(file_path: str) -> list:
    """Extract all tables from document as DataFrames."""

    with open(file_path, "rb") as f:
        poller = client.begin_analyze_document("prebuilt-layout", f)
    result = poller.result()

    dataframes = []
    for i, table in enumerate(result.tables):
        df = table_to_dataframe(table)
        dataframes.append({
            "table_index": i,
            "page": table.bounding_regions[0].page_number if table.bounding_regions else None,
            "rows": table.row_count,
            "columns": table.column_count,
            "dataframe": df
        })

    return dataframes

Structured Document Parsing

class StructuredDocumentParser:
    """Parse documents with known structure."""

    def __init__(self, client):
        self.client = client

    def parse_report(self, file_path: str) -> dict:
        """Parse a structured report."""

        with open(file_path, "rb") as f:
            poller = self.client.begin_analyze_document("prebuilt-layout", f)
        result = poller.result()

        report = {
            "title": None,
            "sections": [],
            "tables": [],
            "figures": []
        }

        current_section = None

        for para in result.paragraphs:
            role = para.role

            if role == "title":
                report["title"] = para.content

            elif role == "sectionHeading":
                if current_section:
                    report["sections"].append(current_section)
                current_section = {
                    "heading": para.content,
                    "content": []
                }

            elif role in ["paragraph", None]:
                if current_section:
                    current_section["content"].append(para.content)
                else:
                    # Content before first section
                    report.setdefault("introduction", []).append(para.content)

        if current_section:
            report["sections"].append(current_section)

        # Add tables
        for table in result.tables:
            report["tables"].append(table_to_dataframe(table))

        return report

    def extract_section(
        self,
        result,
        section_title: str
    ) -> dict:
        """Extract a specific section by title."""

        in_section = False
        section_content = []

        for para in result.paragraphs:
            if para.role == "sectionHeading":
                if section_title.lower() in para.content.lower():
                    in_section = True
                elif in_section:
                    break  # Reached next section

            elif in_section:
                section_content.append(para.content)

        return {
            "title": section_title,
            "content": section_content
        }

Complex Table Handling

class TableProcessor:
    """Handle complex table scenarios."""

    @staticmethod
    def merge_split_tables(tables: list, threshold: float = 50) -> list:
        """Merge tables that span across pages."""
        if len(tables) < 2:
            return tables

        merged = []
        current_table = tables[0]

        for next_table in tables[1:]:
            # Check if tables should be merged
            if TableProcessor._should_merge(current_table, next_table, threshold):
                current_table = TableProcessor._merge_tables(current_table, next_table)
            else:
                merged.append(current_table)
                current_table = next_table

        merged.append(current_table)
        return merged

    @staticmethod
    def _should_merge(table1, table2, threshold) -> bool:
        """Determine if tables should be merged."""
        # Same column count suggests same table
        if table1.column_count != table2.column_count:
            return False

        # Check if on consecutive pages
        if table2.bounding_regions and table1.bounding_regions:
            page1 = table1.bounding_regions[0].page_number
            page2 = table2.bounding_regions[0].page_number
            if page2 - page1 != 1:
                return False

        return True

    @staticmethod
    def _merge_tables(table1, table2) -> dict:
        """Merge two tables."""
        df1 = table_to_dataframe(table1)
        df2 = table_to_dataframe(table2)

        # If df2 has same columns as df1, append rows
        if list(df1.columns) == list(df2.columns):
            merged_df = pd.concat([df1, df2], ignore_index=True)
        else:
            merged_df = pd.concat([df1, df2], ignore_index=True)

        return merged_df

    @staticmethod
    def extract_nested_headers(table) -> dict:
        """Extract multi-level headers from table."""
        headers = {"levels": [], "mapping": {}}

        # Find all header rows
        header_rows = set()
        for cell in table.cells:
            if cell.kind in ["columnHeader"]:
                header_rows.add(cell.row_index)

        header_rows = sorted(header_rows)

        for level, row_idx in enumerate(header_rows):
            level_headers = []
            for cell in table.cells:
                if cell.row_index == row_idx:
                    level_headers.append({
                        "column": cell.column_index,
                        "content": cell.content,
                        "span": cell.column_span
                    })
            headers["levels"].append(level_headers)

        return headers

Visualization

from PIL import Image, ImageDraw
import io

def visualize_layout(file_path: str, output_path: str):
    """Visualize detected layout elements."""

    with open(file_path, "rb") as f:
        poller = client.begin_analyze_document("prebuilt-layout", f)
    result = poller.result()

    # For PDF, you'd need to convert to image first
    # This example assumes image input

    with Image.open(file_path) as img:
        draw = ImageDraw.Draw(img)

        for page in result.pages:
            # Draw text lines in blue
            for line in page.lines:
                if line.polygon:
                    points = [(p.x, p.y) for p in line.polygon]
                    draw.polygon(points, outline="blue")

        # Draw tables in green
        for table in result.tables:
            if table.bounding_regions:
                for region in table.bounding_regions:
                    if region.polygon:
                        points = [(p.x, p.y) for p in region.polygon]
                        draw.polygon(points, outline="green", width=2)

        img.save(output_path)

Best Practices

Use layout for structure: When you need document organization
Handle multi-page tables: Merge tables split across pages
Validate table structure: Check row/column counts
Consider nested headers: Multi-level headers are common
Export to appropriate formats: CSV, Excel, DataFrame