October 26, 2024 1 min read

Unifying Transactional and Analytical Workloads in Fabric

Microsoft Fabric OLTP OLAP Data Architecture Unified Platform

Microsoft Fabric’s vision is to unify all data workloads in one platform. Let’s explore practical patterns for bringing transactional and analytical workloads together.

The Unified Architecture

"""
Unified Data Platform in Fabric:

Data Sources         Fabric Platform              Consumers
-----------         ---------------              ---------

[IoT Devices] --+
                |   +------------------+
[Web Apps]   ---+-->| Event Streams    |----+
                |   +------------------+    |
[Mobile]     ---+                          |
                                           v
                    +------------------+  +------------------+
                    | SQL Database     |  | Real-Time        |
                    | (Transactional)  |  | Intelligence     |
                    +------------------+  +------------------+
                             |                    |
                             |   Auto-Mirror      |
                             v                    v
                    +------------------+  +------------------+
                    |     OneLake      |<-| KQL Database     |
                    |  (Delta Format)  |  +------------------+
                    +------------------+
                             |
              +--------------+--------------+
              |              |              |
              v              v              v
     +------------+  +------------+  +------------+
     | Lakehouse  |  | Warehouse  |  | Semantic   |
     |            |  |            |  | Model      |
     +------------+  +------------+  +------------+
              |              |              |
              v              v              v
     +------------+  +------------+  +------------+
     | Notebooks  |  | SQL Editor |  | Power BI   |
     +------------+  +------------+  +------------+
"""

Cross-Workload Data Patterns

from dataclasses import dataclass
from typing import List, Dict, Any

@dataclass
class DataFlow:
    """Defines a data flow across Fabric workloads"""
    name: str
    source: str
    transformations: List[str]
    destination: str
    latency_requirement: str  # "real-time", "near-real-time", "batch"

class UnifiedDataPlatform:
    """Orchestrate data across Fabric workloads"""

    def __init__(self, workspace_id: str):
        self.workspace_id = workspace_id
        self.flows: List[DataFlow] = []

    def add_flow(self, flow: DataFlow):
        self.flows.append(flow)

    def get_architecture_diagram(self) -> str:
        """Generate architecture diagram as mermaid"""
        lines = ["graph LR"]

        for flow in self.flows:
            source_id = flow.source.replace(" ", "_")
            dest_id = flow.destination.replace(" ", "_")

            lines.append(f"    {source_id}[{flow.source}]")
            lines.append(f"    {dest_id}[{flow.destination}]")
            lines.append(f"    {source_id} -->|{flow.latency_requirement}| {dest_id}")

        return "\n".join(lines)

# Define unified data flows
platform = UnifiedDataPlatform("my-workspace")

# Transactional to Analytical
platform.add_flow(DataFlow(
    name="Order Processing Analytics",
    source="SQL Database (Orders)",
    transformations=["Auto-mirror to Delta"],
    destination="Lakehouse (Order Facts)",
    latency_requirement="near-real-time"
))

# Real-time Events to Analytics
platform.add_flow(DataFlow(
    name="Event Stream Analytics",
    source="Event Stream (User Events)",
    transformations=["Real-time processing"],
    destination="KQL Database",
    latency_requirement="real-time"
))

# Analytics to Reporting
platform.add_flow(DataFlow(
    name="Business Intelligence",
    source="Lakehouse (Curated)",
    transformations=["Semantic modeling"],
    destination="Power BI Reports",
    latency_requirement="batch"
))

Implementing the Unified Pattern

# Step 1: Transactional Database Setup
transactional_sql = """
-- Core transactional tables
CREATE TABLE sales.orders (
    order_id BIGINT IDENTITY PRIMARY KEY,
    customer_id INT NOT NULL,
    order_date DATETIME2 DEFAULT SYSUTCDATETIME(),
    total_amount DECIMAL(12, 2),
    status VARCHAR(20) DEFAULT 'pending',
    updated_at DATETIME2 DEFAULT SYSUTCDATETIME()
);

CREATE TABLE sales.order_items (
    item_id BIGINT IDENTITY PRIMARY KEY,
    order_id BIGINT REFERENCES sales.orders(order_id),
    product_id INT NOT NULL,
    quantity INT NOT NULL,
    unit_price DECIMAL(10, 2) NOT NULL
);

-- Enable change tracking for mirroring
ALTER TABLE sales.orders ENABLE CHANGE_TRACKING;
ALTER TABLE sales.order_items ENABLE CHANGE_TRACKING;
"""

# Step 2: Event Stream Processing
event_stream_kql = """
// Real-time user events in KQL Database
.create table UserEvents (
    EventId: guid,
    UserId: string,
    EventType: string,
    Properties: dynamic,
    Timestamp: datetime
)

// Create materialized view for aggregation
.create materialized-view UserEventSummary on table UserEvents
{
    UserEvents
    | summarize
        EventCount = count(),
        LastEvent = max(Timestamp)
        by UserId, EventType, bin(Timestamp, 1h)
}
"""

# Step 3: Lakehouse Transformations
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

def create_unified_analytics():
    """Create unified analytics layer"""

    # Read mirrored transactional data
    orders_df = spark.read.format("delta").load(
        "abfss://workspace@onelake.dfs.fabric.microsoft.com/"
        "SalesDB.Database/Tables/sales/orders"
    )

    order_items_df = spark.read.format("delta").load(
        "abfss://workspace@onelake.dfs.fabric.microsoft.com/"
        "SalesDB.Database/Tables/sales/order_items"
    )

    # Read real-time event summaries (exported from KQL)
    user_events_df = spark.read.format("delta").load(
        "abfss://workspace@onelake.dfs.fabric.microsoft.com/"
        "EventsDB.Database/Tables/UserEventSummary"
    )

    # Create unified customer view
    unified_customer = orders_df.join(
        user_events_df,
        orders_df.customer_id == user_events_df.UserId
    ).groupBy(
        "customer_id"
    ).agg(
        spark.sql.functions.sum("total_amount").alias("lifetime_value"),
        spark.sql.functions.count("order_id").alias("total_orders"),
        spark.sql.functions.max("order_date").alias("last_order"),
        spark.sql.functions.sum("EventCount").alias("total_events")
    )

    # Save to curated layer
    unified_customer.write.format("delta").mode("overwrite").saveAsTable(
        "curated.unified_customer_profile"
    )

# Step 4: Semantic Model
semantic_model_definition = {
    "name": "UnifiedSalesModel",
    "tables": [
        {
            "name": "Orders",
            "source": "curated.unified_customer_profile",
            "measures": [
                {"name": "Total Revenue", "expression": "SUM([lifetime_value])"},
                {"name": "Avg Order Value", "expression": "DIVIDE([Total Revenue], SUM([total_orders]))"}
            ]
        }
    ],
    "relationships": []
}

Cross-Database Queries

-- Query across transactional and analytical data
-- Using T-SQL with external tables

-- Create external data source for Lakehouse
CREATE EXTERNAL DATA SOURCE LakehouseData
WITH (
    TYPE = DELTA,
    LOCATION = 'abfss://workspace@onelake.dfs.fabric.microsoft.com/Lakehouse.Lakehouse'
);

-- Create external table for curated data
CREATE EXTERNAL TABLE curated.customer_segments (
    customer_id INT,
    segment VARCHAR(50),
    score DECIMAL(5, 2)
)
WITH (
    LOCATION = '/Tables/customer_segments',
    DATA_SOURCE = LakehouseData,
    FILE_FORMAT = DeltaFormat
);

-- Join operational and analytical data
SELECT
    o.order_id,
    o.customer_id,
    c.segment,
    c.score as customer_score,
    o.total_amount,
    o.order_date
FROM sales.orders o
JOIN curated.customer_segments c ON o.customer_id = c.customer_id
WHERE o.order_date >= DATEADD(day, -7, GETUTCDATE())
    AND c.segment = 'high_value';

Orchestrating Cross-Workload Pipelines

# Fabric Data Pipeline for unified processing
pipeline_definition = {
    "name": "UnifiedDataPipeline",
    "activities": [
        {
            "name": "SyncTransactionalData",
            "type": "Copy",
            "description": "Ensure mirroring is current",
            "dependsOn": [],
            "policy": {"timeout": "0.01:00:00"}
        },
        {
            "name": "ProcessRealTimeEvents",
            "type": "KQLCommand",
            "description": "Run KQL aggregations",
            "dependsOn": [],
            "command": ".refresh materialized-view UserEventSummary"
        },
        {
            "name": "CreateUnifiedView",
            "type": "Notebook",
            "description": "Build unified analytics",
            "dependsOn": ["SyncTransactionalData", "ProcessRealTimeEvents"],
            "notebook": "CreateUnifiedAnalytics"
        },
        {
            "name": "RefreshSemanticModel",
            "type": "SemanticModelRefresh",
            "description": "Refresh Power BI dataset",
            "dependsOn": ["CreateUnifiedView"],
            "dataset": "UnifiedSalesModel"
        }
    ],
    "schedule": {
        "frequency": "Hour",
        "interval": 1
    }
}

Benefits of Unified Architecture

UNIFIED_BENEFITS = {
    "data_freshness": {
        "before": "Hours to days for ETL",
        "after": "Seconds to minutes with auto-mirroring"
    },
    "complexity": {
        "before": "Multiple systems, ETL pipelines, data copies",
        "after": "Single platform, automatic sync, shared storage"
    },
    "governance": {
        "before": "Fragmented across systems",
        "after": "Unified with Purview integration"
    },
    "cost": {
        "before": "Multiple compute, storage, licensing",
        "after": "Consolidated capacity, single billing"
    },
    "agility": {
        "before": "Weeks to build new analytics",
        "after": "Hours with mirrored data access"
    }
}

Unifying transactional and analytical workloads in Fabric eliminates data silos, reduces latency, and simplifies architecture. Design with both workloads in mind from the start.