July 15, 2023 1 min read

Fabric Warehouse: Cloud-Native SQL Analytics

Microsoft Fabric Data Warehouse T-SQL Analytics

The Fabric Warehouse provides a cloud-native SQL analytics experience with full T-SQL support. Today we’ll explore its capabilities and how it differs from traditional data warehouses.

Fabric Warehouse Overview

# Warehouse characteristics
fabric_warehouse = {
    "engine": "Distributed SQL engine",
    "storage": "OneLake (Delta format)",
    "language": "T-SQL",
    "distribution": "Automatic (no manual configuration)",
    "indexing": "Automatic (no manual indexes)",
    "scaling": "Automatic within capacity"
}

# Key differences from Synapse Dedicated Pool:
differences = {
    "no_distribution_keys": "Automatic data distribution",
    "no_indexes": "Automatic indexing/statistics",
    "no_cluster_management": "Serverless-like experience",
    "no_pause_resume": "Always available",
    "delta_format": "Uses Delta Lake, not columnstore"
}

Creating a Warehouse

-- In Fabric portal:
-- 1. Navigate to workspace
-- 2. Click "+ New" > "Warehouse"
-- 3. Name your warehouse

-- Warehouse naming convention:
-- {domain}_{purpose}_wh
-- Example: sales_analytics_wh, finance_reporting_wh

Creating Tables

-- Create a basic table
CREATE TABLE dim_customer (
    customer_key INT NOT NULL,
    customer_id NVARCHAR(50) NOT NULL,
    customer_name NVARCHAR(200),
    email NVARCHAR(200),
    segment NVARCHAR(50),
    city NVARCHAR(100),
    state NVARCHAR(50),
    country NVARCHAR(50),
    created_date DATE,
    modified_date DATETIME2
);

-- Create table with constraints
CREATE TABLE fact_sales (
    sales_key BIGINT NOT NULL,
    order_id NVARCHAR(50) NOT NULL,
    customer_key INT NOT NULL,
    product_key INT NOT NULL,
    date_key INT NOT NULL,
    quantity INT NOT NULL,
    unit_price DECIMAL(18, 2) NOT NULL,
    discount_percent DECIMAL(5, 2) DEFAULT 0,
    total_amount AS (quantity * unit_price * (1 - discount_percent / 100)) PERSISTED,
    created_at DATETIME2 DEFAULT GETUTCDATE()
);

-- Note: No IDENTITY columns in Fabric Warehouse yet
-- Generate keys using sequences or Spark

Loading Data

Using COPY Statement

-- COPY FROM is the fastest way to load data
COPY INTO dim_customer
FROM 'abfss://workspace@onelake.dfs.fabric.microsoft.com/lakehouse/Tables/customers'
WITH (
    FILE_TYPE = 'PARQUET'
);

-- Copy from CSV
COPY INTO staging_orders
FROM 'abfss://workspace@onelake.dfs.fabric.microsoft.com/lakehouse/Files/raw/*.csv'
WITH (
    FILE_TYPE = 'CSV',
    FIRSTROW = 2,
    FIELDTERMINATOR = ',',
    ROWTERMINATOR = '\n'
);

-- Copy with column mapping
COPY INTO dim_product (product_key, product_name, category)
FROM 'abfss://path/to/files/*.parquet'
WITH (
    FILE_TYPE = 'PARQUET',
    COLUMN_MAPPING = 'product_key=id, product_name=name, category=category_name'
);

Using INSERT…SELECT

-- Insert from another table
INSERT INTO dim_customer_current
SELECT * FROM dim_customer
WHERE is_current = 1;

-- Insert from Lakehouse (cross-database query)
INSERT INTO warehouse.dbo.fact_sales
SELECT
    ROW_NUMBER() OVER (ORDER BY order_id) as sales_key,
    order_id,
    customer_key,
    product_key,
    CONVERT(INT, FORMAT(order_date, 'yyyyMMdd')) as date_key,
    quantity,
    unit_price,
    discount_percent
FROM lakehouse.dbo.orders;

Querying Data

Basic Queries

-- Select with aggregation
SELECT
    c.segment,
    COUNT(DISTINCT s.customer_key) as unique_customers,
    COUNT(*) as total_orders,
    SUM(s.total_amount) as total_revenue,
    AVG(s.total_amount) as avg_order_value
FROM fact_sales s
INNER JOIN dim_customer c ON s.customer_key = c.customer_key
WHERE s.date_key >= 20230101
GROUP BY c.segment
ORDER BY total_revenue DESC;

-- Window functions
SELECT
    customer_key,
    order_id,
    total_amount,
    SUM(total_amount) OVER (
        PARTITION BY customer_key
        ORDER BY date_key
        ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
    ) as running_total,
    ROW_NUMBER() OVER (
        PARTITION BY customer_key
        ORDER BY date_key DESC
    ) as order_rank
FROM fact_sales;

CTEs and Subqueries

-- Common Table Expression
WITH monthly_sales AS (
    SELECT
        d.year_month,
        c.segment,
        SUM(s.total_amount) as monthly_revenue
    FROM fact_sales s
    JOIN dim_date d ON s.date_key = d.date_key
    JOIN dim_customer c ON s.customer_key = c.customer_key
    GROUP BY d.year_month, c.segment
),
segment_ranking AS (
    SELECT
        year_month,
        segment,
        monthly_revenue,
        RANK() OVER (PARTITION BY year_month ORDER BY monthly_revenue DESC) as rank
    FROM monthly_sales
)
SELECT * FROM segment_ranking
WHERE rank <= 3
ORDER BY year_month, rank;

Cross-Database Queries

-- Query Lakehouse tables from Warehouse
SELECT
    w.customer_key,
    w.customer_name,
    l.total_orders,
    l.total_spent
FROM warehouse.dbo.dim_customer w
LEFT JOIN lakehouse.dbo.customer_metrics l
    ON w.customer_id = l.customer_id;

-- Join across workspaces (using shortcuts)
SELECT *
FROM sales_warehouse.dbo.orders o
JOIN reference_lakehouse.dbo.products p
    ON o.product_id = p.product_id;

Views and Stored Procedures

-- Create view
CREATE VIEW vw_customer_360 AS
SELECT
    c.customer_key,
    c.customer_name,
    c.segment,
    COUNT(s.sales_key) as total_orders,
    SUM(s.total_amount) as lifetime_value,
    MAX(d.full_date) as last_order_date,
    DATEDIFF(day, MAX(d.full_date), GETDATE()) as days_since_last_order
FROM dim_customer c
LEFT JOIN fact_sales s ON c.customer_key = s.customer_key
LEFT JOIN dim_date d ON s.date_key = d.date_key
GROUP BY c.customer_key, c.customer_name, c.segment;

-- Create stored procedure
CREATE PROCEDURE usp_refresh_customer_metrics
    @start_date DATE,
    @end_date DATE
AS
BEGIN
    -- Truncate target
    TRUNCATE TABLE customer_metrics;

    -- Insert fresh calculations
    INSERT INTO customer_metrics
    SELECT
        customer_key,
        COUNT(*) as order_count,
        SUM(total_amount) as total_spent,
        AVG(total_amount) as avg_order_value
    FROM fact_sales
    WHERE date_key BETWEEN
        CONVERT(INT, FORMAT(@start_date, 'yyyyMMdd'))
        AND CONVERT(INT, FORMAT(@end_date, 'yyyyMMdd'))
    GROUP BY customer_key;
END;

-- Execute stored procedure
EXEC usp_refresh_customer_metrics '2023-01-01', '2023-12-31';

Performance Considerations

-- Check query statistics
-- Use the Query Insights feature in the portal

-- Best practices for performance:
-- 1. Use appropriate data types
-- 2. Filter early in queries
-- 3. Avoid SELECT *
-- 4. Use CTEs for readability
-- 5. Leverage partitioning where possible

-- Example: Optimized query pattern
SELECT
    c.segment,
    d.year_month,
    SUM(s.total_amount) as revenue
FROM fact_sales s
INNER JOIN dim_customer c ON s.customer_key = c.customer_key
INNER JOIN dim_date d ON s.date_key = d.date_key
WHERE s.date_key BETWEEN 20230101 AND 20231231  -- Filter fact table first
    AND c.country = 'USA'
GROUP BY c.segment, d.year_month;

Security

-- Row-level security
CREATE FUNCTION fn_security_predicate(@region NVARCHAR(50))
RETURNS TABLE
WITH SCHEMABINDING
AS
RETURN SELECT 1 AS result
WHERE @region = USER_NAME() OR USER_NAME() = 'admin';

CREATE SECURITY POLICY region_filter
ADD FILTER PREDICATE dbo.fn_security_predicate(region)
ON dbo.fact_sales;

-- Column-level security via views
CREATE VIEW vw_sales_public AS
SELECT
    order_id,
    customer_key,
    product_key,
    quantity,
    -- Exclude sensitive columns like discount, cost
    total_amount
FROM fact_sales;

Tomorrow we’ll dive deeper into T-SQL patterns specific to Fabric.