March 9, 2021 1 min read

Azure Synapse Dedicated SQL Pools for Enterprise Data Warehousing

Azure Synapse SQL Data Warehouse Analytics

Azure Synapse dedicated SQL pools (formerly SQL Data Warehouse) provide enterprise-grade data warehousing with massively parallel processing (MPP) architecture. In this post, I’ll cover key concepts and best practices for building performant analytical solutions.

Understanding MPP Architecture

Dedicated SQL pools distribute data across 60 distributions for parallel processing:

Control node: Orchestrates queries and manages metadata
Compute nodes: Execute queries in parallel
Distributions: Data partitions spread across compute nodes

Creating a Dedicated SQL Pool

# Create Synapse workspace first
az synapse workspace create \
    --name mysynapseworkspace \
    --resource-group myResourceGroup \
    --storage-account mystorageaccount \
    --file-system synapse-fs \
    --sql-admin-login-user sqladmin \
    --sql-admin-login-password 'ComplexPassword123!' \
    --location eastus

# Create dedicated SQL pool
az synapse sql pool create \
    --name mydedicatedpool \
    --workspace-name mysynapseworkspace \
    --resource-group myResourceGroup \
    --performance-level DW100c

Table Distribution Strategies

Hash Distribution

Best for large fact tables with high cardinality columns:

-- Hash distributed fact table
CREATE TABLE dbo.FactSales
(
    SalesKey BIGINT NOT NULL,
    DateKey INT NOT NULL,
    ProductKey INT NOT NULL,
    CustomerKey INT NOT NULL,
    StoreKey INT NOT NULL,
    Quantity INT,
    UnitPrice DECIMAL(18,2),
    TotalAmount DECIMAL(18,2),
    DiscountAmount DECIMAL(18,2)
)
WITH
(
    DISTRIBUTION = HASH(CustomerKey),
    CLUSTERED COLUMNSTORE INDEX,
    PARTITION (DateKey RANGE RIGHT FOR VALUES
        (20210101, 20210201, 20210301, 20210401, 20210501, 20210601,
         20210701, 20210801, 20210901, 20211001, 20211101, 20211201))
);

Replicated Tables

Best for small dimension tables frequently joined:

-- Replicated dimension table
CREATE TABLE dbo.DimProduct
(
    ProductKey INT NOT NULL,
    ProductName NVARCHAR(100),
    Category NVARCHAR(50),
    SubCategory NVARCHAR(50),
    Brand NVARCHAR(50),
    Color NVARCHAR(20),
    Size NVARCHAR(10),
    UnitCost DECIMAL(18,2),
    UnitPrice DECIMAL(18,2)
)
WITH
(
    DISTRIBUTION = REPLICATE,
    CLUSTERED COLUMNSTORE INDEX
);

-- Customer dimension
CREATE TABLE dbo.DimCustomer
(
    CustomerKey INT NOT NULL,
    CustomerName NVARCHAR(100),
    Email NVARCHAR(255),
    Phone NVARCHAR(20),
    Address NVARCHAR(200),
    City NVARCHAR(50),
    State NVARCHAR(50),
    Country NVARCHAR(50),
    Segment NVARCHAR(20)
)
WITH
(
    DISTRIBUTION = REPLICATE,
    CLUSTERED COLUMNSTORE INDEX
);

Round-Robin Distribution

Best for staging tables or when no clear distribution key:

-- Staging table with round-robin
CREATE TABLE staging.SalesStaging
(
    TransactionId NVARCHAR(50),
    TransactionDate DATETIME,
    ProductId NVARCHAR(20),
    CustomerId NVARCHAR(20),
    Quantity INT,
    Amount DECIMAL(18,2)
)
WITH
(
    DISTRIBUTION = ROUND_ROBIN,
    HEAP
);

Loading Data with COPY Command

-- Load from Azure Data Lake using COPY
COPY INTO staging.SalesStaging
FROM 'https://mystorageaccount.dfs.core.windows.net/data/sales/*.parquet'
WITH (
    FILE_TYPE = 'PARQUET',
    CREDENTIAL = (IDENTITY = 'Managed Identity')
);

-- Load CSV files
COPY INTO staging.SalesStaging
FROM 'https://mystorageaccount.blob.core.windows.net/data/sales/'
WITH (
    FILE_TYPE = 'CSV',
    FIELDTERMINATOR = ',',
    ROWTERMINATOR = '\n',
    FIRSTROW = 2,
    CREDENTIAL = (IDENTITY = 'Shared Access Signature', SECRET = '<sas-token>')
);

-- Transform and load into fact table
INSERT INTO dbo.FactSales
SELECT
    ROW_NUMBER() OVER (ORDER BY s.TransactionDate) AS SalesKey,
    CONVERT(INT, FORMAT(s.TransactionDate, 'yyyyMMdd')) AS DateKey,
    COALESCE(p.ProductKey, -1) AS ProductKey,
    COALESCE(c.CustomerKey, -1) AS CustomerKey,
    -1 AS StoreKey,
    s.Quantity,
    s.Amount / NULLIF(s.Quantity, 0) AS UnitPrice,
    s.Amount AS TotalAmount,
    0 AS DiscountAmount
FROM staging.SalesStaging s
LEFT JOIN dbo.DimProduct p ON s.ProductId = p.ProductId
LEFT JOIN dbo.DimCustomer c ON s.CustomerId = c.CustomerId;

-- Clean up staging
TRUNCATE TABLE staging.SalesStaging;

Query Optimization

Statistics Management

-- Create statistics on key columns
CREATE STATISTICS stat_DateKey ON dbo.FactSales(DateKey);
CREATE STATISTICS stat_CustomerKey ON dbo.FactSales(CustomerKey);
CREATE STATISTICS stat_ProductKey ON dbo.FactSales(ProductKey);

-- Update statistics
UPDATE STATISTICS dbo.FactSales;

-- Auto-create statistics (recommended)
ALTER DATABASE myDatabase
SET AUTO_CREATE_STATISTICS ON;

Result Set Caching

-- Enable result set caching
ALTER DATABASE myDatabase
SET RESULT_SET_CACHING ON;

-- Check cache usage
SELECT
    request_id,
    command,
    result_set_cache
FROM sys.dm_pdw_exec_requests
WHERE result_set_cache = 1
ORDER BY start_time DESC;

Materialized Views

-- Create materialized view for common aggregations
CREATE MATERIALIZED VIEW dbo.vw_MonthlySalesByProduct
WITH (DISTRIBUTION = HASH(ProductKey))
AS
SELECT
    p.Category,
    p.SubCategory,
    p.ProductKey,
    p.ProductName,
    YEAR(d.DateKey) AS SalesYear,
    MONTH(d.DateKey) AS SalesMonth,
    SUM(f.Quantity) AS TotalQuantity,
    SUM(f.TotalAmount) AS TotalRevenue,
    COUNT_BIG(*) AS TransactionCount
FROM dbo.FactSales f
JOIN dbo.DimProduct p ON f.ProductKey = p.ProductKey
JOIN dbo.DimDate d ON f.DateKey = d.DateKey
GROUP BY
    p.Category, p.SubCategory, p.ProductKey, p.ProductName,
    YEAR(d.DateKey), MONTH(d.DateKey);

-- Query uses materialized view automatically
SELECT
    Category,
    SalesYear,
    SalesMonth,
    SUM(TotalRevenue) AS Revenue
FROM dbo.vw_MonthlySalesByProduct
WHERE SalesYear = 2021
GROUP BY Category, SalesYear, SalesMonth
ORDER BY Category, SalesYear, SalesMonth;

Workload Management

-- Create workload classifier for reporting users
CREATE WORKLOAD CLASSIFIER ReportingUsers
WITH (
    WORKLOAD_GROUP = 'LargeQueries',
    MEMBERNAME = 'ReportingRole',
    IMPORTANCE = Normal
);

-- Create workload group with resource allocation
CREATE WORKLOAD GROUP LargeQueries
WITH (
    MIN_PERCENTAGE_RESOURCE = 20,
    CAP_PERCENTAGE_RESOURCE = 40,
    REQUEST_MIN_RESOURCE_GRANT_PERCENT = 10,
    REQUEST_MAX_RESOURCE_GRANT_PERCENT = 25
);

-- Monitor workload
SELECT
    wg.name AS workload_group,
    wc.name AS workload_classifier,
    r.request_id,
    r.status,
    r.importance,
    r.start_time,
    r.total_elapsed_time
FROM sys.dm_pdw_exec_requests r
JOIN sys.workload_management_workload_classifiers wc
    ON r.classifier_name = wc.name
JOIN sys.workload_management_workload_groups wg
    ON wc.group_name = wg.name
ORDER BY r.start_time DESC;

Pause and Scale

# Pause to save costs (no compute charges when paused)
az synapse sql pool pause \
    --name mydedicatedpool \
    --workspace-name mysynapseworkspace \
    --resource-group myResourceGroup

# Resume when needed
az synapse sql pool resume \
    --name mydedicatedpool \
    --workspace-name mysynapseworkspace \
    --resource-group myResourceGroup

# Scale up for heavy workloads
az synapse sql pool update \
    --name mydedicatedpool \
    --workspace-name mysynapseworkspace \
    --resource-group myResourceGroup \
    --performance-level DW500c

# Scale down for cost optimization
az synapse sql pool update \
    --name mydedicatedpool \
    --workspace-name mysynapseworkspace \
    --resource-group myResourceGroup \
    --performance-level DW100c

Monitoring Queries

-- Find long-running queries
SELECT
    request_id,
    submit_time,
    start_time,
    end_time,
    total_elapsed_time / 1000 AS elapsed_seconds,
    status,
    command
FROM sys.dm_pdw_exec_requests
WHERE status = 'Running'
   OR total_elapsed_time > 60000
ORDER BY total_elapsed_time DESC;

-- Analyze query execution steps
SELECT
    request_id,
    step_index,
    operation_type,
    distribution_type,
    location_type,
    status,
    total_elapsed_time,
    row_count,
    command
FROM sys.dm_pdw_request_steps
WHERE request_id = '<request_id>'
ORDER BY step_index;

-- Check data movement
SELECT
    request_id,
    step_index,
    operation_type,
    total_elapsed_time,
    row_count
FROM sys.dm_pdw_request_steps
WHERE operation_type LIKE '%Move%'
ORDER BY total_elapsed_time DESC;

Conclusion

Azure Synapse dedicated SQL pools provide enterprise-grade data warehousing capabilities:

MPP architecture for massive scale and performance
Flexible distribution strategies for optimized joins
Advanced features like materialized views and result caching
Cost management through pause/resume and scaling

Key success factors:

Choose appropriate distribution keys based on join patterns
Use replicated tables for small dimensions
Leverage materialized views for common aggregations
Monitor and optimize data movement operations
Scale dynamically based on workload demands