Back to Blog
4 min read

Introduction to Azure Synapse Analytics

Azure Synapse Analytics brings together data warehousing and big data analytics in a unified experience. With data volumes growing, especially as organizations digitize operations, Synapse provides a comprehensive platform for analytics. Here is an introduction to its capabilities.

What is Azure Synapse?

Azure Synapse combines:

  • Dedicated SQL pools - MPP data warehouse (formerly SQL DW)
  • Serverless SQL pools - Query data lake without loading
  • Spark pools - Big data processing
  • Pipelines - Data integration (Data Factory)
  • Studio - Unified workspace

Creating a Synapse Workspace

# Create a storage account for the data lake
az storage account create \
    --name synapsedatalake2020 \
    --resource-group rg-analytics \
    --location australiaeast \
    --sku Standard_LRS \
    --kind StorageV2 \
    --enable-hierarchical-namespace true

# Create the Synapse workspace
az synapse workspace create \
    --name synapse-analytics-2020 \
    --resource-group rg-analytics \
    --storage-account synapsedatalake2020 \
    --file-system synapse-fs \
    --sql-admin-login-user sqladmin \
    --sql-admin-login-password 'YourSecurePassword123!' \
    --location australiaeast

Dedicated SQL Pool

Create a dedicated pool for data warehousing:

# Create a dedicated SQL pool
az synapse sql pool create \
    --name dwpool \
    --workspace-name synapse-analytics-2020 \
    --resource-group rg-analytics \
    --performance-level DW100c

Creating Tables

-- Create a dimension table
CREATE TABLE dbo.DimCustomer
(
    CustomerKey INT NOT NULL,
    CustomerName NVARCHAR(100) NOT NULL,
    Email NVARCHAR(255),
    Country NVARCHAR(50),
    CreatedDate DATE
)
WITH
(
    DISTRIBUTION = REPLICATE,
    CLUSTERED COLUMNSTORE INDEX
);

-- Create a fact table
CREATE TABLE dbo.FactSales
(
    SalesKey BIGINT NOT NULL,
    CustomerKey INT NOT NULL,
    ProductKey INT NOT NULL,
    OrderDate DATE NOT NULL,
    Quantity INT NOT NULL,
    UnitPrice DECIMAL(10, 2) NOT NULL,
    TotalAmount DECIMAL(10, 2) NOT NULL
)
WITH
(
    DISTRIBUTION = HASH(CustomerKey),
    CLUSTERED COLUMNSTORE INDEX,
    PARTITION (OrderDate RANGE RIGHT FOR VALUES
        ('2020-01-01', '2020-04-01', '2020-07-01', '2020-10-01'))
);

Loading Data with COPY

-- Load data from data lake
COPY INTO dbo.FactSales
FROM 'https://synapsedatalake2020.dfs.core.windows.net/raw/sales/*.parquet'
WITH (
    FILE_TYPE = 'PARQUET',
    CREDENTIAL = (IDENTITY = 'Managed Identity')
);

-- Load from CSV
COPY INTO dbo.DimCustomer
FROM 'https://synapsedatalake2020.dfs.core.windows.net/raw/customers/*.csv'
WITH (
    FILE_TYPE = 'CSV',
    FIRSTROW = 2,
    FIELDTERMINATOR = ',',
    CREDENTIAL = (IDENTITY = 'Managed Identity')
);

Serverless SQL Pool

Query data lake directly:

-- Query Parquet files
SELECT
    CustomerName,
    SUM(TotalAmount) as TotalRevenue,
    COUNT(*) as OrderCount
FROM OPENROWSET(
    BULK 'https://synapsedatalake2020.dfs.core.windows.net/raw/sales/2020/*.parquet',
    FORMAT = 'PARQUET'
) AS sales
JOIN OPENROWSET(
    BULK 'https://synapsedatalake2020.dfs.core.windows.net/raw/customers/*.parquet',
    FORMAT = 'PARQUET'
) AS customers ON sales.CustomerKey = customers.CustomerKey
GROUP BY CustomerName
ORDER BY TotalRevenue DESC;

-- Create external table for repeated queries
CREATE EXTERNAL DATA SOURCE DataLake
WITH (
    LOCATION = 'https://synapsedatalake2020.dfs.core.windows.net/raw'
);

CREATE EXTERNAL FILE FORMAT ParquetFormat
WITH (
    FORMAT_TYPE = PARQUET,
    DATA_COMPRESSION = 'org.apache.hadoop.io.compress.SnappyCodec'
);

CREATE EXTERNAL TABLE dbo.ExternalSales
WITH (
    LOCATION = 'sales/',
    DATA_SOURCE = DataLake,
    FILE_FORMAT = ParquetFormat
)
AS SELECT * FROM OPENROWSET(
    BULK 'https://synapsedatalake2020.dfs.core.windows.net/raw/sales/*.parquet',
    FORMAT = 'PARQUET'
) AS sales;

Spark Pools

Create and use Spark for big data:

# Create a Spark pool
az synapse spark pool create \
    --name sparkpool \
    --workspace-name synapse-analytics-2020 \
    --resource-group rg-analytics \
    --spark-version 2.4 \
    --node-count 3 \
    --node-size Medium

PySpark Notebook

# Read data from data lake
df = spark.read.parquet("abfss://synapse-fs@synapsedatalake2020.dfs.core.windows.net/raw/sales/")

# Transform data
from pyspark.sql.functions import col, sum, count, avg

sales_summary = df \
    .groupBy("CustomerKey", "ProductKey") \
    .agg(
        sum("TotalAmount").alias("TotalRevenue"),
        count("*").alias("TransactionCount"),
        avg("Quantity").alias("AvgQuantity")
    )

# Write to data lake
sales_summary.write \
    .mode("overwrite") \
    .parquet("abfss://synapse-fs@synapsedatalake2020.dfs.core.windows.net/processed/sales_summary/")

# Write to dedicated SQL pool
sales_summary.write \
    .format("com.databricks.spark.sqldw") \
    .option("url", jdbc_url) \
    .option("tempDir", "abfss://synapse-fs@synapsedatalake2020.dfs.core.windows.net/temp/") \
    .option("forwardSparkAzureStorageCredentials", "true") \
    .option("dbTable", "dbo.SalesSummary") \
    .mode("overwrite") \
    .save()

Data Pipelines

Create pipelines for data movement:

{
    "name": "DailyDataIngestion",
    "properties": {
        "activities": [
            {
                "name": "CopyFromSource",
                "type": "Copy",
                "inputs": [
                    {
                        "referenceName": "SourceDataset",
                        "type": "DatasetReference"
                    }
                ],
                "outputs": [
                    {
                        "referenceName": "DataLakeDataset",
                        "type": "DatasetReference"
                    }
                ],
                "typeProperties": {
                    "source": {
                        "type": "SqlSource",
                        "sqlReaderQuery": "SELECT * FROM Sales WHERE ModifiedDate >= '@{pipeline().parameters.startDate}'"
                    },
                    "sink": {
                        "type": "ParquetSink"
                    }
                }
            },
            {
                "name": "TransformWithSpark",
                "type": "SynapseNotebook",
                "dependsOn": [
                    {
                        "activity": "CopyFromSource",
                        "dependencyConditions": ["Succeeded"]
                    }
                ],
                "typeProperties": {
                    "notebook": {
                        "referenceName": "TransformSalesData",
                        "type": "NotebookReference"
                    },
                    "parameters": {
                        "inputPath": {
                            "value": "@pipeline().parameters.dataPath",
                            "type": "string"
                        }
                    }
                }
            }
        ]
    }
}

Security

-- Column-level security
CREATE USER DataAnalyst FROM EXTERNAL PROVIDER;
GRANT SELECT ON dbo.FactSales(OrderDate, Quantity, ProductKey) TO DataAnalyst;
DENY SELECT ON dbo.FactSales(UnitPrice, TotalAmount) TO DataAnalyst;

-- Row-level security
CREATE FUNCTION dbo.fn_SecurityPredicate(@Country AS NVARCHAR(50))
RETURNS TABLE
WITH SCHEMABINDING
AS
RETURN SELECT 1 AS result
WHERE @Country = USER_NAME() OR USER_NAME() = 'admin';

CREATE SECURITY POLICY SalesFilter
ADD FILTER PREDICATE dbo.fn_SecurityPredicate(Country) ON dbo.DimCustomer;

Azure Synapse Analytics provides a unified platform for all your analytics needs, from data warehousing to big data processing.

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.