February 1, 2022 1 min read

Azure Synapse Analytics Updates: What's New in 2022

azure synapse analytics data-engineering

Azure Synapse Analytics continues to evolve as the unified analytics platform. Let’s explore the latest updates and how to leverage new capabilities for your data workloads.

Synapse Link Enhancements

Synapse Link now supports more data sources with improved performance:

-- Create Synapse Link connection to Cosmos DB
CREATE DATABASE SCOPED CREDENTIAL CosmosDBCredential
WITH IDENTITY = 'Cosmos DB Account',
SECRET = 'your-cosmos-key';

CREATE EXTERNAL DATA SOURCE CosmosDBSource
WITH (
    LOCATION = 'https://yourcosmosaccount.documents.azure.com:443/',
    CREDENTIAL = CosmosDBCredential
);

-- Query Cosmos DB data directly
SELECT TOP 100 *
FROM OPENROWSET(
    PROVIDER = 'CosmosDB',
    CONNECTION = 'Account=yourcosmosaccount;Database=salesdb',
    OBJECT = 'orders',
    SERVER_CREDENTIAL = 'CosmosDBCredential'
) AS orders
WHERE orders.orderDate > '2022-01-01';

Serverless SQL Pool Improvements

-- New CETAS (Create External Table As Select) improvements
CREATE EXTERNAL TABLE [dbo].[SalesSummary]
WITH (
    LOCATION = 'curated/sales_summary/',
    DATA_SOURCE = DataLakeStorage,
    FILE_FORMAT = ParquetFormat
)
AS
SELECT
    YEAR(OrderDate) as Year,
    MONTH(OrderDate) as Month,
    ProductCategory,
    SUM(Amount) as TotalSales,
    COUNT(*) as OrderCount
FROM OPENROWSET(
    BULK 'raw/sales/**/*.parquet',
    DATA_SOURCE = 'DataLakeStorage',
    FORMAT = 'PARQUET'
) AS sales
GROUP BY
    YEAR(OrderDate),
    MONTH(OrderDate),
    ProductCategory;

-- Query with partition elimination
SELECT *
FROM [dbo].[SalesSummary]
WHERE Year = 2022 AND Month = 1;

Dedicated SQL Pool Updates

-- Result-set caching improvements
ALTER DATABASE [SynapseDB]
SET RESULT_SET_CACHING ON;

-- Materialized views with automatic refresh
CREATE MATERIALIZED VIEW [dbo].[DailySalesMV]
WITH (DISTRIBUTION = HASH(ProductID))
AS
SELECT
    CAST(OrderDate as DATE) as SaleDate,
    ProductID,
    SUM(Quantity) as TotalQuantity,
    SUM(Amount) as TotalAmount
FROM [dbo].[FactSales]
GROUP BY CAST(OrderDate as DATE), ProductID;

-- Check materialized view freshness
SELECT
    name,
    state_desc,
    is_switched_on,
    percentage_stale_row
FROM sys.pdw_materialized_view_mappings
WHERE name = 'DailySalesMV';

Spark Pool Enhancements

# New Spark 3.2 features in Synapse
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# Configure Spark pool
spark = SparkSession.builder \
    .appName("SynapseSparkAnalytics") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .getOrCreate()

# Read from Data Lake with predicate pushdown
df = spark.read \
    .format("delta") \
    .load("abfss://data@storage.dfs.core.windows.net/silver/sales")

# New ANSI SQL mode for better compatibility
spark.conf.set("spark.sql.ansi.enabled", "true")

# Use Synapse Link connector for Cosmos DB
cosmosdf = spark.read \
    .format("cosmos.olap") \
    .option("spark.synapse.linkedService", "CosmosDBLink") \
    .option("spark.cosmos.container", "orders") \
    .load()

# Delta Lake integration improvements
df.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .partitionBy("Year", "Month") \
    .save("abfss://data@storage.dfs.core.windows.net/gold/sales_optimized")

Pipeline Improvements

{
  "name": "IncrementalLoadPipeline",
  "properties": {
    "activities": [
      {
        "name": "GetHighWatermark",
        "type": "Lookup",
        "policy": {
          "timeout": "7.00:00:00",
          "retry": 3
        },
        "typeProperties": {
          "source": {
            "type": "SqlPoolSource",
            "sqlReaderQuery": "SELECT MAX(ModifiedDate) as HighWatermark FROM control.Watermarks WHERE TableName = 'Sales'"
          },
          "dataset": {
            "referenceName": "ControlTable",
            "type": "DatasetReference"
          }
        }
      },
      {
        "name": "CopyIncrementalData",
        "type": "Copy",
        "dependsOn": [
          {
            "activity": "GetHighWatermark",
            "dependencyConditions": ["Succeeded"]
          }
        ],
        "typeProperties": {
          "source": {
            "type": "SqlSource",
            "sqlReaderQuery": {
              "value": "SELECT * FROM dbo.Sales WHERE ModifiedDate > '@{activity('GetHighWatermark').output.firstRow.HighWatermark}'",
              "type": "Expression"
            }
          },
          "sink": {
            "type": "ParquetSink",
            "storeSettings": {
              "type": "AzureBlobFSWriteSettings"
            }
          }
        }
      }
    ]
  }
}

Integrated Monitoring

// Synapse workspace diagnostics
SynapseSqlPoolDms
| where Category == "SqlRequests"
| where Duration > 60000  // Queries over 60 seconds
| project
    TimeGenerated,
    LoginName,
    QueryText = substring(Statement, 0, 200),
    Duration = Duration / 1000,
    ResourceClass
| order by Duration desc
| take 20

Azure Synapse Analytics in 2022 provides a more unified, performant, and feature-rich analytics platform.