March 13, 2022 2 min read

Databricks SQL: Analytics Warehousing on the Lakehouse

Azure Databricks SQL Data Warehouse Analytics

Databricks SQL provides a serverless SQL analytics experience on top of your lakehouse. It combines the best of data warehousing with the flexibility of a data lake.

What is Databricks SQL?

Databricks SQL offers:

SQL Endpoints: Serverless compute for SQL queries
Query Editor: Web-based SQL IDE
Dashboards: Built-in visualization and reporting
Alerts: Automated monitoring and notifications

Creating a SQL Warehouse

# Using Databricks REST API
import requests

workspace_url = "https://adb-xxxx.azuredatabricks.net"
token = os.environ["DATABRICKS_TOKEN"]

headers = {
    "Authorization": f"Bearer {token}",
    "Content-Type": "application/json"
}

# Create SQL Warehouse
warehouse_config = {
    "name": "analytics-warehouse",
    "cluster_size": "Medium",
    "min_num_clusters": 1,
    "max_num_clusters": 4,
    "auto_stop_mins": 30,
    "enable_photon": True,
    "enable_serverless_compute": True,
    "warehouse_type": "PRO",
    "tags": {
        "custom_tags": [
            {"key": "department", "value": "analytics"},
            {"key": "cost_center", "value": "CC-123"}
        ]
    }
}

response = requests.post(
    f"{workspace_url}/api/2.0/sql/warehouses",
    headers=headers,
    json=warehouse_config
)

warehouse = response.json()
print(f"Warehouse created: {warehouse['id']}")

Warehouse Sizing

Choose the right size for your workload:

Size	Workers	Use Case
2X-Small	1	Development, testing
X-Small	2	Light ad-hoc queries
Small	4	Small team analytics
Medium	8	Standard analytics
Large	16	Heavy concurrent usage
X-Large	32	Enterprise workloads
2X-Large	64	Large-scale analytics

Optimizing Query Performance

Table Optimization

-- Optimize table layout for query patterns
OPTIMIZE production.sales.transactions
ZORDER BY (customer_id, transaction_date);

-- Analyze table statistics
ANALYZE TABLE production.sales.transactions
COMPUTE STATISTICS FOR ALL COLUMNS;

-- Check table statistics
DESCRIBE EXTENDED production.sales.transactions;

Query Optimization

-- Use predicate pushdown
SELECT * FROM production.sales.transactions
WHERE transaction_date >= '2022-01-01'  -- Pushes filter to storage
AND region = 'US';

-- Avoid SELECT *
SELECT transaction_id, amount, transaction_date
FROM production.sales.transactions
WHERE region = 'US';

-- Use appropriate joins
-- Broadcast small tables
SELECT /*+ BROADCAST(dim) */ f.*, dim.category_name
FROM production.sales.transactions f
JOIN production.dim.product_categories dim
ON f.category_id = dim.category_id;

Query Profiles and Diagnostics

# Get query history
def get_query_history(warehouse_id, hours=24):
    response = requests.get(
        f"{workspace_url}/api/2.0/sql/history/queries",
        headers=headers,
        params={
            "filter_by.warehouse_ids": [warehouse_id],
            "include_metrics": True,
            "max_results": 100
        }
    )
    return response.json()

# Analyze slow queries
history = get_query_history(warehouse_id)
slow_queries = [
    q for q in history['res']
    if q.get('metrics', {}).get('execution_time_ms', 0) > 30000
]

for query in slow_queries:
    print(f"Query ID: {query['query_id']}")
    print(f"Duration: {query['metrics']['execution_time_ms']}ms")
    print(f"Rows: {query['metrics']['rows_produced_count']}")
    print(f"SQL: {query['query_text'][:100]}...")

Building Dashboards

# Create a dashboard via API
dashboard_config = {
    "name": "Sales Performance",
    "tags": ["sales", "executive"],
    "widgets": []
}

response = requests.post(
    f"{workspace_url}/api/2.0/preview/sql/dashboards",
    headers=headers,
    json=dashboard_config
)

dashboard = response.json()
dashboard_id = dashboard['id']

# Add a visualization widget
visualization_config = {
    "dashboard_id": dashboard_id,
    "visualization": {
        "query_id": "query-123",
        "type": "CHART",
        "name": "Daily Revenue",
        "options": {
            "chartType": "line",
            "xAxis": {"column": "date"},
            "yAxis": [{"column": "revenue", "type": "sum"}]
        }
    },
    "width": 3,
    "position": {"col": 0, "row": 0}
}

requests.post(
    f"{workspace_url}/api/2.0/preview/sql/widgets",
    headers=headers,
    json=visualization_config
)

SQL Alerts

Set up automated alerts:

# Create an alert
alert_config = {
    "name": "Low Inventory Alert",
    "query_id": "query-inventory-check",
    "options": {
        "column": "low_stock_count",
        "op": ">",
        "value": 10
    },
    "rearm": 300  # Seconds before re-triggering
}

response = requests.post(
    f"{workspace_url}/api/2.0/preview/sql/alerts",
    headers=headers,
    json=alert_config
)

alert = response.json()

# Set up notification destination
notification_config = {
    "alert_id": alert['id'],
    "notification_destination_id": "email-dest-123"
}

requests.post(
    f"{workspace_url}/api/2.0/preview/sql/alert-subscriptions",
    headers=headers,
    json=notification_config
)

Connecting BI Tools

Power BI

# Get connection details
response = requests.get(
    f"{workspace_url}/api/2.0/sql/warehouses/{warehouse_id}",
    headers=headers
)

warehouse = response.json()
connection_info = {
    "server": warehouse['odbc_params']['hostname'],
    "port": warehouse['odbc_params']['port'],
    "http_path": warehouse['odbc_params']['path'],
    "catalog": "production",
    "schema": "analytics"
}

print("Power BI Connection:")
print(f"Server: {connection_info['server']}")
print(f"HTTP Path: {connection_info['http_path']}")

JDBC/ODBC Connection String

# JDBC
jdbc:spark://{server}:443/default;transportMode=http;ssl=1;
httpPath={http_path};AuthMech=3;UID=token;PWD={personal_access_token}

# ODBC
Driver=Simba Spark ODBC Driver;Host={server};Port=443;
HTTPPath={http_path};AuthMech=3;SSL=1;ThriftTransport=2;
UID=token;PWD={personal_access_token}

Query Federation

Query external data sources:

-- Create external connection
CREATE CONNECTION snowflake_conn
TYPE snowflake
OPTIONS (
    host 'account.snowflakecomputing.com',
    user 'service_account',
    password secret('scope', 'snowflake-password')
);

-- Create foreign catalog
CREATE FOREIGN CATALOG snowflake_data
USING CONNECTION snowflake_conn
OPTIONS (database 'ANALYTICS');

-- Query federated data
SELECT
    d.customer_id,
    d.customer_name,
    s.total_orders
FROM snowflake_data.public.customers s
JOIN production.sales.order_summary d
ON s.customer_id = d.customer_id;

Cost Management

Monitor and control costs:

def get_warehouse_usage(warehouse_id, days=30):
    """Get warehouse usage and estimated cost"""

    # Get billing usage
    response = requests.get(
        f"{workspace_url}/api/2.0/sql/warehouses/{warehouse_id}/usage",
        headers=headers,
        params={"days": days}
    )

    usage = response.json()

    # Calculate costs
    dbu_price = 0.55  # Example price per DBU
    total_dbus = sum(d['total_dbus'] for d in usage['usage'])
    estimated_cost = total_dbus * dbu_price

    return {
        "total_dbus": total_dbus,
        "estimated_cost": estimated_cost,
        "daily_average": total_dbus / days
    }

# Implement auto-scaling limits
def configure_auto_scaling(warehouse_id, min_clusters, max_clusters, auto_stop_mins):
    config = {
        "min_num_clusters": min_clusters,
        "max_num_clusters": max_clusters,
        "auto_stop_mins": auto_stop_mins
    }

    requests.post(
        f"{workspace_url}/api/2.0/sql/warehouses/{warehouse_id}/edit",
        headers=headers,
        json=config
    )

Query Caching

Leverage result caching:

-- Results are cached by default for identical queries
-- Cache TTL is based on underlying data changes (Delta Lake)

-- Force cache refresh
SELECT /*+ REFRESH_CACHE */ *
FROM production.sales.daily_summary
WHERE date = current_date();

-- Check if result was from cache
-- Look at query profile for "Cache Hit"

Parameterized Queries

-- Create parameterized query for reuse
-- Parameters are defined with {{ parameter_name }}

SELECT
    region,
    SUM(revenue) as total_revenue,
    COUNT(DISTINCT customer_id) as unique_customers
FROM production.sales.transactions
WHERE transaction_date BETWEEN '{{ start_date }}' AND '{{ end_date }}'
AND region IN ({{ regions }})
GROUP BY region
ORDER BY total_revenue DESC;

Best Practices

Query Optimization Checklist

-- 1. Filter early and precisely
SELECT * FROM large_table WHERE partition_col = 'value';

-- 2. Use LIMIT during exploration
SELECT * FROM large_table LIMIT 1000;

-- 3. Avoid cross joins
-- Bad: SELECT * FROM t1, t2
-- Good: SELECT * FROM t1 JOIN t2 ON t1.key = t2.key

-- 4. Use CTEs for readability and optimization
WITH recent_orders AS (
    SELECT * FROM orders WHERE order_date > dateadd(day, -30, current_date())
)
SELECT customer_id, COUNT(*)
FROM recent_orders
GROUP BY customer_id;

-- 5. Leverage materialized views for repeated queries
CREATE MATERIALIZED VIEW mv_daily_sales AS
SELECT date, SUM(revenue) as daily_revenue
FROM transactions
GROUP BY date;

Warehouse Configuration Tips

Start small: Begin with smaller warehouses and scale up based on query profiles
Use auto-stop: Set appropriate auto-stop (15-60 minutes) to control costs
Enable Photon: Always enable for better performance (included in pricing)
Set concurrency limits: Prevent runaway queries from impacting others
Use warehouse types appropriately: Classic for batch, Pro/Serverless for interactive

Conclusion

Databricks SQL brings traditional data warehouse capabilities to the lakehouse architecture. With serverless compute, Photon acceleration, and native Unity Catalog integration, it provides a modern SQL analytics experience without the complexity of managing separate systems.

Key advantages:

No data movement between lake and warehouse
Unified governance with Unity Catalog
Automatic optimization with Photon
Serverless scaling
Rich ecosystem connectivity