5 min read
Azure Databricks Cluster Configuration Best Practices
Azure Databricks Cluster Configuration Best Practices
Proper cluster configuration is crucial for balancing performance and cost in Azure Databricks. Let’s explore how to configure clusters for different workload types and optimize their settings.
Cluster Types
All-Purpose Clusters
Interactive, shared clusters for development and exploration:
# Cluster configuration via API
cluster_config = {
"cluster_name": "dev-cluster",
"spark_version": "9.1.x-scala2.12",
"node_type_id": "Standard_DS3_v2",
"num_workers": 4,
"autotermination_minutes": 60,
"enable_elastic_disk": True,
"spark_conf": {
"spark.speculation": "true"
}
}
Job Clusters
Ephemeral clusters for automated workloads:
{
"new_cluster": {
"spark_version": "9.1.x-scala2.12",
"node_type_id": "Standard_DS3_v2",
"num_workers": 8,
"spark_conf": {
"spark.databricks.cluster.profile": "serverless"
}
}
}
Node Selection
Worker Node Types
Choose based on workload characteristics:
# Memory-optimized for data-intensive workloads
memory_optimized = {
"node_type_id": "Standard_E8s_v3", # 8 cores, 64 GB RAM
"num_workers": 4
}
# Compute-optimized for CPU-intensive workloads
compute_optimized = {
"node_type_id": "Standard_F8s_v2", # 8 cores, 16 GB RAM
"num_workers": 8
}
# GPU-enabled for ML training
gpu_enabled = {
"node_type_id": "Standard_NC6s_v3", # 6 cores, 112 GB RAM, 1 GPU
"num_workers": 2
}
Driver Node Configuration
# Use larger driver for complex workloads
cluster_config = {
"driver_node_type_id": "Standard_DS4_v2", # Larger driver
"node_type_id": "Standard_DS3_v2", # Regular workers
"num_workers": 8
}
Autoscaling
Enable Autoscaling
autoscaling_config = {
"cluster_name": "autoscaling-cluster",
"spark_version": "9.1.x-scala2.12",
"node_type_id": "Standard_DS3_v2",
"autoscale": {
"min_workers": 2,
"max_workers": 20
},
"autotermination_minutes": 30
}
Optimized Autoscaling
# Enable optimized autoscaling for faster scaling
optimized_autoscaling = {
"autoscale": {
"min_workers": 2,
"max_workers": 20
},
"spark_conf": {
"spark.databricks.cluster.profile": "serverless",
"spark.databricks.delta.preview.enabled": "true"
},
"cluster_source": "UI"
}
Spark Configuration
Memory Configuration
spark_memory_config = {
"spark_conf": {
"spark.executor.memory": "8g",
"spark.driver.memory": "4g",
"spark.executor.memoryOverhead": "2g",
"spark.memory.fraction": "0.8",
"spark.memory.storageFraction": "0.3"
}
}
Performance Optimization
performance_config = {
"spark_conf": {
# Adaptive Query Execution
"spark.sql.adaptive.enabled": "true",
"spark.sql.adaptive.coalescePartitions.enabled": "true",
"spark.sql.adaptive.skewJoin.enabled": "true",
# Shuffle optimization
"spark.sql.shuffle.partitions": "auto",
"spark.databricks.io.cache.enabled": "true",
# Delta optimization
"spark.databricks.delta.optimizeWrite.enabled": "true",
"spark.databricks.delta.autoCompact.enabled": "true"
}
}
Dynamic Allocation
dynamic_allocation = {
"spark_conf": {
"spark.dynamicAllocation.enabled": "true",
"spark.dynamicAllocation.minExecutors": "2",
"spark.dynamicAllocation.maxExecutors": "20",
"spark.dynamicAllocation.executorIdleTimeout": "60s"
}
}
Instance Pools
Pre-allocate instances for faster cluster startup:
# Create instance pool
pool_config = {
"instance_pool_name": "data-engineering-pool",
"node_type_id": "Standard_DS3_v2",
"min_idle_instances": 2,
"max_capacity": 20,
"idle_instance_autotermination_minutes": 30,
"preloaded_spark_versions": ["9.1.x-scala2.12"]
}
# Use pool in cluster
cluster_with_pool = {
"cluster_name": "pooled-cluster",
"instance_pool_id": "pool-123456",
"num_workers": 4
}
Cluster Tags
Add tags for cost tracking and management:
tagged_cluster = {
"cluster_name": "production-etl",
"custom_tags": {
"Environment": "Production",
"Team": "DataEngineering",
"CostCenter": "DE-001",
"Project": "DataPlatform"
}
}
Init Scripts
Run initialization scripts on cluster startup:
# Global init script (workspace level)
init_script_config = {
"init_scripts": [
{
"workspace": {
"destination": "/Shared/init-scripts/install-packages.sh"
}
}
]
}
# Cluster-scoped init script
cluster_init_config = {
"init_scripts": [
{
"dbfs": {
"destination": "dbfs:/databricks/init-scripts/my-init.sh"
}
}
]
}
Sample Init Script
#!/bin/bash
# /dbfs/databricks/init-scripts/install-libs.sh
# Install system packages
apt-get update
apt-get install -y libxml2-dev
# Install Python packages
/databricks/python/bin/pip install azure-identity==1.7.0
/databricks/python/bin/pip install pandas-profiling==3.1.0
Environment Variables
cluster_with_env = {
"cluster_name": "env-configured-cluster",
"spark_env_vars": {
"PYSPARK_PYTHON": "/databricks/python3/bin/python3",
"ENVIRONMENT": "production",
"LOG_LEVEL": "INFO"
}
}
Monitoring Configuration
Enable Cluster Logging
logging_config = {
"cluster_log_conf": {
"dbfs": {
"destination": "dbfs:/cluster-logs"
}
}
}
Ganglia Metrics
# Ganglia metrics are enabled by default
# Access via cluster UI -> Metrics tab
# Or configure to push to Azure Monitor
metrics_config = {
"spark_conf": {
"spark.metrics.namespace": "databricks",
"spark.metrics.conf.*.sink.ganglia.class": "org.apache.spark.metrics.sink.GangliaSink"
}
}
Security Configuration
Credential Passthrough
# Enable Azure AD credential passthrough
credential_passthrough = {
"spark_conf": {
"spark.databricks.passthrough.enabled": "true"
},
"azure_attributes": {
"first_on_demand": 1,
"availability": "SPOT_WITH_FALLBACK_AZURE",
"spot_bid_max_price": -1
}
}
Network Configuration
# Cluster in VNet
vnet_cluster = {
"cluster_name": "vnet-cluster",
"node_type_id": "Standard_DS3_v2",
"num_workers": 4,
# VNet is configured at workspace level
}
Terraform Configuration
resource "databricks_cluster" "production" {
cluster_name = "production-cluster"
spark_version = "9.1.x-scala2.12"
node_type_id = "Standard_DS3_v2"
autotermination_minutes = 30
autoscale {
min_workers = 2
max_workers = 10
}
spark_conf = {
"spark.sql.adaptive.enabled" = "true"
"spark.databricks.delta.optimizeWrite.enabled" = "true"
"spark.databricks.delta.autoCompact.enabled" = "true"
}
custom_tags = {
Environment = "Production"
Team = "DataEngineering"
}
library {
pypi {
package = "azure-identity==1.7.0"
}
}
}
Best Practices Summary
- Right-size nodes - Match node type to workload characteristics
- Use autoscaling - Enable for variable workloads
- Set autotermination - Prevent idle clusters from running
- Use instance pools - Faster startup for frequently used clusters
- Configure Spark properly - Enable AQE and Delta optimizations
- Tag everything - Enable cost tracking and management
- Use job clusters for production - Ephemeral, reproducible clusters
Conclusion
Proper cluster configuration balances performance, cost, and reliability. By understanding the various configuration options, you can create clusters optimized for your specific workloads.
Tomorrow, we’ll explore cluster policies for governance and standardization.