Back to Blog
5 min read

Databricks Cluster Policies for Governance and Cost Control

Databricks Cluster Policies for Governance and Cost Control

Cluster policies in Databricks enable administrators to control cluster configurations, enforce standards, and manage costs. Let’s explore how to implement effective cluster policies.

Understanding Cluster Policies

Cluster policies:

  • Define allowed cluster configurations
  • Restrict node types and sizes
  • Enforce tagging requirements
  • Set default values
  • Limit maximum cluster sizes

Creating Basic Policies

Simple Cost Control Policy

{
  "cluster_type": {
    "type": "fixed",
    "value": "all-purpose"
  },
  "autotermination_minutes": {
    "type": "range",
    "minValue": 10,
    "maxValue": 120,
    "defaultValue": 30
  },
  "node_type_id": {
    "type": "allowlist",
    "values": [
      "Standard_DS3_v2",
      "Standard_DS4_v2",
      "Standard_DS5_v2"
    ],
    "defaultValue": "Standard_DS3_v2"
  },
  "num_workers": {
    "type": "range",
    "minValue": 1,
    "maxValue": 10,
    "defaultValue": 2
  }
}

Autoscaling Policy

{
  "spark_version": {
    "type": "regex",
    "pattern": "9\\.[0-9]+\\.x-scala2\\.12"
  },
  "autoscale.min_workers": {
    "type": "range",
    "minValue": 1,
    "maxValue": 5,
    "defaultValue": 2
  },
  "autoscale.max_workers": {
    "type": "range",
    "minValue": 2,
    "maxValue": 20,
    "defaultValue": 10
  },
  "autotermination_minutes": {
    "type": "fixed",
    "value": 30
  }
}

Policy Attribute Types

Fixed Values

{
  "spark_version": {
    "type": "fixed",
    "value": "9.1.x-scala2.12",
    "hidden": true
  }
}

Range Constraints

{
  "num_workers": {
    "type": "range",
    "minValue": 1,
    "maxValue": 50,
    "defaultValue": 4
  }
}

Allowlist

{
  "node_type_id": {
    "type": "allowlist",
    "values": [
      "Standard_DS3_v2",
      "Standard_DS4_v2",
      "Standard_E4s_v3",
      "Standard_E8s_v3"
    ],
    "defaultValue": "Standard_DS3_v2"
  }
}

Blocklist

{
  "node_type_id": {
    "type": "blocklist",
    "values": [
      "Standard_NC6",
      "Standard_NC12",
      "Standard_NC24"
    ]
  }
}

Regex Pattern

{
  "spark_version": {
    "type": "regex",
    "pattern": "(9|10)\\.[0-9]+\\.x-.*"
  }
}

Enterprise Policy Examples

Development Team Policy

{
  "cluster_type": {
    "type": "fixed",
    "value": "all-purpose"
  },
  "spark_version": {
    "type": "regex",
    "pattern": "9\\.[0-9]+\\.x-scala2\\.12"
  },
  "node_type_id": {
    "type": "allowlist",
    "values": [
      "Standard_DS3_v2",
      "Standard_DS4_v2"
    ],
    "defaultValue": "Standard_DS3_v2"
  },
  "num_workers": {
    "type": "range",
    "minValue": 1,
    "maxValue": 4,
    "defaultValue": 2
  },
  "autotermination_minutes": {
    "type": "range",
    "minValue": 10,
    "maxValue": 60,
    "defaultValue": 30
  },
  "custom_tags.Environment": {
    "type": "fixed",
    "value": "Development"
  },
  "custom_tags.Team": {
    "type": "fixed",
    "value": "DataEngineering"
  }
}

Production ETL Policy

{
  "cluster_type": {
    "type": "fixed",
    "value": "job"
  },
  "spark_version": {
    "type": "fixed",
    "value": "9.1.x-scala2.12",
    "hidden": true
  },
  "node_type_id": {
    "type": "allowlist",
    "values": [
      "Standard_E8s_v3",
      "Standard_E16s_v3"
    ]
  },
  "autoscale.min_workers": {
    "type": "range",
    "minValue": 2,
    "maxValue": 10,
    "defaultValue": 4
  },
  "autoscale.max_workers": {
    "type": "range",
    "minValue": 4,
    "maxValue": 50,
    "defaultValue": 20
  },
  "custom_tags.Environment": {
    "type": "fixed",
    "value": "Production"
  },
  "spark_conf.spark.databricks.delta.optimizeWrite.enabled": {
    "type": "fixed",
    "value": "true",
    "hidden": true
  },
  "spark_conf.spark.databricks.delta.autoCompact.enabled": {
    "type": "fixed",
    "value": "true",
    "hidden": true
  }
}

Data Science Policy (with GPU)

{
  "cluster_type": {
    "type": "fixed",
    "value": "all-purpose"
  },
  "spark_version": {
    "type": "regex",
    "pattern": "9\\.[0-9]+\\.x-gpu-ml-scala2\\.12"
  },
  "node_type_id": {
    "type": "allowlist",
    "values": [
      "Standard_NC6s_v3",
      "Standard_NC12s_v3"
    ],
    "defaultValue": "Standard_NC6s_v3"
  },
  "num_workers": {
    "type": "range",
    "minValue": 1,
    "maxValue": 4,
    "defaultValue": 1
  },
  "autotermination_minutes": {
    "type": "fixed",
    "value": 60
  },
  "custom_tags.UseCase": {
    "type": "fixed",
    "value": "MachineLearning"
  }
}

Mandatory Tags Policy

{
  "custom_tags.CostCenter": {
    "type": "regex",
    "pattern": "[A-Z]{2}-[0-9]{3}",
    "isOptional": false
  },
  "custom_tags.Project": {
    "type": "unlimited",
    "isOptional": false
  },
  "custom_tags.Owner": {
    "type": "unlimited",
    "isOptional": false
  }
}

Instance Pool Policy

{
  "instance_pool_id": {
    "type": "fixed",
    "value": "pool-XXXXXXXXXXXXX",
    "hidden": true
  },
  "driver_instance_pool_id": {
    "type": "fixed",
    "value": "pool-XXXXXXXXXXXXX",
    "hidden": true
  },
  "autoscale.min_workers": {
    "type": "range",
    "minValue": 2,
    "maxValue": 10
  },
  "autoscale.max_workers": {
    "type": "range",
    "minValue": 4,
    "maxValue": 30
  }
}

Creating Policies via API

import requests
import json

# Create policy via REST API
def create_cluster_policy(workspace_url, token, policy_name, definition):
    url = f"{workspace_url}/api/2.0/policies/clusters/create"
    headers = {
        "Authorization": f"Bearer {token}",
        "Content-Type": "application/json"
    }
    payload = {
        "name": policy_name,
        "definition": json.dumps(definition)
    }

    response = requests.post(url, headers=headers, json=payload)
    return response.json()

# Example usage
policy_definition = {
    "node_type_id": {
        "type": "allowlist",
        "values": ["Standard_DS3_v2", "Standard_DS4_v2"]
    },
    "num_workers": {
        "type": "range",
        "minValue": 1,
        "maxValue": 10
    },
    "autotermination_minutes": {
        "type": "fixed",
        "value": 30
    }
}

result = create_cluster_policy(
    "https://adb-123456789.0.azuredatabricks.net",
    "dapi123...",
    "Cost Controlled Policy",
    policy_definition
)

Terraform Configuration

resource "databricks_cluster_policy" "development" {
  name = "Development Policy"

  definition = jsonencode({
    "cluster_type" : {
      "type" : "fixed",
      "value" : "all-purpose"
    },
    "node_type_id" : {
      "type" : "allowlist",
      "values" : [
        "Standard_DS3_v2",
        "Standard_DS4_v2"
      ],
      "defaultValue" : "Standard_DS3_v2"
    },
    "num_workers" : {
      "type" : "range",
      "minValue" : 1,
      "maxValue" : 4,
      "defaultValue" : 2
    },
    "autotermination_minutes" : {
      "type" : "fixed",
      "value" : 30
    },
    "custom_tags.Environment" : {
      "type" : "fixed",
      "value" : "Development"
    }
  })
}

# Assign policy to group
resource "databricks_permissions" "policy_usage" {
  cluster_policy_id = databricks_cluster_policy.development.id

  access_control {
    group_name       = "developers"
    permission_level = "CAN_USE"
  }
}

Policy Assignment

Assign to Users/Groups

# Assign policy permissions
def set_policy_permissions(workspace_url, token, policy_id, permissions):
    url = f"{workspace_url}/api/2.0/permissions/cluster-policies/{policy_id}"
    headers = {
        "Authorization": f"Bearer {token}",
        "Content-Type": "application/json"
    }

    response = requests.put(url, headers=headers, json=permissions)
    return response.json()

permissions = {
    "access_control_list": [
        {
            "group_name": "data-engineers",
            "permission_level": "CAN_USE"
        },
        {
            "group_name": "admins",
            "permission_level": "CAN_MANAGE"
        }
    ]
}

Monitoring Policy Compliance

# Check cluster compliance
def get_clusters_by_policy(workspace_url, token, policy_id):
    url = f"{workspace_url}/api/2.0/clusters/list"
    headers = {"Authorization": f"Bearer {token}"}

    response = requests.get(url, headers=headers)
    clusters = response.json().get("clusters", [])

    compliant = [c for c in clusters if c.get("policy_id") == policy_id]
    non_compliant = [c for c in clusters if c.get("policy_id") != policy_id]

    return {
        "compliant": compliant,
        "non_compliant": non_compliant
    }

Best Practices

  1. Start restrictive - Begin with tight policies and loosen as needed
  2. Use default values - Guide users to optimal configurations
  3. Hide unnecessary options - Reduce complexity for users
  4. Require tags - Enable cost tracking and governance
  5. Version control policies - Store definitions in source control
  6. Audit regularly - Review clusters against policies
  7. Document policies - Explain the rationale for restrictions

Conclusion

Cluster policies are essential for governing Databricks usage at scale. By implementing appropriate policies, you can control costs, enforce standards, and simplify the user experience while maintaining flexibility for different workload types.

Tomorrow, we’ll compare job clusters vs all-purpose clusters for different use cases.

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.