Back to Blog
5 min read

Auto-Scaling Best Practices in Azure

Proper auto-scaling balances performance and cost. Too aggressive and you waste money; too conservative and users suffer. Let’s explore how to get it right.

Auto-Scaling Fundamentals

Key Metrics to Scale On

scaling_metrics:
  compute:
    primary:
      - CPU percentage
      - Memory percentage
    secondary:
      - Request count
      - Response time
      - Queue depth

  app_service:
    primary:
      - CPU percentage
      - Memory percentage
    secondary:
      - HTTP queue length
      - Data in/out
      - Request count

  aks:
    primary:
      - CPU utilization
      - Memory utilization
    secondary:
      - Custom metrics (Prometheus)
      - External metrics (Azure Monitor)

VMSS Auto-Scaling

resource autoscaleSettings 'Microsoft.Insights/autoscalesettings@2022-10-01' = {
  name: 'vmss-autoscale'
  location: location
  properties: {
    name: 'vmss-autoscale'
    targetResourceUri: vmss.id
    enabled: true
    profiles: [
      {
        name: 'Default'
        capacity: {
          minimum: '2'
          maximum: '20'
          default: '4'
        }
        rules: [
          // Scale out on high CPU
          {
            metricTrigger: {
              metricName: 'Percentage CPU'
              metricResourceUri: vmss.id
              timeGrain: 'PT1M'
              statistic: 'Average'
              timeWindow: 'PT5M'
              timeAggregation: 'Average'
              operator: 'GreaterThan'
              threshold: 70
            }
            scaleAction: {
              direction: 'Increase'
              type: 'ChangeCount'
              value: '2'
              cooldown: 'PT5M'
            }
          }
          // Scale in on low CPU
          {
            metricTrigger: {
              metricName: 'Percentage CPU'
              metricResourceUri: vmss.id
              timeGrain: 'PT1M'
              statistic: 'Average'
              timeWindow: 'PT10M'  // Longer window for scale-in
              timeAggregation: 'Average'
              operator: 'LessThan'
              threshold: 30
            }
            scaleAction: {
              direction: 'Decrease'
              type: 'ChangeCount'
              value: '1'
              cooldown: 'PT10M'  // Longer cooldown for scale-in
            }
          }
        ]
      }
      // Schedule-based profile for known patterns
      {
        name: 'BusinessHours'
        capacity: {
          minimum: '4'
          maximum: '20'
          default: '6'
        }
        recurrence: {
          frequency: 'Week'
          schedule: {
            timeZone: 'Pacific Standard Time'
            days: ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']
            hours: [8]
            minutes: [0]
          }
        }
        rules: []  // Uses metric rules from Default profile
      }
      {
        name: 'AfterHours'
        capacity: {
          minimum: '2'
          maximum: '10'
          default: '2'
        }
        recurrence: {
          frequency: 'Week'
          schedule: {
            timeZone: 'Pacific Standard Time'
            days: ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']
            hours: [18]
            minutes: [0]
          }
        }
        rules: []
      }
    ]
    notifications: [
      {
        operation: 'Scale'
        email: {
          sendToSubscriptionAdministrator: false
          customEmails: ['ops-team@company.com']
        }
        webhooks: [
          {
            serviceUri: 'https://hooks.slack.com/...'
          }
        ]
      }
    ]
  }
}

Kubernetes Auto-Scaling

Horizontal Pod Autoscaler

apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
  name: api-hpa
spec:
  scaleTargetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: api
  minReplicas: 3
  maxReplicas: 50
  metrics:
    # CPU-based scaling
    - type: Resource
      resource:
        name: cpu
        target:
          type: Utilization
          averageUtilization: 70

    # Memory-based scaling
    - type: Resource
      resource:
        name: memory
        target:
          type: Utilization
          averageUtilization: 80

    # Custom metric from Prometheus
    - type: Pods
      pods:
        metric:
          name: http_requests_per_second
        target:
          type: AverageValue
          averageValue: 100

  behavior:
    scaleUp:
      stabilizationWindowSeconds: 60
      policies:
        - type: Percent
          value: 100
          periodSeconds: 60
        - type: Pods
          value: 4
          periodSeconds: 60
      selectPolicy: Max

    scaleDown:
      stabilizationWindowSeconds: 300  # 5 minutes stabilization
      policies:
        - type: Percent
          value: 10
          periodSeconds: 60
      selectPolicy: Min

Cluster Autoscaler

# AKS cluster with autoscaler
resource aks 'Microsoft.ContainerService/managedClusters@2022-07-01' = {
  name: 'aks-cluster'
  location: location
  properties: {
    agentPoolProfiles: [
      {
        name: 'system'
        count: 3
        vmSize: 'Standard_D4s_v3'
        mode: 'System'
        enableAutoScaling: true
        minCount: 3
        maxCount: 5
      }
      {
        name: 'workload'
        count: 5
        vmSize: 'Standard_D8s_v3'
        mode: 'User'
        enableAutoScaling: true
        minCount: 3
        maxCount: 20
        scaleSetPriority: 'Regular'
      }
      {
        name: 'spot'
        count: 0
        vmSize: 'Standard_D8s_v3'
        mode: 'User'
        enableAutoScaling: true
        minCount: 0
        maxCount: 50
        scaleSetPriority: 'Spot'
        scaleSetEvictionPolicy: 'Delete'
        spotMaxPrice: -1
      }
    ]
    autoScalerProfile: {
      balanceSimilarNodeGroups: 'true'
      scanInterval: '10s'
      scaleDownDelayAfterAdd: '10m'
      scaleDownDelayAfterDelete: '10s'
      scaleDownUnneededTime: '10m'
      scaleDownUtilizationThreshold: '0.5'
    }
  }
}

KEDA for Event-Driven Scaling

# KEDA ScaledObject for queue-based scaling
apiVersion: keda.sh/v1alpha1
kind: ScaledObject
metadata:
  name: queue-processor-scaler
spec:
  scaleTargetRef:
    name: queue-processor
  pollingInterval: 15
  cooldownPeriod: 300
  minReplicaCount: 0  # Scale to zero when no messages
  maxReplicaCount: 100
  triggers:
    - type: azure-servicebus
      metadata:
        queueName: orders
        messageCount: "10"  # 1 pod per 10 messages
        connectionFromEnv: SERVICEBUS_CONNECTION

App Service Auto-Scaling

// Programmatic auto-scaling configuration
using Azure.ResourceManager.AppService;
using Azure.ResourceManager.Monitor;

public async Task ConfigureAutoScaling(string resourceGroupName, string planName)
{
    var autoscaleSettings = new AutoscaleSettingData(
        location: AzureLocation.WestUS2,
        profiles: new[]
        {
            new AutoscaleProfile(
                name: "Default",
                capacity: new ScaleCapacity(minimum: "2", maximum: "10", @default: "2"),
                rules: new[]
                {
                    // Scale out
                    new ScaleRule(
                        metricTrigger: new MetricTrigger(
                            metricName: "CpuPercentage",
                            metricResourceUri: appServicePlanId,
                            timeGrain: TimeSpan.FromMinutes(1),
                            statistic: MetricStatisticType.Average,
                            timeWindow: TimeSpan.FromMinutes(5),
                            timeAggregation: TimeAggregationType.Average,
                            @operator: ComparisonOperationType.GreaterThan,
                            threshold: 70
                        ),
                        scaleAction: new ScaleAction(
                            direction: ScaleDirection.Increase,
                            scaleType: ScaleType.ChangeCount,
                            value: "1",
                            cooldown: TimeSpan.FromMinutes(5)
                        )
                    ),
                    // Scale in
                    new ScaleRule(
                        metricTrigger: new MetricTrigger(
                            metricName: "CpuPercentage",
                            metricResourceUri: appServicePlanId,
                            timeGrain: TimeSpan.FromMinutes(1),
                            statistic: MetricStatisticType.Average,
                            timeWindow: TimeSpan.FromMinutes(10),
                            timeAggregation: TimeAggregationType.Average,
                            @operator: ComparisonOperationType.LessThan,
                            threshold: 30
                        ),
                        scaleAction: new ScaleAction(
                            direction: ScaleDirection.Decrease,
                            scaleType: ScaleType.ChangeCount,
                            value: "1",
                            cooldown: TimeSpan.FromMinutes(10)
                        )
                    )
                }
            )
        },
        isEnabled: true,
        targetResourceUri: appServicePlanId
    );

    await _monitorClient.AutoscaleSettings.CreateOrUpdateAsync(
        resourceGroupName,
        $"{planName}-autoscale",
        autoscaleSettings
    );
}

Best Practices

autoscaling_best_practices = {
    "scale_out_quickly": {
        "description": "Scale out faster than you scale in",
        "scale_out_window": "5 minutes",
        "scale_in_window": "10-15 minutes",
        "reason": "User experience matters more than cost during spikes"
    },

    "use_multiple_metrics": {
        "description": "Don't rely on single metric",
        "example": "CPU AND request count AND response time",
        "reason": "Different workloads stress different resources"
    },

    "set_appropriate_cooldowns": {
        "description": "Prevent scaling thrashing",
        "scale_out_cooldown": "5 minutes minimum",
        "scale_in_cooldown": "10 minutes minimum",
        "reason": "New instances need time to warm up and stabilize"
    },

    "test_scaling_behavior": {
        "description": "Load test your auto-scaling",
        "tools": ["Azure Load Testing", "k6", "Locust"],
        "verify": ["Scaling triggers correctly", "Application handles scaling", "No data loss"]
    },

    "monitor_and_alert": {
        "description": "Track scaling events",
        "alerts": ["Max capacity reached", "Scaling failures", "Unusual patterns"]
    }
}

Common Mistakes

scaling_mistakes:
  too_aggressive_scale_in:
    problem: Instances removed too quickly
    symptom: Repeated scale-out/scale-in cycles
    fix: Increase scale-in cooldown and stabilization window

  wrong_metric:
    problem: Scaling on metric that doesn't reflect load
    symptom: Over/under provisioned during actual load
    fix: Use request-based or custom application metrics

  no_pre_warming:
    problem: Cold instances can't handle traffic
    symptom: Errors during scale-out
    fix: Pre-warm instances, use health checks with delay

  ignoring_startup_time:
    problem: Instances need time to start
    symptom: Scale-out doesn't help immediately
    fix: Scale earlier based on trends, use predictive scaling

Conclusion

Auto-scaling is powerful but requires careful tuning. Start with conservative settings, monitor behavior, and adjust based on real data. Combine metric-based scaling with schedule-based profiles for predictable patterns. Always test your scaling behavior under load before relying on it in production.

Resources

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.