December 22, 2022 1 min read

Auto-Scaling Best Practices in Azure

Azure Auto-Scaling Performance Cost Optimization Best Practices

Proper auto-scaling balances performance and cost. Too aggressive and you waste money; too conservative and users suffer. Let’s explore how to get it right.

Auto-Scaling Fundamentals

Key Metrics to Scale On

scaling_metrics:
  compute:
    primary:
      - CPU percentage
      - Memory percentage
    secondary:
      - Request count
      - Response time
      - Queue depth

  app_service:
    primary:
      - CPU percentage
      - Memory percentage
    secondary:
      - HTTP queue length
      - Data in/out
      - Request count

  aks:
    primary:
      - CPU utilization
      - Memory utilization
    secondary:
      - Custom metrics (Prometheus)
      - External metrics (Azure Monitor)

VMSS Auto-Scaling

resource autoscaleSettings 'Microsoft.Insights/autoscalesettings@2022-10-01' = {
  name: 'vmss-autoscale'
  location: location
  properties: {
    name: 'vmss-autoscale'
    targetResourceUri: vmss.id
    enabled: true
    profiles: [
      {
        name: 'Default'
        capacity: {
          minimum: '2'
          maximum: '20'
          default: '4'
        }
        rules: [
          // Scale out on high CPU
          {
            metricTrigger: {
              metricName: 'Percentage CPU'
              metricResourceUri: vmss.id
              timeGrain: 'PT1M'
              statistic: 'Average'
              timeWindow: 'PT5M'
              timeAggregation: 'Average'
              operator: 'GreaterThan'
              threshold: 70
            }
            scaleAction: {
              direction: 'Increase'
              type: 'ChangeCount'
              value: '2'
              cooldown: 'PT5M'
            }
          }
          // Scale in on low CPU
          {
            metricTrigger: {
              metricName: 'Percentage CPU'
              metricResourceUri: vmss.id
              timeGrain: 'PT1M'
              statistic: 'Average'
              timeWindow: 'PT10M'  // Longer window for scale-in
              timeAggregation: 'Average'
              operator: 'LessThan'
              threshold: 30
            }
            scaleAction: {
              direction: 'Decrease'
              type: 'ChangeCount'
              value: '1'
              cooldown: 'PT10M'  // Longer cooldown for scale-in
            }
          }
        ]
      }
      // Schedule-based profile for known patterns
      {
        name: 'BusinessHours'
        capacity: {
          minimum: '4'
          maximum: '20'
          default: '6'
        }
        recurrence: {
          frequency: 'Week'
          schedule: {
            timeZone: 'Pacific Standard Time'
            days: ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']
            hours: [8]
            minutes: [0]
          }
        }
        rules: []  // Uses metric rules from Default profile
      }
      {
        name: 'AfterHours'
        capacity: {
          minimum: '2'
          maximum: '10'
          default: '2'
        }
        recurrence: {
          frequency: 'Week'
          schedule: {
            timeZone: 'Pacific Standard Time'
            days: ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']
            hours: [18]
            minutes: [0]
          }
        }
        rules: []
      }
    ]
    notifications: [
      {
        operation: 'Scale'
        email: {
          sendToSubscriptionAdministrator: false
          customEmails: ['ops-team@company.com']
        }
        webhooks: [
          {
            serviceUri: 'https://hooks.slack.com/...'
          }
        ]
      }
    ]
  }
}

Kubernetes Auto-Scaling

Horizontal Pod Autoscaler

apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
  name: api-hpa
spec:
  scaleTargetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: api
  minReplicas: 3
  maxReplicas: 50
  metrics:
    # CPU-based scaling
    - type: Resource
      resource:
        name: cpu
        target:
          type: Utilization
          averageUtilization: 70

    # Memory-based scaling
    - type: Resource
      resource:
        name: memory
        target:
          type: Utilization
          averageUtilization: 80

    # Custom metric from Prometheus
    - type: Pods
      pods:
        metric:
          name: http_requests_per_second
        target:
          type: AverageValue
          averageValue: 100

  behavior:
    scaleUp:
      stabilizationWindowSeconds: 60
      policies:
        - type: Percent
          value: 100
          periodSeconds: 60
        - type: Pods
          value: 4
          periodSeconds: 60
      selectPolicy: Max

    scaleDown:
      stabilizationWindowSeconds: 300  # 5 minutes stabilization
      policies:
        - type: Percent
          value: 10
          periodSeconds: 60
      selectPolicy: Min

Cluster Autoscaler

# AKS cluster with autoscaler
resource aks 'Microsoft.ContainerService/managedClusters@2022-07-01' = {
  name: 'aks-cluster'
  location: location
  properties: {
    agentPoolProfiles: [
      {
        name: 'system'
        count: 3
        vmSize: 'Standard_D4s_v3'
        mode: 'System'
        enableAutoScaling: true
        minCount: 3
        maxCount: 5
      }
      {
        name: 'workload'
        count: 5
        vmSize: 'Standard_D8s_v3'
        mode: 'User'
        enableAutoScaling: true
        minCount: 3
        maxCount: 20
        scaleSetPriority: 'Regular'
      }
      {
        name: 'spot'
        count: 0
        vmSize: 'Standard_D8s_v3'
        mode: 'User'
        enableAutoScaling: true
        minCount: 0
        maxCount: 50
        scaleSetPriority: 'Spot'
        scaleSetEvictionPolicy: 'Delete'
        spotMaxPrice: -1
      }
    ]
    autoScalerProfile: {
      balanceSimilarNodeGroups: 'true'
      scanInterval: '10s'
      scaleDownDelayAfterAdd: '10m'
      scaleDownDelayAfterDelete: '10s'
      scaleDownUnneededTime: '10m'
      scaleDownUtilizationThreshold: '0.5'
    }
  }
}

KEDA for Event-Driven Scaling

# KEDA ScaledObject for queue-based scaling
apiVersion: keda.sh/v1alpha1
kind: ScaledObject
metadata:
  name: queue-processor-scaler
spec:
  scaleTargetRef:
    name: queue-processor
  pollingInterval: 15
  cooldownPeriod: 300
  minReplicaCount: 0  # Scale to zero when no messages
  maxReplicaCount: 100
  triggers:
    - type: azure-servicebus
      metadata:
        queueName: orders
        messageCount: "10"  # 1 pod per 10 messages
        connectionFromEnv: SERVICEBUS_CONNECTION

App Service Auto-Scaling

// Programmatic auto-scaling configuration
using Azure.ResourceManager.AppService;
using Azure.ResourceManager.Monitor;

public async Task ConfigureAutoScaling(string resourceGroupName, string planName)
{
    var autoscaleSettings = new AutoscaleSettingData(
        location: AzureLocation.WestUS2,
        profiles: new[]
        {
            new AutoscaleProfile(
                name: "Default",
                capacity: new ScaleCapacity(minimum: "2", maximum: "10", @default: "2"),
                rules: new[]
                {
                    // Scale out
                    new ScaleRule(
                        metricTrigger: new MetricTrigger(
                            metricName: "CpuPercentage",
                            metricResourceUri: appServicePlanId,
                            timeGrain: TimeSpan.FromMinutes(1),
                            statistic: MetricStatisticType.Average,
                            timeWindow: TimeSpan.FromMinutes(5),
                            timeAggregation: TimeAggregationType.Average,
                            @operator: ComparisonOperationType.GreaterThan,
                            threshold: 70
                        ),
                        scaleAction: new ScaleAction(
                            direction: ScaleDirection.Increase,
                            scaleType: ScaleType.ChangeCount,
                            value: "1",
                            cooldown: TimeSpan.FromMinutes(5)
                        )
                    ),
                    // Scale in
                    new ScaleRule(
                        metricTrigger: new MetricTrigger(
                            metricName: "CpuPercentage",
                            metricResourceUri: appServicePlanId,
                            timeGrain: TimeSpan.FromMinutes(1),
                            statistic: MetricStatisticType.Average,
                            timeWindow: TimeSpan.FromMinutes(10),
                            timeAggregation: TimeAggregationType.Average,
                            @operator: ComparisonOperationType.LessThan,
                            threshold: 30
                        ),
                        scaleAction: new ScaleAction(
                            direction: ScaleDirection.Decrease,
                            scaleType: ScaleType.ChangeCount,
                            value: "1",
                            cooldown: TimeSpan.FromMinutes(10)
                        )
                    )
                }
            )
        },
        isEnabled: true,
        targetResourceUri: appServicePlanId
    );

    await _monitorClient.AutoscaleSettings.CreateOrUpdateAsync(
        resourceGroupName,
        $"{planName}-autoscale",
        autoscaleSettings
    );
}

Best Practices

autoscaling_best_practices = {
    "scale_out_quickly": {
        "description": "Scale out faster than you scale in",
        "scale_out_window": "5 minutes",
        "scale_in_window": "10-15 minutes",
        "reason": "User experience matters more than cost during spikes"
    },

    "use_multiple_metrics": {
        "description": "Don't rely on single metric",
        "example": "CPU AND request count AND response time",
        "reason": "Different workloads stress different resources"
    },

    "set_appropriate_cooldowns": {
        "description": "Prevent scaling thrashing",
        "scale_out_cooldown": "5 minutes minimum",
        "scale_in_cooldown": "10 minutes minimum",
        "reason": "New instances need time to warm up and stabilize"
    },

    "test_scaling_behavior": {
        "description": "Load test your auto-scaling",
        "tools": ["Azure Load Testing", "k6", "Locust"],
        "verify": ["Scaling triggers correctly", "Application handles scaling", "No data loss"]
    },

    "monitor_and_alert": {
        "description": "Track scaling events",
        "alerts": ["Max capacity reached", "Scaling failures", "Unusual patterns"]
    }
}

Common Mistakes

scaling_mistakes:
  too_aggressive_scale_in:
    problem: Instances removed too quickly
    symptom: Repeated scale-out/scale-in cycles
    fix: Increase scale-in cooldown and stabilization window

  wrong_metric:
    problem: Scaling on metric that doesn't reflect load
    symptom: Over/under provisioned during actual load
    fix: Use request-based or custom application metrics

  no_pre_warming:
    problem: Cold instances can't handle traffic
    symptom: Errors during scale-out
    fix: Pre-warm instances, use health checks with delay

  ignoring_startup_time:
    problem: Instances need time to start
    symptom: Scale-out doesn't help immediately
    fix: Scale earlier based on trends, use predictive scaling

Conclusion

Auto-scaling is powerful but requires careful tuning. Start with conservative settings, monitor behavior, and adjust based on real data. Combine metric-based scaling with schedule-based profiles for predictable patterns. Always test your scaling behavior under load before relying on it in production.