Back to Blog
3 min read

Azure ML Compute: Choosing the Right Infrastructure

Azure Machine Learning provides various compute options for ML workloads. Today we explore how to choose and configure the right compute for your needs.

Compute Options Overview

compute_types = {
    "compute_instance": {
        "use": "Development, notebooks",
        "scaling": "Single VM",
        "cost": "Pay while running"
    },
    "compute_cluster": {
        "use": "Training jobs",
        "scaling": "Auto-scale 0 to N nodes",
        "cost": "Pay per use"
    },
    "serverless_compute": {
        "use": "Quick experiments",
        "scaling": "Automatic",
        "cost": "Pay per job"
    },
    "kubernetes": {
        "use": "Production deployment",
        "scaling": "Kubernetes-managed",
        "cost": "AKS pricing"
    }
}

Creating Compute Clusters

from azure.ai.ml import MLClient
from azure.ai.ml.entities import AmlCompute
from azure.identity import DefaultAzureCredential

# Connect to workspace
ml_client = MLClient(
    DefaultAzureCredential(),
    subscription_id="your-subscription",
    resource_group_name="your-rg",
    workspace_name="your-workspace"
)

# Create GPU compute cluster
gpu_cluster = AmlCompute(
    name="gpu-cluster",
    type="amlcompute",
    size="Standard_NC6s_v3",  # T4 GPU
    min_instances=0,
    max_instances=4,
    idle_time_before_scale_down=300,
    tier="Dedicated"
)

ml_client.compute.begin_create_or_update(gpu_cluster).result()

GPU VM Sizes

gpu_sizes = {
    # Entry-level
    "Standard_NC6s_v3": {
        "gpu": "1x V100 16GB",
        "vcpu": 6,
        "memory": "112 GB",
        "use": "Fine-tuning small models"
    },
    # Mid-range
    "Standard_NC12s_v3": {
        "gpu": "2x V100 16GB",
        "vcpu": 12,
        "memory": "224 GB",
        "use": "Training medium models"
    },
    # High-end
    "Standard_NC24ads_A100_v4": {
        "gpu": "1x A100 80GB",
        "vcpu": 24,
        "memory": "220 GB",
        "use": "Large model training"
    },
    "Standard_ND96asr_v4": {
        "gpu": "8x A100 40GB",
        "vcpu": 96,
        "memory": "900 GB",
        "use": "Distributed training"
    }
}

Submitting Training Jobs

from azure.ai.ml import command, Input

# Define training job
job = command(
    code="./src",
    command="python train.py --model-name ${{inputs.model}} --epochs ${{inputs.epochs}}",
    inputs={
        "model": "meta-llama/Llama-2-7b-hf",
        "epochs": 3
    },
    environment="AzureML-pytorch-2.0-cuda11.7@latest",
    compute="gpu-cluster",
    instance_count=1,
    distribution={
        "type": "PyTorch",
        "process_count_per_instance": 1
    }
)

# Submit job
submitted_job = ml_client.jobs.create_or_update(job)
print(f"Job submitted: {submitted_job.name}")

# Monitor
ml_client.jobs.stream(submitted_job.name)

Distributed Training Configuration

from azure.ai.ml import command
from azure.ai.ml.entities import ResourceConfiguration

# Multi-GPU job
distributed_job = command(
    code="./src",
    command="torchrun --nproc_per_node=4 train.py",
    environment="AzureML-pytorch-2.0-cuda11.7@latest",
    compute="gpu-cluster",
    resources=ResourceConfiguration(
        instance_count=2,  # 2 nodes
        instance_type="Standard_NC24ads_A100_v4"
    ),
    distribution={
        "type": "PyTorch",
        "process_count_per_instance": 4  # 4 GPUs per node
    }
)

ml_client.jobs.create_or_update(distributed_job)

Cost Optimization

# Use spot instances for training
spot_cluster = AmlCompute(
    name="spot-gpu-cluster",
    size="Standard_NC24ads_A100_v4",
    min_instances=0,
    max_instances=4,
    tier="LowPriority",  # Spot instances
    idle_time_before_scale_down=120
)

# Cost comparison
cost_comparison = {
    "Standard_NC6s_v3": {
        "dedicated": "$0.90/hour",
        "spot": "$0.27/hour",
        "savings": "70%"
    },
    "Standard_NC24ads_A100_v4": {
        "dedicated": "$3.67/hour",
        "spot": "$1.10/hour",
        "savings": "70%"
    }
}

Managed Online Endpoints

from azure.ai.ml.entities import (
    ManagedOnlineEndpoint,
    ManagedOnlineDeployment,
    Model
)

# Create endpoint
endpoint = ManagedOnlineEndpoint(
    name="llm-endpoint",
    auth_mode="key"
)
ml_client.online_endpoints.begin_create_or_update(endpoint).result()

# Deploy model
deployment = ManagedOnlineDeployment(
    name="llm-deployment",
    endpoint_name="llm-endpoint",
    model=Model(path="./model"),
    instance_type="Standard_NC6s_v3",
    instance_count=1,
    environment="AzureML-pytorch-2.0-cuda11.7@latest",
    code_configuration=CodeConfiguration(
        code="./score",
        scoring_script="score.py"
    )
)
ml_client.online_deployments.begin_create_or_update(deployment).result()

Monitoring Compute

# Get compute status
compute = ml_client.compute.get("gpu-cluster")
print(f"State: {compute.provisioning_state}")
print(f"Current nodes: {compute.current_instance_count}")

# List running jobs
jobs = ml_client.jobs.list()
for job in jobs:
    if job.status == "Running":
        print(f"{job.name}: {job.status}")

Tomorrow we’ll explore spot instances for ML workloads.

Resources

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.