August 19, 2022 1 min read

Hyperparameter Tuning with Azure ML Sweep Jobs

Azure Machine Learning Hyperparameter Tuning Sweep Optimization

Sweep jobs in Azure ML automate hyperparameter tuning by running multiple training runs with different parameter combinations. This helps find optimal model configurations efficiently.

Basic Sweep Job

from azure.ai.ml import MLClient, command, Input
from azure.ai.ml.sweep import Choice, Uniform, LogUniform, BanditPolicy
from azure.identity import DefaultAzureCredential

ml_client = MLClient(
    credential=DefaultAzureCredential(),
    subscription_id="your-subscription",
    resource_group_name="your-rg",
    workspace_name="your-workspace"
)

# Define training command with hyperparameters
training_command = command(
    code="./src",
    command="""python train.py \
        --data ${{inputs.training_data}} \
        --learning-rate ${{inputs.learning_rate}} \
        --n-estimators ${{inputs.n_estimators}} \
        --max-depth ${{inputs.max_depth}} \
        --min-samples-split ${{inputs.min_samples_split}}""",
    inputs={
        "training_data": Input(path="azureml://datastores/workspaceblobstore/paths/train.csv"),
        "learning_rate": 0.1,  # Will be swept
        "n_estimators": 100,   # Will be swept
        "max_depth": 5,        # Will be swept
        "min_samples_split": 2 # Will be swept
    },
    environment="AzureML-sklearn-1.0-ubuntu20.04-py38-cpu@latest",
    compute="cpu-cluster"
)

# Configure sweep
sweep_job = training_command.sweep(
    compute="cpu-cluster",
    sampling_algorithm="random",
    primary_metric="accuracy",
    goal="maximize",
    limits={
        "max_total_trials": 50,
        "max_concurrent_trials": 10,
        "timeout": 3600
    }
)

# Define search space
sweep_job.set_inputs(
    learning_rate=LogUniform(min_value=-4, max_value=-1),  # 0.0001 to 0.1
    n_estimators=Choice([50, 100, 200, 500]),
    max_depth=Choice([3, 5, 7, 10, 15]),
    min_samples_split=Choice([2, 5, 10, 20])
)

# Submit sweep job
returned_job = ml_client.jobs.create_or_update(sweep_job)
print(f"Sweep job submitted: {returned_job.name}")

Training Script for Sweep

# src/train.py
import argparse
import pandas as pd
import mlflow
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

def train(args):
    # Start MLflow run
    mlflow.start_run()

    # Log hyperparameters
    mlflow.log_param("learning_rate", args.learning_rate)
    mlflow.log_param("n_estimators", args.n_estimators)
    mlflow.log_param("max_depth", args.max_depth)
    mlflow.log_param("min_samples_split", args.min_samples_split)

    # Load data
    df = pd.read_csv(args.data)
    X = df.drop("target", axis=1)
    y = df["target"]

    # Train model
    model = RandomForestClassifier(
        n_estimators=args.n_estimators,
        max_depth=args.max_depth,
        min_samples_split=args.min_samples_split,
        random_state=42
    )

    # Cross-validation
    scores = cross_val_score(model, X, y, cv=5)
    accuracy = scores.mean()

    # Log metrics
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("accuracy_std", scores.std())

    # Train final model
    model.fit(X, y)
    mlflow.sklearn.log_model(model, "model")

    mlflow.end_run()

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--data", required=True)
    parser.add_argument("--learning-rate", type=float)
    parser.add_argument("--n-estimators", type=int)
    parser.add_argument("--max-depth", type=int)
    parser.add_argument("--min-samples-split", type=int)
    args = parser.parse_args()

    train(args)

Sampling Algorithms

from azure.ai.ml.sweep import RandomParameterSampling, GridParameterSampling, BayesianParameterSampling

# Random sampling
sweep_job_random = training_command.sweep(
    sampling_algorithm="random",
    primary_metric="accuracy",
    goal="maximize"
)

# Grid sampling (exhaustive)
sweep_job_grid = training_command.sweep(
    sampling_algorithm="grid",
    primary_metric="accuracy",
    goal="maximize"
)

# Bayesian optimization
sweep_job_bayesian = training_command.sweep(
    sampling_algorithm="bayesian",
    primary_metric="accuracy",
    goal="maximize"
)

Search Space Types

from azure.ai.ml.sweep import (
    Choice,
    Uniform,
    LogUniform,
    Normal,
    LogNormal,
    QUniform,
    QLogUniform,
    QNormal,
    QLogNormal
)

sweep_job.set_inputs(
    # Categorical
    optimizer=Choice(["adam", "sgd", "rmsprop"]),

    # Continuous uniform
    dropout=Uniform(min_value=0.1, max_value=0.5),

    # Log-uniform (good for learning rates)
    learning_rate=LogUniform(min_value=-5, max_value=-1),

    # Normal distribution
    weight_decay=Normal(mu=0.01, sigma=0.005),

    # Quantized (discrete steps)
    batch_size=QUniform(min_value=16, max_value=128, q=16),  # 16, 32, 48, ...

    # Integer choice
    num_layers=Choice([1, 2, 3, 4, 5])
)

Early Termination Policies

from azure.ai.ml.sweep import BanditPolicy, MedianStoppingPolicy, TruncationSelectionPolicy

# Bandit policy - terminate runs falling behind
sweep_job.early_termination = BanditPolicy(
    slack_factor=0.1,  # Within 10% of best run
    evaluation_interval=1,  # Check every epoch
    delay_evaluation=5  # Start checking after 5 epochs
)

# Median stopping policy
sweep_job.early_termination = MedianStoppingPolicy(
    evaluation_interval=1,
    delay_evaluation=5
)

# Truncation selection - terminate worst performers
sweep_job.early_termination = TruncationSelectionPolicy(
    truncation_percentage=25,  # Terminate bottom 25%
    evaluation_interval=1
)

Analyzing Sweep Results

# Get sweep job
sweep_job = ml_client.jobs.get(returned_job.name)

# Get best trial
best_trial = sweep_job.properties.get("best_child_run_id")
print(f"Best trial: {best_trial}")

# Get all child runs
child_jobs = ml_client.jobs.list(parent_job_name=sweep_job.name)

results = []
for job in child_jobs:
    if job.status == "Completed":
        results.append({
            "run_id": job.name,
            "accuracy": job.properties.get("accuracy"),
            "learning_rate": job.inputs.get("learning_rate"),
            "n_estimators": job.inputs.get("n_estimators")
        })

# Create results dataframe
results_df = pd.DataFrame(results).sort_values("accuracy", ascending=False)
print(results_df.head(10))

Visualization

import matplotlib.pyplot as plt
import seaborn as sns

# Parallel coordinates plot
fig, ax = plt.subplots(figsize=(12, 6))
pd.plotting.parallel_coordinates(
    results_df,
    'accuracy',
    colormap='viridis'
)
plt.title('Hyperparameter Space Exploration')
plt.savefig('sweep_results.png')

# Hyperparameter vs metric scatter plots
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

axes[0, 0].scatter(results_df['learning_rate'], results_df['accuracy'])
axes[0, 0].set_xlabel('Learning Rate')
axes[0, 0].set_ylabel('Accuracy')
axes[0, 0].set_xscale('log')

axes[0, 1].scatter(results_df['n_estimators'], results_df['accuracy'])
axes[0, 1].set_xlabel('N Estimators')
axes[0, 1].set_ylabel('Accuracy')

# ... more plots

plt.tight_layout()
plt.savefig('hyperparameter_analysis.png')

Registering Best Model

# Get best model from sweep
best_run = ml_client.jobs.get(best_trial)

# Download model
ml_client.jobs.download(
    name=best_trial,
    output_name="model",
    download_path="./best_model"
)

# Register model
from azure.ai.ml.entities import Model

model = Model(
    path="./best_model/model",
    name="optimized-classifier",
    description=f"Best model from sweep {sweep_job.name}",
    properties={
        "accuracy": str(best_run.properties.get("accuracy")),
        "sweep_job": sweep_job.name
    }
)

ml_client.models.create_or_update(model)

Sweep jobs automate the tedious process of hyperparameter tuning, helping you find optimal configurations faster.