Skip to content
Back to Blog
1 min read

Hyperparameter Tuning with Azure ML Sweep Jobs

I wrote “Hyperparameter Tuning with Azure ML Sweep Jobs” to share practical, production-minded guidance on this topic.

Basic Sweep Job

from azure.ai.ml import MLClient, command, Input
from azure.ai.ml.sweep import Choice, Uniform, LogUniform, BanditPolicy
from azure.identity import DefaultAzureCredential

ml_client = MLClient(
    credential=DefaultAzureCredential(),
    subscription_id="your-subscription",
    resource_group_name="your-rg",
    workspace_name="your-workspace"
)

# Define training command with hyperparameters
training_command = command(
    code="./src",
    command="""python train.py \
        --data ${{inputs.training_data}} \
        --learning-rate ${{inputs.learning_rate}} \
        --n-estimators ${{inputs.n_estimators}} \
        --max-depth ${{inputs.max_depth}} \
        --min-samples-split ${{inputs.min_samples_split}}""",
    inputs={
        "training_data": Input(path="azureml://datastores/workspaceblobstore/paths/train.csv"),
        "learning_rate": 0.1,  # Will be swept
        "n_estimators": 100,   # Will be swept
        "max_depth": 5,        # Will be swept
        "min_samples_split": 2 # Will be swept
    },
    environment="AzureML-sklearn-1.0-ubuntu20.04-py38-cpu@latest",
    compute="cpu-cluster"
)

# Configure sweep
sweep_job = training_command.sweep(
    compute="cpu-cluster",
    sampling_algorithm="random",
    primary_metric="accuracy",
    goal="maximize",
    limits={
        "max_total_trials": 50,
        "max_concurrent_trials": 10,
        "timeout": 3600
    }
)

# Define search space
sweep_job.set_inputs(
    learning_rate=LogUniform(min_value=-4, max_value=-1),  # 0.0001 to 0.1
    n_estimators=Choice([50, 100, 200, 500]),
    max_depth=Choice([3, 5, 7, 10, 15]),
    min_samples_split=Choice([2, 5, 10, 20])
)

# Submit sweep job
returned_job = ml_client.jobs.create_or_update(sweep_job)
print(f"Sweep job submitted: {returned_job.name}")

Training Script for Sweep

# src/train.py
import argparse
import pandas as pd
import mlflow
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

def train(args):
    # Start MLflow run
    mlflow.start_run()

    # Log hyperparameters
    mlflow.log_param("learning_rate", args.learning_rate)
    mlflow.log_param("n_estimators", args.n_estimators)
    mlflow.log_param("max_depth", args.max_depth)
    mlflow.log_param("min_samples_split", args.min_samples_split)

    # Load data
    df = pd.read_csv(args.data)
    X = df.drop("target", axis=1)
    y = df["target"]

    # Train model
    model = RandomForestClassifier(
        n_estimators=args.n_estimators,
        max_depth=args.max_depth,
        min_samples_split=args.min_samples_split,
        random_state=42
    )

    # Cross-validation
    scores = cross_val_score(model, X, y, cv=5)
    accuracy = scores.mean()

    # Log metrics
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("accuracy_std", scores.std())

    # Train final model
    model.fit(X, y)
    mlflow.sklearn.log_model(model, "model")

    mlflow.end_run()

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--data", required=True)
    parser.add_argument("--learning-rate", type=float)
    parser.add_argument("--n-estimators", type=int)
    parser.add_argument("--max-depth", type=int)
    parser.add_argument("--min-samples-split", type=int)
    args = parser.parse_args()

    train(args)

Sampling Algorithms

from azure.ai.ml.sweep import RandomParameterSampling, GridParameterSampling, BayesianParameterSampling

# Random sampling
sweep_job_random = training_command.sweep(
    sampling_algorithm="random",
    primary_metric="accuracy",
    goal="maximize"
)

# Grid sampling (exhaustive)
sweep_job_grid = training_command.sweep(
    sampling_algorithm="grid",
    primary_metric="accuracy",
    goal="maximize"
)

# Bayesian optimization
sweep_job_bayesian = training_command.sweep(
    sampling_algorithm="bayesian",
    primary_metric="accuracy",
    goal="maximize"
)

Search Space Types

from azure.ai.ml.sweep import (
    Choice,
    Uniform,
    LogUniform,
    Normal,
    LogNormal,
    QUniform,
    QLogUniform,
    QNormal,
    QLogNormal
)

sweep_job.set_inputs(
    # Categorical
    optimizer=Choice(["adam", "sgd", "rmsprop"]),

    # Continuous uniform
    dropout=Uniform(min_value=0.1, max_value=0.5),

    # Log-uniform (good for learning rates)
    learning_rate=LogUniform(min_value=-5, max_value=-1),

    # Normal distribution
    weight_decay=Normal(mu=0.01, sigma=0.005),

    # Quantized (discrete steps)
    batch_size=QUniform(min_value=16, max_value=128, q=16),  # 16, 32, 48, ...

    # Integer choice
    num_layers=Choice([1, 2, 3, 4, 5])
)

Early Termination Policies

from azure.ai.ml.sweep import BanditPolicy, MedianStoppingPolicy, TruncationSelectionPolicy

# Bandit policy - terminate runs falling behind
sweep_job.early_termination = BanditPolicy(
    slack_factor=0.1,  # Within 10% of best run
    evaluation_interval=1,  # Check every epoch
    delay_evaluation=5  # Start checking after 5 epochs
)

# Median stopping policy
sweep_job.early_termination = MedianStoppingPolicy(
    evaluation_interval=1,
    delay_evaluation=5
)

# Truncation selection - terminate worst performers
sweep_job.early_termination = TruncationSelectionPolicy(
    truncation_percentage=25,  # Terminate bottom 25%
    evaluation_interval=1
)

Analyzing Sweep Results

# Get sweep job
sweep_job = ml_client.jobs.get(returned_job.name)

# Get best trial
best_trial = sweep_job.properties.get("best_child_run_id")
print(f"Best trial: {best_trial}")

# Get all child runs
child_jobs = ml_client.jobs.list(parent_job_name=sweep_job.name)

results = []
for job in child_jobs:
    if job.status == "Completed":
        results.append({
            "run_id": job.name,
            "accuracy": job.properties.get("accuracy"),
            "learning_rate": job.inputs.get("learning_rate"),
            "n_estimators": job.inputs.get("n_estimators")
        })

# Create results dataframe
results_df = pd.DataFrame(results).sort_values("accuracy", ascending=False)
print(results_df.head(10))

Visualization

import matplotlib.pyplot as plt
import seaborn as sns

# Parallel coordinates plot
fig, ax = plt.subplots(figsize=(12, 6))
pd.plotting.parallel_coordinates(
    results_df,
    'accuracy',
    colormap='viridis'
)
plt.title('Hyperparameter Space Exploration')
plt.savefig('sweep_results.png')

# Hyperparameter vs metric scatter plots
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

axes[0, 0].scatter(results_df['learning_rate'], results_df['accuracy'])
axes[0, 0].set_xlabel('Learning Rate')
axes[0, 0].set_ylabel('Accuracy')
axes[0, 0].set_xscale('log')

axes[0, 1].scatter(results_df['n_estimators'], results_df['accuracy'])
axes[0, 1].set_xlabel('N Estimators')
axes[0, 1].set_ylabel('Accuracy')

# ... more plots

plt.tight_layout()
plt.savefig('hyperparameter_analysis.png')

Registering Best Model

# Get best model from sweep
best_run = ml_client.jobs.get(best_trial)

# Download model
ml_client.jobs.download(
    name=best_trial,
    output_name="model",
    download_path="./best_model"
)

# Register model
from azure.ai.ml.entities import Model

model = Model(
    path="./best_model/model",
    name="optimized-classifier",
    description=f"Best model from sweep {sweep_job.name}",
    properties={
        "accuracy": str(best_run.properties.get("accuracy")),
        "sweep_job": sweep_job.name
    }
)

ml_client.models.create_or_update(model)

Sweep jobs automate the tedious process of hyperparameter tuning, helping you find optimal configurations faster.\n\n## Takeaways\n\nAdd a concise, personal takeaway and recommended next steps here.\n

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.