August 21, 2022 1 min read

Automated Machine Learning with Azure ML AutoML

Azure Machine Learning AutoML Automation Model Selection

Azure ML AutoML automates the process of selecting algorithms, tuning hyperparameters, and generating models. It democratizes ML by enabling rapid model development without deep expertise.

AutoML Classification

from azure.ai.ml import MLClient, Input, automl
from azure.ai.ml.automl import ClassificationPrimaryMetrics
from azure.identity import DefaultAzureCredential

ml_client = MLClient(
    credential=DefaultAzureCredential(),
    subscription_id="your-subscription",
    resource_group_name="your-rg",
    workspace_name="your-workspace"
)

# Configure AutoML classification
classification_job = automl.classification(
    compute="cpu-cluster",
    experiment_name="automl-classification",
    training_data=Input(path="azureml://datastores/workspaceblobstore/paths/train.csv"),
    target_column_name="target",
    primary_metric=ClassificationPrimaryMetrics.ACCURACY,
    n_cross_validations=5,

    # Limits
    timeout_minutes=60,
    trial_timeout_minutes=20,
    max_trials=50,
    max_concurrent_trials=4,

    # Enable all featurization
    enable_early_termination=True,
    featurization="auto"
)

# Set allowed models
classification_job.set_training(
    allowed_training_algorithms=[
        "LightGBM",
        "XGBoostClassifier",
        "RandomForest",
        "LogisticRegression"
    ]
)

# Submit job
returned_job = ml_client.jobs.create_or_update(classification_job)
print(f"AutoML job submitted: {returned_job.name}")

AutoML Regression

from azure.ai.ml.automl import RegressionPrimaryMetrics

# Configure AutoML regression
regression_job = automl.regression(
    compute="cpu-cluster",
    experiment_name="automl-regression",
    training_data=Input(path="azureml://datastores/workspaceblobstore/paths/housing.csv"),
    target_column_name="price",
    primary_metric=RegressionPrimaryMetrics.NORMALIZED_ROOT_MEAN_SQUARED_ERROR,
    n_cross_validations=5,

    timeout_minutes=120,
    trial_timeout_minutes=30,
    max_trials=100,
    max_concurrent_trials=4
)

# Enable deep learning
regression_job.set_training(
    enable_dnn_training=True,
    enable_onnx_compatible_models=True
)

returned_job = ml_client.jobs.create_or_update(regression_job)

AutoML Forecasting

from azure.ai.ml.automl import ForecastingPrimaryMetrics

# Configure AutoML forecasting
forecasting_job = automl.forecasting(
    compute="cpu-cluster",
    experiment_name="automl-forecasting",
    training_data=Input(path="azureml://datastores/workspaceblobstore/paths/sales.csv"),
    target_column_name="sales",
    primary_metric=ForecastingPrimaryMetrics.NORMALIZED_ROOT_MEAN_SQUARED_ERROR,
    n_cross_validations=3,

    # Forecasting-specific settings
    forecasting_settings=automl.ForecastingSettings(
        time_column_name="date",
        forecast_horizon=30,
        frequency="D",
        target_lags=[1, 7, 14],
        target_rolling_window_size=7,
        time_series_id_column_names=["store_id", "product_id"]
    ),

    timeout_minutes=180,
    max_trials=50
)

# Enable specific algorithms for time series
forecasting_job.set_training(
    allowed_training_algorithms=[
        "Prophet",
        "AutoArima",
        "ExponentialSmoothing",
        "TCNForecaster"
    ]
)

returned_job = ml_client.jobs.create_or_update(forecasting_job)

Featurization Settings

from azure.ai.ml.automl import ColumnTransformer

# Custom featurization
classification_job = automl.classification(
    # ... basic settings
)

# Configure column transformers
classification_job.set_featurization(
    mode="custom",
    transformer_params={
        "imputer": [
            ColumnTransformer(
                fields=["age", "income"],
                parameters={"strategy": "median"}
            )
        ],
        "hash_one_hot_encoder": [
            ColumnTransformer(
                fields=["city", "state"],
                parameters={"number_of_bits": 10}
            )
        ]
    },
    blocked_transformers=["LabelEncoder"],  # Don't use label encoding
    column_name_and_types={
        "age": "Numeric",
        "income": "Numeric",
        "category": "Categorical",
        "date_joined": "DateTime"
    }
)

Analyzing AutoML Results

# Get job results
automl_job = ml_client.jobs.get(returned_job.name)

# Get all child runs
child_runs = ml_client.jobs.list(parent_job_name=automl_job.name)

results = []
for run in child_runs:
    if run.status == "Completed":
        results.append({
            "algorithm": run.properties.get("run_algorithm"),
            "accuracy": run.properties.get("accuracy"),
            "f1_score": run.properties.get("f1_score_weighted"),
            "duration": run.properties.get("duration")
        })

results_df = pd.DataFrame(results).sort_values("accuracy", ascending=False)
print("Top Models:")
print(results_df.head(10))

Get Best Model

# Get best model from AutoML
best_run_id = automl_job.properties.get("best_child_run_id")
best_run = ml_client.jobs.get(best_run_id)

print(f"Best model: {best_run.properties.get('run_algorithm')}")
print(f"Accuracy: {best_run.properties.get('accuracy')}")

# Download best model
ml_client.jobs.download(
    name=best_run_id,
    output_name="best_model",
    download_path="./outputs"
)

# Register best model
from azure.ai.ml.entities import Model

model = Model(
    path="./outputs/best_model",
    name="automl-best-model",
    description=f"Best model from AutoML run {automl_job.name}",
    type="mlflow_model"
)

ml_client.models.create_or_update(model)

Model Explanations

# Enable model explanations
classification_job = automl.classification(
    # ... settings
)

classification_job.set_training(
    enable_model_explainability=True
)

# After job completes, get explanations
from azure.ai.ml.entities import Model

# Download explanation artifacts
ml_client.jobs.download(
    name=best_run_id,
    output_name="explanations",
    download_path="./explanations"
)

# Load and analyze
import json

with open("./explanations/global_importance.json") as f:
    importance = json.load(f)

for feature, imp in sorted(importance.items(), key=lambda x: -x[1])[:10]:
    print(f"{feature}: {imp:.4f}")

AutoML in Pipelines

from azure.ai.ml import dsl

@dsl.pipeline(
    name="automl_pipeline",
    compute="cpu-cluster"
)
def automl_training_pipeline(training_data: Input, test_data: Input):
    # AutoML training step
    automl_step = automl.classification(
        training_data=training_data,
        target_column_name="target",
        primary_metric="accuracy",
        timeout_minutes=60
    )

    # Evaluation step
    eval_step = evaluate_model(
        model=automl_step.outputs.best_model,
        test_data=test_data
    )

    return {
        "model": automl_step.outputs.best_model,
        "metrics": eval_step.outputs.metrics
    }

# Submit pipeline
pipeline_job = ml_client.jobs.create_or_update(
    automl_training_pipeline(
        training_data=Input(path="azureml://datastores/workspaceblobstore/paths/train.csv"),
        test_data=Input(path="azureml://datastores/workspaceblobstore/paths/test.csv")
    ),
    experiment_name="automl-pipeline"
)

AutoML accelerates model development while maintaining production-quality results.