4 min read
Hugging Face + Azure: Running Open Source Models in the Cloud
Combining Hugging Face’s model ecosystem with Azure’s compute infrastructure enables powerful ML workflows. Today we’ll explore how to use Hugging Face models on Azure.
Integration Options
# Ways to use Hugging Face models on Azure
integration_options = {
"azure_ml": {
"description": "Full ML platform with Hugging Face integration",
"use_case": "Production deployments, team collaboration"
},
"azure_container_instances": {
"description": "Simple container deployment",
"use_case": "Quick prototypes, APIs"
},
"azure_kubernetes": {
"description": "Scalable container orchestration",
"use_case": "High-traffic production"
},
"azure_vm": {
"description": "Direct GPU VM access",
"use_case": "Fine-tuning, experimentation"
}
}
Azure ML with Hugging Face
Environment Setup
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential
from azure.ai.ml.entities import Environment, BuildContext
# Connect to workspace
ml_client = MLClient(
DefaultAzureCredential(),
subscription_id="your-subscription-id",
resource_group_name="your-rg",
workspace_name="your-workspace"
)
# Create environment with Hugging Face libraries
env = Environment(
name="huggingface-transformers",
description="Environment for Hugging Face models",
conda_file={
"name": "transformers",
"channels": ["conda-forge", "pytorch"],
"dependencies": [
"python=3.10",
"pip",
{"pip": [
"transformers>=4.30.0",
"torch>=2.0.0",
"accelerate",
"peft",
"bitsandbytes",
"datasets",
"evaluate",
"huggingface_hub"
]}
]
},
image="mcr.microsoft.com/azureml/curated/acpt-pytorch-2.0-cuda11.7:latest"
)
ml_client.environments.create_or_update(env)
Training Script
# train_hf_model.py
import argparse
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TrainingArguments,
Trainer
)
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
import mlflow
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--model_name", type=str, required=True)
parser.add_argument("--output_dir", type=str, required=True)
parser.add_argument("--data_path", type=str, required=True)
parser.add_argument("--epochs", type=int, default=3)
parser.add_argument("--batch_size", type=int, default=4)
parser.add_argument("--learning_rate", type=float, default=2e-4)
args = parser.parse_args()
# Enable MLflow tracking
mlflow.transformers.autolog()
# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(args.model_name)
tokenizer = AutoTokenizer.from_pretrained(args.model_name)
# Apply LoRA
lora_config = LoraConfig(
r=8,
lora_alpha=32,
target_modules=["q_proj", "v_proj"],
lora_dropout=0.05,
task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
# Load dataset
dataset = load_dataset("json", data_files=args.data_path)
# Tokenize
def tokenize(examples):
return tokenizer(
examples["text"],
truncation=True,
max_length=512,
padding="max_length"
)
tokenized = dataset.map(tokenize, batched=True)
# Training
training_args = TrainingArguments(
output_dir=args.output_dir,
num_train_epochs=args.epochs,
per_device_train_batch_size=args.batch_size,
learning_rate=args.learning_rate,
logging_steps=10,
save_strategy="epoch"
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized["train"]
)
trainer.train()
trainer.save_model()
if __name__ == "__main__":
main()
Submit Training Job
from azure.ai.ml import command, Input
# Define the job
job = command(
code="./src",
command="python train_hf_model.py --model_name ${{inputs.model_name}} --output_dir ${{outputs.model}} --data_path ${{inputs.data}} --epochs 3",
environment="huggingface-transformers@latest",
compute="gpu-cluster",
inputs={
"model_name": "meta-llama/Llama-2-7b-hf",
"data": Input(type="uri_file", path="azureml://datastores/workspaceblobstore/paths/training_data.jsonl")
},
outputs={
"model": Output(type="uri_folder", path="azureml://datastores/workspaceblobstore/paths/models/llama-finetuned")
},
environment_variables={
"HF_TOKEN": "your-hf-token" # For gated models
}
)
# Submit
submitted_job = ml_client.jobs.create_or_update(job)
print(f"Job submitted: {submitted_job.name}")
Deploying Models
Managed Online Endpoint
from azure.ai.ml.entities import (
ManagedOnlineEndpoint,
ManagedOnlineDeployment,
Model,
CodeConfiguration
)
# Create endpoint
endpoint = ManagedOnlineEndpoint(
name="hf-llm-endpoint",
description="Hugging Face LLM endpoint",
auth_mode="key"
)
ml_client.online_endpoints.begin_create_or_update(endpoint).result()
# Create deployment
deployment = ManagedOnlineDeployment(
name="main",
endpoint_name="hf-llm-endpoint",
model=Model(path="./model"),
code_configuration=CodeConfiguration(
code="./score",
scoring_script="score.py"
),
environment="huggingface-transformers@latest",
instance_type="Standard_NC6s_v3",
instance_count=1
)
ml_client.online_deployments.begin_create_or_update(deployment).result()
Scoring Script
# score.py
import json
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
def init():
global model, tokenizer
model_path = os.environ.get("AZUREML_MODEL_DIR", "./model")
model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
if torch.cuda.is_available():
model = model.to("cuda")
def run(raw_data):
data = json.loads(raw_data)
prompt = data.get("prompt", "")
max_length = data.get("max_length", 100)
inputs = tokenizer(prompt, return_tensors="pt")
if torch.cuda.is_available():
inputs = {k: v.to("cuda") for k, v in inputs.items()}
with torch.no_grad():
outputs = model.generate(**inputs, max_length=max_length)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
return {"response": response}
Using Azure VMs Directly
# Create GPU VM
az vm create \
--resource-group myResourceGroup \
--name hf-training-vm \
--image microsoft-dsvm:ubuntu-hpc:2004:latest \
--size Standard_NC6s_v3 \
--admin-username azureuser \
--generate-ssh-keys
# SSH and install
ssh azureuser@<vm-ip>
pip install transformers peft accelerate bitsandbytes
# Login to Hugging Face
huggingface-cli login --token $HF_TOKEN
Cost Optimization
cost_optimization = {
"spot_instances": {
"description": "Use spot VMs for training",
"savings": "Up to 90% cheaper",
"risk": "May be preempted"
},
"right_sizing": {
"description": "Choose appropriate GPU",
"nc6s_v3": "T4 - Good for inference, fine-tuning small models",
"nc12s_v3": "2x T4 - Larger models",
"nd40rs_v2": "8x V100 - Large-scale training"
},
"auto_shutdown": {
"description": "Stop VMs when not in use",
"implementation": "Azure Automation or DevTest Labs"
},
"quantization": {
"description": "Use 4-bit/8-bit models",
"benefit": "Smaller GPU requirements"
}
}
Tomorrow we’ll explore the Hugging Face Model Hub.