November 9, 2024 1 min read

Custom Model Deployment on Azure: From Fine-Tuning to Production

Azure AI Model Deployment MLOps Production

Deploying custom models to production requires careful consideration of scaling, monitoring, and cost management. Let’s explore the complete workflow from fine-tuned model to production deployment.

Deployment Options Overview

┌─────────────────────────────────────────────────────────────┐
│                    Deployment Options                        │
├─────────────────┬─────────────────┬─────────────────────────┤
│   Serverless    │    Managed      │      Container          │
├─────────────────┼─────────────────┼─────────────────────────┤
│ Pay-per-token   │ Pay-per-hour    │ Full control            │
│ Auto-scaling    │ Manual scaling  │ Custom scaling          │
│ Zero management │ Some management │ Full management         │
│ Limited config  │ More options    │ Complete flexibility    │
└─────────────────┴─────────────────┴─────────────────────────┘

Serverless Deployment

Best for: Variable workloads, quick start, minimal ops overhead

from azure.ai.foundry import AIFoundryClient
from azure.ai.foundry.deployments import ServerlessDeployment

client = AIFoundryClient(...)

# Deploy fine-tuned model serverless
deployment = client.deployments.create_serverless(
    ServerlessDeployment(
        name="my-custom-model-serverless",
        model=fine_tuned_model_name,
        rate_limits={
            "requests_per_minute": 100,
            "tokens_per_minute": 50000
        }
    )
)

print(f"Endpoint: {deployment.endpoint}")
print(f"API Key: {deployment.api_key}")

Managed Compute Deployment

Best for: Predictable workloads, SLA requirements, cost optimization

from azure.ai.foundry.deployments import ManagedDeployment, ScaleSettings

# Deploy with managed compute
deployment = client.deployments.create_managed(
    ManagedDeployment(
        name="my-custom-model-managed",
        model=fine_tuned_model_name,
        compute={
            "sku": "Standard_NC24ads_A100_v4",
            "instance_count": 2
        },
        scale_settings=ScaleSettings(
            min_instances=1,
            max_instances=5,
            scale_type="manual"  # or "target_utilization"
        ),
        request_settings={
            "max_concurrent_requests": 10,
            "request_timeout_ms": 60000
        }
    )
)

# Wait for deployment
deployment.wait_for_completion()
print(f"Status: {deployment.status}")

Container Deployment with AKS

Best for: Multi-model serving, custom inference logic, hybrid scenarios

# First, export the model
export_job = client.models.export(
    model_name=fine_tuned_model_name,
    format="onnx",  # or "pytorch", "safetensors"
    output_path="azureml://datastores/models/paths/my-model"
)

export_job.wait_for_completion()

# Dockerfile for custom inference server
FROM mcr.microsoft.com/azureml/inference-base:latest

COPY requirements.txt .
RUN pip install -r requirements.txt

COPY model/ /app/model/
COPY inference.py /app/

EXPOSE 8080

CMD ["python", "/app/inference.py"]

# inference.py
from fastapi import FastAPI
from pydantic import BaseModel
from typing import List
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

app = FastAPI()

# Load model once at startup
model = AutoModelForCausalLM.from_pretrained("/app/model")
tokenizer = AutoTokenizer.from_pretrained("/app/model")

class ChatRequest(BaseModel):
    messages: List[dict]
    max_tokens: int = 500
    temperature: float = 0.7

class ChatResponse(BaseModel):
    content: str
    usage: dict

@app.post("/v1/chat/completions")
async def chat_completion(request: ChatRequest) -> ChatResponse:
    # Format messages
    prompt = tokenizer.apply_chat_template(
        request.messages,
        tokenize=False
    )

    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt")

    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=request.max_tokens,
            temperature=request.temperature,
            do_sample=True
        )

    # Decode
    response_text = tokenizer.decode(
        outputs[0][inputs.input_ids.shape[1]:],
        skip_special_tokens=True
    )

    return ChatResponse(
        content=response_text,
        usage={
            "prompt_tokens": inputs.input_ids.shape[1],
            "completion_tokens": outputs.shape[1] - inputs.input_ids.shape[1],
            "total_tokens": outputs.shape[1]
        }
    )

@app.get("/health")
async def health():
    return {"status": "healthy"}

# kubernetes/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: custom-model-inference
spec:
  replicas: 3
  selector:
    matchLabels:
      app: custom-model
  template:
    metadata:
      labels:
        app: custom-model
    spec:
      containers:
      - name: inference
        image: myregistry.azurecr.io/custom-model:v1
        resources:
          limits:
            nvidia.com/gpu: 1
            memory: "32Gi"
          requests:
            nvidia.com/gpu: 1
            memory: "16Gi"
        ports:
        - containerPort: 8080
        readinessProbe:
          httpGet:
            path: /health
            port: 8080
        livenessProbe:
          httpGet:
            path: /health
            port: 8080
---
apiVersion: v1
kind: Service
metadata:
  name: custom-model-service
spec:
  selector:
    app: custom-model
  ports:
  - port: 80
    targetPort: 8080
  type: LoadBalancer

Blue-Green Deployments

from azure.ai.foundry.deployments import TrafficSplit

class BlueGreenDeploymentManager:
    """Manage blue-green deployments for custom models."""

    def __init__(self, client: AIFoundryClient):
        self.client = client

    async def deploy_new_version(
        self,
        endpoint_name: str,
        new_model: str,
        initial_traffic_percent: int = 10
    ):
        """Deploy new model version with traffic splitting."""

        # Get current deployment (blue)
        endpoint = self.client.endpoints.get(endpoint_name)
        blue_deployment = endpoint.deployments[0]

        # Create new deployment (green)
        green_deployment = await self.client.deployments.create_managed(
            ManagedDeployment(
                name=f"{endpoint_name}-green",
                model=new_model,
                compute=blue_deployment.compute
            )
        )

        # Split traffic
        await self.client.endpoints.update_traffic(
            endpoint_name=endpoint_name,
            traffic=TrafficSplit({
                blue_deployment.name: 100 - initial_traffic_percent,
                green_deployment.name: initial_traffic_percent
            })
        )

        return green_deployment

    async def promote_green(self, endpoint_name: str):
        """Promote green to 100% traffic."""

        endpoint = self.client.endpoints.get(endpoint_name)
        green_deployment = next(
            d for d in endpoint.deployments
            if d.name.endswith("-green")
        )

        await self.client.endpoints.update_traffic(
            endpoint_name=endpoint_name,
            traffic=TrafficSplit({green_deployment.name: 100})
        )

    async def rollback(self, endpoint_name: str):
        """Rollback to blue deployment."""

        endpoint = self.client.endpoints.get(endpoint_name)
        blue_deployment = next(
            d for d in endpoint.deployments
            if not d.name.endswith("-green")
        )

        await self.client.endpoints.update_traffic(
            endpoint_name=endpoint_name,
            traffic=TrafficSplit({blue_deployment.name: 100})
        )

# Usage
manager = BlueGreenDeploymentManager(client)

# Deploy new version with 10% traffic
await manager.deploy_new_version(
    endpoint_name="production-endpoint",
    new_model="my-model-v2",
    initial_traffic_percent=10
)

# Monitor metrics, then promote
await manager.promote_green("production-endpoint")

Monitoring and Observability

from azure.ai.foundry.monitoring import DeploymentMonitor

monitor = DeploymentMonitor(client)

# Set up monitoring
monitor.configure(
    deployment_name="my-custom-model-managed",
    metrics=[
        "request_count",
        "latency_p50",
        "latency_p95",
        "latency_p99",
        "error_rate",
        "token_usage",
        "gpu_utilization"
    ],
    alerts=[
        {
            "name": "high-latency",
            "condition": "latency_p95 > 2000",
            "action": "email",
            "recipients": ["oncall@company.com"]
        },
        {
            "name": "high-error-rate",
            "condition": "error_rate > 0.01",
            "action": "pagerduty",
            "severity": "critical"
        }
    ]
)

# Get current metrics
metrics = monitor.get_metrics(
    deployment_name="my-custom-model-managed",
    time_range="last_1h"
)

print(f"Requests: {metrics['request_count']}")
print(f"P95 Latency: {metrics['latency_p95']}ms")
print(f"Error Rate: {metrics['error_rate']*100:.2f}%")

Cost Management

class DeploymentCostTracker:
    """Track and optimize deployment costs."""

    def __init__(self, client: AIFoundryClient):
        self.client = client

    def get_cost_breakdown(self, deployment_name: str, days: int = 30) -> dict:
        """Get detailed cost breakdown."""

        usage = self.client.deployments.get_usage(
            deployment_name=deployment_name,
            days=days
        )

        return {
            "compute_cost": usage.compute_hours * usage.compute_rate,
            "token_cost": usage.total_tokens * usage.token_rate,
            "storage_cost": usage.model_storage_gb * 0.02,
            "total": usage.total_cost,
            "cost_per_request": usage.total_cost / usage.request_count,
            "recommendations": self.get_recommendations(usage)
        }

    def get_recommendations(self, usage) -> list:
        recommendations = []

        if usage.avg_gpu_utilization < 0.3:
            recommendations.append(
                "Consider smaller compute SKU - GPU utilization is low"
            )

        if usage.requests_per_hour_variance > 0.8:
            recommendations.append(
                "Consider serverless deployment for variable workloads"
            )

        if usage.avg_batch_size < 2:
            recommendations.append(
                "Enable request batching to improve throughput"
            )

        return recommendations

tracker = DeploymentCostTracker(client)
costs = tracker.get_cost_breakdown("my-custom-model-managed")
print(f"Total cost (30d): ${costs['total']:.2f}")
print(f"Cost per request: ${costs['cost_per_request']:.4f}")

Custom model deployment is a critical skill for production AI systems. Choose the right deployment option based on your workload characteristics, and implement proper monitoring and cost management from day one.