March 8, 2024 1 min read

Meta Llama 2 70B on Azure: Open Source Power

Meta’s Llama 2 70B represents the state of the art in open-source large language models. Available on Azure AI, it offers a compelling alternative to proprietary models for enterprises with specific requirements.

Why Llama 2 70B?

Open weights: Full transparency and customization potential
No API costs: Pay only for compute
Fine-tuning friendly: Customize for your domain
Strong performance: Competitive with proprietary models

Deployment Options on Azure

Managed Compute Deployment

from azure.ai.ml import MLClient
from azure.ai.ml.entities import (
    ManagedOnlineEndpoint,
    ManagedOnlineDeployment
)
from azure.identity import DefaultAzureCredential

credential = DefaultAzureCredential()
ml_client = MLClient(
    credential=credential,
    subscription_id="your-subscription",
    resource_group="your-rg",
    workspace_name="your-workspace"
)

# Create endpoint
endpoint = ManagedOnlineEndpoint(
    name="llama-2-70b-endpoint",
    description="Llama 2 70B deployment",
    auth_mode="key"
)
ml_client.online_endpoints.begin_create_or_update(endpoint).result()

# Deploy model
deployment = ManagedOnlineDeployment(
    name="llama-2-70b",
    endpoint_name="llama-2-70b-endpoint",
    model="azureml://registries/azureml-meta/models/Llama-2-70b-chat/versions/1",
    instance_type="Standard_NC96ads_A100_v4",  # 4x A100 80GB
    instance_count=1,
    environment_variables={
        "TENSOR_PARALLEL_SIZE": "4"
    }
)
ml_client.online_deployments.begin_create_or_update(deployment).result()

Using Azure Container Instances for Dev/Test

# Pull and run locally first
docker pull mcr.microsoft.com/azureml/curated/llama-2-70b:latest

# Deploy to ACI
az container create \
    --resource-group your-rg \
    --name llama-2-70b-aci \
    --image mcr.microsoft.com/azureml/curated/llama-2-70b:latest \
    --cpu 8 \
    --memory 64 \
    --gpu-count 4 \
    --gpu-sku A100 \
    --ports 8000

Interacting with the Model

import requests

class Llama2Client:
    def __init__(self, endpoint_url: str, api_key: str):
        self.endpoint_url = endpoint_url
        self.api_key = api_key

    def chat(
        self,
        messages: list,
        max_tokens: int = 1024,
        temperature: float = 0.7
    ) -> str:
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {self.api_key}"
        }

        # Format for Llama 2 chat template
        formatted_prompt = self._format_messages(messages)

        payload = {
            "inputs": formatted_prompt,
            "parameters": {
                "max_new_tokens": max_tokens,
                "temperature": temperature,
                "do_sample": True,
                "top_p": 0.9
            }
        }

        response = requests.post(
            f"{self.endpoint_url}/score",
            headers=headers,
            json=payload
        )

        return response.json()["generated_text"]

    def _format_messages(self, messages: list) -> str:
        """Format messages using Llama 2 chat template"""
        prompt = ""
        for msg in messages:
            if msg["role"] == "system":
                prompt += f"<<SYS>>\n{msg['content']}\n<</SYS>>\n\n"
            elif msg["role"] == "user":
                prompt += f"[INST] {msg['content']} [/INST]"
            elif msg["role"] == "assistant":
                prompt += f" {msg['content']}"
        return prompt

# Usage
client = Llama2Client(
    endpoint_url="https://llama-2-70b-endpoint.eastus.inference.ml.azure.com",
    api_key="your-api-key"
)

response = client.chat([
    {"role": "system", "content": "You are a helpful coding assistant."},
    {"role": "user", "content": "Write a Python function to calculate fibonacci numbers."}
])
print(response)

Quantization for Cost Optimization

# Using bitsandbytes for quantization
from azure.ai.ml.entities import ManagedOnlineDeployment, CodeConfiguration

deployment_4bit = ManagedOnlineDeployment(
    name="llama-2-70b-4bit",
    endpoint_name="llama-2-70b-endpoint",
    model="azureml://registries/azureml-meta/models/Llama-2-70b-chat/versions/1",
    instance_type="Standard_NC24ads_A100_v4",  # Single A100
    instance_count=1,
    environment_variables={
        "QUANTIZATION": "4bit",
        "LOAD_IN_4BIT": "true"
    }
)

Comparing with Proprietary Models

import time
from dataclasses import dataclass

@dataclass
class BenchmarkResult:
    model: str
    latency_ms: float
    tokens_per_second: float
    quality_score: float

def benchmark_models():
    test_prompt = "Explain the concept of recursion in programming."

    results = []

    # Llama 2 70B
    start = time.time()
    llama_response = llama_client.chat([{"role": "user", "content": test_prompt}])
    llama_latency = (time.time() - start) * 1000

    results.append(BenchmarkResult(
        model="Llama-2-70B",
        latency_ms=llama_latency,
        tokens_per_second=len(llama_response.split()) / (llama_latency / 1000),
        quality_score=0.85  # Manual evaluation
    ))

    return results

# Run benchmark
results = benchmark_models()
for r in results:
    print(f"{r.model}: {r.latency_ms:.0f}ms, {r.tokens_per_second:.1f} tok/s")

Cost Analysis

# Cost comparison (approximate)
MONTHLY_COSTS = {
    "llama-2-70b-managed": {
        "compute": 15000,  # 4x A100 for 30 days
        "inference": 0,     # No per-token cost
        "total": 15000
    },
    "gpt-4-turbo": {
        "compute": 0,
        "inference": 40000,  # 1M tokens/day
        "total": 40000
    }
}

def calculate_break_even(daily_tokens: int):
    """Calculate when self-hosting becomes cheaper"""
    gpt4_cost_per_token = 0.00003  # Approximate blended rate
    llama_fixed_cost = 15000

    gpt4_monthly = daily_tokens * 30 * gpt4_cost_per_token

    if gpt4_monthly > llama_fixed_cost:
        return f"Self-hosting saves ${gpt4_monthly - llama_fixed_cost:.0f}/month"
    else:
        return f"API is cheaper by ${llama_fixed_cost - gpt4_monthly:.0f}/month"

print(calculate_break_even(1_000_000))  # 1M tokens/day

Security Considerations

# Virtual Network Integration
from azure.ai.ml.entities import ManagedOnlineEndpoint

secure_endpoint = ManagedOnlineEndpoint(
    name="llama-2-70b-secure",
    auth_mode="aml_token",
    public_network_access="disabled",
    # VNet integration
)

# Data never leaves your Azure environment
# No external API calls
# Full audit logging

Conclusion

Llama 2 70B on Azure provides enterprise-grade open-source AI with full control over data and costs. It’s ideal for organizations with high volume inference needs or strict data sovereignty requirements.