November 20, 2023 2 min read

Mistral 7B on Azure: Exploring Open-Source LLM Deployment

Mistral AI, the French AI startup founded by former DeepMind and Meta researchers, has released Mistral 7B - a compact yet powerful open-source language model. Now available on Azure, Mistral 7B offers an excellent cost-performance ratio for many use cases.

Understanding Mistral 7B

Mistral 7B punches above its weight class:

from dataclasses import dataclass
from typing import List

@dataclass
class MistralModel:
    name: str
    parameters: str
    context_window: int
    architecture: str
    license: str
    strengths: List[str]

mistral_7b = MistralModel(
    name="Mistral 7B",
    parameters="7.3B",
    context_window=8192,
    architecture="Dense Transformer with Sliding Window Attention",
    license="Apache 2.0",
    strengths=[
        "Excellent cost-efficiency",
        "Fast inference speed",
        "Strong performance for its size",
        "Apache 2.0 license for commercial use",
        "Sliding window attention for efficiency",
        "Good multilingual capabilities"
    ]
)

def compare_to_alternatives():
    """Compare Mistral 7B to similar-sized alternatives."""
    comparison = {
        "mistral-7b-vs-llama2-7b": {
            "performance": "Mistral 7B often outperforms Llama 2 7B on benchmarks",
            "speed": "Comparable inference speed",
            "context": "Mistral: 8K vs Llama 2: 4K tokens",
            "verdict": "Mistral 7B is generally preferred for new projects"
        },
        "mistral-7b-vs-gpt35": {
            "performance": "~85% of GPT-3.5 quality on many tasks",
            "cost": "Significantly cheaper when self-hosted",
            "speed": "Faster inference with local deployment",
            "verdict": "Great for high-volume, cost-sensitive applications"
        }
    }
    return comparison

Deploying Mistral 7B on Azure

Using Azure Machine Learning

from azure.ai.ml import MLClient
from azure.ai.ml.entities import (
    ManagedOnlineEndpoint,
    ManagedOnlineDeployment,
    Model,
    Environment
)
from azure.identity import DefaultAzureCredential

class MistralDeployer:
    def __init__(
        self,
        subscription_id: str,
        resource_group: str,
        workspace: str
    ):
        self.client = MLClient(
            DefaultAzureCredential(),
            subscription_id,
            resource_group,
            workspace
        )

    def create_endpoint(self, endpoint_name: str) -> ManagedOnlineEndpoint:
        """Create a managed online endpoint for Mistral."""
        endpoint = ManagedOnlineEndpoint(
            name=endpoint_name,
            description="Mistral 7B Instruct deployment",
            auth_mode="key"
        )

        return self.client.online_endpoints.begin_create_or_update(
            endpoint
        ).result()

    def deploy_mistral(
        self,
        endpoint_name: str,
        deployment_name: str = "mistral-7b-instruct"
    ):
        """Deploy Mistral 7B from the model registry."""
        # Reference model from Azure ML registry
        model = Model(
            path="azureml://registries/azureml/models/mistral-7b-instruct/versions/latest"
        )

        deployment = ManagedOnlineDeployment(
            name=deployment_name,
            endpoint_name=endpoint_name,
            model=model,
            instance_type="Standard_NC24ads_A100_v4",
            instance_count=1,
            environment_variables={
                "MAX_TOTAL_TOKENS": "8192",
                "MAX_INPUT_LENGTH": "7168"
            }
        )

        return self.client.online_deployments.begin_create_or_update(
            deployment
        ).result()

    def get_deployment_config(self) -> dict:
        """Get recommended deployment configuration."""
        return {
            "instance_type": "Standard_NC24ads_A100_v4",
            "min_instances": 1,
            "max_instances": 10,
            "target_utilization": 70,
            "estimated_throughput_tps": 400,
            "estimated_cost_per_hour": "$3.50"
        }

# Usage
deployer = MistralDeployer(
    subscription_id="your-subscription",
    resource_group="your-rg",
    workspace="your-workspace"
)

# Deploy
endpoint = deployer.create_endpoint("mistral-endpoint")
deployment = deployer.deploy_mistral("mistral-endpoint")

Using the Deployed Model

Basic Inference

import requests
import json
from typing import List, Dict

class MistralClient:
    def __init__(self, endpoint_url: str, api_key: str):
        self.endpoint_url = endpoint_url
        self.api_key = api_key
        self.headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {api_key}"
        }

    def chat_completion(
        self,
        messages: List[Dict],
        temperature: float = 0.7,
        max_tokens: int = 1024,
        top_p: float = 0.95
    ) -> dict:
        """Generate chat completion with Mistral 7B."""
        # Mistral uses special tokens for chat
        prompt = self._format_chat_messages(messages)

        payload = {
            "inputs": prompt,
            "parameters": {
                "temperature": temperature,
                "max_new_tokens": max_tokens,
                "top_p": top_p,
                "do_sample": True,
                "return_full_text": False
            }
        }

        response = requests.post(
            f"{self.endpoint_url}/score",
            headers=self.headers,
            json=payload
        )

        return response.json()

    def _format_chat_messages(self, messages: List[Dict]) -> str:
        """Format messages for Mistral's chat template."""
        formatted = ""
        for msg in messages:
            role = msg["role"]
            content = msg["content"]

            if role == "system":
                formatted += f"<s>[INST] {content}\n"
            elif role == "user":
                if formatted:
                    formatted += f"{content} [/INST]"
                else:
                    formatted += f"<s>[INST] {content} [/INST]"
            elif role == "assistant":
                formatted += f" {content}</s>"

        return formatted

    def complete(
        self,
        prompt: str,
        max_tokens: int = 512,
        temperature: float = 0.7
    ) -> str:
        """Simple text completion."""
        payload = {
            "inputs": prompt,
            "parameters": {
                "max_new_tokens": max_tokens,
                "temperature": temperature,
                "do_sample": True
            }
        }

        response = requests.post(
            f"{self.endpoint_url}/score",
            headers=self.headers,
            json=payload
        )

        result = response.json()
        return result[0]["generated_text"]

# Usage
client = MistralClient(
    endpoint_url="https://your-endpoint.inference.ml.azure.com",
    api_key="your-api-key"
)

# Chat completion
response = client.chat_completion([
    {"role": "system", "content": "You are a helpful coding assistant."},
    {"role": "user", "content": "Write a Python function to calculate fibonacci numbers."}
])

print(response)

Streaming Responses

import sseclient

class MistralStreamingClient(MistralClient):
    def stream_chat(
        self,
        messages: List[Dict],
        temperature: float = 0.7
    ):
        """Stream chat responses token by token."""
        prompt = self._format_chat_messages(messages)

        payload = {
            "inputs": prompt,
            "parameters": {
                "temperature": temperature,
                "max_new_tokens": 1024,
                "do_sample": True,
                "stream": True
            }
        }

        response = requests.post(
            f"{self.endpoint_url}/score",
            headers=self.headers,
            json=payload,
            stream=True
        )

        client = sseclient.SSEClient(response)
        for event in client.events():
            if event.data:
                yield json.loads(event.data)

# Usage
streaming_client = MistralStreamingClient(endpoint_url, api_key)

for token in streaming_client.stream_chat([
    {"role": "user", "content": "Explain quantum computing in simple terms."}
]):
    print(token.get("token", {}).get("text", ""), end="", flush=True)

Use Cases for Mistral 7B

mistral_use_cases = {
    "high_volume_chat": {
        "description": "Customer support with many concurrent users",
        "why_mistral": "Low cost per request, fast response times",
        "example": "Handle 10K+ daily support queries at fraction of GPT-3.5 cost"
    },
    "content_classification": {
        "description": "Categorizing documents, tickets, or messages",
        "why_mistral": "Simple task, high throughput needed, consistent results",
        "example": "Classify support tickets into 20 categories"
    },
    "code_assistance": {
        "description": "IDE autocomplete and simple code generation",
        "why_mistral": "Fast inference for real-time suggestions",
        "example": "Suggest code completions with <200ms latency"
    },
    "summarization": {
        "description": "Summarizing articles, documents, conversations",
        "why_mistral": "Good quality at lower cost for bulk processing",
        "example": "Summarize thousands of customer reviews daily"
    },
    "data_extraction": {
        "description": "Extracting structured data from unstructured text",
        "why_mistral": "Consistent format following, cost-effective",
        "example": "Extract contact info from emails at scale"
    }
}

def recommend_mistral_use(
    task_complexity: str,
    volume: str,
    latency_requirement: str,
    budget_sensitivity: str
) -> dict:
    """Determine if Mistral 7B is right for your use case."""
    score = 0
    reasons = []

    # Task complexity
    if task_complexity in ["low", "medium"]:
        score += 30
        reasons.append("Task complexity is suitable for Mistral 7B")
    else:
        reasons.append("Consider larger models for complex reasoning")

    # Volume
    if volume in ["high", "very_high"]:
        score += 30
        reasons.append("High volume makes cost savings significant")

    # Latency
    if latency_requirement == "low":
        score += 20
        reasons.append("Mistral 7B offers fast inference")

    # Budget
    if budget_sensitivity in ["high", "very_high"]:
        score += 20
        reasons.append("Cost-effective solution for budget-conscious deployments")

    return {
        "score": score,
        "recommendation": "Strongly consider Mistral 7B" if score >= 70 else "Evaluate alternatives",
        "reasons": reasons
    }

Performance Optimization

class MistralOptimizer:
    """Optimization strategies for Mistral 7B deployment."""

    @staticmethod
    def batch_requests(requests: List[str], batch_size: int = 8) -> List[List[str]]:
        """Batch requests for better throughput."""
        return [
            requests[i:i + batch_size]
            for i in range(0, len(requests), batch_size)
        ]

    @staticmethod
    def optimize_prompt(prompt: str) -> str:
        """Optimize prompt for Mistral's context window."""
        # Mistral works well with concise prompts
        # Remove unnecessary whitespace
        prompt = " ".join(prompt.split())

        # Keep prompts focused
        if len(prompt) > 6000:
            # Truncate with summary request
            prompt = prompt[:5500] + "\n\n[Content truncated. Summarize the key points above.]"

        return prompt

    @staticmethod
    def select_instance_type(
        expected_qps: float,
        latency_target_ms: float
    ) -> dict:
        """Select appropriate instance type based on requirements."""
        configs = {
            "low_traffic": {
                "instance": "Standard_NC6s_v3",
                "max_qps": 5,
                "avg_latency_ms": 500,
                "cost_per_hour": "$1.50"
            },
            "medium_traffic": {
                "instance": "Standard_NC24ads_A100_v4",
                "max_qps": 50,
                "avg_latency_ms": 200,
                "cost_per_hour": "$3.50"
            },
            "high_traffic": {
                "instance": "Standard_NC48ads_A100_v4",
                "max_qps": 100,
                "avg_latency_ms": 150,
                "cost_per_hour": "$7.00"
            }
        }

        if expected_qps <= 5 and latency_target_ms >= 500:
            return configs["low_traffic"]
        elif expected_qps <= 50:
            return configs["medium_traffic"]
        else:
            return configs["high_traffic"]

# Usage
optimizer = MistralOptimizer()
config = optimizer.select_instance_type(
    expected_qps=30,
    latency_target_ms=300
)
print(f"Recommended: {config['instance']} at {config['cost_per_hour']}")

Best Practices

Start with Mistral 7B Instruct - The instruction-tuned version works best for chat and task completion
Use appropriate temperature - 0.1-0.3 for factual tasks, 0.7-0.9 for creative tasks
Leverage the 8K context - But be mindful that quality may degrade with very long contexts
Batch when possible - Process multiple requests together for better throughput
Monitor quality - Establish baselines and track performance over time
Consider fine-tuning - For domain-specific tasks, fine-tuning can significantly improve results

Conclusion

Mistral 7B represents an excellent option for organizations looking to balance cost and performance. Its Apache 2.0 license, strong benchmark results, and efficient architecture make it a compelling choice for many production use cases. As open-source models continue to improve, expect Mistral and similar models to handle increasingly complex tasks.

Stay tuned for coverage of larger models as they become available on Azure.