November 22, 2023 1 min read

Open Source LLMs: Comparing Llama, Mistral, and Beyond

AI LLM Open Source Llama Mistral Benchmarks

Open Source LLMs: Comparing Llama, Mistral, and Beyond

The open-source LLM landscape has exploded in 2023. With multiple high-quality models available, choosing the right one for your use case requires understanding their strengths and trade-offs.

The Open Source Landscape

from dataclasses import dataclass
from typing import List, Dict, Optional
from enum import Enum

class ModelFamily(Enum):
    LLAMA = "Meta Llama"
    MISTRAL = "Mistral AI"
    FALCON = "TII Falcon"
    MPT = "MosaicML MPT"
    QWEN = "Alibaba Qwen"
    YI = "01.AI Yi"

@dataclass
class OpenSourceModel:
    name: str
    family: ModelFamily
    parameters: str
    context_length: int
    license: str
    release_date: str
    specialties: List[str]
    benchmark_mmlu: float
    benchmark_humaneval: float

open_source_models = [
    OpenSourceModel(
        name="Llama-2-70B-Chat",
        family=ModelFamily.LLAMA,
        parameters="70B",
        context_length=4096,
        license="Llama 2 Community",
        release_date="2023-07",
        specialties=["General chat", "Reasoning", "Instruction following"],
        benchmark_mmlu=68.9,
        benchmark_humaneval=29.9
    ),
    OpenSourceModel(
        name="Mixtral-8x7B-Instruct",
        family=ModelFamily.MISTRAL,
        parameters="46.7B (12.9B active)",
        context_length=32768,
        license="Apache 2.0",
        release_date="2023-12",
        specialties=["Long context", "Multilingual", "Efficient inference"],
        benchmark_mmlu=70.6,
        benchmark_humaneval=40.2
    ),
    OpenSourceModel(
        name="Mistral-7B-Instruct",
        family=ModelFamily.MISTRAL,
        parameters="7B",
        context_length=8192,
        license="Apache 2.0",
        release_date="2023-09",
        specialties=["Fast inference", "Cost efficient", "Good base for fine-tuning"],
        benchmark_mmlu=60.1,
        benchmark_humaneval=30.5
    ),
    OpenSourceModel(
        name="Falcon-180B-Chat",
        family=ModelFamily.FALCON,
        parameters="180B",
        context_length=2048,
        license="Falcon-180B TII License",
        release_date="2023-09",
        specialties=["Large scale", "Multilingual"],
        benchmark_mmlu=70.4,
        benchmark_humaneval=None
    ),
    OpenSourceModel(
        name="Qwen-72B-Chat",
        family=ModelFamily.QWEN,
        parameters="72B",
        context_length=32768,
        license="Qianwen License",
        release_date="2023-11",
        specialties=["Chinese-English", "Long context", "Tool use"],
        benchmark_mmlu=74.4,
        benchmark_humaneval=35.4
    )
]

Comprehensive Benchmark Comparison

import pandas as pd
from typing import Dict, List

def create_benchmark_comparison():
    """Create comprehensive benchmark comparison."""

    benchmarks = {
        "Model": [
            "GPT-4 (reference)",
            "GPT-3.5-Turbo",
            "Llama-2-70B",
            "Mixtral-8x7B",
            "Mistral-7B",
            "Falcon-180B",
            "Qwen-72B"
        ],
        "MMLU (knowledge)": [86.4, 70.0, 68.9, 70.6, 60.1, 70.4, 74.4],
        "HumanEval (code)": [67.0, 48.1, 29.9, 40.2, 30.5, None, 35.4],
        "GSM8K (math)": [92.0, 57.1, 56.8, 60.4, 52.2, None, 61.3],
        "ARC (reasoning)": [96.3, 85.2, 67.3, 70.2, 64.5, None, 65.4],
        "TruthfulQA": [59.0, 47.0, 45.0, 46.8, 42.1, 40.2, 52.1],
        "Context Length": [128000, 16384, 4096, 32768, 8192, 2048, 32768],
        "Relative Cost": [1.0, 0.1, 0.05, 0.04, 0.01, 0.08, 0.05]
    }

    df = pd.DataFrame(benchmarks)
    return df

def calculate_value_score(row: pd.Series) -> float:
    """Calculate value score (performance/cost)."""
    # Normalize metrics (higher is better)
    mmlu_score = row["MMLU (knowledge)"] / 100
    code_score = (row["HumanEval (code)"] or 0) / 100
    math_score = (row["GSM8K (math)"] or 0) / 100

    # Average performance
    perf_score = (mmlu_score + code_score + math_score) / 3

    # Value = performance / cost
    cost = row["Relative Cost"]
    if cost == 0:
        return 0

    return perf_score / cost

# Generate comparison
comparison = create_benchmark_comparison()
comparison["Value Score"] = comparison.apply(calculate_value_score, axis=1)
print(comparison.to_string())

Use Case Decision Tree

def recommend_model(requirements: dict) -> dict:
    """Recommend open source model based on requirements."""

    recommendations = {
        "model": None,
        "reason": "",
        "alternatives": [],
        "considerations": []
    }

    # Decision logic
    context_needed = requirements.get("context_length", 4096)
    budget = requirements.get("budget", "medium")  # low, medium, high
    task_type = requirements.get("task_type", "general")
    language = requirements.get("language", "english")
    latency_critical = requirements.get("latency_critical", False)
    needs_fine_tuning = requirements.get("needs_fine_tuning", False)

    # Long context requirement
    if context_needed > 8192:
        if budget == "low":
            recommendations["model"] = "Mixtral-8x7B-Instruct"
            recommendations["reason"] = "Best long-context performance at low cost"
        else:
            recommendations["model"] = "Qwen-72B-Chat"
            recommendations["reason"] = "Highest quality with 32K context"
        recommendations["alternatives"].append("Llama-2-70B with context extension")

    # Latency critical
    elif latency_critical:
        recommendations["model"] = "Mistral-7B-Instruct"
        recommendations["reason"] = "Fastest inference, good quality for simple tasks"
        recommendations["alternatives"].append("Llama-2-7B-Chat")

    # Code generation
    elif task_type == "code":
        recommendations["model"] = "Mixtral-8x7B-Instruct"
        recommendations["reason"] = "Best code generation among open models"
        recommendations["alternatives"].extend(["CodeLlama-34B", "Llama-2-70B"])

    # Fine-tuning needed
    elif needs_fine_tuning:
        if budget == "low":
            recommendations["model"] = "Mistral-7B"
            recommendations["reason"] = "Apache 2.0 license, efficient fine-tuning"
        else:
            recommendations["model"] = "Llama-2-13B"
            recommendations["reason"] = "Good balance of capability and fine-tuning cost"

    # Chinese language
    elif "chinese" in language.lower():
        recommendations["model"] = "Qwen-72B-Chat"
        recommendations["reason"] = "Superior Chinese language capabilities"

    # General purpose
    else:
        if budget == "high":
            recommendations["model"] = "Llama-2-70B-Chat"
            recommendations["reason"] = "Best general performance among open models"
        else:
            recommendations["model"] = "Mixtral-8x7B-Instruct"
            recommendations["reason"] = "Near-GPT-3.5 performance at fraction of cost"

    # Add considerations
    recommendations["considerations"] = [
        "Test on your specific use case before production",
        "Consider fine-tuning for domain-specific tasks",
        "Monitor for quality issues compared to proprietary models",
        "Check license compatibility with your use case"
    ]

    return recommendations

# Example usage
reqs = {
    "context_length": 8000,
    "budget": "medium",
    "task_type": "customer_support",
    "language": "english",
    "latency_critical": True
}

recommendation = recommend_model(reqs)
print(f"Recommended: {recommendation['model']}")
print(f"Reason: {recommendation['reason']}")

Self-Hosting Considerations

def calculate_infrastructure_requirements(model_size: str) -> dict:
    """Calculate infrastructure needs for self-hosting."""

    # Approximate requirements for inference
    requirements = {
        "7B": {
            "gpu": "1x A10 (24GB) or 1x A100 (40GB)",
            "memory_required_gb": 14,
            "estimated_monthly_cost_azure": 1500,
            "throughput_tokens_per_sec": 50,
            "quantization_options": ["4-bit", "8-bit", "fp16"]
        },
        "13B": {
            "gpu": "1x A100 (40GB) or 2x A10",
            "memory_required_gb": 26,
            "estimated_monthly_cost_azure": 2500,
            "throughput_tokens_per_sec": 30,
            "quantization_options": ["4-bit", "8-bit"]
        },
        "70B": {
            "gpu": "2x A100 (80GB) or 8x A10",
            "memory_required_gb": 140,
            "estimated_monthly_cost_azure": 8000,
            "throughput_tokens_per_sec": 10,
            "quantization_options": ["4-bit", "8-bit"]
        }
    }

    return requirements.get(model_size, requirements["13B"])

def compare_hosting_vs_api():
    """Compare self-hosting vs API costs."""

    analysis = """
    # Self-Hosting vs API Cost Analysis

    ## API (e.g., Azure OpenAI GPT-3.5-Turbo)
    - $0.002 per 1K tokens
    - No infrastructure management
    - Scales automatically
    - Break-even at ~750M tokens/month

    ## Self-Hosted Mistral-7B (1x A100)
    - ~$2,500/month infrastructure
    - Can process unlimited tokens
    - Requires DevOps expertise
    - Better for high-volume, predictable workloads

    ## Recommendation
    - < 500M tokens/month: Use API
    - > 1B tokens/month: Consider self-hosting
    - Variable load: Use API with self-hosted fallback
    """
    return analysis

Best Practices

Benchmark on your data - Public benchmarks don’t tell the full story
Start small, scale up - Begin with 7B models, increase if needed
Consider quantization - 4-bit models offer good quality/cost trade-off
Plan for fine-tuning - Choose models with permissive licenses
Monitor quality - Set up evaluation pipelines

Tomorrow, we’ll dive into model benchmarking methodologies and how to evaluate models for your specific needs!