February 9, 2025 1 min read

Phi Model Family: Microsoft's Small Language Models Guide

Microsoft’s Phi model family represents the cutting edge of small language models (SLMs). These models deliver impressive capabilities while being small enough to run on-device. Let’s explore the Phi family and how to use them.

The Phi Model Lineup

Phi-1 (2023)
├── 1.3B parameters
├── Code-focused
└── Research model

Phi-2 (2023)
├── 2.7B parameters
├── General purpose
└── Outperforms larger models on many benchmarks

Phi-3 (2024)
├── Phi-3-mini: 3.8B parameters
├── Phi-3-small: 7B parameters
├── Phi-3-medium: 14B parameters
└── Multi-modal versions available

Phi-4 (2025) - Expected
├── Enhanced reasoning
├── Longer context
└── Better efficiency

Why Phi Models?

# Phi models excel at:

use_cases = {
    "on_device": {
        "description": "Run locally without cloud",
        "latency": "< 100ms",
        "privacy": "Data never leaves device"
    },
    "cost_effective": {
        "description": "Lower inference costs",
        "savings": "10-100x cheaper than GPT-4",
        "throughput": "Higher tokens/second"
    },
    "embedded": {
        "description": "IoT and edge devices",
        "memory": "< 4GB RAM",
        "power": "CPU inference possible"
    },
    "fine_tuning": {
        "description": "Customize for specific tasks",
        "data_needed": "Much less than large models",
        "compute": "Single GPU sufficient"
    }
}

Using Phi-3 on Azure

from azure.ai.inference import ChatCompletionsClient
from azure.core.credentials import AzureKeyCredential

# Deploy Phi-3 from Azure AI Model Catalog
client = ChatCompletionsClient(
    endpoint="https://my-phi3-endpoint.eastus.inference.ml.azure.com",
    credential=AzureKeyCredential(api_key)
)

# Chat completion
response = client.complete(
    messages=[
        {"role": "system", "content": "You are a helpful coding assistant."},
        {"role": "user", "content": "Write a Python function to parse JSON safely"}
    ],
    model="phi-3-mini-128k-instruct",
    max_tokens=500,
    temperature=0.7
)

print(response.choices[0].message.content)

Local Deployment with Ollama

# Install Ollama and pull Phi-3
ollama pull phi3:mini

# Run inference
ollama run phi3:mini "Explain data partitioning in simple terms"

# Python client for Ollama
import requests

def query_phi3(prompt: str) -> str:
    response = requests.post(
        "http://localhost:11434/api/generate",
        json={
            "model": "phi3:mini",
            "prompt": prompt,
            "stream": False
        }
    )
    return response.json()["response"]

# Use in applications
result = query_phi3("Write SQL to find duplicate records")

ONNX Runtime Deployment

import onnxruntime_genai as og

class Phi3Inference:
    def __init__(self, model_path: str):
        self.model = og.Model(model_path)
        self.tokenizer = og.Tokenizer(self.model)
        self.params = og.GeneratorParams(self.model)

    def generate(
        self,
        prompt: str,
        max_tokens: int = 256,
        temperature: float = 0.7
    ) -> str:
        # Set generation parameters
        self.params.set_search_options(
            max_length=max_tokens,
            temperature=temperature,
            top_p=0.9
        )

        # Tokenize input
        input_tokens = self.tokenizer.encode(prompt)
        self.params.input_ids = input_tokens

        # Generate
        generator = og.Generator(self.model, self.params)

        output_tokens = []
        while not generator.is_done():
            generator.compute_logits()
            generator.generate_next_token()
            output_tokens.append(generator.get_next_tokens()[0])

        # Decode
        output_text = self.tokenizer.decode(output_tokens)
        return output_text

# Usage
phi3 = Phi3Inference("phi-3-mini-onnx")
response = phi3.generate("Summarize the key benefits of data mesh:")

Fine-Tuning Phi-3

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, get_peft_model
import torch

def fine_tune_phi3(
    train_data,
    model_name: str = "microsoft/Phi-3-mini-4k-instruct",
    output_dir: str = "./phi3-finetuned"
):
    # Load model and tokenizer
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Configure LoRA for efficient fine-tuning
    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"
    )

    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()  # ~0.1% of parameters

    # Training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=3,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=100,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=10,
        save_steps=100,
        evaluation_strategy="steps",
        eval_steps=100
    )

    # Train
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        tokenizer=tokenizer
    )

    trainer.train()
    trainer.save_model()

    return output_dir

Phi-3 vs Other Models

# Benchmark comparison (approximate)

benchmarks = {
    "Phi-3-mini (3.8B)": {
        "MMLU": 68.8,
        "GSM8K": 75.0,
        "HumanEval": 58.5,
        "Memory_GB": 3.8,
        "Tokens_per_sec_cpu": 15
    },
    "Llama-3-8B": {
        "MMLU": 66.6,
        "GSM8K": 77.4,
        "HumanEval": 62.2,
        "Memory_GB": 8.0,
        "Tokens_per_sec_cpu": 8
    },
    "GPT-3.5-turbo": {
        "MMLU": 70.0,
        "GSM8K": 57.1,
        "HumanEval": 48.1,
        "Memory_GB": "N/A (cloud)",
        "Tokens_per_sec_cpu": "N/A"
    }
}

# Phi-3 achieves comparable quality with:
# - 2x smaller than Llama-3-8B
# - 2x faster inference
# - Runs on consumer hardware

Best Use Cases for Phi

# When to use Phi models

ideal_scenarios = [
    {
        "scenario": "Code completion in IDE",
        "why": "Low latency needed, runs locally",
        "model": "phi-3-mini"
    },
    {
        "scenario": "Mobile app AI features",
        "why": "On-device, no API costs",
        "model": "phi-3-mini-quantized"
    },
    {
        "scenario": "High-volume classification",
        "why": "Cost at scale, fine-tunable",
        "model": "phi-3-small"
    },
    {
        "scenario": "Edge IoT processing",
        "why": "Offline capable, low power",
        "model": "phi-3-mini"
    },
    {
        "scenario": "Internal chatbot",
        "why": "Privacy, no data leaves org",
        "model": "phi-3-medium"
    }
]

# When to use larger models instead
use_larger_model = [
    "Complex multi-step reasoning",
    "Very long documents (>32K tokens)",
    "Creative writing tasks",
    "Latest knowledge needed",
    "Multi-modal with high quality"
]

Phi in Production

from fastapi import FastAPI
import uvicorn

app = FastAPI()

# Load model at startup
phi3 = None

@app.on_event("startup")
async def load_model():
    global phi3
    phi3 = Phi3Inference("phi-3-mini-onnx")

@app.post("/generate")
async def generate(prompt: str, max_tokens: int = 256):
    response = phi3.generate(prompt, max_tokens)
    return {"response": response}

@app.get("/health")
async def health():
    return {"status": "healthy", "model": "phi-3-mini"}

# Run: uvicorn app:app --host 0.0.0.0 --port 8000

Phi models represent the future of practical AI - powerful enough for real tasks, small enough to deploy anywhere. Start with Phi-3-mini for most use cases and scale up only if needed.