Back to Blog
5 min read

Local AI Models: Running Intelligence On-Device

Not every AI workload needs to call the cloud. Today I’m exploring when and how to run AI models locally on your device.

Why Local AI?

Benefits:

  • Zero latency to cloud
  • Works offline
  • Data never leaves device
  • No per-request costs
  • Regulatory compliance (data sovereignty)

Trade-offs:

  • Limited model size
  • Hardware requirements
  • Update complexity
  • Less capable than cloud models

When to Use Local vs Cloud

Use CaseLocalCloud
Real-time inferenceYesDepends
Sensitive dataYesRequires compliance
Complex reasoningLimitedYes
Offline requiredYesNo
Cost-sensitive (high volume)YesExpensive
Latest model capabilitiesNoYes

Local Model Options

Small Language Models

# Phi-3-mini (3.8B parameters)
# Runs on 8GB RAM, ~4GB VRAM

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_name = "microsoft/Phi-3-mini-4k-instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

def generate(prompt: str, max_tokens: int = 200) -> str:
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=max_tokens,
        do_sample=True,
        temperature=0.7,
        top_p=0.9
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

response = generate("Explain microservices in simple terms:")
print(response)

ONNX for Cross-Platform

import onnxruntime as ort
import numpy as np

class LocalLLM:
    def __init__(self, model_path: str):
        self.session = ort.InferenceSession(
            model_path,
            providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
        )

    def generate_token(self, input_ids: np.ndarray, past_key_values=None):
        inputs = {'input_ids': input_ids}
        if past_key_values is not None:
            inputs['past_key_values'] = past_key_values

        outputs = self.session.run(None, inputs)
        logits = outputs[0]
        new_past = outputs[1] if len(outputs) > 1 else None

        # Greedy selection
        next_token = np.argmax(logits[:, -1, :], axis=-1)
        return next_token, new_past

llama.cpp for Efficient Inference

from llama_cpp import Llama

# Load quantized model (GGUF format)
llm = Llama(
    model_path="./phi-3-mini-4k-instruct-q4_k_m.gguf",
    n_ctx=4096,        # Context window
    n_threads=8,       # CPU threads
    n_gpu_layers=35,   # Layers on GPU (if available)
    verbose=False
)

def chat(user_message: str) -> str:
    response = llm.create_chat_completion(
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": user_message}
        ],
        max_tokens=500,
        temperature=0.7
    )
    return response['choices'][0]['message']['content']

answer = chat("What are the benefits of local AI models?")
print(answer)

Local Vision Models

from transformers import AutoImageProcessor, AutoModelForImageClassification
from PIL import Image
import torch

# Load a small vision model
model_name = "microsoft/resnet-50"
processor = AutoImageProcessor.from_pretrained(model_name)
model = AutoModelForImageClassification.from_pretrained(model_name)

def classify_image(image_path: str) -> dict:
    image = Image.open(image_path)
    inputs = processor(images=image, return_tensors="pt")

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    predicted_idx = logits.argmax(-1).item()

    # Get top 5 predictions
    probs = torch.softmax(logits, dim=-1)
    top5_probs, top5_indices = torch.topk(probs, 5)

    return {
        "top_prediction": model.config.id2label[predicted_idx],
        "confidence": probs[0][predicted_idx].item(),
        "top_5": [
            {"label": model.config.id2label[idx.item()], "prob": prob.item()}
            for idx, prob in zip(top5_indices[0], top5_probs[0])
        ]
    }

Local Embeddings

from sentence_transformers import SentenceTransformer
import numpy as np

# Load a small, efficient embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')  # 80MB, 384 dimensions

def get_embeddings(texts: list[str]) -> np.ndarray:
    return model.encode(texts, show_progress_bar=False)

def semantic_search(query: str, documents: list[str], top_k: int = 5) -> list:
    # Encode query and documents
    query_embedding = get_embeddings([query])[0]
    doc_embeddings = get_embeddings(documents)

    # Compute cosine similarity
    similarities = np.dot(doc_embeddings, query_embedding) / (
        np.linalg.norm(doc_embeddings, axis=1) * np.linalg.norm(query_embedding)
    )

    # Get top-k results
    top_indices = np.argsort(similarities)[-top_k:][::-1]

    return [
        {"document": documents[i], "score": similarities[i]}
        for i in top_indices
    ]

# Usage
documents = [
    "Azure provides cloud computing services",
    "Python is a programming language",
    "Machine learning models can run locally",
    "Docker containers package applications"
]

results = semantic_search("cloud services", documents)
for r in results:
    print(f"{r['score']:.3f}: {r['document']}")

Hybrid Architecture

Combine local and cloud models:

class HybridAI:
    def __init__(self, local_model, cloud_client):
        self.local = local_model
        self.cloud = cloud_client
        self.local_capabilities = self._assess_local()

    def _assess_local(self) -> set:
        return {
            "simple_qa",
            "classification",
            "embedding",
            "summarization_short"
        }

    async def process(self, task: str, input_data: str) -> str:
        # Route based on task complexity
        if task in self.local_capabilities:
            return self._run_local(task, input_data)

        # Check if we can run locally with degraded quality
        if self._can_degrade(task):
            if not self._has_internet():
                return self._run_local(task, input_data)

        # Use cloud
        return await self._run_cloud(task, input_data)

    def _run_local(self, task: str, input_data: str) -> str:
        if task == "classification":
            return self.local.classify(input_data)
        elif task == "embedding":
            return self.local.embed(input_data)
        elif task == "simple_qa":
            return self.local.generate(input_data)
        # ... other tasks

    async def _run_cloud(self, task: str, input_data: str) -> str:
        response = await self.cloud.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": input_data}]
        )
        return response.choices[0].message.content

Performance Optimization

import torch
from contextlib import contextmanager
import time

@contextmanager
def inference_mode():
    """Context manager for optimized inference."""
    torch.set_grad_enabled(False)
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    try:
        yield
    finally:
        torch.set_grad_enabled(True)

class OptimizedLocalModel:
    def __init__(self, model_path: str):
        self.model = self._load_optimized(model_path)

    def _load_optimized(self, path: str):
        model = torch.load(path)
        model.eval()

        # Optimize for inference
        if torch.cuda.is_available():
            model = model.cuda()
            model = model.half()  # FP16 for speed

        # Try to compile (PyTorch 2.0+)
        try:
            model = torch.compile(model, mode="reduce-overhead")
        except:
            pass

        return model

    def predict(self, inputs):
        with inference_mode():
            return self.model(inputs)

Memory Management

class MemoryEfficientInference:
    def __init__(self, model_path: str, max_memory_gb: float = 4.0):
        self.model_path = model_path
        self.max_memory = max_memory_gb * 1024**3  # Convert to bytes
        self.model = None

    def load_model(self):
        if self.model is None:
            self.model = self._load_with_limits()
        return self.model

    def unload_model(self):
        if self.model is not None:
            del self.model
            self.model = None
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            import gc
            gc.collect()

    def _load_with_limits(self):
        # Load with memory limits
        return AutoModelForCausalLM.from_pretrained(
            self.model_path,
            torch_dtype=torch.float16,
            low_cpu_mem_usage=True,
            device_map="auto",
            max_memory={0: f"{int(self.max_memory)}B"}
        )

What’s Next

Tomorrow I’ll dive into Microsoft’s Phi-3 family of small language models.

Resources

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.