Back to Blog
3 min read

GPU Optimization for LLM Workloads

Maximizing GPU utilization is essential for cost-effective LLM deployment. Today we explore GPU optimization techniques.

GPU Memory Management

import torch

# Check GPU memory
def print_gpu_memory():
    if torch.cuda.is_available():
        print(f"Allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
        print(f"Reserved: {torch.cuda.memory_reserved() / 1e9:.2f} GB")
        print(f"Max allocated: {torch.cuda.max_memory_allocated() / 1e9:.2f} GB")

# Clear cache
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

# Memory efficient model loading
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    torch_dtype=torch.float16,      # Half precision
    low_cpu_mem_usage=True,         # Efficient loading
    device_map="auto"               # Automatic placement
)

Multi-GPU Strategies

# Device map for model parallelism
device_map = {
    "model.embed_tokens": 0,
    "model.layers.0": 0,
    "model.layers.1": 0,
    # ... more layers on GPU 0
    "model.layers.16": 1,
    "model.layers.17": 1,
    # ... more layers on GPU 1
    "model.norm": 1,
    "lm_head": 1
}

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device_map,
    torch_dtype=torch.float16
)

# Automatic device map
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",  # Accelerate handles distribution
    torch_dtype=torch.float16
)

# Balanced device map
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="balanced",  # Equal distribution
    torch_dtype=torch.float16
)

CUDA Optimization Tips

# Enable TF32 for faster matmuls (Ampere+ GPUs)
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

# Use cuDNN benchmark for consistent input sizes
torch.backends.cudnn.benchmark = True

# Compile model for faster execution (PyTorch 2.0+)
model = torch.compile(model, mode="reduce-overhead")

# Inference mode context
with torch.inference_mode():
    outputs = model.generate(**inputs)

# Disable gradient computation
@torch.no_grad()
def generate(model, inputs):
    return model.generate(**inputs)

Memory-Efficient Attention

# Flash Attention 2 (much faster and memory efficient)
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    attn_implementation="flash_attention_2",
    torch_dtype=torch.float16,
    device_map="auto"
)

# Benefits:
# - O(N) memory instead of O(N^2)
# - 2-4x faster for long sequences
# - Native in transformers for supported models

Gradient Checkpointing (Training)

# For training: trade compute for memory
from transformers import TrainingArguments

training_args = TrainingArguments(
    gradient_checkpointing=True,  # Recompute activations
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    # ... other args
)

# Manual gradient checkpointing
model.gradient_checkpointing_enable()

Profiling GPU Usage

import torch.profiler

# Profile model execution
with torch.profiler.profile(
    activities=[
        torch.profiler.ProfilerActivity.CPU,
        torch.profiler.ProfilerActivity.CUDA,
    ],
    record_shapes=True,
    profile_memory=True,
    with_stack=True
) as prof:
    outputs = model.generate(**inputs, max_new_tokens=50)

# Print results
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))

# Export for visualization
prof.export_chrome_trace("trace.json")

Optimizing Batch Inference

def optimize_batch_inference(model, tokenizer, texts, batch_size=8):
    """Optimized batch inference with proper GPU utilization."""

    # Sort by length to minimize padding
    indexed_texts = [(i, t, len(tokenizer.encode(t))) for i, t in enumerate(texts)]
    sorted_texts = sorted(indexed_texts, key=lambda x: x[2])

    results = [None] * len(texts)

    with torch.inference_mode():
        for i in range(0, len(sorted_texts), batch_size):
            batch = sorted_texts[i:i+batch_size]
            batch_texts = [t[1] for t in batch]
            batch_indices = [t[0] for t in batch]

            # Tokenize with dynamic padding
            inputs = tokenizer(
                batch_texts,
                padding=True,
                truncation=True,
                return_tensors="pt"
            ).to(model.device)

            # Generate
            outputs = model.generate(
                **inputs,
                max_new_tokens=100,
                pad_token_id=tokenizer.eos_token_id
            )

            # Decode and store
            for j, idx in enumerate(batch_indices):
                results[idx] = tokenizer.decode(outputs[j], skip_special_tokens=True)

            # Clear cache between batches if needed
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

    return results

GPU Selection Guide

gpu_recommendations = {
    "inference_small": {
        "gpus": ["RTX 3090", "RTX 4090"],
        "models": "7B with quantization",
        "vram": "24GB"
    },
    "inference_medium": {
        "gpus": ["A100 40GB", "A6000"],
        "models": "7B-13B FP16",
        "vram": "40-48GB"
    },
    "inference_large": {
        "gpus": ["A100 80GB", "H100"],
        "models": "70B with quantization",
        "vram": "80GB+"
    },
    "training": {
        "gpus": ["A100", "H100"],
        "consideration": "Need multiple for large models",
        "vram": "80GB per GPU"
    }
}

Tomorrow we’ll explore Azure ML compute options.

Resources

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.