5 min read
LLM Fine-Tuning Strategies: When and How to Customize Models
Fine-tuning large language models allows you to customize their behavior for specific tasks. Today we’ll explore when fine-tuning makes sense and the various strategies available.
When to Fine-Tune
# Decision framework for fine-tuning
fine_tuning_decision = {
"consider_fine_tuning_when": [
"Specific domain terminology needed",
"Consistent output format required",
"Task requires specialized knowledge",
"Prompt engineering alone isn't sufficient",
"Cost reduction needed (shorter prompts)"
],
"avoid_fine_tuning_when": [
"Prompt engineering achieves goals",
"RAG provides needed context",
"Limited training data available",
"Task changes frequently",
"Base model already performs well"
]
}
# The hierarchy of customization
customization_hierarchy = [
"1. Prompt engineering (start here)",
"2. Few-shot examples in context",
"3. RAG (Retrieval Augmented Generation)",
"4. Fine-tuning (when above insufficient)"
]
Fine-Tuning Approaches
Full Fine-Tuning
# Full fine-tuning updates all model parameters
full_finetuning = {
"description": "Update all model weights",
"requirements": {
"compute": "Very high (multiple GPUs)",
"memory": "Full model must fit in memory",
"data": "Large high-quality dataset",
"time": "Hours to days"
},
"pros": [
"Maximum flexibility",
"Best potential performance",
"Full adaptation to domain"
],
"cons": [
"Very expensive",
"Risk of catastrophic forgetting",
"Requires significant infrastructure"
]
}
Parameter-Efficient Fine-Tuning (PEFT)
# PEFT methods update only a small subset of parameters
peft_methods = {
"lora": {
"name": "Low-Rank Adaptation",
"params_trained": "< 1% of total",
"memory": "Much lower than full",
"quality": "Near full fine-tuning quality"
},
"qlora": {
"name": "Quantized LoRA",
"params_trained": "< 1% of total",
"memory": "4-bit quantization + LoRA",
"quality": "Good, with memory efficiency"
},
"prefix_tuning": {
"name": "Prefix Tuning",
"params_trained": "Prefix tokens only",
"memory": "Very low",
"quality": "Task-specific optimization"
},
"adapters": {
"name": "Adapter Layers",
"params_trained": "Small adapter modules",
"memory": "Low",
"quality": "Good for multi-task"
}
}
Preparing Training Data
Data Format
# Standard fine-tuning data format
training_examples = [
{
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is the capital of France?"},
{"role": "assistant", "content": "The capital of France is Paris."}
]
},
{
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Summarize this text: ..."},
{"role": "assistant", "content": "Summary: ..."}
]
}
]
# Save as JSONL
import json
with open("training_data.jsonl", "w") as f:
for example in training_examples:
f.write(json.dumps(example) + "\n")
Data Quality Guidelines
data_quality_guidelines = {
"quantity": {
"minimum": "50-100 examples for simple tasks",
"recommended": "500-1000 for complex tasks",
"more_is_better": "But quality > quantity"
},
"quality": {
"accuracy": "All examples must be correct",
"consistency": "Same format and style throughout",
"diversity": "Cover edge cases and variations",
"relevance": "Only include task-relevant examples"
},
"format": {
"clear_instructions": "System prompts define behavior",
"realistic_inputs": "Match real-world usage",
"ideal_outputs": "Show exactly what you want"
}
}
# Data validation
def validate_training_data(examples):
issues = []
for i, example in enumerate(examples):
# Check structure
if "messages" not in example:
issues.append(f"Example {i}: Missing 'messages' key")
continue
messages = example["messages"]
# Check roles
roles = [m["role"] for m in messages]
if "assistant" not in roles:
issues.append(f"Example {i}: Missing assistant response")
# Check content length
for msg in messages:
if len(msg["content"]) < 10:
issues.append(f"Example {i}: Very short content detected")
return issues
Fine-Tuning with Azure OpenAI
from openai import AzureOpenAI
client = AzureOpenAI(
azure_endpoint="https://your-resource.openai.azure.com/",
api_key="your-api-key",
api_version="2024-02-15-preview"
)
# Upload training file
with open("training_data.jsonl", "rb") as f:
training_file = client.files.create(
file=f,
purpose="fine-tune"
)
print(f"Training file ID: {training_file.id}")
# Create fine-tuning job
job = client.fine_tuning.jobs.create(
training_file=training_file.id,
model="gpt-35-turbo-0613", # Base model
hyperparameters={
"n_epochs": 3,
"batch_size": 4,
"learning_rate_multiplier": 0.1
}
)
print(f"Job ID: {job.id}")
print(f"Status: {job.status}")
# Monitor job progress
import time
while job.status not in ["succeeded", "failed", "cancelled"]:
time.sleep(60)
job = client.fine_tuning.jobs.retrieve(job.id)
print(f"Status: {job.status}")
if job.status == "succeeded":
print(f"Fine-tuned model: {job.fine_tuned_model}")
Hyperparameter Tuning
# Key hyperparameters
hyperparameters = {
"n_epochs": {
"description": "Number of training passes",
"default": 3,
"guidance": "Start with 3, increase if underfitting"
},
"batch_size": {
"description": "Examples per training step",
"default": 4,
"guidance": "Larger = more stable, smaller = more updates"
},
"learning_rate_multiplier": {
"description": "Learning rate scaling",
"default": 1.0,
"guidance": "Lower for more subtle changes"
}
}
# Experimentation strategy
def hyperparameter_search():
experiments = [
{"n_epochs": 2, "learning_rate_multiplier": 0.1},
{"n_epochs": 3, "learning_rate_multiplier": 0.1},
{"n_epochs": 3, "learning_rate_multiplier": 0.5},
{"n_epochs": 4, "learning_rate_multiplier": 0.1},
]
results = []
for params in experiments:
job = train_model(params)
metrics = evaluate_model(job.fine_tuned_model)
results.append({"params": params, "metrics": metrics})
return sorted(results, key=lambda x: x["metrics"]["accuracy"], reverse=True)
Evaluation
# Evaluation dataset (separate from training)
evaluation_examples = [
{"input": "...", "expected_output": "..."},
# ... more examples
]
def evaluate_fine_tuned_model(model_name, test_data):
correct = 0
total = len(test_data)
for example in test_data:
response = client.chat.completions.create(
model=model_name,
messages=[
{"role": "user", "content": example["input"]}
]
)
actual = response.choices[0].message.content
expected = example["expected_output"]
# Your evaluation logic
if evaluate_match(actual, expected):
correct += 1
accuracy = correct / total
return {"accuracy": accuracy, "correct": correct, "total": total}
# Compare base vs fine-tuned
base_results = evaluate_fine_tuned_model("gpt-35-turbo", test_data)
finetuned_results = evaluate_fine_tuned_model("ft:gpt-35-turbo:...", test_data)
print(f"Base model accuracy: {base_results['accuracy']:.2%}")
print(f"Fine-tuned accuracy: {finetuned_results['accuracy']:.2%}")
Best Practices
best_practices = {
"data": {
"clean_data": "Ensure training data is accurate",
"diverse_examples": "Cover the full range of use cases",
"balanced": "Don't over-represent any category",
"hold_out_test": "Keep evaluation data separate"
},
"training": {
"start_small": "Begin with fewer epochs",
"monitor_loss": "Watch for overfitting",
"validate_early": "Test during training if possible"
},
"deployment": {
"a_b_test": "Compare against base model",
"monitor_performance": "Track real-world metrics",
"iterate": "Fine-tune again with new data"
}
}
Tomorrow we’ll dive deep into LoRA and QLoRA techniques.