August 3, 2023 1 min read

Parameter-Efficient Fine-Tuning: A Comprehensive Guide

Parameter-Efficient Fine-Tuning (PEFT) methods enable customizing large models while training only a small fraction of parameters. Today we’ll explore the landscape of PEFT techniques.

PEFT Overview

# Why PEFT matters
peft_benefits = {
    "memory_efficient": "Train on consumer GPUs",
    "storage_efficient": "Small adapter files (MBs vs GBs)",
    "modular": "Swap adapters without reloading base model",
    "fast": "Quicker training iterations",
    "prevents_forgetting": "Base model knowledge preserved"
}

# PEFT method categories
peft_categories = {
    "additive": ["Adapters", "Prefix Tuning", "Prompt Tuning"],
    "selective": ["BitFit", "Diff Pruning"],
    "reparameterization": ["LoRA", "AdaLoRA", "DoRA"]
}

Adapter Methods

Standard Adapters

from peft import AdapterConfig, get_peft_model

# Adapter configuration
adapter_config = AdapterConfig(
    adapter_size=64,           # Size of adapter layers
    adapter_act="relu",        # Activation function
    adapter_dropout=0.1        # Dropout probability
)

# Apply adapters
model = get_peft_model(base_model, adapter_config)

# How adapters work:
"""
Original layer:
    output = layer(input)

With adapter:
    hidden = layer(input)
    adapter_output = down_project(hidden)  # Reduce dimension
    adapter_output = activation(adapter_output)
    adapter_output = up_project(adapter_output)  # Restore dimension
    output = hidden + adapter_output  # Residual connection
"""

Parallel Adapters

# Parallel adapters process input alongside the original layer
parallel_adapter_config = AdapterConfig(
    adapter_size=64,
    parallel=True  # Process in parallel instead of serial
)

# Benefits:
# - Can be more expressive
# - Better gradient flow
# - Slightly more compute

Prefix Tuning

from peft import PrefixTuningConfig

# Prefix tuning adds trainable vectors to attention
prefix_config = PrefixTuningConfig(
    num_virtual_tokens=20,    # Number of prefix tokens
    encoder_hidden_size=768,  # Hidden size for prefix encoding
    prefix_projection=True    # Use MLP to generate prefix
)

model = get_peft_model(base_model, prefix_config)

# How it works:
"""
Normal attention:
    attention(Q, K, V)

With prefix:
    K' = concat(prefix_K, K)
    V' = concat(prefix_V, V)
    attention(Q, K', V')

The prefix tokens are learned during training.
"""

Prompt Tuning

from peft import PromptTuningConfig, PromptTuningInit

# Soft prompt tuning
prompt_config = PromptTuningConfig(
    num_virtual_tokens=20,
    prompt_tuning_init=PromptTuningInit.TEXT,
    prompt_tuning_init_text="Classify the sentiment of the following text:",
    tokenizer_name_or_path=model_name
)

model = get_peft_model(base_model, prompt_config)

# How it works:
"""
Instead of discrete tokens: "Classify sentiment:"
Learn continuous embeddings that serve the same purpose
but are optimized for the specific task.

Input: [learned_prompt_embeddings] + [actual_input_embeddings]
"""

P-Tuning

from peft import PromptEncoderConfig, TaskType

# P-Tuning v2 configuration
p_tuning_config = PromptEncoderConfig(
    num_virtual_tokens=20,
    encoder_hidden_size=128,
    encoder_num_layers=2,
    encoder_dropout=0.1,
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(base_model, p_tuning_config)

# Difference from prompt tuning:
# - Uses an encoder (MLP/LSTM) to generate prompt embeddings
# - More expressive but more parameters
# - Better for complex tasks

IA3 (Infused Adapter by Inhibiting and Amplifying Inner Activations)

from peft import IA3Config

# IA3 configuration
ia3_config = IA3Config(
    target_modules=["k_proj", "v_proj", "down_proj"],
    feedforward_modules=["down_proj"]
)

model = get_peft_model(base_model, ia3_config)

# How it works:
"""
Instead of adding new layers:
    output = activation * learned_vector

Very few parameters (just scaling vectors)
But effective for many tasks.
"""

Comparing PEFT Methods

comparison_table = """
| Method          | Params Added | Memory  | Quality | Use Case |
|-----------------|--------------|---------|---------|----------|
| Full Fine-tune  | 100%         | Highest | Best    | Unlimited resources |
| LoRA            | ~0.1%        | Medium  | High    | General purpose |
| QLoRA           | ~0.1%        | Low     | Good    | Limited GPU memory |
| Adapters        | ~1-5%        | Medium  | High    | Multi-task |
| Prefix Tuning   | ~0.1%        | Low     | Good    | NLU tasks |
| Prompt Tuning   | <0.1%        | Lowest  | Moderate| Simple classification |
| IA3             | <0.01%       | Lowest  | Good    | Memory constrained |
"""

Choosing the Right Method

def recommend_peft_method(scenario):
    recommendations = {
        "limited_gpu_memory": {
            "method": "QLoRA",
            "reasoning": "4-bit quantization minimizes memory"
        },
        "multiple_tasks": {
            "method": "Adapters",
            "reasoning": "Easy to swap task-specific adapters"
        },
        "simple_classification": {
            "method": "Prompt Tuning",
            "reasoning": "Minimal parameters, fast training"
        },
        "generation_task": {
            "method": "LoRA",
            "reasoning": "Good balance of quality and efficiency"
        },
        "instruction_following": {
            "method": "LoRA or QLoRA",
            "reasoning": "Well-suited for instruction tuning"
        },
        "knowledge_intensive": {
            "method": "Full fine-tune or LoRA with high rank",
            "reasoning": "Need more capacity for new knowledge"
        }
    }
    return recommendations.get(scenario, "LoRA as default choice")

Combining PEFT Methods

from peft import PeftModel, LoraConfig, AdapterConfig

# You can combine multiple PEFT methods
# Example: LoRA for attention + Adapters for FFN

lora_config = LoraConfig(
    r=8,
    target_modules=["q_proj", "v_proj"]
)

# Apply LoRA first
model = get_peft_model(base_model, lora_config)

# Note: Combining methods requires careful implementation
# and isn't always beneficial

Training Tips

training_tips = {
    "learning_rate": {
        "peft_general": "Higher than full fine-tuning (1e-4 to 3e-4)",
        "prompt_tuning": "Can be even higher (1e-3)",
        "lora": "2e-4 is a good starting point"
    },
    "batch_size": {
        "guidance": "Use gradient accumulation if memory limited",
        "example": "batch_size=1, gradient_accumulation_steps=16"
    },
    "epochs": {
        "small_dataset": "More epochs (5-10)",
        "large_dataset": "Fewer epochs (1-3)"
    },
    "regularization": {
        "dropout": "Use dropout in PEFT layers",
        "weight_decay": "Small amount (0.01) helps"
    }
}

Evaluation

def evaluate_peft_model(model, test_data, task_type="classification"):
    """Evaluate PEFT model performance."""
    predictions = []
    labels = []

    for example in test_data:
        # Generate prediction
        output = model.generate(example["input"])
        predictions.append(output)
        labels.append(example["label"])

    if task_type == "classification":
        accuracy = sum(p == l for p, l in zip(predictions, labels)) / len(labels)
        return {"accuracy": accuracy}

    elif task_type == "generation":
        # Use appropriate metrics (BLEU, ROUGE, etc.)
        from evaluate import load
        bleu = load("bleu")
        score = bleu.compute(predictions=predictions, references=labels)
        return {"bleu": score}

Tomorrow we’ll explore PEFT libraries and their practical usage.