October 12, 2023 1 min read

Direct Preference Optimization (DPO): A Simpler Alternative to RLHF

DPO LLM Training AI Alignment Machine Learning AI

Introduction

Direct Preference Optimization (DPO) offers a simpler alternative to traditional RLHF by eliminating the need for a separate reward model and PPO training. This post explains how DPO works and when to use it.

DPO vs RLHF

from dataclasses import dataclass
from typing import List, Dict
import math

@dataclass
class TrainingComparison:
    """Compare RLHF and DPO approaches"""

    @staticmethod
    def compare() -> Dict:
        return {
            "RLHF": {
                "stages": ["SFT", "Reward Model", "PPO"],
                "complexity": "High",
                "training_stability": "Challenging",
                "compute_cost": "High (3 models)",
                "hyperparameters": "Many (PPO + reward model)",
                "reward_hacking_risk": "Higher"
            },
            "DPO": {
                "stages": ["SFT", "DPO fine-tuning"],
                "complexity": "Lower",
                "training_stability": "More stable",
                "compute_cost": "Lower (1 model + reference)",
                "hyperparameters": "Few (beta mainly)",
                "reward_hacking_risk": "Lower"
            }
        }

DPO Mathematical Foundation

class DPOConcepts:
    """Core concepts of Direct Preference Optimization"""

    def __init__(self, beta: float = 0.1):
        self.beta = beta  # Temperature parameter

    def explain_objective(self) -> str:
        return """
DPO Objective:

The key insight of DPO is that the optimal policy under RLHF can be expressed
in closed form, allowing direct optimization without a reward model.

Loss function:
L_DPO = -E[log sigmoid(beta * (log(pi(y_w|x)/pi_ref(y_w|x)) - log(pi(y_l|x)/pi_ref(y_l|x))))]

Where:
- y_w: chosen (winning) response
- y_l: rejected (losing) response
- pi: policy being trained
- pi_ref: reference policy (SFT model)
- beta: temperature controlling deviation from reference
"""

    def compute_dpo_loss(
        self,
        log_prob_chosen_policy: float,
        log_prob_rejected_policy: float,
        log_prob_chosen_ref: float,
        log_prob_rejected_ref: float
    ) -> float:
        """Compute DPO loss for a single example"""
        # Log ratios
        chosen_ratio = log_prob_chosen_policy - log_prob_chosen_ref
        rejected_ratio = log_prob_rejected_policy - log_prob_rejected_ref

        # DPO loss
        logit = self.beta * (chosen_ratio - rejected_ratio)
        loss = -math.log(1 / (1 + math.exp(-logit)))

        return loss

    def compute_implicit_reward(
        self,
        log_prob_policy: float,
        log_prob_ref: float
    ) -> float:
        """Compute implicit reward from trained model"""
        return self.beta * (log_prob_policy - log_prob_ref)

Preparing DPO Training Data

@dataclass
class DPOExample:
    """Single DPO training example"""
    prompt: str
    chosen: str
    rejected: str

class DPODataset:
    """Dataset for DPO training"""

    def __init__(self):
        self.examples: List[DPOExample] = []

    def add_example(self, prompt: str, chosen: str, rejected: str):
        """Add preference pair"""
        self.examples.append(DPOExample(
            prompt=prompt,
            chosen=chosen,
            rejected=rejected
        ))

    def from_comparisons(self, comparisons: List[Dict]):
        """Create from comparison data"""
        for comp in comparisons:
            self.add_example(
                prompt=comp["prompt"],
                chosen=comp["response_a"] if comp["preference"] == "A" else comp["response_b"],
                rejected=comp["response_b"] if comp["preference"] == "A" else comp["response_a"]
            )

    def to_training_format(self, tokenizer=None) -> List[Dict]:
        """Convert to training format"""
        formatted = []
        for ex in self.examples:
            item = {
                "prompt": ex.prompt,
                "chosen": ex.chosen,
                "rejected": ex.rejected
            }

            if tokenizer:
                # Tokenize for actual training
                item["chosen_input_ids"] = tokenizer.encode(ex.prompt + ex.chosen)
                item["rejected_input_ids"] = tokenizer.encode(ex.prompt + ex.rejected)

            formatted.append(item)

        return formatted

    def validate(self) -> Dict:
        """Validate dataset"""
        issues = []

        for i, ex in enumerate(self.examples):
            if not ex.prompt.strip():
                issues.append(f"Example {i}: Empty prompt")
            if not ex.chosen.strip():
                issues.append(f"Example {i}: Empty chosen response")
            if not ex.rejected.strip():
                issues.append(f"Example {i}: Empty rejected response")
            if ex.chosen == ex.rejected:
                issues.append(f"Example {i}: Chosen equals rejected")

        return {
            "valid": len(issues) == 0,
            "total_examples": len(self.examples),
            "issues": issues
        }

# Example usage
dataset = DPODataset()

# Add examples
dataset.add_example(
    prompt="What is machine learning?",
    chosen="Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience without being explicitly programmed.",
    rejected="Machine learning is computers doing stuff."
)

dataset.add_example(
    prompt="Write a haiku about coding",
    chosen="Silent keystrokes fall\nLogic weaves through midnight hours\nBugs fade into dawn",
    rejected="Coding is fun and cool I like it a lot yes"
)

validation = dataset.validate()
print(f"Dataset valid: {validation['valid']}")
print(f"Total examples: {validation['total_examples']}")

DPO Training Implementation

class DPOTrainer:
    """Conceptual DPO trainer"""

    def __init__(self, beta: float = 0.1, learning_rate: float = 1e-6):
        self.beta = beta
        self.learning_rate = learning_rate
        self.dpo = DPOConcepts(beta=beta)

    def training_step(
        self,
        batch: List[DPOExample],
        policy_model,
        reference_model
    ) -> Dict:
        """Single training step (conceptual)"""
        total_loss = 0
        chosen_rewards = []
        rejected_rewards = []

        for example in batch:
            # Get log probabilities from models
            # In practice, these come from forward pass
            chosen_text = example.prompt + example.chosen
            rejected_text = example.prompt + example.rejected

            # Policy model log probs
            log_prob_chosen_policy = policy_model.log_prob(chosen_text)
            log_prob_rejected_policy = policy_model.log_prob(rejected_text)

            # Reference model log probs (frozen)
            log_prob_chosen_ref = reference_model.log_prob(chosen_text)
            log_prob_rejected_ref = reference_model.log_prob(rejected_text)

            # Compute loss
            loss = self.dpo.compute_dpo_loss(
                log_prob_chosen_policy,
                log_prob_rejected_policy,
                log_prob_chosen_ref,
                log_prob_rejected_ref
            )
            total_loss += loss

            # Track implicit rewards
            chosen_rewards.append(
                self.dpo.compute_implicit_reward(log_prob_chosen_policy, log_prob_chosen_ref)
            )
            rejected_rewards.append(
                self.dpo.compute_implicit_reward(log_prob_rejected_policy, log_prob_rejected_ref)
            )

        return {
            "loss": total_loss / len(batch),
            "chosen_reward_mean": sum(chosen_rewards) / len(chosen_rewards),
            "rejected_reward_mean": sum(rejected_rewards) / len(rejected_rewards),
            "reward_margin": (sum(chosen_rewards) - sum(rejected_rewards)) / len(batch)
        }

    def get_training_config(self) -> Dict:
        """Get recommended training configuration"""
        return {
            "beta": self.beta,
            "learning_rate": self.learning_rate,
            "batch_size": 4,
            "gradient_accumulation_steps": 8,
            "epochs": 1,
            "warmup_ratio": 0.1,
            "weight_decay": 0.01,
            "max_grad_norm": 1.0,
            "bf16": True,  # Use bfloat16 for stability
            "logging_steps": 10,
            "save_strategy": "epoch"
        }

Using TRL for DPO

def get_trl_dpo_example() -> str:
    """Example code for DPO training with TRL library"""
    return '''
# DPO Training with Hugging Face TRL
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import DPOTrainer, DPOConfig

# Load model and tokenizer
model_name = "meta-llama/Llama-2-7b-hf"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Reference model (frozen copy)
ref_model = AutoModelForCausalLM.from_pretrained(model_name)

# Prepare dataset
train_dataset = Dataset.from_dict({
    "prompt": ["What is AI?", "Explain Python"],
    "chosen": ["AI is artificial intelligence...", "Python is a programming language..."],
    "rejected": ["AI is stuff", "Python is a snake"]
})

# DPO configuration
dpo_config = DPOConfig(
    beta=0.1,
    learning_rate=1e-6,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    warmup_ratio=0.1,
    logging_steps=10,
    output_dir="./dpo_model",
    bf16=True,
)

# Initialize trainer
trainer = DPOTrainer(
    model=model,
    ref_model=ref_model,
    args=dpo_config,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
)

# Train
trainer.train()

# Save model
trainer.save_model("./dpo_final")
'''

print(get_trl_dpo_example())

DPO Variants and Extensions

class DPOVariants:
    """Variants and extensions of DPO"""

    @staticmethod
    def get_variants() -> List[Dict]:
        return [
            {
                "name": "IPO (Identity Preference Optimization)",
                "description": "Removes log-sigmoid, uses identity function",
                "advantage": "More robust to noisy preferences",
                "use_case": "When preference data has low agreement"
            },
            {
                "name": "KTO (Kahneman-Tversky Optimization)",
                "description": "Uses unpaired preference data",
                "advantage": "Works with thumbs up/down without pairs",
                "use_case": "When you only have binary feedback"
            },
            {
                "name": "ORPO (Odds Ratio Preference Optimization)",
                "description": "Combines SFT and preference alignment",
                "advantage": "Single stage training",
                "use_case": "When you want simpler pipeline"
            },
            {
                "name": "cDPO (Conservative DPO)",
                "description": "Adds regularization toward reference",
                "advantage": "Better preserves capabilities",
                "use_case": "When base model has important skills"
            }
        ]

    @staticmethod
    def choose_variant(scenario: str) -> str:
        """Recommend variant based on scenario"""
        scenarios = {
            "paired_preferences": "Standard DPO",
            "noisy_labels": "IPO",
            "only_ratings": "KTO",
            "limited_compute": "ORPO",
            "preserve_capabilities": "cDPO"
        }
        return scenarios.get(scenario, "Standard DPO")

Best Practices

class DPOBestPractices:
    """Best practices for DPO training"""

    @staticmethod
    def get_recommendations() -> List[Dict]:
        return [
            {
                "category": "Data Quality",
                "recommendations": [
                    "Ensure clear preference differences between chosen/rejected",
                    "Use multiple annotators and check agreement",
                    "Include diverse prompts covering target use cases",
                    "Balance different types of preferences"
                ]
            },
            {
                "category": "Hyperparameters",
                "recommendations": [
                    "Start with beta=0.1, adjust based on results",
                    "Use low learning rate (1e-6 to 5e-6)",
                    "Train for 1 epoch to avoid overfitting",
                    "Use gradient checkpointing for large models"
                ]
            },
            {
                "category": "Evaluation",
                "recommendations": [
                    "Monitor reward margin during training",
                    "Evaluate on held-out preference data",
                    "Check for capability degradation",
                    "Compare against SFT baseline"
                ]
            },
            {
                "category": "Common Issues",
                "recommendations": [
                    "If loss doesn't decrease, increase beta",
                    "If quality degrades, decrease beta or learning rate",
                    "If overfitting, reduce epochs or add regularization",
                    "Ensure reference model is frozen properly"
                ]
            }
        ]

# Print recommendations
for practice in DPOBestPractices.get_recommendations():
    print(f"\n{practice['category']}:")
    for rec in practice["recommendations"]:
        print(f"  - {rec}")

Conclusion

DPO provides a simpler, more stable alternative to RLHF for aligning language models with human preferences. By directly optimizing on preference pairs without a separate reward model, DPO reduces complexity while achieving comparable results. Choose DPO when you have clean preference pairs and want straightforward training.