7 min read
Constitutional AI: Self-Supervised Alignment
Introduction
Constitutional AI (CAI) is an approach developed by Anthropic that uses a set of principles (a “constitution”) to guide AI behavior through self-critique and revision. This reduces reliance on human feedback while improving alignment at scale.
Constitutional AI Overview
from dataclasses import dataclass
from typing import List, Dict
@dataclass
class ConstitutionalPrinciple:
"""A principle in the AI constitution"""
name: str
description: str
critique_prompt: str
revision_prompt: str
class Constitution:
"""A collection of principles guiding AI behavior"""
def __init__(self, name: str):
self.name = name
self.principles: List[ConstitutionalPrinciple] = []
def add_principle(self, principle: ConstitutionalPrinciple):
self.principles.append(principle)
def get_principle_names(self) -> List[str]:
return [p.name for p in self.principles]
# Example constitution based on Anthropic's approach
def create_helpful_harmless_honest_constitution() -> Constitution:
"""Create HHH-based constitution"""
constitution = Constitution("HHH Constitution")
constitution.add_principle(ConstitutionalPrinciple(
name="Harmlessness",
description="Avoid generating harmful, dangerous, or illegal content",
critique_prompt="Identify any harmful, dangerous, unethical, or illegal content in this response.",
revision_prompt="Revise the response to remove harmful content while remaining helpful."
))
constitution.add_principle(ConstitutionalPrinciple(
name="Helpfulness",
description="Provide useful, accurate, and relevant information",
critique_prompt="Does this response adequately address the user's question? Is it accurate and useful?",
revision_prompt="Revise to make the response more helpful and accurate."
))
constitution.add_principle(ConstitutionalPrinciple(
name="Honesty",
description="Be truthful and acknowledge uncertainty",
critique_prompt="Is this response honest? Does it acknowledge limitations or uncertainty where appropriate?",
revision_prompt="Revise to be more honest about limitations and avoid false claims."
))
constitution.add_principle(ConstitutionalPrinciple(
name="Respectfulness",
description="Treat users with respect and avoid discrimination",
critique_prompt="Is this response respectful? Does it avoid bias or discrimination?",
revision_prompt="Revise to be more respectful and remove any biased language."
))
return constitution
CAI Training Process
class CAIProcess:
"""Constitutional AI training process"""
@staticmethod
def describe_stages() -> Dict:
return {
"stage_1_critique_revision": {
"name": "Supervised Learning from AI Feedback (SL-CAI)",
"steps": [
"1. Generate initial response to prompt",
"2. Ask model to critique response using principles",
"3. Ask model to revise based on critique",
"4. Use revised responses for supervised training"
],
"benefit": "Creates training data without human annotation"
},
"stage_2_rl_cai": {
"name": "Reinforcement Learning from AI Feedback (RL-CAI)",
"steps": [
"1. Generate comparison pairs (original vs revised)",
"2. Use model to rank pairs based on principles",
"3. Train reward model on AI preferences",
"4. Use RL to optimize policy"
],
"benefit": "Scales alignment without human preference labels"
}
}
class CritiqueRevisionPipeline:
"""Implement critique and revision process"""
def __init__(self, model, constitution: Constitution):
self.model = model
self.constitution = constitution
def generate_initial_response(self, prompt: str) -> str:
"""Generate initial response"""
return self.model.generate(prompt)
def critique(self, prompt: str, response: str, principle: ConstitutionalPrinciple) -> str:
"""Generate critique based on principle"""
critique_prompt = f"""
Original prompt: {prompt}
Response to critique: {response}
{principle.critique_prompt}
Provide a detailed critique:
"""
return self.model.generate(critique_prompt)
def revise(self, prompt: str, response: str, critique: str, principle: ConstitutionalPrinciple) -> str:
"""Revise response based on critique"""
revision_prompt = f"""
Original prompt: {prompt}
Original response: {response}
Critique: {critique}
{principle.revision_prompt}
Revised response:
"""
return self.model.generate(revision_prompt)
def full_revision_chain(self, prompt: str) -> Dict:
"""Run full critique-revision chain"""
# Initial response
initial = self.generate_initial_response(prompt)
revisions = [{"stage": "initial", "response": initial}]
current_response = initial
# Apply each principle
for principle in self.constitution.principles:
critique = self.critique(prompt, current_response, principle)
revised = self.revise(prompt, current_response, critique, principle)
revisions.append({
"stage": principle.name,
"critique": critique,
"response": revised
})
current_response = revised
return {
"prompt": prompt,
"initial_response": initial,
"final_response": current_response,
"revision_chain": revisions
}
Practical Implementation
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
class CAIImplementation:
"""Practical CAI implementation using LangChain"""
def __init__(self, model_name: str = "gpt-4"):
self.llm = ChatOpenAI(model=model_name, temperature=0.7)
self.constitution = create_helpful_harmless_honest_constitution()
def create_critique_chain(self, principle: ConstitutionalPrinciple):
"""Create critique chain for a principle"""
prompt = ChatPromptTemplate.from_template("""
You are evaluating a response based on this principle: {principle_name}
Description: {principle_description}
Original user prompt: {user_prompt}
Response to evaluate: {response}
{critique_instructions}
Provide a specific, constructive critique. If the response is acceptable, say "No issues found."
Critique:
""")
return prompt | self.llm
def create_revision_chain(self, principle: ConstitutionalPrinciple):
"""Create revision chain for a principle"""
prompt = ChatPromptTemplate.from_template("""
You are revising a response based on this critique and principle: {principle_name}
Original user prompt: {user_prompt}
Original response: {response}
Critique: {critique}
{revision_instructions}
If the critique found no issues, return the original response unchanged.
Otherwise, provide an improved response that addresses the critique.
Revised response:
""")
return prompt | self.llm
def apply_constitution(self, user_prompt: str, initial_response: str) -> Dict:
"""Apply full constitution to a response"""
current_response = initial_response
revision_history = []
for principle in self.constitution.principles:
# Critique
critique_chain = self.create_critique_chain(principle)
critique_result = critique_chain.invoke({
"principle_name": principle.name,
"principle_description": principle.description,
"user_prompt": user_prompt,
"response": current_response,
"critique_instructions": principle.critique_prompt
})
critique = critique_result.content
# Revision
revision_chain = self.create_revision_chain(principle)
revision_result = revision_chain.invoke({
"principle_name": principle.name,
"user_prompt": user_prompt,
"response": current_response,
"critique": critique,
"revision_instructions": principle.revision_prompt
})
revised = revision_result.content
revision_history.append({
"principle": principle.name,
"critique": critique,
"revised_response": revised
})
current_response = revised
return {
"initial_response": initial_response,
"final_response": current_response,
"history": revision_history
}
# Usage
cai = CAIImplementation()
# Example with potentially problematic content
user_prompt = "How can I access someone else's email account?"
initial_response = "You could try guessing their password or using phishing..."
result = cai.apply_constitution(user_prompt, initial_response)
print(f"Final response: {result['final_response']}")
Building Custom Constitutions
class ConstitutionBuilder:
"""Build custom constitutions for specific use cases"""
def __init__(self, name: str):
self.constitution = Constitution(name)
def add_safety_principles(self) -> 'ConstitutionBuilder':
"""Add standard safety principles"""
self.constitution.add_principle(ConstitutionalPrinciple(
name="No Harmful Instructions",
description="Do not provide instructions for harmful activities",
critique_prompt="Does this response provide instructions that could harm people or property?",
revision_prompt="Remove any harmful instructions while explaining why you cannot help."
))
self.constitution.add_principle(ConstitutionalPrinciple(
name="No Illegal Activity",
description="Do not assist with illegal activities",
critique_prompt="Does this response assist with any illegal activities?",
revision_prompt="Revise to refuse illegal requests and suggest legal alternatives."
))
return self
def add_accuracy_principles(self) -> 'ConstitutionBuilder':
"""Add accuracy and truthfulness principles"""
self.constitution.add_principle(ConstitutionalPrinciple(
name="Factual Accuracy",
description="Ensure factual claims are accurate",
critique_prompt="Are there any factual errors or misleading claims in this response?",
revision_prompt="Correct any factual errors and add caveats where uncertain."
))
self.constitution.add_principle(ConstitutionalPrinciple(
name="Source Attribution",
description="Acknowledge when information comes from specific sources",
critique_prompt="Does this response properly attribute information or acknowledge uncertainty?",
revision_prompt="Add appropriate attribution and acknowledge limitations."
))
return self
def add_domain_principle(
self,
name: str,
description: str,
critique: str,
revision: str
) -> 'ConstitutionBuilder':
"""Add custom domain-specific principle"""
self.constitution.add_principle(ConstitutionalPrinciple(
name=name,
description=description,
critique_prompt=critique,
revision_prompt=revision
))
return self
def build(self) -> Constitution:
return self.constitution
# Example: Customer service constitution
customer_service_constitution = (
ConstitutionBuilder("Customer Service AI")
.add_safety_principles()
.add_domain_principle(
name="Professional Tone",
description="Maintain professional and empathetic communication",
critique="Is this response professional and empathetic?",
revision="Revise to be more professional while showing empathy."
)
.add_domain_principle(
name="Solution Focused",
description="Focus on resolving customer issues",
critique="Does this response help solve the customer's problem?",
revision="Revise to focus more on actionable solutions."
)
.add_domain_principle(
name="Company Policy Compliance",
description="Ensure responses align with company policies",
critique="Does this response align with standard company policies?",
revision="Revise to comply with company guidelines."
)
.build()
)
Evaluation and Monitoring
class CAIEvaluator:
"""Evaluate CAI effectiveness"""
def __init__(self, constitution: Constitution):
self.constitution = constitution
def evaluate_revision(self, initial: str, revised: str, principle: ConstitutionalPrinciple) -> Dict:
"""Evaluate how well revision addressed principle"""
# In practice, use an evaluator model
return {
"principle": principle.name,
"improvement_detected": initial != revised,
"initial_length": len(initial),
"revised_length": len(revised)
}
def evaluate_full_chain(self, revision_result: Dict) -> Dict:
"""Evaluate full revision chain"""
evaluations = []
total_changes = 0
for i, entry in enumerate(revision_result["history"]):
prev_response = (
revision_result["initial_response"] if i == 0
else revision_result["history"][i-1]["revised_response"]
)
changed = prev_response != entry["revised_response"]
if changed:
total_changes += 1
evaluations.append({
"principle": entry["principle"],
"changed": changed,
"critique_length": len(entry["critique"])
})
return {
"total_principles_applied": len(self.constitution.principles),
"revisions_made": total_changes,
"revision_rate": total_changes / len(self.constitution.principles),
"principle_evaluations": evaluations
}
Conclusion
Constitutional AI provides a scalable approach to AI alignment by using self-critique and revision guided by explicit principles. By defining clear constitutions and implementing automated revision chains, organizations can improve AI safety and helpfulness without extensive human feedback collection.