Reinforcement Learning with AI
RLHF Pipeline
Reinforcement Learning from Human Feedback aligns language models with human preferences through supervised fine-tuning, reward modeling, and policy optimization.
Reward Model Training
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
class RewardModel(nn.Module):
def __init__(self, model_name: str):
super().__init__()
self.transformer = AutoModel.from_pretrained(model_name)
self.reward_head = nn.Linear(self.transformer.config.hidden_size, 1)
def forward(self, input_ids, attention_mask):
outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
if hasattr(outputs, "last_hidden_state"):
hidden = outputs.last_hidden_state[:, 0, :]
else:
hidden = outputs[:, 0, :]
reward = self.reward_head(hidden)
return reward.squeeze(-1)
class RewardModelTrainer:
def __init__(self, model, tokenizer, lr=1e-5):
self.model = model
self.tokenizer = tokenizer
self.optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
def train_step(self, chosen: str, rejected: str):
chosen_enc = self.tokenizer(chosen, return_tensors="pt", padding=True)
rejected_enc = self.tokenizer(rejected, return_tensors="pt", padding=True)
chosen_reward = self.model(**chosen_enc)
rejected_reward = self.model(**rejected_enc)
loss = -torch.log(torch.sigmoid(chosen_reward - rejected_reward))
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
return loss.item(), chosen_reward.item(), rejected_reward.item()
model = RewardModel("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
trainer = RewardModelTrainer(model, tokenizer)
PPO Implementation
import torch
import torch.nn as nn
from typing import Tuple
class PPOAgent:
def __init__(self, policy_model, value_model, clip_epsilon=0.2):
self.policy = policy_model
self.value = value_model
self.clip_epsilon = clip_epsilon
def compute_advantages(
self,
rewards: torch.Tensor,
values: torch.Tensor,
gamma: float = 0.99,
lam: float = 0.95
) -> torch.Tensor:
advantages = torch.zeros_like(rewards)
last_gae = 0
for t in reversed(range(len(rewards))):
if t == len(rewards) - 1:
next_value = 0
else:
next_value = values[t + 1]
delta = rewards[t] + gamma * next_value - values[t]
advantages[t] = last_gae = delta + gamma * lam * last_gae
return advantages
def ppo_loss(
self,
old_log_probs: torch.Tensor,
new_log_probs: torch.Tensor,
advantages: torch.Tensor
) -> torch.Tensor:
ratio = torch.exp(new_log_probs - old_log_probs)
clipped_ratio = torch.clamp(
ratio,
1 - self.clip_epsilon,
1 + self.clip_epsilon
)
loss1 = ratio * advantages
loss2 = clipped_ratio * advantages
return -torch.min(loss1, loss2).mean()
def update(self, batch, epochs=4):
total_loss = 0
for _ in range(epochs):
new_log_probs = self.policy.get_log_probs(batch["states"], batch["actions"])
values = self.value(batch["states"])
advantages = self.compute_advantages(batch["rewards"], values)
loss = self.ppo_loss(batch["old_log_probs"], new_log_probs, advantages)
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
total_loss += loss.item()
return total_loss / epochs
RLHF Training Loop
class RLHFTrainer:
def __init__(self, policy_model, reward_model, tokenizer):
self.policy = policy_model
self.reward_model = reward_model
self.tokenizer = tokenizer
self.ppo_agent = PPOAgent(policy_model, None)
def generate_response(self, prompt: str) -> str:
inputs = self.tokenizer(prompt, return_tensors="pt")
with torch.no_grad():
outputs = self.policy.generate(
**inputs,
max_new_tokens=256,
temperature=0.7,
do_sample=True
)
return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
def train_step(self, prompts: list):
responses = [self.generate_response(p) for p in prompts]
with torch.no_grad():
rewards = [
self.reward_model(
**self.tokenizer(r, return_tensors="pt", padding=True)
).item()
for r in responses
]
print(f"Avg Reward: {sum(rewards)/len(rewards):.3f}")
return responses, rewards
trainer = RLHFTrainer(policy_model, reward_model, tokenizer)
for epoch in range(10):
responses, rewards = trainer.train_step(training_prompts)
DPO (Direct Preference Optimization)
class DPOTrainer:
def __init__(self, model, tokenizer, beta=0.1):
self.model = model
self.tokenizer = tokenizer
self.beta = beta
def dpo_loss(
self,
chosen_log_probs: torch.Tensor,
rejected_log_probs: torch.Tensor,
reference_log_probs: torch.Tensor,
reference_rejected_log_probs: torch.Tensor
) -> torch.Tensor:
chosen_ratio = chosen_log_probs - reference_log_probs
rejected_ratio = rejected_log_probs - reference_rejected_log_probs
loss = -torch.log(
torch.sigmoid(self.beta * (chosen_ratio - rejected_ratio))
)
return loss.mean()
dpo_trainer = DPOTrainer(model, tokenizer)
Best Practices
- Use large, diverse human preference datasets
- Balance helpfulness with safety constraints
- Implement proper reward hacking prevention
- Monitor for distributional shift during training
- Use curriculum learning for complex alignment tasks
- Validate with red-teaming and adversarial testing