πŸŽ‰ 75% of content is free forever β€” Unlock Premium from $10/mo β†’
CW
Search courses…
πŸ’Ό Servicesℹ️ Aboutβœ‰οΈ ContactView Pricing Plansfrom $10

Reinforcement Learning with AI

🟒 Free Lesson

Advertisement

Reinforcement Learning with AI

RLHF Deep DivePre-trained LLMSFT Fine-tuningReward ModelPPO OptimizationRLHF Training LoopGenerate β†’ Rank β†’ Reward β†’ Optimize β†’ RepeatPPO Algorithmβ€’ Proximal Policy Optimizationβ€’ Clipped Objective FunctionAlignment Goalsβ€’ Helpfulness and Harmlessnessβ€’ Truthfulness and Safety

RLHF Pipeline

Reinforcement Learning from Human Feedback aligns language models with human preferences through supervised fine-tuning, reward modeling, and policy optimization.

Reward Model Training

import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer

class RewardModel(nn.Module):
    def __init__(self, model_name: str):
        super().__init__()
        self.transformer = AutoModel.from_pretrained(model_name)
        self.reward_head = nn.Linear(self.transformer.config.hidden_size, 1)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        
        if hasattr(outputs, "last_hidden_state"):
            hidden = outputs.last_hidden_state[:, 0, :]
        else:
            hidden = outputs[:, 0, :]
        
        reward = self.reward_head(hidden)
        return reward.squeeze(-1)

class RewardModelTrainer:
    def __init__(self, model, tokenizer, lr=1e-5):
        self.model = model
        self.tokenizer = tokenizer
        self.optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    
    def train_step(self, chosen: str, rejected: str):
        chosen_enc = self.tokenizer(chosen, return_tensors="pt", padding=True)
        rejected_enc = self.tokenizer(rejected, return_tensors="pt", padding=True)
        
        chosen_reward = self.model(**chosen_enc)
        rejected_reward = self.model(**rejected_enc)
        
        loss = -torch.log(torch.sigmoid(chosen_reward - rejected_reward))
        
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        return loss.item(), chosen_reward.item(), rejected_reward.item()

model = RewardModel("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
trainer = RewardModelTrainer(model, tokenizer)

PPO Implementation

import torch
import torch.nn as nn
from typing import Tuple

class PPOAgent:
    def __init__(self, policy_model, value_model, clip_epsilon=0.2):
        self.policy = policy_model
        self.value = value_model
        self.clip_epsilon = clip_epsilon
    
    def compute_advantages(
        self, 
        rewards: torch.Tensor, 
        values: torch.Tensor,
        gamma: float = 0.99,
        lam: float = 0.95
    ) -> torch.Tensor:
        advantages = torch.zeros_like(rewards)
        last_gae = 0
        
        for t in reversed(range(len(rewards))):
            if t == len(rewards) - 1:
                next_value = 0
            else:
                next_value = values[t + 1]
            
            delta = rewards[t] + gamma * next_value - values[t]
            advantages[t] = last_gae = delta + gamma * lam * last_gae
        
        return advantages
    
    def ppo_loss(
        self,
        old_log_probs: torch.Tensor,
        new_log_probs: torch.Tensor,
        advantages: torch.Tensor
    ) -> torch.Tensor:
        ratio = torch.exp(new_log_probs - old_log_probs)
        
        clipped_ratio = torch.clamp(
            ratio,
            1 - self.clip_epsilon,
            1 + self.clip_epsilon
        )
        
        loss1 = ratio * advantages
        loss2 = clipped_ratio * advantages
        
        return -torch.min(loss1, loss2).mean()
    
    def update(self, batch, epochs=4):
        total_loss = 0
        
        for _ in range(epochs):
            new_log_probs = self.policy.get_log_probs(batch["states"], batch["actions"])
            values = self.value(batch["states"])
            advantages = self.compute_advantages(batch["rewards"], values)
            
            loss = self.ppo_loss(batch["old_log_probs"], new_log_probs, advantages)
            
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            
            total_loss += loss.item()
        
        return total_loss / epochs

RLHF Training Loop

class RLHFTrainer:
    def __init__(self, policy_model, reward_model, tokenizer):
        self.policy = policy_model
        self.reward_model = reward_model
        self.tokenizer = tokenizer
        self.ppo_agent = PPOAgent(policy_model, None)
    
    def generate_response(self, prompt: str) -> str:
        inputs = self.tokenizer(prompt, return_tensors="pt")
        
        with torch.no_grad():
            outputs = self.policy.generate(
                **inputs,
                max_new_tokens=256,
                temperature=0.7,
                do_sample=True
            )
        
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    def train_step(self, prompts: list):
        responses = [self.generate_response(p) for p in prompts]
        
        with torch.no_grad():
            rewards = [
                self.reward_model(
                    **self.tokenizer(r, return_tensors="pt", padding=True)
                ).item()
                for r in responses
            ]
        
        print(f"Avg Reward: {sum(rewards)/len(rewards):.3f}")
        
        return responses, rewards

trainer = RLHFTrainer(policy_model, reward_model, tokenizer)
for epoch in range(10):
    responses, rewards = trainer.train_step(training_prompts)

DPO (Direct Preference Optimization)

class DPOTrainer:
    def __init__(self, model, tokenizer, beta=0.1):
        self.model = model
        self.tokenizer = tokenizer
        self.beta = beta
    
    def dpo_loss(
        self,
        chosen_log_probs: torch.Tensor,
        rejected_log_probs: torch.Tensor,
        reference_log_probs: torch.Tensor,
        reference_rejected_log_probs: torch.Tensor
    ) -> torch.Tensor:
        chosen_ratio = chosen_log_probs - reference_log_probs
        rejected_ratio = rejected_log_probs - reference_rejected_log_probs
        
        loss = -torch.log(
            torch.sigmoid(self.beta * (chosen_ratio - rejected_ratio))
        )
        
        return loss.mean()

dpo_trainer = DPOTrainer(model, tokenizer)

Best Practices

  • Use large, diverse human preference datasets
  • Balance helpfulness with safety constraints
  • Implement proper reward hacking prevention
  • Monitor for distributional shift during training
  • Use curriculum learning for complex alignment tasks
  • Validate with red-teaming and adversarial testing
⭐

Premium Content

Reinforcement Learning with AI

Unlock this lesson and 900+ advanced tutorials with a Premium plan.

🎯End-to-end Projects
πŸ’ΌInterview Prep
πŸ“œCertificates
🀝Community Access

Already a member? Log in

Need Expert Generative AI Help?

Get personalized tutoring, project support, or professional consulting.

Advertisement