Pretraining and Fine-tuning
Pretraining: Building Foundation Knowledge
Pretraining is the first stage where models learn general language patterns from massive unlabeled datasets.
Pretraining Objectives
import torch
import torch.nn as nn
class PretrainingModel(nn.Module):
def __init__(self, vocab_size, d_model, num_heads, num_layers):
super().__init__()
self.embedding = nn.Embedding(vocab_size, d_model)
self.transformer = nn.TransformerEncoder(
nn.TransformerEncoderLayer(d_model, num_heads),
num_layers
)
self.mlm_head = nn.Linear(d_model, vocab_size)
self.clm_head = nn.Linear(d_model, vocab_size)
def forward(self, x, task='clm'):
x = self.embedding(x)
x = self.transformer(x)
if task == 'mlm':
return self.mlm_head(x)
else:
return self.clm_head(x)
# Pretraining configuration
config = {
"d_model": 768,
"num_heads": 12,
"num_layers": 12,
"vocab_size": 50257,
"batch_size": 32,
"learning_rate": 1e-4,
"warmup_steps": 1000,
"max_steps": 100000
}
Fine-tuning Approaches
Supervised Fine-tuning (SFT)
import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer
def supervised_finetuning(model_name, train_data, val_data, epochs=3):
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
for epoch in range(epochs):
model.train()
total_loss = 0
for batch in train_data:
inputs = tokenizer(batch['text'], return_tensors='pt',
padding=True, truncation=True)
outputs = model(
input_ids=inputs['input_ids'],
attention_mask=inputs['attention_mask'],
labels=inputs['input_ids']
)
loss = outputs.loss
loss.backward()
optimizer.step()
optimizer.zero_grad()
total_loss += loss.item()
print(f"Epoch {epoch+1}: Train Loss = {total_loss/len(train_data):.4f}")
return model
RLHF (Reinforcement Learning from Human Feedback)
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
def rlhf_training(base_model_name, reward_model, train_prompts):
config = PPOConfig(
model_name=base_model_name,
learning_rate=1.41e-5,
batch_size=64,
mini_batch_size=16,
ppo_epochs=4,
kl_penalty="kl",
init_kl_coef=0.2,
)
model = AutoModelForCausalLMWithValueHead.from_pretrained(base_model_name)
trainer = PPOTrainer(config=config, model=model)
for prompt in train_prompts:
query_tensors = trainer.tokenizer.encode(prompt, return_tensors="pt")
response = trainer.generate(query_tensors)
rewards = reward_model(query_tensors, response)
stats = trainer.step([query_tensors], [response], [rewards])
Comparison Table
| Approach | Parameters Updated | Compute Cost | Performance |
|---|---|---|---|
| Full FT | 100% | High | Best |
| LoRA | 0.1-1% | Low | Good |
| QLoRA | 0.1-1% | Very Low | Good |
| Prefix | 0.01% | Very Low | Moderate |
| Adapter | 1-5% | Low | Good |
Best Practices
- Data Quality: Curate high-quality fine-tuning datasets
- Learning Rate: Use lower rates (1e-5 to 5e-5) for fine-tuning
- Evaluation: Monitor validation loss to prevent overfitting
- Regularization: Apply dropout and weight decay appropriately
Summary
Pretraining builds general knowledge; fine-tuning specializes for specific tasks. Choose the right approach based on your resources and requirements.
Next: We'll explore prompt engineering techniques.