Self-Supervised Learning
Masked Language Modeling (BERT-style)
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
class MLMPretrainer:
def __init__(self, model_name, mask_prob=0.15):
self.model = AutoModel.from_pretrained(model_name)
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.mask_prob = mask_prob
def mask_tokens(self, input_ids):
labels = input_ids.clone()
probability_matrix = torch.full(input_ids.shape, self.mask_prob)
masked_indices = torch.bernoulli(probability_matrix).bool()
labels[~masked_indices] = -100
input_ids[masked_indices] = self.tokenizer.mask_token_id
return input_ids, labels
def create_mlm_batch(self, texts):
encodings = self.tokenizer(
texts, truncation=True, padding=True, return_tensors="pt"
)
input_ids, labels = self.mask_tokens(encodings["input_ids"])
return {
"input_ids": input_ids,
"attention_mask": encodings["attention_mask"],
"labels": labels
}
class MLMHead(nn.Module):
def __init__(self, hidden_size, vocab_size):
super().__init__()
self.dense = nn.Linear(hidden_size, hidden_size)
self.activation = nn.GELU()
self.layer_norm = nn.LayerNorm(hidden_size)
self.decoder = nn.Linear(hidden_size, vocab_size)
def forward(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.activation(hidden_states)
hidden_states = self.layer_norm(hidden_states)
hidden_states = self.decoder(hidden_states)
return hidden_states
pretrainer = MLMPretrainer("bert-base-uncased")
batch = pretrainer.create_mlm_batch(["Hello world", "MLM pretraining"])
SimCLR Contrastive Learning
import torch
import torch.nn as nn
import torchvision.transforms as T
class SimCLR:
def __init__(self, encoder, projection_dim=128, temperature=0.5):
self.encoder = encoder
self.projector = nn.Sequential(
nn.Linear(encoder.output_dim, 256),
nn.ReLU(),
nn.Linear(256, projection_dim)
)
self.temperature = temperature
self.augment = T.Compose([
T.RandomResizedCrop(224),
T.RandomHorizontalFlip(),
T.ColorJitter(0.8, 0.8, 0.8, 0.2),
T.RandomGrayscale(p=0.2),
T.ToTensor()
])
def nt_xent_loss(self, z_i, z_j):
batch_size = z_i.shape[0]
z_i = nn.functional.normalize(z_i, dim=1)
z_j = nn.functional.normalize(z_j, dim=1)
sim_matrix = torch.mm(z_i, z_j.t()) / self.temperature
labels = torch.arange(batch_size).to(z_i.device)
loss_i = nn.functional.cross_entropy(sim_matrix, labels)
loss_j = nn.functional.cross_entropy(sim_matrix.t(), labels)
return (loss_i + loss_j) / 2
def train_step(self, batch):
x_i = torch.stack([self.augment(x) for x in batch])
x_j = torch.stack([self.augment(x) for x in batch])
h_i = self.encoder(x_i)
h_j = self.encoder(x_j)
z_i = self.projector(h_i)
z_j = self.projector(h_j)
loss = self.nt_xent_loss(z_i, z_j)
return loss
simclr = SimCLR(encoder=ResNetEncoder())
loss = simclr.train_step(images)
Text Contrastive Learning
class TextContrastiveLearner:
def __init__(self, model, temperature=0.07):
self.model = model
self.temperature = temperature
self.projector = nn.Linear(model.config.hidden_size, 256)
def get_embeddings(self, texts):
encodings = self.tokenizer(
texts, padding=True, truncation=True, return_tensors="pt"
)
with torch.no_grad():
outputs = self.model(**encodings)
embeddings = outputs.last_hidden_state[:, 0, :]
projected = self.projector(embeddings)
return nn.functional.normalize(projected, dim=1)
def contrastive_loss(self, embeddings_a, embeddings_b):
similarities = torch.mm(embeddings_a, embeddings_b.t()) / self.temperature
labels = torch.arange(len(embeddings_a)).to(similarities.device)
loss_a2b = nn.functional.cross_entropy(similarities, labels)
loss_b2a = nn.functional.cross_entropy(similarities.t(), labels)
return (loss_a2b + loss_b2a) / 2
learner = TextContrastiveLearner(model)
embeddings_a = learner.get_embeddings(paraphrases_a)
embeddings_b = learner.get_embeddings(paraphrases_b)
loss = learner.contrastive_loss(embeddings_a, embeddings_b)
Best Practices
- Use strong augmentations for contrastive learning
- Large batch sizes improve contrastive performance
- Stop gradients for target encoder in BYOL/DINO
- Use momentum encoder for stable training
- Linear evaluation protocol for benchmarking
- Combine multiple pretext tasks for robustness