Language Model Fundamentals
What is a Language Model?
A language model estimates the probability distribution over sequences of tokens. It learns to predict the next token given previous tokens, capturing statistical patterns in language.
Mathematical Foundation
The chain rule of probability decomposes sequence probability:
P(w1, w2, ..., wn) = P(w1) * P(w2|w1) * P(w3|w1,w2) * ... * P(wn|w1,...,wn-1)
Training Objectives
Causal Language Modeling (CLM)
import torch
import torch.nn as nn
class CausalLM(nn.Module):
def __init__(self, vocab_size, d_model, num_heads, num_layers):
super().__init__()
self.embedding = nn.Embedding(vocab_size, d_model)
self.transformer = nn.TransformerEncoder(
nn.TransformerEncoderLayer(d_model, num_heads),
num_layers
)
self.output = nn.Linear(d_model, vocab_size)
def forward(self, x):
# Create causal mask
seq_len = x.size(1)
mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool()
# Forward pass
x = self.embedding(x)
x = self.transformer(x, mask=mask)
return self.output(x)
# Training loop
def train_clm(model, dataloader, optimizer, epochs=10):
criterion = nn.CrossEntropyLoss()
for epoch in range(epochs):
total_loss = 0
for batch in dataloader:
inputs = batch[:, :-1]
targets = batch[:, 1:]
logits = model(inputs)
loss = criterion(logits.reshape(-1, logits.size(-1)), targets.reshape(-1))
optimizer.zero_grad()
loss.backward()
optimizer.step()
total_loss += loss.item()
print(f"Epoch {epoch+1}: Loss = {total_loss/len(dataloader):.4f}")
Masked Language Modeling (MLM)
class MaskedLM(nn.Module):
def __init__(self, vocab_size, d_model, num_heads, num_layers, mask_prob=0.15):
super().__init__()
self.mask_prob = mask_prob
self.embedding = nn.Embedding(vocab_size, d_model)
self.transformer = nn.TransformerEncoder(
nn.TransformerEncoderLayer(d_model, num_heads),
num_layers
)
self.output = nn.Linear(d_model, vocab_size)
def create_masked_input(self, x):
mask = torch.rand(x.shape) < self.mask_prob
masked_x = x.clone()
masked_x[mask] = self.mask_token_id
return masked_x, mask
def forward(self, x, mask=None):
x = self.embedding(x)
x = self.transformer(x)
return self.output(x)
Decoding Strategies
Decoding Implementation
import torch
import torch.nn.functional as F
def greedy_decode(model, input_ids, max_length=100):
model.eval()
for _ in range(max_length):
logits = model(input_ids)
next_token_logits = logits[:, -1, :]
next_token = torch.argmax(next_token_logits, dim=-1)
input_ids = torch.cat([input_ids, next_token.unsqueeze(1)], dim=1)
return input_ids
def top_k_decode(model, input_ids, k=50, max_length=100, temperature=1.0):
model.eval()
for _ in range(max_length):
logits = model(input_ids)
next_token_logits = logits[:, -1, :] / temperature
# Filter to top-k
top_k_values, _ = torch.topk(next_token_logits, k)
threshold = top_k_values[:, -1]
next_token_logits[next_token_logits < threshold] = float('-inf')
# Sample
probs = F.softmax(next_token_logits, dim=-1)
next_token = torch.multinomial(probs, 1)
input_ids = torch.cat([input_ids, next_token], dim=1)
return input_ids
def nucleus_decode(model, input_ids, p=0.9, max_length=100, temperature=1.0):
model.eval()
for _ in range(max_length):
logits = model(input_ids)
next_token_logits = logits[:, -1, :] / temperature
# Sort probabilities
sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True)
cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
# Remove tokens with cumulative probability above threshold
sorted_indices_to_remove = cumulative_probs > p
sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1].clone()
sorted_indices_to_remove[:, 0] = 0
indices_to_remove = sorted_indices[sorted_indices_to_remove]
next_token_logits[:, indices_to_remove] = float('-inf')
probs = F.softmax(next_token_logits, dim=-1)
next_token = torch.multinomial(probs, 1)
input_ids = torch.cat([input_ids, next_token], dim=1)
return input_ids
Temperature and Sampling
| Temperature | Effect | Use Case |
|---|---|---|
| 0.0 | Deterministic, greedy | Factual tasks |
| 0.7 | Balanced | General use |
| 1.0 | Standard sampling | Creative tasks |
| 1.5 | High diversity | Brainstorming |
Summary
Language models learn to predict tokens, and decoding strategies determine how outputs are generated. Understanding these fundamentals is essential for effective generative AI development.
Next: We'll explore pretraining and fine-tuning approaches.