Language Model Fundamentals

What is a Language Model?

A language model estimates the probability distribution over sequences of tokens. It learns to predict the next token given previous tokens, capturing statistical patterns in language.

Mathematical Foundation

The chain rule of probability decomposes sequence probability:

P(w1, w2, ..., wn) = P(w1) * P(w2|w1) * P(w3|w1,w2) * ... * P(wn|w1,...,wn-1)

Training Objectives

Causal Language Modeling (CLM)

import torch
import torch.nn as nn

class CausalLM(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model, num_heads),
            num_layers
        )
        self.output = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        # Create causal mask
        seq_len = x.size(1)
        mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool()

        # Forward pass
        x = self.embedding(x)
        x = self.transformer(x, mask=mask)
        return self.output(x)

# Training loop
def train_clm(model, dataloader, optimizer, epochs=10):
    criterion = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        total_loss = 0
        for batch in dataloader:
            inputs = batch[:, :-1]
            targets = batch[:, 1:]

            logits = model(inputs)
            loss = criterion(logits.reshape(-1, logits.size(-1)), targets.reshape(-1))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}: Loss = {total_loss/len(dataloader):.4f}")

Masked Language Modeling (MLM)

class MaskedLM(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, num_layers, mask_prob=0.15):
        super().__init__()
        self.mask_prob = mask_prob
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model, num_heads),
            num_layers
        )
        self.output = nn.Linear(d_model, vocab_size)

    def create_masked_input(self, x):
        mask = torch.rand(x.shape) < self.mask_prob
        masked_x = x.clone()
        masked_x[mask] = self.mask_token_id
        return masked_x, mask

    def forward(self, x, mask=None):
        x = self.embedding(x)
        x = self.transformer(x)
        return self.output(x)

Decoding Strategies

Decoding Implementation

import torch
import torch.nn.functional as F

def greedy_decode(model, input_ids, max_length=100):
    model.eval()
    for _ in range(max_length):
        logits = model(input_ids)
        next_token_logits = logits[:, -1, :]
        next_token = torch.argmax(next_token_logits, dim=-1)
        input_ids = torch.cat([input_ids, next_token.unsqueeze(1)], dim=1)
    return input_ids

def top_k_decode(model, input_ids, k=50, max_length=100, temperature=1.0):
    model.eval()
    for _ in range(max_length):
        logits = model(input_ids)
        next_token_logits = logits[:, -1, :] / temperature

        # Filter to top-k
        top_k_values, _ = torch.topk(next_token_logits, k)
        threshold = top_k_values[:, -1]
        next_token_logits[next_token_logits < threshold] = float('-inf')

        # Sample
        probs = F.softmax(next_token_logits, dim=-1)
        next_token = torch.multinomial(probs, 1)
        input_ids = torch.cat([input_ids, next_token], dim=1)
    return input_ids

def nucleus_decode(model, input_ids, p=0.9, max_length=100, temperature=1.0):
    model.eval()
    for _ in range(max_length):
        logits = model(input_ids)
        next_token_logits = logits[:, -1, :] / temperature

        # Sort probabilities
        sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True)
        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

        # Remove tokens with cumulative probability above threshold
        sorted_indices_to_remove = cumulative_probs > p
        sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1].clone()
        sorted_indices_to_remove[:, 0] = 0

        indices_to_remove = sorted_indices[sorted_indices_to_remove]
        next_token_logits[:, indices_to_remove] = float('-inf')

        probs = F.softmax(next_token_logits, dim=-1)
        next_token = torch.multinomial(probs, 1)
        input_ids = torch.cat([input_ids, next_token], dim=1)
    return input_ids

Temperature and Sampling

Temperature	Effect	Use Case
0.0	Deterministic, greedy	Factual tasks
0.7	Balanced	General use
1.0	Standard sampling	Creative tasks
1.5	High diversity	Brainstorming

Summary

Language models learn to predict tokens, and decoding strategies determine how outputs are generated. Understanding these fundamentals is essential for effective generative AI development.

Next: We'll explore pretraining and fine-tuning approaches.

Language Model Fundamentals

Language Model Fundamentals

What is a Language Model?

Mathematical Foundation

Training Objectives

Causal Language Modeling (CLM)

Masked Language Modeling (MLM)

Decoding Strategies

Decoding Implementation

Temperature and Sampling

Summary

Premium Content

Need Expert Generative AI Help?