Introduction
Transformers use self-attention mechanisms to process sequential data, forming the backbone of modern NLP models.
Attention Mechanism
import torch
import torch.nn as nn
class SelfAttention(nn.Module):
def __init__(self, embed_size, heads):
super().__init__()
self.embed_size = embed_size
self.heads = heads
self.head_dim = embed_size // heads
self.values = nn.Linear(embed_size, embed_size)
self.keys = nn.Linear(embed_size, embed_size)
self.queries = nn.Linear(embed_size, embed_size)
self.fc_out = nn.Linear(embed_size, embed_size)
def forward(self, values, keys, query, mask):
N = query.shape[0]
value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]
values = self.values(values)
keys = self.keys(keys)
queries = self.queries(query)
# Reshape for multi-head attention
values = values.reshape(N, value_len, self.heads, self.head_dim)
keys = keys.reshape(N, key_len, self.heads, self.head_dim)
queries = queries.reshape(N, query_len, self.heads, self.head_dim)
# Attention scores
energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])
if mask is not None:
energy = energy.masked_fill(mask == 0, float("-1e20"))
attention = torch.softmax(energy / (self.embed_size ** (1/2)), dim=3)
out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(N, query_len, self.embed_size)
return self.fc_out(out)
BERT
from transformers import BertModel, BertTokenizer
bert = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
inputs = tokenizer("Hello world", return_tensors='pt')
outputs = bert(**inputs)
pooled = outputs.pooler_output
last_hidden = outputs.last_hidden_state
GPT Architecture
from transformers import GPT2Model, GPT2Tokenizer
gpt = GPT2Model.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
inputs = tokenizer("Once upon a time", return_tensors='pt')
outputs = gpt(**inputs)
Practice Problems
- Implement self-attention
- Use BERT for embeddings
- Fine-tune BERT
- Use GPT for generation
- Compare encoder vs decoder