NLP Fundamentals
NLP fundamentals cover the core techniques for processing and understanding human language, from basic tokenization to advanced transformer architectures.
Tokenization Methods
import re
from typing import List
class Tokenizer:
def __init__(self, method: str = "word"):
self.method = method
self.vocab = {}
self.inverse_vocab = {}
def word_tokenize(self, text: str) -> List[str]:
text = text.lower()
text = re.sub(r'[^\w\s]', ' ', text)
return text.split()
def subword_tokenize(self, text: str, vocab_size: int = 1000) -> List[str]:
tokens = list(text)
word_tokens = self.word_tokenize(text)
tokens.extend([t for word in word_tokens for t in self._bpe_tokenize(word)])
return tokens[:vocab_size]
def _bpe_tokenize(self, word: str) -> List[str]:
chars = list(word)
pairs = {}
for i in range(len(chars) - 1):
pair = (chars[i], chars[i+1])
pairs[pair] = pairs.get(pair, 0) + 1
if not pairs:
return [word]
best_pair = max(pairs, key=pairs.get)
merged = list("".join(best_pair))
return merged
def character_tokenize(self, text: str) -> List[str]:
return list(text)
def build_vocab(self, texts: List[str]):
all_tokens = []
for text in texts:
if self.method == "word":
all_tokens.extend(self.word_tokenize(text))
else:
all_tokens.extend(self.character_tokenize(text))
from collections import Counter
token_counts = Counter(all_tokens)
self.vocab = {token: idx for idx, (token, _) in enumerate(token_counts.most_common())}
self.inverse_vocab = {idx: token for token, idx in self.vocab.items()}
def encode(self, text: str) -> List[int]:
if self.method == "word":
tokens = self.word_tokenize(text)
else:
tokens = self.character_tokenize(text)
return [self.vocab.get(t, 0) for t in tokens]
def decode(self, indices: List[int]) -> str:
tokens = [self.inverse_vocab.get(i, "") for i in indices]
return " ".join(tokens)
# Usage
tokenizer = Tokenizer(method="word")
tokens = tokenizer.word_tokenize("Hello, world! This is NLP.")
encoded = tokenizer.encode("Hello world")
Word Embeddings
import numpy as np
from collections import defaultdict
class Word2Vec:
def __init__(self, vocab_size: int, embedding_dim: int, learning_rate: float = 0.01):
self.W_in = np.random.randn(vocab_size, embedding_dim) * 0.01
self.W_out = np.random.randn(embedding_dim, vocab_size) * 0.01
self.lr = learning_rate
def one_hot(self, idx: int, vocab_size: int) -> np.ndarray:
vec = np.zeros(vocab_size)
vec[idx] = 1
return vec
def forward(self,中心词_idx: int, vocab_size: int) -> np.ndarray:
h = self.W_in[中心词_idx]
scores = h @ self.W_out
exp_scores = np.exp(scores - np.max(scores))
probs = exp_scores / exp_scores.sum()
return probs
def train_step(self, center_idx: int, context_idx: int, vocab_size: int):
probs = self.forward(center_idx, vocab_size)
loss = -np.log(probs[context_idx] + 1e-8)
grad_scores = probs.copy()
grad_scores[context_idx] -= 1
grad_W_out = np.outer(self.W_in[center_idx], grad_scores)
grad_h = grad_scores @ self.W_out.T
self.W_in[center_idx] -= self.lr * grad_h
self.W_out -= self.lr * grad_W_out
return loss
def get_embedding(self, word_idx: int) -> np.ndarray:
return self.W_in[word_idx]
def most_similar(self, word_idx: int, top_k: int = 5) -> list:
embedding = self.get_embedding(word_idx)
similarities = np.dot(self.W_in, embedding) / (
np.linalg.norm(self.W_in, axis=1) * np.linalg.norm(embedding)
)
top_indices = np.argsort(similarities)[::-1][1:top_k+1]
return top_indices.tolist()
# Usage
w2v = Word2Vec(vocab_size=10000, embedding_dim=100)
Attention Mechanism
import numpy as np
class SelfAttention:
def __init__(self, d_model: int):
self.d_model = d_model
self.W_q = np.random.randn(d_model, d_model) * 0.01
self.W_k = np.random.randn(d_model, d_model) * 0.01
self.W_v = np.random.randn(d_model, d_model) * 0.01
def forward(self, x: np.ndarray) -> np.ndarray:
Q = x @ self.W_q
K = x @ self.W_k
V = x @ self.W_v
scores = Q @ K.T / np.sqrt(self.d_model)
attention_weights = self.softmax(scores)
output = attention_weights @ V
return output, attention_weights
def softmax(self, x: np.ndarray) -> np.ndarray:
exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
return exp_x / exp_x.sum(axis=-1, keepdims=True)
class MultiHeadAttention:
def __init__(self, d_model: int, num_heads: int):
self.d_model = d_model
self.num_heads = num_heads
self.d_k = d_model // num_heads
self.attention_heads = [SelfAttention(self.d_k) for _ in range(num_heads)]
self.W_o = np.random.randn(d_model, d_model) * 0.01
def forward(self, x: np.ndarray) -> np.ndarray:
head_outputs = []
for head in self.attention_heads:
output, _ = head.forward(x)
head_outputs.append(output)
concatenated = np.concatenate(head_outputs, axis=-1)
return concatenated @ self.W_o
# Usage
attention = SelfAttention(d_model=512)
x = np.random.randn(10, 512) # 10 tokens, 512 dimensions
output, weights = attention.forward(x)
Transformer Block
class TransformerBlock:
def __init__(self, d_model: int, d_ff: int, num_heads: int):
self.attention = MultiHeadAttention(d_model, num_heads)
self.norm1 = LayerNorm(d_model)
self.norm2 = LayerNorm(d_model)
self.ffn = FeedForward(d_model, d_ff)
def forward(self, x: np.ndarray) -> np.ndarray:
attn_output, _ = self.attention.forward(x)
x = self.norm1(x + attn_output)
ffn_output = self.ffn.forward(x)
x = self.norm2(x + ffn_output)
return x
class LayerNorm:
def __init__(self, d_model: int, eps: float = 1e-6):
self.gamma = np.ones(d_model)
self.beta = np.zeros(d_model)
self.eps = eps
def forward(self, x: np.ndarray) -> np.ndarray:
mean = x.mean(axis=-1, keepdims=True)
var = x.var(axis=-1, keepdims=True)
x_norm = (x - mean) / np.sqrt(var + self.eps)
return self.gamma * x_norm + self.beta
class FeedForward:
def __init__(self, d_model: int, d_ff: int):
self.W1 = np.random.randn(d_model, d_ff) * 0.01
self.W2 = np.random.randn(d_ff, d_model) * 0.01
def forward(self, x: np.ndarray) -> np.ndarray:
return np.maximum(0, x @ self.W1) @ self.W2
# Usage
transformer = TransformerBlock(d_model=512, d_ff=2048, num_heads=8)
x = np.random.randn(10, 512)
output = transformer.forward(x)
Key Takeaways
- Tokenization converts text to numerical tokens for processing
- Embeddings capture semantic meaning in dense vector spaces
- Self-attention enables models to focus on relevant parts of input
- Transformers stack attention and feed-forward layers for deep processing
- Pre-trained models provide powerful starting points for NLP tasks