NLP Fundamentals

NLP fundamentals cover the core techniques for processing and understanding human language, from basic tokenization to advanced transformer architectures.

Tokenization Methods

import re
from typing import List

class Tokenizer:
    def __init__(self, method: str = "word"):
        self.method = method
        self.vocab = {}
        self.inverse_vocab = {}

    def word_tokenize(self, text: str) -> List[str]:
        text = text.lower()
        text = re.sub(r'[^\w\s]', ' ', text)
        return text.split()

    def subword_tokenize(self, text: str, vocab_size: int = 1000) -> List[str]:
        tokens = list(text)
        word_tokens = self.word_tokenize(text)
        tokens.extend([t for word in word_tokens for t in self._bpe_tokenize(word)])
        return tokens[:vocab_size]

    def _bpe_tokenize(self, word: str) -> List[str]:
        chars = list(word)
        pairs = {}
        for i in range(len(chars) - 1):
            pair = (chars[i], chars[i+1])
            pairs[pair] = pairs.get(pair, 0) + 1

        if not pairs:
            return [word]

        best_pair = max(pairs, key=pairs.get)
        merged = list("".join(best_pair))
        return merged

    def character_tokenize(self, text: str) -> List[str]:
        return list(text)

    def build_vocab(self, texts: List[str]):
        all_tokens = []
        for text in texts:
            if self.method == "word":
                all_tokens.extend(self.word_tokenize(text))
            else:
                all_tokens.extend(self.character_tokenize(text))

        from collections import Counter
        token_counts = Counter(all_tokens)
        self.vocab = {token: idx for idx, (token, _) in enumerate(token_counts.most_common())}
        self.inverse_vocab = {idx: token for token, idx in self.vocab.items()}

    def encode(self, text: str) -> List[int]:
        if self.method == "word":
            tokens = self.word_tokenize(text)
        else:
            tokens = self.character_tokenize(text)
        return [self.vocab.get(t, 0) for t in tokens]

    def decode(self, indices: List[int]) -> str:
        tokens = [self.inverse_vocab.get(i, "") for i in indices]
        return " ".join(tokens)

# Usage
tokenizer = Tokenizer(method="word")
tokens = tokenizer.word_tokenize("Hello, world! This is NLP.")
encoded = tokenizer.encode("Hello world")

Word Embeddings

import numpy as np
from collections import defaultdict

class Word2Vec:
    def __init__(self, vocab_size: int, embedding_dim: int, learning_rate: float = 0.01):
        self.W_in = np.random.randn(vocab_size, embedding_dim) * 0.01
        self.W_out = np.random.randn(embedding_dim, vocab_size) * 0.01
        self.lr = learning_rate

    def one_hot(self, idx: int, vocab_size: int) -> np.ndarray:
        vec = np.zeros(vocab_size)
        vec[idx] = 1
        return vec

    def forward(self,中心词_idx: int, vocab_size: int) -> np.ndarray:
        h = self.W_in[中心词_idx]
        scores = h @ self.W_out
        exp_scores = np.exp(scores - np.max(scores))
        probs = exp_scores / exp_scores.sum()
        return probs

    def train_step(self, center_idx: int, context_idx: int, vocab_size: int):
        probs = self.forward(center_idx, vocab_size)
        loss = -np.log(probs[context_idx] + 1e-8)

        grad_scores = probs.copy()
        grad_scores[context_idx] -= 1

        grad_W_out = np.outer(self.W_in[center_idx], grad_scores)
        grad_h = grad_scores @ self.W_out.T

        self.W_in[center_idx] -= self.lr * grad_h
        self.W_out -= self.lr * grad_W_out

        return loss

    def get_embedding(self, word_idx: int) -> np.ndarray:
        return self.W_in[word_idx]

    def most_similar(self, word_idx: int, top_k: int = 5) -> list:
        embedding = self.get_embedding(word_idx)
        similarities = np.dot(self.W_in, embedding) / (
            np.linalg.norm(self.W_in, axis=1) * np.linalg.norm(embedding)
        )
        top_indices = np.argsort(similarities)[::-1][1:top_k+1]
        return top_indices.tolist()

# Usage
w2v = Word2Vec(vocab_size=10000, embedding_dim=100)

Attention Mechanism

import numpy as np

class SelfAttention:
    def __init__(self, d_model: int):
        self.d_model = d_model
        self.W_q = np.random.randn(d_model, d_model) * 0.01
        self.W_k = np.random.randn(d_model, d_model) * 0.01
        self.W_v = np.random.randn(d_model, d_model) * 0.01

    def forward(self, x: np.ndarray) -> np.ndarray:
        Q = x @ self.W_q
        K = x @ self.W_k
        V = x @ self.W_v

        scores = Q @ K.T / np.sqrt(self.d_model)
        attention_weights = self.softmax(scores)
        output = attention_weights @ V

        return output, attention_weights

    def softmax(self, x: np.ndarray) -> np.ndarray:
        exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
        return exp_x / exp_x.sum(axis=-1, keepdims=True)

class MultiHeadAttention:
    def __init__(self, d_model: int, num_heads: int):
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        self.attention_heads = [SelfAttention(self.d_k) for _ in range(num_heads)]
        self.W_o = np.random.randn(d_model, d_model) * 0.01

    def forward(self, x: np.ndarray) -> np.ndarray:
        head_outputs = []
        for head in self.attention_heads:
            output, _ = head.forward(x)
            head_outputs.append(output)

        concatenated = np.concatenate(head_outputs, axis=-1)
        return concatenated @ self.W_o

# Usage
attention = SelfAttention(d_model=512)
x = np.random.randn(10, 512)  # 10 tokens, 512 dimensions
output, weights = attention.forward(x)

Transformer Block

class TransformerBlock:
    def __init__(self, d_model: int, d_ff: int, num_heads: int):
        self.attention = MultiHeadAttention(d_model, num_heads)
        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.ffn = FeedForward(d_model, d_ff)

    def forward(self, x: np.ndarray) -> np.ndarray:
        attn_output, _ = self.attention.forward(x)
        x = self.norm1(x + attn_output)
        ffn_output = self.ffn.forward(x)
        x = self.norm2(x + ffn_output)
        return x

class LayerNorm:
    def __init__(self, d_model: int, eps: float = 1e-6):
        self.gamma = np.ones(d_model)
        self.beta = np.zeros(d_model)
        self.eps = eps

    def forward(self, x: np.ndarray) -> np.ndarray:
        mean = x.mean(axis=-1, keepdims=True)
        var = x.var(axis=-1, keepdims=True)
        x_norm = (x - mean) / np.sqrt(var + self.eps)
        return self.gamma * x_norm + self.beta

class FeedForward:
    def __init__(self, d_model: int, d_ff: int):
        self.W1 = np.random.randn(d_model, d_ff) * 0.01
        self.W2 = np.random.randn(d_ff, d_model) * 0.01

    def forward(self, x: np.ndarray) -> np.ndarray:
        return np.maximum(0, x @ self.W1) @ self.W2

# Usage
transformer = TransformerBlock(d_model=512, d_ff=2048, num_heads=8)
x = np.random.randn(10, 512)
output = transformer.forward(x)

Key Takeaways

Tokenization converts text to numerical tokens for processing
Embeddings capture semantic meaning in dense vector spaces
Self-attention enables models to focus on relevant parts of input
Transformers stack attention and feed-forward layers for deep processing
Pre-trained models provide powerful starting points for NLP tasks

NLP Fundamentals

NLP Fundamentals

Tokenization Methods

Word Embeddings

Attention Mechanism

Transformer Block

Key Takeaways

Premium Content

Need Expert Generative AI Help?