🎉 75% of content is free forever — Unlock Premium from $10/mo →
CW
Search courses…
💼 Servicesℹ️ About✉️ ContactView Pricing Plansfrom $10

NLP Fundamentals

🟢 Free Lesson

Advertisement

NLP Fundamentals

Raw Text"Hello World"UnstructuredTokenization["Hello", "World"][101, 7592, 2088]Embeddings[0.2, -0.5, ...]Dense VectorsAttentionSelf-AttentionQ, K, V MatricesTransformerEncoder-DecoderStacked LayersPreprocessing PipelineLowercase -> Remove Punctuation -> StemFeature ExtractionTF-IDF -> Word2Vec -> BERT EmbeddingsModel InferenceClassification -> Generation -> ExtractionPost-processingDecode -> Format -> Validate

NLP fundamentals cover the core techniques for processing and understanding human language, from basic tokenization to advanced transformer architectures.

Tokenization Methods

import re
from typing import List

class Tokenizer:
    def __init__(self, method: str = "word"):
        self.method = method
        self.vocab = {}
        self.inverse_vocab = {}

    def word_tokenize(self, text: str) -> List[str]:
        text = text.lower()
        text = re.sub(r'[^\w\s]', ' ', text)
        return text.split()

    def subword_tokenize(self, text: str, vocab_size: int = 1000) -> List[str]:
        tokens = list(text)
        word_tokens = self.word_tokenize(text)
        tokens.extend([t for word in word_tokens for t in self._bpe_tokenize(word)])
        return tokens[:vocab_size]

    def _bpe_tokenize(self, word: str) -> List[str]:
        chars = list(word)
        pairs = {}
        for i in range(len(chars) - 1):
            pair = (chars[i], chars[i+1])
            pairs[pair] = pairs.get(pair, 0) + 1

        if not pairs:
            return [word]

        best_pair = max(pairs, key=pairs.get)
        merged = list("".join(best_pair))
        return merged

    def character_tokenize(self, text: str) -> List[str]:
        return list(text)

    def build_vocab(self, texts: List[str]):
        all_tokens = []
        for text in texts:
            if self.method == "word":
                all_tokens.extend(self.word_tokenize(text))
            else:
                all_tokens.extend(self.character_tokenize(text))

        from collections import Counter
        token_counts = Counter(all_tokens)
        self.vocab = {token: idx for idx, (token, _) in enumerate(token_counts.most_common())}
        self.inverse_vocab = {idx: token for token, idx in self.vocab.items()}

    def encode(self, text: str) -> List[int]:
        if self.method == "word":
            tokens = self.word_tokenize(text)
        else:
            tokens = self.character_tokenize(text)
        return [self.vocab.get(t, 0) for t in tokens]

    def decode(self, indices: List[int]) -> str:
        tokens = [self.inverse_vocab.get(i, "") for i in indices]
        return " ".join(tokens)

# Usage
tokenizer = Tokenizer(method="word")
tokens = tokenizer.word_tokenize("Hello, world! This is NLP.")
encoded = tokenizer.encode("Hello world")

Word Embeddings

import numpy as np
from collections import defaultdict

class Word2Vec:
    def __init__(self, vocab_size: int, embedding_dim: int, learning_rate: float = 0.01):
        self.W_in = np.random.randn(vocab_size, embedding_dim) * 0.01
        self.W_out = np.random.randn(embedding_dim, vocab_size) * 0.01
        self.lr = learning_rate

    def one_hot(self, idx: int, vocab_size: int) -> np.ndarray:
        vec = np.zeros(vocab_size)
        vec[idx] = 1
        return vec

    def forward(self,中心词_idx: int, vocab_size: int) -> np.ndarray:
        h = self.W_in[中心词_idx]
        scores = h @ self.W_out
        exp_scores = np.exp(scores - np.max(scores))
        probs = exp_scores / exp_scores.sum()
        return probs

    def train_step(self, center_idx: int, context_idx: int, vocab_size: int):
        probs = self.forward(center_idx, vocab_size)
        loss = -np.log(probs[context_idx] + 1e-8)

        grad_scores = probs.copy()
        grad_scores[context_idx] -= 1

        grad_W_out = np.outer(self.W_in[center_idx], grad_scores)
        grad_h = grad_scores @ self.W_out.T

        self.W_in[center_idx] -= self.lr * grad_h
        self.W_out -= self.lr * grad_W_out

        return loss

    def get_embedding(self, word_idx: int) -> np.ndarray:
        return self.W_in[word_idx]

    def most_similar(self, word_idx: int, top_k: int = 5) -> list:
        embedding = self.get_embedding(word_idx)
        similarities = np.dot(self.W_in, embedding) / (
            np.linalg.norm(self.W_in, axis=1) * np.linalg.norm(embedding)
        )
        top_indices = np.argsort(similarities)[::-1][1:top_k+1]
        return top_indices.tolist()

# Usage
w2v = Word2Vec(vocab_size=10000, embedding_dim=100)

Attention Mechanism

import numpy as np

class SelfAttention:
    def __init__(self, d_model: int):
        self.d_model = d_model
        self.W_q = np.random.randn(d_model, d_model) * 0.01
        self.W_k = np.random.randn(d_model, d_model) * 0.01
        self.W_v = np.random.randn(d_model, d_model) * 0.01

    def forward(self, x: np.ndarray) -> np.ndarray:
        Q = x @ self.W_q
        K = x @ self.W_k
        V = x @ self.W_v

        scores = Q @ K.T / np.sqrt(self.d_model)
        attention_weights = self.softmax(scores)
        output = attention_weights @ V

        return output, attention_weights

    def softmax(self, x: np.ndarray) -> np.ndarray:
        exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
        return exp_x / exp_x.sum(axis=-1, keepdims=True)

class MultiHeadAttention:
    def __init__(self, d_model: int, num_heads: int):
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        self.attention_heads = [SelfAttention(self.d_k) for _ in range(num_heads)]
        self.W_o = np.random.randn(d_model, d_model) * 0.01

    def forward(self, x: np.ndarray) -> np.ndarray:
        head_outputs = []
        for head in self.attention_heads:
            output, _ = head.forward(x)
            head_outputs.append(output)

        concatenated = np.concatenate(head_outputs, axis=-1)
        return concatenated @ self.W_o

# Usage
attention = SelfAttention(d_model=512)
x = np.random.randn(10, 512)  # 10 tokens, 512 dimensions
output, weights = attention.forward(x)

Transformer Block

class TransformerBlock:
    def __init__(self, d_model: int, d_ff: int, num_heads: int):
        self.attention = MultiHeadAttention(d_model, num_heads)
        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.ffn = FeedForward(d_model, d_ff)

    def forward(self, x: np.ndarray) -> np.ndarray:
        attn_output, _ = self.attention.forward(x)
        x = self.norm1(x + attn_output)
        ffn_output = self.ffn.forward(x)
        x = self.norm2(x + ffn_output)
        return x

class LayerNorm:
    def __init__(self, d_model: int, eps: float = 1e-6):
        self.gamma = np.ones(d_model)
        self.beta = np.zeros(d_model)
        self.eps = eps

    def forward(self, x: np.ndarray) -> np.ndarray:
        mean = x.mean(axis=-1, keepdims=True)
        var = x.var(axis=-1, keepdims=True)
        x_norm = (x - mean) / np.sqrt(var + self.eps)
        return self.gamma * x_norm + self.beta

class FeedForward:
    def __init__(self, d_model: int, d_ff: int):
        self.W1 = np.random.randn(d_model, d_ff) * 0.01
        self.W2 = np.random.randn(d_ff, d_model) * 0.01

    def forward(self, x: np.ndarray) -> np.ndarray:
        return np.maximum(0, x @ self.W1) @ self.W2

# Usage
transformer = TransformerBlock(d_model=512, d_ff=2048, num_heads=8)
x = np.random.randn(10, 512)
output = transformer.forward(x)

Key Takeaways

  • Tokenization converts text to numerical tokens for processing
  • Embeddings capture semantic meaning in dense vector spaces
  • Self-attention enables models to focus on relevant parts of input
  • Transformers stack attention and feed-forward layers for deep processing
  • Pre-trained models provide powerful starting points for NLP tasks

Premium Content

NLP Fundamentals

Unlock this lesson and 900+ advanced tutorials with a Premium plan.

🎯End-to-end Projects
💼Interview Prep
📜Certificates
🤝Community Access

Already a member? Log in

Need Expert Generative AI Help?

Get personalized tutoring, project support, or professional consulting.

Advertisement