Advanced Text Classification

Text classification assigns labels to text – spam detection, topic routing, sentiment analysis. From classic TF-IDF + SVM to fine-tuned BERT, this lesson covers the full spectrum of techniques.

Why Advanced Text Classification Matters

Customer support tickets need routing. Legal documents need categorization. Social media needs content moderation. Modern text classifiers handle millions of documents with superhuman accuracy.

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report, f1_score
from sklearn.preprocessing import MultiLabelBinarizer
import warnings
warnings.filterwarnings('ignore')

Generate Text Classification Dataset

np.random.seed(42)
n = 2000

categories = {
    'sports': [
        'team won championship game player scored goal match tournament',
        'athlete training olympics gold medal record performance',
        'football basketball soccer baseball league season playoffs',
        'coach strategy draft pick transfer market club victory'
    ],
    'politics': [
        'president election campaign vote policy government senate',
        'democrat republican party congress legislation bill law',
        'debate speech announcement reform campaign agenda',
        'diplomatic foreign policy trade agreement summit meeting'
    ],
    'technology': [
        'software startup app development programming code algorithm',
        'artificial intelligence machine learning data cloud computing',
        'device smartphone laptop gadget launch release product',
        'cybersecurity privacy encryption blockchain cryptocurrency'
    ],
    'entertainment': [
        'movie film actor director box office premiere Oscar',
        'music album concert artist tour performance song',
        'celebrity gossip interview magazine award red carpet',
        'streaming series episode season release premiere show'
    ]
}

documents = []
labels = []
for label, templates in categories.items():
    for template in templates:
        words = template.split()
        for _ in range(n // len(categories)):
            # Randomly combine words
            selected = np.random.choice(words, size=np.random.randint(8, 15), replace=True)
            doc = ' '.join(selected)
            # Add noise
            noise_words = np.random.choice(['the', 'a', 'is', 'was', 'has', 'will', 'for', 'with'], 
                                          size=np.random.randint(3, 8))
            doc = ' '.join(np.random.permutation(list(selected) + list(noise_words)))
            documents.append(doc)
            labels.append(label)

# Shuffle
indices = np.random.permutation(len(documents))
documents = [documents[i] for i in indices]
labels = [labels[i] for i in indices]

print(f"Dataset: {len(documents)} documents, {len(set(labels))} classes")
print(f"Class distribution: {pd.Series(labels).value_counts().to_dict()}")

TF-IDF + SVM

The classic baseline – fast, effective, and interpretable.

# TF-IDF + Linear SVM pipeline
svm_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=10000,
        ngram_range=(1, 2),
        min_df=2,
        max_df=0.95,
        sublinear_tf=True
    )),
    ('svm', LinearSVC(C=1.0, max_iter=10000, random_state=42))
])

# Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(svm_pipeline, documents, labels, cv=cv, scoring='accuracy')
print(f"TF-IDF + SVM CV Accuracy: {scores.mean():.4f} ± {scores.std():.4f}")

# Feature importance
svm_pipeline.fit(documents, labels)
feature_names = svm_pipeline.named_steps['tfidf'].get_feature_names_out()
coefs = svm_pipeline.named_steps['svm'].coef_

print("\nTop features per class:")
for i, class_label in enumerate(categories.keys()):
    top_indices = coefs[i].argsort()[-10:][::-1]
    top_features = [feature_names[j] for j in top_indices]
    print(f"  {class_label}: {', '.join(top_features)}")

Logistic Regression with TF-IDF

lr_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1, 2))),
    ('lr', LogisticRegression(C=1.0, max_iter=1000, random_state=42, n_jobs=-1))
])

scores_lr = cross_val_score(lr_pipeline, documents, labels, cv=cv, scoring='accuracy')
print(f"TF-IDF + LR CV Accuracy: {scores_lr.mean():.4f} ± {scores_lr.std():.4f}")

FastText

Facebook's fastText for fast, effective text classification with subword features.

# FastText format preparation
def create_fasttext_data(documents, labels, filepath):
    """Create FastText formatted training data."""
    with open(filepath, 'w') as f:
        for doc, label in zip(documents, labels):
            f.write(f"__label__{label} {doc}\n")

# Prepare data
train_docs = documents[:1600]
train_labels = labels[:1600]
test_docs = documents[1600:]
test_labels = labels[1600:]

create_fasttext_data(train_docs, train_labels, 'train.txt')
create_fasttext_data(test_docs, test_labels, 'test.txt')

# Train fastText model
try:
    import fasttext
    
    model = fasttext.train_supervised(
        'train.txt',
        lr=0.5,
        epoch=25,
        wordNgrams=2,
        dim=50,
        loss='softmax'
    )
    
    # Predict
    predictions = [model.predict(doc)[0][0].replace('__label__', '') for doc in test_docs]
    accuracy = sum(p == l for p, l in zip(predictions, test_labels)) / len(test_labels)
    print(f"FastText Accuracy: {accuracy:.4f}")
    
    # FastText with autotune
    model_tuned = fasttext.train_supervised(
        'train.txt', autotuneValidationFile='test.txt', autotuneDuration=30
    )
    
except ImportError:
    print("Install fasttext: pip install fasttext")

Fine-Tuning BERT

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, AdamW

class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.label_map = {l: i for i, l in enumerate(set(labels))}
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': self.label_map[self.labels[idx]]
        }

class BERTClassifier(nn.Module):
    def __init__(self, n_classes, model_name='bert-base-uncased'):
        super().__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(768, n_classes)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.pooler_output
        return self.classifier(self.dropout(pooled))

# Training function
def train_bert(texts, labels, n_classes=4, epochs=3, batch_size=16):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    
    dataset = TextClassificationDataset(texts, labels, tokenizer)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    model = BERTClassifier(n_classes).to(device)
    optimizer = AdamW(model.parameters(), lr=2e-5)
    criterion = nn.CrossEntropyLoss()
    
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        correct = 0
        total = 0
        
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels_batch = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels_batch)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels_batch).sum().item()
            total += labels_batch.size(0)
        
        accuracy = correct / total
        print(f"Epoch {epoch+1}: Loss={total_loss/len(loader):.4f}, Accuracy={accuracy:.4f}")
    
    return model, tokenizer

# Train (simplified for demo)
print("BERT fine-tuning pipeline ready")
print("For production: use transformers.Trainer or PyTorch Lightning")

Multi-Label Classification

class MultiLabelClassifier:
    """Multi-label text classification with binary relevance."""
    
    def __init__(self, n_labels):
        self.n_labels = n_labels
        self.classifiers = []
        self.vectorizer = TfidfVectorizer(max_features=5000)
    
    def fit(self, texts, labels_matrix):
        """Train one binary classifier per label."""
        X = self.vectorizer.fit_transform(texts)
        
        for i in range(self.n_labels):
            clf = LogisticRegression(C=1.0, max_iter=500)
            clf.fit(X, labels_matrix[:, i])
            self.classifiers.append(clf)
    
    def predict(self, texts, threshold=0.5):
        X = self.vectorizer.transform(texts)
        predictions = np.zeros((len(texts), self.n_labels))
        
        for i, clf in enumerate(self.classifiers):
            predictions[:, i] = clf.predict_proba(X)[:, 1] >= threshold
        
        return predictions

# Multi-label data
np.random.seed(42)
n_docs = 500
n_labels = 4
label_names = ['positive', 'negative', 'question', 'urgent']

multi_labels = np.random.binomial(1, 0.3, (n_docs, n_labels))
multi_labels[:, 0] = 1 - multi_labels[:, 1]  # positive and negative are somewhat exclusive

mlc = MultiLabelClassifier(n_labels)
mlc.fit(documents[:n_docs], multi_labels)

# Predict with different thresholds
preds = mlc.predict(documents[:n_docs], threshold=0.3)
print(f"Multi-label predictions: {preds.shape}")
print(f"Average labels per doc: {preds.sum(axis=1).mean():.1f}")

Hierarchical Classification

class HierarchicalClassifier:
    """Two-level classification: coarse then fine-grained."""
    
    def __init__(self):
        self.coarse_clf = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=5000)),
            ('clf', LinearSVC())
        ])
        self.fine_classifiers = {}
    
    def fit(self, texts, coarse_labels, fine_labels):
        self.coarse_clf.fit(texts, coarse_labels)
        
        for coarse in set(coarse_labels):
            mask = [c == coarse for c in coarse_labels]
            fine_texts = [t for t, m in zip(texts, mask) if m]
            fine_labels_coarse = [f for f, m in zip(fine_labels, mask) if m]
            
            if len(set(fine_labels_coarse)) > 1:
                clf = Pipeline([
                    ('tfidf', TfidfVectorizer(max_features=3000)),
                    ('clf', LinearSVC())
                ])
                clf.fit(fine_texts, fine_labels_coarse)
                self.fine_classifiers[coarse] = clf
    
    def predict(self, texts):
        coarse = self.coarse_clf.predict(texts)
        fine = []
        for text, c in zip(texts, coarse):
            if c in self.fine_classifiers:
                fine.append(self.fine_classifiers[c].predict([text])[0])
            else:
                fine.append(c)
        return coarse, fine

Best Practices

TF-IDF + SVM as baseline – fast, interpretable, often competitive
BERT for accuracy – fine-tune when data is sufficient (>1000 samples per class)
fastText for speed – near-BERT accuracy with SVM-like speed
Multi-label needs threshold tuning – use precision-recall curves
Class weights for imbalance – critical for rare categories
Ensemble diverse models – combine TF-IDF and neural approaches

Summary

Text classification spans from TF-IDF + SVM baselines to fine-tuned transformers. Start simple, add complexity as needed. BERT achieves state-of-the-art but requires more data and compute. fastText bridges the gap for practical applications.