Advanced Text Classification
Text classification assigns labels to text β spam detection, topic routing, sentiment analysis. From classic TF-IDF + SVM to fine-tuned BERT, this lesson covers the full spectrum of techniques.
Why Advanced Text Classification Matters
Customer support tickets need routing. Legal documents need categorization. Social media needs content moderation. Modern text classifiers handle millions of documents with superhuman accuracy.
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report, f1_score
from sklearn.preprocessing import MultiLabelBinarizer
import warnings
warnings.filterwarnings('ignore')
Generate Text Classification Dataset
np.random.seed(42)
n = 2000
categories = {
'sports': [
'team won championship game player scored goal match tournament',
'athlete training olympics gold medal record performance',
'football basketball soccer baseball league season playoffs',
'coach strategy draft pick transfer market club victory'
],
'politics': [
'president election campaign vote policy government senate',
'democrat republican party congress legislation bill law',
'debate speech announcement reform campaign agenda',
'diplomatic foreign policy trade agreement summit meeting'
],
'technology': [
'software startup app development programming code algorithm',
'artificial intelligence machine learning data cloud computing',
'device smartphone laptop gadget launch release product',
'cybersecurity privacy encryption blockchain cryptocurrency'
],
'entertainment': [
'movie film actor director box office premiere Oscar',
'music album concert artist tour performance song',
'celebrity gossip interview magazine award red carpet',
'streaming series episode season release premiere show'
]
}
documents = []
labels = []
for label, templates in categories.items():
for template in templates:
words = template.split()
for _ in range(n // len(categories)):
# Randomly combine words
selected = np.random.choice(words, size=np.random.randint(8, 15), replace=True)
doc = ' '.join(selected)
# Add noise
noise_words = np.random.choice(['the', 'a', 'is', 'was', 'has', 'will', 'for', 'with'],
size=np.random.randint(3, 8))
doc = ' '.join(np.random.permutation(list(selected) + list(noise_words)))
documents.append(doc)
labels.append(label)
# Shuffle
indices = np.random.permutation(len(documents))
documents = [documents[i] for i in indices]
labels = [labels[i] for i in indices]
print(f"Dataset: {len(documents)} documents, {len(set(labels))} classes")
print(f"Class distribution: {pd.Series(labels).value_counts().to_dict()}")
TF-IDF + SVM
The classic baseline β fast, effective, and interpretable.
# TF-IDF + Linear SVM pipeline
svm_pipeline = Pipeline([
('tfidf', TfidfVectorizer(
max_features=10000,
ngram_range=(1, 2),
min_df=2,
max_df=0.95,
sublinear_tf=True
)),
('svm', LinearSVC(C=1.0, max_iter=10000, random_state=42))
])
# Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(svm_pipeline, documents, labels, cv=cv, scoring='accuracy')
print(f"TF-IDF + SVM CV Accuracy: {scores.mean():.4f} Β± {scores.std():.4f}")
# Feature importance
svm_pipeline.fit(documents, labels)
feature_names = svm_pipeline.named_steps['tfidf'].get_feature_names_out()
coefs = svm_pipeline.named_steps['svm'].coef_
print("\nTop features per class:")
for i, class_label in enumerate(categories.keys()):
top_indices = coefs[i].argsort()[-10:][::-1]
top_features = [feature_names[j] for j in top_indices]
print(f" {class_label}: {', '.join(top_features)}")
Logistic Regression with TF-IDF
lr_pipeline = Pipeline([
('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1, 2))),
('lr', LogisticRegression(C=1.0, max_iter=1000, random_state=42, n_jobs=-1))
])
scores_lr = cross_val_score(lr_pipeline, documents, labels, cv=cv, scoring='accuracy')
print(f"TF-IDF + LR CV Accuracy: {scores_lr.mean():.4f} Β± {scores_lr.std():.4f}")
FastText
Facebook's fastText for fast, effective text classification with subword features.
# FastText format preparation
def create_fasttext_data(documents, labels, filepath):
"""Create FastText formatted training data."""
with open(filepath, 'w') as f:
for doc, label in zip(documents, labels):
f.write(f"__label__{label} {doc}\n")
# Prepare data
train_docs = documents[:1600]
train_labels = labels[:1600]
test_docs = documents[1600:]
test_labels = labels[1600:]
create_fasttext_data(train_docs, train_labels, 'train.txt')
create_fasttext_data(test_docs, test_labels, 'test.txt')
# Train fastText model
try:
import fasttext
model = fasttext.train_supervised(
'train.txt',
lr=0.5,
epoch=25,
wordNgrams=2,
dim=50,
loss='softmax'
)
# Predict
predictions = [model.predict(doc)[0][0].replace('__label__', '') for doc in test_docs]
accuracy = sum(p == l for p, l in zip(predictions, test_labels)) / len(test_labels)
print(f"FastText Accuracy: {accuracy:.4f}")
# FastText with autotune
model_tuned = fasttext.train_supervised(
'train.txt', autotuneValidationFile='test.txt', autotuneDuration=30
)
except ImportError:
print("Install fasttext: pip install fasttext")
Fine-Tuning BERT
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, AdamW
class TextClassificationDataset(Dataset):
def __init__(self, texts, labels, tokenizer, max_len=128):
self.texts = texts
self.labels = labels
self.tokenizer = tokenizer
self.max_len = max_len
self.label_map = {l: i for i, l in enumerate(set(labels))}
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
encoding = self.tokenizer(
self.texts[idx],
max_length=self.max_len,
padding='max_length',
truncation=True,
return_tensors='pt'
)
return {
'input_ids': encoding['input_ids'].squeeze(),
'attention_mask': encoding['attention_mask'].squeeze(),
'label': self.label_map[self.labels[idx]]
}
class BERTClassifier(nn.Module):
def __init__(self, n_classes, model_name='bert-base-uncased'):
super().__init__()
self.bert = BertModel.from_pretrained(model_name)
self.dropout = nn.Dropout(0.3)
self.classifier = nn.Linear(768, n_classes)
def forward(self, input_ids, attention_mask):
outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
pooled = outputs.pooler_output
return self.classifier(self.dropout(pooled))
# Training function
def train_bert(texts, labels, n_classes=4, epochs=3, batch_size=16):
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
dataset = TextClassificationDataset(texts, labels, tokenizer)
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
model = BERTClassifier(n_classes).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()
for epoch in range(epochs):
model.train()
total_loss = 0
correct = 0
total = 0
for batch in loader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels_batch = batch['label'].to(device)
outputs = model(input_ids, attention_mask)
loss = criterion(outputs, labels_batch)
optimizer.zero_grad()
loss.backward()
optimizer.step()
total_loss += loss.item()
_, predicted = torch.max(outputs, 1)
correct += (predicted == labels_batch).sum().item()
total += labels_batch.size(0)
accuracy = correct / total
print(f"Epoch {epoch+1}: Loss={total_loss/len(loader):.4f}, Accuracy={accuracy:.4f}")
return model, tokenizer
# Train (simplified for demo)
print("BERT fine-tuning pipeline ready")
print("For production: use transformers.Trainer or PyTorch Lightning")
Multi-Label Classification
class MultiLabelClassifier:
"""Multi-label text classification with binary relevance."""
def __init__(self, n_labels):
self.n_labels = n_labels
self.classifiers = []
self.vectorizer = TfidfVectorizer(max_features=5000)
def fit(self, texts, labels_matrix):
"""Train one binary classifier per label."""
X = self.vectorizer.fit_transform(texts)
for i in range(self.n_labels):
clf = LogisticRegression(C=1.0, max_iter=500)
clf.fit(X, labels_matrix[:, i])
self.classifiers.append(clf)
def predict(self, texts, threshold=0.5):
X = self.vectorizer.transform(texts)
predictions = np.zeros((len(texts), self.n_labels))
for i, clf in enumerate(self.classifiers):
predictions[:, i] = clf.predict_proba(X)[:, 1] >= threshold
return predictions
# Multi-label data
np.random.seed(42)
n_docs = 500
n_labels = 4
label_names = ['positive', 'negative', 'question', 'urgent']
multi_labels = np.random.binomial(1, 0.3, (n_docs, n_labels))
multi_labels[:, 0] = 1 - multi_labels[:, 1] # positive and negative are somewhat exclusive
mlc = MultiLabelClassifier(n_labels)
mlc.fit(documents[:n_docs], multi_labels)
# Predict with different thresholds
preds = mlc.predict(documents[:n_docs], threshold=0.3)
print(f"Multi-label predictions: {preds.shape}")
print(f"Average labels per doc: {preds.sum(axis=1).mean():.1f}")
Hierarchical Classification
class HierarchicalClassifier:
"""Two-level classification: coarse then fine-grained."""
def __init__(self):
self.coarse_clf = Pipeline([
('tfidf', TfidfVectorizer(max_features=5000)),
('clf', LinearSVC())
])
self.fine_classifiers = {}
def fit(self, texts, coarse_labels, fine_labels):
self.coarse_clf.fit(texts, coarse_labels)
for coarse in set(coarse_labels):
mask = [c == coarse for c in coarse_labels]
fine_texts = [t for t, m in zip(texts, mask) if m]
fine_labels_coarse = [f for f, m in zip(fine_labels, mask) if m]
if len(set(fine_labels_coarse)) > 1:
clf = Pipeline([
('tfidf', TfidfVectorizer(max_features=3000)),
('clf', LinearSVC())
])
clf.fit(fine_texts, fine_labels_coarse)
self.fine_classifiers[coarse] = clf
def predict(self, texts):
coarse = self.coarse_clf.predict(texts)
fine = []
for text, c in zip(texts, coarse):
if c in self.fine_classifiers:
fine.append(self.fine_classifiers[c].predict([text])[0])
else:
fine.append(c)
return coarse, fine
Best Practices
- TF-IDF + SVM as baseline β fast, interpretable, often competitive
- BERT for accuracy β fine-tune when data is sufficient (>1000 samples per class)
- fastText for speed β near-BERT accuracy with SVM-like speed
- Multi-label needs threshold tuning β use precision-recall curves
- Class weights for imbalance β critical for rare categories
- Ensemble diverse models β combine TF-IDF and neural approaches
Summary
Text classification spans from TF-IDF + SVM baselines to fine-tuned transformers. Start simple, add complexity as needed. BERT achieves state-of-the-art but requires more data and compute. fastText bridges the gap for practical applications.