Text Classification
Text classification assigns categories to text inputs, supporting sentiment analysis, topic categorization, intent detection, and content moderation.
Traditional ML Classification
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
class TraditionalClassifier:
def __init__(self):
self.pipeline = Pipeline([
('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1, 2))),
('classifier', LogisticRegression(max_iter=1000))
])
def train(self, texts: list, labels: list):
X_train, X_test, y_train, y_test = train_test_split(
texts, labels, test_size=0.2, random_state=42
)
self.pipeline.fit(X_train, y_train)
predictions = self.pipeline.predict(X_test)
return classification_report(y_test, predictions)
def predict(self, texts: list) -> list:
return self.pipeline.predict(texts)
def predict_proba(self, texts: list) -> list:
return self.pipeline.predict_proba(texts)
# Usage
classifier = TraditionalClassifier()
texts = ["Great product!", "Terrible service", "Amazing experience"]
labels = ["positive", "negative", "positive"]
print(classifier.train(texts * 100, labels * 100))
Deep Learning Classifier
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
class TextDataset(Dataset):
def __init__(self, texts, labels, vocab, max_len=128):
self.texts = texts
self.labels = labels
self.vocab = vocab
self.max_len = max_len
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
tokens = self.texts[idx].lower().split()
indices = [self.vocab.get(t, 0) for t in tokens[:self.max_len]]
padded = indices + [0] * (self.max_len - len(indices))
return torch.tensor(padded), torch.tensor(self.labels[idx])
class TextCNN(nn.Module):
def __init__(self, vocab_size, embed_dim, num_classes, num_filters=100):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim)
self.convs = nn.ModuleList([
nn.Conv1d(embed_dim, num_filters, kernel_size=k)
for k in [2, 3, 4, 5]
])
self.dropout = nn.Dropout(0.5)
self.fc = nn.Linear(num_filters * 4, num_classes)
def forward(self, x):
embedded = self.embedding(x).permute(0, 2, 1)
conv_outputs = [torch.relu(conv(embedded)).max(dim=2)[0] for conv in self.convs]
concatenated = torch.cat(conv_outputs, dim=1)
return self.fc(self.dropout(concatenated))
class TextLSTM(nn.Module):
def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim)
self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
self.fc = nn.Linear(hidden_dim * 2, num_classes)
def forward(self, x):
embedded = self.embedding(x)
output, (hidden, _) = self.lstm(embedded)
hidden = torch.cat((hidden[-2], hidden[-1]), dim=1)
return self.fc(hidden)
# Training loop
def train_model(model, dataloader, epochs=10):
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
for epoch in range(epochs):
model.train()
total_loss = 0
for batch_texts, batch_labels in dataloader:
optimizer.zero_grad()
outputs = model(batch_texts)
loss = criterion(outputs, batch_labels)
loss.backward()
optimizer.step()
total_loss += loss.item()
print(f"Epoch {epoch+1}: Loss = {total_loss/len(dataloader):.4f}")
Transformer-Based Classification
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import numpy as np
class TransformerClassifier:
def __init__(self, model_name: str = "bert-base-uncased", num_labels: int = 2):
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForSequenceClassification.from_pretrained(
model_name, num_labels=num_labels
)
def tokenize_data(self, texts: list, labels: list, max_length: int = 128):
encodings = self.tokenizer(
texts, truncation=True, padding=True,
max_length=max_length, return_tensors="pt"
)
return encodings, labels
def compute_metrics(self, eval_pred):
predictions, labels = eval_pred
preds = np.argmax(predictions, axis=-1)
accuracy = (preds == labels).mean()
return {"accuracy": accuracy}
def train(self, train_texts, train_labels, val_texts, val_labels,
output_dir: str = "./classifier_model"):
train_encodings, train_labels = self.tokenize_data(train_texts, train_labels)
val_encodings, val_labels = self.tokenize_data(val_texts, val_labels)
training_args = TrainingArguments(
output_dir=output_dir,
num_train_epochs=3,
per_device_train_batch_size=16,
per_device_eval_batch_size=64,
warmup_steps=500,
weight_decay=0.01,
logging_dir="./logs",
evaluation_strategy="epoch"
)
trainer = Trainer(
model=self.model,
args=training_args,
train_dataset=train_encodings,
eval_dataset=val_encodings,
compute_metrics=self.compute_metrics
)
trainer.train()
def predict(self, text: str) -> dict:
inputs = self.tokenizer(text, return_tensors="pt", truncation=True)
outputs = self.model(**inputs)
probs = torch.softmax(outputs.logits, dim=-1)
predicted_class = torch.argmax(probs, dim=-1).item()
confidence = probs[0][predicted_class].item()
return {"class": predicted_class, "confidence": confidence}
# Usage
classifier = TransformerClassifier(num_labels=3)
classifier.train(train_texts, train_labels, val_texts, val_labels)
result = classifier.predict("This movie was fantastic!")
Zero-Shot Classification
class ZeroShotClassifier:
def __init__(self, model_name: string = "facebook/bart-large-mnli"):
from transformers import pipeline
self.classifier = pipeline("zero-shot-classification", model=model_name)
def classify(self, text: str, candidate_labels: list,
hypothesis_template: str = "This text is about {}") -> dict:
result = self.classifier(
text,
candidate_labels,
hypothesis_template=hypothesis_template
)
return {
"labels": result["labels"],
"scores": result["scores"],
"top_label": result["labels"][0],
"top_score": result["scores"][0]
}
def classify_batch(self, texts: list, labels: list) -> list:
return [self.classify(text, labels) for text in texts]
# Usage
classifier = ZeroShotClassifier()
result = classifier.classify(
"The stock market crashed today",
candidate_labels=["finance", "sports", "politics", "technology"]
)
print(result) # {"top_label": "finance", "top_score": 0.95}
Key Takeaways
- Traditional ML works well with limited data and simple tasks
- Deep learning captures complex patterns with enough training data
- Transformers provide state-of-the-art performance with fine-tuning
- Zero-shot enables classification without task-specific training
- Multi-label supports documents belonging to multiple categories