Text Classification

Text classification assigns categories to text inputs, supporting sentiment analysis, topic categorization, intent detection, and content moderation.

Traditional ML Classification

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

class TraditionalClassifier:
    def __init__(self):
        self.pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1, 2))),
            ('classifier', LogisticRegression(max_iter=1000))
        ])

    def train(self, texts: list, labels: list):
        X_train, X_test, y_train, y_test = train_test_split(
            texts, labels, test_size=0.2, random_state=42
        )
        self.pipeline.fit(X_train, y_train)
        predictions = self.pipeline.predict(X_test)
        return classification_report(y_test, predictions)

    def predict(self, texts: list) -> list:
        return self.pipeline.predict(texts)

    def predict_proba(self, texts: list) -> list:
        return self.pipeline.predict_proba(texts)

# Usage
classifier = TraditionalClassifier()
texts = ["Great product!", "Terrible service", "Amazing experience"]
labels = ["positive", "negative", "positive"]
print(classifier.train(texts * 100, labels * 100))

Deep Learning Classifier

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

class TextDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_len=128):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens = self.texts[idx].lower().split()
        indices = [self.vocab.get(t, 0) for t in tokens[:self.max_len]]
        padded = indices + [0] * (self.max_len - len(indices))
        return torch.tensor(padded), torch.tensor(self.labels[idx])

class TextCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes, num_filters=100):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.convs = nn.ModuleList([
            nn.Conv1d(embed_dim, num_filters, kernel_size=k)
            for k in [2, 3, 4, 5]
        ])
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(num_filters * 4, num_classes)

    def forward(self, x):
        embedded = self.embedding(x).permute(0, 2, 1)
        conv_outputs = [torch.relu(conv(embedded)).max(dim=2)[0] for conv in self.convs]
        concatenated = torch.cat(conv_outputs, dim=1)
        return self.fc(self.dropout(concatenated))

class TextLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)
        output, (hidden, _) = self.lstm(embedded)
        hidden = torch.cat((hidden[-2], hidden[-1]), dim=1)
        return self.fc(hidden)

# Training loop
def train_model(model, dataloader, epochs=10):
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch_texts, batch_labels in dataloader:
            optimizer.zero_grad()
            outputs = model(batch_texts)
            loss = criterion(outputs, batch_labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}: Loss = {total_loss/len(dataloader):.4f}")

Transformer-Based Classification

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import numpy as np

class TransformerClassifier:
    def __init__(self, model_name: str = "bert-base-uncased", num_labels: int = 2):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_name, num_labels=num_labels
        )

    def tokenize_data(self, texts: list, labels: list, max_length: int = 128):
        encodings = self.tokenizer(
            texts, truncation=True, padding=True,
            max_length=max_length, return_tensors="pt"
        )
        return encodings, labels

    def compute_metrics(self, eval_pred):
        predictions, labels = eval_pred
        preds = np.argmax(predictions, axis=-1)
        accuracy = (preds == labels).mean()
        return {"accuracy": accuracy}

    def train(self, train_texts, train_labels, val_texts, val_labels,
              output_dir: str = "./classifier_model"):
        train_encodings, train_labels = self.tokenize_data(train_texts, train_labels)
        val_encodings, val_labels = self.tokenize_data(val_texts, val_labels)

        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=3,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=64,
            warmup_steps=500,
            weight_decay=0.01,
            logging_dir="./logs",
            evaluation_strategy="epoch"
        )

        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_encodings,
            eval_dataset=val_encodings,
            compute_metrics=self.compute_metrics
        )

        trainer.train()

    def predict(self, text: str) -> dict:
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True)
        outputs = self.model(**inputs)
        probs = torch.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(probs, dim=-1).item()
        confidence = probs[0][predicted_class].item()
        return {"class": predicted_class, "confidence": confidence}

# Usage
classifier = TransformerClassifier(num_labels=3)
classifier.train(train_texts, train_labels, val_texts, val_labels)
result = classifier.predict("This movie was fantastic!")

Zero-Shot Classification

class ZeroShotClassifier:
    def __init__(self, model_name: string = "facebook/bart-large-mnli"):
        from transformers import pipeline
        self.classifier = pipeline("zero-shot-classification", model=model_name)

    def classify(self, text: str, candidate_labels: list,
                 hypothesis_template: str = "This text is about {}") -> dict:
        result = self.classifier(
            text,
            candidate_labels,
            hypothesis_template=hypothesis_template
        )
        return {
            "labels": result["labels"],
            "scores": result["scores"],
            "top_label": result["labels"][0],
            "top_score": result["scores"][0]
        }

    def classify_batch(self, texts: list, labels: list) -> list:
        return [self.classify(text, labels) for text in texts]

# Usage
classifier = ZeroShotClassifier()
result = classifier.classify(
    "The stock market crashed today",
    candidate_labels=["finance", "sports", "politics", "technology"]
)
print(result)  # {"top_label": "finance", "top_score": 0.95}

Key Takeaways

Traditional ML works well with limited data and simple tasks
Deep learning captures complex patterns with enough training data
Transformers provide state-of-the-art performance with fine-tuning
Zero-shot enables classification without task-specific training
Multi-label supports documents belonging to multiple categories

Text Classification

Text Classification

Traditional ML Classification

Deep Learning Classifier

Transformer-Based Classification

Zero-Shot Classification

Key Takeaways

Premium Content

Need Expert Generative AI Help?