Sentiment Analysis with NLP

Sentiment analysis determines whether text expresses positive, negative, or neutral opinions. It powers brand monitoring, customer feedback analysis, and market research. Modern approaches range from simple lexicons to fine-tuned transformers.

Sentiment Analysis Pipeline

Why Sentiment Analysis Matters

A product launch generates millions of social media posts. Manual review is impossible. Automated sentiment analysis provides real-time insight into public perception, enabling rapid response to crises and opportunities.

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import warnings
warnings.filterwarnings('ignore')

Lexicon-Based Sentiment

Rule-based approaches using predefined sentiment dictionaries.

# Simple lexicon-based sentiment
positive_words = {'good', 'great', 'excellent', 'amazing', 'love', 'best', 'awesome', 'fantastic'}
negative_words = {'bad', 'terrible', 'awful', 'hate', 'worst', 'horrible', 'poor', 'disappointing'}

def lexicon_sentiment(text):
    words = set(text.lower().split())
    pos = len(words & positive_words)
    neg = len(words & negative_words)
    score = pos - neg
    if score > 0: return 'positive'
    elif score < 0: return 'negative'
    return 'neutral'

# VADER (Valence Aware Dictionary and sEntiment Reasoner)
# Handles slang, emojis, capitalization, punctuation
try:
    from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
    analyzer = SentimentIntensityAnalyzer()
    
    texts = [
        "This product is AMAZING! Best purchase ever!!!",
        "Terrible quality, broke after one day. Very disappointed.",
        "It's okay, nothing special.",
        "Not bad, could be better though.",
        "Absolutely LOVE this! Worth every penny!"
    ]
    
    for text in texts:
        scores = analyzer.polarity_scores(text)
        print(f"Text: {text[:50]}...")
        print(f"  Compound: {scores['compound']:.3f}")
        print()
except ImportError:
    print("Install vaderSentiment: pip install vaderSentiment")

LSTM Sentiment Classifier

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from collections import Counter

class TextDataset(Dataset):
    def __init__(self, texts, labels, vocab=None, max_len=100):
        self.texts = texts
        self.labels = labels
        self.max_len = max_len
        
        if vocab is None:
            self.build_vocab()
        else:
            self.vocab = vocab
    
    def build_vocab(self):
        word_counts = Counter()
        for text in self.texts:
            word_counts.update(text.lower().split())
        
        self.vocab = {'<PAD>': 0, '<UNK>': 1}
        for word, count in word_counts.items():
            if count >= 2:
                self.vocab[word] = len(self.vocab)
    
    def encode(self, text):
        tokens = text.lower().split()
        indices = [self.vocab.get(t, 1) for t in tokens]
        
        if len(indices) < self.max_len:
            indices += [0] * (self.max_len - len(indices))
        else:
            indices = indices[:self.max_len]
        
        return torch.LongTensor(indices)
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        return self.encode(self.texts[idx]), self.labels[idx]

class LSTMSentiment(nn.Module):
    def __init__(self, vocab_size, embed_dim=64, hidden_dim=128, n_classes=3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, 
                           bidirectional=True, num_layers=2, dropout=0.3)
        self.attention = nn.Linear(hidden_dim * 2, 1)
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim * 2, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, n_classes)
        )
    
    def forward(self, x):
        embeds = self.embedding(x)
        lstm_out, _ = self.lstm(embeds)
        
        # Self-attention
        attn_weights = torch.softmax(self.attention(lstm_out).squeeze(-1), dim=1)
        context = torch.bmm(attn_weights.unsqueeze(1), lstm_out).squeeze(1)
        
        return self.classifier(context)

# Create sample data
texts = [
    "This movie was absolutely fantastic and thrilling",
    "Terrible waste of time, awful acting and plot",
    "Good film with some great performances",
    "The worst movie I have ever seen in my life",
    "Brilliant direction and stunning cinematography",
] * 200

labels = [2, 0, 1, 0, 2] * 200  # 0=neg, 1=neutral, 2=pos

dataset = TextDataset(texts, labels)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

model = LSTMSentiment(len(dataset.vocab))
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

for epoch in range(10):
    model.train()
    total_loss = 0
    for batch_texts, batch_labels in loader:
        output = model(batch_texts)
        loss = criterion(output, batch_labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    if (epoch + 1) % 5 == 0:
        print(f"Epoch {epoch+1}: Loss={total_loss/len(loader):.4f}")

Fine-Tuning BERT for Sentiment

from transformers import (
    BertTokenizer, BertForSequenceClassification,
    AdamW, get_linear_schedule_with_warmup
)

class BERTSentiment:
    def __init__(self, model_name='bert-base-uncased', n_classes=3):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertForSequenceClassification.from_pretrained(
            model_name, num_labels=n_classes
        ).to(self.device)
    
    def tokenize(self, texts, max_len=128):
        return self.tokenizer(
            texts, padding=True, truncation=True,
            max_length=max_len, return_tensors='pt'
        )
    
    def train(self, texts, labels, epochs=3, batch_size=16, lr=2e-5):
        dataset = list(zip(texts, labels))
        optimizer = AdamW(self.model.parameters(), lr=lr)
        total_steps = len(dataset) // batch_size * epochs
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=0, num_training_steps=total_steps
        )
        
        for epoch in range(epochs):
            self.model.train()
            total_loss = 0
            
            for i in range(0, len(dataset), batch_size):
                batch = dataset[i:i+batch_size]
                texts_batch = [t for t, _ in batch]
                labels_batch = torch.LongTensor([l for _, l in batch]).to(self.device)
                
                inputs = self.tokenize(texts_batch)
                inputs = {k: v.to(self.device) for k, v in inputs.items()}
                
                outputs = self.model(**inputs, labels=labels_batch)
                loss = outputs.loss
                
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                
                total_loss += loss.item()
            
            print(f"Epoch {epoch+1}: Loss={total_loss/(len(dataset)//batch_size):.4f}")
    
    def predict(self, texts):
        self.model.eval()
        inputs = self.tokenize(texts)
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = self.model(**inputs)
            predictions = torch.argmax(outputs.logits, dim=-1)
        
        return predictions.cpu().numpy()

# Initialize (requires transformers library)
try:
    bert_sentiment = BERTSentiment(n_classes=3)
    print("BERT sentiment classifier initialized")
except:
    print("Install transformers: pip install transformers")

Aspect-Based Sentiment Analysis

class AspectSentimentAnalyzer:
    """Extract sentiment for specific aspects of text."""
    
    def __init__(self):
        self.aspects = {
            'product': ['product', 'item', 'device', 'phone', 'laptop'],
            'service': ['service', 'support', 'staff', 'help', 'team'],
            'price': ['price', 'cost', 'value', 'expensive', 'cheap', 'worth'],
            'quality': ['quality', 'build', 'material', 'durable', 'flimsy']
        }
    
    def extract_aspects(self, text):
        text_lower = text.lower()
        found_aspects = []
        for aspect, keywords in self.aspects.items():
            if any(kw in text_lower for kw in keywords):
                found_aspects.append(aspect)
        return found_aspects
    
    def analyze(self, text):
        aspects = self.extract_aspects(text)
        results = {}
        
        analyzer = SentimentIntensityAnalyzer()
        sentences = text.split('.')
        
        for aspect in aspects:
            relevant_sentences = [
                s for s in sentences 
                if any(kw in s.lower() for kw in self.aspects[aspect])
            ]
            
            if relevant_sentences:
                scores = [analyzer.polarity_scores(s)['compound'] for s in relevant_sentences]
                results[aspect] = {
                    'sentiment': np.mean(scores),
                    'label': 'positive' if np.mean(scores) > 0.05 else 
                            'negative' if np.mean(scores) < -0.05 else 'neutral',
                    'sentences': len(relevant_sentences)
                }
        
        return results

# Usage
try:
    analyzer = AspectSentimentAnalyzer()
    text = "The product quality is excellent but the price is too expensive. Service was okay."
    results = analyzer.analyze(text)
    for aspect, data in results.items():
        print(f"{aspect}: {data['label']} ({data['sentiment']:.3f})")
except:
    print("Aspect analysis requires vaderSentiment")

Data Augmentation for Sentiment

def augment_text(text, n_augments=3):
    """Simple text augmentation via synonym replacement."""
    synonyms = {
        'good': ['great', 'excellent', 'fine'],
        'bad': ['terrible', 'awful', 'poor'],
        'love': ['adore', 'enjoy', 'appreciate'],
        'hate': ['dislike', 'detest', 'loathe']
    }
    
    augmented = [text]
    words = text.split()
    
    for _ in range(n_augments):
        new_words = words.copy()
        for i, word in enumerate(words):
            if word.lower() in synonyms and np.random.random() < 0.3:
                new_words[i] = np.random.choice(synonyms[word.lower()])
        augmented.append(' '.join(new_words))
    
    return augmented

# Test augmentation
text = "I love this product, it is very good"
augmented = augment_text(text)
for t in augmented:
    print(f"  {t}")

Best Practices

Start with BERT – fine-tuned transformers dominate sentiment tasks
Domain-specific training – general models miss domain nuances
Handle negation – "not good" should be negative
Aspect-based for reviews – overall sentiment hides useful detail
Balance classes – oversample minority sentiment classes
Monitor drift – language evolves, models need retraining

Summary

Sentiment analysis ranges from lexicon rules to fine-tuned BERT. Lexicons are fast but shallow; LSTMs capture context; transformers achieve state-of-the-art. Aspect-based analysis reveals fine-grained opinions. Choose based on accuracy needs and computational constraints.