Sentiment Analysis with NLP
Sentiment analysis determines whether text expresses positive, negative, or neutral opinions. It powers brand monitoring, customer feedback analysis, and market research. Modern approaches range from simple lexicons to fine-tuned transformers.
Sentiment Analysis Pipeline
Why Sentiment Analysis Matters
A product launch generates millions of social media posts. Manual review is impossible. Automated sentiment analysis provides real-time insight into public perception, enabling rapid response to crises and opportunities.
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import warnings
warnings.filterwarnings('ignore')
Lexicon-Based Sentiment
Rule-based approaches using predefined sentiment dictionaries.
# Simple lexicon-based sentiment
positive_words = {'good', 'great', 'excellent', 'amazing', 'love', 'best', 'awesome', 'fantastic'}
negative_words = {'bad', 'terrible', 'awful', 'hate', 'worst', 'horrible', 'poor', 'disappointing'}
def lexicon_sentiment(text):
words = set(text.lower().split())
pos = len(words & positive_words)
neg = len(words & negative_words)
score = pos - neg
if score > 0: return 'positive'
elif score < 0: return 'negative'
return 'neutral'
# VADER (Valence Aware Dictionary and sEntiment Reasoner)
# Handles slang, emojis, capitalization, punctuation
try:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
texts = [
"This product is AMAZING! Best purchase ever!!!",
"Terrible quality, broke after one day. Very disappointed.",
"It's okay, nothing special.",
"Not bad, could be better though.",
"Absolutely LOVE this! Worth every penny!"
]
for text in texts:
scores = analyzer.polarity_scores(text)
print(f"Text: {text[:50]}...")
print(f" Compound: {scores['compound']:.3f}")
print()
except ImportError:
print("Install vaderSentiment: pip install vaderSentiment")
LSTM Sentiment Classifier
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from collections import Counter
class TextDataset(Dataset):
def __init__(self, texts, labels, vocab=None, max_len=100):
self.texts = texts
self.labels = labels
self.max_len = max_len
if vocab is None:
self.build_vocab()
else:
self.vocab = vocab
def build_vocab(self):
word_counts = Counter()
for text in self.texts:
word_counts.update(text.lower().split())
self.vocab = {'<PAD>': 0, '<UNK>': 1}
for word, count in word_counts.items():
if count >= 2:
self.vocab[word] = len(self.vocab)
def encode(self, text):
tokens = text.lower().split()
indices = [self.vocab.get(t, 1) for t in tokens]
if len(indices) < self.max_len:
indices += [0] * (self.max_len - len(indices))
else:
indices = indices[:self.max_len]
return torch.LongTensor(indices)
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
return self.encode(self.texts[idx]), self.labels[idx]
class LSTMSentiment(nn.Module):
def __init__(self, vocab_size, embed_dim=64, hidden_dim=128, n_classes=3):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True,
bidirectional=True, num_layers=2, dropout=0.3)
self.attention = nn.Linear(hidden_dim * 2, 1)
self.classifier = nn.Sequential(
nn.Linear(hidden_dim * 2, 64),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(64, n_classes)
)
def forward(self, x):
embeds = self.embedding(x)
lstm_out, _ = self.lstm(embeds)
# Self-attention
attn_weights = torch.softmax(self.attention(lstm_out).squeeze(-1), dim=1)
context = torch.bmm(attn_weights.unsqueeze(1), lstm_out).squeeze(1)
return self.classifier(context)
# Create sample data
texts = [
"This movie was absolutely fantastic and thrilling",
"Terrible waste of time, awful acting and plot",
"Good film with some great performances",
"The worst movie I have ever seen in my life",
"Brilliant direction and stunning cinematography",
] * 200
labels = [2, 0, 1, 0, 2] * 200 # 0=neg, 1=neutral, 2=pos
dataset = TextDataset(texts, labels)
loader = DataLoader(dataset, batch_size=32, shuffle=True)
model = LSTMSentiment(len(dataset.vocab))
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
for epoch in range(10):
model.train()
total_loss = 0
for batch_texts, batch_labels in loader:
output = model(batch_texts)
loss = criterion(output, batch_labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
total_loss += loss.item()
if (epoch + 1) % 5 == 0:
print(f"Epoch {epoch+1}: Loss={total_loss/len(loader):.4f}")
Fine-Tuning BERT for Sentiment
from transformers import (
BertTokenizer, BertForSequenceClassification,
AdamW, get_linear_schedule_with_warmup
)
class BERTSentiment:
def __init__(self, model_name='bert-base-uncased', n_classes=3):
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
self.tokenizer = BertTokenizer.from_pretrained(model_name)
self.model = BertForSequenceClassification.from_pretrained(
model_name, num_labels=n_classes
).to(self.device)
def tokenize(self, texts, max_len=128):
return self.tokenizer(
texts, padding=True, truncation=True,
max_length=max_len, return_tensors='pt'
)
def train(self, texts, labels, epochs=3, batch_size=16, lr=2e-5):
dataset = list(zip(texts, labels))
optimizer = AdamW(self.model.parameters(), lr=lr)
total_steps = len(dataset) // batch_size * epochs
scheduler = get_linear_schedule_with_warmup(
optimizer, num_warmup_steps=0, num_training_steps=total_steps
)
for epoch in range(epochs):
self.model.train()
total_loss = 0
for i in range(0, len(dataset), batch_size):
batch = dataset[i:i+batch_size]
texts_batch = [t for t, _ in batch]
labels_batch = torch.LongTensor([l for _, l in batch]).to(self.device)
inputs = self.tokenize(texts_batch)
inputs = {k: v.to(self.device) for k, v in inputs.items()}
outputs = self.model(**inputs, labels=labels_batch)
loss = outputs.loss
loss.backward()
torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
optimizer.step()
scheduler.step()
optimizer.zero_grad()
total_loss += loss.item()
print(f"Epoch {epoch+1}: Loss={total_loss/(len(dataset)//batch_size):.4f}")
def predict(self, texts):
self.model.eval()
inputs = self.tokenize(texts)
inputs = {k: v.to(self.device) for k, v in inputs.items()}
with torch.no_grad():
outputs = self.model(**inputs)
predictions = torch.argmax(outputs.logits, dim=-1)
return predictions.cpu().numpy()
# Initialize (requires transformers library)
try:
bert_sentiment = BERTSentiment(n_classes=3)
print("BERT sentiment classifier initialized")
except:
print("Install transformers: pip install transformers")
Aspect-Based Sentiment Analysis
class AspectSentimentAnalyzer:
"""Extract sentiment for specific aspects of text."""
def __init__(self):
self.aspects = {
'product': ['product', 'item', 'device', 'phone', 'laptop'],
'service': ['service', 'support', 'staff', 'help', 'team'],
'price': ['price', 'cost', 'value', 'expensive', 'cheap', 'worth'],
'quality': ['quality', 'build', 'material', 'durable', 'flimsy']
}
def extract_aspects(self, text):
text_lower = text.lower()
found_aspects = []
for aspect, keywords in self.aspects.items():
if any(kw in text_lower for kw in keywords):
found_aspects.append(aspect)
return found_aspects
def analyze(self, text):
aspects = self.extract_aspects(text)
results = {}
analyzer = SentimentIntensityAnalyzer()
sentences = text.split('.')
for aspect in aspects:
relevant_sentences = [
s for s in sentences
if any(kw in s.lower() for kw in self.aspects[aspect])
]
if relevant_sentences:
scores = [analyzer.polarity_scores(s)['compound'] for s in relevant_sentences]
results[aspect] = {
'sentiment': np.mean(scores),
'label': 'positive' if np.mean(scores) > 0.05 else
'negative' if np.mean(scores) < -0.05 else 'neutral',
'sentences': len(relevant_sentences)
}
return results
# Usage
try:
analyzer = AspectSentimentAnalyzer()
text = "The product quality is excellent but the price is too expensive. Service was okay."
results = analyzer.analyze(text)
for aspect, data in results.items():
print(f"{aspect}: {data['label']} ({data['sentiment']:.3f})")
except:
print("Aspect analysis requires vaderSentiment")
Data Augmentation for Sentiment
def augment_text(text, n_augments=3):
"""Simple text augmentation via synonym replacement."""
synonyms = {
'good': ['great', 'excellent', 'fine'],
'bad': ['terrible', 'awful', 'poor'],
'love': ['adore', 'enjoy', 'appreciate'],
'hate': ['dislike', 'detest', 'loathe']
}
augmented = [text]
words = text.split()
for _ in range(n_augments):
new_words = words.copy()
for i, word in enumerate(words):
if word.lower() in synonyms and np.random.random() < 0.3:
new_words[i] = np.random.choice(synonyms[word.lower()])
augmented.append(' '.join(new_words))
return augmented
# Test augmentation
text = "I love this product, it is very good"
augmented = augment_text(text)
for t in augmented:
print(f" {t}")
Best Practices
- Start with BERT β fine-tuned transformers dominate sentiment tasks
- Domain-specific training β general models miss domain nuances
- Handle negation β "not good" should be negative
- Aspect-based for reviews β overall sentiment hides useful detail
- Balance classes β oversample minority sentiment classes
- Monitor drift β language evolves, models need retraining
Summary
Sentiment analysis ranges from lexicon rules to fine-tuned BERT. Lexicons are fast but shallow; LSTMs capture context; transformers achieve state-of-the-art. Aspect-based analysis reveals fine-grained opinions. Choose based on accuracy needs and computational constraints.