The Interview Question
βΉοΈ
Question: You're building a sentiment analysis system for product reviews:
- Dataset: 10M reviews with text, ratings, and metadata
- Requirements: Real-time inference, multilingual support, explainable predictions
Walk through your NLP pipeline:
- How do you preprocess text data at scale?
- How do you represent text as features?
- How do you handle domain-specific vocabulary?
- How do you evaluate and deploy NLP models?
Detailed Answer
1. Text Preprocessing Pipeline
import pandas as pd
import numpy as np
import re
import string
from typing import List, Dict, Tuple
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
import spacy
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
class TextPreprocessor:
"""Complete text preprocessing pipeline"""
def __init__(self, language='english'):
self.language = language
self.stop_words = set(stopwords.words(language))
self.lemmatizer = WordNetLemmatizer()
self.stemmer = PorterStemmer()
self.nlp = spacy.load('en_core_web_sm')
def clean_text(self, text: str) -> str:
"""Basic text cleaning"""
# Convert to lowercase
text = text.lower()
# Remove HTML tags
text = re.sub(r'<[^>]+>', '', text)
# Remove URLs
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
# Remove email addresses
text = re.sub(r'\S+@\S+', '', text)
# Remove special characters and digits
text = re.sub(r'[^a-zA-Z\s]', '', text)
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text).strip()
return text
def tokenize(self, text: str) -> List[str]:
"""Tokenize text"""
return word_tokenize(text)
def remove_stopwords(self, tokens: List[str]) -> List[str]:
"""Remove stopwords"""
return [token for token in tokens if token not in self.stop_words]
def lemmatize(self, tokens: List[str]) -> List[str]:
"""Lemmatize tokens"""
return [self.lemmatizer.lemmatize(token) for token in tokens]
def stem(self, tokens: List[str]) -> List[str]:
"""Stem tokens"""
return [self.stemmer.stem(token) for token in tokens]
def pos_tag(self, tokens: List[str]) -> List[Tuple[str, str]]:
"""Part-of-speech tagging"""
return nltk.pos_tag(tokens)
def extract_nouns_adjectives(self, tokens: List[str]) -> List[str]:
"""Extract nouns and adjectives (often most informative)"""
tagged = self.pos_tag(tokens)
# NN = noun, JJ = adjective
return [word for word, tag in tagged if tag.startswith('NN') or tag.startswith('JJ')]
def spacy_preprocess(self, text: str) -> List[str]:
"""Advanced preprocessing using spaCy"""
doc = self.nlp(text)
# Extract tokens with POS filtering
tokens = [
token.lemma_ for token in doc
if not token.is_stop
and not token.is_punct
and not token.is_space
and token.is_alpha
and len(token) > 2
]
return tokens
def handle_negation(self, tokens: List[str]) -> List[str]:
"""Handle negation (e.g., "not good" β "not_good")"""
negation_words = {'not', 'no', 'never', 'neither', 'nobody', 'nothing'}
processed = []
negate = False
for token in tokens:
if token in negation_words:
negate = True
processed.append(token)
elif negate:
processed.append(f"not_{token}")
# End negation at punctuation or after 3 words
if token in {'.', ',', '!', '?'} or len(processed) > 5:
negate = False
else:
processed.append(token)
return processed
def preprocess_pipeline(self, texts: pd.Series,
clean=True,
remove_stop=True,
lemmatize=True,
handle_neg=True,
min_word_length=2) -> pd.Series:
"""Complete preprocessing pipeline"""
processed_texts = []
for text in texts:
# Step 1: Basic cleaning
if clean:
text = self.clean_text(str(text))
# Step 2: Tokenize
tokens = self.tokenize(text)
# Step 3: Remove stopwords
if remove_stop:
tokens = self.remove_stopwords(tokens)
# Step 4: Handle negation
if handle_neg:
tokens = self.handle_negation(tokens)
# Step 5: Lemmatize or stem
if lemmatize:
tokens = self.lemmatize(tokens)
else:
tokens = self.stem(tokens)
# Step 6: Filter by length
tokens = [t for t in tokens if len(t) >= min_word_length]
processed_texts.append(' '.join(tokens))
return pd.Series(processed_texts, index=texts.index)
# Example usage
# preprocessor = TextPreprocessor()
# processed_texts = preprocessor.preprocess_pipeline(df['review_text'])
2. Text Representation Methods
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models import Word2Vec, FastText
from sentence_transformers import SentenceTransformer
import torch
from transformers import AutoTokenizer, AutoModel
class TextVectorizer:
"""Multiple text vectorization methods"""
def __init__(self):
self.vectorizers = {}
self.vocabulary = None
def bag_of_words(self, texts: pd.Series, max_features=10000) -> np.ndarray:
"""Bag of Words representation"""
vectorizer = CountVectorizer(
max_features=max_features,
ngram_range=(1, 2), # Unigrams and bigrams
min_df=2, # Minimum document frequency
max_df=0.95 # Maximum document frequency
)
X = vectorizer.fit_transform(texts)
self.vectorizers['bow'] = vectorizer
print(f"BoW shape: {X.shape}")
print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")
return X
def tfidf(self, texts: pd.Series, max_features=10000) -> np.ndarray:
"""TF-IDF representation"""
vectorizer = TfidfVectorizer(
max_features=max_features,
ngram_range=(1, 2),
min_df=2,
max_df=0.95,
sublinear_tf=True, # Apply sublinear tf scaling
norm='l2'
)
X = vectorizer.fit_transform(texts)
self.vectorizers['tfidf'] = vectorizer
print(f"TF-IDF shape: {X.shape}")
return X
def word2vec(self, texts: pd.Series, vector_size=100, window=5) -> np.ndarray:
"""Word2Vec embeddings"""
# Tokenize texts
tokenized_texts = [text.split() for text in texts]
# Train Word2Vec model
model = Word2Vec(
sentences=tokenized_texts,
vector_size=vector_size,
window=window,
min_count=2,
workers=4,
epochs=10
)
# Average word vectors for each document
vectors = []
for tokens in tokenized_texts:
token_vectors = [model.wv[word] for word in tokens if word in model.wv]
if token_vectors:
vectors.append(np.mean(token_vectors, axis=0))
else:
vectors.append(np.zeros(vector_size))
self.vectorizers['word2vec'] = model
return np.array(vectors)
def fasttext(self, texts: pd.Series, vector_size=100) -> np.ndarray:
"""FastText embeddings (handles out-of-vocabulary words)"""
from gensim.models import FastText
tokenized_texts = [text.split() for text in texts]
model = FastText(
sentences=tokenized_texts,
vector_size=vector_size,
window=5,
min_count=2,
workers=4
)
# Average word vectors
vectors = []
for tokens in tokenized_texts:
token_vectors = [model.wv[word] for word in tokens]
if token_vectors:
vectors.append(np.mean(token_vectors, axis=0))
else:
vectors.append(np.zeros(vector_size))
self.vectorizers['fasttext'] = model
return np.array(vectors)
def sentence_transformers(self, texts: pd.Series, model_name='all-MiniLM-L6-v2') -> np.ndarray:
"""Sentence Transformers embeddings"""
model = SentenceTransformer(model_name)
# Encode texts
vectors = model.encode(texts.tolist(), batch_size=32, show_progress_bar=True)
self.vectorizers['sentence_transformers'] = model
return vectors
def contextual_embeddings(self, texts: pd.Series,
model_name='bert-base-uncased') -> np.ndarray:
"""Contextual embeddings using BERT"""
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
embeddings = []
for text in texts:
# Tokenize
inputs = tokenizer(
text,
return_tensors='pt',
truncation=True,
padding=True,
max_length=512
)
# Get embeddings
with torch.no_grad():
outputs = model(**inputs)
# Use [CLS] token embedding
cls_embedding = outputs.last_hidden_state[:, 0, :].numpy()
embeddings.append(cls_embedding.flatten())
self.vectorizers['contextual'] = model
return np.array(embeddings)
def get_feature_names(self, method='tfidf'):
"""Get feature names from vectorizer"""
if method in ['bow', 'tfidf']:
return self.vectorizers[method].get_feature_names_out()
else:
return None
# Example usage
# vectorizer = TextVectorizer()
# X_tfidf = vectorizer.tfidf(processed_texts)
# X_w2v = vectorizer.word2vec(processed_texts)
# X_sentence = vectorizer.sentence_transformers(texts)
3. Sentiment Analysis Implementation
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix
from transformers import pipeline
import torch
class SentimentAnalyzer:
"""Complete sentiment analysis system"""
def __init__(self):
self.models = {}
self.vectorizer = TextVectorizer()
self.preprocessor = TextPreprocessor()
def prepare_data(self, df, text_column, label_column):
"""Prepare data for sentiment analysis"""
# Preprocess text
processed_texts = self.preprocessor.preprocess_pipeline(df[text_column])
# Split data
X_train, X_test, y_train, y_test = train_test_split(
processed_texts,
df[label_column],
test_size=0.2,
random_state=42
)
return X_train, X_test, y_train, y_test
def train_traditional_models(self, X_train, y_train):
"""Train traditional ML models"""
# Vectorize
X_train_tfidf = self.vectorizer.tfidf(X_train)
# Train models
models = {
'logistic_regression': LogisticRegression(max_iter=1000, random_state=42),
'naive_bayes': MultinomialNB(),
'svm': LinearSVC(random_state=42)
}
trained_models = {}
for name, model in models.items():
print(f"\nTraining {name}...")
model.fit(X_train_tfidf, y_train)
trained_models[name] = model
# Cross-validation score
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X_train_tfidf, y_train, cv=5, scoring='accuracy')
print(f" CV Accuracy: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")
self.models.update(trained_models)
return trained_models
def train_transformer_model(self, X_train, y_train, model_name='distilbert-base-uncased'):
"""Train transformer-based model"""
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
TrainingArguments,
Trainer
)
from datasets import Dataset
# Tokenize
tokenizer = AutoTokenizer.from_pretrained(model_name)
def tokenize_function(examples):
return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)
# Create dataset
train_dataset = Dataset.from_dict({'text': X_train.tolist(), 'label': y_train.tolist()})
train_dataset = train_dataset.map(tokenize_function, batched=True)
# Load pre-trained model
model = AutoModelForSequenceClassification.from_pretrained(
model_name,
num_labels=2
)
# Training arguments
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=3,
per_device_train_batch_size=16,
warmup_steps=500,
weight_decay=0.01,
logging_dir='./logs',
)
# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
)
# Train
trainer.train()
self.models['transformer'] = model
return model
def predict_sentiment(self, texts, method='ensemble'):
"""Predict sentiment using specified method"""
if method == 'ensemble':
predictions = {}
# Preprocess
processed_texts = self.preprocessor.preprocess_pipeline(pd.Series(texts))
# Get predictions from each model
X_tfidf = self.vectorizer.tfidf(processed_texts)
for name, model in self.models.items():
if name != 'transformer':
pred = model.predict(X_tfidf)
predictions[name] = pred
# Ensemble: majority voting
from scipy.stats import mode
all_preds = np.array(list(predictions.values()))
ensemble_pred = mode(all_preds, axis=0)[0].flatten()
return ensemble_pred
elif method == 'transformer':
# Use transformer pipeline
sentiment_pipeline = pipeline(
"sentiment-analysis",
model=self.models['transformer'],
tokenizer="distilbert-base-uncased"
)
results = sentiment_pipeline(texts)
return [1 if r['label'] == 'POSITIVE' else 0 for r in results]
def evaluate_models(self, X_test, y_test):
"""Evaluate all models"""
X_test_tfidf = self.vectorizer.tfidf(X_test)
results = {}
for name, model in self.models.items():
if name != 'transformer':
y_pred = model.predict(X_test_tfidf)
results[name] = {
'accuracy': accuracy_score(y_test, y_pred),
'precision': precision_score(y_test, y_pred, average='weighted'),
'recall': recall_score(y_test, y_pred, average='weighted'),
'f1': f1_score(y_test, y_pred, average='weighted')
}
comparison = pd.DataFrame(results).T
print("\nModel Comparison:")
print("=" * 60)
print(comparison)
return comparison
# Example usage
# sentiment_analyzer = SentimentAnalyzer()
# X_train, X_test, y_train, y_test = sentiment_analyzer.prepare_data(df, 'review_text', 'sentiment')
# models = sentiment_analyzer.train_traditional_models(X_train, y_train)
# comparison = sentiment_analyzer.evaluate_models(X_test, y_test)
4. Topic Modeling
from sklearn.decomposition import LatentDirichletAllocation, NMF
from gensim.models import LdaMulticore, CoherenceModel
from gensim.corpora import Dictionary
import pyLDAvis
import pyLDAvis.gensim_models
class TopicModeler:
"""Topic modeling using LDA and NMF"""
def __init__(self):
self.models = {}
self.topics = None
def lda_gensim(self, texts: List[List[str]], n_topics=10):
"""LDA using Gensim"""
# Create dictionary
dictionary = Dictionary(texts)
# Filter extremes
dictionary.filter_extremes(no_below=5, no_above=0.5)
# Create corpus
corpus = [dictionary.doc2bow(text) for text in texts]
# Train LDA model
lda_model = LdaMulticore(
corpus=corpus,
id2word=dictionary,
num_topics=n_topics,
random_state=42,
passes=10,
workers=4
)
# Calculate coherence
coherence_model = CoherenceModel(
model=lda_model,
texts=texts,
dictionary=dictionary,
coherence='c_v'
)
coherence_score = coherence_model.get_coherence()
print(f"LDA Coherence Score: {coherence_score:.4f}")
self.models['lda'] = {
'model': lda_model,
'dictionary': dictionary,
'corpus': corpus,
'coherence': coherence_score
}
return lda_model, coherence_score
def sklearn_lda(self, texts_tfidf, n_topics=10, feature_names=None):
"""LDA using scikit-learn"""
lda = LatentDirichletAllocation(
n_components=n_topics,
max_iter=10,
learning_method='online',
random_state=42
)
lda.fit(texts_tfidf)
# Display topics
if feature_names is not None:
self.display_topics(lda, feature_names, n_words=10)
self.models['sklearn_lda'] = lda
return lda
def nmf(self, texts_tfidf, n_topics=10, feature_names=None):
"""Non-negative Matrix Factorization"""
nmf = NMF(
n_components=n_topics,
random_state=42,
max_iter=500
)
nmf.fit(texts_tfidf)
# Display topics
if feature_names is not None:
self.display_topics(nmf, feature_names, n_words=10)
self.models['nmf'] = nmf
return nmf
def display_topics(self, model, feature_names, n_words=10):
"""Display topics with top words"""
print("\nTopics:")
print("=" * 60)
for topic_idx, topic in enumerate(model.components_):
top_words = [feature_names[i] for i in topic.argsort()[:-n_words - 1:-1]]
print(f"Topic {topic_idx + 1}: {', '.join(top_words)}")
def get_document_topics(self, texts_tfidf, model_name='nmf'):
"""Get topic distribution for each document"""
model = self.models[model_name]
if hasattr(model, 'transform'):
topic_dist = model.transform(texts_tfidf)
else:
topic_dist = model[texts_tfidf]
return topic_dist
def visualize_topics(self, texts_tfidf, dictionary=None):
"""Visualize topics using pyLDAvis"""
if 'lda' in self.models and dictionary is not None:
vis_data = pyLDAvis.gensim_models.prepare(
self.models['lda']['model'],
self.models['lda']['corpus'],
dictionary
)
pyLDAvis.save_html(vis_data, 'lda_visualization.html')
print("Topic visualization saved to lda_visualization.html")
# Example usage
# topic_modeler = TopicModeler()
# tokenized_texts = [text.split() for text in processed_texts]
# lda_model, coherence = topic_modeler.lda_gensim(tokenized_texts, n_topics=10)
π‘
Pro Tip: When choosing between LDA and NMF, LDA works better for interpretability while NMF often produces more coherent topics. Always evaluate using coherence scores, not just perplexity.
5. Real-World Application: Product Review Analysis
class ProductReviewAnalyzer:
"""Complete product review analysis system"""
def __init__(self):
self.preprocessor = TextPreprocessor()
self.vectorizer = TextVectorizer()
self.sentiment_analyzer = SentimentAnalyzer()
self.topic_modeler = TopicModeler()
def analyze_reviews(self, df, text_column='review_text'):
"""Complete review analysis pipeline"""
print("Step 1: Preprocessing text...")
processed_texts = self.preprocessor.preprocess_pipeline(df[text_column])
print("\nStep 2: Creating text representations...")
X_tfidf = self.vectorizer.tfidf(processed_texts)
print("\nStep 3: Sentiment analysis...")
sentiments = self.sentiment_analyzer.predict_sentiment(processed_texts.tolist())
df['sentiment'] = sentiments
print("\nStep 4: Topic modeling...")
tokenized_texts = [text.split() for text in processed_texts]
self.topic_modeler.lda_gensim(tokenized_texts, n_topics=5)
print("\nStep 5: Extracting insights...")
insights = self.extract_insights(df, sentiments)
return df, insights
def extract_insights(self, df, sentiments):
"""Extract business insights from analysis"""
insights = {
'total_reviews': len(df),
'sentiment_distribution': {
'positive': (sentiments == 1).sum(),
'negative': (sentiments == 0).sum()
},
'average_rating': df['rating'].mean() if 'rating' in df.columns else None,
'top_positive_aspects': self._extract_aspects(df[sentiments == 1]),
'top_negative_aspects': self._extract_aspects(df[sentiments == 0]),
'recommendations': self._generate_recommendations(insights)
}
return insights
def _extract_aspects(self, df_subset):
"""Extract common aspects from reviews"""
# Combine all text
all_text = ' '.join(df_subset['review_text'].tolist())
# Extract noun phrases
doc = self.preprocessor.nlp(all_text[:10000]) # Limit for performance
# Count noun phrases
noun_phrases = []
for chunk in doc.noun_chunks:
noun_phrases.append(chunk.text.lower())
# Get most common
common_phrases = Counter(noun_phrases).most_common(10)
return common_phrases
def _generate_recommendations(self, insights):
"""Generate business recommendations"""
recommendations = []
# Based on sentiment distribution
pos_pct = insights['sentiment_distribution']['positive'] / insights['total_reviews']
if pos_pct > 0.8:
recommendations.append("Strong positive sentiment - leverage in marketing")
elif pos_pct < 0.5:
recommendations.append("Investigate negative sentiment drivers")
# Based on aspects
if insights['top_negative_aspects']:
top_negative = insights['top_negative_aspects'][0][0]
recommendations.append(f"Address common complaint: {top_negative}")
return recommendations
# Example usage
# analyzer = ProductReviewAnalyzer()
# analyzed_df, insights = analyzer.analyze_reviews(df)
6. Common Follow-Up Questions
Follow-up 1: How do you handle out-of-vocabulary words?
def handle_oov_words(texts, method='fasttext'):
"""Handle out-of-vocabulary words"""
if method == 'fasttext':
# FastText uses subword information
from gensim.models import FastText
tokenized_texts = [text.split() for text in texts]
model = FastText(
sentences=tokenized_texts,
vector_size=100,
min_count=2,
workers=4
)
return model
elif method == 'character_ngrams':
# Character n-grams can represent OOV words
from sklearn.feature_extraction.text import HashingVectorizer
vectorizer = HashingVectorizer(
analyzer='char_wb',
ngram_range=(2, 4),
n_features=2**18
)
return vectorizer
elif method == 'bpe':
# Byte Pair Encoding (used in BERT, GPT)
from tokenizers import Tokenizer, models, trainers, pre_tokenizers
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
trainer = trainers.BpeTrainer(vocab_size=30000)
tokenizer.train(files=None, trainer=trainer)
return tokenizer
Follow-up 2: How do you deploy NLP models at scale?
class NLPDeployment:
"""Deploy NLP models at scale"""
def __init__(self):
self.models = {}
def export_to_onnx(self, model, tokenizer, output_path):
"""Export model to ONNX for production"""
import torch
# Create dummy input
dummy_input = tokenizer(
"Sample text",
return_tensors="pt",
padding="max_length",
max_length=128,
truncation=True
)
# Export
torch.onnx.export(
model,
(dummy_input['input_ids'], dummy_input['attention_mask']),
output_path,
input_names=['input_ids', 'attention_mask'],
output_names=['logits'],
dynamic_axes={
'input_ids': {0: 'batch_size', 1: 'sequence_length'},
'attention_mask': {0: 'batch_size', 1: 'sequence_length'},
'logits': {0: 'batch_size'}
}
)
def create_inference_server(self, model_path):
"""Create FastAPI server for inference"""
from fastapi import FastAPI
from pydantic import BaseModel
app = FastAPI()
class PredictionRequest(BaseModel):
text: str
class PredictionResponse(BaseModel):
sentiment: str
confidence: float
@app.post("/predict", response_model=PredictionResponse)
def predict(request: PredictionRequest):
# Load model and predict
# This is a simplified example
return PredictionResponse(
sentiment="positive",
confidence=0.95
)
return app
def create_batch_pipeline(self, texts, batch_size=32):
"""Batch processing for efficiency"""
results = []
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
batch_results = self.predict_batch(batch)
results.extend(batch_results)
return results
Company-Specific Tips
βΉοΈ
Google Tips:
- Google heavily tests on transformer architectures
- Know how to fine-tune BERT for specific tasks
- Understand attention mechanisms and their interpretation
- Be comfortable with distributed training for NLP
Meta Tips:
- Meta focuses on multilingual NLP
- Know how to handle low-resource languages
- Understand how to build conversational AI
- Be familiar with large language model deployment
Quiz Section
Related Topics
- Text Preprocessing β Data cleaning for text data
- Feature Engineering β Creating NLP features
- Deep Learning β Neural network approaches
- Transformer Models β Advanced NLP architectures