NLP: Text Processing, TF-IDF, Word Embeddings

The Interview Question

ℹ️

Question: You're building a sentiment analysis system for product reviews:

Dataset: 10M reviews with text, ratings, and metadata
Requirements: Real-time inference, multilingual support, explainable predictions

Walk through your NLP pipeline:

How do you preprocess text data at scale?
How do you represent text as features?
How do you handle domain-specific vocabulary?
How do you evaluate and deploy NLP models?

Detailed Answer

1. Text Preprocessing Pipeline

import pandas as pd
import numpy as np
import re
import string
from typing import List, Dict, Tuple
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
import spacy
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

class TextPreprocessor:
    """Complete text preprocessing pipeline"""
    
    def __init__(self, language='english'):
        self.language = language
        self.stop_words = set(stopwords.words(language))
        self.lemmatizer = WordNetLemmatizer()
        self.stemmer = PorterStemmer()
        self.nlp = spacy.load('en_core_web_sm')
    
    def clean_text(self, text: str) -> str:
        """Basic text cleaning"""
        # Convert to lowercase
        text = text.lower()
        
        # Remove HTML tags
        text = re.sub(r'<[^>]+>', '', text)
        
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        
        # Remove email addresses
        text = re.sub(r'\S+@\S+', '', text)
        
        # Remove special characters and digits
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    def tokenize(self, text: str) -> List[str]:
        """Tokenize text"""
        return word_tokenize(text)
    
    def remove_stopwords(self, tokens: List[str]) -> List[str]:
        """Remove stopwords"""
        return [token for token in tokens if token not in self.stop_words]
    
    def lemmatize(self, tokens: List[str]) -> List[str]:
        """Lemmatize tokens"""
        return [self.lemmatizer.lemmatize(token) for token in tokens]
    
    def stem(self, tokens: List[str]) -> List[str]:
        """Stem tokens"""
        return [self.stemmer.stem(token) for token in tokens]
    
    def pos_tag(self, tokens: List[str]) -> List[Tuple[str, str]]:
        """Part-of-speech tagging"""
        return nltk.pos_tag(tokens)
    
    def extract_nouns_adjectives(self, tokens: List[str]) -> List[str]:
        """Extract nouns and adjectives (often most informative)"""
        tagged = self.pos_tag(tokens)
        # NN = noun, JJ = adjective
        return [word for word, tag in tagged if tag.startswith('NN') or tag.startswith('JJ')]
    
    def spacy_preprocess(self, text: str) -> List[str]:
        """Advanced preprocessing using spaCy"""
        doc = self.nlp(text)
        
        # Extract tokens with POS filtering
        tokens = [
            token.lemma_ for token in doc 
            if not token.is_stop 
            and not token.is_punct 
            and not token.is_space
            and token.is_alpha
            and len(token) > 2
        ]
        
        return tokens
    
    def handle_negation(self, tokens: List[str]) -> List[str]:
        """Handle negation (e.g., "not good" → "not_good")"""
        negation_words = {'not', 'no', 'never', 'neither', 'nobody', 'nothing'}
        
        processed = []
        negate = False
        
        for token in tokens:
            if token in negation_words:
                negate = True
                processed.append(token)
            elif negate:
                processed.append(f"not_{token}")
                # End negation at punctuation or after 3 words
                if token in {'.', ',', '!', '?'} or len(processed) > 5:
                    negate = False
            else:
                processed.append(token)
        
        return processed
    
    def preprocess_pipeline(self, texts: pd.Series, 
                          clean=True, 
                          remove_stop=True,
                          lemmatize=True,
                          handle_neg=True,
                          min_word_length=2) -> pd.Series:
        """Complete preprocessing pipeline"""
        
        processed_texts = []
        
        for text in texts:
            # Step 1: Basic cleaning
            if clean:
                text = self.clean_text(str(text))
            
            # Step 2: Tokenize
            tokens = self.tokenize(text)
            
            # Step 3: Remove stopwords
            if remove_stop:
                tokens = self.remove_stopwords(tokens)
            
            # Step 4: Handle negation
            if handle_neg:
                tokens = self.handle_negation(tokens)
            
            # Step 5: Lemmatize or stem
            if lemmatize:
                tokens = self.lemmatize(tokens)
            else:
                tokens = self.stem(tokens)
            
            # Step 6: Filter by length
            tokens = [t for t in tokens if len(t) >= min_word_length]
            
            processed_texts.append(' '.join(tokens))
        
        return pd.Series(processed_texts, index=texts.index)

# Example usage
# preprocessor = TextPreprocessor()
# processed_texts = preprocessor.preprocess_pipeline(df['review_text'])

2. Text Representation Methods

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models import Word2Vec, FastText
from sentence_transformers import SentenceTransformer
import torch
from transformers import AutoTokenizer, AutoModel

class TextVectorizer:
    """Multiple text vectorization methods"""
    
    def __init__(self):
        self.vectorizers = {}
        self.vocabulary = None
    
    def bag_of_words(self, texts: pd.Series, max_features=10000) -> np.ndarray:
        """Bag of Words representation"""
        vectorizer = CountVectorizer(
            max_features=max_features,
            ngram_range=(1, 2),  # Unigrams and bigrams
            min_df=2,  # Minimum document frequency
            max_df=0.95  # Maximum document frequency
        )
        
        X = vectorizer.fit_transform(texts)
        self.vectorizers['bow'] = vectorizer
        
        print(f"BoW shape: {X.shape}")
        print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")
        
        return X
    
    def tfidf(self, texts: pd.Series, max_features=10000) -> np.ndarray:
        """TF-IDF representation"""
        vectorizer = TfidfVectorizer(
            max_features=max_features,
            ngram_range=(1, 2),
            min_df=2,
            max_df=0.95,
            sublinear_tf=True,  # Apply sublinear tf scaling
            norm='l2'
        )
        
        X = vectorizer.fit_transform(texts)
        self.vectorizers['tfidf'] = vectorizer
        
        print(f"TF-IDF shape: {X.shape}")
        
        return X
    
    def word2vec(self, texts: pd.Series, vector_size=100, window=5) -> np.ndarray:
        """Word2Vec embeddings"""
        # Tokenize texts
        tokenized_texts = [text.split() for text in texts]
        
        # Train Word2Vec model
        model = Word2Vec(
            sentences=tokenized_texts,
            vector_size=vector_size,
            window=window,
            min_count=2,
            workers=4,
            epochs=10
        )
        
        # Average word vectors for each document
        vectors = []
        for tokens in tokenized_texts:
            token_vectors = [model.wv[word] for word in tokens if word in model.wv]
            if token_vectors:
                vectors.append(np.mean(token_vectors, axis=0))
            else:
                vectors.append(np.zeros(vector_size))
        
        self.vectorizers['word2vec'] = model
        
        return np.array(vectors)
    
    def fasttext(self, texts: pd.Series, vector_size=100) -> np.ndarray:
        """FastText embeddings (handles out-of-vocabulary words)"""
        from gensim.models import FastText
        
        tokenized_texts = [text.split() for text in texts]
        
        model = FastText(
            sentences=tokenized_texts,
            vector_size=vector_size,
            window=5,
            min_count=2,
            workers=4
        )
        
        # Average word vectors
        vectors = []
        for tokens in tokenized_texts:
            token_vectors = [model.wv[word] for word in tokens]
            if token_vectors:
                vectors.append(np.mean(token_vectors, axis=0))
            else:
                vectors.append(np.zeros(vector_size))
        
        self.vectorizers['fasttext'] = model
        
        return np.array(vectors)
    
    def sentence_transformers(self, texts: pd.Series, model_name='all-MiniLM-L6-v2') -> np.ndarray:
        """Sentence Transformers embeddings"""
        model = SentenceTransformer(model_name)
        
        # Encode texts
        vectors = model.encode(texts.tolist(), batch_size=32, show_progress_bar=True)
        
        self.vectorizers['sentence_transformers'] = model
        
        return vectors
    
    def contextual_embeddings(self, texts: pd.Series, 
                             model_name='bert-base-uncased') -> np.ndarray:
        """Contextual embeddings using BERT"""
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModel.from_pretrained(model_name)
        
        embeddings = []
        
        for text in texts:
            # Tokenize
            inputs = tokenizer(
                text, 
                return_tensors='pt', 
                truncation=True, 
                padding=True, 
                max_length=512
            )
            
            # Get embeddings
            with torch.no_grad():
                outputs = model(**inputs)
            
            # Use [CLS] token embedding
            cls_embedding = outputs.last_hidden_state[:, 0, :].numpy()
            embeddings.append(cls_embedding.flatten())
        
        self.vectorizers['contextual'] = model
        
        return np.array(embeddings)
    
    def get_feature_names(self, method='tfidf'):
        """Get feature names from vectorizer"""
        if method in ['bow', 'tfidf']:
            return self.vectorizers[method].get_feature_names_out()
        else:
            return None

# Example usage
# vectorizer = TextVectorizer()
# X_tfidf = vectorizer.tfidf(processed_texts)
# X_w2v = vectorizer.word2vec(processed_texts)
# X_sentence = vectorizer.sentence_transformers(texts)

3. Sentiment Analysis Implementation

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix
from transformers import pipeline
import torch

class SentimentAnalyzer:
    """Complete sentiment analysis system"""
    
    def __init__(self):
        self.models = {}
        self.vectorizer = TextVectorizer()
        self.preprocessor = TextPreprocessor()
    
    def prepare_data(self, df, text_column, label_column):
        """Prepare data for sentiment analysis"""
        # Preprocess text
        processed_texts = self.preprocessor.preprocess_pipeline(df[text_column])
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            processed_texts, 
            df[label_column],
            test_size=0.2,
            random_state=42
        )
        
        return X_train, X_test, y_train, y_test
    
    def train_traditional_models(self, X_train, y_train):
        """Train traditional ML models"""
        # Vectorize
        X_train_tfidf = self.vectorizer.tfidf(X_train)
        
        # Train models
        models = {
            'logistic_regression': LogisticRegression(max_iter=1000, random_state=42),
            'naive_bayes': MultinomialNB(),
            'svm': LinearSVC(random_state=42)
        }
        
        trained_models = {}
        
        for name, model in models.items():
            print(f"\nTraining {name}...")
            model.fit(X_train_tfidf, y_train)
            trained_models[name] = model
            
            # Cross-validation score
            from sklearn.model_selection import cross_val_score
            scores = cross_val_score(model, X_train_tfidf, y_train, cv=5, scoring='accuracy')
            print(f"  CV Accuracy: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")
        
        self.models.update(trained_models)
        return trained_models
    
    def train_transformer_model(self, X_train, y_train, model_name='distilbert-base-uncased'):
        """Train transformer-based model"""
        from transformers import (
            AutoTokenizer, 
            AutoModelForSequenceClassification,
            TrainingArguments, 
            Trainer
        )
        from datasets import Dataset
        
        # Tokenize
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        
        def tokenize_function(examples):
            return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)
        
        # Create dataset
        train_dataset = Dataset.from_dict({'text': X_train.tolist(), 'label': y_train.tolist()})
        train_dataset = train_dataset.map(tokenize_function, batched=True)
        
        # Load pre-trained model
        model = AutoModelForSequenceClassification.from_pretrained(
            model_name, 
            num_labels=2
        )
        
        # Training arguments
        training_args = TrainingArguments(
            output_dir='./results',
            num_train_epochs=3,
            per_device_train_batch_size=16,
            warmup_steps=500,
            weight_decay=0.01,
            logging_dir='./logs',
        )
        
        # Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
        )
        
        # Train
        trainer.train()
        
        self.models['transformer'] = model
        return model
    
    def predict_sentiment(self, texts, method='ensemble'):
        """Predict sentiment using specified method"""
        if method == 'ensemble':
            predictions = {}
            
            # Preprocess
            processed_texts = self.preprocessor.preprocess_pipeline(pd.Series(texts))
            
            # Get predictions from each model
            X_tfidf = self.vectorizer.tfidf(processed_texts)
            
            for name, model in self.models.items():
                if name != 'transformer':
                    pred = model.predict(X_tfidf)
                    predictions[name] = pred
            
            # Ensemble: majority voting
            from scipy.stats import mode
            all_preds = np.array(list(predictions.values()))
            ensemble_pred = mode(all_preds, axis=0)[0].flatten()
            
            return ensemble_pred
        
        elif method == 'transformer':
            # Use transformer pipeline
            sentiment_pipeline = pipeline(
                "sentiment-analysis",
                model=self.models['transformer'],
                tokenizer="distilbert-base-uncased"
            )
            
            results = sentiment_pipeline(texts)
            return [1 if r['label'] == 'POSITIVE' else 0 for r in results]
    
    def evaluate_models(self, X_test, y_test):
        """Evaluate all models"""
        X_test_tfidf = self.vectorizer.tfidf(X_test)
        
        results = {}
        
        for name, model in self.models.items():
            if name != 'transformer':
                y_pred = model.predict(X_test_tfidf)
                
                results[name] = {
                    'accuracy': accuracy_score(y_test, y_pred),
                    'precision': precision_score(y_test, y_pred, average='weighted'),
                    'recall': recall_score(y_test, y_pred, average='weighted'),
                    'f1': f1_score(y_test, y_pred, average='weighted')
                }
        
        comparison = pd.DataFrame(results).T
        print("\nModel Comparison:")
        print("=" * 60)
        print(comparison)
        
        return comparison

# Example usage
# sentiment_analyzer = SentimentAnalyzer()
# X_train, X_test, y_train, y_test = sentiment_analyzer.prepare_data(df, 'review_text', 'sentiment')
# models = sentiment_analyzer.train_traditional_models(X_train, y_train)
# comparison = sentiment_analyzer.evaluate_models(X_test, y_test)

4. Topic Modeling

from sklearn.decomposition import LatentDirichletAllocation, NMF
from gensim.models import LdaMulticore, CoherenceModel
from gensim.corpora import Dictionary
import pyLDAvis
import pyLDAvis.gensim_models

class TopicModeler:
    """Topic modeling using LDA and NMF"""
    
    def __init__(self):
        self.models = {}
        self.topics = None
    
    def lda_gensim(self, texts: List[List[str]], n_topics=10):
        """LDA using Gensim"""
        # Create dictionary
        dictionary = Dictionary(texts)
        
        # Filter extremes
        dictionary.filter_extremes(no_below=5, no_above=0.5)
        
        # Create corpus
        corpus = [dictionary.doc2bow(text) for text in texts]
        
        # Train LDA model
        lda_model = LdaMulticore(
            corpus=corpus,
            id2word=dictionary,
            num_topics=n_topics,
            random_state=42,
            passes=10,
            workers=4
        )
        
        # Calculate coherence
        coherence_model = CoherenceModel(
            model=lda_model, 
            texts=texts, 
            dictionary=dictionary, 
            coherence='c_v'
        )
        coherence_score = coherence_model.get_coherence()
        
        print(f"LDA Coherence Score: {coherence_score:.4f}")
        
        self.models['lda'] = {
            'model': lda_model,
            'dictionary': dictionary,
            'corpus': corpus,
            'coherence': coherence_score
        }
        
        return lda_model, coherence_score
    
    def sklearn_lda(self, texts_tfidf, n_topics=10, feature_names=None):
        """LDA using scikit-learn"""
        lda = LatentDirichletAllocation(
            n_components=n_topics,
            max_iter=10,
            learning_method='online',
            random_state=42
        )
        
        lda.fit(texts_tfidf)
        
        # Display topics
        if feature_names is not None:
            self.display_topics(lda, feature_names, n_words=10)
        
        self.models['sklearn_lda'] = lda
        return lda
    
    def nmf(self, texts_tfidf, n_topics=10, feature_names=None):
        """Non-negative Matrix Factorization"""
        nmf = NMF(
            n_components=n_topics,
            random_state=42,
            max_iter=500
        )
        
        nmf.fit(texts_tfidf)
        
        # Display topics
        if feature_names is not None:
            self.display_topics(nmf, feature_names, n_words=10)
        
        self.models['nmf'] = nmf
        return nmf
    
    def display_topics(self, model, feature_names, n_words=10):
        """Display topics with top words"""
        print("\nTopics:")
        print("=" * 60)
        
        for topic_idx, topic in enumerate(model.components_):
            top_words = [feature_names[i] for i in topic.argsort()[:-n_words - 1:-1]]
            print(f"Topic {topic_idx + 1}: {', '.join(top_words)}")
    
    def get_document_topics(self, texts_tfidf, model_name='nmf'):
        """Get topic distribution for each document"""
        model = self.models[model_name]
        
        if hasattr(model, 'transform'):
            topic_dist = model.transform(texts_tfidf)
        else:
            topic_dist = model[texts_tfidf]
        
        return topic_dist
    
    def visualize_topics(self, texts_tfidf, dictionary=None):
        """Visualize topics using pyLDAvis"""
        if 'lda' in self.models and dictionary is not None:
            vis_data = pyLDAvis.gensim_models.prepare(
                self.models['lda']['model'],
                self.models['lda']['corpus'],
                dictionary
            )
            pyLDAvis.save_html(vis_data, 'lda_visualization.html')
            print("Topic visualization saved to lda_visualization.html")

# Example usage
# topic_modeler = TopicModeler()
# tokenized_texts = [text.split() for text in processed_texts]
# lda_model, coherence = topic_modeler.lda_gensim(tokenized_texts, n_topics=10)

💡

Pro Tip: When choosing between LDA and NMF, LDA works better for interpretability while NMF often produces more coherent topics. Always evaluate using coherence scores, not just perplexity.

5. Real-World Application: Product Review Analysis

class ProductReviewAnalyzer:
    """Complete product review analysis system"""
    
    def __init__(self):
        self.preprocessor = TextPreprocessor()
        self.vectorizer = TextVectorizer()
        self.sentiment_analyzer = SentimentAnalyzer()
        self.topic_modeler = TopicModeler()
    
    def analyze_reviews(self, df, text_column='review_text'):
        """Complete review analysis pipeline"""
        
        print("Step 1: Preprocessing text...")
        processed_texts = self.preprocessor.preprocess_pipeline(df[text_column])
        
        print("\nStep 2: Creating text representations...")
        X_tfidf = self.vectorizer.tfidf(processed_texts)
        
        print("\nStep 3: Sentiment analysis...")
        sentiments = self.sentiment_analyzer.predict_sentiment(processed_texts.tolist())
        df['sentiment'] = sentiments
        
        print("\nStep 4: Topic modeling...")
        tokenized_texts = [text.split() for text in processed_texts]
        self.topic_modeler.lda_gensim(tokenized_texts, n_topics=5)
        
        print("\nStep 5: Extracting insights...")
        insights = self.extract_insights(df, sentiments)
        
        return df, insights
    
    def extract_insights(self, df, sentiments):
        """Extract business insights from analysis"""
        
        insights = {
            'total_reviews': len(df),
            'sentiment_distribution': {
                'positive': (sentiments == 1).sum(),
                'negative': (sentiments == 0).sum()
            },
            'average_rating': df['rating'].mean() if 'rating' in df.columns else None,
            'top_positive_aspects': self._extract_aspects(df[sentiments == 1]),
            'top_negative_aspects': self._extract_aspects(df[sentiments == 0]),
            'recommendations': self._generate_recommendations(insights)
        }
        
        return insights
    
    def _extract_aspects(self, df_subset):
        """Extract common aspects from reviews"""
        # Combine all text
        all_text = ' '.join(df_subset['review_text'].tolist())
        
        # Extract noun phrases
        doc = self.preprocessor.nlp(all_text[:10000])  # Limit for performance
        
        # Count noun phrases
        noun_phrases = []
        for chunk in doc.noun_chunks:
            noun_phrases.append(chunk.text.lower())
        
        # Get most common
        common_phrases = Counter(noun_phrases).most_common(10)
        
        return common_phrases
    
    def _generate_recommendations(self, insights):
        """Generate business recommendations"""
        recommendations = []
        
        # Based on sentiment distribution
        pos_pct = insights['sentiment_distribution']['positive'] / insights['total_reviews']
        
        if pos_pct > 0.8:
            recommendations.append("Strong positive sentiment - leverage in marketing")
        elif pos_pct < 0.5:
            recommendations.append("Investigate negative sentiment drivers")
        
        # Based on aspects
        if insights['top_negative_aspects']:
            top_negative = insights['top_negative_aspects'][0][0]
            recommendations.append(f"Address common complaint: {top_negative}")
        
        return recommendations

# Example usage
# analyzer = ProductReviewAnalyzer()
# analyzed_df, insights = analyzer.analyze_reviews(df)

6. Common Follow-Up Questions

Follow-up 1: How do you handle out-of-vocabulary words?

def handle_oov_words(texts, method='fasttext'):
    """Handle out-of-vocabulary words"""
    
    if method == 'fasttext':
        # FastText uses subword information
        from gensim.models import FastText
        
        tokenized_texts = [text.split() for text in texts]
        
        model = FastText(
            sentences=tokenized_texts,
            vector_size=100,
            min_count=2,
            workers=4
        )
        
        return model
    
    elif method == 'character_ngrams':
        # Character n-grams can represent OOV words
        from sklearn.feature_extraction.text import HashingVectorizer
        
        vectorizer = HashingVectorizer(
            analyzer='char_wb',
            ngram_range=(2, 4),
            n_features=2**18
        )
        
        return vectorizer
    
    elif method == 'bpe':
        # Byte Pair Encoding (used in BERT, GPT)
        from tokenizers import Tokenizer, models, trainers, pre_tokenizers
        
        tokenizer = Tokenizer(models.BPE())
        tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
        
        trainer = trainers.BpeTrainer(vocab_size=30000)
        tokenizer.train(files=None, trainer=trainer)
        
        return tokenizer

Follow-up 2: How do you deploy NLP models at scale?

class NLPDeployment:
    """Deploy NLP models at scale"""
    
    def __init__(self):
        self.models = {}
    
    def export_to_onnx(self, model, tokenizer, output_path):
        """Export model to ONNX for production"""
        import torch
        
        # Create dummy input
        dummy_input = tokenizer(
            "Sample text",
            return_tensors="pt",
            padding="max_length",
            max_length=128,
            truncation=True
        )
        
        # Export
        torch.onnx.export(
            model,
            (dummy_input['input_ids'], dummy_input['attention_mask']),
            output_path,
            input_names=['input_ids', 'attention_mask'],
            output_names=['logits'],
            dynamic_axes={
                'input_ids': {0: 'batch_size', 1: 'sequence_length'},
                'attention_mask': {0: 'batch_size', 1: 'sequence_length'},
                'logits': {0: 'batch_size'}
            }
        )
    
    def create_inference_server(self, model_path):
        """Create FastAPI server for inference"""
        from fastapi import FastAPI
        from pydantic import BaseModel
        
        app = FastAPI()
        
        class PredictionRequest(BaseModel):
            text: str
        
        class PredictionResponse(BaseModel):
            sentiment: str
            confidence: float
        
        @app.post("/predict", response_model=PredictionResponse)
        def predict(request: PredictionRequest):
            # Load model and predict
            # This is a simplified example
            return PredictionResponse(
                sentiment="positive",
                confidence=0.95
            )
        
        return app
    
    def create_batch_pipeline(self, texts, batch_size=32):
        """Batch processing for efficiency"""
        results = []
        
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            batch_results = self.predict_batch(batch)
            results.extend(batch_results)
        
        return results

Company-Specific Tips

ℹ️

Google Tips:

Google heavily tests on transformer architectures
Know how to fine-tune BERT for specific tasks
Understand attention mechanisms and their interpretation
Be comfortable with distributed training for NLP

Meta Tips:

Meta focuses on multilingual NLP
Know how to handle low-resource languages
Understand how to build conversational AI
Be familiar with large language model deployment

NLP: Text Processing, TF-IDF, Word Embeddings

NLP: Text Processing, TF-IDF, Word Embeddings

The Interview Question

Detailed Answer

1. Text Preprocessing Pipeline

2. Text Representation Methods

3. Sentiment Analysis Implementation

4. Topic Modeling

5. Real-World Application: Product Review Analysis

6. Common Follow-Up Questions

Company-Specific Tips

Quiz Section

Related Topics