Recommendation Systems: Collaborative and Content-Based
Recommendation systems power personalized experiences on platforms like Netflix, Amazon, and Spotify. This lesson covers the core algorithms.
Collaborative Filtering
<svg width="600" height="400" viewBox="0 0 600 400" xmlns="http://www.w3.org/2000/svg">
<rect width="600" height="400" fill="#f8f9fa" rx="10"/>
<text x="300" y="30" text-anchor="middle" font-size="18" font-weight="bold" fill="#2c3e50">Collaborative Filtering</text>
<!-- User-Item Matrix -->
<text x="50" y="65" font-size="12" font-weight="bold" fill="#2c3e50">User-Item Interaction Matrix</text>
<!-- Users -->
<text x="80" y="90" font-size="10" fill="#7f8c8d">Users</text>
<rect x="100" y="80" width="60" height="25" fill="#3498db" rx="3"/>
<text x="130" y="97" text-anchor="middle" font-size="9" fill="white">User 1</text>
<rect x="100" y="110" width="60" height="25" fill="#3498db" rx="3"/>
<text x="130" y="127" text-anchor="middle" font-size="9" fill="white">User 2</text>
<rect x="100" y="140" width="60" height="25" fill="#3498db" rx="3"/>
<text x="130" y="157" text-anchor="middle" font-size="9" fill="white">User 3</text>
<!-- Items -->
<text x="200" y="80" font-size="10" fill="#7f8c8d">Items</text>
<rect x="230" y="80" width="50" height="25" fill="#2ecc71" rx="3"/>
<text x="255" y="97" text-anchor="middle" font-size="9" fill="white">Item A</text>
<rect x="290" y="80" width="50" height="25" fill="#2ecc71" rx="3"/>
<text x="315" y="97" text-anchor="middle" font-size="9" fill="white">Item B</text>
<rect x="350" y="80" width="50" height="25" fill="#2ecc71" rx="3"/>
<text x="375" y="97" text-anchor="middle" font-size="9" fill="white">Item C</text>
<rect x="410" y="80" width="50" height="25" fill="#2ecc71" rx="3"/>
<text x="435" y="97" text-anchor="middle" font-size="9" fill="white">Item D</text>
<!-- Ratings -->
<rect x="230" y="110" width="50" height="25" fill="#f39c12" rx="3"/>
<text x="255" y="127" text-anchor="middle" font-size="9" fill="white">5</text>
<rect x="290" y="110" width="50" height="25" fill="#f39c12" rx="3"/>
<text x="315" y="127" text-anchor="middle" font-size="9" fill="white">3</text>
<rect x="350" y="110" width="50" height="25" fill="#f39c12" rx="3"/>
<text x="375" y="127" text-anchor="middle" font-size="9" fill="white">?</text>
<rect x="410" y="110" width="50" height="25" fill="#f39c12" rx="3"/>
<text x="435" y="127" text-anchor="middle" font-size="9" fill="white">4</text>
<!-- Goal -->
<text x="300" y="180" text-anchor="middle" font-size="12" fill="#2c3e50">Goal: Predict missing ratings (?)</text>
<!-- Methods -->
<rect x="50" y="210" width="230" height="120" fill="white" stroke="#3498db" stroke-width="2" rx="5"/>
<text x="165" y="235" text-anchor="middle" font-size="12" font-weight="bold" fill="#3498db">Memory-Based CF</text>
<text x="70" y="260" font-size="10" fill="#2c3e50">• User-User: Find similar users</text>
<text x="70" y="280" font-size="10" fill="#2c3e50">• Item-Item: Find similar items</text>
<text x="70" y="300" font-size="10" fill="#2c3e50">• Cosine/pearson similarity</text>
<rect x="320" y="210" width="230" height="120" fill="white" stroke="#2ecc71" stroke-width="2" rx="5"/>
<text x="435" y="235" text-anchor="middle" font-size="12" font-weight="bold" fill="#2ecc71">Model-Based CF</text>
<text x="340" y="260" font-size="10" fill="#2c3e50">• Matrix Factorization (SVD)</text>
<text x="340" y="280" font-size="10" fill="#2c3e50">• Deep Learning (Neural CF)</text>
<text x="340" y="300" font-size="10" fill="#2c3e50">• Handles sparsity better</text>
<text x="300" y="360" text-anchor="middle" font-size="11" fill="#7f8c8d">Similarity: sim(u,v) = Σ(r_ui - r̄_u)(r_vi - r̄_v) / √(Σ(r_ui - r̄_u)² × Σ(r_vi - r̄_v)²)</text>
</svg>
User-User Collaborative Filtering
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
class UserBasedCF:
def __init__(self, ratings_df):
self.ratings = ratings_df
self.user_similarity = None
def compute_similarity(self):
"""Compute user-user similarity matrix"""
# Create user-item matrix
self.user_item_matrix = self.ratings.pivot_table(
index='user_id', columns='item_id', values='rating'
).fillna(0)
# Compute cosine similarity
self.user_similarity = cosine_similarity(self.user_item_matrix)
self.user_similarity = pd.DataFrame(
self.user_similarity,
index=self.user_item_matrix.index,
columns=self.user_item_matrix.index
)
return self
def predict(self, user_id, item_id, k=5):
"""Predict rating for user-item pair"""
# Find k most similar users who rated this item
item_ratings = self.ratings[self.ratings['item_id'] == item_id]
similar_users = item_ratings['user_id'].unique()
# Get similarity scores
sim_scores = self.user_similarity[user_id][similar_users]
top_k_users = sim_scores.nlargest(k).index
# Weighted average of ratings
sim_scores = self.user_similarity[user_id][top_k_users]
ratings = self.ratings[
(self.ratings['user_id'].isin(top_k_users)) &
(self.ratings['item_id'] == item_id)
]['rating']
prediction = np.dot(sim_scores, ratings) / (sim_scores.sum() + 1e-8)
return prediction
def recommend(self, user_id, n_recommendations=10):
"""Generate top N recommendations for a user"""
# Get items not rated by user
user_items = set(self.ratings[self.ratings['user_id'] == user_id]['item_id'])
all_items = set(self.ratings['item_id'])
unrated_items = all_items - user_items
# Predict ratings for unrated items
predictions = []
for item_id in unrated_items:
pred = self.predict(user_id, item_id)
predictions.append((item_id, pred))
# Sort by predicted rating
predictions.sort(key=lambda x: x[1], reverse=True)
return predictions[:n_recommendations]
Item-Item Collaborative Filtering
class ItemBasedCF:
def __init__(self, ratings_df):
self.ratings = ratings_df
self.item_similarity = None
def compute_similarity(self):
"""Compute item-item similarity matrix"""
# Create item-user matrix (transpose)
self.item_user_matrix = self.ratings.pivot_table(
index='item_id', columns='user_id', values='rating'
).fillna(0)
# Compute cosine similarity
self.item_similarity = cosine_similarity(self.item_user_matrix)
self.item_similarity = pd.DataFrame(
self.item_similarity,
index=self.item_user_matrix.index,
columns=self.item_user_matrix.index
)
return self
def predict(self, user_id, item_id, k=5):
"""Predict rating using similar items"""
# Find items rated by this user
user_items = self.ratings[self.ratings['user_id'] == user_id]
# Get similarity scores with items user has rated
similar_items = user_items['item_id'].values
sim_scores = self.item_similarity[item_id][similar_items]
# Get top k similar items
top_k_items = sim_scores.nlargest(k).index
top_k_ratings = user_items[user_items['item_id'].isin(similar_items[top_k_items])]['rating']
# Weighted average
prediction = np.dot(sim_scores[top_k_items], top_k_ratings) / (sim_scores[top_k_items].sum() + 1e-8)
return prediction
Matrix Factorization (SVD)
from scipy.sparse.linalg import svds
from sklearn.decomposition import TruncatedSVD
class MatrixFactorization:
def __init__(self, ratings_df, n_factors=50):
self.ratings = ratings_df
self.n_factors = n_factors
self.user_factors = None
self.item_factors = None
def fit(self):
"""Decompose rating matrix using SVD"""
# Create user-item matrix
self.user_item_matrix = self.ratings.pivot_table(
index='user_id', columns='item_id', values='rating'
).fillna(0)
# Normalize by user mean
user_means = self.user_item_matrix.mean(axis=1).values.reshape(-1, 1)
matrix_normalized = self.user_item_matrix.values - user_means
# SVD decomposition
U, sigma, Vt = svds(matrix_normalized, k=self.n_factors)
sigma = np.diag(sigma)
# Store factors
self.user_factors = U
self.item_factors = Vt.T
self.sigma = sigma
self.user_means = user_means
self.user_ids = self.user_item_matrix.index
self.item_ids = self.user_item_matrix.columns
return self
def predict(self, user_id, item_id):
"""Predict rating for user-item pair"""
user_idx = np.where(self.user_ids == user_id)[0][0]
item_idx = np.where(self.item_ids == item_id)[0][0]
# Reconstruct rating
prediction = self.user_means[user_idx] + \
np.dot(self.user_factors[user_idx],
np.dot(self.sigma, self.item_factors[item_idx]))
return np.clip(prediction, 1, 5) # Clip to valid rating range
def recommend(self, user_id, n_recommendations=10):
"""Generate recommendations"""
user_idx = np.where(self.user_ids == user_id)[0][0]
# Predict all ratings for this user
predictions = self.user_means[user_idx] + \
np.dot(self.user_factors[user_idx],
np.dot(self.sigma, self.item_factors.T))
# Get unrated items
rated_items = self.ratings[self.ratings['user_id'] == user_id]['item_id'].values
unrated_mask = ~np.isin(self.item_ids, rated_items)
# Sort predictions
item_predictions = list(zip(self.item_ids[unrated_mask],
predictions[unrated_mask]))
item_predictions.sort(key=lambda x: x[1], reverse=True)
return item_predictions[:n_recommendations]
Content-Based Filtering
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
class ContentBasedRecommender:
def __init__(self, items_df):
self.items = items_df
self.tfidf_matrix = None
self.similarity_matrix = None
def fit(self, text_columns=['title', 'description', 'genres']):
"""Build content similarity matrix"""
# Combine text features
self.items['content'] = self.items[text_columns].apply(
lambda x: ' '.join(x.fillna('')), axis=1
)
# TF-IDF vectorization
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
self.tfidf_matrix = tfidf.fit_transform(self.items['content'])
# Compute cosine similarity
self.similarity_matrix = linear_kernel(self.tfidf_matrix, self.tfidf_matrix)
return self
def recommend(self, item_id, n_recommendations=10):
"""Recommend similar items based on content"""
item_idx = np.where(self.items['item_id'] == item_id)[0][0]
# Get similarity scores
sim_scores = list(enumerate(self.similarity_matrix[item_idx]))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
# Exclude the item itself
sim_scores = sim_scores[1:n_recommendations + 1]
# Get recommendations
item_indices = [i[0] for i in sim_scores]
return self.items.iloc[item_indices][['item_id', 'title', 'genres']]
def recommend_for_user(self, user_history, n_recommendations=10):
"""Recommend based on user's viewing history"""
# Get items user has interacted with
user_items = self.items[self.items['item_id'].isin(user_history)]
# Average similarity scores
avg_similarities = np.mean(
self.similarity_matrix[user_items.index], axis=0
)
# Exclude already watched
watched_mask = np.isin(self.items.index, user_items.index)
avg_similarities[watched_mask] = -1
# Get top recommendations
top_indices = np.argsort(avg_similarities)[::-1][:n_recommendations]
return self.items.iloc[top_indices][['item_id', 'title', 'genres']]
Evaluation Metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error
def evaluate_recommendation_system(predictions, actuals, k=10):
"""Evaluate recommendation quality"""
# Precision@K
precision = len(set(predictions[:k]) & set(actuals)) / k
# Recall@K
recall = len(set(predictions[:k]) & set(actuals)) / len(actuals) if actuals else 0
# NDCG@K
def dcg(scores):
return np.sum(scores / np.log2(np.arange(2, len(scores) + 2)))
relevance = [1 if item in actuals else 0 for item in predictions[:k]]
ideal_relevance = sorted(relevance, reverse=True)
ndcg = dcg(relevance) / (dcg(ideal_relevance) + 1e-8)
return {
'Precision@K': precision,
'Recall@K': recall,
'NDCG@K': ndcg
}
# Cross-validation
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df[['user_id', 'item_id', 'rating']], reader)
# Evaluate SVD
svd = SVD(n_factors=50, random_state=42)
cv_results = cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5)
print(f"RMSE: {cv_results['test_rmse'].mean():.4f}")
print(f"MAE: {cv_results['test_mae'].mean():.4f}")
Key Takeaways
- Collaborative filtering uses user behavior patterns
- Content-based filtering uses item features
- Matrix factorization handles sparse data well
- Hybrid systems combine both approaches
- Use Precision@K and NDCG for evaluation