Recommendation Engines
Recommendation systems predict what users will like based on their past behavior and the behavior of similar users. They power Netflix, Amazon, Spotify, and every modern content platform.
Recommendation System Architecture
Why Recommendations Matter
A good recommendation increases engagement, revenue, and user satisfaction. The difference between random suggestions and personalized recommendations can be billions in revenue.
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
import warnings
warnings.filterwarnings('ignore')
Generate User-Item Interaction Data
np.random.seed(42)
n_users = 500
n_items = 200
# Create user-item ratings with latent structure
n_factors = 5
user_factors = np.random.randn(n_users, n_factors) * 0.5
item_factors = np.random.randn(n_items, n_factors) * 0.5
# Ratings = user @ item.T + noise
ratings_matrix = user_factors @ item_factors.T + np.random.randn(n_users, n_items) * 0.1
ratings_matrix = np.clip(ratings_matrix * 2 + 3, 1, 5) # Scale to 1-5
# Create sparse ratings (not all users rate all items)
mask = np.random.random((n_users, n_items)) < 0.3 # 30% observed
sparse_ratings = ratings_matrix * mask
# Create interaction DataFrame
rows, cols = np.where(mask > 0)
interactions = pd.DataFrame({
'user_id': rows,
'item_id': cols,
'rating': sparse_ratings[rows, cols]
})
print(f"Interactions: {len(interactions)}")
print(f"Sparsity: {1 - len(interactions) / (n_users * n_items):.1%}")
print(f"Rating distribution:\n{interactions['rating'].describe()}")
User-Based Collaborative Filtering
class UserBasedCF:
"""User-based collaborative filtering."""
def __init__(self, k=20):
self.k = k
def fit(self, ratings_df):
self.ratings = ratings_df.pivot_table(
index='user_id', columns='item_id', values='rating'
).fillna(0)
# User similarity
self.user_sim = cosine_similarity(self.ratings)
np.fill_diagonal(self.user_sim, 0)
self.user_mean = self.ratings.mean(axis=1).values
self.user_ids = self.ratings.index
self.item_ids = self.ratings.columns
def predict(self, user_id, item_id):
if user_id not in self.user_ids or item_id not in self.item_ids:
return self.user_mean.mean()
user_idx = list(self.user_ids).index(user_id)
# Find k most similar users who rated this item
item_ratings = self.ratings[item_id]
rated_users = item_ratings[item_ratings > 0].index
if len(rated_users) == 0:
return self.user_mean[user_idx]
rated_indices = [list(self.user_ids).index(u) for u in rated_users]
similarities = self.user_sim[user_idx, rated_indices]
# Top k
top_k_idx = similarities.argsort()[-self.k:]
top_k_sims = similarities[top_k_idx]
top_k_ratings = item_ratings.iloc[top_k_idx].values
# Weighted average
if top_k_sims.sum() == 0:
return self.user_mean[user_idx]
pred = self.user_mean[user_idx] + \
np.sum(top_k_sims * (top_k_ratings - self.user_mean[rated_indices[top_k_idx]])) / \
(np.abs(top_k_sims).sum() + 1e-10)
return np.clip(pred, 1, 5)
# Train and evaluate
ubcf = UserBasedCF(k=20)
ubcf.fit(interactions)
# Predict for a sample
pred = ubcf.predict(0, 5)
print(f"User-based CF prediction: {pred:.2f}")
Item-Based Collaborative Filtering
class ItemBasedCF:
"""Item-based collaborative filtering."""
def __init__(self, k=20):
self.k = k
def fit(self, ratings_df):
self.ratings = ratings_df.pivot_table(
index='user_id', columns='item_id', values='rating'
).fillna(0)
# Item similarity
self.item_sim = cosine_similarity(self.ratings.T)
np.fill_diagonal(self.item_sim, 0)
self.item_ids = self.ratings.columns
self.user_ids = self.ratings.index
def predict(self, user_id, item_id):
if user_id not in self.user_ids or item_id not in self.item_ids:
return self.ratings.values.mean()
user_idx = list(self.user_ids).index(user_id)
item_idx = list(self.item_ids).index(item_id)
# Items rated by this user
user_ratings = self.ratings.iloc[user_idx]
rated_items = user_ratings[user_ratings > 0].index
if len(rated_items) == 0:
return self.ratings.values.mean()
rated_indices = [list(self.item_ids).index(i) for i in rated_items]
similarities = self.item_sim[item_idx, rated_indices]
# Top k similar items
top_k_idx = similarities.argsort()[-self.k:]
top_k_sims = similarities[top_k_idx]
top_k_ratings = user_ratings.iloc[top_k_idx].values
if top_k_sims.sum() == 0:
return self.ratings.values.mean()
pred = np.sum(top_k_sims * top_k_ratings) / (np.abs(top_k_sims).sum() + 1e-10)
return np.clip(pred, 1, 5)
ibcf = ItemBasedCF(k=20)
ibcf.fit(interactions)
pred = ibcf.predict(0, 5)
print(f"Item-based CF prediction: {pred:.2f}")
Matrix Factorization (SVD)
class MatrixFactorization:
"""Matrix factorization using SVD."""
def __init__(self, n_factors=20):
self.n_factors = n_factors
def fit(self, ratings_df):
self.ratings = ratings_df.pivot_table(
index='user_id', columns='item_id', values='rating'
).fillna(0)
self.user_ids = self.ratings.index
self.item_ids = self.ratings.columns
# SVD
self.svd = TruncatedSVD(n_components=self.n_factors, random_state=42)
self.user_factors = self.svd.fit_transform(self.ratings)
self.item_factors = self.svd.components_.T
# Reconstruct
self.predicted_ratings = self.user_factors @ self.item_factors.T + \
self.ratings.mean().mean()
def predict(self, user_id, item_id):
if user_id not in self.user_ids or item_id not in self.item_ids:
return self.ratings.values.mean()
user_idx = list(self.user_ids).index(user_id)
item_idx = list(self.item_ids).index(item_id)
return np.clip(self.predicted_ratings[user_idx, item_idx], 1, 5)
def recommend(self, user_id, n=10):
"""Get top-n recommendations for a user."""
if user_id not in self.user_ids:
return []
user_idx = list(self.user_ids).index(user_id)
scores = self.predicted_ratings[user_idx]
# Exclude already rated items
rated = self.ratings.iloc[user_idx] > 0
scores[rated] = -np.inf
top_items = scores.argsort()[::-1][:n]
return [(self.item_ids[i], scores[i]) for i in top_items]
mf = MatrixFactorization(n_factors=20)
mf.fit(interactions)
# Get recommendations
recs = mf.recommend(0, n=5)
print("Top 5 recommendations for user 0:")
for item_id, score in recs:
print(f" Item {item_id}: {score:.2f}")
ALS (Alternating Least Squares)
class ALS:
"""Alternating Least Squares for matrix factorization."""
def __init__(self, n_factors=20, reg=0.1, n_iter=20):
self.n_factors = n_factors
self.reg = reg
self.n_iter = n_iter
def fit(self, ratings_df):
R = ratings_df.pivot_table(
index='user_id', columns='item_id', values='rating'
).fillna(0).values
n_users, n_items = R.shape
# Initialize
self.user_factors = np.random.randn(n_users, self.n_factors) * 0.1
self.item_factors = np.random.randn(n_items, self.n_factors) * 0.1
# Binary mask
mask = (R > 0).astype(float)
for iteration in range(self.n_iter):
# Fix items, solve for users
for u in range(n_users):
idx = np.where(mask[u] > 0)[0]
if len(idx) == 0:
continue
V = self.item_factors[idx]
r = R[u, idx]
A = V.T @ V + self.reg * np.eye(self.n_factors)
b = V.T @ r
self.user_factors[u] = np.linalg.solve(A, b)
# Fix users, solve for items
for i in range(n_items):
idx = np.where(mask[:, i] > 0)[0]
if len(idx) == 0:
continue
U = self.user_factors[idx]
r = R[idx, i]
A = U.T @ U + self.reg * np.eye(self.n_factors)
b = U.T @ r
self.item_factors[i] = np.linalg.solve(A, b)
self.predicted_ratings = self.user_factors @ self.item_factors.T
def predict(self, user_id, item_id):
return np.clip(self.predicted_ratings[user_id, item_id], 1, 5)
als = ALS(n_factors=20, reg=0.1, n_iter=20)
als.fit(interactions)
pred = als.predict(0, 5)
print(f"ALS prediction: {pred:.2f}")
Content-Based Filtering
class ContentBasedRecommender:
"""Content-based filtering using item features."""
def __init__(self):
self.similarity = None
def fit(self, item_features, ratings_df):
"""
item_features: DataFrame with item_id as index, feature columns
"""
self.item_features = item_features
self.item_sim = cosine_similarity(item_features)
np.fill_diagonal(self.item_sim, 0)
self.ratings = ratings_df
self.item_ids = item_features.index
def predict(self, user_id, item_id):
# Get items rated by this user
user_ratings = self.ratings[self.ratings['user_id'] == user_id]
if len(user_ratings) == 0 or item_id not in self.item_ids:
return 3.0
item_idx = list(self.item_ids).index(item_id)
# Weight by similarity to rated items
weights = []
scores = []
for _, row in user_ratings.iterrows():
if row['item_id'] in self.item_ids:
rated_idx = list(self.item_ids).index(row['item_id'])
sim = self.item_sim[item_idx, rated_idx]
weights.append(sim)
scores.append(row['rating'])
if sum(np.abs(weights)) == 0:
return 3.0
return np.average(scores, weights=weights)
# Create item features (simulating genre features)
item_features = pd.DataFrame(
np.random.randn(n_items, 10),
index=range(n_items)
)
cb = ContentBasedRecommender()
cb.fit(item_features, interactions)
pred = cb.predict(0, 5)
print(f"Content-based prediction: {pred:.2f}")
Hybrid Recommender
class HybridRecommender:
"""Combine collaborative and content-based filtering."""
def __init__(self, cf_weight=0.7, cb_weight=0.3):
self.cf_weight = cf_weight
self.cb_weight = cb_weight
def predict(self, user_id, item_id):
cf_pred = self.cf_model.predict(user_id, item_id)
cb_pred = self.cb_model.predict(user_id, item_id)
return self.cf_weight * cf_pred + self.cb_weight * cb_pred
hybrid = HybridRecommender()
hybrid.cf_model = mf
hybrid.cb_model = cb
pred = hybrid.predict(0, 5)
print(f"Hybrid prediction: {pred:.2f}")
Evaluation
from sklearn.model_selection import train_test_split
train, test = train_test_split(interactions, test_size=0.2, random_state=42)
# Evaluate MF
mf_eval = MatrixFactorization(n_factors=20)
mf_eval.fit(train)
predictions = []
actuals = []
for _, row in test.iterrows():
pred = mf_eval.predict(int(row['user_id']), int(row['item_id']))
predictions.append(pred)
actuals.append(row['rating'])
rmse = np.sqrt(np.mean((np.array(predictions) - np.array(actuals)) ** 2))
mae = np.mean(np.abs(np.array(predictions) - np.array(actuals)))
print(f"MF RMSE: {rmse:.4f}")
print(f"MF MAE: {mae:.4f}")
Best Practices
- Matrix factorization as baseline β SVD or ALS for collaborative filtering
- Cold start problem β use content-based for new users/items
- Implicit feedback β clicks, views, and time spent are more available than ratings
- Diversity vs accuracy β balance relevant and novel recommendations
- A/B testing β online metrics matter more than offline metrics
- Scale with ALS β handles large sparse matrices efficiently
Summary
Recommendation engines predict user preferences through collaborative filtering, content-based methods, and hybrid approaches. Matrix factorization with ALS scales to millions of users and items. Master these techniques to build systems that drive engagement and revenue.