CW

Recommendation Systems: Collaborative and Content-Based

Module 10: Specialized MLFree Lesson

Advertisement

Recommendation Systems: Collaborative and Content-Based

Recommendation systems power personalized experiences on platforms like Netflix, Amazon, and Spotify. This lesson covers the core algorithms.

Collaborative Filtering

<svg width="600" height="400" viewBox="0 0 600 400" xmlns="http://www.w3.org/2000/svg">
  <rect width="600" height="400" fill="#f8f9fa" rx="10"/>
  <text x="300" y="30" text-anchor="middle" font-size="18" font-weight="bold" fill="#2c3e50">Collaborative Filtering</text>
  
  <!-- User-Item Matrix -->
  <text x="50" y="65" font-size="12" font-weight="bold" fill="#2c3e50">User-Item Interaction Matrix</text>
  
  <!-- Users -->
  <text x="80" y="90" font-size="10" fill="#7f8c8d">Users</text>
  <rect x="100" y="80" width="60" height="25" fill="#3498db" rx="3"/>
  <text x="130" y="97" text-anchor="middle" font-size="9" fill="white">User 1</text>
  
  <rect x="100" y="110" width="60" height="25" fill="#3498db" rx="3"/>
  <text x="130" y="127" text-anchor="middle" font-size="9" fill="white">User 2</text>
  
  <rect x="100" y="140" width="60" height="25" fill="#3498db" rx="3"/>
  <text x="130" y="157" text-anchor="middle" font-size="9" fill="white">User 3</text>
  
  <!-- Items -->
  <text x="200" y="80" font-size="10" fill="#7f8c8d">Items</text>
  <rect x="230" y="80" width="50" height="25" fill="#2ecc71" rx="3"/>
  <text x="255" y="97" text-anchor="middle" font-size="9" fill="white">Item A</text>
  
  <rect x="290" y="80" width="50" height="25" fill="#2ecc71" rx="3"/>
  <text x="315" y="97" text-anchor="middle" font-size="9" fill="white">Item B</text>
  
  <rect x="350" y="80" width="50" height="25" fill="#2ecc71" rx="3"/>
  <text x="375" y="97" text-anchor="middle" font-size="9" fill="white">Item C</text>
  
  <rect x="410" y="80" width="50" height="25" fill="#2ecc71" rx="3"/>
  <text x="435" y="97" text-anchor="middle" font-size="9" fill="white">Item D</text>
  
  <!-- Ratings -->
  <rect x="230" y="110" width="50" height="25" fill="#f39c12" rx="3"/>
  <text x="255" y="127" text-anchor="middle" font-size="9" fill="white">5</text>
  
  <rect x="290" y="110" width="50" height="25" fill="#f39c12" rx="3"/>
  <text x="315" y="127" text-anchor="middle" font-size="9" fill="white">3</text>
  
  <rect x="350" y="110" width="50" height="25" fill="#f39c12" rx="3"/>
  <text x="375" y="127" text-anchor="middle" font-size="9" fill="white">?</text>
  
  <rect x="410" y="110" width="50" height="25" fill="#f39c12" rx="3"/>
  <text x="435" y="127" text-anchor="middle" font-size="9" fill="white">4</text>
  
  <!-- Goal -->
  <text x="300" y="180" text-anchor="middle" font-size="12" fill="#2c3e50">Goal: Predict missing ratings (?)</text>
  
  <!-- Methods -->
  <rect x="50" y="210" width="230" height="120" fill="white" stroke="#3498db" stroke-width="2" rx="5"/>
  <text x="165" y="235" text-anchor="middle" font-size="12" font-weight="bold" fill="#3498db">Memory-Based CF</text>
  <text x="70" y="260" font-size="10" fill="#2c3e50">• User-User: Find similar users</text>
  <text x="70" y="280" font-size="10" fill="#2c3e50">• Item-Item: Find similar items</text>
  <text x="70" y="300" font-size="10" fill="#2c3e50">• Cosine/pearson similarity</text>
  
  <rect x="320" y="210" width="230" height="120" fill="white" stroke="#2ecc71" stroke-width="2" rx="5"/>
  <text x="435" y="235" text-anchor="middle" font-size="12" font-weight="bold" fill="#2ecc71">Model-Based CF</text>
  <text x="340" y="260" font-size="10" fill="#2c3e50">• Matrix Factorization (SVD)</text>
  <text x="340" y="280" font-size="10" fill="#2c3e50">• Deep Learning (Neural CF)</text>
  <text x="340" y="300" font-size="10" fill="#2c3e50">• Handles sparsity better</text>
  
  <text x="300" y="360" text-anchor="middle" font-size="11" fill="#7f8c8d">Similarity: sim(u,v) = Σ(r_ui - r̄_u)(r_vi - r̄_v) / √(Σ(r_ui - r̄_u)² × Σ(r_vi - r̄_v)²)</text>
</svg>

User-User Collaborative Filtering

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

class UserBasedCF:
    def __init__(self, ratings_df):
        self.ratings = ratings_df
        self.user_similarity = None
        
    def compute_similarity(self):
        """Compute user-user similarity matrix"""
        # Create user-item matrix
        self.user_item_matrix = self.ratings.pivot_table(
            index='user_id', columns='item_id', values='rating'
        ).fillna(0)
        
        # Compute cosine similarity
        self.user_similarity = cosine_similarity(self.user_item_matrix)
        self.user_similarity = pd.DataFrame(
            self.user_similarity,
            index=self.user_item_matrix.index,
            columns=self.user_item_matrix.index
        )
        return self
    
    def predict(self, user_id, item_id, k=5):
        """Predict rating for user-item pair"""
        # Find k most similar users who rated this item
        item_ratings = self.ratings[self.ratings['item_id'] == item_id]
        similar_users = item_ratings['user_id'].unique()
        
        # Get similarity scores
        sim_scores = self.user_similarity[user_id][similar_users]
        top_k_users = sim_scores.nlargest(k).index
        
        # Weighted average of ratings
        sim_scores = self.user_similarity[user_id][top_k_users]
        ratings = self.ratings[
            (self.ratings['user_id'].isin(top_k_users)) & 
            (self.ratings['item_id'] == item_id)
        ]['rating']
        
        prediction = np.dot(sim_scores, ratings) / (sim_scores.sum() + 1e-8)
        return prediction
    
    def recommend(self, user_id, n_recommendations=10):
        """Generate top N recommendations for a user"""
        # Get items not rated by user
        user_items = set(self.ratings[self.ratings['user_id'] == user_id]['item_id'])
        all_items = set(self.ratings['item_id'])
        unrated_items = all_items - user_items
        
        # Predict ratings for unrated items
        predictions = []
        for item_id in unrated_items:
            pred = self.predict(user_id, item_id)
            predictions.append((item_id, pred))
        
        # Sort by predicted rating
        predictions.sort(key=lambda x: x[1], reverse=True)
        return predictions[:n_recommendations]

Item-Item Collaborative Filtering

class ItemBasedCF:
    def __init__(self, ratings_df):
        self.ratings = ratings_df
        self.item_similarity = None
        
    def compute_similarity(self):
        """Compute item-item similarity matrix"""
        # Create item-user matrix (transpose)
        self.item_user_matrix = self.ratings.pivot_table(
            index='item_id', columns='user_id', values='rating'
        ).fillna(0)
        
        # Compute cosine similarity
        self.item_similarity = cosine_similarity(self.item_user_matrix)
        self.item_similarity = pd.DataFrame(
            self.item_similarity,
            index=self.item_user_matrix.index,
            columns=self.item_user_matrix.index
        )
        return self
    
    def predict(self, user_id, item_id, k=5):
        """Predict rating using similar items"""
        # Find items rated by this user
        user_items = self.ratings[self.ratings['user_id'] == user_id]
        
        # Get similarity scores with items user has rated
        similar_items = user_items['item_id'].values
        sim_scores = self.item_similarity[item_id][similar_items]
        
        # Get top k similar items
        top_k_items = sim_scores.nlargest(k).index
        top_k_ratings = user_items[user_items['item_id'].isin(similar_items[top_k_items])]['rating']
        
        # Weighted average
        prediction = np.dot(sim_scores[top_k_items], top_k_ratings) / (sim_scores[top_k_items].sum() + 1e-8)
        return prediction

Matrix Factorization (SVD)

from scipy.sparse.linalg import svds
from sklearn.decomposition import TruncatedSVD

class MatrixFactorization:
    def __init__(self, ratings_df, n_factors=50):
        self.ratings = ratings_df
        self.n_factors = n_factors
        self.user_factors = None
        self.item_factors = None
        
    def fit(self):
        """Decompose rating matrix using SVD"""
        # Create user-item matrix
        self.user_item_matrix = self.ratings.pivot_table(
            index='user_id', columns='item_id', values='rating'
        ).fillna(0)
        
        # Normalize by user mean
        user_means = self.user_item_matrix.mean(axis=1).values.reshape(-1, 1)
        matrix_normalized = self.user_item_matrix.values - user_means
        
        # SVD decomposition
        U, sigma, Vt = svds(matrix_normalized, k=self.n_factors)
        sigma = np.diag(sigma)
        
        # Store factors
        self.user_factors = U
        self.item_factors = Vt.T
        self.sigma = sigma
        self.user_means = user_means
        self.user_ids = self.user_item_matrix.index
        self.item_ids = self.user_item_matrix.columns
        
        return self
    
    def predict(self, user_id, item_id):
        """Predict rating for user-item pair"""
        user_idx = np.where(self.user_ids == user_id)[0][0]
        item_idx = np.where(self.item_ids == item_id)[0][0]
        
        # Reconstruct rating
        prediction = self.user_means[user_idx] + \
                    np.dot(self.user_factors[user_idx], 
                           np.dot(self.sigma, self.item_factors[item_idx]))
        
        return np.clip(prediction, 1, 5)  # Clip to valid rating range
    
    def recommend(self, user_id, n_recommendations=10):
        """Generate recommendations"""
        user_idx = np.where(self.user_ids == user_id)[0][0]
        
        # Predict all ratings for this user
        predictions = self.user_means[user_idx] + \
                     np.dot(self.user_factors[user_idx], 
                            np.dot(self.sigma, self.item_factors.T))
        
        # Get unrated items
        rated_items = self.ratings[self.ratings['user_id'] == user_id]['item_id'].values
        unrated_mask = ~np.isin(self.item_ids, rated_items)
        
        # Sort predictions
        item_predictions = list(zip(self.item_ids[unrated_mask], 
                                   predictions[unrated_mask]))
        item_predictions.sort(key=lambda x: x[1], reverse=True)
        
        return item_predictions[:n_recommendations]

Content-Based Filtering

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

class ContentBasedRecommender:
    def __init__(self, items_df):
        self.items = items_df
        self.tfidf_matrix = None
        self.similarity_matrix = None
        
    def fit(self, text_columns=['title', 'description', 'genres']):
        """Build content similarity matrix"""
        # Combine text features
        self.items['content'] = self.items[text_columns].apply(
            lambda x: ' '.join(x.fillna('')), axis=1
        )
        
        # TF-IDF vectorization
        tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
        self.tfidf_matrix = tfidf.fit_transform(self.items['content'])
        
        # Compute cosine similarity
        self.similarity_matrix = linear_kernel(self.tfidf_matrix, self.tfidf_matrix)
        
        return self
    
    def recommend(self, item_id, n_recommendations=10):
        """Recommend similar items based on content"""
        item_idx = np.where(self.items['item_id'] == item_id)[0][0]
        
        # Get similarity scores
        sim_scores = list(enumerate(self.similarity_matrix[item_idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        
        # Exclude the item itself
        sim_scores = sim_scores[1:n_recommendations + 1]
        
        # Get recommendations
        item_indices = [i[0] for i in sim_scores]
        return self.items.iloc[item_indices][['item_id', 'title', 'genres']]
    
    def recommend_for_user(self, user_history, n_recommendations=10):
        """Recommend based on user's viewing history"""
        # Get items user has interacted with
        user_items = self.items[self.items['item_id'].isin(user_history)]
        
        # Average similarity scores
        avg_similarities = np.mean(
            self.similarity_matrix[user_items.index], axis=0
        )
        
        # Exclude already watched
        watched_mask = np.isin(self.items.index, user_items.index)
        avg_similarities[watched_mask] = -1
        
        # Get top recommendations
        top_indices = np.argsort(avg_similarities)[::-1][:n_recommendations]
        return self.items.iloc[top_indices][['item_id', 'title', 'genres']]

Evaluation Metrics

from sklearn.metrics import mean_squared_error, mean_absolute_error

def evaluate_recommendation_system(predictions, actuals, k=10):
    """Evaluate recommendation quality"""
    
    # Precision@K
    precision = len(set(predictions[:k]) & set(actuals)) / k
    
    # Recall@K
    recall = len(set(predictions[:k]) & set(actuals)) / len(actuals) if actuals else 0
    
    # NDCG@K
    def dcg(scores):
        return np.sum(scores / np.log2(np.arange(2, len(scores) + 2)))
    
    relevance = [1 if item in actuals else 0 for item in predictions[:k]]
    ideal_relevance = sorted(relevance, reverse=True)
    
    ndcg = dcg(relevance) / (dcg(ideal_relevance) + 1e-8)
    
    return {
        'Precision@K': precision,
        'Recall@K': recall,
        'NDCG@K': ndcg
    }

# Cross-validation
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df[['user_id', 'item_id', 'rating']], reader)

# Evaluate SVD
svd = SVD(n_factors=50, random_state=42)
cv_results = cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5)
print(f"RMSE: {cv_results['test_rmse'].mean():.4f}")
print(f"MAE: {cv_results['test_mae'].mean():.4f}")

Key Takeaways

  1. Collaborative filtering uses user behavior patterns
  2. Content-based filtering uses item features
  3. Matrix factorization handles sparse data well
  4. Hybrid systems combine both approaches
  5. Use Precision@K and NDCG for evaluation

Advertisement

Need Expert Data Science Help?

Get personalized tutoring, project support, or professional consulting.

Advertisement