πŸŽ‰ 75% of content is free forever β€” Unlock Premium from $10/mo β†’
CW
Search courses…
πŸ’Ό Servicesℹ️ Aboutβœ‰οΈ ContactView Pricing Plansfrom $10

Clustering: K-Means, DBSCAN, Evaluation Methods

Data Science Interview PremiumClustering⭐ Premium

Advertisement

GOOGLE & LINKEDIN INTERVIEW QUESTION

Clustering: K-Means, DBSCAN, Evaluation Methods

Unsupervised Learning & Segmentation

The Interview Question

ℹ️

Question: You're building a customer segmentation system for a professional networking platform:

  • Dataset: 5M users with features: profile views, connections, posts, job applications, industry
  • Requirements: Identify distinct user segments for targeted marketing
  • Challenges: High-dimensional data, noise, varying cluster densities

Walk through your clustering approach:

  1. How do you determine the optimal number of clusters?
  2. Which clustering algorithm would you choose and why?
  3. How do you evaluate clustering quality?
  4. How do you interpret and use the segments?

Detailed Answer

1. Determining Optimal Number of Clusters

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import cdist
import warnings
warnings.filterwarnings('ignore')

class OptimalClustersFinder:
    """Find optimal number of clusters using multiple methods"""
    
    def __init__(self, X):
        self.X = X
        self.results = {}
    
    def elbow_method(self, k_range=range(2, 11)):
        """Elbow method using within-cluster sum of squares"""
        inertias = []
        K = []
        
        for k in k_range:
            kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
            kmeans.fit(self.X)
            inertias.append(kmeans.inertia_)
            K.append(k)
        
        # Calculate elbow point
        deltas = np.diff(inertias)
        delta_deltas = np.diff(deltas)
        elbow_idx = np.argmax(np.abs(delta_deltas)) + 2
        optimal_k_elbow = K[elbow_idx]
        
        self.results['elbow'] = {
            'k_values': K,
            'inertias': inertias,
            'optimal_k': optimal_k_elbow
        }
        
        return optimal_k_elbow
    
    def silhouette_method(self, k_range=range(2, 11)):
        """Silhouette analysis"""
        silhouette_scores = []
        K = []
        
        for k in k_range:
            kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
            labels = kmeans.fit_predict(self.X)
            score = silhouette_score(self.X, labels)
            silhouette_scores.append(score)
            K.append(k)
        
        optimal_k_silhouette = K[np.argmax(silhouette_scores)]
        
        self.results['silhouette'] = {
            'k_values': K,
            'scores': silhouette_scores,
            'optimal_k': optimal_k_silhouette,
            'best_score': max(silhouette_scores)
        }
        
        return optimal_k_silhouette
    
    def gap_statistic(self, k_range=range(2, 11), n_refs=10):
        """Gap statistic method"""
        gaps = []
        K = []
        
        for k in k_range:
            # Calculate WCSS for actual data
            kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
            kmeans.fit(self.X)
            WCSS = kmeans.inertia_
            
            # Calculate WCSS for reference datasets
            ref_WCSS = []
            for _ in range(n_refs):
                # Generate random reference data
                random_data = np.random.uniform(
                    low=self.X.min(axis=0),
                    high=self.X.max(axis=0),
                    size=self.X.shape
                )
                kmeans_ref = KMeans(n_clusters=k, random_state=42, n_init=10)
                kmeans_ref.fit(random_data)
                ref_WCSS.append(kmeans_ref.inertia_)
            
            # Calculate gap
            gap = np.mean(np.log(ref_WCSS)) - np.log(WCSS)
            gaps.append(gap)
            K.append(k)
        
        # Find optimal k (first k where gap(k) >= gap(k+1) - std_error)
        optimal_k_gap = K[np.argmax(gaps)]
        
        self.results['gap'] = {
            'k_values': K,
            'gaps': gaps,
            'optimal_k': optimal_k_gap
        }
        
        return optimal_k_gap
    
    def bayesian_information_criterion(self, k_range=range(2, 11)):
        """BIC for Gaussian Mixture Models"""
        from sklearn.mixture import GaussianMixture
        
        bics = []
        K = []
        
        for k in k_range:
            gmm = GaussianMixture(n_components=k, random_state=42)
            gmm.fit(self.X)
            bics.append(gmm.bic(self.X))
            K.append(k)
        
        optimal_k_bic = K[np.argmin(bics)]
        
        self.results['bic'] = {
            'k_values': K,
            'bics': bics,
            'optimal_k': optimal_k_bic
        }
        
        return optimal_k_bic
    
    def find_optimal_k(self, k_range=range(2, 11)):
        """Run all methods and find consensus"""
        print("Running multiple methods to find optimal k...")
        print("=" * 60)
        
        self.elbow_method(k_range)
        self.silhouette_method(k_range)
        self.gap_statistic(k_range)
        self.bayesian_information_criterion(k_range)
        
        # Consensus
        all_k = [
            self.results['elbow']['optimal_k'],
            self.results['silhouette']['optimal_k'],
            self.results['gap']['optimal_k'],
            self.results['bic']['optimal_k']
        ]
        
        # Find most common k
        from collections import Counter
        k_counts = Counter(all_k)
        consensus_k = k_counts.most_common(1)[0][0]
        
        print(f"\nOptimal k estimates:")
        print(f"  Elbow method: {self.results['elbow']['optimal_k']}")
        print(f"  Silhouette: {self.results['silhouette']['optimal_k']}")
        print(f"  Gap statistic: {self.results['gap']['optimal_k']}")
        print(f"  BIC: {self.results['bic']['optimal_k']}")
        print(f"\nConsensus optimal k: {consensus_k}")
        
        return consensus_k
    
    def visualize_methods(self):
        """Visualize all methods"""
        fig, axes = plt.subplots(2, 2, figsize=(12, 10))
        
        # Elbow method
        axes[0, 0].plot(self.results['elbow']['k_values'], 
                       self.results['elbow']['inertias'], 'bo-', linewidth=2)
        axes[0, 0].axvline(x=self.results['elbow']['optimal_k'], 
                          color='r', linestyle='--', label=f"Optimal k={self.results['elbow']['optimal_k']}")
        axes[0, 0].set_xlabel('Number of Clusters (k)')
        axes[0, 0].set_ylabel('Inertia')
        axes[0, 0].set_title('Elbow Method')
        axes[0, 0].legend()
        axes[0, 0].grid(True, alpha=0.3)
        
        # Silhouette method
        axes[0, 1].plot(self.results['silhouette']['k_values'],
                       self.results['silhouette']['scores'], 'go-', linewidth=2)
        axes[0, 1].axvline(x=self.results['silhouette']['optimal_k'],
                          color='r', linestyle='--', label=f"Optimal k={self.results['silhouette']['optimal_k']}")
        axes[0, 1].set_xlabel('Number of Clusters (k)')
        axes[0, 1].set_ylabel('Silhouette Score')
        axes[0, 1].set_title('Silhouette Method')
        axes[0, 1].legend()
        axes[0, 1].grid(True, alpha=0.3)
        
        # Gap statistic
        axes[1, 0].plot(self.results['gap']['k_values'],
                       self.results['gap']['gaps'], 'mo-', linewidth=2)
        axes[1, 0].axvline(x=self.results['gap']['optimal_k'],
                          color='r', linestyle='--', label=f"Optimal k={self.results['gap']['optimal_k']}")
        axes[1, 0].set_xlabel('Number of Clusters (k)')
        axes[1, 0].set_ylabel('Gap Statistic')
        axes[1, 0].set_title('Gap Statistic')
        axes[1, 0].legend()
        axes[1, 0].grid(True, alpha=0.3)
        
        # BIC
        axes[1, 1].plot(self.results['bic']['k_values'],
                       self.results['bic']['bics'], 'co-', linewidth=2)
        axes[1, 1].axvline(x=self.results['bic']['optimal_k'],
                          color='r', linestyle='--', label=f"Optimal k={self.results['bic']['optimal_k']}")
        axes[1, 1].set_xlabel('Number of Clusters (k)')
        axes[1, 1].set_ylabel('BIC')
        axes[1, 1].set_title('Bayesian Information Criterion')
        axes[1, 1].legend()
        axes[1, 1].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.savefig('optimal_k_methods.png', dpi=150, bbox_inches='tight')
        plt.show()

# Example usage
# finder = OptimalClustersFinder(X_scaled)
# optimal_k = finder.find_optimal_k()
# finder.visualize_methods()

2. Clustering Algorithms Comparison

class ClusteringComparison:
    """Compare multiple clustering algorithms"""
    
    def __init__(self):
        self.models = {}
        self.results = {}
    
    def define_models(self, n_clusters=5):
        """Define clustering models"""
        self.models = {
            'kmeans': KMeans(n_clusters=n_clusters, random_state=42, n_init=10),
            'agglomerative': AgglomerativeClustering(n_clusters=n_clusters),
            'dbscan': DBSCAN(eps=0.5, min_samples=5),
            'hierarchical': AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
        }
        
        return self.models
    
    def find_dbscan_params(self, X, k=5):
        """Find optimal DBSCAN parameters using k-distance graph"""
        # Fit nearest neighbors
        neighbors = NearestNeighbors(n_neighbors=k)
        neighbors.fit(X)
        distances, indices = neighbors.kneighbors(X)
        
        # Sort distances to k-th neighbor
        distances = np.sort(distances[:, -1])
        
        # Find elbow point
        from kneed import KneeLocator
        kneedle = KneeLocator(range(len(distances)), distances, curve='convex', direction='increasing')
        
        optimal_eps = distances[kneedle.elbow] if kneedle.elbow else np.median(distances)
        
        print(f"Optimal eps for DBSCAN: {optimal_eps:.4f}")
        
        return optimal_eps
    
    def compare_performance(self, X):
        """Compare all clustering algorithms"""
        results = []
        
        for name, model in self.models.items():
            print(f"\nRunning {name}...")
            
            # Fit and predict
            if name == 'dbscan':
                labels = model.fit_predict(X)
            else:
                labels = model.fit_predict(X)
            
            # Calculate metrics (skip if only 1 cluster)
            n_clusters = len(np.unique(labels[labels >= 0]))
            
            if n_clusters > 1:
                # Exclude noise points for DBSCAN
                mask = labels >= 0
                if mask.sum() > 0:
                    silhouette = silhouette_score(X[mask], labels[mask])
                    calinski = calinski_harabasz_score(X[mask], labels[mask])
                    davies_bouldin = davies_bouldin_score(X[mask], labels[mask])
                else:
                    silhouette, calinski, davies_bouldin = 0, 0, float('inf')
            else:
                silhouette, calinski, davies_bouldin = 0, 0, float('inf')
            
            result = {
                'algorithm': name,
                'n_clusters': n_clusters,
                'n_noise': (labels == -1).sum() if name == 'dbscan' else 0,
                'silhouette': silhouette,
                'calinski_harabasz': calinski,
                'davies_bouldin': davies_bouldin
            }
            
            results.append(result)
        
        self.results = pd.DataFrame(results)
        return self.results
    
    def visualize_comparison(self):
        """Visualize algorithm comparison"""
        fig, axes = plt.subplots(1, 3, figsize=(15, 5))
        
        metrics = ['silhouette', 'calinski_harabasz', 'davies_bouldin']
        titles = ['Silhouette Score\n(higher=better)', 
                 'Calinski-Harabasz\n(higher=better)',
                 'Davies-Bouldin\n(lower=better)']
        
        for idx, (metric, title) in enumerate(zip(metrics, titles)):
            bars = axes[idx].barh(self.results['algorithm'], self.results[metric])
            axes[idx].set_xlabel(metric.replace('_', ' ').title())
            axes[idx].set_title(title)
            
            # Add value labels
            for bar, value in zip(bars, self.results[metric]):
                axes[idx].text(value + 0.01 * value, bar.get_y() + bar.get_height()/2,
                             f'{value:.3f}', va='center', fontsize=9)
        
        plt.tight_layout()
        plt.savefig('clustering_comparison.png', dpi=150, bbox_inches='tight')
        plt.show()

# Example usage
# comparator = ClusteringComparison()
# comparator.define_models(n_clusters=5)
# results = comparator.compare_performance(X_scaled)
# comparator.visualize_comparison()

3. Clustering Evaluation Metrics

class ClusteringEvaluator:
    """Comprehensive clustering evaluation"""
    
    def __init__(self, X, labels):
        self.X = X
        self.labels = labels
        self.n_clusters = len(np.unique(labels[labels >= 0]))
    
    def calculate_all_metrics(self):
        """Calculate comprehensive clustering metrics"""
        # Filter out noise points
        mask = self.labels >= 0
        X_filtered = self.X[mask]
        labels_filtered = self.labels[mask]
        
        metrics = {
            'n_clusters': self.n_clusters,
            'n_noise': (self.labels == -1).sum(),
            'cluster_sizes': pd.Series(labels_filtered).value_counts().to_dict()
        }
        
        if self.n_clusters > 1:
            # Internal metrics
            metrics['silhouette'] = silhouette_score(X_filtered, labels_filtered)
            metrics['calinski_harabasz'] = calinski_harabasz_score(X_filtered, labels_filtered)
            metrics['davies_bouldin'] = davies_bouldin_score(X_filtered, labels_filtered)
            
            # Inertia
            from sklearn.metrics import pairwise_distances_argmin_min
            centroids = np.array([X_filtered[labels_filtered == i].mean(axis=0) 
                                 for i in range(self.n_clusters)])
            _, distances = pairwise_distances_argmin_min(X_filtered, centroids)
            metrics['inertia'] = np.sum(distances ** 2)
            
            # Cluster separation
            metrics['inter_cluster_distance'] = self._inter_cluster_distance(centroids)
            metrics['intra_cluster_distance'] = self._intra_cluster_distance(X_filtered, labels_filtered)
        
        return metrics
    
    def _inter_cluster_distance(self, centroids):
        """Calculate inter-cluster distance"""
        from scipy.spatial.distance import pdist
        if len(centroids) > 1:
            distances = pdist(centroids)
            return np.mean(distances)
        return 0
    
    def _intra_cluster_distance(self, X, labels):
        """Calculate average intra-cluster distance"""
        intra_distances = []
        
        for i in range(self.n_clusters):
            cluster_points = X[labels == i]
            if len(cluster_points) > 1:
                from scipy.spatial.distance import pdist
                distances = pdist(cluster_points)
                intra_distances.append(np.mean(distances))
        
        return np.mean(intra_distances) if intra_distances else 0
    
    def plot_silhouette_analysis(self):
        """Plot silhouette analysis"""
        from sklearn.metrics import silhouette_samples
        
        sample_silhouette_values = silhouette_samples(self.X, self.labels)
        
        fig, ax = plt.subplots(figsize=(8, 6))
        
        y_lower = 10
        
        for i in range(self.n_clusters):
            ith_cluster_silhouette_values = sample_silhouette_values[self.labels == i]
            ith_cluster_silhouette_values.sort()
            
            size_cluster_i = ith_cluster_silhouette_values.shape[0]
            y_upper = y_lower + size_cluster_i
            
            color = plt.cm.tab20(i / self.n_clusters)
            ax.fill_betweenx(np.arange(y_lower, y_upper),
                            0, ith_cluster_silhouette_values,
                            facecolor=color, edgecolor=color, alpha=0.7)
            
            ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
            y_lower = y_upper + 10
        
        ax.set_title("Silhouette Analysis")
        ax.set_xlabel("Silhouette Coefficient")
        ax.set_ylabel("Cluster")
        
        # Add average silhouette line
        avg_silhouette = np.mean(sample_silhouette_values)
        ax.axvline(x=avg_silhouette, color="red", linestyle="--", 
                  label=f"Average: {avg_silhouette:.3f}")
        ax.legend()
        
        plt.tight_layout()
        plt.savefig('silhouette_analysis.png', dpi=150, bbox_inches='tight')
        plt.show()

# Example usage
# evaluator = ClusteringEvaluator(X_scaled, labels)
# metrics = evaluator.calculate_all_metrics()
# evaluator.plot_silhouette_analysis()

4. Real-World Application: User Segmentation

class UserSegmentation:
    """Complete user segmentation system"""
    
    def __init__(self):
        self.scaler = StandardScaler()
        self.pca = PCA(n_components=0.95)  # Retain 95% variance
        self.model = None
        self.segments = None
    
    def preprocess_features(self, df, feature_columns):
        """Preprocess features for clustering"""
        # Handle missing values
        X = df[feature_columns].fillna(0)
        
        # Log transform skewed features
        for col in feature_columns:
            if X[col].skew() > 1:
                X[col] = np.log1p(X[col])
        
        # Scale features
        X_scaled = self.scaler.fit_transform(X)
        
        # Reduce dimensionality if needed
        if X.shape[1] > 20:
            X_scaled = self.pca.fit_transform(X_scaled)
            print(f"PCA reduced dimensions from {len(feature_columns)} to {X_scaled.shape[1]}")
        
        return X_scaled
    
    def find_optimal_segments(self, X_scaled):
        """Find optimal number of segments"""
        finder = OptimalClustersFinder(X_scaled)
        optimal_k = finder.find_optimal_k(k_range=range(2, 15))
        return optimal_k
    
    def create_segments(self, X_scaled, n_clusters):
        """Create user segments"""
        self.model = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        self.segments = self.model.fit_predict(X_scaled)
        
        return self.segments
    
    def profile_segments(self, df, feature_columns, segment_col='segment'):
        """Create detailed segment profiles"""
        df_with_segments = df.copy()
        df_with_segments[segment_col] = self.segments
        
        profiles = {}
        
        for segment in range(self.n_clusters):
            segment_data = df_with_segments[df_with_segments[segment_col] == segment]
            
            profile = {
                'size': len(segment_data),
                'percentage': len(segment_data) / len(df) * 100,
                'features': {}
            }
            
            for feature in feature_columns:
                if feature in segment_data.columns:
                    profile['features'][feature] = {
                        'mean': segment_data[feature].mean(),
                        'median': segment_data[feature].median(),
                        'std': segment_data[feature].std()
                    }
            
            profiles[f'Segment {segment}'] = profile
        
        return profiles
    
    def interpret_segments(self, profiles, feature_columns):
        """Generate human-readable segment interpretations"""
        interpretations = {}
        
        # Calculate global statistics
        global_means = {}
        for feature in feature_columns:
            means = [profiles[seg]['features'][feature]['mean'] 
                    for seg in profiles.keys()]
            global_means[feature] = np.mean(means)
        
        # Interpret each segment
        for segment_name, profile in profiles.items():
            interpretation = []
            
            for feature in feature_columns:
                segment_mean = profile['features'][feature]['mean']
                global_mean = global_means[feature]
                
                if segment_mean > global_mean * 1.2:
                    interpretation.append(f"High {feature}")
                elif segment_mean < global_mean * 0.8:
                    interpretation.append(f"Low {feature}")
            
            interpretations[segment_name] = {
                'profile': interpretation,
                'size': profile['size'],
                'percentage': profile['percentage']
            }
        
        return interpretations
    
    def create_marketing_strategies(self, interpretations):
        """Generate marketing strategies for each segment"""
        strategies = {}
        
        for segment_name, info in interpretations.items():
            profile = info['profile']
            
            if 'High engagement' in str(profile) and 'High connections' in str(profile):
                strategy = 'Power users - Focus on retention and advocacy programs'
            elif 'Low engagement' in str(profile) and 'New user' in str(profile):
                strategy = 'New users - Focus on onboarding and activation'
            elif 'High job applications' in str(profile):
                strategy = 'Job seekers - Focus on job-related features'
            elif 'High content creation' in str(profile):
                strategy = 'Content creators - Focus on creator tools and monetization'
            else:
                strategy = 'General users - Standard engagement campaigns'
            
            strategies[segment_name] = {
                'interpretation': profile,
                'strategy': strategy,
                'size': info['size'],
                'priority': 'High' if info['percentage'] > 20 else 'Medium'
            }
        
        return strategies

# Example usage
# segmentation = UserSegmentation()
# X_scaled = segmentation.preprocess_features(df, feature_columns)
# optimal_k = segmentation.find_optimal_segments(X_scaled)
# segments = segmentation.create_segments(X_scaled, optimal_k)
# profiles = segmentation.profile_segments(df, feature_columns)
# interpretations = segmentation.interpret_segments(profiles, feature_columns)
# strategies = segmentation.create_marketing_strategies(interpretations)

πŸ’‘

Pro Tip: When profiling segments, don't just look at means. Examine the full distribution, including skewness and outliers, to understand the true nature of each segment.

5. Common Follow-Up Questions

Follow-up 1: How do you handle high-dimensional data in clustering?

def handle_high_dimensionality(X, method='pca'):
    """Handle high-dimensional data for clustering"""
    
    if method == 'pca':
        from sklearn.decomposition import PCA
        pca = PCA(n_components=0.95)  # Retain 95% variance
        X_reduced = pca.fit_transform(X)
        print(f"PCA: {X.shape[1]} β†’ {X_reduced.shape[1]} dimensions")
        print(f"Explained variance: {pca.explained_variance_ratio_.sum():.2%}")
        
    elif method == 'tsne':
        from sklearn.manifold import TSNE
        tsne = TSNE(n_components=2, perplexity=30, random_state=42)
        X_reduced = tsne.fit_transform(X)
        print(f"t-SNE: {X.shape[1]} β†’ {X_reduced.shape[1]} dimensions")
        
    elif method == 'umap':
        import umap
        reducer = umap.UMAP(n_components=2, random_state=42)
        X_reduced = reducer.fit_transform(X)
        print(f"UMAP: {X.shape[1]} β†’ {X_reduced.shape[1]} dimensions")
    
    elif method == 'autoencoder':
        from tensorflow.keras.models import Model
        from tensorflow.keras.layers import Input, Dense
        
        # Simple autoencoder
        input_dim = X.shape[1]
        encoding_dim = 10
        
        input_layer = Input(shape=(input_dim,))
        encoded = Dense(encoding_dim, activation='relu')(input_layer)
        decoded = Dense(input_dim, activation='sigmoid')(encoded)
        
        autoencoder = Model(input_layer, decoded)
        encoder = Model(input_layer, encoded)
        
        autoencoder.compile(optimizer='adam', loss='mse')
        autoencoder.fit(X, X, epochs=50, batch_size=256, verbose=0)
        
        X_reduced = encoder.predict(X)
        print(f"Autoencoder: {X.shape[1]} β†’ {X_reduced.shape[1]} dimensions")
    
    return X_reduced

Follow-up 2: How do you handle streaming data for clustering?

class StreamingClustering:
    """Handle streaming data for clustering"""
    
    def __init__(self, initial_data, n_clusters=5):
        self.n_clusters = n_clusters
        self.centroids = None
        self.counts = np.zeros(n_clusters)
        self.initialize(initial_data)
    
    def initialize(self, X):
        """Initialize centroids using K-Means++"""
        from sklearn.cluster import KMeans
        kmeans = KMeans(n_clusters=self.n_clusters, init='k-means++', random_state=42)
        kmeans.fit(X)
        self.centroids = kmeans.cluster_centers_
    
    def partial_fit(self, X_batch):
        """Update centroids with new data batch"""
        # Assign points to nearest centroid
        from sklearn.metrics import pairwise_distances_argmin_min
        labels, _ = pairwise_distances_argmin_min(X_batch, self.centroids)
        
        # Update centroids using online K-Means
        for i in range(self.n_clusters):
            mask = labels == i
            if mask.any():
                self.counts[i] += mask.sum()
                # Online update: centroid = centroid + lr * (point - centroid)
                lr = 1 / self.counts[i]
                self.centroids[i] += lr * (X_batch[mask].mean(axis=0) - self.centroids[i])
    
    def predict(self, X):
        """Predict cluster for new points"""
        from sklearn.metrics import pairwise_distances_argmin_min
        labels, _ = pairwise_distances_argmin_min(X, self.centroids)
        return labels

# Example usage
# streaming = StreamingClustering(X_initial, n_clusters=5)
# for batch in data_stream:
#     streaming.partial_fit(batch)
# labels = streaming.predict(X_test)

Company-Specific Tips

ℹ️

Google Tips:

  • Google values scalable clustering algorithms
  • Be prepared to discuss Mini-Batch K-Means for large datasets
  • Know how to use clustering for anomaly detection
  • Understand hierarchical clustering for taxonomy

LinkedIn Tips:

  • LinkedIn focuses on user segmentation for recommendations
  • Know how to cluster professional profiles
  • Be comfortable with graph-based clustering
  • Understand how to use clustering for feed personalization

Quiz Section


Related Topics

Advertisement