Data Science Case Studies & Interview Prep

Data science interviews test product thinking, analytical frameworks, and communication. Master the patterns that top candidates use.

The Case Study Framework

Architecture Diagram

1. Clarify the problem    → Ask questions, define scope
2. Define metrics          → Choose north star and guardrails
3. Structure the analysis  → Break into components
4. Deep dive               → Explore hypotheses
5. Recommend               → Data-driven action
6. Follow up               → Monitor and iterate

Product Metrics

import pandas as pd
import numpy as np
from dataclasses import dataclass

@dataclass
class MetricsDefinition:
    north_star: str
    input_metrics: list
    guardrail_metrics: list

def define_product_metrics(product_type):
    metrics = {
        "social_media": MetricsDefinition(
            north_star="daily_active_users",
            input_metrics=[
                "new_signups",
                "session_length",
                "content_created",
                "shares_per_user"
            ],
            guardrail_metrics=[
                "spam_rate",
                "report_rate",
                "time_to_first_negative_action"
            ]
        ),
        "ecommerce": MetricsDefinition(
            north_star="revenue_per_visitor",
            input_metrics=[
                "conversion_rate",
                "average_order_value",
                "items_per_cart",
                "return_rate"
            ],
            guardrail_metrics=[
                "customer_satisfaction",
                "return_rate",
                "support_ticket_rate"
            ]
        ),
        "saas": MetricsDefinition(
            north_star="monthly_recurring_revenue",
            input_metrics=[
                "trial_conversion",
                "feature_adoption",
                "seats_per_account",
                "churn_rate"
            ],
            guardrail_metrics=[
                "time_to_value",
                "support_ticket_volume",
                "net_promoter_score"
            ]
        )
    }
    return metrics[product_type]

# AARRR metrics framework
def aarrr_metrics(user_data):
    """Acquisition, Activation, Retention, Revenue, Referral"""
    metrics = {}
    
    # Acquisition
    metrics['acquisition'] = {
        'new_users': user_data[user_data['is_new']]['user_id'].nunique(),
        'cost_per_acquisition': user_data['marketing_spend'].sum() / 
                                 user_data[user_data['is_new']]['user_id'].nunique(),
        'organic_rate': 1 - user_data[user_data['is_new']]['is_paid'].mean()
    }
    
    # Activation
    activated = user_data.groupby('user_id').agg({
        'completed_onboarding': 'first',
        'first_action': 'first'
    })
    metrics['activation'] = {
        'activation_rate': activated['completed_onboarding'].mean(),
        'time_to_activation': user_data[user_data['completed_onboarding']]['days_since_signup'].median()
    }
    
    # Retention
    cohort = user_data.groupby('cohort_week').agg({
        'user_id': 'nunique',
        'is_active_day30': 'mean'
    })
    metrics['retention'] = {
        'd1_retention': user_data.groupby('user_id')['is_active_day1'].first().mean(),
        'd7_retention': user_data.groupby('user_id')['is_active_day7'].first().mean(),
        'd30_retention': cohort['is_active_day30'].mean()
    }
    
    # Revenue
    paying = user_data[user_data['revenue'] > 0]
    metrics['revenue'] = {
        'arpu': user_data['revenue'].mean(),
        'arppu': paying['revenue'].mean(),
        'ltv': paying.groupby('user_id')['revenue'].sum().mean()
    }
    
    # Referral
    metrics['referral'] = {
        'viral_coefficient': user_data['referrals'].sum() / user_data['user_id'].nunique(),
        'organic_share': 1 - user_data['is_paid'].mean()
    }
    
    return metrics

Funnel Analysis

import pandas as pd
import numpy as np

class FunnelAnalyzer:
    def __init__(self, events_df):
        self.events = events_df
    
    def compute_funnel(self, steps, segment_by=None):
        """Compute funnel conversion rates"""
        results = []
        
        for step in steps:
            step_users = self.events[self.events['event'] == step]['user_id'].nunique()
            results.append({
                'step': step,
                'users': step_users
            })
        
        df = pd.DataFrame(results)
        df['conversion_rate'] = df['users'] / df['users'].iloc[0]
        df['step_conversion'] = df['users'] / df['users'].shift(1)
        df['drop_off'] = 1 - df['step_conversion']
        
        return df
    
    def identify_drop_off_points(self, funnel_df, threshold=0.3):
        """Find steps with significant drop-offs"""
        drop_offs = funnel_df[funnel_df['drop_off'] > threshold]
        return drop_offs
    
    def segment_analysis(self, steps, segment_col):
        """Compare funnel across segments"""
        segments = self.events[segment_col].unique()
        results = []
        
        for segment in segments:
            segment_events = self.events[self.events[segment_col] == segment]
            for step in steps:
                count = segment_events[segment_events['event'] == step]['user_id'].nunique()
                results.append({
                    'segment': segment,
                    'step': step,
                    'users': count
                })
        
        df = pd.DataFrame(results)
        pivot = df.pivot(index='step', columns='segment', values='users')
        
        for col in pivot.columns:
            pivot[f'{col}_rate'] = pivot[col] / pivot[col].iloc[0]
        
        return pivot

# Example usage
np.random.seed(42)
n_users = 5000
events = pd.DataFrame({
    'user_id': np.repeat(range(n_users), 4),
    'event': np.random.choice(['page_view', 'signup', 'first_action', 'purchase'], 
                               n_users * 4, p=[0.5, 0.3, 0.15, 0.05]),
    'device': np.random.choice(['mobile', 'desktop', 'tablet'], n_users * 4),
    'channel': np.random.choice(['organic', 'paid', 'referral'], n_users * 4)
})

analyzer = FunnelAnalyzer(events)
funnel = analyzer.compute_funnel(['page_view', 'signup', 'first_action', 'purchase'])
print(funnel)

device_funnel = analyzer.segment_analysis(
    ['page_view', 'signup', 'first_action', 'purchase'], 'device'
)
print(device_funnel)

Experimentation Analysis

import numpy as np
from scipy import stats
from dataclasses import dataclass

@dataclass
class ExperimentResult:
    control_mean: float
    treatment_mean: float
    lift: float
    p_value: float
    ci_lower: float
    ci_upper: float
    significant: bool
    power: float

class ExperimentAnalyzer:
    def __init__(self, alpha=0.05, mde=0.02):
        self.alpha = alpha
        self.mde = mde
    
    def required_sample_size(self, baseline_std, power=0.8):
        z_alpha = stats.norm.ppf(1 - self.alpha / 2)
        z_beta = stats.norm.ppf(power)
        
        n = 2 * ((z_alpha + z_beta) ** 2) * (baseline_std ** 2) / (self.mde * baseline_std) ** 2
        return int(np.ceil(n))
    
    def analyze_two_sample(self, control_data, treatment_data):
        n_control = len(control_data)
        n_treatment = len(treatment_data)
        
        mean_control = np.mean(control_data)
        mean_treatment = np.mean(treatment_data)
        
        std_control = np.std(control_data, ddof=1)
        std_pooled = np.sqrt(((n_control-1)*std_control**2 + (n_treatment-1)*np.std(treatment_data, ddof=1)**2) / 
                             (n_control + n_treatment - 2))
        
        se = std_pooled * np.sqrt(1/n_control + 1/n_treatment)
        
        t_stat = (mean_treatment - mean_control) / se
        df = n_control + n_treatment - 2
        p_value = 2 * (1 - stats.t.cdf(abs(t_stat), df))
        
        ci_lower = (mean_treatment - mean_control) - 1.96 * se
        ci_upper = (mean_treatment - mean_control) + 1.96 * se
        
        # Power calculation
        effect_size = (mean_treatment - mean_control) / std_pooled
        power = 1 - stats.t.cdf(1.96 - effect_size * np.sqrt(n_control * n_treatment / (n_control + n_treatment)), df)
        
        return ExperimentResult(
            control_mean=mean_control,
            treatment_mean=mean_treatment,
            lift=(mean_treatment - mean_control) / mean_control,
            p_value=p_value,
            ci_lower=ci_lower,
            ci_upper=ci_upper,
            significant=p_value < self.alpha,
            power=power
        )

# Sequential testing (no peeking)
class SequentialAnalyzer:
    def __init__(self, alpha=0.05, max_samples=10000):
        self.alpha = alpha
        self.max_samples = max_samples
        self.boundaries = self._compute_boundaries()
    
    def _compute_boundaries(self):
        """O'Brien-Fleming spending function"""
        info_rates = np.linspace(0, 1, 10)
        boundaries = []
        
        for rate in info_rates:
            if rate == 0:
                boundaries.append(np.inf)
            else:
                # Simplified O'Brien-Fleming
                boundary = 4 * stats.norm.ppf(1 - self.alpha / 2) / np.sqrt(rate)
                boundaries.append(boundary)
        
        return boundaries
    
    def check_stop(self, sample_idx, t_statistic):
        info_rate = sample_idx / self.max_samples
        idx = min(int(info_rate * (len(self.boundaries) - 1)), len(self.boundaries) - 1)
        
        return abs(t_statistic) >= self.boundaries[idx]

Case Study: Engagement Drop

import pandas as pd
import numpy as np

def engagement_drop_case_study():
    """Framework for investigating engagement drops"""
    
    # Step 1: Validate the signal
    metrics_over_time = pd.DataFrame({
        'date': pd.date_range('2024-01-01', periods=30),
        'dau': np.random.poisson(10000, 30) - np.arange(30) * 50,
        'sessions_per_user': np.random.normal(3, 0.3, 30) - np.arange(30) * 0.01,
        'session_length': np.random.normal(5, 0.5, 30) - np.arange(30) * 0.1
    })
    
    # Step 2: Segment the drop
    # By device
    device_impact = {
        'mobile': -15,  # % change
        'desktop': -5,
        'tablet': -3
    }
    
    # By geography
    geo_impact = {
        'US': -8,
        'EU': -12,
        'APAC': -20
    }
    
    # Step 3: Correlate with changes
    changes_log = [
        {'date': '2024-01-15', 'change': 'App update v2.3', 'impact': 'high'},
        {'date': '2024-01-20', 'change': 'Server migration', 'impact': 'medium'},
        {'date': '2024-01-25', 'change': 'Feature deprecation', 'impact': 'high'}
    ]
    
    # Step 4: Formulate hypothesis
    hypothesis = """
    The engagement drop is primarily driven by:
    1. App update v2.3 introducing UX friction on mobile (biggest impact)
    2. Server migration causing latency in APAC region
    3. Feature deprecation affecting power users
    
    Recommended actions:
    1. Roll back problematic UI changes in v2.3.1 hotfix
    2. Optimize CDN for APAC region
    3. Restore deprecated feature as opt-in for power users
    """
    
    return {
        'metrics': metrics_over_time,
        'device_impact': device_impact,
        'geo_impact': geo_impact,
        'hypothesis': hypothesis
    }

result = engagement_drop_case_study()
print(result['hypothesis'])

Communication Tips

# STAR method for behavioral questions
def star_example():
    return {
        "Situation": "Our recommendation model was underperforming, with CTR 20% below target",
        "Task": "I needed to identify the root cause and improve model performance",
        "Action": "I analyzed the feature distributions, discovered data drift in user preferences, and retrained with fresh data while adding new behavioral features",
        "Result": "CTR improved by 35%, exceeding the original target"
    }

# Pyramid principle for presenting findings
def pyramid_presentation(finding):
    """Start with the answer, then provide supporting evidence"""
    return {
        "recommendation": finding['conclusion'],
        "why_it_matters": finding['business_impact'],
        "supporting_evidence": [
            finding['data_point_1'],
            finding['data_point_2'],
            finding['data_point_3']
        ],
        "risks_and_mitigations": finding['risks']
    }

Key Takeaways

Structure first – Use consistent frameworks for every case
Metrics matter – Define north star and guardrail metrics early
Segment everything – The answer is usually in the segments
Communicate clearly – Lead with the answer, then support with evidence
Practice systematically – Do 2-3 cases per week for interview prep