Data Science Case Studies & Interview Prep
Data science interviews test product thinking, analytical frameworks, and communication. Master the patterns that top candidates use.
The Case Study Framework
Architecture Diagram
1. Clarify the problem β Ask questions, define scope
2. Define metrics β Choose north star and guardrails
3. Structure the analysis β Break into components
4. Deep dive β Explore hypotheses
5. Recommend β Data-driven action
6. Follow up β Monitor and iterate
Product Metrics
import pandas as pd
import numpy as np
from dataclasses import dataclass
@dataclass
class MetricsDefinition:
north_star: str
input_metrics: list
guardrail_metrics: list
def define_product_metrics(product_type):
metrics = {
"social_media": MetricsDefinition(
north_star="daily_active_users",
input_metrics=[
"new_signups",
"session_length",
"content_created",
"shares_per_user"
],
guardrail_metrics=[
"spam_rate",
"report_rate",
"time_to_first_negative_action"
]
),
"ecommerce": MetricsDefinition(
north_star="revenue_per_visitor",
input_metrics=[
"conversion_rate",
"average_order_value",
"items_per_cart",
"return_rate"
],
guardrail_metrics=[
"customer_satisfaction",
"return_rate",
"support_ticket_rate"
]
),
"saas": MetricsDefinition(
north_star="monthly_recurring_revenue",
input_metrics=[
"trial_conversion",
"feature_adoption",
"seats_per_account",
"churn_rate"
],
guardrail_metrics=[
"time_to_value",
"support_ticket_volume",
"net_promoter_score"
]
)
}
return metrics[product_type]
# AARRR metrics framework
def aarrr_metrics(user_data):
"""Acquisition, Activation, Retention, Revenue, Referral"""
metrics = {}
# Acquisition
metrics['acquisition'] = {
'new_users': user_data[user_data['is_new']]['user_id'].nunique(),
'cost_per_acquisition': user_data['marketing_spend'].sum() /
user_data[user_data['is_new']]['user_id'].nunique(),
'organic_rate': 1 - user_data[user_data['is_new']]['is_paid'].mean()
}
# Activation
activated = user_data.groupby('user_id').agg({
'completed_onboarding': 'first',
'first_action': 'first'
})
metrics['activation'] = {
'activation_rate': activated['completed_onboarding'].mean(),
'time_to_activation': user_data[user_data['completed_onboarding']]['days_since_signup'].median()
}
# Retention
cohort = user_data.groupby('cohort_week').agg({
'user_id': 'nunique',
'is_active_day30': 'mean'
})
metrics['retention'] = {
'd1_retention': user_data.groupby('user_id')['is_active_day1'].first().mean(),
'd7_retention': user_data.groupby('user_id')['is_active_day7'].first().mean(),
'd30_retention': cohort['is_active_day30'].mean()
}
# Revenue
paying = user_data[user_data['revenue'] > 0]
metrics['revenue'] = {
'arpu': user_data['revenue'].mean(),
'arppu': paying['revenue'].mean(),
'ltv': paying.groupby('user_id')['revenue'].sum().mean()
}
# Referral
metrics['referral'] = {
'viral_coefficient': user_data['referrals'].sum() / user_data['user_id'].nunique(),
'organic_share': 1 - user_data['is_paid'].mean()
}
return metrics
Funnel Analysis
import pandas as pd
import numpy as np
class FunnelAnalyzer:
def __init__(self, events_df):
self.events = events_df
def compute_funnel(self, steps, segment_by=None):
"""Compute funnel conversion rates"""
results = []
for step in steps:
step_users = self.events[self.events['event'] == step]['user_id'].nunique()
results.append({
'step': step,
'users': step_users
})
df = pd.DataFrame(results)
df['conversion_rate'] = df['users'] / df['users'].iloc[0]
df['step_conversion'] = df['users'] / df['users'].shift(1)
df['drop_off'] = 1 - df['step_conversion']
return df
def identify_drop_off_points(self, funnel_df, threshold=0.3):
"""Find steps with significant drop-offs"""
drop_offs = funnel_df[funnel_df['drop_off'] > threshold]
return drop_offs
def segment_analysis(self, steps, segment_col):
"""Compare funnel across segments"""
segments = self.events[segment_col].unique()
results = []
for segment in segments:
segment_events = self.events[self.events[segment_col] == segment]
for step in steps:
count = segment_events[segment_events['event'] == step]['user_id'].nunique()
results.append({
'segment': segment,
'step': step,
'users': count
})
df = pd.DataFrame(results)
pivot = df.pivot(index='step', columns='segment', values='users')
for col in pivot.columns:
pivot[f'{col}_rate'] = pivot[col] / pivot[col].iloc[0]
return pivot
# Example usage
np.random.seed(42)
n_users = 5000
events = pd.DataFrame({
'user_id': np.repeat(range(n_users), 4),
'event': np.random.choice(['page_view', 'signup', 'first_action', 'purchase'],
n_users * 4, p=[0.5, 0.3, 0.15, 0.05]),
'device': np.random.choice(['mobile', 'desktop', 'tablet'], n_users * 4),
'channel': np.random.choice(['organic', 'paid', 'referral'], n_users * 4)
})
analyzer = FunnelAnalyzer(events)
funnel = analyzer.compute_funnel(['page_view', 'signup', 'first_action', 'purchase'])
print(funnel)
device_funnel = analyzer.segment_analysis(
['page_view', 'signup', 'first_action', 'purchase'], 'device'
)
print(device_funnel)
Experimentation Analysis
import numpy as np
from scipy import stats
from dataclasses import dataclass
@dataclass
class ExperimentResult:
control_mean: float
treatment_mean: float
lift: float
p_value: float
ci_lower: float
ci_upper: float
significant: bool
power: float
class ExperimentAnalyzer:
def __init__(self, alpha=0.05, mde=0.02):
self.alpha = alpha
self.mde = mde
def required_sample_size(self, baseline_std, power=0.8):
z_alpha = stats.norm.ppf(1 - self.alpha / 2)
z_beta = stats.norm.ppf(power)
n = 2 * ((z_alpha + z_beta) ** 2) * (baseline_std ** 2) / (self.mde * baseline_std) ** 2
return int(np.ceil(n))
def analyze_two_sample(self, control_data, treatment_data):
n_control = len(control_data)
n_treatment = len(treatment_data)
mean_control = np.mean(control_data)
mean_treatment = np.mean(treatment_data)
std_control = np.std(control_data, ddof=1)
std_pooled = np.sqrt(((n_control-1)*std_control**2 + (n_treatment-1)*np.std(treatment_data, ddof=1)**2) /
(n_control + n_treatment - 2))
se = std_pooled * np.sqrt(1/n_control + 1/n_treatment)
t_stat = (mean_treatment - mean_control) / se
df = n_control + n_treatment - 2
p_value = 2 * (1 - stats.t.cdf(abs(t_stat), df))
ci_lower = (mean_treatment - mean_control) - 1.96 * se
ci_upper = (mean_treatment - mean_control) + 1.96 * se
# Power calculation
effect_size = (mean_treatment - mean_control) / std_pooled
power = 1 - stats.t.cdf(1.96 - effect_size * np.sqrt(n_control * n_treatment / (n_control + n_treatment)), df)
return ExperimentResult(
control_mean=mean_control,
treatment_mean=mean_treatment,
lift=(mean_treatment - mean_control) / mean_control,
p_value=p_value,
ci_lower=ci_lower,
ci_upper=ci_upper,
significant=p_value < self.alpha,
power=power
)
# Sequential testing (no peeking)
class SequentialAnalyzer:
def __init__(self, alpha=0.05, max_samples=10000):
self.alpha = alpha
self.max_samples = max_samples
self.boundaries = self._compute_boundaries()
def _compute_boundaries(self):
"""O'Brien-Fleming spending function"""
info_rates = np.linspace(0, 1, 10)
boundaries = []
for rate in info_rates:
if rate == 0:
boundaries.append(np.inf)
else:
# Simplified O'Brien-Fleming
boundary = 4 * stats.norm.ppf(1 - self.alpha / 2) / np.sqrt(rate)
boundaries.append(boundary)
return boundaries
def check_stop(self, sample_idx, t_statistic):
info_rate = sample_idx / self.max_samples
idx = min(int(info_rate * (len(self.boundaries) - 1)), len(self.boundaries) - 1)
return abs(t_statistic) >= self.boundaries[idx]
Case Study: Engagement Drop
import pandas as pd
import numpy as np
def engagement_drop_case_study():
"""Framework for investigating engagement drops"""
# Step 1: Validate the signal
metrics_over_time = pd.DataFrame({
'date': pd.date_range('2024-01-01', periods=30),
'dau': np.random.poisson(10000, 30) - np.arange(30) * 50,
'sessions_per_user': np.random.normal(3, 0.3, 30) - np.arange(30) * 0.01,
'session_length': np.random.normal(5, 0.5, 30) - np.arange(30) * 0.1
})
# Step 2: Segment the drop
# By device
device_impact = {
'mobile': -15, # % change
'desktop': -5,
'tablet': -3
}
# By geography
geo_impact = {
'US': -8,
'EU': -12,
'APAC': -20
}
# Step 3: Correlate with changes
changes_log = [
{'date': '2024-01-15', 'change': 'App update v2.3', 'impact': 'high'},
{'date': '2024-01-20', 'change': 'Server migration', 'impact': 'medium'},
{'date': '2024-01-25', 'change': 'Feature deprecation', 'impact': 'high'}
]
# Step 4: Formulate hypothesis
hypothesis = """
The engagement drop is primarily driven by:
1. App update v2.3 introducing UX friction on mobile (biggest impact)
2. Server migration causing latency in APAC region
3. Feature deprecation affecting power users
Recommended actions:
1. Roll back problematic UI changes in v2.3.1 hotfix
2. Optimize CDN for APAC region
3. Restore deprecated feature as opt-in for power users
"""
return {
'metrics': metrics_over_time,
'device_impact': device_impact,
'geo_impact': geo_impact,
'hypothesis': hypothesis
}
result = engagement_drop_case_study()
print(result['hypothesis'])
Communication Tips
# STAR method for behavioral questions
def star_example():
return {
"Situation": "Our recommendation model was underperforming, with CTR 20% below target",
"Task": "I needed to identify the root cause and improve model performance",
"Action": "I analyzed the feature distributions, discovered data drift in user preferences, and retrained with fresh data while adding new behavioral features",
"Result": "CTR improved by 35%, exceeding the original target"
}
# Pyramid principle for presenting findings
def pyramid_presentation(finding):
"""Start with the answer, then provide supporting evidence"""
return {
"recommendation": finding['conclusion'],
"why_it_matters": finding['business_impact'],
"supporting_evidence": [
finding['data_point_1'],
finding['data_point_2'],
finding['data_point_3']
],
"risks_and_mitigations": finding['risks']
}
Key Takeaways
- Structure first β Use consistent frameworks for every case
- Metrics matter β Define north star and guardrail metrics early
- Segment everything β The answer is usually in the segments
- Communicate clearly β Lead with the answer, then support with evidence
- Practice systematically β Do 2-3 cases per week for interview prep