The Interview Question
"Netflix sees that users who watch more foreign content have higher retention. Does this mean foreign content causes higher retention? How would you investigate this?"
This question tests whether you can distinguish correlation from causation and apply statistical thinking to business problems.
Why Companies Ask This
βΉοΈ
Netflix and Google need data scientists who don't just run statistical tests, but who think statistically about business problems. They need to know you understand the limitations of data and can draw valid conclusions.
Interviewers evaluate:
- Correlation vs. Causation β Do you understand the distinction?
- Experimental Design β Can you design valid studies?
- Statistical Methods β Do you know which method to apply when?
- Practical Significance β Do you distinguish statistical from practical significance?
- Uncertainty Quantification β Can you communicate uncertainty clearly?
The Statistical Thinking Framework
1. Observation β Hypothesis
- What pattern do you observe?
- What's your initial hypothesis?
- What are alternative explanations?
2. Data β Evidence
- What data would help distinguish between explanations?
- What confounding variables might exist?
- What biases might be present?
3. Analysis β Inference
- What statistical method is appropriate?
- What are the assumptions?
- What do the results actually mean?
4. Conclusion β Action
- What can we conclude?
- What are the limitations?
- What should we do next?
Example: Foreign Content & Retention
Observation
"Users who watch foreign content have 23% higher retention than those who don't."
Initial Hypothesis
"Foreign content might be causing higher retention by providing more diverse content options."
Alternative Explanations
alternatives = {
'causal': 'Foreign content directly improves retention',
'self_selection': 'More engaged users watch more foreign content',
'confounding': 'A third factor (e.g., curiosity, open-mindedness) causes both',
'reverse_causation': 'Users who plan to stay longer are more likely to explore foreign content',
'survivorship': 'Users who churn early never discover foreign content',
}
# Key question: Which explanation is most likely?
# How would you distinguish between them?
Investigating with Observational Data
import pandas as pd
import numpy as np
from scipy import stats
def observational_analysis(users_df, viewing_df):
"""
Analyze the relationship between foreign content and retention
using observational data (no experiment).
"""
# Merge user data with viewing behavior
user_viewing = viewing_df.groupby('user_id').agg(
foreign_content_hours=('content_type', lambda x: (x == 'foreign').sum()),
total_hours=('hours_watched', 'sum'),
foreign_ratio=('content_type', lambda x: (x == 'foreign').mean()),
).reset_index()
analysis_df = users_df.merge(user_viewing, on='user_id')
# 1. Basic correlation
correlation = analysis_df[['foreign_ratio', 'retention_90d']].corr().iloc[0, 1]
print(f"Correlation between foreign content ratio and retention: {correlation:.3f}")
# 2. Controlling for confounders
# User tenure (longer-tenured users may watch more foreign content)
tenure_groups = analysis_df['tenure_months'].cut bins=[0, 3, 6, 12, 24, 999])
controlled_correlations = []
for group in tenure_groups.unique():
group_data = analysis_df[tenure_groups == group]
if len(group_data) > 100:
corr = group_data[['foreign_ratio', 'retention_90d']].corr().iloc[0, 1]
controlled_correlations.append({
'tenure_group': group,
'correlation': corr,
'n_users': len(group_data),
})
# 3. Multiple regression
import statsmodels.api as sm
X = analysis_df[['foreign_ratio', 'tenure_months', 'total_hours',
'account_age_days', 'plan_price']]
X = sm.add_constant(X)
y = analysis_df['retention_90d']
model = sm.OLS(y, X).fit()
print(model.summary())
# 4. Propensity score matching
# Match users who watch foreign content with similar users who don't
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors
# Estimate propensity scores
X_prop = analysis_df[['tenure_months', 'total_hours',
'account_age_days', 'plan_price']]
treatment = analysis_df['foreign_ratio'] > 0.5
ps_model = LogisticRegression()
ps_model.fit(X_prop, treatment)
propensity_scores = ps_model.predict_proba(X_prop)[:, 1]
# Match treated and control users
treated_idx = np.where(treatment)[0]
control_idx = np.where(~treatment)[0]
nn = NearestNeighbors(n_neighbors=1)
nn.fit(propensity_scores[control_idx].reshape(-1, 1))
distances, matches = nn.kneighbors(
propensity_scores[treated_idx].reshape(-1, 1)
)
# Calculate matched treatment effect
matched_control_idx = control_idx[matches.flatten()]
treatment_effect = (
analysis_df.iloc[treated_idx]['retention_90d'].mean() -
analysis_df.iloc[matched_control_idx]['retention_90d'].mean()
)
return {
'raw_correlation': correlation,
'controlled_correlations': controlled_correlations,
'regression_coefficient': model.params['foreign_ratio'],
'propensity_matched_effect': treatment_effect,
}
Key Statistical Concepts for Interviews
Hypothesis Testing
from scipy import stats
def hypothesis_test_example():
"""
Common hypothesis testing scenarios in interviews.
"""
# Scenario 1: A/B test result
control_conversions = [120, 115, 118, 122, 119]
treatment_conversions = [135, 128, 132, 140, 133]
t_stat, p_value = stats.ttest_ind(control_conversions, treatment_conversions)
print(f"A/B Test: t={t_stat:.3f}, p={p_value:.4f}")
# Scenario 2: Chi-squared test for categorical data
# Does payment method differ by user segment?
observed = np.array([
[450, 150], # Premium users: credit, paypal
[300, 200], # Free users: credit, paypal
])
chi2, p_value, dof, expected = stats.chi2_contingency(observed)
print(f"Chi-squared: ΟΒ²={chi2:.3f}, p={p_value:.4f}")
# Scenario 3: Non-parametric test (when data isn't normal)
# Mann-Whitney U test
control = [12, 15, 18, 22, 25, 28, 31, 35, 40, 50]
treatment = [18, 22, 25, 28, 32, 35, 40, 45, 52, 60]
u_stat, p_value = stats.mannwhitneyu(control, treatment, alternative='less')
print(f"Mann-Whitney U: U={u_stat:.3f}, p={p_value:.4f}")
Bayesian Thinking
import numpy as np
def bayesian_ab_test(control_conversions, control_trials,
treatment_conversions, treatment_trials,
n_simulations=100000):
"""
Bayesian A/B test using Monte Carlo simulation.
"""
# Prior: Beta(1, 1) = uniform prior
alpha_prior = 1
beta_prior = 1
# Posterior for control
alpha_control = alpha_prior + control_conversions
beta_control = beta_prior + (control_trials - control_conversions)
# Posterior for treatment
alpha_treatment = alpha_prior + treatment_conversions
beta_treatment = beta_prior + (treatment_trials - treatment_conversions)
# Sample from posteriors
control_samples = np.random.beta(alpha_control, beta_control, n_simulations)
treatment_samples = np.random.beta(alpha_treatment, beta_treatment, n_simulations)
# Probability that treatment is better
prob_treatment_better = (treatment_samples > control_samples).mean()
# Expected loss (risk of choosing treatment when control is better)
expected_loss = np.mean(
np.maximum(control_samples - treatment_samples, 0)
)
# Credible interval for the difference
diff_samples = treatment_samples - control_samples
ci_lower = np.percentile(diff_samples, 2.5)
ci_upper = np.percentile(diff_samples, 97.5)
return {
'prob_treatment_better': prob_treatment_better,
'expected_loss': expected_loss,
'credible_interval': (ci_lower, ci_upper),
'control_rate': alpha_control / (alpha_control + beta_control),
'treatment_rate': alpha_treatment / (alpha_treatment + beta_treatment),
}
# Example
result = bayesian_ab_test(
control_conversions=120, control_trials=1000,
treatment_conversions=140, treatment_trials=1000
)
print(f"P(Treatment > Control) = {result['prob_treatment_better']:.3f}")
print(f"95% Credible Interval: [{result['credible_interval'][0]:.4f}, {result['credible_interval'][1]:.4f}]")
Causal Inference
def causal_inference_methods():
"""
Overview of causal inference methods for interviews.
"""
methods = {
'randomized_experiment': {
'when': 'When you can randomize treatment assignment',
'assumption': 'Randomization eliminates confounding',
'example': 'A/B testing a new feature',
},
'propensity_score_matching': {
'when': 'When you have observational data and can estimate treatment probability',
'assumption': 'Selection on observables (unconfoundedness)',
'example': 'Estimating effect of marketing campaign',
},
'difference_in_differences': {
'when': 'When you have pre/post data for treated and control groups',
'assumption': 'Parallel trends (treatment and control would follow same trend without treatment)',
'example': 'Evaluating policy change in one region',
},
'regression_discontinuity': {
'when': 'When treatment is assigned based on a threshold',
'assumption': 'Units near threshold are comparable',
'example': 'Effect of financial aid on graduation (aid given above income threshold)',
},
'instrumental_variables': {
'when': 'When you have a variable that affects treatment but not outcome directly',
'assumption': 'Exclusion restriction (instrument only affects outcome through treatment)',
'example': 'Effect of education on earnings (using distance to college as instrument)',
},
}
return methods
Common Statistical Mistakes
β οΈ
These mistakes are red flags in interviews:
1. P-Hacking
# BAD: Testing multiple hypotheses and reporting only significant ones
results = {}
for metric in ['retention', 'engagement', 'satisfaction', 'churn']:
_, p = stats.ttest_ind(treatment[metric], control[metric])
results[metric] = p
# If you only report the significant ones, that's p-hacking
# GOOD: Pre-specify primary hypothesis and correct for multiple comparisons
from statsmodels.stats.multitest import multipletests
p_values = [results[m] for m in results]
reject, corrected_p, _, _ = multipletests(p_values, method='bonferroni')
2. Ignoring Effect Size
# BAD: "p < 0.05, so it's significant!"
# GOOD: "p < 0.05, with an effect size of 2% improvement (95% CI: 0.5% to 3.5%)"
# Effect size matters more than p-value for business decisions
def report_with_effect_size(treatment_mean, control_mean, treatment_std, control_std, n):
"""
Report results with effect size and practical significance.
"""
diff = treatment_mean - control_mean
pooled_std = np.sqrt(((n-1)*treatment_std**2 + (n-1)*control_std**2) / (2*n-2))
cohens_d = diff / pooled_std
return {
'absolute_difference': diff,
'relative_difference': diff / control_mean,
'cohens_d': cohens_d,
'practical_significance': abs(cohens_d) > 0.2, # Small effect threshold
}
3. Confusing Statistical with Practical Significance
# Example: Large sample can make tiny effects "significant"
# With 1 million users, a 0.01% improvement might be p < 0.001
# But is it worth the engineering effort?
def is_practically_significant(effect_size, cost_of_implementation,
value_per_user, n_users):
"""
Determine if an effect is practically significant.
"""
total_value = effect_size * value_per_user * n_users
roi = total_value / cost_of_implementation
return {
'total_value': total_value,
'roi': roi,
'recommendation': 'Implement' if roi > 2 else 'Do not implement',
}