The Interview Question
"Walk me through how you would design, run, and analyze an A/B test for a new recommendation algorithm on YouTube."
This question goes beyond basic A/B testing knowledge β it tests whether you can handle the real complexities of experimentation at scale.
Why Companies Ask This
βΉοΈ
Netflix and Google run thousands of experiments per year. They need data scientists who understand not just the statistics, but the practical challenges: sample ratio mismatch, network effects, novelty effects, and more.
Interviewers evaluate:
- Experimental Design β Can you set up a valid experiment?
- Statistical Knowledge β Do you understand power, significance, and effect sizes?
- Practical Awareness β Do you know what can go wrong in real experiments?
- Advanced Topics β Can you discuss heterogeneous treatment effects, interference, and long-term effects?
- Business Context β Do you understand when NOT to experiment?
The Complete A/B Testing Framework
Phase 1: Pre-Experiment Design
Step 1: Define the Hypothesis
Architecture Diagram
Hβ: The new recommendation algorithm has no effect on watch time
Hβ: The new recommendation algorithm increases average watch time by at least 5%
Step 2: Choose the Right Metric
# Primary metric: Average watch time per session
# This directly measures the value delivered to users
# Secondary metrics:
secondary_metrics = {
'engagement': [
'click_through_rate',
'videos_started',
'completion_rate',
],
'satisfaction': [
'return_rate_7d',
'thumbs_up_rate',
'survey_satisfaction',
],
'guardrail': [
'app_crash_rate',
'stream_buffering_rate',
'customer_support_tickets',
],
}
Step 3: Calculate Sample Size
import numpy as np
from scipy.stats import norm
def calculate_sample_size(baseline_mean, baseline_std, mde, alpha=0.05, power=0.80):
"""
Calculate required sample size for two-sample t-test.
Args:
baseline_mean: Current average watch time (minutes)
baseline_std: Standard deviation of watch time
mde: Minimum detectable effect (relative)
alpha: Significance level
power: Statistical power
Returns:
Required sample size per group
"""
z_alpha = norm.ppf(1 - alpha / 2)
z_beta = norm.ppf(power)
# Effect size (Cohen's d)
effect_size = (baseline_mean * mde) / baseline_std
# Sample size per group
n = 2 * ((z_alpha + z_beta) / effect_size) ** 2
return int(np.ceil(n))
# Example: YouTube watch time
baseline_mean = 12 # minutes
baseline_std = 8 # minutes
mde = 0.05 # 5% relative improvement
n_required = calculate_sample_size(baseline_mean, baseline_std, mde)
print(f"Required sample size per group: {n_required:,}")
# Output: Required sample size per group: 14,158
Step 4: Determine Experiment Duration
def calculate_experiment_durationrequired_sample_size, daily_traffic,
traffic_allocation=0.5):
"""
Calculate how long the experiment needs to run.
"""
total_sample_needed = required_sample_size * 2 # Both groups
daily_sample = daily_traffic * traffic_allocation
days_needed = total_sample_needed / daily_sample
# Add buffer for weekends and anomalies
days_with_buffer = days_needed * 1.2
return int(np.ceil(days_with_buffer))
# YouTube example
daily_active_users = 100_000_000
traffic_allocation = 0.1 # 10% of users in experiment
days = calculate_experiment_duration(n_required, daily_active_users, traffic_allocation)
print(f"Experiment duration: {days} days")
# Output: Experiment duration: 4 days
Phase 2: Running the Experiment
Randomization Unit
# For YouTube: User-level randomization
# Why? Because showing different recommendations to the same user
# creates a terrible experience
# Alternative: Session-level (only if recommendations are session-independent)
# Alternative: Device-level (if cross-device sync isn't critical)
Sample Ratio Mismatch (SRM) Check
def check_srm(control_count, treatment_count, alpha=0.05):
"""
Check for Sample Ratio Mismatch β a common experiment bug.
"""
total = control_count + treatment_count
expected_ratio = 0.5
expected_control = total * expected_ratio
# Chi-squared test
observed = np.array([control_count, treatment_count])
expected = np.array([expected_control, total - expected_control])
chi2, p_value = chisquare(observed, f_exp=expected)
return {
'chi2': chi2,
'p_value': p_value,
'srm_detected': p_value < alpha,
'actual_ratio': control_count / total,
}
# Example: Detecting a bug
result = check_srm(control_count=49_800, treatment_count=50_200)
# p_value = 0.32 β no SRM detected
result_buggy = check_srm(control_count=45_000, treatment_count=55_000)
# p_value < 0.001 β SRM detected! Investigation needed
Phase 3: Analyzing Results
Basic Analysis
from scipy import stats
import numpy as np
def analyze_ab_test(control_data, treatment_data, metric_name, alpha=0.05):
"""
Comprehensive A/B test analysis.
"""
control = control_data[metric_name]
treatment = treatment_data[metric_name]
# Descriptive statistics
results = {
'control_mean': control.mean(),
'treatment_mean': treatment.mean(),
'absolute_difference': treatment.mean() - control.mean(),
'relative_difference': (treatment.mean() - control.mean()) / control.mean(),
'control_std': control.std(),
'treatment_std': treatment.std(),
}
# Statistical test (Welch's t-test β doesn't assume equal variances)
t_stat, p_value = stats.ttest_ind(treatment, control, equal_var=False)
results['t_statistic'] = t_stat
results['p_value'] = p_value
results['significant'] = p_value < alpha
# Confidence interval for the difference
diff = treatment.mean() - control.mean()
se = np.sqrt(control.var() / len(control) + treatment.var() / len(treatment))
ci_lower = diff - 1.96 * se
ci_upper = diff + 1.96 * se
results['ci_95'] = (ci_lower, ci_upper)
# Effect size (Cohen's d)
pooled_std = np.sqrt(
((len(control) - 1) * control.var() +
(len(treatment) - 1) * treatment.var()) /
(len(control) + len(treatment) - 2)
)
results['cohens_d'] = diff / pooled_std
return results
# Example
results = analyze_ab_test(
control_data=control_group,
treatment_data=treatment_group,
metric_name='watch_time_minutes'
)
print(f"Control: {results['control_mean']:.2f} min")
print(f"Treatment: {results['treatment_mean']:.2f} min")
print(f"Lift: {results['relative_difference']:.2%}")
print(f"p-value: {results['p_value']:.4f}")
print(f"95% CI: [{results['ci_95'][0]:.2f}, {results['ci_95'][1]:.2f}]")
Novelty and Primacy Effects
def check_novelty_effect(data, treatment_col='is_treatment',
time_col='days_since_experiment_start',
metric_col='watch_time'):
"""
Check if the treatment effect changes over time (novelty effect).
"""
# Split by early vs. late experiment
data['period'] = np.where(
data[time_col] <= data[time_col].median(),
'early',
'late'
)
# Calculate treatment effect by period
effects = data.groupby('period').apply(
lambda x: x[x[treatment_col]][metric_col].mean() -
x[~x[treatment_col]][metric_col].mean()
)
# If early effect >> late effect, novelty effect likely
novelty_ratio = effects['early'] / max(effects['late'], 0.001)
return {
'early_effect': effects['early'],
'late_effect': effects['late'],
'novelty_ratio': novelty_ratio,
'likely_novelty_effect': novelty_ratio > 1.5,
}
Advanced Topics
Network Effects
β οΈ
When users interact with each other (social features, marketplace), individual randomization can bias results. This is called "interference" or "spillover."
# Solution: Cluster randomization
# Randomize at the level of social groups, geographic clusters,
# or friend networks
# Example: Facebook friendship clusters
def cluster_randomization(users_df, friendships_df, treatment_rate=0.5):
"""
Randomize at the cluster level to avoid network effects.
"""
import networkx as nx
# Build friendship graph
G = nx.from_pandas_edgelist(
friendships_df, 'user_a', 'user_b'
)
# Find connected components (friend clusters)
clusters = list(nx.connected_components(G))
# Randomly assign clusters to treatment
np.random.seed(42)
treatment_clusters = np.random.choice(
len(clusters),
size=int(len(clusters) * treatment_rate),
replace=False
)
# Assign treatment status
treatment_users = set()
for idx in treatment_clusters:
treatment_users.update(clusters[idx])
users_df['is_treatment'] = users_df['user_id'].isin(treatment_users)
return users_df
Heterogeneous Treatment Effects
from sklearn.ensemble import GradientBoostingRegressor
from econml.dml import CausalForestDML
def estimate_heterogeneous_effects(data, treatment_col, outcome_col,
covariates):
"""
Estimate how treatment effects vary across user segments.
"""
# Causal Forest for heterogeneous treatment effects
est = CausalForestDML(
model_y=GradientBoostingRegressor(),
model_t=GradientBoostingRegressor(),
n_estimators=1000,
random_state=42
)
est.fit(
Y=data[outcome_col],
T=data[treatment_col],
X=data[covariates]
)
# Get individual treatment effects
individual_effects = est.effect(data[covariates])
# Identify who benefits most
data['treatment_effect'] = individual_effects
top_beneficiaries = data.nlargest(1000, 'treatment_effect')
return {
'average_effect': individual_effects.mean(),
'effect_std': individual_effects.std(),
'top_segment': top_beneficiaries[covariates].mode().iloc[0],
'effect_distribution': np.percentile(
individual_effects, [10, 25, 50, 75, 90]
),
}
Common Pitfalls and How to Avoid Them
1. Peeking at Results Too Early
# Problem: Stopping early when you see "significance"
# This inflates false positive rate
# Solution: Sequential testing
from statsmodels.stats.power import TTestIndPower
def sequential_testingηι, max_looks=5):
"""
Adjust significance threshold for multiple looks.
"""
# Bonferroni correction (conservative)
adjusted_alpha = alpha / max_looks
# Or use more sophisticated methods:
# - O'Brien-Fleming boundaries
# - Alpha spending functions
# - Bayesian approaches
return adjusted_alpha
2. Ignoring Multiple Metrics
# Problem: Testing 20 metrics, finding 1 "significant" by chance
# Solution: Control for multiple comparisons
def multiple_comparison_correction(p_values, method='bonferroni'):
"""
Correct for multiple comparisons.
"""
if method == 'bonferroni':
return [min(p * len(p_values), 1.0) for p in p_values]
elif method == 'benjamini-hochberg':
from statsmodels.stats.multitest import multipletests
_, corrected_p, _, _ = multipletests(p_values, method='fdr_bh')
return corrected_p
3. Not Accounting for User Heterogeneity
# Problem: Average effect masks important segment differences
# Solution: Always check subgroup analyses
def subgroup_analysis(data, treatment_col, metric_col, segment_col):
"""
Check if treatment effect varies by segment.
"""
results = []
for segment in data[segment_col].unique():
segment_data = data[data[segment_col] == segment]
control = segment_data[~segment_data[treatment_col]][metric_col]
treatment = segment_data[segment_data[treatment_col]][metric_col]
results.append({
'segment': segment,
'control_mean': control.mean(),
'treatment_mean': treatment.mean(),
'lift': (treatment.mean() - control.mean()) / control.mean(),
'n_users': len(segment_data),
})
return pd.DataFrame(results)