One-Way ANOVA
Tests whether k group population means are all equal:
import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import matplotlib.pyplot as plt
np.random.seed(42)
k, n_each = 4, 20
fertilizers = {
'A': np.random.normal(22, 3, n_each),
'B': np.random.normal(25, 3, n_each),
'C': np.random.normal(23, 3, n_each),
'D': np.random.normal(28, 3, n_each),
}
df = pd.DataFrame([(fert, val) for fert, vals in fertilizers.items() for val in vals],
columns=['fert','yield'])
# ANOVA
F, p = stats.f_oneway(*fertilizers.values())
print(f"F({k-1},{k*(n_each-1)}) = {F:.4f}, p = {p:.6f}")
# ANOVA table
model = ols('yield ~ C(fert)', data=df).fit()
table = sm.stats.anova_lm(model, typ=1)
print("\nANOVA Table:")
print(table.round(4))
# Effect size
grand = df['yield'].mean()
SS_b = sum(n_each*(g.mean()-grand)**2 for g in fertilizers.values())
SS_t = sum((df['yield']-grand)**2)
eta2 = SS_b/SS_t
print(f"\nη² = {eta2:.4f} ({'small' if eta2<0.06 else 'medium' if eta2<0.14 else 'large'} effect)")
# Tukey post-hoc
tukey = pairwise_tukeyhsd(df['yield'], df['fert'])
print("\nTukey HSD:")
print(tukey.summary())
# Plot
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
axes[0].boxplot(list(fertilizers.values()), labels=list(fertilizers.keys()), patch_artist=True)
axes[0].axhline(grand, color='red', linestyle='--', label='Grand mean')
axes[0].set_title(f'Yield by Fertilizer\nF={F:.2f}, p={p:.4f}')
axes[0].legend()
means = [v.mean() for v in fertilizers.values()]
sems = [v.std(ddof=1)/np.sqrt(n_each) for v in fertilizers.values()]
axes[1].errorbar(list(fertilizers.keys()), means, yerr=[1.96*s for s in sems],
fmt='o', capsize=5, markersize=8, color='steelblue')
axes[1].set_title('Group Means with 95% CI')
plt.tight_layout()
plt.savefig('one_way_anova.png', dpi=150)
plt.show()
Key Takeaways
- ANOVA tests all means simultaneously — avoids inflated Type I error of multiple t-tests
- F = MS_between/MS_within — large F means between-group variation exceeds random noise
- Assumptions: independence, normality within groups, equal variances (Levene's test)
- Post-hoc tests (Tukey, Bonferroni) identify which specific groups differ
- η² tells you proportion of total variance explained by group membership