Chi-Square Test of Independence
Tests whether two categorical variables are independent (not associated) in a contingency table.
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
# Contingency table: Gender vs. Preferred Learning Style
data = np.array([[45, 30, 25], # Female
[30, 40, 35]]) # Male
rows = ['Female', 'Male']
cols = ['Visual', 'Auditory', 'Kinesthetic']
contingency = pd.DataFrame(data, index=rows, columns=cols)
print("Observed Frequencies:")
print(contingency)
print(f"Total n = {data.sum()}")
# Chi-square test
chi2, p, dof, expected = stats.chi2_contingency(contingency)
print(f"
Expected Frequencies:")
print(pd.DataFrame(expected, index=rows, columns=cols).round(2))
print(f"
χ²({dof}) = {chi2:.4f}")
print(f"p-value = {p:.4f}")
print(f"Decision: {'Reject H₀ — variables are associated' if p < 0.05 else 'Fail to reject H₀ — no significant association'}")
# Effect size: Cramér's V
n = data.sum()
min_dim = min(data.shape) - 1
cramers_v = np.sqrt(chi2 / (n * min_dim))
print(f"Cramér's V = {cramers_v:.4f}")
print(f"Effect size: {'small' if cramers_v < 0.1 else 'medium' if cramers_v < 0.3 else 'large'}")
# Residuals: which cells contribute most to chi-square?
observed = data
residuals = (observed - expected) / np.sqrt(expected)
resid_df = pd.DataFrame(residuals, index=rows, columns=cols).round(3)
print("
Standardized Residuals (|z| > 2 indicates significant cell):")
print(resid_df)
# Heatmap
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
sns.heatmap(contingency, annot=True, fmt='d', cmap='Blues', ax=axes[0])
axes[0].set_title('Observed Frequencies')
sns.heatmap(resid_df, annot=True, fmt='.2f', cmap='RdBu_r', center=0,
vmin=-3, vmax=3, ax=axes[1])
axes[1].set_title('Standardized Residuals
(Red = higher than expected, Blue = lower)')
plt.tight_layout()
plt.savefig('chi_square_independence.png', dpi=150)
plt.show()
# Fisher's Exact Test for small samples (2×2 tables)
small_table = np.array([[5, 3], [2, 10]])
oddsratio, p_fisher = stats.fisher_exact(small_table)
print(f"
Fisher's Exact Test (2×2, small n): OR={oddsratio:.3f}, p={p_fisher:.4f}")
When to Use Fisher's Exact vs Chi-Square
- Chi-square: large n, Eᵢⱼ ≥ 5 for at least 80% of cells
- Fisher's Exact: small n, any 2×2 table with small expected counts
Key Takeaways
- χ² independence test works on contingency tables of any size
- Expected frequency = row total × column total / n
- Cramér's V measures association strength: <0.1 small, <0.3 medium, >0.3 large
- Standardized residuals show which cells drive the association
- Fisher's exact test is preferred over chi-square for small expected counts