Complete Regression Diagnostics
A comprehensive checklist for validating linear regression models.
import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy import stats
import matplotlib.pyplot as plt
np.random.seed(42)
n = 150
X1 = np.random.uniform(1, 10, n)
X2 = np.random.normal(5, 2, n)
y = 2 + 3*X1 - 1.5*X2 + np.random.normal(0, 3, n)
# Add outlier and influential point
X1 = np.append(X1, [2, 9])
X2 = np.append(X2, [5, 5])
y = np.append(y, [40, -5]) # outliers
X = sm.add_constant(np.column_stack([X1, X2]))
model = sm.OLS(y, X).fit()
infl = model.get_influence()
resid_student = infl.resid_studentized_internal
leverage = infl.hat_matrix_diag
cooks_d = infl.cooks_distance[0]
# Full diagnostic panel
fig, axes = plt.subplots(3, 2, figsize=(12, 12))
# 1. Residuals vs Fitted
axes[0,0].scatter(model.fittedvalues, model.resid, alpha=0.5)
axes[0,0].axhline(0, color='red', linestyle='--')
axes[0,0].set_title('Residuals vs Fitted\n(check linearity & homoscedasticity)')
axes[0,0].set_xlabel('Fitted'); axes[0,0].set_ylabel('Residuals')
# 2. Q-Q plot
stats.probplot(model.resid, dist='norm', plot=axes[0,1])
axes[0,1].set_title('Normal Q-Q\n(check normality of residuals)')
# 3. Scale-Location
axes[1,0].scatter(model.fittedvalues, np.sqrt(np.abs(resid_student)), alpha=0.5, color='coral')
axes[1,0].set_title('Scale-Location\n(horizontal band = homoscedasticity)')
axes[1,0].set_xlabel('Fitted'); axes[1,0].set_ylabel('sqrt|Std Residuals|')
# 4. Cook's Distance
axes[1,1].stem(range(len(cooks_d)), cooks_d, markerfmt='o', linefmt='gray', basefmt='k-')
axes[1,1].axhline(4/n, color='red', linestyle='--', label=f'4/n={4/n:.3f}')
axes[1,1].set_title("Cook's Distance\n(influential observations)")
axes[1,1].legend()
# 5. Residuals vs Leverage
axes[2,0].scatter(leverage, resid_student, alpha=0.5, color='orchid')
axes[2,0].axhline(0, color='black', linestyle='--')
axes[2,0].set_title('Residuals vs Leverage')
axes[2,0].set_xlabel('Leverage'); axes[2,0].set_ylabel('Std Residuals')
# 6. Added Variable Plots (partial regression)
sm.graphics.plot_partregress_grid(model, fig=fig)
plt.tight_layout()
plt.savefig('regression_diagnostics_full.png', dpi=150)
plt.show()
# Flag problems
n_tot = len(y)
high_leverage = (leverage > 2*3/n_tot).sum()
outliers = (np.abs(resid_student) > 3).sum()
influential = (cooks_d > 4/n_tot).sum()
print(f"High leverage points (h > 2p/n): {high_leverage}")
print(f"Outliers (|std resid| > 3): {outliers}")
print(f"Influential points (Cook's D > 4/n): {influential}")
# Breusch-Pagan homoscedasticity
from statsmodels.stats.diagnostic import het_breuschpagan
bp, bp_p, _, _ = het_breuschpagan(model.resid, model.model.exog)
print(f"Breusch-Pagan test: p={bp_p:.4f} ({'homoscedastic' if bp_p>0.05 else 'heteroscedastic'})")
Diagnostics Checklist
| Check | Tool | Problem if... |
|---|---|---|
| Linearity | Residuals vs Fitted | Curved pattern |
| Normality | Q-Q plot, Shapiro-Wilk | Heavy tails, skew |
| Homoscedasticity | Scale-Location, BP test | Fan/trumpet shape |
| Independence | Durbin-Watson, ACF | DW << 2 or >> 2 |
| Outliers | Studentized residuals | |
| Leverage | Hat values | h > 2p/n |
| Influence | Cook's D | D > 4/n |
Key Takeaways
- Run all four standard diagnostic plots before trusting any regression output
- High leverage (unusual X) + outlier (unusual Y) = influential point → check it!
- Cook's D > 4/n flags influential points — remove, investigate, or use robust regression
- Heteroscedasticity: use robust SEs (HC3) or transform the response variable
- Never remove outliers without a substantive reason — they may be the most important observations