Residual Analysis

Regression Analysis

Diagnosing What's Wrong With Your Regression

Residual analysis reveals whether your model assumptions hold, identifies influential outliers, and detects non-linearity. It's the diagnostic toolkit that separates trustworthy models from misleading ones.

Clinical Studies — Detect influential patients that distort treatment effect estimates
Financial Forecasting — Identify time periods where models systematically fail
Manufacturing — Spot measurement errors and anomalous production runs

What remains after fitting the model tells you as much as what the model predicts.

Residuals are the differences between observed and predicted values:

Analyzing residuals reveals whether regression assumptions are met and identifies influential observations.


import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

import statsmodels.api as sm

from scipy import stats



np.random.seed(42)



# Build a regression model with potential issues

n = 80

X = np.random.uniform(1, 10, n)

y = 2 + 3*X + np.random.normal(0, 2, n)



# Add some outliers and influential points

X = np.append(X, [1.5, 9.5])

y = np.append(y, [25, 10])  # outliers



X_dm = sm.add_constant(X)

model = sm.OLS(y, X_dm).fit()



# Types of residuals

raw_resid = model.resid

student_resid = model.outlier_test()['student_resid']

influence = model.get_influence()

leverage = influence.hat_matrix_diag

cooks_d = influence.cooks_distance[0]

standardized_resid = influence.resid_studentized_internal



print("Influence Diagnostics:")

print(f"Max leverage: {leverage.max():.4f} (threshold: {2*(2/len(X)):.4f})")

print(f"Max Cook's D: {cooks_d.max():.4f} (threshold: 4/n = {4/len(X):.4f})")



high_lev = leverage > 2*(2/len(X))

high_cook = cooks_d > 4/len(X)

print(f"High leverage points: {high_lev.sum()}")

print(f"High Cook's D points: {high_cook.sum()}")



# 4-panel diagnostic plot

fig, axes = plt.subplots(2, 2, figsize=(12, 8))



# 1. Residuals vs Fitted

axes[0,0].scatter(model.fittedvalues, raw_resid, alpha=0.6, color='steelblue')

axes[0,0].axhline(0, color='red', linestyle='--', linewidth=2)

axes[0,0].set_xlabel('Fitted Values')

axes[0,0].set_ylabel('Residuals')

axes[0,0].set_title('Residuals vs Fitted')



# 2. Q-Q Plot

stats.probplot(raw_resid, dist='norm', plot=axes[0,1])

axes[0,1].set_title('Normal Q-Q Plot')



# 3. Scale-Location

axes[1,0].scatter(model.fittedvalues, np.sqrt(np.abs(standardized_resid)), alpha=0.6, color='coral')

axes[1,0].set_xlabel('Fitted Values')

axes[1,0].set_ylabel('v|Standardized Residuals|')

axes[1,0].set_title('Scale-Location (Homoscedasticity)')



# 4. Residuals vs Leverage (Cook's D)

axes[1,1].scatter(leverage, standardized_resid, alpha=0.6, color='orchid')

axes[1,1].axhline(0, color='black', linestyle='--')

axes[1,1].axvline(2*(2/len(X)), color='red', linestyle=':', label='Leverage threshold')

for i, (lev, res, cd) in enumerate(zip(leverage, standardized_resid, cooks_d)):

    if cd > 4/len(X):

        axes[1,1].annotate(f'Obs {i}', (lev, res), fontsize=8)

axes[1,1].set_xlabel('Leverage')

axes[1,1].set_ylabel('Standardized Residuals')

axes[1,1].set_title("Residuals vs Leverage (Cook's D)")

axes[1,1].legend()



plt.tight_layout()

plt.savefig('residual_analysis.png', dpi=150)

plt.show()



# Outlier test (Bonferroni-corrected)

print("\nOutlier Test (Bonferroni-corrected):")

outlier_results = model.outlier_test()

significant_outliers = outlier_results[outlier_results['bonf(p)'] < 0.05]

print(f"Significant outliers: {len(significant_outliers)}")

Key Diagnostics

|---------|---------|-----------|----------|

Residual Analysis — Diagnosing Regression Problems

Residual Analysis

Diagnosing What's Wrong With Your Regression

Key Diagnostics

Key Takeaways

Need Expert Statistics Help?