Logistic Regression
Logistic regression models the probability of a binary outcome as a function of predictors:
The model is linear in the log-odds (logit):
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from scipy import stats
np.random.seed(42)
n = 300
# Predict loan default: credit score, income, debt-to-income
credit_score = np.random.normal(650, 80, n)
income = np.random.lognormal(10.5, 0.5, n)
dti = np.random.uniform(0.1, 0.7, n)
log_odds = -5 + (-0.008)*credit_score + (-0.00002)*income + 3*dti
p_default = 1 / (1 + np.exp(-log_odds))
default = np.random.binomial(1, p_default)
df = pd.DataFrame({'default':default,'credit':credit_score,'income':income,'dti':dti})
X = sm.add_constant(df[['credit','income','dti']])
model = sm.Logit(df['default'], X).fit()
print(model.summary())
# Odds ratios
print("\nOdds Ratios and 95% CI:")
coef_df = pd.DataFrame({
'OR': np.exp(model.params),
'CI_lower': np.exp(model.conf_int()[0]),
'CI_upper': np.exp(model.conf_int()[1]),
'p_value': model.pvalues
}).drop('const')
print(coef_df.round(4))
# Interpretation
print(f"\nFor 1-unit increase in DTI ratio:")
print(f" Odds of default multiply by {np.exp(model.params['dti']):.2f}")
print(f" ({(np.exp(model.params['dti'])-1)*100:.1f}% increase in odds)")
# Predicted probabilities
probs = model.predict(X)
pred_class = (probs >= 0.5).astype(int)
accuracy = (pred_class == df['default']).mean()
print(f"\nAccuracy (threshold=0.5): {accuracy:.4f}")
# ROC curve
from sklearn.metrics import roc_curve, roc_auc_score
fpr, tpr, thresholds = roc_curve(df['default'], probs)
auc = roc_auc_score(df['default'], probs)
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
axes[0].plot(fpr, tpr, 'b-', linewidth=2)
axes[0].plot([0,1],[0,1],'k--')
axes[0].set_title(f'ROC Curve (AUC = {auc:.4f})')
axes[0].set_xlabel('False Positive Rate')
axes[0].set_ylabel('True Positive Rate')
credit_range = np.linspace(credit_score.min(), credit_score.max(), 100)
for dti_val, color, label in [(0.2,'green','Low DTI (0.2)'),
(0.4,'blue','Med DTI (0.4)'),
(0.6,'red','High DTI (0.6)')]:
new_X = pd.DataFrame({'const':1,'credit':credit_range,
'income':np.median(income),'dti':dti_val})
pred_p = model.predict(new_X)
axes[1].plot(credit_range, pred_p, color=color, linewidth=2, label=label)
axes[1].set_title('Predicted Default Probability')
axes[1].set_xlabel('Credit Score')
axes[1].set_ylabel('P(Default)')
axes[1].legend()
plt.tight_layout()
plt.savefig('logistic_regression.png', dpi=150)
plt.show()
Key Takeaways
- Logistic regression outputs probabilities via the sigmoid function
- Coefficients are log-odds — exponentiate to get odds ratios
- MLE, not OLS, is used to fit logistic regression
- Likelihood ratio test (LRT) is more powerful than Wald test for hypothesis testing
- AUC-ROC is a better performance metric than accuracy for imbalanced classes