Elastic Net Regression
Elastic Net combines Ridge (L2) and Lasso (L1) penalties:
- α = 0: pure Ridge; α = 1: pure Lasso; 0 < α < 1: Elastic Net
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import ElasticNet, ElasticNetCV, Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
np.random.seed(42)
n, p = 200, 50
# Create groups of correlated features
X = np.random.randn(n, p)
for group_start in range(0, 15, 5): # 3 groups of 5 correlated features
for j in range(group_start+1, group_start+5):
X[:, j] = X[:, group_start] + np.random.randn(n)*0.3
true_beta = np.zeros(p)
true_beta[:15] = [3,-2,2,-1.5,1, 2.5,-2,1.5,-1,0.8, -3,2,-2,1,-1.5]
y = X @ true_beta + np.random.randn(n)*2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Compare Ridge, Lasso, ElasticNet
results = {}
for name, Model, kwargs in [
('Ridge', Ridge, {'alpha': 1.0}),
('Lasso', Lasso, {'alpha': 0.1, 'max_iter': 10000}),
('ElasticNet', ElasticNet, {'alpha': 0.1, 'l1_ratio': 0.5, 'max_iter': 10000})
]:
model = Pipeline([('s', StandardScaler()), ('m', Model(**kwargs))])
model.fit(X_train, y_train)
test_mse = np.mean((y_test - model.predict(X_test))**2)
nonzero = (model.named_steps['m'].coef_ != 0).sum() if name != 'Ridge' else p
results[name] = {'mse': test_mse, 'nonzero': nonzero,
'coef': model.named_steps['m'].coef_}
print(f"{name}: Test MSE={test_mse:.4f}, Nonzero={nonzero}/{p}")
# ElasticNetCV to find best alpha and l1_ratio
enet_cv = Pipeline([
('s', StandardScaler()),
('m', ElasticNetCV(l1_ratio=[0.1, 0.3, 0.5, 0.7, 0.9, 0.95, 1.0],
cv=5, random_state=42, max_iter=10000))
])
enet_cv.fit(X_train, y_train)
best_alpha = enet_cv.named_steps['m'].alpha_
best_l1 = enet_cv.named_steps['m'].l1_ratio_
test_mse_cv = np.mean((y_test - enet_cv.predict(X_test))**2)
print(f"\nElasticNetCV: best α={best_alpha:.4f}, l1_ratio={best_l1:.2f}, Test MSE={test_mse_cv:.4f}")
# Coefficient comparison
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
for ax, (name, res) in zip(axes, results.items()):
colors = ['red' if j<15 else 'steelblue' for j in range(p)]
ax.bar(range(p), res['coef'], color=colors, alpha=0.7)
ax.axhline(0, color='black', linewidth=0.5)
ax.set_title(f'{name} (MSE={res["mse"]:.3f})
{res["nonzero"]} nonzero coefficients')
ax.set_xlabel('Feature Index')
plt.suptitle('Ridge vs Lasso vs Elastic Net Coefficients', fontsize=13)
plt.tight_layout()
plt.savefig('elastic_net.png', dpi=150)
plt.show()
Key Takeaways
- Elastic Net is best when predictors are grouped in correlated clusters
- Lasso arbitrarily selects one from a correlated group; Elastic Net selects all or none
- l1_ratio closer to 1: more like Lasso (sparser); closer to 0: more like Ridge
- ElasticNetCV selects both λ and l1_ratio via cross-validation
- Default for many ML pipelines: Elastic Net with cross-validated hyperparameters