Bayesian Optimization for Hyperparameters
Grid search is brute force; random search is smarter but still wasteful. Bayesian optimization builds a probabilistic model of the objective function and uses it to intelligently select the next hyperparameters to evaluate β finding better models in fewer iterations.
Bayesian Optimization Loop
Why Bayesian Optimization
Each model training run is expensive. Bayesian optimization minimizes wasted trials by learning from past results, focusing evaluation on promising regions of the hyperparameter space.
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
import optuna
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import warnings
warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)
Generate Dataset
X, y = make_classification(
n_samples=2000, n_features=30, n_informative=15,
n_redundant=5, random_state=42
)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
print(f"Dataset: {X.shape}, classes: {np.bincount(y)}")
Grid Search vs Random Search vs Bayesian
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# Grid Search β exhaustive but slow
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [3, 5, 10, None],
'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(
RandomForestClassifier(random_state=42),
param_grid, cv=3, scoring='roc_auc', n_jobs=-1
)
grid_search.fit(X, y)
print(f"Grid Search: {grid_search.best_score_:.4f} with {grid_search.best_params_}")
# Random Search β samples from distributions
param_distributions = {
'n_estimators': [50, 100, 200, 300, 500],
'max_depth': [3, 5, 7, 10, 15, None],
'min_samples_split': [2, 5, 10, 15, 20]
}
random_search = RandomizedSearchCV(
RandomForestClassifier(random_state=42),
param_distributions, n_iter=20, cv=3,
scoring='roc_auc', random_state=42, n_jobs=-1
)
random_search.fit(X, y)
print(f"Random Search: {random_search.best_score_:.4f} with {random_search.best_params_}")
Hyperopt: Bayesian Optimization
Basic Hyperopt Usage
from hyperopt import hp
# Define search space
space = {
'n_estimators': hp.choice('n_estimators', [50, 100, 200, 300, 500]),
'max_depth': hp.choice('max_depth', [3, 5, 7, 10, 15, None]),
'min_samples_split': hp.uniform('min_samples_split', 2, 20),
'min_samples_leaf': hp.uniform('min_samples_leaf', 1, 10),
'max_features': hp.choice('max_features', ['sqrt', 'log2', 0.5, 0.8])
}
def objective(params):
model = RandomForestClassifier(
n_estimators=params['n_estimators'],
max_depth=params['max_depth'],
min_samples_split=int(params['min_samples_split']),
min_samples_leaf=int(params['min_samples_leaf']),
max_features=params['max_features'],
random_state=42, n_jobs=-1
)
scores = cross_val_score(model, X, y, cv=3, scoring='roc_auc')
return {'loss': -scores.mean(), 'status': STATUS_OK}
trials = Trials()
best = fmin(
fn=objective, space=space, algo=tpe.suggest,
max_evals=50, trials=trials, random_state=42
)
print(f"Best params: {best}")
print(f"Best AUC: {-trials.best_trial['result']['loss']:.4f}")
Hyperopt with Different Algorithms
# TPE (Tree-structured Parzen Estimator) β default, good for most cases
best_tpe = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=30, trials=Trials())
# Random search as baseline
best_random = fmin(fn=objective, space=space, algo=hp.rand.suggest, max_evals=30, trials=Trials())
print(f"TPE best AUC: {-Trials().best_trial['result']['loss']:.4f}")
Optuna: Modern Bayesian Optimization
Optuna provides a cleaner API with built-in pruning and visualization.
Basic Optuna Usage
def objective_optuna(trial):
params = {
'n_estimators': trial.suggest_int('n_estimators', 50, 500),
'max_depth': trial.suggest_int('max_depth', 3, 20),
'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', 0.5, 0.8])
}
model = RandomForestClassifier(**params, random_state=42, n_jobs=-1)
scores = cross_val_score(model, X, y, cv=3, scoring='roc_auc')
return scores.mean()
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=42))
study.optimize(objective_optuna, n_trials=50, show_progress_bar=True)
print(f"Best AUC: {study.best_value:.4f}")
print(f"Best params: {study.best_params}")
Optuna with Pruning
Stop unpromising trials early to save computation.
def objective_pruning(trial):
n_estimators = trial.suggest_int('n_estimators', 50, 500)
max_depth = trial.suggest_int('max_depth', 3, 20)
min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
# Simulate iterative training with pruning
model = RandomForestClassifier(
n_estimators=n_estimators, max_depth=max_depth,
min_samples_split=min_samples_split, random_state=42
)
# Use cross-validation with early stopping via pruning
scores = []
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
model.fit(X[train_idx], y[train_idx])
pred = model.predict_proba(X[val_idx])[:, 1]
score = roc_auc_score(y[val_idx], pred)
scores.append(score)
# Report intermediate value for pruning
trial.report(np.mean(scores), fold)
if trial.should_prune():
raise optuna.TrialPruned()
return np.mean(scores)
study_pruned = optuna.create_study(
direction='maximize',
pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=2)
)
study_pruned.optimize(objective_pruning, n_trials=30, show_progress_bar=True)
print(f"Pruned study best: {study_pruned.best_value:.4f}")
Optuna with Conditional Search Spaces
Different hyperparameters for different model choices.
def objective_conditional(trial):
model_type = trial.suggest_categorical('model_type', ['rf', 'gbm', 'xgb'])
if model_type == 'rf':
n_estimators = trial.suggest_int('rf_n_estimators', 50, 300)
max_depth = trial.suggest_int('rf_max_depth', 3, 15)
model = RandomForestClassifier(
n_estimators=n_estimators, max_depth=max_depth, random_state=42
)
elif model_type == 'gbm':
n_estimators = trial.suggest_int('gbm_n_estimators', 50, 300)
learning_rate = trial.suggest_float('gbm_lr', 0.01, 0.3, log=True)
max_depth = trial.suggest_int('gbm_max_depth', 3, 10)
model = GradientBoostingClassifier(
n_estimators=n_estimators, learning_rate=learning_rate,
max_depth=max_depth, random_state=42
)
else:
n_estimators = trial.suggest_int('xgb_n_estimators', 50, 300)
learning_rate = trial.suggest_float('xgb_lr', 0.01, 0.3, log=True)
max_depth = trial.suggest_int('xgb_max_depth', 3, 10)
model = xgb.XGBClassifier(
n_estimators=n_estimators, learning_rate=learning_rate,
max_depth=max_depth, random_state=42, eval_metric='logloss'
)
scores = cross_val_score(model, X, y, cv=3, scoring='roc_auc')
return scores.mean()
study = optuna.create_study(direction='maximize')
study.optimize(objective_conditional, n_trials=40)
print(f"Best model: {study.best_params['model_type']}")
print(f"Best AUC: {study.best_value:.4f}")
Search Space Design
Good search spaces reflect the structure of the problem. The Expected Improvement (EI) acquisition function balances exploration and exploitation:
where is the best observed value. EI is high where the model predicts improvement with high confidence (exploitation) or where uncertainty is high (exploration).
# Continuous parameters β use log scale for learning rates
learning_rate = hp.loguniform('learning_rate', np.log(0.001), np.log(0.3))
# Integer parameters β uniform for small ranges
max_depth = hp.choice('max_depth', [3, 4, 5, 6, 7, 8, 10, 12, 15, None])
# Categorical parameters
optimizer = hp.choice('optimizer', ['adam', 'sgd', 'rmsprop'])
# Conditional parameters β depends on other choices
use_dropout = hp.choice('use_dropout', [True, False])
dropout_rate = hp.uniform('dropout_rate', 0.1, 0.5) # only relevant if use_dropout=True
# Nested conditionals
def create_search_space():
model_type = hp.choice('model_type', ['neural', 'tree'])
if model_type == 'neural':
return {
'model_type': model_type,
'hidden_layers': hp.choice('hidden_layers', [1, 2, 3]),
'learning_rate': hp.loguniform('lr', np.log(1e-4), np.log(1e-1)),
'dropout': hp.uniform('dropout', 0.1, 0.5)
}
else:
return {
'model_type': model_type,
'n_estimators': hp.choice('n_estimators', [100, 200, 500]),
'max_depth': hp.choice('max_depth', [5, 10, 15, 20])
}
Optuna Visualization
# Optimization history
fig1 = optuna.visualization.plot_optimization_history(study)
print("Optimization history plotted")
# Parameter importances
fig2 = optuna.visualization.plot_param_importances(study)
print("Parameter importances plotted")
# Parallel coordinate plot
fig3 = optuna.visualization.plot_parallel_coordinate(study)
print("Parallel coordinates plotted")
# Slice plot β parameter relationships
fig4 = optuna.visualization.plot_slice(study)
print("Slice plot plotted")
# Contour plot β parameter interactions
fig5 = optuna.visualization.plot_contour(study, params=['max_depth', 'n_estimators'])
print("Contour plot plotted")
Practical Comparison
results = {}
# Grid search
import time
start = time.time()
grid_search = GridSearchCV(
RandomForestClassifier(random_state=42),
{'n_estimators': [50, 100, 200], 'max_depth': [5, 10, 15]},
cv=3, scoring='roc_auc', n_jobs=-1
)
grid_search.fit(X, y)
results['Grid Search (9 combos)'] = {
'score': grid_search.best_score_,
'time': time.time() - start
}
# Random search
start = time.time()
random_search = RandomizedSearchCV(
RandomForestClassifier(random_state=42),
{'n_estimators': range(50, 500), 'max_depth': range(3, 30)},
n_iter=20, cv=3, scoring='roc_auc', random_state=42, n_jobs=-1
)
random_search.fit(X, y)
results['Random Search (20 iters)'] = {
'score': random_search.best_score_,
'time': time.time() - start
}
# Bayesian (Optuna)
start = time.time()
def obj(trial):
params = {
'n_estimators': trial.suggest_int('n_estimators', 50, 500),
'max_depth': trial.suggest_int('max_depth', 3, 30)
}
model = RandomForestClassifier(**params, random_state=42)
return cross_val_score(model, X, y, cv=3, scoring='roc_auc').mean()
study = optuna.create_study(direction='maximize')
study.optimize(obj, n_trials=20)
results['Bayesian (20 trials)'] = {
'score': study.best_value,
'time': time.time() - start
}
results_df = pd.DataFrame(results).T
print(results_df)
Best Practices
- Start with random search β establishes a baseline quickly
- Use log scale for learning rates β they span orders of magnitude
- Prune early β stop unpromising trials to save compute
- Set a timeout β prevents runaway optimization
- Use conditional spaces β model-specific hyperparameters matter
- Track everything β Optuna's database backend enables experiment management
Summary
Bayesian optimization with Optuna or Hyperopt finds better hyperparameters in fewer trials than grid or random search. Combined with pruning, conditional spaces, and visualization, it's the most efficient way to tune models. Master search space design and early stopping for maximum impact.