Bayesian Optimization for Hyperparameters

Grid search is brute force; random search is smarter but still wasteful. Bayesian optimization builds a probabilistic model of the objective function and uses it to intelligently select the next hyperparameters to evaluate – finding better models in fewer iterations.

Bayesian Optimization Loop

Why Bayesian Optimization

Each model training run is expensive. Bayesian optimization minimizes wasted trials by learning from past results, focusing evaluation on promising regions of the hyperparameter space.

import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
import optuna
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import warnings
warnings.filterwarnings('ignore')

optuna.logging.set_verbosity(optuna.logging.WARNING)

Generate Dataset

X, y = make_classification(
    n_samples=2000, n_features=30, n_informative=15,
    n_redundant=5, random_state=42
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
print(f"Dataset: {X.shape}, classes: {np.bincount(y)}")

Grid Search vs Random Search vs Bayesian

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Grid Search – exhaustive but slow
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid, cv=3, scoring='roc_auc', n_jobs=-1
)
grid_search.fit(X, y)
print(f"Grid Search: {grid_search.best_score_:.4f} with {grid_search.best_params_}")

# Random Search – samples from distributions
param_distributions = {
    'n_estimators': [50, 100, 200, 300, 500],
    'max_depth': [3, 5, 7, 10, 15, None],
    'min_samples_split': [2, 5, 10, 15, 20]
}

random_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions, n_iter=20, cv=3,
    scoring='roc_auc', random_state=42, n_jobs=-1
)
random_search.fit(X, y)
print(f"Random Search: {random_search.best_score_:.4f} with {random_search.best_params_}")

Hyperopt: Bayesian Optimization

Basic Hyperopt Usage

from hyperopt import hp

# Define search space
space = {
    'n_estimators': hp.choice('n_estimators', [50, 100, 200, 300, 500]),
    'max_depth': hp.choice('max_depth', [3, 5, 7, 10, 15, None]),
    'min_samples_split': hp.uniform('min_samples_split', 2, 20),
    'min_samples_leaf': hp.uniform('min_samples_leaf', 1, 10),
    'max_features': hp.choice('max_features', ['sqrt', 'log2', 0.5, 0.8])
}

def objective(params):
    model = RandomForestClassifier(
        n_estimators=params['n_estimators'],
        max_depth=params['max_depth'],
        min_samples_split=int(params['min_samples_split']),
        min_samples_leaf=int(params['min_samples_leaf']),
        max_features=params['max_features'],
        random_state=42, n_jobs=-1
    )
    scores = cross_val_score(model, X, y, cv=3, scoring='roc_auc')
    return {'loss': -scores.mean(), 'status': STATUS_OK}

trials = Trials()
best = fmin(
    fn=objective, space=space, algo=tpe.suggest,
    max_evals=50, trials=trials, random_state=42
)
print(f"Best params: {best}")
print(f"Best AUC: {-trials.best_trial['result']['loss']:.4f}")

Hyperopt with Different Algorithms

# TPE (Tree-structured Parzen Estimator) – default, good for most cases
best_tpe = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=30, trials=Trials())

# Random search as baseline
best_random = fmin(fn=objective, space=space, algo=hp.rand.suggest, max_evals=30, trials=Trials())

print(f"TPE best AUC: {-Trials().best_trial['result']['loss']:.4f}")

Optuna: Modern Bayesian Optimization

Optuna provides a cleaner API with built-in pruning and visualization.

Basic Optuna Usage

def objective_optuna(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', 0.5, 0.8])
    }
    
    model = RandomForestClassifier(**params, random_state=42, n_jobs=-1)
    scores = cross_val_score(model, X, y, cv=3, scoring='roc_auc')
    return scores.mean()

study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=42))
study.optimize(objective_optuna, n_trials=50, show_progress_bar=True)

print(f"Best AUC: {study.best_value:.4f}")
print(f"Best params: {study.best_params}")

Optuna with Pruning

Stop unpromising trials early to save computation.

def objective_pruning(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 500)
    max_depth = trial.suggest_int('max_depth', 3, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    
    # Simulate iterative training with pruning
    model = RandomForestClassifier(
        n_estimators=n_estimators, max_depth=max_depth,
        min_samples_split=min_samples_split, random_state=42
    )
    
    # Use cross-validation with early stopping via pruning
    scores = []
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        model.fit(X[train_idx], y[train_idx])
        pred = model.predict_proba(X[val_idx])[:, 1]
        score = roc_auc_score(y[val_idx], pred)
        scores.append(score)
        
        # Report intermediate value for pruning
        trial.report(np.mean(scores), fold)
        if trial.should_prune():
            raise optuna.TrialPruned()
    
    return np.mean(scores)

study_pruned = optuna.create_study(
    direction='maximize',
    pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=2)
)
study_pruned.optimize(objective_pruning, n_trials=30, show_progress_bar=True)
print(f"Pruned study best: {study_pruned.best_value:.4f}")

Optuna with Conditional Search Spaces

Different hyperparameters for different model choices.

def objective_conditional(trial):
    model_type = trial.suggest_categorical('model_type', ['rf', 'gbm', 'xgb'])
    
    if model_type == 'rf':
        n_estimators = trial.suggest_int('rf_n_estimators', 50, 300)
        max_depth = trial.suggest_int('rf_max_depth', 3, 15)
        model = RandomForestClassifier(
            n_estimators=n_estimators, max_depth=max_depth, random_state=42
        )
    elif model_type == 'gbm':
        n_estimators = trial.suggest_int('gbm_n_estimators', 50, 300)
        learning_rate = trial.suggest_float('gbm_lr', 0.01, 0.3, log=True)
        max_depth = trial.suggest_int('gbm_max_depth', 3, 10)
        model = GradientBoostingClassifier(
            n_estimators=n_estimators, learning_rate=learning_rate,
            max_depth=max_depth, random_state=42
        )
    else:
        n_estimators = trial.suggest_int('xgb_n_estimators', 50, 300)
        learning_rate = trial.suggest_float('xgb_lr', 0.01, 0.3, log=True)
        max_depth = trial.suggest_int('xgb_max_depth', 3, 10)
        model = xgb.XGBClassifier(
            n_estimators=n_estimators, learning_rate=learning_rate,
            max_depth=max_depth, random_state=42, eval_metric='logloss'
        )
    
    scores = cross_val_score(model, X, y, cv=3, scoring='roc_auc')
    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective_conditional, n_trials=40)
print(f"Best model: {study.best_params['model_type']}")
print(f"Best AUC: {study.best_value:.4f}")

Search Space Design

Good search spaces reflect the structure of the problem. The Expected Improvement (EI) acquisition function balances exploration and exploitation:

\alpha(x) = \mathbb{E}[\max(f(x) - f(x^+), 0)]

where $f(x^+)$ is the best observed value. EI is high where the model predicts improvement with high confidence (exploitation) or where uncertainty is high (exploration).

# Continuous parameters – use log scale for learning rates
learning_rate = hp.loguniform('learning_rate', np.log(0.001), np.log(0.3))

# Integer parameters – uniform for small ranges
max_depth = hp.choice('max_depth', [3, 4, 5, 6, 7, 8, 10, 12, 15, None])

# Categorical parameters
optimizer = hp.choice('optimizer', ['adam', 'sgd', 'rmsprop'])

# Conditional parameters – depends on other choices
use_dropout = hp.choice('use_dropout', [True, False])
dropout_rate = hp.uniform('dropout_rate', 0.1, 0.5)  # only relevant if use_dropout=True

# Nested conditionals
def create_search_space():
    model_type = hp.choice('model_type', ['neural', 'tree'])
    
    if model_type == 'neural':
        return {
            'model_type': model_type,
            'hidden_layers': hp.choice('hidden_layers', [1, 2, 3]),
            'learning_rate': hp.loguniform('lr', np.log(1e-4), np.log(1e-1)),
            'dropout': hp.uniform('dropout', 0.1, 0.5)
        }
    else:
        return {
            'model_type': model_type,
            'n_estimators': hp.choice('n_estimators', [100, 200, 500]),
            'max_depth': hp.choice('max_depth', [5, 10, 15, 20])
        }

Optuna Visualization

# Optimization history
fig1 = optuna.visualization.plot_optimization_history(study)
print("Optimization history plotted")

# Parameter importances
fig2 = optuna.visualization.plot_param_importances(study)
print("Parameter importances plotted")

# Parallel coordinate plot
fig3 = optuna.visualization.plot_parallel_coordinate(study)
print("Parallel coordinates plotted")

# Slice plot – parameter relationships
fig4 = optuna.visualization.plot_slice(study)
print("Slice plot plotted")

# Contour plot – parameter interactions
fig5 = optuna.visualization.plot_contour(study, params=['max_depth', 'n_estimators'])
print("Contour plot plotted")

Practical Comparison

results = {}

# Grid search
import time
start = time.time()
grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    {'n_estimators': [50, 100, 200], 'max_depth': [5, 10, 15]},
    cv=3, scoring='roc_auc', n_jobs=-1
)
grid_search.fit(X, y)
results['Grid Search (9 combos)'] = {
    'score': grid_search.best_score_,
    'time': time.time() - start
}

# Random search
start = time.time()
random_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    {'n_estimators': range(50, 500), 'max_depth': range(3, 30)},
    n_iter=20, cv=3, scoring='roc_auc', random_state=42, n_jobs=-1
)
random_search.fit(X, y)
results['Random Search (20 iters)'] = {
    'score': random_search.best_score_,
    'time': time.time() - start
}

# Bayesian (Optuna)
start = time.time()
def obj(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 30)
    }
    model = RandomForestClassifier(**params, random_state=42)
    return cross_val_score(model, X, y, cv=3, scoring='roc_auc').mean()

study = optuna.create_study(direction='maximize')
study.optimize(obj, n_trials=20)
results['Bayesian (20 trials)'] = {
    'score': study.best_value,
    'time': time.time() - start
}

results_df = pd.DataFrame(results).T
print(results_df)

Best Practices

Start with random search – establishes a baseline quickly
Use log scale for learning rates – they span orders of magnitude
Prune early – stop unpromising trials to save compute
Set a timeout – prevents runaway optimization
Use conditional spaces – model-specific hyperparameters matter
Track everything – Optuna's database backend enables experiment management

Summary

Bayesian optimization with Optuna or Hyperopt finds better hyperparameters in fewer trials than grid or random search. Combined with pruning, conditional spaces, and visualization, it's the most efficient way to tune models. Master search space design and early stopping for maximum impact.