Parameters vs Hyperparameters
Parameters are learned from data during training (e.g., neural network weights). Hyperparameters are set before training and control the learning process.
# Examples of hyperparameters:
# Random Forest
# - n_estimators: number of trees
# - max_depth: tree depth
# - min_samples_split: minimum samples to split
# Neural Networks
# - learning_rate: step size for gradient descent
# - batch_size: samples per gradient update
# - num_layers: network depth
# XGBoost
# - learning_rate: shrinkage
# - max_depth: tree depth
# - n_estimators: boosting rounds
# Why tune?
# Default hyperparameters rarely give best results
# Proper tuning can improve accuracy by 5-20%!
Grid Search
Exhaustively search through all combinations of specified hyperparameters.
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
# Define parameter grid
param_grid = {
'n_estimators': [100, 200, 300],
'max_depth': [5, 10, 15, None],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
# This will try 3 * 4 * 3 * 3 = 108 combinations!
model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(
model,
param_grid,
cv=5, # 5-fold cross-validation
scoring='f1', # Metric to optimize
n_jobs=-1, # Use all CPU cores
verbose=2
)
grid_search.fit(X_train, y_train)
# Best results
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_:.4f}")
# Use best model
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)
# Pros: Thorough, guaranteed to find best in grid
# Cons: Very slow, exponential growth with parameters
Random Search
Randomly sample from parameter distributions. Often better than grid search!
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
# Define parameter distributions
param_dist = {
'n_estimators': randint(100, 500),
'max_depth': randint(3, 20),
'min_samples_split': randint(2, 20),
'min_samples_leaf': randint(1, 10),
'max_features': uniform(0.1, 0.9) # Continuous
}
random_search = RandomizedSearchCV(
RandomForestClassifier(random_state=42),
param_distributions=param_dist,
n_iter=50, # Number of random combinations
cv=5,
scoring='f1',
n_jobs=-1,
random_state=42,
verbose=2
)
random_search.fit(X_train, y_train)
print(f"Best parameters: {random_search.best_params_}")
print(f"Best score: {random_search.best_score_:.4f}")
# Why Random Search is often better:
# - Same compute budget explores more values per parameter
# - Some parameters matter more than others
# - Not all combinations need to be tested
Bayesian Optimization with Optuna
Intelligently explore the parameter space using probabilistic models. The state-of-the-art approach.
# pip install optuna
import optuna
from sklearn.model_selection import cross_val_score
def objective(trial):
# Define hyperparameter search space
params = {
'n_estimators': trial.suggest_int('n_estimators', 100, 500),
'max_depth': trial.suggest_int('max_depth', 3, 20),
'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
'max_features': trial.suggest_float('max_features', 0.1, 1.0)
}
model = RandomForestClassifier(**params, random_state=42)
score = cross_val_score(model, X_train, y_train, cv=5, scoring='f1')
return score.mean()
# Create study and optimize
study = optuna.create_study(direction='maximize') # Maximize F1
study.optimize(objective, n_trials=100, show_progress_bar=True)
# Results
print(f"Best trial: {study.best_trial.params}")
print(f"Best score: {study.best_value:.4f}")
# Train final model with best params
best_model = RandomForestClassifier(**study.best_trial.params, random_state=42)
best_model.fit(X_train, y_train)
# Visualize optimization
optuna.visualization.plot_optimization_history(study)
optuna.visualization.plot_param_importances(study)
Optuna for Deep Learning
import optuna
import tensorflow as tf
from tensorflow import keras
def create_model(trial):
# Architecture hyperparameters
n_layers = trial.suggest_int('n_layers', 1, 4)
units = trial.suggest_int('units', 32, 256)
dropout = trial.suggest_float('dropout', 0.1, 0.5)
learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True)
model = keras.Sequential()
model.add(keras.layers.Input(shape=(X_train.shape[1],)))
for i in range(n_layers):
model.add(keras.layers.Dense(units, activation='relu'))
model.add(keras.layers.Dropout(dropout))
model.add(keras.layers.Dense(1, activation='sigmoid'))
model.compile(
optimizer=keras.optimizers.Adam(learning_rate),
loss='binary_crossentropy',
metrics=['accuracy']
)
return model
def objective(trial):
model = create_model(trial)
# Training hyperparameters
batch_size = trial.suggest_int('batch_size', 16, 128)
# Pruning callback (stop unpromising trials early)
pruning_callback = optuna.integration.TFKerasPruningCallback(
trial, 'val_accuracy'
)
history = model.fit(
X_train, y_train,
epochs=50,
batch_size=batch_size,
validation_split=0.2,
callbacks=[pruning_callback],
verbose=0
)
return max(history.history['val_accuracy'])
study = optuna.create_study(
direction='maximize',
pruner=optuna.pruners.MedianPruner() # Early stop bad trials
)
study.optimize(objective, n_trials=50)
XGBoost Tuning Guide
import xgboost as xgb
import optuna
def objective(trial):
params = {
'objective': 'binary:logistic',
'eval_metric': 'auc',
'booster': 'gbtree',
# Most important parameters
'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
'max_depth': trial.suggest_int('max_depth', 3, 10),
# Regularization
'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10, log=True),
'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10, log=True),
# Tree-specific
'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
'subsample': trial.suggest_float('subsample', 0.5, 1.0),
'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
}
model = xgb.XGBClassifier(**params, random_state=42, n_jobs=-1)
score = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
return score.mean()
# Tuning order recommendation:
# 1. n_estimators, learning_rate (most important)
# 2. max_depth, min_child_weight
# 3. subsample, colsample_bytree
# 4. reg_alpha, reg_lambda (regularization)
Practical Tips
- Start with defaults: Get a baseline before tuning
- Focus on important parameters: Not all hyperparameters matter equally
- Use Random Search first: Quick exploration of the search space
- Then Bayesian optimization: For fine-tuning promising regions
- Always use cross-validation: Single train-test split is unreliable
- Set a time budget: Diminishing returns after certain point
- Log your experiments: Use MLflow or Weights & Biases
Common Parameter Ranges
# Random Forest
param_dist_rf = {
'n_estimators': (100, 500),
'max_depth': (5, 30),
'min_samples_split': (2, 20),
'min_samples_leaf': (1, 10),
'max_features': ['sqrt', 'log2', 0.3, 0.5, 0.7]
}
# XGBoost / LightGBM
param_dist_xgb = {
'learning_rate': (0.01, 0.3),
'n_estimators': (100, 1000),
'max_depth': (3, 10),
'min_child_weight': (1, 10),
'subsample': (0.5, 1.0),
'colsample_bytree': (0.5, 1.0),
'reg_alpha': (1e-8, 10), # Log scale
'reg_lambda': (1e-8, 10) # Log scale
}
# Neural Networks
param_dist_nn = {
'learning_rate': (1e-5, 1e-2), # Log scale
'batch_size': [16, 32, 64, 128],
'hidden_layers': (1, 5),
'units_per_layer': (32, 512),
'dropout': (0.0, 0.5)
}
Master Hyperparameter Tuning
Our Data Science program teaches systematic model optimization techniques.
Explore Data Science Program