Pipeline Regression Tuning
This example shows how to tune a scikit-learn Pipeline with GASearchCV. Pipeline parameters use the standard sklearn double-underscore syntax: regressor__max_depth.
Setup
python
import warnings
from pprint import pprint
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import KFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn_genetic import (
EvolutionConfig, GASearchCV, OptimizationConfig, PopulationConfig, RuntimeConfig,
)
from sklearn_genetic.callbacks import ConsecutiveStopping, DeltaThreshold, TimerStopping
from sklearn_genetic.plots import plot_fitness_evolution, plot_search_space
from sklearn_genetic.schedules import ExponentialAdapter, InverseAdapter
from sklearn_genetic.space import Categorical, Continuous, Integer
warnings.filterwarnings("ignore", category=UserWarning)
RANDOM_STATE = 42
data = load_diabetes(as_frame=True)
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.30, random_state=RANDOM_STATE
)
cv = KFold(n_splits=4, shuffle=True, random_state=RANDOM_STATE)
print(f"Training shape: {X_train.shape}") # (309, 10)
print(f"Test shape: {X_test.shape}") # (133, 10)Baseline Pipeline
python
def make_pipeline(**regressor_kwargs):
return Pipeline([
("scaler", StandardScaler()),
("regressor", GradientBoostingRegressor(random_state=RANDOM_STATE, **regressor_kwargs)),
])
def regression_metrics(estimator, X_eval, y_eval):
predictions = estimator.predict(X_eval)
rmse = mean_squared_error(y_eval, predictions) ** 0.5
return {
"r2": r2_score(y_eval, predictions),
"rmse": rmse,
"mae": mean_absolute_error(y_eval, predictions),
}
baseline = make_pipeline()
baseline.fit(X_train, y_train)
baseline_metrics = regression_metrics(baseline, X_test, y_test)
print(baseline_metrics)
# {'r2': 0.430, 'rmse': 55.46, 'mae': 44.72}Pipeline Search Space
Parameter names use the sklearn step__param convention.
python
param_grid = {
"regressor__n_estimators": Integer(40, 180),
"regressor__learning_rate": Continuous(0.01, 0.20, distribution="log-uniform"),
"regressor__max_depth": Integer(1, 4),
"regressor__min_samples_leaf": Integer(1, 12),
"regressor__subsample": Continuous(0.65, 1.0),
"regressor__loss": Categorical(["squared_error", "absolute_error", "huber"]),
}Regression scorers
For metrics where smaller is better, use sklearn's negative scorer: "neg_root_mean_squared_error". The GA maximizes the fitness value, so negative RMSE increases as RMSE decreases.
Configure GASearchCV
python
search = GASearchCV(
estimator=make_pipeline(),
param_grid=param_grid,
scoring="neg_root_mean_squared_error",
criteria="max",
cv=cv,
evolution_config=EvolutionConfig(
population_size=12,
generations=10,
crossover_probability=ExponentialAdapter(initial_value=0.8, end_value=0.4, adaptive_rate=0.15),
mutation_probability=InverseAdapter(initial_value=0.25, end_value=0.08, adaptive_rate=0.25),
tournament_size=3,
elitism=True,
keep_top_k=3,
),
population_config=PopulationConfig(
initializer="smart",
warm_start_configs=[{
"regressor__n_estimators": 100,
"regressor__learning_rate": 0.05,
"regressor__max_depth": 2,
"regressor__min_samples_leaf": 4,
"regressor__subsample": 0.85,
"regressor__loss": "squared_error",
}],
),
runtime_config=RuntimeConfig(
n_jobs=-1,
parallel_backend="auto",
use_cache=True,
verbose=True,
return_train_score=False,
),
optimization_config=OptimizationConfig(
local_search=True,
local_search_top_k=2,
local_search_steps=1,
local_search_radius=0.20,
diversity_control=True,
diversity_threshold=0.30,
diversity_stagnation_generations=3,
diversity_mutation_boost=1.8,
random_immigrants_fraction=0.10,
fitness_sharing=True,
sharing_radius=0.40,
),
)
callbacks = [
DeltaThreshold(threshold=0.01, generations=5, metric="fitness_best"),
ConsecutiveStopping(generations=7, metric="fitness_best"),
TimerStopping(total_seconds=120),
]
search.fit(X_train, y_train, callbacks=callbacks)Evaluate Predictions
GASearchCV refits the best pipeline automatically, so you can call predict directly on the search object.
python
print("Best CV negative RMSE:", round(search.best_score_, 4))
pprint(search.best_params_)
ga_metrics = regression_metrics(search, X_test, y_test)
pd.DataFrame([baseline_metrics, ga_metrics], index=["baseline", "ga_pipeline"])Search Cost and Telemetry
python
print(search.fit_stats_)
# {'evaluated_candidates': 134, 'cache_hits': 1, 'random_immigrants': 3,
# 'local_refinement_candidates': 2, ...}
history = pd.DataFrame(search.history)
cols = ["gen", "fitness", "fitness_max", "unique_individual_ratio",
"genotype_diversity", "stagnation_generations"]
print(history[[c for c in cols if c in history.columns]].tail())Visualize the Search
python
import matplotlib.pyplot as plt
plot_fitness_evolution(search)
plt.show()
# Show how two parameters were sampled
plot_search_space(search, features=["regressor__learning_rate", "regressor__max_depth"])
plt.show()Practical Notes
- Use pipeline parameter names exactly as sklearn expects them (
step__param). - For regression losses where larger is better only after negation, use sklearn's negative scorers such as
"neg_root_mean_squared_error". - Compare holdout metrics, not only CV fitness.
- If the search revisits many candidates, inspect
cache_hitsinfit_stats_and consider stronger diversity controls or a larger search space.
See Also
- Pipeline Tuning Guide — pipeline parameter naming and step configuration
- Search Space API —
Continuous,Integer,Categoricalreference - Plots API —
plot_fitness_evolutionandplot_search_spacereference
