Skip to content

Pipeline Regression Tuning

This example shows how to tune a scikit-learn Pipeline with GASearchCV. Pipeline parameters use the standard sklearn double-underscore syntax: regressor__max_depth.

Setup

python
import warnings
from pprint import pprint

import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import KFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn_genetic import (
    EvolutionConfig, GASearchCV, OptimizationConfig, PopulationConfig, RuntimeConfig,
)
from sklearn_genetic.callbacks import ConsecutiveStopping, DeltaThreshold, TimerStopping
from sklearn_genetic.plots import plot_fitness_evolution, plot_search_space
from sklearn_genetic.schedules import ExponentialAdapter, InverseAdapter
from sklearn_genetic.space import Categorical, Continuous, Integer

warnings.filterwarnings("ignore", category=UserWarning)

RANDOM_STATE = 42

data = load_diabetes(as_frame=True)
X, y = data.data, data.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=RANDOM_STATE
)
cv = KFold(n_splits=4, shuffle=True, random_state=RANDOM_STATE)

print(f"Training shape: {X_train.shape}")  # (309, 10)
print(f"Test shape: {X_test.shape}")       # (133, 10)

Baseline Pipeline

python
def make_pipeline(**regressor_kwargs):
    return Pipeline([
        ("scaler", StandardScaler()),
        ("regressor", GradientBoostingRegressor(random_state=RANDOM_STATE, **regressor_kwargs)),
    ])


def regression_metrics(estimator, X_eval, y_eval):
    predictions = estimator.predict(X_eval)
    rmse = mean_squared_error(y_eval, predictions) ** 0.5
    return {
        "r2": r2_score(y_eval, predictions),
        "rmse": rmse,
        "mae": mean_absolute_error(y_eval, predictions),
    }


baseline = make_pipeline()
baseline.fit(X_train, y_train)
baseline_metrics = regression_metrics(baseline, X_test, y_test)
print(baseline_metrics)
# {'r2': 0.430, 'rmse': 55.46, 'mae': 44.72}

Pipeline Search Space

Parameter names use the sklearn step__param convention.

python
param_grid = {
    "regressor__n_estimators": Integer(40, 180),
    "regressor__learning_rate": Continuous(0.01, 0.20, distribution="log-uniform"),
    "regressor__max_depth": Integer(1, 4),
    "regressor__min_samples_leaf": Integer(1, 12),
    "regressor__subsample": Continuous(0.65, 1.0),
    "regressor__loss": Categorical(["squared_error", "absolute_error", "huber"]),
}

Regression scorers

For metrics where smaller is better, use sklearn's negative scorer: "neg_root_mean_squared_error". The GA maximizes the fitness value, so negative RMSE increases as RMSE decreases.

Configure GASearchCV

python
search = GASearchCV(
    estimator=make_pipeline(),
    param_grid=param_grid,
    scoring="neg_root_mean_squared_error",
    criteria="max",
    cv=cv,
    evolution_config=EvolutionConfig(
        population_size=12,
        generations=10,
        crossover_probability=ExponentialAdapter(initial_value=0.8, end_value=0.4, adaptive_rate=0.15),
        mutation_probability=InverseAdapter(initial_value=0.25, end_value=0.08, adaptive_rate=0.25),
        tournament_size=3,
        elitism=True,
        keep_top_k=3,
    ),
    population_config=PopulationConfig(
        initializer="smart",
        warm_start_configs=[{
            "regressor__n_estimators": 100,
            "regressor__learning_rate": 0.05,
            "regressor__max_depth": 2,
            "regressor__min_samples_leaf": 4,
            "regressor__subsample": 0.85,
            "regressor__loss": "squared_error",
        }],
    ),
    runtime_config=RuntimeConfig(
        n_jobs=-1,
        parallel_backend="auto",
        use_cache=True,
        verbose=True,
        return_train_score=False,
    ),
    optimization_config=OptimizationConfig(
        local_search=True,
        local_search_top_k=2,
        local_search_steps=1,
        local_search_radius=0.20,
        diversity_control=True,
        diversity_threshold=0.30,
        diversity_stagnation_generations=3,
        diversity_mutation_boost=1.8,
        random_immigrants_fraction=0.10,
        fitness_sharing=True,
        sharing_radius=0.40,
    ),
)

callbacks = [
    DeltaThreshold(threshold=0.01, generations=5, metric="fitness_best"),
    ConsecutiveStopping(generations=7, metric="fitness_best"),
    TimerStopping(total_seconds=120),
]

search.fit(X_train, y_train, callbacks=callbacks)

Evaluate Predictions

GASearchCV refits the best pipeline automatically, so you can call predict directly on the search object.

python
print("Best CV negative RMSE:", round(search.best_score_, 4))
pprint(search.best_params_)

ga_metrics = regression_metrics(search, X_test, y_test)
pd.DataFrame([baseline_metrics, ga_metrics], index=["baseline", "ga_pipeline"])

Search Cost and Telemetry

python
print(search.fit_stats_)
# {'evaluated_candidates': 134, 'cache_hits': 1, 'random_immigrants': 3,
#  'local_refinement_candidates': 2, ...}

history = pd.DataFrame(search.history)
cols = ["gen", "fitness", "fitness_max", "unique_individual_ratio",
        "genotype_diversity", "stagnation_generations"]
print(history[[c for c in cols if c in history.columns]].tail())
python
import matplotlib.pyplot as plt

plot_fitness_evolution(search)
plt.show()

# Show how two parameters were sampled
plot_search_space(search, features=["regressor__learning_rate", "regressor__max_depth"])
plt.show()

Practical Notes

  • Use pipeline parameter names exactly as sklearn expects them (step__param).
  • For regression losses where larger is better only after negation, use sklearn's negative scorers such as "neg_root_mean_squared_error".
  • Compare holdout metrics, not only CV fitness.
  • If the search revisits many candidates, inspect cache_hits in fit_stats_ and consider stronger diversity controls or a larger search space.

See Also

Released under the MIT License.