Development version
This is the latest (dev) documentation. It may contain unreleased features or breaking changes. For the stable release, use version 0.13.
Feature Selection With Noisy Data
This example adds synthetic noise features to the Iris dataset to make feature selection realistic. A useful selector should keep a small subset of the original measurements and discard most noise columns.
Setup
python
import warnings
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn_genetic import (
EvolutionConfig, GAFeatureSelectionCV, OptimizationConfig, PopulationConfig, RuntimeConfig,
)
from sklearn_genetic.callbacks import ConsecutiveStopping, DeltaThreshold, TimerStopping
from sklearn_genetic.schedules import ExponentialAdapter, InverseAdapter
warnings.filterwarnings("ignore", category=UserWarning)
RANDOM_STATE = 42
rng = np.random.default_rng(RANDOM_STATE)Add Noise Features
python
iris = load_iris(as_frame=True)
X_original = iris.data
y = iris.target
noise = pd.DataFrame(
rng.normal(size=(X_original.shape[0], 12)),
columns=[f"noise_{i:02d}" for i in range(12)],
)
X = pd.concat([X_original, noise], axis=1)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.30, stratify=y, random_state=RANDOM_STATE
)
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)
print(f"Original features: {X_original.shape[1]}") # 4
print(f"Noise features: {noise.shape[1]}") # 12
print(f"Total features: {X.shape[1]}") # 16Baseline With All Features
python
def make_svc_pipeline():
return Pipeline([
("scaler", StandardScaler()),
("svc", SVC(kernel="rbf", C=2.0, gamma="scale", random_state=RANDOM_STATE)),
])
def evaluate(estimator, X_eval, y_eval):
predictions = estimator.predict(X_eval)
return {
"accuracy": accuracy_score(y_eval, predictions),
"balanced_accuracy": balanced_accuracy_score(y_eval, predictions),
}
baseline = make_svc_pipeline()
baseline.fit(X_train, y_train)
baseline_metrics = evaluate(baseline, X_test, y_test)
print(baseline_metrics)
# {'accuracy': 0.822, 'balanced_accuracy': 0.822}Configure GAFeatureSelectionCV
GAFeatureSelectionCV searches over binary masks — 1 means the feature is selected, 0 means it is excluded.
python
selector = GAFeatureSelectionCV(
estimator=make_svc_pipeline(),
cv=cv,
scoring="balanced_accuracy",
max_features=6, # prefer compact subsets
evolution_config=EvolutionConfig(
population_size=20,
generations=15,
crossover_probability=ExponentialAdapter(initial_value=0.8, end_value=0.4, adaptive_rate=0.15),
mutation_probability=InverseAdapter(initial_value=0.30, end_value=0.08, adaptive_rate=0.25),
tournament_size=3,
elitism=True,
keep_top_k=3,
),
population_config=PopulationConfig(initializer="smart"),
runtime_config=RuntimeConfig(n_jobs=-1, parallel_backend="auto", use_cache=True, verbose=True),
optimization_config=OptimizationConfig(
local_search=True,
local_search_top_k=2,
local_search_steps=1,
local_search_radius=0.15,
diversity_control=True,
diversity_threshold=0.30,
diversity_stagnation_generations=3,
diversity_mutation_boost=1.8,
random_immigrants_fraction=0.10,
fitness_sharing=True,
sharing_radius=0.40,
),
)
callbacks = [
DeltaThreshold(threshold=0.001, generations=5, metric="fitness_best"),
ConsecutiveStopping(generations=7, metric="fitness_best"),
TimerStopping(total_seconds=90),
]
selector.fit(X_train, y_train, callbacks=callbacks)Inspect Selected Features
The fitted selector exposes support_, just like sklearn feature selectors.
python
selected_features = X_train.columns[selector.support_]
selected_summary = pd.DataFrame({
"feature": X_train.columns,
"selected": selector.support_,
"kind": ["original" if c in X_original.columns else "noise" for c in X_train.columns],
})
print(f"Selected {len(selected_features)} of {X_train.shape[1]} features")
print(selected_summary[selected_summary["selected"]])Telemetry
python
# Evaluation mechanics
print(selector.fit_stats_)
# {'evaluated_candidates': 182, 'cache_hits': 0, 'random_immigrants': 16, ...}
# Per-generation stats
history = pd.DataFrame(selector.history)
telemetry_cols = [
"gen", "fitness", "fitness_max", "fitness_std",
"unique_individual_ratio", "genotype_diversity", "stagnation_generations",
]
print(history[[c for c in telemetry_cols if c in history.columns]])python
import matplotlib.pyplot as plt
ax = history.plot(
x="gen", y=["fitness_best", "fitness_max", "fitness"],
marker="o", figsize=(8, 4),
)
ax.set_title("Feature-selection fitness over generations")
ax.set_xlabel("Generation")
ax.set_ylabel("Balanced accuracy")
plt.show()Compare Results
python
selector_metrics = evaluate(selector, X_test, y_test)
pd.DataFrame(
[baseline_metrics, selector_metrics],
index=["all_features", "selected_features"],
)
# selected_features typically improves on the noisy-input baseline
print(classification_report(y_test, selector.predict(X_test), target_names=iris.target_names))Practical Notes
max_featuresis a useful way to make the selection prefer compact solutions.- If many candidates are skipped as invalid, increase
max_featuresor reduce mutation strength. - If diversity drops quickly, use
diversity_control,random_immigrants_fraction, andfitness_sharingbefore simply adding more generations. - Always compare with an all-feature baseline. A smaller selected subset is only useful if quality remains acceptable.
See Also
- GAFeatureSelectionCV API — full parameter reference
- Advanced Optimizer Control — diversity control, fitness sharing, local search
- Adaptive Schedules — how
ExponentialAdapterandInverseAdapterwork
