Feature Selection With Noisy Data
This example adds synthetic noise features to the Iris dataset to make feature selection realistic. A useful selector should keep a small subset of the original measurements and discard most noise columns.
Setup
python
import warnings
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn_genetic import (
EvolutionConfig, GAFeatureSelectionCV, OptimizationConfig, PopulationConfig, RuntimeConfig,
)
from sklearn_genetic.callbacks import ConsecutiveStopping, DeltaThreshold, TimerStopping
from sklearn_genetic.schedules import ExponentialAdapter, InverseAdapter
warnings.filterwarnings("ignore", category=UserWarning)
RANDOM_STATE = 42
rng = np.random.default_rng(RANDOM_STATE)Add Noise Features
python
iris = load_iris(as_frame=True)
X_original = iris.data
y = iris.target
noise = pd.DataFrame(
rng.normal(size=(X_original.shape[0], 12)),
columns=[f"noise_{i:02d}" for i in range(12)],
)
X = pd.concat([X_original, noise], axis=1)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.30, stratify=y, random_state=RANDOM_STATE
)
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)
print(f"Original features: {X_original.shape[1]}") # 4
print(f"Noise features: {noise.shape[1]}") # 12
print(f"Total features: {X.shape[1]}") # 16Baseline With All Features
python
def make_svc_pipeline():
return Pipeline([
("scaler", StandardScaler()),
("svc", SVC(kernel="rbf", C=2.0, gamma="scale", random_state=RANDOM_STATE)),
])
def evaluate(estimator, X_eval, y_eval):
predictions = estimator.predict(X_eval)
return {
"accuracy": accuracy_score(y_eval, predictions),
"balanced_accuracy": balanced_accuracy_score(y_eval, predictions),
}
baseline = make_svc_pipeline()
baseline.fit(X_train, y_train)
baseline_metrics = evaluate(baseline, X_test, y_test)
print(baseline_metrics)
# {'accuracy': 0.822, 'balanced_accuracy': 0.822}Configure GAFeatureSelectionCV
GAFeatureSelectionCV searches over binary masks — 1 means the feature is selected, 0 means it is excluded.
python
selector = GAFeatureSelectionCV(
estimator=make_svc_pipeline(),
cv=cv,
scoring="balanced_accuracy",
max_features=6, # prefer compact subsets
evolution_config=EvolutionConfig(
population_size=20,
generations=15,
crossover_probability=ExponentialAdapter(initial_value=0.8, end_value=0.4, adaptive_rate=0.15),
mutation_probability=InverseAdapter(initial_value=0.30, end_value=0.08, adaptive_rate=0.25),
tournament_size=3,
elitism=True,
keep_top_k=3,
),
population_config=PopulationConfig(initializer="smart"),
runtime_config=RuntimeConfig(n_jobs=-1, parallel_backend="auto", use_cache=True, verbose=True),
optimization_config=OptimizationConfig(
local_search=True,
local_search_top_k=2,
local_search_steps=1,
local_search_radius=0.15,
diversity_control=True,
diversity_threshold=0.30,
diversity_stagnation_generations=3,
diversity_mutation_boost=1.8,
random_immigrants_fraction=0.10,
fitness_sharing=True,
sharing_radius=0.40,
),
)
callbacks = [
DeltaThreshold(threshold=0.001, generations=5, metric="fitness_best"),
ConsecutiveStopping(generations=7, metric="fitness_best"),
TimerStopping(total_seconds=90),
]
selector.fit(X_train, y_train, callbacks=callbacks)Inspect Selected Features
The fitted selector exposes support_, just like sklearn feature selectors.
python
selected_features = X_train.columns[selector.support_]
selected_summary = pd.DataFrame({
"feature": X_train.columns,
"selected": selector.support_,
"kind": ["original" if c in X_original.columns else "noise" for c in X_train.columns],
})
print(f"Selected {len(selected_features)} of {X_train.shape[1]} features")
print(selected_summary[selected_summary["selected"]])Telemetry
python
# Evaluation mechanics
print(selector.fit_stats_)
# {'evaluated_candidates': 182, 'cache_hits': 0, 'random_immigrants': 16, ...}
# Per-generation stats
history = pd.DataFrame(selector.history)
telemetry_cols = [
"gen", "fitness", "fitness_max", "fitness_std",
"unique_individual_ratio", "genotype_diversity", "stagnation_generations",
]
print(history[[c for c in telemetry_cols if c in history.columns]])python
import matplotlib.pyplot as plt
ax = history.plot(
x="gen", y=["fitness_best", "fitness_max", "fitness"],
marker="o", figsize=(8, 4),
)
ax.set_title("Feature-selection fitness over generations")
ax.set_xlabel("Generation")
ax.set_ylabel("Balanced accuracy")
plt.show()Compare Results
python
selector_metrics = evaluate(selector, X_test, y_test)
pd.DataFrame(
[baseline_metrics, selector_metrics],
index=["all_features", "selected_features"],
)
# selected_features typically improves on the noisy-input baseline
print(classification_report(y_test, selector.predict(X_test), target_names=iris.target_names))Practical Notes
max_featuresis a useful way to make the selection prefer compact solutions.- If many candidates are skipped as invalid, increase
max_featuresor reduce mutation strength. - If diversity drops quickly, use
diversity_control,random_immigrants_fraction, andfitness_sharingbefore simply adding more generations. - Always compare with an all-feature baseline. A smaller selected subset is only useful if quality remains acceptable.
See Also
- GAFeatureSelectionCV API — full parameter reference
- Advanced Optimizer Control — diversity control, fitness sharing, local search
- Adaptive Schedules — how
ExponentialAdapterandInverseAdapterwork
