SelectByFIRST
We now demonstrate how to use FIRST for factor selection within the sklearn.pipeline.Pipeline via SelectByFIRST class. If you have not installed pyfirst, please uncomment and run %pip install pyfirst below before proceeding.
# %pip install pyfirst
Imports
import numpy as np
from pyfirst import SelectByFIRST
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing, load_breast_cancer
Regression
Fetch Data
housing = fetch_california_housing()
X = housing.data
y = np.log(housing.target)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)
Build Pipeline and Train
pipe = Pipeline([
('selector', SelectByFIRST(regression=True,random_state=43)),
('estimator', RandomForestRegressor(random_state=43))
]).fit(X_train, y_train)
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
/tmp/ipykernel_780/33790229.py in ?()
----> 1 pipe = Pipeline([
2 ('selector', SelectByFIRST(regression=True,random_state=43)),
3 ('estimator', RandomForestRegressor(random_state=43))
4 ]).fit(X_train, y_train)
~/checkouts/readthedocs.org/user_builds/pyfirst/envs/stable/lib/python3.9/site-packages/sklearn/base.py in ?(estimator, *args, **kwargs)
1469 skip_parameter_validation=(
1470 prefer_skip_nested_validation or global_skip_validation
1471 )
1472 ):
-> 1473 return fit_method(estimator, *args, **kwargs)
~/checkouts/readthedocs.org/user_builds/pyfirst/envs/stable/lib/python3.9/site-packages/sklearn/pipeline.py in ?(self, X, y, **params)
465 self : object
466 Pipeline with fitted steps.
467 """
468 routed_params = self._check_method_params(method="fit", props=params)
--> 469 Xt = self._fit(X, y, routed_params)
470 with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
471 if self._final_estimator != "passthrough":
472 last_step_params = routed_params[self.steps[-1][0]]
~/checkouts/readthedocs.org/user_builds/pyfirst/envs/stable/lib/python3.9/site-packages/sklearn/pipeline.py in ?(self, X, y, routed_params)
402 cloned_transformer = transformer
403 else:
404 cloned_transformer = clone(transformer)
405 # Fit or load from cache the current transformer
--> 406 X, fitted_transformer = fit_transform_one_cached(
407 cloned_transformer,
408 X,
409 y,
~/checkouts/readthedocs.org/user_builds/pyfirst/envs/stable/lib/python3.9/site-packages/joblib/memory.py in ?(self, *args, **kwargs)
311 def __call__(self, *args, **kwargs):
--> 312 return self.func(*args, **kwargs)
~/checkouts/readthedocs.org/user_builds/pyfirst/envs/stable/lib/python3.9/site-packages/sklearn/pipeline.py in ?(transformer, X, y, weight, message_clsname, message, params)
1308 with _print_elapsed_time(message_clsname, message):
1309 if hasattr(transformer, "fit_transform"):
1310 res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
1311 else:
-> 1312 res = transformer.fit(X, y, **params.get("fit", {})).transform(
1313 X, **params.get("transform", {})
1314 )
1315
~/checkouts/readthedocs.org/user_builds/pyfirst/envs/stable/lib/python3.9/site-packages/sklearn/utils/_set_output.py in ?(self, X, *args, **kwargs)
311 @wraps(f)
312 def wrapped(self, X, *args, **kwargs):
--> 313 data_to_wrap = f(self, X, *args, **kwargs)
314 if isinstance(data_to_wrap, tuple):
315 # only wrap the first output for cross decomposition
316 return_tuple = (
~/checkouts/readthedocs.org/user_builds/pyfirst/envs/stable/lib/python3.9/site-packages/sklearn/base.py in ?(self, X, y, **fit_params)
1097 # fit method of arity 1 (unsupervised transformation)
1098 return self.fit(X, **fit_params).transform(X)
1099 else:
1100 # fit method of arity 2 (supervised transformation)
-> 1101 return self.fit(X, y, **fit_params).transform(X)
~/checkouts/readthedocs.org/user_builds/pyfirst/envs/stable/lib/python3.9/site-packages/pyfirst/pyfirst.py in ?(self, X, y, n_mc, twin_mc)
659
660 if not self.regression:
661 assert np.unique(y).size == 2, f"Only binary classification is supported by FIRST."
662
--> 663 self.importance_ = FIRST(
664 X = X,
665 y = y,
666 n_knn = self.n_knn,
~/checkouts/readthedocs.org/user_builds/pyfirst/envs/stable/lib/python3.9/site-packages/pyfirst/pyfirst.py in ?(X, y, n_knn, approx_knn, n_mc, twin_mc, rescale, n_forward, n_jobs, random_state, verbose)
457 candidate = [i for i in factor_non_constant if i not in subset]
458 while len(candidate) > 0:
459 # compute total Sobol' effect for -x (x for current subset)
460 seeds = rng.randint(1e9, size=len(candidate))
--> 461 nx_var = Parallel(n_jobs=n_jobs,prefer='threads')(delayed(_exp_var_knn)(
462 X = X,
463 y = y,
464 subset = subset + [candidate[i]],
~/checkouts/readthedocs.org/user_builds/pyfirst/envs/stable/lib/python3.9/site-packages/joblib/parallel.py in ?(self, iterable)
1914 # If n_jobs==1, run the computation sequentially and return
1915 # immediately to avoid overheads.
1916 output = self._get_sequential_output(iterable)
1917 next(output)
-> 1918 return output if self.return_generator else list(output)
1919
1920 # Let's create an ID that uniquely identifies the current call. If the
1921 # call is interrupted early and that the same instance is immediately
~/checkouts/readthedocs.org/user_builds/pyfirst/envs/stable/lib/python3.9/site-packages/joblib/parallel.py in ?(self, iterable)
1857 finally:
1858 self.print_progress()
1859 self._running = False
1860 self._iterating = False
-> 1861 self._original_iterator = None
~/checkouts/readthedocs.org/user_builds/pyfirst/envs/stable/lib/python3.9/site-packages/pyfirst/pyfirst.py in ?(X, y, subset, factor_nunique, n_knn, approx_knn, n_mc, twin_mc, random_state)
112 else:
113 query_ind = rng.permutation(np.arange(n))[:n_mc]
114 else:
115 query_ind = np.arange(n)
--> 116 nn_index = _get_knn(X, X[query_ind,:], k=n_knn, approximate=approx_knn)
117 ev = np.mean(np.var(y[nn_index],ddof=1,axis=1))
118
119 return ev
~/checkouts/readthedocs.org/user_builds/pyfirst/envs/stable/lib/python3.9/site-packages/pyfirst/pyfirst.py in ?(data, query, k, approximate)
39 nn_engine.nprobe = 10
40 else:
41 nn_engine = faiss.IndexFlatL2(data.shape[1])
42 nn_engine.add(data)
---> 43 _, nn_index = nn_engine.search(query, k)
44 return nn_index
~/checkouts/readthedocs.org/user_builds/pyfirst/envs/stable/lib/python3.9/site-packages/faiss/class_wrappers.py in ?(self, x, k, params, D, I)
339 I = np.empty((n, k), dtype=np.int64)
340 else:
341 assert I.shape == (n, k)
342
--> 343 self.search_c(n, swig_ptr(x), k, swig_ptr(D), swig_ptr(I), params)
344 return D, I
~/checkouts/readthedocs.org/user_builds/pyfirst/envs/stable/lib/python3.9/site-packages/faiss/swigfaiss_avx512.py in ?(self, n, x, k, distances, labels, params)
2276 def search(self, n, x, k, distances, labels, params=None):
-> 2277 return _swigfaiss_avx512.IndexFlat_search(self, n, x, k, distances, labels, params)
KeyboardInterrupt:
Test \(R^2\) vs Full Model
pipe.score(X_test, y_test)
0.8536755908380326
full = RandomForestRegressor(random_state=43).fit(X_train, y_train)
full.score(X_test, y_test)
0.845913188558511
pipe['selector'].get_feature_importance()
array([0.00749229, 0. , 0. , 0. , 0. ,
0.01008637, 0.13938932, 0.15473464])
Comparable test \(R^2\) is observed from the random forest model fitted on the 4 factors identified by FIRST versus the random forest model fitted on the entire set of factors.
Binary Classification
Fetch Data
cancer = load_breast_cancer()
X = cancer.data
y = cancer.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)
Build Pipeline and Train
pipe = Pipeline([
('selector', SelectByFIRST(regression=False,random_state=43)),
('estimator', RandomForestClassifier(random_state=43))
]).fit(X_train, y_train)
Test Accuracy vs Full Model
pipe.score(X_test, y_test)
0.9736842105263158
full = RandomForestClassifier(random_state=43).fit(X_train, y_train)
full.score(X_test, y_test)
0.9912280701754386
pipe['selector'].get_feature_importance()
array([0. , 0.04416144, 0.01358814, 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0.00339703, 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0.00339703, 0.0101911 , 0.14267543,
0. , 0. , 0. , 0. , 0. ])
Comparable test accuracy is observed from the random forest model fitted on the 6 factors identified by FIRST versus the random forest model fitted on the entire set of factors.