SelectByFIRST

We now demonstrate how to use FIRST for factor selection within the sklearn.pipeline.Pipeline via SelectByFIRST class. If you have not installed pyfirst, please uncomment and run %pip install pyfirst below before proceeding.

# %pip install pyfirst

Imports

import numpy as np
from pyfirst import SelectByFIRST
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing, load_breast_cancer

Regression

Fetch Data

housing = fetch_california_housing()
X = housing.data
y = np.log(housing.target)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)

Build Pipeline and Train

pipe = Pipeline([
    ('selector', SelectByFIRST(regression=True,random_state=43)),
    ('estimator', RandomForestRegressor(random_state=43))
]).fit(X_train, y_train)
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
/tmp/ipykernel_780/33790229.py in ?()
----> 1 pipe = Pipeline([
      2     ('selector', SelectByFIRST(regression=True,random_state=43)),
      3     ('estimator', RandomForestRegressor(random_state=43))
      4 ]).fit(X_train, y_train)

~/checkouts/readthedocs.org/user_builds/pyfirst/envs/stable/lib/python3.9/site-packages/sklearn/base.py in ?(estimator, *args, **kwargs)
   1469                 skip_parameter_validation=(
   1470                     prefer_skip_nested_validation or global_skip_validation
   1471                 )
   1472             ):
-> 1473                 return fit_method(estimator, *args, **kwargs)

~/checkouts/readthedocs.org/user_builds/pyfirst/envs/stable/lib/python3.9/site-packages/sklearn/pipeline.py in ?(self, X, y, **params)
    465         self : object
    466             Pipeline with fitted steps.
    467         """
    468         routed_params = self._check_method_params(method="fit", props=params)
--> 469         Xt = self._fit(X, y, routed_params)
    470         with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
    471             if self._final_estimator != "passthrough":
    472                 last_step_params = routed_params[self.steps[-1][0]]

~/checkouts/readthedocs.org/user_builds/pyfirst/envs/stable/lib/python3.9/site-packages/sklearn/pipeline.py in ?(self, X, y, routed_params)
    402                 cloned_transformer = transformer
    403             else:
    404                 cloned_transformer = clone(transformer)
    405             # Fit or load from cache the current transformer
--> 406             X, fitted_transformer = fit_transform_one_cached(
    407                 cloned_transformer,
    408                 X,
    409                 y,

~/checkouts/readthedocs.org/user_builds/pyfirst/envs/stable/lib/python3.9/site-packages/joblib/memory.py in ?(self, *args, **kwargs)
    311     def __call__(self, *args, **kwargs):
--> 312         return self.func(*args, **kwargs)

~/checkouts/readthedocs.org/user_builds/pyfirst/envs/stable/lib/python3.9/site-packages/sklearn/pipeline.py in ?(transformer, X, y, weight, message_clsname, message, params)
   1308     with _print_elapsed_time(message_clsname, message):
   1309         if hasattr(transformer, "fit_transform"):
   1310             res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
   1311         else:
-> 1312             res = transformer.fit(X, y, **params.get("fit", {})).transform(
   1313                 X, **params.get("transform", {})
   1314             )
   1315 

~/checkouts/readthedocs.org/user_builds/pyfirst/envs/stable/lib/python3.9/site-packages/sklearn/utils/_set_output.py in ?(self, X, *args, **kwargs)
    311     @wraps(f)
    312     def wrapped(self, X, *args, **kwargs):
--> 313         data_to_wrap = f(self, X, *args, **kwargs)
    314         if isinstance(data_to_wrap, tuple):
    315             # only wrap the first output for cross decomposition
    316             return_tuple = (

~/checkouts/readthedocs.org/user_builds/pyfirst/envs/stable/lib/python3.9/site-packages/sklearn/base.py in ?(self, X, y, **fit_params)
   1097             # fit method of arity 1 (unsupervised transformation)
   1098             return self.fit(X, **fit_params).transform(X)
   1099         else:
   1100             # fit method of arity 2 (supervised transformation)
-> 1101             return self.fit(X, y, **fit_params).transform(X)

~/checkouts/readthedocs.org/user_builds/pyfirst/envs/stable/lib/python3.9/site-packages/pyfirst/pyfirst.py in ?(self, X, y, n_mc, twin_mc)
    659 
    660         if not self.regression:
    661             assert np.unique(y).size == 2, f"Only binary classification is supported by FIRST."
    662 
--> 663         self.importance_ = FIRST(
    664             X = X,
    665             y = y,
    666             n_knn = self.n_knn,

~/checkouts/readthedocs.org/user_builds/pyfirst/envs/stable/lib/python3.9/site-packages/pyfirst/pyfirst.py in ?(X, y, n_knn, approx_knn, n_mc, twin_mc, rescale, n_forward, n_jobs, random_state, verbose)
    457         candidate = [i for i in factor_non_constant if i not in subset]
    458         while len(candidate) > 0:
    459             # compute total Sobol' effect for -x (x for current subset)
    460             seeds = rng.randint(1e9, size=len(candidate))
--> 461             nx_var = Parallel(n_jobs=n_jobs,prefer='threads')(delayed(_exp_var_knn)(
    462                 X = X,
    463                 y = y,
    464                 subset = subset + [candidate[i]],

~/checkouts/readthedocs.org/user_builds/pyfirst/envs/stable/lib/python3.9/site-packages/joblib/parallel.py in ?(self, iterable)
   1914             # If n_jobs==1, run the computation sequentially and return
   1915             # immediately to avoid overheads.
   1916             output = self._get_sequential_output(iterable)
   1917             next(output)
-> 1918             return output if self.return_generator else list(output)
   1919 
   1920         # Let's create an ID that uniquely identifies the current call. If the
   1921         # call is interrupted early and that the same instance is immediately

~/checkouts/readthedocs.org/user_builds/pyfirst/envs/stable/lib/python3.9/site-packages/joblib/parallel.py in ?(self, iterable)
   1857         finally:
   1858             self.print_progress()
   1859             self._running = False
   1860             self._iterating = False
-> 1861             self._original_iterator = None

~/checkouts/readthedocs.org/user_builds/pyfirst/envs/stable/lib/python3.9/site-packages/pyfirst/pyfirst.py in ?(X, y, subset, factor_nunique, n_knn, approx_knn, n_mc, twin_mc, random_state)
    112         else:
    113             query_ind = rng.permutation(np.arange(n))[:n_mc]
    114     else:
    115         query_ind = np.arange(n)
--> 116     nn_index = _get_knn(X, X[query_ind,:], k=n_knn, approximate=approx_knn)
    117     ev = np.mean(np.var(y[nn_index],ddof=1,axis=1))
    118 
    119     return ev

~/checkouts/readthedocs.org/user_builds/pyfirst/envs/stable/lib/python3.9/site-packages/pyfirst/pyfirst.py in ?(data, query, k, approximate)
     39         nn_engine.nprobe = 10
     40     else:
     41         nn_engine = faiss.IndexFlatL2(data.shape[1])
     42     nn_engine.add(data)
---> 43     _, nn_index = nn_engine.search(query, k)
     44     return nn_index

~/checkouts/readthedocs.org/user_builds/pyfirst/envs/stable/lib/python3.9/site-packages/faiss/class_wrappers.py in ?(self, x, k, params, D, I)
    339             I = np.empty((n, k), dtype=np.int64)
    340         else:
    341             assert I.shape == (n, k)
    342 
--> 343         self.search_c(n, swig_ptr(x), k, swig_ptr(D), swig_ptr(I), params)
    344         return D, I

~/checkouts/readthedocs.org/user_builds/pyfirst/envs/stable/lib/python3.9/site-packages/faiss/swigfaiss_avx512.py in ?(self, n, x, k, distances, labels, params)
   2276     def search(self, n, x, k, distances, labels, params=None):
-> 2277         return _swigfaiss_avx512.IndexFlat_search(self, n, x, k, distances, labels, params)

KeyboardInterrupt: 

Test \(R^2\) vs Full Model

pipe.score(X_test, y_test)
0.8536755908380326
full = RandomForestRegressor(random_state=43).fit(X_train, y_train)
full.score(X_test, y_test)
0.845913188558511
pipe['selector'].get_feature_importance()
array([0.00749229, 0.        , 0.        , 0.        , 0.        ,
       0.01008637, 0.13938932, 0.15473464])

Comparable test \(R^2\) is observed from the random forest model fitted on the 4 factors identified by FIRST versus the random forest model fitted on the entire set of factors.

Binary Classification

Fetch Data

cancer = load_breast_cancer()
X = cancer.data
y = cancer.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)

Build Pipeline and Train

pipe = Pipeline([
    ('selector', SelectByFIRST(regression=False,random_state=43)),
    ('estimator', RandomForestClassifier(random_state=43))
]).fit(X_train, y_train)

Test Accuracy vs Full Model

pipe.score(X_test, y_test)
0.9736842105263158
full = RandomForestClassifier(random_state=43).fit(X_train, y_train)
full.score(X_test, y_test)
0.9912280701754386
pipe['selector'].get_feature_importance()
array([0.        , 0.04416144, 0.01358814, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.00339703, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.00339703, 0.0101911 , 0.14267543,
       0.        , 0.        , 0.        , 0.        , 0.        ])

Comparable test accuracy is observed from the random forest model fitted on the 6 factors identified by FIRST versus the random forest model fitted on the entire set of factors.