Source code for pyfirst.pyfirst

import faiss
import numpy as np
import pandas as pd
from twinning import twin
from math import comb
from joblib import Parallel, delayed
from pandas.api.types import is_numeric_dtype, is_bool_dtype
from typing import Dict, List, Optional, Tuple, Union
from sklearn.base import BaseEstimator
from sklearn.feature_selection._base import SelectorMixin
from sklearn.utils import check_random_state
from sklearn.utils.validation import check_is_fitted

__all__ = ['TotalSobolKNN', 'ShapleySobolKNN', 'FIRSTRank', 'FIRST', 'SelectByFIRST']

def _check_X_y_and_preprocess(
        X:Union[pd.DataFrame, np.ndarray],
        y:Union[pd.Series, np.ndarray], 
        rescale:bool,
    ) -> Tuple[pd.DataFrame, np.ndarray]:
    # function to check for the input X and output y

    # make a copy to avoid mutation of input
    X = X.copy()
    y = y.copy()

    # check for input X 
    assert isinstance(X, pd.DataFrame) or isinstance(X, np.ndarray), f"X must be pd.DataFrame/np.ndarry type, but {type(X)} is provided."
    assert X.ndim == 2, f"Input X must be 2D."
    if isinstance(X, np.ndarray):
        X = pd.DataFrame(X, columns=[f"x{i}" for i in range(X.shape[1])])
    assert X.apply(lambda x: not hasattr(x, "sparse")).all(), f"X cannot be sparse. Please convert them to dense."
    assert X.apply(lambda x: np.isfinite(x).all() if is_numeric_dtype(x) else x.notna().all()).all(), f"X cannot contain any missing/infinite value."

    # check for output y
    assert isinstance(y, pd.Series) or isinstance(y, np.ndarray), f"y must be pd.Series/np.ndarray type, but {type(y)} is provided."
    if isinstance(y, pd.Series):
        y = y.to_numpy()
    y = y.squeeze()
    assert y.ndim == 1, f"Output y must be 1D."
    assert not hasattr(y, "sparse"), f"y cannot be sparse. Please convert it to dense."
    assert np.isfinite(y).all(), f"y cannot contain any missing/infinite value."
    if np.issubdtype(y.dtype, np.number):
        assert (y != y[0]).any(), f"y must have more than one unique value."
    else:
        uniq_y, y = np.unique(y, return_inverse=True)
        assert uniq_y.size == 2, f"y can only have two values for classification (only binary classification is supported)."

    # check consistent length between X and y
    assert X.shape[0] == y.size, f"Size of X ({X.shape[0]}) and y ({y.size}) does not match."

    # preprocess for input X if rescale
    if rescale:
        for col in X.columns:
            if is_numeric_dtype(X[col]) and X[col].nunique() > 1:
                if is_bool_dtype(X[col]) or X[col].isin([0,1]).all():
                    # make binary input -1/1, see Gelman 
                    # http://www.stat.columbia.edu/~gelman/research/published/standardizing7.pdf
                    X[col] = 2.0 * (X[col].astype(np.float32) - 0.5)
                else:
                    X[col] = (X[col] - np.mean(X[col])) / np.std(X[col])
    
    return X, y

def _check_mc_setting(
        n_mc: int,
        twin_mc: bool,
        y: np.ndarray,
    ) -> Tuple[int, bool]: 
    # function to check for using monte carlo subsamples

    # check for n_mc: positive integer and cannot be greater than n
    n = y.size
    n_mc = n if n_mc is None else min(n_mc, n)
    assert isinstance(n_mc, int) and n_mc > 0, f"n_mc ({n_mc}) must be a positive integer."

    # check for twin_mc: twinning subsample is available only when reduction ratio >= 2
    twin_mc = False if n//n_mc < 2 else twin_mc

    return n_mc, twin_mc

def _check_knn_setting(
        n_knn: int,
        approx_knn: bool,
        y: np.ndarray,
    ) -> Tuple[int, bool]:
    # function to check for nearest-neighbor search setting

    # check for n_knn 
    n_knn = (2 if np.unique(y).size > 2 else 3) if n_knn is None else n_knn
    assert isinstance(n_knn, int) and n_knn > 0, f"n_knn {(n_knn)} must be a postive integer."

    # approximate nearest-neighbor search is only supported for data size at least 10,000
    approx_knn = False if y.size < 1e4 else approx_knn

    return n_knn, approx_knn

def _preprocess_input(
        X:pd.DataFrame,
    ) -> np.ndarray:
    # preprocess input by transforming categorical features via one hot encoding 
    # numeric columns include bool, int, float
    p = X.shape[1]
    num_col_ind = [i for i in range(p) if is_numeric_dtype(X.iloc[:,i])]
    cat_col_ind = list(set(range(p)).difference(set(num_col_ind)))
    X = np.hstack([X.iloc[:,num_col_ind].values.astype(np.float32)] + [pd.get_dummies(X.iloc[:,i]).values.astype(np.float32) for i in cat_col_ind])
    return X

def _get_knn(
        data:np.ndarray, 
        query:np.ndarray, 
        k:int = 5, 
        approximate:bool = False,
    ) -> np.ndarray:
    # nearest-neighbor search using faiss
    assert k <= data.shape[0], f"k ({k}) cannot be greater than size of data ({data.shape[0]})."
    data = data.astype(np.float32)
    if approximate:
        nn_quantizer = faiss.IndexFlatL2(data.shape[1])
        nn_engine = faiss.IndexIVFFlat(nn_quantizer, data.shape[1], 100)
        nn_engine.train(data)
        nn_engine.nprobe = 10
    else:
        nn_engine = faiss.IndexFlatL2(data.shape[1])
    nn_engine.add(data)
    _, nn_index = nn_engine.search(query, k)
    return nn_index

def _exp_var_knn(
        X:pd.DataFrame,
        y:np.ndarray,
        subset:List[int],
        factor_nunique:Optional[np.ndarray] = None,
        n_knn:int = 2,
        approx_knn:bool = False,
        n_mc:int = None,
        twin_mc:bool = False,
        random_state:Union[int,np.random.RandomState] = None,
    ) -> float:

    # argument check for random state
    rng = check_random_state(random_state)

    # argument check for subset 
    assert isinstance(subset, List), f"subset must be a list of integer."
    if len(subset) > 0:
        assert isinstance(subset[0], int), f"subset must be a list of integer."
    else:
        return np.var(y, ddof=1)

    # argument check for month carlo subsample setting
    n_mc, twin_mc = _check_mc_setting(n_mc, twin_mc, y)

    # argument check for knn search setting
    n_knn, approx_knn = _check_knn_setting(n_knn, approx_knn, y)

    # check if any row of X duplicate and preprocess categorical features
    n, p = X.shape
    # get the subset 
    X = X.iloc[:,subset].copy()
    # check if any duplicated rows
    row_duplicated = False 
    if factor_nunique is None:
        if X.duplicated().any():
            row_duplicated = True 
    else:
        assert factor_nunique.size == p, f"factor_nunique ({factor_nunique.size}) must be the same size of X ({p})."
        factor_nunique = factor_nunique[subset]
        if (factor_nunique < n).all():
            if factor_nunique.size > 1:
                if X.duplicated().any():
                    row_duplicated = True
            else:
                row_duplicated = True
    # preprocess categorical features
    X = _preprocess_input(X)
    # handle duplicated row for nearest-neighbor search
    if row_duplicated:
        # random jittering to have unqiue rows
        X = np.hstack((X, 1e-3*rng.uniform(low=-1,high=1,size=(n,1))))
        faiss.cvar.distance_compute_blas_threshold = n + 1 # exact computation
    else:
        faiss.cvar.distance_compute_blas_threshold = 20 # default setting

    # nearest-neighbor search and compute expected conditional variance
    if n_mc < n:
        if twin_mc:
            r = n//n_mc
            keep_ind = rng.permutation(np.arange(n))[:(n_mc*r)]
            twin_ind = twin(data=X[keep_ind,:].astype(np.float64), r=r, u1=0)
            query_ind = keep_ind[twin_ind]
        else:
            query_ind = rng.permutation(np.arange(n))[:n_mc]
    else:
        query_ind = np.arange(n)
    nn_index = _get_knn(X, X[query_ind,:], k=n_knn, approximate=approx_knn)
    ev = np.mean(np.var(y[nn_index],ddof=1,axis=1))
        
    return ev

[docs] def TotalSobolKNN( X:Union[pd.DataFrame, np.ndarray], y:Union[pd.Series, np.ndarray], noise:bool, n_knn:int = None, approx_knn:bool = False, n_mc:int = None, twin_mc:bool = False, rescale:bool = True, n_jobs:int = 1, random_state:Union[int,np.random.RandomState] = None, ) -> np.ndarray: """Estimating Total Sobol' Indices from Data `TotalSobolKNN` provides consistent estimation of total Sobol' indices (Sobol', 2001) directly from scattered data. When the responses are noiseless (`noise=False`), it implements the Nearest-Neighbor estimator in Broto et al. (2020). When the responses are noisy (`noise=True`), it implements the Noise-Adjusted Nearest-Neighbor estimator in Huang and Joseph (2024). Parameters ---------- X : pd.DataFrame or np.ndarray A pd.DataFrame or np.ndarray for the factors / predictors. y : pd.Series or np.ndarray A pd.Series or np.ndarray for the responses. noise : bool A logical indicating whether the responses are noisy. n_knn : int, default=None The number of nearest-neighbor for the inner loop conditional variance estimation. `n_knn=2` is recommended for regression, and `n_knn=3` for binary classification. approx_knn : bool, default=False A logical indicating whether to use approximate nearest-neighbor search, otherwise exact search is used. It is supported when there are at least 10,000 data instances. n_mc : int, default=None The number of Monte Carlo samples for the outer loop expectation estimation. twin_mc : bool, default=False A logical indicating whether to use twinning subsamples, otherwise random subsamples are used. It is supported when the reduction ratio is at least 2. rescale : bool, default=True A logical indicating whether to standardize the factors / predictors. n_jobs : int, default=1 The number of jobs to run in parallel. `n_jobs=-1` means using all processors. random_state : int or RandomState instance, default=None A seed for controlling the randomness in breaking ties in nearest-neighbor search and finding random subsamples. Returns ---------- np.ndarray A numeric vector for the total Sobol' indices estimation. Notes ----- `Faiss` (Douze et al., 2024) is used for efficient nearest-neighbor search, with the approximate search (`approx_knn=True`) by the inverted file index (IVF). IVF reduces the search scope through first clustering data into Voronoi cells. To further accelerate, we also support the use of subsamples by specifying `n_mc`. Both random and twinning (Vakayil and Joseph, 2022) subsamples are available, where twinning subsamples provide better approximation for the full data. References ---------- Huang, C., & Joseph, V. R. (2025). Factor Importance Ranking and Selection using Total Indices. Technometrics. Sobol', I. M. (2001). Global sensitivity indices for nonlinear mathematical models and their Monte Carlo estimates. Mathematics and computers in simulation, 55(1-3), 271-280. Broto, B., Bachoc, F., & Depecker, M. (2020). Variance reduction for estimation of Shapley effects and adaptation to unknown input distribution. SIAM/ASA Journal on Uncertainty Quantification, 8(2), 693-716. Douze, M., Guzhva, A., Deng, C., Johnson, J., Szilvasy, G., Mazaré, P.E., Lomeli, M., Hosseini, L., & Jégou, H., (2024). The Faiss library. arXiv preprint arXiv:2401.08281. Vakayil, A., & Joseph, V. R. (2022). Data twinning. Statistical Analysis and Data Mining: The ASA Data Science Journal, 15(5), 598-610. """ # argument check for random state rng = check_random_state(random_state) # check X and y, and preprocess X, y = _check_X_y_and_preprocess(X, y, rescale) # check Monte Carlo subsample setting n_mc, twin_mc = _check_mc_setting(n_mc, twin_mc, y) # check knn setting n_knn, approx_knn = _check_knn_setting(n_knn, approx_knn, y) # get basic information for factor n, p = X.shape factor_nunique = X.nunique().to_numpy() factor_non_constant = [i for i in range(p) if factor_nunique[i] > 1] # compute total Sobol' indices if noise: noise_var = _exp_var_knn( X = X, y = y, subset = factor_non_constant, factor_nunique = factor_nunique, n_knn = n_knn, approx_knn = approx_knn, n_mc = n_mc, twin_mc = twin_mc, random_state = rng.randint(1e9, size=1)[0], ) else: noise_var = 0 y_var = max(np.var(y,ddof=1) - noise_var, 0) tsi = np.zeros(p) if y_var > 0: seeds = rng.randint(1e9, size=len(factor_non_constant)) xe_var = Parallel(n_jobs=n_jobs,prefer='threads')(delayed(_exp_var_knn)( X = X, y = y, subset = factor_non_constant[:i]+factor_non_constant[(i+1):], factor_nunique = factor_nunique, n_knn = n_knn, approx_knn = approx_knn, n_mc = n_mc, twin_mc = twin_mc, random_state = seeds[i], ) for i in range(len(factor_non_constant))) xe_var = np.array(xe_var) xi_var = np.maximum(xe_var - noise_var, 0) tsi[factor_non_constant] = xi_var / y_var return tsi
[docs] def ShapleySobolKNN( X:Union[pd.DataFrame, np.ndarray], y:Union[pd.Series, np.ndarray], noise:bool, n_knn:int = None, approx_knn:bool = False, n_mc:int = None, twin_mc:bool = False, rescale:bool = True, n_jobs:int = 1, random_state:Union[int,np.random.RandomState] = None, ) -> np.ndarray: """Estimating Shapley Sobol' Effect from Data `ShapleySobolKNN` provides consistent estimation of Shapley Sobol' Effect (Owen, 2014; Song et al., 2016) directly from scattered data. When the responses are noiseless (`noise=False`), it implements the Nearest-Neighbor estimator in Broto et al. (2020). When the responses are noisy (`noise=True`), it implements the Noise-Adjusted Nearest-Neighbor estimator in Huang and Joseph (2024). Parameters ---------- X : pd.DataFrame or np.ndarray A pd.DataFrame or np.ndarray for the factors / predictors. y : pd.Series or np.ndarray A pd.Series or np.ndarray for the responses. noise : bool A logical indicating whether the responses are noisy. n_knn : int, default=None The number of nearest-neighbor for the inner loop conditional variance estimation. `n_knn=2` is recommended for regression, and `n_knn=3` for binary classification. approx_knn : bool, default=False A logical indicating whether to use approximate nearest-neighbor search, otherwise exact search is used. It is supported when there are at least 10,000 data instances. n_mc : int, default=None The number of Monte Carlo samples for the outer loop expectation estimation. twin_mc : bool, default=False A logical indicating whether to use twinning subsamples, otherwise random subsamples are used. It is supported when the reduction ratio is at least 2. rescale : bool, default=True A logical indicating whether to standardize the factors / predictors. n_jobs : int, default=1 The number of jobs to run in parallel. `n_jobs=-1` means using all processors. random_state : int or RandomState instance, default=None A seed for controlling the randomness in breaking ties in nearest-neighbor search and finding random subsamples. Returns ---------- np.ndarray A numeric vector for the Shapley Sobol' effect estimation. Notes ----- `Faiss` (Douze et al., 2024) is used for efficient nearest-neighbor search, with the approximate search (`approx_knn=True`) by the inverted file index (IVF). IVF reduces the search scope through first clustering data into Voronoi cells. To further accelerate, we also support the use of subsamples by specifying `n_mc`. Both random and twinning (Vakayil and Joseph, 2022) subsamples are available, where twinning subsamples provide better approximation for the full data. References ---------- Huang, C., & Joseph, V. R. (2025). Factor Importance Ranking and Selection using Total Indices. Technometrics. Owen, A. B. (2014), “Sobol’indices and Shapley value,” SIAM/ASA Journal on Uncertainty Quantification, 2, 245–251. Song, E., Nelson, B. L., & Staum, J. (2016), “Shapley effects for global sensitivity analysis: Theory and computation,” SIAM/ASA Journal on Uncertainty Quantification, 4, 1060-1083. Broto, B., Bachoc, F., & Depecker, M. (2020). Variance reduction for estimation of Shapley effects and adaptation to unknown input distribution. SIAM/ASA Journal on Uncertainty Quantification, 8(2), 693-716. Douze, M., Guzhva, A., Deng, C., Johnson, J., Szilvasy, G., Mazaré, P.E., Lomeli, M., Hosseini, L., & Jégou, H., (2024). The Faiss library. arXiv preprint arXiv:2401.08281. Vakayil, A., & Joseph, V. R. (2022). Data twinning. Statistical Analysis and Data Mining: The ASA Data Science Journal, 15(5), 598-610. """ # argument check for random state rng = check_random_state(random_state) # check X and y, and preprocess X, y = _check_X_y_and_preprocess(X, y, rescale) # check Monte Carlo subsample setting n_mc, twin_mc = _check_mc_setting(n_mc, twin_mc, y) # check knn setting n_knn, approx_knn = _check_knn_setting(n_knn, approx_knn, y) # get basic information for factor n, p = X.shape factor_nunique = X.nunique().to_numpy() factor_non_constant = [i for i in range(p) if factor_nunique[i] > 1] # compute Shapley Sobol' Effect if noise: noise_var = _exp_var_knn( X = X, y = y, subset = factor_non_constant, factor_nunique = factor_nunique, n_knn = n_knn, approx_knn = approx_knn, n_mc = n_mc, twin_mc = twin_mc, random_state = rng.randint(1e9, size=1)[0], ) else: noise_var = 0 y_var = np.var(y,ddof=1) if y_var < noise_var: # noise dominated, no factor is important ssi = np.zeros(p) else: q = len(factor_non_constant) # number of non-constant factors u = [[bool(int(bit)) for bit in format(i, f'0{q}b')[::-1]] for i in range(2**q-1)] seeds = rng.randint(1e9, size=2**q-1) nx_var = Parallel(n_jobs=n_jobs,prefer='threads')(delayed(_exp_var_knn)( X = X, y = y, subset = [ind for ind, flag in zip(factor_non_constant, u[i]) if flag], factor_nunique = factor_nunique, n_knn = n_knn, approx_knn = approx_knn, n_mc = n_mc, twin_mc = twin_mc, random_state = seeds[i], ) for i in range(2**q-1)) nx_var = np.array(nx_var + [noise_var]) x_var = np.maximum(y_var - nx_var, 0.0) ssi_non_constant = np.zeros(q) for i in range(q): for s in range(0, 2**q, 2**(i+1)): for l in range(s, s+2**i): cardu = sum(u[l]) ssi_non_constant[i] += (x_var[l+2**i] - x_var[l]) / comb(q-1, cardu) ssi_non_constant = ssi_non_constant / q / (y_var - noise_var) ssi = np.zeros(p) ssi[factor_non_constant] = ssi_non_constant return ssi
[docs] def FIRSTRank( X:Union[pd.DataFrame, np.ndarray], y:Union[pd.Series, np.ndarray], noise:bool, n_knn:int = None, approx_knn:bool = False, n_mc:int = None, twin_mc:bool = False, rescale:bool = True, n_jobs:int = 1, random_state:Union[int,np.random.RandomState] = None, ) -> Dict: """Provides Factor Ranking via Cumulative Variance that can be explained. `FIRSTRank` provides factor ranking directly from scattered data via cumulative variance that can be explained. See Huang and Joseph (2025) for details. Parameters ---------- X : pd.DataFrame or np.ndarray A pd.DataFrame or np.ndarray for the factors / predictors. y : pd.Series or np.ndarray A pd.Series or np.ndarray for the responses. noise : bool A logical indicating whether the responses are noisy. n_knn : int, default=None The number of nearest-neighbor for the inner loop conditional variance estimation. `n_knn=2` is recommended for regression, and `n_knn=3` for binary classification. approx_knn : bool, default=False A logical indicating whether to use approximate nearest-neighbor search, otherwise exact search is used. It is supported when there are at least 10,000 data instances. n_mc : int, default=None The number of Monte Carlo samples for the outer loop expectation estimation. twin_mc : bool, default=False A logical indicating whether to use twinning subsamples, otherwise random subsamples are used. It is supported when the reduction ratio is at least 2. rescale : bool, default=True A logical indicating whether to standardize the factors / predictors. n_jobs : int, default=1 The number of jobs to run in parallel. `n_jobs=-1` means using all processors. random_state : int or RandomState instance, default=None A seed for controlling the randomness in breaking ties in nearest-neighbor search and finding random subsamples. Returns ---------- A dictionary with the following keys: - "ranking": np.ndarray Indices of the factors, ordered from most to least important, based on cumulative variance that can be explained. - "explained_variance": np.ndarray Cumulative variance explained by the factors in the ranking order. Notes ----- `Faiss` (Douze et al., 2024) is used for efficient nearest-neighbor search, with the approximate search (`approx_knn=True`) by the inverted file index (IVF). IVF reduces the search scope through first clustering data into Voronoi cells. To further accelerate, we also support the use of subsamples by specifying `n_mc`. Both random and twinning (Vakayil and Joseph, 2022) subsamples are available, where twinning subsamples provide better approximation for the full data. References ---------- Huang, C., & Joseph, V. R. (2025). Factor Importance Ranking and Selection using Total Indices. Technometrics. Sobol', I. M. (2001). Global sensitivity indices for nonlinear mathematical models and their Monte Carlo estimates. Mathematics and computers in simulation, 55(1-3), 271-280. Broto, B., Bachoc, F., & Depecker, M. (2020). Variance reduction for estimation of Shapley effects and adaptation to unknown input distribution. SIAM/ASA Journal on Uncertainty Quantification, 8(2), 693-716. Douze, M., Guzhva, A., Deng, C., Johnson, J., Szilvasy, G., Mazaré, P.E., Lomeli, M., Hosseini, L., & Jégou, H., (2024). The Faiss library. arXiv preprint arXiv:2401.08281. Vakayil, A., & Joseph, V. R. (2022). Data twinning. Statistical Analysis and Data Mining: The ASA Data Science Journal, 15(5), 598-610. """ # argument check for random state rng = check_random_state(random_state) # check X and y, and preprocess X, y = _check_X_y_and_preprocess(X, y, rescale) # check Monte Carlo subsample setting n_mc, twin_mc = _check_mc_setting(n_mc, twin_mc, y) # check knn setting n_knn, approx_knn = _check_knn_setting(n_knn, approx_knn, y) # get basic information for factor _, p = X.shape factor_nunique = X.nunique().to_numpy() factor_non_constant = [i for i in range(p) if factor_nunique[i] > 1] # factor importance ranking by the cumulative variance that can be explained if noise: noise_var = _exp_var_knn( X = X, y = y, subset = factor_non_constant, factor_nunique = factor_nunique, n_knn = n_knn, approx_knn = approx_knn, n_mc = n_mc, twin_mc = twin_mc, random_state = rng.randint(1e9, size=1)[0], ) else: noise_var = 0 y_var = max(np.var(y,ddof=1) - noise_var, 0) if y_var == 0: return { "ranking": np.arange(p), "explained_variance": np.zeros(p), } factor_ranking = [] factor_explained_variance = [] subset = factor_non_constant while len(subset) > 0: seeds = rng.randint(1e9, size=len(subset)) nx_var = Parallel(n_jobs=n_jobs,prefer='threads')(delayed(_exp_var_knn)( X = X, y = y, subset = subset[:i]+subset[(i+1):], factor_nunique = factor_nunique, n_knn = n_knn, approx_knn = approx_knn, n_mc = n_mc, twin_mc = twin_mc, random_state = seeds[i], ) for i in range(len(subset))) nx_var = np.array(nx_var) x_var = np.maximum(y_var - nx_var, 0) remove_ind = subset[np.argmax(x_var)] factor_ranking.append(remove_ind) factor_explained_variance.append(x_var.max() / y_var) subset.remove(remove_ind) factor_ranking = factor_ranking[::-1] factor_explained_variance = factor_explained_variance[::-1][1:] + [1] for i in range(p): if factor_nunique[i] == 1: factor_ranking.append(i) factor_explained_variance.append(1) return { "ranking": np.array(factor_ranking), "explained_variance": np.array(factor_explained_variance) }
[docs] def FIRST( X:Union[pd.DataFrame, np.ndarray], y:Union[pd.Series, np.ndarray], n_knn:int = None, approx_knn:bool = False, n_mc:int = None, twin_mc:bool = False, rescale:bool = True, n_forward:int = 2, n_jobs:int = 1, random_state:Union[int,np.random.RandomState] = None, return_option:str = 'selection', verbose:bool = False, ) -> np.ndarray: """Factor Importance Ranking and Selection using Total (Sobol') indices `FIRST` provides factor importance ranking and selection directly from scattered data without any model fitting. `FIRST` is a model-independent factor importance ranking and selection algorithm proposed in Huang and Joseph (2024). Factor importance is computed based on total Sobol' indices (Sobol', 2021), which is connected to the approximation error introduced by excluding the factor of interest (Huang and Joseph, 2024). The estimation procedure adapts from the Nearest-Neighbor estimator in Broto et al. (2020) to account for the noisy data. Integrating it with forward selection and backward elimination allows for factor selection. Parameters ---------- X : pd.DataFrame or np.ndarray A pd.DataFrame or np.ndarray for the factors / predictors. y : pd.Series or np.ndarray A pd.Series or np.ndarray for the responses. n_knn : int, default=None The number of nearest-neighbor for the inner loop conditional variance estimation. `n_knn=2` is recommended for regression, and `n_knn=3` for binary classification. approx_knn : bool, default=False A logical indicating whether to use approximate nearest-neighbor search, otherwise exact search is used. It is supported when there are at least 10,000 data instances. n_mc : int, default=None The number of Monte Carlo samples for the outer loop expectation estimation. twin_mc : bool, default=False A logical indicating whether to use twinning subsamples, otherwise random subsamples are used. It is supported when the reduction ratio is at least 2. rescale : bool, default=True A logical indicating whether to standardize the factors / predictors. n_forward : int, default=2 The number of times to run the forward selection phase to tradeoff between efficiency and accuracy. `n_forward=2` is recommended. To run the complete forward selection, please set `n_forward` to the number of factors / predictors. n_jobs : int, default=1 The number of jobs to run in parallel. `n_jobs=-1` means using all processors. random_state : int or RandomState instance, default=None A seed for controlling the randomness in breaking ties in nearest-neighbor search and finding random subsamples. return_option : str, default='selection' The options for the output of `FIRST`. Default is `selection`, which returns a binary vector indicating the factor selection, with `True` for selected variables and `False` otherwise. Other options include: (i) `total`, which return a numeric vector for total Sobol' effect and (ii) `shapley`, which return a numeric vector for Shapley Sobol' effect. See Huang and Joseph (2024) for details. verbose : bool, default=False A logical indicating whether to display intermediate results, e.g., the selected factor from each iteration. Returns ---------- np.ndarray A numeric vector for the factor selection, importance, or ranking depending on the value of the `return_option` argument. Notes ----- `FIRST` belongs to the class of forward-backward selection with early dropping algorithm (Borboudakis and Tsamardinos, 2019). In forward selection, each time we find the candidate that maximizes the output variance that can be explained. For candidates that cannot improve the variance explained conditional on the selected factors, they are removed from the candidate set. This forward selection step is run `n_forward` times to tradeoff between accuracy and efficiency. `n_forward = 2` is recommended in Yu et al. (2020). To run the complete forward selection, please set `n_forward` to the number of factors / predictors. In backward elimination, we again remove one factor at a time, starting with the factor that can improve the explained variance most, till no factor can further improve. `Faiss` (Douze et al., 2024) is used for efficient nearest-neighbor search, with the approximate search (`approx_knn=True`) by the inverted file index (IVF). IVF reduces the search scope through first clustering data into Voronoi cells. To further accelerate, we also support the use of subsamples by specifying `n_mc`. Both random and twinning (Vakayil and Joseph, 2022) subsamples are available, where twinning subsamples provide better approximation for the full data. For more details about `FIRST`, please see Huang and Joseph (2024). References ---------- Huang, C., & Joseph, V. R. (2025). Factor Importance Ranking and Selection using Total Indices. Technometrics. Sobol', I. M. (2001). Global sensitivity indices for nonlinear mathematical models and their Monte Carlo estimates. Mathematics and computers in simulation, 55(1-3), 271-280. Broto, B., Bachoc, F., & Depecker, M. (2020). Variance reduction for estimation of Shapley effects and adaptation to unknown input distribution. SIAM/ASA Journal on Uncertainty Quantification, 8(2), 693-716. Borboudakis, G., & Tsamardinos, I. (2019). Forward-backward selection with early dropping. The Journal of Machine Learning Research, 20(1), 276-314. Yu, K., Guo, X., Liu, L., Li, J., Wang, H., Ling, Z., & Wu, X. (2020). Causality-based feature selection: Methods and evaluations. ACM Computing Surveys (CSUR), 53(5), 1-36. Douze, M., Guzhva, A., Deng, C., Johnson, J., Szilvasy, G., Mazaré, P.E., Lomeli, M., Hosseini, L., & Jégou, H., (2024). The Faiss library. arXiv preprint arXiv:2401.08281. Vakayil, A., & Joseph, V. R. (2022). Data twinning. Statistical Analysis and Data Mining: The ASA Data Science Journal, 15(5), 598-610. """ # argument check for random state rng = check_random_state(random_state) # check X and y, and preprocess X, y = _check_X_y_and_preprocess(X, y, rescale) # check Monte Carlo subsample setting n_mc, twin_mc = _check_mc_setting(n_mc, twin_mc, y) # check knn setting n_knn, approx_knn = _check_knn_setting(n_knn, approx_knn, y) # check return option assert return_option in ['selection','total','shapley'], f"return_option ({return_option}) is not available, only selection/total/shapley/ranking is supported." # get basic information for factor n, p = X.shape factor_nunique = X.nunique().to_numpy() factor_non_constant = [i for i in range(p) if factor_nunique[i] > 1] # forward selection if verbose: print("Starting forward selection...") if len(factor_non_constant) < p: print(f"factors removed because of constant value: {' '.join(str(i) for i in range(p) if i not in factor_non_constant)}") y_var = np.var(y, ddof=1) subset = [] x_var_max = 0 for t in range(n_forward): if verbose: print(f"\nPhase-{(t+1):d} Forward Selection...") none_added_to_subset = True candidate = [i for i in factor_non_constant if i not in subset] while len(candidate) > 0: # compute total Sobol' effect for -x (x for current subset) seeds = rng.randint(1e9, size=len(candidate)) nx_var = Parallel(n_jobs=n_jobs,prefer='threads')(delayed(_exp_var_knn)( X = X, y = y, subset = subset + [candidate[i]], factor_nunique = factor_nunique, n_knn = n_knn, approx_knn = approx_knn, n_mc = n_mc, twin_mc = twin_mc, random_state = seeds[i], ) for i in range(len(candidate))) nx_var = np.array(nx_var) x_var = np.maximum(y_var - nx_var, 0) if verbose: print(f"\ncurrent selection: {' '.join(str(s) for s in subset)}") print(f"current variance explained: {x_var_max:.3f}") print("candidate to add:", ' '.join(f"{candidate[i]:d}({x_var[i]:.3f})" for i in range(len(candidate)))) if x_var.max() > x_var_max: # find the index to add such that the variance explained is maximized add_ind = candidate[np.argmax(x_var)] candidate = [candidate[i] for i in range(len(candidate)) if x_var[i] > x_var_max] candidate.remove(add_ind) subset.append(add_ind) none_added_to_subset = False x_var_max = x_var.max() if verbose: print(f"add candidate {add_ind:d}({x_var_max:.3f}).") else: break if none_added_to_subset: if verbose: print("early termination since none of the candidates can be added in this phase.") break # backward elimination if verbose: print("\nStarting backward elimination...") subset.sort() while len(subset) > 0: # compute total sobol' effect for -(x/{i}) (x for current subset) seeds = rng.randint(1e9, size=len(subset)) nx_var = Parallel(n_jobs=n_jobs,prefer='threads')(delayed(_exp_var_knn)( X = X, y = y, subset = subset[:i]+subset[(i+1):], factor_nunique = factor_nunique, n_knn = n_knn, approx_knn = approx_knn, n_mc = n_mc, twin_mc = twin_mc, random_state = seeds[i], ) for i in range(len(subset))) nx_var = np.array(nx_var) x_var = np.maximum(y_var - nx_var, 0) if verbose: print(f"\ncurrent selection: {' '.join(str(s) for s in subset)}") print(f"current variance explained: {x_var_max:.3f}") print("candidate to remove:", ' '.join(f"{subset[i]:d}({x_var[i]:.3f})" for i in range(len(subset)))) if x_var.max() >= x_var_max: # find the index to remove such that the variance explained is maximized remove_ind = subset[np.argmax(x_var)] subset.remove(remove_ind) x_var_max = x_var.max() if verbose: print(f"remove candidate {remove_ind:d}({x_var_max:.3f}).") else: break # compute importance via total Sobol' indices imp = np.zeros(p) if len(subset) > 0: noise_var = y_var - x_var_max imp[subset] = (nx_var - noise_var) / x_var_max # get the indices for selection selection = (imp > 0).astype(bool) if return_option == "selection": return selection elif return_option == "total": return imp elif return_option == "shapley": shapley_imp = np.zeros(p) shapley_imp[selection] = ShapleySobolKNN( X = X.loc[:,selection].copy(), y = y, noise = True, n_knn = n_knn, approx_knn = approx_knn, n_mc = n_mc, twin_mc = twin_mc, rescale = rescale, n_jobs = n_jobs, random_state = random_state, ) return shapley_imp else: return None
[docs] class SelectByFIRST(SelectorMixin, BaseEstimator): """Feature selector using FIRST This implements the feature selector class for FIRST (Huang and Joseph, 2024), a model-independent feature selection algorithm based on total Sobol' indices (Sobol', 2001). Parameters ---------- regression : bool, default=True A logical indicating whether the feature selector is for a regression or classification problem. n_knn : int, default=None The number of nearest-neighbor for the inner loop conditional variance estimation. `n_knn=2` is recommended for regression, and `n_knn=3` for binary classification. approx_knn : bool, default=False A logical indicating whether to use approximate nearest-neighbor search, otherwise exact search is used. It is supported when there are at least 10,000 data instances. rescale : bool, default=True A logical indicating whether to standardize the factors / predictors. n_forward : int, default=2 The number of times to run the forward selection phase to tradeoff between efficiency and accuracy. `n_forward=2` is recommended. n_jobs : int, default=1 The number of jobs to run in parallel. `n_jobs=-1` means using all processors. random_state : int or RandomState instance, default=None A seed for controlling the randomness in breaking ties in nearest-neighbor search and finding random subsamples. importance_option : str, default='total' The options for the feature importance. Default is `total`, which returns a numeric vector for total Sobol' effect and `shapley` for Shapley Sobol' effect. See Huang and Joseph (2024) for details. verbose : default = False A logical indicating whether to display intermediate results, e.g., the selected factor from each iteration. Attributes ---------- importance_ : np.ndarray Factor importance, with zero indicating that the factor is not important for predicting the response. Notes ----- FIRST belongs to the class of forward-backward selection with early dropping algorithm (Borboudakis and Tsamardinos, 2019). In forward selection, each time we find the candidate that maximizes the output variance that can be explained. For candidates that cannot improve the variance explained conditional on the selected factors, they are removed from the candidate set. This forward selection step is run `n_forward` times to tradeoff between accuracy and efficiency. `n_forward = 2` is recommended in (Yu et al., 2020). In backward elimination, we again remove one factor at a time, starting with the factor that can improve the explained variance most, till no factor can further improve. The estimation of the importance is via an adaptation of the Nearest-Neighbor estimator of Broto et al. (2020) for the total Sobol' indices. `Faiss` (Douze et al., 2024) is used for efficient nearest-neighbor search, with the approximate search (`approx_knn=True`) by the inverted file index (IVF). IVF reduces the search scope through first clustering data into Voronoi cells. For more details about FIRST, please see Huang and Joseph (2024). References ---------- Huang, C., & Joseph, V. R. (2025). Factor Importance Ranking and Selection using Total Indices. Technometrics. Sobol', I. M. (2001). Global sensitivity indices for nonlinear mathematical models and their Monte Carlo estimates. Mathematics and computers in simulation, 55(1-3), 271-280. Broto, B., Bachoc, F., & Depecker, M. (2020). Variance reduction for estimation of Shapley effects and adaptation to unknown input distribution. SIAM/ASA Journal on Uncertainty Quantification, 8(2), 693-716. Borboudakis, G., & Tsamardinos, I. (2019). Forward-backward selection with early dropping. The Journal of Machine Learning Research, 20(1), 276-314. Yu, K., Guo, X., Liu, L., Li, J., Wang, H., Ling, Z., & Wu, X. (2020). Causality-based feature selection: Methods and evaluations. ACM Computing Surveys (CSUR), 53(5), 1-36. Douze, M., Guzhva, A., Deng, C., Johnson, J., Szilvasy, G., Mazaré, P.E., Lomeli, M., Hosseini, L., & Jégou, H., (2024). The Faiss library. arXiv preprint arXiv:2401.08281. """ def __init__( self, regression:bool = True, n_knn:int = None, approx_knn:bool = False, rescale:bool = True, n_forward:int = 2, n_jobs:int = 1, random_state:Union[int,np.random.RandomState] = None, importance_option:str = 'total', verbose:bool = False, ): self.regression = regression self.n_knn = (2 if regression else 3) if n_knn is None else n_knn self.approx_knn = approx_knn self.rescale = rescale self.n_forward = n_forward self.n_jobs = n_jobs self.random_state = check_random_state(random_state) self.verbose = verbose self.importance_option = importance_option
[docs] def fit( self, X:Union[pd.DataFrame, np.ndarray], y:Union[pd.Series, np.ndarray], n_mc:int = None, twin_mc:bool = False, ): """Compute the factor importance from data Parameters ---------- X : pd.DataFrame or np.ndarray A pd.DataFrame or np.ndarray for the factors / predictors. y : pd.Series or np.ndarray A pd.Series or np.ndarray for the responses. n_mc : int, default=None The number of Monte Carlo samples for the outer loop expectation estimation. twin_mc : bool, default=False A logical indicating whether to use twinning subsamples, otherwise random subsamples are used. It is supported when the reduction ratio is at least 2. Returns ------- object Returns the instance itself. Notes ----- To further accelerate the importance computation, we support the use of subsamples by specifying `n_mc`. Both random and twinning (Vakayil and Joseph, 2022) subsamples are available, where twinning subsamples provide better approximation for the full data. References ---------- Vakayil, A., & Joseph, V. R. (2022). Data twinning. Statistical Analysis and Data Mining: The ASA Data Science Journal, 15(5), 598-610. """ if not self.regression: assert np.unique(y).size == 2, f"Only binary classification is supported by FIRST." self.importance_ = FIRST( X = X, y = y, n_knn = self.n_knn, approx_knn = self.approx_knn, n_mc = n_mc, twin_mc = twin_mc, rescale = self.rescale, n_forward = self.n_forward, n_jobs = self.n_jobs, random_state = self.random_state, return_option = self.importance_option, verbose = self.verbose, ) return self
[docs] def get_feature_importance(self): """Get the feature importance Returns ------- np.ndarray A numeric vector for the factor importance, with zero indicating that the factor is not important for predicting the response. """ check_is_fitted(self) return self.importance_
[docs] def _get_support_mask(self): """Get the boolean mask indicating which features are selected Returns ------- np.ndarray A boolean vector with True indicating the feature is selected. """ check_is_fitted(self) return self.importance_ > 0