Source code for slickml.selection._xgboost

import gc
from collections import defaultdict
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Set, Tuple, Union

import numpy as np
import pandas as pd
import xgboost as xgb
from matplotlib.figure import Figure
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler

from slickml.base import BaseXGBoostEstimator
from slickml.utils import Colors, add_noisy_features, check_var, df_to_csr
from slickml.visualization import plot_xfs_cv_results, plot_xfs_feature_frequency


# TODO(amir): ditch print with logging
# TODO(amir): expose `groups` in `cv.split()`
# TODO(amir): define an `abstractclass` under `base/selection` and inherit from it
# that should ease the process and reduce the amount of amount we need to copy for each algo
# TODO(amir): currently we have all the `feature_importance` calculated at each fold of each iteration
# we can apply some stats (mean/median to be simple) on top of them and plot the `feature_importance`
# as well
[docs]@dataclass class XGBoostFeatureSelector(BaseXGBoostEstimator): """XGBoost Feature Selector. Notes ----- This is a wrapper using XGBoost [xgboost-api]_ to perform a frequency-based feature selection algorithm with n-folds cross-validation on top of an augmented data with noisy features iteratively. At each n-fold of cross-validation of each iteration, the best number of boostin rounds will be found to over-come the possibility of over-fitting, and the feature-importance of the best trained model will be used to select the features. Finally, the frequency of the features that showed up at each feature importance phase of each cross-validation fold of each iteration will the benchmark of feature selection. In principle, the maximum frequency of each feature can be `n_iter` times `n_splits`. Parameters ---------- n_iter : int, optional Number of iteration to repeat the feature selection algorithm, by default 3 num_boost_round : int, optional Number of boosting rounds to fit a model, by default 200 n_splits : int, optional Number of folds for cross-validation, by default 4 metrics : str, optional Metrics to be tracked at cross-validation fitting time depends on the task (classification vs regression) with possible values of "auc", "aucpr", "error", "logloss", "rmse", "rmsle", "mae". Note this is different than `eval_metric` that needs to be passed to `params` dict, by default "auc" early_stopping_rounds : int, optional The criterion to early abort the ``xgboost.cv()`` phase if the test metric is not improved, by default 20 random_state : int, optional Random seed number, by default 1367 stratified : bool, optional Whether to use stratificaiton of the targets (only available for classification tasks) to run ``xgboost.cv()`` to find the best number of boosting round at each fold of each iteration, by default True shuffle : bool, optional Whether to shuffle data to have the ability of building stratified folds in ``xgboost.cv()``, by default True sparse_matrix : bool, optional Whether to convert the input features to sparse matrix with csr format or not. This would increase the speed of feature selection for relatively large/sparse datasets. Consequently, this would actually act like an un-optimize solution for dense feature matrix. Additionally, this parameter cannot be used along with ``scale_mean=True`` standardizing the feature matrix to have a mean value of zeros would turn the feature matrix into a dense matrix. Therefore, by default our API banned this feature, by default False scale_mean : bool, optional Whether to standarize the feauture matrix to have a mean value of zero per feature (center the features before scaling). As laid out in ``sparse_matrix``, ``scale_mean=False`` when using ``sparse_matrix=True``, since centering the feature matrix would decrease the sparsity and in practice it does not make any sense to use sparse matrix method and it would make it worse. The ``StandardScaler`` object can be accessed via ``cls.scaler_`` if ``scale_mean`` or ``scale_strd`` is used unless it is ``None``, by default False scale_std : bool, optional Whether to scale the feauture matrix to have unit variance (or equivalently, unit standard deviation) per feature. The ``StandardScaler`` object can be accessed via ``cls.scaler_`` if ``scale_mean`` or ``scale_strd`` is used unless it is ``None``, by default False nth_noise_threshold : int, optional The threshold to keep all the features up to the `n-th` noisy feature at each fold of each iteration. For example, for a feature selection with 4 iterations and 5-folds cv, the maximum number of noisy features would be 4*5=20, by default 1. importance_type : str, optional Importance type of ``xgboost.train()`` with possible values ``"weight"``, ``"gain"``, ``"total_gain"``, ``"cover"``, ``"total_cover"``, by default "total_gain" params : Dict[str, Union[str, float, int]], optional Set of parameters required for fitting a Booster, by default for a classification task {"eval_metric": "auc", "tree_method": "hist", "objective": "binary:logistic", "learning_rate": 0.05, "max_depth": 2, "min_child_weight": 1, "gamma": 0.0, "reg_alpha": 0.0, "reg_lambda": 1.0, "subsample": 0.9, "max_delta_step": 1, "verbosity": 0, "nthread": 4, "scale_pos_weight": 1} and by default for any regression task {"eval_metric": "rmse", "tree_method": "hist", "objective": "reg:squarederror", "learning_rate": 0.05, "max_depth": 2, "min_child_weight": 1, "gamma": 0.0, "reg_alpha": 0.0, "reg_lambda": 1.0, "subsample": 0.9, "max_delta_step": 1, "verbosity": 0, "nthread": 4} Other options for objective: ``"reg:logistic"``, ``"reg:squaredlogerror"`` verbose_eval : bool, optional Whether to show the results of `xgboost.train()` on train/test sets using `eval_metric`, by default False callbacks : bool, optional Whether to logging standard deviation of metrics on train data and track the early stopping criterion, by default False Methods ------- fit(X, y) Fits the main feature selection algorithm get_feature_frequency() Returns the total feature frequency of the best model get_feature_importance() Returns feature importance based on `importance_type` at each fold of each iteration of the selection process as a dict of dataframes get_cv_results() Returns the total internal/external cross-validation results plot_frequency() Visualizes the selected features frequency as a bar chart plot_cv_results() Visualizies the cross-validation results get_params() Returns the final set of train parameters get_default_params() Returns the default set of train parameters Attributes ---------- feature_importance_ Returns a dict of all feature importance dataframes based on `importance_type` at each fold of each iteration during selection process feature_frequency_ Returns a pandas.DataFrame cosists of total frequency of each feature during the selection process cv_results_ Return a dict of the total internal/external cross-validation results plotting_cv_ Returns the required elements to visualize the histograms of total internal/external cross-validation results References ---------- .. [xgboost-api] https://xgboost.readthedocs.io/en/latest/python/python_api.html .. [markers-api] https://matplotlib.org/stable/api/markers_api.html .. [seaborn-distplot-deprecation] https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 """ n_iter: Optional[int] = 3 n_splits: Optional[int] = 4 metrics: Optional[str] = "auc" num_boost_round: Optional[int] = 200 sparse_matrix: Optional[bool] = False scale_mean: Optional[bool] = False scale_std: Optional[bool] = False early_stopping_rounds: Optional[int] = 20 nth_noise_threshold: Optional[int] = 1 random_state: Optional[int] = 1367 importance_type: Optional[str] = "total_gain" stratified: Optional[bool] = True shuffle: Optional[bool] = True verbose_eval: Optional[bool] = False params: Optional[Dict[str, Union[str, float, int]]] = None callbacks: Optional[bool] = False
[docs] def __post_init__(self) -> None: """Post instantiation validations and assignments.""" super().__post_init__() check_var( self.n_iter, var_name="n_iter", dtypes=int, ) check_var( self.n_splits, var_name="n_splits", dtypes=int, ) check_var( self.metrics, var_name="metrics", dtypes=str, values=( "auc", "aucpr", "error", "logloss", "rmse", "rmsle", "mae", ), ) check_var( self.early_stopping_rounds, var_name="early_stopping_rounds", dtypes=int, ) check_var( self.random_state, var_name="random_state", dtypes=int, ) check_var( self.nth_noise_threshold, var_name="nth_noise_threshold", dtypes=int, ) check_var( self.stratified, var_name="stratified", dtypes=bool, ) check_var( self.shuffle, var_name="shuffle", dtypes=bool, ) check_var( self.verbose_eval, var_name="verbose_eval", dtypes=bool, ) check_var( self.callbacks, var_name="callbacks", dtypes=bool, ) self._callbacks() # The default set of params can be updated based on the given params by user _default_params = self._default_params() if self.params is not None: check_var( self.params, var_name="params", dtypes=dict, ) _default_params.update(self.params) self.params = _default_params else: self.params = _default_params
[docs] def fit( self, X: Union[pd.DataFrame, np.ndarray], y: Union[List[float], np.ndarray, pd.Series], ) -> None: """Fits the main feature selection algorithm. Parameters ---------- X_train : Union[pd.DataFrame, np.ndarray] Input data for training (features) y_train : Union[List[float], np.ndarray, pd.Series] Input ground truth for training (targets) Returns ------- None """ self._check_X_y( X=X, y=y, ) self.cv_results_ = defaultdict(list) # type: ignore self.feature_importance_ = {} self.selected_features = [] # main algorithm loop for iteration in range(self.n_iter): # type: ignore print( str(Colors.BOLD) + "*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-* " + str(Colors.B_Green) + f"Iteration {iteration + 1}" + str(Colors.END) + str(Colors.BOLD) + " *-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*", ) # internal/external cross-validation results at each iteration int_cv_train2 = [] int_cv_test2 = [] ext_cv_train2 = [] ext_cv_test2 = [] # update random state self._random_state = self.random_state + iteration # type: ignore # add noisy featuers by permutation based on targets _X_permuted = add_noisy_features( X=self.X, random_state=self._random_state, prefix="noisy", ) _columns, _X_permuted_values = _X_permuted.columns.tolist(), _X_permuted.values # k-folds cross-validation (stratified only for classifications) if self.metrics in self._clf_metrics(): cv = StratifiedKFold( n_splits=self.n_splits, shuffle=self.shuffle, random_state=self._random_state, ) else: cv = KFold( n_splits=self.n_splits, shuffle=self.shuffle, random_state=self._random_state, ) # set a counter for nfolds cv ijk = 1 for train_index, test_index in cv.split(_X_permuted_values, self.y): _X_train, _X_test = pd.DataFrame( data=_X_permuted_values[train_index], columns=_columns, ), pd.DataFrame( data=_X_permuted_values[test_index], columns=_columns, ) _y_train, _y_test = self.y[train_index], self.y[test_index] # _dtrain / _dtest goes here self.dtrain_, self.dtest_ = self._dtrain( X_train=_X_train, y_train=_y_train, ), self._dtest( X_test=_X_test, y_test=_y_test, ) # watchlist during final training self._watchlist = [ (self.dtrain_, "train"), (self.dtest_, "eval"), ] # store training results self._evals_result: Dict[str, Any] = {} # call xgb cv self._cvr = self._cv() # append cv results self.cv_results_["int_cv_train"] += [self._cvr.iloc[-1][0]] self.cv_results_["int_cv_test"] += [self._cvr.iloc[-1][2]] # append temp cv results int_cv_train2.append(self._cvr.iloc[-1][0]) int_cv_test2.append(self._cvr.iloc[-1][2]) # store best trained model self._best_model = self._model() # store feature gain _feature_gain = self._xgb_imp_to_df() self.feature_importance_[f"model_iter{iteration+1}_fold{ijk}"] = _feature_gain # check wheather noisy feature is being selected if _feature_gain["feature"].str.contains("noisy").sum() != 0: _gain_threshold = _feature_gain.loc[ _feature_gain["feature"].str.contains("noisy"), self.importance_type, ].values.tolist()[ self.nth_noise_threshold - 1 # type: ignore ] else: _gain_threshold = 0.0 # select features where their gain > _gain_threshold self.selected_features.extend( _feature_gain.loc[ _feature_gain[self.importance_type] > _gain_threshold, "feature", ].values.tolist(), ) # final eval results for train/test external cross-validation if self.params is not None and isinstance(self.params["eval_metric"], str): self.cv_results_["ext_cv_train"] += [ self._evals_result["train"][self.params["eval_metric"]][-1], ] self.cv_results_["ext_cv_test"] += [ self._evals_result["eval"][self.params["eval_metric"]][-1], ] ext_cv_train2.append( self._evals_result["train"][self.params["eval_metric"]][-1], ) ext_cv_test2.append( self._evals_result["eval"][self.params["eval_metric"]][-1], ) # TODO(amir): ditch print with logging print( str(Colors.BOLD) + "*-*-*-*-*-*-*-*-*-*-*-* " + str(Colors.F_Green) + f"Fold = {ijk}/{self.n_splits}" + str(Colors.F_Black) + " -- " + str(Colors.F_Red) + f"Train {self.params['eval_metric'].upper()}" + " = " + f"{self._evals_result['train'][self.params['eval_metric']][-1]:.3f}" + str(Colors.F_Black) + " -- " + str(Colors.F_Blue) + f"Test {self.params['eval_metric'].upper()}" + " = " + f"{self._evals_result['eval'][self.params['eval_metric']][-1]:.3f}" + str(Colors.END) + str(Colors.BOLD) + " *-*-*-*-*-*-*-*-*-*-*-*", ) # free memory here at each fold del ( self._best_model, self._watchlist, self.dtrain_, self.dtest_, self._cvr, self._evals_result, _feature_gain, _X_train, _y_train, _X_test, _y_test, ) ijk += 1 gc.collect() # print internal metrics results # TODO(amir): replace print with logging if self.metrics is not None and self.n_splits is not None: print( str(Colors.BOLD) + "*-*-* " + str(Colors.GREEN) + f"Internal {self.n_splits}-Folds CV:" + str(Colors.END) + str(Colors.BOLD) + " -*-*- " + str(Colors.F_Red) + f"Train {self.metrics.upper()}" + " = " + f"{np.mean(int_cv_train2):.3f}" + " +/- " + f"{np.std(int_cv_train2):.3f}" + str(Colors.END) + str(Colors.BOLD) + " -*-*- " + str(Colors.F_Blue) + f"Test {self.metrics.upper()}" + " = " + f"{np.mean(int_cv_test2):.3f}" + " +/- " + f"{np.std(int_cv_test2):.3f}" + str(Colors.END) + str(Colors.BOLD) + " *-*-*", ) # print external eval_metric results # TODO(amir): replace print with logging if self.params is not None and isinstance(self.params["eval_metric"], str): print( str(Colors.BOLD) + "*-*-* " + str(Colors.GREEN) + f"External {self.n_splits}-Folds CV:" + str(Colors.END) + str(Colors.BOLD) + " -*-*- " + str(Colors.F_Red) + f"Train {self.params['eval_metric'].upper()}" + " = " + f"{np.mean(ext_cv_train2):.3f}" + " +/- " + f"{np.std(ext_cv_train2):.3f}" + str(Colors.END) + str(Colors.BOLD) + " -*-*- " + str(Colors.F_Blue) + f"Test {self.params['eval_metric'].upper()}" + " = " + f"{np.mean(ext_cv_test2):.3f}" + " +/- " + f"{np.std(ext_cv_test2):.3f}" + str(Colors.END) + str(Colors.BOLD) + " *-*-*\n", ) # free memory here at iteration del ( int_cv_train2, int_cv_test2, ext_cv_train2, ext_cv_test2, _X_permuted, _X_permuted_values, _columns, ) gc.collect() self.plotting_cv_ = self._plotting_cv() self.feature_frequency_ = self._freq() return None
[docs] def plot_frequency( self, *, figsize: Optional[Tuple[Union[int, float], Union[int, float]]] = (8, 4), show_freq_pct: Optional[bool] = True, color: Optional[str] = "#87CEEB", marker: Optional[str] = "o", markersize: Optional[Union[int, float]] = 10, markeredgecolor: Optional[str] = "#1F77B4", markerfacecolor: Optional[str] = "#1F77B4", markeredgewidth: Optional[Union[int, float]] = 1, fontsize: Optional[Union[int, float]] = 12, save_path: Optional[str] = None, display_plot: Optional[bool] = True, return_fig: Optional[bool] = False, ) -> Optional[Figure]: """Visualizes the selected features frequency as a bar chart. Notes ----- This plotting function can be used along with ``feature_frequency_`` attribute of any frequency-based feature selection algorithm such as ``XGBoostFeatureSelector``. Parameters ---------- feature importance : pd.DataFrame Feature importance (``feature_frequency_`` attribute) figsize : tuple, optional Figure size, by default (8, 4) show_freq_pct : bool, optional Whether to show the features frequency in percent, by default True color : str, optional Color of the horizontal lines of lollipops, by default "#87CEEB" marker : str, optional Marker style of the lollipops. More valid marker styles can be found at [markers-api]_, by default "o" markersize : Union[int, float], optional Markersize, by default 10 markeredgecolor : str, optional Marker edge color, by default "#1F77B4" markerfacecolor : str, optional Marker face color, by defualt "#1F77B4" markeredgewidth : Union[int, float], optional Marker edge width, by default 1 fontsize : Union[int, float], optional Fontsize for xlabel and ylabel, and ticks parameters, by default 12 save_path : str, optional The full or relative path to save the plot including the image format such as "myplot.png" or "../../myplot.pdf", by default None display_plot : bool, optional Whether to show the plot, by default True return_fig : bool, optional Whether to return figure object, by default False Returns ------- Figure, optional """ return plot_xfs_feature_frequency( freq=self.feature_frequency_, figsize=figsize, show_freq_pct=show_freq_pct, color=color, marker=marker, markersize=markersize, markeredgecolor=markeredgecolor, markerfacecolor=markerfacecolor, markeredgewidth=markeredgewidth, fontsize=fontsize, save_path=save_path, display_plot=display_plot, return_fig=return_fig, )
[docs] def plot_cv_results( self, *, figsize: Optional[Tuple[Union[int, float], Union[int, float]]] = (10, 8), internalcvcolor: Optional[str] = "#4169E1", externalcvcolor: Optional[str] = "#8A2BE2", sharex: Optional[bool] = False, sharey: Optional[bool] = False, save_path: Optional[str] = None, display_plot: Optional[bool] = True, return_fig: Optional[bool] = False, ) -> Optional[Figure]: """Visualizies the cross-validation results. Notes ----- It visualizes the internal and external cross-validiation performance during the selection process. The `internal` refers to the performance of the train/test folds during the ``xgboost.cv()`` using ``metrics`` rounds to help the best number of boosting round while the `external` refers to the performance of ``xgboost.train()`` based on watchlist using ``eval_metric``. Additionally, `sns.distplot` previously was used which is now deprecated. More details in [seaborn-distplot-deprecation]_. Parameters ---------- figsize : tuple, optional Figure size, by default (10, 8) internalcvcolor : str, optional Color of the histograms for internal cv results, by default "#4169E1" externalcvcolor : str, optional Color of the histograms for external cv results, by default "#8A2BE2" sharex : bool, optional Whether to share "X" axis for each column of subplots, by default False sharey : bool, optional Whether to share "Y" axis for each row of subplots, by default False save_path : str, optional The full or relative path to save the plot including the image format such as "myplot.png" or "../../myplot.pdf", by default None display_plot : bool, optional Whether to show the plot, by default True return_fig : bool, optional Whether to return figure object, by default False kwargs : Dict[str, Any] Required plooting elements (``plotting_cv_`` attribute of ``XGBoostFeatureSelector``) Returns ------- Figure, optional """ return plot_xfs_cv_results( figsize=figsize, internalcvcolor=internalcvcolor, externalcvcolor=externalcvcolor, sharex=sharex, sharey=sharey, save_path=save_path, display_plot=display_plot, return_fig=return_fig, **self.plotting_cv_, )
[docs] def get_params(self) -> Optional[Dict[str, Union[str, float, int]]]: """Returns the final set of train parameters. The default set of parameters will be updated with the new ones that passed to ``params``. See Also -------- :meth:`get_default_params()` Returns ------- Dict[str, Union[str, float, int]] """ return self.params
[docs] def get_default_params(self) -> Dict[str, Union[str, float, int]]: """Returns the default set of train parameters. The default set of parameters will be used when ``params=None``. See Also -------- :meth:`get_params()` Returns ------- Dict[str, Union[str, float, int]] """ return self._default_params()
[docs] def get_feature_importance(self) -> Dict[str, pd.DataFrame]: """Returns the feature importance of the trained booster based on the given ``importance_type``. Returns ------- pd.DataFrame """ return self.feature_importance_
[docs] def get_feature_frequency(self) -> pd.DataFrame: """ReturnS the total feature frequency of the best model at each fold of each iteration. Returns ------- pd.DataFrame """ return self.feature_frequency_
[docs] def get_cv_results(self) -> pd.DataFrame: """Returns internal and external cross-validation results. Returns ------- pd.DataFrame """ return pd.DataFrame( data=self.cv_results_, )
def _dtrain( self, X_train: Union[pd.DataFrame, np.ndarray], y_train: Union[List[float], np.ndarray, pd.Series], ) -> xgb.DMatrix: """Returns a proper dtrain matrix compatible with sparse/standardized matrices. Parameters ---------- X_train : Union[pd.DataFrame, np.ndarray] Input data for training (features) y_train : Union[List[float], np.ndarray, pd.Series] Input ground truth for training (targets) See Also -------- :meth:`_dtest()` Returns ------- xgb.DMatrix """ if self.scale_mean or self.scale_std: self._scaler = StandardScaler( copy=True, with_mean=self.scale_mean, with_std=self.scale_std, ) _X_train = pd.DataFrame( self._scaler.fit_transform(X_train), columns=X_train.columns.tolist(), ) else: self._scaler = None _X_train = X_train if not self.sparse_matrix: dtrain = xgb.DMatrix( data=_X_train, label=y_train, ) else: dtrain = xgb.DMatrix( data=df_to_csr( _X_train, fillna=0.0, verbose=False, ), label=y_train, feature_names=_X_train.columns.tolist(), ) return dtrain def _dtest( self, X_test: Union[pd.DataFrame, np.ndarray], y_test: Optional[Union[List[float], np.ndarray, pd.Series]] = None, ) -> xgb.DMatrix: """Returns a proper dtest matrix compatible with sparse/standardized matrices. If ``scale_mean=True`` or ``scale_std=True``, the ``StandardScaler`` object ``(scaler_)`` which is being fitted on ``X_train`` will be used to **only** transform ``X_test`` to make sure there is no data leak in the transformation. Additionally, ``y_test`` is optional since it might not be available while validating the model (inference). Parameters ---------- X_test : Union[pd.DataFrame, np.ndarray] Input data for testing (features) y_test : Union[List[float], np.ndarray, pd.Series], optional Input ground truth for testing (targets) See Also -------- :meth:`_dtrain()` Returns ------- xgb.DMatrix """ if self.scale_mean or self.scale_std: _X_test = pd.DataFrame( self._scaler.transform(X_test), columns=X_test.columns.tolist(), ) else: _X_test = X_test if not self.sparse_matrix: dtest = xgb.DMatrix( data=_X_test, label=y_test, ) else: dtest = xgb.DMatrix( data=df_to_csr( _X_test, fillna=0.0, verbose=False, ), label=y_test, feature_names=_X_test.columns.tolist(), ) return dtest def _xgb_imp_to_df(self) -> pd.DataFrame: """Returns the feature importance dict object into a pandas dataframe. Returns ------- pd.DataFrame """ data: Dict[str, List[float]] = { "feature": [], f"{self.importance_type}": [], } features_gain = self._best_model.get_score(importance_type=self.importance_type) for key, val in features_gain.items(): data["feature"].append(key) data[f"{self.importance_type}"].append(val) return ( pd.DataFrame(data) .sort_values( by=f"{self.importance_type}", ascending=False, ) .reset_index(drop=True) ) def _cv(self) -> pd.DataFrame: """Returns XGBoost cross-validation results to find the best number of boosting rounds. Returns ------- pd.DataFrame """ if self.metrics in self._clf_metrics(): return xgb.cv( params=self.params, dtrain=self.dtrain_, num_boost_round=self.num_boost_round, nfold=self.n_splits, stratified=self.stratified, metrics=self.metrics, early_stopping_rounds=self.early_stopping_rounds, seed=self._random_state, verbose_eval=self.verbose_eval, shuffle=self.shuffle, callbacks=self.callbacks, as_pandas=True, ) else: return xgb.cv( params=self.params, dtrain=self.dtrain_, num_boost_round=self.num_boost_round, nfold=self.n_splits, metrics=self.metrics, early_stopping_rounds=self.early_stopping_rounds, seed=self._random_state, verbose_eval=self.verbose_eval, shuffle=self.shuffle, callbacks=self.callbacks, as_pandas=True, ) def _model(self) -> xgb.Booster: """Returns the trained `xgb.Booster` model based on the best number of boosting round. Returns ------- xgb.Booster """ return xgb.train( params=self.params, dtrain=self.dtrain_, num_boost_round=len(self._cvr) - 1, evals=self._watchlist, evals_result=self._evals_result, verbose_eval=self.verbose_eval, ) def _freq(self) -> pd.DataFrame: """Returns feature frequency of the selected features. Returns ------- pd.DataFrame """ unique_elements, counts_elements = np.unique( self.selected_features, return_counts=True, ) feature_frequency = pd.DataFrame( data={ "Feature": list(unique_elements), "Frequency": [float(i) for i in list(counts_elements)], }, ) if self.n_splits is not None and self.n_iter is not None: feature_frequency["Frequency (%)"] = round( (feature_frequency["Frequency"] / float(self.n_splits * self.n_iter) * 100), ndigits=2, ) return feature_frequency.sort_values( by=["Frequency", "Frequency (%)"], ascending=[False, False], ).reset_index(drop=True) # TODO(amir): investigate more for other callback options ? def _callbacks(self) -> None: """Returns a list of callbacks. The implemented callbacks are including ``xgboost.callback.EvaluationMonitor`` and ``xgboost.callback.EarlyStopping`` [callback-api]_. Returns ------- None """ if self.callbacks: # TODO(amir): ditch print with logger print( "Warning: The `cv` will break if the `early_stopping_rounds` criterion was not satisfied.", ) # TODO(amir): use type overload self.callbacks = [ # type: ignore xgb.callback.EvaluationMonitor( rank=0, period=1, show_stdv=True, ), xgb.callback.EarlyStopping( rounds=self.early_stopping_rounds, ), ] else: self.callbacks = None return None def _default_params(self) -> Dict[str, Union[str, float, int]]: """Default set of parameters when the class is being instantiated with ``params=None``. Notes ----- The default set of parameters would be a little bit different depends on the type of selection whether a classification or regression `metric` is being used. Returns ------- Dict[str, Union[str, float, int]] """ if self.metrics in self._clf_metrics(): return { "eval_metric": "auc", "tree_method": "hist", "objective": "binary:logistic", "learning_rate": 0.05, "max_depth": 2, "min_child_weight": 1, "gamma": 0.0, "reg_alpha": 0.0, "reg_lambda": 1.0, "subsample": 0.9, "max_delta_step": 1, "verbosity": 0, "nthread": 4, "scale_pos_weight": 1, } else: return { "eval_metric": "rmse", "tree_method": "hist", "objective": "reg:squarederror", "learning_rate": 0.05, "max_depth": 2, "min_child_weight": 1, "gamma": 0.0, "reg_alpha": 0.0, "reg_lambda": 1.0, "subsample": 0.9, "max_delta_step": 1, "verbosity": 0, "nthread": 4, } def _plotting_cv(self) -> Dict[str, Any]: """Returns the required elements for plotting cross-validation results. Returns ------- Dict[str, Any] """ p: Dict[str, Any] = {} if ( self.metrics is not None and self.params is not None and isinstance(self.params["eval_metric"], str) ): p["metric"] = self.metrics.upper() p["eval_metric"] = self.params["eval_metric"].upper() p["n_splits"] = self.n_splits p["int_cv_train"] = self.cv_results_["int_cv_train"] p["int_cv_test"] = self.cv_results_["int_cv_test"] p["ext_cv_train"] = self.cv_results_["ext_cv_train"] p["ext_cv_test"] = self.cv_results_["ext_cv_test"] return p def _clf_metrics(self) -> Set[str]: """Returns the default classification metrics. Returns ------- Set[str] """ return { "auc", "aucpr", "error", "logloss", }