Source code for slickml.classification._xgboostcv

from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple, Union

import numpy as np
import pandas as pd
import xgboost as xgb
from matplotlib.figure import Figure

from slickml.classification._xgboost import XGBoostClassifier
from slickml.utils import Colors, check_var
from slickml.visualization import plot_xgb_cv_results


# TODO(amir): currently there is a bug in `sphinx-autoapi` that ignores the doc for inherited classes
# https://github.com/readthedocs/sphinx-autoapi/issues/272
# for now, I have turned on `"private-members"`
# TODO(amir): add the functionality to receive multiple metrics as `List[str]` as well
[docs] @dataclass class XGBoostCVClassifier(XGBoostClassifier): """XGBoost CV Classifier. This is wrapper using ``XGBoostClassifier`` to train a XGBoost [xgboost-api]_ model with using the optimum number of boosting rounds from the inputs. It used ``xgboost.cv()`` model with n-folds cross-validation and train model based on the best number of boosting round to avoid over-fitting. Parameters ---------- num_boost_round : int, optional Number of boosting rounds to fit a model, by default 200 n_splits : int, optional Number of folds for cross-validation, by default 4 metrics : str, optional Metrics to be tracked at cross-validation fitting time with possible values of "auc", "aucpr", "error", "logloss". Note this is different than `eval_metric` that needs to be passed to `params` dict, by default "auc" early_stopping_rounds : int, optional The criterion to early abort the ``xgboost.cv()`` phase if the test metric is not improved, by default 20 random_state : int, optional Random seed number, by default 1367 stratified : bool, optional Whether to use stratificaiton of the targets to run ``xgboost.cv()`` to find the best number of boosting round at each fold of each iteration, by default True shuffle : bool, optional Whether to shuffle data to have the ability of building stratified folds in ``xgboost.cv()``, by default True sparse_matrix : bool, optional Whether to convert the input features to sparse matrix with csr format or not. This would increase the speed of feature selection for relatively large/sparse datasets. Consequently, this would actually act like an un-optimize solution for dense feature matrix. Additionally, this feature cannot be used along with ``scale_mean=True`` standardizing the feature matrix to have a mean value of zeros would turn the feature matrix into a dense matrix. Therefore, by default our API banned this feature, by default False scale_mean : bool, optional Whether to standarize the feauture matrix to have a mean value of zero per feature (center the features before scaling). As laid out in ``sparse_matrix``, ``scale_mean=False`` when using ``sparse_matrix=True``, since centering the feature matrix would decrease the sparsity and in practice it does not make any sense to use sparse matrix method and it would make it worse. The ``StandardScaler`` object can be accessed via ``cls.scaler_`` if ``scale_mean`` or ``scale_strd`` is used unless it is ``None``, by default False scale_std : bool, optional Whether to scale the feauture matrix to have unit variance (or equivalently, unit standard deviation) per feature. The ``StandardScaler`` object can be accessed via ``cls.scaler_`` if ``scale_mean`` or ``scale_strd`` is used unless it is ``None``, by default False importance_type : str, optional Importance type of ``xgboost.train()`` with possible values ``"weight"``, ``"gain"``, ``"total_gain"``, ``"cover"``, ``"total_cover"``, by default "total_gain" params : Dict[str, Union[str, float, int]], optional Set of parameters required for fitting a Booster, by default {"eval_metric": "auc", "tree_method": "hist", "objective": "binary:logistic", "learning_rate": 0.05, "max_depth": 2, "min_child_weight": 1, "gamma": 0.0, "reg_alpha": 0.0, "reg_lambda": 1.0, "subsample": 0.9, "max_delta_step": 1, "verbosity": 0, "nthread": 4, "scale_pos_weight": 1} verbose : bool, optional Whether to log the final results of ``xgboost.cv()``, by default True callbacks : bool, optional Whether to logging standard deviation of metrics on train data and track the early stopping criterion, by default False Methods ------- fit(X_train, y_train) Fits a ``XGBoost.Booster`` to input training data. Proper ``dtrain_`` matrix based on chosen options i.e. ``sparse_matrix``, ``scale_mean``, ``scale_std`` is being created based on the passed ``X_train`` and ``y_train`` predict_proba(X_test, y_test) Returns prediction probabilities for the positive class. ``predict_proba()`` only reports the probability of the positive class, while the sklearn API returns for both and slicing like ``pred_proba[:, 1]`` is needed for positive class predictions. Additionally, ``y_test`` is optional while the targets might not be available in validiation (inference) predict(X_test, y_test, threshold=0.5) Returns prediction classes based on the threshold. The default ``threshold=0.5`` might not give you the best results while you can find the optimum thresholds based on different algorithms including Youden Index, maximizing the area under sensitivity-specificity curve, and maximizing the area under precision-recall curve by using ``BinaryClassificationMetrics`` get_cv_results() Returns the mean value of the metrics in ``n_splits`` cross-validation for each boosting round get_params() Returns final set of train parameters. The default set of parameters will be updated with the new ones that passed to ``params`` get_default_params() Returns the default set of train parameters. The default set of parameters will be used when ``params=None`` get_feature_importance() Returns the feature importance of the trained booster based on the given ``importance_type`` get_shap_explainer() Returns the ``shap.TreeExplainer`` plot_cv_results() Visualizes cross-validation results plot_shap_summary() Visualizes Shapley values summary plot plot_shap_waterfall() Visualizes Shapley values waterfall plot Attributes ---------- cv_results_ : pd.DataFrame The mean value of the metrics in ``n_splits`` cross-validation for each boosting round feature_importance_ : pd.DataFrame Features importance based on the given ``importance_type`` scaler_ : StandardScaler, optional Standardization object when ``scale_mean=True`` or ``scale_std=True`` unless it is ``None`` X_train_ : pd.DataFrame Fitted and Transformed features when ``scale_mean=True`` or ``scale_std=True``. In other case, it will be the same as the passed ``X_train`` features X_test_ : pd.DataFrame Transformed features when ``scale_mean=True`` or ``scale_std=True`` using `clf.scaler_` that has be fitted on ``X_train`` and ``y_train`` data. In other case, it will be the same as the passed ``X_train`` features dtrain_ : xgb.DMatrix Training data matrix via ``xgboost.DMatrix(clf.X_train_, clf.y_train)`` dtest_ : xgb.DMatrix Testing data matrix via ``xgboost.DMatrix(clf.X_test_, clf.y_test)`` or ``xgboost.DMatrix(clf.X_test_, None)`` when ``y_test`` is not available in inference shap_values_train_ : np.ndarray Shapley values from ``TreeExplainer`` using ``X_train_`` shap_values_test_ : np.ndarray Shapley values from ``TreeExplainer`` using ``X_test_`` shap_explainer_ : shap.TreeExplainer Shap TreeExplainer object model_ : xgboost.Booster XGBoost Booster object See Also -------- :class:`slickml.classification.XGBoostClassifier` References ---------- .. [callback-api] https://xgboost.readthedocs.io/en/latest/python/python_api.html#callback-api .. [linestyles-api] https://matplotlib.org/3.1.0/gallery/lines_bars_and_markers/linestyles.html """ num_boost_round: Optional[int] = 200 n_splits: Optional[int] = 4 metrics: Optional[str] = "auc" early_stopping_rounds: Optional[int] = 20 random_state: Optional[int] = 1367 stratified: Optional[bool] = True shuffle: Optional[bool] = True sparse_matrix: Optional[bool] = False scale_mean: Optional[bool] = False scale_std: Optional[bool] = False importance_type: Optional[str] = "total_gain" params: Optional[Dict[str, Union[str, float, int]]] = None verbose: Optional[bool] = True callbacks: Optional[bool] = False
[docs] def __post_init__(self) -> None: """Post instantiation validations and assignments.""" super().__post_init__() check_var( self.n_splits, var_name="n_splits", dtypes=int, ) check_var( self.metrics, var_name="metrics", dtypes=str, values=( "auc", "aucpr", "error", "logloss", ), ) check_var( self.early_stopping_rounds, var_name="early_stopping_rounds", dtypes=int, ) check_var( self.random_state, var_name="random_state", dtypes=int, ) check_var( self.stratified, var_name="stratified", dtypes=bool, ) check_var( self.shuffle, var_name="shuffle", dtypes=bool, ) check_var( self.verbose, var_name="verbose", dtypes=bool, ) check_var( self.callbacks, var_name="callbacks", dtypes=bool, ) self._callbacks()
[docs] def fit( self, X_train: Union[pd.DataFrame, np.ndarray], y_train: Union[List[float], np.ndarray, pd.Series], ) -> None: """Fits a ``XGBoost.Booster`` to input training data based on the best number of boostring round. Parameters ---------- X_train : Union[pd.DataFrame, np.ndarray] Input data for training (features) y_train : Union[List[float], np.ndarray, pd.Series] Input ground truth for training (targets) See Also -------- :meth:`xgboost.cv()` :meth:`xgboost.train()` Returns ------- None """ self.dtrain_ = self._dtrain( X_train=X_train, y_train=y_train, ) self.cv_results_ = self._cv() if self.verbose: self._verbose_log() self.model_ = self._model() self.feature_importance_ = self._imp_to_df() return None
[docs] def get_cv_results(self) -> pd.DataFrame: """Returns cross-validiation results. Returns ------- pd.DataFrame """ return self.cv_results_
[docs] def plot_cv_results( self, figsize: Optional[Tuple[Union[int, float], Union[int, float]]] = (8, 5), linestyle: Optional[str] = "--", train_label: Optional[str] = "Train", test_label: Optional[str] = "Test", train_color: Optional[str] = "navy", train_std_color: Optional[str] = "#B3C3F3", test_color: Optional[str] = "purple", test_std_color: Optional[str] = "#D0AAF3", save_path: Optional[str] = None, display_plot: Optional[bool] = False, return_fig: Optional[bool] = False, ) -> Optional[Figure]: """Visualizes the cross-validation results and evolution of metrics through number of boosting rounds. Parameters ---------- cv_results : pd.DataFrame Cross-validation results figsize : Tuple[Union[int, float], Union[int, float]], optional Figure size, by default (8, 5) linestyle : str, optional Style of lines [linestyles-api]_, by default "--" train_label : str, optional Label in the figure legend for the train line, by default "Train" test_label : str, optional Label in the figure legend for the test line, by default "Test" train_color : str, optional Color of the training line, by default "navy" train_std_color : str, optional Color of the edge color of the training std bars, by default "#B3C3F3" test_color : str, optional Color of the testing line, by default "purple" test_std_color : str, optional Color of the edge color of the testing std bars, by default "#D0AAF3" save_path : str, optional The full or relative path to save the plot including the image format such as "myplot.png" or "../../myplot.pdf", by default None display_plot : bool, optional Whether to show the plot, by default False return_fig : bool, optional Whether to return figure object, by default False Returns ------- Figure, optional """ return plot_xgb_cv_results( cv_results=self.cv_results_, figsize=figsize, linestyle=linestyle, train_label=train_label, test_label=test_label, train_color=train_color, train_std_color=train_std_color, test_color=test_color, test_std_color=test_std_color, save_path=save_path, display_plot=display_plot, return_fig=return_fig, )
def _cv(self) -> pd.DataFrame: """Returns the XGBoost cv_results based on the best number of boosting rounds. Returns ------- pd.DataFrame """ return xgb.cv( params=self.params, dtrain=self.dtrain_, num_boost_round=self.num_boost_round, nfold=self.n_splits, stratified=self.stratified, metrics=self.metrics, early_stopping_rounds=self.early_stopping_rounds, seed=self.random_state, shuffle=self.shuffle, callbacks=self.callbacks, as_pandas=True, ) def _model(self) -> xgb.Booster: """Fits a ``xgboost.Booster`` based on the best number of boosting round on ``dtrain_`` matrix. Returns ------- xgb.Booster """ return xgb.train( params=self.params, dtrain=self.dtrain_, num_boost_round=len(self.cv_results_) - 1, ) # TODO(amir): investigate more for other callback options ? def _callbacks(self) -> None: """Returns a list of callbacks. The implemented callbacks are including ``xgboost.callback.EvaluationMonitor`` and ``xgboost.callback.EarlyStopping`` [callback-api]_. Returns ------- None """ if self.callbacks: # TODO(amir): we receive bool from user and define callbacks; so mypy complains # we prolly need to use type overloads here self.callbacks = [ # type: ignore xgb.callback.EvaluationMonitor( rank=0, period=1, show_stdv=True, ), xgb.callback.EarlyStopping( rounds=self.early_stopping_rounds, ), ] else: self.callbacks = None return None # TODO(amir): ditch print with logger def _verbose_log(self) -> None: """Logs n-folds cross-validation results. Returns ------- None """ if self.metrics is not None: print( str(Colors.BOLD) + "*-* " + str(Colors.GREEN) + f"Best Boosting Round = {len(self.cv_results_) - 1}" + str(Colors.END) + str(Colors.BOLD) + " -*- " + str(Colors.F_Red) + f"{self.n_splits}-Folds CV {self.metrics.upper()}: " + str(Colors.END) + str(Colors.BOLD) + str(Colors.B_Blue) + f"Train = {self.cv_results_.iloc[-1][0]:.3f}" + " +/- " + f"{self.cv_results_.iloc[-1][1]:.3f}" + str(Colors.END) + str(Colors.BOLD) + " -*- " + str(Colors.B_Magenta) + f"Test = {self.cv_results_.iloc[-1][2]:.3f}" + " +/- " + f"{self.cv_results_.iloc[-1][3]:.3f}" + str(Colors.END) + str(Colors.BOLD) + " *-*", )