Source code for slickml.classification._xgboost

from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple, Union

import numpy as np
import pandas as pd
import shap
import xgboost as xgb
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.figure import Figure
from sklearn.base import ClassifierMixin

from slickml.base import BaseXGBoostEstimator
from slickml.utils import check_var
from slickml.visualization import (
    plot_shap_summary,
    plot_shap_waterfall,
    plot_xgb_feature_importance,
)


# TODO(amir): add the functionality for mulit-class classification as well
[docs] @dataclass class XGBoostClassifier(BaseXGBoostEstimator, ClassifierMixin): """XGBoost Classifier. This is a wrapper using XGBoost classifier to train a XGBoost [xgboost-api]_ model using the number of boosting rounds from the inputs. This is also the base class for ``XGBoostCVClassifier``. Parameters ---------- num_boost_round : int, optional Number of boosting rounds to fit a model, by default 200 sparse_matrix : bool, optional Whether to convert the input features to sparse matrix with csr format or not. This would increase the speed of feature selection for relatively large/sparse datasets. Consequently, this would actually act like an un-optimize solution for dense feature matrix. Additionally, this parameter cannot be used along with ``scale_mean=True`` standardizing the feature matrix to have a mean value of zeros would turn the feature matrix into a dense matrix. Therefore, by default our API banned this feature, by default False scale_mean : bool, optional Whether to standarize the feauture matrix to have a mean value of zero per feature (center the features before scaling). As laid out in ``sparse_matrix``, ``scale_mean=False`` when using ``sparse_matrix=True``, since centering the feature matrix would decrease the sparsity and in practice it does not make any sense to use sparse matrix method and it would make it worse. The ``StandardScaler`` object can be accessed via ``cls.scaler_`` if ``scale_mean`` or ``scale_strd`` is used unless it is ``None``, by default False scale_std : bool, optional Whether to scale the feauture matrix to have unit variance (or equivalently, unit standard deviation) per feature. The ``StandardScaler`` object can be accessed via ``cls.scaler_`` if ``scale_mean`` or ``scale_strd`` is used unless it is ``None``, by default False importance_type : str, optional Importance type of ``xgboost.train()`` with possible values ``"weight"``, ``"gain"``, ``"total_gain"``, ``"cover"``, ``"total_cover"``, by default "total_gain" params : Dict[str, Union[str, float, int]], optional Set of parameters required for fitting a Booster, by default {"eval_metric": "auc", "tree_method": "hist", "objective": "binary:logistic", "learning_rate": 0.05, "max_depth": 2, "min_child_weight": 1, "gamma": 0.0, "reg_alpha": 0.0, "reg_lambda": 1.0, "subsample": 0.9, "max_delta_step": 1, "verbosity": 0, "nthread": 4, "scale_pos_weight": 1} Methods ------- fit(X_train, y_train) Fits a ``XGBoost.Booster`` to input training data. Proper ``dtrain_`` matrix based on chosen options i.e. ``sparse_matrix``, ``scale_mean``, ``scale_std`` is being created based on the passed ``X_train`` and ``y_train`` predict_proba(X_test, y_test) Returns prediction probabilities for the positive class. ``predict_proba()`` only reports the probability of the positive class, while the sklearn API returns for both and slicing like ``pred_proba[:, 1]`` is needed for positive class predictions. Additionally, ``y_test`` is optional while the targets might not be available in validiation (inference) predict(X_test, y_test, threshold=0.5) Returns prediction classes based on the threshold. The default ``threshold=0.5`` might not give you the best results while you can find the optimum thresholds based on different algorithms including Youden Index, maximizing the area under sensitivity-specificity curve, and maximizing the area under precision-recall curve by using ``BinaryClassificationMetrics`` get_params() Returns final set of train parameters. The default set of parameters will be updated with the new ones that passed to ``params`` get_default_params() Returns the default set of train parameters. The default set of parameters will be used when ``params=None`` get_feature_importance() Returns the feature importance of the trained booster based on the given ``importance_type`` get_shap_explainer() Returns the ``shap.TreeExplainer`` plot_shap_summary() Visualizes Shapley values summary plot plot_shap_waterfall() Visualizes Shapley values waterfall plot Attributes ---------- feature_importance_ : pd.DataFrame Features importance based on the given ``importance_type`` scaler_ : StandardScaler, optional Standardization object when ``scale_mean=True`` or ``scale_std=True`` unless it is ``None`` X_train_ : pd.DataFrame Fitted and Transformed features when ``scale_mean=True`` or ``scale_std=True``. In other case, it will be the same as the passed ``X_train`` features X_test_ : pd.DataFrame Transformed features when ``scale_mean=True`` or ``scale_std=True`` using `clf.scaler_` that has be fitted on ``X_train`` and ``y_train`` data. In other case, it will be the same as the passed ``X_train`` features dtrain_ : xgb.DMatrix Training data matrix via ``xgboost.DMatrix(clf.X_train_, clf.y_train)`` dtest_ : xgb.DMatrix Testing data matrix via ``xgboost.DMatrix(clf.X_test_, clf.y_test)`` or ``xgboost.DMatrix(clf.X_test_, None)`` when ``y_test`` is not available in inference shap_values_train_ : np.ndarray Shapley values from ``TreeExplainer`` using ``X_train_`` shap_values_test_ : np.ndarray Shapley values from ``TreeExplainer`` using ``X_test_`` shap_explainer_ : shap.TreeExplainer Shap TreeExplainer object model_ : xgboost.Booster XGBoost Booster object References ---------- .. [xgboost-api] https://xgboost.readthedocs.io/en/latest/python/python_api.html .. [markers-api] https://matplotlib.org/stable/api/markers_api.html .. [shap-api] https://shap-lrjball.readthedocs.io/en/latest/generated/shap.summary_plot.html """ num_boost_round: Optional[int] = 200 sparse_matrix: Optional[bool] = False scale_mean: Optional[bool] = False scale_std: Optional[bool] = False importance_type: Optional[str] = "total_gain" params: Optional[Dict[str, Union[str, float, int]]] = None
[docs] def __post_init__(self) -> None: """Post instantiation validations and assignments.""" super().__post_init__() # The default set of params can be updated based on the given params by user _default_params = self._default_params() if self.params: check_var( self.params, var_name="params", dtypes=dict, ) _default_params.update(self.params) self.params = _default_params else: self.params = _default_params
[docs] def fit( self, X_train: Union[pd.DataFrame, np.ndarray], y_train: Union[List[float], np.ndarray, pd.Series], ) -> None: """Fits a ``XGBoost.Booster`` to input training data. Notes ----- Proper ``dtrain_`` matrix based on chosen options i.e. ``sparse_matrix``, ``scale_mean``, ``scale_std`` is being created based on the passed ``X_train`` and ``y_train``. Parameters ---------- X_train : Union[pd.DataFrame, np.ndarray] Input data for training (features) y_train : Union[List[float], np.ndarray, pd.Series] Input ground truth for training (targets) See Also -------- :meth:`xgboost.train()` Returns ------- None """ self.dtrain_ = self._dtrain( X_train=X_train, y_train=y_train, ) self.model_ = self._model() self.feature_importance_ = self._imp_to_df() return None
[docs] def predict_proba( self, X_test: Union[pd.DataFrame, np.ndarray], y_test: Optional[Union[List[float], np.ndarray, pd.Series]] = None, ) -> np.ndarray: """Returns the prediction probabilities for the positive class. Notes ----- ``predict_proba()`` only reports the probability of the positive class, while the sklearn API returns for both and slicing like ``pred_proba[:, 1]`` is needed for positive class predictions. Additionally, ``y_test`` is optional while the targets might not be available in validiation (inference). Parameters ---------- X_test : Union[pd.DataFrame, np.ndarray] Input data for testing (features) y_test : Union[List[float], np.ndarray, pd.Series], optional Input ground truth for testing (targets) Returns ------- np.ndarray """ self.dtest_ = self._dtest( X_test=X_test, y_test=y_test, ) self.y_pred_proba_ = self.model_.predict( data=self.dtest_, output_margin=False, ) return self.y_pred_proba_
[docs] def predict( self, X_test: Union[pd.DataFrame, np.ndarray], y_test: Optional[Union[List[float], np.ndarray, pd.Series]] = None, threshold: Optional[float] = 0.5, ) -> np.ndarray: """Returns the prediction classes based on the threshold. Notes ----- The default ``threshold=0.5`` might not give you the best results while you can find the optimum thresholds based on different algorithms including Youden Index, maximizing the area under sensitivity-specificity curve, and maximizing the area under precision-recall curve by using ``BinaryClassificationMetrics``. Parameters ---------- X_test : Union[pd.DataFrame, np.ndarray] Input data for testing (features) y_test : Union[List[float], np.ndarray, pd.Series], optional Input ground truth for testing (targets) threshold : float, optional Inclusive threshold value to binarize ``y_pred_proba_`` to ``y_pred_`` where any value that satisfies ``y_pred_prob_ >= threshold`` will set to ``class=1 (positive class)``. Note that for ``">="`` is used instead of ``">"``, by default 0.5 Returns ------- np.ndarray """ self.dtest_ = self._dtest( X_test=X_test, y_test=y_test, ) self.y_pred_proba_ = self.model_.predict( self.dtest_, output_margin=False, ) self.y_pred_ = (self.y_pred_proba_ >= threshold).astype(int) return self.y_pred_
[docs] def plot_feature_importance( self, figsize: Optional[Tuple[Union[int, float], Union[int, float]]] = (8, 5), color: Optional[str] = "#87CEEB", marker: Optional[str] = "o", markersize: Optional[Union[int, float]] = 10, markeredgecolor: Optional[str] = "#1F77B4", markerfacecolor: Optional[str] = "#1F77B4", markeredgewidth: Optional[Union[int, float]] = 1, fontsize: Optional[Union[int, float]] = 12, save_path: Optional[str] = None, display_plot: Optional[bool] = True, return_fig: Optional[bool] = False, ) -> Optional[Figure]: """Visualizes the XGBoost feature importance as bar chart. Parameters ---------- feature importance : pd.DataFrame Feature importance (``feature_importance_`` attribute) figsize : Tuple[Union[int, float], Union[int, float]], optional Figure size, by default (8, 5) color : str, optional Color of the horizontal lines of lollipops, by default "#87CEEB" marker : str, optional Marker style of the lollipops. More valid marker styles can be found at [markers-api]_, by default "o" markersize : Union[int, float], optional Markersize, by default 10 markeredgecolor : str, optional Marker edge color, by default "#1F77B4" markerfacecolor : str, optional Marker face color, by defualt "#1F77B4" markeredgewidth : Union[int, float], optional Marker edge width, by default 1 fontsize : Union[int, float], optional Fontsize for xlabel and ylabel, and ticks parameters, by default 12 save_path : str, optional The full or relative path to save the plot including the image format such as "myplot.png" or "../../myplot.pdf", by default None display_plot : bool, optional Whether to show the plot, by default True return_fig : bool, optional Whether to return figure object, by default False Returns ------- Figure, optional """ return plot_xgb_feature_importance( feature_importance=self.feature_importance_, figsize=figsize, color=color, marker=marker, markersize=markersize, markeredgecolor=markeredgecolor, markerfacecolor=markerfacecolor, markeredgewidth=markeredgewidth, fontsize=fontsize, save_path=save_path, display_plot=display_plot, return_fig=return_fig, )
[docs] def plot_shap_summary( self, validation: Optional[bool] = True, plot_type: Optional[str] = "dot", figsize: Optional[Union[str, Tuple[float, float]]] = "auto", color: Optional[str] = None, cmap: Optional[LinearSegmentedColormap] = None, max_display: Optional[int] = 20, feature_names: Optional[List[str]] = None, layered_violin_max_num_bins: Optional[int] = 10, title: Optional[str] = None, sort: Optional[bool] = True, color_bar: Optional[bool] = True, class_names: Optional[List[str]] = None, class_inds: Optional[List[int]] = None, color_bar_label: Optional[str] = "Feature Value", save_path: Optional[str] = None, display_plot: Optional[bool] = True, ) -> None: """Visualizes shap beeswarm plot as summary of shapley values. Notes ----- This is a helper function to plot the ``shap`` summary plot based on all types of ``shap.Explainer`` including ``shap.LinearExplainer`` for linear models, ``shap.TreeExplainer`` for tree-based models, and ``shap.DeepExplainer`` deep neural network models. More on details are available at [shap-api]_. Note that this function should be ran after the ``predict_proba()`` to make sure the ``X_test`` is being instansiated or set ``validation=False``. Parameters ---------- validation : bool, optional Whether to calculate Shap values of using the validation data ``X_test`` or not. When ``validation=False``, Shap values are calculated using ``X_train``, be default True plot_type : str, optional The type of summary plot where possible options are "bar", "dot", "violin", "layered_violin", and "compact_dot". Recommendations are "dot" for single-output such as binary classifications, "bar" for multi-output problems, "compact_dot" for Shap interactions, by default "dot" figsize : tuple, optional Figure size where "auto" is auto-scaled figure size based on the number of features that are being displayed. Passing a single float will cause each row to be that many inches high. Passing a pair of floats will scale the plot by that number of inches. If None is passed then the size of the current figure will be left unchanged, by default "auto" color : str, optional Color of plots when ``plot_type="violin"`` and ``plot_type=layered_violin"`` are "RdBl" color-map while color of the horizontal lines when ``plot_type="bar"`` is "#D0AAF3", by default None cmap : LinearSegmentedColormap, optional Color map when ``plot_type="violin"`` and ``plot_type=layered_violin"``, by default "RdBl" max_display : int, optional Limit to show the number of features in the plot, by default 20 feature_names : List[str], optional List of feature names to pass. It should follow the order of features, by default None layered_violin_max_num_bins : int, optional The number of bins for calculating the violin plots ranges and outliers, by default 10 title : str, optional Title of the plot, by default None sort : bool, optional Flag to plot sorted shap vlues in descending order, by default True color_bar : bool, optional Flag to show a color bar when ``plot_type="dot"`` or ``plot_type="violin"`` class_names : List[str], optional List of class names for multi-output problems, by default None class_inds : List[int], optional List of class indices for multi-output problems, by default None color_bar_label : str, optional Label for color bar, by default "Feature Value" save_path : str, optional The full or relative path to save the plot including the image format such as "myplot.png" or "../../myplot.pdf", by default None display_plot : bool, optional Whether to show the plot, by default True Returns ------- None """ self._explainer() if validation: shap_values = self.shap_values_test_ features = self.X_test_ else: shap_values = self.shap_values_train_ features = self.X_train_ return plot_shap_summary( shap_values=shap_values, features=features, plot_type=plot_type, figsize=figsize, color=color, cmap=cmap, max_display=max_display, feature_names=feature_names, layered_violin_max_num_bins=layered_violin_max_num_bins, title=title, sort=sort, color_bar=color_bar, class_names=class_names, class_inds=class_inds, color_bar_label=color_bar_label, save_path=save_path, display_plot=display_plot, )
[docs] def plot_shap_waterfall( self, validation: Optional[bool] = True, figsize: Optional[Tuple[float, float]] = (8, 5), bar_color: Optional[str] = "#B3C3F3", bar_thickness: Optional[Union[float, int]] = 0.5, line_color: Optional[str] = "purple", marker: Optional[str] = "o", markersize: Optional[Union[int, float]] = 7, markeredgecolor: Optional[str] = "purple", markerfacecolor: Optional[str] = "purple", markeredgewidth: Optional[Union[int, float]] = 1, max_display: Optional[int] = 20, title: Optional[str] = None, fontsize: Optional[Union[int, float]] = 12, save_path: Optional[str] = None, display_plot: Optional[bool] = True, return_fig: Optional[bool] = False, ) -> Optional[Figure]: """Visualizes the Shapley values as a waterfall plot. Notes ----- Waterfall is defined as the cumulitative/composite ratios of shap values per feature. Therefore, it can be easily seen with each feature how much explainability we can achieve. Note that this function should be ran after the ``predict_proba()`` to make sure the ``X_test`` is being instansiated or set ``validation=False``. Parameters ---------- validation : bool, optional Whether to calculate Shap values of using the validation data ``X_test`` or not. When ``validation=False``, Shap values are calculated using ``X_train``, be default True figsize : Tuple[float, float], optional Figure size, by default (8, 5) bar_color : str, optional Color of the horizontal bar lines, "#B3C3F3" bar_thickness : Union[float, int], optional Thickness (hight) of the horizontal bar lines, by default 0.5 line_color : str, optional Color of the line plot, by default "purple" marker : str, optional Marker style of the lollipops. More valid marker styles can be found at [markers-api]_, by default "o" markersize : Union[int, float], optional Markersize, by default 7 markeredgecolor : str, optional Marker edge color, by default "purple" markerfacecolor: str, optional Marker face color, by default "purple" markeredgewidth : Union[int, float], optional Marker edge width, by default 1 max_display : int, optional Limit to show the number of features in the plot, by default 20 title : str, optional Title of the plot, by default None fontsize : Union[int, float], optional Fontsize for xlabel and ylabel, and ticks parameters, by default 12 save_path : str, optional The full or relative path to save the plot including the image format such as "myplot.png" or "../../myplot.pdf", by default None display_plot : bool, optional Whether to show the plot, by default True return_fig : bool, optional Whether to return figure object, by default False Returns ------- Figure, optional """ self._explainer() if validation: shap_values = self.shap_values_test_ features = self.X_test_ else: shap_values = self.shap_values_train_ features = self.X_train_ return plot_shap_waterfall( shap_values=shap_values, features=features, figsize=figsize, bar_color=bar_color, bar_thickness=bar_thickness, line_color=line_color, marker=marker, markersize=markersize, markeredgecolor=markeredgecolor, markerfacecolor=markerfacecolor, markeredgewidth=markeredgewidth, max_display=max_display, title=title, fontsize=fontsize, save_path=save_path, display_plot=display_plot, return_fig=return_fig, )
[docs] def get_params(self) -> Optional[Dict[str, Union[str, float, int]]]: """Returns the final set of train parameters. The default set of parameters will be updated with the new ones that passed to ``params``. See Also -------- :meth:`get_default_params()` Returns ------- Dict[str, Union[str, float, int]] """ return self.params
[docs] def get_default_params(self) -> Dict[str, Union[str, float, int]]: """Returns the default set of train parameters. The default set of parameters will be used when ``params=None``. See Also -------- :meth:`get_params()` Returns ------- Dict[str, Union[str, float, int]] """ return self._default_params()
[docs] def get_feature_importance(self) -> pd.DataFrame: """Returns the feature importance of the trained booster based on the given ``importance_type``. Returns ------- pd.DataFrame """ return self.feature_importance_
[docs] def get_shap_explainer(self) -> shap.TreeExplainer: """Returns the ``shap.TreeExplainer`` object. Returns ------- shap.TreeExplainer """ self._explainer() return self.shap_explainer_
def _model(self) -> xgb.Booster: """Fits a ``XGBoost.Booster`` based on the given number of boosting round on ``dtrain_`` matrix. Returns ------- xgb.Booster """ return xgb.train( params=self.params, dtrain=self.dtrain_, num_boost_round=self.num_boost_round, # type: ignore ) def _explainer(self) -> None: """Fits a ``shap.TreeExplainer`` on the ``X_train_`` and ``X_test_`` data. Returns ------- None """ self.shap_explainer_ = shap.TreeExplainer( model=self.model_, ) self.shap_values_test_ = self.shap_explainer_.shap_values( X=self.X_test_, ) self.shap_values_train_ = self.shap_explainer_.shap_values( X=self.X_train_, ) return None def _imp_to_df(self) -> pd.DataFrame: """Converts the feature importance object to ``pd.DataFrame``. Returns ------- pd.DataFrame """ data: Dict[str, List[float]] = { "feature": [], f"{self.importance_type}": [], } features_gain = self.model_.get_score( importance_type=self.importance_type, ) for key, val in features_gain.items(): data["feature"].append(key) data[f"{self.importance_type}"].append(val) return ( pd.DataFrame(data) .sort_values( by=f"{self.importance_type}", ascending=False, ) .reset_index( drop=True, ) ) @staticmethod def _default_params() -> Dict[str, Union[str, float, int]]: """Default set of parameters when the class is being instantiated with ``params=None``. Returns ------- Dict[str, Union[str, float, int]] """ return { "eval_metric": "auc", "tree_method": "hist", "objective": "binary:logistic", "learning_rate": 0.05, "max_depth": 2, "min_child_weight": 1, "gamma": 0.0, "reg_alpha": 0.0, "reg_lambda": 1.0, "subsample": 0.9, "max_delta_step": 1, "verbosity": 0, "nthread": 4, "scale_pos_weight": 1, }