Source code for slickml.classification._xgboost

from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple, Union

import numpy as np
import pandas as pd
import shap
import xgboost as xgb
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.figure import Figure
from sklearn.base import ClassifierMixin

from slickml.base import BaseXGBoostEstimator
from slickml.utils import check_var
from slickml.visualization import (
    plot_shap_summary,
    plot_shap_waterfall,
    plot_xgb_feature_importance,
)


# TODO(amir): add the functionality for mulit-class classification as well

[docs]
@dataclass
class XGBoostClassifier(BaseXGBoostEstimator, ClassifierMixin):
    """XGBoost Classifier.

    This is a wrapper using XGBoost classifier to train a XGBoost [xgboost-api]_ model using the number of
    boosting rounds from the inputs. This is also the base class for ``XGBoostCVClassifier``.

    Parameters
    ----------
    num_boost_round : int, optional
        Number of boosting rounds to fit a model, by default 200

    sparse_matrix : bool, optional
        Whether to convert the input features to sparse matrix with csr format or not. This would
        increase the speed of feature selection for relatively large/sparse datasets. Consequently,
        this would actually act like an un-optimize solution for dense feature matrix. Additionally,
        this parameter cannot be used along with ``scale_mean=True`` standardizing the feature matrix
        to have a mean value of zeros would turn the feature matrix into a dense matrix. Therefore,
        by default our API banned this feature, by default False

    scale_mean : bool, optional
        Whether to standarize the feauture matrix to have a mean value of zero per feature (center
        the features before scaling). As laid out in ``sparse_matrix``, ``scale_mean=False`` when
        using ``sparse_matrix=True``, since centering the feature matrix would decrease the sparsity
        and in practice it does not make any sense to use sparse matrix method and it would make
        it worse. The ``StandardScaler`` object can be accessed via ``cls.scaler_`` if ``scale_mean`` or
        ``scale_strd`` is used unless it is ``None``, by default False

    scale_std : bool, optional
        Whether to scale the feauture matrix to have unit variance (or equivalently, unit standard
        deviation) per feature. The ``StandardScaler`` object can be accessed via ``cls.scaler_``
        if ``scale_mean`` or ``scale_strd`` is used unless it is ``None``, by default False

    importance_type : str, optional
        Importance type of ``xgboost.train()`` with possible values ``"weight"``, ``"gain"``,
        ``"total_gain"``, ``"cover"``, ``"total_cover"``, by default "total_gain"

    params : Dict[str, Union[str, float, int]], optional
        Set of parameters required for fitting a Booster, by default {"eval_metric": "auc",
        "tree_method": "hist", "objective": "binary:logistic", "learning_rate": 0.05,
        "max_depth": 2, "min_child_weight": 1, "gamma": 0.0, "reg_alpha": 0.0, "reg_lambda": 1.0,
        "subsample": 0.9, "max_delta_step": 1, "verbosity": 0, "nthread": 4, "scale_pos_weight": 1}

    Methods
    -------
    fit(X_train, y_train)
        Fits a ``XGBoost.Booster`` to input training data. Proper ``dtrain_`` matrix based on chosen
        options i.e. ``sparse_matrix``, ``scale_mean``, ``scale_std`` is being created based on the
        passed ``X_train`` and ``y_train``

    predict_proba(X_test, y_test)
        Returns prediction probabilities for the positive class. ``predict_proba()`` only reports
        the probability of the positive class, while the sklearn API returns for both and slicing
        like ``pred_proba[:, 1]`` is needed for positive class predictions. Additionally, ``y_test``
        is optional while the targets might not be available in validiation (inference)

    predict(X_test, y_test, threshold=0.5)
        Returns prediction classes based on the threshold. The default ``threshold=0.5`` might not
        give you the best results while you can find the optimum thresholds based on different
        algorithms including Youden Index, maximizing the area under sensitivity-specificity curve,
        and maximizing the area under precision-recall curve by using ``BinaryClassificationMetrics``

    get_params()
        Returns final set of train parameters. The default set of parameters will be updated with
        the new ones that passed to ``params``

    get_default_params()
        Returns the default set of train parameters. The default set of parameters will be used when
        ``params=None``

    get_feature_importance()
        Returns the feature importance of the trained booster based on the given ``importance_type``

    get_shap_explainer()
        Returns the ``shap.TreeExplainer``

    plot_shap_summary()
        Visualizes Shapley values summary plot

    plot_shap_waterfall()
        Visualizes Shapley values waterfall plot

    Attributes
    ----------
    feature_importance_ : pd.DataFrame
        Features importance based on the given ``importance_type``

    scaler_ : StandardScaler, optional
        Standardization object when ``scale_mean=True`` or ``scale_std=True`` unless it is ``None``

    X_train_ : pd.DataFrame
        Fitted and Transformed features when ``scale_mean=True`` or ``scale_std=True``. In other case, it will
        be the same as the passed ``X_train`` features

    X_test_ : pd.DataFrame
        Transformed features when ``scale_mean=True`` or ``scale_std=True`` using `clf.scaler_` that
        has be fitted on ``X_train`` and ``y_train`` data. In other case, it will be the same as the
        passed ``X_train`` features

    dtrain_ : xgb.DMatrix
        Training data matrix via ``xgboost.DMatrix(clf.X_train_, clf.y_train)``

    dtest_ : xgb.DMatrix
        Testing data matrix via ``xgboost.DMatrix(clf.X_test_, clf.y_test)`` or
        ``xgboost.DMatrix(clf.X_test_, None)`` when ``y_test`` is not available in inference

    shap_values_train_ : np.ndarray
        Shapley values from ``TreeExplainer`` using ``X_train_``

    shap_values_test_ : np.ndarray
        Shapley values from ``TreeExplainer`` using ``X_test_``

    shap_explainer_ : shap.TreeExplainer
        Shap TreeExplainer object

    model_ : xgboost.Booster
        XGBoost Booster object

    References
    ----------
    .. [xgboost-api] https://xgboost.readthedocs.io/en/latest/python/python_api.html
    .. [markers-api] https://matplotlib.org/stable/api/markers_api.html
    .. [shap-api] https://shap-lrjball.readthedocs.io/en/latest/generated/shap.summary_plot.html
    """

    num_boost_round: Optional[int] = 200
    sparse_matrix: Optional[bool] = False
    scale_mean: Optional[bool] = False
    scale_std: Optional[bool] = False
    importance_type: Optional[str] = "total_gain"
    params: Optional[Dict[str, Union[str, float, int]]] = None


[docs]
    def __post_init__(self) -> None:
        """Post instantiation validations and assignments."""
        super().__post_init__()

        # The default set of params can be updated based on the given params by user
        _default_params = self._default_params()
        if self.params:
            check_var(
                self.params,
                var_name="params",
                dtypes=dict,
            )
            _default_params.update(self.params)
            self.params = _default_params
        else:
            self.params = _default_params



[docs]
    def fit(
        self,
        X_train: Union[pd.DataFrame, np.ndarray],
        y_train: Union[List[float], np.ndarray, pd.Series],
    ) -> None:
        """Fits a ``XGBoost.Booster`` to input training data.

        Notes
        -----
        Proper ``dtrain_`` matrix based on chosen options i.e. ``sparse_matrix``, ``scale_mean``,
        ``scale_std`` is being created based on the passed ``X_train`` and ``y_train``.

        Parameters
        ----------
        X_train : Union[pd.DataFrame, np.ndarray]
            Input data for training (features)

        y_train : Union[List[float], np.ndarray, pd.Series]
            Input ground truth for training (targets)

        See Also
        --------
        :meth:`xgboost.train()`

        Returns
        -------
        None
        """
        self.dtrain_ = self._dtrain(
            X_train=X_train,
            y_train=y_train,
        )
        self.model_ = self._model()
        self.feature_importance_ = self._imp_to_df()

        return None



[docs]
    def predict_proba(
        self,
        X_test: Union[pd.DataFrame, np.ndarray],
        y_test: Optional[Union[List[float], np.ndarray, pd.Series]] = None,
    ) -> np.ndarray:
        """Returns the prediction probabilities for the positive class.

        Notes
        -----
        ``predict_proba()`` only reports the probability of the positive class, while the sklearn API
        returns for both and slicing like ``pred_proba[:, 1]`` is needed for positive class
        predictions. Additionally, ``y_test`` is optional while the targets might not be available
        in validiation (inference).

        Parameters
        ----------
        X_test : Union[pd.DataFrame, np.ndarray]
            Input data for testing (features)

        y_test : Union[List[float], np.ndarray, pd.Series], optional
            Input ground truth for testing (targets)

        Returns
        -------
        np.ndarray
        """
        self.dtest_ = self._dtest(
            X_test=X_test,
            y_test=y_test,
        )
        self.y_pred_proba_ = self.model_.predict(
            data=self.dtest_,
            output_margin=False,
        )

        return self.y_pred_proba_



[docs]
    def predict(
        self,
        X_test: Union[pd.DataFrame, np.ndarray],
        y_test: Optional[Union[List[float], np.ndarray, pd.Series]] = None,
        threshold: Optional[float] = 0.5,
    ) -> np.ndarray:
        """Returns the prediction classes based on the threshold.

        Notes
        -----
        The default ``threshold=0.5`` might not give you the best results while you can find the
        optimum thresholds based on different algorithms including Youden Index, maximizing the area
        under sensitivity-specificity curve, and maximizing the area under precision-recall curve by
        using ``BinaryClassificationMetrics``.

        Parameters
        ----------
        X_test : Union[pd.DataFrame, np.ndarray]
            Input data for testing (features)

        y_test : Union[List[float], np.ndarray, pd.Series], optional
            Input ground truth for testing (targets)

        threshold : float, optional
            Inclusive threshold value to binarize ``y_pred_proba_`` to ``y_pred_`` where any value
            that satisfies ``y_pred_prob_ >= threshold`` will set to ``class=1 (positive class)``.
            Note that for ``">="`` is used instead of ``">"``, by default 0.5

        Returns
        -------
        np.ndarray
        """
        self.dtest_ = self._dtest(
            X_test=X_test,
            y_test=y_test,
        )
        self.y_pred_proba_ = self.model_.predict(
            self.dtest_,
            output_margin=False,
        )
        self.y_pred_ = (self.y_pred_proba_ >= threshold).astype(int)

        return self.y_pred_



[docs]
    def plot_feature_importance(
        self,
        figsize: Optional[Tuple[Union[int, float], Union[int, float]]] = (8, 5),
        color: Optional[str] = "#87CEEB",
        marker: Optional[str] = "o",
        markersize: Optional[Union[int, float]] = 10,
        markeredgecolor: Optional[str] = "#1F77B4",
        markerfacecolor: Optional[str] = "#1F77B4",
        markeredgewidth: Optional[Union[int, float]] = 1,
        fontsize: Optional[Union[int, float]] = 12,
        save_path: Optional[str] = None,
        display_plot: Optional[bool] = True,
        return_fig: Optional[bool] = False,
    ) -> Optional[Figure]:
        """Visualizes the XGBoost feature importance as bar chart.

        Parameters
        ----------
        feature importance : pd.DataFrame
            Feature importance (``feature_importance_`` attribute)

        figsize : Tuple[Union[int, float], Union[int, float]], optional
            Figure size, by default (8, 5)

        color : str, optional
            Color of the horizontal lines of lollipops, by default "#87CEEB"

        marker : str, optional
            Marker style of the lollipops. More valid marker styles can be found at [markers-api]_, by default "o"

        markersize : Union[int, float], optional
            Markersize, by default 10

        markeredgecolor : str, optional
            Marker edge color, by default "#1F77B4"

        markerfacecolor : str, optional
            Marker face color, by defualt "#1F77B4"

        markeredgewidth : Union[int, float], optional
            Marker edge width, by default 1

        fontsize : Union[int, float], optional
            Fontsize for xlabel and ylabel, and ticks parameters, by default 12

        save_path : str, optional
            The full or relative path to save the plot including the image format such as
            "myplot.png" or "../../myplot.pdf", by default None

        display_plot : bool, optional
            Whether to show the plot, by default True

        return_fig : bool, optional
            Whether to return figure object, by default False

        Returns
        -------
        Figure, optional
        """
        return plot_xgb_feature_importance(
            feature_importance=self.feature_importance_,
            figsize=figsize,
            color=color,
            marker=marker,
            markersize=markersize,
            markeredgecolor=markeredgecolor,
            markerfacecolor=markerfacecolor,
            markeredgewidth=markeredgewidth,
            fontsize=fontsize,
            save_path=save_path,
            display_plot=display_plot,
            return_fig=return_fig,
        )



[docs]
    def plot_shap_summary(
        self,
        validation: Optional[bool] = True,
        plot_type: Optional[str] = "dot",
        figsize: Optional[Union[str, Tuple[float, float]]] = "auto",
        color: Optional[str] = None,
        cmap: Optional[LinearSegmentedColormap] = None,
        max_display: Optional[int] = 20,
        feature_names: Optional[List[str]] = None,
        layered_violin_max_num_bins: Optional[int] = 10,
        title: Optional[str] = None,
        sort: Optional[bool] = True,
        color_bar: Optional[bool] = True,
        class_names: Optional[List[str]] = None,
        class_inds: Optional[List[int]] = None,
        color_bar_label: Optional[str] = "Feature Value",
        save_path: Optional[str] = None,
        display_plot: Optional[bool] = True,
    ) -> None:
        """Visualizes shap beeswarm plot as summary of shapley values.

        Notes
        -----
        This is a helper function to plot the ``shap`` summary plot based on all types of
        ``shap.Explainer`` including ``shap.LinearExplainer`` for linear models, ``shap.TreeExplainer``
        for tree-based models, and ``shap.DeepExplainer`` deep neural network models. More on details
        are available at [shap-api]_. Note that this function should be ran after the ``predict_proba()``
        to make sure the ``X_test`` is being instansiated or set ``validation=False``.

        Parameters
        ----------
        validation : bool, optional
            Whether to calculate Shap values of using the validation data ``X_test`` or not. When
            ``validation=False``, Shap values are calculated using ``X_train``, be default True

        plot_type : str, optional
            The type of summary plot where possible options are "bar", "dot", "violin", "layered_violin",
            and "compact_dot". Recommendations are "dot" for single-output such as binary classifications,
            "bar" for multi-output problems, "compact_dot" for Shap interactions, by default "dot"

        figsize : tuple, optional
            Figure size where "auto" is auto-scaled figure size based on the number of features that are
            being displayed. Passing a single float will cause each row to be that many inches high.
            Passing a pair of floats will scale the plot by that number of inches. If None is passed
            then the size of the current figure will be left unchanged, by default "auto"

        color : str, optional
            Color of plots when ``plot_type="violin"`` and ``plot_type=layered_violin"`` are "RdBl"
            color-map while color of the horizontal lines when ``plot_type="bar"`` is "#D0AAF3", by
            default None

        cmap : LinearSegmentedColormap, optional
            Color map when ``plot_type="violin"`` and ``plot_type=layered_violin"``, by default "RdBl"

        max_display : int, optional
            Limit to show the number of features in the plot, by default 20

        feature_names : List[str], optional
            List of feature names to pass. It should follow the order of features, by default None

        layered_violin_max_num_bins : int, optional
            The number of bins for calculating the violin plots ranges and outliers, by default 10

        title : str, optional
            Title of the plot, by default None

        sort : bool, optional
            Flag to plot sorted shap vlues in descending order, by default True

        color_bar : bool, optional
            Flag to show a color bar when ``plot_type="dot"`` or ``plot_type="violin"``

        class_names : List[str], optional
            List of class names for multi-output problems, by default None

        class_inds : List[int], optional
            List of class indices for multi-output problems, by default None

        color_bar_label : str, optional
            Label for color bar, by default "Feature Value"

        save_path : str, optional
            The full or relative path to save the plot including the image format such as
            "myplot.png" or "../../myplot.pdf", by default None

        display_plot : bool, optional
            Whether to show the plot, by default True

        Returns
        -------
        None
        """
        self._explainer()

        if validation:
            shap_values = self.shap_values_test_
            features = self.X_test_
        else:
            shap_values = self.shap_values_train_
            features = self.X_train_

        return plot_shap_summary(
            shap_values=shap_values,
            features=features,
            plot_type=plot_type,
            figsize=figsize,
            color=color,
            cmap=cmap,
            max_display=max_display,
            feature_names=feature_names,
            layered_violin_max_num_bins=layered_violin_max_num_bins,
            title=title,
            sort=sort,
            color_bar=color_bar,
            class_names=class_names,
            class_inds=class_inds,
            color_bar_label=color_bar_label,
            save_path=save_path,
            display_plot=display_plot,
        )



[docs]
    def plot_shap_waterfall(
        self,
        validation: Optional[bool] = True,
        figsize: Optional[Tuple[float, float]] = (8, 5),
        bar_color: Optional[str] = "#B3C3F3",
        bar_thickness: Optional[Union[float, int]] = 0.5,
        line_color: Optional[str] = "purple",
        marker: Optional[str] = "o",
        markersize: Optional[Union[int, float]] = 7,
        markeredgecolor: Optional[str] = "purple",
        markerfacecolor: Optional[str] = "purple",
        markeredgewidth: Optional[Union[int, float]] = 1,
        max_display: Optional[int] = 20,
        title: Optional[str] = None,
        fontsize: Optional[Union[int, float]] = 12,
        save_path: Optional[str] = None,
        display_plot: Optional[bool] = True,
        return_fig: Optional[bool] = False,
    ) -> Optional[Figure]:
        """Visualizes the Shapley values as a waterfall plot.

        Notes
        -----
        Waterfall is defined as the cumulitative/composite ratios of shap values per feature.
        Therefore, it can be easily seen with each feature how much explainability we can achieve.
        Note that this function should be ran after the ``predict_proba()`` to make sure the
        ``X_test`` is being instansiated or set ``validation=False``.

        Parameters
        ----------
        validation : bool, optional
            Whether to calculate Shap values of using the validation data ``X_test`` or not. When
            ``validation=False``, Shap values are calculated using ``X_train``, be default True

        figsize : Tuple[float, float], optional
            Figure size, by default (8, 5)

        bar_color : str, optional
            Color of the horizontal bar lines, "#B3C3F3"

        bar_thickness : Union[float, int], optional
            Thickness (hight) of the horizontal bar lines, by default 0.5

        line_color : str, optional
            Color of the line plot, by default "purple"

        marker : str, optional
            Marker style of the lollipops. More valid marker styles can be found at [markers-api]_, by default "o"

        markersize : Union[int, float], optional
            Markersize, by default 7

        markeredgecolor : str, optional
            Marker edge color, by default "purple"

        markerfacecolor: str, optional
            Marker face color, by default "purple"

        markeredgewidth : Union[int, float], optional
            Marker edge width, by default 1

        max_display : int, optional
            Limit to show the number of features in the plot, by default 20

        title : str, optional
            Title of the plot, by default None

        fontsize : Union[int, float], optional
            Fontsize for xlabel and ylabel, and ticks parameters, by default 12

        save_path : str, optional
            The full or relative path to save the plot including the image format such as
            "myplot.png" or "../../myplot.pdf", by default None

        display_plot : bool, optional
            Whether to show the plot, by default True

        return_fig : bool, optional
            Whether to return figure object, by default False

        Returns
        -------
        Figure, optional
        """
        self._explainer()

        if validation:
            shap_values = self.shap_values_test_
            features = self.X_test_
        else:
            shap_values = self.shap_values_train_
            features = self.X_train_

        return plot_shap_waterfall(
            shap_values=shap_values,
            features=features,
            figsize=figsize,
            bar_color=bar_color,
            bar_thickness=bar_thickness,
            line_color=line_color,
            marker=marker,
            markersize=markersize,
            markeredgecolor=markeredgecolor,
            markerfacecolor=markerfacecolor,
            markeredgewidth=markeredgewidth,
            max_display=max_display,
            title=title,
            fontsize=fontsize,
            save_path=save_path,
            display_plot=display_plot,
            return_fig=return_fig,
        )



[docs]
    def get_params(self) -> Optional[Dict[str, Union[str, float, int]]]:
        """Returns the final set of train parameters.

        The default set of parameters will be updated with the new ones that passed to ``params``.

        See Also
        --------
        :meth:`get_default_params()`

        Returns
        -------
        Dict[str, Union[str, float, int]]
        """
        return self.params



[docs]
    def get_default_params(self) -> Dict[str, Union[str, float, int]]:
        """Returns the default set of train parameters.

        The default set of parameters will be used when ``params=None``.

        See Also
        --------
        :meth:`get_params()`

        Returns
        -------
        Dict[str, Union[str, float, int]]
        """
        return self._default_params()



[docs]
    def get_feature_importance(self) -> pd.DataFrame:
        """Returns the feature importance of the trained booster based on the given ``importance_type``.

        Returns
        -------
        pd.DataFrame
        """
        return self.feature_importance_



[docs]
    def get_shap_explainer(self) -> shap.TreeExplainer:
        """Returns the ``shap.TreeExplainer`` object.

        Returns
        -------
        shap.TreeExplainer
        """
        self._explainer()
        return self.shap_explainer_


    def _model(self) -> xgb.Booster:
        """Fits a ``XGBoost.Booster`` based on the given number of boosting round on ``dtrain_`` matrix.

        Returns
        -------
        xgb.Booster
        """
        return xgb.train(
            params=self.params,
            dtrain=self.dtrain_,
            num_boost_round=self.num_boost_round,  # type: ignore
        )

    def _explainer(self) -> None:
        """Fits a ``shap.TreeExplainer`` on the ``X_train_`` and ``X_test_`` data.

        Returns
        -------
        None
        """
        self.shap_explainer_ = shap.TreeExplainer(
            model=self.model_,
        )
        self.shap_values_test_ = self.shap_explainer_.shap_values(
            X=self.X_test_,
        )
        self.shap_values_train_ = self.shap_explainer_.shap_values(
            X=self.X_train_,
        )

        return None

    def _imp_to_df(self) -> pd.DataFrame:
        """Converts the feature importance object to ``pd.DataFrame``.

        Returns
        -------
        pd.DataFrame
        """
        data: Dict[str, List[float]] = {
            "feature": [],
            f"{self.importance_type}": [],
        }
        features_gain = self.model_.get_score(
            importance_type=self.importance_type,
        )
        for key, val in features_gain.items():
            data["feature"].append(key)
            data[f"{self.importance_type}"].append(val)

        return (
            pd.DataFrame(data)
            .sort_values(
                by=f"{self.importance_type}",
                ascending=False,
            )
            .reset_index(
                drop=True,
            )
        )

    @staticmethod
    def _default_params() -> Dict[str, Union[str, float, int]]:
        """Default set of parameters when the class is being instantiated with ``params=None``.

        Returns
        -------
        Dict[str, Union[str, float, int]]
        """
        return {
            "eval_metric": "auc",
            "tree_method": "hist",
            "objective": "binary:logistic",
            "learning_rate": 0.05,
            "max_depth": 2,
            "min_child_weight": 1,
            "gamma": 0.0,
            "reg_alpha": 0.0,
            "reg_lambda": 1.0,
            "subsample": 0.9,
            "max_delta_step": 1,
            "verbosity": 0,
            "nthread": 4,
            "scale_pos_weight": 1,
        }