Source code for slickml.optimization._bayesianopt

from dataclasses import dataclass
from typing import Dict, List, Optional, Set, Tuple, Union

import numpy as np
import pandas as pd
import xgboost as xgb
from bayes_opt import BayesianOptimization
from bayes_opt.util import UtilityFunction

from slickml.base import BaseXGBoostEstimator
from slickml.utils import check_var


# TODO(amir): add multi-class objective for multi-lable classification
# TODO(amir): currently the `params` dont do anything. we need to make the process of combination of
# `params` and `params_pbounds` dynamic with `_xgb_eval()`

[docs]
@dataclass
class XGBoostBayesianOptimizer(BaseXGBoostEstimator):
    """XGBoost Hyper-Parameters Tuner using Bayesian Optimization.

    This is wrapper using Bayesian Optimization algorithm [bayesian-optimization]_ to tune the
    hyper-parameter of XGBoost [xgboost-api]_ using ``xgboost.cv()`` functionality with n-folds
    cross-validation iteratively. This feature can be used to find the set of optimized set of
    hyper-parameters for both classification and regression tasks.

    Notes
    -----
    The optimizier objective is always to maximize the target values. Therefore, in case of using a
    metric such as ``logloss``, ``error``, ``mae``, ``rmse``, or ``rmsle``, the negative value of the
    metric will be maximized. One of the big pitfall of the current implementation is the way we are
    sampling hyper-parameters from the ``params_bounds`` where we are looking for an integer which
    is not possible. Therefore, for some of cases i.e. ``max_depth`` we must cast the sampled value
    which is mathematically wrong (i.e. ``f(1.1) != f(1)``).

    Parameters
    ----------
    n_iter : int, optional
        Number of iteration rounds for hyper-parameters tuning after initialization, by default 10

    n_init_iter : int, optional
        Number of initial iterations to initialize the optimizer, by default 5

    n_splits : int, optional
        Number of folds for cross-validation, by default 4

    metrics : str, optional
        Metrics to be tracked at cross-validation fitting time depends on the task
        (classification vs regression) with possible values of "auc", "aucpr", "error", "logloss",
        "rmse", "rmsle", "mae". Note this is different than `eval_metric` that needs to be passed to
        `params` dict, by default "auc"

    objective : str, optional
        Objective function depending on the task whether it is regression or classification. Possible
        objectives for classification ``"binary:logistic"`` and for regression ``"reg:logistic"``,
        ``"reg:squarederror"``, and ``"reg:squaredlogerror"``, by default "binary:logistic"

    acquisition_criterion : str, optional
        Acquisition criterion method with possible options of ``"ei"`` (Expected Improvement),
        ``"ucb"`` (Upper Confidence Bounds), and ``"poi"`` (Probability Of Improvement), by default "ei"

    params_bounds : Dict[str, Tuple[Union[int, float], Union[int, float]]], optional
        Set of hyper-parameters boundaries for Bayesian Optimization where all fields are required,
        by default {"max_depth" : (2, 7), "learning_rate" : (0, 1), "min_child_weight" : (1, 20),
        "colsample_bytree": (0.1, 1.0), "subsample" : (0.1, 1), "gamma" : (0, 1),
        "reg_alpha" : (0, 1), "reg_lambda" : (0, 1)}

    num_boost_round : int, optional
        Number of boosting rounds to fit a model, by default 200

    early_stopping_rounds : int, optional
        The criterion to early abort the ``xgboost.cv()`` phase if the test metric is not improved,
        by default 20

    random_state : int, optional
        Random seed number, by default 1367

    stratified : bool, optional
        Whether to use stratificaiton of the targets (only available for classification tasks) to run
        ``xgboost.cv()`` to find the best number of boosting round at each fold of each iteration,
        by default True

    shuffle : bool, optional
        Whether to shuffle data to have the ability of building stratified folds in ``xgboost.cv()``,
        by default True

    sparse_matrix : bool, optional
        Whether to convert the input features to sparse matrix with csr format or not. This would
        increase the speed of feature selection for relatively large/sparse datasets. Consequently,
        this would actually act like an un-optimize solution for dense feature matrix. Additionally,
        this parameter cannot be used along with ``scale_mean=True`` standardizing the feature matrix
        to have a mean value of zeros would turn the feature matrix into a dense matrix. Therefore,
        by default our API banned this feature, by default False

    scale_mean : bool, optional
        Whether to standarize the feauture matrix to have a mean value of zero per feature (center
        the features before scaling). As laid out in ``sparse_matrix``, ``scale_mean=False`` when
        using ``sparse_matrix=True``, since centering the feature matrix would decrease the sparsity
        and in practice it does not make any sense to use sparse matrix method and it would make
        it worse. The ``StandardScaler`` object can be accessed via ``cls.scaler_`` if ``scale_mean`` or
        ``scale_strd`` is used unless it is ``None``, by default False

    scale_std : bool, optional
        Whether to scale the feauture matrix to have unit variance (or equivalently, unit standard
        deviation) per feature. The ``StandardScaler`` object can be accessed via ``cls.scaler_``
        if ``scale_mean`` or ``scale_strd`` is used unless it is ``None``, by default False

    importance_type : str, optional
        Importance type of ``xgboost.train()`` with possible values ``"weight"``, ``"gain"``,
        ``"total_gain"``, ``"cover"``, ``"total_cover"``, by default "total_gain"

    verbose : bool, optional
        Whether to show the Bayesian Optimization progress at each iteration, by default True

    Methods
    -------
    fit(X, y)
        Fits the Bayesian optimization algorithm to tune the hyper-parameters

    get_optimizer()
        Returns the fitted Bayesian Optimiziation object

    get_results()
        Returns all the optimization results including target and params

    get_best_results()
        Return the results based on the best (tuned) hyper-parameters

    get_best_params()
        Returns the tuned hyper-parameters as a dictionary

    get_params_bounds()
        Returns the parameters boundaries

    Attributes
    ----------
    optimizer_ :
        Returns the fitted Bayesian Optimiziation object

    results_
        Returns all the optimization results including target and params

    best_params_
        Returns the tuned hyper-parameters as a dictionary

    best_results_
        Return the results based on the best (tuned) hyper-parameters

    References
    ----------
    .. [xgboost-api] https://xgboost.readthedocs.io/en/latest/python/python_api.html
    .. [bayesian-optimization] https://github.com/fmfn/BayesianOptimization
    """

    n_iter: Optional[int] = 10
    n_init_iter: Optional[int] = 5
    n_splits: Optional[int] = 4
    metrics: Optional[str] = "auc"
    objective: Optional[str] = "binary:logistic"
    acquisition_criterion: Optional[str] = "ei"
    params_bounds: Optional[Dict[str, Tuple[Union[int, float], Union[int, float]]]] = None
    num_boost_round: Optional[int] = 200
    early_stopping_rounds: Optional[int] = 20
    sparse_matrix: Optional[bool] = False
    scale_mean: Optional[bool] = False
    scale_std: Optional[bool] = False
    importance_type: Optional[str] = "total_gain"
    stratified: Optional[bool] = True
    shuffle: Optional[bool] = True
    random_state: Optional[int] = 1367
    verbose: Optional[bool] = True


[docs]
    def __post_init__(self) -> None:
        """Post instantiation validations and assignments."""
        super().__post_init__()
        check_var(
            self.n_iter,
            var_name="n_iter",
            dtypes=int,
        )
        check_var(
            self.n_init_iter,
            var_name="n_iter",
            dtypes=int,
        )
        check_var(
            self.n_splits,
            var_name="n_splits",
            dtypes=int,
        )
        check_var(
            self.metrics,
            var_name="metrics",
            dtypes=str,
            values=(
                "auc",
                "aucpr",
                "error",
                "logloss",
                "rmse",
                "rmsle",
                "mae",
            ),
        )
        check_var(
            self.early_stopping_rounds,
            var_name="early_stopping_rounds",
            dtypes=int,
        )
        check_var(
            self.random_state,
            var_name="random_state",
            dtypes=int,
        )
        check_var(
            self.stratified,
            var_name="stratified",
            dtypes=bool,
        )
        check_var(
            self.shuffle,
            var_name="shuffle",
            dtypes=bool,
        )
        check_var(
            self.verbose,
            var_name="verbose",
            dtypes=bool,
        )
        # TODO(amir): use type overload
        self.verbose = self._verbose()  # type: ignore
        check_var(
            self.objective,
            var_name="objective",
            dtypes=str,
            values=(
                "binary:logistic",
                "reg:squarederror",
                "reg:squaredlogerror",
                "reg:logistic",
            ),
        )
        check_var(
            self.acquisition_criterion,
            var_name="acquisition_criterion",
            dtypes=str,
            values=(
                "ei",
                "ucb",
                "poi",
            ),
        )

        # The default set of params bounds can be updated based on the given params bounds by user
        _default_params_bounds = self._default_params_bounds()
        if self.params_bounds is not None:
            check_var(
                self.params_bounds,
                var_name="params_bounds",
                dtypes=dict,
            )
            _default_params_bounds.update(self.params_bounds)
            self.params_bounds = _default_params_bounds
        else:
            self.params_bounds = self._default_params_bounds()

        # classification/regression metrics and objectives should be aligned
        self._metrics_and_objectives_should_be_aligned()



[docs]
    def fit(
        self,
        X: Union[pd.DataFrame, np.ndarray],
        y: Union[List[float], np.ndarray, pd.Series],
    ) -> None:
        """Fits the main hyper-parameter tuning algorithm.

        Notes
        -----
        At each iteration, one set of parameters gets passed from the `params_bounds` and the
        evaluation occurs based on the cross-validation results. Bayesian optimizier always
        maximizes the objectives. Therefore, based on the `metrics` we should be careful
        when using `self.metrics` that are supposed to get minimized i.e. `error`. For those,
        we can maximize `(-1) * metric`. One of the big pitfall of the current implementation
        is the way we are sampling hyper-parameters from the `params_bounds` where we are looking
        for an integer which is not possible. Therefore, for some of cases i.e. `max_depth` we
        must cast the sampled value which is mathematically wrong (i.e. f(1.1) != f(1)).

        Parameters
        ----------
        X : Union[pd.DataFrame, np.ndarray]
            Input data for training (features)

        y : Union[List[float], np.ndarray, pd.Series]
            Input ground truth for training (targets)

        Returns
        -------
        None
        """

        def _xgb_eval(
            max_depth: float,
            subsample: float,
            colsample_bytree: float,
            min_child_weight: float,
            learning_rate: float,
            gamma: float,
            reg_alpha: float,
            reg_lambda: float,
        ) -> float:
            """Inner hyper-parameter evaluation.

            Returns
            -------
            float
            """
            params = self._inner_params(
                max_depth=max_depth,
                subsample=subsample,
                colsample_bytree=colsample_bytree,
                min_child_weight=min_child_weight,
                learning_rate=learning_rate,
                gamma=gamma,
                reg_alpha=reg_alpha,
                reg_lambda=reg_lambda,
            )
            if self.metrics in self._clf_metrics():
                _cvr = xgb.cv(
                    params=params,
                    dtrain=self.dtrain_,
                    num_boost_round=self.num_boost_round,
                    nfold=self.n_splits,
                    stratified=self.stratified,
                    metrics=self.metrics,
                    early_stopping_rounds=self.early_stopping_rounds,
                    seed=self.random_state,
                    shuffle=self.shuffle,
                )
            else:
                _cvr = xgb.cv(
                    params=params,
                    dtrain=self.dtrain_,
                    num_boost_round=self.num_boost_round,
                    nfold=self.n_splits,
                    metrics=self.metrics,
                    early_stopping_rounds=self.early_stopping_rounds,
                    seed=self.random_state,
                    shuffle=self.shuffle,
                )

            if self.metrics in self._metrics_should_be_minimized():
                return (-1) * _cvr.iloc[-1][2]
            else:
                return _cvr.iloc[-1][2]

        self.dtrain_ = self._dtrain(
            X_train=X,
            y_train=y,
        )
        self.optimizer_ = BayesianOptimization(
            f=_xgb_eval,
            pbounds=self.params_bounds,
            random_state=self.random_state,
            verbose=self.verbose,
            constraint=None,
            bounds_transformer=None,
        )
        self.optimizer_.maximize(
            init_points=self.n_init_iter,
            n_iter=self.n_iter,
            acquisition_function=UtilityFunction(
                kind=self.acquisition_criterion,
                kappa=2.576,
                xi=0.0,
                kappa_decay=1,
                kappa_decay_delay=0,
            ),
        )
        self.results_ = self.get_results()
        self.best_params_ = self.get_best_params()
        self.best_results_ = self.get_best_results()

        return None



[docs]
    def get_params_bounds(
        self,
    ) -> Optional[Dict[str, Tuple[Union[int, float], Union[int, float]]]]:
        """Returns the hyper-parameters boundaries for the tuning process.

        Returns
        -------
        Dict[str, Tuple[Union[int, float], Union[int, float]]]
        """
        return self.params_bounds



[docs]
    def get_optimizer(self) -> BayesianOptimization:
        """Return the Bayesian Optimization object.

        Returns
        -------
        BayesianOptimization
        """
        return self.optimizer_



[docs]
    def get_results(self) -> pd.DataFrame:
        """Returns the hyper-parameter optimization results.

        Returns
        -------
        pd.DataFrame
        """
        frames = []
        for idx, res in enumerate(self.optimizer_.res):
            data = res["params"]
            data[self.metrics] = res["target"]
            frames.append(
                pd.DataFrame(
                    data=data,
                    index=[idx],
                ),
            )

        df_results = pd.concat(
            frames,
            axis=0,
        )
        df_results["max_depth"] = df_results["max_depth"].astype(int)

        return df_results



[docs]
    def get_best_params(self) -> Dict[str, Union[str, float, int]]:
        """Returns the tuned results of the optimization as the best set of hyper-parameters.

        Returns
        -------
        Dict[str, Union[str, float, int]]
        """
        targets = []
        for _, rs in enumerate(self.optimizer_.res):
            targets.append(rs["target"])
        best_params = self.optimizer_.res[targets.index(max(targets))]["params"]
        best_params["max_depth"] = int(best_params["max_depth"])

        return best_params



[docs]
    def get_best_results(self) -> pd.DataFrame:
        """Returns the performance of the best (tuned) set of hyper-parameters.

        Returns
        -------
        pd.DataFrame
        """
        cond = self.results_[self.metrics] == self.results_[self.metrics].max()
        return self.results_.loc[cond, :].reset_index(drop=True)


    def _default_params_bounds(
        self,
    ) -> Dict[str, Tuple[Union[int, float], Union[int, float]]]:
        """Default set of parameters when the class is being instantiated with ``params_bounds=None``.

        Notes
        -----
        The default set of parameters would be a little bit different depends on the type of selection
        whether a classification or regression `metric` is being used.

        Returns
        -------
        Dict[str, Union[str, float, int]]
        """
        return {
            "max_depth": (2, 7),
            "learning_rate": (0.0, 1.0),
            "min_child_weight": (1.0, 20.0),
            "colsample_bytree": (0.1, 1.0),
            "subsample": (0.1, 1.0),
            "gamma": (0.0, 1.0),
            "reg_alpha": (0.0, 1.0),
            "reg_lambda": (0.0, 1.0),
        }

    def _verbose(self) -> int:
        """Returns verbosity level based on `verbose`.

        Returns
        -------
        int
        """
        return 2 if self.verbose else 0

    def _inner_params(
        self,
        max_depth: float,
        subsample: float,
        colsample_bytree: float,
        min_child_weight: float,
        learning_rate: float,
        gamma: float,
        reg_alpha: float,
        reg_lambda: float,
    ) -> Dict[str, Union[str, float, int, None]]:
        """Default set of parameters passed in inner evaluation.

        Notes
        -----
        The default set of inners parameters would be a little bit different depends on the type of
        task whether a classification or regression `metric` is being used.

        Returns
        -------
        Dict[str, Union[str, float, int]]
        """
        _params = {
            "eval_metric": self.metrics,
            "objective": self.objective,
            "reg_alpha": reg_alpha,
            "reg_lambda": reg_lambda,
            "learning_rate": learning_rate,
            "max_depth": int(max_depth),
            "min_child_weight": min_child_weight,
            "gamma": gamma,
            "subsample": subsample,
            "colsample_bytree": colsample_bytree,
            "tree_method": "hist",
            "nthread": 4,
            "max_delta_step": 1,
            "verbosity": 0,
        }
        # TODO(amir): this way prolly breaks for imbalanced classification
        if self.metrics in self._clf_metrics():
            _params["scale_pos_weight"] = 1

        return _params

    def _metrics_should_be_minimized(self) -> Set[str]:
        """Returns the default metrics that should be minimized.

        Returns
        -------
        Set[str]
        """
        return {
            "error",
            "logloss",
            "rmse",
            "rmsle",
            "mae",
        }

    def _clf_metrics(self) -> Set[str]:
        """Returns the default classification metrics.

        Returns
        -------
        Set[str]
        """
        return {
            "auc",
            "aucpr",
            "error",
            "logloss",
        }

    def _clf_objectives(self) -> Set[str]:
        """Returns the default classification objectives.

        Returns
        -------
        Set[str]
        """
        return {
            "binary:logistic",
        }

    def _metrics_and_objectives_should_be_aligned(self) -> None:
        """Predicate to validate the given metric and objective are aligned.

        Raises
        ------
        ValueError

        Returns
        -------
        None
        """
        if self.metrics in self._clf_metrics() and self.objective not in self._clf_objectives():
            raise ValueError(
                "Classification metrics cannot be used with regression objectives.",
            )

        if self.metrics not in self._clf_metrics() and self.objective in self._clf_objectives():
            raise ValueError(
                "Regression metrics cannot be used with classification objectives.",
            )

        return None