from dataclasses import dataclass
from typing import Dict, List, Optional, Set, Tuple, Union
import numpy as np
import pandas as pd
import xgboost as xgb
from bayes_opt import BayesianOptimization
from bayes_opt.util import UtilityFunction
from slickml.base import BaseXGBoostEstimator
from slickml.utils import check_var
# TODO(amir): add multi-class objective for multi-lable classification
# TODO(amir): currently the `params` dont do anything. we need to make the process of combination of
# `params` and `params_pbounds` dynamic with `_xgb_eval()`
[docs]
@dataclass
class XGBoostBayesianOptimizer(BaseXGBoostEstimator):
"""XGBoost Hyper-Parameters Tuner using Bayesian Optimization.
This is wrapper using Bayesian Optimization algorithm [bayesian-optimization]_ to tune the
hyper-parameter of XGBoost [xgboost-api]_ using ``xgboost.cv()`` functionality with n-folds
cross-validation iteratively. This feature can be used to find the set of optimized set of
hyper-parameters for both classification and regression tasks.
Notes
-----
The optimizier objective is always to maximize the target values. Therefore, in case of using a
metric such as ``logloss``, ``error``, ``mae``, ``rmse``, or ``rmsle``, the negative value of the
metric will be maximized. One of the big pitfall of the current implementation is the way we are
sampling hyper-parameters from the ``params_bounds`` where we are looking for an integer which
is not possible. Therefore, for some of cases i.e. ``max_depth`` we must cast the sampled value
which is mathematically wrong (i.e. ``f(1.1) != f(1)``).
Parameters
----------
n_iter : int, optional
Number of iteration rounds for hyper-parameters tuning after initialization, by default 10
n_init_iter : int, optional
Number of initial iterations to initialize the optimizer, by default 5
n_splits : int, optional
Number of folds for cross-validation, by default 4
metrics : str, optional
Metrics to be tracked at cross-validation fitting time depends on the task
(classification vs regression) with possible values of "auc", "aucpr", "error", "logloss",
"rmse", "rmsle", "mae". Note this is different than `eval_metric` that needs to be passed to
`params` dict, by default "auc"
objective : str, optional
Objective function depending on the task whether it is regression or classification. Possible
objectives for classification ``"binary:logistic"`` and for regression ``"reg:logistic"``,
``"reg:squarederror"``, and ``"reg:squaredlogerror"``, by default "binary:logistic"
acquisition_criterion : str, optional
Acquisition criterion method with possible options of ``"ei"`` (Expected Improvement),
``"ucb"`` (Upper Confidence Bounds), and ``"poi"`` (Probability Of Improvement), by default "ei"
params_bounds : Dict[str, Tuple[Union[int, float], Union[int, float]]], optional
Set of hyper-parameters boundaries for Bayesian Optimization where all fields are required,
by default {"max_depth" : (2, 7), "learning_rate" : (0, 1), "min_child_weight" : (1, 20),
"colsample_bytree": (0.1, 1.0), "subsample" : (0.1, 1), "gamma" : (0, 1),
"reg_alpha" : (0, 1), "reg_lambda" : (0, 1)}
num_boost_round : int, optional
Number of boosting rounds to fit a model, by default 200
early_stopping_rounds : int, optional
The criterion to early abort the ``xgboost.cv()`` phase if the test metric is not improved,
by default 20
random_state : int, optional
Random seed number, by default 1367
stratified : bool, optional
Whether to use stratificaiton of the targets (only available for classification tasks) to run
``xgboost.cv()`` to find the best number of boosting round at each fold of each iteration,
by default True
shuffle : bool, optional
Whether to shuffle data to have the ability of building stratified folds in ``xgboost.cv()``,
by default True
sparse_matrix : bool, optional
Whether to convert the input features to sparse matrix with csr format or not. This would
increase the speed of feature selection for relatively large/sparse datasets. Consequently,
this would actually act like an un-optimize solution for dense feature matrix. Additionally,
this parameter cannot be used along with ``scale_mean=True`` standardizing the feature matrix
to have a mean value of zeros would turn the feature matrix into a dense matrix. Therefore,
by default our API banned this feature, by default False
scale_mean : bool, optional
Whether to standarize the feauture matrix to have a mean value of zero per feature (center
the features before scaling). As laid out in ``sparse_matrix``, ``scale_mean=False`` when
using ``sparse_matrix=True``, since centering the feature matrix would decrease the sparsity
and in practice it does not make any sense to use sparse matrix method and it would make
it worse. The ``StandardScaler`` object can be accessed via ``cls.scaler_`` if ``scale_mean`` or
``scale_strd`` is used unless it is ``None``, by default False
scale_std : bool, optional
Whether to scale the feauture matrix to have unit variance (or equivalently, unit standard
deviation) per feature. The ``StandardScaler`` object can be accessed via ``cls.scaler_``
if ``scale_mean`` or ``scale_strd`` is used unless it is ``None``, by default False
importance_type : str, optional
Importance type of ``xgboost.train()`` with possible values ``"weight"``, ``"gain"``,
``"total_gain"``, ``"cover"``, ``"total_cover"``, by default "total_gain"
verbose : bool, optional
Whether to show the Bayesian Optimization progress at each iteration, by default True
Methods
-------
fit(X, y)
Fits the Bayesian optimization algorithm to tune the hyper-parameters
get_optimizer()
Returns the fitted Bayesian Optimiziation object
get_results()
Returns all the optimization results including target and params
get_best_results()
Return the results based on the best (tuned) hyper-parameters
get_best_params()
Returns the tuned hyper-parameters as a dictionary
get_params_bounds()
Returns the parameters boundaries
Attributes
----------
optimizer_ :
Returns the fitted Bayesian Optimiziation object
results_
Returns all the optimization results including target and params
best_params_
Returns the tuned hyper-parameters as a dictionary
best_results_
Return the results based on the best (tuned) hyper-parameters
References
----------
.. [xgboost-api] https://xgboost.readthedocs.io/en/latest/python/python_api.html
.. [bayesian-optimization] https://github.com/fmfn/BayesianOptimization
"""
n_iter: Optional[int] = 10
n_init_iter: Optional[int] = 5
n_splits: Optional[int] = 4
metrics: Optional[str] = "auc"
objective: Optional[str] = "binary:logistic"
acquisition_criterion: Optional[str] = "ei"
params_bounds: Optional[Dict[str, Tuple[Union[int, float], Union[int, float]]]] = None
num_boost_round: Optional[int] = 200
early_stopping_rounds: Optional[int] = 20
sparse_matrix: Optional[bool] = False
scale_mean: Optional[bool] = False
scale_std: Optional[bool] = False
importance_type: Optional[str] = "total_gain"
stratified: Optional[bool] = True
shuffle: Optional[bool] = True
random_state: Optional[int] = 1367
verbose: Optional[bool] = True
[docs]
def __post_init__(self) -> None:
"""Post instantiation validations and assignments."""
super().__post_init__()
check_var(
self.n_iter,
var_name="n_iter",
dtypes=int,
)
check_var(
self.n_init_iter,
var_name="n_iter",
dtypes=int,
)
check_var(
self.n_splits,
var_name="n_splits",
dtypes=int,
)
check_var(
self.metrics,
var_name="metrics",
dtypes=str,
values=(
"auc",
"aucpr",
"error",
"logloss",
"rmse",
"rmsle",
"mae",
),
)
check_var(
self.early_stopping_rounds,
var_name="early_stopping_rounds",
dtypes=int,
)
check_var(
self.random_state,
var_name="random_state",
dtypes=int,
)
check_var(
self.stratified,
var_name="stratified",
dtypes=bool,
)
check_var(
self.shuffle,
var_name="shuffle",
dtypes=bool,
)
check_var(
self.verbose,
var_name="verbose",
dtypes=bool,
)
# TODO(amir): use type overload
self.verbose = self._verbose() # type: ignore
check_var(
self.objective,
var_name="objective",
dtypes=str,
values=(
"binary:logistic",
"reg:squarederror",
"reg:squaredlogerror",
"reg:logistic",
),
)
check_var(
self.acquisition_criterion,
var_name="acquisition_criterion",
dtypes=str,
values=(
"ei",
"ucb",
"poi",
),
)
# The default set of params bounds can be updated based on the given params bounds by user
_default_params_bounds = self._default_params_bounds()
if self.params_bounds is not None:
check_var(
self.params_bounds,
var_name="params_bounds",
dtypes=dict,
)
_default_params_bounds.update(self.params_bounds)
self.params_bounds = _default_params_bounds
else:
self.params_bounds = self._default_params_bounds()
# classification/regression metrics and objectives should be aligned
self._metrics_and_objectives_should_be_aligned()
[docs]
def fit(
self,
X: Union[pd.DataFrame, np.ndarray],
y: Union[List[float], np.ndarray, pd.Series],
) -> None:
"""Fits the main hyper-parameter tuning algorithm.
Notes
-----
At each iteration, one set of parameters gets passed from the `params_bounds` and the
evaluation occurs based on the cross-validation results. Bayesian optimizier always
maximizes the objectives. Therefore, based on the `metrics` we should be careful
when using `self.metrics` that are supposed to get minimized i.e. `error`. For those,
we can maximize `(-1) * metric`. One of the big pitfall of the current implementation
is the way we are sampling hyper-parameters from the `params_bounds` where we are looking
for an integer which is not possible. Therefore, for some of cases i.e. `max_depth` we
must cast the sampled value which is mathematically wrong (i.e. f(1.1) != f(1)).
Parameters
----------
X : Union[pd.DataFrame, np.ndarray]
Input data for training (features)
y : Union[List[float], np.ndarray, pd.Series]
Input ground truth for training (targets)
Returns
-------
None
"""
def _xgb_eval(
max_depth: float,
subsample: float,
colsample_bytree: float,
min_child_weight: float,
learning_rate: float,
gamma: float,
reg_alpha: float,
reg_lambda: float,
) -> float:
"""Inner hyper-parameter evaluation.
Returns
-------
float
"""
params = self._inner_params(
max_depth=max_depth,
subsample=subsample,
colsample_bytree=colsample_bytree,
min_child_weight=min_child_weight,
learning_rate=learning_rate,
gamma=gamma,
reg_alpha=reg_alpha,
reg_lambda=reg_lambda,
)
if self.metrics in self._clf_metrics():
_cvr = xgb.cv(
params=params,
dtrain=self.dtrain_,
num_boost_round=self.num_boost_round,
nfold=self.n_splits,
stratified=self.stratified,
metrics=self.metrics,
early_stopping_rounds=self.early_stopping_rounds,
seed=self.random_state,
shuffle=self.shuffle,
)
else:
_cvr = xgb.cv(
params=params,
dtrain=self.dtrain_,
num_boost_round=self.num_boost_round,
nfold=self.n_splits,
metrics=self.metrics,
early_stopping_rounds=self.early_stopping_rounds,
seed=self.random_state,
shuffle=self.shuffle,
)
if self.metrics in self._metrics_should_be_minimized():
return (-1) * _cvr.iloc[-1][2]
else:
return _cvr.iloc[-1][2]
self.dtrain_ = self._dtrain(
X_train=X,
y_train=y,
)
self.optimizer_ = BayesianOptimization(
f=_xgb_eval,
pbounds=self.params_bounds,
random_state=self.random_state,
verbose=self.verbose,
constraint=None,
bounds_transformer=None,
)
self.optimizer_.maximize(
init_points=self.n_init_iter,
n_iter=self.n_iter,
acquisition_function=UtilityFunction(
kind=self.acquisition_criterion,
kappa=2.576,
xi=0.0,
kappa_decay=1,
kappa_decay_delay=0,
),
)
self.results_ = self.get_results()
self.best_params_ = self.get_best_params()
self.best_results_ = self.get_best_results()
return None
[docs]
def get_params_bounds(
self,
) -> Optional[Dict[str, Tuple[Union[int, float], Union[int, float]]]]:
"""Returns the hyper-parameters boundaries for the tuning process.
Returns
-------
Dict[str, Tuple[Union[int, float], Union[int, float]]]
"""
return self.params_bounds
[docs]
def get_optimizer(self) -> BayesianOptimization:
"""Return the Bayesian Optimization object.
Returns
-------
BayesianOptimization
"""
return self.optimizer_
[docs]
def get_results(self) -> pd.DataFrame:
"""Returns the hyper-parameter optimization results.
Returns
-------
pd.DataFrame
"""
frames = []
for idx, res in enumerate(self.optimizer_.res):
data = res["params"]
data[self.metrics] = res["target"]
frames.append(
pd.DataFrame(
data=data,
index=[idx],
),
)
df_results = pd.concat(
frames,
axis=0,
)
df_results["max_depth"] = df_results["max_depth"].astype(int)
return df_results
[docs]
def get_best_params(self) -> Dict[str, Union[str, float, int]]:
"""Returns the tuned results of the optimization as the best set of hyper-parameters.
Returns
-------
Dict[str, Union[str, float, int]]
"""
targets = []
for _, rs in enumerate(self.optimizer_.res):
targets.append(rs["target"])
best_params = self.optimizer_.res[targets.index(max(targets))]["params"]
best_params["max_depth"] = int(best_params["max_depth"])
return best_params
[docs]
def get_best_results(self) -> pd.DataFrame:
"""Returns the performance of the best (tuned) set of hyper-parameters.
Returns
-------
pd.DataFrame
"""
cond = self.results_[self.metrics] == self.results_[self.metrics].max()
return self.results_.loc[cond, :].reset_index(drop=True)
def _default_params_bounds(
self,
) -> Dict[str, Tuple[Union[int, float], Union[int, float]]]:
"""Default set of parameters when the class is being instantiated with ``params_bounds=None``.
Notes
-----
The default set of parameters would be a little bit different depends on the type of selection
whether a classification or regression `metric` is being used.
Returns
-------
Dict[str, Union[str, float, int]]
"""
return {
"max_depth": (2, 7),
"learning_rate": (0.0, 1.0),
"min_child_weight": (1.0, 20.0),
"colsample_bytree": (0.1, 1.0),
"subsample": (0.1, 1.0),
"gamma": (0.0, 1.0),
"reg_alpha": (0.0, 1.0),
"reg_lambda": (0.0, 1.0),
}
def _verbose(self) -> int:
"""Returns verbosity level based on `verbose`.
Returns
-------
int
"""
return 2 if self.verbose else 0
def _inner_params(
self,
max_depth: float,
subsample: float,
colsample_bytree: float,
min_child_weight: float,
learning_rate: float,
gamma: float,
reg_alpha: float,
reg_lambda: float,
) -> Dict[str, Union[str, float, int, None]]:
"""Default set of parameters passed in inner evaluation.
Notes
-----
The default set of inners parameters would be a little bit different depends on the type of
task whether a classification or regression `metric` is being used.
Returns
-------
Dict[str, Union[str, float, int]]
"""
_params = {
"eval_metric": self.metrics,
"objective": self.objective,
"reg_alpha": reg_alpha,
"reg_lambda": reg_lambda,
"learning_rate": learning_rate,
"max_depth": int(max_depth),
"min_child_weight": min_child_weight,
"gamma": gamma,
"subsample": subsample,
"colsample_bytree": colsample_bytree,
"tree_method": "hist",
"nthread": 4,
"max_delta_step": 1,
"verbosity": 0,
}
# TODO(amir): this way prolly breaks for imbalanced classification
if self.metrics in self._clf_metrics():
_params["scale_pos_weight"] = 1
return _params
def _metrics_should_be_minimized(self) -> Set[str]:
"""Returns the default metrics that should be minimized.
Returns
-------
Set[str]
"""
return {
"error",
"logloss",
"rmse",
"rmsle",
"mae",
}
def _clf_metrics(self) -> Set[str]:
"""Returns the default classification metrics.
Returns
-------
Set[str]
"""
return {
"auc",
"aucpr",
"error",
"logloss",
}
def _clf_objectives(self) -> Set[str]:
"""Returns the default classification objectives.
Returns
-------
Set[str]
"""
return {
"binary:logistic",
}
def _metrics_and_objectives_should_be_aligned(self) -> None:
"""Predicate to validate the given metric and objective are aligned.
Raises
------
ValueError
Returns
-------
None
"""
if self.metrics in self._clf_metrics() and self.objective not in self._clf_objectives():
raise ValueError(
"Classification metrics cannot be used with regression objectives.",
)
if self.metrics not in self._clf_metrics() and self.objective in self._clf_objectives():
raise ValueError(
"Regression metrics cannot be used with classification objectives.",
)
return None