Source code for slickml.optimization._hyperopt

from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Set, Union

import hyperopt
import numpy as np
import pandas as pd
import xgboost as xgb
from hyperopt.pyll import stochastic

from slickml.base import BaseXGBoostEstimator
from slickml.utils import check_var


[docs]@dataclass class XGBoostHyperOptimizer(BaseXGBoostEstimator): """XGBoost Hyper-Parameters Tuner using HyperOpt Optimization. This is wrapper using HyperOpt [hyperopt]_ a Python library for serial and parallel optimization over search spaces, which may include real-valued, discrete, and conditional dimensions to tune the hyper-parameter of XGBoost [xgboost-api]_ using ``xgboost.cv()`` functionality with n-folds cross-validation iteratively. This feature can be used to find the set of optimized set of hyper-parameters for both classification and regression tasks. Notes ----- The optimizier objective is always to minimize the target values. Therefore, in case of using a metric such as ``auc``, or ``aucpr`` the negative value of the metric will be minimized. Parameters ---------- n_iter : int, optional Maximum number of iteration rounds for hyper-parameters tuning before convergance, by default 100 n_splits : int, optional Number of folds for cross-validation, by default 4 metrics : str, optional Metrics to be tracked at cross-validation fitting time depends on the task (classification vs regression) with possible values of "auc", "aucpr", "error", "logloss", "rmse", "rmsle", "mae". Note this is different than `eval_metric` that needs to be passed to `params` dict, by default "auc" objective : str, optional Objective function depending on the task whether it is regression or classification. Possible objectives for classification ``"binary:logistic"`` and for regression ``"reg:logistic"``, ``"reg:squarederror"``, and ``"reg:squaredlogerror"``, by default "binary:logistic" params_bounds : Dict[str, Any], optional Set of hyper-parameters boundaries for HyperOpt using``hyperopt.hp`` and `hyperopt.pyll_utils`, by default {"max_depth" : (2, 7), "learning_rate" : (0, 1), "min_child_weight" : (1, 20), "colsample_bytree": (0.1, 1.0), "subsample" : (0.1, 1), "gamma" : (0, 1), "reg_alpha" : (0, 1), "reg_lambda" : (0, 1)} num_boost_round : int, optional Number of boosting rounds to fit a model, by default 200 early_stopping_rounds : int, optional The criterion to early abort the ``xgboost.cv()`` phase if the test metric is not improved, by default 20 random_state : int, optional Random seed number, by default 1367 stratified : bool, optional Whether to use stratificaiton of the targets (only available for classification tasks) to run ``xgboost.cv()`` to find the best number of boosting round at each fold of each iteration, by default True shuffle : bool, optional Whether to shuffle data to have the ability of building stratified folds in ``xgboost.cv()``, by default True sparse_matrix : bool, optional Whether to convert the input features to sparse matrix with csr format or not. This would increase the speed of feature selection for relatively large/sparse datasets. Consequently, this would actually act like an un-optimize solution for dense feature matrix. Additionally, this parameter cannot be used along with ``scale_mean=True`` standardizing the feature matrix to have a mean value of zeros would turn the feature matrix into a dense matrix. Therefore, by default our API banned this feature, by default False scale_mean : bool, optional Whether to standarize the feauture matrix to have a mean value of zero per feature (center the features before scaling). As laid out in ``sparse_matrix``, ``scale_mean=False`` when using ``sparse_matrix=True``, since centering the feature matrix would decrease the sparsity and in practice it does not make any sense to use sparse matrix method and it would make it worse. The ``StandardScaler`` object can be accessed via ``cls.scaler_`` if ``scale_mean`` or ``scale_strd`` is used unless it is ``None``, by default False scale_std : bool, optional Whether to scale the feauture matrix to have unit variance (or equivalently, unit standard deviation) per feature. The ``StandardScaler`` object can be accessed via ``cls.scaler_`` if ``scale_mean`` or ``scale_strd`` is used unless it is ``None``, by default False importance_type : str, optional Importance type of ``xgboost.train()`` with possible values ``"weight"``, ``"gain"``, ``"total_gain"``, ``"cover"``, ``"total_cover"``, by default "total_gain" verbose : bool, optional Whether to show the HyperOpt Optimization progress at each iteration, by default True Methods ------- fit(X, y) Fits the HyperOpt optimization algorithm to tune the hyper-parameters get_best_params() Returns the tuned hyper-parameters as a dictionary get_results() Returns all the optimization trials get_trials() Return the trials object get_params_bounds() Returns the parameters boundaries Attributes ---------- best_params_ Returns the tuned hyper-parameters as a dictionary results_ Returns all the optimization trials as results References ---------- .. [xgboost-api] https://xgboost.readthedocs.io/en/latest/python/python_api.html .. [hyperopt] https://github.com/hyperopt/hyperopt """ n_iter: Optional[int] = 100 n_splits: Optional[int] = 4 metrics: Optional[str] = "auc" num_boost_round: Optional[int] = 200 objective: Optional[str] = "binary:logistic" params_bounds: Optional[Dict[str, Any]] = None early_stopping_rounds: Optional[int] = 20 sparse_matrix: Optional[bool] = False scale_mean: Optional[bool] = False scale_std: Optional[bool] = False importance_type: Optional[str] = "total_gain" stratified: Optional[bool] = True shuffle: Optional[bool] = True random_state: Optional[int] = 1367 verbose: Optional[bool] = True
[docs] def __post_init__(self) -> None: """Post instantiation validations and assignments.""" super().__post_init__() check_var( self.n_iter, var_name="n_inter", dtypes=int, ) check_var( self.n_splits, var_name="n_splits", dtypes=int, ) check_var( self.metrics, var_name="metrics", dtypes=str, values=( "auc", "aucpr", "error", "logloss", "rmse", "rmsle", "mae", ), ) check_var( self.objective, var_name="objective", dtypes=str, values=( "binary:logistic", "reg:squarederror", "reg:squaredlogerror", "reg:logistic", ), ) check_var( self.early_stopping_rounds, var_name="early_stopping_rounds", dtypes=int, ) check_var( self.random_state, var_name="random_state", dtypes=int, ) check_var( self.stratified, var_name="stratified", dtypes=bool, ) check_var( self.shuffle, var_name="shuffle", dtypes=bool, ) check_var( self.verbose, var_name="verbose", dtypes=bool, ) # The default set of params bounds can be updated based on the given params bounds by user _default_params_bounds = self._default_params_bounds() if self.params_bounds is not None: check_var( self.params_bounds, var_name="params_bounds", dtypes=dict, ) # TODO(amir): here we update the defaults while the user only wants to test out a small # search space; we can also just let the user decide; this is doable since the inner # scope only needs `space` _default_params_bounds.update(self.params_bounds) self.params_bounds = _default_params_bounds else: self.params_bounds = self._default_params_bounds() # classification/regression metrics and objectives should be aligned self._metrics_and_objectives_should_be_aligned()
[docs] def fit( self, X: Union[pd.DataFrame, np.ndarray], y: Union[List[float], np.ndarray, pd.Series], ) -> None: """Fits the main hyper-parameter tuning algorithm. Notes ----- At each iteration, one set of parameters gets passed from the `params_bounds` and the evaluation occurs based on the cross-validation results. Hyper optimizier always minimizes the objectives. Therefore, based on the `metrics` we should be careful when using `self.metrics` that are supposed to get maximized i.e. `auc`. For those, we can maximize `(-1) * metric`. Parameters ---------- X : Union[pd.DataFrame, np.ndarray] Input data for training (features) y : Union[List[float], np.ndarray, pd.Series] Input ground truth for training (targets) Returns ------- None """ def _xgb_eval(space: Dict[str, Any]) -> Dict[str, Union[float, str]]: """Inner hyper-parameter evaluation. Returns ------- Dict[str, Union[float, str]] """ params = stochastic.sample(space) params.update( self._inner_params(), ) if self.metrics in self._clf_metrics(): _cvr = xgb.cv( params=params, dtrain=self.dtrain_, num_boost_round=self.num_boost_round, nfold=self.n_splits, stratified=self.stratified, metrics=self.metrics, early_stopping_rounds=self.early_stopping_rounds, seed=self.random_state, shuffle=self.shuffle, ) else: _cvr = xgb.cv( params=params, dtrain=self.dtrain_, num_boost_round=self.num_boost_round, nfold=self.n_splits, metrics=self.metrics, early_stopping_rounds=self.early_stopping_rounds, seed=self.random_state, shuffle=self.shuffle, ) if self.metrics in self._metrics_should_be_minimized(): loss = _cvr.iloc[-1][2] else: loss = (-1) * _cvr.iloc[-1][2] return { "loss": loss, "status": hyperopt.STATUS_OK, } self.dtrain_ = self._dtrain( X_train=X, y_train=y, ) self.trials_ = hyperopt.Trials() try: self.best_params_ = hyperopt.fmin( fn=_xgb_eval, space=self.params_bounds, algo=hyperopt.tpe.suggest, max_evals=self.n_iter, trials=self.trials_, verbose=self.verbose, rstate=None, allow_trials_fmin=True, catch_eval_exceptions=False, return_argmin=True, max_queue_len=1, timeout=None, loss_threshold=None, pass_expr_memo_ctrl=None, points_to_evaluate=None, show_progressbar=True, early_stop_fn=None, trials_save_file="", ) self.results_ = self.trials_.trials # TODO(amir): log error except Exception as e: self.best_params_ = { "status": hyperopt.STATUS_FAIL, "exception": str(e), } return None
[docs] def get_best_params(self) -> Dict[str, Union[str, float, int]]: """Returns the tuned results of the optimization as the best set of hyper-parameters. Returns ------- Dict[str, Union[str, float, int]] """ return self.best_params_
[docs] def get_results(self) -> List[Dict[str, Any]]: """Return all trials results. Returns ------- List[Dict[str, Any]] """ return self.results_
[docs] def get_trials(self) -> hyperopt.Trials: """Returns the `Trials` object passed to the optimizer. Returns ------- hyperopt.Trials """ return self.trials_
[docs] def get_params_bounds(self) -> Optional[Dict[str, Any]]: """Returns the hyper-parameters boundaries for the tuning process. Returns ------- Dict[str, Any] """ return self.params_bounds
# TODO(amir): check the type checker for return type def _default_params_bounds(self) -> Dict[str, Any]: """Default set of parameters when the class is being instantiated with ``params_bounds=None``. Notes ----- The default set of parameters would be a little bit different depends on the type of selection whether a classification or regression `metric` is being used. Returns ------- Dict[str, Any] """ return { "max_depth": hyperopt.hp.choice("max_depth", range(2, 7)), "learning_rate": hyperopt.hp.quniform("learning_rate", 0.01, 1.0, 0.01), "min_child_weight": hyperopt.hp.quniform("min_child_weight", 1.0, 20.0, 1), "colsample_bytree": hyperopt.hp.quniform("colsample_bytree", 0.1, 1.0, 0.01), "subsample": hyperopt.hp.quniform("subsample", 0.1, 1, 0.01), "gamma": hyperopt.hp.quniform("gamma", 0.0, 1.0, 0.01), "reg_alpha": hyperopt.hp.quniform("reg_alpha", 0.0, 1.0, 0.01), "reg_lambda": hyperopt.hp.quniform("reg_lambda", 0.0, 1.0, 0.01), } def _inner_params(self) -> Dict[str, Union[str, float, int]]: """Default set of parameters passed in inner evaluation. Notes ----- The default set of inners parameters would be a little bit different depends on the type of task whether a classification or regression `metric` is being used. Returns ------- Dict[str, Union[str, float, int]] """ _params = { "eval_metric": self.metrics, "objective": self.objective, "tree_method": "hist", "nthread": 4, "max_delta_step": 1, "verbosity": 0, } # TODO(amir): this way prolly breaks for imbalanced classification if self.metrics in self._clf_metrics(): _params["scale_pos_weight"] = 1 return _params # type: ignore def _metrics_should_be_minimized(self) -> Set[str]: """Returns the default metrics that should be minimized. Returns ------- Set[str] """ return { "error", "logloss", "rmse", "rmsle", "mae", } def _clf_metrics(self) -> Set[str]: """Returns the default classification metrics. Returns ------- Set[str] """ return { "auc", "aucpr", "error", "logloss", } def _clf_objectives(self) -> Set[str]: """Returns the default classification objectives. Returns ------- Set[str] """ return { "binary:logistic", } def _metrics_and_objectives_should_be_aligned(self) -> None: """Predicate to validate the given metric and objective are aligned. Raises ------ ValueError Returns ------- None """ if self.metrics in self._clf_metrics() and self.objective not in self._clf_objectives(): raise ValueError("Classification metrics cannot be used with regression objectives.") if self.metrics not in self._clf_metrics() and self.objective in self._clf_objectives(): raise ValueError("Regression metrics cannot be used with classification objectives.") return None