Source code for slickml.base._estimator

from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Dict, List, Optional, Union

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.base import BaseEstimator
from sklearn.preprocessing import StandardScaler

from slickml.utils._transform import array_to_df, check_var, df_to_csr


[docs]@dataclass class BaseXGBoostEstimator(ABC, BaseEstimator): """Base Estimator for XGBoost. Notes ----- This is an `abstractbaseclass` using XGBoost [xgboost-api]_ that can be used for any estimator using XGBoost as the base estimator such as ``XGBoostCVClassifier``, ``XGBoostRegressor``, ``XGBoostFeatureSelector``, ``XGBoostBayesianOptimizer``, and so on. This base estimator comes with the base validation utilities that can reduce the amount of copy/paste codes in the downstream classes. Parameters ---------- num_boost_round : int Number of boosting rounds to fit a model sparse_matrix : bool Whether to convert the input features to sparse matrix with csr format or not. This would increase the speed of feature selection for relatively large/sparse datasets. Consequently, this would actually act like an un-optimize solution for dense feature matrix. Additionally, this parameter cannot be used along with ``scale_mean=True`` standardizing the feature matrix to have a mean value of zeros would turn the feature matrix into a dense matrix. Therefore, by default our API banned this feature scale_mean : bool Whether to standarize the feauture matrix to have a mean value of zero per feature (center the features before scaling). As laid out in ``sparse_matrix``, ``scale_mean=False`` when using ``sparse_matrix=True``, since centering the feature matrix would decrease the sparsity and in practice it does not make any sense to use sparse matrix method and it would make it worse. The ``StandardScaler`` object can be accessed via ``cls.scaler_`` if ``scale_mean`` or ``scale_strd`` is used unless it is ``None`` scale_std : bool Whether to scale the feauture matrix to have unit variance (or equivalently, unit standard deviation) per feature. The ``StandardScaler`` object can be accessed via ``cls.scaler_`` if ``scale_mean`` or ``scale_strd`` is used unless it is ``None`` importance_type : str Importance type of ``xgboost.train()`` with possible values ``"weight"``, ``"gain"``, ``"total_gain"``, ``"cover"``, ``"total_cover"`` params : Dict[str, Union[str, float, int]], optional Set of parameters required for fitting a Booster Methods ------- fit(X, y) Abstract method to fit a model to the features/target depend on the task References ---------- .. [xgboost-api] https://xgboost.readthedocs.io/en/latest/python/python_api.html """ num_boost_round: Optional[int] sparse_matrix: Optional[bool] scale_mean: Optional[bool] scale_std: Optional[bool] importance_type: Optional[str] params: Optional[Dict[str, Union[str, float, int]]] = None
[docs] def __post_init__(self) -> None: """Post instantiation validations and assignments.""" check_var( self.num_boost_round, var_name="num_boost_round", dtypes=int, ) check_var( self.sparse_matrix, var_name="sparse_matrix", dtypes=bool, ) check_var( self.scale_mean, var_name="scale_mean", dtypes=bool, ) check_var( self.scale_std, var_name="scale_std", dtypes=bool, ) check_var( self.importance_type, var_name="importance_type", dtypes=str, values=( "weight", "gain", "total_gain", "cover", "total_cover", ), ) # The `StandardScaler` with `mean=True` would turn a sparse matrix into a dense matrix if self.sparse_matrix and self.scale_mean: raise ValueError( "The scale_mean should be False in conjuction of using sparse_matrix=True.", )
[docs] @abstractmethod def fit( self, X: Union[pd.DataFrame, np.ndarray], y: Union[List[float], np.ndarray, pd.Series], ) -> None: """`Abstractmethod` to fit a model to the features/targets depends on the task. Parameters ---------- X : Union[pd.DataFrame, np.ndarray] Input data for training (features) y : Union[List[float], np.ndarray, pd.Series] Input ground truth for training (targets) Returns ------- None """ ... # pragma: no cover
# TODO(amir): check the `y_train` type; maybe we need to have `list_to_array()` in utils? def _dtrain( self, X_train: Union[pd.DataFrame, np.ndarray], y_train: Union[List[float], np.ndarray, pd.Series], ) -> xgb.DMatrix: """Returns a proper dtrain matrix compatible with sparse/standardized matrices. Parameters ---------- X_train : Union[pd.DataFrame, np.ndarray] Input data for training (features) y_train : Union[List[float], np.ndarray, pd.Series] Input ground truth for training (targets) See Also -------- :meth:`_dtest()` Returns ------- xgb.DMatrix """ check_var( X_train, var_name="X_train", dtypes=( pd.DataFrame, np.ndarray, ), ) check_var( y_train, var_name="y_train", dtypes=( list, np.ndarray, pd.Series, ), ) if isinstance(X_train, np.ndarray): self.X_train = array_to_df( X=X_train, prefix="F", delimiter="_", ) else: self.X_train = X_train if not isinstance(y_train, np.ndarray): self.y_train = np.array(y_train) else: self.y_train = y_train # TODO(amir): move `StandardScaler` to utils if self.scale_mean or self.scale_std: self.scaler_ = StandardScaler( copy=True, with_mean=self.scale_mean, with_std=self.scale_std, ) self.X_train_ = pd.DataFrame( self.scaler_.fit_transform(self.X_train), columns=self.X_train.columns.tolist(), ) else: self.scaler_ = None self.X_train_ = self.X_train if not self.sparse_matrix: dtrain = xgb.DMatrix( data=self.X_train_, label=self.y_train, ) else: dtrain = xgb.DMatrix( data=df_to_csr( self.X_train_, fillna=0.0, verbose=False, ), label=self.y_train, feature_names=self.X_train_.columns.tolist(), ) return dtrain def _dtest( self, X_test: Union[pd.DataFrame, np.ndarray], y_test: Optional[Union[List[float], np.ndarray, pd.Series]] = None, ) -> xgb.DMatrix: """Returns a proper dtest matrix compatible with sparse/standardized matrices. If ``scale_mean=True`` or ``scale_std=True``, the ``StandardScaler`` object ``(scaler_)`` which is being fitted on ``X_train`` will be used to **only** transform ``X_test`` to make sure there is no data leak in the transformation. Additionally, ``y_test`` is optional since it might not be available while validating the model (inference). Parameters ---------- X_test : Union[pd.DataFrame, np.ndarray] Input data for testing (features) y_test : Union[List[float], np.ndarray, pd.Series], optional Input ground truth for testing (targets) See Also -------- :meth:`_dtrain()` Returns ------- xgb.DMatrix """ check_var( X_test, var_name="X_test", dtypes=( pd.DataFrame, np.ndarray, ), ) if y_test is not None: check_var( y_test, var_name="y_test", dtypes=( list, np.ndarray, pd.Series, ), ) if not isinstance(y_test, np.ndarray): self.y_test = np.array(y_test) else: self.y_test = y_test else: self.y_test = y_test if isinstance(X_test, np.ndarray): self.X_test = array_to_df( X=X_test, prefix="F", delimiter="_", ) else: self.X_test = X_test if self.scale_mean or self.scale_std: self.X_test_ = pd.DataFrame( self.scaler_.transform(self.X_test), columns=self.X_test.columns.tolist(), ) else: self.X_test_ = self.X_test if not self.sparse_matrix: dtest = xgb.DMatrix( data=self.X_test_, label=self.y_test, ) else: dtest = xgb.DMatrix( data=df_to_csr( self.X_test_, fillna=0.0, verbose=False, ), label=self.y_test, feature_names=self.X_test_.columns.tolist(), ) return dtest def _check_X_y( self, X: Union[pd.DataFrame, np.ndarray], y: Union[List[float], np.ndarray, pd.Series], ) -> None: """Validates/pre-processes the input matrices (features/targets). Returns ------- None """ check_var( X, var_name="X", dtypes=( pd.DataFrame, np.ndarray, ), ) check_var( y, var_name="y", dtypes=( list, np.ndarray, pd.Series, ), ) if isinstance(X, np.ndarray): self.X = array_to_df( X=X, prefix="F", delimiter="_", ) else: self.X = X if not isinstance(y, np.ndarray): self.y = np.array(y) else: self.y = y return None