Source code for slickml.base._estimator

from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Dict, List, Optional, Union

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.base import BaseEstimator
from sklearn.preprocessing import StandardScaler

from slickml.utils._transform import array_to_df, check_var, df_to_csr



[docs]
@dataclass
class BaseXGBoostEstimator(ABC, BaseEstimator):
    """Base Estimator for XGBoost.

    Notes
    -----
    This is an `abstractbaseclass` using XGBoost [xgboost-api]_ that can be used for any estimator using
    XGBoost as the base estimator such as ``XGBoostCVClassifier``, ``XGBoostRegressor``,
    ``XGBoostFeatureSelector``, ``XGBoostBayesianOptimizer``, and so on. This base estimator comes
    with the base validation utilities that can reduce the amount of copy/paste codes in the
    downstream classes.

    Parameters
    ----------
    num_boost_round : int
        Number of boosting rounds to fit a model

    sparse_matrix : bool
        Whether to convert the input features to sparse matrix with csr format or not. This would
        increase the speed of feature selection for relatively large/sparse datasets. Consequently,
        this would actually act like an un-optimize solution for dense feature matrix. Additionally,
        this parameter cannot be used along with ``scale_mean=True`` standardizing the feature matrix
        to have a mean value of zeros would turn the feature matrix into a dense matrix. Therefore,
        by default our API banned this feature

    scale_mean : bool
        Whether to standarize the feauture matrix to have a mean value of zero per feature (center
        the features before scaling). As laid out in ``sparse_matrix``, ``scale_mean=False`` when
        using ``sparse_matrix=True``, since centering the feature matrix would decrease the sparsity
        and in practice it does not make any sense to use sparse matrix method and it would make
        it worse. The ``StandardScaler`` object can be accessed via ``cls.scaler_`` if ``scale_mean`` or
        ``scale_strd`` is used unless it is ``None``

    scale_std : bool
        Whether to scale the feauture matrix to have unit variance (or equivalently, unit standard
        deviation) per feature. The ``StandardScaler`` object can be accessed via ``cls.scaler_``
        if ``scale_mean`` or ``scale_strd`` is used unless it is ``None``

    importance_type : str
        Importance type of ``xgboost.train()`` with possible values ``"weight"``, ``"gain"``,
        ``"total_gain"``, ``"cover"``, ``"total_cover"``

    params : Dict[str, Union[str, float, int]], optional
        Set of parameters required for fitting a Booster

    Methods
    -------
    fit(X, y)
        Abstract method to fit a model to the features/target depend on the task

    References
    ----------
    .. [xgboost-api] https://xgboost.readthedocs.io/en/latest/python/python_api.html
    """

    num_boost_round: Optional[int]
    sparse_matrix: Optional[bool]
    scale_mean: Optional[bool]
    scale_std: Optional[bool]
    importance_type: Optional[str]
    params: Optional[Dict[str, Union[str, float, int]]] = None


[docs]
    def __post_init__(self) -> None:
        """Post instantiation validations and assignments."""
        check_var(
            self.num_boost_round,
            var_name="num_boost_round",
            dtypes=int,
        )
        check_var(
            self.sparse_matrix,
            var_name="sparse_matrix",
            dtypes=bool,
        )
        check_var(
            self.scale_mean,
            var_name="scale_mean",
            dtypes=bool,
        )
        check_var(
            self.scale_std,
            var_name="scale_std",
            dtypes=bool,
        )
        check_var(
            self.importance_type,
            var_name="importance_type",
            dtypes=str,
            values=(
                "weight",
                "gain",
                "total_gain",
                "cover",
                "total_cover",
            ),
        )
        # The `StandardScaler` with `mean=True` would turn a sparse matrix into a dense matrix
        if self.sparse_matrix and self.scale_mean:
            raise ValueError(
                "The scale_mean should be False in conjuction of using sparse_matrix=True.",
            )



[docs]
    @abstractmethod
    def fit(
        self,
        X: Union[pd.DataFrame, np.ndarray],
        y: Union[List[float], np.ndarray, pd.Series],
    ) -> None:
        """`Abstractmethod` to fit a model to the features/targets depends on the task.

        Parameters
        ----------
        X : Union[pd.DataFrame, np.ndarray]
            Input data for training (features)

        y : Union[List[float], np.ndarray, pd.Series]
            Input ground truth for training (targets)

        Returns
        -------
        None
        """
        ...  # pragma: no cover


    # TODO(amir): check the `y_train` type; maybe we need to have `list_to_array()` in utils?
    def _dtrain(
        self,
        X_train: Union[pd.DataFrame, np.ndarray],
        y_train: Union[List[float], np.ndarray, pd.Series],
    ) -> xgb.DMatrix:
        """Returns a proper dtrain matrix compatible with sparse/standardized matrices.

        Parameters
        ----------
        X_train : Union[pd.DataFrame, np.ndarray]
            Input data for training (features)

        y_train : Union[List[float], np.ndarray, pd.Series]
            Input ground truth for training (targets)

        See Also
        --------
        :meth:`_dtest()`

        Returns
        -------
        xgb.DMatrix
        """
        check_var(
            X_train,
            var_name="X_train",
            dtypes=(
                pd.DataFrame,
                np.ndarray,
            ),
        )
        check_var(
            y_train,
            var_name="y_train",
            dtypes=(
                list,
                np.ndarray,
                pd.Series,
            ),
        )

        if isinstance(X_train, np.ndarray):
            self.X_train = array_to_df(
                X=X_train,
                prefix="F",
                delimiter="_",
            )
        else:
            self.X_train = X_train

        if not isinstance(y_train, np.ndarray):
            self.y_train = np.array(y_train)
        else:
            self.y_train = y_train

        # TODO(amir): move `StandardScaler` to utils
        if self.scale_mean or self.scale_std:
            self.scaler_ = StandardScaler(
                copy=True,
                with_mean=self.scale_mean,
                with_std=self.scale_std,
            )
            self.X_train_ = pd.DataFrame(
                self.scaler_.fit_transform(self.X_train),
                columns=self.X_train.columns.tolist(),
            )
        else:
            self.scaler_ = None
            self.X_train_ = self.X_train

        if not self.sparse_matrix:
            dtrain = xgb.DMatrix(
                data=self.X_train_,
                label=self.y_train,
            )
        else:
            dtrain = xgb.DMatrix(
                data=df_to_csr(
                    self.X_train_,
                    fillna=0.0,
                    verbose=False,
                ),
                label=self.y_train,
                feature_names=self.X_train_.columns.tolist(),
            )

        return dtrain

    def _dtest(
        self,
        X_test: Union[pd.DataFrame, np.ndarray],
        y_test: Optional[Union[List[float], np.ndarray, pd.Series]] = None,
    ) -> xgb.DMatrix:
        """Returns a proper dtest matrix compatible with sparse/standardized matrices.

        If ``scale_mean=True`` or ``scale_std=True``, the ``StandardScaler`` object ``(scaler_)``
        which is being fitted on ``X_train`` will be used to **only** transform ``X_test`` to make
        sure there is no data leak in the transformation. Additionally, ``y_test`` is optional since
        it might not be available while validating the model (inference).

        Parameters
        ----------
        X_test : Union[pd.DataFrame, np.ndarray]
            Input data for testing (features)

        y_test : Union[List[float], np.ndarray, pd.Series], optional
            Input ground truth for testing (targets)

        See Also
        --------
        :meth:`_dtrain()`

        Returns
        -------
        xgb.DMatrix
        """
        check_var(
            X_test,
            var_name="X_test",
            dtypes=(
                pd.DataFrame,
                np.ndarray,
            ),
        )
        if y_test is not None:
            check_var(
                y_test,
                var_name="y_test",
                dtypes=(
                    list,
                    np.ndarray,
                    pd.Series,
                ),
            )
            if not isinstance(y_test, np.ndarray):
                self.y_test = np.array(y_test)
            else:
                self.y_test = y_test
        else:
            self.y_test = y_test

        if isinstance(X_test, np.ndarray):
            self.X_test = array_to_df(
                X=X_test,
                prefix="F",
                delimiter="_",
            )
        else:
            self.X_test = X_test

        if self.scale_mean or self.scale_std:
            self.X_test_ = pd.DataFrame(
                self.scaler_.transform(self.X_test),
                columns=self.X_test.columns.tolist(),
            )
        else:
            self.X_test_ = self.X_test

        if not self.sparse_matrix:
            dtest = xgb.DMatrix(
                data=self.X_test_,
                label=self.y_test,
            )
        else:
            dtest = xgb.DMatrix(
                data=df_to_csr(
                    self.X_test_,
                    fillna=0.0,
                    verbose=False,
                ),
                label=self.y_test,
                feature_names=self.X_test_.columns.tolist(),
            )

        return dtest

    def _check_X_y(
        self,
        X: Union[pd.DataFrame, np.ndarray],
        y: Union[List[float], np.ndarray, pd.Series],
    ) -> None:
        """Validates/pre-processes the input matrices (features/targets).

        Returns
        -------
        None
        """
        check_var(
            X,
            var_name="X",
            dtypes=(
                pd.DataFrame,
                np.ndarray,
            ),
        )
        check_var(
            y,
            var_name="y",
            dtypes=(
                list,
                np.ndarray,
                pd.Series,
            ),
        )

        if isinstance(X, np.ndarray):
            self.X = array_to_df(
                X=X,
                prefix="F",
                delimiter="_",
            )
        else:
            self.X = X

        if not isinstance(y, np.ndarray):
            self.y = np.array(y)
        else:
            self.y = y

        return None