from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple, Union
import glmnet
import numpy as np
import pandas as pd
import shap
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.figure import Figure
from sklearn.base import BaseEstimator, ClassifierMixin
from slickml.utils import array_to_df, check_var, df_to_csr
from slickml.visualization import (
plot_glmnet_coeff_path,
plot_glmnet_cv_results,
plot_shap_summary,
plot_shap_waterfall,
)
# TODO(amir): add lolli-pop plot for coeff
[docs]
@dataclass
class GLMNetCVClassifier(BaseEstimator, ClassifierMixin):
"""GLMNet CV Classifier.
This is wrapper using GLM-Net [glmnet-api]_ to train a Regularized Linear Model via logitic regression and
find the optimal penalty values through N-Folds cross validation. In principle, GLMNet (also known
as ElasticNet) can also be used for feature selection and dimensionality reduction using the LASSO
(Least Absolute Shrinkage and Selection Operator) Regression part of the alogrithm while reaching
a solid solution using the Ridge Regression part of the algorithm.
Parameters
----------
alpha : float, optional
The stability parameter with a possible values of ``0 <= alpha <= 1`` where ``alpha=0.0``
and ``alpha=1.0`` will lead to classic Ridge and LASSO regression models, respectively, by
default 0.5
n_lambda : int, optional
Maximum number of penalty values to compute, by default 100
n_splits : int, optional
Number of cross validation folds for computing performance metrics and determining
``lambda_best_`` and ``lambda_max_``. If non-zero, must beat least 3, by default 3
metric : str, optional
Metric used for model selection during cross validation. Valid options are ``"accuracy"``,
``"roc_auc" (alias = "auc")``, ``"average_precision"``, ``"precision"``, and ``"recall"``.
The metric affects the selection of ``lambda_best_`` and ``lambda_max_``. Thus, fitting the
same data with different metric methods will result in the selection of different models, by
default "auc"
scale : bool, optional
Whether to standardize the input features to have a mean value of 0.0 and standard deviation
of 1 prior to fitting. The final coefficients will be on the scale of the original data regardless
of this step. Therefore, there is no need to pre-process the data when using ``scale=True``,
by default True
sparse_matrix : bool, optional
Whether to convert the input features to sparse matrix with csr format or not. This would increase
the speed of feature selection for relatively large sparse datasets. Additionally, this
parameter cannot be used along with ``scale=True`` where standardizing the feature matrix
to have a mean value of zero would turn the feature matrix into a dense matrix, by default False
fit_intercept : bool, optional
Include an intercept term in the model, by default True
cut_point : float, optional
The cut point to use for selecting ``lambda_best_``. Based on this value, the distance between
``lambda_max_`` and ``lambda_best_`` would be ``cut_point * standard_error(lambda_best_)``
``arg_max(lambda) for cv_score(lambda) >= cv_score(lambda_max_) - cut_point * standard_error(lambda_max_),
by default 1.0
min_lambda_ratio : float, optional
In combination with ``n_lambda``, the ratio of the smallest and largest values of lambda
computed ``(min_lambda/max_lambda >= min_lambda_ratio)``, by default 1e-4
tolerance : float, optional
Convergence criteria tolerance, by default 1e-7
max_iter : int, optional
Maximum passes over the data, by default 100000
random_state : int, optional
Seed for the random number generator. The glmnet solver is not
deterministic, this seed is used for determining the cv folds.
lambda_path : Union[List[float], np.ndarray, pd.Series], optional
In place of supplying ``n_lambda``, provide an array of specific values to compute. The
specified values must be in decreasing order. When None, the path of lambda values will be
determined automatically. A maximum of ``n_lambda`` values will be computed, by default None
max_features : int, optional
Optional maximum number of features with nonzero coefficients after regularization. If not
set, defaults to the number features (``X_train.shape[1]``) during fit. Note, this will be
ignored if the user specifies ``lambda_path``, by default None
Methods
-------
fit(X_train, y_train)
Fits a ``glmnet.LogitNet`` to input training data. Proper ``X_train`` matrix based on chosen
options i.e. ``sparse_matrix``, and ``scale`` is being created based on the passed ``X_train``
and ``y_train``
predict_proba(X_test, y_test)
Returns prediction probabilities for the positive class. ``predict_proba()`` only reports
the probability of the positive class, while the sklearn API returns for both and slicing
like ``pred_proba[:, 1]`` is needed for positive class predictions. Additionally, ``y_test``
is optional while the targets might not be available in validiation (inference)
predict(X_test, y_test, threshold=0.5)
Returns prediction classes based on the threshold. The default ``threshold=0.5`` might not
give you the best results while you can find the optimum thresholds based on different
algorithms including Youden Index, maximizing the area under sensitivity-specificity curve,
and maximizing the area under precision-recall curve by using ``BinaryClassificationMetrics``
plot_coeff_path():
Visualizes the coefficients' paths
plot_cv_results()
Visualizes the cross-validation results
plot_shap_summary()
Visualizes Shapley values summary plot
plot_shap_waterfall()
Visualizes Shapley values waterfall plot
get_shap_explainer()
Returns the fitted ``shap.LinearExplainer`` object
get_params():
Returns parameters
get_intercept():
Returns model's intercept
get_coeffs():
Returns non-zero coefficients
get_cv_results():
Returns cross-validation results
get_results():
Returns model's total results
Attributes
----------
X_train : pd.DataFrame
Returns training data set
X_test : pd.DataFrame
Returns transformed testing data set
y_train : np.ndarray
Returns the list of training ground truth binary values [0, 1]
y_test : np.ndarray
Returns the list of testing ground truth binary values [0, 1]
coeff_ : pd.DataFrame
Return the model's non-zero coefficients
intercept_ : float
Return the model's intercept
cv_results_ : pd.DataFrame
Returns the cross-validation results
results_ : Dict[str, Any]
Returns the model's total results
params_ : Dict[str, Any]
Returns model's fitting parameters
shap_values_train_ : np.ndarray
Shapley values from ``LinearExplainer`` using ``X_train``
shap_values_test_ : np.ndarray
Shapley values from ``LinearExplainer`` using ``X_test``
shap_explainer_ : shap.LinearExplainer
Shap LinearExplainer with independent masker using ``X_Test``
model_ : glmnet.LogitNet
Returns fitted ``glmnet.LogitNet`` model
References
----------
.. [glmnet-api] https://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html
.. [markers-api] https://matplotlib.org/stable/api/markers_api.html
.. [yscale] https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.yscale.html
.. [shap-api] https://shap-lrjball.readthedocs.io/en/latest/generated/shap.summary_plot.html
"""
alpha: Optional[float] = 0.5
n_lambda: Optional[int] = 100
n_splits: Optional[int] = 3
metric: Optional[str] = "auc"
scale: Optional[bool] = True
sparse_matrix: Optional[bool] = False
fit_intercept: Optional[bool] = True
cut_point: Optional[float] = 1.0
min_lambda_ratio: Optional[float] = 1e-4
tolerance: Optional[float] = 1e-7
max_iter: Optional[int] = 100000
random_state: Optional[int] = 1367
lambda_path: Optional[Union[List[float], np.ndarray, pd.Series]] = None
max_features: Optional[int] = None
[docs]
def __post_init__(self) -> None:
"""Post instantiation validations and assignments."""
check_var(
self.alpha,
var_name="alpha",
dtypes=float,
)
check_var(
self.n_lambda,
var_name="n_lambda",
dtypes=int,
)
check_var(
self.n_splits,
var_name="n_splits",
dtypes=int,
)
# TODO(amir): metric should be able to be a `CALLABLE` as well with signature "scorer(estimator, X, y)".
check_var(
self.metric,
var_name="metric",
dtypes=str,
values=(
"auc",
"roc_auc",
"average_precision",
"precision",
"recall",
"accuracy",
),
)
self._transform_metric()
check_var(
self.scale,
var_name="scale",
dtypes=bool,
)
check_var(
self.sparse_matrix,
var_name="sparse_matrix",
dtypes=bool,
)
check_var(
self.fit_intercept,
var_name="fit_intercept",
dtypes=bool,
)
check_var(
self.cut_point,
var_name="cut_point",
dtypes=float,
)
check_var(
self.min_lambda_ratio,
var_name="min_lambda_ratio",
dtypes=float,
)
check_var(
self.tolerance,
var_name="tolerance",
dtypes=float,
)
check_var(
self.max_iter,
var_name="max_iter",
dtypes=int,
)
check_var(
self.random_state,
var_name="random_state",
dtypes=int,
)
if self.max_features:
check_var(
self.max_features,
var_name="max_features",
dtypes=int,
)
if self.lambda_path:
check_var(
self.lambda_path,
var_name="lambda_path",
dtypes=(
list,
np.ndarray,
pd.Series,
),
)
if not isinstance(self.lambda_path, np.ndarray):
self.lambda_path = np.array(self.lambda_path)
# The `scale=True` would turn a sparse matrix into a dense matrix
if self.sparse_matrix and self.scale:
raise ValueError(
"The scale should be False in conjuction of using sparse_matrix=True.",
)
# TODO(amir): expose `groups` in args since glmnet supports it
[docs]
def fit(
self,
X_train: Union[pd.DataFrame, np.ndarray],
y_train: Union[List[float], np.ndarray, pd.Series],
) -> None:
"""Fits a ``glmnet.LogitNet`` to input training data.
Notes
-----
For the cases that ``sparse_matrix=True``, a CSR format of the input will be used via
``df_to_csr()`` function.
Parameters
----------
X_train : Union[pd.DataFrame, np.ndarray]
Input data for training (features)
y_train : Union[List[float], np.ndarray, pd.Series]
Input ground truth for training (targets)
Returns
-------
None
"""
self._dtrain(
X_train=X_train,
y_train=y_train,
)
self.model_ = self._model()
self.coeff_ = self._coeff_to_df()
self.results_ = self._results()
self.cv_results_ = self._cv_results()
self.intercept_ = self.model_.intercept_
self.params_ = self.model_.get_params()
return None
[docs]
def predict_proba(
self,
X_test: Union[pd.DataFrame, np.ndarray],
y_test: Optional[Union[List[float], np.ndarray, pd.Series]] = None,
lamb: Optional[np.ndarray] = None,
) -> np.ndarray:
"""Returns the prediction probabilities for the positive class.
Notes
-----
``predict_proba()`` only reports the probability of the positive class, while the sklearn API
returns for both and slicing like ``pred_proba[:, 1]`` is needed for positive class
predictions. Additionally, ``y_test`` is optional while the targets might not be available
in validiation (inference).
Parameters
----------
X_test : Union[pd.DataFrame, np.ndarray]
Input data for testing (features)
y_test : Union[List[float], np.ndarray, pd.Series], optional
Input ground truth for testing (targets)
lamb : np.ndarray, optional
Values with shape ``(n_lambda,)`` of lambda from ``lambda_path_`` from which to make
predictions. If no values are provided (None), the returned predictions will be those
corresponding to ``lambda_best_``. The values of lamb must also be in the range of
``lambda_path_``, values greater than ``max(lambda_path_)`` or less than
``min(lambda_path_)`` will be clipped
Returns
-------
np.ndarray
"""
self._dtest(
X_test=X_test,
y_test=y_test,
)
if self.sparse_matrix:
self.y_pred_proba_ = self.model_.predict_proba(
X=df_to_csr(
self.X_test,
fillna=0.0,
verbose=False,
),
lamb=lamb,
)[:, 1]
else:
self.y_pred_proba_ = self.model_.predict_proba(
X=self.X_test,
lamb=lamb,
)[:, 1]
return self.y_pred_proba_
[docs]
def predict(
self,
X_test: Union[pd.DataFrame, np.ndarray],
y_test: Optional[Union[List[float], np.ndarray, pd.Series]] = None,
threshold: Optional[float] = 0.5,
lamb: Optional[np.ndarray] = None,
) -> np.ndarray:
"""Returns the prediction classes based on the threshold.
Notes
-----
The default ``threshold=0.5`` might not give you the best results while you can find the
optimum thresholds based on different algorithms including Youden Index, maximizing the area
under sensitivity-specificity curve, and maximizing the area under precision-recall curve by
using ``BinaryClassificationMetrics``.
Parameters
----------
X_test : Union[pd.DataFrame, np.ndarray]
Input data for testing (features)
y_test : Union[List[float], np.ndarray, pd.Series], optional
Input ground truth for testing (targets)
threshold : float, optional
Inclusive threshold value to binarize ``y_pred_proba_`` to ``y_pred_`` where any value
that satisfies ``y_pred_prob_ >= threshold`` will set to ``class=1 (positive class)``.
Note that for ``">="`` is used instead of ``">"``, by default 0.5
lamb : np.ndarray, optional
Values with shape ``(n_lambda,)`` of lambda from ``lambda_path_`` from which to make
predictions. If no values are provided (None), the returned predictions will be those
corresponding to ``lambda_best_``. The values of lamb must also be in the range of
``lambda_path_``, values greater than ``max(lambda_path_)`` or less than
``min(lambda_path_)`` will be clipped
Returns
-------
np.ndarray
"""
y_pred_proba = self.predict_proba(
X_test=X_test,
y_test=y_test,
lamb=lamb,
)
self.y_pred_ = (y_pred_proba >= threshold).astype(int)
return self.y_pred_
[docs]
def plot_cv_results(
self,
figsize: Optional[Tuple[Union[int, float], Union[int, float]]] = (8, 5),
marker: Optional[str] = "o",
markersize: Optional[Union[int, float]] = 5,
color: Optional[str] = "red",
errorbarcolor: Optional[str] = "black",
maxlambdacolor: Optional[str] = "purple",
bestlambdacolor: Optional[str] = "navy",
linestyle: Optional[str] = "--",
fontsize: Optional[Union[int, float]] = 12,
grid: Optional[bool] = True,
legend: Optional[bool] = True,
legendloc: Optional[Union[int, str]] = "best",
xlabel: Optional[str] = None,
ylabel: Optional[str] = None,
title: Optional[str] = None,
save_path: Optional[str] = None,
display_plot: Optional[bool] = True,
return_fig: Optional[bool] = False,
) -> Optional[Figure]:
"""Visualizes the GLMNet cross-validation results.
Notes
-----
This plotting function can be used along with ``results_`` attribute of any of
``GLMNetCVClassifier``, or ``GLMNetCVRegressor`` classes as ``kwargs``.
Parameters
----------
figsize : tuple, optional
Figure size, by default (8, 5)
marker : str, optional
Marker style of the metric to distinguish the error bars. More valid marker styles can be
found at [markers-api]_, by default "o"
markersize : Union[int, float], optional
Markersize, by default 5
color : str, optional
Line and marker color, by default "red"
errorbarcolor : str, optional
Error bar color, by default "black"
maxlambdacolor : str, optional
Color of vertical line for ``lambda_max_``, by default "purple"
bestlambdacolor : str, optional
Color of vertical line for ``lambda_best_``, by default "navy"
linestyle : str, optional
Linestyle of vertical lambda lines, by default "--"
fontsize : Union[int, float], optional
Fontsize of the title. The fontsizes of xlabel, ylabel, tick_params, and legend are resized
with 0.85, 0.85, 0.75, and 0.85 fraction of title fontsize, respectively, by default 12
grid : bool, optional
Whether to show (x,y) grid on the plot or not, by default True
legend : bool, optional
Whether to show legend on the plot or not, by default True
legendloc : Union[int, str], optional
Location of legend, by default "best"
xlabel : str, optional
Xlabel of the plot, by default "-Log(Lambda)"
ylabel : str, optional
Ylabel of the plot, by default "{n_splits}-Folds CV Mean {metric}"
title : str, optional
Title of the plot, by default "Best {lambda_best} with {n} Features"
save_path : str, optional
The full or relative path to save the plot including the image format such as
"myplot.png" or "../../myplot.pdf", by default None
display_plot : bool, optional
Whether to show the plot, by default True
return_fig : bool, optional
Whether to return figure object, by default False
**kwargs : Dict[str, Any]
Key-value pairs of results. ``results_`` attribute can be used
See Also
--------
:class:`slickml.classification.GLMNetCVClassifier`
:class:`slickml.regression.GLMNetCVRegressor`
Returns
-------
Figure, optional
"""
return plot_glmnet_cv_results(
figsize=figsize,
marker=marker,
markersize=markersize,
color=color,
errorbarcolor=errorbarcolor,
maxlambdacolor=maxlambdacolor,
bestlambdacolor=bestlambdacolor,
linestyle=linestyle,
fontsize=fontsize,
grid=grid,
legend=legend,
legendloc=legendloc,
xlabel=xlabel,
ylabel=ylabel,
title=title,
save_path=save_path,
display_plot=display_plot,
return_fig=return_fig,
**self.results_,
)
[docs]
def plot_coeff_path(
self,
figsize: Optional[Tuple[Union[int, float], Union[int, float]]] = (8, 5),
linestyle: Optional[str] = "-",
fontsize: Optional[Union[int, float]] = 12,
grid: Optional[bool] = True,
legend: Optional[bool] = True,
legendloc: Optional[Union[int, str]] = "center",
xlabel: Optional[str] = None,
ylabel: Optional[str] = "Coefficients",
title: Optional[str] = None,
bbox_to_anchor: Tuple[float, float] = (1.1, 0.5),
yscale: Optional[str] = "linear",
save_path: Optional[str] = None,
display_plot: Optional[bool] = True,
return_fig: Optional[bool] = False,
) -> Optional[Figure]:
"""Visualizes the GLMNet coefficients' paths.
Parameters
----------
figsize : tuple, optional
Figure size, by default (8, 5)
linestyle : str, optional
Linestyle of paths, by default "-"
fontsize : Union[int, float], optional
Fontsize of the title. The fontsizes of xlabel, ylabel, tick_params, and legend are resized
with 0.85, 0.85, 0.75, and 0.85 fraction of title fontsize, respectively, by default 12
grid : bool, optional
Whether to show (x,y) grid on the plot or not, by default True
legend : bool, optional
Whether to show legend on the plot or not, by default True
legendloc : Union[int, str], optional
Location of legend, by default "center"
xlabel : str, optional
Xlabel of the plot, by default "-Log(Lambda)"
ylabel : str, optional
Ylabel of the plot, by default "Coefficients"
title : str, optional
Title of the plot, by default "Best {lambda_best} with {n} Features"
yscale : str, optiona
Scale for y-axis (coefficients). Possible options are ``"linear"``, ``"log"``, ``"symlog"``,
``"logit"`` [yscale]_, by default "linear"
bbox_to_anchor : Tuple[float, float], optional
Relative coordinates for legend location outside of the plot, by default (1.1, 0.5)
save_path : str, optional
The full or relative path to save the plot including the image format such as
"myplot.png" or "../../myplot.pdf", by default None
display_plot : bool, optional
Whether to show the plot, by default True
return_fig : bool, optional
Whether to return figure object, by default False
**kwargs : Dict[str, Any]
Key-value pairs of results. ``results_`` attribute can be used
Returns
-------
Figure, optional
"""
return plot_glmnet_coeff_path(
figsize=figsize,
linestyle=linestyle,
fontsize=fontsize,
grid=grid,
legend=legend,
legendloc=legendloc,
xlabel=xlabel,
ylabel=ylabel,
title=title,
bbox_to_anchor=bbox_to_anchor,
yscale=yscale,
save_path=save_path,
display_plot=display_plot,
return_fig=return_fig,
**self.results_,
)
[docs]
def plot_shap_summary(
self,
validation: Optional[bool] = True,
plot_type: Optional[str] = "dot",
figsize: Optional[Union[str, Tuple[float, float]]] = "auto",
color: Optional[str] = None,
cmap: Optional[LinearSegmentedColormap] = None,
max_display: Optional[int] = 20,
feature_names: Optional[List[str]] = None,
layered_violin_max_num_bins: Optional[int] = 10,
title: Optional[str] = None,
sort: Optional[bool] = True,
color_bar: Optional[bool] = True,
class_names: Optional[List[str]] = None,
class_inds: Optional[List[int]] = None,
color_bar_label: Optional[str] = "Feature Value",
save_path: Optional[str] = None,
display_plot: Optional[bool] = True,
) -> None:
"""Visualizes shap beeswarm plot as summary of shapley values.
Notes
-----
This is a helper function to plot the ``shap`` summary plot based on all types of
``shap.Explainer`` including ``shap.LinearExplainer`` for linear models, ``shap.TreeExplainer``
for tree-based models, and ``shap.DeepExplainer`` deep neural network models. More on details
are available at [shap-api]_. Note that this function should be ran after the ``predict_proba()``
to make sure the ``X_test`` is being instansiated or set ``validation=False``.
Parameters
----------
validation : bool, optional
Whether to calculate Shap values of using the validation data ``X_test`` or not. When
``validation=False``, Shap values are calculated using ``X_train``, be default True
plot_type : str, optional
The type of summary plot where possible options are "bar", "dot", "violin", "layered_violin",
and "compact_dot". Recommendations are "dot" for single-output such as binary classifications,
"bar" for multi-output problems, "compact_dot" for Shap interactions, by default "dot"
figsize : tuple, optional
Figure size where "auto" is auto-scaled figure size based on the number of features that are
being displayed. Passing a single float will cause each row to be that many inches high.
Passing a pair of floats will scale the plot by that number of inches. If None is passed
then the size of the current figure will be left unchanged, by default "auto"
color : str, optional
Color of plots when ``plot_type="violin"`` and ``plot_type=layered_violin"`` are "RdBl"
color-map while color of the horizontal lines when ``plot_type="bar"`` is "#D0AAF3", by
default None
cmap : LinearSegmentedColormap, optional
Color map when ``plot_type="violin"`` and ``plot_type=layered_violin"``, by default "RdBl"
max_display : int, optional
Limit to show the number of features in the plot, by default 20
feature_names : List[str], optional
List of feature names to pass. It should follow the order of features, by default None
layered_violin_max_num_bins : int, optional
The number of bins for calculating the violin plots ranges and outliers, by default 10
title : str, optional
Title of the plot, by default None
sort : bool, optional
Flag to plot sorted shap vlues in descending order, by default True
color_bar : bool, optional
Flag to show a color bar when ``plot_type="dot"`` or ``plot_type="violin"``
class_names : List[str], optional
List of class names for multi-output problems, by default None
class_inds : List[int], optional
List of class indices for multi-output problems, by default None
color_bar_label : str, optional
Label for color bar, by default "Feature Value"
save_path : str, optional
The full or relative path to save the plot including the image format such as
"myplot.png" or "../../myplot.pdf", by default None
display_plot : bool, optional
Whether to show the plot, by default True
Returns
-------
None
"""
self._explainer()
if validation:
shap_values = self.shap_values_test_
features = self.X_test
else:
shap_values = self.shap_values_train_
features = self.X_train
return plot_shap_summary(
shap_values=shap_values,
features=features,
plot_type=plot_type,
figsize=figsize,
color=color,
cmap=cmap,
max_display=max_display,
feature_names=feature_names,
layered_violin_max_num_bins=layered_violin_max_num_bins,
title=title,
sort=sort,
color_bar=color_bar,
class_names=class_names,
class_inds=class_inds,
color_bar_label=color_bar_label,
save_path=save_path,
display_plot=display_plot,
)
[docs]
def plot_shap_waterfall(
self,
validation: Optional[bool] = True,
figsize: Optional[Tuple[float, float]] = (8, 5),
bar_color: Optional[str] = "#B3C3F3",
bar_thickness: Optional[Union[float, int]] = 0.5,
line_color: Optional[str] = "purple",
marker: Optional[str] = "o",
markersize: Optional[Union[int, float]] = 7,
markeredgecolor: Optional[str] = "purple",
markerfacecolor: Optional[str] = "purple",
markeredgewidth: Optional[Union[int, float]] = 1,
max_display: Optional[int] = 20,
title: Optional[str] = None,
fontsize: Optional[Union[int, float]] = 12,
save_path: Optional[str] = None,
display_plot: Optional[bool] = True,
return_fig: Optional[bool] = False,
) -> Optional[Figure]:
"""Visualizes the Shapley values as a waterfall plot.
Notes
-----
Waterfall is defined as the cumulitative/composite ratios of shap values per feature.
Therefore, it can be easily seen with each feature how much explainability we can achieve.
Note that this function should be ran after the ``predict_proba()`` to make sure the
``X_test`` is being instansiated or set ``validation=False``.
Parameters
----------
validation : bool, optional
Whether to calculate Shap values of using the validation data ``X_test`` or not. When
``validation=False``, Shap values are calculated using ``X_train``, be default True
figsize : Tuple[float, float], optional
Figure size, by default (8, 5)
bar_color : str, optional
Color of the horizontal bar lines, "#B3C3F3"
bar_thickness : Union[float, int], optional
Thickness (hight) of the horizontal bar lines, by default 0.5
line_color : str, optional
Color of the line plot, by default "purple"
marker : str, optional
Marker style of the lollipops. More valid marker styles can be found at [2]_, by default "o"
markersize : Union[int, float], optional
Markersize, by default 7
markeredgecolor : str, optional
Marker edge color, by default "purple"
markerfacecolor: str, optional
Marker face color, by default "purple"
markeredgewidth : Union[int, float], optional
Marker edge width, by default 1
max_display : int, optional
Limit to show the number of features in the plot, by default 20
title : str, optional
Title of the plot, by default None
fontsize : Union[int, float], optional
Fontsize for xlabel and ylabel, and ticks parameters, by default 12
save_path : str, optional
The full or relative path to save the plot including the image format such as
"myplot.png" or "../../myplot.pdf", by default None
display_plot : bool, optional
Whether to show the plot, by default True
return_fig : bool, optional
Whether to return figure object, by default False
Returns
-------
Figure, optional
"""
self._explainer()
if validation:
shap_values = self.shap_values_test_
features = self.X_test
else:
shap_values = self.shap_values_train_
features = self.X_train
return plot_shap_waterfall(
shap_values=shap_values,
features=features,
figsize=figsize,
bar_color=bar_color,
bar_thickness=bar_thickness,
line_color=line_color,
marker=marker,
markersize=markersize,
markeredgecolor=markeredgecolor,
markerfacecolor=markerfacecolor,
markeredgewidth=markeredgewidth,
max_display=max_display,
title=title,
fontsize=fontsize,
save_path=save_path,
display_plot=display_plot,
return_fig=return_fig,
)
[docs]
def get_intercept(self) -> float:
"""Returns the model's intercept.
Returns
-------
float
"""
return self.intercept_
[docs]
def get_coeffs(
self,
output: Optional[str] = "dataframe",
) -> Union[Dict[str, float], pd.DataFrame]:
"""Returns model's coefficients in different format.
Parameters
----------
output : str, optional
Output format with possible values of "dataframe" and "dict", by default "dataframe"
Returns
-------
Union[Dict[str, float], pd.DataFrame]
"""
check_var(
output,
var_name="output",
dtypes=str,
values=("dataframe", "dict"),
)
if output == "dataframe":
return self._coeff_to_df()
else:
return self._coeff_to_dict()
[docs]
def get_params(self) -> Dict[str, Any]:
"""Returns model's parameters.
Returns
-------
Dict[str, Any]
"""
return self.params_
[docs]
def get_shap_explainer(self) -> shap.LinearExplainer:
"""Returns ``shap.LinearExplainer`` object.
Returns
-------
shap.LinearExplainer
"""
self._explainer()
return self.shap_explainer_
[docs]
def get_cv_results(self) -> pd.DataFrame:
"""Returns model's cross-validation results.
See Also
--------
:meth:`get_results()`
Returns
-------
pd.DataFrame
"""
return self.cv_results_
[docs]
def get_results(self) -> Dict[str, Any]:
"""Returns model's total results.
See Also
--------
:meth:`get_cv_results()`
Returns
-------
Dict[str, Any]
"""
return self.results_
def _dtrain(
self,
X_train: Union[pd.DataFrame, np.ndarray],
y_train: Union[List[float], np.ndarray, pd.Series],
) -> None:
"""Returns the features matrix and targets array.
Parameters
----------
X_train : Union[pd.DataFrame, np.ndarray]
Input data for training (features)
y_train : Union[List[float], np.ndarray, pd.Series]
Input ground truth for training (targets)
See Also
--------
:meth:`_dtest()`
Returns
-------
Tuple[pd.DataFrame, np.ndarray]
"""
check_var(
X_train,
var_name="X_train",
dtypes=(
pd.DataFrame,
np.ndarray,
),
)
check_var(
y_train,
var_name="y_train",
dtypes=(
list,
np.ndarray,
pd.Series,
),
)
if isinstance(X_train, np.ndarray):
self.X_train = array_to_df(
X=X_train,
prefix="F",
delimiter="_",
)
else:
self.X_train = X_train
if not isinstance(y_train, np.ndarray):
self.y_train = np.array(y_train)
else:
self.y_train = y_train
return None
def _dtest(
self,
X_test: Union[pd.DataFrame, np.ndarray],
y_test: Optional[Union[List[float], np.ndarray, pd.Series]] = None,
) -> None:
"""Returns the features matrix and targets array.
Note that ``y_test`` is optional since it might not be available while validating the
model (inference).
Parameters
----------
X_test : Union[pd.DataFrame, np.ndarray]
Input data for training (features)
y_test : Union[List[float], np.ndarray, pd.Series], optional
Input ground truth for training (targets), by default None
See Also
--------
:meth:`_dtrain()`
Returns
-------
Tuple[pd.DataFrame, np.ndarray]
"""
check_var(
X_test,
var_name="X_test",
dtypes=(
pd.DataFrame,
np.ndarray,
),
)
if y_test is not None:
check_var(
y_test,
var_name="y_test",
dtypes=(
list,
np.ndarray,
pd.Series,
),
)
if not isinstance(y_test, np.ndarray):
self.y_test = np.array(y_test)
else:
self.y_test = y_test
else:
self.y_test = y_test
if isinstance(X_test, np.ndarray):
self.X_test = array_to_df(
X=X_test,
prefix="F",
delimiter="_",
)
else:
self.X_test = X_test
return None
def _model(self) -> glmnet.LogitNet:
"""Fits a ``glmnet.LogtNet`` model.
Returns
-------
glmnet.LogitNet
"""
model = glmnet.LogitNet(
alpha=self.alpha,
n_lambda=self.n_lambda,
min_lambda_ratio=self.min_lambda_ratio,
lambda_path=self.lambda_path,
standardize=self.scale,
fit_intercept=self.fit_intercept,
cut_point=self.cut_point,
n_splits=self.n_splits,
scoring=self.metric,
n_jobs=-1,
tol=self.tolerance,
max_iter=self.max_iter,
random_state=self.random_state,
max_features=self.max_features,
verbose=False,
)
if self.sparse_matrix:
# TODO(amir): expose groups: array, shape (n_samples,)
# Group labels for the samples used while splitting the dataset into train/test set.
# If the groups are specified, the groups will be passed to
# ``sklearn.model_selection.GroupKFold```. If None, then data will be split randomly
# for K-fold cross-validation via sklearn.model_selection.KFold.
model.fit(
X=df_to_csr(
self.X_train,
fillna=0.0,
verbose=False,
),
y=self.y_train,
sample_weight=None,
relative_penalties=None,
groups=None,
)
else:
model.fit(
X=self.X_train,
y=self.y_train,
sample_weight=None,
relative_penalties=None,
groups=None,
)
return model
def _explainer(self) -> None:
"""Fits a ``shap.LinearExplainer`` using an independent masker.
Returns
-------
None
"""
# TODO(amir): currently, this does not make sense
# https://shap.readthedocs.io/en/latest/generated/shap.explainers.Linear.html
# https://stackoverflow.com/questions/66560839/what-do-maskers-really-do-in-shap-package-and-fit-them-to-train-or-test
self.shap_explainer_ = shap.LinearExplainer(
model=self.model_,
masker=shap.maskers.Independent(
data=self.X_train,
),
)
self.shap_values_train_ = self.shap_explainer_.shap_values(
X=self.X_train,
)
self.shap_values_test_ = self.shap_explainer_.shap_values(
X=self.X_test,
)
return None
def _coeff_to_df(self) -> pd.DataFrame:
"""Returns the non-zero coeff for the ``lambda_best_``.
See Also
--------
:meth:`_coeff_to_dict()`
Returns
-------
pd.DataFrame
"""
return (
pd.DataFrame(
data=self._coeff_to_dict().items(),
columns=[
"feature",
"coeff",
],
)
.sort_values(
by="coeff",
ascending=False,
)
.reset_index(
drop=True,
)
)
def _coeff_to_dict(self) -> Dict[str, float]:
"""Returns the non-zero coeff for the ``lambda_best_``.
See Also
--------
:meth:`_coeff_to_df()`
Returns
-------
Dict[str, float]
"""
idx = list(
np.nonzero(
np.reshape(
self.model_.coef_,
(1, -1),
),
)[1],
)
# TODO(amir): why I have this here ? [self.model_.coef_[0][i] for i in idx],
return dict(
zip(
[self.X_train.columns.tolist()[i] for i in idx],
[self.model_.coef_.reshape(-1, self.model_.coef_.shape[-1])[0][i] for i in idx],
),
)
def _results(self) -> Dict[str, Any]:
"""Returns fitted ``glmnet.LogitNet`` results as a nested dictionary.
Returns
-------
Dict[str, Any]
"""
results = {}
results["coeff"] = self._coeff_to_dict()
results["coeff_path"] = dict(
zip(
[f"{col}" for col in self.X_train.columns.tolist()],
(self.model_.coef_path_.reshape(-1, self.model_.coef_path_.shape[-1])).tolist(),
),
)
results["cv_standard_error"] = self.model_.cv_standard_error_.tolist()
results["cv_mean_score"] = self.model_.cv_mean_score_.tolist()
results["lambda_path"] = self.model_.lambda_path_.tolist()
results["lambda_best"] = self.model_.lambda_best_[0]
results["lambda_max"] = self.model_.lambda_max_
results["n_lambda"] = self.model_.n_lambda_
results["intercept"] = self.model_.intercept_
results["intercept_path"] = self.model_.intercept_path_.tolist()[0]
results["params"] = self.model_.get_params()
results["module"] = self.model_.__module__
return results
def _cv_results(self) -> pd.DataFrame:
"""Returns fitted ``glmnet.LogitNet`` results.
Results are including coeff. paths, intercept paths, lambda paths, and mean/standard-error
of the metric through cross-validation.
Returns
-------
pd.DataFrame
"""
df = pd.DataFrame(
(
self.model_.coef_path_.reshape(
-1,
self.model_.coef_path_.shape[-1],
)
).T,
columns=[f"{col}_coeff_path" for col in self.X_train.columns.tolist()],
)
df["intercept_path"] = (
self.model_.intercept_path_.reshape(
-1,
self.model_.intercept_path_.shape[-1],
)
).T
df["lambda_path"] = self.model_.lambda_path_
df["cv_standard_error"] = self.model_.cv_standard_error_
df["cv_mean_score"] = self.model_.cv_standard_error_
return df
# TODO(amir): we prolly need to do more complex patterns for precision recall too
def _transform_metric(self) -> None:
"""Transforms the given metric by user and post instantiation step.
Returns
-------
None
"""
if self.metric == "auc":
self.metric = "roc_auc"
return None