"""
Wrappers around native `scikit-learn` estimators.
`sklearndf` wrappers accept and return data frames (while `scikit-learn` transformers
usually return a numpy arrays, and may not accept data frames as input).
Otherwise, the wrappers are designed to precisely mirror the API and behavior of the
native estimators they wrap.
The wrappers also implement the additional column attributes introduced by `sklearndf`,
:meth:`~EstimatorDF.feature_names_in_`, :meth:`~TransformerDF.feature_names_out_`, and
:meth:`~TransformerDF.feature_names_original_`.
"""
from __future__ import annotations
import inspect
import logging
import warnings
from abc import ABCMeta
from collections.abc import Iterable, Mapping, Sequence
from functools import update_wrapper
from typing import Any, Callable, Generic, Optional, TypeVar, Union, cast
import numpy as np
import numpy.typing as npt
import pandas as pd
import sklearn.utils.metaestimators as sklearn_meta
from scipy import sparse
from sklearn.base import (
BaseEstimator,
ClassifierMixin,
ClusterMixin,
MetaEstimatorMixin,
RegressorMixin,
TransformerMixin,
)
from pytools.api import AllTracker, inheritdoc, public_module_prefix
from sklearndf import (
ClassifierDF,
ClusterDF,
EstimatorDF,
LearnerDF,
RegressorDF,
SupervisedLearnerDF,
TransformerDF,
__sklearn_1_6__,
__sklearn_version__,
)
if __sklearn_version__ >= __sklearn_1_6__:
from sklearn.utils import Tags
log = logging.getLogger(__name__)
__all__ = [
"ClassifierWrapperDF",
"ClusterWrapperDF",
"EstimatorWrapperDF",
"EstimatorWrapperDFMeta",
"LearnerWrapperDF",
"MetaEstimatorWrapperDF",
"RegressorWrapperDF",
"SupervisedLearnerWrapperDF",
"TransformerWrapperDF",
]
#
# type variables
#
T = TypeVar("T")
T_Callable = TypeVar("T_Callable", bound=Callable[..., Any])
T_Target = TypeVar("T_Target", bound=Optional[Union[pd.Series, pd.DataFrame]])
T_NativeEstimator = TypeVar("T_NativeEstimator", bound=BaseEstimator)
T_NativeTransformer = TypeVar("T_NativeTransformer", bound=TransformerMixin)
T_NativeLearner = TypeVar(
"T_NativeLearner", bound=Union[RegressorMixin, ClassifierMixin, ClusterMixin]
)
T_NativeSupervisedLearner = TypeVar(
"T_NativeSupervisedLearner", bound=Union[RegressorMixin, ClassifierMixin]
)
T_NativeRegressor = TypeVar("T_NativeRegressor", bound=RegressorMixin)
T_NativeClassifier = TypeVar("T_NativeClassifier", bound=ClassifierMixin)
T_NativeCluster = TypeVar("T_NativeCluster", bound=ClusterMixin)
T_EstimatorWrapperDF = TypeVar(
"T_EstimatorWrapperDF", bound="EstimatorWrapperDF[BaseEstimator]"
)
#
# Ensure all symbols introduced below are included in __all__
#
__tracker = AllTracker(globals())
#
# base wrapper classes
#
def _make_init(cls: type) -> Callable[..., None]:
def __init__(self: type, *args: Any, **kwargs: Any) -> None:
""""""
cast(EstimatorWrapperDF, super(cls, self)).__init__( # type: ignore
*args, **kwargs
)
return __init__
[docs]
@inheritdoc(match="[see superclass]")
class EstimatorWrapperDF(
EstimatorDF, Generic[T_NativeEstimator], metaclass=EstimatorWrapperDFMeta
):
"""
Base class of DF wrappers for native estimators conforming with the `scikit-learn`
API.
"""
__native_base_class__ = BaseEstimator
__ARG_FITTED_DELEGATE_CONTEXT = "__EstimatorWrapperDF_fitted"
#: The native estimator that this wrapper delegates to.
_native_estimator: T_NativeEstimator
def __init__(self, *args: Any, **kwargs: Any) -> None:
"""
:param args: positional arguments to use when initializing a new delegate
estimator
:param kwargs: keyword arguments to use when initializing a new delegate
estimator
"""
super().__init__()
self._features_in: Optional[pd.Index] = None
self._outputs: Optional[list[str]] = None
# check if a fitted estimator was passed by class method is_fitted
fitted_delegate_context = cast(
Optional[tuple[T_NativeEstimator, pd.Index, int]],
kwargs.get(EstimatorWrapperDF.__ARG_FITTED_DELEGATE_CONTEXT, None),
)
_native_estimator: T_NativeEstimator
if fitted_delegate_context is None:
# create a new delegate estimator with the given parameters
# noinspection PyProtectedMember
_native_estimator = type(self).__wrapped__(*args, **kwargs)
self._reset_fit()
else:
(
_native_estimator,
self._features_in,
self._n_outputs,
) = fitted_delegate_context
self._native_estimator = _native_estimator
self._validate_delegate_estimator()
def __new__(
cls: type[T_EstimatorWrapperDF], *args: Any, **kwargs: Any
) -> T_EstimatorWrapperDF:
try:
cls.__wrapped__
except AttributeError:
raise TypeError(
f"cannot instantiate wrapper class {cls.__name__}: "
"need to specify class argument 'native' in class definition"
)
else:
return cast(type[EstimatorDF], super()).__new__(cls)
@property
def is_fitted(self) -> bool:
"""[see superclass]"""
return self._features_in is not None
@property
def native_estimator(self) -> T_NativeEstimator:
"""
The native estimator that this wrapper delegates to.
"""
return self._native_estimator
@property
def feature_names_in_(self) -> pd.Index:
"""[see superclass]"""
return self._check_feature_names_in(
super().feature_names_in_, warning_stacklevel=2
)
@property
def n_features_in_(self) -> int:
"""[see superclass]"""
return self._check_n_features_in(super().n_features_in_, warning_stacklevel=2)
def _check_feature_names_in(
self, wrapper_feature_names_in: pd.Index, *, warning_stacklevel: int
) -> pd.Index:
# Check that the given feature names are the same as the ingoing feature names
# recorded by the native estimator, if present. Issue a warning if the feature
# names differ.
# Return the same feature names that were passed to this method.
# noinspection PyBroadException
try:
feature_names_in_native = self.native_estimator.feature_names_in_
except Exception:
return wrapper_feature_names_in
if not np.array_equal(wrapper_feature_names_in.values, feature_names_in_native):
warnings.warn(
"conflicting input feature names: "
"the input feature names recorded by this estimator are "
f"{wrapper_feature_names_in}, but the input feature names recorded by "
f"the wrapped native estimator are {feature_names_in_native}",
stacklevel=warning_stacklevel + 1,
)
return wrapper_feature_names_in
def _check_n_features_in(
self, wrapper_n_features: int, *, warning_stacklevel: int
) -> int:
# Check that the given number of features is the same as the number of features
# recorded by the native estimator, if present. Issue a warning if the number of
# features differ.
# Return the same number of features that were passed to this method.
# noinspection PyBroadException
try:
n_features_native = self.native_estimator.n_features_in_
except Exception:
return wrapper_n_features
if wrapper_n_features != n_features_native:
warnings.warn(
"conflicting number of features: "
"the number of features recorded by this estimator is "
f"{wrapper_n_features}, but the number of features recorded by "
f"the wrapped native estimator is {n_features_native}",
stacklevel=warning_stacklevel + 1,
)
return wrapper_n_features
@property
def _estimator_type(self) -> Optional[str]:
try:
# noinspection PyProtectedMember
return cast(str, self.native_estimator._estimator_type)
except AttributeError:
return None
if __sklearn_version__ >= __sklearn_1_6__:
def __sklearn_tags__(self) -> Tags:
return self.native_estimator.__sklearn_tags__()
[docs]
@classmethod
def from_fitted(
cls: type[T_EstimatorWrapperDF],
estimator: T_NativeEstimator,
features_in: pd.Index,
n_outputs: int,
) -> T_EstimatorWrapperDF:
"""
Make a new wrapped DF estimator, delegating to a given native estimator that
has already been fitted.
:param estimator: the fitted native estimator to use as the delegate
:param features_in: the column names of X used for fitting the estimator
:param n_outputs: the number of outputs in y used for fitting the estimator
:return: the wrapped data frame estimator
"""
return cls(
**{
EstimatorWrapperDF.__ARG_FITTED_DELEGATE_CONTEXT: (
estimator,
features_in,
n_outputs,
)
}
)
[docs]
def get_params(self, deep: bool = True) -> Mapping[str, Any]:
"""[see superclass]"""
return cast(Mapping[str, Any], self._native_estimator.get_params(deep=deep))
[docs]
def set_params(self: T_EstimatorWrapperDF, **params: Any) -> T_EstimatorWrapperDF:
"""[see superclass]"""
self._native_estimator.set_params(**params)
return self
# noinspection PyPep8Naming
[docs]
def fit(
self: T_EstimatorWrapperDF,
X: Union[pd.DataFrame, pd.Series],
y: Optional[Union[pd.Series, pd.DataFrame]] = None,
**fit_params: Any,
) -> T_EstimatorWrapperDF:
"""[see superclass]"""
self._reset_fit()
try:
X, y = self._validate_parameter_types(X, y)
self._fit(X, y, **fit_params)
self._post_fit(X, y, **fit_params)
except Exception as cause:
self._reset_fit()
raise self._make_verbose_exception(self.fit.__name__, cause) from cause
return self
def _validate_delegate_estimator(self) -> None:
# Called as the last step of the estimator wrapper's constructor.
# No validation required by default; to be overloaded as needed.
pass
def _get_features_in(self) -> pd.Index:
assert self._features_in is not None, "estimator is fitted"
return self._features_in
def _get_outputs(self) -> Optional[list[str]]:
return self._outputs
def _reset_fit(self) -> None:
self._features_in = None
self._outputs = None
# noinspection PyPep8Naming
def _fit(
self,
X: pd.DataFrame,
y: Optional[Union[pd.Series, pd.DataFrame]],
**fit_params: Any,
) -> T_NativeEstimator:
# noinspection PyUnresolvedReferences
return cast(
T_NativeEstimator,
self._native_estimator.fit(
self._prepare_X_for_delegate(X),
self._prepare_y_for_delegate(y),
**fit_params,
),
)
# noinspection PyPep8Naming,PyUnusedLocal
def _post_fit(
self,
X: pd.DataFrame,
y: Optional[Union[pd.Series, pd.DataFrame]] = None,
**fit_params: Any,
) -> None:
self._features_in = X.columns.rename(self.COL_FEATURE)
if y is None:
self._outputs = None
elif isinstance(y, pd.Series):
self._outputs = [y.name]
else:
self._outputs = y.columns.tolist()
# noinspection PyPep8Naming
def _validate_parameter_types(
self,
X: Union[pd.Series, pd.DataFrame],
y: T_Target,
*,
expected_columns: Optional[pd.Index] = None,
) -> tuple[pd.DataFrame, T_Target]:
# Check that the X and y parameters are valid data frames and series,
# and return X as a data frame and y as a series or data frame.
#
# If X is a series, convert it to a data frame with a single column.
#
# If expected_columns is not None, check that the columns of X match
# the expected columns.
if isinstance(X, pd.Series):
if X.name is None:
raise ValueError(
"the name of the series passed as arg X must not be None"
)
X = X.to_frame()
elif not isinstance(X, pd.DataFrame):
raise TypeError("arg X must be a DataFrame or a Series")
if self.is_fitted:
EstimatorWrapperDF._verify_df(
df_name="arg X",
df=X,
expected_columns=(
self.feature_names_in_
if expected_columns is None
else expected_columns
),
)
if y is not None and not isinstance(y, (pd.Series, pd.DataFrame)):
raise TypeError("arg y must be None, or a pandas series or data frame")
return X, y
@staticmethod
def _verify_df(
df_name: str,
df: pd.DataFrame,
expected_columns: pd.Index,
expected_index: Optional[pd.Index] = None,
) -> None:
def _verify_labels(axis: str, actual: pd.Index, expected: pd.Index) -> None:
missing_columns = expected.difference(actual)
extra_columns = actual.difference(expected)
error_detail: list[str] = []
# check that we have the expected number of columns
if len(actual) != len(expected):
error_detail.append(
f"expected {len(expected)} elements but got {len(actual)}"
)
# check that all the expected columns are in place
if len(missing_columns) > 0:
error_detail.append(
f"missing elements: "
f"{', '.join(str(item) for item in missing_columns)}"
)
# check that there are no unexpected columns
if len(extra_columns) > 0:
error_detail.append(
f"extra elements: "
f"{', '.join(str(item) for item in extra_columns)}"
)
# raise an exception if we have encountered any errors
if error_detail:
raise ValueError(
f"{df_name} data frame does not have expected {axis} index "
f"({'; '.join(error_detail)})"
)
_verify_labels(axis="column", actual=df.columns, expected=expected_columns)
if expected_index is not None:
_verify_labels(axis="row", actual=df.index, expected=expected_index)
def _validate_delegate_attribute(self, attribute_name: str) -> None:
if not hasattr(self.native_estimator, attribute_name):
raise AttributeError(
f"delegate estimator of type {type(self.native_estimator).__name__} "
f"does not have attribute {attribute_name}"
)
# noinspection PyPep8Naming
def _prepare_X_for_delegate(
self, X: pd.DataFrame
) -> Union[pd.DataFrame, npt.NDArray[Any], sparse.csr_matrix]:
# convert X before passing it to the delegate estimator
return self._adjust_X_type_for_delegate(self._adjust_X_columns_for_delegate(X))
def _prepare_y_for_delegate(
self, y: Optional[Union[pd.Series, pd.DataFrame]]
) -> Optional[Union[pd.Series, pd.DataFrame, npt.NDArray[Any], sparse.csr_matrix]]:
return self._adjust_y_type_for_delegate(y)
# noinspection PyPep8Naming
def _adjust_X_columns_for_delegate(self, X: pd.DataFrame) -> pd.DataFrame:
# make sure columns of X are aligned with frame used to fit this estimator
if not self.is_fitted:
# return X unchanged if estimator is not fitted yet
return X
features_in = self._get_features_in()
if X.columns.is_(features_in):
return X
else:
return X.reindex(columns=features_in, copy=False)
# noinspection PyPep8Naming
def _adjust_X_type_for_delegate(
self, X: pd.DataFrame
) -> Union[pd.DataFrame, npt.NDArray[Any], sparse.csr_matrix]:
# Convert X before passing it to the delegate estimator.
# By default, does nothing, but can be overridden.
return X
def _adjust_y_type_for_delegate(
self, y: Optional[Union[pd.Series, pd.DataFrame]]
) -> Optional[Union[pd.Series, pd.DataFrame, npt.NDArray[Any], sparse.csr_matrix]]:
# convert y before passing it to the delegate estimator
return y
def _make_verbose_exception(self, method: str, cause: Exception) -> Exception:
verbose_message = f"{type(self).__name__}.{method}: {cause}"
# noinspection PyBroadException
try:
return type(cause)(verbose_message)
except Exception:
return RuntimeError(verbose_message)
def __dir__(self) -> Iterable[str]:
# include non-private attributes of delegate estimator in directory
return {
*super().__dir__(),
*(
attr
for attr in self._native_estimator.__dir__()
if not attr.startswith("_")
),
}
def __getattr__(self, name: str) -> Any:
# This method is only called if the attribute name is not found in the
# instance's dictionary, and __getattribute__() has raised an AttributeError.
# For private attributes, give up and raise attribute error.
if name.startswith("_"):
# The following will raise an AttributeError
self.__getattribute__(name)
else:
# For public attributes, try to get the attribute from the delegate
# estimator. If the attribute is not found, raise an attribute error.
try:
return getattr(self._native_estimator, name)
except AttributeError:
# The following will raise an AttributeError
self.__getattribute__(name)
def __setattr__(self, name: str, value: Any) -> None:
# This method is called whenever an attribute assignment is attempted.
# For private attributes, set the attribute in this wrapper object.
if name.startswith("_"):
super().__setattr__(name, value)
else:
# For public attributes, set the attribute in this wrapper object only
# if it is already defined. Otherwise, set the attribute in the delegate
# estimator.
try:
self.__getattribute__(name)
except AttributeError:
# The attribute is not defined in this wrapper object, so set it in
# the delegate estimator.
setattr(self._native_estimator, name, value)
else:
# The attribute is defined in this wrapper object, so set it here.
super().__setattr__(name, value)
[docs]
@inheritdoc(match="[see superclass]")
class LearnerWrapperDF(
LearnerDF,
EstimatorWrapperDF[T_NativeLearner],
Generic[T_NativeLearner],
):
"""
Base class of DF wrappers for native `learners` conforming with the `scikit-learn`
API.
Learners in `scikit-learn` typically are regressors, classifiers, or clusterers.
"""
#: Name of :class:`~pandas.Series` objects containing the predictions of
#: single-output learners.
#:
#: See :meth:`~.LearnerDF.predict`.
COL_PREDICTION = "prediction"
# noinspection PyPep8Naming
[docs]
def predict(
self, X: Union[pd.Series, pd.DataFrame], **predict_params: Any
) -> Union[pd.Series, pd.DataFrame]:
"""[see superclass]"""
X, _ = self._validate_parameter_types(X, None)
# noinspection PyUnresolvedReferences
return self._prediction_to_series_or_frame(
X,
self.native_estimator.predict(
self._prepare_X_for_delegate(X), **predict_params
),
)
# noinspection PyPep8Naming
def _prediction_to_series_or_frame(
self, X: pd.DataFrame, y: Union[npt.NDArray[Any], pd.Series, pd.DataFrame]
) -> Union[pd.Series, pd.DataFrame]:
if len(y) != len(X):
raise ValueError(
f"length of prediction ({len(y)}) does not match length of X ({len(X)})"
)
outputs: Optional[list[str]] = self._get_outputs()
if y.ndim == 1:
if outputs is None:
# in case predict() was called without first calling fit(),
# no outputs are known, so we use the default output name
outputs = [self.COL_PREDICTION]
elif len(outputs) != 1:
raise ValueError(
f"expected {len(outputs)} predictions, but got 1 prediction"
)
elif y.ndim == 2:
if outputs is None:
# in case predict() was called without first calling fit(),
# no outputs are known, so we use default output names
outputs = [
f"{self.COL_PREDICTION}_{output}" for output in range(y.shape[1])
]
elif y.shape[1] != len(outputs):
raise ValueError(
f"expected {len(outputs)} predictions, "
f"but got {y.shape[1]} prediction{'' if y.shape[1] == 1 else 's'}"
)
else:
raise ValueError(
f"got {y.ndim}-dimensional prediction, "
f"but expected 1- or 2-dimensional prediction"
)
if isinstance(y, pd.Series):
return y.rename(outputs[0])
elif isinstance(y, pd.DataFrame):
return y.set_axis(outputs, axis=1)
elif isinstance(y, np.ndarray):
if len(y) == len(X):
# predictions are usually provided as a numpy array the same length as X
if y.ndim == 1:
# single-output predictions yield a numpy array of shape (n_samples)
return pd.Series(data=y, name=outputs[0], index=X.index)
if y.ndim == 2:
# multi-output predictions yield a numpy array of shape (n_samples,
# n_outputs)
return pd.DataFrame(data=y, columns=outputs, index=X.index)
raise TypeError(
f"unexpected shape of numpy array returned as prediction: {y.shape}"
)
raise TypeError(
f"unexpected data type returned as prediction: {type(y).__name__}"
)
[docs]
@inheritdoc(match="[see superclass]")
class SupervisedLearnerWrapperDF(
SupervisedLearnerDF,
LearnerWrapperDF[T_NativeSupervisedLearner],
Generic[T_NativeSupervisedLearner],
metaclass=ABCMeta,
):
"""
Base class of DF wrappers for native `supervised learners` conforming with the
`scikit-learn` API.
Supervised learners in `scikit-learn` typically are regressors or classifiers.
"""
# noinspection PyPep8Naming
[docs]
def score(
self,
X: Union[pd.Series, pd.DataFrame],
y: pd.Series,
sample_weight: Optional[pd.Series] = None,
) -> float:
"""[see superclass]"""
X, y = self._validate_parameter_types(X, y)
if y is None:
raise ValueError("arg y must not be None")
if sample_weight is not None and not isinstance(sample_weight, pd.Series):
raise TypeError("arg sample_weight must be None or a Series")
return cast(
float,
self.native_estimator.score(
self._prepare_X_for_delegate(X),
self._prepare_y_for_delegate(y),
sample_weight,
),
)
[docs]
@inheritdoc(match="[see superclass]")
class RegressorWrapperDF(
RegressorDF,
SupervisedLearnerWrapperDF[T_NativeRegressor],
Generic[T_NativeRegressor],
metaclass=ABCMeta,
):
"""
Base class of DF wrappers for native regressors conforming with the `scikit-learn`
API.
"""
__native_base_class__ = RegressorMixin
# noinspection PyPep8Naming
[docs]
def score(
self,
X: Union[pd.Series, pd.DataFrame],
y: pd.Series,
sample_weight: Optional[pd.Series] = None,
) -> float:
"""[see superclass]"""
return cast(
float,
SupervisedLearnerWrapperDF.score(self, X, y, sample_weight=sample_weight),
)
[docs]
@inheritdoc(match="[see superclass]")
class ClassifierWrapperDF(
ClassifierDF,
SupervisedLearnerWrapperDF[T_NativeClassifier],
Generic[T_NativeClassifier],
metaclass=ABCMeta,
):
"""
Base class of DF wrappers for native classifiers conforming with the `scikit-learn`
API.
"""
__native_base_class__ = ClassifierMixin
def _get_classes(self) -> Union[npt.NDArray[Any], list[npt.NDArray[Any]]]:
return cast(
Union[npt.NDArray[Any], list[npt.NDArray[Any]]],
self._native_estimator.classes_,
)
# noinspection PyPep8Naming
[docs]
def predict_proba(
self, X: Union[pd.Series, pd.DataFrame], **predict_params: Any
) -> Union[pd.DataFrame, list[pd.DataFrame]]:
"""[see superclass]"""
self._ensure_delegate_method("predict_proba")
X, _ = self._validate_parameter_types(X, None)
# noinspection PyUnresolvedReferences
return self._prediction_with_class_labels(
X,
self.native_estimator.predict_proba(
self._prepare_X_for_delegate(X), **predict_params
),
)
# noinspection PyPep8Naming
[docs]
def predict_log_proba(
self, X: Union[pd.Series, pd.DataFrame], **predict_params: Any
) -> Union[pd.DataFrame, list[pd.DataFrame]]:
"""[see superclass]"""
self._ensure_delegate_method("predict_log_proba")
X, _ = self._validate_parameter_types(X, None)
# noinspection PyUnresolvedReferences
return self._prediction_with_class_labels(
X,
self.native_estimator.predict_log_proba(
self._prepare_X_for_delegate(X), **predict_params
),
)
# noinspection PyPep8Naming
[docs]
def decision_function(
self, X: Union[pd.Series, pd.DataFrame], **predict_params: Any
) -> Union[pd.Series, pd.DataFrame]:
"""[see superclass]"""
self._ensure_delegate_method("decision_function")
X, _ = self._validate_parameter_types(X, None)
# noinspection PyUnresolvedReferences
return self._prediction_with_class_labels(
X,
self.native_estimator.decision_function(
self._prepare_X_for_delegate(X), **predict_params
),
)
def _ensure_delegate_method(self, method: str) -> None:
if not hasattr(self.native_estimator, method):
raise NotImplementedError(
f"{type(self.native_estimator).__name__} does not implement method "
f"{method}"
)
# noinspection PyPep8Naming
def _prediction_with_class_labels(
self,
X: pd.DataFrame,
prediction: Union[
pd.Series, pd.DataFrame, list[npt.NDArray[Any]], npt.NDArray[Any]
],
classes: Optional[Sequence[Any]] = None,
) -> Union[pd.Series, pd.DataFrame, list[pd.DataFrame]]:
if classes is None:
classes = getattr(self.native_estimator, "classes_", None)
if classes is None:
classes = pd.RangeIndex(self._get_n_outputs())
if isinstance(prediction, pd.DataFrame):
return prediction.set_axis(classes, axis=1)
elif isinstance(prediction, np.ndarray):
if len(prediction) == len(X):
# predictions of probabilities are usually provided as a NumPy array
# the same length as X
if prediction.ndim == 1:
# for a binary classifier, we get a series with probabilities
# for the second class
return pd.Series(data=prediction, index=X.index, name=classes[1])
elif prediction.ndim == 2:
# for a multi-class classifiers, we get a two-dimensional NumPy
# array with probabilities for each class
return pd.DataFrame(data=prediction, index=X.index, columns=classes)
raise TypeError(
f"ndarray with unexpected shape returned as prediction: "
f"{prediction.shape}"
)
else:
raise TypeError(
f"unexpected type or prediction result: {type(prediction).__name__}"
)
# noinspection PyPep8Naming
[docs]
def score(
self,
X: Union[pd.Series, pd.DataFrame],
y: pd.Series,
sample_weight: Optional[pd.Series] = None,
) -> float:
"""[see superclass]"""
return cast(
float,
SupervisedLearnerWrapperDF.score(self, X, y, sample_weight=sample_weight),
)
# noinspection PyPep8Naming
[docs]
@inheritdoc(match="[see superclass]")
class ClusterWrapperDF(
ClusterDF,
LearnerWrapperDF[T_NativeCluster],
Generic[T_NativeCluster],
metaclass=ABCMeta,
):
"""
Base class of DF wrappers for native clusterers conforming with the scikit-learn
API.
"""
__native_base_class__ = ClusterMixin
COL_LABELS = "labels"
def __init__(self, *args: Any, **kwargs: Any) -> None:
"""[see superclass]"""
super().__init__(*args, **kwargs)
self._x_index: Optional[pd.Index] = None
def _get_labels(self) -> pd.Series:
return pd.Series(
data=self._native_estimator.labels_,
name=self.COL_LABELS,
index=self._x_index,
)
[docs]
def fit_predict(
self,
X: Union[pd.Series, pd.DataFrame],
y: Optional[Union[pd.Series, pd.DataFrame]] = None,
**fit_predict_params: Any,
) -> Union[pd.Series, pd.DataFrame]:
"""[see superclass]"""
self._reset_fit()
try:
X, y = self._validate_parameter_types(X, y)
# fitting a clusterer produces a single output column for labels
self._outputs = [ClusterWrapperDF.COL_LABELS]
# Ignore a PyCharm warning that is caused by scikit-learn incorrectly
# omitting optional arguments from the abstract method declaration
# of ClassifierMixin.fit_predict():
# noinspection PyArgumentList
result = self._prediction_to_series_or_frame(
X,
self.native_estimator.fit_predict(
self._prepare_X_for_delegate(X),
self._prepare_y_for_delegate(y),
**fit_predict_params,
),
)
self._post_fit(X, y, **fit_predict_params)
except Exception as cause:
self._reset_fit()
raise self._make_verbose_exception(
self.fit_predict.__name__, cause
) from cause
return result
def _post_fit(
self,
X: pd.DataFrame,
y: Optional[Union[pd.Series, pd.DataFrame]] = None,
**fit_params: Any,
) -> None:
super()._post_fit(X, y, **fit_params)
self._x_index = X.index
def _reset_fit(self) -> None:
super()._reset_fit()
self._x_index = None
#
# Meta estimator wrappers
#
#
# private factory implementation
#
def _mirror_attributes(
wrapper_class: type[EstimatorWrapperDF[T_NativeEstimator]],
native_estimator: type[T_NativeEstimator],
wrapper_module: str,
) -> None:
wrapper_name = wrapper_class.__name__
wrapper_attributes: set[str] = set(dir(wrapper_class))
for name, member in vars(native_estimator).items():
if member is None or name in wrapper_attributes:
continue
alias = _make_alias(
wrapper_module=wrapper_module,
wrapper_name=wrapper_name,
name=name,
delegate_cls=native_estimator,
delegate=member,
)
if alias is not None:
setattr(wrapper_class, name, alias)
def _make_alias(
wrapper_module: str, wrapper_name: str, name: str, delegate_cls: type, delegate: Any
) -> Optional[Union[Callable[..., Any], property]]:
if inspect.isfunction(delegate):
return _make_method_alias(
wrapper_module=wrapper_module,
wrapper_name=wrapper_name,
name=name,
delegate_cls=delegate_cls,
delegate_method=delegate,
)
elif inspect.isdatadescriptor(delegate):
return _make_descriptor_alias(
delegate_cls=delegate_cls, delegate_descriptor=delegate
)
else:
return None
def _make_method_alias(
wrapper_module: str,
wrapper_name: str,
name: str,
delegate_cls: type,
delegate_method: T_Callable,
) -> T_Callable:
# create a method that forwards calls to a native delegate estimator
wrapper_method = _make_forwarder(delegate_method)
_update_wrapper(
wrapper=wrapper_method,
wrapped=delegate_method,
wrapper_module=wrapper_module,
wrapper_parent=wrapper_name,
)
class_name = _full_class_name(cls=delegate_cls)
wrapper_method.__doc__ = f"See :meth:`{class_name}.{name}`"
return wrapper_method
def _make_descriptor_alias(delegate_cls: type, delegate_descriptor: Any) -> property:
# create a property that forwards attribute access to a native delegate estimator
class_name = _full_class_name(cls=delegate_cls)
return property(
fget=lambda self: delegate_descriptor.__get__(self._native_estimator),
fset=lambda self, value: cast(Callable[..., None], delegate_descriptor.__set__)(
self._native_estimator, value
),
fdel=lambda self: cast(Callable[..., None], delegate_descriptor.__delete__)(
self._native_estimator
),
doc=f"See documentation of :class:`{class_name}`.",
)
def _make_forwarder(delegate_method: T_Callable) -> T_Callable:
# noinspection PyShadowingNames
def _forwarder(
self: EstimatorWrapperDF[BaseEstimator], *args: Any, **kwargs: Any
) -> Any:
return delegate_method(self._native_estimator, *args, **kwargs)
return cast(T_Callable, _forwarder)
def _update_wrapper(
wrapper: Any,
wrapped: Any,
wrapper_module: str,
wrapper_parent: str,
) -> None:
update_wrapper(
wrapper, wrapped, assigned=("__name__", "__annotations__"), updated=()
)
wrapper.__module__ = wrapper_module
if wrapper_parent:
wrapper.__qualname__ = f"{wrapper_parent}.{wrapper.__name__}"
else:
wrapper.__qualname__ = wrapper.__name__
def _update_class_docstring(
df_estimator_type: type[EstimatorWrapperDF[T_NativeEstimator]],
sklearn_native_estimator_type: type[T_NativeEstimator],
) -> None:
base_doc = sklearn_native_estimator_type.__doc__
if not base_doc:
return
base_doc_lines = base_doc.split("\n")
# use the first paragraph as the tag line
tag_lines: list[str] = []
for line in base_doc_lines:
# end of paragraph reached?
stripped = line.strip()
if stripped:
# no: append line to tag lines
tag_lines.append(stripped)
elif tag_lines:
# empty line, and we already have tag lines: stop here
break
estimator_name = _full_class_name(cls=sklearn_native_estimator_type)
df_estimator_type.__doc__ = "\n".join(
[
*tag_lines,
"",
(
f"""
.. note:: This class is a wrapper around class :class:`{estimator_name}`.
It provides enhanced support for :mod:`pandas` data frames, and otherwise
delegates all attribute access and method calls to an associated
:class:`~{estimator_name}` instance.
"""
),
]
)
def _full_class_name(cls: type) -> str:
# get the full name of the class, including the module prefix
try:
module_name = cls.__module__
except AttributeError as e:
raise RuntimeError(f"cannot get module for {cls}") from e
if module_name != "__main__":
module_name = public_module_prefix(module_name)
return f"{module_name}.{cls.__qualname__}"
#
# validate __all__
#
__tracker.validate()