Source code for facet.simulation._result

"""
Core implementation of :mod:`facet.simulation`
"""


import logging
from typing import Generic, Sequence, TypeVar

import numpy as np
import pandas as pd
from scipy import stats

from pytools.api import AllTracker

from facet.data.partition import Partitioner

log = logging.getLogger(__name__)

__all__ = [
    "UnivariateSimulationResult",
]


#
# Type variables
#

T_Values = TypeVar("T_Values", bound=np.generic)


#
# Ensure all symbols introduced below are included in __all__
#

__tracker = AllTracker(globals())


[docs]class UnivariateSimulationResult(Generic[T_Values]):
    """
    Summary result of a univariate simulation.
    """

    #: The simulation result as a data frame, indexed by the central values of the
    #: partitions for which the simulation was run, with the following columns:
    #:
    #: - :attr:`.COL_MEAN`: the mean predictions for the simulated values
    #: - :attr:`.COL_SEM`: the standard errors of the mean predictions
    #: - :attr:`.COL_LOWER_BOUND`: the lower bounds of the confidence intervals for the
    #:   simulation outcomes, based on mean, standard error of the mean, and
    #:   :attr:`confidence_level`
    #: - :attr:`.COL_UPPER_BOUND`: the upper bounds of the confidence intervals for the
    #:   simulation outcomes, based on mean, standard error of the mean, and
    #:   :attr:`confidence_level`
    data: pd.DataFrame

    #: The partitioner used to generate feature values to be simulated.
    partitioner: Partitioner[T_Values]

    #: Name of the simulated feature.
    feature_name: str

    #: Name of the target for which outputs are simulated.
    output_name: str

    #: The unit of the simulated outputs (e.g., uplift or class probability).
    output_unit: str

    #: The average observed actual output, acting as the baseline of the simulation.
    baseline: float

    #: The width :math:`\alpha` of the confidence interval
    #: determined by bootstrapping, with :math:`0 < \alpha < 1`.
    confidence_level: float

    #: The name of the column index of attribute :attr:`.output`, denoting partitions
    #: represented by their central values or by a category.
    IDX_PARTITION = "partition"

    #: The name of a series of mean simulated values per partition.
    COL_MEAN = "mean"

    #: The name of a series of standard errors of mean simulated values per partition.
    COL_SEM = "sem"

    #: The name of a series of lower CI bounds of simulated values per partition.
    COL_LOWER_BOUND = "lower_bound"

    #: The name of a series of upper CI bounds of simulated values per partition.
    COL_UPPER_BOUND = "upper_bound"

    def __init__(
        self,
        *,
        partitioner: Partitioner[T_Values],
        mean: Sequence[float],
        sem: Sequence[float],
        feature_name: str,
        output_name: str,
        output_unit: str,
        baseline: float,
        confidence_level: float,
    ) -> None:
        """
        :param partitioner: the partitioner used to generate feature values to be
            simulated
        :param mean: mean predictions for the values representing each partition
        :param sem: standard errors of the mean predictions for the values representing
            each partition
        :param feature_name: name of the simulated feature
        :param output_name: name of the target for which outputs are simulated
        :param output_unit: the unit of the simulated outputs
            (e.g., uplift or class probability)
        :param baseline: the average observed actual output, acting as the baseline
            of the simulation
        :param confidence_level: the width of the confidence interval determined by
            the standard error of the mean, ranging between 0.0 and 1.0 (exclusive)
        """
        super().__init__()

        if not partitioner.is_fitted:
            raise ValueError("arg partitioner must be fitted")

        n_partitions = len(partitioner.partitions_)

        for seq, seq_name in [(mean, "mean"), (sem, "sem")]:
            if len(seq) != n_partitions:
                raise ValueError(
                    f"length of arg {seq_name} must correspond to "
                    f"the number of partitions (n={n_partitions})"
                )

        if not (0.0 < confidence_level < 1.0):
            raise ValueError(
                f"arg confidence_level={confidence_level} is not "
                "in the range between 0.0 and 1.0 (exclusive)"
            )

        self.partitioner = partitioner
        self.feature_name = feature_name
        self.output_name = output_name
        self.output_unit = output_unit
        self.baseline = baseline
        self.confidence_level = confidence_level

        # convert mean and sem to numpy arrays
        mean_arr = np.array(mean)
        sem_arr = np.array(sem)

        # get the width of the confidence interval (this is a negative number)
        ci_width = stats.norm.ppf((1.0 - self.confidence_level) / 2.0) * sem_arr

        self.data = pd.DataFrame(
            data={
                UnivariateSimulationResult.COL_MEAN: mean_arr,
                UnivariateSimulationResult.COL_SEM: sem_arr,
                UnivariateSimulationResult.COL_LOWER_BOUND: mean_arr + ci_width,
                UnivariateSimulationResult.COL_UPPER_BOUND: mean_arr - ci_width,
            },
            index=pd.Index(
                partitioner.partitions_, name=UnivariateSimulationResult.IDX_PARTITION
            ),
        )