Source code for econml.dr._drlearner

# Copyright (c) PyWhy contributors. All rights reserved.
# Licensed under the MIT License.

"""
Doubly Robust Learner.

The method uses the doubly robust correction to construct doubly
robust estimates of all the potential outcomes of each samples. Then estimates a CATE model
by regressing the potential outcome differences on the heterogeneity features X.

References
----------
Dylan Foster, Vasilis Syrgkanis (2019).
    Orthogonal Statistical Learning.
    ACM Conference on Learning Theory. https://arxiv.org/abs/1901.09036

Robins, J.M., Rotnitzky, A., and Zhao, L.P. (1994).
    Estimation of regression coefficients when some regressors are not always observed.
    Journal of the American Statistical Association 89,846–866.

Bang, H. and Robins, J.M. (2005).
    Doubly robust estimation in missing data and causal inference models.
    Biometrics 61,962–972.

Tsiatis AA (2006).
    Semiparametric Theory and Missing Data.
    New York: Springer; 2006.

.. testcode::
    :hide:

    import numpy as np
    import scipy.special
    np.set_printoptions(suppress=True)

"""

from warnings import warn

import numpy as np
from sklearn.base import clone


from .._ortho_learner import _OrthoLearner
from .._cate_estimator import (DebiasedLassoCateEstimatorDiscreteMixin, ForestModelFinalCateEstimatorDiscreteMixin,
                               StatsModelsCateEstimatorDiscreteMixin, LinearCateEstimator)
from ..inference import GenericModelFinalInferenceDiscrete
from ..grf import RegressionForest
from ..sklearn_extensions.linear_model import (
    DebiasedLasso, StatsModelsLinearRegression)
from ..sklearn_extensions.model_selection import ModelSelector, SingleModelSelector, get_selector
from ..utilities import (check_high_dimensional,
                         filter_none_kwargs, inverse_onehot, get_feature_names_or_default)
from .._shap import _shap_explain_multitask_model_cate, _shap_explain_model_cate
from ..validate.sensitivity_analysis import (sensitivity_interval, RV,
                                             sensitivity_summary, dr_sensitivity_values)


class _ModelNuisance(ModelSelector):
    def __init__(self,
                 model_propensity: SingleModelSelector,
                 model_regression: SingleModelSelector,
                 min_propensity,
                 discrete_outcome):
        self._model_propensity = model_propensity
        self._model_regression = model_regression
        self._min_propensity = min_propensity
        self._discrete_outcome = discrete_outcome

    def _combine(self, X, W):
        return np.hstack([arr for arr in [X, W] if arr is not None])

    def train(self, is_selecting, folds, Y, T, X=None, W=None, *, sample_weight=None, groups=None):
        if Y.ndim != 1 and (Y.ndim != 2 or Y.shape[1] != 1):
            raise ValueError("The outcome matrix must be of shape ({0}, ) or ({0}, 1), "
                             "instead got {1}.".format(len(X), Y.shape))
        if (X is None) and (W is None):
            raise AttributeError("At least one of X or W has to not be None!")
        if np.any(np.all(T == 0, axis=0)) or (not np.any(np.all(T == 0, axis=1))):
            raise AttributeError("Provided crossfit folds contain training splits that " +
                                 "don't contain all treatments")
        XW = self._combine(X, W)
        filtered_kwargs = filter_none_kwargs(sample_weight=sample_weight)

        self._model_propensity.train(is_selecting, folds, XW, inverse_onehot(T), groups=groups, **filtered_kwargs)
        self._model_regression.train(is_selecting, folds, np.hstack([XW, T]), Y, groups=groups, **filtered_kwargs)
        return self

    def score(self, Y, T, X=None, W=None, *, sample_weight=None, groups=None):
        XW = self._combine(X, W)
        filtered_kwargs = filter_none_kwargs(sample_weight=sample_weight)

        propensity_score = self._model_propensity.score(XW, inverse_onehot(T), **filtered_kwargs)
        regression_score = self._model_regression.score(np.hstack([XW, T]), Y, **filtered_kwargs)

        return propensity_score, regression_score

    def predict(self, Y, T, X=None, W=None, *, sample_weight=None, groups=None):
        XW = self._combine(X, W)
        propensities = np.clip(self._model_propensity.predict_proba(XW), self._min_propensity, 1-self._min_propensity)
        n = T.shape[0]
        Y_pred = np.zeros((T.shape[0], T.shape[1] + 1))
        T_counter = np.zeros(T.shape)
        if hasattr(self._model_regression, 'predict_proba'):
            if self._discrete_outcome:
                Y_pred[:, 0] = self._model_regression.predict_proba(np.hstack([XW, T_counter]))[:, 1].reshape(n)
            else:
                raise AttributeError("Cannot use a classifier for model_regression when discrete_outcome=False!")
        else:
            if self._discrete_outcome:
                warn("A regressor was passed to model_regression when discrete_outcome=True. "
                     "Using a classifier is recommended.", UserWarning)
            Y_pred[:, 0] = self._model_regression.predict(np.hstack([XW, T_counter])).reshape(n)
        Y_pred[:, 0] += (Y.reshape(n) - Y_pred[:, 0]) * np.all(T == 0, axis=1) / propensities[:, 0]
        for t in np.arange(T.shape[1]):
            T_counter = np.zeros(T.shape)
            T_counter[:, t] = 1
            if self._discrete_outcome and hasattr(self._model_regression, 'predict_proba'):
                Y_pred[:, t + 1] = self._model_regression.predict_proba(np.hstack([XW, T_counter]))[:, 1].reshape(n)
            else:
                Y_pred[:, t + 1] = self._model_regression.predict(np.hstack([XW, T_counter])).reshape(n)
            Y_pred[:, t + 1] += (Y.reshape(n) - Y_pred[:, t + 1]) * (T[:, t] == 1) / propensities[:, t + 1]
        return Y_pred.reshape(Y.shape + (T.shape[1] + 1,)), propensities


def _make_first_stage_selector(model, is_discrete, random_state):
    if model == "auto":
        model = ['linear', 'forest']
    return get_selector(model, is_discrete=is_discrete, random_state=random_state)


class _ModelFinal:
    # Coding Remark: The reasoning around the multitask_model_final could have been simplified if
    # we simply wrapped the model_final with a MultiOutputRegressor. However, because we also want
    # to allow even for model_final objects whose fit(X, y) can accept X=None
    # (e.g. the StatsModelsLinearRegression), we cannot take that route, because the MultiOutputRegressor
    # checks that X is 2D array.
    def __init__(self, model_final, featurizer, multitask_model_final):
        self._model_final = clone(model_final, safe=False)
        self._featurizer = clone(featurizer, safe=False)
        self._multitask_model_final = multitask_model_final
        return

    def fit(self, Y, T, X=None, W=None, *, nuisances,
            sample_weight=None, freq_weight=None, sample_var=None, groups=None):
        Y_pred, T_pred = nuisances
        self.sensitivity_params = dr_sensitivity_values(Y, T, Y_pred, T_pred)

        T_complete = np.hstack(((np.all(T == 0, axis=1) * 1).reshape(-1, 1), T))
        propensities = np.sum(T_pred * T_complete, axis=1).reshape((T.shape[0],))

        self.d_y = Y_pred.shape[1:-1]  # track whether there's a Y dimension (must be a singleton)
        self.d_t = Y_pred.shape[-1] - 1  # track # of treatment (exclude baseline treatment)

        if (X is not None) and (self._featurizer is not None):
            X = self._featurizer.fit_transform(X)

        if self._multitask_model_final:
            ys = Y_pred[..., 1:] - Y_pred[..., [0]]  # subtract control results from each other arm
            if self.d_y:  # need to squeeze out singleton so that we fit on 2D array
                ys = ys.squeeze(1)
            weighted_sample_var = np.tile((sample_var / propensities**2).reshape((-1, 1)),
                                          self.d_t) if sample_var is not None else None
            filtered_kwargs = filter_none_kwargs(sample_weight=sample_weight,
                                                 freq_weight=freq_weight, sample_var=weighted_sample_var)
            self.model_cate = self._model_final.fit(X, ys, **filtered_kwargs)
        else:
            weighted_sample_var = sample_var / propensities**2 if sample_var is not None else None
            filtered_kwargs = filter_none_kwargs(sample_weight=sample_weight,
                                                 freq_weight=freq_weight, sample_var=weighted_sample_var)
            self.models_cate = [clone(self._model_final, safe=False).fit(X, Y_pred[..., t] - Y_pred[..., 0],
                                                                         **filtered_kwargs)
                                for t in np.arange(1, Y_pred.shape[-1])]
        return self

    def predict(self, X=None):
        if (X is not None) and (self._featurizer is not None):
            X = self._featurizer.transform(X)
        if self._multitask_model_final:
            pred = self.model_cate.predict(X).reshape((-1, self.d_t))
            if self.d_y:  # need to reintroduce singleton Y dimension
                return pred[:, np.newaxis, :]
            return pred
        else:
            preds = np.array([mdl.predict(X).reshape((-1,) + self.d_y) for mdl in self.models_cate])
            return np.moveaxis(preds, 0, -1)  # move treatment dim to end

    def score(self, Y, T, X=None, W=None, *, nuisances, sample_weight=None, groups=None):
        if (X is not None) and (self._featurizer is not None):
            X = self._featurizer.transform(X)
        Y_pred, _ = nuisances

        if self._multitask_model_final:
            Y_pred_diff = Y_pred[..., 1:] - Y_pred[..., [0]]
            cate_pred = self.model_cate.predict(X).reshape((-1, self.d_t))
            if self.d_y:
                cate_pred = cate_pred[:, np.newaxis, :]
            return np.mean(np.average((Y_pred_diff - cate_pred)**2, weights=sample_weight, axis=0))

        else:
            scores = []
            for t in np.arange(1, Y_pred.shape[-1]):
                # since we only allow single dimensional y, we could flatten the prediction
                Y_pred_diff = (Y_pred[..., t] - Y_pred[..., 0]).flatten()
                cate_pred = self.models_cate[t - 1].predict(X).flatten()
                score = np.average((Y_pred_diff - cate_pred)**2, weights=sample_weight, axis=0)
                scores.append(score)
            return np.mean(scores)



[docs]
class DRLearner(_OrthoLearner):
    """
    CATE estimator that uses doubly-robust correction to account for selection bias between the treatment arms.

    The estimator is a special case of an :class:`._OrthoLearner` estimator, so it follows the two
    stage process, where a set of nuisance functions are estimated in the first stage in a crossfitting
    manner and a final stage estimates the CATE model. See the documentation of
    :class:`._OrthoLearner` for a description of this two stage process.

    In this estimator, the CATE is estimated by using the following estimating equations. If we let:

    .. math ::
        Y_{i, t}^{DR} = E[Y | X_i, W_i, T_i=t]\
            + \\frac{Y_i - E[Y | X_i, W_i, T_i=t]}{Pr[T_i=t | X_i, W_i]} \\cdot 1\\{T_i=t\\}

    Then the following estimating equation holds:

    .. math ::
        E\\left[Y_{i, t}^{DR} - Y_{i, 0}^{DR} | X_i\\right] = \\theta_t(X_i)

    Thus if we estimate the nuisance functions :math:`h(X, W, T) = E[Y | X, W, T]` and
    :math:`p_t(X, W)=Pr[T=t | X, W]` in the first stage, we can estimate the final stage cate for each
    treatment t, by running a regression, regressing :math:`Y_{i, t}^{DR} - Y_{i, 0}^{DR}` on :math:`X_i`.

    The problem of estimating the nuisance function :math:`p` is a simple multi-class classification
    problem of predicting the label :math:`T` from :math:`X, W`. The :class:`.DRLearner`
    class takes as input the parameter ``model_propensity``, which is an arbitrary scikit-learn
    classifier, that is internally used to solve this classification problem.

    The second nuisance function :math:`h` is a simple regression problem and the :class:`.DRLearner`
    class takes as input the parameter ``model_regressor``, which is an arbitrary scikit-learn regressor that
    is internally used to solve this regression problem.

    The final stage is multi-task regression problem with outcomes the labels :math:`Y_{i, t}^{DR} - Y_{i, 0}^{DR}`
    for each non-baseline treatment t. The :class:`.DRLearner` takes as input parameter
    ``model_final``, which is any scikit-learn regressor that is internally used to solve this multi-task
    regresion problem. If the parameter ``multitask_model_final`` is False, then this model is assumed
    to be a mono-task regressor, and separate clones of it are used to solve each regression target
    separately.

    Parameters
    ----------
    model_propensity: estimator, default ``'auto'``
        Classifier for Pr[T=t | X, W]. Trained by regressing treatments on (features, controls) concatenated.

        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models

        - Otherwise, see :ref:`model_selection` for the range of supported options

    model_regression: estimator, default ``'auto'``
        Estimator for E[Y | X, W, T]. Trained by regressing Y on (features, controls, one-hot-encoded treatments)
        concatenated. The one-hot-encoding excludes the baseline treatment.

        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models

        - Otherwise, see :ref:`model_selection` for the range of supported options;
          if a single model is specified it should be a classifier if `discrete_outcome` is True
          and a regressor otherwise

    model_final :
        estimator for the final cate model. Trained on regressing the doubly robust potential outcomes
        on (features X).

        - If X is None, then the fit method of model_final should be able to handle X=None.
        - If featurizer is not None and X is not None, then it is trained on the outcome of
          featurizer.fit_transform(X).
        - If multitask_model_final is True, then this model must support multitasking
          and it is trained by regressing all doubly robust target outcomes on (featurized) features simultanteously.
        - The output of the predict(X) of the trained model will contain the CATEs for each treatment compared to
          baseline treatment (lexicographically smallest). If multitask_model_final is False, it is assumed to be a
          mono-task model and a separate clone of the model is trained for each outcome. Then predict(X) of the t-th
          clone will be the CATE of the t-th lexicographically ordered treatment compared to the baseline.

    discrete_outcome: bool, default False
        Whether the outcome should be treated as binary

    multitask_model_final : bool, default False
        Whether the model_final should be treated as a multi-task model. See description of model_final.

    featurizer : :term:`transformer`, optional
        Must support fit_transform and transform. Used to create composite features in the final CATE regression.
        It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
        If featurizer=None, then CATE is trained on X.

    min_propensity : float, default ``1e-6``
        The minimum propensity at which to clip propensity estimates to avoid dividing by zero.

    categories: 'auto' or list, default 'auto'
        The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
        The first category will be treated as the control treatment.

    cv: int, cross-validation generator or an iterable, default 2
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:

        - None, to use the default 3-fold cross-validation,
        - integer, to specify the number of folds.
        - :term:`CV splitter`
        - An iterable yielding (train, test) splits as arrays of indices.

        For integer/None inputs, if the treatment is discrete
        :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
        :class:`~sklearn.model_selection.KFold` is used
        (with a random shuffle in either case).

        Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
        W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.

    mc_iters: int, optional
        The number of times to rerun the first stage models to reduce the variance of the nuisances.

    mc_agg: {'mean', 'median'}, default 'mean'
        How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of
        cross-fitting.

    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None
        If int, random_state is the seed used by the random number generator;
        If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
        If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
        by :mod:`np.random<numpy.random>`.

    allow_missing: bool
        Whether to allow missing values in X, W. If True, will need to supply model_propensity,
        model_regression, and model_final that can handle missing values.

    use_ray: bool, default False
        Whether to use Ray to parallelize the cross-fitting step. If True, Ray must be installed.

    ray_remote_func_options : dict, default None
        Options to pass to the remote function when using Ray.
        See https://docs.ray.io/en/latest/ray-core/api/doc/ray.remote.html

    Examples
    --------
    A simple example with the default models:

    .. testcode::
        :hide:

        import numpy as np
        import scipy.special
        np.set_printoptions(suppress=True)

    .. testcode::

        from econml.dr import DRLearner

        np.random.seed(123)
        X = np.random.normal(size=(1000, 3))
        T = np.random.binomial(2, scipy.special.expit(X[:, 0]))
        sigma = 0.001
        y = (1 + .5*X[:, 0]) * T + X[:, 0] + np.random.normal(0, sigma, size=(1000,))
        est = DRLearner()
        est.fit(y, T, X=X, W=None)

    >>> est.const_marginal_effect(X[:2])
    array([[0.516888..., 0.995747...],
           [0.356386..., 0.671889...]])
    >>> est.effect(X[:2], T0=0, T1=1)
    array([0.516888..., 0.356386...])
    >>> est.score_
    np.float64(2.845660...)
    >>> est.score(y, T, X=X)
    np.float64(1.062668...)
    >>> est.model_cate(T=1).coef_
    array([ 0.447146..., -0.001025...,  0.018984...])
    >>> est.model_cate(T=2).coef_
    array([ 0.925064..., -0.012351...,  0.033480...])
    >>> est.cate_feature_names()
    ['X0', 'X1', 'X2']

    Beyond default models:

    .. testcode::

        from sklearn.linear_model import LassoCV
        from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
        from econml.dr import DRLearner

        np.random.seed(123)
        X = np.random.normal(size=(1000, 3))
        T = np.random.binomial(2, scipy.special.expit(X[:, 0]))
        sigma = 0.01
        y = (1 + .5*X[:, 0]) * T + X[:, 0] + np.random.normal(0, sigma, size=(1000,))
        est = DRLearner(model_propensity=RandomForestClassifier(n_estimators=100, min_samples_leaf=10),
                        model_regression=RandomForestRegressor(n_estimators=100, min_samples_leaf=10),
                        model_final=LassoCV(cv=3),
                        featurizer=None)
        est.fit(y, T, X=X, W=None)

    >>> est.score_
    np.float64(1.73...)
    >>> est.const_marginal_effect(X[:3])
    array([[0.68..., 1.10...],
           [0.56..., 0.79... ],
           [0.34..., 0.10... ]])
    >>> est.model_cate(T=2).coef_
    array([0.74..., 0.        , 0.        ])
    >>> est.model_cate(T=2).intercept_
    np.float64(1.9...)
    >>> est.model_cate(T=1).coef_
    array([0.24..., 0.00..., 0.        ])
    >>> est.model_cate(T=1).intercept_
    np.float64(0.94...)

    Attributes
    ----------
    score_ : float
        The MSE in the final doubly robust potential outcome regressions, i.e.

        .. math::
            \\frac{1}{n_t} \\sum_{t=1}^{n_t} \\frac{1}{n} \\sum_{i=1}^n (Y_{i, t}^{DR} - \\hat{\\theta}_t(X_i))^2

        where n_t is the number of treatments (excluding control).

        If `sample_weight` is not None at fit time, then a weighted average across samples is returned.


    """


[docs]
    def __init__(self, *,
                 model_propensity='auto',
                 model_regression='auto',
                 model_final=StatsModelsLinearRegression(),
                 discrete_outcome=False,
                 multitask_model_final=False,
                 featurizer=None,
                 min_propensity=1e-6,
                 categories='auto',
                 cv=2,
                 mc_iters=None,
                 mc_agg='mean',
                 random_state=None,
                 allow_missing=False,
                 use_ray=False,
                 ray_remote_func_options=None
                 ):
        self.model_propensity = clone(model_propensity, safe=False)
        self.model_regression = clone(model_regression, safe=False)
        self.model_final = clone(model_final, safe=False)
        self.multitask_model_final = multitask_model_final
        self.featurizer = clone(featurizer, safe=False)
        self.min_propensity = min_propensity
        super().__init__(cv=cv,
                         mc_iters=mc_iters,
                         mc_agg=mc_agg,
                         discrete_outcome=discrete_outcome,
                         discrete_treatment=True,
                         treatment_featurizer=None,  # treatment featurization not supported with discrete treatment
                         discrete_instrument=False,  # no instrument, so doesn't matter
                         categories=categories,
                         random_state=random_state,
                         allow_missing=allow_missing,
                         use_ray=use_ray,
                         ray_remote_func_options=ray_remote_func_options
                         )


    def _gen_allowed_missing_vars(self):
        return ['X', 'W'] if self.allow_missing else []

    # override only so that we can exclude treatment featurization verbiage in docstring

[docs]
    def const_marginal_effect(self, X=None):
        """
        Calculate the constant marginal CATE :math:`\\theta(·)`.

        The marginal effect is conditional on a vector of
        features on a set of m test samples X[i].

        Parameters
        ----------
        X: (m, d_x) matrix, optional
            Features for each sample.

        Returns
        -------
        theta: (m, d_y, d_t) matrix or (d_y, d_t) matrix if X is None
            Constant marginal CATE of each treatment on each outcome for each sample X[i].
            Note that when Y or T is a vector rather than a 2-dimensional array,
            the corresponding singleton dimensions in the output will be collapsed
            (e.g. if both are vectors, then the output of this method will also be a vector)
        """
        return super().const_marginal_effect(X=X)


    # override only so that we can exclude treatment featurization verbiage in docstring

[docs]
    def const_marginal_ate(self, X=None):
        """
        Calculate the average constant marginal CATE :math:`E_X[\\theta(X)]`.

        Parameters
        ----------
        X: (m, d_x) matrix, optional
            Features for each sample.

        Returns
        -------
        theta: (d_y, d_t) matrix
            Average constant marginal CATE of each treatment on each outcome.
            Note that when Y or T is a vector rather than a 2-dimensional array,
            the corresponding singleton dimensions in the output will be collapsed
            (e.g. if both are vectors, then the output of this method will be a scalar)
        """
        return super().const_marginal_ate(X=X)


    def _get_inference_options(self):
        options = super()._get_inference_options()
        if not self.multitask_model_final:
            options.update(auto=GenericModelFinalInferenceDiscrete)
        else:
            options.update(auto=lambda: None)
        return options

    def _gen_ortho_learner_model_nuisance(self):
        model_propensity = _make_first_stage_selector(self.model_propensity, True, self.random_state)
        model_regression = _make_first_stage_selector(self.model_regression, self.discrete_outcome, self.random_state)

        return _ModelNuisance(model_propensity, model_regression, self.min_propensity, self.discrete_outcome)

    def _gen_featurizer(self):
        return clone(self.featurizer, safe=False)

    def _gen_model_final(self):
        return clone(self.model_final, safe=False)

    def _gen_ortho_learner_model_final(self):
        return _ModelFinal(self._gen_model_final(), self._gen_featurizer(), self.multitask_model_final)


[docs]
    def fit(self, Y, T, *, X=None, W=None, sample_weight=None, freq_weight=None, sample_var=None, groups=None,
            cache_values=False, inference='auto'):
        """
        Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.

        Parameters
        ----------
        Y: (n,) vector of length n
            Outcomes for each sample
        T: (n,) vector of length n
            Treatments for each sample
        X:(n, d_x) matrix, optional
            Features for each sample
        W:(n, d_w) matrix, optional
            Controls for each sample
        sample_weight : (n,) array_like, optional
            Individual weights for each sample. If None, it assumes equal weight.
        freq_weight: (n,) array_like of int, optional
            Weight for the observation. Observation i is treated as the mean
            outcome of freq_weight[i] independent observations.
            When ``sample_var`` is not None, this should be provided.
        sample_var : (n,) nd array_like, optional
            Variance of the outcome(s) of the original freq_weight[i] observations that were used to
            compute the mean outcome represented by observation i.
        groups: (n,) vector, optional
            All rows corresponding to the same group will be kept together during splitting.
            If groups is not None, the `cv` argument passed to this class's initializer
            must support a 'groups' argument to its split method.
        cache_values: bool, default False
            Whether to cache inputs and first stage results, which will allow refitting a different final model
        inference: str, :class:`.Inference` instance, or None
            Method for performing inference.  This estimator supports 'bootstrap'
            (or an instance of :class:`.BootstrapInference`).

        Returns
        -------
        self: DRLearner instance
        """
        # Replacing fit from _OrthoLearner, to enforce Z=None and improve the docstring
        return super().fit(Y, T, X=X, W=W,
                           sample_weight=sample_weight, freq_weight=freq_weight, sample_var=sample_var, groups=groups,
                           cache_values=cache_values, inference=inference)



[docs]
    def refit_final(self, *, inference='auto'):
        return super().refit_final(inference=inference)

    refit_final.__doc__ = _OrthoLearner.refit_final.__doc__


[docs]
    def score(self, Y, T, X=None, W=None, sample_weight=None):
        """
        Score the fitted CATE model on a new data set.

        Generates nuisance parameters
        for the new data set based on the fitted residual nuisance models created at fit time.
        It uses the mean prediction of the models fitted by the different crossfit folds.
        Then calculates the MSE of the final residual Y on residual T regression.

        If model_final does not have a score method, then it raises an :exc:`.AttributeError`

        Parameters
        ----------
        Y: (n,) vector of length n
            Outcomes for each sample
        T: (n,) vector of length n
            Treatments for each sample
        X:(n, d_x) matrix, optional
            Features for each sample
        W:(n, d_w) matrix, optional
            Controls for each sample
        sample_weight:(n,) vector, optional
            Weights for each samples

        Returns
        -------
        score: float
            The MSE of the final CATE model on the new data.
        """
        # Replacing score from _OrthoLearner, to enforce Z=None and improve the docstring
        return super().score(Y, T, X=X, W=W, sample_weight=sample_weight)


    @property
    def multitask_model_cate(self):
        """
        Get the fitted final CATE model.

        Returns
        -------
        multitask_model_cate: object of type(`model_final`)
            An instance of the model_final object that was fitted after calling fit which corresponds whose
            vector of outcomes correspond to the CATE model for each treatment, compared to baseline.
            Available only when multitask_model_final=True.
        """
        if not self.ortho_learner_model_final_._multitask_model_final:
            raise AttributeError("Separate CATE models were fitted for each treatment! Use model_cate.")
        return self.ortho_learner_model_final_.model_cate


[docs]
    def model_cate(self, T=1):
        """
        Get the fitted final CATE model.

        Parameters
        ----------
        T: alphanumeric
            The treatment with respect to which we want the fitted CATE model.

        Returns
        -------
        model_cate: object of type(model_final)
            An instance of the model_final object that was fitted after calling fit which corresponds
            to the CATE model for treatment T=t, compared to baseline. Available when multitask_model_final=False.
        """
        if self.ortho_learner_model_final_._multitask_model_final:
            raise AttributeError("A single multitask model was fitted for all treatments! Use multitask_model_cate.")
        _, T = self._expand_treatments(None, T)
        ind = inverse_onehot(T).item() - 1
        assert ind >= 0, "No model was fitted for the control"
        return self.ortho_learner_model_final_.models_cate[ind]


    @property
    def models_propensity(self):
        """
        Get the fitted propensity models.

        Returns
        -------
        models_propensity: nested list of objects of type(`model_propensity`)
            A nested list of instances of the `model_propensity` object. Number of sublist equals to number of
            monte carlo iterations, each element in the sublist corresponds to a crossfitting
            fold and is the model instance that was fitted for that training fold.
        """
        return [[mdl._model_propensity.best_model for mdl in mdls] for mdls in super().models_nuisance_]

    @property
    def models_regression(self):
        """
        Get the fitted regression models.

        Returns
        -------
        model_regression: nested list of objects of type(`model_regression`)
            A nested list of instances of the model_regression object. Number of sublist equals to number of
            monte carlo iterations, each element in the sublist corresponds to a crossfitting
            fold and is the model instance that was fitted for that training fold.
        """
        return [[mdl._model_regression.best_model for mdl in mdls] for mdls in super().models_nuisance_]

    @property
    def nuisance_scores_propensity(self):
        """Get the score for the propensity model on out-of-sample training data."""
        return self.nuisance_scores_[0]

    @property
    def nuisance_scores_regression(self):
        """Get the score for the regression model on out-of-sample training data."""
        return self.nuisance_scores_[1]

    @property
    def featurizer_(self):
        """
        Get the fitted featurizer.

        Returns
        -------
        featurizer: object of type(`featurizer`)
            An instance of the fitted featurizer that was used to preprocess X in the final CATE model training.
            Available only when featurizer is not None and X is not None.
        """
        return self.ortho_learner_model_final_._featurizer


[docs]
    def cate_feature_names(self, feature_names=None):
        """
        Get the output feature names.

        Parameters
        ----------
        feature_names: list of str of length X.shape[1] or None
            The names of the input features. If None and X is a dataframe, it defaults to the column names
            from the dataframe.

        Returns
        -------
        out_feature_names: list of str or None
            The names of the output features :math:`\\phi(X)`, i.e. the features with respect to which the
            final CATE model for each treatment is linear. It is the names of the features that are associated
            with each entry of the :meth:`coef_` parameter. Available only when the featurizer is not None and has
            a method: `get_feature_names(feature_names)`. Otherwise None is returned.
        """
        if self._d_x is None:
            # Handles the corner case when X=None but featurizer might be not None
            return None
        if feature_names is None:
            feature_names = self._input_names["feature_names"]
        if self.featurizer_ is None:
            return feature_names
        return get_feature_names_or_default(self.featurizer_, feature_names)


    @property
    def model_final_(self):
        return self.ortho_learner_model_final_._model_final

    @property
    def fitted_models_final(self):
        return self.ortho_learner_model_final_.models_cate


[docs]
    def shap_values(self, X, *, feature_names=None, treatment_names=None, output_names=None, background_samples=100):
        if self.ortho_learner_model_final_._multitask_model_final:
            return _shap_explain_multitask_model_cate(self.const_marginal_effect, self.multitask_model_cate, X,
                                                      self._d_t, self._d_y,
                                                      featurizer=self.featurizer_,
                                                      feature_names=feature_names,
                                                      treatment_names=treatment_names,
                                                      output_names=output_names,
                                                      input_names=self._input_names,
                                                      background_samples=background_samples)
        else:
            return _shap_explain_model_cate(self.const_marginal_effect, self.fitted_models_final,
                                            X, self._d_t, self._d_y,
                                            featurizer=self.featurizer_,
                                            feature_names=feature_names,
                                            treatment_names=treatment_names,
                                            output_names=output_names,
                                            input_names=self._input_names,
                                            background_samples=background_samples)

    shap_values.__doc__ = LinearCateEstimator.shap_values.__doc__


[docs]
    def sensitivity_summary(self, T, null_hypothesis=0, alpha=0.05, c_y=0.05, c_t=0.05, rho=1., decimals=3):
        """
        Generate a summary of the sensitivity analysis for the ATE for a given treatment.

        Parameters
        ----------
        null_hypothesis: float, default 0
            The null_hypothesis value for the ATE.

        alpha: float, default 0.05
            The significance level for the sensitivity interval.

        c_y: float, default 0.05
            The level of confounding in the outcome. Ranges from 0 to 1.

        c_d: float, default 0.05
            The level of confounding in the treatment. Ranges from 0 to 1.

        decimals: int, default 3
            Number of decimal places to round each column to.

        """
        if T not in self.transformer.categories_[0]:
            # raise own ValueError here because sometimes error from sklearn is not transparent
            raise ValueError(f"Treatment {T} not in the list of treatments {self.transformer.categories_[0]}")
        _, T = self._expand_treatments(None, T)
        T_ind = inverse_onehot(T).item() - 1
        assert T_ind >= 0, "No model was fitted for the control"
        sensitivity_params = {
            k: v[T_ind] for k, v in self._ortho_learner_model_final.sensitivity_params._asdict().items()}
        return sensitivity_summary(**sensitivity_params, null_hypothesis=null_hypothesis, alpha=alpha,
                                    c_y=c_y, c_t=c_t, rho=rho, decimals=decimals)



[docs]
    def sensitivity_interval(self, T, alpha=0.05, c_y=0.05, c_t=0.05, rho=1., interval_type='ci'):
        """
        Calculate the sensitivity interval for the ATE for a given treatment category.

        The sensitivity interval is the range of values for the ATE that are
        consistent with the observed data, given a specified level of confounding.

        Based on [Chernozhukov2022]_

        Parameters
        ----------
        T: alphanumeric
            The treatment with respect to calculate the sensitivity interval.

        alpha: float, default 0.05
            The significance level for the sensitivity interval.

        c_y: float, default 0.05
            The level of confounding in the outcome. Ranges from 0 to 1.

        c_d: float, default 0.05
            The level of confounding in the treatment. Ranges from 0 to 1.

        interval_type: str, default 'ci'
            The type of interval to return. Can be 'ci' or 'theta'

        Returns
        -------
        (lb, ub): tuple of floats
            sensitivity interval for the ATE for treatment T
        """
        if T not in self.transformer.categories_[0]:
            # raise own ValueError here because sometimes error from sklearn is not transparent
            raise ValueError(f"Treatment {T} not in the list of treatments {self.transformer.categories_[0]}")
        _, T = self._expand_treatments(None, T)
        T_ind = inverse_onehot(T).item() - 1
        assert T_ind >= 0, "No model was fitted for the control"
        sensitivity_params = {
            k: v[T_ind] for k, v in self._ortho_learner_model_final.sensitivity_params._asdict().items()}
        return sensitivity_interval(**sensitivity_params, alpha=alpha,
                                    c_y=c_y, c_t=c_t, rho=rho, interval_type=interval_type)




[docs]
    def robustness_value(self, T, null_hypothesis=0, alpha=0.05, interval_type='ci'):
        """
        Calculate the robustness value for the ATE for a given treatment category.

        The robustness value is the level of confounding (between 0 and 1) in
        *both* the treatment and outcome that would result in enough omitted variable bias such that
        we can no longer reject the null hypothesis. When null_hypothesis is the default of 0, the robustness value
        has the interpretation that it is the level of confounding that would make the
        ATE statistically insignificant.

        A higher value indicates a more robust estimate.

        Returns 0 if the original interval already includes the null_hypothesis.

        Based on [Chernozhukov2022]_

        Parameters
        ----------
        T: alphanumeric
            The treatment with respect to calculate the robustness value.

        null_hypothesis: float, default 0
            The null_hypothesis value for the ATE.

        alpha: float, default 0.05
            The significance level for the robustness value.

        interval_type: str, default 'ci'
            The type of interval to return. Can be 'ci' or 'theta'

        Returns
        -------
        float
            The robustness value
        """
        if T not in self.transformer.categories_[0]:
            # raise own ValueError here because sometimes error from sklearn is not transparent
            raise ValueError(f"Treatment {T} not in the list of treatments {self.transformer.categories_[0]}")
        _, T = self._expand_treatments(None, T)
        T_ind = inverse_onehot(T).item() - 1
        assert T_ind >= 0, "No model was fitted for the control"
        sensitivity_params = {
            k: v[T_ind] for k, v in self._ortho_learner_model_final.sensitivity_params._asdict().items()}
        return RV(**sensitivity_params, null_hypothesis=null_hypothesis, alpha=alpha, interval_type=interval_type)





[docs]
class LinearDRLearner(StatsModelsCateEstimatorDiscreteMixin, DRLearner):
    """
    Special case of :class:`.DRLearner` where the final stage is Linear Regression on a low dimensional set of features.

    In this case, inference can be performed via the asymptotic normal characterization of the estimated parameters.
    This is computationally faster than bootstrap inference. To do this, just leave the setting ``inference='auto'``
    unchanged, or explicitly set ``inference='statsmodels'`` or alter the covariance type calculation via
    ``inference=StatsModelsInferenceDiscrete(cov_type='HC1)``.

    More concretely, this estimator assumes that the final cate model for each treatment takes a linear form:

    .. math ::
        \\theta_t(X) = \\left\\langle \\theta_t, \\phi(X) \\right\\rangle + \\beta_t

    where :math:`\\phi(X)` is the outcome features of the featurizers, or `X` if featurizer is None. :math:`\\beta_t`
    is an intercept of the CATE, which is included if ``fit_cate_intercept=True`` (Default). It fits this by
    running a standard ordinary linear regression (OLS), regressing the doubly robust outcome differences on X:

    .. math ::
        \\min_{\\theta_t, \\beta_t}\
        E_n\\left[\\left(Y_{i, t}^{DR} - Y_{i, 0}^{DR}\
            - \\left\\langle \\theta_t, \\phi(X_i) \\right\\rangle - \\beta_t\\right)^2\\right]

    Then inference can be performed via standard approaches for inference of OLS, via asympotic normal approximations
    of the estimated parameters. The default covariance estimator used is heteroskedasticity robust (HC1).
    For other methods see :class:`.StatsModelsInferenceDiscrete`. Use can invoke them by setting:
    ``inference=StatsModelsInferenceDiscrete(cov_type=...)``.

    This approach is valid even if the CATE model is not linear in :math:`\\phi(X)`. In this case it performs
    inference on the best linear approximation of the CATE model.

    Parameters
    ----------
    model_propensity: estimator, default ``'auto'``
        Classifier for Pr[T=t | X, W]. Trained by regressing treatments on (features, controls) concatenated.

        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models

        - Otherwise, see :ref:`model_selection` for the range of supported options

    model_regression: estimator, default ``'auto'``
        Estimator for E[Y | X, W, T]. Trained by regressing Y on (features, controls, one-hot-encoded treatments)
        concatenated. The one-hot-encoding excludes the baseline treatment.

        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models

        - Otherwise, see :ref:`model_selection` for the range of supported options;
          if a single model is specified it should be a classifier if `discrete_outcome` is True
          and a regressor otherwise

    featurizer : :term:`transformer`, optional
        Must support fit_transform and transform. Used to create composite features in the final CATE regression.
        It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
        If featurizer=None, then CATE is trained on X.

    fit_cate_intercept : bool, default True
        Whether the linear CATE model should have a constant term.

    discrete_outcome: bool, default False
        Whether the outcome should be treated as binary

    min_propensity : float, default ``1e-6``
        The minimum propensity at which to clip propensity estimates to avoid dividing by zero.

    categories: 'auto' or list, default 'auto'
        The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
        The first category will be treated as the control treatment.

    cv: int, cross-validation generator or an iterable, default 2
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:

        - None, to use the default 3-fold cross-validation,
        - integer, to specify the number of folds.
        - :term:`CV splitter`
        - An iterable yielding (train, test) splits as arrays of indices.

        For integer/None inputs, if the treatment is discrete
        :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
        :class:`~sklearn.model_selection.KFold` is used
        (with a random shuffle in either case).

        Unless an iterable is used, we call `split(X,T)` to generate the splits.

    mc_iters: int, optional
        The number of times to rerun the first stage models to reduce the variance of the nuisances.

    mc_agg: {'mean', 'median'}, default 'mean'
        How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of
        cross-fitting.

    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None
        If int, random_state is the seed used by the random number generator;
        If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
        If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
        by :mod:`np.random<numpy.random>`.

    allow_missing: bool
        Whether to allow missing values in W. If True, will need to supply model_propensity and
        model_regression that can handle missing values.

    enable_federation: bool, default False
        Whether to enable federation for the final model.  This has a memory cost so should be enabled only
        if this model will be aggregated with other models.

    use_ray: bool, default False
        Whether to use Ray to parallelize the cross-fitting step. If True, Ray must be installed.

    ray_remote_func_options : dict, default None
        Options to pass to the remote function when using Ray.
        See https://docs.ray.io/en/latest/ray-core/api/doc/ray.remote.html

    Examples
    --------
    A simple example with the default models:

    .. testcode::
        :hide:

        import numpy as np
        import scipy.special
        np.set_printoptions(suppress=True)

    .. testcode::

        from econml.dr import DRLearner, LinearDRLearner

        np.random.seed(123)
        X = np.random.normal(size=(1000, 3))
        T = np.random.binomial(2, scipy.special.expit(X[:, 0]))
        y = (1 + .5*X[:, 0]) * T + X[:, 0] + np.random.normal(size=(1000,))
        est = LinearDRLearner()
        est.fit(y, T, X=X, W=None)

    >>> est.effect(X[:3])
    array([ 0.432365...,  0.359694..., -0.085428...])
    >>> est.effect_interval(X[:3])
    (array([ 0.084048..., -0.177951... , -0.734747...]),
    array([0.780683..., 0.897341..., 0.563889...]))
    >>> est.coef_(T=1)
    array([ 0.450666..., -0.008821...,  0.075271...])
    >>> est.coef__interval(T=1)
    (array([ 0.156245..., -0.252216..., -0.159709...]),
    array([0.745086..., 0.234572..., 0.310252...]))
    >>> est.intercept_(T=1)
    np.float64(0.909121...)
    >>> est.intercept__interval(T=1)
    (np.float64(0.668518...), np.float64(1.149723...))

    Attributes
    ----------
    score_ : float
        The MSE in the final doubly robust potential outcome regressions, i.e.

        .. math::
            \\frac{1}{n_t} \\sum_{t=1}^{n_t} \\frac{1}{n} \\sum_{i=1}^n (Y_{i, t}^{DR} - \\hat{\\theta}_t(X_i))^2

        where n_t is the number of treatments (excluding control).

        If `sample_weight` is not None at fit time, then a weighted average across samples is returned.

    """


[docs]
    def __init__(self, *,
                 model_propensity='auto',
                 model_regression='auto',
                 featurizer=None,
                 fit_cate_intercept=True,
                 discrete_outcome=False,
                 min_propensity=1e-6,
                 categories='auto',
                 cv=2,
                 mc_iters=None,
                 mc_agg='mean',
                 random_state=None,
                 allow_missing=False,
                 enable_federation=False,
                 use_ray=False,
                 ray_remote_func_options=None):

        self.fit_cate_intercept = fit_cate_intercept
        self.enable_federation = enable_federation
        super().__init__(model_propensity=model_propensity,
                         model_regression=model_regression,
                         model_final=None,
                         discrete_outcome=discrete_outcome,
                         featurizer=featurizer,
                         multitask_model_final=False,
                         min_propensity=min_propensity,
                         categories=categories,
                         cv=cv,
                         mc_iters=mc_iters,
                         mc_agg=mc_agg,
                         random_state=random_state,
                         allow_missing=allow_missing,
                         use_ray=use_ray,
                         ray_remote_func_options=ray_remote_func_options
                         )


    def _gen_allowed_missing_vars(self):
        return ['W'] if self.allow_missing else []

    def _gen_model_final(self):
        return StatsModelsLinearRegression(fit_intercept=self.fit_cate_intercept,
                                           enable_federation=self.enable_federation)

    def _gen_ortho_learner_model_final(self):
        return _ModelFinal(self._gen_model_final(), self._gen_featurizer(), False)


[docs]
    def fit(self, Y, T, *, X=None, W=None, sample_weight=None, freq_weight=None, sample_var=None, groups=None,
            cache_values=False, inference='auto'):
        """
        Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.

        Parameters
        ----------
        Y: (n,) vector of length n
            Outcomes for each sample
        T: (n,) vector of length n
            Treatments for each sample
        X:(n, d_x) matrix, optional
            Features for each sample
        W:(n, d_w) matrix, optional
            Controls for each sample
        sample_weight : (n,) array_like, optional
            Individual weights for each sample. If None, it assumes equal weight.
        freq_weight: (n,) array_like of int, optional
            Weight for the observation. Observation i is treated as the mean
            outcome of freq_weight[i] independent observations.
            When ``sample_var`` is not None, this should be provided.
        sample_var : (n,) nd array_like, optional
            Variance of the outcome(s) of the original freq_weight[i] observations that were used to
            compute the mean outcome represented by observation i.
        groups: (n,) vector, optional
            All rows corresponding to the same group will be kept together during splitting.
            If groups is not None, the `cv` argument passed to this class's initializer
            must support a 'groups' argument to its split method.
        cache_values: bool, default False
            Whether to cache inputs and first stage results, which will allow refitting a different final model
        inference: str, :class:`.Inference` instance, or None
            Method for performing inference.  This estimator supports ``'bootstrap'``
            (or an instance of :class:`.BootstrapInference`) and ``'statsmodels'``
            (or an instance of :class:`.StatsModelsInferenceDiscrete`).

        Returns
        -------
        self: DRLearner instance
        """
        # Replacing fit from DRLearner, to add statsmodels inference in docstring
        return super().fit(Y, T, X=X, W=W,
                           sample_weight=sample_weight, freq_weight=freq_weight, sample_var=sample_var, groups=groups,
                           cache_values=cache_values, inference=inference)


    @property
    def fit_cate_intercept_(self):
        return self.model_final_.fit_intercept

    @property
    def multitask_model_cate(self):
        # Replacing this method which is invalid for this class, so that we make the
        # dosctring empty and not appear in the docs.
        return super().multitask_model_cate

    @property
    def multitask_model_final(self):
        return False

    @multitask_model_final.setter
    def multitask_model_final(self, value):
        if value:
            raise ValueError("Parameter `multitask_model_final` cannot change from `False` for this estimator!")

    @property
    def model_final(self):
        return self._gen_model_final()

    @model_final.setter
    def model_final(self, model):
        if model is not None:
            raise ValueError("Parameter `model_final` cannot be altered for this estimator!")




[docs]
class SparseLinearDRLearner(DebiasedLassoCateEstimatorDiscreteMixin, DRLearner):
    """
    Special case of the :class:`.DRLearner` where the final stage is a Debiased Lasso Regression.

    In this case, inference can be performed via the debiased lasso approach
    and its asymptotic normal characterization of the estimated parameters. This is computationally
    faster than bootstrap inference. Leave the default ``inference='auto'`` unchanged, or explicitly set
    ``inference='debiasedlasso'`` at fit time to enable inference via asymptotic normality.

    More concretely, this estimator assumes that the final cate model for each treatment takes a linear form:

    .. math ::
        \\theta_t(X) = \\left\\langle \\theta_t, \\phi(X) \\right\\rangle + \\beta_t

    where :math:`\\phi(X)` is the outcome features of the featurizers, or `X` if featurizer is None. :math:`\\beta_t`
    is a an intercept of the CATE, which is included if ``fit_cate_intercept=True`` (Default). It fits this by
    running a debiased lasso regression (i.e. :math:`\\ell_1`-penalized regression with debiasing),
    regressing the doubly robust outcome differences on X: i.e. first solves the penalized square loss problem

    .. math ::
        \\min_{\\theta_t, \\beta_t}\
        E_n\\left[\\left(Y_{i, t}^{DR} - Y_{i, 0}^{DR}\
            - \\left\\langle \\theta_t, \\phi(X_i) \\right\\rangle - \\beta_t\\right)^2\\right]\
                + \\lambda \\left\\lVert \\theta_t \\right\\rVert_1

    and then adds a debiasing correction to the solution. If alpha='auto' (recommended), then the penalty
    weight :math:`\\lambda` is set optimally via cross-validation.

    This approach is valid even if the CATE model is not linear in :math:`\\phi(X)`. In this case it performs
    inference on the best sparse linear approximation of the CATE model.

    Parameters
    ----------
    model_propensity: estimator, default ``'auto'``
        Classifier for Pr[T=t | X, W]. Trained by regressing treatments on (features, controls) concatenated.

        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models

        - Otherwise, see :ref:`model_selection` for the range of supported options

    model_regression: estimator, default ``'auto'``
        Estimator for E[Y | X, W, T]. Trained by regressing Y on (features, controls, one-hot-encoded treatments)
        concatenated. The one-hot-encoding excludes the baseline treatment.

        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models

        - Otherwise, see :ref:`model_selection` for the range of supported options;
          if a single model is specified it should be a classifier if `discrete_outcome` is True
          and a regressor otherwise

    featurizer : :term:`transformer`, optional
        Must support fit_transform and transform. Used to create composite features in the final CATE regression.
        It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
        If featurizer=None, then CATE is trained on X.

    fit_cate_intercept : bool, default True
        Whether the linear CATE model should have a constant term.

    discrete_outcome: bool, default False
        Whether the outcome should be treated as binary

    alpha: str | float, optional., default 'auto'.
        CATE L1 regularization applied through the debiased lasso in the final model.
        'auto' corresponds to a CV form of the :class:`DebiasedLasso`.

    n_alphas : int, default 100
        How many alphas to try if alpha='auto'

    alpha_cov : str | float, default 'auto'
        The regularization alpha that is used when constructing the pseudo inverse of
        the covariance matrix Theta used to for correcting the final state lasso coefficient
        in the debiased lasso. Each such regression corresponds to the regression of one feature
        on the remainder of the features.

    n_alphas_cov : int, default 10
        How many alpha_cov to try if alpha_cov='auto'.

    max_iter : int, default 1000
        The maximum number of iterations in the Debiased Lasso

    tol : float, default 1e-4
        The tolerance for the optimization: if the updates are
        smaller than ``tol``, the optimization code checks the
        dual gap for optimality and continues until it is smaller
        than ``tol``.

    n_jobs : int or None, optional
        The number of jobs to run in parallel for both `fit` and `predict`.
        ``None`` means 1 unless in a :func:`joblib.parallel_backend` context.
        ``-1`` means using all processors.

    min_propensity : float, default ``1e-6``
        The minimum propensity at which to clip propensity estimates to avoid dividing by zero.

    categories: 'auto' or list, default 'auto'
        The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
        The first category will be treated as the control treatment.

    cv: int, cross-validation generator or an iterable, default 2
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:

        - None, to use the default 3-fold cross-validation,
        - integer, to specify the number of folds.
        - :term:`CV splitter`
        - An iterable yielding (train, test) splits as arrays of indices.

        For integer/None inputs, if the treatment is discrete
        :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
        :class:`~sklearn.model_selection.KFold` is used
        (with a random shuffle in either case).

        Unless an iterable is used, we call `split(X,T)` to generate the splits.

    mc_iters: int, optional
        The number of times to rerun the first stage models to reduce the variance of the nuisances.

    mc_agg: {'mean', 'median'}, default 'mean'
        How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of
        cross-fitting.

    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None
        If int, random_state is the seed used by the random number generator;
        If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
        If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
        by :mod:`np.random<numpy.random>`.

    allow_missing: bool
        Whether to allow missing values in W. If True, will need to supply model_propensity and
        model_regression that can handle missing values.

    use_ray: bool, default False
        Whether to use Ray to parallelize the cross-validation step. If True, Ray must be installed.

    ray_remote_func_options : dict, default None
        Options to pass to the remote function when using Ray.
        See https://docs.ray.io/en/latest/ray-core/api/doc/ray.remote.html

    Examples
    --------
    A simple example with the default models:

    .. testcode::
        :hide:

        import numpy as np
        import scipy.special
        np.set_printoptions(suppress=True)

    .. testcode::

        from econml.dr import DRLearner, SparseLinearDRLearner

        np.random.seed(123)
        X = np.random.normal(size=(1000, 3))
        T = np.random.binomial(2, scipy.special.expit(X[:, 0]))
        y = (1 + .5*X[:, 0]) * T + X[:, 0] + np.random.normal(size=(1000,))
        est = SparseLinearDRLearner()
        est.fit(y, T, X=X, W=None)

    >>> est.effect(X[:3])
    array([ 0.43...,  0.35..., -0.08...])
    >>> est.effect_interval(X[:3])
    (array([-0.01..., -0.26..., -0.81...]), array([0.87..., 0.98..., 0.65...]))
    >>> est.coef_(T=1)
    array([ 0.44..., -0.00...,  0.07...])
    >>> est.coef__interval(T=1)
    (array([ 0.19... , -0.24..., -0.17...]), array([0.70..., 0.22..., 0.32...]))
    >>> est.intercept_(T=1)
    np.float64(0.90...)
    >>> est.intercept__interval(T=1)
    (np.float64(0.66...), np.float64(1.14...))

    Attributes
    ----------
    score_ : float
        The MSE in the final doubly robust potential outcome regressions, i.e.

        .. math::
            \\frac{1}{n_t} \\sum_{t=1}^{n_t} \\frac{1}{n} \\sum_{i=1}^n (Y_{i, t}^{DR} - \\hat{\\theta}_t(X_i))^2

        where n_t is the number of treatments (excluding control).

        If `sample_weight` is not None at fit time, then a weighted average across samples is returned.

    """


[docs]
    def __init__(self, *,
                 model_propensity='auto',
                 model_regression='auto',
                 featurizer=None,
                 fit_cate_intercept=True,
                 discrete_outcome=False,
                 alpha='auto',
                 n_alphas=100,
                 alpha_cov='auto',
                 n_alphas_cov=10,
                 max_iter=1000,
                 tol=1e-4,
                 n_jobs=None,
                 min_propensity=1e-6,
                 categories='auto',
                 cv=2,
                 mc_iters=None,
                 mc_agg='mean',
                 random_state=None,
                 allow_missing=False,
                 use_ray=False,
                 ray_remote_func_options=None):

        self.fit_cate_intercept = fit_cate_intercept
        self.alpha = alpha
        self.n_alphas = n_alphas
        self.alpha_cov = alpha_cov
        self.n_alphas_cov = n_alphas_cov
        self.max_iter = max_iter
        self.tol = tol
        self.n_jobs = n_jobs
        super().__init__(model_propensity=model_propensity,
                         model_regression=model_regression,
                         model_final=None,
                         discrete_outcome=discrete_outcome,
                         featurizer=featurizer,
                         multitask_model_final=False,
                         min_propensity=min_propensity,
                         categories=categories,
                         cv=cv,
                         mc_iters=mc_iters,
                         mc_agg=mc_agg,
                         random_state=random_state,
                         allow_missing=allow_missing,
                         use_ray=use_ray,
                         ray_remote_func_options=ray_remote_func_options)


    def _gen_allowed_missing_vars(self):
        return ['W'] if self.allow_missing else []

    def _gen_model_final(self):
        return DebiasedLasso(alpha=self.alpha,
                             n_alphas=self.n_alphas,
                             alpha_cov=self.alpha_cov,
                             n_alphas_cov=self.n_alphas_cov,
                             fit_intercept=self.fit_cate_intercept,
                             max_iter=self.max_iter,
                             tol=self.tol,
                             n_jobs=self.n_jobs,
                             random_state=self.random_state)

    def _gen_ortho_learner_model_final(self):
        return _ModelFinal(self._gen_model_final(), self._gen_featurizer(), False)


[docs]
    def fit(self, Y, T, *, X=None, W=None, sample_weight=None, groups=None,
            cache_values=False, inference='auto'):
        """
        Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.

        Parameters
        ----------
        Y: (n,) vector of length n
            Outcomes for each sample
        T: (n,) vector of length n
            Treatments for each sample
        X:(n, d_x) matrix, optional
            Features for each sample
        W:(n, d_w) matrix, optional
            Controls for each sample
        sample_weight : (n,) array_like or None
            Individual weights for each sample. If None, it assumes equal weight.
        groups: (n,) vector, optional
            All rows corresponding to the same group will be kept together during splitting.
            If groups is not None, the `cv` argument passed to this class's initializer
            must support a 'groups' argument to its split method.
        cache_values: bool, default False
            Whether to cache inputs and first stage results, which will allow refitting a different final model
        inference: str, :class:`.Inference` instance, or None
            Method for performing inference.  This estimator supports ``'bootstrap'``
            (or an instance of :class:`.BootstrapInference`) and ``'debiasedlasso'``
            (or an instance of :class:`.LinearModelInferenceDiscrete`).

        Returns
        -------
        self: DRLearner instance
        """
        # TODO: support freq_weight and sample_var in debiased lasso
        # Replacing fit from DRLearner, to add debiasedlasso inference in docstring
        check_high_dimensional(X, T, threshold=5, featurizer=self.featurizer,
                               discrete_treatment=self.discrete_treatment,
                               msg="The number of features in the final model (< 5) is too small for a sparse model. "
                                   "We recommend using the LinearDRLearner for this low-dimensional setting.")
        return super().fit(Y, T, X=X, W=W,
                           sample_weight=sample_weight, groups=groups,
                           cache_values=cache_values, inference=inference)


    @property
    def fit_cate_intercept_(self):
        return self.model_final_.fit_intercept

    @property
    def multitask_model_final(self):
        return False

    @multitask_model_final.setter
    def multitask_model_final(self, value):
        if value:
            raise ValueError("Parameter `multitask_model_final` cannot change from `False` for this estimator!")

    @property
    def model_final(self):
        return self._gen_model_final()

    @model_final.setter
    def model_final(self, model):
        if model is not None:
            raise ValueError("Parameter `model_final` cannot be altered for this estimator!")




[docs]
class ForestDRLearner(ForestModelFinalCateEstimatorDiscreteMixin, DRLearner):
    """
    Instance of DRLearner with a :class:`~econml.grf.RegressionForest` final model to enable non-parametric inference.

    Parameters
    ----------
    model_propensity: estimator, default ``'auto'``
        Classifier for Pr[T=t | X, W]. Trained by regressing treatments on (features, controls) concatenated.

        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models

        - Otherwise, see :ref:`model_selection` for the range of supported options

    model_regression: estimator, default ``'auto'``
        Estimator for E[Y | X, W, T]. Trained by regressing Y on (features, controls, one-hot-encoded treatments)
        concatenated. The one-hot-encoding excludes the baseline treatment.

        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models

        - Otherwise, see :ref:`model_selection` for the range of supported options;
          if a single model is specified it should be a classifier if `discrete_outcome` is True
          and a regressor otherwise

    discrete_outcome: bool, default False
        Whether the outcome should be treated as binary

    min_propensity : float, default ``1e-6``
        The minimum propensity at which to clip propensity estimates to avoid dividing by zero.

    categories: 'auto' or list, default 'auto'
        The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
        The first category will be treated as the control treatment.

    cv: int, cross-validation generator or an iterable, default 2
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:

        - None, to use the default 3-fold cross-validation,
        - integer, to specify the number of folds.
        - :term:`CV splitter`
        - An iterable yielding (train, test) splits as arrays of indices.

        For integer/None inputs, if the treatment is discrete
        :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
        :class:`~sklearn.model_selection.KFold` is used
        (with a random shuffle in either case).

        Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
        W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.

    mc_iters: int, optional
        The number of times to rerun the first stage models to reduce the variance of the nuisances.

    mc_agg: {'mean', 'median'}, default 'mean'
        How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of
        cross-fitting.

    n_estimators : int, default 100
        The total number of trees in the forest. The forest consists of a
        forest of sqrt(n_estimators) sub-forests, where each sub-forest
        contains sqrt(n_estimators) trees.

    max_depth : int or None, optional
        The maximum depth of the tree. If None, then nodes are expanded until
        all leaves are pure or until all leaves contain less than
        min_samples_split samples.

    min_samples_split : int, float, default 2
        The minimum number of splitting samples required to split an internal node.

        - If int, then consider `min_samples_split` as the minimum number.
        - If float, then `min_samples_split` is a fraction and
          `ceil(min_samples_split * n_samples)` are the minimum
          number of samples for each split.

    min_samples_leaf : int, float, default 1
        The minimum number of samples required to be at a leaf node.
        A split point at any depth will only be considered if it leaves at
        least ``min_samples_leaf`` splitting samples in each of the left and
        right branches.  This may have the effect of smoothing the model,
        especially in regression. After construction the tree is also pruned
        so that there are at least min_samples_leaf estimation samples on
        each leaf.

        - If int, then consider `min_samples_leaf` as the minimum number.
        - If float, then `min_samples_leaf` is a fraction and
          `ceil(min_samples_leaf * n_samples)` are the minimum
          number of samples for each node.

    min_weight_fraction_leaf : float, default 0.
        The minimum weighted fraction of the sum total of weights (of all
        splitting samples) required to be at a leaf node. Samples have
        equal weight when sample_weight is not provided. After construction
        the tree is pruned so that the fraction of the sum total weight
        of the estimation samples contained in each leaf node is at
        least min_weight_fraction_leaf

    max_features : int, float, str, or None, default "auto"
        The number of features to consider when looking for the best split:

        - If int, then consider `max_features` features at each split.
        - If float, then `max_features` is a fraction and
          `int(max_features * n_features)` features are considered at each
          split.
        - If "auto", then `max_features=n_features`.
        - If "sqrt", then `max_features=sqrt(n_features)`.
        - If "log2", then `max_features=log2(n_features)`.
        - If None, then `max_features=n_features`.

        Note: the search for a split does not stop until at least one
        valid partition of the node samples is found, even if it requires to
        effectively inspect more than ``max_features`` features.

    min_impurity_decrease : float, default 0.
        A node will be split if this split induces a decrease of the impurity
        greater than or equal to this value.

        The weighted impurity decrease equation is the following::

            N_t / N * (impurity - N_t_R / N_t * right_impurity
                                - N_t_L / N_t * left_impurity)

        where ``N`` is the total number of split samples, ``N_t`` is the number of
        split samples at the current node, ``N_t_L`` is the number of split samples in the
        left child, and ``N_t_R`` is the number of split samples in the right child.

        ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
        if ``sample_weight`` is passed.

    max_samples : int or float in (0, .5], default .45,
        The number of samples to use for each subsample that is used to train each tree:

        - If int, then train each tree on `max_samples` samples, sampled without replacement from all the samples
        - If float, then train each tree on ceil(`max_samples` * `n_samples`), sampled without replacement
          from all the samples.

    min_balancedness_tol: float in [0, .5], default .45
        How imbalanced a split we can tolerate. This enforces that each split leaves at least
        (.5 - min_balancedness_tol) fraction of samples on each side of the split; or fraction
        of the total weight of samples, when sample_weight is not None. Default value, ensures
        that at least 5% of the parent node weight falls in each side of the split. Set it to 0.0 for no
        balancedness and to .5 for perfectly balanced splits. For the formal inference theory
        to be valid, this has to be any positive constant bounded away from zero.

    honest : bool, default True
        Whether to use honest trees, i.e. half of the samples are used for
        creating the tree structure and the other half for the estimation at
        the leafs. If False, then all samples are used for both parts.

    subforest_size : int, default 4,
        The number of trees in each sub-forest that is used in the bootstrap-of-little-bags calculation.
        The parameter `n_estimators` must be divisible by `subforest_size`. Should typically be a small constant.

    n_jobs : int or None, default -1
        The number of jobs to run in parallel for both `fit` and `predict`.
        ``None`` means 1 unless in a :func:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    verbose : int, default 0
        Controls the verbosity when fitting and predicting.

    random_state : int, RandomState instance, or None, default None

        If int, random_state is the seed used by the random number generator;
        If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
        If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
        by :mod:`np.random<numpy.random>`.

    allow_missing: bool
        Whether to allow missing values in W. If True, will need to supply model_propensity and
        model_regression that can handle missing values.

    use_ray: bool, default False
        Whether to use Ray to parallelize the cross-validation step. If True, Ray must be installed.

    ray_remote_func_options : dict, default None
        Options to pass to the remote function when using Ray.
        See https://docs.ray.io/en/latest/ray-core/api/doc/ray.remote.html


    """


[docs]
    def __init__(self, *,
                 model_regression="auto",
                 model_propensity="auto",
                 discrete_outcome=False,
                 featurizer=None,
                 min_propensity=1e-6,
                 categories='auto',
                 cv=2,
                 mc_iters=None,
                 mc_agg='mean',
                 n_estimators=1000,
                 max_depth=None,
                 min_samples_split=5,
                 min_samples_leaf=5,
                 min_weight_fraction_leaf=0.,
                 max_features="auto",
                 min_impurity_decrease=0.,
                 max_samples=.45,
                 min_balancedness_tol=.45,
                 honest=True,
                 subforest_size=4,
                 n_jobs=-1,
                 verbose=0,
                 random_state=None,
                 allow_missing=False,
                 use_ray=False,
                 ray_remote_func_options=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.min_weight_fraction_leaf = min_weight_fraction_leaf
        self.max_features = max_features
        self.min_impurity_decrease = min_impurity_decrease
        self.max_samples = max_samples
        self.min_balancedness_tol = min_balancedness_tol
        self.honest = honest
        self.subforest_size = subforest_size
        self.n_jobs = n_jobs
        self.verbose = verbose
        super().__init__(model_regression=model_regression,
                         model_propensity=model_propensity,
                         model_final=None,
                         discrete_outcome=discrete_outcome,
                         featurizer=featurizer,
                         multitask_model_final=False,
                         min_propensity=min_propensity,
                         categories=categories,
                         cv=cv,
                         mc_iters=mc_iters,
                         mc_agg=mc_agg,
                         random_state=random_state,
                         allow_missing=allow_missing,
                         use_ray=use_ray,
                         ray_remote_func_options=ray_remote_func_options)


    def _gen_allowed_missing_vars(self):
        return ['W'] if self.allow_missing else []

    def _gen_model_final(self):
        return RegressionForest(n_estimators=self.n_estimators,
                                max_depth=self.max_depth,
                                min_samples_split=self.min_samples_split,
                                min_samples_leaf=self.min_samples_leaf,
                                min_weight_fraction_leaf=self.min_weight_fraction_leaf,
                                max_features=self.max_features,
                                min_impurity_decrease=self.min_impurity_decrease,
                                max_samples=self.max_samples,
                                min_balancedness_tol=self.min_balancedness_tol,
                                honest=self.honest,
                                inference=True,
                                subforest_size=self.subforest_size,
                                n_jobs=self.n_jobs,
                                random_state=self.random_state,
                                verbose=self.verbose,
                                warm_start=False)

    def _gen_ortho_learner_model_final(self):
        return _ModelFinal(self._gen_model_final(), self._gen_featurizer(), False)


[docs]
    def fit(self, Y, T, *, X=None, W=None, sample_weight=None, groups=None,
            cache_values=False, inference='auto'):
        """
        Estimate the counterfactual model from data, i.e. estimates functions τ(·,·,·), ∂τ(·,·).

        Parameters
        ----------
        Y: (n × d_y) matrix or vector of length n
            Outcomes for each sample
        T: (n × dₜ) matrix or vector of length n
            Treatments for each sample
        X:  (n × dₓ) matrix, optional
            Features for each sample
        W:  (n × d_w) matrix, optional
            Controls for each sample
        sample_weight : (n,) array_like or None
            Individual weights for each sample. If None, it assumes equal weight.
        groups: (n,) vector, optional
            All rows corresponding to the same group will be kept together during splitting.
            If groups is not None, the `cv` argument passed to this class's initializer
            must support a 'groups' argument to its split method.
        cache_values: bool, default False
            Whether to cache inputs and first stage results, which will allow refitting a different final model
        inference: str, `Inference` instance, or None
            Method for performing inference.  This estimator supports 'bootstrap'
            (or an instance of :class:`.BootstrapInference`) and 'blb'
            (for Bootstrap-of-Little-Bags based inference)

        Returns
        -------
        self
        """
        if X is None:
            raise ValueError("This estimator does not support X=None!")

        return super().fit(Y, T, X=X, W=W,
                           sample_weight=sample_weight, groups=groups,
                           cache_values=cache_values, inference=inference)



[docs]
    def multitask_model_cate(self):
        # Replacing to remove docstring
        super().multitask_model_cate()


    @property
    def multitask_model_final(self):
        return False

    @multitask_model_final.setter
    def multitask_model_final(self, value):
        if value:
            raise ValueError("Parameter `multitask_model_final` cannot change from `False` for this estimator!")

    @property
    def model_final(self):
        return self._gen_model_final()

    @model_final.setter
    def model_final(self, model):
        if model is not None:
            raise ValueError("Parameter `model_final` cannot be altered for this estimator!")