Source code for econml.dr._drlearner

# Copyright (c) PyWhy contributors. All rights reserved.
# Licensed under the MIT License.

"""
Doubly Robust Learner. The method uses the doubly robust correction to construct doubly
robust estimates of all the potential outcomes of each samples. Then estimates a CATE model
by regressing the potential outcome differences on the heterogeneity features X.

References
----------

Dylan Foster, Vasilis Syrgkanis (2019).
    Orthogonal Statistical Learning.
    ACM Conference on Learning Theory. https://arxiv.org/abs/1901.09036

Robins, J.M., Rotnitzky, A., and Zhao, L.P. (1994).
    Estimation of regression coefficients when some regressors are not always observed.
    Journal of the American Statistical Association 89,846–866.

Bang, H. and Robins, J.M. (2005).
    Doubly robust estimation in missing data and causal inference models.
    Biometrics 61,962–972.

Tsiatis AA (2006).
    Semiparametric Theory and Missing Data.
    New York: Springer; 2006.

.. testcode::
    :hide:

    import numpy as np
    import scipy.special
    np.set_printoptions(suppress=True)

"""

from warnings import warn
from copy import deepcopy

import numpy as np
from sklearn.base import clone
from sklearn.linear_model import (LassoCV, LinearRegression,
                                  LogisticRegressionCV)
from sklearn.ensemble import RandomForestRegressor


from .._ortho_learner import _OrthoLearner
from .._cate_estimator import (DebiasedLassoCateEstimatorDiscreteMixin, BaseCateEstimator,
                               ForestModelFinalCateEstimatorDiscreteMixin,
                               StatsModelsCateEstimatorDiscreteMixin, LinearCateEstimator)
from ..inference import GenericModelFinalInferenceDiscrete
from ..grf import RegressionForest
from ..sklearn_extensions.linear_model import (
    DebiasedLasso, StatsModelsLinearRegression, WeightedLassoCVWrapper)
from ..sklearn_extensions.model_selection import ModelSelector, SingleModelSelector, get_selector
from ..utilities import (_deprecate_positional, check_high_dimensional,
                         filter_none_kwargs, inverse_onehot, get_feature_names_or_default)
from .._shap import _shap_explain_multitask_model_cate, _shap_explain_model_cate


class _ModelNuisance(ModelSelector):
    def __init__(self,
                 model_propensity: SingleModelSelector,
                 model_regression: SingleModelSelector,
                 min_propensity,
                 discrete_outcome):
        self._model_propensity = model_propensity
        self._model_regression = model_regression
        self._min_propensity = min_propensity
        self._discrete_outcome = discrete_outcome

    def _combine(self, X, W):
        return np.hstack([arr for arr in [X, W] if arr is not None])

    def train(self, is_selecting, folds, Y, T, X=None, W=None, *, sample_weight=None, groups=None):
        if Y.ndim != 1 and (Y.ndim != 2 or Y.shape[1] != 1):
            raise ValueError("The outcome matrix must be of shape ({0}, ) or ({0}, 1), "
                             "instead got {1}.".format(len(X), Y.shape))
        if (X is None) and (W is None):
            raise AttributeError("At least one of X or W has to not be None!")
        if np.any(np.all(T == 0, axis=0)) or (not np.any(np.all(T == 0, axis=1))):
            raise AttributeError("Provided crossfit folds contain training splits that " +
                                 "don't contain all treatments")
        XW = self._combine(X, W)
        filtered_kwargs = filter_none_kwargs(sample_weight=sample_weight)

        self._model_propensity.train(is_selecting, folds, XW, inverse_onehot(T), groups=groups, **filtered_kwargs)
        self._model_regression.train(is_selecting, folds, np.hstack([XW, T]), Y, groups=groups, **filtered_kwargs)
        return self

    def score(self, Y, T, X=None, W=None, *, sample_weight=None, groups=None):
        XW = self._combine(X, W)
        filtered_kwargs = filter_none_kwargs(sample_weight=sample_weight)

        propensity_score = self._model_propensity.score(XW, inverse_onehot(T), **filtered_kwargs)
        regression_score = self._model_regression.score(np.hstack([XW, T]), Y, **filtered_kwargs)

        return propensity_score, regression_score

    def predict(self, Y, T, X=None, W=None, *, sample_weight=None, groups=None):
        XW = self._combine(X, W)
        propensities = np.maximum(self._model_propensity.predict_proba(XW), self._min_propensity)
        n = T.shape[0]
        Y_pred = np.zeros((T.shape[0], T.shape[1] + 1))
        T_counter = np.zeros(T.shape)
        if hasattr(self._model_regression, 'predict_proba'):
            if self._discrete_outcome:
                Y_pred[:, 0] = self._model_regression.predict_proba(np.hstack([XW, T_counter]))[:, 1].reshape(n)
            else:
                raise AttributeError("Cannot use a classifier for model_regression when discrete_outcome=False!")
        else:
            if self._discrete_outcome:
                warn("A regressor was passed to model_regression when discrete_outcome=True. "
                     "Using a classifier is recommended.", UserWarning)
            Y_pred[:, 0] = self._model_regression.predict(np.hstack([XW, T_counter])).reshape(n)
        Y_pred[:, 0] += (Y.reshape(n) - Y_pred[:, 0]) * np.all(T == 0, axis=1) / propensities[:, 0]
        for t in np.arange(T.shape[1]):
            T_counter = np.zeros(T.shape)
            T_counter[:, t] = 1
            if self._discrete_outcome and hasattr(self._model_regression, 'predict_proba'):
                Y_pred[:, t + 1] = self._model_regression.predict_proba(np.hstack([XW, T_counter]))[:, 1].reshape(n)
            else:
                Y_pred[:, t + 1] = self._model_regression.predict(np.hstack([XW, T_counter])).reshape(n)
            Y_pred[:, t + 1] += (Y.reshape(n) - Y_pred[:, t + 1]) * (T[:, t] == 1) / propensities[:, t + 1]
        T_complete = np.hstack(((np.all(T == 0, axis=1) * 1).reshape(-1, 1), T))
        propensities_weight = np.sum(propensities * T_complete, axis=1)
        return Y_pred.reshape(Y.shape + (T.shape[1] + 1,)), propensities_weight.reshape((n,))

    @property
    def needs_fit(self):
        return self._model_propensity.needs_fit or self._model_regression.needs_fit


def _make_first_stage_selector(model, is_discrete, random_state):
    if model == "auto":
        model = ['linear', 'forest']
    return get_selector(model, is_discrete=is_discrete, random_state=random_state)


class _ModelFinal:
    # Coding Remark: The reasoning around the multitask_model_final could have been simplified if
    # we simply wrapped the model_final with a MultiOutputRegressor. However, because we also want
    # to allow even for model_final objects whose fit(X, y) can accept X=None
    # (e.g. the StatsModelsLinearRegression), we cannot take that route, because the MultiOutputRegressor
    # checks that X is 2D array.
    def __init__(self, model_final, featurizer, multitask_model_final):
        self._model_final = clone(model_final, safe=False)
        self._featurizer = clone(featurizer, safe=False)
        self._multitask_model_final = multitask_model_final
        return

    def fit(self, Y, T, X=None, W=None, *, nuisances,
            sample_weight=None, freq_weight=None, sample_var=None, groups=None):
        Y_pred, propensities = nuisances
        self.d_y = Y_pred.shape[1:-1]  # track whether there's a Y dimension (must be a singleton)
        self.d_t = Y_pred.shape[-1] - 1  # track # of treatment (exclude baseline treatment)
        if (X is not None) and (self._featurizer is not None):
            X = self._featurizer.fit_transform(X)

        if self._multitask_model_final:
            ys = Y_pred[..., 1:] - Y_pred[..., [0]]  # subtract control results from each other arm
            if self.d_y:  # need to squeeze out singleton so that we fit on 2D array
                ys = ys.squeeze(1)
            weighted_sample_var = np.tile((sample_var / propensities**2).reshape((-1, 1)),
                                          self.d_t) if sample_var is not None else None
            filtered_kwargs = filter_none_kwargs(sample_weight=sample_weight,
                                                 freq_weight=freq_weight, sample_var=weighted_sample_var)
            self.model_cate = self._model_final.fit(X, ys, **filtered_kwargs)
        else:
            weighted_sample_var = sample_var / propensities**2 if sample_var is not None else None
            filtered_kwargs = filter_none_kwargs(sample_weight=sample_weight,
                                                 freq_weight=freq_weight, sample_var=weighted_sample_var)
            self.models_cate = [clone(self._model_final, safe=False).fit(X, Y_pred[..., t] - Y_pred[..., 0],
                                                                         **filtered_kwargs)
                                for t in np.arange(1, Y_pred.shape[-1])]
        return self

    def predict(self, X=None):
        if (X is not None) and (self._featurizer is not None):
            X = self._featurizer.transform(X)
        if self._multitask_model_final:
            pred = self.model_cate.predict(X).reshape((-1, self.d_t))
            if self.d_y:  # need to reintroduce singleton Y dimension
                return pred[:, np.newaxis, :]
            return pred
        else:
            preds = np.array([mdl.predict(X).reshape((-1,) + self.d_y) for mdl in self.models_cate])
            return np.moveaxis(preds, 0, -1)  # move treatment dim to end

    def score(self, Y, T, X=None, W=None, *, nuisances, sample_weight=None, groups=None):
        if (X is not None) and (self._featurizer is not None):
            X = self._featurizer.transform(X)
        Y_pred, _ = nuisances
        if self._multitask_model_final:
            Y_pred_diff = Y_pred[..., 1:] - Y_pred[..., [0]]
            cate_pred = self.model_cate.predict(X).reshape((-1, self.d_t))
            if self.d_y:
                cate_pred = cate_pred[:, np.newaxis, :]
            return np.mean(np.average((Y_pred_diff - cate_pred)**2, weights=sample_weight, axis=0))

        else:
            scores = []
            for t in np.arange(1, Y_pred.shape[-1]):
                # since we only allow single dimensional y, we could flatten the prediction
                Y_pred_diff = (Y_pred[..., t] - Y_pred[..., 0]).flatten()
                cate_pred = self.models_cate[t - 1].predict(X).flatten()
                score = np.average((Y_pred_diff - cate_pred)**2, weights=sample_weight, axis=0)
                scores.append(score)
            return np.mean(scores)


[docs]class DRLearner(_OrthoLearner): """ CATE estimator that uses doubly-robust correction techniques to account for covariate shift (selection bias) between the treatment arms. The estimator is a special case of an :class:`._OrthoLearner` estimator, so it follows the two stage process, where a set of nuisance functions are estimated in the first stage in a crossfitting manner and a final stage estimates the CATE model. See the documentation of :class:`._OrthoLearner` for a description of this two stage process. In this estimator, the CATE is estimated by using the following estimating equations. If we let: .. math :: Y_{i, t}^{DR} = E[Y | X_i, W_i, T_i=t]\ + \\frac{Y_i - E[Y | X_i, W_i, T_i=t]}{Pr[T_i=t | X_i, W_i]} \\cdot 1\\{T_i=t\\} Then the following estimating equation holds: .. math :: E\\left[Y_{i, t}^{DR} - Y_{i, 0}^{DR} | X_i\\right] = \\theta_t(X_i) Thus if we estimate the nuisance functions :math:`h(X, W, T) = E[Y | X, W, T]` and :math:`p_t(X, W)=Pr[T=t | X, W]` in the first stage, we can estimate the final stage cate for each treatment t, by running a regression, regressing :math:`Y_{i, t}^{DR} - Y_{i, 0}^{DR}` on :math:`X_i`. The problem of estimating the nuisance function :math:`p` is a simple multi-class classification problem of predicting the label :math:`T` from :math:`X, W`. The :class:`.DRLearner` class takes as input the parameter ``model_propensity``, which is an arbitrary scikit-learn classifier, that is internally used to solve this classification problem. The second nuisance function :math:`h` is a simple regression problem and the :class:`.DRLearner` class takes as input the parameter ``model_regressor``, which is an arbitrary scikit-learn regressor that is internally used to solve this regression problem. The final stage is multi-task regression problem with outcomes the labels :math:`Y_{i, t}^{DR} - Y_{i, 0}^{DR}` for each non-baseline treatment t. The :class:`.DRLearner` takes as input parameter ``model_final``, which is any scikit-learn regressor that is internally used to solve this multi-task regresion problem. If the parameter ``multitask_model_final`` is False, then this model is assumed to be a mono-task regressor, and separate clones of it are used to solve each regression target separately. Parameters ---------- model_propensity: estimator, default ``'auto'`` Classifier for Pr[T=t | X, W]. Trained by regressing treatments on (features, controls) concatenated. - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models - Otherwise, see :ref:`model_selection` for the range of supported options model_regression: estimator, default ``'auto'`` Estimator for E[Y | X, W, T]. Trained by regressing Y on (features, controls, one-hot-encoded treatments) concatenated. The one-hot-encoding excludes the baseline treatment. - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models - Otherwise, see :ref:`model_selection` for the range of supported options; if a single model is specified it should be a classifier if `discrete_outcome` is True and a regressor otherwise model_final : estimator for the final cate model. Trained on regressing the doubly robust potential outcomes on (features X). - If X is None, then the fit method of model_final should be able to handle X=None. - If featurizer is not None and X is not None, then it is trained on the outcome of featurizer.fit_transform(X). - If multitask_model_final is True, then this model must support multitasking and it is trained by regressing all doubly robust target outcomes on (featurized) features simultanteously. - The output of the predict(X) of the trained model will contain the CATEs for each treatment compared to baseline treatment (lexicographically smallest). If multitask_model_final is False, it is assumed to be a mono-task model and a separate clone of the model is trained for each outcome. Then predict(X) of the t-th clone will be the CATE of the t-th lexicographically ordered treatment compared to the baseline. discrete_outcome: bool, default False Whether the outcome should be treated as binary multitask_model_final : bool, default False Whether the model_final should be treated as a multi-task model. See description of model_final. featurizer : :term:`transformer`, optional Must support fit_transform and transform. Used to create composite features in the final CATE regression. It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X). If featurizer=None, then CATE is trained on X. min_propensity : float, default ``1e-6`` The minimum propensity at which to clip propensity estimates to avoid dividing by zero. categories: 'auto' or list, default 'auto' The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values). The first category will be treated as the control treatment. cv: int, cross-validation generator or an iterable, default 2 Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 3-fold cross-validation, - integer, to specify the number of folds. - :term:`CV splitter` - An iterable yielding (train, test) splits as arrays of indices. For integer/None inputs, if the treatment is discrete :class:`~sklearn.model_selection.StratifiedKFold` is used, else, :class:`~sklearn.model_selection.KFold` is used (with a random shuffle in either case). Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all W, X are None, then we call `split(ones((T.shape[0], 1)), T)`. mc_iters: int, optional The number of times to rerun the first stage models to reduce the variance of the nuisances. mc_agg: {'mean', 'median'}, default 'mean' How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of cross-fitting. random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None If int, random_state is the seed used by the random number generator; If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator; If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used by :mod:`np.random<numpy.random>`. allow_missing: bool Whether to allow missing values in X, W. If True, will need to supply model_propensity, model_regression, and model_final that can handle missing values. use_ray: bool, default False Whether to use Ray to parallelize the cross-fitting step. If True, Ray must be installed. ray_remote_func_options : dict, default None Options to pass to the remote function when using Ray. See https://docs.ray.io/en/latest/ray-core/api/doc/ray.remote.html Examples -------- A simple example with the default models: .. testcode:: :hide: import numpy as np import scipy.special np.set_printoptions(suppress=True) .. testcode:: from econml.dr import DRLearner np.random.seed(123) X = np.random.normal(size=(1000, 3)) T = np.random.binomial(2, scipy.special.expit(X[:, 0])) sigma = 0.001 y = (1 + .5*X[:, 0]) * T + X[:, 0] + np.random.normal(0, sigma, size=(1000,)) est = DRLearner() est.fit(y, T, X=X, W=None) >>> est.const_marginal_effect(X[:2]) array([[0.516931..., 0.995704...], [0.356427..., 0.671870...]]) >>> est.effect(X[:2], T0=0, T1=1) array([0.516931..., 0.356427...]) >>> est.score_ 2.84365756... >>> est.score(y, T, X=X) 1.06259465... >>> est.model_cate(T=1).coef_ array([ 0.447095..., -0.001013... , 0.018982...]) >>> est.model_cate(T=2).coef_ array([ 0.925055..., -0.012357... , 0.033489...]) >>> est.cate_feature_names() ['X0', 'X1', 'X2'] Beyond default models: .. testcode:: from sklearn.linear_model import LassoCV from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from econml.dr import DRLearner np.random.seed(123) X = np.random.normal(size=(1000, 3)) T = np.random.binomial(2, scipy.special.expit(X[:, 0])) sigma = 0.01 y = (1 + .5*X[:, 0]) * T + X[:, 0] + np.random.normal(0, sigma, size=(1000,)) est = DRLearner(model_propensity=RandomForestClassifier(n_estimators=100, min_samples_leaf=10), model_regression=RandomForestRegressor(n_estimators=100, min_samples_leaf=10), model_final=LassoCV(cv=3), featurizer=None) est.fit(y, T, X=X, W=None) >>> est.score_ 1.7... >>> est.const_marginal_effect(X[:3]) array([[0.68..., 1.10...], [0.56..., 0.79... ], [0.34..., 0.10... ]]) >>> est.model_cate(T=2).coef_ array([0.74..., 0. , 0. ]) >>> est.model_cate(T=2).intercept_ 1.9... >>> est.model_cate(T=1).coef_ array([0.24..., 0.00..., 0. ]) >>> est.model_cate(T=1).intercept_ 0.94... Attributes ---------- score_ : float The MSE in the final doubly robust potential outcome regressions, i.e. .. math:: \\frac{1}{n_t} \\sum_{t=1}^{n_t} \\frac{1}{n} \\sum_{i=1}^n (Y_{i, t}^{DR} - \\hat{\\theta}_t(X_i))^2 where n_t is the number of treatments (excluding control). If `sample_weight` is not None at fit time, then a weighted average across samples is returned. """
[docs] def __init__(self, *, model_propensity='auto', model_regression='auto', model_final=StatsModelsLinearRegression(), discrete_outcome=False, multitask_model_final=False, featurizer=None, min_propensity=1e-6, categories='auto', cv=2, mc_iters=None, mc_agg='mean', random_state=None, allow_missing=False, use_ray=False, ray_remote_func_options=None ): self.model_propensity = clone(model_propensity, safe=False) self.model_regression = clone(model_regression, safe=False) self.model_final = clone(model_final, safe=False) self.multitask_model_final = multitask_model_final self.featurizer = clone(featurizer, safe=False) self.min_propensity = min_propensity super().__init__(cv=cv, mc_iters=mc_iters, mc_agg=mc_agg, discrete_outcome=discrete_outcome, discrete_treatment=True, treatment_featurizer=None, # treatment featurization not supported with discrete treatment discrete_instrument=False, # no instrument, so doesn't matter categories=categories, random_state=random_state, allow_missing=allow_missing, use_ray=use_ray, ray_remote_func_options=ray_remote_func_options )
def _gen_allowed_missing_vars(self): return ['X', 'W'] if self.allow_missing else [] # override only so that we can exclude treatment featurization verbiage in docstring
[docs] def const_marginal_effect(self, X=None): """ Calculate the constant marginal CATE :math:`\\theta(·)`. The marginal effect is conditional on a vector of features on a set of m test samples X[i]. Parameters ---------- X: (m, d_x) matrix, optional Features for each sample. Returns ------- theta: (m, d_y, d_t) matrix or (d_y, d_t) matrix if X is None Constant marginal CATE of each treatment on each outcome for each sample X[i]. Note that when Y or T is a vector rather than a 2-dimensional array, the corresponding singleton dimensions in the output will be collapsed (e.g. if both are vectors, then the output of this method will also be a vector) """ return super().const_marginal_effect(X=X)
# override only so that we can exclude treatment featurization verbiage in docstring
[docs] def const_marginal_ate(self, X=None): """ Calculate the average constant marginal CATE :math:`E_X[\\theta(X)]`. Parameters ---------- X: (m, d_x) matrix, optional Features for each sample. Returns ------- theta: (d_y, d_t) matrix Average constant marginal CATE of each treatment on each outcome. Note that when Y or T is a vector rather than a 2-dimensional array, the corresponding singleton dimensions in the output will be collapsed (e.g. if both are vectors, then the output of this method will be a scalar) """ return super().const_marginal_ate(X=X)
def _get_inference_options(self): options = super()._get_inference_options() if not self.multitask_model_final: options.update(auto=GenericModelFinalInferenceDiscrete) else: options.update(auto=lambda: None) return options def _gen_ortho_learner_model_nuisance(self): model_propensity = _make_first_stage_selector(self.model_propensity, True, self.random_state) model_regression = _make_first_stage_selector(self.model_regression, self.discrete_outcome, self.random_state) return _ModelNuisance(model_propensity, model_regression, self.min_propensity, self.discrete_outcome) def _gen_featurizer(self): return clone(self.featurizer, safe=False) def _gen_model_final(self): return clone(self.model_final, safe=False) def _gen_ortho_learner_model_final(self): return _ModelFinal(self._gen_model_final(), self._gen_featurizer(), self.multitask_model_final)
[docs] def fit(self, Y, T, *, X=None, W=None, sample_weight=None, freq_weight=None, sample_var=None, groups=None, cache_values=False, inference='auto'): """ Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`. Parameters ---------- Y: (n,) vector of length n Outcomes for each sample T: (n,) vector of length n Treatments for each sample X:(n, d_x) matrix, optional Features for each sample W:(n, d_w) matrix, optional Controls for each sample sample_weight : (n,) array_like, optional Individual weights for each sample. If None, it assumes equal weight. freq_weight: (n,) array_like of int, optional Weight for the observation. Observation i is treated as the mean outcome of freq_weight[i] independent observations. When ``sample_var`` is not None, this should be provided. sample_var : (n,) nd array_like, optional Variance of the outcome(s) of the original freq_weight[i] observations that were used to compute the mean outcome represented by observation i. groups: (n,) vector, optional All rows corresponding to the same group will be kept together during splitting. If groups is not None, the `cv` argument passed to this class's initializer must support a 'groups' argument to its split method. cache_values: bool, default False Whether to cache inputs and first stage results, which will allow refitting a different final model inference: str, :class:`.Inference` instance, or None Method for performing inference. This estimator supports 'bootstrap' (or an instance of :class:`.BootstrapInference`). Returns ------- self: DRLearner instance """ # Replacing fit from _OrthoLearner, to enforce Z=None and improve the docstring return super().fit(Y, T, X=X, W=W, sample_weight=sample_weight, freq_weight=freq_weight, sample_var=sample_var, groups=groups, cache_values=cache_values, inference=inference)
[docs] def refit_final(self, *, inference='auto'): return super().refit_final(inference=inference)
refit_final.__doc__ = _OrthoLearner.refit_final.__doc__
[docs] def score(self, Y, T, X=None, W=None, sample_weight=None): """ Score the fitted CATE model on a new data set. Generates nuisance parameters for the new data set based on the fitted residual nuisance models created at fit time. It uses the mean prediction of the models fitted by the different crossfit folds. Then calculates the MSE of the final residual Y on residual T regression. If model_final does not have a score method, then it raises an :exc:`.AttributeError` Parameters ---------- Y: (n,) vector of length n Outcomes for each sample T: (n,) vector of length n Treatments for each sample X:(n, d_x) matrix, optional Features for each sample W:(n, d_w) matrix, optional Controls for each sample sample_weight:(n,) vector, optional Weights for each samples Returns ------- score: float The MSE of the final CATE model on the new data. """ # Replacing score from _OrthoLearner, to enforce Z=None and improve the docstring return super().score(Y, T, X=X, W=W, sample_weight=sample_weight)
@property def multitask_model_cate(self): """ Get the fitted final CATE model. Returns ------- multitask_model_cate: object of type(`model_final`) An instance of the model_final object that was fitted after calling fit which corresponds whose vector of outcomes correspond to the CATE model for each treatment, compared to baseline. Available only when multitask_model_final=True. """ if not self.ortho_learner_model_final_._multitask_model_final: raise AttributeError("Separate CATE models were fitted for each treatment! Use model_cate.") return self.ortho_learner_model_final_.model_cate
[docs] def model_cate(self, T=1): """ Get the fitted final CATE model. Parameters ---------- T: alphanumeric The treatment with respect to which we want the fitted CATE model. Returns ------- model_cate: object of type(model_final) An instance of the model_final object that was fitted after calling fit which corresponds to the CATE model for treatment T=t, compared to baseline. Available when multitask_model_final=False. """ if self.ortho_learner_model_final_._multitask_model_final: raise AttributeError("A single multitask model was fitted for all treatments! Use multitask_model_cate.") _, T = self._expand_treatments(None, T) ind = inverse_onehot(T).item() - 1 assert ind >= 0, "No model was fitted for the control" return self.ortho_learner_model_final_.models_cate[ind]
@property def models_propensity(self): """ Get the fitted propensity models. Returns ------- models_propensity: nested list of objects of type(`model_propensity`) A nested list of instances of the `model_propensity` object. Number of sublist equals to number of monte carlo iterations, each element in the sublist corresponds to a crossfitting fold and is the model instance that was fitted for that training fold. """ return [[mdl._model_propensity.best_model for mdl in mdls] for mdls in super().models_nuisance_] @property def models_regression(self): """ Get the fitted regression models. Returns ------- model_regression: nested list of objects of type(`model_regression`) A nested list of instances of the model_regression object. Number of sublist equals to number of monte carlo iterations, each element in the sublist corresponds to a crossfitting fold and is the model instance that was fitted for that training fold. """ return [[mdl._model_regression.best_model for mdl in mdls] for mdls in super().models_nuisance_] @property def nuisance_scores_propensity(self): """Gets the score for the propensity model on out-of-sample training data""" return self.nuisance_scores_[0] @property def nuisance_scores_regression(self): """Gets the score for the regression model on out-of-sample training data""" return self.nuisance_scores_[1] @property def featurizer_(self): """ Get the fitted featurizer. Returns ------- featurizer: object of type(`featurizer`) An instance of the fitted featurizer that was used to preprocess X in the final CATE model training. Available only when featurizer is not None and X is not None. """ return self.ortho_learner_model_final_._featurizer
[docs] def cate_feature_names(self, feature_names=None): """ Get the output feature names. Parameters ---------- feature_names: list of str of length X.shape[1] or None The names of the input features. If None and X is a dataframe, it defaults to the column names from the dataframe. Returns ------- out_feature_names: list of str or None The names of the output features :math:`\\phi(X)`, i.e. the features with respect to which the final CATE model for each treatment is linear. It is the names of the features that are associated with each entry of the :meth:`coef_` parameter. Available only when the featurizer is not None and has a method: `get_feature_names(feature_names)`. Otherwise None is returned. """ if self._d_x is None: # Handles the corner case when X=None but featurizer might be not None return None if feature_names is None: feature_names = self._input_names["feature_names"] if self.featurizer_ is None: return feature_names return get_feature_names_or_default(self.featurizer_, feature_names)
@property def model_final_(self): return self.ortho_learner_model_final_._model_final @property def fitted_models_final(self): return self.ortho_learner_model_final_.models_cate
[docs] def shap_values(self, X, *, feature_names=None, treatment_names=None, output_names=None, background_samples=100): if self.ortho_learner_model_final_._multitask_model_final: return _shap_explain_multitask_model_cate(self.const_marginal_effect, self.multitask_model_cate, X, self._d_t, self._d_y, featurizer=self.featurizer_, feature_names=feature_names, treatment_names=treatment_names, output_names=output_names, input_names=self._input_names, background_samples=background_samples) else: return _shap_explain_model_cate(self.const_marginal_effect, self.fitted_models_final, X, self._d_t, self._d_y, featurizer=self.featurizer_, feature_names=feature_names, treatment_names=treatment_names, output_names=output_names, input_names=self._input_names, background_samples=background_samples)
shap_values.__doc__ = LinearCateEstimator.shap_values.__doc__
[docs]class LinearDRLearner(StatsModelsCateEstimatorDiscreteMixin, DRLearner): """ Special case of the :class:`.DRLearner` where the final stage is a Linear Regression on a low dimensional set of features. In this case, inference can be performed via the asymptotic normal characterization of the estimated parameters. This is computationally faster than bootstrap inference. To do this, just leave the setting ``inference='auto'`` unchanged, or explicitly set ``inference='statsmodels'`` or alter the covariance type calculation via ``inference=StatsModelsInferenceDiscrete(cov_type='HC1)``. More concretely, this estimator assumes that the final cate model for each treatment takes a linear form: .. math :: \\theta_t(X) = \\left\\langle \\theta_t, \\phi(X) \\right\\rangle + \\beta_t where :math:`\\phi(X)` is the outcome features of the featurizers, or `X` if featurizer is None. :math:`\\beta_t` is an intercept of the CATE, which is included if ``fit_cate_intercept=True`` (Default). It fits this by running a standard ordinary linear regression (OLS), regressing the doubly robust outcome differences on X: .. math :: \\min_{\\theta_t, \\beta_t}\ E_n\\left[\\left(Y_{i, t}^{DR} - Y_{i, 0}^{DR}\ - \\left\\langle \\theta_t, \\phi(X_i) \\right\\rangle - \\beta_t\\right)^2\\right] Then inference can be performed via standard approaches for inference of OLS, via asympotic normal approximations of the estimated parameters. The default covariance estimator used is heteroskedasticity robust (HC1). For other methods see :class:`.StatsModelsInferenceDiscrete`. Use can invoke them by setting: ``inference=StatsModelsInferenceDiscrete(cov_type=...)``. This approach is valid even if the CATE model is not linear in :math:`\\phi(X)`. In this case it performs inference on the best linear approximation of the CATE model. Parameters ---------- model_propensity: estimator, default ``'auto'`` Classifier for Pr[T=t | X, W]. Trained by regressing treatments on (features, controls) concatenated. - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models - Otherwise, see :ref:`model_selection` for the range of supported options model_regression: estimator, default ``'auto'`` Estimator for E[Y | X, W, T]. Trained by regressing Y on (features, controls, one-hot-encoded treatments) concatenated. The one-hot-encoding excludes the baseline treatment. - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models - Otherwise, see :ref:`model_selection` for the range of supported options; if a single model is specified it should be a classifier if `discrete_outcome` is True and a regressor otherwise featurizer : :term:`transformer`, optional Must support fit_transform and transform. Used to create composite features in the final CATE regression. It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X). If featurizer=None, then CATE is trained on X. fit_cate_intercept : bool, default True Whether the linear CATE model should have a constant term. discrete_outcome: bool, default False Whether the outcome should be treated as binary min_propensity : float, default ``1e-6`` The minimum propensity at which to clip propensity estimates to avoid dividing by zero. categories: 'auto' or list, default 'auto' The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values). The first category will be treated as the control treatment. cv: int, cross-validation generator or an iterable, default 2 Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 3-fold cross-validation, - integer, to specify the number of folds. - :term:`CV splitter` - An iterable yielding (train, test) splits as arrays of indices. For integer/None inputs, if the treatment is discrete :class:`~sklearn.model_selection.StratifiedKFold` is used, else, :class:`~sklearn.model_selection.KFold` is used (with a random shuffle in either case). Unless an iterable is used, we call `split(X,T)` to generate the splits. mc_iters: int, optional The number of times to rerun the first stage models to reduce the variance of the nuisances. mc_agg: {'mean', 'median'}, default 'mean' How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of cross-fitting. random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None If int, random_state is the seed used by the random number generator; If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator; If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used by :mod:`np.random<numpy.random>`. allow_missing: bool Whether to allow missing values in W. If True, will need to supply model_propensity and model_regression that can handle missing values. enable_federation: bool, default False Whether to enable federation for the final model. This has a memory cost so should be enabled only if this model will be aggregated with other models. use_ray: bool, default False Whether to use Ray to parallelize the cross-fitting step. If True, Ray must be installed. ray_remote_func_options : dict, default None Options to pass to the remote function when using Ray. See https://docs.ray.io/en/latest/ray-core/api/doc/ray.remote.html Examples -------- A simple example with the default models: .. testcode:: :hide: import numpy as np import scipy.special np.set_printoptions(suppress=True) .. testcode:: from econml.dr import DRLearner, LinearDRLearner np.random.seed(123) X = np.random.normal(size=(1000, 3)) T = np.random.binomial(2, scipy.special.expit(X[:, 0])) y = (1 + .5*X[:, 0]) * T + X[:, 0] + np.random.normal(size=(1000,)) est = LinearDRLearner() est.fit(y, T, X=X, W=None) >>> est.effect(X[:3]) array([ 0.432476..., 0.359739..., -0.085326...]) >>> est.effect_interval(X[:3]) (array([ 0.084145... , -0.178020..., -0.734688...]), array([0.780807..., 0.897500..., 0.564035...])) >>> est.coef_(T=1) array([ 0.450620..., -0.008792..., 0.075242...]) >>> est.coef__interval(T=1) (array([ 0.156233... , -0.252177..., -0.159805...]), array([0.745007..., 0.234592..., 0.310290...])) >>> est.intercept_(T=1) 0.90916103... >>> est.intercept__interval(T=1) (0.66855287..., 1.14976919...) Attributes ---------- score_ : float The MSE in the final doubly robust potential outcome regressions, i.e. .. math:: \\frac{1}{n_t} \\sum_{t=1}^{n_t} \\frac{1}{n} \\sum_{i=1}^n (Y_{i, t}^{DR} - \\hat{\\theta}_t(X_i))^2 where n_t is the number of treatments (excluding control). If `sample_weight` is not None at fit time, then a weighted average across samples is returned. """
[docs] def __init__(self, *, model_propensity='auto', model_regression='auto', featurizer=None, fit_cate_intercept=True, discrete_outcome=False, min_propensity=1e-6, categories='auto', cv=2, mc_iters=None, mc_agg='mean', random_state=None, allow_missing=False, enable_federation=False, use_ray=False, ray_remote_func_options=None): self.fit_cate_intercept = fit_cate_intercept self.enable_federation = enable_federation super().__init__(model_propensity=model_propensity, model_regression=model_regression, model_final=None, discrete_outcome=discrete_outcome, featurizer=featurizer, multitask_model_final=False, min_propensity=min_propensity, categories=categories, cv=cv, mc_iters=mc_iters, mc_agg=mc_agg, random_state=random_state, allow_missing=allow_missing, use_ray=use_ray, ray_remote_func_options=ray_remote_func_options )
def _gen_allowed_missing_vars(self): return ['W'] if self.allow_missing else [] def _gen_model_final(self): return StatsModelsLinearRegression(fit_intercept=self.fit_cate_intercept, enable_federation=self.enable_federation) def _gen_ortho_learner_model_final(self): return _ModelFinal(self._gen_model_final(), self._gen_featurizer(), False)
[docs] def fit(self, Y, T, *, X=None, W=None, sample_weight=None, freq_weight=None, sample_var=None, groups=None, cache_values=False, inference='auto'): """ Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`. Parameters ---------- Y: (n,) vector of length n Outcomes for each sample T: (n,) vector of length n Treatments for each sample X:(n, d_x) matrix, optional Features for each sample W:(n, d_w) matrix, optional Controls for each sample sample_weight : (n,) array_like, optional Individual weights for each sample. If None, it assumes equal weight. freq_weight: (n,) array_like of int, optional Weight for the observation. Observation i is treated as the mean outcome of freq_weight[i] independent observations. When ``sample_var`` is not None, this should be provided. sample_var : (n,) nd array_like, optional Variance of the outcome(s) of the original freq_weight[i] observations that were used to compute the mean outcome represented by observation i. groups: (n,) vector, optional All rows corresponding to the same group will be kept together during splitting. If groups is not None, the `cv` argument passed to this class's initializer must support a 'groups' argument to its split method. cache_values: bool, default False Whether to cache inputs and first stage results, which will allow refitting a different final model inference: str, :class:`.Inference` instance, or None Method for performing inference. This estimator supports ``'bootstrap'`` (or an instance of :class:`.BootstrapInference`) and ``'statsmodels'`` (or an instance of :class:`.StatsModelsInferenceDiscrete`). Returns ------- self: DRLearner instance """ # Replacing fit from DRLearner, to add statsmodels inference in docstring return super().fit(Y, T, X=X, W=W, sample_weight=sample_weight, freq_weight=freq_weight, sample_var=sample_var, groups=groups, cache_values=cache_values, inference=inference)
@property def fit_cate_intercept_(self): return self.model_final_.fit_intercept @property def multitask_model_cate(self): # Replacing this method which is invalid for this class, so that we make the # dosctring empty and not appear in the docs. return super().multitask_model_cate @property def multitask_model_final(self): return False @multitask_model_final.setter def multitask_model_final(self, value): if value: raise ValueError("Parameter `multitask_model_final` cannot change from `False` for this estimator!") @property def model_final(self): return self._gen_model_final() @model_final.setter def model_final(self, model): if model is not None: raise ValueError("Parameter `model_final` cannot be altered for this estimator!")
[docs]class SparseLinearDRLearner(DebiasedLassoCateEstimatorDiscreteMixin, DRLearner): """ Special case of the :class:`.DRLearner` where the final stage is a Debiased Lasso Regression. In this case, inference can be performed via the debiased lasso approach and its asymptotic normal characterization of the estimated parameters. This is computationally faster than bootstrap inference. Leave the default ``inference='auto'`` unchanged, or explicitly set ``inference='debiasedlasso'`` at fit time to enable inference via asymptotic normality. More concretely, this estimator assumes that the final cate model for each treatment takes a linear form: .. math :: \\theta_t(X) = \\left\\langle \\theta_t, \\phi(X) \\right\\rangle + \\beta_t where :math:`\\phi(X)` is the outcome features of the featurizers, or `X` if featurizer is None. :math:`\\beta_t` is a an intercept of the CATE, which is included if ``fit_cate_intercept=True`` (Default). It fits this by running a debiased lasso regression (i.e. :math:`\\ell_1`-penalized regression with debiasing), regressing the doubly robust outcome differences on X: i.e. first solves the penalized square loss problem .. math :: \\min_{\\theta_t, \\beta_t}\ E_n\\left[\\left(Y_{i, t}^{DR} - Y_{i, 0}^{DR}\ - \\left\\langle \\theta_t, \\phi(X_i) \\right\\rangle - \\beta_t\\right)^2\\right]\ + \\lambda \\left\\lVert \\theta_t \\right\\rVert_1 and then adds a debiasing correction to the solution. If alpha='auto' (recommended), then the penalty weight :math:`\\lambda` is set optimally via cross-validation. This approach is valid even if the CATE model is not linear in :math:`\\phi(X)`. In this case it performs inference on the best sparse linear approximation of the CATE model. Parameters ---------- model_propensity: estimator, default ``'auto'`` Classifier for Pr[T=t | X, W]. Trained by regressing treatments on (features, controls) concatenated. - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models - Otherwise, see :ref:`model_selection` for the range of supported options model_regression: estimator, default ``'auto'`` Estimator for E[Y | X, W, T]. Trained by regressing Y on (features, controls, one-hot-encoded treatments) concatenated. The one-hot-encoding excludes the baseline treatment. - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models - Otherwise, see :ref:`model_selection` for the range of supported options; if a single model is specified it should be a classifier if `discrete_outcome` is True and a regressor otherwise featurizer : :term:`transformer`, optional Must support fit_transform and transform. Used to create composite features in the final CATE regression. It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X). If featurizer=None, then CATE is trained on X. fit_cate_intercept : bool, default True Whether the linear CATE model should have a constant term. discrete_outcome: bool, default False Whether the outcome should be treated as binary alpha: str | float, optional., default 'auto'. CATE L1 regularization applied through the debiased lasso in the final model. 'auto' corresponds to a CV form of the :class:`DebiasedLasso`. n_alphas : int, default 100 How many alphas to try if alpha='auto' alpha_cov : str | float, default 'auto' The regularization alpha that is used when constructing the pseudo inverse of the covariance matrix Theta used to for correcting the final state lasso coefficient in the debiased lasso. Each such regression corresponds to the regression of one feature on the remainder of the features. n_alphas_cov : int, default 10 How many alpha_cov to try if alpha_cov='auto'. max_iter : int, default 1000 The maximum number of iterations in the Debiased Lasso tol : float, default 1e-4 The tolerance for the optimization: if the updates are smaller than ``tol``, the optimization code checks the dual gap for optimality and continues until it is smaller than ``tol``. n_jobs : int or None, optional The number of jobs to run in parallel for both `fit` and `predict`. ``None`` means 1 unless in a :func:`joblib.parallel_backend` context. ``-1`` means using all processors. min_propensity : float, default ``1e-6`` The minimum propensity at which to clip propensity estimates to avoid dividing by zero. categories: 'auto' or list, default 'auto' The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values). The first category will be treated as the control treatment. cv: int, cross-validation generator or an iterable, default 2 Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 3-fold cross-validation, - integer, to specify the number of folds. - :term:`CV splitter` - An iterable yielding (train, test) splits as arrays of indices. For integer/None inputs, if the treatment is discrete :class:`~sklearn.model_selection.StratifiedKFold` is used, else, :class:`~sklearn.model_selection.KFold` is used (with a random shuffle in either case). Unless an iterable is used, we call `split(X,T)` to generate the splits. mc_iters: int, optional The number of times to rerun the first stage models to reduce the variance of the nuisances. mc_agg: {'mean', 'median'}, default 'mean' How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of cross-fitting. random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None If int, random_state is the seed used by the random number generator; If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator; If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used by :mod:`np.random<numpy.random>`. allow_missing: bool Whether to allow missing values in W. If True, will need to supply model_propensity and model_regression that can handle missing values. use_ray: bool, default False Whether to use Ray to parallelize the cross-validation step. If True, Ray must be installed. ray_remote_func_options : dict, default None Options to pass to the remote function when using Ray. See https://docs.ray.io/en/latest/ray-core/api/doc/ray.remote.html Examples -------- A simple example with the default models: .. testcode:: :hide: import numpy as np import scipy.special np.set_printoptions(suppress=True) .. testcode:: from econml.dr import DRLearner, SparseLinearDRLearner np.random.seed(123) X = np.random.normal(size=(1000, 3)) T = np.random.binomial(2, scipy.special.expit(X[:, 0])) y = (1 + .5*X[:, 0]) * T + X[:, 0] + np.random.normal(size=(1000,)) est = SparseLinearDRLearner() est.fit(y, T, X=X, W=None) >>> est.effect(X[:3]) array([ 0.43..., 0.35..., -0.08... ]) >>> est.effect_interval(X[:3]) (array([-0.01..., -0.26..., -0.81...]), array([0.87..., 0.98..., 0.65...])) >>> est.coef_(T=1) array([ 0.44..., -0.00..., 0.07...]) >>> est.coef__interval(T=1) (array([ 0.19... , -0.24..., -0.17...]), array([0.70..., 0.22..., 0.32...])) >>> est.intercept_(T=1) 0.90... >>> est.intercept__interval(T=1) (0.66..., 1.14...) Attributes ---------- score_ : float The MSE in the final doubly robust potential outcome regressions, i.e. .. math:: \\frac{1}{n_t} \\sum_{t=1}^{n_t} \\frac{1}{n} \\sum_{i=1}^n (Y_{i, t}^{DR} - \\hat{\\theta}_t(X_i))^2 where n_t is the number of treatments (excluding control). If `sample_weight` is not None at fit time, then a weighted average across samples is returned. """
[docs] def __init__(self, *, model_propensity='auto', model_regression='auto', featurizer=None, fit_cate_intercept=True, discrete_outcome=False, alpha='auto', n_alphas=100, alpha_cov='auto', n_alphas_cov=10, max_iter=1000, tol=1e-4, n_jobs=None, min_propensity=1e-6, categories='auto', cv=2, mc_iters=None, mc_agg='mean', random_state=None, allow_missing=False, use_ray=False, ray_remote_func_options=None): self.fit_cate_intercept = fit_cate_intercept self.alpha = alpha self.n_alphas = n_alphas self.alpha_cov = alpha_cov self.n_alphas_cov = n_alphas_cov self.max_iter = max_iter self.tol = tol self.n_jobs = n_jobs super().__init__(model_propensity=model_propensity, model_regression=model_regression, model_final=None, discrete_outcome=discrete_outcome, featurizer=featurizer, multitask_model_final=False, min_propensity=min_propensity, categories=categories, cv=cv, mc_iters=mc_iters, mc_agg=mc_agg, random_state=random_state, allow_missing=allow_missing, use_ray=use_ray, ray_remote_func_options=ray_remote_func_options)
def _gen_allowed_missing_vars(self): return ['W'] if self.allow_missing else [] def _gen_model_final(self): return DebiasedLasso(alpha=self.alpha, n_alphas=self.n_alphas, alpha_cov=self.alpha_cov, n_alphas_cov=self.n_alphas_cov, fit_intercept=self.fit_cate_intercept, max_iter=self.max_iter, tol=self.tol, n_jobs=self.n_jobs, random_state=self.random_state) def _gen_ortho_learner_model_final(self): return _ModelFinal(self._gen_model_final(), self._gen_featurizer(), False)
[docs] def fit(self, Y, T, *, X=None, W=None, sample_weight=None, groups=None, cache_values=False, inference='auto'): """ Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`. Parameters ---------- Y: (n,) vector of length n Outcomes for each sample T: (n,) vector of length n Treatments for each sample X:(n, d_x) matrix, optional Features for each sample W:(n, d_w) matrix, optional Controls for each sample sample_weight : (n,) array_like or None Individual weights for each sample. If None, it assumes equal weight. groups: (n,) vector, optional All rows corresponding to the same group will be kept together during splitting. If groups is not None, the `cv` argument passed to this class's initializer must support a 'groups' argument to its split method. cache_values: bool, default False Whether to cache inputs and first stage results, which will allow refitting a different final model inference: str, :class:`.Inference` instance, or None Method for performing inference. This estimator supports ``'bootstrap'`` (or an instance of :class:`.BootstrapInference`) and ``'debiasedlasso'`` (or an instance of :class:`.LinearModelInferenceDiscrete`). Returns ------- self: DRLearner instance """ # TODO: support freq_weight and sample_var in debiased lasso # Replacing fit from DRLearner, to add debiasedlasso inference in docstring check_high_dimensional(X, T, threshold=5, featurizer=self.featurizer, discrete_treatment=self.discrete_treatment, msg="The number of features in the final model (< 5) is too small for a sparse model. " "We recommend using the LinearDRLearner for this low-dimensional setting.") return super().fit(Y, T, X=X, W=W, sample_weight=sample_weight, groups=groups, cache_values=cache_values, inference=inference)
@property def fit_cate_intercept_(self): return self.model_final_.fit_intercept @property def multitask_model_final(self): return False @multitask_model_final.setter def multitask_model_final(self, value): if value: raise ValueError("Parameter `multitask_model_final` cannot change from `False` for this estimator!") @property def model_final(self): return self._gen_model_final() @model_final.setter def model_final(self, model): if model is not None: raise ValueError("Parameter `model_final` cannot be altered for this estimator!")
[docs]class ForestDRLearner(ForestModelFinalCateEstimatorDiscreteMixin, DRLearner): """ Instance of DRLearner with a :class:`~econml.grf.RegressionForest` as a final model, so as to enable non-parametric inference. Parameters ---------- model_propensity: estimator, default ``'auto'`` Classifier for Pr[T=t | X, W]. Trained by regressing treatments on (features, controls) concatenated. - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models - Otherwise, see :ref:`model_selection` for the range of supported options model_regression: estimator, default ``'auto'`` Estimator for E[Y | X, W, T]. Trained by regressing Y on (features, controls, one-hot-encoded treatments) concatenated. The one-hot-encoding excludes the baseline treatment. - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models - Otherwise, see :ref:`model_selection` for the range of supported options; if a single model is specified it should be a classifier if `discrete_outcome` is True and a regressor otherwise discrete_outcome: bool, default False Whether the outcome should be treated as binary min_propensity : float, default ``1e-6`` The minimum propensity at which to clip propensity estimates to avoid dividing by zero. categories: 'auto' or list, default 'auto' The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values). The first category will be treated as the control treatment. cv: int, cross-validation generator or an iterable, default 2 Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 3-fold cross-validation, - integer, to specify the number of folds. - :term:`CV splitter` - An iterable yielding (train, test) splits as arrays of indices. For integer/None inputs, if the treatment is discrete :class:`~sklearn.model_selection.StratifiedKFold` is used, else, :class:`~sklearn.model_selection.KFold` is used (with a random shuffle in either case). Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all W, X are None, then we call `split(ones((T.shape[0], 1)), T)`. mc_iters: int, optional The number of times to rerun the first stage models to reduce the variance of the nuisances. mc_agg: {'mean', 'median'}, default 'mean' How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of cross-fitting. n_estimators : int, default 100 The total number of trees in the forest. The forest consists of a forest of sqrt(n_estimators) sub-forests, where each sub-forest contains sqrt(n_estimators) trees. max_depth : int or None, optional The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples. min_samples_split : int, float, default 2 The minimum number of splitting samples required to split an internal node. - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. min_samples_leaf : int, float, default 1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` splitting samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. After construction the tree is also pruned so that there are at least min_samples_leaf estimation samples on each leaf. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. min_weight_fraction_leaf : float, default 0. The minimum weighted fraction of the sum total of weights (of all splitting samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided. After construction the tree is pruned so that the fraction of the sum total weight of the estimation samples contained in each leaf node is at least min_weight_fraction_leaf max_features : int, float, str, or None, default "auto" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and `int(max_features * n_features)` features are considered at each split. - If "auto", then `max_features=n_features`. - If "sqrt", then `max_features=sqrt(n_features)`. - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features. min_impurity_decrease : float, default 0. A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following:: N_t / N * (impurity - N_t_R / N_t * right_impurity - N_t_L / N_t * left_impurity) where ``N`` is the total number of split samples, ``N_t`` is the number of split samples at the current node, ``N_t_L`` is the number of split samples in the left child, and ``N_t_R`` is the number of split samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. max_samples : int or float in (0, .5], default .45, The number of samples to use for each subsample that is used to train each tree: - If int, then train each tree on `max_samples` samples, sampled without replacement from all the samples - If float, then train each tree on ceil(`max_samples` * `n_samples`), sampled without replacement from all the samples. min_balancedness_tol: float in [0, .5], default .45 How imbalanced a split we can tolerate. This enforces that each split leaves at least (.5 - min_balancedness_tol) fraction of samples on each side of the split; or fraction of the total weight of samples, when sample_weight is not None. Default value, ensures that at least 5% of the parent node weight falls in each side of the split. Set it to 0.0 for no balancedness and to .5 for perfectly balanced splits. For the formal inference theory to be valid, this has to be any positive constant bounded away from zero. honest : bool, default True Whether to use honest trees, i.e. half of the samples are used for creating the tree structure and the other half for the estimation at the leafs. If False, then all samples are used for both parts. subforest_size : int, default 4, The number of trees in each sub-forest that is used in the bootstrap-of-little-bags calculation. The parameter `n_estimators` must be divisible by `subforest_size`. Should typically be a small constant. n_jobs : int or None, default -1 The number of jobs to run in parallel for both `fit` and `predict`. ``None`` means 1 unless in a :func:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary <n_jobs>` for more details. verbose : int, default 0 Controls the verbosity when fitting and predicting. random_state : int, RandomState instance, or None, default None If int, random_state is the seed used by the random number generator; If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator; If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used by :mod:`np.random<numpy.random>`. allow_missing: bool Whether to allow missing values in W. If True, will need to supply model_propensity and model_regression that can handle missing values. use_ray: bool, default False Whether to use Ray to parallelize the cross-validation step. If True, Ray must be installed. ray_remote_func_options : dict, default None Options to pass to the remote function when using Ray. See https://docs.ray.io/en/latest/ray-core/api/doc/ray.remote.html """
[docs] def __init__(self, *, model_regression="auto", model_propensity="auto", discrete_outcome=False, featurizer=None, min_propensity=1e-6, categories='auto', cv=2, mc_iters=None, mc_agg='mean', n_estimators=1000, max_depth=None, min_samples_split=5, min_samples_leaf=5, min_weight_fraction_leaf=0., max_features="auto", min_impurity_decrease=0., max_samples=.45, min_balancedness_tol=.45, honest=True, subforest_size=4, n_jobs=-1, verbose=0, random_state=None, allow_missing=False, use_ray=False, ray_remote_func_options=None): self.n_estimators = n_estimators self.max_depth = max_depth self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.min_weight_fraction_leaf = min_weight_fraction_leaf self.max_features = max_features self.min_impurity_decrease = min_impurity_decrease self.max_samples = max_samples self.min_balancedness_tol = min_balancedness_tol self.honest = honest self.subforest_size = subforest_size self.n_jobs = n_jobs self.verbose = verbose super().__init__(model_regression=model_regression, model_propensity=model_propensity, model_final=None, discrete_outcome=discrete_outcome, featurizer=featurizer, multitask_model_final=False, min_propensity=min_propensity, categories=categories, cv=cv, mc_iters=mc_iters, mc_agg=mc_agg, random_state=random_state, allow_missing=allow_missing, use_ray=use_ray, ray_remote_func_options=ray_remote_func_options)
def _gen_allowed_missing_vars(self): return ['W'] if self.allow_missing else [] def _gen_model_final(self): return RegressionForest(n_estimators=self.n_estimators, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=self.min_weight_fraction_leaf, max_features=self.max_features, min_impurity_decrease=self.min_impurity_decrease, max_samples=self.max_samples, min_balancedness_tol=self.min_balancedness_tol, honest=self.honest, inference=True, subforest_size=self.subforest_size, n_jobs=self.n_jobs, random_state=self.random_state, verbose=self.verbose, warm_start=False) def _gen_ortho_learner_model_final(self): return _ModelFinal(self._gen_model_final(), self._gen_featurizer(), False)
[docs] def fit(self, Y, T, *, X=None, W=None, sample_weight=None, groups=None, cache_values=False, inference='auto'): """ Estimate the counterfactual model from data, i.e. estimates functions τ(·,·,·), ∂τ(·,·). Parameters ---------- Y: (n × d_y) matrix or vector of length n Outcomes for each sample T: (n × dₜ) matrix or vector of length n Treatments for each sample X: (n × dₓ) matrix, optional Features for each sample W: (n × d_w) matrix, optional Controls for each sample sample_weight : (n,) array_like or None Individual weights for each sample. If None, it assumes equal weight. groups: (n,) vector, optional All rows corresponding to the same group will be kept together during splitting. If groups is not None, the `cv` argument passed to this class's initializer must support a 'groups' argument to its split method. cache_values: bool, default False Whether to cache inputs and first stage results, which will allow refitting a different final model inference: str, `Inference` instance, or None Method for performing inference. This estimator supports 'bootstrap' (or an instance of :class:`.BootstrapInference`) and 'blb' (for Bootstrap-of-Little-Bags based inference) Returns ------- self """ if X is None: raise ValueError("This estimator does not support X=None!") return super().fit(Y, T, X=X, W=W, sample_weight=sample_weight, groups=groups, cache_values=cache_values, inference=inference)
[docs] def multitask_model_cate(self): # Replacing to remove docstring super().multitask_model_cate()
@property def multitask_model_final(self): return False @multitask_model_final.setter def multitask_model_final(self, value): if value: raise ValueError("Parameter `multitask_model_final` cannot change from `False` for this estimator!") @property def model_final(self): return self._gen_model_final() @model_final.setter def model_final(self, model): if model is not None: raise ValueError("Parameter `model_final` cannot be altered for this estimator!")