Source code for econml.dml.causal_forest

# Copyright (c) PyWhy contributors. All rights reserved.
# Licensed under the MIT License.

from warnings import warn

import numpy as np
from sklearn.linear_model import LogisticRegressionCV
from sklearn.base import clone, BaseEstimator
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from itertools import product
from .dml import _BaseDML
from .dml import _make_first_stage_selector
from ..sklearn_extensions.linear_model import WeightedLassoCVWrapper
from ..sklearn_extensions.model_selection import WeightedStratifiedKFold
from ..inference import NormalInferenceResults
from ..inference._inference import Inference
from ..utilities import (add_intercept, shape, check_inputs, check_input_arrays,
                         _deprecate_positional, cross_product, Summary)
from ..grf import CausalForest, MultiOutputGRF
from .._cate_estimator import LinearCateEstimator
from .._shap import _shap_explain_multitask_model_cate
from .._ortho_learner import _OrthoLearner


class _CausalForestFinalWrapper:

    def __init__(self, model_final, featurizer, discrete_treatment, drate):
        self._model = clone(model_final, safe=False)
        self._original_featurizer = clone(featurizer, safe=False)
        self._featurizer = self._original_featurizer
        self._discrete_treatment = discrete_treatment
        self._drate = drate

    def _combine(self, X, fitting=True):
        if X is not None:
            if self._featurizer is not None:
                F = self._featurizer.fit_transform(X) if fitting else self._featurizer.transform(X)
            else:
                F = X
        else:
            raise AttributeError("Cannot use this method with X=None. Consider "
                                 "using the LinearDML estimator.")
        return F

    def _ate_and_stderr(self, drpreds, mask=None):
        if mask is not None:
            drpreds = drpreds[mask]
        point = np.nanmean(drpreds, axis=0).reshape(self._d_y + self._d_t)
        nonnan = np.sum(~np.isnan(drpreds))
        stderr = (np.nanstd(drpreds, axis=0) / np.sqrt(nonnan)).reshape(self._d_y + self._d_t)
        return point, stderr

    def fit(self, X, T, T_res, Y_res, sample_weight=None, freq_weight=None, sample_var=None, groups=None):
        # Track training dimensions to see if Y or T is a vector instead of a 2-dimensional array
        self._d_t = shape(T_res)[1:]
        self._d_y = shape(Y_res)[1:]
        fts = self._combine(X)
        if T_res.ndim == 1:
            T_res = T_res.reshape((-1, 1))
        if Y_res.ndim == 1:
            Y_res = Y_res.reshape((-1, 1))
        self._model.fit(fts, T_res, Y_res, sample_weight=sample_weight)
        # Fit a doubly robust average effect
        if self._discrete_treatment and self._drate:
            oob_preds = self._model.oob_predict(fts)
            self._oob_preds = oob_preds
            if np.any(np.isnan(oob_preds)):
                warn("Could not generate out-of-bag predictions on some training data. "
                     "Consider increasing the number of trees. `ate_` results will take the "
                     "average of the subset of training data for which out-of-bag predictions "
                     "where available.")
            residuals = Y_res - np.einsum('ijk,ik->ij', oob_preds, T_res)
            propensities = T - T_res
            VarT = np.clip(propensities * (1 - propensities), 1e-2, np.inf)
            drpreds = oob_preds
            drpreds += cross_product(residuals, T_res / VarT).reshape((-1, Y_res.shape[1], T_res.shape[1]))
            drpreds[np.isnan(oob_preds)] = np.nan
            self.ate_, self.ate_stderr_ = self._ate_and_stderr(drpreds)
            self.att_ = []
            self.att_stderr_ = []
            att, stderr = self._ate_and_stderr(drpreds, np.all(T == 0, axis=1))
            self.att_.append(att)
            self.att_stderr_.append(stderr)
            for t in range(self._d_t[0]):
                att, stderr = self._ate_and_stderr(drpreds, (T[:, t] == 1))
                self.att_.append(att)
                self.att_stderr_.append(stderr)

        return self

    def predict(self, X):
        return self._model.predict(self._combine(X, fitting=False)).reshape((-1,) + self._d_y + self._d_t)

    @property
    def ate_(self):
        if not self._discrete_treatment:
            raise AttributeError("Doubly Robust ATE calculation on training data "
                                 "is available only on discrete treatments!")
        if not self._drate:
            raise AttributeError("Doubly Robust ATE calculation on training data "
                                 "is available only when `drate=True`!")
        return self._ate

    @ate_.setter
    def ate_(self, value):
        self._ate = value

    @property
    def ate_stderr_(self):
        if not self._discrete_treatment:
            raise AttributeError("Doubly Robust ATE calculation on training data "
                                 "is available only on discrete treatments!")
        if not self._drate:
            raise AttributeError("Doubly Robust ATE calculation on training data "
                                 "is available only when `drate=True`!")
        return self._ate_stderr

    @ate_stderr_.setter
    def ate_stderr_(self, value):
        self._ate_stderr = value

    @property
    def att_(self):
        if not self._discrete_treatment:
            raise AttributeError("Doubly Robust ATT calculation on training data "
                                 "is available only on discrete treatments!")
        if not self._drate:
            raise AttributeError("Doubly Robust ATT calculation on training data "
                                 "is available only when `drate=True`!")
        return self._att

    @att_.setter
    def att_(self, value):
        self._att = value

    @property
    def att_stderr_(self):
        if not self._discrete_treatment:
            raise AttributeError("Doubly Robust ATT calculation on training data "
                                 "is available only on discrete treatments!")
        if not self._drate:
            raise AttributeError("Doubly Robust ATT calculation on training data "
                                 "is available only when `drate=True`!")
        return self._att_stderr

    @att_stderr_.setter
    def att_stderr_(self, value):
        self._att_stderr = value


class _GenericSingleOutcomeModelFinalWithCovInference(Inference):

    def prefit(self, estimator, *args, **kwargs):
        self.model_final = estimator.model_final_
        self.featurizer = estimator.featurizer_ if hasattr(estimator, 'featurizer_') else None

    def fit(self, estimator, *args, **kwargs):
        # once the estimator has been fit, it's kosher to store d_t here
        # (which needs to have been expanded if there's a discrete treatment)
        self._est = estimator
        self._d_t = estimator._d_t
        self._d_y = estimator._d_y
        self.d_t = self._d_t[0] if self._d_t else 1
        self.d_y = self._d_y[0] if self._d_y else 1

    def const_marginal_effect_interval(self, X, *, alpha=0.05):
        return self.const_marginal_effect_inference(X).conf_int(alpha=alpha)

    def const_marginal_effect_inference(self, X):
        if X is None:
            raise ValueError("This inference method currently does not support X=None!")
        if self.featurizer is not None:
            X = self.featurizer.transform(X)
        pred, pred_var = self.model_final.predict_and_var(X)
        pred = pred.reshape((-1,) + self._d_y + self._d_t)
        pred_stderr = np.sqrt(np.diagonal(pred_var, axis1=2, axis2=3).reshape((-1,) + self._d_y + self._d_t))
        return NormalInferenceResults(d_t=self.d_t, d_y=self.d_y, pred=pred,
                                      pred_stderr=pred_stderr, mean_pred_stderr=None, inf_type='effect')

    def effect_interval(self, X, *, T0, T1, alpha=0.05):
        return self.effect_inference(X, T0=T0, T1=T1).conf_int(alpha=alpha)

    def effect_inference(self, X, *, T0, T1):
        if X is None:
            raise ValueError("This inference method currently does not support X=None!")
        X, T0, T1 = self._est._expand_treatments(X, T0, T1)
        if self.featurizer is not None:
            X = self.featurizer.transform(X)
        dT = T1 - T0
        if dT.ndim == 1:
            dT = dT.reshape((-1, 1))
        pred, pred_var = self.model_final.predict_projection_and_var(X, dT)
        pred = pred.reshape((-1,) + self._d_y)
        pred_stderr = np.sqrt(pred_var.reshape((-1,) + self._d_y))
        return NormalInferenceResults(d_t=None, d_y=self.d_y, pred=pred,
                                      pred_stderr=pred_stderr, mean_pred_stderr=None, inf_type='effect')

    def marginal_effect_interval(self, T, X, alpha=0.05):
        return self.marginal_effect_inference(T, X).conf_int(alpha=alpha)

    def marginal_effect_inference(self, T, X):
        if X is None:
            raise ValueError("This inference method currently does not support X=None!")
        if not self._est._original_treatment_featurizer:
            return self.const_marginal_effect_inference(X)
        X, T = self._est._expand_treatments(X, T, transform=False)
        if self.featurizer is not None:
            X = self.featurizer.transform(X)

        feat_T = self._est.transformer.transform(T)
        jac_T = self._est.transformer.jac(T)

        d_t_orig = T.shape[1:]
        d_t_orig = d_t_orig[0] if d_t_orig else 1

        d_y = self._d_y[0] if self._d_y else 1
        d_t = self._d_t[0] if self._d_t else 1

        output_shape = [X.shape[0]]
        if self._d_y:
            output_shape.append(self._d_y[0])
        if T.shape[1:]:
            output_shape.append(T.shape[1])
        me_pred = np.zeros(shape=output_shape)
        me_stderr = np.zeros(shape=output_shape)

        for i in range(d_t_orig):
            # conditionally index multiple dimensions depending on shapes of T, Y and feat_T
            jac_index = [slice(None)]
            me_index = [slice(None)]
            if self._d_y:
                me_index.append(slice(None))
            if T.shape[1:]:
                jac_index.append(i)
                me_index.append(i)
            if feat_T.shape[1:]:  # if featurized T is not a vector
                jac_index.append(slice(None))

            jac_slice = jac_T[tuple(jac_index)]
            if jac_slice.ndim == 1:
                jac_slice.reshape((-1, 1))

            e_pred, e_var = self.model_final.predict_projection_and_var(X, jac_slice)
            e_stderr = np.sqrt(e_var)

            if not self._d_y:
                e_pred = e_pred.squeeze(axis=1)
                e_stderr = e_stderr.squeeze(axis=1)

            me_pred[tuple(me_index)] = e_pred
            me_stderr[tuple(me_index)] = e_stderr

        return NormalInferenceResults(d_t=d_t_orig, d_y=self.d_y, pred=me_pred,
                                      pred_stderr=me_stderr, mean_pred_stderr=None, inf_type='effect')


[docs]class CausalForestDML(_BaseDML): """A Causal Forest [cfdml1]_ combined with double machine learning based residualization of the treatment and outcome variables. It fits a forest that solves the local moment equation problem: .. code-block:: E[ (Y - E[Y|X, W] - <theta(x), T - E[T|X, W]> - beta(x)) (T;1) | X=x] = 0 where E[Y|X, W] and E[T|X, W] are fitted in a first stage in a cross-fitting manner. Parameters ---------- model_y: estimator, default ``'auto'`` Determines how to fit the outcome to the features. - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models - Otherwise, see :ref:`model_selection` for the range of supported options; if a single model is specified it should be a classifier if `discrete_outcome` is True and a regressor otherwise model_t: estimator, default ``'auto'`` Determines how to fit the treatment to the features. - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models - Otherwise, see :ref:`model_selection` for the range of supported options; if a single model is specified it should be a classifier if `discrete_treatment` is True and a regressor otherwise featurizer : :term:`transformer`, optional Must support fit_transform and transform. Used to create composite features in the final CATE regression. It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X). If featurizer=None, then CATE is trained on X. treatment_featurizer : :term:`transformer`, optional Must support fit_transform and transform. Used to create composite treatment in the final CATE regression. The final CATE will be trained on the outcome of featurizer.fit_transform(T). If featurizer=None, then CATE is trained on T. discrete_outcome: bool, default ``False`` Whether the outcome should be treated as binary discrete_treatment: bool, default ``False`` Whether the treatment values should be treated as categorical, rather than continuous, quantities categories: 'auto' or list, default 'auto' The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values). The first category will be treated as the control treatment. cv: int, cross-validation generator or an iterable, default 2 Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 3-fold cross-validation, - integer, to specify the number of folds. - :term:`CV splitter` - An iterable yielding (train, test) splits as arrays of indices. For integer/None inputs, if the treatment is discrete :class:`~sklearn.model_selection.StratifiedKFold` is used, else, :class:`~sklearn.model_selection.KFold` is used (with a random shuffle in either case). Unless an iterable is used, we call `split(X,T)` to generate the splits. mc_iters: int, optional The number of times to rerun the first stage models to reduce the variance of the nuisances. mc_agg: {'mean', 'median'}, default 'mean' How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of cross-fitting. drate : bool, default True Whether to calculate doubly robust average treatment effect estimate on training data at fit time. This happens only if `discrete_treatment=True`. Doubly robust ATE estimation on the training data is not available for continuous treatments. n_estimators : int, default 100 Number of trees criterion : {``"mse"``, ``"het"``}, default "mse" The function to measure the quality of a split. Supported criteria are ``"mse"`` for the mean squared error in a linear moment estimation tree and ``"het"`` for heterogeneity score. - The ``"mse"`` criterion finds splits that minimize the score .. code-block:: sum_{child} E[(Y - <theta(child), T> - beta(child))^2 | X=child] weight(child) Internally, for the case of more than two treatments or for the case of two treatments with ``fit_intercept=True`` then this criterion is approximated by computationally simpler variants for computational purposes. In particular, it is replaced by: .. code-block:: sum_{child} weight(child) * rho(child).T @ E[(T;1) @ (T;1).T | X in child] @ rho(child) where: .. code-block:: rho(child) := E[(T;1) @ (T;1).T | X in parent]^{-1} * E[(Y - <theta(x), T> - beta(x)) (T;1) | X in child] This can be thought as a heterogeneity inducing score, but putting more weight on scores with a large minimum eigenvalue of the child jacobian ``E[(T;1) @ (T;1).T | X in child]``, which leads to smaller variance of the estimate and stronger identification of the parameters. - The "het" criterion finds splits that maximize the pure parameter heterogeneity score .. code-block:: sum_{child} weight(child) * rho(child)[:n_T].T @ rho(child)[:n_T] This can be thought as an approximation to the ideal heterogeneity score: .. code-block:: weight(left) * weight(right) || theta(left) - theta(right)||_2^2 / weight(parent)^2 as outlined in [cfdml1]_ max_depth : int, default None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples. min_samples_split : int or float, default 10 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. min_samples_leaf : int or float, default 5 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. min_weight_fraction_leaf : float, default 0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided. min_var_fraction_leaf : None or float in (0, 1], default None A constraint on some proxy of the variation of the treatment vector that should be contained within each leaf as a percentage of the total variance of the treatment vector on the whole sample. This avoids performing splits where either the variance of the treatment is small and hence the local parameter is not well identified and has high variance. The proxy of variance is different for different criterion, primarily for computational efficiency reasons. If ``criterion='het'``, then this constraint translates to:: for all i in {1, ..., T.shape[1]}: Var(T[i] | X in leaf) > `min_var_fraction_leaf` * Var(T[i]) If ``criterion='mse'``, because the criterion stores more information about the leaf for every candidate split, then this constraint imposes further constraints on the pairwise correlations of different coordinates of each treatment, i.e.:: for all i neq j: sqrt( Var(T[i]|X in leaf) * Var(T[j]|X in leaf) * ( 1 - rho(T[i], T[j]| in leaf)^2 ) ) > `min_var_fraction_leaf` sqrt( Var(T[i]) * Var(T[j]) * (1 - rho(T[i], T[j])^2 ) ) where rho(X, Y) is the Pearson correlation coefficient of two random variables X, Y. Thus this constraint also enforces that no two pairs of treatments be very co-linear within a leaf. This extra constraint primarily has bite in the case of more than two input treatments and also avoids leafs where the parameter estimate has large variance due to local co-linearities of the treatments. min_var_leaf_on_val : bool, default False Whether the `min_var_fraction_leaf` constraint should also be enforced to hold on the validation set of the honest split too. If ``min_var_leaf=None`` then this flag does nothing. Setting this to True should be done with caution, as this partially violates the honesty structure, since the treatment variable of the validation set is used to inform the split structure of the tree. However, this is a benign dependence as it only uses local correlation structure of the treatment T to decide whether a split is feasible. max_features : int, float, {"auto", "sqrt", "log2"}, or None, default None The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and `int(max_features * n_features)` features are considered at each split. - If "auto", then `max_features=n_features`. - If "sqrt", then `max_features=sqrt(n_features)`. - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features. min_impurity_decrease : float, default 0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following:: N_t / N * (impurity - N_t_R / N_t * right_impurity - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. max_samples : int or float in (0, 1], default .45, The number of samples to use for each subsample that is used to train each tree: - If int, then train each tree on `max_samples` samples, sampled without replacement from all the samples - If float, then train each tree on `ceil(`max_samples` * `n_samples`)`, sampled without replacement from all the samples. If ``inference=True``, then `max_samples` must either be an integer smaller than `n_samples//2` or a float less than or equal to .5. min_balancedness_tol: float in [0, .5], default .45 How imbalanced a split we can tolerate. This enforces that each split leaves at least (.5 - min_balancedness_tol) fraction of samples on each side of the split; or fraction of the total weight of samples, when sample_weight is not None. Default value, ensures that at least 5% of the parent node weight falls in each side of the split. Set it to 0.0 for no balancedness and to .5 for perfectly balanced splits. For the formal inference theory to be valid, this has to be any positive constant bounded away from zero. honest : bool, default True Whether each tree should be trained in an honest manner, i.e. the training set is split into two equal sized subsets, the train and the val set. All samples in train are used to create the split structure and all samples in val are used to calculate the value of each node in the tree. inference : bool, default True Whether inference (i.e. confidence interval construction and uncertainty quantification of the estimates) should be enabled. If ``inference=True``, then the estimator uses a bootstrap-of-little-bags approach to calculate the covariance of the parameter vector, with am objective Bayesian debiasing correction to ensure that variance quantities are positive. fit_intercept : bool, default True Whether we should fit an intercept nuisance parameter beta(x). subforest_size : int, default 4, The number of trees in each sub-forest that is used in the bootstrap-of-little-bags calculation. The parameter `n_estimators` must be divisible by `subforest_size`. Should typically be a small constant. n_jobs : int or None, default -1 The number of parallel jobs to be used for parallelism; follows joblib semantics. `n_jobs=-1` means all available cpu cores. `n_jobs=None` means no parallelism. random_state : int, RandomState instance, or None, default None Controls the randomness of the estimator. The features are always randomly permuted at each split. When ``max_features < n_features``, the algorithm will select ``max_features`` at random at each split before finding the best split among them. But the best found split may vary across different runs, even if ``max_features=n_features``. That is the case, if the improvement of the criterion is identical for several splits and one split has to be selected at random. To obtain a deterministic behaviour during fitting, ``random_state`` has to be fixed to an integer. verbose : int, default 0 Controls the verbosity when fitting and predicting. allow_missing: bool Whether to allow missing values in W. If True, will need to supply model_y, model_y that can handle missing values. use_ray: bool, default False Whether to use Ray to parallelize the cross-validation step. If True, Ray must be installed. ray_remote_func_options : dict, default None Options to pass to the remote function when using Ray. See https://docs.ray.io/en/latest/ray-core/api/doc/ray.remote.html Examples -------- A simple example with the default models and discrete treatment: .. testcode:: :hide: import numpy as np import scipy.special np.set_printoptions(suppress=True) .. testcode:: from econml.dml import CausalForestDML np.random.seed(123) X = np.random.normal(size=(1000, 5)) T = np.random.binomial(1, scipy.special.expit(X[:, 0])) y = (1 + .5*X[:, 0]) * T + X[:, 0] + np.random.normal(size=(1000,)) est = CausalForestDML(discrete_treatment=True) est.fit(y, T, X=X, W=None) >>> est.effect(X[:3]) array([0.62947..., 1.64576..., 0.68496... ]) >>> est.effect_interval(X[:3]) (array([0.19136... , 1.17143..., 0.10789...]), array([1.06758..., 2.12009..., 1.26203...])) Attributes ---------- ate_ : ndarray of shape (n_outcomes, n_treatments) The average constant marginal treatment effect of each treatment for each outcome, averaged over the training data and with a doubly robust correction. Available only when `discrete_treatment=True` and `drate=True`. ate_stderr_ : ndarray of shape (n_outcomes, n_treatments) The standard error of the `ate_` attribute. feature_importances_ : ndarray of shape (n_features,) The feature importances based on the amount of parameter heterogeneity they create. The higher, the more important the feature. The importance of a feature is computed as the (normalized) total heterogeneity that the feature creates. Each split that the feature was chosen adds:: parent_weight * (left_weight * right_weight) * mean((value_left[k] - value_right[k])**2) / parent_weight**2 to the importance of the feature. Each such quantity is also weighted by the depth of the split. By default splits below `max_depth=4` are not used in this calculation and also each split at depth `depth`, is re-weighted by 1 / (1 + `depth`)**2.0. See the method ``feature_importances`` for a method that allows one to change these defaults. References ---------- .. [cfdml1] Athey, Susan, Julie Tibshirani, and Stefan Wager. "Generalized random forests." The Annals of Statistics 47.2 (2019): 1148-1178 https://arxiv.org/pdf/1610.01271.pdf """
[docs] def __init__(self, *, model_y='auto', model_t='auto', featurizer=None, treatment_featurizer=None, discrete_outcome=False, discrete_treatment=False, categories='auto', cv=2, mc_iters=None, mc_agg='mean', drate=True, n_estimators=100, criterion="mse", max_depth=None, min_samples_split=10, min_samples_leaf=5, min_weight_fraction_leaf=0., min_var_fraction_leaf=None, min_var_leaf_on_val=False, max_features="auto", min_impurity_decrease=0., max_samples=.45, min_balancedness_tol=.45, honest=True, inference=True, fit_intercept=True, subforest_size=4, n_jobs=-1, random_state=None, verbose=0, allow_missing=False, use_ray=False, ray_remote_func_options=None): # TODO: consider whether we need more care around stateful featurizers, # since we clone it and fit separate copies self.drate = drate self.model_y = clone(model_y, safe=False) self.model_t = clone(model_t, safe=False) self.featurizer = clone(featurizer, safe=False) self.discrete_instrument = discrete_treatment self.categories = categories self.cv = cv self.n_estimators = n_estimators self.criterion = criterion self.max_depth = max_depth self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.min_weight_fraction_leaf = min_weight_fraction_leaf self.min_var_fraction_leaf = min_var_fraction_leaf self.min_var_leaf_on_val = min_var_leaf_on_val self.max_features = max_features self.min_impurity_decrease = min_impurity_decrease self.max_samples = max_samples self.min_balancedness_tol = min_balancedness_tol self.honest = honest self.inference = inference self.fit_intercept = fit_intercept self.subforest_size = subforest_size self.n_jobs = n_jobs self.verbose = verbose super().__init__(discrete_outcome=discrete_outcome, discrete_treatment=discrete_treatment, treatment_featurizer=treatment_featurizer, categories=categories, cv=cv, mc_iters=mc_iters, mc_agg=mc_agg, random_state=random_state, allow_missing=allow_missing, use_ray=use_ray, ray_remote_func_options=ray_remote_func_options)
def _gen_allowed_missing_vars(self): return ['W'] if self.allow_missing else [] def _get_inference_options(self): options = super()._get_inference_options() options.update(blb=_GenericSingleOutcomeModelFinalWithCovInference) options.update(auto=_GenericSingleOutcomeModelFinalWithCovInference) return options def _gen_featurizer(self): return clone(self.featurizer, safe=False) def _gen_model_y(self): return _make_first_stage_selector(self.model_y, self.discrete_outcome, self.random_state) def _gen_model_t(self): return _make_first_stage_selector(self.model_t, self.discrete_treatment, self.random_state) def _gen_model_final(self): return MultiOutputGRF(CausalForest(n_estimators=self.n_estimators, criterion=self.criterion, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=self.min_weight_fraction_leaf, min_var_fraction_leaf=self.min_var_fraction_leaf, min_var_leaf_on_val=self.min_var_leaf_on_val, max_features=self.max_features, min_impurity_decrease=self.min_impurity_decrease, max_samples=self.max_samples, min_balancedness_tol=self.min_balancedness_tol, honest=self.honest, inference=self.inference, fit_intercept=self.fit_intercept, subforest_size=self.subforest_size, n_jobs=self.n_jobs, random_state=self.random_state, verbose=self.verbose, warm_start=False)) def _gen_rlearner_model_final(self): return _CausalForestFinalWrapper(self._gen_model_final(), self._gen_featurizer(), self.discrete_treatment, self.drate) @property def tunable_params(self): return ['n_estimators', 'criterion', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'min_weight_fraction_leaf', 'min_var_fraction_leaf', 'min_var_leaf_on_val', 'max_features', 'min_impurity_decrease', 'max_samples', 'min_balancedness_tol', 'honest', 'inference', 'fit_intercept', 'subforest_size']
[docs] def tune(self, Y, T, *, X=None, W=None, sample_weight=None, groups=None, params='auto'): """ Tunes the major hyperparameters of the final stage causal forest based on out-of-sample R-score performance. It trains small forests of size 100 trees on a grid of parameters and tests the out of sample R-score. After the function is called, then all parameters of `self` have been set to the optimal hyperparameters found. The estimator however remains un-fitted, so you need to call fit afterwards to fit the estimator with the chosen hyperparameters. The list of tunable parameters can be accessed via the property `tunable_params`. Parameters ---------- Y: (n × d_y) matrix or vector of length n Outcomes for each sample T: (n × dₜ) matrix or vector of length n Treatments for each sample X: (n × dₓ) matrix Features for each sample W: (n × d_w) matrix, optional Controls for each sample sample_weight: (n,) vector, optional Weights for each row groups: (n,) vector, optional All rows corresponding to the same group will be kept together during splitting. If groups is not None, the `cv` argument passed to this class's initializer must support a 'groups' argument to its split method. params: dict or 'auto', default 'auto' A dictionary that contains the grid of hyperparameters to try, i.e. {'param1': [value1, value2, ...], 'param2': [value1, value2, ...], ...} If `params='auto'`, then a default grid is used. Returns ------- self : CausalForestDML object The tuned causal forest object. This is the same object (not a copy) as the original one, but where all parameters of the object have been set to the best performing parameters from the tuning grid. """ from ..score import RScorer # import here to avoid circular import issue Y, T, X, sample_weight, groups = check_input_arrays(Y, T, X, sample_weight, groups) W, = check_input_arrays(W, force_all_finite='allow-nan' if 'W' in self._gen_allowed_missing_vars() else True) if params == 'auto': params = { 'min_weight_fraction_leaf': [0.0001, .01], 'max_depth': [3, 5, None], 'min_var_fraction_leaf': [0.001, .01] } else: # If custom param grid, check that only estimator parameters are being altered estimator_param_names = self.tunable_params for key in params.keys(): if key not in estimator_param_names: raise ValueError(f"Parameter `{key}` is not an tunable causal forest parameter.") strata = None if self.discrete_treatment: strata = self._strata(Y, T, X=X, W=W, sample_weight=sample_weight, groups=groups) # use 0.699 instead of 0.7 as train size so that if there are 5 examples in a stratum, we get 2 in test train, test = train_test_split(np.arange(Y.shape[0]), train_size=0.699, random_state=self.random_state, stratify=strata) ytrain, yval, Ttrain, Tval = Y[train], Y[test], T[train], T[test] Xtrain, Xval = (X[train], X[test]) if X is not None else (None, None) Wtrain, Wval = (W[train], W[test]) if W is not None else (None, None) groups_train, groups_val = (groups[train], groups[test]) if groups is not None else (None, None) if sample_weight is not None: sample_weight_train, sample_weight_val = sample_weight[train], sample_weight[test] else: sample_weight_train, sample_weight_val = None, None est = clone(self, safe=False) est.n_estimators = 100 est.inference = False scorer = RScorer(model_y=est.model_y, model_t=est.model_t, discrete_treatment=est.discrete_treatment, categories=est.categories, cv=est.cv, mc_iters=est.mc_iters, mc_agg=est.mc_agg, random_state=est.random_state) scorer.fit(yval, Tval, X=Xval, W=Wval, sample_weight=sample_weight_val, groups=groups_val) names = params.keys() scores = [] for it, values in enumerate(product(*params.values())): for key, value in zip(names, values): setattr(est, key, value) if it == 0: est.fit(ytrain, Ttrain, X=Xtrain, W=Wtrain, sample_weight=sample_weight_train, groups=groups_train, cache_values=True) else: est.refit_final() scores.append((scorer.score(est), tuple(zip(names, values)))) bestind = np.argmax([s[0] for s in scores]) _, best_params = scores[bestind] for key, value in best_params: setattr(self, key, value) return self
# override only so that we can update the docstring to indicate support for `blb`
[docs] def fit(self, Y, T, *, X=None, W=None, sample_weight=None, groups=None, cache_values=False, inference='auto'): """ Estimate the counterfactual model from data, i.e. estimates functions τ(·,·,·), ∂τ(·,·). Parameters ---------- Y: (n × d_y) matrix or vector of length n Outcomes for each sample T: (n × dₜ) matrix or vector of length n Treatments for each sample X: (n × dₓ) matrix Features for each sample W: (n × d_w) matrix, optional Controls for each sample sample_weight : (n,) array_like or None Individual weights for each sample. If None, it assumes equal weight. groups: (n,) vector, optional All rows corresponding to the same group will be kept together during splitting. If groups is not None, the `cv` argument passed to this class's initializer must support a 'groups' argument to its split method. cache_values: bool, default False Whether to cache inputs and first stage results, which will allow refitting a different final model inference: str, :class:`.Inference` instance, or None Method for performing inference. This estimator supports 'bootstrap' (or an instance of :class:`.BootstrapInference`), 'blb' or 'auto' (for Bootstrap-of-Little-Bags based inference) Returns ------- self """ if X is None: raise ValueError("This estimator does not support X=None!") return super().fit(Y, T, X=X, W=W, sample_weight=sample_weight, groups=groups, cache_values=cache_values, inference=inference)
[docs] def refit_final(self, *, inference='auto'): return super().refit_final(inference=inference)
refit_final.__doc__ = _OrthoLearner.refit_final.__doc__ def feature_importances(self, max_depth=4, depth_decay_exponent=2.0): imps = self.model_final_.feature_importances(max_depth=max_depth, depth_decay_exponent=depth_decay_exponent) return imps.reshape(self._d_y + (-1,))
[docs] def summary(self, alpha=0.05, value=0, decimals=3, feature_names=None, treatment_names=None, output_names=None): """ The summary of coefficient and intercept in the linear model of the constant marginal treatment effect. Parameters ---------- alpha: float in [0, 1], default 0.05 The overall level of confidence of the reported interval. The alpha/2, 1-alpha/2 confidence interval is reported. value: float, default 0 The mean value of the metric you'd like to test under null hypothesis. decimals: int, default 3 Number of decimal places to round each column to. feature_names: list of str, optional The input of the feature names treatment_names: list of str, optional The names of the treatments output_names: list of str, optional The names of the outputs Returns ------- smry : Summary instance this holds the summary tables and text, which can be printed or converted to various output formats. """ # Get input names treatment_names = self.cate_treatment_names(treatment_names) output_names = self.cate_output_names(output_names) # Summary if self._cached_values is not None: print("Population summary of CATE predictions on Training Data") smry = self.const_marginal_ate_inference(self._cached_values.X).summary(alpha=alpha, value=value, decimals=decimals, output_names=output_names, treatment_names=treatment_names) else: print("Population summary results are available only if `cache_values=True` at fit time!") smry = Summary() d_t = self._d_t[0] if self._d_t else 1 d_y = self._d_y[0] if self._d_y else 1 try: intercept_table = self.ate__inference().summary_frame(alpha=alpha, value=value, decimals=decimals, feature_names=None, treatment_names=treatment_names, output_names=output_names) intercept_array = intercept_table.values intercept_headers = intercept_table.columns.tolist() n_level = intercept_table.index.nlevels if n_level > 1: intercept_stubs = ["|".join(ind_value) for ind_value in intercept_table.index.values] else: intercept_stubs = intercept_table.index.tolist() intercept_title = 'Doubly Robust ATE on Training Data Results' smry.add_table(intercept_array, intercept_headers, intercept_stubs, intercept_title) except Exception as e: print("Doubly Robust ATE on Training Data Results: ", str(e)) for t in range(0, d_t + 1): try: intercept_table = self.att__inference(T=t).summary_frame(alpha=alpha, value=value, decimals=decimals, feature_names=None, output_names=output_names) intercept_array = intercept_table.values intercept_headers = intercept_table.columns.tolist() n_level = intercept_table.index.nlevels if n_level > 1: intercept_stubs = ["|".join(ind_value) for ind_value in intercept_table.index.values] else: intercept_stubs = intercept_table.index.tolist() intercept_title = "Doubly Robust ATT(T={}) on Training Data Results".format(t) smry.add_table(intercept_array, intercept_headers, intercept_stubs, intercept_title) except Exception as e: print("Doubly Robust ATT on Training Data Results: ", str(e)) break if len(smry.tables) > 0: return smry
[docs] def shap_values(self, X, *, feature_names=None, treatment_names=None, output_names=None, background_samples=100): return _shap_explain_multitask_model_cate(self.const_marginal_effect, self.model_cate.estimators_, X, self._d_t, self._d_y, featurizer=self.featurizer_, feature_names=feature_names, treatment_names=treatment_names, output_names=output_names, input_names=self._input_names, background_samples=background_samples)
shap_values.__doc__ = LinearCateEstimator.shap_values.__doc__
[docs] def ate__inference(self): """ Returns ------- ate__inference : NormalInferenceResults Inference results information for the `ate_` attribute, which is the average constant marginal treatment effect of each treatment for each outcome, averaged over the training data and with a doubly robust correction. Available only when `discrete_treatment=True` and `drate=True`. """ return NormalInferenceResults(d_t=self._d_t[0] if self._d_t else 1, d_y=self._d_y[0] if self._d_y else 1, pred=self.ate_, pred_stderr=self.ate_stderr_, mean_pred_stderr=None, inf_type='ate', feature_names=self.cate_feature_names(), output_names=self.cate_output_names(), treatment_names=self.cate_treatment_names())
@property def ate_(self): return self.rlearner_model_final_.ate_ @property def ate_stderr_(self): return self.rlearner_model_final_.ate_stderr_
[docs] def att__inference(self, *, T): """ Parameters ---------- T : int The index of the treatment for which to get the ATT. It corresponds to the lexicographic rank of the discrete input treatments. Returns ------- att__inference : NormalInferenceResults Inference results information for the `att_` attribute, which is the average constant marginal treatment effect of each treatment for each outcome, averaged over the training data treated with treatment T and with a doubly robust correction. Available only when `discrete_treatment=True` and `drate=True`. """ return NormalInferenceResults(d_t=self._d_t[0] if self._d_t else 1, d_y=self._d_y[0] if self._d_y else 1, pred=self.att_(T=T), pred_stderr=self.att_stderr_(T=T), mean_pred_stderr=None, inf_type='att', feature_names=self.cate_feature_names(), output_names=self.cate_output_names(), treatment_names=self.cate_treatment_names())
[docs] def att_(self, *, T): """ Parameters ---------- T : int The index of the treatment for which to get the ATT. It corresponds to the lexicographic rank of the discrete input treatments. Returns ------- att_ : ndarray (n_y, n_t) The average constant marginal treatment effect of each treatment for each outcome, averaged over the training data treated with treatment T and with a doubly robust correction. Singleton dimensions are dropped if input variable was a vector. """ return self.rlearner_model_final_.att_[T]
[docs] def att_stderr_(self, *, T): """ Parameters ---------- T : int The index of the treatment for which to get the ATT. It corresponds to the lexicographic rank of the discrete input treatments. Returns ------- att_stderr_ : ndarray (n_y, n_t) The standard error of the corresponding `att_` """ return self.rlearner_model_final_.att_stderr_[T]
@property def feature_importances_(self): return self.feature_importances() @property def model_final(self): return self._gen_model_final() @model_final.setter def model_final(self, model): if model is not None: raise ValueError("Parameter `model_final` cannot be altered for this estimator!") def __len__(self): """Return the number of estimators in the ensemble.""" return self.model_cate.__len__() def __getitem__(self, index): """Return the index'th estimator in the ensemble.""" return self.model_cate.__getitem__(index) def __iter__(self): """Return iterator over estimators in the ensemble.""" return self.model_cate.__iter__()