import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style("whitegrid")
import gc # memory cleaning

import os
import psutil
process = psutil.Process(os.getpid())
print("Memory usage:", process.memory_info().rss/1024/1024,"MB")

Memory usage: 142.046875 MB

Generate synthetic data¶

We consider the following setting:

$\mathbf{X}\sim \mathcal{N}(\mathbf{0}, \mathbf{Id}_{d\times d})$ or $\mathrm{Uniform}([0,1]^d)$
$(Y(0),Y(1)) \sim \mathcal{N}(m(\mathbf{X}),\Sigma(\mathbf{X})))$
$W \sim \mathrm{Bernoulli}(e(\mathbf{X}))$

We assume that given $\mathbf{X}$, $(Y(0),Y(1))$ is a 2-dimensional Gaussian random variable. More precisely, let $(G_0,G_1,G_2)$ be 3 independent standard Guassians, we consider $$ Y(0) := \sqrt{\sigma_{00}(\mathbf{X})}G_0 + \sqrt{\sigma_{00}(\mathbf{X}) - \sigma_{01}(\mathbf{X})} G_1 + m(\mathbf{X}){(0)}, $$ and $$ Y(1) := \sqrt{\sigma_{11}(\mathbf{X})}G_0 + \sqrt{\sigma_{11}(\mathbf{X}) - \sigma_{01}(\mathbf{X})} G_2+ m(\mathbf{X}){(1)}. $$ It is readily checked that $$ m(\mathbf{X}) = (m(\mathbf{X})(0),m(\mathbf{X})(1)), $$ and $$

\Sigma(\mathbf{X})¶

\begin{pmatrix} \sigma_{00}(\mathbf{X}) & \sigma_{01}(\mathbf{X}) \\ \sigma_{01}(\mathbf{X}) & \sigma_{11}(\mathbf{X}) \end{pmatrix}

. $$

Details on the choice of functions:¶

mode 1: constant covariance w.r.t. $\mathbf{X}$¶

$e(\mathbf{X}) = \sin(X^{(1)}+X^{(2)})/2 +0.5$
$m(\mathbf{X})(0) = X^{(3)}+X^{(1)} $
$m(\mathbf{X})(1) = X^{(4)}+0.3\times X^{(2)}$
$\sigma_{00}(\mathbf{X}) = 1$
$\sigma_{11}(\mathbf{X}) = 1$
$\sigma_{01}(\mathbf{X}) = 0.5$

mode 2: dependent covariance matrix w.r.t. $\mathbf{X}$¶

$e(\mathbf{X}) = \sin(X^{(1)}+X^{(2)})/2 +0.5$
$m(\mathbf{X})(0) = X^{(3)}+X^{(1)} $
$m(\mathbf{X})(1) = X^{(4)}+0.3\times X^{(2)}$
$\sigma_{00}(\mathbf{X}) = |X^{(1)} + X^{(4)}|$
$\sigma_{11}(\mathbf{X}) = |X^{(2)} + X^{(3)}|$
$\sigma_{01}(\mathbf{X}) = \min(\sigma_{00}(\mathbf{X}),\sigma_{11}(\mathbf{X}))/2$

remark¶

According to our construction, the covariate $X^{(0)}$ and $X^{(5)}$ are irrelevant noise w.r.t. the objective $(Y(0),Y(1))$. We also remark that this setting does not enter the modelling of Meta-learners.

def e_func(x):
    return np.sin(x[1]*x[2]+x[3])*.5+.5
def mX0(x):
    #return x[3]+x[2]+x[1]
    return 1. 
def mX1(x):
    return np.sin(x[4])+3.*x[1] +5.
    #return 5. 
def sigma00(x):
    return max(min(np.abs(x[1]+x[4])*1.9,3.),1.)
    #return 1.
def sigma11(x):
    return max(min(np.abs(x[2]+x[3])*1.9,3.),1.)
    #return 1
def sigma01(x):
    #return min(sigma00(x),sigma11(x))*0.5 
    #return np.abs(np.sin(x[1]+2*x[2]+3*x[4]))*.4
    return 0.9 

N_total = 5000
N_train = 4000 
N_test = N_total - N_train
d = 6
X = np.random.normal(0,1,(N_total,d))
#X = np.random.uniform(0,1,(N_total,d))


e = np.apply_along_axis(e_func, 1, X)
W = np.random.binomial(size=N_total, n=1, p= e)
# instrumental gaussians
G = np.random.normal(0,1,(N_total,3)) 
# generate (Y(0),Y(1))
Y0 = np.apply_along_axis(mX0, 1, X)+np.sqrt(np.apply_along_axis(sigma01, 1, X))*G[:,0] +np.sqrt(np.apply_along_axis(sigma00, 1, X) -np.apply_along_axis(sigma01, 1, X) )*G[:,1] 
Y1 = np.apply_along_axis(mX1, 1, X)+np.sqrt(np.apply_along_axis(sigma01, 1, X))*G[:,0] +np.sqrt(np.apply_along_axis(sigma11, 1, X) -np.apply_along_axis(sigma01, 1, X) )*G[:,2] 
Yi = W*Y1 + (1-W)*Y0


## split train/test data
X_train = X[0:N_train]
X_test = X[N_train:]

Y0_train = Y0[0:N_train]
Y0_test = Y0[N_train:]

Y1_train = Y1[0:N_train]
Y1_test = Y1[N_train:]

Yi_train = Yi[0:N_train]
Yi_test = Yi[N_train:]

W_train = W[0:N_train]
W_test = W[N_train:]

# check memory usage
process = psutil.Process(os.getpid())
print(process.memory_info().rss/1024/1024,"MB")

143.52734375 MB

# get true values/samples of L((Y(0),Y(1)) | X = x )
def sampleY(x, N):
    G = np.random.normal(0,1,(N,3))
    Y0x = mX0(x) + np.sqrt(sigma01(x))*G[:,0]  + np.sqrt(sigma00(x)-sigma01(x))*G[:,1]
    Y1x = mX1(x) + np.sqrt(sigma01(x))*G[:,0]  + np.sqrt(sigma11(x)-sigma01(x))*G[:,2]
    return Y0x,Y1x
def getInfoY(x):
    return {'mX0':mX0(x),
            'mX1':mX1(x),
            'sigma00':sigma00(x),
            'sigma11':sigma11(x),
            'sigma01':sigma01(x)}

Some Visualization of data¶

# Viz of (Y(0),Y(1))
plt.figure(figsize = (10,5))
plt.subplot(121)
plt.plot(Y0_train[:1000],Y1_train[:1000],"p",alpha = 0.1, color = "grey")
plt.title("Points cloud of $(Y_i(0),Y_i(1))$")
plt.subplot(122)
sns.kdeplot(Y0_train[:1000],Y1_train[:1000],shade=True, cbar=True)
plt.title("KDE plot of $(Y_i(0),Y_i(1))$")

Text(0.5, 1.0, 'KDE plot of $(Y_i(0),Y_i(1))$')

plt.figure(figsize = (15,5))
plt.subplot(131)
plt.hist(Yi_train,bins = 30, color = "lightgrey")
plt.title("Histogram of $Y_i(W_i)$")
plt.subplot(132)
plt.hist(Y0_train,bins = 30, color = "lightgrey")
plt.title("Histogram of $Y_i(0)$")
plt.subplot(133)
plt.hist(Y1_train,bins = 30, color = "lightgrey")
plt.title("Histogram of $Y_i(1)$")

Text(0.5, 1.0, 'Histogram of $Y_i(1)$')

IndexTest = 10
Y0x,Y1x = sampleY(X[IndexTest],1000)
plt.figure(figsize = (10,10))
plt.plot(Y0x,Y1x,"p", color = "grey",alpha = 0.2)
InfoTest = getInfoY(X[IndexTest])
plt.title("Samples of $\mathcal{L}((Y(0),Y(1)) | X =$"+str(X[IndexTest])+"$)$")
print(X[IndexTest])
print(InfoTest)

[-0.72820706 -1.47569508 -1.7427602   1.17488743  1.65302154  0.27001527]
{'mX0': 1.0, 'mX1': 1.5695361795205853, 'sigma00': 1.0, 'sigma11': 1.0789582530334885, 'sigma01': 0.9}

np.cov(Y0x,Y1x)

array([[1.05382309, 0.93168275],
       [0.93168275, 1.08420357]])

Meta-learners for conditional distribution estimation.¶

Notation¶

A model trained by the training data $(\mathbf{X}_i,Y_i;1\leq i \leq N)$ is denoted by $$ \mathcal{M}_{\left\{\mathbf{X}_i\sim Y_i \;|\; i\in [N]\right\}}. $$ The associated prediction at point $\mathbf{x}$ is denoted by $$ \mathbf{Pred}\left(\mathbf{x} \;|\; \mathcal{M}_{\left\{\mathbf{X}_i\sim Y_i \;|\; i\in [N]\right\}} \right). $$ This notation is more compatible with the models in the Bayesian nonparametrics context, emphasizing the randomness brought by the model itself. When the underlying model is Random Forests-based, we replace $\mathcal{M}$ by $\mathcal{T}$. Below we list the construction of T-, S- and X-learners.

T-learners:¶

Divided by two subgroups w.r.t. $W_i$, we consider the estimator $\hat{\mu}_{\omega}^T$ of $\mu_{\omega}$ defined as follows: $$ \hat{\mu}_{\omega}^T(\mathbf{x}) :=\mathbf{Pred}\left(\mathbf{x} \;|\; \mathcal{M}_{\left\{\mathbf{X}_i\sim Y_i \;|\; W_i = \omega,\;i\in [N]\right\}}\right). $$ Then, the estimator $\hat{\tau}^T$ of $\tau$ is defined by $$ \hat{\tau}^T(\mathbf{x}) = \hat{\mu}_{1}^T(\mathbf{x}) - \hat{\mu}_{0}^T(\mathbf{x}). $$

S-learners:¶

Without specifying the role of $W_i$, the estimator $\hat{\mu}_{\omega}$ of $\mu_{\omega}$ defined as follows: $$ \hat{\mu}_{\omega}^S(\mathbf{x}) := \mathbf{Pred}\left((\mathbf{x},\omega) \;|\; \mathcal{M}_{\left\{(\mathbf{X}_i,W_i)\sim Y_i \;|\; i\in [N]\right\}}\right). $$ Then, the estimator $\hat{\tau}^S$ of $\tau$ is defined by $$ \hat{\tau}^S(\mathbf{x}) = \hat{\mu}_{1}^S(\mathbf{x}) - \hat{\mu}_{0}^S(\mathbf{x}). $$

X-learners:¶

By estimating the "missing" potential outcomes with the same spirit of T-learners, we define

$$ \hat{\tau}_{1}^X(\mathbf{x}) := \mathbf{Pred}\left(\mathbf{x} \;|\; \mathcal{M}_{\left\{\mathbf{X}_i \sim Y_i - \hat{\mu}_0^T(\mathbf{X}_i) \;|\; W_i = 1,\;i\in [N]\right\}}\right), $$

and

$$ \hat{\tau}_{0}^X(\mathbf{x}) :=\mathbf{Pred}\left(\mathbf{x} \;|\; \mathcal{M}_{\left\{\mathbf{X}_i \sim \hat{\mu}_1^T(\mathbf{X}_i) - Y_i \;|\; W_i = 0,\;i\in [N]\right\}}\right). $$

The final estimator $\hat{\tau}^X$ of $\tau$ is constructed by taking the convex combination of the two estimators defined above, that is

$$ \hat{\tau}^X(\mathbf{x}) := g(\mathbf{x})\hat{\tau}_1^X(\mathbf{x}) + (1-g(\mathbf{x}))\hat{\tau}_0^X(\mathbf{x}). $$

For the choice of $g(\mathbf{x})$, one may consider the estimator of the propensity score $\hat{e}(\mathbf{x})$, or one can simply take $0$ or $1$ according to the data.

Stage 1: parametric models¶

The only ingredient required for conditional distribution estimation is the conditional mean estimator $\hat{\mu}_0^T$ and $\hat{\mu}_1^T$. However, the estimator for the covariance matrix is of taste as X-learners.

$$ \hat{\sigma}_{00}(\mathbf{x}) := \mathbf{Pred}\left(\mathbf{x}\;|\; \mathcal{M}_{\left\{\mathbf{X}_i \sim Y_i^2 \;|\; W_i = 0,\;i\in [N]\right\}}\right) - \hat{\mu}_0^T(\mathbf{x})^2 , $$

and

$$ \hat{\sigma}_{11}(\mathbf{x}) := \mathbf{Pred}\left(\mathbf{x}\;|\; \mathcal{M}_{\left\{\mathbf{X}_i \sim Y_i^2 \;|\; W_i = 1,\;i\in [N]\right\}}\right) - \hat{\mu}_1^T(\mathbf{x})^2 , $$

as well as

$$ \begin{aligned} & \hat{\sigma}_{01}(\mathbf{x}) \\ :=& g(\mathbf{x}) \mathbf{Pred}\left(\mathbf{x}\;|\; \mathcal{M}_{\left\{\mathbf{X}_i \sim \hat{\mu}_0^T(\mathbf{X}_i)Y_i \;|\; W_i = 1,\;i\in [N]\right\}}\right) \\& +(1-g(\mathbf{x})) \mathbf{Pred}\left(\mathbf{x}\;|\; \mathcal{M}_{\left\{\mathbf{X}_i \sim Y_i \hat{\mu}_1^T(\mathbf{X}_i) \;|\; W_i = 0,\;i\in [N]\right\}}\right) - \hat{\mu}_0^T(\mathbf{x}) \hat{\mu}_1^T(\mathbf{x}) , \end{aligned} $$

where $g(\mathbf{x})$ can be any $[0,1]$-valued function, which can be chosen as an estimator of the propensity score. For simplicity, we take $g(\mathbf{x})\equiv 1/2$.

# divide training data according to treatment assignment.

W_multi = np.transpose(np.array([W_train for i in range(d)])).reshape(N_train,d)

X_train_treated = np.extract(W_multi,X_train) 
X_train_treated = np.transpose(X_train_treated).reshape(np.sum(W_train),d)
Yi_train_treated = np.extract(W_train,Yi_train)

X_train_control = np.extract(1-W_multi,X_train) 
X_train_control = np.transpose(X_train_control).reshape(N_train - np.sum(W_train),d)
Yi_train_control = np.extract(1-W_train,Yi_train)

# test data

W_test_multi = np.transpose(np.array([W_test for i in range(d)])).reshape(N_test,d)
X_test_treated = np.extract(W_test_multi,X_test) 
X_test_treated = np.transpose(X_test_treated).reshape(np.sum(W_test),d)
Yi_test_treated = np.extract(W_test,Yi_test)

X_test_control = np.extract(1-W_test_multi,X_test) 
X_test_control = np.transpose(X_test_control).reshape(N_test - np.sum(W_test),d)
Yi_test_control = np.extract(1-W_test,Yi_test)

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor 
from sklearn.metrics import mean_squared_error
#mu0T_hat = RandomForestRegressor(n_estimators = 100, min_samples_split=2)
#mu0T_hat = XGBRegressor(learning_rate = 0.1,
#                        objective = "reg:squarederror",
#                        n_estimators = 30)
mu0T_hat = AdaBoostRegressor(learning_rate = 0.1, n_estimators = 10)
mu0T_hat.fit(X_train_control, Yi_train_control)
#mu1T_hat = RandomForestRegressor(n_estimators = 100, min_samples_split=2)
#mu1T_hat = AdaBoostRegressor(learning_rate = 0.1, n_estimators = 10)
mu1T_hat = XGBRegressor(learning_rate = 0.1,
                        objective = "reg:squarederror",
                        n_estimators = 30)
mu1T_hat.fit(X_train_treated, Yi_train_treated)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=30,
             n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, verbosity=1)

## other choices for base learners
# from sklearn.linear_model import LinearRegression
# from sklearn.ensemble import AdaBoostRegressor
# mu0T_hat = AdaBoostRegressor()
# mu0T_hat.fit(X_train_control, Yi_train_control)
# mu1T_hat = AdaBoostRegressor()
# mu1T_hat.fit(X_train_treated, Yi_train_treated)

Illustration of the estimation of $\hat{\mu}_0^T$ and $\hat{\mu}_1^T$¶

We choose randomly 100 sample points respectively in treated and control group in test data, and compare the prediction of $Y(0)$ (resp. $Y(1)$) with the reference values.

Additional Info:¶

Evaluation¶

To evaluate the quality of prediction, we consider:

RMSE: root-mean-square error $$ \mathrm{RMSE} := \sqrt{\sum_{i=1}^N (\mathrm{Pred_i} - \mathrm{Ref}_i)^2}. $$
Score: coefficient of determination, defined by $$ \mathrm{score} := 1 - \frac{R_{res}}{R_{tot}}. $$

Visualization¶

To illustrate the quality of estimation, we first draw a subsample of size 100 in the test date, then we plot repectively the predicitions and the reference values.

Base-learners¶

For the choice of the base-learners, we mainly consider:

Forests-based¶

Random Forests:

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

Mondrian Forests:

https://scikit-garden.github.io/examples/MondrianTreeRegressor/

Boosting-based¶

AdaBoost:

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostRegressor.html

XGBoost:

https://xgboost.readthedocs.io/en/latest/tutorials/model.html

for parameters: https://xgboost.readthedocs.io/en/latest/parameter.html

plt.figure(figsize=(15,5))
RandomSubgroup = np.random.choice(range(np.sum(1-W_test)),100)
plt.plot(mu1T_hat.predict(X_test_control[RandomSubgroup]),color="grey",alpha=0.5,label="prediction")
plt.plot(
    np.apply_along_axis(mX1,1,X_test[np.extract(1-W_test,range(N_test))[RandomSubgroup]]),
    color="darkred",alpha=0.5,label="reference")
plt.title("Estimation of $Y(1)$ for control group")
plt.legend()
print("size of treated group in training data: {}, {:.1%}.".format(sum(W_train),sum(W_train)/N_train))
print("Score on the test dataset:",mu1T_hat.score(X_test, np.apply_along_axis(mX1,1,X_test)))
print("RMSE on the test dataset:", np.sqrt(mean_squared_error(np.apply_along_axis(mX1,1,X_test),mu1T_hat.predict(X_test))))

size of treated group in training data: 1966, 49.1%.
Score on the test dataset: 0.9809638846149588
RMSE on the test dataset: 0.4267775054935421

plt.figure(figsize=(15,5))
RandomSubgroup = np.random.choice(range(np.sum(W_test)),100)
plt.plot(mu0T_hat.predict(X_test_treated[RandomSubgroup]),color="grey",alpha=0.5,label="prediction")
plt.plot(
    np.apply_along_axis(mX0,1,X_test[np.extract(W_test,range(N_test))[RandomSubgroup]]),
    color="darkred",alpha=0.5,label="reference")
plt.title("Estimation of $Y(0)$ for treated group")
plt.legend()
print("size of treated group in training data: {}, {:.1%}.".format(sum(1-W_train),sum(1-W_train)/N_train))
print("Score on the test dataset:", mu0T_hat.score(X_test, np.apply_along_axis(mX0,1,X_test)))
print("RMSE on the test dataset:", np.sqrt(mean_squared_error(np.apply_along_axis(mX0,1,X_test),mu0T_hat.predict(X_test))))

size of treated group in training data: 2034, 50.8%.
Score on the test dataset: 0.0
RMSE on the test dataset: 0.04314684780702415

Covariance matrix estimators¶

from sklearn.metrics import r2_score


#mu00_hat = RandomForestRegressor(n_estimators = 100, min_samples_split=2)
#mu00_hat = AdaBoostRegressor(learning_rate = 0.1, n_estimators = 20)
mu00_hat = XGBRegressor(learning_rate = 0.1,
                        n_estimators = 50,
                        objective = "reg:squarederror",
                       gamma=2.)
mu00_hat.fit(X_train_control, Yi_train_control**2)
def sigma00_hat(x):
    return mu00_hat.predict(x) - mu0T_hat.predict(x)**2
#mu11_hat = RandomForestRegressor(n_estimators = 100, min_samples_split=2)
#mu11_hat = AdaBoostRegressor(learning_rate = 0.1, n_estimators = 50)
mu11_hat = XGBRegressor(learning_rate = 0.1,
                        booster = "gbtree",
                        objective = "reg:squarederror",
                        n_estimators = 20,
                       gamma=1.,
                       subsample = 1.,
                       reg_lambda = 1.)
mu11_hat.fit(X_train_treated, Yi_train_treated**2)
def sigma11_hat(x):
    return mu11_hat.predict(x) - mu1T_hat.predict(x)**2

mu01_hat_0 = RandomForestRegressor(n_estimators = 100, min_samples_split=5)
#mu01_hat_0.fit(X_train_control,Yi_train_control*mu1T_hat.predict(X_train_control))
mu01_hat_0.fit(X_train_control,Yi_train_control*mu1T_hat.predict(X_train_control))
mu01_hat_1 = RandomForestRegressor(n_estimators = 100, min_samples_split=5)
#mu01_hat_1.fit(X_train_treated,Yi_train_treated*mu0T_hat.predict(X_train_treated))
mu01_hat_1.fit(X_train_treated,Yi_train_treated*mu0T_hat.predict(X_train_treated))
def sigma01_hat(x):
    #return mu01_hat_0.predict(x)*(1-np.apply_along_axis(e_func, 1, x)) + mu01_hat_1.predict(x)*np.apply_along_axis(e_func, 1, x)
    return mu01_hat_0.predict(x)*.5 + mu01_hat_1.predict(x)*.5 - mu0T_hat.predict(x)*mu1T_hat.predict(x)

plt.figure(figsize = (15,5))
RandomSubgroup = np.random.choice(range(N_test),100)
plt.plot(np.apply_along_axis(sigma00, 1, X_test[RandomSubgroup]),color="darkred",alpha=0.5,label="reference")
plt.plot(sigma00_hat(X_test[RandomSubgroup]),color="grey",alpha=0.5,label="prediction")
plt.title("Estimation of $\sigma_{00}(\mathbf{x})$")
plt.legend()
print("score:", r2_score(sigma00_hat(X_test),np.apply_along_axis(sigma00, 1, X_test)))
print("RMSE on the test dataset:", np.sqrt(mean_squared_error(np.apply_along_axis(sigma00,1,X_test),sigma00_hat(X_test))))

score: -0.17280967980716433
RMSE on the test dataset: 0.80172318115857

plt.figure(figsize = (15,5))
RandomSubgroup = np.random.choice(range(N_test),100)
plt.plot(np.apply_along_axis(sigma11, 1, X_test[RandomSubgroup]),color="darkred",alpha=0.5,label="reference")
plt.plot(sigma11_hat(X_test[RandomSubgroup]),color="grey",alpha=0.5,label="prediction")
plt.title("Estimation of $\sigma_{11}(\mathbf{x})$")
plt.legend()
print("score:", r2_score(sigma11_hat(X_test),np.apply_along_axis(sigma11, 1, X_test)))
print("RMSE on the test dataset:", np.sqrt(mean_squared_error(np.apply_along_axis(sigma11,1,X_test),sigma11_hat(X_test))))

score: -0.33208254994340014
RMSE on the test dataset: 3.320856544220381

plt.figure(figsize = (15,5))
RandomSubgroup = np.random.choice(range(N_test),100)
plt.plot(np.apply_along_axis(sigma01, 1, X_test[RandomSubgroup]),color="darkred",alpha=0.5,label="reference")
plt.plot(sigma01_hat(X_test[RandomSubgroup]),color="grey",alpha=0.5,label="prediction")
plt.title("Estimation of $\sigma_{01}(\mathbf{x})$")
plt.legend()
print("score:", r2_score(sigma01_hat(X_test),np.apply_along_axis(sigma01, 1, X_test)))
print("RMSE on the test dataset:", np.sqrt(mean_squared_error(np.apply_along_axis(sigma01,1,X_test),sigma01_hat(X_test))))

score: -0.5913874033780999
RMSE on the test dataset: 1.3488601456961298

Comments¶

We could see that even for the case where the covariance matrix is constant, these estimators failed to provided reliable estimation. This is due to the propagation of errors: it is very dangerous to couple the Meta-learners to get more information.

Any ideas on how to construct covariance estimators in this Gaussian case?

Test with Mondrian Forests¶

from skgarden import MondrianForestRegressor, MondrianTreeRegressor

mu0T_hat = MondrianForestRegressor(n_estimators = 100, min_samples_split=2)
mu0T_hat.fit(X_train_control, Yi_train_control)

mu1T_hat = MondrianForestRegressor(n_estimators = 100, min_samples_split=2)
mu1T_hat.fit(X_train_treated, Yi_train_treated)

/home/qiming/.local/lib/python3.7/site-packages/sklearn/externals/joblib/__init__.py:15: DeprecationWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.
  warnings.warn(msg, category=DeprecationWarning)
/home/qiming/.local/lib/python3.7/site-packages/sklearn/externals/six.py:31: DeprecationWarning: The module is deprecated in version 0.21 and will be removed in version 0.23 since we've dropped support for Python 2.7. Please rely on the official version of six (https://pypi.org/project/six/).
  "(https://pypi.org/project/six/).", DeprecationWarning)

MondrianForestRegressor(bootstrap=False, max_depth=None, min_samples_split=2,
                        n_estimators=100, n_jobs=1, random_state=None,
                        verbose=0)

plt.figure(figsize=(15,5))
RandomSubgroup = np.random.choice(range(np.sum(1-W_test)),100)
plt.plot(mu1T_hat.predict(X_test_control[RandomSubgroup]),color="grey",alpha=0.5,label="prediction")
plt.plot(
    np.apply_along_axis(mX1,1,X_test[np.extract(1-W_test,range(N_test))[RandomSubgroup]]),
    color="darkred",alpha=0.5,label="reference")
plt.title("Estimation of $Y(1)$ for control group")
plt.legend()
print("size of treated group in training data: {}, {:.1%}.".format(sum(W_train),sum(W_train)/N_train))
print(mu1T_hat.score(X_test, np.apply_along_axis(mX1,1,X_test)))

size of treated group in training data: 1966, 49.1%.
0.8911961289343913

plt.figure(figsize=(15,5))
RandomSubgroup = np.random.choice(range(np.sum(W_test)),100)
plt.plot(mu0T_hat.predict(X_test_treated[RandomSubgroup]),color="grey",alpha=0.5,label="prediction")
plt.plot(
    np.apply_along_axis(mX0,1,X_test[np.extract(W_test,range(N_test))[RandomSubgroup]]),
    color="darkred",alpha=0.5,label="reference")
plt.title("Estimation of $Y(0)$ for treated group")
plt.legend()
print("size of treated group in training data: {}, {:.1%}.".format(sum(1-W_train),sum(1-W_train)/N_train))
print("Score on the test dataset:", mu0T_hat.score(X_test, np.apply_along_axis(mX0,1,X_test)))

size of treated group in training data: 2034, 50.8%.
Score on the test dataset: 0.0

Forest-based conditional distribution estimation¶

from sklearn.tree import DecisionTreeRegressor

def samplerY0x(x,n_estimators,N):
    Y0x_empirical = np.zeros(n_estimators)
    for i in range(n_estimators):
        nu0T_hat = DecisionTreeRegressor()
        #nu0T_hat = RandomForestRegressor(n_estimators=1, bootstrap=False)
        nu0T_hat = MondrianForestRegressor(n_estimators=1, bootstrap=True)
        nu0T_hat.fit(X_train_control, Yi_train_control)
        
        # bootstrap manually
        # BootstrapSize = 10000 
        # N_train_0 = sum(1-W_train)
        # nu0T_hat.fit(X_train_control[np.random.choice(range(N_train_0),BootstrapSize)], Yi_train_control[np.random.choice(range(N_train_0),BootstrapSize)])
        Y0x_empirical[i] =nu0T_hat.predict(x.reshape(1,d))
    Y0x_sample = np.zeros(N)
    # for i in range(N):
    #     m = np.random.choice(Y0x_empirical,1)
    #     Y0x_sample[i] = np.random.normal(m,1.5,1)
    #     
    # return Y0x_sample
    return np.random.choice(Y0x_empirical,N)
        
def samplerY1x(x,n_estimators,N):
    Y1x_empirical = np.zeros(n_estimators)
    for i in range(n_estimators):
        nu1T_hat = MondrianForestRegressor(n_estimators = 1, bootstrap=True)
        nu1T_hat.fit(X_train_treated, Yi_train_treated)
        Y1x_empirical[i] =nu1T_hat.predict(x.reshape(1,d))
    return np.random.choice(Y1x_empirical,N)

TestIndex = 15 
y0 = samplerY0x(X_test[TestIndex],20,1000)
y1 = samplerY1x(X_test[TestIndex],20,1000)

plt.figure(figsize = (10,5))
plt.subplot(121)
plt.hist(y0,bins =30, color="lightgrey")
plt.title("Hist of Y(0) | X = x")
plt.subplot(122)
plt.hist(y1,bins =30, color="lightgrey")
plt.title("Hist of Y(1) | X = x")

Text(0.5, 1.0, 'Hist of Y(1) | X = x')

getInfoY(X_test[TestIndex])

{'mX0': 1.0,
 'mX1': 4.892827337359855,
 'sigma00': 1.0,
 'sigma11': 3.0,
 'sigma01': 0.9}

np.mean(y0)

1.1836145285563544

np.var(y0)

0.7183969863114369

np.mean(y1)

4.642889865338803

np.var(y1)

3.136919068070546

gc.collect()

12445

Drafts¶

X = np.random.uniform(0,1,(5000,5))
#Y = np.random.normal(0,1.,5000) +np.sin(X[:,1]+X[:,3]) +X[:,2]**2
Y = np.random.normal(0,1.,5000) + 1

from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor 
reg = AdaBoostRegressor(learning_rate = 0.1, n_estimators = 10)
#reg = XGBRegressor(n_estimators = 20) 
#reg = RandomForestRegressor(n_estimators = 100)
reg.fit(X[0:4000,:],Y[0:4000])

AdaBoostRegressor(base_estimator=None, learning_rate=0.1, loss='linear',
                  n_estimators=10, random_state=None)

from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(np.zeros(1000)+1.,reg.predict(X[4000:])))

0.06853218806163983

np.mean(reg.predict(X[4000:]))

1.0161601788924157

np.mean(Y[4000:])

1.019043940319142