In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style("whitegrid")
import gc # memory cleaning

import os
import psutil
process = psutil.Process(os.getpid())
print("Memory usage:", process.memory_info().rss/1024/1024,"MB") 
Memory usage: 142.046875 MB

Generate synthetic data

We consider the following setting:

  1. $\mathbf{X}\sim \mathcal{N}(\mathbf{0}, \mathbf{Id}_{d\times d})$ or $\mathrm{Uniform}([0,1]^d)$

  2. $(Y(0),Y(1)) \sim \mathcal{N}(m(\mathbf{X}),\Sigma(\mathbf{X})))$

  3. $W \sim \mathrm{Bernoulli}(e(\mathbf{X}))$

We assume that given $\mathbf{X}$, $(Y(0),Y(1))$ is a 2-dimensional Gaussian random variable. More precisely, let $(G_0,G_1,G_2)$ be 3 independent standard Guassians, we consider $$ Y(0) := \sqrt{\sigma_{00}(\mathbf{X})}G_0 + \sqrt{\sigma_{00}(\mathbf{X}) - \sigma_{01}(\mathbf{X})} G_1 + m(\mathbf{X}){(0)}, $$ and $$ Y(1) := \sqrt{\sigma_{11}(\mathbf{X})}G_0 + \sqrt{\sigma_{11}(\mathbf{X}) - \sigma_{01}(\mathbf{X})} G_2+ m(\mathbf{X}){(1)}. $$ It is readily checked that $$ m(\mathbf{X}) = (m(\mathbf{X})(0),m(\mathbf{X})(1)), $$ and $$

\Sigma(\mathbf{X})

\begin{pmatrix} \sigma_{00}(\mathbf{X}) & \sigma_{01}(\mathbf{X}) \\ \sigma_{01}(\mathbf{X}) & \sigma_{11}(\mathbf{X}) \end{pmatrix}

. $$

Details on the choice of functions:

mode 1: constant covariance w.r.t. $\mathbf{X}$

  • $e(\mathbf{X}) = \sin(X^{(1)}+X^{(2)})/2 +0.5$
  • $m(\mathbf{X})(0) = X^{(3)}+X^{(1)} $
  • $m(\mathbf{X})(1) = X^{(4)}+0.3\times X^{(2)}$
  • $\sigma_{00}(\mathbf{X}) = 1$
  • $\sigma_{11}(\mathbf{X}) = 1$
  • $\sigma_{01}(\mathbf{X}) = 0.5$

mode 2: dependent covariance matrix w.r.t. $\mathbf{X}$

  • $e(\mathbf{X}) = \sin(X^{(1)}+X^{(2)})/2 +0.5$
  • $m(\mathbf{X})(0) = X^{(3)}+X^{(1)} $
  • $m(\mathbf{X})(1) = X^{(4)}+0.3\times X^{(2)}$
  • $\sigma_{00}(\mathbf{X}) = |X^{(1)} + X^{(4)}|$
  • $\sigma_{11}(\mathbf{X}) = |X^{(2)} + X^{(3)}|$
  • $\sigma_{01}(\mathbf{X}) = \min(\sigma_{00}(\mathbf{X}),\sigma_{11}(\mathbf{X}))/2$

remark

According to our construction, the covariate $X^{(0)}$ and $X^{(5)}$ are irrelevant noise w.r.t. the objective $(Y(0),Y(1))$. We also remark that this setting does not enter the modelling of Meta-learners.

In [2]:
def e_func(x):
    return np.sin(x[1]*x[2]+x[3])*.5+.5
def mX0(x):
    #return x[3]+x[2]+x[1]
    return 1. 
def mX1(x):
    return np.sin(x[4])+3.*x[1] +5.
    #return 5. 
def sigma00(x):
    return max(min(np.abs(x[1]+x[4])*1.9,3.),1.)
    #return 1.
def sigma11(x):
    return max(min(np.abs(x[2]+x[3])*1.9,3.),1.)
    #return 1
def sigma01(x):
    #return min(sigma00(x),sigma11(x))*0.5 
    #return np.abs(np.sin(x[1]+2*x[2]+3*x[4]))*.4
    return 0.9 

N_total = 5000
N_train = 4000 
N_test = N_total - N_train
d = 6
X = np.random.normal(0,1,(N_total,d))
#X = np.random.uniform(0,1,(N_total,d))


e = np.apply_along_axis(e_func, 1, X)
W = np.random.binomial(size=N_total, n=1, p= e)
# instrumental gaussians
G = np.random.normal(0,1,(N_total,3)) 
# generate (Y(0),Y(1))
Y0 = np.apply_along_axis(mX0, 1, X)+np.sqrt(np.apply_along_axis(sigma01, 1, X))*G[:,0] +np.sqrt(np.apply_along_axis(sigma00, 1, X) -np.apply_along_axis(sigma01, 1, X) )*G[:,1] 
Y1 = np.apply_along_axis(mX1, 1, X)+np.sqrt(np.apply_along_axis(sigma01, 1, X))*G[:,0] +np.sqrt(np.apply_along_axis(sigma11, 1, X) -np.apply_along_axis(sigma01, 1, X) )*G[:,2] 
Yi = W*Y1 + (1-W)*Y0


## split train/test data
X_train = X[0:N_train]
X_test = X[N_train:]

Y0_train = Y0[0:N_train]
Y0_test = Y0[N_train:]

Y1_train = Y1[0:N_train]
Y1_test = Y1[N_train:]

Yi_train = Yi[0:N_train]
Yi_test = Yi[N_train:]

W_train = W[0:N_train]
W_test = W[N_train:]
In [3]:
# check memory usage
process = psutil.Process(os.getpid())
print(process.memory_info().rss/1024/1024,"MB") 
143.52734375 MB
In [4]:
# get true values/samples of L((Y(0),Y(1)) | X = x )
def sampleY(x, N):
    G = np.random.normal(0,1,(N,3))
    Y0x = mX0(x) + np.sqrt(sigma01(x))*G[:,0]  + np.sqrt(sigma00(x)-sigma01(x))*G[:,1]
    Y1x = mX1(x) + np.sqrt(sigma01(x))*G[:,0]  + np.sqrt(sigma11(x)-sigma01(x))*G[:,2]
    return Y0x,Y1x
def getInfoY(x):
    return {'mX0':mX0(x),
            'mX1':mX1(x),
            'sigma00':sigma00(x),
            'sigma11':sigma11(x),
            'sigma01':sigma01(x)}

Some Visualization of data

In [5]:
# Viz of (Y(0),Y(1))
plt.figure(figsize = (10,5))
plt.subplot(121)
plt.plot(Y0_train[:1000],Y1_train[:1000],"p",alpha = 0.1, color = "grey")
plt.title("Points cloud of $(Y_i(0),Y_i(1))$")
plt.subplot(122)
sns.kdeplot(Y0_train[:1000],Y1_train[:1000],shade=True, cbar=True)
plt.title("KDE plot of $(Y_i(0),Y_i(1))$")
Out[5]:
Text(0.5, 1.0, 'KDE plot of $(Y_i(0),Y_i(1))$')
In [6]:
plt.figure(figsize = (15,5))
plt.subplot(131)
plt.hist(Yi_train,bins = 30, color = "lightgrey")
plt.title("Histogram of $Y_i(W_i)$")
plt.subplot(132)
plt.hist(Y0_train,bins = 30, color = "lightgrey")
plt.title("Histogram of $Y_i(0)$")
plt.subplot(133)
plt.hist(Y1_train,bins = 30, color = "lightgrey")
plt.title("Histogram of $Y_i(1)$")
Out[6]:
Text(0.5, 1.0, 'Histogram of $Y_i(1)$')
In [7]:
IndexTest = 10
Y0x,Y1x = sampleY(X[IndexTest],1000)
plt.figure(figsize = (10,10))
plt.plot(Y0x,Y1x,"p", color = "grey",alpha = 0.2)
InfoTest = getInfoY(X[IndexTest])
plt.title("Samples of $\mathcal{L}((Y(0),Y(1)) | X =$"+str(X[IndexTest])+"$)$")
print(X[IndexTest])
print(InfoTest)
[-0.72820706 -1.47569508 -1.7427602   1.17488743  1.65302154  0.27001527]
{'mX0': 1.0, 'mX1': 1.5695361795205853, 'sigma00': 1.0, 'sigma11': 1.0789582530334885, 'sigma01': 0.9}
In [8]:
np.cov(Y0x,Y1x)
Out[8]:
array([[1.05382309, 0.93168275],
       [0.93168275, 1.08420357]])

Meta-learners for conditional distribution estimation.

Notation

A model trained by the training data $(\mathbf{X}_i,Y_i;1\leq i \leq N)$ is denoted by $$ \mathcal{M}_{\left\{\mathbf{X}_i\sim Y_i \;|\; i\in [N]\right\}}. $$ The associated prediction at point $\mathbf{x}$ is denoted by $$ \mathbf{Pred}\left(\mathbf{x} \;|\; \mathcal{M}_{\left\{\mathbf{X}_i\sim Y_i \;|\; i\in [N]\right\}} \right). $$ This notation is more compatible with the models in the Bayesian nonparametrics context, emphasizing the randomness brought by the model itself. When the underlying model is Random Forests-based, we replace $\mathcal{M}$ by $\mathcal{T}$. Below we list the construction of T-, S- and X-learners.

T-learners:

Divided by two subgroups w.r.t. $W_i$, we consider the estimator $\hat{\mu}_{\omega}^T$ of $\mu_{\omega}$ defined as follows: $$ \hat{\mu}_{\omega}^T(\mathbf{x}) :=\mathbf{Pred}\left(\mathbf{x} \;|\; \mathcal{M}_{\left\{\mathbf{X}_i\sim Y_i \;|\; W_i = \omega,\;i\in [N]\right\}}\right). $$ Then, the estimator $\hat{\tau}^T$ of $\tau$ is defined by $$ \hat{\tau}^T(\mathbf{x}) = \hat{\mu}_{1}^T(\mathbf{x}) - \hat{\mu}_{0}^T(\mathbf{x}). $$

S-learners:

Without specifying the role of $W_i$, the estimator $\hat{\mu}_{\omega}$ of $\mu_{\omega}$ defined as follows: $$ \hat{\mu}_{\omega}^S(\mathbf{x}) := \mathbf{Pred}\left((\mathbf{x},\omega) \;|\; \mathcal{M}_{\left\{(\mathbf{X}_i,W_i)\sim Y_i \;|\; i\in [N]\right\}}\right). $$ Then, the estimator $\hat{\tau}^S$ of $\tau$ is defined by $$ \hat{\tau}^S(\mathbf{x}) = \hat{\mu}_{1}^S(\mathbf{x}) - \hat{\mu}_{0}^S(\mathbf{x}). $$

X-learners:

By estimating the "missing" potential outcomes with the same spirit of T-learners, we define

$$ \hat{\tau}_{1}^X(\mathbf{x}) := \mathbf{Pred}\left(\mathbf{x} \;|\; \mathcal{M}_{\left\{\mathbf{X}_i \sim Y_i - \hat{\mu}_0^T(\mathbf{X}_i) \;|\; W_i = 1,\;i\in [N]\right\}}\right), $$

and

$$ \hat{\tau}_{0}^X(\mathbf{x}) :=\mathbf{Pred}\left(\mathbf{x} \;|\; \mathcal{M}_{\left\{\mathbf{X}_i \sim \hat{\mu}_1^T(\mathbf{X}_i) - Y_i \;|\; W_i = 0,\;i\in [N]\right\}}\right). $$

The final estimator $\hat{\tau}^X$ of $\tau$ is constructed by taking the convex combination of the two estimators defined above, that is

$$ \hat{\tau}^X(\mathbf{x}) := g(\mathbf{x})\hat{\tau}_1^X(\mathbf{x}) + (1-g(\mathbf{x}))\hat{\tau}_0^X(\mathbf{x}). $$

For the choice of $g(\mathbf{x})$, one may consider the estimator of the propensity score $\hat{e}(\mathbf{x})$, or one can simply take $0$ or $1$ according to the data.

Stage 1: parametric models

The only ingredient required for conditional distribution estimation is the conditional mean estimator $\hat{\mu}_0^T$ and $\hat{\mu}_1^T$. However, the estimator for the covariance matrix is of taste as X-learners.

$$ \hat{\sigma}_{00}(\mathbf{x}) := \mathbf{Pred}\left(\mathbf{x}\;|\; \mathcal{M}_{\left\{\mathbf{X}_i \sim Y_i^2 \;|\; W_i = 0,\;i\in [N]\right\}}\right) - \hat{\mu}_0^T(\mathbf{x})^2 , $$

and

$$ \hat{\sigma}_{11}(\mathbf{x}) := \mathbf{Pred}\left(\mathbf{x}\;|\; \mathcal{M}_{\left\{\mathbf{X}_i \sim Y_i^2 \;|\; W_i = 1,\;i\in [N]\right\}}\right) - \hat{\mu}_1^T(\mathbf{x})^2 , $$

as well as

$$ \begin{aligned} & \hat{\sigma}_{01}(\mathbf{x}) \\ :=& g(\mathbf{x}) \mathbf{Pred}\left(\mathbf{x}\;|\; \mathcal{M}_{\left\{\mathbf{X}_i \sim \hat{\mu}_0^T(\mathbf{X}_i)Y_i \;|\; W_i = 1,\;i\in [N]\right\}}\right) \\& +(1-g(\mathbf{x})) \mathbf{Pred}\left(\mathbf{x}\;|\; \mathcal{M}_{\left\{\mathbf{X}_i \sim Y_i \hat{\mu}_1^T(\mathbf{X}_i) \;|\; W_i = 0,\;i\in [N]\right\}}\right) - \hat{\mu}_0^T(\mathbf{x}) \hat{\mu}_1^T(\mathbf{x}) , \end{aligned} $$

where $g(\mathbf{x})$ can be any $[0,1]$-valued function, which can be chosen as an estimator of the propensity score. For simplicity, we take $g(\mathbf{x})\equiv 1/2$.

In [9]:
# divide training data according to treatment assignment.

W_multi = np.transpose(np.array([W_train for i in range(d)])).reshape(N_train,d)

X_train_treated = np.extract(W_multi,X_train) 
X_train_treated = np.transpose(X_train_treated).reshape(np.sum(W_train),d)
Yi_train_treated = np.extract(W_train,Yi_train)

X_train_control = np.extract(1-W_multi,X_train) 
X_train_control = np.transpose(X_train_control).reshape(N_train - np.sum(W_train),d)
Yi_train_control = np.extract(1-W_train,Yi_train)

# test data

W_test_multi = np.transpose(np.array([W_test for i in range(d)])).reshape(N_test,d)
X_test_treated = np.extract(W_test_multi,X_test) 
X_test_treated = np.transpose(X_test_treated).reshape(np.sum(W_test),d)
Yi_test_treated = np.extract(W_test,Yi_test)

X_test_control = np.extract(1-W_test_multi,X_test) 
X_test_control = np.transpose(X_test_control).reshape(N_test - np.sum(W_test),d)
Yi_test_control = np.extract(1-W_test,Yi_test)
In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor 
from sklearn.metrics import mean_squared_error
#mu0T_hat = RandomForestRegressor(n_estimators = 100, min_samples_split=2)
#mu0T_hat = XGBRegressor(learning_rate = 0.1,
#                        objective = "reg:squarederror",
#                        n_estimators = 30)
mu0T_hat = AdaBoostRegressor(learning_rate = 0.1, n_estimators = 10)
mu0T_hat.fit(X_train_control, Yi_train_control)
#mu1T_hat = RandomForestRegressor(n_estimators = 100, min_samples_split=2)
#mu1T_hat = AdaBoostRegressor(learning_rate = 0.1, n_estimators = 10)
mu1T_hat = XGBRegressor(learning_rate = 0.1,
                        objective = "reg:squarederror",
                        n_estimators = 30)
mu1T_hat.fit(X_train_treated, Yi_train_treated)
Out[10]:
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=30,
             n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, verbosity=1)
In [11]:
## other choices for base learners
# from sklearn.linear_model import LinearRegression
# from sklearn.ensemble import AdaBoostRegressor
# mu0T_hat = AdaBoostRegressor()
# mu0T_hat.fit(X_train_control, Yi_train_control)
# mu1T_hat = AdaBoostRegressor()
# mu1T_hat.fit(X_train_treated, Yi_train_treated)

Illustration of the estimation of $\hat{\mu}_0^T$ and $\hat{\mu}_1^T$

We choose randomly 100 sample points respectively in treated and control group in test data, and compare the prediction of $Y(0)$ (resp. $Y(1)$) with the reference values.

Additional Info:

Evaluation

To evaluate the quality of prediction, we consider:

  • RMSE: root-mean-square error $$ \mathrm{RMSE} := \sqrt{\sum_{i=1}^N (\mathrm{Pred_i} - \mathrm{Ref}_i)^2}. $$

  • Score: coefficient of determination, defined by $$ \mathrm{score} := 1 - \frac{R_{res}}{R_{tot}}. $$

Visualization

To illustrate the quality of estimation, we first draw a subsample of size 100 in the test date, then we plot repectively the predicitions and the reference values.

Base-learners

For the choice of the base-learners, we mainly consider:

Forests-based
  • Random Forests:

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

  • Mondrian Forests:

https://scikit-garden.github.io/examples/MondrianTreeRegressor/

Boosting-based
  • AdaBoost:

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostRegressor.html

  • XGBoost:

https://xgboost.readthedocs.io/en/latest/tutorials/model.html

for parameters: https://xgboost.readthedocs.io/en/latest/parameter.html

In [12]:
plt.figure(figsize=(15,5))
RandomSubgroup = np.random.choice(range(np.sum(1-W_test)),100)
plt.plot(mu1T_hat.predict(X_test_control[RandomSubgroup]),color="grey",alpha=0.5,label="prediction")
plt.plot(
    np.apply_along_axis(mX1,1,X_test[np.extract(1-W_test,range(N_test))[RandomSubgroup]]),
    color="darkred",alpha=0.5,label="reference")
plt.title("Estimation of $Y(1)$ for control group")
plt.legend()
print("size of treated group in training data: {}, {:.1%}.".format(sum(W_train),sum(W_train)/N_train))
print("Score on the test dataset:",mu1T_hat.score(X_test, np.apply_along_axis(mX1,1,X_test)))
print("RMSE on the test dataset:", np.sqrt(mean_squared_error(np.apply_along_axis(mX1,1,X_test),mu1T_hat.predict(X_test))))
size of treated group in training data: 1966, 49.1%.
Score on the test dataset: 0.9809638846149588
RMSE on the test dataset: 0.4267775054935421
In [13]:
plt.figure(figsize=(15,5))
RandomSubgroup = np.random.choice(range(np.sum(W_test)),100)
plt.plot(mu0T_hat.predict(X_test_treated[RandomSubgroup]),color="grey",alpha=0.5,label="prediction")
plt.plot(
    np.apply_along_axis(mX0,1,X_test[np.extract(W_test,range(N_test))[RandomSubgroup]]),
    color="darkred",alpha=0.5,label="reference")
plt.title("Estimation of $Y(0)$ for treated group")
plt.legend()
print("size of treated group in training data: {}, {:.1%}.".format(sum(1-W_train),sum(1-W_train)/N_train))
print("Score on the test dataset:", mu0T_hat.score(X_test, np.apply_along_axis(mX0,1,X_test)))
print("RMSE on the test dataset:", np.sqrt(mean_squared_error(np.apply_along_axis(mX0,1,X_test),mu0T_hat.predict(X_test))))
size of treated group in training data: 2034, 50.8%.
Score on the test dataset: 0.0
RMSE on the test dataset: 0.04314684780702415

Covariance matrix estimators

In [14]:
from sklearn.metrics import r2_score


#mu00_hat = RandomForestRegressor(n_estimators = 100, min_samples_split=2)
#mu00_hat = AdaBoostRegressor(learning_rate = 0.1, n_estimators = 20)
mu00_hat = XGBRegressor(learning_rate = 0.1,
                        n_estimators = 50,
                        objective = "reg:squarederror",
                       gamma=2.)
mu00_hat.fit(X_train_control, Yi_train_control**2)
def sigma00_hat(x):
    return mu00_hat.predict(x) - mu0T_hat.predict(x)**2
#mu11_hat = RandomForestRegressor(n_estimators = 100, min_samples_split=2)
#mu11_hat = AdaBoostRegressor(learning_rate = 0.1, n_estimators = 50)
mu11_hat = XGBRegressor(learning_rate = 0.1,
                        booster = "gbtree",
                        objective = "reg:squarederror",
                        n_estimators = 20,
                       gamma=1.,
                       subsample = 1.,
                       reg_lambda = 1.)
mu11_hat.fit(X_train_treated, Yi_train_treated**2)
def sigma11_hat(x):
    return mu11_hat.predict(x) - mu1T_hat.predict(x)**2
In [15]:
mu01_hat_0 = RandomForestRegressor(n_estimators = 100, min_samples_split=5)
#mu01_hat_0.fit(X_train_control,Yi_train_control*mu1T_hat.predict(X_train_control))
mu01_hat_0.fit(X_train_control,Yi_train_control*mu1T_hat.predict(X_train_control))
mu01_hat_1 = RandomForestRegressor(n_estimators = 100, min_samples_split=5)
#mu01_hat_1.fit(X_train_treated,Yi_train_treated*mu0T_hat.predict(X_train_treated))
mu01_hat_1.fit(X_train_treated,Yi_train_treated*mu0T_hat.predict(X_train_treated))
def sigma01_hat(x):
    #return mu01_hat_0.predict(x)*(1-np.apply_along_axis(e_func, 1, x)) + mu01_hat_1.predict(x)*np.apply_along_axis(e_func, 1, x)
    return mu01_hat_0.predict(x)*.5 + mu01_hat_1.predict(x)*.5 - mu0T_hat.predict(x)*mu1T_hat.predict(x)
In [16]:
plt.figure(figsize = (15,5))
RandomSubgroup = np.random.choice(range(N_test),100)
plt.plot(np.apply_along_axis(sigma00, 1, X_test[RandomSubgroup]),color="darkred",alpha=0.5,label="reference")
plt.plot(sigma00_hat(X_test[RandomSubgroup]),color="grey",alpha=0.5,label="prediction")
plt.title("Estimation of $\sigma_{00}(\mathbf{x})$")
plt.legend()
print("score:", r2_score(sigma00_hat(X_test),np.apply_along_axis(sigma00, 1, X_test)))
print("RMSE on the test dataset:", np.sqrt(mean_squared_error(np.apply_along_axis(sigma00,1,X_test),sigma00_hat(X_test))))
score: -0.17280967980716433
RMSE on the test dataset: 0.80172318115857
In [17]:
plt.figure(figsize = (15,5))
RandomSubgroup = np.random.choice(range(N_test),100)
plt.plot(np.apply_along_axis(sigma11, 1, X_test[RandomSubgroup]),color="darkred",alpha=0.5,label="reference")
plt.plot(sigma11_hat(X_test[RandomSubgroup]),color="grey",alpha=0.5,label="prediction")
plt.title("Estimation of $\sigma_{11}(\mathbf{x})$")
plt.legend()
print("score:", r2_score(sigma11_hat(X_test),np.apply_along_axis(sigma11, 1, X_test)))
print("RMSE on the test dataset:", np.sqrt(mean_squared_error(np.apply_along_axis(sigma11,1,X_test),sigma11_hat(X_test))))
score: -0.33208254994340014
RMSE on the test dataset: 3.320856544220381
In [18]:
plt.figure(figsize = (15,5))
RandomSubgroup = np.random.choice(range(N_test),100)
plt.plot(np.apply_along_axis(sigma01, 1, X_test[RandomSubgroup]),color="darkred",alpha=0.5,label="reference")
plt.plot(sigma01_hat(X_test[RandomSubgroup]),color="grey",alpha=0.5,label="prediction")
plt.title("Estimation of $\sigma_{01}(\mathbf{x})$")
plt.legend()
print("score:", r2_score(sigma01_hat(X_test),np.apply_along_axis(sigma01, 1, X_test)))
print("RMSE on the test dataset:", np.sqrt(mean_squared_error(np.apply_along_axis(sigma01,1,X_test),sigma01_hat(X_test))))
score: -0.5913874033780999
RMSE on the test dataset: 1.3488601456961298

Comments

We could see that even for the case where the covariance matrix is constant, these estimators failed to provided reliable estimation. This is due to the propagation of errors: it is very dangerous to couple the Meta-learners to get more information.

Any ideas on how to construct covariance estimators in this Gaussian case?

Test with Mondrian Forests

In [19]:
from skgarden import MondrianForestRegressor, MondrianTreeRegressor

mu0T_hat = MondrianForestRegressor(n_estimators = 100, min_samples_split=2)
mu0T_hat.fit(X_train_control, Yi_train_control)

mu1T_hat = MondrianForestRegressor(n_estimators = 100, min_samples_split=2)
mu1T_hat.fit(X_train_treated, Yi_train_treated)
/home/qiming/.local/lib/python3.7/site-packages/sklearn/externals/joblib/__init__.py:15: DeprecationWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.
  warnings.warn(msg, category=DeprecationWarning)
/home/qiming/.local/lib/python3.7/site-packages/sklearn/externals/six.py:31: DeprecationWarning: The module is deprecated in version 0.21 and will be removed in version 0.23 since we've dropped support for Python 2.7. Please rely on the official version of six (https://pypi.org/project/six/).
  "(https://pypi.org/project/six/).", DeprecationWarning)
Out[19]:
MondrianForestRegressor(bootstrap=False, max_depth=None, min_samples_split=2,
                        n_estimators=100, n_jobs=1, random_state=None,
                        verbose=0)
In [20]:
plt.figure(figsize=(15,5))
RandomSubgroup = np.random.choice(range(np.sum(1-W_test)),100)
plt.plot(mu1T_hat.predict(X_test_control[RandomSubgroup]),color="grey",alpha=0.5,label="prediction")
plt.plot(
    np.apply_along_axis(mX1,1,X_test[np.extract(1-W_test,range(N_test))[RandomSubgroup]]),
    color="darkred",alpha=0.5,label="reference")
plt.title("Estimation of $Y(1)$ for control group")
plt.legend()
print("size of treated group in training data: {}, {:.1%}.".format(sum(W_train),sum(W_train)/N_train))
print(mu1T_hat.score(X_test, np.apply_along_axis(mX1,1,X_test)))
size of treated group in training data: 1966, 49.1%.
0.8911961289343913
In [21]:
plt.figure(figsize=(15,5))
RandomSubgroup = np.random.choice(range(np.sum(W_test)),100)
plt.plot(mu0T_hat.predict(X_test_treated[RandomSubgroup]),color="grey",alpha=0.5,label="prediction")
plt.plot(
    np.apply_along_axis(mX0,1,X_test[np.extract(W_test,range(N_test))[RandomSubgroup]]),
    color="darkred",alpha=0.5,label="reference")
plt.title("Estimation of $Y(0)$ for treated group")
plt.legend()
print("size of treated group in training data: {}, {:.1%}.".format(sum(1-W_train),sum(1-W_train)/N_train))
print("Score on the test dataset:", mu0T_hat.score(X_test, np.apply_along_axis(mX0,1,X_test)))
size of treated group in training data: 2034, 50.8%.
Score on the test dataset: 0.0

Forest-based conditional distribution estimation

In [22]:
from sklearn.tree import DecisionTreeRegressor
In [23]:
def samplerY0x(x,n_estimators,N):
    Y0x_empirical = np.zeros(n_estimators)
    for i in range(n_estimators):
        nu0T_hat = DecisionTreeRegressor()
        #nu0T_hat = RandomForestRegressor(n_estimators=1, bootstrap=False)
        nu0T_hat = MondrianForestRegressor(n_estimators=1, bootstrap=True)
        nu0T_hat.fit(X_train_control, Yi_train_control)
        
        # bootstrap manually
        # BootstrapSize = 10000 
        # N_train_0 = sum(1-W_train)
        # nu0T_hat.fit(X_train_control[np.random.choice(range(N_train_0),BootstrapSize)], Yi_train_control[np.random.choice(range(N_train_0),BootstrapSize)])
        Y0x_empirical[i] =nu0T_hat.predict(x.reshape(1,d))
    Y0x_sample = np.zeros(N)
    # for i in range(N):
    #     m = np.random.choice(Y0x_empirical,1)
    #     Y0x_sample[i] = np.random.normal(m,1.5,1)
    #     
    # return Y0x_sample
    return np.random.choice(Y0x_empirical,N)
        
def samplerY1x(x,n_estimators,N):
    Y1x_empirical = np.zeros(n_estimators)
    for i in range(n_estimators):
        nu1T_hat = MondrianForestRegressor(n_estimators = 1, bootstrap=True)
        nu1T_hat.fit(X_train_treated, Yi_train_treated)
        Y1x_empirical[i] =nu1T_hat.predict(x.reshape(1,d))
    return np.random.choice(Y1x_empirical,N)
In [24]:
TestIndex = 15 
y0 = samplerY0x(X_test[TestIndex],20,1000)
y1 = samplerY1x(X_test[TestIndex],20,1000)
In [25]:
plt.figure(figsize = (10,5))
plt.subplot(121)
plt.hist(y0,bins =30, color="lightgrey")
plt.title("Hist of Y(0) | X = x")
plt.subplot(122)
plt.hist(y1,bins =30, color="lightgrey")
plt.title("Hist of Y(1) | X = x")
Out[25]:
Text(0.5, 1.0, 'Hist of Y(1) | X = x')
In [26]:
getInfoY(X_test[TestIndex])
Out[26]:
{'mX0': 1.0,
 'mX1': 4.892827337359855,
 'sigma00': 1.0,
 'sigma11': 3.0,
 'sigma01': 0.9}
In [27]:
np.mean(y0)
Out[27]:
1.1836145285563544
In [28]:
np.var(y0)
Out[28]:
0.7183969863114369
In [29]:
np.mean(y1)
Out[29]:
4.642889865338803
In [30]:
np.var(y1)
Out[30]:
3.136919068070546
In [31]:
gc.collect()
Out[31]:
12445

Drafts

In [32]:
X = np.random.uniform(0,1,(5000,5))
#Y = np.random.normal(0,1.,5000) +np.sin(X[:,1]+X[:,3]) +X[:,2]**2
Y = np.random.normal(0,1.,5000) + 1
In [33]:
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor 
reg = AdaBoostRegressor(learning_rate = 0.1, n_estimators = 10)
#reg = XGBRegressor(n_estimators = 20) 
#reg = RandomForestRegressor(n_estimators = 100)
reg.fit(X[0:4000,:],Y[0:4000])
Out[33]:
AdaBoostRegressor(base_estimator=None, learning_rate=0.1, loss='linear',
                  n_estimators=10, random_state=None)
In [34]:
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(np.zeros(1000)+1.,reg.predict(X[4000:])))
Out[34]:
0.06853218806163983
In [35]:
np.mean(reg.predict(X[4000:]))
Out[35]:
1.0161601788924157
In [36]:
np.mean(Y[4000:])
Out[36]:
1.019043940319142
In [ ]: