import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style("whitegrid")
import gc # memory cleaning
import os
import psutil
process = psutil.Process(os.getpid())
print("Memory usage:", process.memory_info().rss/1024/1024,"MB")
We consider the following setting:
$\mathbf{X}\sim \mathcal{N}(\mathbf{0}, \mathbf{Id}_{d\times d})$ or $\mathrm{Uniform}([0,1]^d)$
$(Y(0),Y(1)) \sim \mathcal{N}(m(\mathbf{X}),\Sigma(\mathbf{X})))$
$W \sim \mathrm{Bernoulli}(e(\mathbf{X}))$
We assume that given $\mathbf{X}$, $(Y(0),Y(1))$ is a 2-dimensional Gaussian random variable. More precisely, let $(G_0,G_1,G_2)$ be 3 independent standard Guassians, we consider $$ Y(0) := \sqrt{\sigma_{00}(\mathbf{X})}G_0 + \sqrt{\sigma_{00}(\mathbf{X}) - \sigma_{01}(\mathbf{X})} G_1 + m(\mathbf{X}){(0)}, $$ and $$ Y(1) := \sqrt{\sigma_{11}(\mathbf{X})}G_0 + \sqrt{\sigma_{11}(\mathbf{X}) - \sigma_{01}(\mathbf{X})} G_2+ m(\mathbf{X}){(1)}. $$ It is readily checked that $$ m(\mathbf{X}) = (m(\mathbf{X})(0),m(\mathbf{X})(1)), $$ and $$
. $$
According to our construction, the covariate $X^{(0)}$ and $X^{(5)}$ are irrelevant noise w.r.t. the objective $(Y(0),Y(1))$. We also remark that this setting does not enter the modelling of Meta-learners.
def e_func(x):
return np.sin(x[1]*x[2]+x[3])*.5+.5
def mX0(x):
#return x[3]+x[2]+x[1]
return 1.
def mX1(x):
return np.sin(x[4])+3.*x[1] +5.
#return 5.
def sigma00(x):
return max(min(np.abs(x[1]+x[4])*1.9,3.),1.)
#return 1.
def sigma11(x):
return max(min(np.abs(x[2]+x[3])*1.9,3.),1.)
#return 1
def sigma01(x):
#return min(sigma00(x),sigma11(x))*0.5
#return np.abs(np.sin(x[1]+2*x[2]+3*x[4]))*.4
return 0.9
N_total = 5000
N_train = 4000
N_test = N_total - N_train
d = 6
X = np.random.normal(0,1,(N_total,d))
#X = np.random.uniform(0,1,(N_total,d))
e = np.apply_along_axis(e_func, 1, X)
W = np.random.binomial(size=N_total, n=1, p= e)
# instrumental gaussians
G = np.random.normal(0,1,(N_total,3))
# generate (Y(0),Y(1))
Y0 = np.apply_along_axis(mX0, 1, X)+np.sqrt(np.apply_along_axis(sigma01, 1, X))*G[:,0] +np.sqrt(np.apply_along_axis(sigma00, 1, X) -np.apply_along_axis(sigma01, 1, X) )*G[:,1]
Y1 = np.apply_along_axis(mX1, 1, X)+np.sqrt(np.apply_along_axis(sigma01, 1, X))*G[:,0] +np.sqrt(np.apply_along_axis(sigma11, 1, X) -np.apply_along_axis(sigma01, 1, X) )*G[:,2]
Yi = W*Y1 + (1-W)*Y0
## split train/test data
X_train = X[0:N_train]
X_test = X[N_train:]
Y0_train = Y0[0:N_train]
Y0_test = Y0[N_train:]
Y1_train = Y1[0:N_train]
Y1_test = Y1[N_train:]
Yi_train = Yi[0:N_train]
Yi_test = Yi[N_train:]
W_train = W[0:N_train]
W_test = W[N_train:]
# check memory usage
process = psutil.Process(os.getpid())
print(process.memory_info().rss/1024/1024,"MB")
# get true values/samples of L((Y(0),Y(1)) | X = x )
def sampleY(x, N):
G = np.random.normal(0,1,(N,3))
Y0x = mX0(x) + np.sqrt(sigma01(x))*G[:,0] + np.sqrt(sigma00(x)-sigma01(x))*G[:,1]
Y1x = mX1(x) + np.sqrt(sigma01(x))*G[:,0] + np.sqrt(sigma11(x)-sigma01(x))*G[:,2]
return Y0x,Y1x
def getInfoY(x):
return {'mX0':mX0(x),
'mX1':mX1(x),
'sigma00':sigma00(x),
'sigma11':sigma11(x),
'sigma01':sigma01(x)}
# Viz of (Y(0),Y(1))
plt.figure(figsize = (10,5))
plt.subplot(121)
plt.plot(Y0_train[:1000],Y1_train[:1000],"p",alpha = 0.1, color = "grey")
plt.title("Points cloud of $(Y_i(0),Y_i(1))$")
plt.subplot(122)
sns.kdeplot(Y0_train[:1000],Y1_train[:1000],shade=True, cbar=True)
plt.title("KDE plot of $(Y_i(0),Y_i(1))$")
plt.figure(figsize = (15,5))
plt.subplot(131)
plt.hist(Yi_train,bins = 30, color = "lightgrey")
plt.title("Histogram of $Y_i(W_i)$")
plt.subplot(132)
plt.hist(Y0_train,bins = 30, color = "lightgrey")
plt.title("Histogram of $Y_i(0)$")
plt.subplot(133)
plt.hist(Y1_train,bins = 30, color = "lightgrey")
plt.title("Histogram of $Y_i(1)$")
IndexTest = 10
Y0x,Y1x = sampleY(X[IndexTest],1000)
plt.figure(figsize = (10,10))
plt.plot(Y0x,Y1x,"p", color = "grey",alpha = 0.2)
InfoTest = getInfoY(X[IndexTest])
plt.title("Samples of $\mathcal{L}((Y(0),Y(1)) | X =$"+str(X[IndexTest])+"$)$")
print(X[IndexTest])
print(InfoTest)
np.cov(Y0x,Y1x)
A model trained by the training data $(\mathbf{X}_i,Y_i;1\leq i \leq N)$ is denoted by $$ \mathcal{M}_{\left\{\mathbf{X}_i\sim Y_i \;|\; i\in [N]\right\}}. $$ The associated prediction at point $\mathbf{x}$ is denoted by $$ \mathbf{Pred}\left(\mathbf{x} \;|\; \mathcal{M}_{\left\{\mathbf{X}_i\sim Y_i \;|\; i\in [N]\right\}} \right). $$ This notation is more compatible with the models in the Bayesian nonparametrics context, emphasizing the randomness brought by the model itself. When the underlying model is Random Forests-based, we replace $\mathcal{M}$ by $\mathcal{T}$. Below we list the construction of T-, S- and X-learners.
Divided by two subgroups w.r.t. $W_i$, we consider the estimator $\hat{\mu}_{\omega}^T$ of $\mu_{\omega}$ defined as follows: $$ \hat{\mu}_{\omega}^T(\mathbf{x}) :=\mathbf{Pred}\left(\mathbf{x} \;|\; \mathcal{M}_{\left\{\mathbf{X}_i\sim Y_i \;|\; W_i = \omega,\;i\in [N]\right\}}\right). $$ Then, the estimator $\hat{\tau}^T$ of $\tau$ is defined by $$ \hat{\tau}^T(\mathbf{x}) = \hat{\mu}_{1}^T(\mathbf{x}) - \hat{\mu}_{0}^T(\mathbf{x}). $$
Without specifying the role of $W_i$, the estimator $\hat{\mu}_{\omega}$ of $\mu_{\omega}$ defined as follows: $$ \hat{\mu}_{\omega}^S(\mathbf{x}) := \mathbf{Pred}\left((\mathbf{x},\omega) \;|\; \mathcal{M}_{\left\{(\mathbf{X}_i,W_i)\sim Y_i \;|\; i\in [N]\right\}}\right). $$ Then, the estimator $\hat{\tau}^S$ of $\tau$ is defined by $$ \hat{\tau}^S(\mathbf{x}) = \hat{\mu}_{1}^S(\mathbf{x}) - \hat{\mu}_{0}^S(\mathbf{x}). $$
By estimating the "missing" potential outcomes with the same spirit of T-learners, we define
$$ \hat{\tau}_{1}^X(\mathbf{x}) := \mathbf{Pred}\left(\mathbf{x} \;|\; \mathcal{M}_{\left\{\mathbf{X}_i \sim Y_i - \hat{\mu}_0^T(\mathbf{X}_i) \;|\; W_i = 1,\;i\in [N]\right\}}\right), $$and
$$ \hat{\tau}_{0}^X(\mathbf{x}) :=\mathbf{Pred}\left(\mathbf{x} \;|\; \mathcal{M}_{\left\{\mathbf{X}_i \sim \hat{\mu}_1^T(\mathbf{X}_i) - Y_i \;|\; W_i = 0,\;i\in [N]\right\}}\right). $$The final estimator $\hat{\tau}^X$ of $\tau$ is constructed by taking the convex combination of the two estimators defined above, that is
$$ \hat{\tau}^X(\mathbf{x}) := g(\mathbf{x})\hat{\tau}_1^X(\mathbf{x}) + (1-g(\mathbf{x}))\hat{\tau}_0^X(\mathbf{x}). $$For the choice of $g(\mathbf{x})$, one may consider the estimator of the propensity score $\hat{e}(\mathbf{x})$, or one can simply take $0$ or $1$ according to the data.
The only ingredient required for conditional distribution estimation is the conditional mean estimator $\hat{\mu}_0^T$ and $\hat{\mu}_1^T$. However, the estimator for the covariance matrix is of taste as X-learners.
$$ \hat{\sigma}_{00}(\mathbf{x}) := \mathbf{Pred}\left(\mathbf{x}\;|\; \mathcal{M}_{\left\{\mathbf{X}_i \sim Y_i^2 \;|\; W_i = 0,\;i\in [N]\right\}}\right) - \hat{\mu}_0^T(\mathbf{x})^2 , $$and
$$ \hat{\sigma}_{11}(\mathbf{x}) := \mathbf{Pred}\left(\mathbf{x}\;|\; \mathcal{M}_{\left\{\mathbf{X}_i \sim Y_i^2 \;|\; W_i = 1,\;i\in [N]\right\}}\right) - \hat{\mu}_1^T(\mathbf{x})^2 , $$as well as
$$ \begin{aligned} & \hat{\sigma}_{01}(\mathbf{x}) \\ :=& g(\mathbf{x}) \mathbf{Pred}\left(\mathbf{x}\;|\; \mathcal{M}_{\left\{\mathbf{X}_i \sim \hat{\mu}_0^T(\mathbf{X}_i)Y_i \;|\; W_i = 1,\;i\in [N]\right\}}\right) \\& +(1-g(\mathbf{x})) \mathbf{Pred}\left(\mathbf{x}\;|\; \mathcal{M}_{\left\{\mathbf{X}_i \sim Y_i \hat{\mu}_1^T(\mathbf{X}_i) \;|\; W_i = 0,\;i\in [N]\right\}}\right) - \hat{\mu}_0^T(\mathbf{x}) \hat{\mu}_1^T(\mathbf{x}) , \end{aligned} $$where $g(\mathbf{x})$ can be any $[0,1]$-valued function, which can be chosen as an estimator of the propensity score. For simplicity, we take $g(\mathbf{x})\equiv 1/2$.
# divide training data according to treatment assignment.
W_multi = np.transpose(np.array([W_train for i in range(d)])).reshape(N_train,d)
X_train_treated = np.extract(W_multi,X_train)
X_train_treated = np.transpose(X_train_treated).reshape(np.sum(W_train),d)
Yi_train_treated = np.extract(W_train,Yi_train)
X_train_control = np.extract(1-W_multi,X_train)
X_train_control = np.transpose(X_train_control).reshape(N_train - np.sum(W_train),d)
Yi_train_control = np.extract(1-W_train,Yi_train)
# test data
W_test_multi = np.transpose(np.array([W_test for i in range(d)])).reshape(N_test,d)
X_test_treated = np.extract(W_test_multi,X_test)
X_test_treated = np.transpose(X_test_treated).reshape(np.sum(W_test),d)
Yi_test_treated = np.extract(W_test,Yi_test)
X_test_control = np.extract(1-W_test_multi,X_test)
X_test_control = np.transpose(X_test_control).reshape(N_test - np.sum(W_test),d)
Yi_test_control = np.extract(1-W_test,Yi_test)
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
#mu0T_hat = RandomForestRegressor(n_estimators = 100, min_samples_split=2)
#mu0T_hat = XGBRegressor(learning_rate = 0.1,
# objective = "reg:squarederror",
# n_estimators = 30)
mu0T_hat = AdaBoostRegressor(learning_rate = 0.1, n_estimators = 10)
mu0T_hat.fit(X_train_control, Yi_train_control)
#mu1T_hat = RandomForestRegressor(n_estimators = 100, min_samples_split=2)
#mu1T_hat = AdaBoostRegressor(learning_rate = 0.1, n_estimators = 10)
mu1T_hat = XGBRegressor(learning_rate = 0.1,
objective = "reg:squarederror",
n_estimators = 30)
mu1T_hat.fit(X_train_treated, Yi_train_treated)
## other choices for base learners
# from sklearn.linear_model import LinearRegression
# from sklearn.ensemble import AdaBoostRegressor
# mu0T_hat = AdaBoostRegressor()
# mu0T_hat.fit(X_train_control, Yi_train_control)
# mu1T_hat = AdaBoostRegressor()
# mu1T_hat.fit(X_train_treated, Yi_train_treated)
We choose randomly 100 sample points respectively in treated and control group in test data, and compare the prediction of $Y(0)$ (resp. $Y(1)$) with the reference values.
To evaluate the quality of prediction, we consider:
RMSE: root-mean-square error $$ \mathrm{RMSE} := \sqrt{\sum_{i=1}^N (\mathrm{Pred_i} - \mathrm{Ref}_i)^2}. $$
Score: coefficient of determination, defined by $$ \mathrm{score} := 1 - \frac{R_{res}}{R_{tot}}. $$
To illustrate the quality of estimation, we first draw a subsample of size 100 in the test date, then we plot repectively the predicitions and the reference values.
For the choice of the base-learners, we mainly consider:
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
https://scikit-garden.github.io/examples/MondrianTreeRegressor/
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostRegressor.html
https://xgboost.readthedocs.io/en/latest/tutorials/model.html
for parameters: https://xgboost.readthedocs.io/en/latest/parameter.html
plt.figure(figsize=(15,5))
RandomSubgroup = np.random.choice(range(np.sum(1-W_test)),100)
plt.plot(mu1T_hat.predict(X_test_control[RandomSubgroup]),color="grey",alpha=0.5,label="prediction")
plt.plot(
np.apply_along_axis(mX1,1,X_test[np.extract(1-W_test,range(N_test))[RandomSubgroup]]),
color="darkred",alpha=0.5,label="reference")
plt.title("Estimation of $Y(1)$ for control group")
plt.legend()
print("size of treated group in training data: {}, {:.1%}.".format(sum(W_train),sum(W_train)/N_train))
print("Score on the test dataset:",mu1T_hat.score(X_test, np.apply_along_axis(mX1,1,X_test)))
print("RMSE on the test dataset:", np.sqrt(mean_squared_error(np.apply_along_axis(mX1,1,X_test),mu1T_hat.predict(X_test))))
plt.figure(figsize=(15,5))
RandomSubgroup = np.random.choice(range(np.sum(W_test)),100)
plt.plot(mu0T_hat.predict(X_test_treated[RandomSubgroup]),color="grey",alpha=0.5,label="prediction")
plt.plot(
np.apply_along_axis(mX0,1,X_test[np.extract(W_test,range(N_test))[RandomSubgroup]]),
color="darkred",alpha=0.5,label="reference")
plt.title("Estimation of $Y(0)$ for treated group")
plt.legend()
print("size of treated group in training data: {}, {:.1%}.".format(sum(1-W_train),sum(1-W_train)/N_train))
print("Score on the test dataset:", mu0T_hat.score(X_test, np.apply_along_axis(mX0,1,X_test)))
print("RMSE on the test dataset:", np.sqrt(mean_squared_error(np.apply_along_axis(mX0,1,X_test),mu0T_hat.predict(X_test))))
from sklearn.metrics import r2_score
#mu00_hat = RandomForestRegressor(n_estimators = 100, min_samples_split=2)
#mu00_hat = AdaBoostRegressor(learning_rate = 0.1, n_estimators = 20)
mu00_hat = XGBRegressor(learning_rate = 0.1,
n_estimators = 50,
objective = "reg:squarederror",
gamma=2.)
mu00_hat.fit(X_train_control, Yi_train_control**2)
def sigma00_hat(x):
return mu00_hat.predict(x) - mu0T_hat.predict(x)**2
#mu11_hat = RandomForestRegressor(n_estimators = 100, min_samples_split=2)
#mu11_hat = AdaBoostRegressor(learning_rate = 0.1, n_estimators = 50)
mu11_hat = XGBRegressor(learning_rate = 0.1,
booster = "gbtree",
objective = "reg:squarederror",
n_estimators = 20,
gamma=1.,
subsample = 1.,
reg_lambda = 1.)
mu11_hat.fit(X_train_treated, Yi_train_treated**2)
def sigma11_hat(x):
return mu11_hat.predict(x) - mu1T_hat.predict(x)**2
mu01_hat_0 = RandomForestRegressor(n_estimators = 100, min_samples_split=5)
#mu01_hat_0.fit(X_train_control,Yi_train_control*mu1T_hat.predict(X_train_control))
mu01_hat_0.fit(X_train_control,Yi_train_control*mu1T_hat.predict(X_train_control))
mu01_hat_1 = RandomForestRegressor(n_estimators = 100, min_samples_split=5)
#mu01_hat_1.fit(X_train_treated,Yi_train_treated*mu0T_hat.predict(X_train_treated))
mu01_hat_1.fit(X_train_treated,Yi_train_treated*mu0T_hat.predict(X_train_treated))
def sigma01_hat(x):
#return mu01_hat_0.predict(x)*(1-np.apply_along_axis(e_func, 1, x)) + mu01_hat_1.predict(x)*np.apply_along_axis(e_func, 1, x)
return mu01_hat_0.predict(x)*.5 + mu01_hat_1.predict(x)*.5 - mu0T_hat.predict(x)*mu1T_hat.predict(x)
plt.figure(figsize = (15,5))
RandomSubgroup = np.random.choice(range(N_test),100)
plt.plot(np.apply_along_axis(sigma00, 1, X_test[RandomSubgroup]),color="darkred",alpha=0.5,label="reference")
plt.plot(sigma00_hat(X_test[RandomSubgroup]),color="grey",alpha=0.5,label="prediction")
plt.title("Estimation of $\sigma_{00}(\mathbf{x})$")
plt.legend()
print("score:", r2_score(sigma00_hat(X_test),np.apply_along_axis(sigma00, 1, X_test)))
print("RMSE on the test dataset:", np.sqrt(mean_squared_error(np.apply_along_axis(sigma00,1,X_test),sigma00_hat(X_test))))
plt.figure(figsize = (15,5))
RandomSubgroup = np.random.choice(range(N_test),100)
plt.plot(np.apply_along_axis(sigma11, 1, X_test[RandomSubgroup]),color="darkred",alpha=0.5,label="reference")
plt.plot(sigma11_hat(X_test[RandomSubgroup]),color="grey",alpha=0.5,label="prediction")
plt.title("Estimation of $\sigma_{11}(\mathbf{x})$")
plt.legend()
print("score:", r2_score(sigma11_hat(X_test),np.apply_along_axis(sigma11, 1, X_test)))
print("RMSE on the test dataset:", np.sqrt(mean_squared_error(np.apply_along_axis(sigma11,1,X_test),sigma11_hat(X_test))))
plt.figure(figsize = (15,5))
RandomSubgroup = np.random.choice(range(N_test),100)
plt.plot(np.apply_along_axis(sigma01, 1, X_test[RandomSubgroup]),color="darkred",alpha=0.5,label="reference")
plt.plot(sigma01_hat(X_test[RandomSubgroup]),color="grey",alpha=0.5,label="prediction")
plt.title("Estimation of $\sigma_{01}(\mathbf{x})$")
plt.legend()
print("score:", r2_score(sigma01_hat(X_test),np.apply_along_axis(sigma01, 1, X_test)))
print("RMSE on the test dataset:", np.sqrt(mean_squared_error(np.apply_along_axis(sigma01,1,X_test),sigma01_hat(X_test))))
We could see that even for the case where the covariance matrix is constant, these estimators failed to provided reliable estimation. This is due to the propagation of errors: it is very dangerous to couple the Meta-learners to get more information.
Any ideas on how to construct covariance estimators in this Gaussian case?
from skgarden import MondrianForestRegressor, MondrianTreeRegressor
mu0T_hat = MondrianForestRegressor(n_estimators = 100, min_samples_split=2)
mu0T_hat.fit(X_train_control, Yi_train_control)
mu1T_hat = MondrianForestRegressor(n_estimators = 100, min_samples_split=2)
mu1T_hat.fit(X_train_treated, Yi_train_treated)
plt.figure(figsize=(15,5))
RandomSubgroup = np.random.choice(range(np.sum(1-W_test)),100)
plt.plot(mu1T_hat.predict(X_test_control[RandomSubgroup]),color="grey",alpha=0.5,label="prediction")
plt.plot(
np.apply_along_axis(mX1,1,X_test[np.extract(1-W_test,range(N_test))[RandomSubgroup]]),
color="darkred",alpha=0.5,label="reference")
plt.title("Estimation of $Y(1)$ for control group")
plt.legend()
print("size of treated group in training data: {}, {:.1%}.".format(sum(W_train),sum(W_train)/N_train))
print(mu1T_hat.score(X_test, np.apply_along_axis(mX1,1,X_test)))
plt.figure(figsize=(15,5))
RandomSubgroup = np.random.choice(range(np.sum(W_test)),100)
plt.plot(mu0T_hat.predict(X_test_treated[RandomSubgroup]),color="grey",alpha=0.5,label="prediction")
plt.plot(
np.apply_along_axis(mX0,1,X_test[np.extract(W_test,range(N_test))[RandomSubgroup]]),
color="darkred",alpha=0.5,label="reference")
plt.title("Estimation of $Y(0)$ for treated group")
plt.legend()
print("size of treated group in training data: {}, {:.1%}.".format(sum(1-W_train),sum(1-W_train)/N_train))
print("Score on the test dataset:", mu0T_hat.score(X_test, np.apply_along_axis(mX0,1,X_test)))
from sklearn.tree import DecisionTreeRegressor
def samplerY0x(x,n_estimators,N):
Y0x_empirical = np.zeros(n_estimators)
for i in range(n_estimators):
nu0T_hat = DecisionTreeRegressor()
#nu0T_hat = RandomForestRegressor(n_estimators=1, bootstrap=False)
nu0T_hat = MondrianForestRegressor(n_estimators=1, bootstrap=True)
nu0T_hat.fit(X_train_control, Yi_train_control)
# bootstrap manually
# BootstrapSize = 10000
# N_train_0 = sum(1-W_train)
# nu0T_hat.fit(X_train_control[np.random.choice(range(N_train_0),BootstrapSize)], Yi_train_control[np.random.choice(range(N_train_0),BootstrapSize)])
Y0x_empirical[i] =nu0T_hat.predict(x.reshape(1,d))
Y0x_sample = np.zeros(N)
# for i in range(N):
# m = np.random.choice(Y0x_empirical,1)
# Y0x_sample[i] = np.random.normal(m,1.5,1)
#
# return Y0x_sample
return np.random.choice(Y0x_empirical,N)
def samplerY1x(x,n_estimators,N):
Y1x_empirical = np.zeros(n_estimators)
for i in range(n_estimators):
nu1T_hat = MondrianForestRegressor(n_estimators = 1, bootstrap=True)
nu1T_hat.fit(X_train_treated, Yi_train_treated)
Y1x_empirical[i] =nu1T_hat.predict(x.reshape(1,d))
return np.random.choice(Y1x_empirical,N)
TestIndex = 15
y0 = samplerY0x(X_test[TestIndex],20,1000)
y1 = samplerY1x(X_test[TestIndex],20,1000)
plt.figure(figsize = (10,5))
plt.subplot(121)
plt.hist(y0,bins =30, color="lightgrey")
plt.title("Hist of Y(0) | X = x")
plt.subplot(122)
plt.hist(y1,bins =30, color="lightgrey")
plt.title("Hist of Y(1) | X = x")
getInfoY(X_test[TestIndex])
np.mean(y0)
np.var(y0)
np.mean(y1)
np.var(y1)
gc.collect()
X = np.random.uniform(0,1,(5000,5))
#Y = np.random.normal(0,1.,5000) +np.sin(X[:,1]+X[:,3]) +X[:,2]**2
Y = np.random.normal(0,1.,5000) + 1
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
reg = AdaBoostRegressor(learning_rate = 0.1, n_estimators = 10)
#reg = XGBRegressor(n_estimators = 20)
#reg = RandomForestRegressor(n_estimators = 100)
reg.fit(X[0:4000,:],Y[0:4000])
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(np.zeros(1000)+1.,reg.predict(X[4000:])))
np.mean(reg.predict(X[4000:]))
np.mean(Y[4000:])