In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# import seaborn as sns
# sns.set_style("whitegrid")
import gc # memory cleaning

import os
import psutil
process = psutil.Process(os.getpid())
print("Memory usage:", process.memory_info().rss/1024/1024,"MB") 
Memory usage: 80.609375 MB
In [2]:
N_total = 150000
N_train = 13000
X = np.random.uniform(0,1,(N_total,3))
def obj_func(x):
    """
    conditional expectation
    """
    return x[1]+x[2]
def obj_func2(x):
    """
    conditional variance
    """
    return x[0]+x[1]
#Y = np.random.normal(0,1.,N_total) + np.apply_along_axis(obj_func,1,X)
Y = np.zeros(N_total)
for i in range(N_total):
    Y[i] = np.random.normal(obj_func(X[i]),np.sqrt(obj_func2(X[i])),1) 
#Y = np.random.normal(0,1.,5000) + 1
In [3]:
from sklearn.ensemble import RandomForestRegressor 
from xgboost import XGBRegressor 
#reg = AdaBoostRegressor(learning_rate = 0.1, n_estimators = 10)
#reg = XGBRegressor(n_estimators = 20) 
reg = XGBRegressor(n_estimators = 100,
                   #max_depth = 15,
                   objective="reg:squarederror"
                  ) 
#reg = RandomForestRegressor(n_estimators = 100)
reg.fit(X[0:N_train,:],Y[0:N_train])
Out[3]:
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, verbosity=1)
In [4]:
from sklearn.metrics import mean_squared_error, r2_score
np.sqrt(mean_squared_error(np.apply_along_axis(obj_func,1,X[N_train:]),reg.predict(X[N_train:])))
Out[4]:
0.08790360711412781
In [5]:
plt.figure(figsize=(15,5))
plt.plot(np.apply_along_axis(obj_func,1,X[N_train:N_train+100]), color = "darkred", label="ref")
plt.plot(reg.predict(X[N_train:N_train+100]), color = "grey", label="pred")
plt.title("Estimation of conditional expectation")
plt.grid()
plt.legend()
Out[5]:
<matplotlib.legend.Legend at 0x7f5b756b7d50>
In [6]:
#reg2 = RandomForestRegressor(n_estimators = 100)
reg2 = XGBRegressor(n_estimators = 100,
                   #max_depth = 5,
                   objective="reg:squarederror") 
reg2.fit(X[0:N_train,:],Y[0:N_train]**2)
Out[6]:
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, verbosity=1)
In [7]:
np.sqrt(mean_squared_error(np.apply_along_axis(obj_func2,1,X[N_train:]),reg2.predict(X[N_train:])-reg.predict(X[N_train:])**2))
Out[7]:
0.23482567442774577
In [8]:
r2_score(np.apply_along_axis(obj_func2,1,X[N_train:]),reg2.predict(X[N_train:])-reg.predict(X[N_train:])**2)
Out[8]:
0.6684678441686359
In [9]:
plt.figure(figsize=(15,5))
#plt.plot(np.zeros(100)+1., color = "darkred", label="ref")
plt.plot(np.apply_along_axis(obj_func2,1,X[N_train:N_train+100]), color = "darkred", label="ref")
plt.plot(reg2.predict(X[N_train:N_train+100])-reg.predict(X[N_train:N_train+100])**2, color = "grey", label="pred")
plt.title("Estimation of conditional variance")
plt.grid()
plt.legend()
Out[9]:
<matplotlib.legend.Legend at 0x7f5b6056e910>
In [10]:
print("Memory usage:", process.memory_info().rss/1024/1024,"MB") 
Memory usage: 161.26953125 MB
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: