Skip to content
Snippets Groups Projects
Commit 35771cb2 authored by Clara Betancourt's avatar Clara Betancourt
Browse files

Merge branch 'clara_issue_39_try_new_base_models' into devel

parents f2000f51 45950de4
Branches devel
Tags
No related merge requests found
......@@ -20,7 +20,8 @@ import torch
# sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import NearestNeighbors
......@@ -40,7 +41,7 @@ class SpatiotemporalMean:
"""
A model that just returns the mean of all labels
"""
def __init__(self, use_val=False):
def __init__(self, use_val=False, cv_seed=None):
"""
Initializing the class
......@@ -104,7 +105,7 @@ class SpatialMean:
If there is no measurement at all, return the mean of
the CAMS reanalyses of that time step.
"""
def __init__(self, use_val=False):
def __init__(self, use_val=False, cv_seed=None):
"""
Initializing the class
......@@ -185,11 +186,100 @@ class SpatialMean:
self.y_hat = torch.load(self.prediction_path)
class Linear:
"""
Training and using random forests.
"""
def __init__(self, use_val=False, cv_seed=None):
"""
Initializing the class
use_val = True means the validation set is used for fitting
"""
self.use_val = use_val
self.cv_seed = cv_seed
if use_val:
prediction_file = 'linear_predictions_useval.pt'
else:
prediction_file = 'linear_predictions.pt'
if cv_seed is not None:
directory = settings.resources_dir + 'cross_validation/' + \
str(self.cv_seed) + '/' + 'models/'
else:
directory = settings.resources_dir + 'models/'
self.prediction_path = directory + prediction_file
def __str__(self):
"""
The name of the simple model.
"""
return 'Linear'
def read_predictions(self):
"""
Predictions were saved for future use
"""
if not os.path.exists(self.prediction_path):
self.save_predictions()
self.y_hat = torch.load(self.prediction_path)
def save_predictions(self, features=None, print_=True):
"""
Train the linear model, then save model and predictions
features can be a list of features to use
print_ indicates whether to print info or not
"""
if print_:
print('getting linear predictions...')
# read in data
ug = UBAGraph()
x_df = pd.read_csv(ug.x_path, index_col=0)
y_df = pd.read_csv(ug.y_path, index_col=0)
if self.cv_seed is not None:
p1 = settings.resources_dir
p2 = settings.resources_dir + f'cross_validation/{self.cv_seed}/'
ug.mask_path = ug.mask_path.replace(p1, p2)
mask_df = pd.read_csv(ug.mask_path, index_col=0)
# use standard features from feature selection if no features given
if not features:
features = ['cams_o3', 'nightlight', 'day_of_year', 'alt',
'hour_of_day', 'pbl_height', 'temperature',
'day_of_week', 'relative_alt', 'cams_no2',
'type', 'relative_humidity', 'wind_v',
'cams_nox_emissions', 'population_density',
'type_of_area', 'cloud_cover', 'wind_u']
# prepare data
x_df = x_df[features]
x_df = get_one_hot(x_df)
x = x_df.values
if self.use_val:
mask = mask_df.train_mask | mask_df.val_mask
else:
mask = mask_df.train_mask
x_train = x_df[mask].values
y_train = y_df[mask].values.reshape(-1)
# fit model
rf = LinearRegression()
rf.fit(x_train, y_train)
# save predictions
y_hat = rf.predict(x)
y_hat_pytorch = torch.tensor(y_hat.astype(np.float32)).view(-1, 1)
torch.save(y_hat_pytorch, self.prediction_path)
if print_:
print(f'written to {self.prediction_path}')
class Cams:
"""
Use the modeled ozone values from CAMS
"""
def __init__(self, use_val=False):
def __init__(self, use_val=False, cv_seed=None):
"""
Initializing the class
......@@ -459,6 +549,99 @@ class RandomForest:
print(f'written to {self.prediction_path}')
class GradientBoostingTrees:
"""
Training and using gradient boosting trees.
"""
def __init__(self, use_val=False, cv_seed=None):
"""
Initializing the class
use_val = True means the validation set is used for fitting
"""
self.use_val = use_val
self.cv_seed = cv_seed
if use_val:
prediction_file = 'gradientboostingtrees_predictions_useval.pt'
else:
prediction_file = 'gradientboostingtrees_predictions.pt'
if cv_seed is not None:
directory = settings.resources_dir + 'cross_validation/' + \
str(self.cv_seed) + '/' + 'models/'
else:
directory = settings.resources_dir + 'models/'
self.prediction_path = directory + prediction_file
def __str__(self):
"""
The name of the simple model.
"""
return 'GradientBoostingTrees'
def read_predictions(self):
"""
Predictions were saved for future use
"""
if not os.path.exists(self.prediction_path):
self.save_predictions()
self.y_hat = torch.load(self.prediction_path)
def save_predictions(self, print_=True, n_estimators=300,
learning_rate=0.1, max_depth=3):
"""
Train the random gradient boosting, then save predictions
print_ indicates whether to print info or not
"""
if print_:
print('getting gradient boosted predictions...')
# read in data
ug = UBAGraph()
x_df = pd.read_csv(ug.x_path, index_col=0)
y_df = pd.read_csv(ug.y_path, index_col=0)
if self.cv_seed is not None:
p1 = settings.resources_dir
p2 = settings.resources_dir + f'cross_validation/{self.cv_seed}/'
ug.mask_path = ug.mask_path.replace(p1, p2)
mask_df = pd.read_csv(ug.mask_path, index_col=0)
# use standard features from feature selection
features = ['cams_o3', 'nightlight', 'day_of_year', 'alt',
'hour_of_day', 'pbl_height', 'temperature',
'day_of_week', 'relative_alt', 'cams_no2',
'type', 'relative_humidity', 'wind_v',
'cams_nox_emissions', 'population_density',
'type_of_area', 'cloud_cover', 'wind_u']
# prepare data
x_df = x_df[features]
x_df = get_one_hot(x_df)
x = x_df.values
if self.use_val:
mask = mask_df.train_mask | mask_df.val_mask
else:
mask = mask_df.train_mask
x_train = x_df[mask].values
y_train = y_df[mask].values.reshape(-1)
# fit model
rf = GradientBoostingRegressor(
n_estimators=n_estimators,
learning_rate=learning_rate,
max_depth=max_depth,
random_state=settings.random_seed,
)
rf.fit(x_train, y_train)
# save predictions
y_hat = rf.predict(x)
y_hat_pytorch = torch.tensor(y_hat.astype(np.float32)).view(-1, 1)
torch.save(y_hat_pytorch, self.prediction_path)
if print_:
print(f'written to {self.prediction_path}')
def tune_nnh():
"""
tuning the samples parameter of NearestNeighborHybrid
......@@ -484,6 +667,37 @@ def tune_nnh():
print('\n\n\n')
def tune_gradient_boosted_trees():
"""
tuning the samples parameter of NearestNeighborHybrid
"""
print('tune gradient boosted tree model...')
# UBA Graph data
ug = UBAGraph()
ug.get_dataset()
mask_df = pd.read_csv(ug.mask_path, index_col=0)
y = ug[0].y.numpy().reshape(-1)[mask_df['val_mask']]
# for n_estimators in [100, 200, 300, 400, 500]:
# print('\nn_estimators=', n_estimators)
# for max_depth=3 in [4, 6, 8, 10]:
# print('\nmax_depth=', max_depth)
for learning_rate in [0.2, 0.25, 0.05, 0.1, 0.15]:
print('\n learning_rate=', learning_rate)
gbt = GradientBoostingTrees()
# gbt.save_predictions(n_estimators=n_estimators)
# gbt.save_predictions(max_depth=max_depth)
gbt.save_predictions(learning_rate=learning_rate)
gbt.read_predictions()
y_hat = gbt.y_hat.numpy().reshape(-1)[mask_df['val_mask']]
rmse = (mean_squared_error(y, y_hat))**.5
r2 = r2_score(y, y_hat)
print(f' val RMSE: {rmse:.3f}, val R2: {r2:.3f}')
print('\n')
def select_rf_features():
"""
Choose features for the simple model.
......@@ -587,8 +801,9 @@ def evaluate_models(cv_seed=None):
models = [NearestNeighborHybrid, RandomForest]
use_vals = [True]
else:
models = [SpatiotemporalMean, SpatialMean, Cams,
NearestNeighborHybrid, RandomForest]
models = [SpatiotemporalMean, SpatialMean, Linear,
Cams, NearestNeighborHybrid, RandomForest,
GradientBoostingTrees]
use_vals = [False, True]
specs = [(model, use_val) for model in models for use_val in use_vals]
......@@ -633,34 +848,45 @@ def tune_and_save_models():
Also save the predictions where the validation set is used
for training.
"""
model = SpatiotemporalMean()
model.save_predictions()
model = SpatiotemporalMean(use_val=True)
model.save_predictions()
# model = SpatiotemporalMean()
# model.save_predictions()
# model = SpatiotemporalMean(use_val=True)
# model.save_predictions()
# model = SpatialMean()
# model.save_predictions()
# model = SpatialMean(use_val=True)
# model.save_predictions()
# model = Linear()
# model.save_predictions()
# model = Linear(use_val=True)
# model.save_predictions()
model = SpatialMean()
model.save_predictions()
model = SpatialMean(use_val=True)
model.save_predictions()
# model = Cams()
# model.save_predictions()
# model = Cams(use_val=True)
# model.save_predictions()
model = Cams()
model.save_predictions()
model = Cams(use_val=True)
model.save_predictions()
# tune_nnh()
# model = NearestNeighborHybrid()
# model.save_predictions()
# model = NearestNeighborHybrid(use_val=True)
# model.save_predictions()
tune_nnh()
model = NearestNeighborHybrid()
model.save_predictions()
model = NearestNeighborHybrid(use_val=True)
model.save_predictions()
# select_rf_features()
# model = RandomForest()
# model.save_predictions()
# model = RandomForest(use_val=True)
# model.save_predictions()
select_rf_features()
model = RandomForest()
model.save_predictions()
model = RandomForest(use_val=True)
model.save_predictions()
tune_gradient_boosted_trees()
# model = GradientBoostingTrees()
# model.save_predictions()
# model = GradientBoostingTrees(use_val=True)
# model.save_predictions()
evaluate_models()
# evaluate_models()
def save_models_for_cross_validation():
......@@ -683,5 +909,5 @@ def save_models_for_cross_validation():
if __name__ == '__main__':
# tune_and_save_models()
save_models_for_cross_validation()
tune_and_save_models()
# save_models_for_cross_validation()
......@@ -364,9 +364,11 @@ def get_characteristics_df(how='read', cv_seed=None):
lin_file = 'nearestneighborhybrid_predictions_useval.pt'
stm_cs_file = 'SpatiotemporalMean_cs_predictions_useval.pyt'
sm_cs_file = 'SpatialMean_cs_predictions_useval.pyt'
cams_cs_file = 'Cams_cs_predictions_useval.pyt'
nnh_cs_file = 'NearestNeighborHybrid_cs_predictions_useval.pyt'
rf_cs_file = 'RandomForest_cs_predictions_useval.pyt'
lin_imp = torch.load(model_path+lin_file).numpy().reshape(-1)
cams_cs_imp = torch.load(model_path+cams_cs_file).numpy().reshape(-1)
nnh_cs_imp = torch.load(model_path+nnh_cs_file).numpy().reshape(-1)
rf_cs_imp = torch.load(model_path+rf_cs_file).numpy().reshape(-1)
if cv_seed is None:
......@@ -374,8 +376,8 @@ def get_characteristics_df(how='read', cv_seed=None):
sm_cs_imp = torch.load(model_path+sm_cs_file).numpy().reshape(-1)
# set up df
columns = ['y_true', 'y_imputed',
'y_stm_cs', 'y_sm_cs', 'y_nnh_cs', 'y_rf_cs',
columns = ['y_true', 'y_imputed', 'y_stm_cs',
'y_sm_cs', 'y_cams_cs', 'y_nnh_cs', 'y_rf_cs',
'gap_len', 'correlated',
'station_id', 'n_neighbors', 'test_mask']
df = pd.DataFrame(columns=columns, index=y_true_df.index)
......@@ -383,6 +385,7 @@ def get_characteristics_df(how='read', cv_seed=None):
# columns that we can take from other data frames
df.y_true = y_true_df.y
df.y_nnh_cs = nnh_cs_imp
df.y_cams_cs = cams_cs_imp
df.y_rf_cs = rf_cs_imp
df.station_id = reg_df.station_id
df.test_mask = mask_df.test_mask
......@@ -403,8 +406,8 @@ def get_characteristics_df(how='read', cv_seed=None):
for gap_idx, gap_row in gap_df_filtered.iterrows():
if gap_idx % 5000 == 0:
print(f'{gap_idx/len(gap_df_filtered)*100:.2f} %')
df.to_csv(save_path)
print(f'intermediate save to {save_path}')
# df.to_csv(save_path)
# print(f'intermediate save to {save_path}')
start_idx = gap_row.start_idx
end_idx = gap_row.start_idx + gap_row.len - 1
len_ = gap_row.len
......@@ -500,8 +503,8 @@ if __name__ == '__main__':
test_bootstrap_ = False
count_exceedances_ = False
number_of_neighbors_ = False
get_characteristics_df_ = False
evaluate_cross_validation_ = True
get_characteristics_df_ = True
evaluate_cross_validation_ = False
if index_of_agreement_:
# test index of agreement should be.959
......@@ -546,8 +549,8 @@ if __name__ == '__main__':
if get_characteristics_df_:
# df = get_characteristics_df(how='read')
df = get_characteristics_df(how='prepare')
for cv_seed in settings.cv_seeds:
get_characteristics_df(how='prepare', cv_seed=cv_seed)
# for cv_seed in settings.cv_seeds:
# get_characteristics_df(how='prepare', cv_seed=cv_seed)
if evaluate_cross_validation_:
evaluate_cross_validation()
......
......@@ -690,8 +690,8 @@ def n_neighbors_vs_r2():
df = get_characteristics_df()
n_list = np.unique(df.n_neighbors).tolist()
r2_n_dict = {}
models = ['y_stm_cs', 'y_sm_cs', 'y_nnh_cs', 'y_rf_cs']
colors = ['rosybrown', 'lightcoral', 'firebrick', 'blue']
models = ['y_stm_cs', 'y_sm_cs', 'y_cams_cs', 'y_nnh_cs', 'y_rf_cs']
colors = ['rosybrown', 'lightcoral', 'red', 'firebrick', 'blue']
for model in models:
r2_n_list = []
for n in n_list:
......@@ -722,7 +722,7 @@ def n_neighbors_vs_r2():
ax.set_ylim(-.1, 1.1)
ax.set_xlabel('# neighbors')
ax.set_ylabel('R2 score')
ax.legend()
# ax.legend()
# save
n_neigh_vs_r2_path = settings.output_dir + 'n_neigh_vs_r2.png'
......@@ -741,8 +741,8 @@ def gap_len_vs_r2():
df = get_characteristics_df()
len_list = list(range(1, 51))
r2_len_dict = {}
models = ['y_stm_cs', 'y_sm_cs', 'y_nnh_cs', 'y_rf_cs']
colors = ['rosybrown', 'lightcoral', 'firebrick', 'blue']
models = ['y_stm_cs', 'y_sm_cs', 'y_cams_cs', 'y_nnh_cs', 'y_rf_cs']
colors = ['rosybrown', 'lightcoral', 'red', 'firebrick', 'blue']
for model in models:
r2_len_list = []
for len_ in len_list:
......@@ -776,7 +776,7 @@ def gap_len_vs_r2():
linestyle='--')
plt.xlim(-.9, 30)
plt.ylim(-.3, 1.1)
plt.legend()
# plt.legend()
plt.xlabel('gap length')
plt.ylabel('R2 score')
......@@ -899,12 +899,12 @@ if __name__ == '__main__':
station_ids_on_map_ = False
o3_value_matrix_ = False
mask_matrix_ = False
time_series_ = True
time_series_ = False
basic_statistics_and_histogram_ = False
exceedances_ = False
true_vs_imputed_ = False
n_neighbors_vs_r2_ = False
gap_len_vs_r2_ = False
n_neighbors_vs_r2_ = True
gap_len_vs_r2_ = True
imputed_dataset_matrix_ = False
modeled_time_series_ = False
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment