diff --git a/source/models/simple.py b/source/models/simple.py index 07024b68058f02846d0c57aa03854ba89a7a8ce5..1b3ff7ed28e59d28b82301237486e6b57d91f702 100644 --- a/source/models/simple.py +++ b/source/models/simple.py @@ -20,7 +20,8 @@ import torch # sklearn from sklearn.preprocessing import StandardScaler -from sklearn.ensemble import RandomForestRegressor +from sklearn.linear_model import LinearRegression +from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor from sklearn.metrics import mean_squared_error, r2_score from sklearn.neighbors import NearestNeighbors @@ -40,7 +41,7 @@ class SpatiotemporalMean: """ A model that just returns the mean of all labels """ - def __init__(self, use_val=False): + def __init__(self, use_val=False, cv_seed=None): """ Initializing the class @@ -104,7 +105,7 @@ class SpatialMean: If there is no measurement at all, return the mean of the CAMS reanalyses of that time step. """ - def __init__(self, use_val=False): + def __init__(self, use_val=False, cv_seed=None): """ Initializing the class @@ -185,11 +186,100 @@ class SpatialMean: self.y_hat = torch.load(self.prediction_path) +class Linear: + """ + Training and using random forests. + """ + def __init__(self, use_val=False, cv_seed=None): + """ + Initializing the class + + use_val = True means the validation set is used for fitting + """ + self.use_val = use_val + self.cv_seed = cv_seed + if use_val: + prediction_file = 'linear_predictions_useval.pt' + else: + prediction_file = 'linear_predictions.pt' + if cv_seed is not None: + directory = settings.resources_dir + 'cross_validation/' + \ + str(self.cv_seed) + '/' + 'models/' + else: + directory = settings.resources_dir + 'models/' + self.prediction_path = directory + prediction_file + + def __str__(self): + """ + The name of the simple model. + """ + return 'Linear' + + def read_predictions(self): + """ + Predictions were saved for future use + """ + if not os.path.exists(self.prediction_path): + self.save_predictions() + + self.y_hat = torch.load(self.prediction_path) + + def save_predictions(self, features=None, print_=True): + """ + Train the linear model, then save model and predictions + + features can be a list of features to use + print_ indicates whether to print info or not + """ + if print_: + print('getting linear predictions...') + # read in data + ug = UBAGraph() + x_df = pd.read_csv(ug.x_path, index_col=0) + y_df = pd.read_csv(ug.y_path, index_col=0) + if self.cv_seed is not None: + p1 = settings.resources_dir + p2 = settings.resources_dir + f'cross_validation/{self.cv_seed}/' + ug.mask_path = ug.mask_path.replace(p1, p2) + mask_df = pd.read_csv(ug.mask_path, index_col=0) + + # use standard features from feature selection if no features given + if not features: + features = ['cams_o3', 'nightlight', 'day_of_year', 'alt', + 'hour_of_day', 'pbl_height', 'temperature', + 'day_of_week', 'relative_alt', 'cams_no2', + 'type', 'relative_humidity', 'wind_v', + 'cams_nox_emissions', 'population_density', + 'type_of_area', 'cloud_cover', 'wind_u'] + + # prepare data + x_df = x_df[features] + x_df = get_one_hot(x_df) + x = x_df.values + if self.use_val: + mask = mask_df.train_mask | mask_df.val_mask + else: + mask = mask_df.train_mask + x_train = x_df[mask].values + y_train = y_df[mask].values.reshape(-1) + + # fit model + rf = LinearRegression() + rf.fit(x_train, y_train) + + # save predictions + y_hat = rf.predict(x) + y_hat_pytorch = torch.tensor(y_hat.astype(np.float32)).view(-1, 1) + torch.save(y_hat_pytorch, self.prediction_path) + if print_: + print(f'written to {self.prediction_path}') + + class Cams: """ Use the modeled ozone values from CAMS """ - def __init__(self, use_val=False): + def __init__(self, use_val=False, cv_seed=None): """ Initializing the class @@ -459,6 +549,99 @@ class RandomForest: print(f'written to {self.prediction_path}') +class GradientBoostingTrees: + """ + Training and using gradient boosting trees. + """ + def __init__(self, use_val=False, cv_seed=None): + """ + Initializing the class + + use_val = True means the validation set is used for fitting + """ + self.use_val = use_val + self.cv_seed = cv_seed + if use_val: + prediction_file = 'gradientboostingtrees_predictions_useval.pt' + else: + prediction_file = 'gradientboostingtrees_predictions.pt' + if cv_seed is not None: + directory = settings.resources_dir + 'cross_validation/' + \ + str(self.cv_seed) + '/' + 'models/' + else: + directory = settings.resources_dir + 'models/' + self.prediction_path = directory + prediction_file + + def __str__(self): + """ + The name of the simple model. + """ + return 'GradientBoostingTrees' + + def read_predictions(self): + """ + Predictions were saved for future use + """ + if not os.path.exists(self.prediction_path): + self.save_predictions() + + self.y_hat = torch.load(self.prediction_path) + + def save_predictions(self, print_=True, n_estimators=300, + learning_rate=0.1, max_depth=3): + """ + Train the random gradient boosting, then save predictions + + print_ indicates whether to print info or not + """ + if print_: + print('getting gradient boosted predictions...') + # read in data + ug = UBAGraph() + x_df = pd.read_csv(ug.x_path, index_col=0) + y_df = pd.read_csv(ug.y_path, index_col=0) + if self.cv_seed is not None: + p1 = settings.resources_dir + p2 = settings.resources_dir + f'cross_validation/{self.cv_seed}/' + ug.mask_path = ug.mask_path.replace(p1, p2) + mask_df = pd.read_csv(ug.mask_path, index_col=0) + + # use standard features from feature selection + features = ['cams_o3', 'nightlight', 'day_of_year', 'alt', + 'hour_of_day', 'pbl_height', 'temperature', + 'day_of_week', 'relative_alt', 'cams_no2', + 'type', 'relative_humidity', 'wind_v', + 'cams_nox_emissions', 'population_density', + 'type_of_area', 'cloud_cover', 'wind_u'] + + # prepare data + x_df = x_df[features] + x_df = get_one_hot(x_df) + x = x_df.values + if self.use_val: + mask = mask_df.train_mask | mask_df.val_mask + else: + mask = mask_df.train_mask + x_train = x_df[mask].values + y_train = y_df[mask].values.reshape(-1) + + # fit model + rf = GradientBoostingRegressor( + n_estimators=n_estimators, + learning_rate=learning_rate, + max_depth=max_depth, + random_state=settings.random_seed, + ) + rf.fit(x_train, y_train) + + # save predictions + y_hat = rf.predict(x) + y_hat_pytorch = torch.tensor(y_hat.astype(np.float32)).view(-1, 1) + torch.save(y_hat_pytorch, self.prediction_path) + if print_: + print(f'written to {self.prediction_path}') + + def tune_nnh(): """ tuning the samples parameter of NearestNeighborHybrid @@ -484,6 +667,37 @@ def tune_nnh(): print('\n\n\n') +def tune_gradient_boosted_trees(): + """ + tuning the samples parameter of NearestNeighborHybrid + """ + print('tune gradient boosted tree model...') + + # UBA Graph data + ug = UBAGraph() + ug.get_dataset() + mask_df = pd.read_csv(ug.mask_path, index_col=0) + y = ug[0].y.numpy().reshape(-1)[mask_df['val_mask']] + + # for n_estimators in [100, 200, 300, 400, 500]: + # print('\nn_estimators=', n_estimators) + # for max_depth=3 in [4, 6, 8, 10]: + # print('\nmax_depth=', max_depth) + for learning_rate in [0.2, 0.25, 0.05, 0.1, 0.15]: + print('\n learning_rate=', learning_rate) + gbt = GradientBoostingTrees() + # gbt.save_predictions(n_estimators=n_estimators) + # gbt.save_predictions(max_depth=max_depth) + gbt.save_predictions(learning_rate=learning_rate) + gbt.read_predictions() + y_hat = gbt.y_hat.numpy().reshape(-1)[mask_df['val_mask']] + rmse = (mean_squared_error(y, y_hat))**.5 + r2 = r2_score(y, y_hat) + + print(f' val RMSE: {rmse:.3f}, val R2: {r2:.3f}') + print('\n') + + def select_rf_features(): """ Choose features for the simple model. @@ -587,8 +801,9 @@ def evaluate_models(cv_seed=None): models = [NearestNeighborHybrid, RandomForest] use_vals = [True] else: - models = [SpatiotemporalMean, SpatialMean, Cams, - NearestNeighborHybrid, RandomForest] + models = [SpatiotemporalMean, SpatialMean, Linear, + Cams, NearestNeighborHybrid, RandomForest, + GradientBoostingTrees] use_vals = [False, True] specs = [(model, use_val) for model in models for use_val in use_vals] @@ -633,34 +848,45 @@ def tune_and_save_models(): Also save the predictions where the validation set is used for training. """ - model = SpatiotemporalMean() - model.save_predictions() - model = SpatiotemporalMean(use_val=True) - model.save_predictions() - - model = SpatialMean() - model.save_predictions() - model = SpatialMean(use_val=True) - model.save_predictions() - - model = Cams() - model.save_predictions() - model = Cams(use_val=True) - model.save_predictions() - - tune_nnh() - model = NearestNeighborHybrid() - model.save_predictions() - model = NearestNeighborHybrid(use_val=True) - model.save_predictions() - - select_rf_features() - model = RandomForest() - model.save_predictions() - model = RandomForest(use_val=True) - model.save_predictions() - - evaluate_models() + # model = SpatiotemporalMean() + # model.save_predictions() + # model = SpatiotemporalMean(use_val=True) + # model.save_predictions() + + # model = SpatialMean() + # model.save_predictions() + # model = SpatialMean(use_val=True) + # model.save_predictions() + + # model = Linear() + # model.save_predictions() + # model = Linear(use_val=True) + # model.save_predictions() + + # model = Cams() + # model.save_predictions() + # model = Cams(use_val=True) + # model.save_predictions() + + # tune_nnh() + # model = NearestNeighborHybrid() + # model.save_predictions() + # model = NearestNeighborHybrid(use_val=True) + # model.save_predictions() + + # select_rf_features() + # model = RandomForest() + # model.save_predictions() + # model = RandomForest(use_val=True) + # model.save_predictions() + + tune_gradient_boosted_trees() + # model = GradientBoostingTrees() + # model.save_predictions() + # model = GradientBoostingTrees(use_val=True) + # model.save_predictions() + + # evaluate_models() def save_models_for_cross_validation(): @@ -683,5 +909,5 @@ def save_models_for_cross_validation(): if __name__ == '__main__': - # tune_and_save_models() - save_models_for_cross_validation() + tune_and_save_models() + # save_models_for_cross_validation() diff --git a/source/postprocessing/evaluation_tools.py b/source/postprocessing/evaluation_tools.py index a1526d58ad7b140cc97fb227b69e2d0a059eca84..7482970a564de26c993856a18375d00ba2e731fb 100644 --- a/source/postprocessing/evaluation_tools.py +++ b/source/postprocessing/evaluation_tools.py @@ -364,9 +364,11 @@ def get_characteristics_df(how='read', cv_seed=None): lin_file = 'nearestneighborhybrid_predictions_useval.pt' stm_cs_file = 'SpatiotemporalMean_cs_predictions_useval.pyt' sm_cs_file = 'SpatialMean_cs_predictions_useval.pyt' + cams_cs_file = 'Cams_cs_predictions_useval.pyt' nnh_cs_file = 'NearestNeighborHybrid_cs_predictions_useval.pyt' rf_cs_file = 'RandomForest_cs_predictions_useval.pyt' lin_imp = torch.load(model_path+lin_file).numpy().reshape(-1) + cams_cs_imp = torch.load(model_path+cams_cs_file).numpy().reshape(-1) nnh_cs_imp = torch.load(model_path+nnh_cs_file).numpy().reshape(-1) rf_cs_imp = torch.load(model_path+rf_cs_file).numpy().reshape(-1) if cv_seed is None: @@ -374,8 +376,8 @@ def get_characteristics_df(how='read', cv_seed=None): sm_cs_imp = torch.load(model_path+sm_cs_file).numpy().reshape(-1) # set up df - columns = ['y_true', 'y_imputed', - 'y_stm_cs', 'y_sm_cs', 'y_nnh_cs', 'y_rf_cs', + columns = ['y_true', 'y_imputed', 'y_stm_cs', + 'y_sm_cs', 'y_cams_cs', 'y_nnh_cs', 'y_rf_cs', 'gap_len', 'correlated', 'station_id', 'n_neighbors', 'test_mask'] df = pd.DataFrame(columns=columns, index=y_true_df.index) @@ -383,6 +385,7 @@ def get_characteristics_df(how='read', cv_seed=None): # columns that we can take from other data frames df.y_true = y_true_df.y df.y_nnh_cs = nnh_cs_imp + df.y_cams_cs = cams_cs_imp df.y_rf_cs = rf_cs_imp df.station_id = reg_df.station_id df.test_mask = mask_df.test_mask @@ -403,8 +406,8 @@ def get_characteristics_df(how='read', cv_seed=None): for gap_idx, gap_row in gap_df_filtered.iterrows(): if gap_idx % 5000 == 0: print(f'{gap_idx/len(gap_df_filtered)*100:.2f} %') - df.to_csv(save_path) - print(f'intermediate save to {save_path}') + # df.to_csv(save_path) + # print(f'intermediate save to {save_path}') start_idx = gap_row.start_idx end_idx = gap_row.start_idx + gap_row.len - 1 len_ = gap_row.len @@ -500,8 +503,8 @@ if __name__ == '__main__': test_bootstrap_ = False count_exceedances_ = False number_of_neighbors_ = False - get_characteristics_df_ = False - evaluate_cross_validation_ = True + get_characteristics_df_ = True + evaluate_cross_validation_ = False if index_of_agreement_: # test index of agreement should be.959 @@ -546,8 +549,8 @@ if __name__ == '__main__': if get_characteristics_df_: # df = get_characteristics_df(how='read') df = get_characteristics_df(how='prepare') - for cv_seed in settings.cv_seeds: - get_characteristics_df(how='prepare', cv_seed=cv_seed) + # for cv_seed in settings.cv_seeds: + # get_characteristics_df(how='prepare', cv_seed=cv_seed) if evaluate_cross_validation_: evaluate_cross_validation() diff --git a/source/visualizations/plots_for_paper.py b/source/visualizations/plots_for_paper.py index 1ef6c310218e0d33fd2d1afd47114aa4a2c8a7e5..172ea57748f28378a082b9c99170bcf11342b0e6 100644 --- a/source/visualizations/plots_for_paper.py +++ b/source/visualizations/plots_for_paper.py @@ -690,8 +690,8 @@ def n_neighbors_vs_r2(): df = get_characteristics_df() n_list = np.unique(df.n_neighbors).tolist() r2_n_dict = {} - models = ['y_stm_cs', 'y_sm_cs', 'y_nnh_cs', 'y_rf_cs'] - colors = ['rosybrown', 'lightcoral', 'firebrick', 'blue'] + models = ['y_stm_cs', 'y_sm_cs', 'y_cams_cs', 'y_nnh_cs', 'y_rf_cs'] + colors = ['rosybrown', 'lightcoral', 'red', 'firebrick', 'blue'] for model in models: r2_n_list = [] for n in n_list: @@ -722,7 +722,7 @@ def n_neighbors_vs_r2(): ax.set_ylim(-.1, 1.1) ax.set_xlabel('# neighbors') ax.set_ylabel('R2 score') - ax.legend() + # ax.legend() # save n_neigh_vs_r2_path = settings.output_dir + 'n_neigh_vs_r2.png' @@ -741,8 +741,8 @@ def gap_len_vs_r2(): df = get_characteristics_df() len_list = list(range(1, 51)) r2_len_dict = {} - models = ['y_stm_cs', 'y_sm_cs', 'y_nnh_cs', 'y_rf_cs'] - colors = ['rosybrown', 'lightcoral', 'firebrick', 'blue'] + models = ['y_stm_cs', 'y_sm_cs', 'y_cams_cs', 'y_nnh_cs', 'y_rf_cs'] + colors = ['rosybrown', 'lightcoral', 'red', 'firebrick', 'blue'] for model in models: r2_len_list = [] for len_ in len_list: @@ -776,7 +776,7 @@ def gap_len_vs_r2(): linestyle='--') plt.xlim(-.9, 30) plt.ylim(-.3, 1.1) - plt.legend() + # plt.legend() plt.xlabel('gap length') plt.ylabel('R2 score') @@ -899,12 +899,12 @@ if __name__ == '__main__': station_ids_on_map_ = False o3_value_matrix_ = False mask_matrix_ = False - time_series_ = True + time_series_ = False basic_statistics_and_histogram_ = False exceedances_ = False true_vs_imputed_ = False - n_neighbors_vs_r2_ = False - gap_len_vs_r2_ = False + n_neighbors_vs_r2_ = True + gap_len_vs_r2_ = True imputed_dataset_matrix_ = False modeled_time_series_ = False