diff --git a/source/experiments/cs_uba_graph.py b/source/experiments/cs_uba_graph.py index 69ba9262c6d9a3e3dee2bb471c5fb04c45dd481b..7b117c4336bbc349ed0e13a06f215e7c38346991 100644 --- a/source/experiments/cs_uba_graph.py +++ b/source/experiments/cs_uba_graph.py @@ -29,6 +29,7 @@ from preprocessing.uba import UBAGraph from models.correct_and_smooth import CorrectAndSmooth from models.simple import SpatiotemporalMean from models.simple import SpatialMean +from models.simple import Cams from models.simple import NearestNeighborHybrid from models.simple import RandomForest from postprocessing.evaluation_tools import index_of_agreement @@ -39,19 +40,27 @@ class CSWorkflow: """ A class for all correct and smooth workflows """ - def __init__(self, simple_model, use_val=False): + def __init__(self, simple_model, use_val=False, cv_seed=None): """ Initialize the workflow with a simple model and indicate whether correct and smooth should be performed. """ self.simple_model_class = simple_model self.use_val = use_val + self.cv_seed = cv_seed def get_dataset(self): """ Read in the UBAGraph dataset """ ug = UBAGraph() + if self.cv_seed is not None: + p1 = settings.resources_dir + p2 = settings.resources_dir + f'cross_validation/{self.cv_seed}/' + ug.missing_o3_mask_pt_path = ug.missing_o3_mask_pt_path.replace(p1, p2) + ug.train_mask_pt_path = ug.train_mask_pt_path.replace(p1, p2) + ug.val_mask_pt_path = ug.val_mask_pt_path.replace(p1, p2) + ug.test_mask_pt_path = ug.test_mask_pt_path.replace(p1, p2) ug.get_dataset() dataset = ug self.data = dataset[0] @@ -60,7 +69,10 @@ class CSWorkflow: """ Read predictions of simple model """ - self.sm = self.simple_model_class(use_val=self.use_val) + if self.cv_seed is not None: + self.sm = self.simple_model_class(use_val=self.use_val, cv_seed=self.cv_seed) + else: + self.sm = self.simple_model_class(use_val=self.use_val) print(f'\nRead predictions for {self.sm} from {self.sm.prediction_path}') self.sm.read_predictions() self.y_hat_simple_model = self.sm.y_hat @@ -132,7 +144,8 @@ class CSWorkflow: else: evaluation_set = 'val' print(f'Bin evaluation on {evaluation_set} set') - gle = GapLengthEvaluation() + gle = GapLengthEvaluation(cv_seed=self.cv_seed) + pd.set_option('display.max_columns', 500) print(gle.evaluate_bins_separately(evaluation_set+'_mask', self.data.y, y_hat)) print() @@ -145,8 +158,12 @@ def tune_correct_and_smooth(): """ print(f"\n{'*'*29}\n** TUNE CORRECT AND SMOOTH **\n{'*'*29}\n") simple_models = [ - SpatiotemporalMean, SpatialMean, - NearestNeighborHybrid, RandomForest] + Cams, + SpatiotemporalMean, + SpatialMean, + NearestNeighborHybrid, + RandomForest + ] correction_alphas = np.arange(0., 1.1, .2) num_correction_layers = [10] smoothing_alphas = np.arange(0., 1.1, .2) @@ -185,22 +202,29 @@ def tune_correct_and_smooth(): print(f'tuning results written to {settings.output_dir}tuning.csv') -def apply_correct_and_smooth(save=False): +def apply_correct_and_smooth(save=False, cv_seed=None): """ Finally, apply correct using training and validation set! If save is set to true, the saved predictions are overwritten. """ print(f"\n{'*'*30}\n** APPLY CORRECT AND SMOOTH **\n{'*'*30}\n") - specs = [ - (SpatiotemporalMean, .6, 10, 0., 0, 2.), # final - (SpatialMean, .6, 10, .2, 10, 1.75), # final - (NearestNeighborHybrid, 0., 0, .2, 10, 0.), # final - (RandomForest, .6, 10, 0., 0, 2.0), # final - ] + if cv_seed is not None: + specs = [ + (NearestNeighborHybrid, 0., 0, .2, 10, 0.), # final + (RandomForest, .6, 10, 0., 0, 2.0), # final + ] + else: + specs = [ + (Cams, .6, 10, .2, 10, 2.), # final + (SpatiotemporalMean, .6, 10, 0., 0, 2.), # final + (SpatialMean, .6, 10, .2, 10, 1.75), # final + (NearestNeighborHybrid, 0., 0, .2, 10, 0.), # final + (RandomForest, .6, 10, 0., 0, 2.0), # final + ] for simple_model, correction_alpha, num_correction_layers, \ smoothing_alpha, num_smoothing_layers, scale in specs: - csw = CSWorkflow(simple_model=simple_model, use_val=True) + csw = CSWorkflow(simple_model=simple_model, use_val=True, cv_seed=cv_seed) csw.get_dataset() csw.get_simple_model() csw.correct_and_smooth(correction_alpha=correction_alpha, @@ -210,7 +234,10 @@ def apply_correct_and_smooth(save=False): scale=scale) csw.evaluate() if save: - dir_ = settings.resources_dir + 'models/' + if cv_seed is not None: + dir_ = settings.resources_dir + f'cross_validation/{cv_seed}/models/' + else: + dir_ = settings.resources_dir + 'models/' file_ = f'{simple_model()}_cs_predictions_useval.pyt' torch.save(csw.y_hat_smooth, dir_+file_) print(f'written to {dir_+file_}\n\n') @@ -221,7 +248,9 @@ if __name__ == '__main__': Start a workflow """ # tune_correct_and_smooth() - apply_correct_and_smooth() + # apply_correct_and_smooth() + for cv_seed in settings.cv_seeds: + apply_correct_and_smooth(save=True, cv_seed=cv_seed) diff --git a/source/models/simple.py b/source/models/simple.py index e81827f5e0bab3ca24f79071086ddabf8b7b0b5c..07024b68058f02846d0c57aa03854ba89a7a8ce5 100644 --- a/source/models/simple.py +++ b/source/models/simple.py @@ -185,6 +185,60 @@ class SpatialMean: self.y_hat = torch.load(self.prediction_path) +class Cams: + """ + Use the modeled ozone values from CAMS + """ + def __init__(self, use_val=False): + """ + Initializing the class + + use_val = True means the validation set is used for fitting + (so it does nothing for this class) + """ + self.use_val = use_val + prediction_dir = settings.resources_dir + 'models/' + if use_val: + prediction_file = 'cams_predictions_useval.pt' + else: + prediction_file = 'cams_predictions.pt' + self.prediction_path = prediction_dir + prediction_file + + def __str__(self): + """ + The name of the Mean simple model. + """ + return 'Cams' + + def save_predictions(self): + """ + Fitting the model. The whole datset is given + as an argument, since different models need + different data for fitting. + """ + print('preparing predictions for Cams model...') + + # load dataset + hd_cams_o3 = HourlyData('cams_o3') + hd_cams_o3.read_from_file() + y_hat_df = hd_cams_o3.df.T.copy() + + # save + y_flat_numpy = y_hat_df.values.reshape(-1).astype(np.float32) + y_flat_pytorch = torch.tensor(y_flat_numpy).view(-1, 1) + torch.save(y_flat_pytorch, self.prediction_path) + print(f'written to {self.prediction_path}') + + def read_predictions(self): + """ + Predictions were saved for future use + """ + if not os.path.exists(self.prediction_path): + self.save_predictions() + + self.y_hat = torch.load(self.prediction_path) + + class NearestNeighborHybrid: """ A model that performs simple imputation @@ -194,18 +248,23 @@ class NearestNeighborHybrid: Nearest neighbors in feature dimension. (for longer gaps) """ - def __init__(self, use_val=False): + def __init__(self, use_val=False, cv_seed=None): """ Initializing the class use_val = True means the validation set is used for fitting """ self.use_val = use_val - prediction_dir = settings.resources_dir + 'models/' + self.cv_seed = cv_seed if use_val: prediction_file = 'nearestneighborhybrid_predictions_useval.pt' else: prediction_file = 'nearestneighborhybrid_predictions.pt' + if cv_seed is not None: + prediction_dir = settings.resources_dir + 'cross_validation/' + \ + str(self.cv_seed) + '/models/' + else: + prediction_dir = settings.resources_dir + 'models/' self.prediction_path = prediction_dir + prediction_file def __str__(self): @@ -238,7 +297,7 @@ class NearestNeighborHybrid: # UBA Graph data ug = UBAGraph() - ug.get_dataset() + ug.get_dataset(cv_seed=self.cv_seed) # set test or val and test samples to nan mask_flat_df = pd.read_csv(ug.mask_path, index_col=0) @@ -296,20 +355,25 @@ class RandomForest: """ Training and using random forests. """ - def __init__(self, use_val=False): + def __init__(self, use_val=False, cv_seed=None): """ Initializing the class use_val = True means the validation set is used for fitting """ self.use_val = use_val - directory = settings.resources_dir + 'models/' + self.cv_seed = cv_seed if use_val: prediction_file = 'randomforest_predictions_useval.pt' model_file = 'randomforest_model_useval.pkl' else: prediction_file = 'randomforest_predictions.pt' model_file = 'randomforest_model.pkl' + if cv_seed is not None: + directory = settings.resources_dir + 'cross_validation/' + \ + str(self.cv_seed) + '/' + 'models/' + else: + directory = settings.resources_dir + 'models/' self.prediction_path = directory + prediction_file self.model_path = directory + model_file @@ -350,6 +414,10 @@ class RandomForest: ug = UBAGraph() x_df = pd.read_csv(ug.x_path, index_col=0) y_df = pd.read_csv(ug.y_path, index_col=0) + if self.cv_seed is not None: + p1 = settings.resources_dir + p2 = settings.resources_dir + f'cross_validation/{self.cv_seed}/' + ug.mask_path = ug.mask_path.replace(p1, p2) mask_df = pd.read_csv(ug.mask_path, index_col=0) # use standard features from feature selection if no features given @@ -492,7 +560,7 @@ def select_rf_features(): break -def evaluate_models(): +def evaluate_models(cv_seed=None): """ Evaluate all simple models """ @@ -503,20 +571,31 @@ def evaluate_models(): # read graph dataset ug = UBAGraph() + if cv_seed is not None: + p1 = settings.resources_dir + p2 = settings.resources_dir + f'cross_validation/{cv_seed}/' + ug.missing_o3_mask_pt_path = ug.missing_o3_mask_pt_path.replace(p1, p2) + ug.train_mask_pt_path = ug.train_mask_pt_path.replace(p1, p2) + ug.val_mask_pt_path = ug.val_mask_pt_path.replace(p1, p2) + ug.test_mask_pt_path = ug.test_mask_pt_path.replace(p1, p2) ug.get_dataset() dataset = ug[0] y = dataset.y.numpy().reshape(-1) # which models to evaluate - models = [SpatiotemporalMean, SpatialMean, - NearestNeighborHybrid, RandomForest] - use_vals = [False, True] + if cv_seed is not None: + models = [NearestNeighborHybrid, RandomForest] + use_vals = [True] + else: + models = [SpatiotemporalMean, SpatialMean, Cams, + NearestNeighborHybrid, RandomForest] + use_vals = [False, True] specs = [(model, use_val) for model in models for use_val in use_vals] # evaluation for model, use_val in specs: - m = model(use_val=use_val) + m = model(use_val=use_val, cv_seed=cv_seed) m.read_predictions() y_hat = m.y_hat.numpy().reshape(-1) if use_val: @@ -541,38 +620,68 @@ def evaluate_models(): print(f'R2: {train_r2:.3f}, RMSE: {train_rmse:.3f}, d: {train_d:.3f}') print(f'evaluation on {test_mask_name} set') print(f'R2: {test_r2:.3f}, RMSE: {test_rmse:.3f}, d: {test_d:.3f}') - gle = GapLengthEvaluation() - print(f'bin evaluation on {test_mask_name}') - print(gle.evaluate_bins_separately(test_mask_name+'_mask', y, y_hat)) + if cv_seed is None: + gle = GapLengthEvaluation() + print(f'bin evaluation on {test_mask_name}') + print(gle.evaluate_bins_separately(test_mask_name+'_mask', y, y_hat)) print('-'*20) -if __name__ == '__main__': +def tune_and_save_models(): """ Tune the simple models, then save their predictions. Also save the predictions where the validation set is used for training. """ - # model = SpatiotemporalMean() - # model.save_predictions() - # model = SpatiotemporalMean(use_val=True) - # model.save_predictions() - - # model = SpatialMean() - # model.save_predictions() - # model = SpatialMean(use_val=True) - # model.save_predictions() - - # tune_nnh() - # model = NearestNeighborHybrid() - # model.save_predictions() - # model = NearestNeighborHybrid(use_val=True) - # model.save_predictions() - - # select_rf_features() - # model = RandomForest() - # model.save_predictions() - # model = RandomForest(use_val=True) - # model.save_predictions() + model = SpatiotemporalMean() + model.save_predictions() + model = SpatiotemporalMean(use_val=True) + model.save_predictions() + + model = SpatialMean() + model.save_predictions() + model = SpatialMean(use_val=True) + model.save_predictions() + + model = Cams() + model.save_predictions() + model = Cams(use_val=True) + model.save_predictions() + + tune_nnh() + model = NearestNeighborHybrid() + model.save_predictions() + model = NearestNeighborHybrid(use_val=True) + model.save_predictions() + + select_rf_features() + model = RandomForest() + model.save_predictions() + model = RandomForest(use_val=True) + model.save_predictions() evaluate_models() + + +def save_models_for_cross_validation(): + """" + Retrain the simple models for different data splits + N.b. the cv_seed option is only available for nearest neighbor + hybrid and random forest, our final chosen models. + """ + print('saving models for cross-validation...') + for cv_seed in settings.cv_seeds: + print('cross validation seed', cv_seed) + + # model = NearestNeighborHybrid(use_val=True, cv_seed=cv_seed) + # model.save_predictions() + + # model = RandomForest(use_val=True, cv_seed=cv_seed) + # model.save_predictions() + + evaluate_models(cv_seed=cv_seed) + + +if __name__ == '__main__': + # tune_and_save_models() + save_models_for_cross_validation() diff --git a/source/postprocessing/compare_with_uba_validated_dataset.py b/source/postprocessing/compare_with_uba_validated_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..b789d316815ec387c49d949636902a6d82e25fd5 --- /dev/null +++ b/source/postprocessing/compare_with_uba_validated_dataset.py @@ -0,0 +1,71 @@ +""" +This file compares the final imputed dataset with the final validated +dataset from UBA. +""" + +# general +import pdb + +# data science +import pandas as pd +import numpy as np + +# machine learning +from sklearn.metrics import r2_score + +# own +import settings +from retrieval.uba_validated import UBARetrieval +from retrieval.toar_db import HourlyData +from retrieval.utils import print_statistics +from postprocessing.evaluation_tools import index_of_agreement +from postprocessing.evaluation_tools import rmse +from visualizations.plots_for_paper import o3_value_matrix + + +if __name__ == '__main__': + """ + Comparison of UBA validation and imputed dataset + """ + print('comparing imputations with final UBA dataset...') + + # read in final validated dataset + ur = UBARetrieval() + ur.read_from_file() + uba_df = ur.df + y = uba_df.to_numpy() + + # read in final imputed dataset + imputed_path = settings.resources_dir + 'imputed_dataset/' + imputed_o3_file = 'imputed_o3.csv' + imputed_df = pd.read_csv(imputed_path+imputed_o3_file, + index_col=0) + imputed_info_file = 'imputed_info.csv' # True if imputed + imputed_info_df = pd.read_csv(imputed_path+imputed_info_file, + index_col=0) + y_hat = imputed_df.to_numpy() + + # we need two masks, + # one is true if data is avaliable in both datasets, to compare them + # (to estimate the differences) + # one is true if data is available in the final UBA dataset but + # not in our training dataset, to ensure robustness of our imputations + comparison_mask = np.full_like(uba_df, True, dtype=bool) + comparison_mask[pd.isna(uba_df).values] = False + comparison_mask[imputed_info_df] = False + evaluation_mask = np.full_like(uba_df, True, dtype=bool) + evaluation_mask[pd.isna(uba_df).values] = False + evaluation_mask[~imputed_info_df] = False + + # print basic info about the masks + N = uba_df.size + specs = [('comparison', comparison_mask), + ('evaluation', evaluation_mask)] + for string, mask in specs: + print('\n', string) + n = mask.sum() + print(f'{n} samples = {n/N*100:.1f}% of all samples') + r2_ = r2_score(y[mask], y_hat[mask]) + rmse_ = rmse(y[mask], y_hat[mask]) + d_ = index_of_agreement(y[mask], y_hat[mask]) + print(f'R2: {r2_:.2f}, RMSE: {rmse_:.2f}, d: {d_:.2f}') diff --git a/source/postprocessing/evaluation_tools.py b/source/postprocessing/evaluation_tools.py index 19c329a958e61ea14af79ca2abca986fce820ad3..a1526d58ad7b140cc97fb227b69e2d0a059eca84 100644 --- a/source/postprocessing/evaluation_tools.py +++ b/source/postprocessing/evaluation_tools.py @@ -89,12 +89,18 @@ class GapLengthEvaluation: and evaluation routines to be imported from experiment workflows. """ - def __init__(self): + def __init__(self, cv_seed=None): """ Initialize the class """ # UBA graph has all data paths needed self.ug = UBAGraph() + self.cv_seed = cv_seed + if self.cv_seed is not None: + p1 = settings.resources_dir + p2 = settings.resources_dir + f'cross_validation/{self.cv_seed}/' + self.ug.gap_path = self.ug.gap_path.replace(p1, p2) + self.ug.mask_path = self.ug.mask_path.replace(p1, p2) self.gap_df = pd.read_csv(self.ug.gap_path, index_col=0) self.mask_df = pd.read_csv(self.ug.mask_path, index_col=0) @@ -325,34 +331,48 @@ def number_of_neighbors(station_id, return_ids=False): return(n_neighbors, distances) -def get_characteristics_df(how='read'): +def get_characteristics_df(how='read', cv_seed=None): """ A dataframe where we preprocess some gap characteristics for easier plotting """ - # read in data - ug = UBAGraph() - y_true_df = pd.read_csv(ug.y_path, index_col=0) - gap_df = pd.read_csv(ug.gap_path, index_col=0) - mask_df = pd.read_csv(ug.mask_path, index_col=0) - reg_df = pd.read_csv(ug.reg_path, index_col=0) - model_path = settings.resources_dir + 'models/' - lin_file = 'nearestneighborhybrid_predictions_useval.pt' - stm_cs_file = 'SpatiotemporalMean_cs_predictions_useval.pyt' - sm_cs_file = 'SpatialMean_cs_predictions_useval.pyt' - nnh_cs_file = 'NearestNeighborHybrid_cs_predictions_useval.pyt' - rf_cs_file = 'RandomForest_cs_predictions_useval.pyt' - lin_imp = torch.load(model_path+lin_file).numpy().reshape(-1) - stm_cs_imp = torch.load(model_path+stm_cs_file).numpy().reshape(-1) - sm_cs_imp = torch.load(model_path+sm_cs_file).numpy().reshape(-1) - nnh_cs_imp = torch.load(model_path+nnh_cs_file).numpy().reshape(-1) - rf_cs_imp = torch.load(model_path+rf_cs_file).numpy().reshape(-1) - # path - save_path = settings.output_dir + 'characteristics_tmp.csv' + if cv_seed is None: + save_path = settings.output_dir + 'characteristics_tmp.csv' + else: + save_path = settings.output_dir + f'characteristics_tmp_{cv_seed}.csv' # set up and fill df if how == 'prepare': + print('\npreparing characteristics df') + print(f'cv_seed = {cv_seed}') + + # read in data + ug = UBAGraph() + if cv_seed is not None: + p1 = settings.resources_dir + p2 = settings.resources_dir + f'cross_validation/{cv_seed}/' + ug.gap_path = ug.gap_path.replace(p1, p2) + ug.mask_path = ug.mask_path.replace(p1, p2) + y_true_df = pd.read_csv(ug.y_path, index_col=0) + gap_df = pd.read_csv(ug.gap_path, index_col=0) + mask_df = pd.read_csv(ug.mask_path, index_col=0) + reg_df = pd.read_csv(ug.reg_path, index_col=0) + model_path = settings.resources_dir + 'models/' + if cv_seed is not None: + model_path = model_path.replace(p1, p2) + lin_file = 'nearestneighborhybrid_predictions_useval.pt' + stm_cs_file = 'SpatiotemporalMean_cs_predictions_useval.pyt' + sm_cs_file = 'SpatialMean_cs_predictions_useval.pyt' + nnh_cs_file = 'NearestNeighborHybrid_cs_predictions_useval.pyt' + rf_cs_file = 'RandomForest_cs_predictions_useval.pyt' + lin_imp = torch.load(model_path+lin_file).numpy().reshape(-1) + nnh_cs_imp = torch.load(model_path+nnh_cs_file).numpy().reshape(-1) + rf_cs_imp = torch.load(model_path+rf_cs_file).numpy().reshape(-1) + if cv_seed is None: + stm_cs_imp = torch.load(model_path+stm_cs_file).numpy().reshape(-1) + sm_cs_imp = torch.load(model_path+sm_cs_file).numpy().reshape(-1) + # set up df columns = ['y_true', 'y_imputed', 'y_stm_cs', 'y_sm_cs', 'y_nnh_cs', 'y_rf_cs', @@ -362,12 +382,13 @@ def get_characteristics_df(how='read'): # columns that we can take from other data frames df.y_true = y_true_df.y - df.y_stm_cs = stm_cs_imp - df.y_sm_cs = sm_cs_imp df.y_nnh_cs = nnh_cs_imp df.y_rf_cs = rf_cs_imp df.station_id = reg_df.station_id df.test_mask = mask_df.test_mask + if cv_seed is None: + df.y_stm_cs = stm_cs_imp + df.y_sm_cs = sm_cs_imp # number of neighbors of each station station_ids = np.unique(df.station_id) @@ -380,9 +401,10 @@ def get_characteristics_df(how='read'): gap_df_filtered = gap_df[gap_df.type=='test_mask'] gap_df_filtered.reset_index(inplace=True) for gap_idx, gap_row in gap_df_filtered.iterrows(): - if gap_idx % 1000 == 0: + if gap_idx % 5000 == 0: print(f'{gap_idx/len(gap_df_filtered)*100:.2f} %') df.to_csv(save_path) + print(f'intermediate save to {save_path}') start_idx = gap_row.start_idx end_idx = gap_row.start_idx + gap_row.len - 1 len_ = gap_row.len @@ -398,6 +420,7 @@ def get_characteristics_df(how='read'): df.loc[start_idx:end_idx, 'y_imputed'] = \ rf_cs_imp[start_idx:end_idx+1] df.to_csv(save_path) + print(f'written to {save_path}') # if it was already prepared elif how == 'read': @@ -407,6 +430,66 @@ def get_characteristics_df(how='read'): return df +def evaluate_cross_validation(): + """ + Read in all cross validation preprocessed results and print + their statistics + """ + print('evaluate cross validation...') + + # read in data + dict_ = {} + r2_list = [] + rmse_list = [] + d_list = [] + df = pd.read_csv(settings.output_dir+'characteristics_tmp.csv', + index_col=0) + dict_[1] = df + for cv_seed in range(1, 12): + if cv_seed != 1: + df = pd.read_csv(settings.output_dir+f'characteristics_tmp_{cv_seed}.csv', + index_col=0) + dict_[cv_seed] = df + + # prepare scatter true vs. imputed + df = dict_[cv_seed] + y_tru_short = df[(df.test_mask) & + (df.gap_len<=5)].y_true + y_imp_short = df[(df.test_mask) & + (df.gap_len<=5)].y_imputed + y_tru_long = df[(df.test_mask) & + (df.gap_len>5)].y_true + y_imp_long = df[(df.test_mask) & + (df.gap_len>5)].y_imputed + y_tru = df[df.test_mask].y_true + y_imp = df[df.test_mask].y_imputed + + # print statistics, only for summary + for tru, imp, what in [#(y_tru_short, y_imp_short, 'short'), + #(y_tru_long, y_imp_long, 'long'), + (y_tru, y_imp, 'summary') ]: + r2 = r2_score(tru, imp) + rmse = (mean_squared_error(tru, imp))**.5 + d = index_of_agreement(tru, imp) + if cv_seed == 1: + print('\noriginal test set:') + print(f'r2: {r2:.5f}') + print(f'rmse: {rmse:.5f}') + print(f'd: {d:.5f}\n') + r2_list.append(r2) + rmse_list.append(rmse) + d_list.append(d) + print('cross validation:') + print(f'\nbest r2: {np.max(r2_list):.5f}') + print(f'worst r2: {np.min(r2_list):.5f}\n') + print(f'best rmse: {np.min(rmse_list):.5f}') + print(f'worst rmse: {np.max(rmse_list):.5f}\n') + print(f'best d: {np.max(d_list):.5f}') + print(f'worst d: {np.min(d_list):.5f}\n') + + + + if __name__ == '__main__': """ Start or test routines. @@ -415,9 +498,10 @@ if __name__ == '__main__': gap_length_evaluation_ = False print_best_hyperparameters_ = False test_bootstrap_ = False - count_exceedances_ = True + count_exceedances_ = False number_of_neighbors_ = False get_characteristics_df_ = False + evaluate_cross_validation_ = True if index_of_agreement_: # test index of agreement should be.959 @@ -462,4 +546,9 @@ if __name__ == '__main__': if get_characteristics_df_: # df = get_characteristics_df(how='read') df = get_characteristics_df(how='prepare') + for cv_seed in settings.cv_seeds: + get_characteristics_df(how='prepare', cv_seed=cv_seed) + + if evaluate_cross_validation_: + evaluate_cross_validation() diff --git a/source/preprocessing/uba.py b/source/preprocessing/uba.py index d98e3d4373317b8cce9356c80f9570a9cba7870f..ded0345fda8b8abef765fa2715d73f8ea7ffed7e 100644 --- a/source/preprocessing/uba.py +++ b/source/preprocessing/uba.py @@ -9,6 +9,7 @@ import pdb import warnings import pickle as pkl import datetime +import os # data science import numpy as np @@ -26,6 +27,7 @@ from sklearn.neighbors import NearestNeighbors import settings from retrieval.toar_db import StationData from retrieval.toar_db import HourlyData +from retrieval.uba_validated import UBARetrieval from preprocessing.utils import geo_to_cartesian, neighbor_index_filter, \ loc_weight, time_weight, get_one_hot @@ -216,20 +218,33 @@ class UBAGraph(InMemoryDataset): y_df.to_csv(self.y_path) print(f'written to {self.y_path}') - def get_gaps(self): + def get_gaps(self, uba_validated=False): """ Analyze the gaps in measurement data, their length. + There is an option to analyze the validated uba dataset + instead. But this option is not needed to prepare the uba graph + dataset. So the default is uba_validated=False. """ print('analyzing existing gaps...') print('correlated gaps...') # read in data - y_df_flat = pd.read_csv(self.y_path, index_col=0) + if uba_validated: + ur = UBARetrieval() + ur.read_from_file() + y_df = ur.df + y_values = ur.df.values.T.reshape(-1) + y_df_flat = pd.DataFrame(index=range(len(y_values)), + columns=['y'], + data=y_values) + y_df_flat.index.name = 'node_index' + else: + hd = HourlyData('o3') + hd.read_from_file() + y_df = hd.df + y_df_flat = pd.read_csv(self.y_path, index_col=0) reg_df = pd.read_csv(self.reg_path, index_col=0) reg_df.datetime = pd.to_datetime(reg_df.datetime) - hd = HourlyData('o3') - hd.read_from_file() - y_df = hd.df y_df.index = pd.to_datetime(y_df.index) + settings.time_offset y_df.columns = [int(col) for col in y_df.columns] @@ -319,10 +334,16 @@ class UBAGraph(InMemoryDataset): station_id_old = station_id # save - gap_df.to_csv(self.gap_path) - print(f'written to {self.gap_path}') - - def get_mask(self): + if uba_validated: + gap_path = settings.resources_dir + \ + 'uba_validated_dataset/gap.csv' + gap_df.to_csv(gap_path) + print(f'written to {gap_path}') + else: + gap_df.to_csv(self.gap_path) + print(f'written to {self.gap_path}') + + def get_mask(self, cv_seed=None): """ - missing_o3_mask (True if o3 is missing) - train_mask (True if training sample) @@ -348,7 +369,10 @@ class UBAGraph(InMemoryDataset): y_df.index = pd.to_datetime(y_df.index) + settings.time_offset # random seed - random.seed(settings.random_seed) + if cv_seed: + random.seed(cv_seed) + else: + random.seed(settings.random_seed) # initialize mask df columns = ['missing_o3_mask', 'train_mask', 'val_mask', @@ -439,7 +463,7 @@ class UBAGraph(InMemoryDataset): shopping_list_corr.reverse() shopping_list.reverse() - dis = 0 # tmp + # dis = 0 # tmp print('\nflagging correlated gaps...') correlated = True @@ -549,6 +573,14 @@ class UBAGraph(InMemoryDataset): assert mask_df.sum().sum() == len(mask_df) # save + if cv_seed is not None: + self.mask_path = self.mask_path.replace(settings.resources_dir, + settings.resources_dir + \ + f'cross_validation/{cv_seed}/') + self.gap_path = self.gap_path.replace(settings.resources_dir, + settings.resources_dir + \ + f'cross_validation/{cv_seed}/') + mask_df.to_csv(self.mask_path) print(f'\nwritten to {self.mask_path}') gap_df.to_csv(self.gap_path) @@ -670,12 +702,38 @@ class UBAGraph(InMemoryDataset): weight_df.to_csv(self.weight_path) print(f'written to {self.weight_path}') - def convert_to_tensor(self): + def convert_to_tensor(self, cv_seed=None): """ Convert all the .csv files to tensors """ print('converting to pytorch tensors...') + # masks + if cv_seed is not None: + p1 = settings.resources_dir + p2 = settings.resources_dir + f'cross_validation/{cv_seed}/' + self.mask_path = self.mask_path.replace(p1, p2) + self.missing_o3_mask_pt_path = self.missing_o3_mask_pt_path.replace(p1, p2) + self.train_mask_pt_path = self.train_mask_pt_path.replace(p1, p2) + self.val_mask_pt_path = self.val_mask_pt_path.replace(p1, p2) + self.test_mask_pt_path = self.test_mask_pt_path.replace(p1, p2) + mask_df = pd.read_csv(self.mask_path, index_col=0) + missing_o3_mask = torch.tensor(mask_df.missing_o3_mask).view(-1) + torch.save(missing_o3_mask, self.missing_o3_mask_pt_path) + print(f'written to {self.missing_o3_mask_pt_path}') + train_mask = torch.tensor(mask_df.train_mask).view(-1) + torch.save(train_mask, self.train_mask_pt_path) + print(f'written to {self.train_mask_pt_path}') + val_mask = torch.tensor(mask_df.val_mask).view(-1) + torch.save(val_mask, self.val_mask_pt_path) + print(f'written to {self.val_mask_pt_path}') + test_mask = torch.tensor(mask_df.test_mask).view(-1) + torch.save(test_mask, self.test_mask_pt_path) + print(f'written to {self.test_mask_pt_path}') + del mask_df + if cv_seed is not None: + return + # x x_df = pd.read_csv(self.x_path, index_col=0) x_df = get_one_hot(x_df) @@ -707,22 +765,6 @@ class UBAGraph(InMemoryDataset): print(f'written to {self.y_pt_path}') del y_df - # masks - mask_df = pd.read_csv(self.mask_path, index_col=0) - missing_o3_mask = torch.tensor(mask_df.missing_o3_mask).view(-1) - torch.save(missing_o3_mask, self.missing_o3_mask_pt_path) - print(f'written to {self.missing_o3_mask_pt_path}') - train_mask = torch.tensor(mask_df.train_mask).view(-1) - torch.save(train_mask, self.train_mask_pt_path) - print(f'written to {self.train_mask_pt_path}') - val_mask = torch.tensor(mask_df.val_mask).view(-1) - torch.save(val_mask, self.val_mask_pt_path) - print(f'written to {self.val_mask_pt_path}') - test_mask = torch.tensor(mask_df.test_mask).view(-1) - torch.save(test_mask, self.test_mask_pt_path) - print(f'written to {self.test_mask_pt_path}') - del mask_df - # position pos_df = pd.read_csv(self.pos_path, index_col=0) pos = torch.tensor(pos_df[['lon', 'lat']].values.astype(np.float32)) @@ -730,7 +772,7 @@ class UBAGraph(InMemoryDataset): print(f'written to {self.pos_pt_path}') del pos_df - def get_dataset(self): + def get_dataset(self, cv_seed=None): """ Finalize the Pytorch dataset, include everything that is in the gAQBench dataset. @@ -740,11 +782,19 @@ class UBAGraph(InMemoryDataset): edge_index = torch.load(self.edge_index_pt_path) edge_weight = torch.load(self.edge_weight_pt_path) y = torch.load(self.y_pt_path) + pos = torch.load(self.pos_pt_path) + if cv_seed is not None: + p1 = settings.resources_dir + p2 = settings.resources_dir + f'cross_validation/{cv_seed}/' + self.mask_path = self.mask_path.replace(p1, p2) + self.missing_o3_mask_pt_path = self.missing_o3_mask_pt_path.replace(p1, p2) + self.train_mask_pt_path = self.train_mask_pt_path.replace(p1, p2) + self.val_mask_pt_path = self.val_mask_pt_path.replace(p1, p2) + self.test_mask_pt_path = self.test_mask_pt_path.replace(p1, p2) missing_o3_mask = torch.load(self.missing_o3_mask_pt_path) train_mask = torch.load(self.train_mask_pt_path) val_mask = torch.load(self.val_mask_pt_path) test_mask = torch.load(self.test_mask_pt_path) - pos = torch.load(self.pos_pt_path) # create dataset data = Data(x=x, edge_index=edge_index, edge_weight=edge_weight, @@ -798,9 +848,40 @@ def print_graph_statistics(): print(f'Is undirected: {data.is_undirected()}') +def cross_validation_data_splits(): + """ + Obtain and save 10 different cross validation data splits + """ + print('obtain data splits for cross validation...') + + for cv_seed in settings.cv_seeds: + print(f'\ncross validation set {cv_seed}') + + # create all folders needed for cross validation + cv_folder = settings.resources_dir + \ + f'cross_validation/{cv_seed}/' + preproc_folder = cv_folder + 'uba_graph_preproc/' + dataset_folder = cv_folder + 'uba_graph_dataset/' + model_folder = cv_folder + 'models/' + for folder in [cv_folder, preproc_folder, dataset_folder, + model_folder]: + if not os.path.isdir(folder): + os.mkdir(folder) + + # initialize data preparation class create masks + ug = UBAGraph() + ug.get_mask(cv_seed=cv_seed) + + # initialize new class and convert to tensor. + # This is necessary as the cv option is a bit buggy + ug = UBAGraph() + ug.convert_to_tensor(cv_seed=cv_seed) + + if __name__ == '__main__': """ Start routines """ - # dataset_workflow() print_graph_statistics() + # cross_validation_data_splits() + # dataset_workflow() diff --git a/source/retrieval/uba_validated.py b/source/retrieval/uba_validated.py new file mode 100644 index 0000000000000000000000000000000000000000..f7550d30079c3143f94d52b3939d557cfad4642d --- /dev/null +++ b/source/retrieval/uba_validated.py @@ -0,0 +1,143 @@ +""" +This file reads in the final validated UBA dataset and converts it +into the same format as the preprocessed ozone data from the TOAR +database. +""" + +# general +import pdb + +# data science +import pandas as pd +import numpy as np + +# own +import settings +from retrieval.toar_db import StationData +from retrieval.toar_db import HourlyData +from retrieval.utils import print_statistics +from retrieval.utils import find_toar_stations + + +class UBARetrieval(): + """ + A class for preparing UBA data. + """ + def __init__(self): + """ + Initialize class. + """ + dir_ = settings.resources_dir + \ + 'uba_validated_dataset/' + self.in_path = dir_ + 'original/DE2011O3_1SMW_20220905.csv' + self.path = dir_ + 'hourly_o3_validated.csv' + + def prepare_uba_data(self): + """ + Read in the original csv file and convert it to our format + """ + print('Prepare validated UBA data...') + + # read in original data + original_df = pd.read_csv(self.in_path, + delimiter=';') + + # prepare final dataframe + hd = HourlyData('o3') + hd.read_from_file() + new_uba_df = pd.DataFrame(index=hd.df.index, + columns=hd.df.columns, + dtype=float) + + # constants for ug/m3 to ppb and missing value + c = 0.50124 + m = -9 + + # list with all TOAR station ids + sd = StationData() + sd.read_from_file() + toar_station_list = sd.df.index.to_list() + + # list with all uba station ids + uba_station_list = [station[1:-1] for station in + np.unique(original_df.Station)] + + # pair toar station ids with uba station ids + uba_toar_dict = {} + for uba_station in uba_station_list: + toar_stations_of_uba = find_toar_stations(uba_station) + for toar_station in toar_stations_of_uba: + if toar_station in toar_station_list: + uba_toar_dict[uba_station] = toar_station + print(f'Number of UBA stations: {len(uba_station_list)}') + print(f'Number of TOAR stations: {len(toar_station_list)}') + print(f'Number of matched stations: {len(uba_toar_dict)}') + + # fill the final dataframe + for idx, row in original_df.iterrows(): + uba_station = row.Station[1:-1] + + if idx % 10000 == 0: + print(f'{idx/len(original_df)*100:.1f} %') + + if uba_station in uba_toar_dict.keys(): + toar_station = uba_toar_dict[uba_station] + else: + continue + year = row.Datum[1:5] + month = row.Datum[5:7] + day = row.Datum[7:9] + for data_col in original_df.columns[5:-2]: + hour = data_col[4:6] + day_off = pd.to_timedelta(0, unit='d') + if hour == '24': + hour = '00' + day_off = pd.to_timedelta(1, unit='d') + dt_string = f'{year}-{month}-{day} {hour}:00:00' + dt = pd.to_datetime(dt_string) - \ + pd.to_timedelta(1, unit='h') + \ + day_off + if row[data_col] > m: + s = str(toar_station) + d = str(dt) + if d in new_uba_df.index: + new_uba_df.at[d, s] = row[data_col] * c + else: + print(d) + + # print statistics and prepare for saving + self.df = new_uba_df + print_statistics('o3_validated', self.df) + + def save_to_file(self): + """ + This file goes to the resources. + """ + self.df.to_csv(self.path) + print(f'Saved to file: {self.path}') + + def read_from_file(self): + """ + If the file already exists, read it! + """ + self.df = pd.read_csv(self.path, index_col=0) + + def get_gaps(self): + """ + Here we reuse a routine from graph dataset prep + """ + print('Save the dataset by using save_to_file(self), then') + print('Run the following to obtain a csv file with gaps:') + print('ug = UBAGraph()') + print('ug.get_gaps(uba_validated=True)') + + +if __name__ == '__main__': + """ + Start the specified routine. + """ + ur = UBARetrieval() + # ur.prepare_uba_data() + # ur.save_to_file() + ur.read_from_file() + ur.get_gaps() diff --git a/source/retrieval/utils.py b/source/retrieval/utils.py index b84bbfce08af3712386218d114879b7a17e966d3..e4d4422b5d6968456ff08ac204d1a870c6386b16 100644 --- a/source/retrieval/utils.py +++ b/source/retrieval/utils.py @@ -37,6 +37,19 @@ def query_db(query): exit() +def find_toar_stations(uba_id): + """ + input: a string containing an UBA station id + output: a list of TOAR numid integers belonging to that station + """ + query = f""" + SELECT numid FROM stations + WHERE station_id = '{uba_id}'; + """ + df = query_db(query) + return df.numid.to_list() + + def print_statistics(name, data): """ Basic statistics for data sanity checks @@ -68,3 +81,4 @@ def print_statistics(name, data): # print statistics print(f'Basic statistics for {name}:') print(s) + diff --git a/source/settings.py b/source/settings.py index 60849b84e16304179e11b6e44a94c996c4aa0ff3..e87b844fc0474c1bfa1c53ee8f4ac6cb25f9a56b 100644 --- a/source/settings.py +++ b/source/settings.py @@ -25,8 +25,9 @@ output_dir = ROOTDIR + '/output/' # data resources directory resources_dir = '/p/project/deepacf/intelliaq/ozone-imputation-data/' -# random seed +# random seeds random_seed = 1 +cv_seeds = range(2, 12) # global model/data settings datetime_start = '2011-01-01 00:00:00' # start of dataset in UTC diff --git a/source/visualizations/plots_for_paper.py b/source/visualizations/plots_for_paper.py index fe891176f87f71bbb05f0d5c15f3f77baa9ded10..1ef6c310218e0d33fd2d1afd47114aa4a2c8a7e5 100644 --- a/source/visualizations/plots_for_paper.py +++ b/source/visualizations/plots_for_paper.py @@ -28,7 +28,7 @@ import settings from retrieval.toar_db import StationData, HourlyData from preprocessing.uba import UBAGraph from preprocessing.utils import geo_to_cartesian - +from retrieval.uba_validated import UBARetrieval from postprocessing.evaluation_tools import count_exceedances from postprocessing.evaluation_tools import number_of_neighbors from postprocessing.evaluation_tools import get_characteristics_df @@ -239,16 +239,21 @@ def station_ids_on_map(): plt.close() -def o3_value_matrix(): +def o3_value_matrix(validated=False): """ All o3 measurements we have """ print('o3 value matrix...') # read data - hd = HourlyData('o3') - hd.read_from_file() - data = hd.df.values.T + if validated: + ur = UBARetrieval() + ur.read_from_file() + data = ur.df.values.T + else: + hd = HourlyData('o3') + hd.read_from_file() + data = hd.df.values.T vmin = 0. vmax = np.nanpercentile(data, 99) @@ -258,7 +263,10 @@ def o3_value_matrix(): ax.axis('off') # save figure - save_path = settings.output_dir + 'o3_value_matrix.png' + if validated: + save_path = settings.output_dir + 'o3_value_matrix_validated.png' + else: + save_path = settings.output_dir + 'o3_value_matrix.png' plt.savefig(save_path, bbox_inches='tight', pad_inches=0, dpi=2000) print(f'saved to {save_path}') plt.close() @@ -273,20 +281,23 @@ def o3_value_matrix(): ax.axis('off') # print info - print('2.4 million values') - print('15.0 % missing values in total') - print('first longer correlated gap') - print('2011-08-19 09:00:00 -- 2011-08-20 04:00:00 local time') - print('length 19 h') + # print('2.4 million values') + # print('15.0 % missing values in total') + # print('first longer correlated gap') + # print('2011-08-19 09:00:00 -- 2011-08-20 04:00:00 local time') + # print('length 19 h') # save colorbar - save_path = settings.output_dir + 'o3_value_matrix_colorbar.png' + if validated: + save_path = settings.output_dir + 'o3_value_matrix_validated_colorbar.png' + else: + save_path = settings.output_dir + 'o3_value_matrix_colorbar.png' plt.savefig(save_path, bbox_inches='tight', pad_inches=0.) print(f'colorbar saved to {save_path}') plt.close() -def mask_matrix(): +def mask_matrix(cv_seed=None): """ the missing, train, test, val masks as a matrix. This plot was not published. @@ -295,6 +306,10 @@ def mask_matrix(): # read data ug = UBAGraph() + if cv_seed is not None: + ug.mask_path = ug.mask_path.replace(settings.resources_dir, + settings.resources_dir + \ + f'cross_validation/{cv_seed}/') mask_df = pd.read_csv(ug.mask_path, index_col=0) hd = HourlyData('o3') hd.read_from_file() @@ -319,7 +334,10 @@ def mask_matrix(): ax.axis('off') # save figure - save_path = settings.output_dir + 'mask_matrix.png' + if cv_seed is not None: + save_path = settings.output_dir + f'mask_matrix_{cv_seed}.png' + else: + save_path = settings.output_dir + 'mask_matrix.png' plt.savefig(save_path, bbox_inches='tight', pad_inches=0, dpi=2000) print(f'saved to {save_path}') plt.close() @@ -381,6 +399,12 @@ def time_series(gap_id=None): 'file_with_cs': 'SpatialMean_cs_predictions_useval.pyt', 'color': 'steelblue' } + numerical_model_dict = { + 'model_name': 'Numerical model', + 'file_without_cs': 'cams_predictions_useval.pt', + 'file_with_cs': 'Cams_cs_predictions_useval.pyt', + 'color': 'steelblue' + } nearest_neighbor_hybrid_dict = { 'model_name': 'Nearest neighbor hybrid', 'file_without_cs': 'nearestneighborhybrid_predictions_useval.pt', @@ -394,6 +418,7 @@ def time_series(gap_id=None): 'color': 'steelblue' } dicts = [spatiotemporal_mean_dict, spatial_mean_dict, + numerical_model_dict, nearest_neighbor_hybrid_dict, random_forest_dict] model_path = settings.resources_dir + 'models/' for dict_ in dicts: @@ -403,7 +428,7 @@ def time_series(gap_id=None): dict_['data_with_cs'] = data_with_cs.numpy().reshape(-1) # setting up plot - fig, axes = plt.subplots(4, 1, figsize=(10, 7), sharex=True) + fig, axes = plt.subplots(5, 1, figsize=(12.5, 7), sharex=True) for model_idx, ax in enumerate(axes): dict_ = dicts[model_idx] ax.fill_between(range(start_idx, gap_start_idx+1), @@ -874,10 +899,10 @@ if __name__ == '__main__': station_ids_on_map_ = False o3_value_matrix_ = False mask_matrix_ = False - time_series_ = False + time_series_ = True basic_statistics_and_histogram_ = False exceedances_ = False - true_vs_imputed_ = True + true_vs_imputed_ = False n_neighbors_vs_r2_ = False gap_len_vs_r2_ = False imputed_dataset_matrix_ = False @@ -891,6 +916,8 @@ if __name__ == '__main__': o3_value_matrix() if mask_matrix_: mask_matrix() + for cv_seed in settings.cv_seeds: + mask_matrix(cv_seed=cv_seed) if time_series_: time_series() # for gap_id in range(104526, 104533):