diff --git a/source/experiments/cs_time_resolved_ozone.py b/source/experiments/cs_time_resolved_ozone.py index d4fdba5787308e03593daabb6eaf98701a8b262d..641c47cfc3594a2bc458f3bc86e3065efd2f91c8 100644 --- a/source/experiments/cs_time_resolved_ozone.py +++ b/source/experiments/cs_time_resolved_ozone.py @@ -35,35 +35,33 @@ x_train = data.x[data.train_mask].numpy() y_train = data.y[data.train_mask].numpy().reshape(-1) x_val = data.x[data.val_mask].numpy() y_val = data.y[data.val_mask].numpy().reshape(-1) -# model = pkl.load(open(tro.rf_path, 'rb')) -# pdb.set_trace() -# y_val_hat = model.predict(x_val) -y_val_hat = np.full_like(y_val, fill_value=np.mean(y_train)) +model = pkl.load(open(tro.rf_path, 'rb')) +y_val_hat = model.predict(x_val) +# y_val_hat = np.full_like(y_val, fill_value=np.mean(y_train)) rmse = (mean_squared_error(y_val, y_val_hat))**.5 r2 = r2_score(y_val, y_val_hat) print('======================') print('Baseline results:') -print(f'RMSE: {rmse:.2f}, R2: {r2:.2f}') +print(f'RMSE: {rmse:.3f}, R2: {r2:.3f}') ('======================') print('Correct and smooth') -cs = CorrectAndSmooth(num_correction_layers=10, correction_alpha=.75, - num_smoothing_layers=10, smoothing_alpha=0.4, +cs = CorrectAndSmooth(num_correction_layers=20, correction_alpha=.75, + num_smoothing_layers=20, smoothing_alpha=0.1, autoscale=True) # autoscale is misleading... x = data.x.numpy() -# y_hat = model.predict(x) -y_hat = np.full_like(data.y.numpy(), fill_value=np.mean(data.y[data.train_mask].numpy())) +y_hat = model.predict(x) +# y_hat = np.full_like(data.y.numpy(), fill_value=np.mean(data.y[data.train_mask].numpy())) y_hat = torch.tensor(y_hat, dtype=torch.float32).view(-1, 1) y_soft = cs.correct(y_soft=y_hat, y_true=data.y[data.train_mask], mask=data.train_mask, edge_index=data.edge_index, edge_weight=data.edge_weight) y_val_soft = y_soft[data.val_mask].numpy() -# pdb.set_trace() rmse = (mean_squared_error(y_val, y_val_soft))**.5 r2 = r2_score(y_val, y_val_soft) print(f'After correct:') -print(f'RMSE: {rmse:.2f}, R2: {r2:.2f}') +print(f'RMSE: {rmse:.3f}, R2: {r2:.3f}') y_soft2 = cs.smooth(y_soft=y_soft, y_true=data.y[data.train_mask], mask=data.train_mask, edge_index=data.edge_index, @@ -72,7 +70,7 @@ y_val_soft2 = y_soft2[data.val_mask].numpy() rmse = (mean_squared_error(y_val, y_val_soft2))**.5 r2 = r2_score(y_val, y_val_soft2) print(f'After smooth:') -print(f'RMSE: {rmse:.2f}, R2: {r2:.2f}') +print(f'RMSE: {rmse:.3f}, R2: {r2:.3f}') exit() print('Incoming node degree vs. error in test set:') diff --git a/source/visualizations/preanalysis_plots.py b/source/visualizations/preanalysis_plots.py index c337c18d2ff0776edfb6c0a5bfb2b3e63870e0da..04ba15f6d532b64fd1bf734990efcc48231383c1 100644 --- a/source/visualizations/preanalysis_plots.py +++ b/source/visualizations/preanalysis_plots.py @@ -16,8 +16,8 @@ import matplotlib.pyplot as plt # own import settings -# from utils import query_db from preprocessing.aqbench import AQBenchGraph +from preprocessing.time_resolved import TimeResolvedOzone def time_series_lenght(): @@ -95,30 +95,95 @@ def missing_values(): """ color the time series according to missing values """ - # find all time series - file_list = [f for f in os.listdir(settings.resources_dir) - if f.startswith('hourly_')] - - for f in file_list: - print(f'Plotting missing values for {f}...') - # info - var = f.lstrip('hourly_').strip('.csv') - df = pd.read_csv(settings.resources_dir+f, index_col=0) - - # plot the data - plt.figure() - plt.imshow(df.values.T, aspect=100, interpolation='none') - plt.yticks([]) - plt.xticks([]) - ax = plt.gca() - ax.spines['right'].set_visible(False) - ax.spines['left'].set_visible(False) - ax.spines['top'].set_visible(False) - ax.spines['bottom'].set_visible(False) - - plt.savefig(settings.output_dir+f'missing_{var}.pdf') - plt.close() - + print('missing values...') + + # read in data + tro = TimeResolvedOzone() + x_df = pd.read_csv(tro.x_path, index_col=0) + y_df = pd.read_csv(tro.y_path, index_col=0) + reg_df = pd.read_csv(tro.reg_path, index_col=0) + + print(x_df.columns) + + # reshape to 2d field + n_stations = len(np.unique(reg_df.station_id)) + n_timesteps = len(np.unique(reg_df.datetime)) + y_2d = y_df.values.reshape(n_stations, n_timesteps) + print(f'stations: {n_stations}') + print(f'timesteps: {n_timesteps}') + print(f'min: {np.nanmin(y_2d)}') + print(f'max: {np.nanmax(y_2d)}') + print(f'mean: {np.nanmean(y_2d)}') + print(f'missing: {np.count_nonzero(np.isnan(y_2d))/(n_stations*n_timesteps)*100}') + + # info + var = 'o3' + + # plot the data + z = int(n_timesteps/n_stations) + plt.figure(figsize=(z*3, 3)) + plt.imshow(y_2d[0:n_stations,0:z*n_stations], interpolation='none') + plt.yticks([]) + plt.xticks([]) + ax = plt.gca() + ax.spines['right'].set_visible(False) + ax.spines['left'].set_visible(False) + ax.spines['top'].set_visible(False) + ax.spines['bottom'].set_visible(False) + + path = f'{settings.output_dir}missing_{var}.png' + plt.savefig(path, dpi=250, bbox_inches='tight', pad_inches=0) + print(f'saved to {path}') + plt.close() + + bins = np.arange(-5, 120, 5, dtype=int) + plt.hist(y_df.values, bins=bins, log=True) + plt.grid() + path = f'{settings.output_dir}hist_{var}.png' + plt.savefig(path) + print(f'saved to {path}') + + +def visualize_masks(): + """ + Showing the data split. + """ + # read in data + tro = TimeResolvedOzone() + mask_df = pd.read_csv(tro.mask_path, index_col=0) + reg_df = pd.read_csv(tro.reg_path, index_col=0) + + # prepare data + n_stations = len(np.unique(reg_df.station_id)) + n_timesteps = len(np.unique(reg_df.datetime)) + + missing_o3_mask = mask_df.missing_o3_mask.values.reshape(n_stations, n_timesteps) + val_mask = mask_df.val_mask.values.reshape(n_stations, n_timesteps) + test_mask = mask_df.test_mask.values.reshape(n_stations, n_timesteps) + + data = np.zeros((n_stations, n_timesteps)) + data[missing_o3_mask] = 3. + data[val_mask] = 2. + data[test_mask] = 1. + + # plot + z = int(n_timesteps/n_stations) + plt.figure(figsize=(z*3, 3)) + plt.imshow(data[0:n_stations,0:z*n_stations], interpolation='none', + cmap='Accent') + plt.yticks([]) + plt.xticks([]) + ax = plt.gca() + ax.spines['right'].set_visible(False) + ax.spines['left'].set_visible(False) + ax.spines['top'].set_visible(False) + ax.spines['bottom'].set_visible(False) + + # save + path = f'{settings.output_dir}masks.png' + plt.savefig(path, dpi=250, bbox_inches='tight', pad_inches=0) + print(f'saved to {path}') + plt.close() def visualize_graph(): """ @@ -160,10 +225,12 @@ if __name__ == '__main__': """ time_series_lenght_ = False missing_values_ = True + visualize_masks_ = False visualize_graph_ = False if time_series_lenght_: time_series_lenght() if missing_values_: missing_values() + if visualize_masks_: visualize_masks() if visualize_graph_: visualize_graph()