Merge branch 'clara_issue05_graph_of_uba_stations' into devel

81ca62f6 · Clara Betancourt · 62597813 · 91668181 · 81ca62f6 · 81ca62f6
Commit 81ca62f6 authored 3 years ago by Clara Betancourt
--- a/prepare.sh
+++ b/prepare.sh
@@ -86,6 +86,7 @@ elif [[ $MCHN == "juwels" ]]
    source ${activate_virt_env}
    pip install --upgrade pip
    pip install -r $STPD"requirements_juwels.txt"
+    python -m pip install "dask[dataframe]" --upgrade
    export PYTHONPATH=${CWD}:$PYTHONPATH >> ${activate_virt_env}
    export PYTHONPATH=${CWD}/source/:$PYTHONPATH >> ${activate_virt_env}

--- a/source/experiments/cs_time_resolved_ozone.py
+++ b/source/experiments/cs_time_resolved_ozone.py
+"""
+Try the correct and smooth algorithm for missing data imputation
+"""
+# general
+import pdb
+import pickle as pkl
+# data science
+import numpy as np
+# sklearn
+from sklearn.metrics import mean_squared_error, r2_score
+# pytorch
+import torch
+from models.cs import CorrectAndSmooth
+# plotting
+import matplotlib.pyplot as plt
+# own package
+import settings
+from preprocessing.time_resolved import TimeResolvedOzone
+print('Read in Dataset')
+tro = TimeResolvedOzone()
+tro.get_dataset()
+dataset = tro
+data = dataset[0]
+print('Baseline model')
+x_train = data.x[data.train_mask].numpy()
+y_train = data.y[data.train_mask].numpy().reshape(-1)
+x_val = data.x[data.val_mask].numpy()
+y_val = data.y[data.val_mask].numpy().reshape(-1)
+# model = pkl.load(open(tro.rf_path, 'rb'))
+# pdb.set_trace()
+# y_val_hat = model.predict(x_val)
+y_val_hat = np.full_like(y_val, fill_value=np.mean(y_train))
+rmse = (mean_squared_error(y_val, y_val_hat))**.5
+r2 = r2_score(y_val, y_val_hat)
+print('======================')
+print('Baseline results:')
+print(f'RMSE: {rmse:.2f}, R2: {r2:.2f}')
+('======================')
+print('Correct and smooth')
+cs = CorrectAndSmooth(num_correction_layers=10, correction_alpha=.75,
+                      num_smoothing_layers=10, smoothing_alpha=0.4,
+                      autoscale=True)  # autoscale is misleading...
+x = data.x.numpy()
+# y_hat = model.predict(x)
+y_hat = np.full_like(data.y.numpy(), fill_value=np.mean(data.y[data.train_mask].numpy()))
+y_hat = torch.tensor(y_hat, dtype=torch.float32).view(-1, 1)
+y_soft = cs.correct(y_soft=y_hat, y_true=data.y[data.train_mask],
+                    mask=data.train_mask, edge_index=data.edge_index,
+                    edge_weight=data.edge_weight)
+y_val_soft = y_soft[data.val_mask].numpy()
+# pdb.set_trace()
+rmse = (mean_squared_error(y_val, y_val_soft))**.5
+r2 = r2_score(y_val, y_val_soft)
+print(f'After correct:')
+print(f'RMSE: {rmse:.2f}, R2: {r2:.2f}')
+y_soft2 = cs.smooth(y_soft=y_soft, y_true=data.y[data.train_mask],
+                    mask=data.train_mask, edge_index=data.edge_index,
+                    edge_weight=data.edge_weight)
+y_val_soft2 = y_soft2[data.val_mask].numpy()
+rmse = (mean_squared_error(y_val, y_val_soft2))**.5
+r2 = r2_score(y_val, y_val_soft2)
+print(f'After smooth:')
+print(f'RMSE: {rmse:.2f}, R2: {r2:.2f}')
+exit()
+print('Incoming node degree vs. error in test set:')
+node_degrees = []
+absolute_errors = []
+for node_idx in range(data.num_nodes):
+    if data.train_mask[node_idx].item():
+        continue
+    edge_weights = data.edge_weight[data.edge_index[1, :]==node_idx]
+    edge_weights = torch.sort(edge_weights.view(-1)).values[:-1]
+    node_degrees.append(torch.sum(edge_weights).item())
+    y = data.y[node_idx].item()
+    y_hat = y_soft2[node_idx].item()
+    absolute_errors.append(np.abs(y-y_hat))
+plt.scatter(node_degrees, absolute_errors, color='navy', alpha=.035)
+plt.title('Random Forest on time resolved ozone')
+plt.xlabel('incoming node degree')
+plt.ylabel('absolute error')
+# plt.show()
+plt.savefig(settings.output_dir+'cs_time_resolved_node_degree_vs_error.png')
--- a/source/preprocessing/time_resolved.py
+++ b/source/preprocessing/time_resolved.py
--- a/source/preprocessing/utils.py
+++ b/source/preprocessing/utils.py
+# general
+import pdb
+# data science
 import numpy as np
+import pandas as pd
 import torch
 def geo_to_cartesian(lons, lats):
    """
    Maps longitudes and latitudes onto a 3d grid which is practical
    for distance measures
+    inputs: lons, lats in degree, dtype is series.
+    returns: numpy arrays
    """
    r = 6371.
@@ -16,3 +25,35 @@ def geo_to_cartesian(lons, lats):
    z = r * np.sin(lats)
    return x, y, z
+def neighbor_index_filter(row, neighbor_df, reg_df, radius, time_window):
+    """
+    For a given row in reg_df, find all neighboring nodes
+    """
+    trg_id = row.station_id
+    trg_datetime = row.datetime
+    neigh_ids = neighbor_df.at[trg_id, 'neighbors']
+    rad_filter = reg_df.station_id.isin(neigh_ids)
+    time_filter = abs(reg_df.datetime-trg_datetime) <= time_window
+    src_idxs = reg_df[rad_filter&time_filter].index.to_list()
+    return src_idxs
+def get_one_hot(x_df):
+    """
+    If this data frame contains 'type' or 'type_of_area',
+    replace them with one-hot encoded columns
+    """
+    for col in ['type', 'type_of_area']:
+        if col in x_df.columns:
+            dummies = pd.get_dummies(x_df[col], prefix=col)
+            x_df.drop(col, axis=1, inplace=True)
+            x_df = pd.concat([x_df, dummies], axis=1)
+    return x_df.copy()
--- a/source/settings.py
+++ b/source/settings.py
@@ -7,6 +7,7 @@ and data files that we use in our project.
 # basic packages
 import os
 import pathlib
+import pandas as pd
 import pdb
 # find the source of our project, only to know the directory
@@ -31,6 +32,10 @@ random_seed = 1
 datetime_start = '2009-01-01 00:00:00'
 datetime_end = '2013-12-31 23:59:59'
+filter_datetime_start = '2011-01-01 00:00:00'
+filter_datetime_end = '2011-12-31 23:59:59'
+time_offset = pd.Timedelta('0 days 02:00:00')
 if __name__ == '__main__':
    print('SOURCEDIR:', SOURCEDIR_pos)