Skip to content
Snippets Groups Projects
Commit 81ca62f6 authored by Clara Betancourt's avatar Clara Betancourt
Browse files

Merge branch 'clara_issue05_graph_of_uba_stations' into devel

parents 62597813 91668181
No related branches found
No related tags found
No related merge requests found
...@@ -86,6 +86,7 @@ elif [[ $MCHN == "juwels" ]] ...@@ -86,6 +86,7 @@ elif [[ $MCHN == "juwels" ]]
source ${activate_virt_env} source ${activate_virt_env}
pip install --upgrade pip pip install --upgrade pip
pip install -r $STPD"requirements_juwels.txt" pip install -r $STPD"requirements_juwels.txt"
python -m pip install "dask[dataframe]" --upgrade
export PYTHONPATH=${CWD}:$PYTHONPATH >> ${activate_virt_env} export PYTHONPATH=${CWD}:$PYTHONPATH >> ${activate_virt_env}
export PYTHONPATH=${CWD}/source/:$PYTHONPATH >> ${activate_virt_env} export PYTHONPATH=${CWD}/source/:$PYTHONPATH >> ${activate_virt_env}
......
"""
Try the correct and smooth algorithm for missing data imputation
"""
# general
import pdb
import pickle as pkl
# data science
import numpy as np
# sklearn
from sklearn.metrics import mean_squared_error, r2_score
# pytorch
import torch
from models.cs import CorrectAndSmooth
# plotting
import matplotlib.pyplot as plt
# own package
import settings
from preprocessing.time_resolved import TimeResolvedOzone
print('Read in Dataset')
tro = TimeResolvedOzone()
tro.get_dataset()
dataset = tro
data = dataset[0]
print('Baseline model')
x_train = data.x[data.train_mask].numpy()
y_train = data.y[data.train_mask].numpy().reshape(-1)
x_val = data.x[data.val_mask].numpy()
y_val = data.y[data.val_mask].numpy().reshape(-1)
# model = pkl.load(open(tro.rf_path, 'rb'))
# pdb.set_trace()
# y_val_hat = model.predict(x_val)
y_val_hat = np.full_like(y_val, fill_value=np.mean(y_train))
rmse = (mean_squared_error(y_val, y_val_hat))**.5
r2 = r2_score(y_val, y_val_hat)
print('======================')
print('Baseline results:')
print(f'RMSE: {rmse:.2f}, R2: {r2:.2f}')
('======================')
print('Correct and smooth')
cs = CorrectAndSmooth(num_correction_layers=10, correction_alpha=.75,
num_smoothing_layers=10, smoothing_alpha=0.4,
autoscale=True) # autoscale is misleading...
x = data.x.numpy()
# y_hat = model.predict(x)
y_hat = np.full_like(data.y.numpy(), fill_value=np.mean(data.y[data.train_mask].numpy()))
y_hat = torch.tensor(y_hat, dtype=torch.float32).view(-1, 1)
y_soft = cs.correct(y_soft=y_hat, y_true=data.y[data.train_mask],
mask=data.train_mask, edge_index=data.edge_index,
edge_weight=data.edge_weight)
y_val_soft = y_soft[data.val_mask].numpy()
# pdb.set_trace()
rmse = (mean_squared_error(y_val, y_val_soft))**.5
r2 = r2_score(y_val, y_val_soft)
print(f'After correct:')
print(f'RMSE: {rmse:.2f}, R2: {r2:.2f}')
y_soft2 = cs.smooth(y_soft=y_soft, y_true=data.y[data.train_mask],
mask=data.train_mask, edge_index=data.edge_index,
edge_weight=data.edge_weight)
y_val_soft2 = y_soft2[data.val_mask].numpy()
rmse = (mean_squared_error(y_val, y_val_soft2))**.5
r2 = r2_score(y_val, y_val_soft2)
print(f'After smooth:')
print(f'RMSE: {rmse:.2f}, R2: {r2:.2f}')
exit()
print('Incoming node degree vs. error in test set:')
node_degrees = []
absolute_errors = []
for node_idx in range(data.num_nodes):
if data.train_mask[node_idx].item():
continue
edge_weights = data.edge_weight[data.edge_index[1, :]==node_idx]
edge_weights = torch.sort(edge_weights.view(-1)).values[:-1]
node_degrees.append(torch.sum(edge_weights).item())
y = data.y[node_idx].item()
y_hat = y_soft2[node_idx].item()
absolute_errors.append(np.abs(y-y_hat))
plt.scatter(node_degrees, absolute_errors, color='navy', alpha=.035)
plt.title('Random Forest on time resolved ozone')
plt.xlabel('incoming node degree')
plt.ylabel('absolute error')
# plt.show()
plt.savefig(settings.output_dir+'cs_time_resolved_node_degree_vs_error.png')
This diff is collapsed.
# general
import pdb
# data science
import numpy as np import numpy as np
import pandas as pd
import torch import torch
def geo_to_cartesian(lons, lats): def geo_to_cartesian(lons, lats):
""" """
Maps longitudes and latitudes onto a 3d grid which is practical Maps longitudes and latitudes onto a 3d grid which is practical
for distance measures for distance measures
inputs: lons, lats in degree, dtype is series.
returns: numpy arrays
""" """
r = 6371. r = 6371.
...@@ -16,3 +25,35 @@ def geo_to_cartesian(lons, lats): ...@@ -16,3 +25,35 @@ def geo_to_cartesian(lons, lats):
z = r * np.sin(lats) z = r * np.sin(lats)
return x, y, z return x, y, z
def neighbor_index_filter(row, neighbor_df, reg_df, radius, time_window):
"""
For a given row in reg_df, find all neighboring nodes
"""
trg_id = row.station_id
trg_datetime = row.datetime
neigh_ids = neighbor_df.at[trg_id, 'neighbors']
rad_filter = reg_df.station_id.isin(neigh_ids)
time_filter = abs(reg_df.datetime-trg_datetime) <= time_window
src_idxs = reg_df[rad_filter&time_filter].index.to_list()
return src_idxs
def get_one_hot(x_df):
"""
If this data frame contains 'type' or 'type_of_area',
replace them with one-hot encoded columns
"""
for col in ['type', 'type_of_area']:
if col in x_df.columns:
dummies = pd.get_dummies(x_df[col], prefix=col)
x_df.drop(col, axis=1, inplace=True)
x_df = pd.concat([x_df, dummies], axis=1)
return x_df.copy()
...@@ -7,6 +7,7 @@ and data files that we use in our project. ...@@ -7,6 +7,7 @@ and data files that we use in our project.
# basic packages # basic packages
import os import os
import pathlib import pathlib
import pandas as pd
import pdb import pdb
# find the source of our project, only to know the directory # find the source of our project, only to know the directory
...@@ -31,6 +32,10 @@ random_seed = 1 ...@@ -31,6 +32,10 @@ random_seed = 1
datetime_start = '2009-01-01 00:00:00' datetime_start = '2009-01-01 00:00:00'
datetime_end = '2013-12-31 23:59:59' datetime_end = '2013-12-31 23:59:59'
filter_datetime_start = '2011-01-01 00:00:00'
filter_datetime_end = '2011-12-31 23:59:59'
time_offset = pd.Timedelta('0 days 02:00:00')
if __name__ == '__main__': if __name__ == '__main__':
print('SOURCEDIR:', SOURCEDIR_pos) print('SOURCEDIR:', SOURCEDIR_pos)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment