Select Git revision
time_resolved.py
time_resolved.py 29.40 KiB
"""
Creates a pytorch Dataset which contains time
resolved data.
"""
# general
import random
import pdb
import warnings
import pickle as pkl
import datetime
# data science
import numpy as np
import pandas as pd
import dask.dataframe as ddf
# sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics.pairwise import cosine_similarity
# pytorch
import torch
from torch_geometric.data import Data, InMemoryDataset
# own package
import settings
from retrieval.toar_db import StationData
from retrieval.toar_db import HourlyData
from preprocessing.utils import geo_to_cartesian, get_one_hot, \
neighbor_index_filter
# oppress future warnings
warnings.filterwarnings('ignore')
class TimeResolvedOzone(InMemoryDataset):
"""
A dataset for graph ML on time resolved
ozone data.
"""
def __init__(self):
"""
Initialize the class
"""
# super class
InMemoryDataset.__init__(self)
# raw data paths
self.raw_data_dir = settings.resources_dir + 'time_resolved_raw/'
self.station_path = self.raw_data_dir + 'stations.csv'
# preprocessed data paths
self.out_dir = settings.resources_dir + 'time_resolved_preproc/'
self.reg_path = self.out_dir + 'reg.csv'
self.x_path = self.out_dir + 'x.csv'
self.y_path = self.out_dir + 'y.csv'
self.gap_path = self.out_dir + 'gap.csv'
self.mask_path = self.out_dir + 'mask.csv'
self.imp_path = self.out_dir + 'imp.csv'
self.rf_path = self.out_dir + 'rf.pkl'
self.pos_path = self.out_dir + 'pos.csv'
self.edge_path = self.out_dir + 'edge.csv'
self.edge_tmp_path = self.out_dir + 'edge_tmp.csv'
self.weight_path = self.out_dir + 'weight.csv'
# torch dataset paths
self.dataset_dir = settings.resources_dir + 'time_resolved_dataset/'
self.x_pt_path = self.dataset_dir + 'x.pt'
self.edge_index_pt_path = self.dataset_dir + 'edge_index.pt'
self.edge_weight_pt_path = self.dataset_dir + 'edge_weights.pt'
self.y_pt_path = self.dataset_dir + 'y.pt'
self.missing_o3_mask_pt_path = self.dataset_dir + 'missing_o3_mask.pt'
self.train_mask_pt_path = self.dataset_dir + 'train_mask.pt'
self.val_mask_pt_path = self.dataset_dir + 'val_mask.pt'
self.test_mask_pt_path = self.dataset_dir + 'test_mask.pt'
self.pos_pt_path = self.dataset_dir + 'pos.pt'
def get_reg(self):
"""
A register file, where the sample id, the corresponding station
id and the datetime can be read.
"""
print('preprocessing register...')
# example data
hd = HourlyData('o3')
hd.read_from_file()
# filter the datetime index and add time offset
time_offset = settings.time_offset
hd.df.index = pd.to_datetime(hd.df.index) + time_offset
start = pd.to_datetime(settings.filter_datetime_start)
end = pd.to_datetime(settings.filter_datetime_end)
filter_condition = hd.df.index.to_series().between(start, end)
hd.df = hd.df[filter_condition].copy()
# set up df
n_samples_total = len(hd.df.index)*len(hd.df.columns)
cols = ['station_id', 'datetime']
reg_df = pd.DataFrame(index=range(n_samples_total),
columns=cols)
reg_df.index.name = 'node_index'
# fill df
idx = 0
for station_idx in hd.df.columns:
for time_idx in hd.df.index:
reg_df.loc[idx] = [station_idx, time_idx]
idx += 1
if idx % 500000 == 0:
print(f'{idx/len(reg_df)*100:.1f} %')
# save
reg_df.to_csv(self.reg_path)
print(f'written to {self.reg_path}')
def get_x(self):
"""
A file with all node features
"""
print('preprocessing x...')
# organize features
easy_names = ['hour_of_day', 'day_of_year']
rea_names = ['temp', 'relhum', 'cloudcover',
'pblheight', 'u', 'v']
meta_names = ['alt', 'relative_alt', 'population_density',
'nightlight', 'type', 'type_of_area']
# set up df
feature_names = easy_names + rea_names + meta_names
reg_df = pd.read_csv(self.reg_path, index_col=0,
converters={'datetime': pd.Timestamp})
x_df = pd.DataFrame(columns=feature_names, index=reg_df.index)
# easy fields
x_df.hour_of_day = [d.hour for d in reg_df.datetime]
x_df.day_of_year = [d.dayofyear for d in reg_df.datetime]
# rea fields, filter year and add time offset
for rea_name in rea_names:
hd = HourlyData(rea_name)
hd.read_from_file()
time_offset = settings.time_offset
hd.df.index = pd.to_datetime(hd.df.index) + time_offset
start = pd.to_datetime(settings.filter_datetime_start)
end = pd.to_datetime(settings.filter_datetime_end)
filter_condition = hd.df.index.to_series().between(start, end)
hd.df = hd.df[filter_condition].copy()
x_df[rea_name] = hd.df.values.T.reshape(-1)
# meta fields
sd = StationData()
sd.read_from_file()
for idx, col in sd.df.iterrows():
id_filter = reg_df.station_id==idx
for meta_name in meta_names:
x_df.loc[id_filter, meta_name] = col[meta_name]
# remame fields to wish names
mapper = {'temp': 'temperature',
'relhum': 'relative_humidity',
'cloudcover': 'cloud_cover',
'pblheight': 'pbl_height',
'u': 'wind_u',
'v': 'wind_v'}
x_df.rename(columns=mapper, inplace=True)
# save
x_df.to_csv(self.x_path)
print(f'written to {self.x_path}')
def get_y(self):
"""
A file with all the labels (i.e. ozone values)
"""
print('preprocessing y...')
hd = HourlyData('o3')
hd.read_from_file()
# filter
time_offset = settings.time_offset
hd.df.index = pd.to_datetime(hd.df.index) + time_offset
start = pd.to_datetime(settings.filter_datetime_start)
end = pd.to_datetime(settings.filter_datetime_end)
filter_condition = hd.df.index.to_series().between(start, end)
hd.df = hd.df[filter_condition].copy()
# flatten, but move through time steps first.
y_values = hd.df.values.T.reshape(-1)
y_df = pd.DataFrame(index=range(len(y_values)),
columns=['y'],
data=y_values)
y_df.index.name = 'node_index'
# save
y_df.to_csv(self.y_path)
print(f'written to {self.y_path}')
def get_gaps(self):
"""
Analyze the gaps in measurement data, their lenght.
Twins are gaps of another station but with the same start
and end datetime.
"""
print('analyzing existing gaps...')
# read in data
y_df = pd.read_csv(self.y_path, index_col=0)
reg_df = pd.read_csv(self.reg_path, index_col=0)
# set up df
columns=['station_id', 'type', 'start_datetime',
'end_datetime', 'start_idx', 'len', 'twins']
gap_df = pd.DataFrame(columns=columns)
gap_df.index.name = 'gap_index'
# set all values to 1 and all nans to 0
y_df.loc[y_df.y==y_df.y, 'y'] = 1
y_df.loc[y_df.y!=y_df.y, 'y'] = 0
# find consecutive gaps and no gaps
gap_idx = 0
y_old = y_df.y[0]
station_id_old = reg_df.loc[0, 'station_id']
y_index_gap_start = y_df.index[0]
gap_len = 0
for y_index, row in y_df.iterrows():
if y_index % 500000 == 0:
print(f'{y_index/len(y_df)*100:.1f} %')
y = row.y
station_id = reg_df.loc[y_index, 'station_id']
if y==y_old and station_id==station_id_old:
# increase len
gap_len += 1
else:
# write gap to gap df and initialize new one
gap_type = 'missing_o3' if y_old== 0 else 'value'
if gap_type == 'missing_o3':
start = reg_df.loc[y_index_gap_start, 'datetime']
end = reg_df.loc[y_index_gap_start+gap_len, 'datetime']
twins = 0
gap_df.loc[gap_idx, :] = [station_id_old, gap_type,
start, end,
y_index_gap_start, gap_len, twins]
# print([station_id_old, gap_type,
# y_index_gap_start, gap_len])
gap_idx += 1
y_index_gap_start = y_index
gap_len = 1
y_old = y
station_id_old = station_id
# find twins with same start_datetime and end datetime
print('analyzing twins of gaps...')
for idx, row in gap_df.iterrows():
if idx % 10000 == 0:
print(f'{idx/len(gap_df)*100:.1f} %')
twins = []
station_id = row.station_id
start = row.start_datetime
end = row.end_datetime
id_filter = gap_df.station_id != station_id
start_filter = gap_df.start_datetime == start
end_filter = gap_df.end_datetime == end
gap_df_filtered = gap_df[id_filter&start_filter&end_filter]
twins = gap_df_filtered.index.to_list()
gap_df.at[idx, 'twins'] = len(twins)
# save
gap_df.to_csv(self.gap_path)
print(f'written to {self.gap_path}')
def get_mask(self):
"""
- missing_o3_mask (True if o3 is missing)
- train_mask (True if training sample)
- val_mask (True if val sample)
- test_mask (True if test sample)
n.b. all masks should have the same statistical properties as
missing_o3_mask
"""
print('preparing masks...')
# read in gaps catalouge and ozone df
gap_df = pd.read_csv(self.gap_path, index_col=0)
reg_df = pd.read_csv(self.reg_path, index_col=0)
y_df = pd.read_csv(self.y_path, index_col=0)
# initialize mask df
columns = ['missing_o3_mask', 'train_mask', 'val_mask',
'test_mask']
mask_df = pd.DataFrame(columns=columns, index=y_df.index,
data=False)
mask_df.index.name = 'node_index'
# missing o3 is simply where we do not have any measurement
mask_df.loc[y_df.y!=y_df.y, 'missing_o3_mask'] = True
# histogram of gap lenghts
# 1h, 2h, 6h, 12h, 1d, 2d, 1w, 2w, 4w, 12w, 1y
len_series = gap_df[gap_df.type=='missing_o3'].len
bins = [1, 2, 3, 4, 6, 12, 24, 48, 168, 336, 672, 2016, 8760]
freq, _ = np.histogram(len_series, bins=bins)
freq = list(freq)
# for val and test, we have to mask series at random.
# reverse, because we need long consecutive series for the
# first masks
bins.reverse()
freq.reverse()
random.seed(settings.random_seed)
for idx, bin_ in enumerate(bins[:-1]):
lower_limit = bins[idx+1]
upper_limit = bins[idx]
# half to val, half to test maks
n_required = min(round(freq[idx]/2), 1000)
# print(f'{n_required} in [{lower_limit},{upper_limit})')
for mask in ['val_mask', 'test_mask']:
masks_found = 0
while masks_found < round(n_required/2):
# pick random len in interval
len_ = random.randrange(lower_limit, upper_limit)
# pick random start index of the gap
start_idx = random.choice(mask_df.index)
start_station = reg_df.at[start_idx, 'station_id']
# add gap len to obtain end index
end_idx = start_idx + len_
# useless if it exceeds the original index
if end_idx > mask_df.index[-1]:
print('Exceeds df boundaries')
continue
end_station = reg_df.at[end_idx, 'station_id']
if start_station != end_station:
print('Different station')
continue
# useless if any more than 10 % of the values True
mask_df_filtered = mask_df[start_idx:end_idx]
missing_values = len(mask_df_filtered[
(mask_df_filtered==True).any(axis=1)])
if missing_values * 10 > len_:
print('Too many missing values')
continue
# set those to true that are not true elsewhere
index_list = mask_df_filtered[
(mask_df_filtered==False).all(axis=1)].index
mask_df.at[index_list, mask] = True
start_datetime = reg_df.at[start_idx, 'datetime']
end_datetime = reg_df.at[end_idx, 'datetime']
twins = 0
_ = [start_station, mask, start_datetime,
end_datetime, start_idx, len_, twins]
gap_df.loc[len(gap_df), :] = _
masks_found += 1
print(start_station, missing_values, len_)
# train mask is where no other mask applies
index_list = mask_df[(mask_df==False).all(axis=1)].index
mask_df.at[index_list, 'train_mask'] = True
# save
mask_df.to_csv(self.mask_path)
print(f'written to {self.mask_path}')
gap_df.to_csv(self.gap_path)
print(f'written to {self.gap_path}')
def get_rf(self, features=None, return_r2=False, save=True):
"""
Fit a simple model to x and y, then save it
A feature list and the option to return the r2 are given
for feature selection purposes.
"""
if not return_r2:
print('baselines...')
# read in data
x_df = pd.read_csv(self.x_path, index_col=0)
y_df = pd.read_csv(self.y_path, index_col=0)
mask_df = pd.read_csv(self.mask_path, index_col=0)
# choosing features, one-hot encoding
if features:
x_df = x_df[features]
x_df = get_one_hot(x_df)
# prepare data
x_train = x_df[mask_df.train_mask].to_numpy()
x_val = x_df[mask_df.val_mask].to_numpy()
y_train = y_df[mask_df.train_mask].to_numpy().reshape(-1)
y_val = y_df[mask_df.val_mask].to_numpy().reshape(-1)
# fit model
rf = RandomForestRegressor(random_state=settings.random_seed,
n_jobs=-1, max_depth=None,
n_estimators=100)
rf.fit(x_train, y_train)
# evaluate
y_val_hat = rf.predict(x_val)
y_train_hat = rf.predict(x_train)
rmse_train = (mean_squared_error(y_train, y_train_hat))**.5
r2_train = r2_score(y_train, y_train_hat)
rmse_val = (mean_squared_error(y_val, y_val_hat))**.5
r2_val = r2_score(y_val, y_val_hat)
# if the r2 is not returned, print stats
if not return_r2:
print('======================')
if features:
print('Features')
print([f for f in features])
print('Baseline results for train:')
print(f'RMSE: {rmse_train:.2f}, R2: {r2_train:.2f}')
print('Baseline results for val:')
print(f'RMSE: {rmse_val:.2f}, R2: {r2_val:.2f}')
print('======================')
# save
if save:
pkl.dump(rf, open(self.rf_path, 'wb'))
print(f'written to {self.rf_path}')
# return metric
if return_r2:
return r2_val
def get_imp(self, features=None):
"""
Feature importances of RF to define the graph structure
"""
print('importances of baseline model...')
# read in data and model
x_df = pd.read_csv(self.x_path, index_col=0)
if features:
x_df = x_df[features]
x_df = get_one_hot(x_df)
rf = pkl.load(open(self.rf_path, 'rb'))
# feature importances
imp_df = pd.DataFrame(columns=['feature', 'importance'],
index=range(len(x_df.columns)))
imp_df.feature = x_df.columns
imp_df.importance = rf.feature_importances_
# save
imp_df.to_csv(self.imp_path)
print(f'written to {self.imp_path}')
def get_pos(self):
"""
Position of every node: lon and lat
"""
print('preprocessing positions of nodes...')
# read in data
reg_df = pd.read_csv(self.reg_path, index_col=0)
sd = StationData()
sd.read_from_file()
station_df = sd.df
# initialize position data frame
pos_df = pd.DataFrame(index=reg_df.index,
columns=['lon', 'lat', 'datetime'])
pos_df.index.name = 'node_index'
# write positions to df
for idx, col in station_df.iterrows():
id_filter = reg_df.station_id==idx
for coordinate in ['lon', 'lat']:
pos_df.loc[id_filter, coordinate] = col[coordinate]
pos_df.datetime = reg_df.datetime
# save
pos_df.to_csv(self.pos_path)
print(f'written to {self.pos_path}')
def get_edges(self):
"""
Collect all edges. This routine best runs on the mem192
partition of JUWELS.
This routine also produces the edge_tmp.csv file,
which makes calculating edge weights a lot faster.
"""
print('preprocessing edges...')
# settings for edges, radius in km and time window
radius = 50.
time_window = pd.Timedelta('0 days 06:00:00')
# read in data
station_df = pd.read_csv(self.station_path, index_col=0)
reg_df = pd.read_csv(self.reg_path, index_col=0,
parse_dates=['datetime'])
# reg_df = reg_df[0:int(1/1000*len(reg_df))].copy() <-- for testing
# initialize edge dataframe, n.b. it is reorganized later
edge_df = pd.DataFrame(index=reg_df.index,
columns=['source_indices'])
edge_df.index.name = 'target_index'
# find neighbors of stations in given radius
neighbor_df = pd.DataFrame(index=station_df.index,
columns=['neighbors'])
ids = station_df.index
x_, y_, z_ = geo_to_cartesian(station_df.lon.values,
station_df.lat.values)
coords = np.array([x_, y_, z_]).T
nn = NearestNeighbors()
nn.fit(coords)
_, idx_lists = nn.radius_neighbors(coords,
radius=50.)
for trg_idx, src_idx_list in enumerate(idx_lists):
trg_id = ids[trg_idx]
src_ids = [ids[src_idx] for src_idx in sorted(src_idx_list)]
neighbor_df.at[trg_id, 'neighbors'] = src_ids
# edges exist if measurements are at neighboring stations
# and in the given time window
reg_ddf = ddf.from_pandas(reg_df, npartitions=6)
datetime_start = datetime.datetime.now()
print(datetime_start)
args = (neighbor_df, reg_df, radius, time_window)
edge_dseries = reg_ddf.apply(neighbor_index_filter,
args=args,
axis=1,
meta=edge_df['source_indices'])
edge_series = edge_dseries.compute(scheduler='multiprocessing')
edge_df.source_indices = edge_series
print(datetime.datetime.now() - datetime_start)
# save temporary edge_df
edge_df.to_csv(self.edge_tmp_path)
print(f'written to {self.edge_tmp_path}')
# reorganize the edge_df, so one edge gets one row.
source_index_list, target_index_list = [], []
for index, row in edge_df.iterrows():
source_indices = row.source_indices
target_index = index
source_index_list += source_indices
target_index_list += [target_index]*len(source_indices)
edge_df = pd.DataFrame()
edge_df['source_node_index'] = source_index_list
edge_df['target_node_index'] = target_index_list
edge_df.index.name = 'edge_index'
# save
edge_df.to_csv(self.edge_path)
print(f'written to {self.edge_path}')
def get_edge_weights(self, features=None):
"""
Weight edges by cosine similarity
"""
print('preprocessing edge weights...')
# read in data
edge_tmp_df = pd.read_csv(self.edge_tmp_path, index_col=0)
x_df = pd.read_csv(self.x_path, index_col=0)
imp_df = pd.read_csv(self.imp_path, index_col=0)
# prepare x
if features:
x_df = x_df[features]
x_df = get_one_hot(x_df)
for feature in x_df.columns:
scaler = StandardScaler()
scale_data = x_df[feature].to_numpy().reshape(-1, 1)
imp = float(imp_df[imp_df.feature==feature].importance.values)
x_df[feature] = scaler.fit_transform(scale_data) * imp
# iterate through all node indices
edge_weight_list = []
for target_node_index in edge_tmp_df.index:
if target_node_index % 500000 == 0:
print(f'{target_node_index/len(edge_tmp_df)*100:.1f} %')
source_node_indices = eval(edge_tmp_df.at[target_node_index, 'source_indices'])
x_source_nodes = x_df.loc[source_node_indices]
x_target_node = x_df.loc[target_node_index]
weights_source_nodes = cosine_similarity(x_source_nodes,
np.array(x_target_node).reshape(1, -1)
).clip(0).reshape(-1)
weights_source_nodes = weights_source_nodes/sum(weights_source_nodes)
edge_weight_list += weights_source_nodes.tolist()
# edge weight dataframe
weight_df = pd.DataFrame(columns=['edge_weight'])
weight_df['edge_weight'] = edge_weight_list
weight_df.index.name = 'edge_index'
# save
weight_df.to_csv(self.weight_path)
print(f'written to {self.weight_path}')
def convert_to_tensor(self, features=None):
"""
Convert all the .csv files to tensors
"""
print('converting to pytorch tensors...')
# x
x_df = pd.read_csv(self.x_path, index_col=0)
if features:
x_df = x_df[features]
x_df = get_one_hot(x_df)
x = torch.tensor(x_df.values.astype(np.float32))
torch.save(x, self.x_pt_path)
print(f'written to {self.x_pt_path}')
del x_df
# edges
edge_df = pd.read_csv(self.edge_path, index_col=0)
src_list = edge_df.source_node_index.to_list()
trg_list = edge_df.target_node_index.to_list()
edge_index = torch.tensor(np.array([src_list, trg_list])).detach()
torch.save(edge_index, self.edge_index_pt_path)
print(f'written to {self.edge_index_pt_path}')
del edge_df
# edge weights
weight_df = pd.read_csv(self.weight_path, index_col=0)
edge_weights = torch.tensor(weight_df.values.astype(np.float32)).view(-1, 1)
torch.save(edge_weights, self.edge_weight_pt_path)
print(f'written to {self.edge_weight_pt_path}')
del weight_df
# y
y_df = pd.read_csv(self.y_path, index_col=0)
y = torch.tensor(y_df.values.astype(np.float32)).view(-1, 1)
torch.save(y, self.y_pt_path)
print(f'written to {self.y_pt_path}')
del y_df
# masks
mask_df = pd.read_csv(self.mask_path, index_col=0)
missing_o3_mask = torch.tensor(mask_df.missing_o3_mask).view(-1)
torch.save(missing_o3_mask, self.missing_o3_mask_pt_path)
print(f'written to {self.missing_o3_mask_pt_path}')
train_mask = torch.tensor(mask_df.train_mask).view(-1)
torch.save(train_mask, self.train_mask_pt_path)
print(f'written to {self.train_mask_pt_path}')
val_mask = torch.tensor(mask_df.val_mask).view(-1)
torch.save(val_mask, self.val_mask_pt_path)
print(f'written to {self.val_mask_pt_path}')
test_mask = torch.tensor(mask_df.test_mask).view(-1)
torch.save(test_mask, self.test_mask_pt_path)
print(f'written to {self.test_mask_pt_path}')
del mask_df
# position
pos_df = pd.read_csv(self.pos_path, index_col=0)
pos = torch.tensor(pos_df[['lon', 'lat']].values.astype(np.float32))
torch.save(pos, self.pos_pt_path)
print(f'written to {self.pos_pt_path}')
del pos_df
def get_dataset(self, features=None):
"""
Finalize the Pytorch dataset, include everything that is
in the gAQBench dataset.
"""
# read in data
x = torch.load(self.x_pt_path)
edge_index = torch.load(self.edge_index_pt_path)
edge_weight = torch.load(self.edge_weight_pt_path)
y = torch.load(self.y_pt_path)
missing_o3_mask = torch.load(self.missing_o3_mask_pt_path)
train_mask = torch.load(self.train_mask_pt_path)
val_mask = torch.load(self.val_mask_pt_path)
test_mask = torch.load(self.test_mask_pt_path)
pos = torch.load(self.pos_pt_path)
# create dataset
data = Data(x=x, edge_index=edge_index, edge_weight=edge_weight,
y=y, missing_o3_mask=missing_o3_mask,
train_mask=train_mask, val_mask=val_mask,
test_mask=test_mask, pos=pos)
self.data, self.slices = self.collate([data])
def main():
"""
Full workflow to create the dataset
"""
# features are important for the simple model and the edge weights
features = ['hour_of_day', 'day_of_year', 'relative_alt',
'pbl_height', 'temperature', 'alt',
'population_density', 'type', 'wind_v', 'wind_u',
'cloud_cover']
tro = TimeResolvedOzone()
# tro.get_reg()
# tro.get_x()
# tro.get_y()
# tro.get_gaps()
# tro.get_mask()
# tro.get_rf(features=features)
# tro.get_imp(features=features)
# tro.get_pos()
# tro.get_edges()
# tro.get_edge_weights(features=features)
tro.convert_to_tensor(features=features)
tro.get_dataset()
dataset = tro
print(f'Dataset: {dataset}:')
print('======================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of node features: {dataset.num_features}')
print(f'Number of edge features: {dataset.num_edge_features}')
data = dataset[0] # Get the first graph object.
print(data)
print('======================')
# Gather some statistics about the graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {(2*data.num_edges) / data.num_nodes:.2f}')
print(f'Number of training nodes: {data.train_mask.sum()}')
print(f'Training node share: {int(data.train_mask.sum()) / data.num_nodes:.2f}')
print(f'Contains isolated nodes: {data.has_isolated_nodes()}')
print(f'Contains self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')
def feature_selection():
"""
Forward feature selection for the simple model
"""
# in itialize class
tro = TimeResolvedOzone()
# read in data
x_df = pd.read_csv(tro.x_path, index_col=0)
all_features = x_df.columns
# create pairs
pair_list = []
for f1 in all_features:
for f2 in all_features:
condition1 = f1 != f2
condition2 = [f2, f1] not in pair_list
if (condition1 and condition2):
pair_list.append([f1, f2])
# train on all pairs and append to r2 list
print('\ntesting 2 features:')
r2_list = []
for pair in pair_list:
r2 = tro.get_rf(features=pair, return_r2=True, save=False)
r2_list.append(r2)
print(f'{pair}, {r2:.3f}')
# look for the winning feature
chosen_features = pair_list[r2_list.index(max(r2_list))]
best_r2 = max(r2_list)
print('\nwinners:')
print(f'{chosen_features}, {best_r2:.3f}')
# append new features iteratively until r2 decreases
new_best_r2 = 1.
while new_best_r2 > best_r2:
print(f'\ntesting {len(chosen_features)+1} features:')
candidates_list = []
new_r2_list = []
for feature in all_features:
if feature in chosen_features:
continue
candidates = chosen_features + [feature]
r2 = tro.get_rf(features=candidates, return_r2=True,
save=False)
candidates_list.append(candidates)
new_r2_list.append(r2)
print(f'{candidates}, {r2:.3f}')
chosen_features = candidates_list[new_r2_list.index(
max(new_r2_list))]
new_best_r2 = max(new_r2_list)
print('winners:')
print(f'{chosen_features}, {new_best_r2:.3f}')
# todo: new best r2 and best r2 are not comparable
if __name__ == '__main__':
"""
Start routines
"""
main()
# feature_selection()