Skip to content
Snippets Groups Projects
Select Git revision
  • 81ca62f611421cdd00fe58b89f76f4043ca2e635
  • master default protected
  • devel
  • paper_resubmission_20230816
  • paper_resubmission_20230630
  • paper_submission_20220921
  • before_elimination_of_duplicates_20220722
  • before_dlr_20220429
8 results

time_resolved.py

Blame
  • user avatar
    Clara Betancourt authored
    91668181
    History
    time_resolved.py 29.40 KiB
    """
    Creates a pytorch Dataset which contains time
    resolved data.
    """
    
    # general
    import random
    import pdb
    import warnings
    import pickle as pkl
    import datetime
    
    # data science
    import numpy as np
    import pandas as pd
    import dask.dataframe as ddf
    
    # sklearn
    from sklearn.preprocessing import StandardScaler
    from sklearn.neighbors import NearestNeighbors
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.metrics import mean_squared_error, r2_score
    from sklearn.metrics.pairwise import cosine_similarity
    
    # pytorch
    import torch
    from torch_geometric.data import Data, InMemoryDataset
    
    # own package
    import settings
    from retrieval.toar_db import StationData
    from retrieval.toar_db import HourlyData
    from preprocessing.utils import geo_to_cartesian, get_one_hot, \
                                    neighbor_index_filter
    
    
    # oppress future warnings
    warnings.filterwarnings('ignore')
    
    
    class TimeResolvedOzone(InMemoryDataset):
        """
        A dataset for graph ML on time resolved
        ozone data.
        """
        def __init__(self):
            """
            Initialize the class
            """
            # super class
            InMemoryDataset.__init__(self)
    
            # raw data paths
            self.raw_data_dir = settings.resources_dir + 'time_resolved_raw/'
            self.station_path = self.raw_data_dir + 'stations.csv'
    
            # preprocessed data paths
            self.out_dir = settings.resources_dir + 'time_resolved_preproc/'
            self.reg_path = self.out_dir + 'reg.csv'
            self.x_path = self.out_dir + 'x.csv'
            self.y_path = self.out_dir + 'y.csv'
            self.gap_path = self.out_dir + 'gap.csv'
            self.mask_path = self.out_dir + 'mask.csv'
            self.imp_path = self.out_dir + 'imp.csv'
            self.rf_path = self.out_dir + 'rf.pkl'
            self.pos_path = self.out_dir + 'pos.csv'
            self.edge_path = self.out_dir + 'edge.csv'
            self.edge_tmp_path = self.out_dir + 'edge_tmp.csv'
            self.weight_path = self.out_dir + 'weight.csv'
    
            # torch dataset paths
            self.dataset_dir = settings.resources_dir + 'time_resolved_dataset/'
            self.x_pt_path = self.dataset_dir + 'x.pt'
            self.edge_index_pt_path = self.dataset_dir + 'edge_index.pt'
            self.edge_weight_pt_path = self.dataset_dir + 'edge_weights.pt'
            self.y_pt_path = self.dataset_dir + 'y.pt'
            self.missing_o3_mask_pt_path = self.dataset_dir + 'missing_o3_mask.pt'
            self.train_mask_pt_path = self.dataset_dir + 'train_mask.pt'
            self.val_mask_pt_path = self.dataset_dir + 'val_mask.pt'
            self.test_mask_pt_path = self.dataset_dir + 'test_mask.pt'
            self.pos_pt_path = self.dataset_dir + 'pos.pt'
    
        def get_reg(self):
            """
            A register file, where the sample id, the corresponding station
            id and the datetime can be read.
            """
            print('preprocessing register...')
    
            # example data
            hd = HourlyData('o3')
            hd.read_from_file()
    
            # filter the datetime index and add time offset
            time_offset = settings.time_offset
            hd.df.index = pd.to_datetime(hd.df.index) + time_offset
            start = pd.to_datetime(settings.filter_datetime_start)
            end = pd.to_datetime(settings.filter_datetime_end)
            filter_condition = hd.df.index.to_series().between(start, end)
            hd.df = hd.df[filter_condition].copy()
    
            # set up df
            n_samples_total = len(hd.df.index)*len(hd.df.columns)
            cols = ['station_id', 'datetime']
            reg_df = pd.DataFrame(index=range(n_samples_total),
                                  columns=cols)
            reg_df.index.name = 'node_index'
    
            # fill df
            idx = 0
            for station_idx in hd.df.columns:
                for time_idx in hd.df.index:
                    reg_df.loc[idx] = [station_idx, time_idx]
                    idx += 1
                    if idx % 500000 == 0:
                        print(f'{idx/len(reg_df)*100:.1f} %')
    
            # save
            reg_df.to_csv(self.reg_path)
            print(f'written to {self.reg_path}')
    
        def get_x(self):
            """
            A file with all node features
            """
            print('preprocessing x...')
    
            # organize features
            easy_names = ['hour_of_day', 'day_of_year']
            rea_names = ['temp', 'relhum', 'cloudcover',
                         'pblheight', 'u', 'v']
            meta_names = ['alt', 'relative_alt', 'population_density',
                          'nightlight', 'type', 'type_of_area']
    
            # set up df
            feature_names = easy_names + rea_names + meta_names
            reg_df = pd.read_csv(self.reg_path, index_col=0,
                                 converters={'datetime': pd.Timestamp})
            x_df = pd.DataFrame(columns=feature_names, index=reg_df.index)
    
            # easy fields
            x_df.hour_of_day = [d.hour for d in reg_df.datetime]
            x_df.day_of_year = [d.dayofyear for d in reg_df.datetime]
    
            # rea fields, filter year and add time offset
            for rea_name in rea_names:
                hd = HourlyData(rea_name)
                hd.read_from_file()
                time_offset = settings.time_offset
                hd.df.index = pd.to_datetime(hd.df.index) + time_offset
                start = pd.to_datetime(settings.filter_datetime_start)
                end = pd.to_datetime(settings.filter_datetime_end)
                filter_condition = hd.df.index.to_series().between(start, end)
                hd.df = hd.df[filter_condition].copy()
                x_df[rea_name] = hd.df.values.T.reshape(-1)
    
            # meta fields
            sd = StationData()
            sd.read_from_file()
            for idx, col in sd.df.iterrows():
                id_filter = reg_df.station_id==idx
                for meta_name in meta_names:
                    x_df.loc[id_filter, meta_name] = col[meta_name]
    
            # remame fields to wish names
            mapper = {'temp': 'temperature',
                     'relhum': 'relative_humidity',
                     'cloudcover': 'cloud_cover',
                     'pblheight': 'pbl_height',
                     'u': 'wind_u',
                     'v': 'wind_v'}
            x_df.rename(columns=mapper, inplace=True)
    
            # save
            x_df.to_csv(self.x_path)
            print(f'written to {self.x_path}')
    
        def get_y(self):
            """
            A file with all the labels (i.e. ozone values)
            """
            print('preprocessing y...')
            hd = HourlyData('o3')
            hd.read_from_file()
    
            # filter
            time_offset = settings.time_offset
            hd.df.index = pd.to_datetime(hd.df.index) + time_offset
            start = pd.to_datetime(settings.filter_datetime_start)
            end = pd.to_datetime(settings.filter_datetime_end)
            filter_condition = hd.df.index.to_series().between(start, end)
            hd.df = hd.df[filter_condition].copy()
    
            # flatten, but move through time steps first.
            y_values = hd.df.values.T.reshape(-1)
            y_df = pd.DataFrame(index=range(len(y_values)),
                                columns=['y'],
                                data=y_values)
            y_df.index.name = 'node_index'
    
            # save
            y_df.to_csv(self.y_path)
            print(f'written to {self.y_path}')
    
        def get_gaps(self):
            """
            Analyze the gaps in measurement data, their lenght.
    
            Twins are gaps of another station but with the same start
            and end datetime.
            """
            print('analyzing existing gaps...')
    
            # read in data
            y_df = pd.read_csv(self.y_path, index_col=0)
            reg_df = pd.read_csv(self.reg_path, index_col=0)
    
            # set up df
            columns=['station_id', 'type', 'start_datetime',
                     'end_datetime', 'start_idx', 'len', 'twins']
            gap_df = pd.DataFrame(columns=columns)
            gap_df.index.name = 'gap_index'
    
            # set all values to 1 and all nans to 0
            y_df.loc[y_df.y==y_df.y, 'y'] = 1
            y_df.loc[y_df.y!=y_df.y, 'y'] = 0
    
            # find consecutive gaps and no gaps
            gap_idx = 0
            y_old = y_df.y[0]
            station_id_old = reg_df.loc[0, 'station_id']
            y_index_gap_start = y_df.index[0]
            gap_len = 0
            for y_index, row in y_df.iterrows():
                if y_index % 500000 == 0:
                    print(f'{y_index/len(y_df)*100:.1f} %')
                y = row.y
                station_id = reg_df.loc[y_index, 'station_id']
                if y==y_old and station_id==station_id_old:
                    # increase len
                    gap_len += 1
                else:
                    # write gap to gap df and initialize new one
                    gap_type = 'missing_o3' if y_old== 0 else 'value'
                    if gap_type == 'missing_o3':
                        start = reg_df.loc[y_index_gap_start, 'datetime']
                        end = reg_df.loc[y_index_gap_start+gap_len, 'datetime']
                        twins = 0
                        gap_df.loc[gap_idx, :] = [station_id_old, gap_type,
                                                  start, end,
                                                  y_index_gap_start, gap_len, twins]
                    # print([station_id_old, gap_type,
                    #        y_index_gap_start, gap_len])
                        gap_idx += 1
                    y_index_gap_start = y_index
                    gap_len = 1
                y_old = y
                station_id_old = station_id
    
            # find twins with same start_datetime and end datetime
            print('analyzing twins of gaps...')
            for idx, row in gap_df.iterrows():
                if idx % 10000 == 0:
                    print(f'{idx/len(gap_df)*100:.1f} %')
                twins = []
                station_id = row.station_id
                start = row.start_datetime
                end = row.end_datetime
                id_filter = gap_df.station_id != station_id
                start_filter = gap_df.start_datetime == start
                end_filter = gap_df.end_datetime == end
                gap_df_filtered = gap_df[id_filter&start_filter&end_filter]
                twins = gap_df_filtered.index.to_list()
                gap_df.at[idx, 'twins'] = len(twins)
    
            # save
            gap_df.to_csv(self.gap_path)
            print(f'written to {self.gap_path}')
    
        def get_mask(self):
            """
            - missing_o3_mask (True if o3 is missing)
            - train_mask (True if training sample)
            - val_mask (True if val sample)
            - test_mask (True if test sample)
            n.b. all masks should have the same statistical properties as
            missing_o3_mask
            """
            print('preparing masks...')
    
            # read in gaps catalouge and ozone df
            gap_df = pd.read_csv(self.gap_path, index_col=0)
            reg_df = pd.read_csv(self.reg_path, index_col=0)
            y_df = pd.read_csv(self.y_path, index_col=0)
    
            # initialize mask df
            columns = ['missing_o3_mask', 'train_mask', 'val_mask',
                       'test_mask']
            mask_df = pd.DataFrame(columns=columns, index=y_df.index,
                                   data=False)
            mask_df.index.name = 'node_index'
    
            # missing o3 is simply where we do not have any measurement
            mask_df.loc[y_df.y!=y_df.y, 'missing_o3_mask'] = True
    
            # histogram of gap lenghts
            # 1h, 2h, 6h, 12h, 1d, 2d, 1w, 2w, 4w, 12w, 1y
            len_series = gap_df[gap_df.type=='missing_o3'].len
            bins = [1, 2, 3, 4, 6, 12, 24, 48, 168, 336, 672, 2016, 8760]
            freq, _ = np.histogram(len_series, bins=bins)
            freq = list(freq)
    
            # for val and test, we have to mask series at random.
            # reverse, because we need long consecutive series for the
            # first masks
            bins.reverse()
            freq.reverse()
            random.seed(settings.random_seed)
    
            for idx, bin_ in enumerate(bins[:-1]):
                lower_limit = bins[idx+1]
                upper_limit = bins[idx]
                # half to val, half to test maks
                n_required = min(round(freq[idx]/2), 1000)
                # print(f'{n_required} in [{lower_limit},{upper_limit})')
                for mask in ['val_mask', 'test_mask']:
                    masks_found = 0
                    while masks_found < round(n_required/2):
                        # pick random len in interval
                        len_ = random.randrange(lower_limit, upper_limit)
                        # pick random start index of the gap
                        start_idx = random.choice(mask_df.index)
                        start_station = reg_df.at[start_idx, 'station_id']
                        # add gap len to obtain end index
                        end_idx = start_idx + len_
                        # useless if it exceeds the original index
                        if end_idx > mask_df.index[-1]:
                            print('Exceeds df boundaries')
                            continue
                        end_station = reg_df.at[end_idx, 'station_id']
                        if start_station != end_station:
                            print('Different station')
                            continue
                        # useless if any more than 10 % of the values True
                        mask_df_filtered = mask_df[start_idx:end_idx]
                        missing_values = len(mask_df_filtered[
                                     (mask_df_filtered==True).any(axis=1)])
                        if missing_values * 10 > len_:
                            print('Too many missing values')
                            continue
                        # set those to true that are not true elsewhere
                        index_list = mask_df_filtered[
                               (mask_df_filtered==False).all(axis=1)].index
                        mask_df.at[index_list, mask] = True
                        start_datetime = reg_df.at[start_idx, 'datetime']
                        end_datetime = reg_df.at[end_idx, 'datetime']
                        twins = 0
                        _ = [start_station, mask, start_datetime,
                             end_datetime, start_idx, len_, twins]
                        gap_df.loc[len(gap_df), :] = _
                        masks_found += 1
                        print(start_station, missing_values, len_)
    
            # train mask is where no other mask applies
            index_list = mask_df[(mask_df==False).all(axis=1)].index
            mask_df.at[index_list, 'train_mask'] = True
    
            # save
            mask_df.to_csv(self.mask_path)
            print(f'written to {self.mask_path}')
            gap_df.to_csv(self.gap_path)
            print(f'written to {self.gap_path}')
    
        def get_rf(self, features=None, return_r2=False, save=True):
            """
            Fit a simple model to x and y, then save it
    
            A feature list and the option to return the r2 are given
            for feature selection purposes.
            """
            if not return_r2:
                print('baselines...')
    
            # read in data
            x_df = pd.read_csv(self.x_path, index_col=0)
            y_df = pd.read_csv(self.y_path, index_col=0)
            mask_df = pd.read_csv(self.mask_path, index_col=0)
    
            # choosing features, one-hot encoding
            if features:
                x_df = x_df[features]
            x_df = get_one_hot(x_df)
    
            # prepare data
            x_train = x_df[mask_df.train_mask].to_numpy()
            x_val = x_df[mask_df.val_mask].to_numpy()
            y_train = y_df[mask_df.train_mask].to_numpy().reshape(-1)
            y_val = y_df[mask_df.val_mask].to_numpy().reshape(-1)
    
            # fit model
            rf = RandomForestRegressor(random_state=settings.random_seed,
                                       n_jobs=-1, max_depth=None,
                                       n_estimators=100)
            rf.fit(x_train, y_train)
    
            # evaluate
            y_val_hat = rf.predict(x_val)
            y_train_hat = rf.predict(x_train)
            rmse_train = (mean_squared_error(y_train, y_train_hat))**.5
            r2_train = r2_score(y_train, y_train_hat)
            rmse_val = (mean_squared_error(y_val, y_val_hat))**.5
            r2_val = r2_score(y_val, y_val_hat)
    
            # if the r2 is not returned, print stats
            if not return_r2:
                print('======================')
                if features:
                    print('Features')
                    print([f for f in features])
                print('Baseline results for train:')
                print(f'RMSE: {rmse_train:.2f}, R2: {r2_train:.2f}')
                print('Baseline results for val:')
                print(f'RMSE: {rmse_val:.2f}, R2: {r2_val:.2f}')
                print('======================')
    
            # save
            if save:
                pkl.dump(rf, open(self.rf_path, 'wb'))
                print(f'written to {self.rf_path}')
    
            # return metric
            if return_r2:
                return r2_val
    
        def get_imp(self, features=None):
            """
            Feature importances of RF to define the graph structure
            """
            print('importances of baseline model...')
    
            # read in data and model
            x_df = pd.read_csv(self.x_path, index_col=0)
            if features:
                x_df = x_df[features]
            x_df = get_one_hot(x_df)
            rf = pkl.load(open(self.rf_path, 'rb'))
    
            # feature importances
            imp_df = pd.DataFrame(columns=['feature', 'importance'],
                                  index=range(len(x_df.columns)))
            imp_df.feature = x_df.columns
            imp_df.importance = rf.feature_importances_
    
            # save
            imp_df.to_csv(self.imp_path)
            print(f'written to {self.imp_path}')
    
        def get_pos(self):
            """
            Position of every node: lon and lat
            """
            print('preprocessing positions of nodes...')
    
            # read in data
            reg_df = pd.read_csv(self.reg_path, index_col=0)
            sd = StationData()
            sd.read_from_file()
            station_df = sd.df
    
            # initialize position data frame
            pos_df = pd.DataFrame(index=reg_df.index,
                                  columns=['lon', 'lat', 'datetime'])
            pos_df.index.name = 'node_index'
    
            # write positions to df
            for idx, col in station_df.iterrows():
                id_filter = reg_df.station_id==idx
                for coordinate in ['lon', 'lat']:
                    pos_df.loc[id_filter, coordinate] = col[coordinate]
            pos_df.datetime = reg_df.datetime
    
            # save
            pos_df.to_csv(self.pos_path)
            print(f'written to {self.pos_path}')
    
        def get_edges(self):
            """
            Collect all edges. This routine best runs on the mem192
            partition of JUWELS.
    
            This routine also produces the edge_tmp.csv file,
            which makes calculating edge weights a lot faster.
            """
            print('preprocessing edges...')
    
            # settings for edges, radius in km and time window
            radius = 50.
            time_window = pd.Timedelta('0 days 06:00:00')
    
            # read in data
            station_df = pd.read_csv(self.station_path, index_col=0)
            reg_df = pd.read_csv(self.reg_path, index_col=0,
                                 parse_dates=['datetime'])
            # reg_df = reg_df[0:int(1/1000*len(reg_df))].copy() <-- for testing
    
            # initialize edge dataframe, n.b. it is reorganized later
            edge_df = pd.DataFrame(index=reg_df.index,
                                    columns=['source_indices'])
            edge_df.index.name = 'target_index'
    
            # find neighbors of stations in given radius
            neighbor_df = pd.DataFrame(index=station_df.index,
                                       columns=['neighbors'])
            ids = station_df.index
            x_, y_, z_ = geo_to_cartesian(station_df.lon.values,
                                          station_df.lat.values)
            coords = np.array([x_, y_, z_]).T
            nn = NearestNeighbors()
            nn.fit(coords)
            _, idx_lists = nn.radius_neighbors(coords,
                                               radius=50.)
            for trg_idx, src_idx_list in enumerate(idx_lists):
                trg_id = ids[trg_idx]
                src_ids = [ids[src_idx] for src_idx in sorted(src_idx_list)]
                neighbor_df.at[trg_id, 'neighbors'] = src_ids
    
            # edges exist if measurements are at neighboring stations
            # and in the given time window
            reg_ddf = ddf.from_pandas(reg_df, npartitions=6)
            datetime_start = datetime.datetime.now()
            print(datetime_start)
            args = (neighbor_df, reg_df, radius, time_window)
            edge_dseries = reg_ddf.apply(neighbor_index_filter,
                                         args=args,
                                         axis=1,
                                         meta=edge_df['source_indices'])
            edge_series = edge_dseries.compute(scheduler='multiprocessing')
            edge_df.source_indices = edge_series
            print(datetime.datetime.now() - datetime_start)
    
            # save temporary edge_df
            edge_df.to_csv(self.edge_tmp_path)
            print(f'written to {self.edge_tmp_path}')
    
            # reorganize the edge_df, so one edge gets one row.
            source_index_list, target_index_list = [], []
            for index, row in edge_df.iterrows():
                source_indices = row.source_indices
                target_index = index
                source_index_list += source_indices
                target_index_list += [target_index]*len(source_indices)
            edge_df = pd.DataFrame()
            edge_df['source_node_index'] = source_index_list
            edge_df['target_node_index'] = target_index_list
            edge_df.index.name = 'edge_index'
    
            # save
            edge_df.to_csv(self.edge_path)
            print(f'written to {self.edge_path}')
    
        def get_edge_weights(self, features=None):
            """
            Weight edges by cosine similarity
            """
            print('preprocessing edge weights...')
    
            # read in data
            edge_tmp_df = pd.read_csv(self.edge_tmp_path, index_col=0)
            x_df = pd.read_csv(self.x_path, index_col=0)
            imp_df = pd.read_csv(self.imp_path, index_col=0)
    
            # prepare x
            if features:
                x_df = x_df[features]
            x_df = get_one_hot(x_df)
            for feature in x_df.columns:
                scaler = StandardScaler()
                scale_data = x_df[feature].to_numpy().reshape(-1, 1)
                imp = float(imp_df[imp_df.feature==feature].importance.values)
                x_df[feature] = scaler.fit_transform(scale_data) * imp
    
            # iterate through all node indices
            edge_weight_list = []
            for target_node_index in edge_tmp_df.index:
                if target_node_index % 500000 == 0:
                    print(f'{target_node_index/len(edge_tmp_df)*100:.1f} %')
                source_node_indices = eval(edge_tmp_df.at[target_node_index, 'source_indices'])
                x_source_nodes = x_df.loc[source_node_indices]
                x_target_node = x_df.loc[target_node_index]
                weights_source_nodes = cosine_similarity(x_source_nodes,
                                       np.array(x_target_node).reshape(1, -1)
                                       ).clip(0).reshape(-1)
                weights_source_nodes = weights_source_nodes/sum(weights_source_nodes)
                edge_weight_list +=  weights_source_nodes.tolist()
    
            # edge weight dataframe
            weight_df = pd.DataFrame(columns=['edge_weight'])
            weight_df['edge_weight'] = edge_weight_list
            weight_df.index.name = 'edge_index'
    
            # save
            weight_df.to_csv(self.weight_path)
            print(f'written to {self.weight_path}')
    
        def convert_to_tensor(self, features=None):
            """
            Convert all the .csv files to tensors
            """
            print('converting to pytorch tensors...')
    
            # x
            x_df = pd.read_csv(self.x_path, index_col=0)
            if features:
                x_df = x_df[features]
            x_df = get_one_hot(x_df)
            x = torch.tensor(x_df.values.astype(np.float32))
            torch.save(x, self.x_pt_path)
            print(f'written to {self.x_pt_path}')
            del x_df
    
            # edges
            edge_df = pd.read_csv(self.edge_path, index_col=0)
            src_list = edge_df.source_node_index.to_list()
            trg_list = edge_df.target_node_index.to_list()
            edge_index = torch.tensor(np.array([src_list, trg_list])).detach()
            torch.save(edge_index, self.edge_index_pt_path)
            print(f'written to {self.edge_index_pt_path}')
            del edge_df
    
            # edge weights
            weight_df = pd.read_csv(self.weight_path, index_col=0)
            edge_weights = torch.tensor(weight_df.values.astype(np.float32)).view(-1, 1)
            torch.save(edge_weights, self.edge_weight_pt_path)
            print(f'written to {self.edge_weight_pt_path}')
            del weight_df
    
            # y
            y_df = pd.read_csv(self.y_path, index_col=0)
            y = torch.tensor(y_df.values.astype(np.float32)).view(-1, 1)
            torch.save(y, self.y_pt_path)
            print(f'written to {self.y_pt_path}')
            del y_df
    
            # masks
            mask_df = pd.read_csv(self.mask_path, index_col=0)
            missing_o3_mask = torch.tensor(mask_df.missing_o3_mask).view(-1)
            torch.save(missing_o3_mask, self.missing_o3_mask_pt_path)
            print(f'written to {self.missing_o3_mask_pt_path}')
            train_mask = torch.tensor(mask_df.train_mask).view(-1)
            torch.save(train_mask, self.train_mask_pt_path)
            print(f'written to {self.train_mask_pt_path}')
            val_mask = torch.tensor(mask_df.val_mask).view(-1)
            torch.save(val_mask, self.val_mask_pt_path)
            print(f'written to {self.val_mask_pt_path}')
            test_mask = torch.tensor(mask_df.test_mask).view(-1)
            torch.save(test_mask, self.test_mask_pt_path)
            print(f'written to {self.test_mask_pt_path}')
            del mask_df
    
            # position
            pos_df = pd.read_csv(self.pos_path, index_col=0)
            pos = torch.tensor(pos_df[['lon', 'lat']].values.astype(np.float32))
            torch.save(pos, self.pos_pt_path)
            print(f'written to {self.pos_pt_path}')
            del pos_df
    
        def get_dataset(self, features=None):
            """
            Finalize the Pytorch dataset, include everything that is
            in the gAQBench dataset.
            """
            # read in data
            x = torch.load(self.x_pt_path)
            edge_index = torch.load(self.edge_index_pt_path)
            edge_weight = torch.load(self.edge_weight_pt_path)
            y = torch.load(self.y_pt_path)
            missing_o3_mask = torch.load(self.missing_o3_mask_pt_path)
            train_mask = torch.load(self.train_mask_pt_path)
            val_mask = torch.load(self.val_mask_pt_path)
            test_mask = torch.load(self.test_mask_pt_path)
            pos = torch.load(self.pos_pt_path)
    
            # create dataset
            data = Data(x=x, edge_index=edge_index, edge_weight=edge_weight,
                        y=y, missing_o3_mask=missing_o3_mask,
                        train_mask=train_mask, val_mask=val_mask,
                        test_mask=test_mask, pos=pos)
            self.data, self.slices = self.collate([data])
    
    
    def main():
        """
        Full workflow to create the dataset
        """
        # features are important for the simple model and the edge weights
        features = ['hour_of_day', 'day_of_year', 'relative_alt',
                    'pbl_height', 'temperature', 'alt',
                    'population_density', 'type', 'wind_v', 'wind_u',
                    'cloud_cover']
        tro = TimeResolvedOzone()
        # tro.get_reg()
        # tro.get_x()
        # tro.get_y()
        # tro.get_gaps()
        # tro.get_mask()
        # tro.get_rf(features=features)
        # tro.get_imp(features=features)
        # tro.get_pos()
        # tro.get_edges()
        # tro.get_edge_weights(features=features)
        tro.convert_to_tensor(features=features)
        tro.get_dataset()
        dataset = tro
    
        print(f'Dataset: {dataset}:')
        print('======================')
        print(f'Number of graphs: {len(dataset)}')
        print(f'Number of node features: {dataset.num_features}')
        print(f'Number of edge features: {dataset.num_edge_features}')
        data = dataset[0]  # Get the first graph object.
        print(data)
        print('======================')
        # Gather some statistics about the graph.
        print(f'Number of nodes: {data.num_nodes}')
        print(f'Number of edges: {data.num_edges}')
        print(f'Average node degree: {(2*data.num_edges) / data.num_nodes:.2f}')
        print(f'Number of training nodes: {data.train_mask.sum()}')
        print(f'Training node share: {int(data.train_mask.sum()) / data.num_nodes:.2f}')
        print(f'Contains isolated nodes: {data.has_isolated_nodes()}')
        print(f'Contains self-loops: {data.has_self_loops()}')
        print(f'Is undirected: {data.is_undirected()}')
    
    
    def feature_selection():
        """
        Forward feature selection for the simple model
        """
        # in itialize class
        tro = TimeResolvedOzone()
    
        # read in data
        x_df = pd.read_csv(tro.x_path, index_col=0)
        all_features = x_df.columns
    
        # create pairs
        pair_list = []
        for f1 in all_features:
            for f2 in all_features:
                condition1 = f1 != f2
                condition2 = [f2, f1] not in pair_list
                if (condition1 and condition2):
                    pair_list.append([f1, f2])
    
        # train on all pairs and append to r2 list
        print('\ntesting 2 features:')
        r2_list = []
        for pair in pair_list:
            r2 = tro.get_rf(features=pair, return_r2=True, save=False)
            r2_list.append(r2)
            print(f'{pair}, {r2:.3f}')
    
        # look for the winning feature
        chosen_features = pair_list[r2_list.index(max(r2_list))]
        best_r2 = max(r2_list)
        print('\nwinners:')
        print(f'{chosen_features}, {best_r2:.3f}')
    
        # append new features iteratively until r2 decreases
        new_best_r2 = 1.
        while new_best_r2 > best_r2:
            print(f'\ntesting {len(chosen_features)+1} features:')
            candidates_list = []
            new_r2_list = []
            for feature in all_features:
                if feature in chosen_features:
                    continue
                candidates = chosen_features + [feature]
                r2 = tro.get_rf(features=candidates, return_r2=True,
                                save=False)
                candidates_list.append(candidates)
                new_r2_list.append(r2)
                print(f'{candidates}, {r2:.3f}')
            chosen_features = candidates_list[new_r2_list.index(
                                                         max(new_r2_list))]
            new_best_r2 = max(new_r2_list)
            print('winners:')
            print(f'{chosen_features}, {new_best_r2:.3f}')
    
        # todo: new best r2 and best r2 are not comparable
    
    
    if __name__ == '__main__':
        """
        Start routines
        """
        main()
        # feature_selection()