time_resolved.py

"""
Creates a pytorch Dataset which contains time
resolved data.
"""

# general
import random
import pdb
import warnings
import pickle as pkl
import datetime

# data science
import numpy as np
import pandas as pd
import dask.dataframe as ddf

# sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics.pairwise import cosine_similarity

# pytorch
import torch
from torch_geometric.data import Data, InMemoryDataset

# own package
import settings
from retrieval.toar_db import StationData
from retrieval.toar_db import HourlyData
from preprocessing.utils import geo_to_cartesian, get_one_hot, \
                                neighbor_index_filter


# oppress future warnings
warnings.filterwarnings('ignore')


class TimeResolvedOzone(InMemoryDataset):
    """
    A dataset for graph ML on time resolved
    ozone data.
    """
    def __init__(self):
        """
        Initialize the class
        """
        # super class
        InMemoryDataset.__init__(self)

        # raw data paths
        self.raw_data_dir = settings.resources_dir + 'time_resolved_raw/'
        self.station_path = self.raw_data_dir + 'stations.csv'

        # preprocessed data paths
        self.out_dir = settings.resources_dir + 'time_resolved_preproc/'
        self.reg_path = self.out_dir + 'reg.csv'
        self.x_path = self.out_dir + 'x.csv'
        self.y_path = self.out_dir + 'y.csv'
        self.gap_path = self.out_dir + 'gap.csv'
        self.mask_path = self.out_dir + 'mask.csv'
        self.imp_path = self.out_dir + 'imp.csv'
        self.rf_path = self.out_dir + 'rf.pkl'
        self.pos_path = self.out_dir + 'pos.csv'
        self.edge_path = self.out_dir + 'edge.csv'
        self.edge_tmp_path = self.out_dir + 'edge_tmp.csv'
        self.weight_path = self.out_dir + 'weight.csv'

        # torch dataset paths
        self.dataset_dir = settings.resources_dir + 'time_resolved_dataset/'
        self.x_pt_path = self.dataset_dir + 'x.pt'
        self.edge_index_pt_path = self.dataset_dir + 'edge_index.pt'
        self.edge_weight_pt_path = self.dataset_dir + 'edge_weights.pt'
        self.y_pt_path = self.dataset_dir + 'y.pt'
        self.missing_o3_mask_pt_path = self.dataset_dir + 'missing_o3_mask.pt'
        self.train_mask_pt_path = self.dataset_dir + 'train_mask.pt'
        self.val_mask_pt_path = self.dataset_dir + 'val_mask.pt'
        self.test_mask_pt_path = self.dataset_dir + 'test_mask.pt'
        self.pos_pt_path = self.dataset_dir + 'pos.pt'

    def get_reg(self):
        """
        A register file, where the sample id, the corresponding station
        id and the datetime can be read.
        """
        print('preprocessing register...')

        # example data
        hd = HourlyData('o3')
        hd.read_from_file()

        # filter the datetime index and add time offset
        time_offset = settings.time_offset
        hd.df.index = pd.to_datetime(hd.df.index) + time_offset
        start = pd.to_datetime(settings.filter_datetime_start)
        end = pd.to_datetime(settings.filter_datetime_end)
        filter_condition = hd.df.index.to_series().between(start, end)
        hd.df = hd.df[filter_condition].copy()

        # set up df
        n_samples_total = len(hd.df.index)*len(hd.df.columns)
        cols = ['station_id', 'datetime']
        reg_df = pd.DataFrame(index=range(n_samples_total),
                              columns=cols)
        reg_df.index.name = 'node_index'

        # fill df
        idx = 0
        for station_idx in hd.df.columns:
            for time_idx in hd.df.index:
                reg_df.loc[idx] = [station_idx, time_idx]
                idx += 1
                if idx % 500000 == 0:
                    print(f'{idx/len(reg_df)*100:.1f} %')

        # save
        reg_df.to_csv(self.reg_path)
        print(f'written to {self.reg_path}')

    def get_x(self):
        """
        A file with all node features
        """
        print('preprocessing x...')

        # organize features
        easy_names = ['hour_of_day', 'day_of_year']
        rea_names = ['temp', 'relhum', 'cloudcover',
                     'pblheight', 'u', 'v']
        meta_names = ['alt', 'relative_alt', 'population_density',
                      'nightlight', 'type', 'type_of_area']

        # set up df
        feature_names = easy_names + rea_names + meta_names
        reg_df = pd.read_csv(self.reg_path, index_col=0,
                             converters={'datetime': pd.Timestamp})
        x_df = pd.DataFrame(columns=feature_names, index=reg_df.index)

        # easy fields
        x_df.hour_of_day = [d.hour for d in reg_df.datetime]
        x_df.day_of_year = [d.dayofyear for d in reg_df.datetime]

        # rea fields, filter year and add time offset
        for rea_name in rea_names:
            hd = HourlyData(rea_name)
            hd.read_from_file()
            time_offset = settings.time_offset
            hd.df.index = pd.to_datetime(hd.df.index) + time_offset
            start = pd.to_datetime(settings.filter_datetime_start)
            end = pd.to_datetime(settings.filter_datetime_end)
            filter_condition = hd.df.index.to_series().between(start, end)
            hd.df = hd.df[filter_condition].copy()
            x_df[rea_name] = hd.df.values.T.reshape(-1)

        # meta fields
        sd = StationData()
        sd.read_from_file()
        for idx, col in sd.df.iterrows():
            id_filter = reg_df.station_id==idx
            for meta_name in meta_names:
                x_df.loc[id_filter, meta_name] = col[meta_name]

        # remame fields to wish names
        mapper = {'temp': 'temperature',
                 'relhum': 'relative_humidity',
                 'cloudcover': 'cloud_cover',
                 'pblheight': 'pbl_height',
                 'u': 'wind_u',
                 'v': 'wind_v'}
        x_df.rename(columns=mapper, inplace=True)

        # save
        x_df.to_csv(self.x_path)
        print(f'written to {self.x_path}')

    def get_y(self):
        """
        A file with all the labels (i.e. ozone values)
        """
        print('preprocessing y...')
        hd = HourlyData('o3')
        hd.read_from_file()

        # filter
        time_offset = settings.time_offset
        hd.df.index = pd.to_datetime(hd.df.index) + time_offset
        start = pd.to_datetime(settings.filter_datetime_start)
        end = pd.to_datetime(settings.filter_datetime_end)
        filter_condition = hd.df.index.to_series().between(start, end)
        hd.df = hd.df[filter_condition].copy()

        # flatten, but move through time steps first.
        y_values = hd.df.values.T.reshape(-1)
        y_df = pd.DataFrame(index=range(len(y_values)),
                            columns=['y'],
                            data=y_values)
        y_df.index.name = 'node_index'

        # save
        y_df.to_csv(self.y_path)
        print(f'written to {self.y_path}')

    def get_gaps(self):
        """
        Analyze the gaps in measurement data, their lenght.

        Twins are gaps of another station but with the same start
        and end datetime.
        """
        print('analyzing existing gaps...')

        # read in data
        y_df = pd.read_csv(self.y_path, index_col=0)
        reg_df = pd.read_csv(self.reg_path, index_col=0)

        # set up df
        columns=['station_id', 'type', 'start_datetime',
                 'end_datetime', 'start_idx', 'len', 'twins']
        gap_df = pd.DataFrame(columns=columns)
        gap_df.index.name = 'gap_index'

        # set all values to 1 and all nans to 0
        y_df.loc[y_df.y==y_df.y, 'y'] = 1
        y_df.loc[y_df.y!=y_df.y, 'y'] = 0

        # find consecutive gaps and no gaps
        gap_idx = 0
        y_old = y_df.y[0]
        station_id_old = reg_df.loc[0, 'station_id']
        y_index_gap_start = y_df.index[0]
        gap_len = 0
        for y_index, row in y_df.iterrows():
            if y_index % 500000 == 0:
                print(f'{y_index/len(y_df)*100:.1f} %')
            y = row.y
            station_id = reg_df.loc[y_index, 'station_id']
            if y==y_old and station_id==station_id_old:
                # increase len
                gap_len += 1
            else:
                # write gap to gap df and initialize new one
                gap_type = 'missing_o3' if y_old== 0 else 'value'
                if gap_type == 'missing_o3':
                    start = reg_df.loc[y_index_gap_start, 'datetime']
                    end = reg_df.loc[y_index_gap_start+gap_len, 'datetime']
                    twins = 0
                    gap_df.loc[gap_idx, :] = [station_id_old, gap_type,
                                              start, end,
                                              y_index_gap_start, gap_len, twins]
                # print([station_id_old, gap_type,
                #        y_index_gap_start, gap_len])
                    gap_idx += 1
                y_index_gap_start = y_index
                gap_len = 1
            y_old = y
            station_id_old = station_id

        # find twins with same start_datetime and end datetime
        print('analyzing twins of gaps...')
        for idx, row in gap_df.iterrows():
            if idx % 10000 == 0:
                print(f'{idx/len(gap_df)*100:.1f} %')
            twins = []
            station_id = row.station_id
            start = row.start_datetime
            end = row.end_datetime
            id_filter = gap_df.station_id != station_id
            start_filter = gap_df.start_datetime == start
            end_filter = gap_df.end_datetime == end
            gap_df_filtered = gap_df[id_filter&start_filter&end_filter]
            twins = gap_df_filtered.index.to_list()
            gap_df.at[idx, 'twins'] = len(twins)

        # save
        gap_df.to_csv(self.gap_path)
        print(f'written to {self.gap_path}')

    def get_mask(self):
        """
        - missing_o3_mask (True if o3 is missing)
        - train_mask (True if training sample)
        - val_mask (True if val sample)
        - test_mask (True if test sample)
        n.b. all masks should have the same statistical properties as
        missing_o3_mask
        """
        print('preparing masks...')

        # read in gaps catalouge and ozone df
        gap_df = pd.read_csv(self.gap_path, index_col=0)
        reg_df = pd.read_csv(self.reg_path, index_col=0)
        y_df = pd.read_csv(self.y_path, index_col=0)

        # initialize mask df
        columns = ['missing_o3_mask', 'train_mask', 'val_mask',
                   'test_mask']
        mask_df = pd.DataFrame(columns=columns, index=y_df.index,
                               data=False)
        mask_df.index.name = 'node_index'

        # missing o3 is simply where we do not have any measurement
        mask_df.loc[y_df.y!=y_df.y, 'missing_o3_mask'] = True

        # histogram of gap lenghts
        # 1h, 2h, 6h, 12h, 1d, 2d, 1w, 2w, 4w, 12w, 1y
        len_series = gap_df[gap_df.type=='missing_o3'].len
        bins = [1, 2, 3, 4, 6, 12, 24, 48, 168, 336, 672, 2016, 8760]
        freq, _ = np.histogram(len_series, bins=bins)
        freq = list(freq)

        # for val and test, we have to mask series at random.
        # reverse, because we need long consecutive series for the
        # first masks
        bins.reverse()
        freq.reverse()
        random.seed(settings.random_seed)

        for idx, bin_ in enumerate(bins[:-1]):
            lower_limit = bins[idx+1]
            upper_limit = bins[idx]
            # half to val, half to test maks
            n_required = min(round(freq[idx]/2), 1000)
            # print(f'{n_required} in [{lower_limit},{upper_limit})')
            for mask in ['val_mask', 'test_mask']:
                masks_found = 0
                while masks_found < round(n_required/2):
                    # pick random len in interval
                    len_ = random.randrange(lower_limit, upper_limit)
                    # pick random start index of the gap
                    start_idx = random.choice(mask_df.index)
                    start_station = reg_df.at[start_idx, 'station_id']
                    # add gap len to obtain end index
                    end_idx = start_idx + len_
                    # useless if it exceeds the original index
                    if end_idx > mask_df.index[-1]:
                        print('Exceeds df boundaries')
                        continue
                    end_station = reg_df.at[end_idx, 'station_id']
                    if start_station != end_station:
                        print('Different station')
                        continue
                    # useless if any more than 10 % of the values True
                    mask_df_filtered = mask_df[start_idx:end_idx]
                    missing_values = len(mask_df_filtered[
                                 (mask_df_filtered==True).any(axis=1)])
                    if missing_values * 10 > len_:
                        print('Too many missing values')
                        continue
                    # set those to true that are not true elsewhere
                    index_list = mask_df_filtered[
                           (mask_df_filtered==False).all(axis=1)].index
                    mask_df.at[index_list, mask] = True
                    start_datetime = reg_df.at[start_idx, 'datetime']
                    end_datetime = reg_df.at[end_idx, 'datetime']
                    twins = 0
                    _ = [start_station, mask, start_datetime,
                         end_datetime, start_idx, len_, twins]
                    gap_df.loc[len(gap_df), :] = _
                    masks_found += 1
                    print(start_station, missing_values, len_)

        # train mask is where no other mask applies
        index_list = mask_df[(mask_df==False).all(axis=1)].index
        mask_df.at[index_list, 'train_mask'] = True

        # save
        mask_df.to_csv(self.mask_path)
        print(f'written to {self.mask_path}')
        gap_df.to_csv(self.gap_path)
        print(f'written to {self.gap_path}')

    def get_rf(self, features=None, return_r2=False, save=True):
        """
        Fit a simple model to x and y, then save it

        A feature list and the option to return the r2 are given
        for feature selection purposes.
        """
        if not return_r2:
            print('baselines...')

        # read in data
        x_df = pd.read_csv(self.x_path, index_col=0)
        y_df = pd.read_csv(self.y_path, index_col=0)
        mask_df = pd.read_csv(self.mask_path, index_col=0)

        # choosing features, one-hot encoding
        if features:
            x_df = x_df[features]
        x_df = get_one_hot(x_df)

        # prepare data
        x_train = x_df[mask_df.train_mask].to_numpy()
        x_val = x_df[mask_df.val_mask].to_numpy()
        y_train = y_df[mask_df.train_mask].to_numpy().reshape(-1)
        y_val = y_df[mask_df.val_mask].to_numpy().reshape(-1)

        # fit model
        rf = RandomForestRegressor(random_state=settings.random_seed,
                                   n_jobs=-1, max_depth=None,
                                   n_estimators=100)
        rf.fit(x_train, y_train)

        # evaluate
        y_val_hat = rf.predict(x_val)
        y_train_hat = rf.predict(x_train)
        rmse_train = (mean_squared_error(y_train, y_train_hat))**.5
        r2_train = r2_score(y_train, y_train_hat)
        rmse_val = (mean_squared_error(y_val, y_val_hat))**.5
        r2_val = r2_score(y_val, y_val_hat)

        # if the r2 is not returned, print stats
        if not return_r2:
            print('======================')
            if features:
                print('Features')
                print([f for f in features])
            print('Baseline results for train:')
            print(f'RMSE: {rmse_train:.2f}, R2: {r2_train:.2f}')
            print('Baseline results for val:')
            print(f'RMSE: {rmse_val:.2f}, R2: {r2_val:.2f}')
            print('======================')

        # save
        if save:
            pkl.dump(rf, open(self.rf_path, 'wb'))
            print(f'written to {self.rf_path}')

        # return metric
        if return_r2:
            return r2_val

    def get_imp(self, features=None):
        """
        Feature importances of RF to define the graph structure
        """
        print('importances of baseline model...')

        # read in data and model
        x_df = pd.read_csv(self.x_path, index_col=0)
        if features:
            x_df = x_df[features]
        x_df = get_one_hot(x_df)
        rf = pkl.load(open(self.rf_path, 'rb'))

        # feature importances
        imp_df = pd.DataFrame(columns=['feature', 'importance'],
                              index=range(len(x_df.columns)))
        imp_df.feature = x_df.columns
        imp_df.importance = rf.feature_importances_

        # save
        imp_df.to_csv(self.imp_path)
        print(f'written to {self.imp_path}')

    def get_pos(self):
        """
        Position of every node: lon and lat
        """
        print('preprocessing positions of nodes...')

        # read in data
        reg_df = pd.read_csv(self.reg_path, index_col=0)
        sd = StationData()
        sd.read_from_file()
        station_df = sd.df

        # initialize position data frame
        pos_df = pd.DataFrame(index=reg_df.index,
                              columns=['lon', 'lat', 'datetime'])
        pos_df.index.name = 'node_index'

        # write positions to df
        for idx, col in station_df.iterrows():
            id_filter = reg_df.station_id==idx
            for coordinate in ['lon', 'lat']:
                pos_df.loc[id_filter, coordinate] = col[coordinate]
        pos_df.datetime = reg_df.datetime

        # save
        pos_df.to_csv(self.pos_path)
        print(f'written to {self.pos_path}')

    def get_edges(self):
        """
        Collect all edges. This routine best runs on the mem192
        partition of JUWELS.

        This routine also produces the edge_tmp.csv file,
        which makes calculating edge weights a lot faster.
        """
        print('preprocessing edges...')

        # settings for edges, radius in km and time window
        radius = 50.
        time_window = pd.Timedelta('0 days 06:00:00')

        # read in data
        station_df = pd.read_csv(self.station_path, index_col=0)
        reg_df = pd.read_csv(self.reg_path, index_col=0,
                             parse_dates=['datetime'])
        # reg_df = reg_df[0:int(1/1000*len(reg_df))].copy() <-- for testing

        # initialize edge dataframe, n.b. it is reorganized later
        edge_df = pd.DataFrame(index=reg_df.index,
                                columns=['source_indices'])
        edge_df.index.name = 'target_index'

        # find neighbors of stations in given radius
        neighbor_df = pd.DataFrame(index=station_df.index,
                                   columns=['neighbors'])
        ids = station_df.index
        x_, y_, z_ = geo_to_cartesian(station_df.lon.values,
                                      station_df.lat.values)
        coords = np.array([x_, y_, z_]).T
        nn = NearestNeighbors()
        nn.fit(coords)
        _, idx_lists = nn.radius_neighbors(coords,
                                           radius=50.)
        for trg_idx, src_idx_list in enumerate(idx_lists):
            trg_id = ids[trg_idx]
            src_ids = [ids[src_idx] for src_idx in sorted(src_idx_list)]
            neighbor_df.at[trg_id, 'neighbors'] = src_ids

        # edges exist if measurements are at neighboring stations
        # and in the given time window
        reg_ddf = ddf.from_pandas(reg_df, npartitions=6)
        datetime_start = datetime.datetime.now()
        print(datetime_start)
        args = (neighbor_df, reg_df, radius, time_window)
        edge_dseries = reg_ddf.apply(neighbor_index_filter,
                                     args=args,
                                     axis=1,
                                     meta=edge_df['source_indices'])
        edge_series = edge_dseries.compute(scheduler='multiprocessing')
        edge_df.source_indices = edge_series
        print(datetime.datetime.now() - datetime_start)

        # save temporary edge_df
        edge_df.to_csv(self.edge_tmp_path)
        print(f'written to {self.edge_tmp_path}')

        # reorganize the edge_df, so one edge gets one row.
        source_index_list, target_index_list = [], []
        for index, row in edge_df.iterrows():
            source_indices = row.source_indices
            target_index = index
            source_index_list += source_indices
            target_index_list += [target_index]*len(source_indices)
        edge_df = pd.DataFrame()
        edge_df['source_node_index'] = source_index_list
        edge_df['target_node_index'] = target_index_list
        edge_df.index.name = 'edge_index'

        # save
        edge_df.to_csv(self.edge_path)
        print(f'written to {self.edge_path}')

    def get_edge_weights(self, features=None):
        """
        Weight edges by cosine similarity
        """
        print('preprocessing edge weights...')

        # read in data
        edge_tmp_df = pd.read_csv(self.edge_tmp_path, index_col=0)
        x_df = pd.read_csv(self.x_path, index_col=0)
        imp_df = pd.read_csv(self.imp_path, index_col=0)

        # prepare x
        if features:
            x_df = x_df[features]
        x_df = get_one_hot(x_df)
        for feature in x_df.columns:
            scaler = StandardScaler()
            scale_data = x_df[feature].to_numpy().reshape(-1, 1)
            imp = float(imp_df[imp_df.feature==feature].importance.values)
            x_df[feature] = scaler.fit_transform(scale_data) * imp

        # iterate through all node indices
        edge_weight_list = []
        for target_node_index in edge_tmp_df.index:
            if target_node_index % 500000 == 0:
                print(f'{target_node_index/len(edge_tmp_df)*100:.1f} %')
            source_node_indices = eval(edge_tmp_df.at[target_node_index, 'source_indices'])
            x_source_nodes = x_df.loc[source_node_indices]
            x_target_node = x_df.loc[target_node_index]
            weights_source_nodes = cosine_similarity(x_source_nodes,
                                   np.array(x_target_node).reshape(1, -1)
                                   ).clip(0).reshape(-1)
            weights_source_nodes = weights_source_nodes/sum(weights_source_nodes)
            edge_weight_list +=  weights_source_nodes.tolist()

        # edge weight dataframe
        weight_df = pd.DataFrame(columns=['edge_weight'])
        weight_df['edge_weight'] = edge_weight_list
        weight_df.index.name = 'edge_index'

        # save
        weight_df.to_csv(self.weight_path)
        print(f'written to {self.weight_path}')

    def convert_to_tensor(self, features=None):
        """
        Convert all the .csv files to tensors
        """
        print('converting to pytorch tensors...')

        # x
        x_df = pd.read_csv(self.x_path, index_col=0)
        if features:
            x_df = x_df[features]
        x_df = get_one_hot(x_df)
        x = torch.tensor(x_df.values.astype(np.float32))
        torch.save(x, self.x_pt_path)
        print(f'written to {self.x_pt_path}')
        del x_df

        # edges
        edge_df = pd.read_csv(self.edge_path, index_col=0)
        src_list = edge_df.source_node_index.to_list()
        trg_list = edge_df.target_node_index.to_list()
        edge_index = torch.tensor(np.array([src_list, trg_list])).detach()
        torch.save(edge_index, self.edge_index_pt_path)
        print(f'written to {self.edge_index_pt_path}')
        del edge_df

        # edge weights
        weight_df = pd.read_csv(self.weight_path, index_col=0)
        edge_weights = torch.tensor(weight_df.values.astype(np.float32)).view(-1, 1)
        torch.save(edge_weights, self.edge_weight_pt_path)
        print(f'written to {self.edge_weight_pt_path}')
        del weight_df

        # y
        y_df = pd.read_csv(self.y_path, index_col=0)
        y = torch.tensor(y_df.values.astype(np.float32)).view(-1, 1)
        torch.save(y, self.y_pt_path)
        print(f'written to {self.y_pt_path}')
        del y_df

        # masks
        mask_df = pd.read_csv(self.mask_path, index_col=0)
        missing_o3_mask = torch.tensor(mask_df.missing_o3_mask).view(-1)
        torch.save(missing_o3_mask, self.missing_o3_mask_pt_path)
        print(f'written to {self.missing_o3_mask_pt_path}')
        train_mask = torch.tensor(mask_df.train_mask).view(-1)
        torch.save(train_mask, self.train_mask_pt_path)
        print(f'written to {self.train_mask_pt_path}')
        val_mask = torch.tensor(mask_df.val_mask).view(-1)
        torch.save(val_mask, self.val_mask_pt_path)
        print(f'written to {self.val_mask_pt_path}')
        test_mask = torch.tensor(mask_df.test_mask).view(-1)
        torch.save(test_mask, self.test_mask_pt_path)
        print(f'written to {self.test_mask_pt_path}')
        del mask_df

        # position
        pos_df = pd.read_csv(self.pos_path, index_col=0)
        pos = torch.tensor(pos_df[['lon', 'lat']].values.astype(np.float32))
        torch.save(pos, self.pos_pt_path)
        print(f'written to {self.pos_pt_path}')
        del pos_df

    def get_dataset(self, features=None):
        """
        Finalize the Pytorch dataset, include everything that is
        in the gAQBench dataset.
        """
        # read in data
        x = torch.load(self.x_pt_path)
        edge_index = torch.load(self.edge_index_pt_path)
        edge_weight = torch.load(self.edge_weight_pt_path)
        y = torch.load(self.y_pt_path)
        missing_o3_mask = torch.load(self.missing_o3_mask_pt_path)
        train_mask = torch.load(self.train_mask_pt_path)
        val_mask = torch.load(self.val_mask_pt_path)
        test_mask = torch.load(self.test_mask_pt_path)
        pos = torch.load(self.pos_pt_path)

        # create dataset
        data = Data(x=x, edge_index=edge_index, edge_weight=edge_weight,
                    y=y, missing_o3_mask=missing_o3_mask,
                    train_mask=train_mask, val_mask=val_mask,
                    test_mask=test_mask, pos=pos)
        self.data, self.slices = self.collate([data])


def main():
    """
    Full workflow to create the dataset
    """
    # features are important for the simple model and the edge weights
    features = ['hour_of_day', 'day_of_year', 'relative_alt',
                'pbl_height', 'temperature', 'alt',
                'population_density', 'type', 'wind_v', 'wind_u',
                'cloud_cover']
    tro = TimeResolvedOzone()
    # tro.get_reg()
    # tro.get_x()
    # tro.get_y()
    # tro.get_gaps()
    # tro.get_mask()
    # tro.get_rf(features=features)
    # tro.get_imp(features=features)
    # tro.get_pos()
    # tro.get_edges()
    # tro.get_edge_weights(features=features)
    tro.convert_to_tensor(features=features)
    tro.get_dataset()
    dataset = tro

    print(f'Dataset: {dataset}:')
    print('======================')
    print(f'Number of graphs: {len(dataset)}')
    print(f'Number of node features: {dataset.num_features}')
    print(f'Number of edge features: {dataset.num_edge_features}')
    data = dataset[0]  # Get the first graph object.
    print(data)
    print('======================')
    # Gather some statistics about the graph.
    print(f'Number of nodes: {data.num_nodes}')
    print(f'Number of edges: {data.num_edges}')
    print(f'Average node degree: {(2*data.num_edges) / data.num_nodes:.2f}')
    print(f'Number of training nodes: {data.train_mask.sum()}')
    print(f'Training node share: {int(data.train_mask.sum()) / data.num_nodes:.2f}')
    print(f'Contains isolated nodes: {data.has_isolated_nodes()}')
    print(f'Contains self-loops: {data.has_self_loops()}')
    print(f'Is undirected: {data.is_undirected()}')


def feature_selection():
    """
    Forward feature selection for the simple model
    """
    # in itialize class
    tro = TimeResolvedOzone()

    # read in data
    x_df = pd.read_csv(tro.x_path, index_col=0)
    all_features = x_df.columns

    # create pairs
    pair_list = []
    for f1 in all_features:
        for f2 in all_features:
            condition1 = f1 != f2
            condition2 = [f2, f1] not in pair_list
            if (condition1 and condition2):
                pair_list.append([f1, f2])

    # train on all pairs and append to r2 list
    print('\ntesting 2 features:')
    r2_list = []
    for pair in pair_list:
        r2 = tro.get_rf(features=pair, return_r2=True, save=False)
        r2_list.append(r2)
        print(f'{pair}, {r2:.3f}')

    # look for the winning feature
    chosen_features = pair_list[r2_list.index(max(r2_list))]
    best_r2 = max(r2_list)
    print('\nwinners:')
    print(f'{chosen_features}, {best_r2:.3f}')

    # append new features iteratively until r2 decreases
    new_best_r2 = 1.
    while new_best_r2 > best_r2:
        print(f'\ntesting {len(chosen_features)+1} features:')
        candidates_list = []
        new_r2_list = []
        for feature in all_features:
            if feature in chosen_features:
                continue
            candidates = chosen_features + [feature]
            r2 = tro.get_rf(features=candidates, return_r2=True,
                            save=False)
            candidates_list.append(candidates)
            new_r2_list.append(r2)
            print(f'{candidates}, {r2:.3f}')
        chosen_features = candidates_list[new_r2_list.index(
                                                     max(new_r2_list))]
        new_best_r2 = max(new_r2_list)
        print('winners:')
        print(f'{chosen_features}, {new_best_r2:.3f}')

    # todo: new best r2 and best r2 are not comparable


if __name__ == '__main__':
    """
    Start routines
    """
    main()
    # feature_selection()