Skip to content
Snippets Groups Projects
Commit 00d65a75 authored by lukas leufen's avatar lukas leufen
Browse files

implementation of extremes in data preparation class

parent 4d408aeb
No related branches found
No related tags found
3 merge requests!90WIP: new release update,!89Resolve "release branch / CI on gpu",!77Resolve "Upsample "extremes" in standardised data space"
...@@ -5,7 +5,7 @@ import datetime as dt ...@@ -5,7 +5,7 @@ import datetime as dt
from functools import reduce from functools import reduce
import logging import logging
import os import os
from typing import Union, List, Iterable from typing import Union, List, Iterable, Tuple
import numpy as np import numpy as np
import pandas as pd import pandas as pd
...@@ -17,6 +17,8 @@ from src import statistics ...@@ -17,6 +17,8 @@ from src import statistics
# define a more general date type for type hinting # define a more general date type for type hinting
date = Union[dt.date, dt.datetime] date = Union[dt.date, dt.datetime]
str_or_list = Union[str, List[str]] str_or_list = Union[str, List[str]]
number = Union[float, int]
num_or_list = Union[number, List[number]]
class DataPrep(object): class DataPrep(object):
...@@ -58,6 +60,8 @@ class DataPrep(object): ...@@ -58,6 +60,8 @@ class DataPrep(object):
self.history = None self.history = None
self.label = None self.label = None
self.observation = None self.observation = None
self.extremes_history = None
self.extremes_labels = None
self.kwargs = kwargs self.kwargs = kwargs
self.data = None self.data = None
self.meta = None self.meta = None
...@@ -420,6 +424,64 @@ class DataPrep(object): ...@@ -420,6 +424,64 @@ class DataPrep(object):
def get_transposed_label(self): def get_transposed_label(self):
return self.label.squeeze("Stations").transpose("datetime", "window").copy() return self.label.squeeze("Stations").transpose("datetime", "window").copy()
def multiply_extremes(self, extreme_values: num_or_list = 1., extremes_on_right_tail_only: bool = False,
timedelta: Tuple[int, str] = (1, 'm')):
"""
This method extracts extreme values from self.labels which are defined in the argument extreme_values. One can
also decide only to extract extremes on the right tail of the distribution. When extreme_values is a list of
floats/ints all values larger (and smaller than negative extreme_values; extraction is performed in standardised
space) than are extracted iteratively. If for example extreme_values = [1.,2.] then a value of 1.5 would be
extracted once (for 0th entry in list), while a 2.5 would be extracted twice (once for each entry). Timedelta is
used to mark those extracted values by adding one min to each timestamp. As TOAR Data are hourly one can
identify those "artificial" data points later easily. Extreme inputs and labels are stored in
self.extremes_history and self.extreme_labels, respectively.
:param extreme_values: user definition of extreme
:param extremes_on_right_tail_only: if False also multiply values which are smaller then -extreme_values,
if True only extract values larger than extreme_values
:param timedelta: used as arguments for np.timedelta in order to mark extreme values on datetime
"""
# check type if inputs
extreme_values = helpers.to_list(extreme_values)
extreme_values.sort()
for i in extreme_values:
if not isinstance(i, number.__args__):
raise TypeError(f"Elements of list extreme_values have to be {number.__args__}, but at least element "
f"{i} is type {type(i)}")
for extr_val in extreme_values:
# check if some extreme values are already extracted
if not all([self.extremes_labels, self.extremes_history]):
# extract extremes based on occurance in labels
if extremes_on_right_tail_only:
extreme_label_idx = (self.label > extr_val).any(axis=0).values.reshape(-1,)
else:
extreme_label_idx = np.concatenate(((self.label < -extr_val).any(axis=0).values.reshape(-1, 1),
(self.label > extr_val).any(axis=0).values.reshape(-1, 1)),
axis=1).any(axis=1)
extremes_label = self.label[..., extreme_label_idx]
extremes_history = self.history[..., extreme_label_idx, :]
extremes_label.datetime.values += np.timedelta64(*timedelta)
extremes_history.datetime.values += np.timedelta64(*timedelta)
self.extremes_labels = extremes_label.squeeze('Stations').transpose('datetime', 'window')
self.extremes_history = extremes_history.transpose('datetime', 'window', 'Stations', 'variables')
else: # one extr value iteration is done already: self.extremes_labels is NOT None...
if extremes_on_right_tail_only:
extreme_label_idx = (self.extremes_labels > extr_val).any(axis=1).values.reshape(-1,)
else:
extreme_label_idx = np.concatenate(((self.extremes_labels < -extr_val).any(axis=1
).values.reshape(-1, 1),
(self.extremes_labels > extr_val).any(axis=1
).values.reshape(-1, 1)
), axis=1).any(axis=1)
# check on existing extracted extremes to minimise computational costs for comparison
extremes_label = self.extremes_labels[extreme_label_idx, ...]
extremes_history = self.extremes_history[extreme_label_idx, ...]
extremes_label.datetime.values += np.timedelta64(*timedelta)
extremes_history.datetime.values += np.timedelta64(*timedelta)
self.extremes_labels = xr.concat([self.extremes_labels, extremes_label], dim='datetime')
self.extremes_history = xr.concat([self.extremes_history, extremes_history], dim='datetime')
if __name__ == "__main__": if __name__ == "__main__":
dp = DataPrep('data/', 'dummy', 'DEBW107', ['o3', 'temp'], statistics_per_var={'o3': 'dma8eu', 'temp': 'maximum'}) dp = DataPrep('data/', 'dummy', 'DEBW107', ['o3', 'temp'], statistics_per_var={'o3': 'dma8eu', 'temp': 'maximum'})
......
...@@ -52,7 +52,7 @@ class TestBootstraps: ...@@ -52,7 +52,7 @@ class TestBootstraps:
boot_no_init.number_bootstraps = 50 boot_no_init.number_bootstraps = 50
assert boot_no_init.valid_bootstrap_file(station, variables, 20) == (False, 60) assert boot_no_init.valid_bootstrap_file(station, variables, 20) == (False, 60)
def test_shuffle_single_variale(self, boot_no_init): def test_shuffle_single_variable(self, boot_no_init):
data = np.array([[1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3]]) data = np.array([[1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3]])
res = boot_no_init.shuffle_single_variable(data, chunks=(2, 3)).compute() res = boot_no_init.shuffle_single_variable(data, chunks=(2, 3)).compute()
assert res.shape == data.shape assert res.shape == data.shape
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment