From 45199b6c8b1bc9198007cb7d699d686f8e07ff5e Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Thu, 17 Jun 2021 15:28:10 +0200 Subject: [PATCH 01/58] apply_oversampling calculates the desired oversampling_rates --- mlair/run_modules/pre_processing.py | 30 +++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/mlair/run_modules/pre_processing.py b/mlair/run_modules/pre_processing.py index 9d44ce0..6f3c1ce 100644 --- a/mlair/run_modules/pre_processing.py +++ b/mlair/run_modules/pre_processing.py @@ -8,6 +8,8 @@ import os import traceback from typing import Tuple import multiprocessing + +import numpy as np import requests import psutil @@ -65,9 +67,37 @@ class PreProcessing(RunEnvironment): raise ValueError("Couldn't find any valid data according to given parameters. Abort experiment run.") self.data_store.set("stations", valid_stations) self.split_train_val_test() + self.apply_oversampling() self.report_pre_processing() self.prepare_competitors() + def apply_oversampling(self): + #if Abfrage for oversampling=True/False + bins = 10 + rates_cap = 20 + data = self.data_store.get('data_collection', 'train') + histogram = np.array(bins) + #get min and max of the whole data + min = 0 + max = 0 + for station in data: + min = np.minimum(np.amin(station.get_Y(as_numpy=True)), min) + max = np.maximum(np.amax(station.get_Y(as_numpy=True)), max) + for station in data: + # erstelle Histogramm mit numpy für jede Station + hist, _ = np.histogram(station.get_Y(as_numpy=True), bins=bins, range=(min,max)) + #histograms.append(hist) + histogram = histogram + hist + # Addiere alle Histogramme zusammen + #histogram = histograms[0]+histograms[1]+histograms[2]+histograms[3] + #teile durch gesamtanzahl + histogram = 1/np.sum(histogram) * histogram + #mult mit 1/häufigste Klasse + histogram = 1/np.amax(histogram) * histogram + #Oversampling 1/Kl + oversampling_rates = 1 / histogram + oversampling_rates_capped = np.minimum(oversampling_rates, rates_cap) + def report_pre_processing(self): """Log some metrics on data and create latex report.""" logging.debug(20 * '##') -- GitLab From 21203700f3665cd6e5dd17a75622b745b1cb17f2 Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Fri, 18 Jun 2021 18:49:09 +0200 Subject: [PATCH 02/58] apply_oversampling calculates the desired oversampling_rates --- package_licenses.md | 78 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 package_licenses.md diff --git a/package_licenses.md b/package_licenses.md new file mode 100644 index 0000000..007e63b --- /dev/null +++ b/package_licenses.md @@ -0,0 +1,78 @@ +Package | License | Link +---|---|--- +absl-py==0.11.0|Apache 2.0|(https://pypi.org/project/absl-py/) +appdirs==1.4.4|MIT|(https://pypi.org/project/appdirs/) +astor==0.8.1|BSD 3-Clause|(https://pypi.org/project/astor/) +astropy==4.1|BSD 3-Clause|(https://pypi.org/project/astropy/) +attrs==20.3.0|MIT|(https://pypi.org/project/attrs/) +bottleneck==1.3.2|BSD Simplified|(https://pypi.org/project/Bottleneck/) +cached-property==1.5.2|BSD|(https://pypi.org/project/cached-property/) +certifi==2020.12.5|MPL 2.0|(https://pypi.org/project/certifi/) +cftime==1.4.1|MIT|(https://pypi.org/project/cftime/) +chardet==4.0.0|LPGL|(https://pypi.org/project/chardet/) +coverage==5.4|Apache 2.0|(https://pypi.org/project/coverage/) +cycler==0.10.0|BSD|(https://pypi.org/project/Cycler/) +dask==2021.2.0|BSD|(https://pypi.org/project/dask/) +dill==0.3.3|BSD 3-Clause|(https://pypi.org/project/dill/) +fsspec==0.8.5|BSD|(https://pypi.org/project/fsspec/) +gast==0.4.0|BSD 3-Clause|(https://pypi.org/project/gast/) +grpcio==1.35.0|Apache 2.0|(https://pypi.org/project/grpcio/) +h5py==2.10.0|BSD|(https://pypi.org/project/h5py/) +idna==2.10|BSD|(https://pypi.org/project/idna/) +importlib-metadata==3.4.0|Apache|(https://pypi.org/project/importlib-metadata/) +iniconfig==1.1.1|MIT|(https://pypi.org/project/iniconfig/) +Keras==2.2.4|MIT|(https://pypi.org/project/keras/) +Keras-Applications==1.0.8|MIT|(https://pypi.org/project/Keras-Applications/) +Keras-Preprocessing==1.1.2|MIT|(https://pypi.org/project/Keras-Preprocessing/) +kiwisolver==1.3.1|BSD|(https://pypi.org/project/kiwisolver/) +locket==0.2.1|BSD 2-Clause|(https://pypi.org/project/locket/) +Markdown==3.3.3|BSD|(https://pypi.org/project/Markdown/) +matplotlib==3.3.4|PSF|(https://pypi.org/project/matplotlib/) +mock==4.0.3|BSD|(https://pypi.org/project/mock/) +netCDF4==1.5.5.1|MIT|(https://pypi.org/project/netCDF4/) +numpy==1.19.5|BSD|(https://pypi.org/project/numpy/) +ordered-set==4.0.2|MIT|(https://pypi.org/project/ordered-set/) +packaging==20.9|BSD 2-Clause or Apache 2.0|(https://pypi.org/project/packaging/) +pandas==1.1.5|BSD|(https://pypi.org/project/pandas/) +partd==1.1.0|BSD|(https://pypi.org/project/partd/) +patsy==0.5.1|BSD 2-Clause|(https://pypi.org/project/patsy/) +Pillow==8.1.0|HPND|(https://pypi.org/project/Pillow/) +pluggy==0.13.1|MIT|(https://pypi.org/project/pluggy/) +protobuf==3.15.0|BSD 3-Clause|(https://pypi.org/project/protobuf/) +psutil==5.8.0|BSD|(https://pypi.org/project/psutil/) +py==1.10.0|MIT|(https://pypi.org/project/py/) +pydot==1.4.2|MIT|(https://pypi.org/project/pydot/) +pyparsing==2.4.7|MIT|(https://pypi.org/project/pyparsing/) +pyshp==2.1.3|MIT|(https://pypi.org/project/pyshp/) +pytest==6.2.2|MIT|(https://pypi.org/project/pytest/) +pytest-cov==2.11.1|MIT|(https://pypi.org/project/pytest-cov/) +pytest-html==3.1.1|MPL 2.0|(https://pypi.org/project/pytest-html/) +pytest-lazy-fixture==0.6.3|MIT|(https://pypi.org/project/pytest-lazy-fixture/) +pytest-metadata==1.11.0|MPL 2.0|(https://pypi.org/project/pytest-metadata/) +pytest-sugar==0.9.4|BSD|(https://pypi.org/project/pytest-sugar/) +python-dateutil==2.8.1|Apache or BSD|(https://pypi.org/project/python-dateutil/) +pytz==2021.1|MIT|(https://pypi.org/project/pytz/) +PyYAML==5.4.1|MIT|(https://pypi.org/project/PyYAML/) +requests==2.25.1|Apache 2.0|(https://pypi.org/project/requests/) +scipy==1.5.4|BSD|(https://pypi.org/project/scipy/) +seaborn==0.11.1|BSD 3-Clause|(https://pypi.org/project/seaborn/) +six==1.15.0|MIT|(https://pypi.org/project/six/) +statsmodels==0.12.2|BSD|(https://pypi.org/project/statsmodels/) +tabulate==0.8.8|MIT|(https://pypi.org/project/tabulate/) +tensorboard==1.13.1|Apache 2.0|(https://pypi.org/project/tensorboard/) +tensorflow==1.13.1|Apache 2.0|(https://pypi.org/project/tensorflow/) +tensorflow-estimator==1.13.0|Apache 2.0|(https://pypi.org/project/tensorflow-estimator/) +termcolor==1.1.0|MIT|(https://pypi.org/project/termcolor/) +toml==0.10.2|MIT|(https://pypi.org/project/toml/) +toolz==0.11.1|BSD|(https://pypi.org/project/toolz/) +typing-extensions==3.7.4.3|PSF|(https://pypi.org/project/typing-extensions/) +urllib3==1.26.3|MIT|(https://pypi.org/project/urllib3/) +Werkzeug==1.0.1|BSD 3-Clause|(https://pypi.org/project/Werkzeug/) +wget==3.2|Public Domain|(https://pypi.org/project/wget/) +xarray==0.16.2|Apache|(https://pypi.org/project/xarray/) +zipp==3.4.0|MIT|(https://pypi.org/project/zipp/) +shapely==1.7.0|BSD|(https://pypi.org/project/Shapely/) +Cartopy==0.18.0|LGPLv3+|(https://pypi.org/project/Cartopy/) + +##Different Licenses +Apache, Apache 2.0, BSD, BSD Simplified, BSD 2-Clause, BSD 3-Clause, HPND, LGPLv3+, MIT, MPL 2.0, PSF, Public Domain -- GitLab From f54a7b8729c9281bb1259abf71ef4c97ea1aaacf Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Mon, 21 Jun 2021 11:39:13 +0200 Subject: [PATCH 03/58] Changes according to the threads, using histogram += hist and histogram /= np.amax(histogram) leads to error because of wrong shape --- mlair/run_modules/pre_processing.py | 31 +++++++++++++---------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/mlair/run_modules/pre_processing.py b/mlair/run_modules/pre_processing.py index 6f3c1ce..05bd61d 100644 --- a/mlair/run_modules/pre_processing.py +++ b/mlair/run_modules/pre_processing.py @@ -71,32 +71,29 @@ class PreProcessing(RunEnvironment): self.report_pre_processing() self.prepare_competitors() - def apply_oversampling(self): - #if Abfrage for oversampling=True/False - bins = 10 - rates_cap = 20 + def apply_oversampling(self, bins=10, rates_cap=20): + #if request for oversampling=True/False data = self.data_store.get('data_collection', 'train') histogram = np.array(bins) #get min and max of the whole data - min = 0 - max = 0 + total_min = 0 + total_max = 0 for station in data: - min = np.minimum(np.amin(station.get_Y(as_numpy=True)), min) - max = np.maximum(np.amax(station.get_Y(as_numpy=True)), max) + total_min = np.minimum(np.amin(station.get_Y(as_numpy=True)), total_min) + total_max = np.maximum(np.amax(station.get_Y(as_numpy=True)), total_max) for station in data: - # erstelle Histogramm mit numpy für jede Station - hist, _ = np.histogram(station.get_Y(as_numpy=True), bins=bins, range=(min,max)) - #histograms.append(hist) + # Create histogram for each station + hist, _ = np.histogram(station.get_Y(as_numpy=True), bins=bins, range=(total_min,total_max)) + # Add up histograms histogram = histogram + hist - # Addiere alle Histogramme zusammen - #histogram = histograms[0]+histograms[1]+histograms[2]+histograms[3] - #teile durch gesamtanzahl - histogram = 1/np.sum(histogram) * histogram - #mult mit 1/häufigste Klasse + # Scale down to most frequent class=1 histogram = 1/np.amax(histogram) * histogram - #Oversampling 1/Kl + # Get Oversampling rates (with and without cap) oversampling_rates = 1 / histogram oversampling_rates_capped = np.minimum(oversampling_rates, rates_cap) + # Add to datastore + self.data_store.set('oversampling_rates', oversampling_rates, 'training') + self.data_store.set('oversampling_rates_capped', oversampling_rates_capped, 'training') def report_pre_processing(self): """Log some metrics on data and create latex report.""" -- GitLab From faf3c2c609a177217b26a7e0ac7e80f747214901 Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Mon, 21 Jun 2021 13:34:51 +0200 Subject: [PATCH 04/58] Trying to make bins and rates_cap more flexible, inserting default values. --- mlair/configuration/defaults.py | 3 +++ mlair/run_modules/experiment_setup.py | 7 ++++++- mlair/run_modules/pre_processing.py | 4 +++- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/mlair/configuration/defaults.py b/mlair/configuration/defaults.py index 785aab8..2b817f5 100644 --- a/mlair/configuration/defaults.py +++ b/mlair/configuration/defaults.py @@ -56,6 +56,9 @@ DEFAULT_DATA_ORIGIN = {"cloudcover": "REA", "humidity": "REA", "pblheight": "REA DEFAULT_USE_MULTIPROCESSING = True DEFAULT_USE_MULTIPROCESSING_ON_DEBUG = False +DEFAULT_BINS = 10 +DEFAULT_RATES_CAP = 20 + def get_defaults(): """Return all default parameters set in defaults.py""" diff --git a/mlair/run_modules/experiment_setup.py b/mlair/run_modules/experiment_setup.py index 24fedaa..e28eb76 100644 --- a/mlair/run_modules/experiment_setup.py +++ b/mlair/run_modules/experiment_setup.py @@ -215,7 +215,8 @@ class ExperimentSetup(RunEnvironment): create_new_bootstraps=None, data_path: str = None, batch_path: str = None, login_nodes=None, hpc_hosts=None, model=None, batch_size=None, epochs=None, data_handler=None, data_origin: Dict = None, competitors: list = None, competitor_path: str = None, - use_multiprocessing: bool = None, use_multiprocessing_on_debug: bool = None, **kwargs): + use_multiprocessing: bool = None, use_multiprocessing_on_debug: bool = None, + bins=None, rates_cap=None, **kwargs): # create run framework super().__init__() @@ -360,6 +361,10 @@ class ExperimentSetup(RunEnvironment): # set model architecture class self._set_param("model_class", model, VanillaModel) + # set params for oversampling + self._set_param("bins", bins, default=DEFAULT_BINS) + self._set_param("rates_cap", rates_cap, default=DEFAULT_RATES_CAP) + # set remaining kwargs if len(kwargs) > 0: for k, v in kwargs.items(): diff --git a/mlair/run_modules/pre_processing.py b/mlair/run_modules/pre_processing.py index 05bd61d..92cdac4 100644 --- a/mlair/run_modules/pre_processing.py +++ b/mlair/run_modules/pre_processing.py @@ -71,9 +71,11 @@ class PreProcessing(RunEnvironment): self.report_pre_processing() self.prepare_competitors() - def apply_oversampling(self, bins=10, rates_cap=20): + def apply_oversampling(self): #if request for oversampling=True/False data = self.data_store.get('data_collection', 'train') + bins = self.data_store.get_default('bins') + rates_cap = self.data_store.get_default('rates_cap') histogram = np.array(bins) #get min and max of the whole data total_min = 0 -- GitLab From 690fca578908175b2cd930e7645376a1642a9f49 Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Mon, 21 Jun 2021 16:21:37 +0200 Subject: [PATCH 05/58] Trying to make bins and rates_cap more flexible, inserting default values. --- mlair/configuration/defaults.py | 5 ++-- mlair/data_handler/default_data_handler.py | 32 ++++++++++++++++++++++ mlair/run_modules/experiment_setup.py | 11 +++++--- mlair/run_modules/pre_processing.py | 14 ++++++---- 4 files changed, 50 insertions(+), 12 deletions(-) diff --git a/mlair/configuration/defaults.py b/mlair/configuration/defaults.py index 2b817f5..c6e6178 100644 --- a/mlair/configuration/defaults.py +++ b/mlair/configuration/defaults.py @@ -55,9 +55,8 @@ DEFAULT_DATA_ORIGIN = {"cloudcover": "REA", "humidity": "REA", "pblheight": "REA "pm10": "", "so2": ""} DEFAULT_USE_MULTIPROCESSING = True DEFAULT_USE_MULTIPROCESSING_ON_DEBUG = False - -DEFAULT_BINS = 10 -DEFAULT_RATES_CAP = 20 +DEFAULT_OVERSAMPLING_BINS = 10 +DEFAULT_OVERSAMPLING_RATES_CAP = 20 def get_defaults(): diff --git a/mlair/data_handler/default_data_handler.py b/mlair/data_handler/default_data_handler.py index 11461ad..0c6d2dd 100644 --- a/mlair/data_handler/default_data_handler.py +++ b/mlair/data_handler/default_data_handler.py @@ -166,6 +166,38 @@ class DefaultDataHandler(AbstractDataHandler): def apply_transformation(self, data, base="target", dim=0, inverse=False): return self.id_class.apply_transformation(data, dim=dim, base=base, inverse=inverse) + def apply_oversampling(self, bin_edges, oversampling_rates): + self._load() + if (self._X is None) or (self._Y is None): + logging.debug(f"{str(self.id_class)} has no data for X or Y, skip multiply extremes") + return + Y = self._Y + X = self._X + for i_bin in range(len(bin_edges)-1): + bin_start = bin_edges[i_bin] + if i_bin == len(bin_edges) - 1: + bin_end = bin_edges[i_bin+1]+1 + else: + bin_end = bin_edges[i_bin + 1] + rate = oversampling_rates[i_bin] + + # extract extremes based on occurrence in labels + other_dims = remove_items(list(Y.dims), self.time_dim) + extreme_idx = xr.concat([(Y >= bin_start).any(dim=other_dims[0]), + (Y < bin_end).any(dim=other_dims[0])], + dim=other_dims[0]).all(dim=other_dims[0]) + + extremes_X = list(map(lambda x: x.sel(**{self.time_dim: extreme_idx}), X)) + self._add_timedelta(extremes_X, dim, timedelta) + # extremes_X = list(map(lambda x: x.coords[dim].values + np.timedelta64(*timedelta), extremes_X)) + + extremes_Y = Y.sel(**{dim: extreme_idx}) + extremes_Y.coords[dim].values += np.timedelta64(*timedelta) + + self._Y_extreme = xr.concat([Y, extremes_Y], dim=dim) + self._X_extreme = list(map(lambda x1, x2: xr.concat([x1, x2], dim=dim), X, extremes_X)) + + def multiply_extremes(self, extreme_values: num_or_list = 1., extremes_on_right_tail_only: bool = False, timedelta: Tuple[int, str] = (1, 'm'), dim=DEFAULT_TIME_DIM): """ diff --git a/mlair/run_modules/experiment_setup.py b/mlair/run_modules/experiment_setup.py index e28eb76..b249491 100644 --- a/mlair/run_modules/experiment_setup.py +++ b/mlair/run_modules/experiment_setup.py @@ -19,7 +19,7 @@ from mlair.configuration.defaults import DEFAULT_STATIONS, DEFAULT_VAR_ALL_DICT, DEFAULT_VAL_MIN_LENGTH, DEFAULT_TEST_START, DEFAULT_TEST_END, DEFAULT_TEST_MIN_LENGTH, DEFAULT_TRAIN_VAL_MIN_LENGTH, \ DEFAULT_USE_ALL_STATIONS_ON_ALL_DATA_SETS, DEFAULT_EVALUATE_BOOTSTRAPS, DEFAULT_CREATE_NEW_BOOTSTRAPS, \ DEFAULT_NUMBER_OF_BOOTSTRAPS, DEFAULT_PLOT_LIST, DEFAULT_SAMPLING, DEFAULT_DATA_ORIGIN, DEFAULT_ITER_DIM, \ - DEFAULT_USE_MULTIPROCESSING, DEFAULT_USE_MULTIPROCESSING_ON_DEBUG + DEFAULT_USE_MULTIPROCESSING, DEFAULT_USE_MULTIPROCESSING_ON_DEBUG, DEFAULT_OVERSAMPLING_BINS, DEFAULT_OVERSAMPLING_RATES_CAP from mlair.data_handler import DefaultDataHandler from mlair.run_modules.run_environment import RunEnvironment from mlair.model_modules.fully_connected_networks import FCN_64_32_16 as VanillaModel @@ -183,6 +183,9 @@ class ExperimentSetup(RunEnvironment): :param use_multiprocessing: Enable parallel preprocessing (postprocessing not implemented yet) by setting this parameter to `True` (default). If set to `False` the computation is performed in an serial approach. Multiprocessing is disabled when running in debug mode and cannot be switched on. + :param oversampling_bins: Sets the number of classes in which the training data is split. The training samples are then + oversampled according to the frequency of the different classes. + :param oversampling_rates_cap: Sets the maximum oversampling rate that is applied to a class """ @@ -216,7 +219,7 @@ class ExperimentSetup(RunEnvironment): hpc_hosts=None, model=None, batch_size=None, epochs=None, data_handler=None, data_origin: Dict = None, competitors: list = None, competitor_path: str = None, use_multiprocessing: bool = None, use_multiprocessing_on_debug: bool = None, - bins=None, rates_cap=None, **kwargs): + oversampling_bins=None, oversampling_rates_cap=None, **kwargs): # create run framework super().__init__() @@ -362,8 +365,8 @@ class ExperimentSetup(RunEnvironment): self._set_param("model_class", model, VanillaModel) # set params for oversampling - self._set_param("bins", bins, default=DEFAULT_BINS) - self._set_param("rates_cap", rates_cap, default=DEFAULT_RATES_CAP) + self._set_param("oversampling_bins", oversampling_bins, default=DEFAULT_OVERSAMPLING_BINS) + self._set_param("oversampling_rates_cap", oversampling_rates_cap, default=DEFAULT_OVERSAMPLING_RATES_CAP) # set remaining kwargs if len(kwargs) > 0: diff --git a/mlair/run_modules/pre_processing.py b/mlair/run_modules/pre_processing.py index 92cdac4..4e41e84 100644 --- a/mlair/run_modules/pre_processing.py +++ b/mlair/run_modules/pre_processing.py @@ -74,8 +74,8 @@ class PreProcessing(RunEnvironment): def apply_oversampling(self): #if request for oversampling=True/False data = self.data_store.get('data_collection', 'train') - bins = self.data_store.get_default('bins') - rates_cap = self.data_store.get_default('rates_cap') + bins = self.data_store.get('oversampling_bins') + rates_cap = self.data_store.get('oversampling_rates_cap') histogram = np.array(bins) #get min and max of the whole data total_min = 0 @@ -83,9 +83,10 @@ class PreProcessing(RunEnvironment): for station in data: total_min = np.minimum(np.amin(station.get_Y(as_numpy=True)), total_min) total_max = np.maximum(np.amax(station.get_Y(as_numpy=True)), total_max) + bin_edges = [] for station in data: # Create histogram for each station - hist, _ = np.histogram(station.get_Y(as_numpy=True), bins=bins, range=(total_min,total_max)) + hist, bin_edges = np.histogram(station.get_Y(as_numpy=True), bins=bins, range=(total_min,total_max)) # Add up histograms histogram = histogram + hist # Scale down to most frequent class=1 @@ -94,8 +95,11 @@ class PreProcessing(RunEnvironment): oversampling_rates = 1 / histogram oversampling_rates_capped = np.minimum(oversampling_rates, rates_cap) # Add to datastore - self.data_store.set('oversampling_rates', oversampling_rates, 'training') - self.data_store.set('oversampling_rates_capped', oversampling_rates_capped, 'training') + self.data_store.set('oversampling_rates', oversampling_rates, 'train') + self.data_store.set('oversampling_rates_capped', oversampling_rates_capped, 'train') + self.data_store.set('oversampling_bin_edges', bin_edges) + for station in data: + station.apply_oversampling(bin_edges, oversampling_rates_capped) def report_pre_processing(self): """Log some metrics on data and create latex report.""" -- GitLab From 1ebf5239ac01c63295bad58fdaec150dd9d39b09 Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Tue, 22 Jun 2021 11:08:27 +0200 Subject: [PATCH 06/58] Todo: l.209 --- mlair/data_handler/default_data_handler.py | 37 +++++++++++++++------- 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/mlair/data_handler/default_data_handler.py b/mlair/data_handler/default_data_handler.py index 0c6d2dd..d0ee9d0 100644 --- a/mlair/data_handler/default_data_handler.py +++ b/mlair/data_handler/default_data_handler.py @@ -166,13 +166,15 @@ class DefaultDataHandler(AbstractDataHandler): def apply_transformation(self, data, base="target", dim=0, inverse=False): return self.id_class.apply_transformation(data, dim=dim, base=base, inverse=inverse) - def apply_oversampling(self, bin_edges, oversampling_rates): + def apply_oversampling(self, bin_edges, oversampling_rates, timedelta: Tuple[int, str] = (1, 's'), timedelta2: Tuple[int, str] = (1, 'ms')): self._load() if (self._X is None) or (self._Y is None): logging.debug(f"{str(self.id_class)} has no data for X or Y, skip multiply extremes") return Y = self._Y X = self._X + complete_extremes_X_list = [] + complete_extremes_Y_list = [] for i_bin in range(len(bin_edges)-1): bin_start = bin_edges[i_bin] if i_bin == len(bin_edges) - 1: @@ -186,16 +188,27 @@ class DefaultDataHandler(AbstractDataHandler): extreme_idx = xr.concat([(Y >= bin_start).any(dim=other_dims[0]), (Y < bin_end).any(dim=other_dims[0])], dim=other_dims[0]).all(dim=other_dims[0]) - - extremes_X = list(map(lambda x: x.sel(**{self.time_dim: extreme_idx}), X)) - self._add_timedelta(extremes_X, dim, timedelta) - # extremes_X = list(map(lambda x: x.coords[dim].values + np.timedelta64(*timedelta), extremes_X)) - - extremes_Y = Y.sel(**{dim: extreme_idx}) - extremes_Y.coords[dim].values += np.timedelta64(*timedelta) - - self._Y_extreme = xr.concat([Y, extremes_Y], dim=dim) - self._X_extreme = list(map(lambda x1, x2: xr.concat([x1, x2], dim=dim), X, extremes_X)) + extremes_X_list =[] + extremes_Y_list = [] + for i in range(np.ceil(rate).astype(int)): + sel = extreme_idx.coords[self.time_dim].values + if rate-i < 1: + rest = int(len(sel)*(rate-i)) + sel = np.random.choice(sel, rest, replace=False) + extremes_X = list(map(lambda x: x.sel(**{self.time_dim: sel}), X)) + self._add_timedelta(extremes_X, self.time_dim, (i,timedelta[1])) + self._add_timedelta(extremes_X, self.time_dim, (i_bin, timedelta2[1])) + extremes_Y = Y.sel(**{self.time_dim: sel}) + extremes_Y.coords[self.time_dim] = extremes_Y.coords[self.time_dim].values + i*np.timedelta64(*timedelta) + i_bin*np.timedelta64(*timedelta2) + extremes_X_list.append(extremes_X) + extremes_Y_list.append(extremes_Y) + + complete_extremes_X_list.append(extremes_X_list) + complete_extremes_Y_list.append(extremes_Y_list) + + #Convert complete_extremes_X_list (list of lists of xarrays) into xarray and give it to self._X_extreme + #self._X_extreme = [[xr.concat(X_list, dim=self.time_dim) for X_list in complete_X_list] for complete_X_list in complete_extremes_X_list] + #self._Y_extreme = [[xr.concat(Y_list, dim=self.time_dim) for Y_list in complete_Y_list] for complete_Y_list in complete_extremes_Y_list] def multiply_extremes(self, extreme_values: num_or_list = 1., extremes_on_right_tail_only: bool = False, @@ -264,7 +277,7 @@ class DefaultDataHandler(AbstractDataHandler): @staticmethod def _add_timedelta(data, dim, timedelta): for d in data: - d.coords[dim].values += np.timedelta64(*timedelta) + d.coords[dim] = d.coords[dim].values + np.timedelta64(*timedelta) @classmethod def transformation(cls, set_stations, **kwargs): -- GitLab From ad6f38147416d853fabb90ca850c9eded343aff0 Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Tue, 22 Jun 2021 12:05:49 +0200 Subject: [PATCH 07/58] Changed list_based storage to directly concatenating with self._X_extreme --- mlair/data_handler/default_data_handler.py | 28 ++++++++++++++-------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/mlair/data_handler/default_data_handler.py b/mlair/data_handler/default_data_handler.py index d0ee9d0..9be2f87 100644 --- a/mlair/data_handler/default_data_handler.py +++ b/mlair/data_handler/default_data_handler.py @@ -168,13 +168,15 @@ class DefaultDataHandler(AbstractDataHandler): def apply_oversampling(self, bin_edges, oversampling_rates, timedelta: Tuple[int, str] = (1, 's'), timedelta2: Tuple[int, str] = (1, 'ms')): self._load() + self._X_extreme = None + self._X_extreme = None if (self._X is None) or (self._Y is None): logging.debug(f"{str(self.id_class)} has no data for X or Y, skip multiply extremes") return Y = self._Y X = self._X - complete_extremes_X_list = [] - complete_extremes_Y_list = [] + #complete_extremes_X_list = [] + #complete_extremes_Y_list = [] for i_bin in range(len(bin_edges)-1): bin_start = bin_edges[i_bin] if i_bin == len(bin_edges) - 1: @@ -188,8 +190,8 @@ class DefaultDataHandler(AbstractDataHandler): extreme_idx = xr.concat([(Y >= bin_start).any(dim=other_dims[0]), (Y < bin_end).any(dim=other_dims[0])], dim=other_dims[0]).all(dim=other_dims[0]) - extremes_X_list =[] - extremes_Y_list = [] + #extremes_X_list =[] + #extremes_Y_list = [] for i in range(np.ceil(rate).astype(int)): sel = extreme_idx.coords[self.time_dim].values if rate-i < 1: @@ -200,12 +202,18 @@ class DefaultDataHandler(AbstractDataHandler): self._add_timedelta(extremes_X, self.time_dim, (i_bin, timedelta2[1])) extremes_Y = Y.sel(**{self.time_dim: sel}) extremes_Y.coords[self.time_dim] = extremes_Y.coords[self.time_dim].values + i*np.timedelta64(*timedelta) + i_bin*np.timedelta64(*timedelta2) - extremes_X_list.append(extremes_X) - extremes_Y_list.append(extremes_Y) - - complete_extremes_X_list.append(extremes_X_list) - complete_extremes_Y_list.append(extremes_Y_list) - + if (self._X_extreme is None) or (self._Y_extreme is None): + self._X_extreme = extremes_X + self._Y_extreme = extremes_Y + else: + self._X_extreme = list(map(lambda x1, x2: xr.concat([x1, x2], dim=self.time_dim), self._X_extreme, extremes_X)) + self._Y_extreme = xr.concat([self._Y_extreme, extremes_Y], dim=self.time_dim) + #extremes_X_list.append(extremes_X) + #extremes_Y_list.append(extremes_Y) + + #complete_extremes_X_list.append(extremes_X_list) + #complete_extremes_Y_list.append(extremes_Y_list) + test = 0 #Convert complete_extremes_X_list (list of lists of xarrays) into xarray and give it to self._X_extreme #self._X_extreme = [[xr.concat(X_list, dim=self.time_dim) for X_list in complete_X_list] for complete_X_list in complete_extremes_X_list] #self._Y_extreme = [[xr.concat(Y_list, dim=self.time_dim) for Y_list in complete_Y_list] for complete_Y_list in complete_extremes_Y_list] -- GitLab From 6d070b5a98ed9260b5c5436861524af9a19310d1 Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Tue, 22 Jun 2021 12:06:56 +0200 Subject: [PATCH 08/58] Changed list_based storage to directly concatenating with self._X_extreme --- mlair/data_handler/default_data_handler.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mlair/data_handler/default_data_handler.py b/mlair/data_handler/default_data_handler.py index 9be2f87..4ae5dc0 100644 --- a/mlair/data_handler/default_data_handler.py +++ b/mlair/data_handler/default_data_handler.py @@ -213,7 +213,6 @@ class DefaultDataHandler(AbstractDataHandler): #complete_extremes_X_list.append(extremes_X_list) #complete_extremes_Y_list.append(extremes_Y_list) - test = 0 #Convert complete_extremes_X_list (list of lists of xarrays) into xarray and give it to self._X_extreme #self._X_extreme = [[xr.concat(X_list, dim=self.time_dim) for X_list in complete_X_list] for complete_X_list in complete_extremes_X_list] #self._Y_extreme = [[xr.concat(Y_list, dim=self.time_dim) for Y_list in complete_Y_list] for complete_Y_list in complete_extremes_Y_list] -- GitLab From 02b687631544785dc72ec7e3d923a038fd5d4b17 Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Wed, 23 Jun 2021 12:23:14 +0200 Subject: [PATCH 09/58] Fixed the error and solved some threads --- mlair/data_handler/default_data_handler.py | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/mlair/data_handler/default_data_handler.py b/mlair/data_handler/default_data_handler.py index 4ae5dc0..7db868e 100644 --- a/mlair/data_handler/default_data_handler.py +++ b/mlair/data_handler/default_data_handler.py @@ -169,14 +169,12 @@ class DefaultDataHandler(AbstractDataHandler): def apply_oversampling(self, bin_edges, oversampling_rates, timedelta: Tuple[int, str] = (1, 's'), timedelta2: Tuple[int, str] = (1, 'ms')): self._load() self._X_extreme = None - self._X_extreme = None + self._Y_extreme = None if (self._X is None) or (self._Y is None): logging.debug(f"{str(self.id_class)} has no data for X or Y, skip multiply extremes") return Y = self._Y X = self._X - #complete_extremes_X_list = [] - #complete_extremes_Y_list = [] for i_bin in range(len(bin_edges)-1): bin_start = bin_edges[i_bin] if i_bin == len(bin_edges) - 1: @@ -190,10 +188,9 @@ class DefaultDataHandler(AbstractDataHandler): extreme_idx = xr.concat([(Y >= bin_start).any(dim=other_dims[0]), (Y < bin_end).any(dim=other_dims[0])], dim=other_dims[0]).all(dim=other_dims[0]) - #extremes_X_list =[] - #extremes_Y_list = [] + extreme_idx = extreme_idx[extreme_idx] + sel = extreme_idx.coords[self.time_dim].values for i in range(np.ceil(rate).astype(int)): - sel = extreme_idx.coords[self.time_dim].values if rate-i < 1: rest = int(len(sel)*(rate-i)) sel = np.random.choice(sel, rest, replace=False) @@ -208,15 +205,7 @@ class DefaultDataHandler(AbstractDataHandler): else: self._X_extreme = list(map(lambda x1, x2: xr.concat([x1, x2], dim=self.time_dim), self._X_extreme, extremes_X)) self._Y_extreme = xr.concat([self._Y_extreme, extremes_Y], dim=self.time_dim) - #extremes_X_list.append(extremes_X) - #extremes_Y_list.append(extremes_Y) - - #complete_extremes_X_list.append(extremes_X_list) - #complete_extremes_Y_list.append(extremes_Y_list) - #Convert complete_extremes_X_list (list of lists of xarrays) into xarray and give it to self._X_extreme - #self._X_extreme = [[xr.concat(X_list, dim=self.time_dim) for X_list in complete_X_list] for complete_X_list in complete_extremes_X_list] - #self._Y_extreme = [[xr.concat(Y_list, dim=self.time_dim) for Y_list in complete_Y_list] for complete_Y_list in complete_extremes_Y_list] - + self._store(fresh_store=True) def multiply_extremes(self, extreme_values: num_or_list = 1., extremes_on_right_tail_only: bool = False, timedelta: Tuple[int, str] = (1, 'm'), dim=DEFAULT_TIME_DIM): -- GitLab From e81da4988f9c3cc5d234145ada05d4c57f50ad9f Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Mon, 28 Jun 2021 10:12:24 +0200 Subject: [PATCH 10/58] Final fixes --- mlair/configuration/defaults.py | 2 +- mlair/data_handler/default_data_handler.py | 33 +++++++++++----------- mlair/run_modules/pre_processing.py | 33 ++++++++++++++++++++++ 3 files changed, 51 insertions(+), 17 deletions(-) diff --git a/mlair/configuration/defaults.py b/mlair/configuration/defaults.py index c6e6178..f2538e9 100644 --- a/mlair/configuration/defaults.py +++ b/mlair/configuration/defaults.py @@ -56,7 +56,7 @@ DEFAULT_DATA_ORIGIN = {"cloudcover": "REA", "humidity": "REA", "pblheight": "REA DEFAULT_USE_MULTIPROCESSING = True DEFAULT_USE_MULTIPROCESSING_ON_DEBUG = False DEFAULT_OVERSAMPLING_BINS = 10 -DEFAULT_OVERSAMPLING_RATES_CAP = 20 +DEFAULT_OVERSAMPLING_RATES_CAP = 100 def get_defaults(): diff --git a/mlair/data_handler/default_data_handler.py b/mlair/data_handler/default_data_handler.py index 7db868e..784e194 100644 --- a/mlair/data_handler/default_data_handler.py +++ b/mlair/data_handler/default_data_handler.py @@ -190,22 +190,23 @@ class DefaultDataHandler(AbstractDataHandler): dim=other_dims[0]).all(dim=other_dims[0]) extreme_idx = extreme_idx[extreme_idx] sel = extreme_idx.coords[self.time_dim].values - for i in range(np.ceil(rate).astype(int)): - if rate-i < 1: - rest = int(len(sel)*(rate-i)) - sel = np.random.choice(sel, rest, replace=False) - extremes_X = list(map(lambda x: x.sel(**{self.time_dim: sel}), X)) - self._add_timedelta(extremes_X, self.time_dim, (i,timedelta[1])) - self._add_timedelta(extremes_X, self.time_dim, (i_bin, timedelta2[1])) - extremes_Y = Y.sel(**{self.time_dim: sel}) - extremes_Y.coords[self.time_dim] = extremes_Y.coords[self.time_dim].values + i*np.timedelta64(*timedelta) + i_bin*np.timedelta64(*timedelta2) - if (self._X_extreme is None) or (self._Y_extreme is None): - self._X_extreme = extremes_X - self._Y_extreme = extremes_Y - else: - self._X_extreme = list(map(lambda x1, x2: xr.concat([x1, x2], dim=self.time_dim), self._X_extreme, extremes_X)) - self._Y_extreme = xr.concat([self._Y_extreme, extremes_Y], dim=self.time_dim) - self._store(fresh_store=True) + if len(extreme_idx)>0: + for i in range(np.ceil(rate).astype(int)): + if rate-i < 1: + rest = int(len(sel)*(rate-i))+1 + sel = np.random.choice(sel, rest, replace=False) + extremes_X = list(map(lambda x: x.sel(**{self.time_dim: sel}), X)) + self._add_timedelta(extremes_X, self.time_dim, (i, timedelta[1])) + self._add_timedelta(extremes_X, self.time_dim, (i_bin, timedelta2[1])) + extremes_Y = Y.sel(**{self.time_dim: sel}) + extremes_Y.coords[self.time_dim] = extremes_Y.coords[self.time_dim].values + i*np.timedelta64(*timedelta) + i_bin*np.timedelta64(*timedelta2) + if (self._X_extreme is None) or (self._Y_extreme is None): + self._X_extreme = extremes_X + self._Y_extreme = extremes_Y + else: + self._X_extreme = list(map(lambda x1, x2: xr.concat([x1, x2], dim=self.time_dim), self._X_extreme, extremes_X)) + self._Y_extreme = xr.concat([self._Y_extreme, extremes_Y], dim=self.time_dim) + #self._store(fresh_store=True) def multiply_extremes(self, extreme_values: num_or_list = 1., extremes_on_right_tail_only: bool = False, timedelta: Tuple[int, str] = (1, 'm'), dim=DEFAULT_TIME_DIM): diff --git a/mlair/run_modules/pre_processing.py b/mlair/run_modules/pre_processing.py index 4e41e84..9ef5c3f 100644 --- a/mlair/run_modules/pre_processing.py +++ b/mlair/run_modules/pre_processing.py @@ -14,6 +14,8 @@ import requests import psutil import pandas as pd +import xarray as xr +from matplotlib import pyplot as plt from mlair.data_handler import DataCollection, AbstractDataHandler from mlair.helpers import TimeTracking, to_list, tables @@ -98,8 +100,39 @@ class PreProcessing(RunEnvironment): self.data_store.set('oversampling_rates', oversampling_rates, 'train') self.data_store.set('oversampling_rates_capped', oversampling_rates_capped, 'train') self.data_store.set('oversampling_bin_edges', bin_edges) + Y = None + Y_extreme = None for station in data: station.apply_oversampling(bin_edges, oversampling_rates_capped) + if Y is None: + Y = station._Y + Y_extreme = station._Y_extreme + else: + Y = xr.concat([Y, station._Y], dim="Stations") + Y_extreme = xr.concat([Y_extreme, station._Y_extreme], dim="Stations") + + fig, ax = plt.subplots(nrows=2, ncols=2) + fig.suptitle(f"Window Size=1, Bins={bins}, rates_cap={rates_cap}") + Y_hist = Y.plot.hist(bins=bin_edges, histtype="step", label="Before", ax=ax[0,0])[0] + Y_extreme_hist = Y_extreme.plot.hist(bins=bin_edges, histtype="step", label="After", ax=ax[0,0])[0] + ax[0,0].set_title(f"Histogram before-after oversampling") + ax[0,0].legend() + Y_hist_dens = Y.plot.hist(bins=bin_edges, density=True, histtype="step", label="Before", ax=ax[0,1])[0] + Y_extreme_hist_dens = Y_extreme.plot.hist(bins=bin_edges, density=True, histtype="step", label="After", ax=ax[0,1])[0] + ax[0,1].set_title(f"Density-Histogram before-after oversampling") + ax[0,1].legend() + real_oversampling = Y_extreme_hist/Y_hist + ax[1,0].plot(range(len(real_oversampling)), oversampling_rates_capped, label="Desired oversampling_rates") + ax[1,0].plot(range(len(real_oversampling)), real_oversampling, label="Actual Oversampling Rates") + ax[1,0].set_title(f"Oversampling rates") + ax[1,0].legend() + ax[1,1].plot(range(len(real_oversampling)), real_oversampling / oversampling_rates_capped, + label="Actual/Desired Rate") + ax[1,1].set_title(f"Deviation from desired Oversampling rate") + ax[1,1].legend() + plt.show() + #data[1]._Y.where(data[1]._Y > bin_edges[9], drop=True) + #data[1]._Y_extreme.where(data[1]._Y_extreme > bin_edges[9], drop=True) def report_pre_processing(self): """Log some metrics on data and create latex report.""" -- GitLab From a01c49118276aa9db5a9e1b9e0a5a01ed9083a45 Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Mon, 28 Jun 2021 10:53:14 +0200 Subject: [PATCH 11/58] Fixed bug with window_size=1 --- mlair/data_handler/default_data_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlair/data_handler/default_data_handler.py b/mlair/data_handler/default_data_handler.py index 784e194..fc5a4d9 100644 --- a/mlair/data_handler/default_data_handler.py +++ b/mlair/data_handler/default_data_handler.py @@ -177,7 +177,7 @@ class DefaultDataHandler(AbstractDataHandler): X = self._X for i_bin in range(len(bin_edges)-1): bin_start = bin_edges[i_bin] - if i_bin == len(bin_edges) - 1: + if i_bin == len(bin_edges) - 2: bin_end = bin_edges[i_bin+1]+1 else: bin_end = bin_edges[i_bin + 1] -- GitLab From 8233f14e45331b946a945ae9b59a710d44012213 Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Mon, 28 Jun 2021 12:14:38 +0200 Subject: [PATCH 12/58] Merged with IntelliO3_ts_architecture, made oversampling_method as parameter to use bin_oversampling. Run Scripts for IntelliO3_architecture with and without oversampling. --- mlair/configuration/defaults.py | 1 + mlair/run_modules/experiment_setup.py | 6 ++-- mlair/run_modules/pre_processing.py | 7 +++-- run_with_oversampling.py | 43 +++++++++++++++++++++++++++ run_without_oversampling.py | 43 +++++++++++++++++++++++++++ 5 files changed, 96 insertions(+), 4 deletions(-) create mode 100644 run_with_oversampling.py create mode 100644 run_without_oversampling.py diff --git a/mlair/configuration/defaults.py b/mlair/configuration/defaults.py index f2538e9..7b7584a 100644 --- a/mlair/configuration/defaults.py +++ b/mlair/configuration/defaults.py @@ -57,6 +57,7 @@ DEFAULT_USE_MULTIPROCESSING = True DEFAULT_USE_MULTIPROCESSING_ON_DEBUG = False DEFAULT_OVERSAMPLING_BINS = 10 DEFAULT_OVERSAMPLING_RATES_CAP = 100 +DEFAULT_OVERSAMPLING_METHOD = None def get_defaults(): diff --git a/mlair/run_modules/experiment_setup.py b/mlair/run_modules/experiment_setup.py index b249491..edf1cdf 100644 --- a/mlair/run_modules/experiment_setup.py +++ b/mlair/run_modules/experiment_setup.py @@ -19,7 +19,8 @@ from mlair.configuration.defaults import DEFAULT_STATIONS, DEFAULT_VAR_ALL_DICT, DEFAULT_VAL_MIN_LENGTH, DEFAULT_TEST_START, DEFAULT_TEST_END, DEFAULT_TEST_MIN_LENGTH, DEFAULT_TRAIN_VAL_MIN_LENGTH, \ DEFAULT_USE_ALL_STATIONS_ON_ALL_DATA_SETS, DEFAULT_EVALUATE_BOOTSTRAPS, DEFAULT_CREATE_NEW_BOOTSTRAPS, \ DEFAULT_NUMBER_OF_BOOTSTRAPS, DEFAULT_PLOT_LIST, DEFAULT_SAMPLING, DEFAULT_DATA_ORIGIN, DEFAULT_ITER_DIM, \ - DEFAULT_USE_MULTIPROCESSING, DEFAULT_USE_MULTIPROCESSING_ON_DEBUG, DEFAULT_OVERSAMPLING_BINS, DEFAULT_OVERSAMPLING_RATES_CAP + DEFAULT_USE_MULTIPROCESSING, DEFAULT_USE_MULTIPROCESSING_ON_DEBUG, DEFAULT_OVERSAMPLING_BINS, \ + DEFAULT_OVERSAMPLING_RATES_CAP, DEFAULT_OVERSAMPLING_METHOD from mlair.data_handler import DefaultDataHandler from mlair.run_modules.run_environment import RunEnvironment from mlair.model_modules.fully_connected_networks import FCN_64_32_16 as VanillaModel @@ -219,7 +220,7 @@ class ExperimentSetup(RunEnvironment): hpc_hosts=None, model=None, batch_size=None, epochs=None, data_handler=None, data_origin: Dict = None, competitors: list = None, competitor_path: str = None, use_multiprocessing: bool = None, use_multiprocessing_on_debug: bool = None, - oversampling_bins=None, oversampling_rates_cap=None, **kwargs): + oversampling_bins=None, oversampling_rates_cap=None, oversampling_method = None, **kwargs): # create run framework super().__init__() @@ -367,6 +368,7 @@ class ExperimentSetup(RunEnvironment): # set params for oversampling self._set_param("oversampling_bins", oversampling_bins, default=DEFAULT_OVERSAMPLING_BINS) self._set_param("oversampling_rates_cap", oversampling_rates_cap, default=DEFAULT_OVERSAMPLING_RATES_CAP) + self._set_param("oversampling_method", oversampling_method, default=DEFAULT_OVERSAMPLING_METHOD) # set remaining kwargs if len(kwargs) > 0: diff --git a/mlair/run_modules/pre_processing.py b/mlair/run_modules/pre_processing.py index 9ef5c3f..e265bd2 100644 --- a/mlair/run_modules/pre_processing.py +++ b/mlair/run_modules/pre_processing.py @@ -69,12 +69,15 @@ class PreProcessing(RunEnvironment): raise ValueError("Couldn't find any valid data according to given parameters. Abort experiment run.") self.data_store.set("stations", valid_stations) self.split_train_val_test() - self.apply_oversampling() + if self.data_store.get('oversampling_method')=='bin_oversampling': + logging.debug("Apply Oversampling") + self.apply_oversampling() + else: + logging.debug("No Oversampling") self.report_pre_processing() self.prepare_competitors() def apply_oversampling(self): - #if request for oversampling=True/False data = self.data_store.get('data_collection', 'train') bins = self.data_store.get('oversampling_bins') rates_cap = self.data_store.get('oversampling_rates_cap') diff --git a/run_with_oversampling.py b/run_with_oversampling.py new file mode 100644 index 0000000..cbab9b4 --- /dev/null +++ b/run_with_oversampling.py @@ -0,0 +1,43 @@ +__author__ = "Lukas Leufen" +__date__ = '2020-06-29' + +import argparse +from mlair.workflows import DefaultWorkflow +from mlair.helpers import remove_items +from mlair.configuration.defaults import DEFAULT_PLOT_LIST +from mlair.model_modules.model_class import IntelliO3_ts_architecture +import os + + +def load_stations(): + import json + try: + filename = 'supplement/station_list_north_german_plain_rural.json' + with open(filename, 'r') as jfile: + stations = json.load(jfile) + except FileNotFoundError: + stations = None + return stations + + +def main(parser_args): + plots = remove_items(DEFAULT_PLOT_LIST, "PlotConditionalQuantiles") + workflow = DefaultWorkflow( # stations=load_stations(), + # stations=["DEBW087","DEBW013", "DEBW107", "DEBW076"], + stations=["DEBW013", "DEBW087", "DEBW107", "DEBW076"], + train_model=False, create_new_model=True, network="UBA", + model=IntelliO3_ts_architecture, oversampling_method="bin_oversampling", + evaluate_bootstraps=False, # plot_list=["PlotCompetitiveSkillScore"], + competitors=["test_model", "test_model2"], + competitor_path=os.path.join(os.getcwd(), "data", "comp_test"), + window_lead_time=1, oversampling_bins=10, oversampling_rates_cap=100, + **parser_args.__dict__) + workflow.run() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--experiment_date', metavar='--exp_date', type=str, default="testrun", + help="set experiment date as string") + args = parser.parse_args() + main(args) diff --git a/run_without_oversampling.py b/run_without_oversampling.py new file mode 100644 index 0000000..3c69b45 --- /dev/null +++ b/run_without_oversampling.py @@ -0,0 +1,43 @@ +__author__ = "Lukas Leufen" +__date__ = '2020-06-29' + +import argparse +from mlair.workflows import DefaultWorkflow +from mlair.helpers import remove_items +from mlair.configuration.defaults import DEFAULT_PLOT_LIST +from mlair.model_modules.model_class import IntelliO3_ts_architecture +import os + + +def load_stations(): + import json + try: + filename = 'supplement/station_list_north_german_plain_rural.json' + with open(filename, 'r') as jfile: + stations = json.load(jfile) + except FileNotFoundError: + stations = None + return stations + + +def main(parser_args): + plots = remove_items(DEFAULT_PLOT_LIST, "PlotConditionalQuantiles") + workflow = DefaultWorkflow( # stations=load_stations(), + # stations=["DEBW087","DEBW013", "DEBW107", "DEBW076"], + stations=["DEBW013", "DEBW087", "DEBW107", "DEBW076"], + train_model=False, create_new_model=True, network="UBA", + model=IntelliO3_ts_architecture, + evaluate_bootstraps=False, # plot_list=["PlotCompetitiveSkillScore"], + competitors=["test_model", "test_model2"], + competitor_path=os.path.join(os.getcwd(), "data", "comp_test"), + window_lead_time=1, oversampling_bins=10, oversampling_rates_cap=100, + **parser_args.__dict__) + workflow.run() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--experiment_date', metavar='--exp_date', type=str, default="testrun", + help="set experiment date as string") + args = parser.parse_args() + main(args) -- GitLab From 50e0dd7491817ce5736af7ca401877d045a11c59 Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Tue, 29 Jun 2021 14:09:26 +0200 Subject: [PATCH 13/58] Set-up to run with and without oversampling on hdfml --- mlair/run_modules/pre_processing.py | 45 +++++++++++++++-------------- run_with_oversampling.py | 15 ++++++---- run_without_oversampling.py | 15 ++++++---- 3 files changed, 41 insertions(+), 34 deletions(-) diff --git a/mlair/run_modules/pre_processing.py b/mlair/run_modules/pre_processing.py index e265bd2..215c0bb 100644 --- a/mlair/run_modules/pre_processing.py +++ b/mlair/run_modules/pre_processing.py @@ -114,28 +114,29 @@ class PreProcessing(RunEnvironment): Y = xr.concat([Y, station._Y], dim="Stations") Y_extreme = xr.concat([Y_extreme, station._Y_extreme], dim="Stations") - fig, ax = plt.subplots(nrows=2, ncols=2) - fig.suptitle(f"Window Size=1, Bins={bins}, rates_cap={rates_cap}") - Y_hist = Y.plot.hist(bins=bin_edges, histtype="step", label="Before", ax=ax[0,0])[0] - Y_extreme_hist = Y_extreme.plot.hist(bins=bin_edges, histtype="step", label="After", ax=ax[0,0])[0] - ax[0,0].set_title(f"Histogram before-after oversampling") - ax[0,0].legend() - Y_hist_dens = Y.plot.hist(bins=bin_edges, density=True, histtype="step", label="Before", ax=ax[0,1])[0] - Y_extreme_hist_dens = Y_extreme.plot.hist(bins=bin_edges, density=True, histtype="step", label="After", ax=ax[0,1])[0] - ax[0,1].set_title(f"Density-Histogram before-after oversampling") - ax[0,1].legend() - real_oversampling = Y_extreme_hist/Y_hist - ax[1,0].plot(range(len(real_oversampling)), oversampling_rates_capped, label="Desired oversampling_rates") - ax[1,0].plot(range(len(real_oversampling)), real_oversampling, label="Actual Oversampling Rates") - ax[1,0].set_title(f"Oversampling rates") - ax[1,0].legend() - ax[1,1].plot(range(len(real_oversampling)), real_oversampling / oversampling_rates_capped, - label="Actual/Desired Rate") - ax[1,1].set_title(f"Deviation from desired Oversampling rate") - ax[1,1].legend() - plt.show() - #data[1]._Y.where(data[1]._Y > bin_edges[9], drop=True) - #data[1]._Y_extreme.where(data[1]._Y_extreme > bin_edges[9], drop=True) + ''' + if not on HPC: + fig, ax = plt.subplots(nrows=2, ncols=2) + fig.suptitle(f"Window Size=1, Bins={bins}, rates_cap={rates_cap}") + Y_hist = Y.plot.hist(bins=bin_edges, histtype="step", label="Before", ax=ax[0,0])[0] + Y_extreme_hist = Y_extreme.plot.hist(bins=bin_edges, histtype="step", label="After", ax=ax[0,0])[0] + ax[0,0].set_title(f"Histogram before-after oversampling") + ax[0,0].legend() + Y_hist_dens = Y.plot.hist(bins=bin_edges, density=True, histtype="step", label="Before", ax=ax[0,1])[0] + Y_extreme_hist_dens = Y_extreme.plot.hist(bins=bin_edges, density=True, histtype="step", label="After", ax=ax[0,1])[0] + ax[0,1].set_title(f"Density-Histogram before-after oversampling") + ax[0,1].legend() + real_oversampling = Y_extreme_hist/Y_hist + ax[1,0].plot(range(len(real_oversampling)), oversampling_rates_capped, label="Desired oversampling_rates") + ax[1,0].plot(range(len(real_oversampling)), real_oversampling, label="Actual Oversampling Rates") + ax[1,0].set_title(f"Oversampling rates") + ax[1,0].legend() + ax[1,1].plot(range(len(real_oversampling)), real_oversampling / oversampling_rates_capped, + label="Actual/Desired Rate") + ax[1,1].set_title(f"Deviation from desired Oversampling rate") + ax[1,1].legend() + plt.show() + ''' def report_pre_processing(self): """Log some metrics on data and create latex report.""" diff --git a/run_with_oversampling.py b/run_with_oversampling.py index cbab9b4..b21e5e6 100644 --- a/run_with_oversampling.py +++ b/run_with_oversampling.py @@ -9,8 +9,12 @@ from mlair.model_modules.model_class import IntelliO3_ts_architecture import os -def load_stations(): +def load_stations(external_station_list): import json + if external_station_list is None: + filename = 'supplement/station_list_north_german_plain_rural.json' + else: + filename = external_station_list try: filename = 'supplement/station_list_north_german_plain_rural.json' with open(filename, 'r') as jfile: @@ -22,15 +26,14 @@ def load_stations(): def main(parser_args): plots = remove_items(DEFAULT_PLOT_LIST, "PlotConditionalQuantiles") - workflow = DefaultWorkflow( # stations=load_stations(), - # stations=["DEBW087","DEBW013", "DEBW107", "DEBW076"], - stations=["DEBW013", "DEBW087", "DEBW107", "DEBW076"], - train_model=False, create_new_model=True, network="UBA", + workflow = DefaultWorkflow(stations=load_stations('supplement/German_background_station.json')[:75], + #stations=["DEBW013", "DEBW087", "DEBW107", "DEBW076"], + train_model=True, create_new_model=True, network="UBA", model=IntelliO3_ts_architecture, oversampling_method="bin_oversampling", evaluate_bootstraps=False, # plot_list=["PlotCompetitiveSkillScore"], competitors=["test_model", "test_model2"], competitor_path=os.path.join(os.getcwd(), "data", "comp_test"), - window_lead_time=1, oversampling_bins=10, oversampling_rates_cap=100, + window_lead_time=2, oversampling_bins=10, oversampling_rates_cap=100, **parser_args.__dict__) workflow.run() diff --git a/run_without_oversampling.py b/run_without_oversampling.py index 3c69b45..5b51ffa 100644 --- a/run_without_oversampling.py +++ b/run_without_oversampling.py @@ -9,8 +9,12 @@ from mlair.model_modules.model_class import IntelliO3_ts_architecture import os -def load_stations(): +def load_stations(external_station_list = None): import json + if external_station_list is None: + filename = 'supplement/station_list_north_german_plain_rural.json' + else: + filename = external_station_list try: filename = 'supplement/station_list_north_german_plain_rural.json' with open(filename, 'r') as jfile: @@ -22,15 +26,14 @@ def load_stations(): def main(parser_args): plots = remove_items(DEFAULT_PLOT_LIST, "PlotConditionalQuantiles") - workflow = DefaultWorkflow( # stations=load_stations(), - # stations=["DEBW087","DEBW013", "DEBW107", "DEBW076"], - stations=["DEBW013", "DEBW087", "DEBW107", "DEBW076"], - train_model=False, create_new_model=True, network="UBA", + workflow = DefaultWorkflow(stations=load_stations('supplement/German_background_station.json')[:75], + #stations=["DEBW013", "DEBW087", "DEBW107", "DEBW076"], + train_model=True, create_new_model=True, network="UBA", model=IntelliO3_ts_architecture, evaluate_bootstraps=False, # plot_list=["PlotCompetitiveSkillScore"], competitors=["test_model", "test_model2"], competitor_path=os.path.join(os.getcwd(), "data", "comp_test"), - window_lead_time=1, oversampling_bins=10, oversampling_rates_cap=100, + window_lead_time=2, oversampling_bins=10, oversampling_rates_cap=100, **parser_args.__dict__) workflow.run() -- GitLab From 789b65623a0d4ce43461a9a7c2ac0e4a74e0703e Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Tue, 29 Jun 2021 16:12:38 +0200 Subject: [PATCH 14/58] Set-up to run with and without oversampling on hdfml, full stationlist, 150epochs --- run_with_oversampling.py | 3 ++- run_without_oversampling.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/run_with_oversampling.py b/run_with_oversampling.py index b21e5e6..023a8a6 100644 --- a/run_with_oversampling.py +++ b/run_with_oversampling.py @@ -26,8 +26,9 @@ def load_stations(external_station_list): def main(parser_args): plots = remove_items(DEFAULT_PLOT_LIST, "PlotConditionalQuantiles") - workflow = DefaultWorkflow(stations=load_stations('supplement/German_background_station.json')[:75], + workflow = DefaultWorkflow(stations=load_stations('supplement/German_background_station.json'), #stations=["DEBW013", "DEBW087", "DEBW107", "DEBW076"], + epochs=150, train_model=True, create_new_model=True, network="UBA", model=IntelliO3_ts_architecture, oversampling_method="bin_oversampling", evaluate_bootstraps=False, # plot_list=["PlotCompetitiveSkillScore"], diff --git a/run_without_oversampling.py b/run_without_oversampling.py index 5b51ffa..bda899c 100644 --- a/run_without_oversampling.py +++ b/run_without_oversampling.py @@ -26,8 +26,9 @@ def load_stations(external_station_list = None): def main(parser_args): plots = remove_items(DEFAULT_PLOT_LIST, "PlotConditionalQuantiles") - workflow = DefaultWorkflow(stations=load_stations('supplement/German_background_station.json')[:75], + workflow = DefaultWorkflow(stations=load_stations('supplement/German_background_station.json'), #stations=["DEBW013", "DEBW087", "DEBW107", "DEBW076"], + epochs=150, train_model=True, create_new_model=True, network="UBA", model=IntelliO3_ts_architecture, evaluate_bootstraps=False, # plot_list=["PlotCompetitiveSkillScore"], -- GitLab From e126258bba423fd70cfd455be1686b601afc2467 Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Wed, 30 Jun 2021 08:51:38 +0200 Subject: [PATCH 15/58] Fixed Error in load_stations --- run_without_oversampling.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/run_without_oversampling.py b/run_without_oversampling.py index bda899c..7b0826b 100644 --- a/run_without_oversampling.py +++ b/run_without_oversampling.py @@ -12,11 +12,9 @@ import os def load_stations(external_station_list = None): import json if external_station_list is None: - filename = 'supplement/station_list_north_german_plain_rural.json' - else: - filename = external_station_list + external_station_list = 'supplement/station_list_north_german_plain_rural.json' try: - filename = 'supplement/station_list_north_german_plain_rural.json' + filename = external_station_list with open(filename, 'r') as jfile: stations = json.load(jfile) except FileNotFoundError: -- GitLab From 2bef91773fcfcd0c8a9371c27bf21766bda1cc5b Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Wed, 30 Jun 2021 09:14:30 +0200 Subject: [PATCH 16/58] Fixed Error in load_stations --- run_with_oversampling.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/run_with_oversampling.py b/run_with_oversampling.py index 023a8a6..d234d1e 100644 --- a/run_with_oversampling.py +++ b/run_with_oversampling.py @@ -9,14 +9,12 @@ from mlair.model_modules.model_class import IntelliO3_ts_architecture import os -def load_stations(external_station_list): +def load_stations(external_station_list = None): import json if external_station_list is None: filename = 'supplement/station_list_north_german_plain_rural.json' - else: - filename = external_station_list try: - filename = 'supplement/station_list_north_german_plain_rural.json' + filename = external_station_list with open(filename, 'r') as jfile: stations = json.load(jfile) except FileNotFoundError: -- GitLab From f3367dc890d4d1e10e06768443a342490fa760f0 Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Wed, 30 Jun 2021 09:21:28 +0200 Subject: [PATCH 17/58] Fixed Error in load_stations --- run_with_oversampling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run_with_oversampling.py b/run_with_oversampling.py index d234d1e..78da204 100644 --- a/run_with_oversampling.py +++ b/run_with_oversampling.py @@ -9,7 +9,7 @@ from mlair.model_modules.model_class import IntelliO3_ts_architecture import os -def load_stations(external_station_list = None): +def load_stations(external_station_list=None): import json if external_station_list is None: filename = 'supplement/station_list_north_german_plain_rural.json' -- GitLab From 3814ac6ad991aec18b0ed843233d9a681ae54e2b Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Wed, 30 Jun 2021 09:26:04 +0200 Subject: [PATCH 18/58] Fixed Error in load_stations --- run_with_oversampling.py | 4 ++-- run_without_oversampling.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/run_with_oversampling.py b/run_with_oversampling.py index 78da204..30371a1 100644 --- a/run_with_oversampling.py +++ b/run_with_oversampling.py @@ -12,7 +12,7 @@ import os def load_stations(external_station_list=None): import json if external_station_list is None: - filename = 'supplement/station_list_north_german_plain_rural.json' + external_station_list = 'supplement/station_list_north_german_plain_rural.json' try: filename = external_station_list with open(filename, 'r') as jfile: @@ -24,7 +24,7 @@ def load_stations(external_station_list=None): def main(parser_args): plots = remove_items(DEFAULT_PLOT_LIST, "PlotConditionalQuantiles") - workflow = DefaultWorkflow(stations=load_stations('supplement/German_background_station.json'), + workflow = DefaultWorkflow(stations=load_stations('supplement/German_background_stations.json'), #stations=["DEBW013", "DEBW087", "DEBW107", "DEBW076"], epochs=150, train_model=True, create_new_model=True, network="UBA", diff --git a/run_without_oversampling.py b/run_without_oversampling.py index 7b0826b..b88cb14 100644 --- a/run_without_oversampling.py +++ b/run_without_oversampling.py @@ -24,7 +24,7 @@ def load_stations(external_station_list = None): def main(parser_args): plots = remove_items(DEFAULT_PLOT_LIST, "PlotConditionalQuantiles") - workflow = DefaultWorkflow(stations=load_stations('supplement/German_background_station.json'), + workflow = DefaultWorkflow(stations=load_stations('supplement/German_background_stations.json'), #stations=["DEBW013", "DEBW087", "DEBW107", "DEBW076"], epochs=150, train_model=True, create_new_model=True, network="UBA", -- GitLab From 8156b244fc6c301c2d0239155e3374872da0b9ac Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Wed, 30 Jun 2021 10:58:09 +0200 Subject: [PATCH 19/58] Fixed Error in load_stations --- run_with_oversampling.py | 4 ++-- run_without_oversampling.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/run_with_oversampling.py b/run_with_oversampling.py index 30371a1..2d8ddb4 100644 --- a/run_with_oversampling.py +++ b/run_with_oversampling.py @@ -2,7 +2,7 @@ __author__ = "Lukas Leufen" __date__ = '2020-06-29' import argparse -from mlair.workflows import DefaultWorkflow +from mlair.workflows import DefaultWorkflowHPC from mlair.helpers import remove_items from mlair.configuration.defaults import DEFAULT_PLOT_LIST from mlair.model_modules.model_class import IntelliO3_ts_architecture @@ -24,7 +24,7 @@ def load_stations(external_station_list=None): def main(parser_args): plots = remove_items(DEFAULT_PLOT_LIST, "PlotConditionalQuantiles") - workflow = DefaultWorkflow(stations=load_stations('supplement/German_background_stations.json'), + workflow = DefaultWorkflowHPC(stations=load_stations('supplement/German_background_stations.json'), #stations=["DEBW013", "DEBW087", "DEBW107", "DEBW076"], epochs=150, train_model=True, create_new_model=True, network="UBA", diff --git a/run_without_oversampling.py b/run_without_oversampling.py index b88cb14..64620ea 100644 --- a/run_without_oversampling.py +++ b/run_without_oversampling.py @@ -2,14 +2,14 @@ __author__ = "Lukas Leufen" __date__ = '2020-06-29' import argparse -from mlair.workflows import DefaultWorkflow +from mlair.workflows import DefaultWorkflowHPC from mlair.helpers import remove_items from mlair.configuration.defaults import DEFAULT_PLOT_LIST from mlair.model_modules.model_class import IntelliO3_ts_architecture import os -def load_stations(external_station_list = None): +def load_stations(external_station_list=None): import json if external_station_list is None: external_station_list = 'supplement/station_list_north_german_plain_rural.json' @@ -24,7 +24,7 @@ def load_stations(external_station_list = None): def main(parser_args): plots = remove_items(DEFAULT_PLOT_LIST, "PlotConditionalQuantiles") - workflow = DefaultWorkflow(stations=load_stations('supplement/German_background_stations.json'), + workflow = DefaultWorkflowHPC(stations=load_stations('supplement/German_background_stations.json'), #stations=["DEBW013", "DEBW087", "DEBW107", "DEBW076"], epochs=150, train_model=True, create_new_model=True, network="UBA", -- GitLab From 4ef91f3062b7641b8cbfde14c43a8dc9bfc57ba2 Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Thu, 15 Jul 2021 13:11:08 +0200 Subject: [PATCH 20/58] Moved Plots from pre_processing to data_insight_plotting. Added PlotOversampling to Default_plot_list. --- mlair/configuration/defaults.py | 2 +- mlair/model_modules/model_class.py | 10 ++-- mlair/plotting/data_insight_plotting.py | 70 +++++++++++++++++++++++++ mlair/run_modules/post_processing.py | 21 +++++++- mlair/run_modules/pre_processing.py | 3 +- run.py | 1 + 6 files changed, 99 insertions(+), 8 deletions(-) diff --git a/mlair/configuration/defaults.py b/mlair/configuration/defaults.py index 7b7584a..fc4f7f0 100644 --- a/mlair/configuration/defaults.py +++ b/mlair/configuration/defaults.py @@ -48,7 +48,7 @@ DEFAULT_CREATE_NEW_BOOTSTRAPS = False DEFAULT_NUMBER_OF_BOOTSTRAPS = 20 DEFAULT_PLOT_LIST = ["PlotMonthlySummary", "PlotStationMap", "PlotClimatologicalSkillScore", "PlotTimeSeries", "PlotCompetitiveSkillScore", "PlotBootstrapSkillScore", "PlotConditionalQuantiles", - "PlotAvailability", "PlotAvailabilityHistogram", "PlotDataHistogram"] + "PlotAvailability", "PlotAvailabilityHistogram", "PlotDataHistogram", "PlotOversampling"] DEFAULT_SAMPLING = "daily" DEFAULT_DATA_ORIGIN = {"cloudcover": "REA", "humidity": "REA", "pblheight": "REA", "press": "REA", "relhum": "REA", "temp": "REA", "totprecip": "REA", "u": "REA", "v": "REA", "no": "", "no2": "", "o3": "", diff --git a/mlair/model_modules/model_class.py b/mlair/model_modules/model_class.py index 9a0e97d..ec7f217 100644 --- a/mlair/model_modules/model_class.py +++ b/mlair/model_modules/model_class.py @@ -377,7 +377,7 @@ class IntelliO3_ts_architecture(AbstractModelClass): # apply to model self.set_model() self.set_compile_options() - self.set_custom_objects(loss=self.compile_options["loss"], + self.set_custom_objects(loss=self.compile_options["loss"][0], SymmetricPadding2D=SymmetricPadding2D, LearningRateDecay=LearningRateDecay) @@ -407,14 +407,14 @@ class IntelliO3_ts_architecture(AbstractModelClass): pool_settings_dict1 = {'pool_kernel': (3, 1), 'tower_filter': 16, 'activation': activation} conv_settings_dict2 = { - 'tower_1': {'reduction_filter': 64, 'tower_filter': 32 * 2, 'tower_kernel': (3, 1), + 'tower_1': {'reduction_filter': 64, 'tower_filter': 32 * 2 * 2, 'tower_kernel': (3, 1), 'activation': activation}, - 'tower_2': {'reduction_filter': 64, 'tower_filter': 32 * 2, 'tower_kernel': (5, 1), + 'tower_2': {'reduction_filter': 64, 'tower_filter': 32 * 2 * 2, 'tower_kernel': (5, 1), 'activation': activation}, - 'tower_3': {'reduction_filter': 64, 'tower_filter': 32 * 2, 'tower_kernel': (1, 1), + 'tower_3': {'reduction_filter': 64, 'tower_filter': 32 * 2 * 2, 'tower_kernel': (1, 1), 'activation': activation} } - pool_settings_dict2 = {'pool_kernel': (3, 1), 'tower_filter': 32, 'activation': activation} + pool_settings_dict2 = {'pool_kernel': (3, 1), 'tower_filter': 32*2, 'activation': activation} ########################################## inception_model = InceptionModelBase() diff --git a/mlair/plotting/data_insight_plotting.py b/mlair/plotting/data_insight_plotting.py index 8a56307..3440321 100644 --- a/mlair/plotting/data_insight_plotting.py +++ b/mlair/plotting/data_insight_plotting.py @@ -19,6 +19,76 @@ from mlair.data_handler import DataCollection from mlair.helpers import TimeTrackingWrapper, to_list from mlair.plotting.abstract_plot_class import AbstractPlotClass +@TimeTrackingWrapper +class PlotOversamplingHistogram(AbstractPlotClass): + + def __init__(self, Y, Y_extreme, bin_edges, plot_folder: str = ".", + plot_name="oversampling_histogram"): + + super().__init__(plot_folder, plot_name) + self._plot(Y, Y_extreme, bin_edges) + self._save() + + def _plot(self, Y, Y_extreme, bin_edges): + fig, ax = plt.subplots(1, 1) + Y.plot.hist(bins=bin_edges, histtype="step", label="Before", ax=ax)[0] + Y_extreme.plot.hist(bins=bin_edges, histtype="step", label="After", ax=ax)[0] + ax.set_title(f"Histogram before-after oversampling") + ax.legend() + + +@TimeTrackingWrapper +class PlotOversamplingDensityHistogram(AbstractPlotClass): + + def __init__(self, Y, Y_extreme, bin_edges, plot_folder: str = ".", + plot_name="oversampling_density_histogram"): + super().__init__(plot_folder, plot_name) + self._plot(Y, Y_extreme, bin_edges) + self._save() + + def _plot(self, Y, Y_extreme, bin_edges): + fig, ax = plt.subplots(1, 1) + Y.plot.hist(bins=bin_edges, density=True, histtype="step", label="Before", ax=ax)[0] + Y_extreme.plot.hist(bins=bin_edges, density=True, histtype="step", label="After", ax=ax)[0] + ax.set_title(f"Density Histogram before-after oversampling") + ax.legend() + + +@TimeTrackingWrapper +class PlotOversamplingRates(AbstractPlotClass): + + def __init__(self, Y, Y_extreme, bin_edges, oversampling_rates, Y_hist, Y_extreme_hist, plot_folder: str = ".", + plot_name="oversampling_rates"): + super().__init__(plot_folder, plot_name) + self._plot(Y, Y_extreme, bin_edges, oversampling_rates, Y_hist, Y_extreme_hist) + self._save() + + def _plot(self, Y, Y_extreme, bin_edges, oversampling_rates, Y_hist, Y_extreme_hist): + fig, ax = plt.subplots(1, 1) + real_oversampling = Y_extreme_hist[0] / Y_hist[0] + ax.plot(range(len(real_oversampling)), oversampling_rates, label="Desired oversampling_rates") + ax.plot(range(len(real_oversampling)), real_oversampling, label="Actual Oversampling Rates") + ax.set_title(f"Oversampling rates") + ax.legend() + + +@TimeTrackingWrapper +class PlotOversamplingRatesDeviation(AbstractPlotClass): + + def __init__(self, Y, Y_extreme, bin_edges, oversampling_rates, Y_hist, Y_extreme_hist, plot_folder: str = ".", + plot_name="oversampling_rates_deviation"): + super().__init__(plot_folder, plot_name) + self._plot(Y, Y_extreme, bin_edges, oversampling_rates, Y_hist, Y_extreme_hist) + self._save() + + def _plot(self, Y, Y_extreme, bin_edges, oversampling_rates, Y_hist, Y_extreme_hist): + fig, ax = plt.subplots(1, 1) + real_oversampling = Y_extreme_hist[0] / Y_hist[0] + ax.plot(range(len(real_oversampling)), real_oversampling / oversampling_rates, + label="Actual/Desired Rate") + ax.set_title(f"Deviation from desired oversampling rates") + ax.legend() + @TimeTrackingWrapper class PlotStationMap(AbstractPlotClass): # pragma: no cover diff --git a/mlair/run_modules/post_processing.py b/mlair/run_modules/post_processing.py index 89a6f20..742acf2 100644 --- a/mlair/run_modules/post_processing.py +++ b/mlair/run_modules/post_processing.py @@ -22,7 +22,8 @@ from mlair.model_modules import AbstractModelClass from mlair.plotting.postprocessing_plotting import PlotMonthlySummary, PlotClimatologicalSkillScore, \ PlotCompetitiveSkillScore, PlotTimeSeries, PlotBootstrapSkillScore, PlotConditionalQuantiles, PlotSeparationOfScales from mlair.plotting.data_insight_plotting import PlotStationMap, PlotAvailability, PlotAvailabilityHistogram, \ - PlotPeriodogram, PlotDataHistogram + PlotPeriodogram, PlotDataHistogram, PlotOversamplingHistogram, PlotOversamplingDensityHistogram, \ + PlotOversamplingRates, PlotOversamplingRatesDeviation from mlair.run_modules.run_environment import RunEnvironment @@ -305,6 +306,24 @@ class PostProcessing(RunEnvironment): target_dim = self.data_store.get("target_dim") iter_dim = self.data_store.get("iter_dim") + try: + if (self.data_store.get('oversampling_method')=='bin_oversampling') and ( + "PlotOversampling" in plot_list): + bin_edges = self.data_store.get('oversampling_bin_edges') + oversampling_rates = self.data_store.get('oversampling_rates_capped','train') + Y = self.data_store.get('Oversampling_Y') + Y_extreme = self.data_store.get('Oversampling_Y_extreme') + Y_hist = Y.plot.hist(bins=bin_edges, histtype="step") + Y_extreme_hist = Y_extreme.plot.hist(bins=bin_edges, histtype="step") + PlotOversamplingHistogram(Y, Y_extreme, bin_edges, plot_folder=self.plot_path) + PlotOversamplingDensityHistogram(Y, Y_extreme, bin_edges, plot_folder=self.plot_path) + PlotOversamplingRates(Y, Y_extreme, bin_edges, oversampling_rates, Y_hist, Y_extreme_hist, + plot_folder=self.plot_path) + PlotOversamplingRatesDeviation(Y, Y_extreme, bin_edges, oversampling_rates, Y_hist, + Y_extreme_hist, plot_folder=self.plot_path) + except Exception as e: + logging.error(f"Could not create plot OversamplingPlots due to the following error: {e}") + try: if ("filter" in self.test_data[0].get_X(as_numpy=False)[0].coords) and ( "PlotSeparationOfScales" in plot_list): diff --git a/mlair/run_modules/pre_processing.py b/mlair/run_modules/pre_processing.py index 215c0bb..69f14be 100644 --- a/mlair/run_modules/pre_processing.py +++ b/mlair/run_modules/pre_processing.py @@ -113,7 +113,8 @@ class PreProcessing(RunEnvironment): else: Y = xr.concat([Y, station._Y], dim="Stations") Y_extreme = xr.concat([Y_extreme, station._Y_extreme], dim="Stations") - + self.data_store.set('Oversampling_Y', Y) + self.data_store.set('Oversampling_Y_extreme', Y_extreme) ''' if not on HPC: fig, ax = plt.subplots(nrows=2, ncols=2) diff --git a/run.py b/run.py index f2bb336..05b43ad 100644 --- a/run.py +++ b/run.py @@ -25,6 +25,7 @@ def main(parser_args): # stations=["DEBW087","DEBW013", "DEBW107", "DEBW076"], stations=["DEBW013", "DEBW087", "DEBW107", "DEBW076"], train_model=False, create_new_model=True, network="UBA", + oversampling_method="bin_oversampling", oversampling_bins=10, oversampling_rates_cap=100, window_lead_time=2, evaluate_bootstraps=False, # plot_list=["PlotCompetitiveSkillScore"], competitors=["test_model", "test_model2"], competitor_path=os.path.join(os.getcwd(), "data", "comp_test"), -- GitLab From 8bf45b43249a703e1eb5ce14073063f6f1e50f74 Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Thu, 15 Jul 2021 15:44:23 +0200 Subject: [PATCH 21/58] Oversampling Plots combined in PlotOversampling method --- mlair/plotting/data_insight_plotting.py | 66 +++++++++---------------- mlair/run_modules/post_processing.py | 12 +---- 2 files changed, 25 insertions(+), 53 deletions(-) diff --git a/mlair/plotting/data_insight_plotting.py b/mlair/plotting/data_insight_plotting.py index 3440321..aff3b4c 100644 --- a/mlair/plotting/data_insight_plotting.py +++ b/mlair/plotting/data_insight_plotting.py @@ -20,70 +20,50 @@ from mlair.helpers import TimeTrackingWrapper, to_list from mlair.plotting.abstract_plot_class import AbstractPlotClass @TimeTrackingWrapper -class PlotOversamplingHistogram(AbstractPlotClass): +class PlotOversampling(AbstractPlotClass): - def __init__(self, Y, Y_extreme, bin_edges, plot_folder: str = ".", - plot_name="oversampling_histogram"): + def __init__(self, Y, Y_extreme, bin_edges, oversampling_rates, plot_folder: str = ".", + plot_names=["oversampling_histogram", "oversampling_density_histogram", "oversampling_rates", + "oversampling_rates_deviation"]): - super().__init__(plot_folder, plot_name) - self._plot(Y, Y_extreme, bin_edges) + super().__init__(plot_folder, plot_names[0]) + Y_hist, Y_extreme_hist = self._plot_oversampling_histogram(Y, Y_extreme, bin_edges) + real_oversampling = Y_extreme_hist / Y_hist + self._save() + self.plot_name = plot_names[1] + self._plot_oversampling_density_histogram(Y, Y_extreme, bin_edges) + self._save() + self.plot_name = plot_names[2] + self._plot_oversampling_rates(oversampling_rates, real_oversampling) + self._save() + self.plot_name = plot_names[3] + self._plot_oversampling_rates_deviation(oversampling_rates, real_oversampling) self._save() - def _plot(self, Y, Y_extreme, bin_edges): + def _plot_oversampling_histogram(self, Y, Y_extreme, bin_edges): fig, ax = plt.subplots(1, 1) - Y.plot.hist(bins=bin_edges, histtype="step", label="Before", ax=ax)[0] - Y_extreme.plot.hist(bins=bin_edges, histtype="step", label="After", ax=ax)[0] + Y_hist = Y.plot.hist(bins=bin_edges, histtype="step", label="Before", ax=ax)[0] + Y_extreme_hist = Y_extreme.plot.hist(bins=bin_edges, histtype="step", label="After", ax=ax)[0] ax.set_title(f"Histogram before-after oversampling") ax.legend() + return Y_hist, Y_extreme_hist - -@TimeTrackingWrapper -class PlotOversamplingDensityHistogram(AbstractPlotClass): - - def __init__(self, Y, Y_extreme, bin_edges, plot_folder: str = ".", - plot_name="oversampling_density_histogram"): - super().__init__(plot_folder, plot_name) - self._plot(Y, Y_extreme, bin_edges) - self._save() - - def _plot(self, Y, Y_extreme, bin_edges): + def _plot_oversampling_density_histogram(self, Y, Y_extreme, bin_edges): fig, ax = plt.subplots(1, 1) Y.plot.hist(bins=bin_edges, density=True, histtype="step", label="Before", ax=ax)[0] Y_extreme.plot.hist(bins=bin_edges, density=True, histtype="step", label="After", ax=ax)[0] ax.set_title(f"Density Histogram before-after oversampling") ax.legend() - -@TimeTrackingWrapper -class PlotOversamplingRates(AbstractPlotClass): - - def __init__(self, Y, Y_extreme, bin_edges, oversampling_rates, Y_hist, Y_extreme_hist, plot_folder: str = ".", - plot_name="oversampling_rates"): - super().__init__(plot_folder, plot_name) - self._plot(Y, Y_extreme, bin_edges, oversampling_rates, Y_hist, Y_extreme_hist) - self._save() - - def _plot(self, Y, Y_extreme, bin_edges, oversampling_rates, Y_hist, Y_extreme_hist): + def _plot_oversampling_rates(self, oversampling_rates, real_oversampling): fig, ax = plt.subplots(1, 1) - real_oversampling = Y_extreme_hist[0] / Y_hist[0] ax.plot(range(len(real_oversampling)), oversampling_rates, label="Desired oversampling_rates") ax.plot(range(len(real_oversampling)), real_oversampling, label="Actual Oversampling Rates") ax.set_title(f"Oversampling rates") ax.legend() - -@TimeTrackingWrapper -class PlotOversamplingRatesDeviation(AbstractPlotClass): - - def __init__(self, Y, Y_extreme, bin_edges, oversampling_rates, Y_hist, Y_extreme_hist, plot_folder: str = ".", - plot_name="oversampling_rates_deviation"): - super().__init__(plot_folder, plot_name) - self._plot(Y, Y_extreme, bin_edges, oversampling_rates, Y_hist, Y_extreme_hist) - self._save() - - def _plot(self, Y, Y_extreme, bin_edges, oversampling_rates, Y_hist, Y_extreme_hist): + def _plot_oversampling_rates_deviation(self, oversampling_rates, real_oversampling): fig, ax = plt.subplots(1, 1) - real_oversampling = Y_extreme_hist[0] / Y_hist[0] ax.plot(range(len(real_oversampling)), real_oversampling / oversampling_rates, label="Actual/Desired Rate") ax.set_title(f"Deviation from desired oversampling rates") diff --git a/mlair/run_modules/post_processing.py b/mlair/run_modules/post_processing.py index 742acf2..74e786a 100644 --- a/mlair/run_modules/post_processing.py +++ b/mlair/run_modules/post_processing.py @@ -22,8 +22,7 @@ from mlair.model_modules import AbstractModelClass from mlair.plotting.postprocessing_plotting import PlotMonthlySummary, PlotClimatologicalSkillScore, \ PlotCompetitiveSkillScore, PlotTimeSeries, PlotBootstrapSkillScore, PlotConditionalQuantiles, PlotSeparationOfScales from mlair.plotting.data_insight_plotting import PlotStationMap, PlotAvailability, PlotAvailabilityHistogram, \ - PlotPeriodogram, PlotDataHistogram, PlotOversamplingHistogram, PlotOversamplingDensityHistogram, \ - PlotOversamplingRates, PlotOversamplingRatesDeviation + PlotPeriodogram, PlotDataHistogram, PlotOversampling from mlair.run_modules.run_environment import RunEnvironment @@ -313,14 +312,7 @@ class PostProcessing(RunEnvironment): oversampling_rates = self.data_store.get('oversampling_rates_capped','train') Y = self.data_store.get('Oversampling_Y') Y_extreme = self.data_store.get('Oversampling_Y_extreme') - Y_hist = Y.plot.hist(bins=bin_edges, histtype="step") - Y_extreme_hist = Y_extreme.plot.hist(bins=bin_edges, histtype="step") - PlotOversamplingHistogram(Y, Y_extreme, bin_edges, plot_folder=self.plot_path) - PlotOversamplingDensityHistogram(Y, Y_extreme, bin_edges, plot_folder=self.plot_path) - PlotOversamplingRates(Y, Y_extreme, bin_edges, oversampling_rates, Y_hist, Y_extreme_hist, - plot_folder=self.plot_path) - PlotOversamplingRatesDeviation(Y, Y_extreme, bin_edges, oversampling_rates, Y_hist, - Y_extreme_hist, plot_folder=self.plot_path) + PlotOversampling(Y, Y_extreme, bin_edges, oversampling_rates, plot_folder=self.plot_path) except Exception as e: logging.error(f"Could not create plot OversamplingPlots due to the following error: {e}") -- GitLab From 4010a036b84f3ac62f25675a46d089aa5b35fb7d Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Thu, 15 Jul 2021 15:57:19 +0200 Subject: [PATCH 22/58] Added PlotOversampling to default_plot_list in test_defaults.py --- test/test_configuration/test_defaults.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_configuration/test_defaults.py b/test/test_configuration/test_defaults.py index 16606d8..0f098dc 100644 --- a/test/test_configuration/test_defaults.py +++ b/test/test_configuration/test_defaults.py @@ -68,4 +68,4 @@ class TestAllDefaults: assert DEFAULT_PLOT_LIST == ["PlotMonthlySummary", "PlotStationMap", "PlotClimatologicalSkillScore", "PlotTimeSeries", "PlotCompetitiveSkillScore", "PlotBootstrapSkillScore", "PlotConditionalQuantiles", "PlotAvailability", "PlotAvailabilityHistogram", - "PlotDataHistogram"] + "PlotDataHistogram","PlotOversampling"] -- GitLab From 3d824606ec7bfbc7f886ddf6264f5e141268a7aa Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Mon, 19 Jul 2021 11:40:40 +0200 Subject: [PATCH 23/58] Coded the structure for PlotOversamplingContingency --- mlair/configuration/defaults.py | 3 +- mlair/plotting/postprocessing_plotting.py | 45 +++++++++++++++++++++++ mlair/run_modules/post_processing.py | 14 ++++++- test/test_configuration/test_defaults.py | 2 +- 4 files changed, 61 insertions(+), 3 deletions(-) diff --git a/mlair/configuration/defaults.py b/mlair/configuration/defaults.py index fc4f7f0..31b58a5 100644 --- a/mlair/configuration/defaults.py +++ b/mlair/configuration/defaults.py @@ -48,7 +48,8 @@ DEFAULT_CREATE_NEW_BOOTSTRAPS = False DEFAULT_NUMBER_OF_BOOTSTRAPS = 20 DEFAULT_PLOT_LIST = ["PlotMonthlySummary", "PlotStationMap", "PlotClimatologicalSkillScore", "PlotTimeSeries", "PlotCompetitiveSkillScore", "PlotBootstrapSkillScore", "PlotConditionalQuantiles", - "PlotAvailability", "PlotAvailabilityHistogram", "PlotDataHistogram", "PlotOversampling"] + "PlotAvailability", "PlotAvailabilityHistogram", "PlotDataHistogram", "PlotOversampling", + "PlotOversamplingContingency"] DEFAULT_SAMPLING = "daily" DEFAULT_DATA_ORIGIN = {"cloudcover": "REA", "humidity": "REA", "pblheight": "REA", "press": "REA", "relhum": "REA", "temp": "REA", "totprecip": "REA", "u": "REA", "v": "REA", "no": "", "no2": "", "o3": "", diff --git a/mlair/plotting/postprocessing_plotting.py b/mlair/plotting/postprocessing_plotting.py index 491aa52..3bef0c3 100644 --- a/mlair/plotting/postprocessing_plotting.py +++ b/mlair/plotting/postprocessing_plotting.py @@ -28,6 +28,51 @@ logging.getLogger('matplotlib').setLevel(logging.WARNING) # matplotlib.use("TkAgg") # import matplotlib.pyplot as plt +@TimeTrackingWrapper +class PlotOversamplingContingency(AbstractPlotClass): + + def __init__(self, predictions, labels, plot_folder: str = ".", + plot_names=["oversampling_threat_score", "oversampling_hit_rate", "oversampling_false_alarm_rate", + "oversampling_all_scores"]): + + super().__init__(plot_folder, plot_names[0]) + ts = [] + h = [] + f = [] + max_label = 0 + min_label = 0 + for threshold in range(min_label, max_label): + true_above = 0 + false_above = 0 + false_below = 0 + true_below = 0 + for prediction, label in predictions, labels: + if prediction >= threshold: + if label >= threshold: + true_above = + 1 + else: + false_above = + 1 + else: + if label >= threshold: + false_below = + 1 + else: + true_below = + 1 + ts.append(true_above/(true_above+false_above+false_below)) + h.append(true_above/(true_above+false_below)) + f.append(false_above/(false_above+true_below)) + plt.plot(range(min_label, max_label), ts) + self._save() + self.plot_name = plot_names[1] + plt.plot(range(min_label, max_label), h) + self._save() + self.plot_name = plot_names[2] + plt.plot(range(min_label, max_label), f) + self.plot_name = plot_names[3] + plt.plot(range(min_label, max_label), ts) + plt.plot(range(min_label, max_label), h) + plt.plot(range(min_label, max_label), f) + self._save() + @TimeTrackingWrapper class PlotMonthlySummary(AbstractPlotClass): diff --git a/mlair/run_modules/post_processing.py b/mlair/run_modules/post_processing.py index 74e786a..6acfcfb 100644 --- a/mlair/run_modules/post_processing.py +++ b/mlair/run_modules/post_processing.py @@ -20,7 +20,8 @@ from mlair.helpers import TimeTracking, statistics, extract_value, remove_items, from mlair.model_modules.linear_model import OrdinaryLeastSquaredModel from mlair.model_modules import AbstractModelClass from mlair.plotting.postprocessing_plotting import PlotMonthlySummary, PlotClimatologicalSkillScore, \ - PlotCompetitiveSkillScore, PlotTimeSeries, PlotBootstrapSkillScore, PlotConditionalQuantiles, PlotSeparationOfScales + PlotCompetitiveSkillScore, PlotTimeSeries, PlotBootstrapSkillScore, PlotConditionalQuantiles,\ + PlotSeparationOfScales, PlotOversamplingContingency from mlair.plotting.data_insight_plotting import PlotStationMap, PlotAvailability, PlotAvailabilityHistogram, \ PlotPeriodogram, PlotDataHistogram, PlotOversampling from mlair.run_modules.run_environment import RunEnvironment @@ -305,6 +306,17 @@ class PostProcessing(RunEnvironment): target_dim = self.data_store.get("target_dim") iter_dim = self.data_store.get("iter_dim") + try: + if (self.data_store.get('oversampling_method')=='bin_oversampling') and ( + "PlotOversamplingContingency" in plot_list): + bin_edges = self.data_store.get('oversampling_bin_edges') + oversampling_rates = self.data_store.get('oversampling_rates_capped','train') + predictions = None + labels = None + PlotOversampling(predictions, labels, plot_folder=self.plot_path) + except Exception as e: + logging.error(f"Could not create plot OversamplingPlots due to the following error: {e}") + try: if (self.data_store.get('oversampling_method')=='bin_oversampling') and ( "PlotOversampling" in plot_list): diff --git a/test/test_configuration/test_defaults.py b/test/test_configuration/test_defaults.py index 0f098dc..922de35 100644 --- a/test/test_configuration/test_defaults.py +++ b/test/test_configuration/test_defaults.py @@ -68,4 +68,4 @@ class TestAllDefaults: assert DEFAULT_PLOT_LIST == ["PlotMonthlySummary", "PlotStationMap", "PlotClimatologicalSkillScore", "PlotTimeSeries", "PlotCompetitiveSkillScore", "PlotBootstrapSkillScore", "PlotConditionalQuantiles", "PlotAvailability", "PlotAvailabilityHistogram", - "PlotDataHistogram","PlotOversampling"] + "PlotDataHistogram","PlotOversampling","PlotOversamplingContingency"] -- GitLab From dde77c42e3de8e17093361e7b14b0a58d556cc13 Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Fri, 23 Jul 2021 08:24:45 +0200 Subject: [PATCH 24/58] Coded the structure for PlotOversamplingContingency --- mlair/run_modules/post_processing.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mlair/run_modules/post_processing.py b/mlair/run_modules/post_processing.py index 6acfcfb..c5f5b2d 100644 --- a/mlair/run_modules/post_processing.py +++ b/mlair/run_modules/post_processing.py @@ -309,13 +309,11 @@ class PostProcessing(RunEnvironment): try: if (self.data_store.get('oversampling_method')=='bin_oversampling') and ( "PlotOversamplingContingency" in plot_list): - bin_edges = self.data_store.get('oversampling_bin_edges') - oversampling_rates = self.data_store.get('oversampling_rates_capped','train') predictions = None labels = None - PlotOversampling(predictions, labels, plot_folder=self.plot_path) + PlotOversamplingContingency(predictions, labels, plot_folder=self.plot_path) except Exception as e: - logging.error(f"Could not create plot OversamplingPlots due to the following error: {e}") + logging.error(f"Could not create plot OversamplingContingencyPlots due to the following error: {e}") try: if (self.data_store.get('oversampling_method')=='bin_oversampling') and ( -- GitLab From a31e2e7e1aad5f5123714329b434dff765664abb Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Fri, 23 Jul 2021 10:20:51 +0200 Subject: [PATCH 25/58] .. --- mlair/data_handler/default_data_handler.py | 2 +- mlair/plotting/data_insight_plotting.py | 12 ++++++++- mlair/plotting/postprocessing_plotting.py | 4 +++ mlair/run_modules/experiment_setup.py | 20 +++++++++------ mlair/run_modules/post_processing.py | 10 +++++--- mlair/run_modules/pre_processing.py | 30 ++++------------------ 6 files changed, 39 insertions(+), 39 deletions(-) diff --git a/mlair/data_handler/default_data_handler.py b/mlair/data_handler/default_data_handler.py index fc5a4d9..acc3caa 100644 --- a/mlair/data_handler/default_data_handler.py +++ b/mlair/data_handler/default_data_handler.py @@ -206,7 +206,7 @@ class DefaultDataHandler(AbstractDataHandler): else: self._X_extreme = list(map(lambda x1, x2: xr.concat([x1, x2], dim=self.time_dim), self._X_extreme, extremes_X)) self._Y_extreme = xr.concat([self._Y_extreme, extremes_Y], dim=self.time_dim) - #self._store(fresh_store=True) + def multiply_extremes(self, extreme_values: num_or_list = 1., extremes_on_right_tail_only: bool = False, timedelta: Tuple[int, str] = (1, 'm'), dim=DEFAULT_TIME_DIM): diff --git a/mlair/plotting/data_insight_plotting.py b/mlair/plotting/data_insight_plotting.py index aff3b4c..ccea0b8 100644 --- a/mlair/plotting/data_insight_plotting.py +++ b/mlair/plotting/data_insight_plotting.py @@ -22,11 +22,14 @@ from mlair.plotting.abstract_plot_class import AbstractPlotClass @TimeTrackingWrapper class PlotOversampling(AbstractPlotClass): - def __init__(self, Y, Y_extreme, bin_edges, oversampling_rates, plot_folder: str = ".", + def __init__(self, data, bin_edges, oversampling_rates, plot_folder: str = ".", plot_names=["oversampling_histogram", "oversampling_density_histogram", "oversampling_rates", "oversampling_rates_deviation"]): super().__init__(plot_folder, plot_names[0]) + + Y_hist, Y_extreme_hist = self._calculate_hist(data, bin_edges) + Y_hist, Y_extreme_hist = self._plot_oversampling_histogram(Y, Y_extreme, bin_edges) real_oversampling = Y_extreme_hist / Y_hist self._save() @@ -40,6 +43,13 @@ class PlotOversampling(AbstractPlotClass): self._plot_oversampling_rates_deviation(oversampling_rates, real_oversampling) self._save() + def _calculate_histogram(self, data, bin_edges): + Y_hist = np.zeros(len(bin_edges),1) + Y_extreme_hist = np.zeros(len(bin_edges), 1) + for station in data: + Y = station.get_Y(as_numpy=True, upsampling=False) + Y_extreme = station.get_Y(as_numpy=True, upsampling=True) + def _plot_oversampling_histogram(self, Y, Y_extreme, bin_edges): fig, ax = plt.subplots(1, 1) Y_hist = Y.plot.hist(bins=bin_edges, histtype="step", label="Before", ax=ax)[0] diff --git a/mlair/plotting/postprocessing_plotting.py b/mlair/plotting/postprocessing_plotting.py index 3bef0c3..6723378 100644 --- a/mlair/plotting/postprocessing_plotting.py +++ b/mlair/plotting/postprocessing_plotting.py @@ -41,6 +41,10 @@ class PlotOversamplingContingency(AbstractPlotClass): f = [] max_label = 0 min_label = 0 + for station in station_names: + file = os.path.join(file_path, file_name % station) + forecast = xr.open_dataarray(file) + competitors = extract_method(station) for threshold in range(min_label, max_label): true_above = 0 false_above = 0 diff --git a/mlair/run_modules/experiment_setup.py b/mlair/run_modules/experiment_setup.py index edf1cdf..cefd450 100644 --- a/mlair/run_modules/experiment_setup.py +++ b/mlair/run_modules/experiment_setup.py @@ -238,15 +238,23 @@ class ExperimentSetup(RunEnvironment): self._set_param("bootstrap_path", bootstrap_path) self._set_param("train_model", train_model, default=DEFAULT_TRAIN_MODEL) self._set_param("fraction_of_training", fraction_of_train, default=DEFAULT_FRACTION_OF_TRAINING) + self._set_param("batch_size", batch_size, default=DEFAULT_BATCH_SIZE) + self._set_param("epochs", epochs, default=DEFAULT_EPOCHS) + + # set params for oversampling + self._set_param("oversampling_bins", oversampling_bins, default=DEFAULT_OVERSAMPLING_BINS) + self._set_param("oversampling_rates_cap", oversampling_rates_cap, default=DEFAULT_OVERSAMPLING_RATES_CAP) + self._set_param("oversampling_method", oversampling_method, default=DEFAULT_OVERSAMPLING_METHOD) self._set_param("extreme_values", extreme_values, default=DEFAULT_EXTREME_VALUES, scope="train") self._set_param("extremes_on_right_tail_only", extremes_on_right_tail_only, default=DEFAULT_EXTREMES_ON_RIGHT_TAIL_ONLY, scope="train") - self._set_param("upsampling", extreme_values is not None, scope="train") - upsampling = self.data_store.get("upsampling", "train") + upsampling = (extreme_values is not None) or (oversampling_method is not None) + self._set_param("upsampling", upsampling, scope="train") permute_data = DEFAULT_PERMUTE_DATA if permute_data_on_training is None else permute_data_on_training self._set_param("permute_data", permute_data or upsampling, scope="train") - self._set_param("batch_size", batch_size, default=DEFAULT_BATCH_SIZE) - self._set_param("epochs", epochs, default=DEFAULT_EPOCHS) + if (extreme_values is not None) and (oversampling_method is not None): + logging.info("Parameters extreme_values and oversampling_method are set. In this case only " + "oversampling_method is used.") # set experiment name sampling = self._set_param("sampling", sampling, default=DEFAULT_SAMPLING) # always related to output sampling @@ -365,10 +373,6 @@ class ExperimentSetup(RunEnvironment): # set model architecture class self._set_param("model_class", model, VanillaModel) - # set params for oversampling - self._set_param("oversampling_bins", oversampling_bins, default=DEFAULT_OVERSAMPLING_BINS) - self._set_param("oversampling_rates_cap", oversampling_rates_cap, default=DEFAULT_OVERSAMPLING_RATES_CAP) - self._set_param("oversampling_method", oversampling_method, default=DEFAULT_OVERSAMPLING_METHOD) # set remaining kwargs if len(kwargs) > 0: diff --git a/mlair/run_modules/post_processing.py b/mlair/run_modules/post_processing.py index c5f5b2d..2febedb 100644 --- a/mlair/run_modules/post_processing.py +++ b/mlair/run_modules/post_processing.py @@ -311,7 +311,9 @@ class PostProcessing(RunEnvironment): "PlotOversamplingContingency" in plot_list): predictions = None labels = None - PlotOversamplingContingency(predictions, labels, plot_folder=self.plot_path) + PlotOversamplingContingency(extract_method=self.load_competitors(), station_names=self.test_data.keys(), + file_path=path, file_name=r"forecasts_%s_test.nc", + plot_folder=self.plot_path) except Exception as e: logging.error(f"Could not create plot OversamplingContingencyPlots due to the following error: {e}") @@ -320,9 +322,9 @@ class PostProcessing(RunEnvironment): "PlotOversampling" in plot_list): bin_edges = self.data_store.get('oversampling_bin_edges') oversampling_rates = self.data_store.get('oversampling_rates_capped','train') - Y = self.data_store.get('Oversampling_Y') - Y_extreme = self.data_store.get('Oversampling_Y_extreme') - PlotOversampling(Y, Y_extreme, bin_edges, oversampling_rates, plot_folder=self.plot_path) + #Y = self.data_store.get('Oversampling_Y') + #Y_extreme = self.data_store.get('Oversampling_Y_extreme') + PlotOversampling(self.train_data, bin_edges, oversampling_rates, plot_folder=self.plot_path) except Exception as e: logging.error(f"Could not create plot OversamplingPlots due to the following error: {e}") diff --git a/mlair/run_modules/pre_processing.py b/mlair/run_modules/pre_processing.py index 69f14be..2d6dc3b 100644 --- a/mlair/run_modules/pre_processing.py +++ b/mlair/run_modules/pre_processing.py @@ -91,7 +91,7 @@ class PreProcessing(RunEnvironment): bin_edges = [] for station in data: # Create histogram for each station - hist, bin_edges = np.histogram(station.get_Y(as_numpy=True), bins=bins, range=(total_min,total_max)) + hist, bin_edges = np.histogram(station.get_Y(as_numpy=True), bins=bins, range=(total_min, total_max)) # Add up histograms histogram = histogram + hist # Scale down to most frequent class=1 @@ -103,10 +103,11 @@ class PreProcessing(RunEnvironment): self.data_store.set('oversampling_rates', oversampling_rates, 'train') self.data_store.set('oversampling_rates_capped', oversampling_rates_capped, 'train') self.data_store.set('oversampling_bin_edges', bin_edges) - Y = None - Y_extreme = None + #Y = None + #Y_extreme = None for station in data: station.apply_oversampling(bin_edges, oversampling_rates_capped) + ''' if Y is None: Y = station._Y Y_extreme = station._Y_extreme @@ -116,28 +117,7 @@ class PreProcessing(RunEnvironment): self.data_store.set('Oversampling_Y', Y) self.data_store.set('Oversampling_Y_extreme', Y_extreme) ''' - if not on HPC: - fig, ax = plt.subplots(nrows=2, ncols=2) - fig.suptitle(f"Window Size=1, Bins={bins}, rates_cap={rates_cap}") - Y_hist = Y.plot.hist(bins=bin_edges, histtype="step", label="Before", ax=ax[0,0])[0] - Y_extreme_hist = Y_extreme.plot.hist(bins=bin_edges, histtype="step", label="After", ax=ax[0,0])[0] - ax[0,0].set_title(f"Histogram before-after oversampling") - ax[0,0].legend() - Y_hist_dens = Y.plot.hist(bins=bin_edges, density=True, histtype="step", label="Before", ax=ax[0,1])[0] - Y_extreme_hist_dens = Y_extreme.plot.hist(bins=bin_edges, density=True, histtype="step", label="After", ax=ax[0,1])[0] - ax[0,1].set_title(f"Density-Histogram before-after oversampling") - ax[0,1].legend() - real_oversampling = Y_extreme_hist/Y_hist - ax[1,0].plot(range(len(real_oversampling)), oversampling_rates_capped, label="Desired oversampling_rates") - ax[1,0].plot(range(len(real_oversampling)), real_oversampling, label="Actual Oversampling Rates") - ax[1,0].set_title(f"Oversampling rates") - ax[1,0].legend() - ax[1,1].plot(range(len(real_oversampling)), real_oversampling / oversampling_rates_capped, - label="Actual/Desired Rate") - ax[1,1].set_title(f"Deviation from desired Oversampling rate") - ax[1,1].legend() - plt.show() - ''' + def report_pre_processing(self): """Log some metrics on data and create latex report.""" -- GitLab From 7a96f1991a7702987d6bc9a6b3fa2ac48454f100 Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Fri, 23 Jul 2021 19:42:05 +0200 Subject: [PATCH 26/58] PlotOversampling fixed --- mlair/plotting/data_insight_plotting.py | 33 ++++++++++++++----------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/mlair/plotting/data_insight_plotting.py b/mlair/plotting/data_insight_plotting.py index ccea0b8..a2007f3 100644 --- a/mlair/plotting/data_insight_plotting.py +++ b/mlair/plotting/data_insight_plotting.py @@ -28,13 +28,12 @@ class PlotOversampling(AbstractPlotClass): super().__init__(plot_folder, plot_names[0]) - Y_hist, Y_extreme_hist = self._calculate_hist(data, bin_edges) - - Y_hist, Y_extreme_hist = self._plot_oversampling_histogram(Y, Y_extreme, bin_edges) + Y_hist, Y_extreme_hist, Y_hist_dens, Y_extreme_hist_dens = self._calculate_hist(data, bin_edges) real_oversampling = Y_extreme_hist / Y_hist + self._plot_oversampling_histogram(Y_hist, Y_extreme_hist, bin_edges) self._save() self.plot_name = plot_names[1] - self._plot_oversampling_density_histogram(Y, Y_extreme, bin_edges) + self._plot_oversampling_density_histogram(Y_hist_dens, Y_extreme_hist_dens, bin_edges) self._save() self.plot_name = plot_names[2] self._plot_oversampling_rates(oversampling_rates, real_oversampling) @@ -43,25 +42,31 @@ class PlotOversampling(AbstractPlotClass): self._plot_oversampling_rates_deviation(oversampling_rates, real_oversampling) self._save() - def _calculate_histogram(self, data, bin_edges): - Y_hist = np.zeros(len(bin_edges),1) - Y_extreme_hist = np.zeros(len(bin_edges), 1) + def _calculate_hist(self, data, bin_edges): + Y_hist = np.zeros(len(bin_edges)-1) + Y_extreme_hist = np.zeros(len(bin_edges)-1) for station in data: Y = station.get_Y(as_numpy=True, upsampling=False) Y_extreme = station.get_Y(as_numpy=True, upsampling=True) + Y_hist = Y_hist + np.histogram(Y, bins=bin_edges)[0] + Y_extreme_hist = Y_extreme_hist + np.histogram(Y_extreme, bins=bin_edges)[0] + Y_hist_dens = Y_hist/np.sum(Y_hist) + Y_extreme_hist_dens = Y_extreme_hist / np.sum(Y_extreme_hist) + return Y_hist, Y_extreme_hist, Y_hist_dens, Y_extreme_hist_dens - def _plot_oversampling_histogram(self, Y, Y_extreme, bin_edges): + def _plot_oversampling_histogram(self, Y_hist, Y_extreme_hist, bin_edges): fig, ax = plt.subplots(1, 1) - Y_hist = Y.plot.hist(bins=bin_edges, histtype="step", label="Before", ax=ax)[0] - Y_extreme_hist = Y_extreme.plot.hist(bins=bin_edges, histtype="step", label="After", ax=ax)[0] + ax.step(bin_edges, np.append(0,Y_hist), label="Before oversampling") + ax.step(bin_edges, np.append(0,Y_extreme_hist), label="After oversampling") ax.set_title(f"Histogram before-after oversampling") ax.legend() - return Y_hist, Y_extreme_hist - def _plot_oversampling_density_histogram(self, Y, Y_extreme, bin_edges): + def _plot_oversampling_density_histogram(self, Y_hist_dens, Y_extreme_hist_dens, bin_edges): fig, ax = plt.subplots(1, 1) - Y.plot.hist(bins=bin_edges, density=True, histtype="step", label="Before", ax=ax)[0] - Y_extreme.plot.hist(bins=bin_edges, density=True, histtype="step", label="After", ax=ax)[0] + ax.step(bin_edges, np.append(0,Y_hist_dens), label="Before oversampling") + ax.step(bin_edges, np.append(0,Y_extreme_hist_dens), label="After oversampling") + #ax.stairs(Y_hist_dens, bin_edges, label="Before oversampling") + #ax.stairs(Y_extreme_hist_dens, bin_edges, label="After oversampling") ax.set_title(f"Density Histogram before-after oversampling") ax.legend() -- GitLab From f73acaa44dd0cdca0d487a2b642092d7dcc85e06 Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Mon, 26 Jul 2021 14:25:34 +0200 Subject: [PATCH 27/58] Commits vor merge --- mlair/data_handler/default_data_handler.py | 1 + mlair/plotting/data_insight_plotting.py | 18 ++- mlair/plotting/postprocessing_plotting.py | 159 ++++++++++++++++----- mlair/run_modules/experiment_setup.py | 2 +- mlair/run_modules/post_processing.py | 12 +- mlair/run_modules/pre_processing.py | 10 -- 6 files changed, 137 insertions(+), 65 deletions(-) diff --git a/mlair/data_handler/default_data_handler.py b/mlair/data_handler/default_data_handler.py index acc3caa..8d977e1 100644 --- a/mlair/data_handler/default_data_handler.py +++ b/mlair/data_handler/default_data_handler.py @@ -206,6 +206,7 @@ class DefaultDataHandler(AbstractDataHandler): else: self._X_extreme = list(map(lambda x1, x2: xr.concat([x1, x2], dim=self.time_dim), self._X_extreme, extremes_X)) self._Y_extreme = xr.concat([self._Y_extreme, extremes_Y], dim=self.time_dim) + self._store(fresh_store=True) def multiply_extremes(self, extreme_values: num_or_list = 1., extremes_on_right_tail_only: bool = False, diff --git a/mlair/plotting/data_insight_plotting.py b/mlair/plotting/data_insight_plotting.py index a2007f3..2637663 100644 --- a/mlair/plotting/data_insight_plotting.py +++ b/mlair/plotting/data_insight_plotting.py @@ -22,6 +22,8 @@ from mlair.plotting.abstract_plot_class import AbstractPlotClass @TimeTrackingWrapper class PlotOversampling(AbstractPlotClass): + #Todo: Build histograms correctly + def __init__(self, data, bin_edges, oversampling_rates, plot_folder: str = ".", plot_names=["oversampling_histogram", "oversampling_density_histogram", "oversampling_rates", "oversampling_rates_deviation"]): @@ -33,7 +35,7 @@ class PlotOversampling(AbstractPlotClass): self._plot_oversampling_histogram(Y_hist, Y_extreme_hist, bin_edges) self._save() self.plot_name = plot_names[1] - self._plot_oversampling_density_histogram(Y_hist_dens, Y_extreme_hist_dens, bin_edges) + self._plot_oversampling_histogram(Y_hist_dens, Y_extreme_hist_dens, bin_edges) self._save() self.plot_name = plot_names[2] self._plot_oversampling_rates(oversampling_rates, real_oversampling) @@ -56,15 +58,11 @@ class PlotOversampling(AbstractPlotClass): def _plot_oversampling_histogram(self, Y_hist, Y_extreme_hist, bin_edges): fig, ax = plt.subplots(1, 1) - ax.step(bin_edges, np.append(0,Y_hist), label="Before oversampling") - ax.step(bin_edges, np.append(0,Y_extreme_hist), label="After oversampling") - ax.set_title(f"Histogram before-after oversampling") - ax.legend() - - def _plot_oversampling_density_histogram(self, Y_hist_dens, Y_extreme_hist_dens, bin_edges): - fig, ax = plt.subplots(1, 1) - ax.step(bin_edges, np.append(0,Y_hist_dens), label="Before oversampling") - ax.step(bin_edges, np.append(0,Y_extreme_hist_dens), label="After oversampling") + ax.hist(bin_edges[:-1], bin_edges, weights=Y_hist, label="Before oversampling") + ax.hist(bin_edges[:-1], bin_edges, weights=Y_extreme_hist, label="After oversampling") + #ax.plot(bin_edges[:-1] + 0.5 * interval_width, weights, label=f"{subset}", c=colors[subset]) + #ax.step(bin_edges, np.append(0,Y_hist), label="Before oversampling") + #ax.step(bin_edges, np.append(0,Y_extreme_hist), label="After oversampling") #ax.stairs(Y_hist_dens, bin_edges, label="Before oversampling") #ax.stairs(Y_extreme_hist_dens, bin_edges, label="After oversampling") ax.set_title(f"Density Histogram before-after oversampling") diff --git a/mlair/plotting/postprocessing_plotting.py b/mlair/plotting/postprocessing_plotting.py index 6723378..b5e76e5 100644 --- a/mlair/plotting/postprocessing_plotting.py +++ b/mlair/plotting/postprocessing_plotting.py @@ -18,7 +18,7 @@ from matplotlib.backends.backend_pdf import PdfPages from mlair import helpers from mlair.data_handler.iterator import DataCollection -from mlair.helpers import TimeTrackingWrapper +from mlair.helpers import TimeTrackingWrapper, to_list from mlair.plotting.abstract_plot_class import AbstractPlotClass logging.getLogger('matplotlib').setLevel(logging.WARNING) @@ -30,53 +30,140 @@ logging.getLogger('matplotlib').setLevel(logging.WARNING) @TimeTrackingWrapper class PlotOversamplingContingency(AbstractPlotClass): + #Todo: 1. Make competitors flexible + # 2. Get min and max_label - def __init__(self, predictions, labels, plot_folder: str = ".", + def __init__(self, station_names, file_path, comp_path, file_name, plot_folder: str = ".", model_name: str = "nn", + obs_name: str = "obs", comp_names: str = "IntelliO3", plot_names=["oversampling_threat_score", "oversampling_hit_rate", "oversampling_false_alarm_rate", "oversampling_all_scores"]): super().__init__(plot_folder, plot_names[0]) - ts = [] - h = [] - f = [] - max_label = 0 - min_label = 0 - for station in station_names: - file = os.path.join(file_path, file_name % station) - forecast = xr.open_dataarray(file) - competitors = extract_method(station) - for threshold in range(min_label, max_label): - true_above = 0 - false_above = 0 - false_below = 0 - true_below = 0 - for prediction, label in predictions, labels: - if prediction >= threshold: - if label >= threshold: - true_above = + 1 - else: - false_above = + 1 - else: - if label >= threshold: - false_below = + 1 - else: - true_below = + 1 - ts.append(true_above/(true_above+false_above+false_below)) - h.append(true_above/(true_above+false_below)) - f.append(false_above/(false_above+true_below)) - plt.plot(range(min_label, max_label), ts) + self._stations = station_names + self._file_path = file_path + self._comp_path = comp_path + self._file_name = file_name + self._model_name = model_name + self._obs_name = obs_name + self._comp_names = to_list(comp_names) + true_above, false_above, false_below, true_below, borders = self._calculate_contingencies() + ts, h, f = self._calculate_scores(true_above, false_above, false_below, true_below) + min_label = borders[0] + max_label = borders[1] + plt.plot(range(min_label, max_label), ts, label="threat score") + plt.legend() self._save() self.plot_name = plot_names[1] - plt.plot(range(min_label, max_label), h) + plt.plot(range(min_label, max_label), h, label="hit rate") + plt.legend() self._save() self.plot_name = plot_names[2] - plt.plot(range(min_label, max_label), f) + plt.plot(range(min_label, max_label), f, label="false alarm rate") + plt.legend() + self._save() self.plot_name = plot_names[3] - plt.plot(range(min_label, max_label), ts) - plt.plot(range(min_label, max_label), h) - plt.plot(range(min_label, max_label), f) + plt.plot(range(min_label, max_label), ts, label="threat score") + plt.plot(range(min_label, max_label), h, label="hit rate") + plt.plot(range(min_label, max_label), f, label="false alarm rate") + plt.legend() self._save() + def _create_competitor_forecast(self, station_name: str, competitor_name: str) -> xr.DataArray: + """ + Load and format the competing forecast of a distinct model indicated by `competitor_name` for a distinct station + indicated by `station_name`. The name of the competitor is set in the `type` axis as indicator. This method will + raise either a `FileNotFoundError` or `KeyError` if no competitor could be found for the given station. Either + there is no file provided in the expected path or no forecast for given `competitor_name` in the forecast file. + + :param station_name: name of the station to load data for + :param competitor_name: name of the model + :return: the forecast of the given competitor + """ + path = os.path.join(self._comp_path, competitor_name) + file = os.path.join(path, f"forecasts_{station_name}_test.nc") + data = xr.open_dataarray(file) + # data = data.expand_dims(Stations=[station_name]) # ToDo: remove line + forecast = data.sel(type=[self._model_name]) + forecast.coords["type"] = [competitor_name] + return forecast + + def _load_competitors(self, station_name: str, comp) -> xr.DataArray: + """ + Load all requested and available competitors for a given station. Forecasts must be available in the competitor + path like `//forecasts__test.nc`. The naming style is equal for all + forecasts of MLAir, so that forecasts of a different experiment can easily be copied into the competitor path + without any change. + + :param station_name: station indicator to load competitors for + + :return: a single xarray with all competing forecasts + """ + competing_predictions = [] + for competitor_name in comp: + try: + prediction = self._create_competitor_forecast(station_name, competitor_name) + competing_predictions.append(prediction) + except (FileNotFoundError, KeyError): + logging.debug(f"No competitor found for combination '{station_name}' and '{competitor_name}'.") + continue + return xr.concat(competing_predictions, "type") if len(competing_predictions) > 0 else None + + def _calculate_contingencies(self): + for station in self._stations: + file = os.path.join(self._file_path, self._file_name % station) + forecast_file = xr.open_dataarray(file) + obs = forecast_file.sel(type=self._obs_name) + model = forecast_file.sel(type=self._model_name) + competitors = [self._load_competitors(station, [comp]).sel(type=comp) for comp in self._comp_names] + min_label = 0 + max_label = 100 + borders = [min_label, max_label] + true_above = [] + false_above = [] + false_below = [] + true_below = [] + for threshold in range(min_label, max_label): + ta, fa, fb, tb = self._single_contingency(obs, model, threshold) + true_above.append(ta) + false_above.append(fa) + false_below.append(fb) + true_below.append(tb) + return np.array(true_above), np.array(false_above), np.array(false_below), np.array(true_below), borders + + + def _single_contingency(self, obs, pred, threshold): + ta = 0 + fa = 0 + fb = 0 + tb = 0 + observations = obs.values.flatten() + predictions = pred.values.flatten() + for i in range(len(observations)): + if predictions[i] >= threshold: + if observations[i] >= threshold: + ta += + 1 + else: + fa += + 1 + else: + if observations[i] >= threshold: + fb += 1 + else: + tb += 1 + return ta, fa, fb, tb + + def _calculate_scores(self, true_above, false_above, false_below, true_below): + np.seterr(divide="ignore") + np.seterr(divide="ignore") + ts = true_above/(true_above + false_above + false_below) + h = true_above/(true_above + false_below) + f = false_above/(false_above + true_below) + np.nan_to_num(ts, copy=False) + np.nan_to_num(h, copy=False) + np.nan_to_num(f, copy=False) + return ts, h, f + + + @TimeTrackingWrapper class PlotMonthlySummary(AbstractPlotClass): diff --git a/mlair/run_modules/experiment_setup.py b/mlair/run_modules/experiment_setup.py index cefd450..c5687e3 100644 --- a/mlair/run_modules/experiment_setup.py +++ b/mlair/run_modules/experiment_setup.py @@ -363,7 +363,7 @@ class ExperimentSetup(RunEnvironment): # set competitors self._set_param("competitors", competitors, default=[]) competitor_path_default = os.path.join(self.data_store.get("data_path"), "competitors", - "_".join(self.data_store.get("target_var"))) + "_".join(to_list(self.data_store.get("target_var")))) self._set_param("competitor_path", competitor_path, default=competitor_path_default) # check variables, statistics and target variable diff --git a/mlair/run_modules/post_processing.py b/mlair/run_modules/post_processing.py index 2febedb..8a59480 100644 --- a/mlair/run_modules/post_processing.py +++ b/mlair/run_modules/post_processing.py @@ -309,11 +309,9 @@ class PostProcessing(RunEnvironment): try: if (self.data_store.get('oversampling_method')=='bin_oversampling') and ( "PlotOversamplingContingency" in plot_list): - predictions = None - labels = None - PlotOversamplingContingency(extract_method=self.load_competitors(), station_names=self.test_data.keys(), - file_path=path, file_name=r"forecasts_%s_test.nc", - plot_folder=self.plot_path) + PlotOversamplingContingency(station_names=self.test_data.keys(), file_path=path, comp_path=self.competitor_path, + comp_names=self.competitors, + file_name=r"forecasts_%s_test.nc", plot_folder=self.plot_path) except Exception as e: logging.error(f"Could not create plot OversamplingContingencyPlots due to the following error: {e}") @@ -321,9 +319,7 @@ class PostProcessing(RunEnvironment): if (self.data_store.get('oversampling_method')=='bin_oversampling') and ( "PlotOversampling" in plot_list): bin_edges = self.data_store.get('oversampling_bin_edges') - oversampling_rates = self.data_store.get('oversampling_rates_capped','train') - #Y = self.data_store.get('Oversampling_Y') - #Y_extreme = self.data_store.get('Oversampling_Y_extreme') + oversampling_rates = self.data_store.get('oversampling_rates_capped', 'train') PlotOversampling(self.train_data, bin_edges, oversampling_rates, plot_folder=self.plot_path) except Exception as e: logging.error(f"Could not create plot OversamplingPlots due to the following error: {e}") diff --git a/mlair/run_modules/pre_processing.py b/mlair/run_modules/pre_processing.py index 2d6dc3b..c006552 100644 --- a/mlair/run_modules/pre_processing.py +++ b/mlair/run_modules/pre_processing.py @@ -107,16 +107,6 @@ class PreProcessing(RunEnvironment): #Y_extreme = None for station in data: station.apply_oversampling(bin_edges, oversampling_rates_capped) - ''' - if Y is None: - Y = station._Y - Y_extreme = station._Y_extreme - else: - Y = xr.concat([Y, station._Y], dim="Stations") - Y_extreme = xr.concat([Y_extreme, station._Y_extreme], dim="Stations") - self.data_store.set('Oversampling_Y', Y) - self.data_store.set('Oversampling_Y_extreme', Y_extreme) - ''' def report_pre_processing(self): -- GitLab From ca894beedfb143274f204ae29f2cd96178950c06 Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Tue, 27 Jul 2021 11:05:25 +0200 Subject: [PATCH 28/58] added to_list import --- mlair/run_modules/experiment_setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mlair/run_modules/experiment_setup.py b/mlair/run_modules/experiment_setup.py index 4755fff..1bd37a6 100644 --- a/mlair/run_modules/experiment_setup.py +++ b/mlair/run_modules/experiment_setup.py @@ -10,6 +10,7 @@ from dill.source import getsource from mlair.configuration import path_config from mlair import helpers +from mlair.helpers import to_list from mlair.configuration.defaults import DEFAULT_STATIONS, DEFAULT_VAR_ALL_DICT, DEFAULT_NETWORK, DEFAULT_STATION_TYPE, \ DEFAULT_START, DEFAULT_END, DEFAULT_WINDOW_HISTORY_SIZE, DEFAULT_OVERWRITE_LOCAL_DATA, \ DEFAULT_HPC_LOGIN_LIST, DEFAULT_HPC_HOST_LIST, DEFAULT_CREATE_NEW_MODEL, DEFAULT_TRAIN_MODEL, \ -- GitLab From ec8e445b36c9ace337a2b327cc648d395b1340a6 Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Tue, 27 Jul 2021 11:22:18 +0200 Subject: [PATCH 29/58] arranged default_plot_list in test in the right order --- mlair/configuration/defaults.py | 4 ++-- test/test_configuration/test_defaults.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mlair/configuration/defaults.py b/mlair/configuration/defaults.py index 0081541..47aaf08 100644 --- a/mlair/configuration/defaults.py +++ b/mlair/configuration/defaults.py @@ -50,8 +50,8 @@ DEFAULT_BOOTSTRAP_TYPE = "singleinput" DEFAULT_BOOTSTRAP_METHOD = "shuffle" DEFAULT_PLOT_LIST = ["PlotMonthlySummary", "PlotStationMap", "PlotClimatologicalSkillScore", "PlotTimeSeries", "PlotCompetitiveSkillScore", "PlotBootstrapSkillScore", "PlotConditionalQuantiles", - "PlotAvailability", "PlotAvailabilityHistogram", "PlotDataHistogram", "PlotOversampling", - "PlotOversamplingContingency", "PlotPeriodogram"] + "PlotAvailability", "PlotAvailabilityHistogram", "PlotDataHistogram", "PlotPeriodogram", + "PlotOversampling", "PlotOversamplingContingency"] DEFAULT_SAMPLING = "daily" DEFAULT_DATA_ORIGIN = {"cloudcover": "REA", "humidity": "REA", "pblheight": "REA", "press": "REA", "relhum": "REA", "temp": "REA", "totprecip": "REA", "u": "REA", "v": "REA", "no": "", "no2": "", "o3": "", diff --git a/test/test_configuration/test_defaults.py b/test/test_configuration/test_defaults.py index 27f38ce..bef3c98 100644 --- a/test/test_configuration/test_defaults.py +++ b/test/test_configuration/test_defaults.py @@ -68,5 +68,5 @@ class TestAllDefaults: assert DEFAULT_PLOT_LIST == ["PlotMonthlySummary", "PlotStationMap", "PlotClimatologicalSkillScore", "PlotTimeSeries", "PlotCompetitiveSkillScore", "PlotBootstrapSkillScore", "PlotConditionalQuantiles", "PlotAvailability", "PlotAvailabilityHistogram", - "PlotDataHistogram", "PlotPeriodogram","PlotOversampling", + "PlotDataHistogram", "PlotPeriodogram", "PlotOversampling", "PlotOversamplingContingency"] -- GitLab From 8ad74c53051a9e27870c58ef97c5cf5e1c4d51f1 Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Mon, 2 Aug 2021 16:30:23 +0200 Subject: [PATCH 30/58] Made PlotOversamplingContingency plot competitors --- mlair/plotting/data_insight_plotting.py | 11 +- mlair/plotting/postprocessing_plotting.py | 123 ++++++++++++++-------- mlair/run_modules/post_processing.py | 4 +- 3 files changed, 82 insertions(+), 56 deletions(-) diff --git a/mlair/plotting/data_insight_plotting.py b/mlair/plotting/data_insight_plotting.py index c4c1f4a..0a3b28c 100644 --- a/mlair/plotting/data_insight_plotting.py +++ b/mlair/plotting/data_insight_plotting.py @@ -23,8 +23,6 @@ from mlair.plotting.abstract_plot_class import AbstractPlotClass @TimeTrackingWrapper class PlotOversampling(AbstractPlotClass): - #Todo: Build histograms correctly - def __init__(self, data, bin_edges, oversampling_rates, plot_folder: str = ".", plot_names=["oversampling_histogram", "oversampling_density_histogram", "oversampling_rates", "oversampling_rates_deviation"]): @@ -59,13 +57,8 @@ class PlotOversampling(AbstractPlotClass): def _plot_oversampling_histogram(self, Y_hist, Y_extreme_hist, bin_edges): fig, ax = plt.subplots(1, 1) - ax.hist(bin_edges[:-1], bin_edges, weights=Y_hist, label="Before oversampling") - ax.hist(bin_edges[:-1], bin_edges, weights=Y_extreme_hist, label="After oversampling") - #ax.plot(bin_edges[:-1] + 0.5 * interval_width, weights, label=f"{subset}", c=colors[subset]) - #ax.step(bin_edges, np.append(0,Y_hist), label="Before oversampling") - #ax.step(bin_edges, np.append(0,Y_extreme_hist), label="After oversampling") - #ax.stairs(Y_hist_dens, bin_edges, label="Before oversampling") - #ax.stairs(Y_extreme_hist_dens, bin_edges, label="After oversampling") + ax.hist(bin_edges[:-1], bin_edges, weights=Y_hist, label="Before oversampling", histtype="step") + ax.hist(bin_edges[:-1], bin_edges, weights=Y_extreme_hist, label="After oversampling", histtype="step") ax.set_title(f"Density Histogram before-after oversampling") ax.legend() diff --git a/mlair/plotting/postprocessing_plotting.py b/mlair/plotting/postprocessing_plotting.py index 29ed405..a0d54c1 100644 --- a/mlair/plotting/postprocessing_plotting.py +++ b/mlair/plotting/postprocessing_plotting.py @@ -30,8 +30,7 @@ logging.getLogger('matplotlib').setLevel(logging.WARNING) @TimeTrackingWrapper class PlotOversamplingContingency(AbstractPlotClass): - #Todo: 1. Make competitors flexible - # 2. Get min and max_label + #Todo: Get min and max_label def __init__(self, station_names, file_path, comp_path, file_name, plot_folder: str = ".", model_name: str = "nn", obs_name: str = "obs", comp_names: str = "IntelliO3", @@ -43,31 +42,40 @@ class PlotOversamplingContingency(AbstractPlotClass): self._file_path = file_path self._comp_path = comp_path self._file_name = file_name - self._model_name = model_name self._obs_name = obs_name + self._model_name = model_name self._comp_names = to_list(comp_names) - true_above, false_above, false_below, true_below, borders = self._calculate_contingencies() - ts, h, f = self._calculate_scores(true_above, false_above, false_below, true_below) - min_label = borders[0] - max_label = borders[1] - plt.plot(range(min_label, max_label), ts, label="threat score") - plt.legend() + self._all_names = [self._model_name] + self._all_names.extend(self._comp_names) + self._plot_names = plot_names + contingency_array, borders = self._calculate_contingencies() + self._scores = ["ts", "h", "f"] + score_array = self._calculate_all_scores(contingency_array) + self._min_label = borders[0] + self._max_label = borders[1] + self._plot_counter = 0 + + self._plot(score_array, "ts") self._save() - self.plot_name = plot_names[1] - plt.plot(range(min_label, max_label), h, label="hit rate") - plt.legend() + self._plot(score_array, "h") self._save() - self.plot_name = plot_names[2] - plt.plot(range(min_label, max_label), f, label="false alarm rate") - plt.legend() + self._plot(score_array, "f") self._save() - self.plot_name = plot_names[3] - plt.plot(range(min_label, max_label), ts, label="threat score") - plt.plot(range(min_label, max_label), h, label="hit rate") - plt.plot(range(min_label, max_label), f, label="false alarm rate") - plt.legend() + self._plot(score_array, "all_scores") self._save() + def _plot(self, data, score): + if score == "all_scores": + for score_name in data.scores.values.tolist(): + plt.plot(range(self._min_label, self._max_label), data.loc[dict(type="nn", scores=score_name)], label=score_name) + else: + for type in data.type.values.tolist(): + plt.plot(range(self._min_label, self._max_label), data.loc[dict(type=type, scores=score)], label=type) + plt.legend() + self.plot_name = self._plot_names[self._plot_counter] + self._plot_counter = self._plot_counter + 1 + + def _create_competitor_forecast(self, station_name: str, competitor_name: str) -> xr.DataArray: """ Load and format the competing forecast of a distinct model indicated by `competitor_name` for a distinct station @@ -109,27 +117,29 @@ class PlotOversamplingContingency(AbstractPlotClass): return xr.concat(competing_predictions, "type") if len(competing_predictions) > 0 else None def _calculate_contingencies(self): + min_label = 0 + max_label = 100 + borders = [min_label, max_label] + thresholds = np.arange(min_label, max_label) + contingency_cell = ["ta", "fa", "fb", "tb"] + contingency_array = xr.DataArray(dims=["thresholds", "contingency_cell", "type"], + coords=[thresholds, contingency_cell, self._all_names]) + contingency_array = contingency_array.fillna(0) for station in self._stations: file = os.path.join(self._file_path, self._file_name % station) forecast_file = xr.open_dataarray(file) obs = forecast_file.sel(type=self._obs_name) - model = forecast_file.sel(type=self._model_name) + predictions = [forecast_file.sel(type=self._model_name)] competitors = [self._load_competitors(station, [comp]).sel(type=comp) for comp in self._comp_names] - min_label = 0 - max_label = 100 - borders = [min_label, max_label] - true_above = [] - false_above = [] - false_below = [] - true_below = [] + predictions.extend(competitors) for threshold in range(min_label, max_label): - ta, fa, fb, tb = self._single_contingency(obs, model, threshold) - true_above.append(ta) - false_above.append(fa) - false_below.append(fb) - true_below.append(tb) - return np.array(true_above), np.array(false_above), np.array(false_below), np.array(true_below), borders - + for pred in predictions: + ta, fa, fb, tb = self._single_contingency(obs, pred, threshold) + contingency_array.loc[dict(thresholds=threshold, contingency_cell="ta", type=pred.type.values)] = ta + contingency_array.loc[dict(thresholds=threshold, contingency_cell="fa", type=pred.type.values)] = fa + contingency_array.loc[dict(thresholds=threshold, contingency_cell="fb", type=pred.type.values)] = fb + contingency_array.loc[dict(thresholds=threshold, contingency_cell="tb", type=pred.type.values)] = tb + return contingency_array, borders def _single_contingency(self, obs, pred, threshold): ta = 0 @@ -151,16 +161,39 @@ class PlotOversamplingContingency(AbstractPlotClass): tb += 1 return ta, fa, fb, tb - def _calculate_scores(self, true_above, false_above, false_below, true_below): - np.seterr(divide="ignore") - np.seterr(divide="ignore") - ts = true_above/(true_above + false_above + false_below) - h = true_above/(true_above + false_below) - f = false_above/(false_above + true_below) - np.nan_to_num(ts, copy=False) - np.nan_to_num(h, copy=False) - np.nan_to_num(f, copy=False) - return ts, h, f + def _calculate_all_scores(self, contingency_array): + score_array = xr.DataArray(dims=["scores", "thresholds", "type"], + coords=[self._scores, contingency_array.thresholds.values, + contingency_array.type.values]) + for type in score_array.type.values.tolist(): + for threshold in score_array.thresholds.values.tolist(): + for score in score_array.scores.values.tolist(): + score_value = self._calculate_scores(contingency_array.loc[dict(type=type, + thresholds=threshold)].values, score) + score_array.loc[dict(type=type, thresholds=threshold, scores=score)] = score_value + return score_array + + def _calculate_scores(self, contingency, score): + true_above = contingency[0] + false_above = contingency[1] + false_below = contingency[2] + true_below = contingency[3] + if score == "ts": + if (true_above + false_above + false_below) == 0: + score_value = 1 + else: + score_value = true_above/(true_above + false_above + false_below) + elif score == "h": + if (true_above + false_below) == 0: + score_value = 1 + else: + score_value = true_above/(true_above + false_below) + elif score == "f": + if (false_above + true_below) == 0: + score_value = 1 + else: + score_value = false_above/(false_above + true_below) + return score_value diff --git a/mlair/run_modules/post_processing.py b/mlair/run_modules/post_processing.py index cef2c65..80d40a6 100644 --- a/mlair/run_modules/post_processing.py +++ b/mlair/run_modules/post_processing.py @@ -333,8 +333,8 @@ class PostProcessing(RunEnvironment): try: if (self.data_store.get('oversampling_method')=='bin_oversampling') and ( "PlotOversamplingContingency" in plot_list): - PlotOversamplingContingency(station_names=self.test_data.keys(), file_path=path, comp_path=self.competitor_path, - comp_names=self.competitors, + PlotOversamplingContingency(station_names=self.test_data.keys(), file_path=path, + comp_path=self.competitor_path, comp_names=self.competitors, file_name=r"forecasts_%s_test.nc", plot_folder=self.plot_path) except Exception as e: logging.error(f"Could not create plot OversamplingContingencyPlots due to the following error: {e}") -- GitLab From 527ddd38953581e49092975e68cde1faf18abcbd Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Wed, 4 Aug 2021 09:01:50 +0200 Subject: [PATCH 31/58] Made PlotOversamplingContingency get min and max_threshold --- mlair/plotting/postprocessing_plotting.py | 30 ++++++++++++++--------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/mlair/plotting/postprocessing_plotting.py b/mlair/plotting/postprocessing_plotting.py index a0d54c1..ed11483 100644 --- a/mlair/plotting/postprocessing_plotting.py +++ b/mlair/plotting/postprocessing_plotting.py @@ -48,11 +48,10 @@ class PlotOversamplingContingency(AbstractPlotClass): self._all_names = [self._model_name] self._all_names.extend(self._comp_names) self._plot_names = plot_names - contingency_array, borders = self._calculate_contingencies() + self._min_threshold, self._max_threshold = self._min_max_threshold() + contingency_array = self._calculate_contingencies() self._scores = ["ts", "h", "f"] score_array = self._calculate_all_scores(contingency_array) - self._min_label = borders[0] - self._max_label = borders[1] self._plot_counter = 0 self._plot(score_array, "ts") @@ -67,10 +66,10 @@ class PlotOversamplingContingency(AbstractPlotClass): def _plot(self, data, score): if score == "all_scores": for score_name in data.scores.values.tolist(): - plt.plot(range(self._min_label, self._max_label), data.loc[dict(type="nn", scores=score_name)], label=score_name) + plt.plot(range(self._min_threshold, self._max_threshold), data.loc[dict(type="nn", scores=score_name)], label=score_name) else: for type in data.type.values.tolist(): - plt.plot(range(self._min_label, self._max_label), data.loc[dict(type=type, scores=score)], label=type) + plt.plot(range(self._min_threshold, self._max_threshold), data.loc[dict(type=type, scores=score)], label=type) plt.legend() self.plot_name = self._plot_names[self._plot_counter] self._plot_counter = self._plot_counter + 1 @@ -116,11 +115,20 @@ class PlotOversamplingContingency(AbstractPlotClass): continue return xr.concat(competing_predictions, "type") if len(competing_predictions) > 0 else None + def _min_max_threshold(self): + min_threshold = 0 + max_threshold = 0 + for station in self._stations: + file = os.path.join(self._file_path, self._file_name % station) + forecast_file = xr.open_dataarray(file) + obs = forecast_file.sel(type=self._obs_name) + obs = obs.fillna(0) + min_threshold = np.minimum(min_threshold, int(np.min(obs.values.flatten()))) + max_threshold = np.maximum(max_threshold, int(np.max(obs.values.flatten()))) + return min_threshold, max_threshold + def _calculate_contingencies(self): - min_label = 0 - max_label = 100 - borders = [min_label, max_label] - thresholds = np.arange(min_label, max_label) + thresholds = np.arange(self._min_threshold, self._max_threshold) contingency_cell = ["ta", "fa", "fb", "tb"] contingency_array = xr.DataArray(dims=["thresholds", "contingency_cell", "type"], coords=[thresholds, contingency_cell, self._all_names]) @@ -132,14 +140,14 @@ class PlotOversamplingContingency(AbstractPlotClass): predictions = [forecast_file.sel(type=self._model_name)] competitors = [self._load_competitors(station, [comp]).sel(type=comp) for comp in self._comp_names] predictions.extend(competitors) - for threshold in range(min_label, max_label): + for threshold in range(self._min_threshold, self._max_threshold): for pred in predictions: ta, fa, fb, tb = self._single_contingency(obs, pred, threshold) contingency_array.loc[dict(thresholds=threshold, contingency_cell="ta", type=pred.type.values)] = ta contingency_array.loc[dict(thresholds=threshold, contingency_cell="fa", type=pred.type.values)] = fa contingency_array.loc[dict(thresholds=threshold, contingency_cell="fb", type=pred.type.values)] = fb contingency_array.loc[dict(thresholds=threshold, contingency_cell="tb", type=pred.type.values)] = tb - return contingency_array, borders + return contingency_array def _single_contingency(self, obs, pred, threshold): ta = 0 -- GitLab From c5e0e40ee0b7aecbf3fc7c8ccdd687eecccca718 Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Wed, 4 Aug 2021 09:27:53 +0200 Subject: [PATCH 32/58] Gives ValueError if window_lead_time of competitors and model dont match --- mlair/run_modules/post_processing.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/mlair/run_modules/post_processing.py b/mlair/run_modules/post_processing.py index 80d40a6..4defc4d 100644 --- a/mlair/run_modules/post_processing.py +++ b/mlair/run_modules/post_processing.py @@ -138,7 +138,16 @@ class PostProcessing(RunEnvironment): except (FileNotFoundError, KeyError): logging.debug(f"No competitor found for combination '{station_name}' and '{competitor_name}'.") continue - return xr.concat(competing_predictions, "type") if len(competing_predictions) > 0 else None + if len(competing_predictions)==0: + return None + else: + comp_array = xr.concat(competing_predictions, "type") + if len(comp_array.coords[self.ahead_dim]) == self.window_lead_time: + return comp_array + else: + raise ValueError(f"Ahead dimensions of competitors do not match model." + f" Competitor ahead: {len(comp_array.coords[self.ahead_dim])}" + f" but window_lead_time is {self.window_lead_time}.") def bootstrap_postprocessing(self, create_new_bootstraps: bool, _iter: int = 0, bootstrap_type="singleinput", bootstrap_method="shuffle") -> None: -- GitLab From 1a6a693add27dae6ff2d88a14b64f0b63eb888df Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Wed, 4 Aug 2021 10:36:06 +0200 Subject: [PATCH 33/58] Added bias and titles to PlotOversamplingContingency --- mlair/plotting/postprocessing_plotting.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/mlair/plotting/postprocessing_plotting.py b/mlair/plotting/postprocessing_plotting.py index ed11483..6801add 100644 --- a/mlair/plotting/postprocessing_plotting.py +++ b/mlair/plotting/postprocessing_plotting.py @@ -35,7 +35,7 @@ class PlotOversamplingContingency(AbstractPlotClass): def __init__(self, station_names, file_path, comp_path, file_name, plot_folder: str = ".", model_name: str = "nn", obs_name: str = "obs", comp_names: str = "IntelliO3", plot_names=["oversampling_threat_score", "oversampling_hit_rate", "oversampling_false_alarm_rate", - "oversampling_all_scores"]): + "oversampling_bias", "oversampling_all_scores"]): super().__init__(plot_folder, plot_names[0]) self._stations = station_names @@ -50,7 +50,7 @@ class PlotOversamplingContingency(AbstractPlotClass): self._plot_names = plot_names self._min_threshold, self._max_threshold = self._min_max_threshold() contingency_array = self._calculate_contingencies() - self._scores = ["ts", "h", "f"] + self._scores = ["ts", "h", "f", "b"] score_array = self._calculate_all_scores(contingency_array) self._plot_counter = 0 @@ -60,6 +60,8 @@ class PlotOversamplingContingency(AbstractPlotClass): self._save() self._plot(score_array, "f") self._save() + self._plot(score_array, "b") + self._save() self._plot(score_array, "all_scores") self._save() @@ -70,6 +72,7 @@ class PlotOversamplingContingency(AbstractPlotClass): else: for type in data.type.values.tolist(): plt.plot(range(self._min_threshold, self._max_threshold), data.loc[dict(type=type, scores=score)], label=type) + plt.title(self._plot_names[self._plot_counter][13:]) plt.legend() self.plot_name = self._plot_names[self._plot_counter] self._plot_counter = self._plot_counter + 1 @@ -132,7 +135,7 @@ class PlotOversamplingContingency(AbstractPlotClass): contingency_cell = ["ta", "fa", "fb", "tb"] contingency_array = xr.DataArray(dims=["thresholds", "contingency_cell", "type"], coords=[thresholds, contingency_cell, self._all_names]) - contingency_array = contingency_array.fillna(0) + contingency_array = contingency_array.fillna(1) for station in self._stations: file = os.path.join(self._file_path, self._file_name % station) forecast_file = xr.open_dataarray(file) @@ -193,14 +196,19 @@ class PlotOversamplingContingency(AbstractPlotClass): score_value = true_above/(true_above + false_above + false_below) elif score == "h": if (true_above + false_below) == 0: - score_value = 1 + score_value = 0 else: score_value = true_above/(true_above + false_below) elif score == "f": if (false_above + true_below) == 0: - score_value = 1 + score_value = 0 else: score_value = false_above/(false_above + true_below) + elif score == "b": + if (true_above + false_below) == 0: + score_value = 0 + else: + score_value = (true_above + false_above)/(true_above + false_below) return score_value -- GitLab From 24efafbb30cad99ab7b28f76ab2aaa3806933ac0 Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Wed, 4 Aug 2021 13:03:18 +0200 Subject: [PATCH 34/58] Updated run-scripts for hdfml --- run_with_oversampling.py | 4 ++-- run_without_oversampling.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/run_with_oversampling.py b/run_with_oversampling.py index 2d8ddb4..0effa8c 100644 --- a/run_with_oversampling.py +++ b/run_with_oversampling.py @@ -30,9 +30,9 @@ def main(parser_args): train_model=True, create_new_model=True, network="UBA", model=IntelliO3_ts_architecture, oversampling_method="bin_oversampling", evaluate_bootstraps=False, # plot_list=["PlotCompetitiveSkillScore"], - competitors=["test_model", "test_model2"], + competitors=["IntelliO3"], competitor_path=os.path.join(os.getcwd(), "data", "comp_test"), - window_lead_time=2, oversampling_bins=10, oversampling_rates_cap=100, + window_lead_time=1, oversampling_bins=10, oversampling_rates_cap=100, **parser_args.__dict__) workflow.run() diff --git a/run_without_oversampling.py b/run_without_oversampling.py index 64620ea..d888f18 100644 --- a/run_without_oversampling.py +++ b/run_without_oversampling.py @@ -30,9 +30,9 @@ def main(parser_args): train_model=True, create_new_model=True, network="UBA", model=IntelliO3_ts_architecture, evaluate_bootstraps=False, # plot_list=["PlotCompetitiveSkillScore"], - competitors=["test_model", "test_model2"], + #competitors=["test_model", "test_model2"], competitor_path=os.path.join(os.getcwd(), "data", "comp_test"), - window_lead_time=2, oversampling_bins=10, oversampling_rates_cap=100, + window_lead_time=1, #oversampling_bins=10, oversampling_rates_cap=100, **parser_args.__dict__) workflow.run() -- GitLab From b15528b91d93da42f4f473c5e09d27f4224088b6 Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Mon, 9 Aug 2021 11:42:43 +0200 Subject: [PATCH 35/58] Before making new branch --- run_without_oversampling.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/run_without_oversampling.py b/run_without_oversampling.py index d888f18..c1714e6 100644 --- a/run_without_oversampling.py +++ b/run_without_oversampling.py @@ -24,9 +24,9 @@ def load_stations(external_station_list=None): def main(parser_args): plots = remove_items(DEFAULT_PLOT_LIST, "PlotConditionalQuantiles") - workflow = DefaultWorkflowHPC(stations=load_stations('supplement/German_background_stations.json'), - #stations=["DEBW013", "DEBW087", "DEBW107", "DEBW076"], - epochs=150, + workflow = DefaultWorkflowHPC(#stations=load_stations('supplement/German_background_stations.json'), + stations=["DEBW013", "DEBW087", "DEBW107", "DEBW076"], + epochs=1, train_model=True, create_new_model=True, network="UBA", model=IntelliO3_ts_architecture, evaluate_bootstraps=False, # plot_list=["PlotCompetitiveSkillScore"], -- GitLab From caf68737cec1e90ec2d558f89b0739aed59a4ffa Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Fri, 13 Aug 2021 13:42:40 +0200 Subject: [PATCH 36/58] Workaround in statistics.py, climatological_skill_scores --- mlair/helpers/statistics.py | 46 ++++++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/mlair/helpers/statistics.py b/mlair/helpers/statistics.py index a1e713a..b63cbe8 100644 --- a/mlair/helpers/statistics.py +++ b/mlair/helpers/statistics.py @@ -301,7 +301,7 @@ class SkillScores: observation_name=self.observation_name) for (first, second) in combinations] return skill_score - + ''' def climatological_skill_scores(self, internal_data: Data, forecast_name: str) -> xr.DataArray: """ Calculate climatological skill scores according to Murphy (1988). @@ -341,6 +341,50 @@ class SkillScores: external_data=external_data).values.flatten()) return skill_score + ''' + + def climatological_skill_scores(self, internal_data: Data, forecast_name: str) -> xr.DataArray: + """ + Calculate climatological skill scores according to Murphy (1988). + + Calculate all CASES I - IV and terms [ABC][I-IV]. Internal data has to be set by initialisation, external data + is part of parameters. + + :param internal_data: internal data + :param forecast_name: name of the forecast to use for this calculation (must be available in `data`) + + :return: all CASES as well as all terms + """ + if self.external_data is None: + ahead_names = [] + else: + ahead_names = list(self.external_data[self.ahead_dim].data) + + all_terms = ['AI', 'AII', 'AIII', 'AIV', 'BI', 'BII', 'BIV', 'CI', 'CIV', 'CASE I', 'CASE II', 'CASE III', + 'CASE IV'] + skill_score = xr.DataArray(np.full((len(all_terms), len(ahead_names)), np.nan), coords=[all_terms, ahead_names], + dims=['terms', self.ahead_dim]) + + for iahead in ahead_names: + data = internal_data.sel({self.ahead_dim: iahead}) + + skill_score.loc[["CASE I", "AI", "BI", "CI"], iahead] = np.stack(self._climatological_skill_score( + data, mu_type=1, forecast_name=forecast_name, observation_name=self.observation_name).values.flatten()) + + skill_score.loc[["CASE II", "AII", "BII"], iahead] = np.stack(self._climatological_skill_score( + data, mu_type=2, forecast_name=forecast_name, observation_name=self.observation_name).values.flatten()) + + if self.external_data is not None and self.observation_name in self.external_data.coords["type"]: + external_data = self.external_data.sel({self.ahead_dim: iahead, "type": [self.observation_name]}) + skill_score.loc[["CASE III", "AIII"], iahead] = np.stack(self._climatological_skill_score( + data, mu_type=3, forecast_name=forecast_name, observation_name=self.observation_name, + external_data=external_data).values.flatten()) + + skill_score.loc[["CASE IV", "AIV", "BIV", "CIV"], iahead] = np.stack(self._climatological_skill_score( + data, mu_type=4, forecast_name=forecast_name, observation_name=self.observation_name, + external_data=external_data).values.flatten()) + + return skill_score def _climatological_skill_score(self, internal_data, observation_name, forecast_name, mu_type=1, external_data=None): -- GitLab From a21535e36ca27ab656b2aee355f91a53513f9df0 Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Sat, 14 Aug 2021 13:56:08 +0200 Subject: [PATCH 37/58] Fixed the problem in _load_competitors in post_processing and postprocessing_plotting, reverted the workaround in statistics --- mlair/helpers/statistics.py | 3 ++- mlair/plotting/postprocessing_plotting.py | 11 ++++++++++- mlair/run_modules/post_processing.py | 11 +---------- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/mlair/helpers/statistics.py b/mlair/helpers/statistics.py index b63cbe8..90f4a8c 100644 --- a/mlair/helpers/statistics.py +++ b/mlair/helpers/statistics.py @@ -301,7 +301,7 @@ class SkillScores: observation_name=self.observation_name) for (first, second) in combinations] return skill_score - ''' + def climatological_skill_scores(self, internal_data: Data, forecast_name: str) -> xr.DataArray: """ Calculate climatological skill scores according to Murphy (1988). @@ -385,6 +385,7 @@ class SkillScores: external_data=external_data).values.flatten()) return skill_score + ''' def _climatological_skill_score(self, internal_data, observation_name, forecast_name, mu_type=1, external_data=None): diff --git a/mlair/plotting/postprocessing_plotting.py b/mlair/plotting/postprocessing_plotting.py index 6801add..b7c0366 100644 --- a/mlair/plotting/postprocessing_plotting.py +++ b/mlair/plotting/postprocessing_plotting.py @@ -116,7 +116,16 @@ class PlotOversamplingContingency(AbstractPlotClass): except (FileNotFoundError, KeyError): logging.debug(f"No competitor found for combination '{station_name}' and '{competitor_name}'.") continue - return xr.concat(competing_predictions, "type") if len(competing_predictions) > 0 else None + if len(competing_predictions)==0: + return None + else: + comp_array = xr.concat(competing_predictions, "type") + if len(comp_array.coords[self.ahead_dim]) == self.window_lead_time: + return comp_array + else: + raise ValueError(f"Ahead dimensions of competitors do not match model." + f" Competitor ahead: {len(comp_array.coords[self.ahead_dim])}" + f" but window_lead_time is {self.window_lead_time}.") def _min_max_threshold(self): min_threshold = 0 diff --git a/mlair/run_modules/post_processing.py b/mlair/run_modules/post_processing.py index 4defc4d..80d40a6 100644 --- a/mlair/run_modules/post_processing.py +++ b/mlair/run_modules/post_processing.py @@ -138,16 +138,7 @@ class PostProcessing(RunEnvironment): except (FileNotFoundError, KeyError): logging.debug(f"No competitor found for combination '{station_name}' and '{competitor_name}'.") continue - if len(competing_predictions)==0: - return None - else: - comp_array = xr.concat(competing_predictions, "type") - if len(comp_array.coords[self.ahead_dim]) == self.window_lead_time: - return comp_array - else: - raise ValueError(f"Ahead dimensions of competitors do not match model." - f" Competitor ahead: {len(comp_array.coords[self.ahead_dim])}" - f" but window_lead_time is {self.window_lead_time}.") + return xr.concat(competing_predictions, "type") if len(competing_predictions) > 0 else None def bootstrap_postprocessing(self, create_new_bootstraps: bool, _iter: int = 0, bootstrap_type="singleinput", bootstrap_method="shuffle") -> None: -- GitLab From f7b7f37a1b482738c91a11d9a4080d6ae998dc8d Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Sat, 14 Aug 2021 16:18:48 +0200 Subject: [PATCH 38/58] reverted last commit --- mlair/helpers/statistics.py | 3 +-- mlair/plotting/postprocessing_plotting.py | 11 +---------- mlair/run_modules/post_processing.py | 11 ++++++++++- 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/mlair/helpers/statistics.py b/mlair/helpers/statistics.py index 90f4a8c..b63cbe8 100644 --- a/mlair/helpers/statistics.py +++ b/mlair/helpers/statistics.py @@ -301,7 +301,7 @@ class SkillScores: observation_name=self.observation_name) for (first, second) in combinations] return skill_score - + ''' def climatological_skill_scores(self, internal_data: Data, forecast_name: str) -> xr.DataArray: """ Calculate climatological skill scores according to Murphy (1988). @@ -385,7 +385,6 @@ class SkillScores: external_data=external_data).values.flatten()) return skill_score - ''' def _climatological_skill_score(self, internal_data, observation_name, forecast_name, mu_type=1, external_data=None): diff --git a/mlair/plotting/postprocessing_plotting.py b/mlair/plotting/postprocessing_plotting.py index b7c0366..6801add 100644 --- a/mlair/plotting/postprocessing_plotting.py +++ b/mlair/plotting/postprocessing_plotting.py @@ -116,16 +116,7 @@ class PlotOversamplingContingency(AbstractPlotClass): except (FileNotFoundError, KeyError): logging.debug(f"No competitor found for combination '{station_name}' and '{competitor_name}'.") continue - if len(competing_predictions)==0: - return None - else: - comp_array = xr.concat(competing_predictions, "type") - if len(comp_array.coords[self.ahead_dim]) == self.window_lead_time: - return comp_array - else: - raise ValueError(f"Ahead dimensions of competitors do not match model." - f" Competitor ahead: {len(comp_array.coords[self.ahead_dim])}" - f" but window_lead_time is {self.window_lead_time}.") + return xr.concat(competing_predictions, "type") if len(competing_predictions) > 0 else None def _min_max_threshold(self): min_threshold = 0 diff --git a/mlair/run_modules/post_processing.py b/mlair/run_modules/post_processing.py index 80d40a6..0444ccf 100644 --- a/mlair/run_modules/post_processing.py +++ b/mlair/run_modules/post_processing.py @@ -138,7 +138,16 @@ class PostProcessing(RunEnvironment): except (FileNotFoundError, KeyError): logging.debug(f"No competitor found for combination '{station_name}' and '{competitor_name}'.") continue - return xr.concat(competing_predictions, "type") if len(competing_predictions) > 0 else None + if len(competing_predictions) == 0: + return None + else: + comp_array = xr.concat(competing_predictions, "type") + if len(comp_array.coords[self.ahead_dim]) == self.window_lead_time: + return comp_array + else: + raise ValueError(f"Ahead dimensions of competitors do not match model." + f" Competitor ahead: {len(comp_array.coords[self.ahead_dim])}" + f" but window_lead_time is {self.window_lead_time}.") def bootstrap_postprocessing(self, create_new_bootstraps: bool, _iter: int = 0, bootstrap_type="singleinput", bootstrap_method="shuffle") -> None: -- GitLab From 43127055212eaeba77fd454453ac84025e9adbed Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Mon, 16 Aug 2021 10:52:27 +0200 Subject: [PATCH 39/58] For debugging on HPC --- mlair/helpers/statistics.py | 5 +---- mlair/plotting/postprocessing_plotting.py | 10 +++++++++- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/mlair/helpers/statistics.py b/mlair/helpers/statistics.py index b63cbe8..2f47bac 100644 --- a/mlair/helpers/statistics.py +++ b/mlair/helpers/statistics.py @@ -355,10 +355,7 @@ class SkillScores: :return: all CASES as well as all terms """ - if self.external_data is None: - ahead_names = [] - else: - ahead_names = list(self.external_data[self.ahead_dim].data) + ahead_names = list(internal_data[self.ahead_dim].data) all_terms = ['AI', 'AII', 'AIII', 'AIV', 'BI', 'BII', 'BIV', 'CI', 'CIV', 'CASE I', 'CASE II', 'CASE III', 'CASE IV'] diff --git a/mlair/plotting/postprocessing_plotting.py b/mlair/plotting/postprocessing_plotting.py index 6801add..55f74a1 100644 --- a/mlair/plotting/postprocessing_plotting.py +++ b/mlair/plotting/postprocessing_plotting.py @@ -121,7 +121,9 @@ class PlotOversamplingContingency(AbstractPlotClass): def _min_max_threshold(self): min_threshold = 0 max_threshold = 0 + logging.info("min_max thresholds") for station in self._stations: + logging.info(f"{station}") file = os.path.join(self._file_path, self._file_name % station) forecast_file = xr.open_dataarray(file) obs = forecast_file.sel(type=self._obs_name) @@ -139,17 +141,23 @@ class PlotOversamplingContingency(AbstractPlotClass): for station in self._stations: file = os.path.join(self._file_path, self._file_name % station) forecast_file = xr.open_dataarray(file) + logging.info(f"{station}: load obs") obs = forecast_file.sel(type=self._obs_name) + logging.info(f"{station}: load pred") predictions = [forecast_file.sel(type=self._model_name)] + logging.info(f"{station}: load comp") competitors = [self._load_competitors(station, [comp]).sel(type=comp) for comp in self._comp_names] predictions.extend(competitors) + logging.info(f"itearate over thresholds") for threshold in range(self._min_threshold, self._max_threshold): - for pred in predictions: + for i, pred in enumerate(predictions): + logging.info(i) ta, fa, fb, tb = self._single_contingency(obs, pred, threshold) contingency_array.loc[dict(thresholds=threshold, contingency_cell="ta", type=pred.type.values)] = ta contingency_array.loc[dict(thresholds=threshold, contingency_cell="fa", type=pred.type.values)] = fa contingency_array.loc[dict(thresholds=threshold, contingency_cell="fb", type=pred.type.values)] = fb contingency_array.loc[dict(thresholds=threshold, contingency_cell="tb", type=pred.type.values)] = tb + logging.info(f"{station}: finished") return contingency_array def _single_contingency(self, obs, pred, threshold): -- GitLab From da4e0ebcace899a8fb9ba766b917c5676745bd83 Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Mon, 16 Aug 2021 12:49:33 +0200 Subject: [PATCH 40/58] Fixed error --- mlair/plotting/postprocessing_plotting.py | 6 ++++-- run_with_oversampling.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/mlair/plotting/postprocessing_plotting.py b/mlair/plotting/postprocessing_plotting.py index 55f74a1..bf73f18 100644 --- a/mlair/plotting/postprocessing_plotting.py +++ b/mlair/plotting/postprocessing_plotting.py @@ -146,8 +146,10 @@ class PlotOversamplingContingency(AbstractPlotClass): logging.info(f"{station}: load pred") predictions = [forecast_file.sel(type=self._model_name)] logging.info(f"{station}: load comp") - competitors = [self._load_competitors(station, [comp]).sel(type=comp) for comp in self._comp_names] - predictions.extend(competitors) + for comp in self._comp_names: + c = self._load_competitors(station, [comp]) + if c is not None: + predictions.append(c.sel(type=comp)) logging.info(f"itearate over thresholds") for threshold in range(self._min_threshold, self._max_threshold): for i, pred in enumerate(predictions): diff --git a/run_with_oversampling.py b/run_with_oversampling.py index 0effa8c..39cf7e1 100644 --- a/run_with_oversampling.py +++ b/run_with_oversampling.py @@ -31,7 +31,7 @@ def main(parser_args): model=IntelliO3_ts_architecture, oversampling_method="bin_oversampling", evaluate_bootstraps=False, # plot_list=["PlotCompetitiveSkillScore"], competitors=["IntelliO3"], - competitor_path=os.path.join(os.getcwd(), "data", "comp_test"), + competitor_path="/p/project/deepacf/intelliaq/gramlich1/mlair/competitors/o3", window_lead_time=1, oversampling_bins=10, oversampling_rates_cap=100, **parser_args.__dict__) workflow.run() -- GitLab From 7269e186b931cef66aa446c13e0460caf33b1765 Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Mon, 16 Aug 2021 13:51:26 +0200 Subject: [PATCH 41/58] debugging in postprocessing_plotting --- mlair/plotting/postprocessing_plotting.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mlair/plotting/postprocessing_plotting.py b/mlair/plotting/postprocessing_plotting.py index bf73f18..da47771 100644 --- a/mlair/plotting/postprocessing_plotting.py +++ b/mlair/plotting/postprocessing_plotting.py @@ -145,15 +145,15 @@ class PlotOversamplingContingency(AbstractPlotClass): obs = forecast_file.sel(type=self._obs_name) logging.info(f"{station}: load pred") predictions = [forecast_file.sel(type=self._model_name)] - logging.info(f"{station}: load comp") + logging.info(f"{station}: load comp, comp_list:{self._comp_names}") for comp in self._comp_names: c = self._load_competitors(station, [comp]) if c is not None: + logging.info(f"{station}: {comp} is not None") predictions.append(c.sel(type=comp)) logging.info(f"itearate over thresholds") for threshold in range(self._min_threshold, self._max_threshold): - for i, pred in enumerate(predictions): - logging.info(i) + for pred in predictions: ta, fa, fb, tb = self._single_contingency(obs, pred, threshold) contingency_array.loc[dict(thresholds=threshold, contingency_cell="ta", type=pred.type.values)] = ta contingency_array.loc[dict(thresholds=threshold, contingency_cell="fa", type=pred.type.values)] = fa -- GitLab From 36da21628b064288df7bc9f9318124594fa97d4c Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Tue, 17 Aug 2021 11:40:35 +0200 Subject: [PATCH 42/58] +1 on every contingency cell --- mlair/plotting/postprocessing_plotting.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mlair/plotting/postprocessing_plotting.py b/mlair/plotting/postprocessing_plotting.py index da47771..c6ec665 100644 --- a/mlair/plotting/postprocessing_plotting.py +++ b/mlair/plotting/postprocessing_plotting.py @@ -155,10 +155,10 @@ class PlotOversamplingContingency(AbstractPlotClass): for threshold in range(self._min_threshold, self._max_threshold): for pred in predictions: ta, fa, fb, tb = self._single_contingency(obs, pred, threshold) - contingency_array.loc[dict(thresholds=threshold, contingency_cell="ta", type=pred.type.values)] = ta - contingency_array.loc[dict(thresholds=threshold, contingency_cell="fa", type=pred.type.values)] = fa - contingency_array.loc[dict(thresholds=threshold, contingency_cell="fb", type=pred.type.values)] = fb - contingency_array.loc[dict(thresholds=threshold, contingency_cell="tb", type=pred.type.values)] = tb + contingency_array.loc[dict(thresholds=threshold, contingency_cell="ta", type=pred.type.values)] = ta + 1 + contingency_array.loc[dict(thresholds=threshold, contingency_cell="fa", type=pred.type.values)] = fa + 1 + contingency_array.loc[dict(thresholds=threshold, contingency_cell="fb", type=pred.type.values)] = fb + 1 + contingency_array.loc[dict(thresholds=threshold, contingency_cell="tb", type=pred.type.values)] = tb + 1 logging.info(f"{station}: finished") return contingency_array -- GitLab From 4a662aa58afa711559783a1438d7a3af6e2d9e3d Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Tue, 17 Aug 2021 11:59:55 +0200 Subject: [PATCH 43/58] added contingency_cell plots --- mlair/plotting/postprocessing_plotting.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/mlair/plotting/postprocessing_plotting.py b/mlair/plotting/postprocessing_plotting.py index c6ec665..668e379 100644 --- a/mlair/plotting/postprocessing_plotting.py +++ b/mlair/plotting/postprocessing_plotting.py @@ -35,7 +35,7 @@ class PlotOversamplingContingency(AbstractPlotClass): def __init__(self, station_names, file_path, comp_path, file_name, plot_folder: str = ".", model_name: str = "nn", obs_name: str = "obs", comp_names: str = "IntelliO3", plot_names=["oversampling_threat_score", "oversampling_hit_rate", "oversampling_false_alarm_rate", - "oversampling_bias", "oversampling_all_scores"]): + "oversampling_bias", "oversampling_all_scores", "contingency_table"]): super().__init__(plot_folder, plot_names[0]) self._stations = station_names @@ -64,6 +64,24 @@ class PlotOversamplingContingency(AbstractPlotClass): self._save() self._plot(score_array, "all_scores") self._save() + self._plot_contingency(contingency_array, self._model_name) + self._save() + for comp in self._comp_names: + self._plot_contingency(contingency_array, comp) + self._save() + + def _plot_contingency(self, contingency_array, type): + plt.plot(range(self._min_threshold, self._max_threshold), + contingency_array.loc[dict(contingency_cell="ta", type=type)], label="a") + plt.plot(range(self._min_threshold, self._max_threshold), + contingency_array.loc[dict(contingency_cell="fa", type=type)], label="b") + plt.plot(range(self._min_threshold, self._max_threshold), + contingency_array.loc[dict(contingency_cell="fb", type=type)], label="c") + plt.plot(range(self._min_threshold, self._max_threshold), + contingency_array.loc[dict(contingency_cell="tb", type=type)], label="d") + plt.title(f"contingency table {type}") + plt.legend() + self.plot_name = f"contingency_table_{type}" def _plot(self, data, score): if score == "all_scores": -- GitLab From f9ce4697e1289e838b6c77ed5a66ee0324faed40 Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Tue, 17 Aug 2021 12:09:23 +0200 Subject: [PATCH 44/58] remove logging from postprocessing_plotting.py, minor_tail_loss=MSE --- mlair/plotting/postprocessing_plotting.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/mlair/plotting/postprocessing_plotting.py b/mlair/plotting/postprocessing_plotting.py index 668e379..4b41c49 100644 --- a/mlair/plotting/postprocessing_plotting.py +++ b/mlair/plotting/postprocessing_plotting.py @@ -139,9 +139,7 @@ class PlotOversamplingContingency(AbstractPlotClass): def _min_max_threshold(self): min_threshold = 0 max_threshold = 0 - logging.info("min_max thresholds") for station in self._stations: - logging.info(f"{station}") file = os.path.join(self._file_path, self._file_name % station) forecast_file = xr.open_dataarray(file) obs = forecast_file.sel(type=self._obs_name) @@ -159,17 +157,12 @@ class PlotOversamplingContingency(AbstractPlotClass): for station in self._stations: file = os.path.join(self._file_path, self._file_name % station) forecast_file = xr.open_dataarray(file) - logging.info(f"{station}: load obs") obs = forecast_file.sel(type=self._obs_name) - logging.info(f"{station}: load pred") predictions = [forecast_file.sel(type=self._model_name)] - logging.info(f"{station}: load comp, comp_list:{self._comp_names}") for comp in self._comp_names: c = self._load_competitors(station, [comp]) if c is not None: - logging.info(f"{station}: {comp} is not None") predictions.append(c.sel(type=comp)) - logging.info(f"itearate over thresholds") for threshold in range(self._min_threshold, self._max_threshold): for pred in predictions: ta, fa, fb, tb = self._single_contingency(obs, pred, threshold) @@ -177,7 +170,6 @@ class PlotOversamplingContingency(AbstractPlotClass): contingency_array.loc[dict(thresholds=threshold, contingency_cell="fa", type=pred.type.values)] = fa + 1 contingency_array.loc[dict(thresholds=threshold, contingency_cell="fb", type=pred.type.values)] = fb + 1 contingency_array.loc[dict(thresholds=threshold, contingency_cell="tb", type=pred.type.values)] = tb + 1 - logging.info(f"{station}: finished") return contingency_array def _single_contingency(self, obs, pred, threshold): -- GitLab From c9b79258aa630f47c88bf697dfd0b3666b208dd7 Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Tue, 17 Aug 2021 12:10:14 +0200 Subject: [PATCH 45/58] minor_tail_loss=MSE --- mlair/model_modules/model_class.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlair/model_modules/model_class.py b/mlair/model_modules/model_class.py index ec7f217..bf97864 100644 --- a/mlair/model_modules/model_class.py +++ b/mlair/model_modules/model_class.py @@ -458,7 +458,7 @@ class IntelliO3_ts_architecture(AbstractModelClass): def set_compile_options(self): self.compile_options = {"optimizer": keras.optimizers.adam(lr=self.initial_lr, amsgrad=True), - "loss": [l_p_loss(4), keras.losses.mean_squared_error], + "loss": [keras.losses.mean_squared_error, keras.losses.mean_squared_error], "metrics": ['mse'], "loss_weights": [.01, .99] } \ No newline at end of file -- GitLab From a1c804ce308897e383d70df2d236203ece281597 Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Wed, 18 Aug 2021 08:06:26 +0200 Subject: [PATCH 46/58] add IntelliO3_ts_architecture_freeze for two phase, add external_weights parameter to load existing model with the given path --- mlair/configuration/defaults.py | 1 + mlair/model_modules/model_class.py | 20 ++++++++++- mlair/run_modules/experiment_setup.py | 6 ++-- mlair/run_modules/training.py | 6 +++- run_two_phase.py | 51 +++++++++++++++++++++++++++ run_with_oversampling.py | 17 ++++----- 6 files changed, 89 insertions(+), 12 deletions(-) create mode 100644 run_two_phase.py diff --git a/mlair/configuration/defaults.py b/mlair/configuration/defaults.py index 47aaf08..dd59201 100644 --- a/mlair/configuration/defaults.py +++ b/mlair/configuration/defaults.py @@ -63,6 +63,7 @@ DEFAULT_OVERSAMPLING_BINS = 10 DEFAULT_OVERSAMPLING_RATES_CAP = 100 DEFAULT_OVERSAMPLING_METHOD = None +DEFAULT_EXTERNAL_WEIGHTS = None def get_defaults(): diff --git a/mlair/model_modules/model_class.py b/mlair/model_modules/model_class.py index bf97864..a5c6a8e 100644 --- a/mlair/model_modules/model_class.py +++ b/mlair/model_modules/model_class.py @@ -381,6 +381,7 @@ class IntelliO3_ts_architecture(AbstractModelClass): SymmetricPadding2D=SymmetricPadding2D, LearningRateDecay=LearningRateDecay) + def set_model(self): """ Build the model. @@ -461,4 +462,21 @@ class IntelliO3_ts_architecture(AbstractModelClass): "loss": [keras.losses.mean_squared_error, keras.losses.mean_squared_error], "metrics": ['mse'], "loss_weights": [.01, .99] - } \ No newline at end of file + } + +class IntelliO3_ts_architecture_freeze(IntelliO3_ts_architecture): + def __init__(self, input_shape: list, output_shape: list): + super().__init__(input_shape, output_shape) + self.freeze_layers() + self.initial_lr = 1e-5 + ''' + def freeze_layers(self): + for layer in self.model.layers: + if not isinstance(layer, keras.layers.core.Dense): + layer.trainable = False + ''' + + def freeze_layers(self): + for layer in self.model.layers: + if layer.name not in ["minor_1_out_Dense", "Main_out_Dense"]: + layer.trainable = False \ No newline at end of file diff --git a/mlair/run_modules/experiment_setup.py b/mlair/run_modules/experiment_setup.py index 1bd37a6..6d8bde1 100644 --- a/mlair/run_modules/experiment_setup.py +++ b/mlair/run_modules/experiment_setup.py @@ -24,7 +24,7 @@ from mlair.configuration.defaults import DEFAULT_STATIONS, DEFAULT_VAR_ALL_DICT, DEFAULT_USE_MULTIPROCESSING, DEFAULT_USE_MULTIPROCESSING_ON_DEBUG, DEFAULT_OVERSAMPLING_BINS, \ DEFAULT_OVERSAMPLING_RATES_CAP, DEFAULT_OVERSAMPLING_METHOD, \ DEFAULT_MAX_NUMBER_MULTIPROCESSING, \ - DEFAULT_BOOTSTRAP_TYPE, DEFAULT_BOOTSTRAP_METHOD + DEFAULT_BOOTSTRAP_TYPE, DEFAULT_BOOTSTRAP_METHOD, DEFAULT_EXTERNAL_WEIGHTS from mlair.data_handler import DefaultDataHandler from mlair.run_modules.run_environment import RunEnvironment from mlair.model_modules.fully_connected_networks import FCN_64_32_16 as VanillaModel @@ -225,7 +225,8 @@ class ExperimentSetup(RunEnvironment): data_origin: Dict = None, competitors: list = None, competitor_path: str = None, use_multiprocessing: bool = None, use_multiprocessing_on_debug: bool = None, max_number_multiprocessing: int = None, start_script: Union[Callable, str] = None, - oversampling_bins=None, oversampling_rates_cap=None, oversampling_method = None, **kwargs): + oversampling_bins=None, oversampling_rates_cap=None, oversampling_method=None, external_weights=None, + **kwargs): # create run framework super().__init__() @@ -287,6 +288,7 @@ class ExperimentSetup(RunEnvironment): # set model path self._set_param("model_path", None, os.path.join(experiment_path, "model")) path_config.check_path_and_create(self.data_store.get("model_path")) + self._set_param("external_weights", external_weights, default=DEFAULT_EXTERNAL_WEIGHTS) # set plot path default_plot_path = os.path.join(experiment_path, "plots") diff --git a/mlair/run_modules/training.py b/mlair/run_modules/training.py index 00e8eae..9cafbbf 100644 --- a/mlair/run_modules/training.py +++ b/mlair/run_modules/training.py @@ -171,7 +171,11 @@ class Training(RunEnvironment): except IndexError: epo_timing = None self.save_callbacks_as_json(history, lr, epo_timing) - self.load_best_model(checkpoint.filepath) + external_weights = self.data_store.get("external_weights") + if external_weights is not None: + self.load_best_model(external_weights) + else: + self.load_best_model(checkpoint.filepath) self.create_monitoring_plots(history, lr) def save_model(self) -> None: diff --git a/run_two_phase.py b/run_two_phase.py new file mode 100644 index 0000000..421c050 --- /dev/null +++ b/run_two_phase.py @@ -0,0 +1,51 @@ +__author__ = "Lukas Leufen" +__date__ = '2020-06-29' + +import argparse +from mlair.workflows import DefaultWorkflowHPC +from mlair.helpers import remove_items +from mlair.configuration.defaults import DEFAULT_PLOT_LIST +from mlair.model_modules.model_class import IntelliO3_ts_architecture, IntelliO3_ts_architecture_freeze +import os + + +def load_stations(external_station_list=None): + import json + if external_station_list is None: + external_station_list = 'supplement/station_list_north_german_plain_rural.json' + try: + filename = external_station_list + with open(filename, 'r') as jfile: + stations = json.load(jfile) + except FileNotFoundError: + stations = None + return stations + +# 1. How to load existing model +# https://www.tensorflow.org/tutorials/images/transfer_learning +# 3. How many epochs? +# 4. Full data set? +# 5. lower learning rate? + + +def main(parser_args): + plots = remove_items(DEFAULT_PLOT_LIST, "PlotConditionalQuantiles") + workflow = DefaultWorkflowHPC(#stations=load_stations('supplement/German_background_stations.json'), + stations=["DEBW013", "DEBW087", "DEBW107", "DEBW076"], + epochs=50, external_weights="/home/vincentgramlich/mlair/data/weights/test_weight.h5", + train_model=True, create_new_model=False, network="UBA", + model=IntelliO3_ts_architecture_freeze, + evaluate_bootstraps=False, # plot_list=["PlotCompetitiveSkillScore"], + # competitors=["test_model", "test_model2"], + competitor_path=os.path.join(os.getcwd(), "data", "comp_test"), + window_lead_time=1, # oversampling_bins=10, oversampling_rates_cap=100, + **parser_args.__dict__) + workflow.run() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--experiment_date', metavar='--exp_date', type=str, default="testrun", + help="set experiment date as string") + args = parser.parse_args() + main(args) diff --git a/run_with_oversampling.py b/run_with_oversampling.py index 39cf7e1..781885f 100644 --- a/run_with_oversampling.py +++ b/run_with_oversampling.py @@ -24,15 +24,16 @@ def load_stations(external_station_list=None): def main(parser_args): plots = remove_items(DEFAULT_PLOT_LIST, "PlotConditionalQuantiles") - workflow = DefaultWorkflowHPC(stations=load_stations('supplement/German_background_stations.json'), - #stations=["DEBW013", "DEBW087", "DEBW107", "DEBW076"], - epochs=150, + workflow = DefaultWorkflowHPC(#stations=load_stations('supplement/German_background_stations.json'), + stations=["DEBW013", "DEBW087", "DEBW107", "DEBW076"], + epochs=1, train_model=True, create_new_model=True, network="UBA", - model=IntelliO3_ts_architecture, oversampling_method="bin_oversampling", - evaluate_bootstraps=False, # plot_list=["PlotCompetitiveSkillScore"], - competitors=["IntelliO3"], - competitor_path="/p/project/deepacf/intelliaq/gramlich1/mlair/competitors/o3", - window_lead_time=1, oversampling_bins=10, oversampling_rates_cap=100, + #model=IntelliO3_ts_architecture, + oversampling_method="bin_oversampling", + evaluate_bootstraps=False, plot_list=["PlotOversamplingContingency"], + competitors=["intellitest"], + #competitor_path="/p/project/deepacf/intelliaq/gramlich1/mlair/competitors/o3", + window_lead_time=1, oversampling_bins=2, oversampling_rates_cap=2, **parser_args.__dict__) workflow.run() -- GitLab From 14dad201d9291ecefea616a3f5d4f1a6dd96de4d Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Wed, 18 Aug 2021 11:01:09 +0200 Subject: [PATCH 47/58] Change PlotOversamplingContingency to PlotContingency and run it also when oversampling_method is None --- mlair/configuration/defaults.py | 2 +- mlair/plotting/postprocessing_plotting.py | 6 +++--- mlair/run_modules/post_processing.py | 7 +++---- test/test_configuration/test_defaults.py | 2 +- 4 files changed, 8 insertions(+), 9 deletions(-) diff --git a/mlair/configuration/defaults.py b/mlair/configuration/defaults.py index dd59201..0dd2a07 100644 --- a/mlair/configuration/defaults.py +++ b/mlair/configuration/defaults.py @@ -51,7 +51,7 @@ DEFAULT_BOOTSTRAP_METHOD = "shuffle" DEFAULT_PLOT_LIST = ["PlotMonthlySummary", "PlotStationMap", "PlotClimatologicalSkillScore", "PlotTimeSeries", "PlotCompetitiveSkillScore", "PlotBootstrapSkillScore", "PlotConditionalQuantiles", "PlotAvailability", "PlotAvailabilityHistogram", "PlotDataHistogram", "PlotPeriodogram", - "PlotOversampling", "PlotOversamplingContingency"] + "PlotOversampling", "PlotContingency"] DEFAULT_SAMPLING = "daily" DEFAULT_DATA_ORIGIN = {"cloudcover": "REA", "humidity": "REA", "pblheight": "REA", "press": "REA", "relhum": "REA", "temp": "REA", "totprecip": "REA", "u": "REA", "v": "REA", "no": "", "no2": "", "o3": "", diff --git a/mlair/plotting/postprocessing_plotting.py b/mlair/plotting/postprocessing_plotting.py index 4b41c49..6eba642 100644 --- a/mlair/plotting/postprocessing_plotting.py +++ b/mlair/plotting/postprocessing_plotting.py @@ -29,13 +29,13 @@ logging.getLogger('matplotlib').setLevel(logging.WARNING) # import matplotlib.pyplot as plt @TimeTrackingWrapper -class PlotOversamplingContingency(AbstractPlotClass): +class PlotContingency(AbstractPlotClass): #Todo: Get min and max_label def __init__(self, station_names, file_path, comp_path, file_name, plot_folder: str = ".", model_name: str = "nn", obs_name: str = "obs", comp_names: str = "IntelliO3", - plot_names=["oversampling_threat_score", "oversampling_hit_rate", "oversampling_false_alarm_rate", - "oversampling_bias", "oversampling_all_scores", "contingency_table"]): + plot_names=["contingency_threat_score", "contingency_hit_rate", "contingency_false_alarm_rate", + "contingency_bias", "contingency_all_scores", "contingency_table"]): super().__init__(plot_folder, plot_names[0]) self._stations = station_names diff --git a/mlair/run_modules/post_processing.py b/mlair/run_modules/post_processing.py index 0444ccf..e6da5f0 100644 --- a/mlair/run_modules/post_processing.py +++ b/mlair/run_modules/post_processing.py @@ -21,7 +21,7 @@ from mlair.model_modules.linear_model import OrdinaryLeastSquaredModel from mlair.model_modules import AbstractModelClass from mlair.plotting.postprocessing_plotting import PlotMonthlySummary, PlotClimatologicalSkillScore, \ PlotCompetitiveSkillScore, PlotTimeSeries, PlotBootstrapSkillScore, PlotConditionalQuantiles,\ - PlotSeparationOfScales, PlotOversamplingContingency + PlotSeparationOfScales, PlotContingency from mlair.plotting.data_insight_plotting import PlotStationMap, PlotAvailability, PlotAvailabilityHistogram, \ PlotPeriodogram, PlotDataHistogram, PlotOversampling from mlair.run_modules.run_environment import RunEnvironment @@ -340,9 +340,8 @@ class PostProcessing(RunEnvironment): iter_dim = self.data_store.get("iter_dim") try: - if (self.data_store.get('oversampling_method')=='bin_oversampling') and ( - "PlotOversamplingContingency" in plot_list): - PlotOversamplingContingency(station_names=self.test_data.keys(), file_path=path, + if ("PlotContingency" in plot_list): + PlotContingency(station_names=self.test_data.keys(), file_path=path, comp_path=self.competitor_path, comp_names=self.competitors, file_name=r"forecasts_%s_test.nc", plot_folder=self.plot_path) except Exception as e: diff --git a/test/test_configuration/test_defaults.py b/test/test_configuration/test_defaults.py index bef3c98..97d80eb 100644 --- a/test/test_configuration/test_defaults.py +++ b/test/test_configuration/test_defaults.py @@ -69,4 +69,4 @@ class TestAllDefaults: "PlotTimeSeries", "PlotCompetitiveSkillScore", "PlotBootstrapSkillScore", "PlotConditionalQuantiles", "PlotAvailability", "PlotAvailabilityHistogram", "PlotDataHistogram", "PlotPeriodogram", "PlotOversampling", - "PlotOversamplingContingency"] + "PlotContingency"] -- GitLab From 85eca0975ad58c70be2b76314e60601f10b11c3b Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Tue, 24 Aug 2021 12:28:35 +0200 Subject: [PATCH 48/58] Fixed finetuning, allowed correct learning_rate --- mlair/model_modules/model_class.py | 15 +++++--- mlair/plotting/postprocessing_plotting.py | 3 +- mlair/run_modules/model_setup.py | 13 +++++-- mlair/run_modules/training.py | 6 +-- run_with_finetuning.py | 45 +++++++++++++++++++++++ run_without_finetuning.py | 45 +++++++++++++++++++++++ 6 files changed, 111 insertions(+), 16 deletions(-) create mode 100644 run_with_finetuning.py create mode 100644 run_without_finetuning.py diff --git a/mlair/model_modules/model_class.py b/mlair/model_modules/model_class.py index a5c6a8e..8343426 100644 --- a/mlair/model_modules/model_class.py +++ b/mlair/model_modules/model_class.py @@ -364,17 +364,18 @@ class IntelliO3_ts_architecture(AbstractModelClass): assert len(output_shape) == 1 super().__init__(input_shape[0], output_shape[0]) - from mlair.model_modules.keras_extensions import LearningRateDecay - # settings self.dropout_rate = .35 self.regularizer = keras.regularizers.l2(0.01) self.initial_lr = 1e-4 - self.lr_decay = LearningRateDecay(base_lr=self.initial_lr, drop=.94, epochs_drop=10) self.activation = keras.layers.ELU self.padding = "SymPad2D" + self.apply_to_model() # apply to model + def apply_to_model(self): + from mlair.model_modules.keras_extensions import LearningRateDecay + self.lr_decay = LearningRateDecay(base_lr=self.initial_lr, drop=.94, epochs_drop=10) self.set_model() self.set_compile_options() self.set_custom_objects(loss=self.compile_options["loss"][0], @@ -467,9 +468,12 @@ class IntelliO3_ts_architecture(AbstractModelClass): class IntelliO3_ts_architecture_freeze(IntelliO3_ts_architecture): def __init__(self, input_shape: list, output_shape: list): super().__init__(input_shape, output_shape) + self.freeze_layers() self.initial_lr = 1e-5 - ''' + self.apply_to_model() + # self.lr_decay = None + def freeze_layers(self): for layer in self.model.layers: if not isinstance(layer, keras.layers.core.Dense): @@ -479,4 +483,5 @@ class IntelliO3_ts_architecture_freeze(IntelliO3_ts_architecture): def freeze_layers(self): for layer in self.model.layers: if layer.name not in ["minor_1_out_Dense", "Main_out_Dense"]: - layer.trainable = False \ No newline at end of file + layer.trainable = False + ''' \ No newline at end of file diff --git a/mlair/plotting/postprocessing_plotting.py b/mlair/plotting/postprocessing_plotting.py index 6eba642..2c8fd6e 100644 --- a/mlair/plotting/postprocessing_plotting.py +++ b/mlair/plotting/postprocessing_plotting.py @@ -30,7 +30,6 @@ logging.getLogger('matplotlib').setLevel(logging.WARNING) @TimeTrackingWrapper class PlotContingency(AbstractPlotClass): - #Todo: Get min and max_label def __init__(self, station_names, file_path, comp_path, file_name, plot_folder: str = ".", model_name: str = "nn", obs_name: str = "obs", comp_names: str = "IntelliO3", @@ -90,7 +89,7 @@ class PlotContingency(AbstractPlotClass): else: for type in data.type.values.tolist(): plt.plot(range(self._min_threshold, self._max_threshold), data.loc[dict(type=type, scores=score)], label=type) - plt.title(self._plot_names[self._plot_counter][13:]) + plt.title(self._plot_names[self._plot_counter]) plt.legend() self.plot_name = self._plot_names[self._plot_counter] self._plot_counter = self._plot_counter + 1 diff --git a/mlair/run_modules/model_setup.py b/mlair/run_modules/model_setup.py index 83f4a2b..28a4e99 100644 --- a/mlair/run_modules/model_setup.py +++ b/mlair/run_modules/model_setup.py @@ -83,7 +83,10 @@ class ModelSetup(RunEnvironment): self.plot_model() # load weights if no training shall be performed - if not self._train_model and not self._create_new_model: + external_weights = self.data_store.get("external_weights") + if external_weights is not None: + self.load_weights(external_weights) + elif not self._train_model and not self._create_new_model: self.load_weights() # create checkpoint @@ -131,11 +134,13 @@ class ModelSetup(RunEnvironment): save_best_only=True, mode='auto') self.data_store.set("callbacks", callbacks, self.scope) - def load_weights(self): + def load_weights(self, external_weight=None): """Try to load weights from existing model or skip if not possible.""" + if external_weight is None: + external_weight = self.model_name try: - self.model.load_weights(self.model_name) - logging.info(f"reload weights from model {self.model_name} ...") + self.model.load_weights(external_weight) + logging.info(f"reload weights from model {external_weight} ...") except OSError: logging.info('no weights to reload...') diff --git a/mlair/run_modules/training.py b/mlair/run_modules/training.py index 9cafbbf..00e8eae 100644 --- a/mlair/run_modules/training.py +++ b/mlair/run_modules/training.py @@ -171,11 +171,7 @@ class Training(RunEnvironment): except IndexError: epo_timing = None self.save_callbacks_as_json(history, lr, epo_timing) - external_weights = self.data_store.get("external_weights") - if external_weights is not None: - self.load_best_model(external_weights) - else: - self.load_best_model(checkpoint.filepath) + self.load_best_model(checkpoint.filepath) self.create_monitoring_plots(history, lr) def save_model(self) -> None: diff --git a/run_with_finetuning.py b/run_with_finetuning.py new file mode 100644 index 0000000..6561179 --- /dev/null +++ b/run_with_finetuning.py @@ -0,0 +1,45 @@ +__author__ = "Lukas Leufen" +__date__ = '2020-06-29' + +import argparse +from mlair.workflows import DefaultWorkflow +from mlair.helpers import remove_items +from mlair.configuration.defaults import DEFAULT_PLOT_LIST +from mlair.model_modules.model_class import IntelliO3_ts_architecture, IntelliO3_ts_architecture_freeze +import os + + +def load_stations(): + import json + try: + filename = 'supplement/station_list_north_german_plain_rural.json' + with open(filename, 'r') as jfile: + stations = json.load(jfile) + except FileNotFoundError: + stations = None + return stations + + +def main(parser_args): + plots = remove_items(DEFAULT_PLOT_LIST, "PlotConditionalQuantiles") + workflow = DefaultWorkflow( # stations=load_stations(), + stations=["DEBW087", "DEBW013", "DEBW107", "DEBW076"], + #stations=["DEBW013", "DEBW087"], + epochs=1, external_weights="/home/vincentgramlich/mlair/data/weights/testrun_network_daily_model-best.h5", + train_model=True, create_new_model=True, network="UBA", + model=IntelliO3_ts_architecture_freeze, + window_lead_time=1, + #oversampling_method="bin_oversampling", oversampling_bins=10, oversampling_rates_cap=100, window_lead_time=2, + evaluate_bootstraps=False, plot_list=["PlotContingency"], + competitors=["withoutfinetuning"], + competitor_path=os.path.join(os.getcwd(), "data", "competitors", "o3"), + **parser_args.__dict__, start_script=__file__) + workflow.run() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--experiment_date', metavar='--exp_date', type=str, default="testrun", + help="set experiment date as string") + args = parser.parse_args() + main(args) diff --git a/run_without_finetuning.py b/run_without_finetuning.py new file mode 100644 index 0000000..61644a3 --- /dev/null +++ b/run_without_finetuning.py @@ -0,0 +1,45 @@ +__author__ = "Lukas Leufen" +__date__ = '2020-06-29' + +import argparse +from mlair.workflows import DefaultWorkflow +from mlair.helpers import remove_items +from mlair.configuration.defaults import DEFAULT_PLOT_LIST +from mlair.model_modules.model_class import IntelliO3_ts_architecture, IntelliO3_ts_architecture_freeze +import os + + +def load_stations(): + import json + try: + filename = 'supplement/station_list_north_german_plain_rural.json' + with open(filename, 'r') as jfile: + stations = json.load(jfile) + except FileNotFoundError: + stations = None + return stations + + +def main(parser_args): + plots = remove_items(DEFAULT_PLOT_LIST, "PlotConditionalQuantiles") + workflow = DefaultWorkflow( # stations=load_stations(), + stations=["DEBW087","DEBW013", "DEBW107", "DEBW076"], + #stations=["DEBW013", "DEBW087"], + epochs=5, #external_weights="/home/vincentgramlich/mlair/data/weights/test_weight.h5", + train_model=True, create_new_model=True, network="UBA", + model=IntelliO3_ts_architecture, + window_lead_time=1, + #oversampling_method="bin_oversampling", oversampling_bins=10, oversampling_rates_cap=100, window_lead_time=2, + evaluate_bootstraps=False, plot_list=plots, + #competitors=["testcompetitor", "testcompetitor2"], + competitor_path=os.path.join(os.getcwd(), "data", "competitors"), + **parser_args.__dict__, start_script=__file__) + workflow.run() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--experiment_date', metavar='--exp_date', type=str, default="testrun", + help="set experiment date as string") + args = parser.parse_args() + main(args) \ No newline at end of file -- GitLab From 3dd15c90a814320bc26cbaec0f75e346b9eb4db3 Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Tue, 24 Aug 2021 20:18:03 +0200 Subject: [PATCH 49/58] Fix in the default_data_handler.py in multiply_extremes --- mlair/data_handler/default_data_handler.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mlair/data_handler/default_data_handler.py b/mlair/data_handler/default_data_handler.py index c97d57e..2ffdd49 100644 --- a/mlair/data_handler/default_data_handler.py +++ b/mlair/data_handler/default_data_handler.py @@ -273,14 +273,16 @@ class DefaultDataHandler(AbstractDataHandler): else: extreme_idx = xr.concat([(Y < -extr_val).any(dim=other_dims[0]), (Y > extr_val).any(dim=other_dims[0])], - dim=other_dims[1]).any(dim=other_dims[1]) + dim=other_dims[0]).any(dim=other_dims[0]) - extremes_X = list(map(lambda x: x.sel(**{dim: extreme_idx}), X)) + sel = extreme_idx[extreme_idx].coords[dim].values + extremes_X = list(map(lambda x: x.sel(**{dim: sel}), X)) self._add_timedelta(extremes_X, dim, timedelta) # extremes_X = list(map(lambda x: x.coords[dim].values + np.timedelta64(*timedelta), extremes_X)) extremes_Y = Y.sel(**{dim: extreme_idx}) - extremes_Y.coords[dim].values += np.timedelta64(*timedelta) + #extremes_Y.coords[dim].values += np.timedelta64(*timedelta) + self._add_timedelta(extremes_Y, dim, timedelta) self._Y_extreme = xr.concat([Y, extremes_Y], dim=dim) self._X_extreme = list(map(lambda x1, x2: xr.concat([x1, x2], dim=dim), X, extremes_X)) -- GitLab From 3e6397d0b1a516a5caf0bd315c9498dc8a101363 Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Tue, 31 Aug 2021 17:30:54 +0200 Subject: [PATCH 50/58] Fixed an error in postprocessing_plotting.py --- mlair/plotting/postprocessing_plotting.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mlair/plotting/postprocessing_plotting.py b/mlair/plotting/postprocessing_plotting.py index 2c8fd6e..a4bd68f 100644 --- a/mlair/plotting/postprocessing_plotting.py +++ b/mlair/plotting/postprocessing_plotting.py @@ -165,10 +165,10 @@ class PlotContingency(AbstractPlotClass): for threshold in range(self._min_threshold, self._max_threshold): for pred in predictions: ta, fa, fb, tb = self._single_contingency(obs, pred, threshold) - contingency_array.loc[dict(thresholds=threshold, contingency_cell="ta", type=pred.type.values)] = ta + 1 - contingency_array.loc[dict(thresholds=threshold, contingency_cell="fa", type=pred.type.values)] = fa + 1 - contingency_array.loc[dict(thresholds=threshold, contingency_cell="fb", type=pred.type.values)] = fb + 1 - contingency_array.loc[dict(thresholds=threshold, contingency_cell="tb", type=pred.type.values)] = tb + 1 + contingency_array.loc[dict(thresholds=threshold, contingency_cell="ta", type=pred.type.values)] = contingency_array.loc[dict(thresholds=threshold, contingency_cell="ta", type=pred.type.values)] + ta + contingency_array.loc[dict(thresholds=threshold, contingency_cell="fa", type=pred.type.values)] = contingency_array.loc[dict(thresholds=threshold, contingency_cell="fa", type=pred.type.values)] + fa + contingency_array.loc[dict(thresholds=threshold, contingency_cell="fb", type=pred.type.values)] = contingency_array.loc[dict(thresholds=threshold, contingency_cell="fb", type=pred.type.values)] + fb + contingency_array.loc[dict(thresholds=threshold, contingency_cell="tb", type=pred.type.values)] = contingency_array.loc[dict(thresholds=threshold, contingency_cell="tb", type=pred.type.values)] + tb return contingency_array def _single_contingency(self, obs, pred, threshold): -- GitLab From ba46ed2291aee512f1e5f2acb92b9b8683fed5b9 Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Tue, 31 Aug 2021 21:55:27 +0200 Subject: [PATCH 51/58] Added model_plot_name to postprocessing_plotting.py --- mlair/plotting/postprocessing_plotting.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/mlair/plotting/postprocessing_plotting.py b/mlair/plotting/postprocessing_plotting.py index a4bd68f..3f11f52 100644 --- a/mlair/plotting/postprocessing_plotting.py +++ b/mlair/plotting/postprocessing_plotting.py @@ -32,7 +32,7 @@ logging.getLogger('matplotlib').setLevel(logging.WARNING) class PlotContingency(AbstractPlotClass): def __init__(self, station_names, file_path, comp_path, file_name, plot_folder: str = ".", model_name: str = "nn", - obs_name: str = "obs", comp_names: str = "IntelliO3", + model_plot_name: str = "nn", obs_name: str = "obs", comp_names: str = "IntelliO3", plot_names=["contingency_threat_score", "contingency_hit_rate", "contingency_false_alarm_rate", "contingency_bias", "contingency_all_scores", "contingency_table"]): @@ -43,6 +43,7 @@ class PlotContingency(AbstractPlotClass): self._file_name = file_name self._obs_name = obs_name self._model_name = model_name + self._model_plot_name = model_plot_name self._comp_names = to_list(comp_names) self._all_names = [self._model_name] self._all_names.extend(self._comp_names) @@ -88,7 +89,11 @@ class PlotContingency(AbstractPlotClass): plt.plot(range(self._min_threshold, self._max_threshold), data.loc[dict(type="nn", scores=score_name)], label=score_name) else: for type in data.type.values.tolist(): - plt.plot(range(self._min_threshold, self._max_threshold), data.loc[dict(type=type, scores=score)], label=type) + if type in "nn": + plt.plot(range(self._min_threshold, self._max_threshold), data.loc[dict(type=type, scores=score)], + label=self._model_plot_name) + else: + plt.plot(range(self._min_threshold, self._max_threshold), data.loc[dict(type=type, scores=score)], label=type) plt.title(self._plot_names[self._plot_counter]) plt.legend() self.plot_name = self._plot_names[self._plot_counter] -- GitLab From 9ab23560877037cdebde08f97e86ff67a4ed53f0 Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Wed, 1 Sep 2021 23:02:44 +0200 Subject: [PATCH 52/58] Add gilbert skill score to PlotContingency --- mlair/plotting/postprocessing_plotting.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/mlair/plotting/postprocessing_plotting.py b/mlair/plotting/postprocessing_plotting.py index 3f11f52..4ca6412 100644 --- a/mlair/plotting/postprocessing_plotting.py +++ b/mlair/plotting/postprocessing_plotting.py @@ -33,7 +33,8 @@ class PlotContingency(AbstractPlotClass): def __init__(self, station_names, file_path, comp_path, file_name, plot_folder: str = ".", model_name: str = "nn", model_plot_name: str = "nn", obs_name: str = "obs", comp_names: str = "IntelliO3", - plot_names=["contingency_threat_score", "contingency_hit_rate", "contingency_false_alarm_rate", + plot_names=["contingency_gilbert_skill_score", "contingency_threat_score", "contingency_hit_rate", + "contingency_false_alarm_rate", "contingency_bias", "contingency_all_scores", "contingency_table"]): super().__init__(plot_folder, plot_names[0]) @@ -50,10 +51,12 @@ class PlotContingency(AbstractPlotClass): self._plot_names = plot_names self._min_threshold, self._max_threshold = self._min_max_threshold() contingency_array = self._calculate_contingencies() - self._scores = ["ts", "h", "f", "b"] + self._scores = ["gss", "ts", "h", "f", "b"] score_array = self._calculate_all_scores(contingency_array) self._plot_counter = 0 + self._plot(score_array, "gss") + self._save() self._plot(score_array, "ts") self._save() self._plot(score_array, "h") @@ -213,6 +216,14 @@ class PlotContingency(AbstractPlotClass): false_above = contingency[1] false_below = contingency[2] true_below = contingency[3] + if score == "gss": + frequency_above_threshold = (true_above + false_below)/(true_above + false_above + false_below + true_below) + forecasts_above_threshold = true_above + false_above + chance_hits = frequency_above_threshold*forecasts_above_threshold + if (true_above + false_above + false_below - chance_hits) == 0: + score_value = 1 + else: + score_value = (true_above - chance_hits)/(true_above + false_above + false_below - chance_hits) if score == "ts": if (true_above + false_above + false_below) == 0: score_value = 1 -- GitLab From ad811eef236e827f128bcd7b6d4f85f6eaf72dc9 Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Thu, 2 Sep 2021 08:47:17 +0200 Subject: [PATCH 53/58] Add model classes IntelliO3_ts_architecture_finetune_all_dense, IntelliO3_ts_architecture_finetune_outputs and IntelliO3_ts_architecture_finetune_main_output --- mlair/model_modules/model_class.py | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/mlair/model_modules/model_class.py b/mlair/model_modules/model_class.py index 8343426..29fe727 100644 --- a/mlair/model_modules/model_class.py +++ b/mlair/model_modules/model_class.py @@ -465,7 +465,7 @@ class IntelliO3_ts_architecture(AbstractModelClass): "loss_weights": [.01, .99] } -class IntelliO3_ts_architecture_freeze(IntelliO3_ts_architecture): +class IntelliO3_ts_architecture_finetune_all_dense(IntelliO3_ts_architecture): def __init__(self, input_shape: list, output_shape: list): super().__init__(input_shape, output_shape) @@ -478,10 +478,31 @@ class IntelliO3_ts_architecture_freeze(IntelliO3_ts_architecture): for layer in self.model.layers: if not isinstance(layer, keras.layers.core.Dense): layer.trainable = False - ''' + +class IntelliO3_ts_architecture_finetune_outputs(IntelliO3_ts_architecture): + def __init__(self, input_shape: list, output_shape: list): + super().__init__(input_shape, output_shape) + + self.freeze_layers() + self.initial_lr = 1e-5 + self.apply_to_model() + # self.lr_decay = None def freeze_layers(self): for layer in self.model.layers: if layer.name not in ["minor_1_out_Dense", "Main_out_Dense"]: layer.trainable = False - ''' \ No newline at end of file + +class IntelliO3_ts_architecture_finetune_main_output(IntelliO3_ts_architecture): + def __init__(self, input_shape: list, output_shape: list): + super().__init__(input_shape, output_shape) + + self.freeze_layers() + self.initial_lr = 1e-5 + self.apply_to_model() + # self.lr_decay = None + + def freeze_layers(self): + for layer in self.model.layers: + if layer.name not in ["Main_out_Dense"]: + layer.trainable = False \ No newline at end of file -- GitLab From 6d39cf4536bdf0a4b117f83d5440bb4a4e194166 Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Thu, 2 Sep 2021 08:51:01 +0200 Subject: [PATCH 54/58] Add imports to run_with_finetuning.py IntelliO3_ts_architecture_finetune_all_dense, IntelliO3_ts_architecture_finetune_outputs and IntelliO3_ts_architecture_finetune_main_output --- run_with_finetuning.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/run_with_finetuning.py b/run_with_finetuning.py index 6561179..fc334a5 100644 --- a/run_with_finetuning.py +++ b/run_with_finetuning.py @@ -5,7 +5,8 @@ import argparse from mlair.workflows import DefaultWorkflow from mlair.helpers import remove_items from mlair.configuration.defaults import DEFAULT_PLOT_LIST -from mlair.model_modules.model_class import IntelliO3_ts_architecture, IntelliO3_ts_architecture_freeze +from mlair.model_modules.model_class import IntelliO3_ts_architecture, IntelliO3_ts_architecture_finetune_all_dense, \ + IntelliO3_ts_architecture_finetune_outputs, IntelliO3_ts_architecture_finetune_main_output import os @@ -27,7 +28,7 @@ def main(parser_args): #stations=["DEBW013", "DEBW087"], epochs=1, external_weights="/home/vincentgramlich/mlair/data/weights/testrun_network_daily_model-best.h5", train_model=True, create_new_model=True, network="UBA", - model=IntelliO3_ts_architecture_freeze, + model=IntelliO3_ts_architecture_finetune_all_dense, window_lead_time=1, #oversampling_method="bin_oversampling", oversampling_bins=10, oversampling_rates_cap=100, window_lead_time=2, evaluate_bootstraps=False, plot_list=["PlotContingency"], -- GitLab From 4f94dddaaed602c384e40d0420ca6f26e938d757 Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Fri, 3 Sep 2021 13:39:27 +0200 Subject: [PATCH 55/58] Set correct order in freeze layers in model_class.py --- mlair/model_modules/model_class.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mlair/model_modules/model_class.py b/mlair/model_modules/model_class.py index 29fe727..f7cae7a 100644 --- a/mlair/model_modules/model_class.py +++ b/mlair/model_modules/model_class.py @@ -469,9 +469,9 @@ class IntelliO3_ts_architecture_finetune_all_dense(IntelliO3_ts_architecture): def __init__(self, input_shape: list, output_shape: list): super().__init__(input_shape, output_shape) - self.freeze_layers() self.initial_lr = 1e-5 self.apply_to_model() + self.freeze_layers() # self.lr_decay = None def freeze_layers(self): @@ -483,9 +483,9 @@ class IntelliO3_ts_architecture_finetune_outputs(IntelliO3_ts_architecture): def __init__(self, input_shape: list, output_shape: list): super().__init__(input_shape, output_shape) - self.freeze_layers() self.initial_lr = 1e-5 self.apply_to_model() + self.freeze_layers() # self.lr_decay = None def freeze_layers(self): @@ -497,9 +497,9 @@ class IntelliO3_ts_architecture_finetune_main_output(IntelliO3_ts_architecture): def __init__(self, input_shape: list, output_shape: list): super().__init__(input_shape, output_shape) - self.freeze_layers() self.initial_lr = 1e-5 self.apply_to_model() + self.freeze_layers() # self.lr_decay = None def freeze_layers(self): -- GitLab From a1d4356fdbd8479706067eddde68c23c5f078a8a Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Wed, 27 Oct 2021 08:31:06 +0200 Subject: [PATCH 56/58] Output tables with all values for contingency tables and scores --- mlair/plotting/postprocessing_plotting.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/mlair/plotting/postprocessing_plotting.py b/mlair/plotting/postprocessing_plotting.py index 4ca6412..29ef70a 100644 --- a/mlair/plotting/postprocessing_plotting.py +++ b/mlair/plotting/postprocessing_plotting.py @@ -51,8 +51,10 @@ class PlotContingency(AbstractPlotClass): self._plot_names = plot_names self._min_threshold, self._max_threshold = self._min_max_threshold() contingency_array = self._calculate_contingencies() + self._save_tables(contingency_array, "contingency") self._scores = ["gss", "ts", "h", "f", "b"] score_array = self._calculate_all_scores(contingency_array) + self._save_tables(score_array, "scores_contingency") self._plot_counter = 0 self._plot(score_array, "gss") @@ -73,6 +75,25 @@ class PlotContingency(AbstractPlotClass): self._plot_contingency(contingency_array, comp) self._save() + def _save_tables(self, array, name): + for model in self._all_names: + type_array = array.sel(type=model).drop("type") + if name is "contingency": + df = pd.DataFrame({"a": type_array.sel(contingency_cell="ta").values, + "b": type_array.sel(contingency_cell="fa").values, + "c": type_array.sel(contingency_cell="fb").values, + "d": type_array.sel(contingency_cell="tb").values}) + else: + df = pd.DataFrame({"gss": type_array.sel(scores="gss").values, + "ts": type_array.sel(scores="ts").values, + "h": type_array.sel(scores="h").values, + "f": type_array.sel(scores="f").values, + "b": type_array.sel(scores="b").values}) + + helpers.tables.save_to_tex(self.plot_folder, f"table_{name}_{model}", + helpers.tables.create_column_format_for_tex(df), df) + + def _plot_contingency(self, contingency_array, type): plt.plot(range(self._min_threshold, self._max_threshold), contingency_array.loc[dict(contingency_cell="ta", type=type)], label="a") -- GitLab From 202af052deb6058257c27816850d50ee4fb3474f Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Wed, 27 Oct 2021 15:14:42 +0200 Subject: [PATCH 57/58] Enable loading weights with external_weights when trainable and create model is False --- mlair/run_modules/post_processing.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mlair/run_modules/post_processing.py b/mlair/run_modules/post_processing.py index e6da5f0..aef5c8e 100644 --- a/mlair/run_modules/post_processing.py +++ b/mlair/run_modules/post_processing.py @@ -301,11 +301,16 @@ class PostProcessing(RunEnvironment): :return: the model """ + external_weights = self.data_store.get("external_weights") try: model = self.data_store.get("best_model") except NameNotFoundInDataStore: logging.info("No model was saved in data store. Try to load model from experiment path.") - model_name = self.data_store.get("model_name", "model") + if external_weights is not None: + logging.info("load model from external_weights path") + model_name = external_weights + else: + model_name = self.data_store.get("model_name", "model") model_class: AbstractModelClass = self.data_store.get("model", "model") model = keras.models.load_model(model_name, custom_objects=model_class.custom_objects) return model -- GitLab From 1179e38f39bef778aa5d6f16f4ac930863e22027 Mon Sep 17 00:00:00 2001 From: "v.gramlich1" Date: Tue, 2 Nov 2021 12:58:43 +0100 Subject: [PATCH 58/58] Plot Oversampling plots with bin_edges_retransformed --- mlair/plotting/data_insight_plotting.py | 6 +++--- mlair/run_modules/post_processing.py | 3 ++- mlair/run_modules/pre_processing.py | 5 +++++ 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/mlair/plotting/data_insight_plotting.py b/mlair/plotting/data_insight_plotting.py index 0a3b28c..29693b3 100644 --- a/mlair/plotting/data_insight_plotting.py +++ b/mlair/plotting/data_insight_plotting.py @@ -23,7 +23,7 @@ from mlair.plotting.abstract_plot_class import AbstractPlotClass @TimeTrackingWrapper class PlotOversampling(AbstractPlotClass): - def __init__(self, data, bin_edges, oversampling_rates, plot_folder: str = ".", + def __init__(self, data, bin_edges, bin_edges_retransformed, oversampling_rates, plot_folder: str = ".", plot_names=["oversampling_histogram", "oversampling_density_histogram", "oversampling_rates", "oversampling_rates_deviation"]): @@ -31,10 +31,10 @@ class PlotOversampling(AbstractPlotClass): Y_hist, Y_extreme_hist, Y_hist_dens, Y_extreme_hist_dens = self._calculate_hist(data, bin_edges) real_oversampling = Y_extreme_hist / Y_hist - self._plot_oversampling_histogram(Y_hist, Y_extreme_hist, bin_edges) + self._plot_oversampling_histogram(Y_hist, Y_extreme_hist, bin_edges_retransformed) self._save() self.plot_name = plot_names[1] - self._plot_oversampling_histogram(Y_hist_dens, Y_extreme_hist_dens, bin_edges) + self._plot_oversampling_histogram(Y_hist_dens, Y_extreme_hist_dens, bin_edges_retransformed) self._save() self.plot_name = plot_names[2] self._plot_oversampling_rates(oversampling_rates, real_oversampling) diff --git a/mlair/run_modules/post_processing.py b/mlair/run_modules/post_processing.py index aef5c8e..a19ddbe 100644 --- a/mlair/run_modules/post_processing.py +++ b/mlair/run_modules/post_processing.py @@ -356,8 +356,9 @@ class PostProcessing(RunEnvironment): if (self.data_store.get('oversampling_method')=='bin_oversampling') and ( "PlotOversampling" in plot_list): bin_edges = self.data_store.get('oversampling_bin_edges') + bin_edges_retransformed = self.data_store.get('oversampling_bin_edges_retransformed') oversampling_rates = self.data_store.get('oversampling_rates_capped', 'train') - PlotOversampling(self.train_data, bin_edges, oversampling_rates, plot_folder=self.plot_path) + PlotOversampling(self.train_data, bin_edges, bin_edges_retransformed, oversampling_rates, plot_folder=self.plot_path) except Exception as e: logging.error(f"Could not create plot OversamplingPlots due to the following error: {e}") diff --git a/mlair/run_modules/pre_processing.py b/mlair/run_modules/pre_processing.py index 3354e78..ef6f325 100644 --- a/mlair/run_modules/pre_processing.py +++ b/mlair/run_modules/pre_processing.py @@ -99,10 +99,15 @@ class PreProcessing(RunEnvironment): # Get Oversampling rates (with and without cap) oversampling_rates = 1 / histogram oversampling_rates_capped = np.minimum(oversampling_rates, rates_cap) + # Get transformer variables + o3_mean = self.data_store.get("transformation")[0]["o3"]["mean"].values + o3_std = self.data_store.get("transformation")[0]["o3"]["std"].values + bin_edges_retransformed = np.floor(bin_edges*o3_std+o3_mean) # Add to datastore self.data_store.set('oversampling_rates', oversampling_rates, 'train') self.data_store.set('oversampling_rates_capped', oversampling_rates_capped, 'train') self.data_store.set('oversampling_bin_edges', bin_edges) + self.data_store.set('oversampling_bin_edges_retransformed', bin_edges_retransformed) #Y = None #Y_extreme = None for station in data: -- GitLab