diff --git a/mlair/data_handler/data_handler_single_station.py b/mlair/data_handler/data_handler_single_station.py index 429f9604593539e6060716c37b4c9736b6beed40..3e962fa8c4fa96637dea86db98e29a180deeac5b 100644 --- a/mlair/data_handler/data_handler_single_station.py +++ b/mlair/data_handler/data_handler_single_station.py @@ -53,7 +53,7 @@ class DataHandlerSingleStation(AbstractDataHandler): _hash = ["station", "statistics_per_var", "data_origin", "station_type", "network", "sampling", "target_dim", "target_var", "time_dim", "iter_dim", "window_dim", "window_history_size", "window_history_offset", - "window_lead_time", "interpolation_limit", "interpolation_method"] + "window_lead_time", "interpolation_limit", "interpolation_method", "variables"] def __init__(self, station, data_path, statistics_per_var=None, station_type=DEFAULT_STATION_TYPE, network=DEFAULT_NETWORK, sampling: Union[str, Tuple[str]] = DEFAULT_SAMPLING, diff --git a/mlair/data_handler/input_bootstraps.py b/mlair/data_handler/input_bootstraps.py index b8ad614f2317e804d415b23308df760f4dd8da7f..2c0027f4df701a5e5956bc74fadbba11e7fafda5 100644 --- a/mlair/data_handler/input_bootstraps.py +++ b/mlair/data_handler/input_bootstraps.py @@ -75,6 +75,14 @@ class BootstrapIterator(Iterator): else: return self._method.apply(data) + def _prepare_data_for_next(self): + index_dimension_collection = self._collection[self._position] + nboot = self._data.number_of_bootstraps + _X, _Y = self._data.data.get_data(as_numpy=False) + _X = list(map(lambda x: x.expand_dims({self.boot_dim: range(nboot)}, axis=-1), _X)) + _Y = _Y.expand_dims({self.boot_dim: range(nboot)}, axis=-1) + return _X, _Y, index_dimension_collection + class BootstrapIteratorSingleInput(BootstrapIterator): _position: int = None @@ -85,11 +93,12 @@ class BootstrapIteratorSingleInput(BootstrapIterator): def __next__(self): """Return next element or stop iteration.""" try: - index, dimension = self._collection[self._position] - nboot = self._data.number_of_bootstraps - _X, _Y = self._data.data.get_data(as_numpy=False) - _X = list(map(lambda x: x.expand_dims({self.boot_dim: range(nboot)}, axis=-1), _X)) - _Y = _Y.expand_dims({self.boot_dim: range(nboot)}, axis=-1) + _X, _Y, (index, dimension) = self._prepare_data_for_next() + # index, dimension = self._collection[self._position] + # nboot = self._data.number_of_bootstraps + # _X, _Y = self._data.data.get_data(as_numpy=False) + # _X = list(map(lambda x: x.expand_dims({self.boot_dim: range(nboot)}, axis=-1), _X)) + # _Y = _Y.expand_dims({self.boot_dim: range(nboot)}, axis=-1) single_variable = _X[index].sel({self._dimension: [dimension]}) bootstrapped_variable = self.apply_bootstrap_method(single_variable.values) bootstrapped_data = xr.DataArray(bootstrapped_variable, coords=single_variable.coords, @@ -117,11 +126,7 @@ class BootstrapIteratorVariable(BootstrapIterator): def __next__(self): """Return next element or stop iteration.""" try: - dimension = self._collection[self._position] - nboot = self._data.number_of_bootstraps - _X, _Y = self._data.data.get_data(as_numpy=False) - _X = list(map(lambda x: x.expand_dims({self.boot_dim: range(nboot)}, axis=-1), _X)) - _Y = _Y.expand_dims({self.boot_dim: range(nboot)}, axis=-1) + _X, _Y, dimension = self._prepare_data_for_next() for index in range(len(_X)): if dimension in _X[index].coords[self._dimension]: single_variable = _X[index].sel({self._dimension: [dimension]}) @@ -150,11 +155,7 @@ class BootstrapIteratorBranch(BootstrapIterator): def __next__(self): try: - index = self._collection[self._position] - nboot = self._data.number_of_bootstraps - _X, _Y = self._data.data.get_data(as_numpy=False) - _X = list(map(lambda x: x.expand_dims({self.boot_dim: range(nboot)}, axis=-1), _X)) - _Y = _Y.expand_dims({self.boot_dim: range(nboot)}, axis=-1) + _X, _Y, index = self._prepare_data_for_next() for dimension in _X[index].coords[self._dimension].values: single_variable = _X[index].sel({self._dimension: [dimension]}) bootstrapped_variable = self.apply_bootstrap_method(single_variable.values) @@ -172,6 +173,59 @@ class BootstrapIteratorBranch(BootstrapIterator): return list(range(len(data.get_X(as_numpy=False)))) +class BootstrapIteratorVariableSets(BootstrapIterator): + _variable_set_splitters: list = ['Sect', 'SectLeft', 'SectRight',] + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def __next__(self): + try: + _X, _Y, (index, dimensions) = self._prepare_data_for_next() + for dimension in dimensions: # _X[index].coords[self._dimension].values: + single_variable = _X[index].sel({self._dimension: [dimension]}) + bootstrapped_variable = self.apply_bootstrap_method(single_variable.values) + bootstrapped_data = xr.DataArray(bootstrapped_variable, coords=single_variable.coords, + dims=single_variable.dims) + _X[index] = bootstrapped_data.combine_first(_X[index]).transpose(*_X[index].dims) + self._position += 1 + except IndexError: + raise StopIteration() + _X, _Y = self._to_numpy(_X), self._to_numpy(_Y) + # dimension_return_by_seperator = [sec for i, var in enumerate(dimensions) for sec in self._variable_set_splitters if (var.endswith(sec) and i ==0)] + # return self._reshape(_X), self._reshape(_Y), (index, dimension_return_by_seperator[0]) + return self._reshape(_X), self._reshape(_Y), (index, dimensions) + + @classmethod + def create_collection(cls, data, dim): + # l = set() + # for i, x in enumerate(data.get_X(as_numpy=False)): + # l.update(x.indexes[dim].to_list()) + # # l.update(['O3Sect', 'O3SectLeft', 'O3SectRight']) # ToDo Remove : just for testing + # return [[var for var in to_list(l) if var.endswith(collection_name)] for collection_name in cls._variable_set_splitters] + + l = [] + for i, x in enumerate(data.get_X(as_numpy=False)): + l.append(x.indexes[dim].to_list()) + # l[0] = l[0] + ['o3Sect', 'o3SectLeft', 'o3SectRight', 'no2Sect', 'no2SectLeft', 'no2SectRight'] + + res = [[var for var in l[i] if var.endswith(collection_name)] for collection_name in cls._variable_set_splitters] + base_vars = [var for var in l[i] if not var.endswith(tuple(cls._variable_set_splitters))] + res.append(base_vars) + res = [(i, dimensions) for i, _ in enumerate(data.get_X(as_numpy=False)) for dimensions in res] + return res + # return list(chain(*res)) + # [[(0, 'o3'), (0, 'relhum'), (0, 'temp'), (0, 'u'), (0, 'v'), (0, 'no'), (0, 'no2'), (0, 'cloudcover'), + # (0, 'pblheight')]] + + + + # l = [] + # for i, x in enumerate(data.get_X(as_numpy=False)): + # l.append(list(map(lambda y: (i, y), x.indexes[dim]))) + # return list(chain(*l)) + + class ShuffleBootstraps: @staticmethod @@ -225,10 +279,12 @@ class Bootstraps(Iterable): self.bootstrap_method = {"shuffle": ShuffleBootstraps(), "zero_mean": MeanBootstraps(mean=0)}.get( bootstrap_method) # todo adjust number of bootstraps if mean bootstrapping + self.bootstrap_type = bootstrap_type self.BootstrapIterator = {"singleinput": BootstrapIteratorSingleInput, "branch": BootstrapIteratorBranch, - "variable": BootstrapIteratorVariable}.get(bootstrap_type, - BootstrapIteratorSingleInput) + "variable": BootstrapIteratorVariable, + "group_of_variables": BootstrapIteratorVariableSets, + }.get(bootstrap_type, BootstrapIteratorSingleInput) def __iter__(self): return self.BootstrapIterator(self, self.bootstrap_method) @@ -236,6 +292,11 @@ class Bootstraps(Iterable): def __len__(self): return len(self.BootstrapIterator.create_collection(self.data, self.bootstrap_dimension)) + def __repr__(self): + return f"Bootstraps(data={self.data}, number_of_bootstraps={self.number_of_bootstraps}, " \ + f"bootstrap_dimension='{self.bootstrap_dimension}', bootstrap_type='{self.bootstrap_type}', " \ + f"bootstrap_method='{self.bootstrap_method}')" + def bootstraps(self): return self.BootstrapIterator.create_collection(self.data, self.bootstrap_dimension) diff --git a/mlair/data_handler/iterator.py b/mlair/data_handler/iterator.py index f2e3b689512ee99524eef8445f84a5a3bdb60f90..e353f84d85a0871b00964899efb2a79bf555aefc 100644 --- a/mlair/data_handler/iterator.py +++ b/mlair/data_handler/iterator.py @@ -9,6 +9,7 @@ import math import os import shutil import pickle +import logging import dill from typing import Tuple, List @@ -142,6 +143,7 @@ class KerasIterator(keras.utils.Sequence): remaining = None mod_rank = self._get_model_rank() for data in self._collection: + logging.debug(f"prepare batches for {str(data)}") X = data.get_X(upsampling=self.upsampling) Y = [data.get_Y(upsampling=self.upsampling)[0] for _ in range(mod_rank)] if self.upsampling: diff --git a/mlair/plotting/postprocessing_plotting.py b/mlair/plotting/postprocessing_plotting.py index 2abdbecbd45f187179598e69c8d286280106e35b..4f5107ac18617832b764f13b27352154e1913f15 100644 --- a/mlair/plotting/postprocessing_plotting.py +++ b/mlair/plotting/postprocessing_plotting.py @@ -8,6 +8,7 @@ import os import sys import warnings from typing import Dict, List, Tuple, Union +import ast import matplotlib import matplotlib.pyplot as plt @@ -128,7 +129,7 @@ class PlotMonthlySummary(AbstractPlotClass): # pragma: no cover """ data = self._data.to_dataset(name='values').to_dask_dataframe() logging.debug("... start plotting") - color_palette = [matplotlib.colors.cnames["green"]] + sns.color_palette("Blues_d", + color_palette = [matplotlib.colors.cnames["green"]] + sns.color_palette("Blues_r", self._window_lead_time).as_hex() ax = sns.boxplot(x='index', y='values', hue='ahead', data=data.compute(), whis=1.5, palette=color_palette, flierprops={'marker': '.', 'markersize': 1}, showmeans=True, @@ -464,7 +465,7 @@ class PlotClimatologicalSkillScore(AbstractPlotClass): # pragma: no cover fig, ax = plt.subplots() if not score_only: fig.set_size_inches(11.7, 8.27) - sns.boxplot(x="terms", y="data", hue="ahead", data=self._data, ax=ax, whis=1.5, palette="Blues_d", + sns.boxplot(x="terms", y="data", hue="ahead", data=self._data, ax=ax, whis=1.5, palette="Blues_r", showmeans=True, meanprops={"markersize": 1, "markeredgecolor": "k"}, flierprops={"marker": "."}) ax.axhline(y=0, color="grey", linewidth=.5) ax.set(ylabel=f"{self._label_add(score_only)}skill score", xlabel="", title="summary of all stations", @@ -556,7 +557,7 @@ class PlotCompetitiveSkillScore(AbstractPlotClass): # pragma: no cover fig, ax = plt.subplots(figsize=(size, size * 0.8)) data = self._filter_comparisons(self._data) if single_model_comparison is True else self._data order = self._create_pseudo_order(data) - sns.boxplot(x="comparison", y="data", hue="ahead", data=data, whis=1.5, ax=ax, palette="Blues_d", + sns.boxplot(x="comparison", y="data", hue="ahead", data=data, whis=1.5, ax=ax, palette="Blues_r", showmeans=True, meanprops={"markersize": 3, "markeredgecolor": "k"}, flierprops={"marker": "."}, order=order) ax.axhline(y=0, color="grey", linewidth=.5) @@ -571,7 +572,7 @@ class PlotCompetitiveSkillScore(AbstractPlotClass): # pragma: no cover fig, ax = plt.subplots() data = self._filter_comparisons(self._data) if single_model_comparison is True else self._data order = self._create_pseudo_order(data) - sns.boxplot(y="comparison", x="data", hue="ahead", data=data, whis=1.5, ax=ax, palette="Blues_d", + sns.boxplot(y="comparison", x="data", hue="ahead", data=data, whis=1.5, ax=ax, palette="Blues_r", showmeans=True, meanprops={"markersize": 3, "markeredgecolor": "k"}, flierprops={"marker": "."}, order=order) ax.axvline(x=0, color="grey", linewidth=.5) @@ -637,7 +638,7 @@ class PlotSectorialSkillScore(AbstractPlotClass): # pragma: no cover size = max([len(np.unique(self._data.sector)), 6]) fig, ax = plt.subplots(figsize=(size, size * 0.8)) data = self._data - sns.boxplot(x="sector", y="data", hue="ahead", data=data, whis=1, ax=ax, palette="Blues_d", + sns.boxplot(x="sector", y="data", hue="ahead", data=data, whis=1, ax=ax, palette="Blues_r", showmeans=True, meanprops={"markersize": 3, "markeredgecolor": "k"}, flierprops={"marker": "."}, ) ax.axhline(y=0, color="grey", linewidth=.5) @@ -652,7 +653,7 @@ class PlotSectorialSkillScore(AbstractPlotClass): # pragma: no cover """Plot skill scores of the comparisons, but vertically aligned.""" fig, ax = plt.subplots() data = self._data - sns.boxplot(y="sector", x="data", hue="ahead", data=data, whis=1.5, ax=ax, palette="Blues_d", + sns.boxplot(y="sector", x="data", hue="ahead", data=data, whis=1.5, ax=ax, palette="Blues_r", showmeans=True, meanprops={"markersize": 3, "markeredgecolor": "k"}, flierprops={"marker": "."}, ) ax.axvline(x=0, color="grey", linewidth=.5) @@ -761,7 +762,8 @@ class PlotFeatureImportanceSkillScore(AbstractPlotClass): # pragma: no cover return {"singleinput": "single input"}.get(boot_type, boot_type) def _set_title(self, model_name, branch=None): - title_d = {"single input": "Single Inputs", "branch": "Input Branches", "variable": "Variables"} + title_d = {"single input": "Single Inputs", "branch": "Input Branches", "variable": "Variables", + "group_of_variables": "grouped variables"} base_title = f"{model_name}\nImportance of {title_d[self._boot_type]}" additional = [] @@ -841,7 +843,16 @@ class PlotFeatureImportanceSkillScore(AbstractPlotClass): # pragma: no cover num = arr[:, 0] if arr.shape[keep] == 1: # keep dim has only length 1, no number tags required return num - new_val = arr[:, keep] + if self._boot_type == "group_of_variables": + h = [] + for i, subset in enumerate(arr[:, keep]): + group_name = self.findstem(ast.literal_eval(subset)) + if group_name == '': + group_name = "Base" + h.append(group_name) + new_val = h + else: + new_val = arr[:, keep] if self._all_values_are_equal(num, axis=0): return new_val elif as_unique is True: @@ -849,6 +860,54 @@ class PlotFeatureImportanceSkillScore(AbstractPlotClass): # pragma: no cover else: raise NotImplementedError + @staticmethod + def findstem(arr): + """ + directly taken from: https://www.geeksforgeeks.org/longest-common-substring-array-strings/ + + # Python 3 program to find the stem + # of given list of words + + # function to find the stem (longest + # common substring) from the string array + :param arr: + :type arr: + :return: + :rtype: + """ + + # Determine size of the array + n = len(arr) + + # Take first word from array + # as reference + s = arr[0] + l = len(s) + + res = "" + + for i in range(l): + for j in range(i + 1, l + 1): + + # generating all possible substrings + # of our reference string arr[0] i.e s + stem = s[i:j] + k = 1 + for k in range(1, n): + + # Check if the generated stem is + # common to all words + if stem not in arr[k]: + break + + # If current substring is present in + # all strings and its length is greater + # than current result + if (k + 1 == n and len(res) < len(stem)): + res = stem + + return res + @staticmethod def _get_number_tag(values, split_by): arr = np.array([v.split(split_by) for v in values]) @@ -872,7 +931,7 @@ class PlotFeatureImportanceSkillScore(AbstractPlotClass): # pragma: no cover return "" if score_only else "terms and " def _plot(self, branch=None, separate_vars=None): - """Plot climatological skill score.""" + """Plot feature importance skill score.""" if separate_vars is None: self._plot_all_variables(branch) else: @@ -917,7 +976,7 @@ class PlotFeatureImportanceSkillScore(AbstractPlotClass): # pragma: no cover first_box_width = .8 sns.boxplot(x=self._x_name, y="data", hue=self._ahead_dim, data=data_first, ax=ax[0], whis=1., - palette="Blues_d", showmeans=True, meanprops={"markersize": 1, "markeredgecolor": "k"}, + palette="Blues_r", showmeans=True, meanprops={"markersize": 1, "markeredgecolor": "k"}, showfliers=False, width=first_box_width) ax[0].set(ylabel=f"skill score", xlabel="") if self._ylim is not None: @@ -925,7 +984,7 @@ class PlotFeatureImportanceSkillScore(AbstractPlotClass): # pragma: no cover ax[0].set(ylim=_ylim) sns.boxplot(x=self._x_name, y="data", hue=self._ahead_dim, data=data_second, ax=ax[1], whis=1.5, - palette="Blues_d", showmeans=True, meanprops={"markersize": 1, "markeredgecolor": "k"}, + palette="Blues_r", showmeans=True, meanprops={"markersize": 1, "markeredgecolor": "k"}, showfliers=False, flierprops={"marker": "."}) ax[1].set(ylabel="", xlabel="") @@ -1018,17 +1077,17 @@ class PlotFeatureImportanceSkillScore(AbstractPlotClass): # pragma: no cover if self._boot_type == "branch": fig, ax = plt.subplots(figsize=(0.5 + 2 / len(plot_data[self._x_name].unique()) + len(plot_data[self._x_name].unique()),4)) sns.boxplot(x=self._x_name, y="data", hue=self._ahead_dim, data=plot_data, ax=ax, whis=1., - palette="Blues_d", showmeans=True, meanprops={"markersize": 1, "markeredgecolor": "k"}, + palette="Blues_r", showmeans=True, meanprops={"markersize": 1, "markeredgecolor": "k"}, showfliers=False, width=0.8) else: fig, ax = plt.subplots() - sns.boxplot(x=self._x_name, y="data", hue=self._ahead_dim, data=plot_data, ax=ax, whis=1.5, palette="Blues_d", + sns.boxplot(x=self._x_name, y="data", hue=self._ahead_dim, data=plot_data, ax=ax, whis=1.5, palette="Blues_r", showmeans=True, meanprops={"markersize": 1, "markeredgecolor": "k"}, showfliers=False) ax.axhline(y=0, color="grey", linewidth=.5) #<<<<<<< HEAD #======= - if group_size > 1: - [ax.axvline(x + .5, color='grey') for i, x in enumerate(ax.get_xticks(), start=1) if i % group_size == 0] + # if group_size > 1: + # [ax.axvline(x + .5, color='grey') for i, x in enumerate(ax.get_xticks(), start=1) if i % group_size == 0] plt.xticks(rotation=45, horizontalalignment="right") ax.set(ylabel=f"skill score", xlabel="", title="summary of all stations") @@ -1154,7 +1213,7 @@ class PlotTimeSeries: # pragma: no cover return f, ax[:, 0], factor def _plot_ahead(self, ax, data): - color = sns.color_palette("Blues_d", self._window_lead_time).as_hex() + color = sns.color_palette("Blues_r", self._window_lead_time).as_hex() for ahead in data.coords[self._ahead_dim].values: plot_data = data.sel({"type": self._model_name, self._ahead_dim: ahead}).drop(["type", self._ahead_dim]).squeeze().shift(index=ahead) label = f"{ahead}{self._sampling}"