diff --git a/docs/_source/_plots/data_availability_histogram_hist.png b/docs/_source/_plots/data_availability_histogram_hist.png new file mode 100644 index 0000000000000000000000000000000000000000..30f940145b5db8a6af1b8ce55c6737d9022da419 Binary files /dev/null and b/docs/_source/_plots/data_availability_histogram_hist.png differ diff --git a/docs/_source/_plots/data_availability_histogram_hist_cum.png b/docs/_source/_plots/data_availability_histogram_hist_cum.png new file mode 100644 index 0000000000000000000000000000000000000000..afb9796161739d060eebcd980258bc1bb1b9a14f Binary files /dev/null and b/docs/_source/_plots/data_availability_histogram_hist_cum.png differ diff --git a/docs/_source/_plots/skill_score_bootstrap_separated.png b/docs/_source/_plots/skill_score_bootstrap_separated.png new file mode 100644 index 0000000000000000000000000000000000000000..cddb0f005286e08617bf43fd55fed9b5862856d5 Binary files /dev/null and b/docs/_source/_plots/skill_score_bootstrap_separated.png differ diff --git a/mlair/configuration/defaults.py b/mlair/configuration/defaults.py index ce42fc0eed6e891bc0a0625666da3dccfcc8a3ee..1862d6734430d42a2d0cda0b199acef97b58bebb 100644 --- a/mlair/configuration/defaults.py +++ b/mlair/configuration/defaults.py @@ -48,7 +48,7 @@ DEFAULT_CREATE_NEW_BOOTSTRAPS = False DEFAULT_NUMBER_OF_BOOTSTRAPS = 20 DEFAULT_PLOT_LIST = ["PlotMonthlySummary", "PlotStationMap", "PlotClimatologicalSkillScore", "PlotTimeSeries", "PlotCompetitiveSkillScore", "PlotBootstrapSkillScore", "PlotConditionalQuantiles", - "PlotAvailability", "PlotSeparationOfScales"] + "PlotAvailability", "PlotAvailabilityHistogram", "PlotSeparationOfScales"] DEFAULT_SAMPLING = "daily" DEFAULT_DATA_ORIGIN = {"cloudcover": "REA", "humidity": "REA", "pblheight": "REA", "press": "REA", "relhum": "REA", "temp": "REA", "totprecip": "REA", "u": "REA", "v": "REA", "no": "", "no2": "", "o3": "", diff --git a/mlair/plotting/postprocessing_plotting.py b/mlair/plotting/postprocessing_plotting.py index f0b6baeb0b56126ccccb80c9da993fb406428d93..5383914d6229e2c7017dada2bdcb9c315e327cf7 100644 --- a/mlair/plotting/postprocessing_plotting.py +++ b/mlair/plotting/postprocessing_plotting.py @@ -1,6 +1,6 @@ """Collection of plots to evaluate a model, create overviews on data or forecasts.""" __author__ = "Lukas Leufen, Felix Kleinert" -__date__ = '2019-12-17' +__date__ = '2020-11-23' import logging import math @@ -8,10 +8,10 @@ import os import warnings from typing import Dict, List, Tuple - import matplotlib import matplotlib.patches as mpatches import matplotlib.pyplot as plt +import matplotlib.dates as mdates import numpy as np import pandas as pd import seaborn as sns @@ -75,7 +75,7 @@ class AbstractPlotClass: """ - def __init__(self, plot_folder, plot_name, resolution=500): + def __init__(self, plot_folder, plot_name, resolution=500, rc_params=None): """Set up plot folder and name, and plot resolution (default 500dpi).""" plot_folder = os.path.abspath(plot_folder) if not os.path.exists(plot_folder): @@ -83,6 +83,15 @@ class AbstractPlotClass: self.plot_folder = plot_folder self.plot_name = plot_name self.resolution = resolution + if rc_params is None: + rc_params = {'axes.labelsize': 'large', + 'xtick.labelsize': 'large', + 'ytick.labelsize': 'large', + 'legend.fontsize': 'large', + 'axes.titlesize': 'large', + } + self.rc_params = rc_params + self._update_rc_params() def _plot(self, *args): """Abstract plot class needs to be implemented in inheritance.""" @@ -95,6 +104,24 @@ class AbstractPlotClass: plt.savefig(plot_name, dpi=self.resolution, **kwargs) plt.close('all') + def _update_rc_params(self): + plt.rcParams.update(self.rc_params) + + @staticmethod + def _get_sampling(sampling): + if sampling == "daily": + return "D" + elif sampling == "hourly": + return "h" + + @staticmethod + def get_dataset_colors(): + """ + Standard colors used for train-, val-, and test-sets during postprocessing + """ + colors = {"train": "#e69f00", "val": "#009e73", "test": "#56b4e9"} # hex code + return colors + @TimeTrackingWrapper class PlotMonthlySummary(AbstractPlotClass): @@ -113,18 +140,19 @@ class PlotMonthlySummary(AbstractPlotClass): :param window_lead_time: lead time to plot, if window_lead_time is higher than the available lead time or not given the maximum lead time from data is used. (default None -> use maximum lead time from data). :param plot_folder: path to save the plot (default: current directory) + :param target_var_unit: unit of target var for plot legend (default= ppb) """ def __init__(self, stations: List, data_path: str, name: str, target_var: str, window_lead_time: int = None, - plot_folder: str = "."): + plot_folder: str = ".", target_var_unit: str = 'ppb'): """Set attributes and create plot.""" super().__init__(plot_folder, "monthly_summary_box_plot") self._data_path = data_path self._data_name = name self._data = self._prepare_data(stations) self._window_lead_time = self._get_window_lead_time(window_lead_time) - self._plot(target_var) + self._plot(target_var, target_var_unit) self._save() def _prepare_data(self, stations: List) -> xr.DataArray: @@ -176,7 +204,12 @@ class PlotMonthlySummary(AbstractPlotClass): window_lead_time = ahead_steps return min(ahead_steps, window_lead_time) - def _plot(self, target_var: str): + @staticmethod + def _spell_out_chemical_concentrations(short_name: str): + short2long = {'o3': 'ozone', 'no': 'nitrogen oxide', 'no2': 'nitrogen dioxide', 'nox': 'nitrogen dioxides'} + return f"{short2long[short_name]} concentration" + + def _plot(self, target_var: str, target_var_unit: str): """ Create a monthly grouped box plot over all stations but with separate boxes for each lead time step. @@ -189,7 +222,8 @@ class PlotMonthlySummary(AbstractPlotClass): ax = sns.boxplot(x='index', y='values', hue='ahead', data=data.compute(), whis=1., palette=color_palette, flierprops={'marker': '.', 'markersize': 1}, showmeans=True, meanprops={'markersize': 1, 'markeredgecolor': 'k'}) - ax.set(xlabel='month', ylabel=f'{target_var}') + ylabel = self._spell_out_chemical_concentrations(target_var) + ax.set(xlabel='month', ylabel=f'{ylabel} (in {target_var_unit})') plt.tight_layout() @@ -453,7 +487,8 @@ class PlotConditionalQuantiles(AbstractPlotClass): def _plot(self): """Start plotting routines: overall plot and seasonal (if enabled).""" - logging.info(f"start plotting {self.__class__.__name__}, scheduled number of plots: {(len(self._seasons) + 1) * 2}") + logging.info( + f"start plotting {self.__class__.__name__}, scheduled number of plots: {(len(self._seasons) + 1) * 2}") if len(self._seasons) > 0: self._plot_seasons() @@ -504,7 +539,7 @@ class PlotConditionalQuantiles(AbstractPlotClass): # add histogram of the segmented data (pred_name) handles, labels = ax.get_legend_handles_labels() segmented_data.loc[x_model, d, :].to_pandas().hist(bins=self._bins, ax=ax2, color='k', alpha=.3, grid=False, - rwidth=1) + rwidth=1) # add legend plt.legend(handles[:3] + [handles[-1]], self._opts["legend"], loc='upper left', fontsize='large') # adjust limits and set labels @@ -688,25 +723,37 @@ class PlotBootstrapSkillScore(AbstractPlotClass): (score_only=True, default). Y-axis is adjusted following the data and not hard coded. The plot is saved under plot_folder path with name skill_score_clim_{extra_name_tag}{model_setup}.pdf and resolution of 500dpi. + By passing a list `separate_vars` containing variable names, a second plot is created showing the `separate_vars` + and the remaining variables side by side with different scaling. + .. image:: ../../../../../_source/_plots/skill_score_bootstrap.png :width: 400 + .. image:: ../../../../../_source/_plots/skill_score_bootstrap_separated.png + :width: 400 + """ - def __init__(self, data: Dict, plot_folder: str = ".", model_setup: str = ""): + def __init__(self, data: Dict, plot_folder: str = ".", model_setup: str = "", separate_vars: List = None): """ Set attributes and create plot. :param data: dictionary with station names as keys and 2D xarrays as values, consist on axis ahead and terms. :param plot_folder: path to save the plot (default: current directory) :param model_setup: architecture type to specify plot name (default "CNN") + :param separate_vars: variables to plot separated (default: ['o3']) """ super().__init__(plot_folder, f"skill_score_bootstrap_{model_setup}") + if separate_vars is None: + separate_vars = ['o3'] self._labels = None self._x_name = "boot_var" self._data = self._prepare_data(data) self._plot() self._save() + self.plot_name += '_separated' + self._plot(separate_vars=separate_vars) + self._save(bbox_inches='tight') def _prepare_data(self, data: Dict) -> pd.DataFrame: """ @@ -719,11 +766,30 @@ class PlotBootstrapSkillScore(AbstractPlotClass): :return: pre-processed data set """ data = helpers.dict_to_xarray(data, "station").sortby(self._x_name) + new_boot_coords = self._return_vars_without_number_tag(data.coords['boot_var'].values, split_by='_', keep=1) + data = data.assign_coords({'boot_var': new_boot_coords}) self._labels = [str(i) + "d" for i in data.coords["ahead"].values] if "station" not in data.dims: data = data.expand_dims("station") return data.to_dataframe("data").reset_index(level=[0, 1, 2]) + def _return_vars_without_number_tag(self, values, split_by, keep): + arr = np.array([v.split(split_by) for v in values]) + num = arr[:, 0] + new_val = arr[:, keep] + if self._all_values_are_equal(num, axis=0): + return new_val + else: + raise NotImplementedError + + + @staticmethod + def _all_values_are_equal(arr, axis=0): + if np.all(arr == arr[0], axis=axis): + return True + else: + return False + def _label_add(self, score_only: bool): """ Add the phrase "terms and " if score_only is disabled or empty string (if score_only=True). @@ -733,12 +799,111 @@ class PlotBootstrapSkillScore(AbstractPlotClass): """ return "" if score_only else "terms and " - def _plot(self): + def _plot(self, separate_vars=None): """Plot climatological skill score.""" + if separate_vars is None: + self._plot_all_variables() + else: + self._plot_selected_variables(separate_vars) + + def _plot_selected_variables(self, separate_vars: List): + # if separate_vars is None: + # separate_vars = ['o3'] + data = self._data + self.raise_error_if_separate_vars_do_not_exist(data, separate_vars) + all_variables = self._get_unique_values_from_column_of_df(data, 'boot_var') + # remaining_vars = helpers.list_pop(all_variables, separate_vars) #remove_items + remaining_vars = helpers.remove_items(all_variables, separate_vars) + data_first = self._select_data(df=data, variables=separate_vars, column_name='boot_var') + data_second = self._select_data(df=data, variables=remaining_vars, column_name='boot_var') + + fig, ax = plt.subplots(nrows=1, ncols=2, + gridspec_kw={'width_ratios': [len(separate_vars), + len(remaining_vars) + ] + } + ) + if len(separate_vars) > 1: + first_box_width = .8 + else: + first_box_width = 2. + + sns.boxplot(x=self._x_name, y="data", hue="ahead", data=data_first, ax=ax[0], whis=1., palette="Blues_d", + showmeans=True, meanprops={"markersize": 1, "markeredgecolor": "k"}, + flierprops={"marker": "."}, width=first_box_width + ) + ax[0].set(ylabel=f"skill score", xlabel="") + + sns.boxplot(x=self._x_name, y="data", hue="ahead", data=data_second, ax=ax[1], whis=1., palette="Blues_d", + showmeans=True, meanprops={"markersize": 1, "markeredgecolor": "k"}, + flierprops={"marker": "."}, + ) + ax[1].set(ylabel="", xlabel="") + ax[1].yaxis.tick_right() + handles, _ = ax[1].get_legend_handles_labels() + for sax in ax: + matplotlib.pyplot.sca(sax) + sax.axhline(y=0, color="grey", linewidth=.5) + plt.xticks(rotation=45, ha='right') + sax.legend_.remove() + + fig.legend(handles, self._labels, loc='upper center', ncol=len(handles) + 1, ) + + def align_yaxis(ax1, ax2): + """ + Align zeros of the two axes, zooming them out by same ratio + + This function is copy pasted from https://stackoverflow.com/a/41259922 + """ + axes = (ax1, ax2) + extrema = [ax.get_ylim() for ax in axes] + tops = [extr[1] / (extr[1] - extr[0]) for extr in extrema] + # Ensure that plots (intervals) are ordered bottom to top: + if tops[0] > tops[1]: + axes, extrema, tops = [list(reversed(l)) for l in (axes, extrema, tops)] + + # How much would the plot overflow if we kept current zoom levels? + tot_span = tops[1] + 1 - tops[0] + + b_new_t = extrema[0][0] + tot_span * (extrema[0][1] - extrema[0][0]) + t_new_b = extrema[1][1] - tot_span * (extrema[1][1] - extrema[1][0]) + axes[0].set_ylim(extrema[0][0], b_new_t) + axes[1].set_ylim(t_new_b, extrema[1][1]) + + align_yaxis(ax[0], ax[1]) + align_yaxis(ax[0], ax[1]) + + @staticmethod + def _select_data(df: pd.DataFrame, variables: List[str], column_name: str) -> pd.DataFrame: + for i, variable in enumerate(variables): + if i == 0: + selected_data = df.loc[df[column_name] == variable] + else: + tmp_var = df.loc[df[column_name] == variable] + selected_data = pd.concat([selected_data, tmp_var], axis=0) + return selected_data + + def raise_error_if_separate_vars_do_not_exist(self, data, separate_vars): + if not self._variables_exist_in_df(df=data, variables=separate_vars): + raise ValueError(f"At least one entry of `separate_vars' does not exist in `self.data' ") + + @staticmethod + def _get_unique_values_from_column_of_df(df: pd.DataFrame, column_name: str) -> List: + return list(df[column_name].unique()) + + def _variables_exist_in_df(self, df: pd.DataFrame, variables: List[str], column_name: str = 'boot_var'): + vars_in_df = set(self._get_unique_values_from_column_of_df(df, column_name)) + return set(variables).issubset(vars_in_df) + + def _plot_all_variables(self): + """ + + """ fig, ax = plt.subplots() sns.boxplot(x=self._x_name, y="data", hue="ahead", data=self._data, ax=ax, whis=1., palette="Blues_d", showmeans=True, meanprops={"markersize": 1, "markeredgecolor": "k"}, flierprops={"marker": "."}) ax.axhline(y=0, color="grey", linewidth=.5) + plt.xticks(rotation=45) ax.set(ylabel=f"skill score", xlabel="", title="summary of all stations") handles, _ = ax.get_legend_handles_labels() ax.legend(handles, self._labels) @@ -859,6 +1024,7 @@ class PlotTimeSeries: def _get_time_range(data): def f(x, f_x): return pd.to_datetime(f_x(x.index.values)).year + return f(data, min), f(data, max) @staticmethod @@ -911,8 +1077,10 @@ class PlotAvailability(AbstractPlotClass): # create standard Gantt plot for all stations (currently in single pdf file with single page) super().__init__(plot_folder, "data_availability") self.dim = time_dimension - self.linewidth = None self.sampling = self._get_sampling(sampling) + self.linewidth = None + if self.sampling == 'h': + self.linewidth = 0.001 plot_dict = self._prepare_data(generators) lgd = self._plot(plot_dict) self._save(bbox_extra_artists=(lgd,), bbox_inches="tight") @@ -927,13 +1095,6 @@ class PlotAvailability(AbstractPlotClass): lgd = self._plot(plot_dict_summary) self._save(bbox_extra_artists=(lgd,), bbox_inches="tight") - def _get_sampling(self, sampling): - if sampling == "daily": - return "D" - elif sampling == "hourly": - self.linewidth = 0.001 - return "h" - def _prepare_data(self, generators: Dict[str, DataCollection]): plt_dict = {} for subset, data_collection in generators.items(): @@ -978,9 +1139,7 @@ class PlotAvailability(AbstractPlotClass): return plt_dict def _plot(self, plt_dict): - # colors = {"train": "orange", "val": "blueishgreen", "test": "skyblue"} # color names - colors = {"train": "#e69f00", "val": "#009e73", "test": "#56b4e9"} # hex code - # colors = {"train": (230, 159, 0), "val": (0, 158, 115), "test": (86, 180, 233)} # in rgb but as abs values + colors = self.get_dataset_colors() pos = 0 height = 0.8 # should be <= 1 yticklabels = [] @@ -1025,6 +1184,169 @@ class PlotSeparationOfScales(AbstractPlotClass): self._save() +@TimeTrackingWrapper +class PlotAvailabilityHistogram(AbstractPlotClass): + """ + Create data availability plots as histogram. + + Each entry of each generator is checked for `notnull()` values along all the datetime axis (boolean). + Calling this class creates two different types of histograms where each generator + + 1) data_availability_histogram: datetime (xaxis) vs. number of stations with availabile data (yaxis) + 2) data_availability_histogram_cumulative: number of samples (xaxis) vs. number of stations having at least number + of samples (yaxis) + + .. image:: ../../../../../_source/_plots/data_availability_histogram_hist.png + :width: 400 + + .. image:: ../../../../../_source/_plots/data_availability_histogram_hist_cum.png + :width: 400 + + """ + + def __init__(self, generators: Dict[str, DataCollection], plot_folder: str = ".", + subset_dim: str = 'DataSet', history_dim: str = 'window', + station_dim: str = 'Stations',): + + super().__init__(plot_folder, "data_availability_histogram") + + self.subset_dim = subset_dim + self.history_dim = history_dim + self.station_dim = station_dim + + self.freq = None + self.temporal_dim = None + self.target_dim = None + self._prepare_data(generators) + + for plt_type in self.allowed_plot_types: + plot_name_tmp = self.plot_name + self.plot_name += '_' + plt_type + self._plot(plt_type=plt_type) + self._save() + self.plot_name = plot_name_tmp + + def _set_dims_from_datahandler(self, data_handler): + self.temporal_dim = data_handler.id_class.time_dim + self.target_dim = data_handler.id_class.target_dim + self.freq = self._get_sampling(data_handler.id_class.sampling) + + @property + def allowed_plot_types(self): + plot_types = ['hist', 'hist_cum'] + return plot_types + + def _prepare_data(self, generators: Dict[str, DataCollection]): + """ + Prepares data to be used by plot methods. + + Creates xarrays which are sums of valid data (boolean sums) across i) station_dim and ii) temporal_dim + """ + avail_data_time_sum = {} + avail_data_station_sum = {} + dataset_time_interval = {} + for subset, generator in generators.items(): + avail_list = [] + for station in generator: + self._set_dims_from_datahandler(data_handler=station) + station_data_x = station.get_X(as_numpy=False)[0] + station_data_x = station_data_x.loc[{self.history_dim: 0, # select recent window frame + self.target_dim: station_data_x[self.target_dim].values[0]}] + avail_list.append(station_data_x.notnull()) + avail_data = xr.concat(avail_list, dim=self.station_dim).notnull() + avail_data_time_sum[subset] = avail_data.sum(dim=self.station_dim) + avail_data_station_sum[subset] = avail_data.sum(dim=self.temporal_dim) + dataset_time_interval[subset] = self._get_first_and_last_indexelement_from_xarray( + avail_data_time_sum[subset], dim_name=self.temporal_dim, return_type='as_dict' + ) + avail_data_amount = xr.concat(avail_data_time_sum.values(), pd.Index(avail_data_time_sum.keys(), + name=self.subset_dim) + ) + full_time_index = self._make_full_time_index(avail_data_amount.coords[self.temporal_dim].values, freq=self.freq) + self.avail_data_cum_sum = xr.concat(avail_data_station_sum.values(), pd.Index(avail_data_station_sum.keys(), + name=self.subset_dim)) + self.avail_data_amount = avail_data_amount.reindex({self.temporal_dim: full_time_index}) + self.dataset_time_interval = dataset_time_interval + + @staticmethod + def _get_first_and_last_indexelement_from_xarray(xarray, dim_name, return_type='as_tuple'): + if isinstance(xarray, xr.DataArray): + first = xarray.coords[dim_name].values[0] + last = xarray.coords[dim_name].values[-1] + if return_type == 'as_tuple': + return first, last + elif return_type == 'as_dict': + return {'first': first, 'last': last} + else: + raise TypeError(f"return_type must be 'as_tuple' or 'as_dict', but is '{return_type}'") + else: + raise TypeError(f"xarray must be of type xr.DataArray, but is of type {type(xarray)}") + + @staticmethod + def _make_full_time_index(irregular_time_index, freq): + full_time_index = pd.date_range(start=irregular_time_index[0], end=irregular_time_index[-1], freq=freq) + return full_time_index + + def _plot(self, plt_type='hist', *args): + if plt_type == 'hist': + self._plot_hist() + elif plt_type == 'hist_cum': + self._plot_hist_cum() + else: + raise ValueError(f"plt_type mus be 'hist' or 'hist_cum', but is {type}") + + def _plot_hist(self, *args): + colors = self.get_dataset_colors() + fig, axes = plt.subplots(figsize=(10, 3)) + for i, subset in enumerate(self.dataset_time_interval.keys()): + plot_dataset = self.avail_data_amount.sel({self.subset_dim: subset, + self.temporal_dim: slice( + self.dataset_time_interval[subset]['first'], + self.dataset_time_interval[subset]['last'] + ) + } + ) + + plot_dataset.plot.step(color=colors[subset], ax=axes, label=subset) + plt.fill_between(plot_dataset.coords[self.temporal_dim].values, plot_dataset.values, color=colors[subset]) + + lgd = fig.legend(loc="upper right", ncol=len(self.dataset_time_interval), + facecolor='white', framealpha=1, edgecolor='black') + for lgd_line in lgd.get_lines(): + lgd_line.set_linewidth(4.0) + plt.gca().xaxis.set_major_locator(mdates.YearLocator()) + plt.title('') + plt.ylabel('Number of samples') + plt.tight_layout() + + def _plot_hist_cum(self, *args): + colors = self.get_dataset_colors() + fig, axes = plt.subplots(figsize=(10, 3)) + n_bins = int(self.avail_data_cum_sum.max().values) + bins = np.arange(0, n_bins+1) + descending_subsets = self.avail_data_cum_sum.max(dim=self.station_dim).sortby( + self.avail_data_cum_sum.max(dim=self.station_dim), ascending=False + ).coords[self.subset_dim].values + + for subset in descending_subsets: + self.avail_data_cum_sum.sel({self.subset_dim: subset}).plot.hist(ax=axes, + bins=bins, + label=subset, + cumulative=-1, + color=colors[subset], + # alpha=.5 + ) + + lgd = fig.legend(loc="upper right", ncol=len(self.dataset_time_interval), + facecolor='white', framealpha=1, edgecolor='black') + plt.title('') + plt.ylabel('Number of stations') + plt.xlabel('Number of samples') + plt.xlim((bins[0], bins[-1])) + plt.tight_layout() + + + if __name__ == "__main__": stations = ['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087'] path = "../../testrun_network/forecasts" diff --git a/mlair/run_modules/post_processing.py b/mlair/run_modules/post_processing.py index 3dc91cbd54094f116f0d959fb9c845751e998464..21fbb3edfa2c9665db12f7ea6bb0f9310ca324a1 100644 --- a/mlair/run_modules/post_processing.py +++ b/mlair/run_modules/post_processing.py @@ -19,8 +19,8 @@ from mlair.helpers import TimeTracking, statistics, extract_value from mlair.model_modules.linear_model import OrdinaryLeastSquaredModel from mlair.model_modules.model_class import AbstractModelClass from mlair.plotting.postprocessing_plotting import PlotMonthlySummary, PlotStationMap, PlotClimatologicalSkillScore, \ - PlotCompetitiveSkillScore, PlotTimeSeries, PlotBootstrapSkillScore, PlotAvailability, PlotConditionalQuantiles, \ - PlotSeparationOfScales + PlotCompetitiveSkillScore, PlotTimeSeries, PlotBootstrapSkillScore, PlotAvailability, PlotAvailabilityHistogram, \ + PlotConditionalQuantiles, PlotSeparationOfScales from mlair.run_modules.run_environment import RunEnvironment @@ -293,6 +293,10 @@ class PostProcessing(RunEnvironment): if "PlotAvailability" in plot_list: avail_data = {"train": self.train_data, "val": self.val_data, "test": self.test_data} PlotAvailability(avail_data, plot_folder=self.plot_path, time_dimension=time_dimension) + if "PlotAvailabilityHistogram" in plot_list: + avail_data = {"train": self.train_data, "val": self.val_data, "test": self.test_data} + PlotAvailabilityHistogram(avail_data, plot_folder=self.plot_path, )# time_dimension=time_dimension) + def calculate_test_score(self): """Evaluate test score of model and save locally.""" diff --git a/mlair/run_modules/pre_processing.py b/mlair/run_modules/pre_processing.py index 13a4b38184ea3c00c9cf5990c35c6d7bdcb6225d..7f546865f3ab3ebae8b48f375300035f5be766a4 100644 --- a/mlair/run_modules/pre_processing.py +++ b/mlair/run_modules/pre_processing.py @@ -115,9 +115,47 @@ class PreProcessing(RunEnvironment): precision = 4 path = os.path.join(self.data_store.get("experiment_path"), "latex_report") path_config.check_path_and_create(path) - set_names = ["train", "val", "test"] - df = pd.DataFrame(columns=meta_data + set_names) - for set_name in set_names: + names_of_set = ["train", "val", "test"] + df = self.create_info_df(meta_data, meta_round, names_of_set, precision) + column_format = self.create_column_format_for_tex(df) + self.save_to_tex(path=path, filename="station_sample_size.tex", column_format=column_format, df=df) + self.save_to_md(path=path, filename="station_sample_size.md", df=df) + df_nometa = df.drop(meta_data, axis=1) + self.save_to_tex(path=path, filename="station_sample_size_short.tex", column_format=column_format, df=df_nometa) + self.save_to_md(path=path, filename="station_sample_size_short.md", df=df_nometa) + # df_nometa.to_latex(os.path.join(path, "station_sample_size_short.tex"), na_rep='---', + # column_format=column_format) + df_descr = self.create_describe_df(df_nometa) + column_format = self.create_column_format_for_tex(df_descr) + self.save_to_tex(path=path, filename="station_describe_short.tex", column_format=column_format, df=df_descr) + self.save_to_md(path=path, filename="station_describe_short.md", df=df_descr) + # df_descr.to_latex(os.path.join(path, "station_describe_short.tex"), na_rep='---', column_format=column_format) + + @staticmethod + def create_describe_df(df, percentiles=None, ignore_last_lines: int = 2): + if percentiles is None: + percentiles = [.05, .1, .25, .5, .75, .9, .95] + df_descr = df.iloc[:-ignore_last_lines].astype('float32').describe( + percentiles=percentiles).astype('int32') + df_descr = pd.concat([df.loc[['# Samples']], df_descr]).T + df_descr.rename(columns={"# Samples": "no. samples", "count": "no. stations"}, inplace=True) + df_descr_colnames = list(df_descr.columns) + df_descr_colnames = [df_descr_colnames[1]] + [df_descr_colnames[0]] + df_descr_colnames[2:] + df_descr = df_descr[df_descr_colnames] + return df_descr + + @staticmethod + def save_to_tex(path, filename, column_format, df, na_rep='---'): + df.to_latex(os.path.join(path, filename), na_rep=na_rep, column_format=column_format) + + @staticmethod + def save_to_md(path, filename, df, mode="w", encoding='utf-8', tablefmt="github"): + df.to_markdown(open(os.path.join(path, filename), mode=mode, encoding=encoding), + tablefmt=tablefmt) + + def create_info_df(self, meta_data, meta_round, names_of_set, precision): + df = pd.DataFrame(columns=meta_data + names_of_set) + for set_name in names_of_set: data = self.data_store.get("data_collection", set_name) for station in data: station_name = str(station.id_class) @@ -130,15 +168,21 @@ class PreProcessing(RunEnvironment): df.sort_index(inplace=True) df = df.reindex(df.index.drop(["# Stations", "# Samples"]).to_list() + ["# Stations", "# Samples"], ) df.index.name = 'stat. ID' + return df + + @staticmethod + def create_column_format_for_tex(df: pd.DataFrame) -> str: + """ + Creates column format for latex table based on the shape of a given DataFrame. + + Calculates number of columns and uses 'c' as column position. First element is set to 'l', last to 'r' + """ column_format = np.repeat('c', df.shape[1] + 1) column_format[0] = 'l' column_format[-1] = 'r' column_format = ''.join(column_format.tolist()) - df.to_latex(os.path.join(path, "station_sample_size.tex"), na_rep='---', column_format=column_format) - df.to_markdown(open(os.path.join(path, "station_sample_size.md"), mode="w", encoding='utf-8'), - tablefmt="github") - df.drop(meta_data, axis=1).to_latex(os.path.join(path, "station_sample_size_short.tex"), na_rep='---', - column_format=column_format) + return column_format + def split_train_val_test(self) -> None: """ diff --git a/test/test_configuration/test_defaults.py b/test/test_configuration/test_defaults.py index fffe7c84075eeeab37ebf59d52bc42dbf87bf522..ae81ef2ef0a15ad08f14ad19312f04040ab71263 100644 --- a/test/test_configuration/test_defaults.py +++ b/test/test_configuration/test_defaults.py @@ -70,4 +70,5 @@ class TestAllDefaults: assert DEFAULT_NUMBER_OF_BOOTSTRAPS == 20 assert DEFAULT_PLOT_LIST == ["PlotMonthlySummary", "PlotStationMap", "PlotClimatologicalSkillScore", "PlotTimeSeries", "PlotCompetitiveSkillScore", "PlotBootstrapSkillScore", - "PlotConditionalQuantiles", "PlotAvailability", "PlotSeparationOfScales"] + "PlotConditionalQuantiles", "PlotAvailability", "PlotAvailabilityHistogram", + "PlotSeparationOfScales"] diff --git a/test/test_run_modules/test_pre_processing.py b/test/test_run_modules/test_pre_processing.py index 602c563fa9e346bd944b20aee2c038970791730e..e7d05bdb51aeb069beda002f9b53439853edff91 100644 --- a/test/test_run_modules/test_pre_processing.py +++ b/test/test_run_modules/test_pre_processing.py @@ -9,6 +9,8 @@ from mlair.helpers import PyTestRegex from mlair.run_modules.experiment_setup import ExperimentSetup from mlair.run_modules.pre_processing import PreProcessing from mlair.run_modules.run_environment import RunEnvironment +import pandas as pd +import numpy as np class TestPreProcessing: @@ -117,3 +119,38 @@ class TestPreProcessing: assert pre.transformation(data_preparation, stations) is None class data_preparation_no_trans: pass assert pre.transformation(data_preparation_no_trans, stations) is None + + @pytest.fixture + def dummy_df(self): + data_dict = {'station_name': {'DEBW013': 'Stuttgart Bad Cannstatt', 'DEBW076': 'Baden-Baden', + 'DEBW087': 'Schwäbische_Alb', 'DEBW107': 'Tübingen', + 'DEBY081': 'Garmisch-Partenkirchen/Kreuzeckbahnstraße', '# Stations': np.nan, + '# Samples': np.nan}, + 'station_lon': {'DEBW013': 9.2297, 'DEBW076': 8.2202, 'DEBW087': 9.2076, 'DEBW107': 9.0512, + 'DEBY081': 11.0631, '# Stations': np.nan, '# Samples': np.nan}, + 'station_lat': {'DEBW013': 48.8088, 'DEBW076': 48.7731, 'DEBW087': 48.3458, 'DEBW107': 48.5077, + 'DEBY081': 47.4764, '# Stations': np.nan, '# Samples': np.nan}, + 'station_alt': {'DEBW013': 235.0, 'DEBW076': 148.0, 'DEBW087': 798.0, 'DEBW107': 325.0, + 'DEBY081': 735.0, '# Stations': np.nan, '# Samples': np.nan}, + 'train': {'DEBW013': 1413, 'DEBW076': 3002, 'DEBW087': 3016, 'DEBW107': 1782, 'DEBY081': 2837, + '# Stations': 6, '# Samples': 12050}, + 'val': {'DEBW013': 698, 'DEBW076': 715, 'DEBW087': 700, 'DEBW107': 701, 'DEBY081': 456, + '# Stations': 6, '# Samples': 3270}, + 'test': {'DEBW013': 1066, 'DEBW076': 696, 'DEBW087': 1080, 'DEBW107': 1080, 'DEBY081': 700, + '# Stations': 6, '# Samples': 4622}} + df = pd.DataFrame.from_dict(data_dict) + return df + + def test_create_column_format_for_tex(self): + df = pd.DataFrame(np.ones((2, 1))) + df_col = PreProcessing.create_column_format_for_tex(df) # len: 1+1 + assert df_col == 'lr' + assert len(df_col) == 2 + df = pd.DataFrame(np.ones((2, 2))) + df_col = PreProcessing.create_column_format_for_tex(df) # len: 2+1 + assert df_col == 'lcr' + assert len(df_col) == 3 + df = pd.DataFrame(np.ones((2, 3))) + df_col = PreProcessing.create_column_format_for_tex(df) # len: 3+1 + assert df_col == 'lccr' + assert len(df_col) == 4