diff --git a/mlair/plotting/postprocessing_plotting.py b/mlair/plotting/postprocessing_plotting.py index f0b6baeb0b56126ccccb80c9da993fb406428d93..581af2a009133c0e994ba3e33b105b5125f129dd 100644 --- a/mlair/plotting/postprocessing_plotting.py +++ b/mlair/plotting/postprocessing_plotting.py @@ -1,6 +1,6 @@ """Collection of plots to evaluate a model, create overviews on data or forecasts.""" __author__ = "Lukas Leufen, Felix Kleinert" -__date__ = '2019-12-17' +__date__ = '2020-11-23' import logging import math @@ -8,10 +8,10 @@ import os import warnings from typing import Dict, List, Tuple - import matplotlib import matplotlib.patches as mpatches import matplotlib.pyplot as plt +import matplotlib.dates as mdates import numpy as np import pandas as pd import seaborn as sns @@ -75,7 +75,7 @@ class AbstractPlotClass: """ - def __init__(self, plot_folder, plot_name, resolution=500): + def __init__(self, plot_folder, plot_name, resolution=500, rc_params=None): """Set up plot folder and name, and plot resolution (default 500dpi).""" plot_folder = os.path.abspath(plot_folder) if not os.path.exists(plot_folder): @@ -83,6 +83,15 @@ class AbstractPlotClass: self.plot_folder = plot_folder self.plot_name = plot_name self.resolution = resolution + if rc_params is None: + rc_params = {'axes.labelsize': 'large', + 'xtick.labelsize': 'large', + 'ytick.labelsize': 'large', + 'legend.fontsize': 'large', + 'axes.titlesize': 'large', + } + self.rc_params = rc_params + self._update_rc_params() def _plot(self, *args): """Abstract plot class needs to be implemented in inheritance.""" @@ -95,6 +104,24 @@ class AbstractPlotClass: plt.savefig(plot_name, dpi=self.resolution, **kwargs) plt.close('all') + def _update_rc_params(self): + plt.rcParams.update(self.rc_params) + + @staticmethod + def _get_sampling(sampling): + if sampling == "daily": + return "D" + elif sampling == "hourly": + return "h" + + @staticmethod + def get_dataset_colors(): + """ + Standard colors used for train-, val-, and test-sets during postprocessing + """ + colors = {"train": "#e69f00", "val": "#009e73", "test": "#56b4e9"} # hex code + return colors + @TimeTrackingWrapper class PlotMonthlySummary(AbstractPlotClass): @@ -113,18 +140,19 @@ class PlotMonthlySummary(AbstractPlotClass): :param window_lead_time: lead time to plot, if window_lead_time is higher than the available lead time or not given the maximum lead time from data is used. (default None -> use maximum lead time from data). :param plot_folder: path to save the plot (default: current directory) + :param target_var_unit: unit of target var for plot legend (default= ppb) """ def __init__(self, stations: List, data_path: str, name: str, target_var: str, window_lead_time: int = None, - plot_folder: str = "."): + plot_folder: str = ".", target_var_unit: str = 'ppb'): """Set attributes and create plot.""" super().__init__(plot_folder, "monthly_summary_box_plot") self._data_path = data_path self._data_name = name self._data = self._prepare_data(stations) self._window_lead_time = self._get_window_lead_time(window_lead_time) - self._plot(target_var) + self._plot(target_var, target_var_unit) self._save() def _prepare_data(self, stations: List) -> xr.DataArray: @@ -176,7 +204,12 @@ class PlotMonthlySummary(AbstractPlotClass): window_lead_time = ahead_steps return min(ahead_steps, window_lead_time) - def _plot(self, target_var: str): + @staticmethod + def _spell_out_chemical_concentrations(short_name: str): + short2long = {'o3': 'ozone', 'no': 'nitrogen oxide', 'no2': 'nitrogen dioxide', 'nox': 'nitrogen dioxides'} + return f"{short2long[short_name]} concentration" + + def _plot(self, target_var: str, target_var_unit: str): """ Create a monthly grouped box plot over all stations but with separate boxes for each lead time step. @@ -189,7 +222,8 @@ class PlotMonthlySummary(AbstractPlotClass): ax = sns.boxplot(x='index', y='values', hue='ahead', data=data.compute(), whis=1., palette=color_palette, flierprops={'marker': '.', 'markersize': 1}, showmeans=True, meanprops={'markersize': 1, 'markeredgecolor': 'k'}) - ax.set(xlabel='month', ylabel=f'{target_var}') + ylabel = self._spell_out_chemical_concentrations(target_var) + ax.set(xlabel='month', ylabel=f'{ylabel} (in {target_var_unit})') plt.tight_layout() @@ -453,7 +487,8 @@ class PlotConditionalQuantiles(AbstractPlotClass): def _plot(self): """Start plotting routines: overall plot and seasonal (if enabled).""" - logging.info(f"start plotting {self.__class__.__name__}, scheduled number of plots: {(len(self._seasons) + 1) * 2}") + logging.info( + f"start plotting {self.__class__.__name__}, scheduled number of plots: {(len(self._seasons) + 1) * 2}") if len(self._seasons) > 0: self._plot_seasons() @@ -504,7 +539,7 @@ class PlotConditionalQuantiles(AbstractPlotClass): # add histogram of the segmented data (pred_name) handles, labels = ax.get_legend_handles_labels() segmented_data.loc[x_model, d, :].to_pandas().hist(bins=self._bins, ax=ax2, color='k', alpha=.3, grid=False, - rwidth=1) + rwidth=1) # add legend plt.legend(handles[:3] + [handles[-1]], self._opts["legend"], loc='upper left', fontsize='large') # adjust limits and set labels @@ -693,20 +728,26 @@ class PlotBootstrapSkillScore(AbstractPlotClass): """ - def __init__(self, data: Dict, plot_folder: str = ".", model_setup: str = ""): + def __init__(self, data: Dict, plot_folder: str = ".", model_setup: str = "", separate_vars: List = None): """ Set attributes and create plot. :param data: dictionary with station names as keys and 2D xarrays as values, consist on axis ahead and terms. :param plot_folder: path to save the plot (default: current directory) :param model_setup: architecture type to specify plot name (default "CNN") + :param separate_vars: variables to plot separated (default: ['o3']) """ super().__init__(plot_folder, f"skill_score_bootstrap_{model_setup}") + if separate_vars is None: + separate_vars = ['o3'] self._labels = None self._x_name = "boot_var" self._data = self._prepare_data(data) self._plot() self._save() + self.plot_name += '_separated' + self._plot(separate_vars=separate_vars) + self._save(bbox_inches='tight') def _prepare_data(self, data: Dict) -> pd.DataFrame: """ @@ -719,11 +760,30 @@ class PlotBootstrapSkillScore(AbstractPlotClass): :return: pre-processed data set """ data = helpers.dict_to_xarray(data, "station").sortby(self._x_name) + new_boot_coords = self._return_vars_without_number_tag(data.coords['boot_var'].values, split_by='_', keep=1) + data = data.assign_coords({'boot_var': new_boot_coords}) self._labels = [str(i) + "d" for i in data.coords["ahead"].values] if "station" not in data.dims: data = data.expand_dims("station") return data.to_dataframe("data").reset_index(level=[0, 1, 2]) + def _return_vars_without_number_tag(self, values, split_by, keep): + arr = np.array([v.split(split_by) for v in values]) + num = arr[:, 0] + new_val = arr[:, keep] + if self._all_values_are_equal(num, axis=0): + return new_val + else: + raise NotImplementedError + + + @staticmethod + def _all_values_are_equal(arr, axis=0): + if np.all(arr == arr[0], axis=axis): + return True + else: + return False + def _label_add(self, score_only: bool): """ Add the phrase "terms and " if score_only is disabled or empty string (if score_only=True). @@ -733,12 +793,111 @@ class PlotBootstrapSkillScore(AbstractPlotClass): """ return "" if score_only else "terms and " - def _plot(self): + def _plot(self, separate_vars=None): """Plot climatological skill score.""" + if separate_vars is None: + self._plot_all_variables() + else: + self._plot_selected_variables(separate_vars) + + def _plot_selected_variables(self, separate_vars: List): + # if separate_vars is None: + # separate_vars = ['o3'] + data = self._data + self.raise_error_if_separate_vars_do_not_exist(data, separate_vars) + all_variables = self._get_unique_values_from_column_of_df(data, 'boot_var') + # remaining_vars = helpers.list_pop(all_variables, separate_vars) #remove_items + remaining_vars = helpers.remove_items(all_variables, separate_vars) + data_first = self._select_data(df=data, variables=separate_vars, column_name='boot_var') + data_second = self._select_data(df=data, variables=remaining_vars, column_name='boot_var') + + fig, ax = plt.subplots(nrows=1, ncols=2, + gridspec_kw={'width_ratios': [len(separate_vars), + len(remaining_vars) + ] + } + ) + if len(separate_vars) > 1: + first_box_width = .8 + else: + first_box_width = 2. + + sns.boxplot(x=self._x_name, y="data", hue="ahead", data=data_first, ax=ax[0], whis=1., palette="Blues_d", + showmeans=True, meanprops={"markersize": 1, "markeredgecolor": "k"}, + flierprops={"marker": "."}, width=first_box_width + ) + ax[0].set(ylabel=f"skill score", xlabel="") + + sns.boxplot(x=self._x_name, y="data", hue="ahead", data=data_second, ax=ax[1], whis=1., palette="Blues_d", + showmeans=True, meanprops={"markersize": 1, "markeredgecolor": "k"}, + flierprops={"marker": "."}, + ) + ax[1].set(ylabel="", xlabel="") + ax[1].yaxis.tick_right() + handles, _ = ax[1].get_legend_handles_labels() + for sax in ax: + matplotlib.pyplot.sca(sax) + sax.axhline(y=0, color="grey", linewidth=.5) + plt.xticks(rotation=45, ha='right') + sax.legend_.remove() + + fig.legend(handles, self._labels, loc='upper center', ncol=len(handles) + 1, ) + + def align_yaxis(ax1, ax2): + """ + Align zeros of the two axes, zooming them out by same ratio + + This function is copy pasted from https://stackoverflow.com/a/41259922 + """ + axes = (ax1, ax2) + extrema = [ax.get_ylim() for ax in axes] + tops = [extr[1] / (extr[1] - extr[0]) for extr in extrema] + # Ensure that plots (intervals) are ordered bottom to top: + if tops[0] > tops[1]: + axes, extrema, tops = [list(reversed(l)) for l in (axes, extrema, tops)] + + # How much would the plot overflow if we kept current zoom levels? + tot_span = tops[1] + 1 - tops[0] + + b_new_t = extrema[0][0] + tot_span * (extrema[0][1] - extrema[0][0]) + t_new_b = extrema[1][1] - tot_span * (extrema[1][1] - extrema[1][0]) + axes[0].set_ylim(extrema[0][0], b_new_t) + axes[1].set_ylim(t_new_b, extrema[1][1]) + + align_yaxis(ax[0], ax[1]) + align_yaxis(ax[0], ax[1]) + + @staticmethod + def _select_data(df: pd.DataFrame, variables: List[str], column_name: str) -> pd.DataFrame: + for i, variable in enumerate(variables): + if i == 0: + selected_data = df.loc[df[column_name] == variable] + else: + tmp_var = df.loc[df[column_name] == variable] + selected_data = pd.concat([selected_data, tmp_var], axis=0) + return selected_data + + def raise_error_if_separate_vars_do_not_exist(self, data, separate_vars): + if not self._variables_exist_in_df(df=data, variables=separate_vars): + raise ValueError(f"At least one entry of `separate_vars' does not exist in `self.data' ") + + @staticmethod + def _get_unique_values_from_column_of_df(df: pd.DataFrame, column_name: str) -> List: + return list(df[column_name].unique()) + + def _variables_exist_in_df(self, df: pd.DataFrame, variables: List[str], column_name: str = 'boot_var'): + vars_in_df = set(self._get_unique_values_from_column_of_df(df, column_name)) + return set(variables).issubset(vars_in_df) + + def _plot_all_variables(self): + """ + + """ fig, ax = plt.subplots() sns.boxplot(x=self._x_name, y="data", hue="ahead", data=self._data, ax=ax, whis=1., palette="Blues_d", showmeans=True, meanprops={"markersize": 1, "markeredgecolor": "k"}, flierprops={"marker": "."}) ax.axhline(y=0, color="grey", linewidth=.5) + plt.xticks(rotation=45) ax.set(ylabel=f"skill score", xlabel="", title="summary of all stations") handles, _ = ax.get_legend_handles_labels() ax.legend(handles, self._labels) @@ -859,6 +1018,7 @@ class PlotTimeSeries: def _get_time_range(data): def f(x, f_x): return pd.to_datetime(f_x(x.index.values)).year + return f(data, min), f(data, max) @staticmethod @@ -978,9 +1138,7 @@ class PlotAvailability(AbstractPlotClass): return plt_dict def _plot(self, plt_dict): - # colors = {"train": "orange", "val": "blueishgreen", "test": "skyblue"} # color names - colors = {"train": "#e69f00", "val": "#009e73", "test": "#56b4e9"} # hex code - # colors = {"train": (230, 159, 0), "val": (0, 158, 115), "test": (86, 180, 233)} # in rgb but as abs values + colors = self.get_dataset_colors() pos = 0 height = 0.8 # should be <= 1 yticklabels = [] @@ -1025,6 +1183,155 @@ class PlotSeparationOfScales(AbstractPlotClass): self._save() +@TimeTrackingWrapper +class PlotAvailabilityHistogram(AbstractPlotClass): + """ + Create data availability plots as histogram. + + Each entry of each generator is checked for `notnull()` values along all the datetime axis (boolean). + Calling this class creates two different types of histograms where each generator + + 1) data_availability_histogram: datetime (xaxis) vs. number of stations with availabile data (yaxis) + 2) data_availability_histogram_cumulative: number of samples (xaxis) vs. number of stations having at least number + of samples (yaxis) + + """ + + def __init__(self, generators: Dict[str, DataCollection], plot_folder: str = ".", sampling="daily", + subset_dim: str = 'DataSet', temporal_dim: str = 'datetime', history_dim: str = 'window', + station_dim: str = 'Stations', target_dim='variables'): + + super().__init__(plot_folder, "data_availability_histogram") + self.freq = self._get_sampling(sampling) + self.subset_dim = subset_dim + self.temporal_dim = temporal_dim + self.history_dim = history_dim + self.station_dim = station_dim + self.target_dim = target_dim + self._prepare_data(generators) + + for plt_type in self.allowed_plot_types: + plot_name_tmp = self.plot_name + self.plot_name += '_' + plt_type + self._plot(plt_type=plt_type) + self._save() + self.plot_name = plot_name_tmp + + @property + def allowed_plot_types(self): + plot_types = ['hist', 'hist_cum'] + return plot_types + + def _prepare_data(self, generators: Dict[str, DataCollection]): + """ + Prepares data to be used by plot methods. + + Creates xarrays which are sums of valid data (boolean sums) across i) station_dim and ii) temporal_dim + """ + avail_data_time_sum = {} + avail_data_station_sum = {} + dataset_time_interval = {} + for subset, generator in generators.items(): + avail_list = [] + for station in generator: + station_data_x = station.get_X(as_numpy=False)[0] + station_data_x = station_data_x.loc[{self.history_dim: 0, # select recent window frame + self.target_dim: station_data_x[self.target_dim].values[0]}] + avail_list.append(station_data_x.notnull()) + avail_data = xr.concat(avail_list, dim=self.station_dim).notnull() + avail_data_time_sum[subset] = avail_data.sum(dim=self.station_dim) + avail_data_station_sum[subset] = avail_data.sum(dim=self.temporal_dim) + dataset_time_interval[subset] = self._get_first_and_last_indexelement_from_xarray( + avail_data_time_sum[subset], dim_name=self.temporal_dim, return_type='as_dict' + ) + avail_data_amount = xr.concat(avail_data_time_sum.values(), pd.Index(avail_data_time_sum.keys(), + name=self.subset_dim) + ) + full_time_index = self._make_full_time_index(avail_data_amount.coords[self.temporal_dim].values, freq=self.freq) + self.avail_data_cum_sum = xr.concat(avail_data_station_sum.values(), pd.Index(avail_data_station_sum.keys(), + name=self.subset_dim)) + self.avail_data_amount = avail_data_amount.reindex({self.temporal_dim: full_time_index}) + self.dataset_time_interval = dataset_time_interval + + @staticmethod + def _get_first_and_last_indexelement_from_xarray(xarray, dim_name, return_type='as_tuple'): + if isinstance(xarray, xr.DataArray): + first = xarray.coords[dim_name].values[0] + last = xarray.coords[dim_name].values[-1] + if return_type == 'as_tuple': + return first, last + elif return_type == 'as_dict': + return {'first': first, 'last': last} + else: + raise TypeError(f"return_type must be 'as_tuple' or 'as_dict', but is '{return_type}'") + else: + raise TypeError(f"xarray must be of type xr.DataArray, but is of type {type(xarray)}") + + @staticmethod + def _make_full_time_index(irregular_time_index, freq): + full_time_index = pd.date_range(start=irregular_time_index[0], end=irregular_time_index[-1], freq=freq) + return full_time_index + + def _plot(self, plt_type='hist', *args): + if plt_type == 'hist': + self._plot_hist() + elif plt_type == 'hist_cum': + self._plot_hist_cum() + else: + raise ValueError(f"plt_type mus be 'hist' or 'hist_cum', but is {type}") + + def _plot_hist(self, *args): + colors = self.get_dataset_colors() + fig, axes = plt.subplots(figsize=(10, 3)) + for i, subset in enumerate(self.dataset_time_interval.keys()): + plot_dataset = self.avail_data_amount.sel({self.subset_dim: subset, + self.temporal_dim: slice( + self.dataset_time_interval[subset]['first'], + self.dataset_time_interval[subset]['last'] + ) + } + ) + + plot_dataset.plot.step(color=colors[subset], ax=axes, label=subset) + plt.fill_between(plot_dataset.coords[self.temporal_dim].values, plot_dataset.values, color=colors[subset]) + + lgd = fig.legend(loc="upper right", ncol=len(self.dataset_time_interval), + facecolor='white', framealpha=1, edgecolor='black') + for lgd_line in lgd.get_lines(): + lgd_line.set_linewidth(4.0) + plt.gca().xaxis.set_major_locator(mdates.YearLocator()) + plt.title('') + plt.ylabel('Number of samples') + plt.tight_layout() + + def _plot_hist_cum(self, *args): + colors = self.get_dataset_colors() + fig, axes = plt.subplots(figsize=(10, 3)) + n_bins = int(self.avail_data_cum_sum.max().values) + bins = np.arange(0, n_bins+1) + descending_subsets = self.avail_data_cum_sum.max(dim=self.station_dim).sortby( + self.avail_data_cum_sum.max(dim=self.station_dim), ascending=False + ).coords[self.subset_dim].values + + for subset in descending_subsets: + self.avail_data_cum_sum.sel({self.subset_dim: subset}).plot.hist(ax=axes, + bins=bins, + label=subset, + cumulative=-1, + color=colors[subset], + # alpha=.5 + ) + + lgd = fig.legend(loc="upper right", ncol=len(self.dataset_time_interval), + facecolor='white', framealpha=1, edgecolor='black') + plt.title('') + plt.ylabel('Number of stations') + plt.xlabel('Number of samples') + plt.xlim((bins[0], bins[-1])) + plt.tight_layout() + + + if __name__ == "__main__": stations = ['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087'] path = "../../testrun_network/forecasts"