Ghost User
--- a/mlair/plotting/postprocessing_plotting.py

+ 321

− 14
+++ b/mlair/plotting/postprocessing_plotting.py

+ 321

− 14
 """Collection of plots to evaluate a model, create overviews on data or forecasts."""
 __author__ = "Lukas Leufen, Felix Kleinert"
-__date__ = '2019-12-17'
+__date__ = '2020-11-23'
 import logging
 import math
 @@ -8,10 +8,10 @@ import os
 @@ -8,10 +8,10 @@ import os
 import warnings
 from typing import Dict, List, Tuple
 import matplotlib
 import matplotlib.patches as mpatches
 import matplotlib.pyplot as plt
+import matplotlib.dates as mdates
 import numpy as np
 import pandas as pd
 import seaborn as sns
 @@ -75,7 +75,7 @@ class AbstractPlotClass:
 @@ -75,7 +75,7 @@ class AbstractPlotClass:
    """
-    def __init__(self, plot_folder, plot_name, resolution=500):
+    def __init__(self, plot_folder, plot_name, resolution=500, rc_params=None):
        """Set up plot folder and name, and plot resolution (default 500dpi)."""
        plot_folder = os.path.abspath(plot_folder)
        if not os.path.exists(plot_folder):
 @@ -83,6 +83,15 @@ class AbstractPlotClass:
 @@ -83,6 +83,15 @@ class AbstractPlotClass:
        self.plot_folder = plot_folder
        self.plot_name = plot_name
        self.resolution = resolution
+        if rc_params is None:
+            rc_params = {'axes.labelsize': 'large',
+                         'xtick.labelsize': 'large',
+                         'ytick.labelsize': 'large',
+                         'legend.fontsize': 'large',
+                         'axes.titlesize': 'large',
+                         }
+        self.rc_params = rc_params
+        self._update_rc_params()
    def _plot(self, *args):
        """Abstract plot class needs to be implemented in inheritance."""
 @@ -95,6 +104,24 @@ class AbstractPlotClass:
 @@ -95,6 +104,24 @@ class AbstractPlotClass:
        plt.savefig(plot_name, dpi=self.resolution, **kwargs)
        plt.close('all')
+    def _update_rc_params(self):
+        plt.rcParams.update(self.rc_params)
+    @staticmethod
+    def _get_sampling(sampling):
+        if sampling == "daily":
+            return "D"
+        elif sampling == "hourly":
+            return "h"
+    @staticmethod
+    def get_dataset_colors():
+        """
+        Standard colors used for train-, val-, and test-sets during postprocessing
+        """
+        colors = {"train": "#e69f00", "val": "#009e73", "test": "#56b4e9"}  # hex code
+        return colors
 @TimeTrackingWrapper
 class PlotMonthlySummary(AbstractPlotClass):
 @@ -113,18 +140,19 @@ class PlotMonthlySummary(AbstractPlotClass):
 @@ -113,18 +140,19 @@ class PlotMonthlySummary(AbstractPlotClass):
    :param window_lead_time: lead time to plot, if window_lead_time is higher than the available lead time or not given
        the maximum lead time from data is used. (default None -> use maximum lead time from data).
    :param plot_folder: path to save the plot (default: current directory)
+    :param target_var_unit: unit of target var for plot legend (default= ppb)
    """
    def __init__(self, stations: List, data_path: str, name: str, target_var: str, window_lead_time: int = None,
-                 plot_folder: str = "."):
+                 plot_folder: str = ".", target_var_unit: str = 'ppb'):
        """Set attributes and create plot."""
        super().__init__(plot_folder, "monthly_summary_box_plot")
        self._data_path = data_path
        self._data_name = name
        self._data = self._prepare_data(stations)
        self._window_lead_time = self._get_window_lead_time(window_lead_time)
-        self._plot(target_var)
+        self._plot(target_var, target_var_unit)
        self._save()
    def _prepare_data(self, stations: List) -> xr.DataArray:
 @@ -176,7 +204,12 @@ class PlotMonthlySummary(AbstractPlotClass):
 @@ -176,7 +204,12 @@ class PlotMonthlySummary(AbstractPlotClass):
            window_lead_time = ahead_steps
        return min(ahead_steps, window_lead_time)
-    def _plot(self, target_var: str):
+    @staticmethod
+    def _spell_out_chemical_concentrations(short_name: str):
+        short2long = {'o3': 'ozone', 'no': 'nitrogen oxide', 'no2': 'nitrogen dioxide', 'nox': 'nitrogen dioxides'}
+        return f"{short2long[short_name]} concentration"
+    def _plot(self, target_var: str, target_var_unit: str):
        """
        Create a monthly grouped box plot over all stations but with separate boxes for each lead time step.
 @@ -189,7 +222,8 @@ class PlotMonthlySummary(AbstractPlotClass):
 @@ -189,7 +222,8 @@ class PlotMonthlySummary(AbstractPlotClass):
        ax = sns.boxplot(x='index', y='values', hue='ahead', data=data.compute(), whis=1., palette=color_palette,
                         flierprops={'marker': '.', 'markersize': 1}, showmeans=True,
                         meanprops={'markersize': 1, 'markeredgecolor': 'k'})
-        ax.set(xlabel='month', ylabel=f'{target_var}')
+        ylabel = self._spell_out_chemical_concentrations(target_var)
+        ax.set(xlabel='month', ylabel=f'{ylabel} (in {target_var_unit})')
        plt.tight_layout()
 @@ -453,7 +487,8 @@ class PlotConditionalQuantiles(AbstractPlotClass):
 @@ -453,7 +487,8 @@ class PlotConditionalQuantiles(AbstractPlotClass):
    def _plot(self):
        """Start plotting routines: overall plot and seasonal (if enabled)."""
-        logging.info(f"start plotting {self.__class__.__name__}, scheduled number of plots: {(len(self._seasons) + 1) * 2}")
+        logging.info(
+            f"start plotting {self.__class__.__name__}, scheduled number of plots: {(len(self._seasons) + 1) * 2}")
        if len(self._seasons) > 0:
            self._plot_seasons()
 @@ -504,7 +539,7 @@ class PlotConditionalQuantiles(AbstractPlotClass):
 @@ -504,7 +539,7 @@ class PlotConditionalQuantiles(AbstractPlotClass):
            # add histogram of the segmented data (pred_name)
            handles, labels = ax.get_legend_handles_labels()
            segmented_data.loc[x_model, d, :].to_pandas().hist(bins=self._bins, ax=ax2, color='k', alpha=.3, grid=False,
-                                                                 rwidth=1)
+                                                               rwidth=1)
            # add legend
            plt.legend(handles[:3] + [handles[-1]], self._opts["legend"], loc='upper left', fontsize='large')
            # adjust limits and set labels
 @@ -693,20 +728,26 @@ class PlotBootstrapSkillScore(AbstractPlotClass):
 @@ -693,20 +728,26 @@ class PlotBootstrapSkillScore(AbstractPlotClass):
    """
-    def __init__(self, data: Dict, plot_folder: str = ".", model_setup: str = ""):
+    def __init__(self, data: Dict, plot_folder: str = ".", model_setup: str = "", separate_vars: List = None):
        """
        Set attributes and create plot.
        :param data: dictionary with station names as keys and 2D xarrays as values, consist on axis ahead and terms.
        :param plot_folder: path to save the plot (default: current directory)
        :param model_setup: architecture type to specify plot name (default "CNN")
+        :param separate_vars: variables to plot separated (default: ['o3'])
        """
        super().__init__(plot_folder, f"skill_score_bootstrap_{model_setup}")
+        if separate_vars is None:
+            separate_vars = ['o3']
        self._labels = None
        self._x_name = "boot_var"
        self._data = self._prepare_data(data)
        self._plot()
        self._save()
+        self.plot_name += '_separated'
+        self._plot(separate_vars=separate_vars)
+        self._save(bbox_inches='tight')
    def _prepare_data(self, data: Dict) -> pd.DataFrame:
        """
 @@ -719,11 +760,30 @@ class PlotBootstrapSkillScore(AbstractPlotClass):
 @@ -719,11 +760,30 @@ class PlotBootstrapSkillScore(AbstractPlotClass):
        :return: pre-processed data set
        """
        data = helpers.dict_to_xarray(data, "station").sortby(self._x_name)
+        new_boot_coords = self._return_vars_without_number_tag(data.coords['boot_var'].values, split_by='_', keep=1)
+        data = data.assign_coords({'boot_var': new_boot_coords})
        self._labels = [str(i) + "d" for i in data.coords["ahead"].values]
        if "station" not in data.dims:
            data = data.expand_dims("station")
        return data.to_dataframe("data").reset_index(level=[0, 1, 2])
+    def _return_vars_without_number_tag(self, values, split_by, keep):
+        arr = np.array([v.split(split_by) for v in values])
+        num = arr[:, 0]
+        new_val = arr[:, keep]
+        if self._all_values_are_equal(num, axis=0):
+            return new_val
+        else:
+            raise NotImplementedError
+    @staticmethod
+    def _all_values_are_equal(arr, axis=0):
+        if np.all(arr == arr[0], axis=axis):
+            return True
+        else:
+            return False
    def _label_add(self, score_only: bool):
        """
        Add the phrase "terms and " if score_only is disabled or empty string (if score_only=True).
 @@ -733,12 +793,111 @@ class PlotBootstrapSkillScore(AbstractPlotClass):
 @@ -733,12 +793,111 @@ class PlotBootstrapSkillScore(AbstractPlotClass):
        """
        return "" if score_only else "terms and "
-    def _plot(self):
+    def _plot(self, separate_vars=None):
        """Plot climatological skill score."""
+        if separate_vars is None:
+            self._plot_all_variables()
+        else:
+            self._plot_selected_variables(separate_vars)
+    def _plot_selected_variables(self, separate_vars: List):
+        # if separate_vars is None:
+        #     separate_vars = ['o3']
+        data = self._data
+        self.raise_error_if_separate_vars_do_not_exist(data, separate_vars)
+        all_variables = self._get_unique_values_from_column_of_df(data, 'boot_var')
+        # remaining_vars = helpers.list_pop(all_variables, separate_vars) #remove_items
+        remaining_vars = helpers.remove_items(all_variables, separate_vars)
+        data_first = self._select_data(df=data, variables=separate_vars, column_name='boot_var')
+        data_second = self._select_data(df=data, variables=remaining_vars, column_name='boot_var')
+        fig, ax = plt.subplots(nrows=1, ncols=2,
+                               gridspec_kw={'width_ratios': [len(separate_vars),
+                                                             len(remaining_vars)
+                                                             ]
+                                            }
+                               )
+        if len(separate_vars) > 1:
+            first_box_width = .8
+        else:
+            first_box_width = 2.
+        sns.boxplot(x=self._x_name, y="data", hue="ahead", data=data_first, ax=ax[0], whis=1., palette="Blues_d",
+                    showmeans=True, meanprops={"markersize": 1, "markeredgecolor": "k"},
+                    flierprops={"marker": "."}, width=first_box_width
+                    )
+        ax[0].set(ylabel=f"skill score", xlabel="")
+        sns.boxplot(x=self._x_name, y="data", hue="ahead", data=data_second, ax=ax[1], whis=1., palette="Blues_d",
+                    showmeans=True, meanprops={"markersize": 1, "markeredgecolor": "k"},
+                    flierprops={"marker": "."},
+                    )
+        ax[1].set(ylabel="", xlabel="")
+        ax[1].yaxis.tick_right()
+        handles, _ = ax[1].get_legend_handles_labels()
+        for sax in ax:
+            matplotlib.pyplot.sca(sax)
+            sax.axhline(y=0, color="grey", linewidth=.5)
+            plt.xticks(rotation=45, ha='right')
+            sax.legend_.remove()
+        fig.legend(handles, self._labels, loc='upper center', ncol=len(handles) + 1, )
+        def align_yaxis(ax1, ax2):
+            """
+            Align zeros of the two axes, zooming them out by same ratio
+            This function is copy pasted from https://stackoverflow.com/a/41259922
+            """
+            axes = (ax1, ax2)
+            extrema = [ax.get_ylim() for ax in axes]
+            tops = [extr[1] / (extr[1] - extr[0]) for extr in extrema]
+            # Ensure that plots (intervals) are ordered bottom to top:
+            if tops[0] > tops[1]:
+                axes, extrema, tops = [list(reversed(l)) for l in (axes, extrema, tops)]
+            # How much would the plot overflow if we kept current zoom levels?
+            tot_span = tops[1] + 1 - tops[0]
+            b_new_t = extrema[0][0] + tot_span * (extrema[0][1] - extrema[0][0])
+            t_new_b = extrema[1][1] - tot_span * (extrema[1][1] - extrema[1][0])
+            axes[0].set_ylim(extrema[0][0], b_new_t)
+            axes[1].set_ylim(t_new_b, extrema[1][1])
+        align_yaxis(ax[0], ax[1])
+        align_yaxis(ax[0], ax[1])
+    @staticmethod
+    def _select_data(df: pd.DataFrame, variables: List[str], column_name: str) -> pd.DataFrame:
+        for i, variable in enumerate(variables):
+            if i == 0:
+                selected_data = df.loc[df[column_name] == variable]
+            else:
+                tmp_var = df.loc[df[column_name] == variable]
+                selected_data = pd.concat([selected_data, tmp_var], axis=0)
+        return selected_data
+    def raise_error_if_separate_vars_do_not_exist(self, data, separate_vars):
+        if not self._variables_exist_in_df(df=data, variables=separate_vars):
+            raise ValueError(f"At least one entry of `separate_vars' does not exist in `self.data' ")
+    @staticmethod
+    def _get_unique_values_from_column_of_df(df: pd.DataFrame, column_name: str) -> List:
+        return list(df[column_name].unique())
+    def _variables_exist_in_df(self, df: pd.DataFrame, variables: List[str], column_name: str = 'boot_var'):
+        vars_in_df = set(self._get_unique_values_from_column_of_df(df, column_name))
+        return set(variables).issubset(vars_in_df)
+    def _plot_all_variables(self):
+        """
+        """
        fig, ax = plt.subplots()
        sns.boxplot(x=self._x_name, y="data", hue="ahead", data=self._data, ax=ax, whis=1., palette="Blues_d",
                    showmeans=True, meanprops={"markersize": 1, "markeredgecolor": "k"}, flierprops={"marker": "."})
        ax.axhline(y=0, color="grey", linewidth=.5)
+        plt.xticks(rotation=45)
        ax.set(ylabel=f"skill score", xlabel="", title="summary of all stations")
        handles, _ = ax.get_legend_handles_labels()
        ax.legend(handles, self._labels)
 @@ -859,6 +1018,7 @@ class PlotTimeSeries:
 @@ -859,6 +1018,7 @@ class PlotTimeSeries:
    def _get_time_range(data):
        def f(x, f_x):
            return pd.to_datetime(f_x(x.index.values)).year
        return f(data, min), f(data, max)
    @staticmethod
 @@ -978,9 +1138,7 @@ class PlotAvailability(AbstractPlotClass):
 @@ -978,9 +1138,7 @@ class PlotAvailability(AbstractPlotClass):
        return plt_dict
    def _plot(self, plt_dict):
-        # colors = {"train": "orange", "val": "blueishgreen", "test": "skyblue"}  # color names
+        colors = self.get_dataset_colors()
-        colors = {"train": "#e69f00", "val": "#009e73", "test": "#56b4e9"}  # hex code
-        # colors = {"train": (230, 159, 0), "val": (0, 158, 115), "test": (86, 180, 233)}  # in rgb but as abs values
        pos = 0
        height = 0.8  # should be <= 1
        yticklabels = []
 @@ -1025,6 +1183,155 @@ class PlotSeparationOfScales(AbstractPlotClass):
 @@ -1025,6 +1183,155 @@ class PlotSeparationOfScales(AbstractPlotClass):
            self._save()
+@TimeTrackingWrapper
+class PlotAvailabilityHistogram(AbstractPlotClass):
+    """
+    Create data availability plots as histogram.
+    Each entry of each generator is checked for `notnull()` values along all the datetime axis (boolean).
+    Calling this class creates two different types of histograms where each generator
+    1) data_availability_histogram: datetime (xaxis) vs. number of stations with availabile data (yaxis)
+    2) data_availability_histogram_cumulative: number of samples (xaxis) vs. number of stations having at least number
+       of samples (yaxis)
+    """
+    def __init__(self, generators: Dict[str, DataCollection], plot_folder: str = ".", sampling="daily",
+                 subset_dim: str = 'DataSet', temporal_dim: str = 'datetime', history_dim: str = 'window',
+                 station_dim: str = 'Stations', target_dim='variables'):
+        super().__init__(plot_folder, "data_availability_histogram")
+        self.freq = self._get_sampling(sampling)
+        self.subset_dim = subset_dim
+        self.temporal_dim = temporal_dim
+        self.history_dim = history_dim
+        self.station_dim = station_dim
+        self.target_dim = target_dim
+        self._prepare_data(generators)
+        for plt_type in self.allowed_plot_types:
+            plot_name_tmp = self.plot_name
+            self.plot_name += '_' + plt_type
+            self._plot(plt_type=plt_type)
+            self._save()
+            self.plot_name = plot_name_tmp
+    @property
+    def allowed_plot_types(self):
+        plot_types = ['hist', 'hist_cum']
+        return plot_types
+    def _prepare_data(self, generators: Dict[str, DataCollection]):
+        """
+        Prepares data to be used by plot methods.
+        Creates xarrays which are sums of valid data (boolean sums) across i) station_dim and ii) temporal_dim
+        """
+        avail_data_time_sum = {}
+        avail_data_station_sum = {}
+        dataset_time_interval = {}
+        for subset, generator in generators.items():
+            avail_list = []
+            for station in generator:
+                station_data_x = station.get_X(as_numpy=False)[0]
+                station_data_x = station_data_x.loc[{self.history_dim: 0,  # select recent window frame
+                                                     self.target_dim: station_data_x[self.target_dim].values[0]}]
+                avail_list.append(station_data_x.notnull())
+            avail_data = xr.concat(avail_list, dim=self.station_dim).notnull()
+            avail_data_time_sum[subset] = avail_data.sum(dim=self.station_dim)
+            avail_data_station_sum[subset] = avail_data.sum(dim=self.temporal_dim)
+            dataset_time_interval[subset] = self._get_first_and_last_indexelement_from_xarray(
+                avail_data_time_sum[subset], dim_name=self.temporal_dim, return_type='as_dict'
+            )
+        avail_data_amount = xr.concat(avail_data_time_sum.values(), pd.Index(avail_data_time_sum.keys(),
+                                                                             name=self.subset_dim)
+                                      )
+        full_time_index = self._make_full_time_index(avail_data_amount.coords[self.temporal_dim].values, freq=self.freq)
+        self.avail_data_cum_sum = xr.concat(avail_data_station_sum.values(), pd.Index(avail_data_station_sum.keys(),
+                                                                                      name=self.subset_dim))
+        self.avail_data_amount = avail_data_amount.reindex({self.temporal_dim: full_time_index})
+        self.dataset_time_interval = dataset_time_interval
+    @staticmethod
+    def _get_first_and_last_indexelement_from_xarray(xarray, dim_name, return_type='as_tuple'):
+        if isinstance(xarray, xr.DataArray):
+            first = xarray.coords[dim_name].values[0]
+            last = xarray.coords[dim_name].values[-1]
+            if return_type == 'as_tuple':
+                return first, last
+            elif return_type == 'as_dict':
+                return {'first': first, 'last': last}
+            else:
+                raise TypeError(f"return_type must be 'as_tuple' or 'as_dict', but is '{return_type}'")
+        else:
+            raise TypeError(f"xarray must be of type xr.DataArray, but is of type {type(xarray)}")
+    @staticmethod
+    def _make_full_time_index(irregular_time_index, freq):
+        full_time_index = pd.date_range(start=irregular_time_index[0], end=irregular_time_index[-1], freq=freq)
+        return full_time_index
+    def _plot(self, plt_type='hist', *args):
+        if plt_type == 'hist':
+            self._plot_hist()
+        elif plt_type == 'hist_cum':
+            self._plot_hist_cum()
+        else:
+            raise ValueError(f"plt_type mus be 'hist' or 'hist_cum', but is {type}")
+    def _plot_hist(self, *args):
+        colors = self.get_dataset_colors()
+        fig, axes = plt.subplots(figsize=(10, 3))
+        for i, subset in enumerate(self.dataset_time_interval.keys()):
+            plot_dataset = self.avail_data_amount.sel({self.subset_dim: subset,
+                                                       self.temporal_dim: slice(
+                                                           self.dataset_time_interval[subset]['first'],
+                                                           self.dataset_time_interval[subset]['last']
+                                                       )
+                                                       }
+                                                      )
+            plot_dataset.plot.step(color=colors[subset], ax=axes, label=subset)
+            plt.fill_between(plot_dataset.coords[self.temporal_dim].values, plot_dataset.values, color=colors[subset])
+        lgd = fig.legend(loc="upper right", ncol=len(self.dataset_time_interval),
+                         facecolor='white', framealpha=1, edgecolor='black')
+        for lgd_line in lgd.get_lines():
+            lgd_line.set_linewidth(4.0)
+        plt.gca().xaxis.set_major_locator(mdates.YearLocator())
+        plt.title('')
+        plt.ylabel('Number of samples')
+        plt.tight_layout()
+    def _plot_hist_cum(self, *args):
+        colors = self.get_dataset_colors()
+        fig, axes = plt.subplots(figsize=(10, 3))
+        n_bins = int(self.avail_data_cum_sum.max().values)
+        bins = np.arange(0, n_bins+1)
+        descending_subsets = self.avail_data_cum_sum.max(dim=self.station_dim).sortby(
+            self.avail_data_cum_sum.max(dim=self.station_dim), ascending=False
+        ).coords[self.subset_dim].values
+        for subset in descending_subsets:
+            self.avail_data_cum_sum.sel({self.subset_dim: subset}).plot.hist(ax=axes,
+                                                                             bins=bins,
+                                                                             label=subset,
+                                                                             cumulative=-1,
+                                                                             color=colors[subset],
+                                                                             # alpha=.5
+                                                                             )
+        lgd = fig.legend(loc="upper right", ncol=len(self.dataset_time_interval),
+                         facecolor='white', framealpha=1, edgecolor='black')
+        plt.title('')
+        plt.ylabel('Number of stations')
+        plt.xlabel('Number of samples')
+        plt.xlim((bins[0], bins[-1]))
+        plt.tight_layout()
 if __name__ == "__main__":
    stations = ['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087']
    path = "../../testrun_network/forecasts"