diff --git a/mlair/data_handler/data_handler_wrf_chem.py b/mlair/data_handler/data_handler_wrf_chem.py index 1d42649bdde2b571ff9ba77dd8d3027386ae275d..94dcc9c2b30bd4658d0c29dc7bde913d3f70c3d5 100644 --- a/mlair/data_handler/data_handler_wrf_chem.py +++ b/mlair/data_handler/data_handler_wrf_chem.py @@ -1248,6 +1248,7 @@ class DataHandlerSectorGrid(DataHandlerSingleGridColumn): def wind_upstream_sector_by_name(self, wind_upstream_sector_by_name: xr.DataArray): self._wind_upstream_sector_by_name = wind_upstream_sector_by_name + @TimeTrackingWrapper def _store_wind_upstream_sector_by_name(self): file_name = os.path.join(self.experiment_path, f"data/{self.station[0]}_{self.start}_{self.end}_upstream_wind_sector.nc") @@ -1257,6 +1258,7 @@ class DataHandlerSectorGrid(DataHandlerSingleGridColumn): dim=list(set(dims_to_expand) - {self.iter_dim, self.target_dim}), drop=True) wind_upstream_sector_by_name = wind_upstream_sector_by_name.to_dataset(self.iter_dim) wind_upstream_sector_by_name.to_netcdf(file_name) + logging.info(f"_store_wind_upstream_sector_by_name: {file_name}") @TimeTrackingWrapper def extract_data_from_loader(self, loader): diff --git a/mlair/helpers/statistics.py b/mlair/helpers/statistics.py index d4c25bb8891b88e114ea9244cf308bd810910daf..028032253b6051b303b21cae398822a13a9a3a2b 100644 --- a/mlair/helpers/statistics.py +++ b/mlair/helpers/statistics.py @@ -212,6 +212,11 @@ def mean_squared_error(a, b, dim=None): return np.square(a - b).mean(dim) +def mean_squared_error_nan(a: xr.DataArray, b: xr.DataArray, dim=None) -> xr.DataArray: + """Calculate mean squared error.""" + return xr.ufuncs.square(a - b).mean(dim, skipna=True) + + def mean_absolute_error(a, b, dim=None): """Calculate mean absolute error.""" return np.abs(a - b).mean(dim) @@ -231,7 +236,7 @@ def skill_score_based_on_mse(data: xr.DataArray, obs_name: str, pred_name: str, obs = data.sel({competitor_dim: obs_name}) pred = data.sel({competitor_dim: pred_name}) ref = data.sel({competitor_dim: ref_name}) - ss = 1 - mean_squared_error(obs, pred, dim=aggregation_dim) / mean_squared_error(obs, ref, dim=aggregation_dim) + ss = 1 - mean_squared_error_nan(obs, pred, dim=aggregation_dim) / mean_squared_error_nan(obs, ref, dim=aggregation_dim) return ss @@ -270,7 +275,8 @@ def represent_p_values_as_asteriks(p_values: pd.Series, threshold_representation f"`threshold_representation' keys mus be in strictly " f"decreasing order but is: {threshold_representation.keys()}") - asteriks = pd.Series().reindex_like(p_values) + # asteriks = pd.Series().reindex_like(p_values) + asteriks = pd.DataFrame().reindex_like(p_values) for k, v in threshold_representation.items(): asteriks[p_values < k] = v return asteriks diff --git a/mlair/plotting/postprocessing_plotting.py b/mlair/plotting/postprocessing_plotting.py index a4768d96b1cdea15c28d2af9f59020b550a82afe..335e67399e239c82d17f572dbc7a3bc3d401d409 100644 --- a/mlair/plotting/postprocessing_plotting.py +++ b/mlair/plotting/postprocessing_plotting.py @@ -22,7 +22,7 @@ from scipy.stats import mannwhitneyu from mlair import helpers from mlair.data_handler.iterator import DataCollection -from mlair.helpers import TimeTrackingWrapper +from mlair.helpers import TimeTrackingWrapper, tables from mlair.plotting.abstract_plot_class import AbstractPlotClass from mlair.helpers.statistics import mann_whitney_u_test, represent_p_values_as_asteriks @@ -65,7 +65,7 @@ class PlotMonthlySummary(AbstractPlotClass): # pragma: no cover self._data = self._prepare_data(stations) self._window_lead_time = self._get_window_lead_time(window_lead_time) self._plot(target_var, target_var_unit) - self._save() + # self._save() def _prepare_data(self, stations: List) -> xr.DataArray: """ @@ -131,14 +131,33 @@ class PlotMonthlySummary(AbstractPlotClass): # pragma: no cover """ data = self._data.to_dataset(name='values').to_dask_dataframe() logging.debug("... start plotting") + plot_path = os.path.join(os.path.abspath(self.plot_folder), f"{self.plot_name}.pdf") + pdf_pages = matplotlib.backends.backend_pdf.PdfPages(plot_path) color_palette = [matplotlib.colors.cnames["green"]] + sns.color_palette("Blues_r", self._window_lead_time).as_hex() - ax = sns.boxplot(x='index', y='values', hue='ahead', data=data.compute(), whis=1.5, palette=color_palette, - flierprops={'marker': '.', 'markersize': 1}, showmeans=True, - meanprops={'markersize': 1, 'markeredgecolor': 'k'}) - ylabel = self._spell_out_chemical_concentrations(target_var) - ax.set(xlabel='month', ylabel=f'{ylabel} (in {target_var_unit})') - plt.tight_layout() + for i in range(3): + ax = sns.boxplot(x='index', y='values', hue='ahead', data=data.compute(), whis=1.5, palette=color_palette, + flierprops={'marker': '.', 'markersize': 1}, showmeans=True, + meanprops={'markersize': 1, 'markeredgecolor': 'k'}) + ylabel = self._spell_out_chemical_concentrations(target_var) + ax.set(xlabel='month', ylabel=f'{ylabel} (in {target_var_unit})') + + handles, labels = ax.axes.get_legend_handles_labels() + if i > 0: + ax.legend().remove() + if i == 1: + ax.legend(handles[:len(color_palette)-1], labels[:len(color_palette)-1], + ncol=len(color_palette)-1, loc='lower center') + if i == 2: + ax.legend(handles[:len(color_palette)-1], labels[:len(color_palette)-1], + ncol=len(color_palette)-1, bbox_to_anchor=(0., 1.02, 1., .102), loc=3) + + plt.tight_layout() + pdf_pages.savefig() + # close all open figures / plots + pdf_pages.close() + plt.close('all') + @TimeTrackingWrapper @@ -644,7 +663,7 @@ class PlotSectorialSkillScore(AbstractPlotClass): # pragma: no cover fig, ax = plt.subplots(figsize=(size, size * 0.8)) data = self._data sns.boxplot(x="sector", y="data", hue="ahead", data=data, whis=1, ax=ax, palette="Blues_r", - showmeans=True, meanprops={"markersize": 3, "markeredgecolor": "k"}, flierprops={"marker": "."}, + showmeans=False, meanprops={"markersize": 3, "markeredgecolor": "k"}, flierprops={"marker": "."}, ) ax.axhline(y=0, color="grey", linewidth=.5) ax.set(ylabel=f"skill score ({self._model_setup} vs. {self._reference_model})", xlabel="sector", @@ -659,7 +678,7 @@ class PlotSectorialSkillScore(AbstractPlotClass): # pragma: no cover fig, ax = plt.subplots() data = self._data sns.boxplot(y="sector", x="data", hue="ahead", data=data, whis=1.5, ax=ax, palette="Blues_r", - showmeans=True, meanprops={"markersize": 3, "markeredgecolor": "k"}, flierprops={"marker": "."}, + showmeans=False, meanprops={"markersize": 3, "markeredgecolor": "k"}, flierprops={"marker": "."}, ) ax.axvline(x=0, color="grey", linewidth=.5) ax.set(xlabel=f"skill score ({self._model_setup} vs. {self._reference_model})", ylabel="sector", @@ -1291,7 +1310,8 @@ class PlotSampleUncertaintyFromBootstrap(AbstractPlotClass): # pragma: no cover def __init__(self, data: xr.DataArray, plot_folder: str = ".", model_type_dim: str = "type", error_measure: str = "mse", error_unit: str = None, dim_name_boots: str = 'boots', - block_length: str = None, model_name: str = "NN", model_indicator: str = "nn"): + block_length: str = None, model_name: str = "NN", model_indicator: str = "nn", + report_path: str = None): super().__init__(plot_folder, "sample_uncertainty_from_bootstrap") default_name = self.plot_name self.model_type_dim = model_type_dim @@ -1300,6 +1320,7 @@ class PlotSampleUncertaintyFromBootstrap(AbstractPlotClass): # pragma: no cover self.error_unit = error_unit self.block_length = block_length self.model_name = model_name + self.report_path = report_path data = self.rename_model_indicator(data, model_name, model_indicator) self.prepare_data(data) @@ -1322,9 +1343,9 @@ class PlotSampleUncertaintyFromBootstrap(AbstractPlotClass): # pragma: no cover @property def get_asteriks_from_mann_whitney_u_result(self): - return represent_p_values_as_asteriks(mann_whitney_u_test(data=self._data_table, - reference_col_name=self.model_name, - axis=0, alternative="two-sided").iloc[-1]) + utest = mann_whitney_u_test(data=self._data_table, reference_col_name=self.model_name, + axis=0, alternative="two-sided") + return represent_p_values_as_asteriks(utest).iloc[-1], utest def rename_model_indicator(self, data, model_name, model_indicator): data.coords[self.model_type_dim] = [{model_indicator: model_name}.get(n, n) @@ -1345,7 +1366,6 @@ class PlotSampleUncertaintyFromBootstrap(AbstractPlotClass): # pragma: no cover data_table = self._data_table n_boots = self._n_boots size = len(np.unique(data_table.columns)) - asteriks = self.get_asteriks_from_mann_whitney_u_result if orientation == "v": figsize, width = (size, 5), 0.4 elif orientation == "h": @@ -1358,6 +1378,13 @@ class PlotSampleUncertaintyFromBootstrap(AbstractPlotClass): # pragma: no cover flierprops={"marker": "o", "markerfacecolor": "black", "markeredgecolor": "none", "markersize": 3}, boxprops={'facecolor': 'none', 'edgecolor': 'k'}, width=width, orient=orientation) + if apply_u_test: #ToDo move calc of u-test from plotting to postproc! + asteriks, utest = self.get_asteriks_from_mann_whitney_u_result + column_format = tables.create_column_format_for_tex(utest) + file_name = f"{self.plot_name}.%s" + tables.save_to_tex(self.report_path, file_name % "tex", column_format=column_format, df=utest) + tables.save_to_md(self.report_path, file_name % "md", df=utest) + utest.to_csv(os.path.join(self.report_path, file_name % "csv")) if orientation == "v": if apply_u_test: ax = self.set_significance_bars(asteriks, ax, data_table, orientation) diff --git a/mlair/run_modules/post_processing.py b/mlair/run_modules/post_processing.py index 04bcbf753bc2ed2a15a8df41eaff852b3395a27c..d529bc715006c08d16b56c40cac98d0bc90787cd 100644 --- a/mlair/run_modules/post_processing.py +++ b/mlair/run_modules/post_processing.py @@ -164,7 +164,7 @@ class PostProcessing(RunEnvironment): logging.warning("Dimension `XTIME' does not exist") ds = ds.squeeze() logging.info(f"PostProcessing.load_upstream_wind_sector({name_of_set}: shape of `upstream_wind_sector' is " - f"{ds.shape}\ndims are {ds.dims}") + f"{ds.shape}\ndims are {ds.dims}, ds.isnull().any()={ds.isnull().any()}") self.upstream_wind_sector = ds @TimeTrackingWrapper @@ -675,11 +675,13 @@ class PostProcessing(RunEnvironment): try: if "PlotSampleUncertaintyFromBootstrap" in plot_list and self.uncertainty_estimate is not None: block_length = self.data_store.get_default("block_length", default="1m", scope="uncertainty_estimate") + report_path = os.path.join(self.data_store.get("experiment_path"), "latex_report") PlotSampleUncertaintyFromBootstrap( data=self.uncertainty_estimate, plot_folder=self.plot_path, model_type_dim=self.model_type_dim, dim_name_boots=self.uncertainty_estimate_boot_dim, error_measure="mean squared error", error_unit=fr"{self.target_var_unit}$^2$", block_length=block_length, - model_name=self.model_display_name, model_indicator=self.forecast_indicator) + model_name=self.model_display_name, model_indicator=self.forecast_indicator, + report_path=report_path) except Exception as e: logging.error(f"Could not create plot PlotSampleUncertaintyFromBootstrap due to the following error: {e}" @@ -957,7 +959,10 @@ class PostProcessing(RunEnvironment): file = os.path.join(path, f"forecasts_{str(station)}_test.nc") with xr.open_dataarray(file) as da: return da.load() - except (IndexError, KeyError, FileNotFoundError): + except (IndexError, KeyError, FileNotFoundError) as e: + logging.error(f"Could not load external test data for station {station} from " + f"{path}) due to the following error:\n{sys.exc_info()[0]}\n" + f"{sys.exc_info()[1]}\n{sys.exc_info()[2]}") return None def _combine_forecasts(self, forecast, competitor, dim=None): @@ -1003,17 +1008,20 @@ class PostProcessing(RunEnvironment): wind_sectors = self.data_store.get("wind_sectors", "general") h_sector_skill_scores = [] for sec in wind_sectors: + + # prepare ds to drop nans only for time steps which are currently not eq. sec: + # https://stackoverflow.com/questions/52553925/python-xarray-remove-coordinates-with-all-missing-variables + hds = ds.where(self.upstream_wind_sector.squeeze() == sec) + # |__________________ stack all dims without time ________________| |_ drop along index_ | |_unstack_| + hds = hds.stack(z=(self.iter_dim, self.ahead_dim, self.model_type_dim)).dropna(self.index_dim, + how="all").unstack() h_sector_skill_scores.append( - # statistics.SkillScores(None).general_skill_score(ds.where(self.upstream_wind_sector.squeeze() == sec), - # forecast_name=self.model_display_name, - # reference_name=ref_name, - # observation_name=self.observation_indicator) - statistics.skill_score_based_on_mse( - ds.where(self.upstream_wind_sector.squeeze() == sec).dropna(dim=self.index_dim), - obs_name=self.observation_indicator, pred_name=self.model_display_name, - ref_name=ref_name).assign_coords({"sector": sec} - ) - ) + statistics.skill_score_based_on_mse(hds, + obs_name=self.observation_indicator, + pred_name=self.model_display_name, + ref_name=ref_name).assign_coords({"sector": sec} + ) + ) sector_skill_scores = xr.concat(h_sector_skill_scores, dim="sector") sector_skill_scores = sector_skill_scores.assign_attrs({f"reference_model": ref_name}) return sector_skill_scores @@ -1164,3 +1172,4 @@ class PostProcessing(RunEnvironment): file_name = f"error_report_{metric}_{model_type}.%s".replace(' ', '_').replace('/', '_') tables.save_to_tex(report_path, file_name % "tex", column_format=column_format, df=df) tables.save_to_md(report_path, file_name % "md", df=df) + df.to_csv(os.path.join(report_path, file_name % "csv"))