diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 9f1ba73f5a3b3fada8624dfd89fa6f12488a877b..f3ec1ab98cf8e46b97e2d803518ed57c6cfd4622 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -23,13 +23,71 @@ version: paths: - badges/ +### Tests (from scratch) ### +tests (from scratch): + tags: + - base + - zam347 + stage: test + only: + - master + - /^release.*$/ + - develop + variables: + FAILURE_THRESHOLD: 100 + TEST_TYPE: "scratch" + before_script: + - chmod +x ./CI/update_badge.sh + - ./CI/update_badge.sh > /dev/null + script: + - zypper --non-interactive install binutils libproj-devel gdal-devel + - zypper --non-interactive install proj geos-devel + - pip install -r requirements.txt + - chmod +x ./CI/run_pytest.sh + - ./CI/run_pytest.sh + after_script: + - ./CI/update_badge.sh > /dev/null + artifacts: + name: pages + when: always + paths: + - badges/ + - test_results/ + +### Tests (on GPU) ### +tests (on GPU): + tags: + - gpu + - zam347 + stage: test + only: + - master + - /^release.*$/ + - develop + variables: + FAILURE_THRESHOLD: 100 + TEST_TYPE: "gpu" + before_script: + - chmod +x ./CI/update_badge.sh + - ./CI/update_badge.sh > /dev/null + script: + - pip install -r requirements.txt + - chmod +x ./CI/run_pytest.sh + - ./CI/run_pytest.sh + after_script: + - ./CI/update_badge.sh > /dev/null + artifacts: + name: pages + when: always + paths: + - badges/ + - test_results/ + ### Tests ### tests: tags: - - leap + - machinelearningtools - zam347 - - base - - django stage: test variables: FAILURE_THRESHOLD: 100 @@ -51,10 +109,8 @@ tests: coverage: tags: - - leap + - machinelearningtools - zam347 - - base - - django stage: test variables: FAILURE_THRESHOLD: 50 @@ -79,7 +135,6 @@ coverage: pages: stage: pages tags: - - leap - zam347 - base script: diff --git a/CI/run_pytest.sh b/CI/run_pytest.sh index a7d883dcc95e0c16541af00ed0891e2d31dee82c..baa7ef8e892fc2d9efdd30094917ca492017de3d 100644 --- a/CI/run_pytest.sh +++ b/CI/run_pytest.sh @@ -1,24 +1,30 @@ #!/bin/bash # run pytest for all run_modules -python3 -m pytest --html=report.html --self-contained-html test/ | tee test_results.out +python3.6 -m pytest --html=report.html --self-contained-html test/ | tee test_results.out IS_FAILED=$? +if [ -z ${TEST_TYPE+x} ]; then + TEST_TYPE=""; else + TEST_TYPE="_"${TEST_TYPE}; +fi + # move html test report -mkdir test_results/ +TEST_RESULTS_DIR="test_results${TEST_TYPE}/" +mkdir ${TEST_RESULTS_DIR} BRANCH_NAME=$( echo -e "${CI_COMMIT_REF_NAME////_}") -mkdir test_results/${BRANCH_NAME} -mkdir test_results/recent -cp report.html test_results/${BRANCH_NAME}/. -cp report.html test_results/recent/. +mkdir "${TEST_RESULTS_DIR}/${BRANCH_NAME}" +mkdir "${TEST_RESULTS_DIR}/recent" +cp report.html "${TEST_RESULTS_DIR}/${BRANCH_NAME}/." +cp report.html "${TEST_RESULTS_DIR}/recent/." if [[ "${CI_COMMIT_REF_NAME}" = "master" ]]; then - cp -r report.html test_results/. + cp -r report.html ${TEST_RESULTS_DIR}/. fi # exit 0 if no tests implemented RUN_NO_TESTS="$(grep -c 'no tests ran' test_results.out)" -if [[ ${RUN_NO_TESTS} > 0 ]]; then +if [[ ${RUN_NO_TESTS} -gt 0 ]]; then echo "no test available" echo "incomplete" > status.txt echo "no tests avail" > incomplete.txt @@ -27,20 +33,19 @@ fi # extract if tests passed or not TEST_FAILED="$(grep -oP '(\d+\s{1}failed)' test_results.out)" -TEST_FAILED="$(echo ${TEST_FAILED} | (grep -oP '\d*'))" +TEST_FAILED="$(echo "${TEST_FAILED}" | (grep -oP '\d*'))" TEST_PASSED="$(grep -oP '\d+\s{1}passed' test_results.out)" -TEST_PASSED="$(echo ${TEST_PASSED} | (grep -oP '\d*'))" +TEST_PASSED="$(echo "${TEST_PASSED}" | (grep -oP '\d*'))" if [[ -z "$TEST_FAILED" ]]; then TEST_FAILED=0 fi -let "TEST_PASSED=${TEST_PASSED}-${TEST_FAILED}" - +(( TEST_PASSED=TEST_PASSED-TEST_FAILED )) # calculate metrics -let "SUM=${TEST_FAILED}+${TEST_PASSED}" -let "TEST_PASSED_RATIO=${TEST_PASSED}*100/${SUM}" +(( SUM=TEST_FAILED+TEST_PASSED )) +(( TEST_PASSED_RATIO=TEST_PASSED*100/SUM )) # report -if [[ ${IS_FAILED} == 0 ]]; then +if [[ ${IS_FAILED} -eq 0 ]]; then if [[ ${TEST_PASSED_RATIO} -lt 100 ]]; then echo "only ${TEST_PASSED_RATIO}% passed" echo "incomplete" > status.txt diff --git a/CI/run_pytest_coverage.sh b/CI/run_pytest_coverage.sh index 2157192d49a15baa048968b799aa264941152c1b..45916427f1521843923fb94e49dc661241dc0369 100644 --- a/CI/run_pytest_coverage.sh +++ b/CI/run_pytest_coverage.sh @@ -1,17 +1,16 @@ #!/usr/bin/env bash # run coverage twice, 1) for html deploy 2) for success evaluation -python3 -m pytest --cov=src --cov-report html test/ -python3 -m pytest --cov=src --cov-report term test/ | tee coverage_results.out +python3.6 -m pytest --cov=src --cov-report term --cov-report html test/ | tee coverage_results.out IS_FAILED=$? # move html coverage report mkdir coverage/ BRANCH_NAME=$( echo -e "${CI_COMMIT_REF_NAME////_}") -mkdir coverage/${BRANCH_NAME} -mkdir coverage/recent -cp -r htmlcov/* coverage/${BRANCH_NAME}/. +mkdir "coverage/${BRANCH_NAME}" +mkdir "coverage/recent" +cp -r htmlcov/* "coverage/${BRANCH_NAME}/." cp -r htmlcov/* coverage/recent/. if [[ "${CI_COMMIT_REF_NAME}" = "master" ]]; then cp -r htmlcov/* coverage/. @@ -19,10 +18,10 @@ fi # extract coverage information COVERAGE_RATIO="$(grep -oP '\d+\%' coverage_results.out | tail -1)" -COVERAGE_RATIO="$(echo ${COVERAGE_RATIO} | (grep -oP '\d*'))" +COVERAGE_RATIO="$(echo "${COVERAGE_RATIO}" | (grep -oP '\d*'))" # report -if [[ ${IS_FAILED} == 0 ]]; then +if [[ ${IS_FAILED} -eq 0 ]]; then if [[ ${COVERAGE_RATIO} -lt ${COVERAGE_PASS_THRESHOLD} ]]; then echo "only ${COVERAGE_RATIO}% covered" echo "incomplete" > status.txt diff --git a/requirements.txt b/requirements.txt index e7c2f439966f6b085348af3078c814c7f0511024..b46f44416cf6560ecc0b62f8d22dd7d547a036c6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -53,6 +53,7 @@ seaborn==0.10.0 --no-binary shapely Shapely==1.7.0 six==1.11.0 statsmodels==0.11.1 +tabulate tensorboard==1.13.1 tensorflow-estimator==1.13.0 tensorflow==1.13.1 diff --git a/requirements_gpu.txt b/requirements_gpu.txt index 9d1c2d62da0864d2626c7ada1aac4dcf6f633630..6ce4df8fe164408024e21db5ea94a692fb5dbf26 100644 --- a/requirements_gpu.txt +++ b/requirements_gpu.txt @@ -53,6 +53,7 @@ seaborn==0.10.0 --no-binary shapely Shapely==1.7.0 six==1.11.0 statsmodels==0.11.1 +tabulate tensorboard==1.13.1 tensorflow-estimator==1.13.0 tensorflow-gpu==1.13.1 diff --git a/src/helpers.py b/src/helpers.py index 4d733ebbaf8761a815cd8669720fc12ba60ef4e7..df234b3c2e621a7ab93d19de8777864720522e40 100644 --- a/src/helpers.py +++ b/src/helpers.py @@ -5,18 +5,20 @@ __date__ = '2019-10-21' import datetime as dt +from functools import wraps import logging import math import os import getpass import socket -import sys import time +import types + import keras.backend as K import xarray as xr -from typing import Dict, Callable +from typing import Dict, Callable, Pattern, Union def to_list(arg): @@ -45,6 +47,19 @@ def l_p_loss(power: int): return loss +class TimeTrackingWrapper: + + def __init__(self, func): + wraps(func)(self) + + def __call__(self, *args, **kwargs): + with TimeTracking(name=self.__wrapped__.__name__): + return self.__wrapped__(*args, **kwargs) + + def __get__(self, instance, cls): + return types.MethodType(self, instance) + + class TimeTracking(object): """ Track time to measure execution time. Time tracking automatically starts on initialisation and ends by calling stop @@ -103,9 +118,10 @@ def get_host(): def prepare_host(create_new=True, sampling="daily"): + hostname = get_host() user = getpass.getuser() - + runner_regex = re.compile(r"runner-.*-project-2411-concurrent-\d+") if hostname == "ZAM144": path = f"/home/{user}/Data/toar_{sampling}/" elif hostname == "zam347": @@ -116,7 +132,7 @@ def prepare_host(create_new=True, sampling="daily"): path = f"/p/project/cjjsc42/{user}/DATA/toar_{sampling}/" elif (len(hostname) > 2) and (hostname[:2] in ['jw', 'ju']): path = f"/p/project/deepacf/intelliaq/{user}/DATA/toar_{sampling}/" - elif "runner-6HmDp9Qd-project-2411-concurrent" in hostname: + elif runner_regex.match(hostname) is not None: path = f"/home/{user}/machinelearningtools/data/toar_{sampling}/" else: raise OSError(f"unknown host '{hostname}'") @@ -159,7 +175,7 @@ def set_bootstrap_path(bootstrap_path, data_path, sampling): class PyTestRegex: """Assert that a given string meets some expectations.""" - def __init__(self, pattern: str, flags: int = 0): + def __init__(self, pattern: Union[str, Pattern], flags: int = 0): self._regex = re.compile(pattern, flags) def __eq__(self, actual: str) -> bool: diff --git a/src/model_modules/inception_model.py b/src/model_modules/inception_model.py index 6467b3245ad097af6ef17e596f85264eef383d7a..15739556d7d28d9e7e6ecc454615d82fb81a2754 100644 --- a/src/model_modules/inception_model.py +++ b/src/model_modules/inception_model.py @@ -75,12 +75,9 @@ class InceptionModelBase: name=f'Block_{self.number_of_blocks}{self.block_part_name()}_1x1')(input_x) tower = self.act(tower, activation, **act_settings) - # tower = self.padding_layer(padding)(padding=padding_size, - # name=f'Block_{self.number_of_blocks}{self.block_part_name()}_Pad' - # )(tower) tower = Padding2D(padding)(padding=padding_size, - name=f'Block_{self.number_of_blocks}{self.block_part_name()}_Pad' - )(tower) + name=f'Block_{self.number_of_blocks}{self.block_part_name()}_Pad' + )(tower) tower = layers.Conv2D(tower_filter, tower_kernel, @@ -111,29 +108,6 @@ class InceptionModelBase: else: return act_name.__name__ - # @staticmethod - # def padding_layer(padding): - # allowed_paddings = { - # 'RefPad2D': ReflectionPadding2D, 'ReflectionPadding2D': ReflectionPadding2D, - # 'SymPad2D': SymmetricPadding2D, 'SymmetricPadding2D': SymmetricPadding2D, - # 'ZeroPad2D': keras.layers.ZeroPadding2D, 'ZeroPadding2D': keras.layers.ZeroPadding2D - # } - # if isinstance(padding, str): - # try: - # pad2d = allowed_paddings[padding] - # except KeyError as einfo: - # raise NotImplementedError( - # f"`{einfo}' is not implemented as padding. " - # "Use one of those: i) `RefPad2D', ii) `SymPad2D', iii) `ZeroPad2D'") - # else: - # if padding in allowed_paddings.values(): - # pad2d = padding - # else: - # raise TypeError(f"`{padding.__name__}' is not a valid padding layer type. " - # "Use one of those: " - # "i) ReflectionPadding2D, ii) SymmetricPadding2D, iii) ZeroPadding2D") - # return pad2d - def create_pool_tower(self, input_x, pool_kernel, tower_filter, activation='relu', max_pooling=True, **kwargs): """ This function creates a "MaxPooling tower block" @@ -159,7 +133,6 @@ class InceptionModelBase: block_type = "AvgPool" pooling = layers.AveragePooling2D - # tower = self.padding_layer(padding)(padding=padding_size, name=block_name+'Pad')(input_x) tower = Padding2D(padding)(padding=padding_size, name=block_name+'Pad')(input_x) tower = pooling(pool_kernel, strides=(1, 1), padding='valid', name=block_name+block_type)(tower) @@ -215,35 +188,6 @@ class InceptionModelBase: return block -# if __name__ == '__main__': -# from keras.models import Model -# from keras.layers import Conv2D, Flatten, Dense, Input -# import numpy as np -# -# -# kernel_1 = (3, 3) -# kernel_2 = (5, 5) -# x = np.array(range(2000)).reshape(-1, 10, 10, 1) -# y = x.mean(axis=(1, 2)) -# -# x_input = Input(shape=x.shape[1:]) -# pad1 = PadUtils.get_padding_for_same(kernel_size=kernel_1) -# x_out = InceptionModelBase.padding_layer('RefPad2D')(padding=pad1, name="RefPAD1")(x_input) -# # x_out = ReflectionPadding2D(padding=pad1, name="RefPAD")(x_input) -# x_out = Conv2D(5, kernel_size=kernel_1, activation='relu')(x_out) -# -# pad2 = PadUtils.get_padding_for_same(kernel_size=kernel_2) -# x_out = InceptionModelBase.padding_layer(SymmetricPadding2D)(padding=pad2, name="SymPAD1")(x_out) -# # x_out = SymmetricPadding2D(padding=pad2, name="SymPAD")(x_out) -# x_out = Conv2D(2, kernel_size=kernel_2, activation='relu')(x_out) -# x_out = Flatten()(x_out) -# x_out = Dense(1, activation='linear')(x_out) -# -# model = Model(inputs=x_input, outputs=x_out) -# model.compile('adam', loss='mse') -# model.summary() -# # model.fit(x, y, epochs=10) - if __name__ == '__main__': print(__name__) from keras.datasets import cifar10 diff --git a/src/plotting/postprocessing_plotting.py b/src/plotting/postprocessing_plotting.py index 44752d67dcacb769cf63fdbedac4b57757432abe..4fcb1f49d828f47c36a5597341585896a19fcc9a 100644 --- a/src/plotting/postprocessing_plotting.py +++ b/src/plotting/postprocessing_plotting.py @@ -15,15 +15,37 @@ import pandas as pd import seaborn as sns import xarray as xr from matplotlib.backends.backend_pdf import PdfPages +import matplotlib.patches as mpatches from src import helpers -from src.helpers import TimeTracking -from src.run_modules.run_environment import RunEnvironment +from src.helpers import TimeTracking, TimeTrackingWrapper +from src.data_handling.data_generator import DataGenerator logging.getLogger('matplotlib').setLevel(logging.WARNING) -class PlotMonthlySummary(RunEnvironment): +class AbstractPlotClass: + + def __init__(self, plot_folder, plot_name, resolution=500): + self.plot_folder = plot_folder + self.plot_name = plot_name + self.resolution = resolution + + def _plot(self, *args): + raise NotImplementedError + + def _save(self, **kwargs): + """ + Standard save method to store plot locally. Name of and path to plot need to be set on initialisation + """ + plot_name = os.path.join(os.path.abspath(self.plot_folder), f"{self.plot_name}.pdf") + logging.debug(f"... save plot to {plot_name}") + plt.savefig(plot_name, dpi=self.resolution, **kwargs) + plt.close('all') + + +@TimeTrackingWrapper +class PlotMonthlySummary(AbstractPlotClass): """ Show a monthly summary over all stations for each lead time ("ahead") as box and whiskers plot. The plot is saved in data_path with name monthly_summary_box_plot.pdf and 500dpi resolution. @@ -40,12 +62,13 @@ class PlotMonthlySummary(RunEnvironment): the maximum lead time from data is used. (default None -> use maximum lead time from data). :param plot_folder: path to save the plot (default: current directory) """ - super().__init__() + super().__init__(plot_folder, "monthly_summary_box_plot") self._data_path = data_path self._data_name = name self._data = self._prepare_data(stations) self._window_lead_time = self._get_window_lead_time(window_lead_time) - self._plot(target_var, plot_folder) + self._plot(target_var) + self._save() def _prepare_data(self, stations: List) -> xr.DataArray: """ @@ -89,12 +112,11 @@ class PlotMonthlySummary(RunEnvironment): window_lead_time = ahead_steps return min(ahead_steps, window_lead_time) - def _plot(self, target_var: str, plot_folder: str): + def _plot(self, target_var: str): """ Main plot function that creates a monthly grouped box plot over all stations but with separate boxes for each lead time step. :param target_var: display name of the target variable on plot's axis - :param plot_folder: path to save the plot """ data = self._data.to_dataset(name='values').to_dask_dataframe() logging.debug("... start plotting") @@ -104,21 +126,10 @@ class PlotMonthlySummary(RunEnvironment): meanprops={'markersize': 1, 'markeredgecolor': 'k'}) ax.set(xlabel='month', ylabel=f'{target_var}') plt.tight_layout() - self._save(plot_folder) - - @staticmethod - def _save(plot_folder): - """ - Standard save method to store plot locally. The name of this plot is static. - :param plot_folder: path to save the plot - """ - plot_name = os.path.join(os.path.abspath(plot_folder), 'monthly_summary_box_plot.pdf') - logging.debug(f"... save plot to {plot_name}") - plt.savefig(plot_name, dpi=500) - plt.close('all') -class PlotStationMap(RunEnvironment): +@TimeTrackingWrapper +class PlotStationMap(AbstractPlotClass): """ Plot geographical overview of all used stations as squares. Different data sets can be colorised by its key in the input dictionary generators. The key represents the color to plot on the map. Currently, there is only a white @@ -133,12 +144,10 @@ class PlotStationMap(RunEnvironment): as value. :param plot_folder: path to save the plot (default: current directory) """ - super().__init__() + super().__init__(plot_folder, "station_map") self._ax = None - if self.data_store.get("hostname")[:2] in self.data_store.get("hpc_hosts"): - logging.info(f"Running on a hpc system {self.data_store.get('hostname')}. Skip {self.__class__.__name__}...") - else: - self._plot(generators, plot_folder) + self._plot(generators) + self._save() def _draw_background(self): """ @@ -170,12 +179,11 @@ class PlotStationMap(RunEnvironment): station_coords.loc['station_lat'].values) self._ax.plot(IDx, IDy, mfc=color, mec='k', marker='s', markersize=6, transform=ccrs.PlateCarree()) - def _plot(self, generators: Dict, plot_folder: str): + def _plot(self, generators: Dict): """ Main plot function to create the station map plot. Sets figure and calls all required sub-methods. :param generators: dictionary with the plot color of each data set as key and the generator containing all stations as value. - :param plot_folder: path to save the plot """ import cartopy.crs as ccrs @@ -184,40 +192,15 @@ class PlotStationMap(RunEnvironment): self._ax.set_extent([0, 20, 42, 58], crs=ccrs.PlateCarree()) self._draw_background() self._plot_stations(generators) - self._save(plot_folder) - - @staticmethod - def _save(plot_folder): - """ - Standard save method to store plot locally. The name of this plot is static. - :param plot_folder: path to save the plot - """ - plot_name = os.path.join(os.path.abspath(plot_folder), 'station_map.pdf') - logging.debug(f"... save plot to {plot_name}") - plt.savefig(plot_name, dpi=500) - plt.close('all') -def plot_conditional_quantiles(stations: list, plot_folder: str = ".", rolling_window: int = 3, ref_name: str = 'obs', - pred_name: str = 'CNN', season: str = "", forecast_path: str = None, - plot_name_affix: str = "", units: str = "ppb"): +@TimeTrackingWrapper +class PlotConditionalQuantiles(AbstractPlotClass): """ - This plot was originally taken from Murphy, Brown and Chen (1989): - https://journals.ametsoc.org/doi/pdf/10.1175/1520-0434%281989%29004%3C0485%3ADVOTF%3E2.0.CO%3B2 - - :param stations: stations to include in the plot (forecast data needs to be available already) - :param plot_folder: path to save the plot (default: current directory) - :param rolling_window: the rolling window mean will smooth the plot appearance (no smoothing in bin calculation, - this is only a cosmetic step, default: 3) - :param ref_name: name of the reference data series - :param pred_name: name of the investigated data series - :param season: season name to highlight if not empty - :param forecast_path: path to save the plot file - :param plot_name_affix: name to specify this plot (e.g. 'cali-ref', default: '') - :param units: units of the forecasted values (default: ppb) + This class creates cond.quantile plots as originally proposed by Murphy, Brown and Chen (1989) [But in log scale] + + Link to paper: https://journals.ametsoc.org/doi/pdf/10.1175/1520-0434%281989%29004%3C0485%3ADVOTF%3E2.0.CO%3B2 """ - time = TimeTracking() - logging.debug(f"started plot_conditional_quantiles()") # ignore warnings if nans appear in quantile grouping warnings.filterwarnings("ignore", message="All-NaN slice encountered") # ignore warnings if mean is calculated on nans @@ -225,115 +208,242 @@ def plot_conditional_quantiles(stations: list, plot_folder: str = ".", rolling_w # ignore warnings for y tick = 0 on log scale (instead of 0.00001 or similar) warnings.filterwarnings("ignore", message="Attempted to set non-positive bottom ylim on a log-scaled axis.") - def load_data(): + def __init__(self, stations: List, data_pred_path: str, plot_folder: str = ".", plot_per_seasons=True, + rolling_window: int = 3, model_mame: str = "CNN", obs_name: str = "obs", **kwargs): + """ + + :param stations: all stations to plot + :param data_pred_path: path to dir which contains the forecasts as .nc files + :param plot_folder: path where the plots are stored + :param plot_per_seasons: if `True' create cond. quantile plots for seasons (DJF, MAM, JJA, SON) individually + :param rolling_window: smoothing of quantiles (3 is used by Murphy et al.) + :param model_mame: name of the model prediction as stored in netCDF file (for example "CNN") + :param obs_name: name of observation as stored in netCDF file (for example "obs") + :param kwargs: Some further arguments which are listed in self._opts + """ + super().__init__(plot_folder, "conditional_quantiles") + + self._data_pred_path = data_pred_path + self._stations = stations + self._rolling_window = rolling_window + self._model_name = model_mame + self._obs_name = obs_name + + self._opts = {"q": kwargs.get("q", [.1, .25, .5, .75, .9]), + "linetype": kwargs.get("linetype", [':', '-.', '--', '-.', ':']), + "legend": kwargs.get("legend", + ['.10th and .90th quantile', '.25th and .75th quantile', '.50th quantile', + 'reference 1:1']), + "data_unit": kwargs.get("data_unit", "ppb"), + } + if plot_per_seasons is True: + self.seasons = ['DJF', 'MAM', 'JJA', 'SON'] + else: + self.seasons = "" + self._data = self._load_data() + self._bins = self._get_bins_from_rage_of_data() + + self._plot() + + def _load_data(self): + """ + This method loads forcast data + + :return: + """ logging.debug("... load data") data_collector = [] - for station in stations: - file = os.path.join(forecast_path, f"forecasts_{station}_test.nc") + for station in self._stations: + file = os.path.join(self._data_pred_path, f"forecasts_{station}_test.nc") data_tmp = xr.open_dataarray(file) - data_collector.append(data_tmp.loc[:, :, ['CNN', 'obs', 'OLS']].assign_coords(station=station)) - return xr.concat(data_collector, dim='station').transpose('index', 'type', 'ahead', 'station') + data_collector.append(data_tmp.loc[:, :, [self._model_name, self._obs_name]].assign_coords(station=station)) + res = xr.concat(data_collector, dim='station').transpose('index', 'type', 'ahead', 'station') + return res + + def _segment_data(self, data, x_model): + """ + This method creates segmented data which is used for cond. quantile plots - def segment_data(data): + :param data: + :param x_model: + :return: + """ logging.debug("... segment data") # combine index and station to multi index data = data.stack(z=['index', 'station']) # replace multi index by simple position index (order is not relevant anymore) data.coords['z'] = range(len(data.coords['z'])) - # segment data of pred_name into bins - data.loc[pred_name, ...] = data.loc[pred_name, ...].to_pandas().T.apply(pd.cut, bins=bins, - labels=bins[1:]).T.values + # segment data of x_model into bins + data.loc[x_model, ...] = data.loc[x_model, ...].to_pandas().T.apply(pd.cut, bins=self._bins, + labels=self._bins[1:]).T.values return data - def create_quantile_panel(data, q): + @staticmethod + def _labels(plot_type, data_unit="ppb"): + """ + Helper method to correctly assign (x,y) labels to plots, depending on like-base or cali-ref factorization + + :param plot_type: + :param data_unit: + :return: + """ + names = (f"forecast concentration (in {data_unit})", f"observed concentration (in {data_unit})") + if plot_type == "obs": + return names + else: + return names[::-1] + + def _get_bins_from_rage_of_data(self): + """ + Get array of bins to use for quantiles + + :return: + """ + return np.arange(0, math.ceil(self._data.max().max()) + 1, 1).astype(int) + + def _create_quantile_panel(self, data, x_model, y_model): + """ + Clculate quantiles + + :param data: + :param x_model: + :param y_model: + :return: + """ logging.debug("... create quantile panel") # create empty xarray with dims: time steps ahead, quantiles, bin index (numbers create in previous step) - quantile_panel = xr.DataArray(np.full([data.ahead.shape[0], len(q), bins[1:].shape[0]], np.nan), - coords=[data.ahead, q, bins[1:]], dims=['ahead', 'quantiles', 'categories']) + quantile_panel = xr.DataArray( + np.full([data.ahead.shape[0], len(self._opts["q"]), self._bins[1:].shape[0]], np.nan), + coords=[data.ahead, self._opts["q"], self._bins[1:]], dims=['ahead', 'quantiles', 'categories']) # ensure that the coordinates are in the right order quantile_panel = quantile_panel.transpose('ahead', 'quantiles', 'categories') # calculate for each bin of the pred_name data the quantiles of the ref_name data - for bin in bins[1:]: - mask = (data.loc[pred_name, ...] == bin) - quantile_panel.loc[..., bin] = data.loc[ref_name, ...].where(mask).quantile(q, dim=['z']).T - + for bin in self._bins[1:]: + mask = (data.loc[x_model, ...] == bin) + quantile_panel.loc[..., bin] = data.loc[y_model, ...].where(mask).quantile(self._opts["q"], + dim=['z']).T return quantile_panel - def labels(plot_type, data_unit="ppb"): - names = (f"forecast concentration (in {data_unit})", f"observed concentration (in {data_unit})") - if plot_type == "obs": - return names - else: - return names[::-1] + @staticmethod + def add_affix(x): + """ + Helper method to add additional information on plot name + + :param x: + :return: + """ + return f"_{x}" if len(x) > 0 else "" + + def _prepare_plots(self, data, x_model, y_model): + """ + Get segmented_data and quantile_panel + + :param data: + :param x_model: + :param y_model: + :return: + """ + segmented_data = self._segment_data(data, x_model) + quantile_panel = self._create_quantile_panel(segmented_data, x_model, y_model) + return segmented_data, quantile_panel + + def _plot(self): + """ + Main plot call + + :return: + """ + logging.info(f"start plotting {self.__class__.__name__}, scheduled number of plots: {(len(self.seasons) + 1) * 2}") + + if len(self.seasons) > 0: + self._plot_seasons() + self._plot_all() + + def _plot_seasons(self): + """ + Seasonal plot call + + :return: + """ + for season in self.seasons: + self._plot_base(data=self._data.where(self._data['index.season'] == season), x_model=self._model_name, + y_model=self._obs_name, plot_name_affix="cali-ref", season=season) + self._plot_base(data=self._data.where(self._data['index.season'] == season), x_model=self._obs_name, + y_model=self._model_name, plot_name_affix="like-base", season=season) + + def _plot_all(self): + """ + Full plot call + + :return: + """ + self._plot_base(data=self._data, x_model=self._model_name, y_model=self._obs_name, plot_name_affix="cali-ref") + self._plot_base(data=self._data, x_model=self._obs_name, y_model=self._model_name, plot_name_affix="like-base") + + @TimeTrackingWrapper + def _plot_base(self, data, x_model, y_model, plot_name_affix, season=""): + """ + Base method to create cond. quantile plots. Is called from _plot_all and _plot_seasonal + + :param data: data which is used to create cond. quantile plot + :param x_model: name of model on x axis (can also be obs) + :param y_model: name of model on y axis (can also be obs) + :param plot_name_affix: should be `cali-ref' or `like-base' + :param season: List of seasons to use + :return: + """ + + segmented_data, quantile_panel = self._prepare_plots(data, x_model, y_model) + ylabel, xlabel = self._labels(x_model, self._opts["data_unit"]) + plot_name = f"{self.plot_name}{self.add_affix(season)}{self.add_affix(plot_name_affix)}_plot.pdf" + #f"{base_name}{add_affix(season)}{add_affix(plot_name_affix)}_plot.pdf" + plot_path = os.path.join(os.path.abspath(self.plot_folder), plot_name) + pdf_pages = matplotlib.backends.backend_pdf.PdfPages(plot_path) + logging.debug(f"... plot path is {plot_path}") + + # create plot for each time step ahead + y2_max = 0 + for iteration, d in enumerate(segmented_data.ahead): + logging.debug(f"... plotting {d.values} time step(s) ahead") + # plot smoothed lines with rolling mean + smooth_data = quantile_panel.loc[d, ...].rolling(categories=self._rolling_window, + center=True).mean().to_pandas().T + ax = smooth_data.plot(style=self._opts["linetype"], color='black', legend=False) + ax2 = ax.twinx() + # add reference line + ax.plot([0, self._bins.max()], [0, self._bins.max()], color='k', label='reference 1:1', linewidth=.8) + # add histogram of the segmented data (pred_name) + handles, labels = ax.get_legend_handles_labels() + segmented_data.loc[x_model, d, :].to_pandas().hist(bins=self._bins, ax=ax2, color='k', alpha=.3, grid=False, + rwidth=1) + # add legend + plt.legend(handles[:3] + [handles[-1]], self._opts["legend"], loc='upper left', fontsize='large') + # adjust limits and set labels + ax.set(xlim=(0, self._bins.max()), ylim=(0, self._bins.max())) + ax.set_xlabel(xlabel, fontsize='x-large') + ax.tick_params(axis='x', which='major', labelsize=15) + ax.set_ylabel(ylabel, fontsize='x-large') + ax.tick_params(axis='y', which='major', labelsize=15) + ax2.yaxis.label.set_color('gray') + ax2.tick_params(axis='y', colors='gray') + ax2.yaxis.labelpad = -15 + ax2.set_yscale('log') + if iteration == 0: + y2_max = ax2.get_ylim()[1] + 100 + ax2.set(ylim=(0, y2_max * 10 ** 8), yticks=np.logspace(0, 4, 5)) + ax2.set_ylabel(' sample size', fontsize='x-large') + ax2.tick_params(axis='y', which='major', labelsize=15) + # set title and save current figure + title = f"{d.values} time step(s) ahead{f' ({season})' if len(season) > 0 else ''}" + plt.title(title) + pdf_pages.savefig() + # close all open figures / plots + pdf_pages.close() + plt.close('all') + - xlabel, ylabel = labels(ref_name, units) - - opts = {"q": [.1, .25, .5, .75, .9], "linetype": [':', '-.', '--', '-.', ':'], - "legend": ['.10th and .90th quantile', '.25th and .75th quantile', '.50th quantile', 'reference 1:1'], - "xlabel": xlabel, "ylabel": ylabel} - - # set name and path of the plot - base_name = "conditional_quantiles" - def add_affix(x): return f"_{x}" if len(x) > 0 else "" - plot_name = f"{base_name}{add_affix(season)}{add_affix(plot_name_affix)}_plot.pdf" - plot_path = os.path.join(os.path.abspath(plot_folder), plot_name) - - # check forecast path - if forecast_path is None: - raise ValueError("Forecast path is not given but required.") - - # load data and set data bins - orig_data = load_data() - bins = np.arange(0, math.ceil(orig_data.max().max()) + 1, 1).astype(int) - segmented_data = segment_data(orig_data) - quantile_panel = create_quantile_panel(segmented_data, q=opts["q"]) - - # init pdf output - pdf_pages = matplotlib.backends.backend_pdf.PdfPages(plot_path) - logging.debug(f"... plot path is {plot_path}") - - # create plot for each time step ahead - y2_max = 0 - for iteration, d in enumerate(segmented_data.ahead): - logging.debug(f"... plotting {d.values} time step(s) ahead") - # plot smoothed lines with rolling mean - smooth_data = quantile_panel.loc[d, ...].rolling(categories=rolling_window, center=True).mean().to_pandas().T - ax = smooth_data.plot(style=opts["linetype"], color='black', legend=False) - ax2 = ax.twinx() - # add reference line - ax.plot([0, bins.max()], [0, bins.max()], color='k', label='reference 1:1', linewidth=.8) - # add histogram of the segmented data (pred_name) - handles, labels = ax.get_legend_handles_labels() - segmented_data.loc[pred_name, d, :].to_pandas().hist(bins=bins, ax=ax2, color='k', alpha=.3, grid=False, - rwidth=1) - # add legend - plt.legend(handles[:3] + [handles[-1]], opts["legend"], loc='upper left', fontsize='large') - # adjust limits and set labels - ax.set(xlim=(0, bins.max()), ylim=(0, bins.max())) - ax.set_xlabel(opts["xlabel"], fontsize='x-large') - ax.tick_params(axis='x', which='major', labelsize=15) - ax.set_ylabel(opts["ylabel"], fontsize='x-large') - ax.tick_params(axis='y', which='major', labelsize=15) - ax2.yaxis.label.set_color('gray') - ax2.tick_params(axis='y', colors='gray') - ax2.yaxis.labelpad = -15 - ax2.set_yscale('log') - if iteration == 0: - y2_max = ax2.get_ylim()[1] + 100 - ax2.set(ylim=(0, y2_max * 10 ** 8), yticks=np.logspace(0, 4, 5)) - ax2.set_ylabel(' sample size', fontsize='x-large') - ax2.tick_params(axis='y', which='major', labelsize=15) - # set title and save current figure - title = f"{d.values} time step(s) ahead{f' ({season})' if len(season) > 0 else ''}" - plt.title(title) - pdf_pages.savefig() - # close all open figures / plots - pdf_pages.close() - plt.close('all') - logging.info(f"plot_conditional_quantiles() finished after {time}") - - -class PlotClimatologicalSkillScore(RunEnvironment): +@TimeTrackingWrapper +class PlotClimatologicalSkillScore(AbstractPlotClass): """ Create plot of climatological skill score after Murphy (1988) as box plot over all stations. A forecast time step (called "ahead") is separately shown to highlight the differences for each prediction time step. Either each single @@ -351,10 +461,11 @@ class PlotClimatologicalSkillScore(RunEnvironment): :param extra_name_tag: additional tag that can be included in the plot name (default "") :param model_setup: architecture type to specify plot name (default "CNN") """ - super().__init__() + super().__init__(plot_folder, f"skill_score_clim_{extra_name_tag}{model_setup}") self._labels = None self._data = self._prepare_data(data, score_only) - self._plot(plot_folder, score_only, extra_name_tag, model_setup) + self._plot(score_only) + self._save() def _prepare_data(self, data: Dict, score_only: bool) -> pd.DataFrame: """ @@ -378,13 +489,10 @@ class PlotClimatologicalSkillScore(RunEnvironment): """ return "" if score_only else "terms and " - def _plot(self, plot_folder, score_only, extra_name_tag, model_setup): + def _plot(self, score_only): """ Main plot function to plot climatological skill score. - :param plot_folder: path to save the plot :param score_only: if true plot only scores of CASE I to IV, otherwise plot all single terms - :param extra_name_tag: additional tag that can be included in the plot name - :param model_setup: architecture type to specify plot name """ fig, ax = plt.subplots() if not score_only: @@ -396,24 +504,10 @@ class PlotClimatologicalSkillScore(RunEnvironment): handles, _ = ax.get_legend_handles_labels() ax.legend(handles, self._labels) plt.tight_layout() - self._save(plot_folder, extra_name_tag, model_setup) - @staticmethod - def _save(plot_folder, extra_name_tag, model_setup): - """ - Standard save method to store plot locally. The name of this plot is dynamic. It includes the model setup like - 'CNN' and can additionally be adjusted using an extra name tag. - :param plot_folder: path to save the plot - :param extra_name_tag: additional tag that can be included in the plot name - :param model_setup: architecture type to specify plot name - """ - plot_name = os.path.join(plot_folder, f"skill_score_clim_{extra_name_tag}{model_setup}.pdf") - logging.debug(f"... save plot to {plot_name}") - plt.savefig(plot_name, dpi=500) - plt.close('all') - -class PlotCompetitiveSkillScore(RunEnvironment): +@TimeTrackingWrapper +class PlotCompetitiveSkillScore(AbstractPlotClass): """ Create competitive skill score for the given model setup and the reference models ordinary least squared ("ols") and the persistence forecast ("persi") for all lead times ("ahead"). The plot is saved under plot_folder with the name @@ -426,10 +520,11 @@ class PlotCompetitiveSkillScore(RunEnvironment): :param plot_folder: path to save the plot (default: current directory) :param model_setup: architecture type (default "CNN") """ - super().__init__() + super().__init__(plot_folder, f"skill_score_competitive_{model_setup}") self._labels = None self._data = self._prepare_data(data) - self._plot(plot_folder, model_setup) + self._plot() + self._save() def _prepare_data(self, data: pd.DataFrame) -> pd.DataFrame: """ @@ -446,12 +541,9 @@ class PlotCompetitiveSkillScore(RunEnvironment): self._labels = [str(i) + "d" for i in data.index.levels[1].values] return data.stack(level=0).reset_index(level=2, drop=True).reset_index(name="data") - def _plot(self, plot_folder, model_setup): + def _plot(self): """ Main plot function to plot skill scores of the comparisons cnn-persi, ols-persi and cnn-ols. - :param plot_folder: path to save the plot - :param model_setup: - :return: architecture type to specify plot name """ fig, ax = plt.subplots() sns.boxplot(x="comparison", y="data", hue="ahead", data=self._data, whis=1., ax=ax, palette="Blues_d", @@ -463,7 +555,6 @@ class PlotCompetitiveSkillScore(RunEnvironment): handles, _ = ax.get_legend_handles_labels() ax.legend(handles, self._labels) plt.tight_layout() - self._save(plot_folder, model_setup) def _ylim(self) -> Tuple[float, float]: """ @@ -475,20 +566,9 @@ class PlotCompetitiveSkillScore(RunEnvironment): upper = helpers.float_round(self._data.max()[2], 2) + 0.1 return lower, upper - @staticmethod - def _save(plot_folder, model_setup): - """ - Standard save method to store plot locally. The name of this plot is dynamic by including the model setup. - :param plot_folder: path to save the plot - :param model_setup: architecture type to specify plot name - """ - plot_name = os.path.join(plot_folder, f"skill_score_competitive_{model_setup}.pdf") - logging.debug(f"... save plot to {plot_name}") - plt.savefig(plot_name, dpi=500) - plt.close() - -class PlotBootstrapSkillScore(RunEnvironment): +@TimeTrackingWrapper +class PlotBootstrapSkillScore(AbstractPlotClass): """ Create plot of climatological skill score after Murphy (1988) as box plot over all stations. A forecast time step (called "ahead") is separately shown to highlight the differences for each prediction time step. Either each single @@ -504,11 +584,12 @@ class PlotBootstrapSkillScore(RunEnvironment): :param plot_folder: path to save the plot (default: current directory) :param model_setup: architecture type to specify plot name (default "CNN") """ - super().__init__() + super().__init__(plot_folder, f"skill_score_bootstrap_{model_setup}") self._labels = None self._x_name = "boot_var" self._data = self._prepare_data(data) - self._plot(plot_folder, model_setup) + self._plot() + self._save() def _prepare_data(self, data: Dict) -> pd.DataFrame: """ @@ -529,11 +610,9 @@ class PlotBootstrapSkillScore(RunEnvironment): """ return "" if score_only else "terms and " - def _plot(self, plot_folder, model_setup): + def _plot(self): """ Main plot function to plot climatological skill score. - :param plot_folder: path to save the plot - :param model_setup: architecture type to specify plot name """ fig, ax = plt.subplots() sns.boxplot(x=self._x_name, y="data", hue="ahead", data=self._data, ax=ax, whis=1., palette="Blues_d", @@ -543,27 +622,13 @@ class PlotBootstrapSkillScore(RunEnvironment): handles, _ = ax.get_legend_handles_labels() ax.legend(handles, self._labels) plt.tight_layout() - self._save(plot_folder, model_setup) - - @staticmethod - def _save(plot_folder, model_setup): - """ - Standard save method to store plot locally. The name of this plot is dynamic. It includes the model setup like - 'CNN' and can additionally be adjusted using an extra name tag. - :param plot_folder: path to save the plot - :param model_setup: architecture type to specify plot name - """ - plot_name = os.path.join(plot_folder, f"skill_score_bootstrap_{model_setup}.pdf") - logging.debug(f"... save plot to {plot_name}") - plt.savefig(plot_name, dpi=500) - plt.close('all') -class PlotTimeSeries(RunEnvironment): +@TimeTrackingWrapper +class PlotTimeSeries: def __init__(self, stations: List, data_path: str, name: str, window_lead_time: int = None, plot_folder: str = ".", sampling="daily"): - super().__init__() self._data_path = data_path self._data_name = name self._stations = stations @@ -675,3 +740,111 @@ class PlotTimeSeries(RunEnvironment): plot_name = os.path.join(os.path.abspath(plot_folder), 'timeseries_plot.pdf') logging.debug(f"... save plot to {plot_name}") return matplotlib.backends.backend_pdf.PdfPages(plot_name) + + +@TimeTrackingWrapper +class PlotAvailability(AbstractPlotClass): + + def __init__(self, generators: Dict[str, DataGenerator], plot_folder: str = ".", sampling="daily", + summary_name="data availability"): + # create standard Gantt plot for all stations (currently in single pdf file with single page) + super().__init__(plot_folder, "data_availability") + self.sampling = self._get_sampling(sampling) + plot_dict = self._prepare_data(generators) + lgd = self._plot(plot_dict) + self._save(bbox_extra_artists=(lgd, ), bbox_inches="tight") + # create summary Gantt plot (is data in at least one station available) + self.plot_name += "_summary" + plot_dict_summary = self._summarise_data(generators, summary_name) + lgd = self._plot(plot_dict_summary) + self._save(bbox_extra_artists=(lgd, ), bbox_inches="tight") + # combination of station and summary plot, last element is summary broken bar + self.plot_name = "data_availability_combined" + plot_dict_summary.update(plot_dict) + lgd = self._plot(plot_dict_summary) + self._save(bbox_extra_artists=(lgd, ), bbox_inches="tight") + + @staticmethod + def _get_sampling(sampling): + if sampling == "daily": + return "D" + elif sampling == "hourly": + return "h" + + def _prepare_data(self, generators: Dict[str, DataGenerator]): + plt_dict = {} + for subset, generator in generators.items(): + stations = generator.stations + for station in stations: + station_data = generator.get_data_generator(station) + labels = station_data.get_transposed_label().resample(datetime=self.sampling, skipna=True).mean() + labels_bool = labels.sel(window=1).notnull() + group = (labels_bool != labels_bool.shift(datetime=1)).cumsum() + plot_data = pd.DataFrame({"avail": labels_bool.values, "group": group.values}, index=labels.datetime.values) + t = plot_data.groupby("group").apply(lambda x: (x["avail"].head(1)[0], x.index[0], x.shape[0])) + t2 = [i[1:] for i in t if i[0]] + + if plt_dict.get(station) is None: + plt_dict[station] = {subset: t2} + else: + plt_dict[station].update({subset: t2}) + return plt_dict + + def _summarise_data(self, generators: Dict[str, DataGenerator], summary_name: str): + plt_dict = {} + for subset, generator in generators.items(): + all_data = None + stations = generator.stations + for station in stations: + station_data = generator.get_data_generator(station) + labels = station_data.get_transposed_label().resample(datetime=self.sampling, skipna=True).mean() + labels_bool = labels.sel(window=1).notnull() + if all_data is None: + all_data = labels_bool + else: + tmp = all_data.combine_first(labels_bool) # expand dims to merged datetime coords + all_data = np.logical_or(tmp, labels_bool).combine_first(all_data) # apply logical on merge and fill missing with all_data + + group = (all_data != all_data.shift(datetime=1)).cumsum() + plot_data = pd.DataFrame({"avail": all_data.values, "group": group.values}, index=all_data.datetime.values) + t = plot_data.groupby("group").apply(lambda x: (x["avail"].head(1)[0], x.index[0], x.shape[0])) + t2 = [i[1:] for i in t if i[0]] + if plt_dict.get(summary_name) is None: + plt_dict[summary_name] = {subset: t2} + else: + plt_dict[summary_name].update({subset: t2}) + return plt_dict + + def _plot(self, plt_dict): + # colors = {"train": "orange", "val": "blueishgreen", "test": "skyblue"} # color names + colors = {"train": "#e69f00", "val": "#009e73", "test": "#56b4e9"} # hex code + # colors = {"train": (230, 159, 0), "val": (0, 158, 115), "test": (86, 180, 233)} # in rgb but as abs values + pos = 0 + height = 0.8 # should be <= 1 + yticklabels = [] + number_of_stations = len(plt_dict.keys()) + fig, ax = plt.subplots(figsize=(10, number_of_stations/3)) + for station, d in sorted(plt_dict.items(), reverse=True): + pos += 1 + for subset, color in colors.items(): + plt_data = d.get(subset) + if plt_data is None: + continue + ax.broken_barh(plt_data, (pos, height), color=color, edgecolor="white") + yticklabels.append(station) + + ax.set_ylim([height, number_of_stations + 1]) + ax.set_yticks(np.arange(len(plt_dict.keys()))+1+height/2) + ax.set_yticklabels(yticklabels) + handles = [mpatches.Patch(color=c, label=k) for k, c in colors.items()] + lgd = plt.legend(handles=handles, bbox_to_anchor=(0, 1, 1, 0.2), loc="lower center", ncol=len(handles)) + return lgd + + +if __name__ == "__main__": + stations = ['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087'] + path = "../../testrun_network/forecasts" + plt_path = "../../" + + con_quan_cls = PlotConditionalQuantiles(stations, path, plt_path) + diff --git a/src/run_modules/experiment_setup.py b/src/run_modules/experiment_setup.py index ac6806ebb80aafc179048b21bae323e035cc4b62..278a481f80038a49829c00e99a156c12f461b985 100644 --- a/src/run_modules/experiment_setup.py +++ b/src/run_modules/experiment_setup.py @@ -23,9 +23,10 @@ DEFAULT_VAR_ALL_DICT = {'o3': 'dma8eu', 'relhum': 'average_values', 'temp': 'max 'pblheight': 'maximum'} DEFAULT_TRANSFORMATION = {"scope": "data", "method": "standardise", "mean": "estimate"} DEFAULT_PLOT_LIST = ["PlotMonthlySummary", "PlotStationMap", "PlotClimatologicalSkillScore", "PlotTimeSeries", - "PlotCompetitiveSkillScore", "PlotBootstrapSkillScore", "plot_conditional_quantiles"] -DEFAULT_HPC_LOGIN_LIST = ["ju"] -DEFAULT_HPC_HOST_LIST = ["jw", "jr"] #first part of node names for Juwels (jw[comp], ju[login]) and Jureca(jr). + "PlotCompetitiveSkillScore", "PlotBootstrapSkillScore", "PlotConditionalQuantiles", + "PlotAvailability"] +DEFAULT_HPC_LOGIN_LIST = ["ju"] +DEFAULT_HPC_HOST_LIST = ["jw", "jr"] # first part of node names for Juwels (jw[comp], ju[login]) and Jureca(jr). class ExperimentSetup(RunEnvironment): diff --git a/src/run_modules/post_processing.py b/src/run_modules/post_processing.py index f6f2dece85ac62f8e1739cc5068b3f0041dd4c03..b0bce5e69336dc8c48106315ace6ee5ca1a6b2b9 100644 --- a/src/run_modules/post_processing.py +++ b/src/run_modules/post_processing.py @@ -20,8 +20,8 @@ from src.helpers import TimeTracking from src.model_modules.linear_model import OrdinaryLeastSquaredModel from src.model_modules.model_class import AbstractModelClass from src.plotting.postprocessing_plotting import PlotMonthlySummary, PlotStationMap, PlotClimatologicalSkillScore, \ - PlotCompetitiveSkillScore, PlotTimeSeries, PlotBootstrapSkillScore -from src.plotting.postprocessing_plotting import plot_conditional_quantiles + PlotCompetitiveSkillScore, PlotTimeSeries, PlotBootstrapSkillScore, PlotAvailability, PlotConditionalQuantiles +# from src.plotting.postprocessing_plotting import plot_conditional_quantiles from src.run_modules.run_environment import RunEnvironment from typing import Dict @@ -37,6 +37,7 @@ class PostProcessing(RunEnvironment): self.test_data: DataGenerator = self.data_store.get("generator", "test") self.test_data_distributed = Distributor(self.test_data, self.model, self.batch_size) self.train_data: DataGenerator = self.data_store.get("generator", "train") + self.val_data: DataGenerator = self.data_store.get("generator", "val") self.train_val_data: DataGenerator = self.data_store.get("generator", "train_val") self.plot_path: str = self.data_store.get("plot_path") self.target_var = self.data_store.get("target_var") @@ -194,13 +195,15 @@ class PostProcessing(RunEnvironment): if self.bootstrap_skill_scores is not None and "PlotBootstrapSkillScore" in plot_list: PlotBootstrapSkillScore(self.bootstrap_skill_scores, plot_folder=self.plot_path, model_setup="CNN") - if "plot_conditional_quantiles" in plot_list: - plot_conditional_quantiles(self.test_data.stations, pred_name="CNN", ref_name="obs", - forecast_path=path, plot_name_affix="cali-ref", plot_folder=self.plot_path) - plot_conditional_quantiles(self.test_data.stations, pred_name="obs", ref_name="CNN", - forecast_path=path, plot_name_affix="like-bas", plot_folder=self.plot_path) - if ("PlotStationMap" in plot_list):# and (not self.data_store.get("hostname")[:2] == "jw"): - PlotStationMap(generators={'b': self.test_data}, plot_folder=self.plot_path) + + if "PlotConditionalQuantiles" in plot_list: + PlotConditionalQuantiles(self.test_data.stations, data_pred_path=path, plot_folder=self.plot_path) + if "PlotStationMap" in plot_list: + if self.data_store.get("hostname")[:2] in self.data_store.get("hpc_hosts"): + logging.warning( + f"Skip 'PlotStationMap` because running on a hpc node: {self.data_store.get('hostname')}") + else: + PlotStationMap(generators={'b': self.test_data}, plot_folder=self.plot_path) if "PlotMonthlySummary" in plot_list: PlotMonthlySummary(self.test_data.stations, path, r"forecasts_%s_test.nc", self.target_var, plot_folder=self.plot_path) @@ -213,6 +216,9 @@ class PostProcessing(RunEnvironment): if "PlotTimeSeries" in plot_list: PlotTimeSeries(self.test_data.stations, path, r"forecasts_%s_test.nc", plot_folder=self.plot_path, sampling=self._sampling) + if "PlotAvailability" in plot_list: + avail_data = {"train": self.train_data, "val": self.val_data, "test": self.test_data} + PlotAvailability(avail_data, plot_folder=self.plot_path) def calculate_test_score(self): test_score = self.model.evaluate_generator(generator=self.test_data_distributed.distribute_on_batches(), diff --git a/src/run_modules/pre_processing.py b/src/run_modules/pre_processing.py index b5de28b3c21d83ea00e4319deb34b0a43d41811c..551ea599a3114b7b97f5bcb146cf6e131e324eb5 100644 --- a/src/run_modules/pre_processing.py +++ b/src/run_modules/pre_processing.py @@ -3,10 +3,14 @@ __date__ = '2019-11-25' import logging +import os from typing import Tuple, Dict, List +import numpy as np +import pandas as pd + from src.data_handling.data_generator import DataGenerator -from src.helpers import TimeTracking +from src.helpers import TimeTracking, check_path_and_create from src.join import EmptyQueryResult from src.run_modules.run_environment import RunEnvironment @@ -54,6 +58,58 @@ class PreProcessing(RunEnvironment): logging.debug(f"Number of test stations: {n_test}") logging.debug(f"TEST SHAPE OF GENERATOR CALL: {self.data_store.get('generator', 'test')[0][0].shape}" f"{self.data_store.get('generator', 'test')[0][1].shape}") + self.create_latex_report() + + def create_latex_report(self): + """ + This function creates tables with information on the station meta data and a summary on subset sample sizes. + + * station_sample_size.md: see table below + * station_sample_size.tex: same as table below, but as latex table + * station_sample_size_short.tex: reduced size table without any meta data besides station ID, as latex table + + All tables are stored inside experiment_path inside the folder latex_report. The table format (e.g. which meta + data is highlighted) is currently hardcoded to have a stable table style. If further styles are needed, it is + better to add an additional style than modifying the existing table styles. + + | stat. ID | station_name | station_lon | station_lat | station_alt | train | val | test | + |------------|-------------------------------------------|---------------|---------------|---------------|---------|-------|--------| + | DEBW013 | Stuttgart Bad Cannstatt | 9.2297 | 48.8088 | 235 | 1434 | 712 | 1080 | + | DEBW076 | Baden-Baden | 8.2202 | 48.7731 | 148 | 3037 | 722 | 710 | + | DEBW087 | Schwäbische_Alb | 9.2076 | 48.3458 | 798 | 3044 | 714 | 1087 | + | DEBW107 | Tübingen | 9.0512 | 48.5077 | 325 | 1803 | 715 | 1087 | + | DEBY081 | Garmisch-Partenkirchen/Kreuzeckbahnstraße | 11.0631 | 47.4764 | 735 | 2935 | 525 | 714 | + | # Stations | nan | nan | nan | nan | 6 | 6 | 6 | + | # Samples | nan | nan | nan | nan | 12253 | 3388 | 4678 | + + """ + meta_data = ['station_name', 'station_lon', 'station_lat', 'station_alt'] + meta_round = ["station_lon", "station_lat", "station_alt"] + precision = 4 + path = os.path.join(self.data_store.get("experiment_path"), "latex_report") + check_path_and_create(path) + set_names = ["train", "val", "test"] + df = pd.DataFrame(columns=meta_data+set_names) + for set_name in set_names: + data: DataGenerator = self.data_store.get("generator", set_name) + for station in data.stations: + df.loc[station, set_name] = data.get_data_generator(station).get_transposed_label().shape[0] + if df.loc[station, meta_data].isnull().any(): + df.loc[station, meta_data] = data.get_data_generator(station).meta.loc[meta_data].values.flatten() + df.loc["# Samples", set_name] = df.loc[:, set_name].sum() + df.loc["# Stations", set_name] = df.loc[:, set_name].count() + df[meta_round] = df[meta_round].astype(float).round(precision) + df.sort_index(inplace=True) + df = df.reindex(df.index.drop(["# Stations", "# Samples"]).to_list() + ["# Stations", "# Samples"], ) + df.index.name = 'stat. ID' + column_format = np.repeat('c', df.shape[1]+1) + column_format[0] = 'l' + column_format[-1] = 'r' + column_format = ''.join(column_format.tolist()) + df.to_latex(os.path.join(path, "station_sample_size.tex"), na_rep='---', column_format=column_format) + df.to_markdown(open(os.path.join(path, "station_sample_size.md"), mode="w", encoding='utf-8'), tablefmt="github") + df.drop(meta_data, axis=1).to_latex(os.path.join(path, "station_sample_size_short.tex"), na_rep='---', + column_format=column_format) def split_train_val_test(self) -> None: """ diff --git a/test/test_helpers.py b/test/test_helpers.py index 6121254bdaa3ad64dd98939825eb4ced515ccb4e..0065a94b7b18d88c2e86e60df5633d47ba15f42a 100644 --- a/test/test_helpers.py +++ b/test/test_helpers.py @@ -144,7 +144,7 @@ class TestGetHost: class TestPrepareHost: @mock.patch("socket.gethostname", side_effect=["linux-aa9b", "ZAM144", "zam347", "jrtest", "jwtest", - "runner-6HmDp9Qd-project-2411-concurrent"]) + "runner-6HmDp9Qd-project-2411-concurrent-01"]) @mock.patch("getpass.getuser", return_value="testUser") @mock.patch("os.path.exists", return_value=True) def test_prepare_host(self, mock_host, mock_user, mock_path): @@ -176,23 +176,6 @@ class TestPrepareHost: # assert "does not exist for host 'linux-aa9b'" in e.value.args[0] assert PyTestRegex(r"path '.*' does not exist for host '.*'\.") == e.value.args[0] -# @mock.patch("socket.gethostname", side_effect=["linux-aa9b", "ZAM144", "zam347", "jrtest", "jwtest", -# "runner-6HmDp9Qd-project-2411-concurrent"]) -# @mock.patch("os.getlogin", side_effect=OSError) -# @mock.patch("os.path.exists", return_value=True) -# def test_os_error(self, mock_path, mock_user, mock_host): -# path = prepare_host() -# assert path == "/home/default/machinelearningtools/data/toar_daily/" -# path = prepare_host() -# assert path == "/home/default/Data/toar_daily/" -# path = prepare_host() -# assert path == "/home/default/Data/toar_daily/" -# path = prepare_host() -# assert path == "/p/project/cjjsc42/default/DATA/toar_daily/" -# path = prepare_host() -# assert path == "/p/home/jusers/default/juwels/intelliaq/DATA/toar_daily/" -# path = prepare_host() -# assert path == '/home/default/machinelearningtools/data/toar_daily/' @mock.patch("socket.gethostname", side_effect=["linux-aa9b"]) @mock.patch("getpass.getuser", return_value="testUser")