diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 4a59b5b91edbe7a918a80884cf9e38a5d70a8826..f97ad5b3258a86811966d2cf58e0fe905c4b12fb 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -41,12 +41,16 @@ tests (from scratch): before_script: - chmod +x ./CI/update_badge.sh - ./CI/update_badge.sh > /dev/null + - source /opt/venv/bin/activate script: - pip install --upgrade pip - - pip install numpy wheel six==1.15.0 - - zypper --non-interactive install binutils libproj-devel gdal-devel - - zypper --non-interactive install proj geos-devel - # - cat requirements.txt | cut -f1 -d"#" | sed '/^\s*$/d' | xargs -L 1 pip install + - zypper --no-gpg-checks addrepo https://download.opensuse.org/repositories/Application:Geo/15.4/Application:Geo.repo + - zypper --no-gpg-checks refresh + - zypper --no-gpg-checks --non-interactive install proj=8.2.1 + - zypper --no-gpg-checks --non-interactive install geos=3.11.0 + - zypper --no-gpg-checks --non-interactive install geos-devel=3.9.1 + - zypper --no-gpg-checks --non-interactive install libproj22=8.2.1 + - zypper --no-gpg-checks --non-interactive install binutils libproj-devel gdal-devel - pip install -r requirements.txt - chmod +x ./CI/run_pytest.sh - ./CI/run_pytest.sh @@ -60,34 +64,6 @@ tests (from scratch): - badges/ - test_results/ -### Tests (on GPU) ### -#tests (on GPU): -# tags: -# - gpu -# - zam347 -# stage: test -# only: -# - master -# - /^release.*$/ -# - develop -# variables: -# FAILURE_THRESHOLD: 100 -# TEST_TYPE: "gpu" -# before_script: -# - chmod +x ./CI/update_badge.sh -# - ./CI/update_badge.sh > /dev/null -# script: -# - pip install -r test/requirements_tf_skip.txt -# - chmod +x ./CI/run_pytest.sh -# - ./CI/run_pytest.sh -# after_script: -# - ./CI/update_badge.sh > /dev/null -# artifacts: -# name: pages -# when: always -# paths: -# - badges/ -# - test_results/ ### Tests ### tests: @@ -100,6 +76,7 @@ tests: before_script: - chmod +x ./CI/update_badge.sh - ./CI/update_badge.sh > /dev/null + - source /opt/venv/bin/activate script: - pip install -r requirements.txt - chmod +x ./CI/run_pytest.sh @@ -125,6 +102,7 @@ coverage: before_script: - chmod +x ./CI/update_badge.sh - ./CI/update_badge.sh > /dev/null + - source /opt/venv/bin/activate script: - pip install -r requirements.txt - chmod +x ./CI/run_pytest_coverage.sh @@ -148,13 +126,15 @@ sphinx docs: before_script: - chmod +x ./CI/update_badge.sh - ./CI/update_badge.sh > /dev/null + - source /opt/venv/bin/activate script: - pip install -r requirements.txt - pip install -r docs/requirements_docs.txt - chmod +x ./CI/create_documentation.sh - ./CI/create_documentation.sh after_script: - - ./CI/update_badge.sh > /dev/null + # - ./CI/update_badge.sh > /dev/null + - ./CI/update_badge.sh when: always artifacts: name: pages diff --git a/CHANGELOG.md b/CHANGELOG.md index 988e3e5a7863868cead1a2fec7c7b6d1c750b8d8..0418eed5422d00f288879f5e9d8128558118c401 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,30 @@ # Changelog All notable changes to this project will be documented in this file. +## v2.2.0 - 2022-08-16 - new data sources and python3.9 + +### general: +* new data sources: era5 data and ToarDB V2 +* CAMS competitor available +* improved execution speed +* MLAir is now updated to python3.9 + +### new features: +* new data loading method to load era5 data on Jülich systems (#393) +* new data loading method to load data from ToarDB V2 (#396) +* implemented competitor model using CAMS ensemble forecasts (#394) +* OLS competitor is only calculated if provided in competitor list (#404) +* experimental: snapshot creation to skip preprocessing stage (#346, #405, #406) +* new workflow HyperSearchWorkflow stopping after training stage (#408) + +### technical: +* fixed minor issues and improved execution speed in postprocessing (#401, #413) +* improved speed in keras iterator creation (#409) +* solved bug for very long competitor time series (#395) +* updated python, HPC and CI environment (#402, #403, #407, #410) +* fix for climateFIR data handler (#399) +* fix for report model error (#416) + ## v2.1.0 - 2022-06-07 - new evaluation metrics and improved training ### general: diff --git a/CI/Dockerfile b/CI/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..f3b99b2f8b78129d3fff4d49d88be54613bf5929 --- /dev/null +++ b/CI/Dockerfile @@ -0,0 +1,65 @@ +# ---- base node ---- +FROM opensuse/leap:latest AS base +MAINTAINER Lukas Leufen <l.leufen@fz-juelich.de> + +# install git +RUN zypper --non-interactive install git + +# install python3 +RUN zypper --non-interactive install python39 python39-devel + +# install pip +RUN zypper --non-interactive install python39-pip + +# upgrade pip +RUN pip3.9 install --upgrade pip + +# install curl +RUN zypper --non-interactive install curl + +# install make +RUN zypper --non-interactive install make + +# install gcc +RUN zypper --non-interactive install gcc-c++ + +# create and activate venv +ENV VIRTUAL_ENV=/opt/venv +RUN python3.9 -m venv $VIRTUAL_ENV +ENV PATH="$VIRTUAL_ENV/bin:$PATH" +# RUN source venv/bin/activate + +# ---- test node ---- +FROM base AS test + +# install pytest +RUN pip install pytest pytest-html pytest-lazy-fixture + +# ---- coverage node ---- +FROM test AS coverage + +# install pytest coverage +RUN pip install pytest-cov + +# ---- docs node ---- +FROM base AS docs + +# install sphinx +RUN pip install sphinx + +# ---- MLAir ready to use ---- +FROM base AS mlair + +# install geo packages +RUN zypper --no-gpg-checks addrepo https://download.opensuse.org/repositories/Application:Geo/15.4/Application:Geo.repo +RUN zypper --no-gpg-checks refresh +RUN zypper --no-gpg-checks --non-interactive install proj=8.2.1 +RUN zypper --no-gpg-checks --non-interactive install geos=3.10.3 +RUN zypper --no-gpg-checks --non-interactive install geos-devel=3.9.1 +RUN zypper --no-gpg-checks --non-interactive install libproj22=8.2.1 +RUN zypper --no-gpg-checks --non-interactive install binutils libproj-devel gdal-devel + +# install requirements +ADD requirements.txt . +RUN pip install -r requirements.txt + diff --git a/CI/run_pytest.sh b/CI/run_pytest.sh index baa7ef8e892fc2d9efdd30094917ca492017de3d..060569abac395c49d5a5fcda80a29726d5e9001a 100644 --- a/CI/run_pytest.sh +++ b/CI/run_pytest.sh @@ -1,7 +1,7 @@ #!/bin/bash # run pytest for all run_modules -python3.6 -m pytest --html=report.html --self-contained-html test/ | tee test_results.out +python -m pytest --html=report.html --self-contained-html test/ | tee test_results.out IS_FAILED=$? diff --git a/CI/run_pytest_coverage.sh b/CI/run_pytest_coverage.sh index 24d916b1a32da714abc2e5de0ac2b4c2790752a9..f6efaf2fb78223d3c41e7d7ba0a40c87befd3296 100644 --- a/CI/run_pytest_coverage.sh +++ b/CI/run_pytest_coverage.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash # run coverage twice, 1) for html deploy 2) for success evaluation -python3.6 -m pytest --cov=mlair --cov-report term --cov-report html test/ | tee coverage_results.out +python -m pytest --cov=mlair --cov-report term --cov-report html test/ | tee coverage_results.out IS_FAILED=$? diff --git a/CI/update_badge.sh b/CI/update_badge.sh index 6238b16c4552b5d1230a7772fa020e1c9a505f44..f677273b25a7c2e69154e7948aff3b55b137d770 100644 --- a/CI/update_badge.sh +++ b/CI/update_badge.sh @@ -11,9 +11,6 @@ printf "%s\n" ${EXIT_STATUS} # fetch badge_status BADGE_STATUS="${CI_COMMIT_REF_NAME}:${CI_JOB_NAME}" -# replace - with -- -BADGE_STATUS=$( echo -e "${BADGE_STATUS//\-/--}") - # Set values for shields.io fields based on STATUS if [[ ${EXIT_STATUS} = "running" ]]; then @@ -47,9 +44,9 @@ fi while getopts b:c:s: option do case ${option} in - b) BADGE_STATUS=$( echo -e "${OPTARG//\-/--}");; - c) BADGE_COLOR=$( echo -e "${OPTARG//\-/--}");; - s) BADGE_SUBJECT=$( echo -e "${OPTARG//\-/--}");; + b) BADGE_STATUS=$( echo -e "${OPTARG// /%20}");; + c) BADGE_COLOR=$( echo -e "${OPTARG// /%20}");; + s) BADGE_SUBJECT=$( echo -e "${OPTARG// /%20}");; esac done @@ -64,18 +61,15 @@ fi RECENT_BADGE_FILENAME="badge_recent-${CI_JOB_NAME}.svg" # Get the badge from shields.io -SHIELDS_IO_NAME=${BADGE_STATUS}-${BADGE_SUBJECT}-${BADGE_COLOR}.svg -printf "%s\n" "INFO: Fetching badge ${SHIELDS_IO_NAME} from shields.io to ${BADGE_FILENAME}." -printf "%s\n" "${SHIELDS_IO_NAME//\_/__}" -printf "%s\n" "${SHIELDS_IO_NAME//\#/%23}" - -SHIELDS_IO_NAME="$( echo -e "${SHIELDS_IO_NAME//\_/__}" )" -SHIELDS_IO_NAME="$( echo -e "${SHIELDS_IO_NAME//\#/%23}")" -curl "https://img.shields.io/badge/${SHIELDS_IO_NAME}" > "${BADGE_FILENAME}" -echo "https://img.shields.io/badge/${SHIELDS_IO_NAME}" -SHIELDS_IO_NAME_RECENT="RECENT:${SHIELDS_IO_NAME}" -curl "https://img.shields.io/badge/${SHIELDS_IO_NAME_RECENT}" > "${RECENT_BADGE_FILENAME}" -echo "${SHIELDS_IO_NAME_RECENT}" > testRecentName.txt +SHIELDS_IO_NAME="https://img.shields.io/static/v1?label=${BADGE_STATUS}&message=${BADGE_SUBJECT}&color=${BADGE_COLOR}" +SHIELDS_IO_NAME="$( echo -e "${SHIELDS_IO_NAME// /%20}" )" +echo "${SHIELDS_IO_NAME}" +curl "${SHIELDS_IO_NAME}" > "${BADGE_FILENAME}" +SHIELDS_IO_NAME="https://img.shields.io/static/v1?label=RECENT%3A${BADGE_STATUS}&message=${BADGE_SUBJECT}&color=${BADGE_COLOR}" +SHIELDS_IO_NAME="$( echo -e "${SHIELDS_IO_NAME// /%20}" )" +echo "${SHIELDS_IO_NAME}" > testRecentName.txt +echo "${SHIELDS_IO_NAME}" +curl "${SHIELDS_IO_NAME}" > "${RECENT_BADGE_FILENAME}" # if [[ ! -d ./badges ]]; then diff --git a/HPC_setup/create_runscripts_HPC.sh b/HPC_setup/create_runscripts_HPC.sh index 730aa52ef42144826bd000d88c0fc81c9d508de0..b3d9d644334d06ff674a22274bf4e04619853b15 100755 --- a/HPC_setup/create_runscripts_HPC.sh +++ b/HPC_setup/create_runscripts_HPC.sh @@ -85,7 +85,7 @@ source venv_${hpcsys}/bin/activate timestamp=\`date +"%Y-%m-%d_%H%M-%S"\` -export PYTHONPATH=\${PWD}/venv_${hpcsys}/lib/python3.8/site-packages:\${PYTHONPATH} +export PYTHONPATH=\${PWD}/venv_${hpcsys}/lib/python3.9/site-packages:\${PYTHONPATH} srun --cpu-bind=none python run.py --experiment_date=\$timestamp EOT @@ -111,7 +111,7 @@ source venv_${hpcsys}/bin/activate timestamp=\`date +"%Y-%m-%d_%H%M-%S"\` -export PYTHONPATH=\${PWD}/venv_${hpcsys}/lib/python3.8/site-packages:\${PYTHONPATH} +export PYTHONPATH=\${PWD}/venv_${hpcsys}/lib/python3.9/site-packages:\${PYTHONPATH} srun --cpu-bind=none python run_HPC.py --experiment_date=\$timestamp EOT diff --git a/HPC_setup/mlt_modules_hdfml.sh b/HPC_setup/mlt_modules_hdfml.sh index df8ae0830ad70c572955447b1c5e87341b8af9ec..4efc5a69987f4a4687080740b93543bcf8107c4c 100644 --- a/HPC_setup/mlt_modules_hdfml.sh +++ b/HPC_setup/mlt_modules_hdfml.sh @@ -8,13 +8,12 @@ module --force purge module use $OTHERSTAGES -ml Stages/2020 -ml GCCcore/.10.3.0 +ml Stages/2022 +ml GCCcore/.11.2.0 -ml Jupyter/2021.3.1-Python-3.8.5 -ml Python/3.8.5 -ml TensorFlow/2.5.0-Python-3.8.5 -ml SciPy-Stack/2021-Python-3.8.5 -ml dask/2.22.0-Python-3.8.5 -ml GEOS/3.8.1-Python-3.8.5 -ml Graphviz/2.44.1 \ No newline at end of file +ml Python/3.9.6 +ml TensorFlow/2.6.0-CUDA-11.5 +ml dask/2021.9.1 +ml GEOS/3.9.1 +ml Cartopy/0.20.0 +ml Graphviz/2.49.3 diff --git a/HPC_setup/mlt_modules_juwels.sh b/HPC_setup/mlt_modules_juwels.sh index ffacfe6fc45302dfa60b108ca2493d9a27408df1..37636fb8834601768ade2d86dc8c7287e273a5d4 100755 --- a/HPC_setup/mlt_modules_juwels.sh +++ b/HPC_setup/mlt_modules_juwels.sh @@ -8,13 +8,12 @@ module --force purge module use $OTHERSTAGES -ml Stages/2020 -ml GCCcore/.10.3.0 +ml Stages/2022 +ml GCCcore/.11.2.0 -ml Jupyter/2021.3.1-Python-3.8.5 -ml Python/3.8.5 -ml TensorFlow/2.5.0-Python-3.8.5 -ml SciPy-Stack/2021-Python-3.8.5 -ml dask/2.22.0-Python-3.8.5 -ml GEOS/3.8.1-Python-3.8.5 -ml Graphviz/2.44.1 \ No newline at end of file +ml Python/3.9.6 +ml TensorFlow/2.6.0-CUDA-11.5 +ml dask/2021.9.1 +ml GEOS/3.9.1 +ml Cartopy/0.20.0 +ml Graphviz/2.49.3 diff --git a/HPC_setup/requirements_HDFML_additionals.txt b/HPC_setup/requirements_HDFML_additionals.txt index ebfac3cd0d989a8845f2a3fceba33d562b898b8d..1a9e8524906115e02338dcf80137081ab7165697 100644 --- a/HPC_setup/requirements_HDFML_additionals.txt +++ b/HPC_setup/requirements_HDFML_additionals.txt @@ -1,15 +1,21 @@ -astropy==4.1 -bottleneck==1.3.2 -cached-property==1.5.2 -iniconfig==1.1.1 -ordered-set==4.0.2 -pyshp==2.1.3 -pytest-html==3.1.1 -pytest-lazy-fixture==0.6.3 -pytest-metadata==1.11.0 -pytest-sugar==0.9.4 -tabulate==0.8.8 +astropy==5.1 +pytz==2022.1 +python-dateutil==2.8.2 +requests==2.28.1 +werkzeug>=0.11.15 +wheel>=0.26 +six==1.15.0 +psutil==5.9.1 +pyparsing==3.0.9 +packaging==21.3 +timezonefinder==5.2.0 +patsy==0.5.2 +statsmodels==0.13.2 +seaborn==0.11.2 +xarray==0.16.2 +tabulate==0.8.10 wget==3.2 ---no-binary shapely Shapely==1.7.0 - -#Cartopy==0.18.0 +pydot==1.4.2 +netcdf4==1.6.0 +tensorflow-probability==0.14.1 +tzwhere \ No newline at end of file diff --git a/HPC_setup/requirements_JUWELS_additionals.txt b/HPC_setup/requirements_JUWELS_additionals.txt index ebfac3cd0d989a8845f2a3fceba33d562b898b8d..1a9e8524906115e02338dcf80137081ab7165697 100644 --- a/HPC_setup/requirements_JUWELS_additionals.txt +++ b/HPC_setup/requirements_JUWELS_additionals.txt @@ -1,15 +1,21 @@ -astropy==4.1 -bottleneck==1.3.2 -cached-property==1.5.2 -iniconfig==1.1.1 -ordered-set==4.0.2 -pyshp==2.1.3 -pytest-html==3.1.1 -pytest-lazy-fixture==0.6.3 -pytest-metadata==1.11.0 -pytest-sugar==0.9.4 -tabulate==0.8.8 +astropy==5.1 +pytz==2022.1 +python-dateutil==2.8.2 +requests==2.28.1 +werkzeug>=0.11.15 +wheel>=0.26 +six==1.15.0 +psutil==5.9.1 +pyparsing==3.0.9 +packaging==21.3 +timezonefinder==5.2.0 +patsy==0.5.2 +statsmodels==0.13.2 +seaborn==0.11.2 +xarray==0.16.2 +tabulate==0.8.10 wget==3.2 ---no-binary shapely Shapely==1.7.0 - -#Cartopy==0.18.0 +pydot==1.4.2 +netcdf4==1.6.0 +tensorflow-probability==0.14.1 +tzwhere \ No newline at end of file diff --git a/HPC_setup/setup_venv_hdfml.sh b/HPC_setup/setup_venv_hdfml.sh index f1b4a63f9a5c90d7afacb5c3dc027adb4e6e29fc..11c273b477ea26383e53799ae0025ceb5c947a4a 100644 --- a/HPC_setup/setup_venv_hdfml.sh +++ b/HPC_setup/setup_venv_hdfml.sh @@ -22,7 +22,7 @@ python3 -m venv ${cur}../venv_hdfml source ${cur}/../venv_hdfml/bin/activate # export path for side-packages -export PYTHONPATH=${cur}/../venv_hdfml/lib/python3.8/site-packages:${PYTHONPATH} +export PYTHONPATH=${cur}/../venv_hdfml/lib/python3.9/site-packages:${PYTHONPATH} echo "##### START INSTALLING requirements_HDFML_additionals.txt #####" pip install -r ${cur}/requirements_HDFML_additionals.txt diff --git a/HPC_setup/setup_venv_juwels.sh b/HPC_setup/setup_venv_juwels.sh index 3e1f489532ef118522ccd37dd56cf6e6306046ac..8d609b8f5094de4e3840aad50656b5c11ff1a86d 100755 --- a/HPC_setup/setup_venv_juwels.sh +++ b/HPC_setup/setup_venv_juwels.sh @@ -22,7 +22,7 @@ python3 -m venv ${cur}/../venv_juwels source ${cur}/../venv_juwels/bin/activate # export path for side-packages -export PYTHONPATH=${cur}/../venv_juwels/lib/python3.8/site-packages:${PYTHONPATH} +export PYTHONPATH=${cur}/../venv_juwels/lib/python3.9/site-packages:${PYTHONPATH} echo "##### START INSTALLING requirements_JUWELS_additionals.txt #####" diff --git a/README.md b/README.md index 792c6d4a06564eb050467f271f660761ec4d3d71..212aac2ac068e88f957b8f8cfb756c8ce9e476f9 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ HPC systems, see [here](#special-instructions-for-installation-on-jülich-hpc-sy * Installation of **MLAir**: * Either clone MLAir from the [gitlab repository](https://gitlab.jsc.fz-juelich.de/esde/machine-learning/mlair.git) and use it without installation (beside the requirements) - * or download the distribution file ([current version](https://gitlab.jsc.fz-juelich.de/esde/machine-learning/mlair/-/blob/master/dist/mlair-2.1.0-py3-none-any.whl)) + * or download the distribution file ([current version](https://gitlab.jsc.fz-juelich.de/esde/machine-learning/mlair/-/blob/master/dist/mlair-2.2.0-py3-none-any.whl)) and install it via `pip install <dist_file>.whl`. In this case, you can simply import MLAir in any python script inside your virtual environment using `import mlair`. diff --git a/dist/mlair-2.2.0-py3-none-any.whl b/dist/mlair-2.2.0-py3-none-any.whl new file mode 100644 index 0000000000000000000000000000000000000000..808b281ee1756e832d4f92e455a34088a9b6e1a6 Binary files /dev/null and b/dist/mlair-2.2.0-py3-none-any.whl differ diff --git a/docs/_source/installation.rst b/docs/_source/installation.rst index 6cbf8c424bdd29470c23eb95a9b5d3a5071cf39f..182489dbd5fd60c38808eebf66ac32bd661ec6ca 100644 --- a/docs/_source/installation.rst +++ b/docs/_source/installation.rst @@ -27,7 +27,7 @@ Installation of MLAir * Install all requirements from `requirements.txt <https://gitlab.jsc.fz-juelich.de/esde/machine-learning/mlair/-/blob/master/requirements.txt>`_ preferably in a virtual environment * Either clone MLAir from the `gitlab repository <https://gitlab.jsc.fz-juelich.de/esde/machine-learning/mlair.git>`_ -* or download the distribution file (`current version <https://gitlab.jsc.fz-juelich.de/esde/machine-learning/mlair/-/blob/master/dist/mlair-2.1.0-py3-none-any.whl>`_) +* or download the distribution file (`current version <https://gitlab.jsc.fz-juelich.de/esde/machine-learning/mlair/-/blob/master/dist/mlair-2.2.0-py3-none-any.whl>`_) and install it via :py:`pip install <dist_file>.whl`. In this case, you can simply import MLAir in any python script inside your virtual environment using :py:`import mlair`. diff --git a/docs/requirements_docs.txt b/docs/requirements_docs.txt index ee455d83f0debc10faa09ffd82cad9a77930d936..66fca62c011263ddb81ab43b2c5258789073e641 100644 --- a/docs/requirements_docs.txt +++ b/docs/requirements_docs.txt @@ -2,8 +2,8 @@ sphinx==3.0.3 sphinx-autoapi==1.8.4 sphinx-autodoc-typehints==1.12.0 sphinx-rtd-theme==0.4.3 -#recommonmark==0.6.0 m2r2==0.3.1 docutils<0.18 mistune==0.8.4 -setuptools>=59.5.0 \ No newline at end of file +setuptools>=59.5.0 +Jinja2<3.1 \ No newline at end of file diff --git a/mlair/__init__.py b/mlair/__init__.py index 901947e5313a183e3909687b1fea0096075f836c..20388a18ac8ebdf37c4e17aa462839bb5b6b8e11 100644 --- a/mlair/__init__.py +++ b/mlair/__init__.py @@ -1,6 +1,6 @@ __version_info__ = { 'major': 2, - 'minor': 1, + 'minor': 2, 'micro': 0, } diff --git a/mlair/configuration/defaults.py b/mlair/configuration/defaults.py index b630261dbf58d7402f8c3cacaee153347ad4f1e3..9bb15068ce3a5ad934f7b0251b84cb19f37702f6 100644 --- a/mlair/configuration/defaults.py +++ b/mlair/configuration/defaults.py @@ -9,7 +9,6 @@ DEFAULT_STATIONS = ['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087'] DEFAULT_VAR_ALL_DICT = {'o3': 'dma8eu', 'relhum': 'average_values', 'temp': 'maximum', 'u': 'average_values', 'v': 'average_values', 'no': 'dma8eu', 'no2': 'dma8eu', 'cloudcover': 'average_values', 'pblheight': 'maximum'} -DEFAULT_NETWORK = "AIRBASE" DEFAULT_STATION_TYPE = "background" DEFAULT_VARIABLES = DEFAULT_VAR_ALL_DICT.keys() DEFAULT_START = "1997-01-01" @@ -49,6 +48,7 @@ DEFAULT_TEST_END = "2017-12-31" DEFAULT_TEST_MIN_LENGTH = 90 DEFAULT_TRAIN_VAL_MIN_LENGTH = 180 DEFAULT_USE_ALL_STATIONS_ON_ALL_DATA_SETS = True +DEFAULT_COMPETITORS = ["ols"] DEFAULT_DO_UNCERTAINTY_ESTIMATE = True DEFAULT_UNCERTAINTY_ESTIMATE_BLOCK_LENGTH = "1m" DEFAULT_UNCERTAINTY_ESTIMATE_EVALUATE_COMPETITORS = True @@ -69,6 +69,7 @@ DEFAULT_DATA_ORIGIN = {"cloudcover": "REA", "humidity": "REA", "pblheight": "REA DEFAULT_USE_MULTIPROCESSING = True DEFAULT_USE_MULTIPROCESSING_ON_DEBUG = False DEFAULT_MAX_NUMBER_MULTIPROCESSING = 16 +DEFAULT_CREATE_SNAPSHOT = False def get_defaults(): diff --git a/mlair/configuration/era5_settings.py b/mlair/configuration/era5_settings.py new file mode 100644 index 0000000000000000000000000000000000000000..9f44176bd50bf95226a0ea7a4913152a34619f9a --- /dev/null +++ b/mlair/configuration/era5_settings.py @@ -0,0 +1,19 @@ +"""Settings to access not public era5 data.""" + +from typing import Tuple + + +def era5_settings(sampling="daily") -> Tuple[str, str]: + """ + Check for sampling as only hourly resolution is supported by era5 and return path on HPC systems. + + :param sampling: temporal resolution to load data for, only hourly supported (default "daily") + + :return: HPC path + """ + if sampling == "hourly": # pragma: no branch + ERA5_DATA_PATH = "." + FILE_NAMES = "*.nc" + else: + raise NameError(f"Given sampling {sampling} is not supported, only hourly sampling can be used.") + return ERA5_DATA_PATH, FILE_NAMES diff --git a/mlair/configuration/snapshot_names.py b/mlair/configuration/snapshot_names.py new file mode 100644 index 0000000000000000000000000000000000000000..8526363eecc068efbd08a108f58a14dad9425490 --- /dev/null +++ b/mlair/configuration/snapshot_names.py @@ -0,0 +1,354 @@ +animals = ["Aardvark", + "Aardwolf", + "Albatross", + "Alligator", + "Alpaca", + "Amphibian", + "Anaconda", + "Angelfish", + "Anglerfish", + "Ant", + "Anteater", + "Antelope", + "Antlion", + "Ape", + "Aphid", + "Armadillo", + "Asp", + "Baboon", + "Badger", + "Bandicoot", + "Barnacle", + "Barracuda", + "Basilisk", + "Bass", + "Bat", + "Bear", + "Beaver", + "Bedbug", + "Bee", + "Beetle", + "Bird", + "Bison", + "Blackbird", + "Boa", + "Boar", + "Bobcat", + "Bobolink", + "Bonobo", + "Booby", + "Bovid", + "Bug", + "Butterfly", + "Buzzard", + "Camel", + "Canid", + "Canidae", + "Capybara", + "Cardinal", + "Caribou", + "Carp", + "Cat", + "Caterpillar", + "Catfish", + "Catshark", + "Cattle", + "Centipede", + "Cephalopod", + "Chameleon", + "Cheetah", + "Chickadee", + "Chicken", + "Chimpanzee", + "Chinchilla", + "Chipmunk", + "Cicada", + "Clam", + "Clownfish", + "Cobra", + "Cockroach", + "Cod", + "Condor", + "Constrictor", + "Coral", + "Cougar", + "Cow", + "Coyote", + "Crab", + "Crane", + "Crawdad", + "Crayfish", + "Cricket", + "Crocodile", + "Crow", + "Cuckoo", + "Damselfly", + "Deer", + "Dingo", + "Dinosaur", + "Dog", + "Dolphin", + "Donkey", + "Dormouse", + "Dove", + "Dragon", + "Dragonfly", + "Duck", + "Eagle", + "Earthworm", + "Earwig", + "Echidna", + "Eel", + "Egret", + "Elephant", + "Elk", + "Emu", + "Ermine", + "Falcon", + "Felidae", + "Ferret", + "Finch", + "Firefly", + "Fish", + "Flamingo", + "Flea", + "Fly", + "Flyingfish", + "Fowl", + "Fox", + "Frog", + "Galliform", + "Gamefowl", + "Gayal", + "Gazelle", + "Gecko", + "Gerbil", + "Gibbon", + "Giraffe", + "Goat", + "Goldfish", + "Goose", + "Gopher", + "Gorilla", + "Grasshopper", + "Grouse", + "Guan", + "Guanaco", + "Guineafowl", + "Gull", + "Guppy", + "Haddock", + "Halibut", + "Hamster", + "Hare", + "Harrier", + "Hawk", + "Hedgehog", + "Heron", + "Herring", + "Hippopotamus", + "Hookworm", + "Hornet", + "Horse", + "Hoverfly", + "Hummingbird", + "Hyena", + "Iguana", + "Impala", + "Jackal", + "Jaguar", + "Jay", + "Jellyfish", + "Junglefowl", + "Kangaroo", + "Kingfisher", + "Kite", + "Kiwi", + "Koala", + "Koi", + "Krill", + "Ladybug", + "Lamprey", + "Landfowl", + "Lark", + "Leech", + "Lemming", + "Lemur", + "Leopard", + "Leopon", + "Limpet", + "Lion", + "Lizard", + "Llama", + "Lobster", + "Locust", + "Loon", + "Louse", + "Lungfish", + "Lynx", + "Macaw", + "Mackerel", + "Magpie", + "Mammal", + "Manatee", + "Mandrill", + "Marlin", + "Marmoset", + "Marmot", + "Marsupial", + "Marten", + "Mastodon", + "Meadowlark", + "Meerkat", + "Mink", + "Minnow", + "Mite", + "Mockingbird", + "Mole", + "Mollusk", + "Mongoose", + "Monkey", + "Moose", + "Mosquito", + "Moth", + "Mouse", + "Mule", + "Muskox", + "Narwhal", + "Newt", + "Nightingale", + "Ocelot", + "Octopus", + "Opossum", + "Orangutan", + "Orca", + "Ostrich", + "Otter", + "Owl", + "Ox", + "Panda", + "Panther", + "Parakeet", + "Parrot", + "Parrotfish", + "Partridge", + "Peacock", + "Peafowl", + "Pelican", + "Penguin", + "Perch", + "Pheasant", + "Pig", + "Pigeon", + "Pike", + "Pinniped", + "Piranha", + "Planarian", + "Platypus", + "Pony", + "Porcupine", + "Porpoise", + "Possum", + "Prawn", + "Primate", + "Ptarmigan", + "Puffin", + "Puma", + "Python", + "Quail", + "Quelea", + "Quokka", + "Rabbit", + "Raccoon", + "Rat", + "Rattlesnake", + "Raven", + "Reindeer", + "Reptile", + "Rhinoceros", + "Roadrunner", + "Rodent", + "Rook", + "Rooster", + "Roundworm", + "Sailfish", + "Salamander", + "Salmon", + "Sawfish", + "Scallop", + "Scorpion", + "Seahorse", + "Shark", + "Sheep", + "Shrew", + "Shrimp", + "Silkworm", + "Silverfish", + "Skink", + "Skunk", + "Sloth", + "Slug", + "Smelt", + "Snail", + "Snake", + "Snipe", + "Sole", + "Sparrow", + "Spider", + "Spoonbill", + "Squid", + "Squirrel", + "Starfish", + "Stingray", + "Stoat", + "Stork", + "Sturgeon", + "Swallow", + "Swan", + "Swift", + "Swordfish", + "Swordtail", + "Tahr", + "Takin", + "Tapir", + "Tarantula", + "Tarsier", + "Termite", + "Tern", + "Thrush", + "Tick", + "Tiger", + "Tiglon", + "Toad", + "Tortoise", + "Toucan", + "Trout", + "Tuna", + "Turkey", + "Turtle", + "Tyrannosaurus", + "Urial", + "Vicuna", + "Viper", + "Vole", + "Vulture", + "Wallaby", + "Walrus", + "Warbler", + "Wasp", + "Weasel", + "Whale", + "Whippet", + "Whitefish", + "Wildcat", + "Wildebeest", + "Wildfowl", + "Wolf", + "Wolverine", + "Wombat", + "Woodpecker", + "Worm", + "Wren", + "Xerinae", + "Yak", + "Zebra", ] diff --git a/mlair/configuration/toar_data_v2_settings.py b/mlair/configuration/toar_data_v2_settings.py new file mode 100644 index 0000000000000000000000000000000000000000..a8bb9f42cf5a1967f150aa18019c2dbdc89f43a2 --- /dev/null +++ b/mlair/configuration/toar_data_v2_settings.py @@ -0,0 +1,20 @@ +"""Settings to access https://toar-data.fz-juelich.de""" +from typing import Tuple, Dict + + +def toar_data_v2_settings(sampling="daily") -> Tuple[str, Dict]: + """ + Set url for toar-data and required headers. Headers information is not required for now. + + :param sampling: temporal resolution to access. + :return: Service url and optional headers + """ + if sampling == "daily": # pragma: no branch + TOAR_SERVICE_URL = "https://toar-data.fz-juelich.de/statistics/api/v1/" + headers = {} + elif sampling == "hourly" or sampling == "meta": + TOAR_SERVICE_URL = "https://toar-data.fz-juelich.de/api/v2/" + headers = {} + else: + raise NameError(f"Given sampling {sampling} is not supported, choose from either daily or hourly sampling.") + return TOAR_SERVICE_URL, headers diff --git a/mlair/data_handler/abstract_data_handler.py b/mlair/data_handler/abstract_data_handler.py index 9ea163fcad2890580e9c44e4bda0627d6419dc9f..a82e5005e8b30f9e3978ae61859e6b80746d95f1 100644 --- a/mlair/data_handler/abstract_data_handler.py +++ b/mlair/data_handler/abstract_data_handler.py @@ -22,6 +22,9 @@ class AbstractDataHandler(object): """Return initialised class.""" return cls(*args, **kwargs) + def __len__(self, upsampling=False): + raise NotImplementedError + @classmethod def requirements(cls, skip_args=None): """Return requirements and own arguments without duplicates.""" diff --git a/mlair/data_handler/data_handler_mixed_sampling.py b/mlair/data_handler/data_handler_mixed_sampling.py index 84596ad081b922a92a91b3df0513a4e730b8eb53..eaa6a21175bd5f88c32c9c3cb74947c0cc0956a3 100644 --- a/mlair/data_handler/data_handler_mixed_sampling.py +++ b/mlair/data_handler/data_handler_mixed_sampling.py @@ -63,8 +63,7 @@ class DataHandlerMixedSamplingSingleStation(DataHandlerSingleStation): vars = [self.variables, self.target_var] stats_per_var = helpers.select_from_dict(self.statistics_per_var, vars[ind]) data, self.meta = self.load_data(self.path[ind], self.station, stats_per_var, self.sampling[ind], - self.station_type, self.network, self.store_data_locally, self.data_origin, - self.start, self.end) + self.store_data_locally, self.data_origin, self.start, self.end) data = self.interpolate(data, dim=self.time_dim, method=self.interpolation_method[ind], limit=self.interpolation_limit[ind], sampling=self.sampling[ind]) @@ -115,7 +114,7 @@ class DataHandlerMixedSamplingWithFilterSingleStation(DataHandlerMixedSamplingSi def make_input_target(self): """ - A FIR filter is applied on the input data that has hourly resolution. Lables Y are provided as aggregated values + A FIR filter is applied on the input data that has hourly resolution. Labels Y are provided as aggregated values with daily resolution. """ self._data = tuple(map(self.load_and_interpolate, [0, 1])) # load input (0) and target (1) data @@ -147,8 +146,7 @@ class DataHandlerMixedSamplingWithFilterSingleStation(DataHandlerMixedSamplingSi stats_per_var = helpers.select_from_dict(self.statistics_per_var, vars[ind]) data, self.meta = self.load_data(self.path[ind], self.station, stats_per_var, self.sampling[ind], - self.station_type, self.network, self.store_data_locally, self.data_origin, - start, end) + self.store_data_locally, self.data_origin, start, end) data = self.interpolate(data, dim=self.time_dim, method=self.interpolation_method[ind], limit=self.interpolation_limit[ind], sampling=self.sampling[ind]) return data @@ -353,6 +351,7 @@ class DataHandlerMixedSamplingWithClimateAndFirFilter(DataHandlerMixedSamplingWi sp_keys = {k: copy.deepcopy(kwargs[k]) for k in cls.data_handler_unfiltered.requirements() if k in kwargs} sp_keys = cls.build_update_transformation(sp_keys, dh_type="unfiltered_chem") cls.prepare_build(sp_keys, chem_vars, cls.chem_indicator) + cls.correct_overwrite_option(sp_keys) sp_chem_unfiltered = cls.data_handler_unfiltered(station, **sp_keys) if len(meteo_vars) > 0: cls.set_data_handler_fir_pos(**kwargs) @@ -364,11 +363,18 @@ class DataHandlerMixedSamplingWithClimateAndFirFilter(DataHandlerMixedSamplingWi sp_keys = {k: copy.deepcopy(kwargs[k]) for k in cls.data_handler_unfiltered.requirements() if k in kwargs} sp_keys = cls.build_update_transformation(sp_keys, dh_type="unfiltered_meteo") cls.prepare_build(sp_keys, meteo_vars, cls.meteo_indicator) + cls.correct_overwrite_option(sp_keys) sp_meteo_unfiltered = cls.data_handler_unfiltered(station, **sp_keys) dp_args = {k: copy.deepcopy(kwargs[k]) for k in cls.own_args("id_class") if k in kwargs} return cls(sp_chem, sp_meteo, sp_chem_unfiltered, sp_meteo_unfiltered, chem_vars, meteo_vars, **dp_args) + @classmethod + def correct_overwrite_option(cls, kwargs): + """Set `overwrite_local_data=False`.""" + if "overwrite_local_data" in kwargs: + kwargs["overwrite_local_data"] = False + @classmethod def set_data_handler_fir_pos(cls, **kwargs): """ diff --git a/mlair/data_handler/data_handler_single_station.py b/mlair/data_handler/data_handler_single_station.py index 4217583d4b7ae03a2529deaae38fd33234bba5db..ec0f1f73282979a1e69945e1ad7f6817bdf3ba12 100644 --- a/mlair/data_handler/data_handler_single_station.py +++ b/mlair/data_handler/data_handler_single_station.py @@ -20,8 +20,9 @@ import xarray as xr from mlair.configuration import check_path_and_create from mlair import helpers -from mlair.helpers import join, statistics, TimeTrackingWrapper +from mlair.helpers import statistics, TimeTrackingWrapper, filter_dict_by_value, select_from_dict from mlair.data_handler.abstract_data_handler import AbstractDataHandler +from mlair.helpers import data_sources # define a more general date type for type hinting date = Union[dt.date, dt.datetime] @@ -38,8 +39,6 @@ class DataHandlerSingleStation(AbstractDataHandler): indicates that not all values up to t0 are used, a positive values indicates usage of values at t>t0. Default is 0. """ - DEFAULT_STATION_TYPE = "background" - DEFAULT_NETWORK = "AIRBASE" DEFAULT_VAR_ALL_DICT = {'o3': 'dma8eu', 'relhum': 'average_values', 'temp': 'maximum', 'u': 'average_values', 'v': 'average_values', 'no': 'dma8eu', 'no2': 'dma8eu', 'cloudcover': 'average_values', 'pblheight': 'maximum'} @@ -58,12 +57,11 @@ class DataHandlerSingleStation(AbstractDataHandler): chem_vars = ["benzene", "ch4", "co", "ethane", "no", "no2", "nox", "o3", "ox", "pm1", "pm10", "pm2p5", "propane", "so2", "toluene"] - _hash = ["station", "statistics_per_var", "data_origin", "station_type", "network", "sampling", "target_dim", - "target_var", "time_dim", "iter_dim", "window_dim", "window_history_size", "window_history_offset", - "window_lead_time", "interpolation_limit", "interpolation_method", "variables", "window_history_end"] + _hash = ["station", "statistics_per_var", "data_origin", "sampling", "target_dim", "target_var", "time_dim", + "iter_dim", "window_dim", "window_history_size", "window_history_offset", "window_lead_time", + "interpolation_limit", "interpolation_method", "variables", "window_history_end"] - def __init__(self, station, data_path, statistics_per_var=None, station_type=DEFAULT_STATION_TYPE, - network=DEFAULT_NETWORK, sampling: Union[str, Tuple[str]] = DEFAULT_SAMPLING, + def __init__(self, station, data_path, statistics_per_var=None, sampling: Union[str, Tuple[str]] = DEFAULT_SAMPLING, target_dim=DEFAULT_TARGET_DIM, target_var=DEFAULT_TARGET_VAR, time_dim=DEFAULT_TIME_DIM, iter_dim=DEFAULT_ITER_DIM, window_dim=DEFAULT_WINDOW_DIM, window_history_size=DEFAULT_WINDOW_HISTORY_SIZE, window_history_offset=DEFAULT_WINDOW_HISTORY_OFFSET, @@ -87,8 +85,6 @@ class DataHandlerSingleStation(AbstractDataHandler): self.input_data, self.target_data = None, None self._transformation = self.setup_transformation(transformation) - self.station_type = station_type - self.network = network self.sampling = sampling self.target_dim = target_dim self.target_var = target_var @@ -140,9 +136,8 @@ class DataHandlerSingleStation(AbstractDataHandler): return self._data.shape, self.get_X().shape, self.get_Y().shape def __repr__(self): - return f"StationPrep(station={self.station}, data_path='{self.path}', " \ + return f"StationPrep(station={self.station}, data_path='{self.path}', data_origin={self.data_origin}, " \ f"statistics_per_var={self.statistics_per_var}, " \ - f"station_type='{self.station_type}', network='{self.network}', " \ f"sampling='{self.sampling}', target_dim='{self.target_dim}', target_var='{self.target_var}', " \ f"time_dim='{self.time_dim}', window_history_size={self.window_history_size}, " \ f"window_lead_time={self.window_lead_time}, interpolation_limit={self.interpolation_limit}, " \ @@ -169,8 +164,12 @@ class DataHandlerSingleStation(AbstractDataHandler): return self.get_transposed_label() def get_coordinates(self): - coords = self.meta.loc[["station_lon", "station_lat"]].astype(float) - return coords.rename(index={"station_lon": "lon", "station_lat": "lat"}).to_dict()[str(self)] + try: + coords = self.meta.loc[["station_lon", "station_lat"]].astype(float) + coords = coords.rename(index={"station_lon": "lon", "station_lat": "lat"}) + except KeyError: + coords = self.meta.loc[["lon", "lat"]].astype(float) + return coords.to_dict()[str(self)] def call_transform(self, inverse=False): opts_input = self._transformation[0] @@ -301,8 +300,7 @@ class DataHandlerSingleStation(AbstractDataHandler): def make_input_target(self): data, self.meta = self.load_data(self.path, self.station, self.statistics_per_var, self.sampling, - self.station_type, self.network, self.store_data_locally, self.data_origin, - self.start, self.end) + self.store_data_locally, self.data_origin, self.start, self.end) self._data = self.interpolate(data, dim=self.time_dim, method=self.interpolation_method, limit=self.interpolation_limit, sampling=self.sampling) self.set_inputs_and_targets() @@ -320,8 +318,8 @@ class DataHandlerSingleStation(AbstractDataHandler): self.make_observation(self.target_dim, self.target_var, self.time_dim) self.remove_nan(self.time_dim) - def load_data(self, path, station, statistics_per_var, sampling, station_type=None, network=None, - store_data_locally=False, data_origin: Dict = None, start=None, end=None): + def load_data(self, path, station, statistics_per_var, sampling, store_data_locally=False, + data_origin: Dict = None, start=None, end=None): """ Load data and meta data either from local disk (preferred) or download new data by using a custom download method. @@ -339,35 +337,34 @@ class DataHandlerSingleStation(AbstractDataHandler): if os.path.exists(meta_file): os.remove(meta_file) data, meta = self.download_data(file_name, meta_file, station, statistics_per_var, sampling, - station_type=station_type, network=network, - store_data_locally=store_data_locally, data_origin=data_origin) + store_data_locally=store_data_locally, data_origin=data_origin, + time_dim=self.time_dim, target_dim=self.target_dim, iter_dim=self.iter_dim) logging.debug(f"loaded new data") else: try: logging.debug(f"try to load local data from: {file_name}") data = xr.open_dataarray(file_name) meta = pd.read_csv(meta_file, index_col=0) - self.check_station_meta(meta, station, station_type, network) + self.check_station_meta(meta, station, data_origin, statistics_per_var) logging.debug("loading finished") except FileNotFoundError as e: logging.debug(e) logging.debug(f"load new data") data, meta = self.download_data(file_name, meta_file, station, statistics_per_var, sampling, - station_type=station_type, network=network, - store_data_locally=store_data_locally, data_origin=data_origin) + store_data_locally=store_data_locally, data_origin=data_origin, + time_dim=self.time_dim, target_dim=self.target_dim, + iter_dim=self.iter_dim) logging.debug("loading finished") # create slices and check for negative concentration. data = self._slice_prep(data, start=start, end=end) data = self.check_for_negative_concentrations(data) return data, meta - @staticmethod - def download_data_from_join(file_name: str, meta_file: str, station, statistics_per_var, sampling, - station_type=None, network=None, store_data_locally=True, data_origin: Dict = None, - time_dim=DEFAULT_TIME_DIM, target_dim=DEFAULT_TARGET_DIM, iter_dim=DEFAULT_ITER_DIM) \ - -> [xr.DataArray, pd.DataFrame]: + def download_data(self, file_name: str, meta_file: str, station, statistics_per_var, sampling, + store_data_locally=True, data_origin: Dict = None, time_dim=DEFAULT_TIME_DIM, + target_dim=DEFAULT_TARGET_DIM, iter_dim=DEFAULT_ITER_DIM) -> [xr.DataArray, pd.DataFrame]: """ - Download data from TOAR database using the JOIN interface. + Download data from TOAR database using the JOIN interface or load local era5 data. Data is transformed to a xarray dataset. If class attribute store_data_locally is true, data is additionally stored locally using given names for file and meta file. @@ -378,8 +375,40 @@ class DataHandlerSingleStation(AbstractDataHandler): :return: downloaded data and its meta data """ df_all = {} - df, meta = join.download_join(station_name=station, stat_var=statistics_per_var, station_type=station_type, - network_name=network, sampling=sampling, data_origin=data_origin) + df_era5, df_toar = None, None + meta_era5, meta_toar = None, None + if data_origin is not None: + era5_origin = filter_dict_by_value(data_origin, "era5", True) + era5_stats = select_from_dict(statistics_per_var, era5_origin.keys()) + toar_origin = filter_dict_by_value(data_origin, "era5", False) + toar_stats = select_from_dict(statistics_per_var, era5_origin.keys(), filter_cond=False) + assert len(era5_origin) + len(toar_origin) == len(data_origin) + assert len(era5_stats) + len(toar_stats) == len(statistics_per_var) + else: + era5_origin, toar_origin = None, None + era5_stats, toar_stats = statistics_per_var, statistics_per_var + + # load data + if era5_origin is not None and len(era5_stats) > 0: + # load era5 data + df_era5, meta_era5 = data_sources.era5.load_era5(station_name=station, stat_var=era5_stats, + sampling=sampling, data_origin=era5_origin) + if toar_origin is None or len(toar_stats) > 0: + # load combined data from toar-data (v2 & v1) + df_toar, meta_toar = data_sources.toar_data.download_toar(station=station, toar_stats=toar_stats, + sampling=sampling, data_origin=toar_origin) + + if df_era5 is None and df_toar is None: + raise data_sources.toar_data.EmptyQueryResult(f"No data available for era5 and toar-data") + + df = pd.concat([df_era5, df_toar], axis=1, sort=True) + if meta_era5 is not None and meta_toar is not None: + meta = meta_era5.combine_first(meta_toar) + else: + meta = meta_era5 if meta_era5 is not None else meta_toar + meta.loc["data_origin"] = str(data_origin) + meta.loc["statistics_per_var"] = str(statistics_per_var) + df_all[station[0]] = df # convert df_all to xarray xarr = {k: xr.DataArray(v, dims=[time_dim, target_dim]) for k, v in df_all.items()} @@ -390,28 +419,22 @@ class DataHandlerSingleStation(AbstractDataHandler): meta.to_csv(meta_file) return xarr, meta - def download_data(self, *args, **kwargs): - data, meta = self.download_data_from_join(*args, **kwargs, time_dim=self.time_dim, target_dim=self.target_dim, - iter_dim=self.iter_dim) - return data, meta - @staticmethod - def check_station_meta(meta, station, station_type, network): + def check_station_meta(meta, station, data_origin, statistics_per_var): """ Search for the entries in meta data and compare the value with the requested values. Will raise a FileNotFoundError if the values mismatch. """ - if station_type is not None: - check_dict = {"station_type": station_type, "network_name": network} - for (k, v) in check_dict.items(): - if v is None: - continue - if meta.at[k, station[0]] != v: - logging.debug(f"meta data does not agree with given request for {k}: {v} (requested) != " - f"{meta.at[k, station[0]]} (local). Raise FileNotFoundError to trigger new " - f"grapping from web.") - raise FileNotFoundError + check_dict = {"data_origin": str(data_origin), "statistics_per_var": str(statistics_per_var)} + for (k, v) in check_dict.items(): + if v is None or k not in meta.index: + continue + if meta.at[k, station[0]] != v: + logging.debug(f"meta data does not agree with given request for {k}: {v} (requested) != " + f"{meta.at[k, station[0]]} (local). Raise FileNotFoundError to trigger new " + f"grapping from web.") + raise FileNotFoundError def check_for_negative_concentrations(self, data: xr.DataArray, minimum: int = 0) -> xr.DataArray: """ diff --git a/mlair/data_handler/data_handler_with_filter.py b/mlair/data_handler/data_handler_with_filter.py index 47ccc5510c8135745c518611504cd02900a1f883..e5760e9afb52f9d55071214fb632601d744f124e 100644 --- a/mlair/data_handler/data_handler_with_filter.py +++ b/mlair/data_handler/data_handler_with_filter.py @@ -68,8 +68,7 @@ class DataHandlerFilterSingleStation(DataHandlerSingleStation): def make_input_target(self): data, self.meta = self.load_data(self.path, self.station, self.statistics_per_var, self.sampling, - self.station_type, self.network, self.store_data_locally, self.data_origin, - self.start, self.end) + self.store_data_locally, self.data_origin, self.start, self.end) self._data = self.interpolate(data, dim=self.time_dim, method=self.interpolation_method, limit=self.interpolation_limit) self.set_inputs_and_targets() diff --git a/mlair/data_handler/default_data_handler.py b/mlair/data_handler/default_data_handler.py index 300e0435c4e8441e299675319e2c72604ebb3200..69c9537b10ca583adf84480636680a99ab265a67 100644 --- a/mlair/data_handler/default_data_handler.py +++ b/mlair/data_handler/default_data_handler.py @@ -22,7 +22,7 @@ import xarray as xr from mlair.data_handler.abstract_data_handler import AbstractDataHandler from mlair.helpers import remove_items, to_list, TimeTrackingWrapper -from mlair.helpers.join import EmptyQueryResult +from mlair.helpers.data_sources.toar_data import EmptyQueryResult number = Union[float, int] @@ -55,6 +55,8 @@ class DefaultDataHandler(AbstractDataHandler): self._X_extreme = None self._Y_extreme = None self._data_intersection = None + self._len = None + self._len_upsampling = None self._use_multiprocessing = use_multiprocessing self._max_number_multiprocessing = max_number_multiprocessing _name_affix = str(f"{str(self.id_class)}_{name_affix}" if name_affix is not None else id(self)) @@ -134,6 +136,12 @@ class DefaultDataHandler(AbstractDataHandler): def __repr__(self): return str(self._collection[0]) + def __len__(self, upsampling=False): + if upsampling is False: + return self._len + else: + return self._len_upsampling + def get_X_original(self): X = [] for data in self._collection: @@ -168,12 +176,13 @@ class DefaultDataHandler(AbstractDataHandler): dim = self.time_dim intersect = reduce(np.intersect1d, map(lambda x: x.coords[dim].values, X_original)) if len(intersect) < max(self.min_length, 1): - X, Y = None, None + raise ValueError(f"There is no intersection of X.") else: X = list(map(lambda x: x.sel({dim: intersect}), X_original)) Y = Y_original.sel({dim: intersect}) self._data_intersection = intersect self._X, self._Y = X, Y + self._len = len(self._data_intersection) def get_observation(self): dim = self.time_dim @@ -205,13 +214,10 @@ class DefaultDataHandler(AbstractDataHandler): if True only extract values larger than extreme_values :param timedelta: used as arguments for np.timedelta in order to mark extreme values on datetime """ - # check if X or Y is None - if (self._X is None) or (self._Y is None): - logging.debug(f"{str(self.id_class)} has no data for X or Y, skip multiply extremes") - return if extreme_values is None: logging.debug(f"No extreme values given, skip multiply extremes") self._X_extreme, self._Y_extreme = self._X, self._Y + self._len_upsampling = self._len return # check type if inputs @@ -247,6 +253,7 @@ class DefaultDataHandler(AbstractDataHandler): self._Y_extreme = xr.concat([Y, extremes_Y], dim=dim) self._X_extreme = list(map(lambda x1, x2: xr.concat([x1, x2], dim=dim), X, extremes_X)) + self._len_upsampling = len(self._X_extreme[0].coords[dim]) @staticmethod def _add_timedelta(data, dim, timedelta): diff --git a/mlair/data_handler/iterator.py b/mlair/data_handler/iterator.py index 3fc25a90f861c65d38aa6b7019095210035d4c2d..af75905bd511a4cfb8d8b5023325c678f83f0799 100644 --- a/mlair/data_handler/iterator.py +++ b/mlair/data_handler/iterator.py @@ -8,7 +8,8 @@ import numpy as np import math import os import shutil -import pickle +import psutil +import multiprocessing import logging import dill from typing import Tuple, List @@ -75,7 +76,7 @@ class DataCollection(Iterable): class KerasIterator(keras.utils.Sequence): def __init__(self, collection: DataCollection, batch_size: int, batch_path: str, shuffle_batches: bool = False, - model=None, upsampling=False, name=None): + model=None, upsampling=False, name=None, use_multiprocessing=False, max_number_multiprocessing=1): self._collection = collection batch_path = os.path.join(batch_path, str(name if name is not None else id(self))) self._path = os.path.join(batch_path, "%i.pickle") @@ -85,7 +86,7 @@ class KerasIterator(keras.utils.Sequence): self.upsampling = upsampling self.indexes: list = [] self._cleanup_path(batch_path) - self._prepare_batches() + self._prepare_batches(use_multiprocessing, max_number_multiprocessing) def __len__(self) -> int: return len(self.indexes) @@ -96,7 +97,13 @@ class KerasIterator(keras.utils.Sequence): def _get_model_rank(self): if self.model is not None: - mod_out = self.model.output_shape + try: + mod_out = self.model.output_shape + except AttributeError as e: + # ToDo replace except statemnet with something meaningful. Depending on BNN architecture the attr + # output_shape might not be defined. We use it here to check the number of tails -> make sure multiple + # tails would also work with BNNs in future versions + mod_out = (None, None) if isinstance(mod_out, tuple): # only one output branch: (None, ahead) mod_rank = 1 elif isinstance(mod_out, list): # multiple output branches, e.g.: [(None, ahead), (None, ahead)] @@ -119,62 +126,61 @@ class KerasIterator(keras.utils.Sequence): """Concatenate two lists of data along axis=0.""" return list(map(lambda n1, n2: np.concatenate((n1, n2), axis=0), old, new)) - def _get_batch(self, data_list: List[np.ndarray], b: int) -> List[np.ndarray]: - """Get batch according to batch size from data list.""" - return list(map(lambda data: data[b * self.batch_size:(b + 1) * self.batch_size, ...], data_list)) - @staticmethod - def _permute_data(X, Y): - p = np.random.permutation(len(X[0])) # equiv to .shape[0] - X = list(map(lambda x: x[p], X)) - Y = list(map(lambda x: x[p], Y)) - return X, Y + def _concatenate_multi(*args: List[np.ndarray]) -> List[np.ndarray]: + """Concatenate two lists of data along axis=0.""" + return list(map(lambda *_args: np.concatenate(_args, axis=0), *args)) - def _prepare_batches(self) -> None: + def _prepare_batches(self, use_multiprocessing=False, max_process=1) -> None: """ Prepare all batches as locally stored files. Walk through all elements of collection and split (or merge) data according to the batch size. Too long data - sets are divided into multiple batches. Not fully filled batches are merged with data from the next collection - element. If data is remaining after the last element, it is saved as smaller batch. All batches are enumerated - beginning from 0. A list with all batch numbers is stored in class's parameter indexes. + sets are divided into multiple batches. Not fully filled batches are retained together with remains from the + next collection elements. These retained data are concatenated and also split into batches. If data are still + remaining afterwards, they are saved as final smaller batch. All batches are enumerated by a running index + starting at 0. A list with all batch numbers is stored in class's parameter indexes. This method can either + use a serial approach or use multiprocessing to decrease computational time. """ index = 0 - remaining = None + remaining = [] mod_rank = self._get_model_rank() + n_process = min([psutil.cpu_count(logical=False), len(self._collection), max_process]) # use only physical cpus + if n_process > 1 and use_multiprocessing is True: # parallel solution + pool = multiprocessing.Pool(n_process) + output = [] + else: + pool = None + output = None for data in self._collection: - logging.debug(f"prepare batches for {str(data)}") - X, _Y = data.get_data(upsampling=self.upsampling) - Y = [_Y[0] for _ in range(mod_rank)] - if self.upsampling: - X, Y = self._permute_data(X, Y) - if remaining is not None: - X, Y = self._concatenate(X, remaining[0]), self._concatenate(Y, remaining[1]) + length = data.__len__(self.upsampling) + batches = _get_number_of_mini_batches(length, self.batch_size) + if pool is None: + res = f_proc(data, self.upsampling, mod_rank, self.batch_size, self._path, index) + if res is not None: + remaining.append(res) + else: + output.append(pool.apply_async(f_proc, args=(data, self.upsampling, mod_rank, self.batch_size, self._path, index))) + index += batches + if output is not None: + for p in output: + res = p.get() + if res is not None: + remaining.append(res) + pool.close() + if len(remaining) > 0: + X = self._concatenate_multi(*[e[0] for e in remaining]) + Y = self._concatenate_multi(*[e[1] for e in remaining]) length = X[0].shape[0] - batches = self._get_number_of_mini_batches(length) - for b in range(batches): - batch_X, batch_Y = self._get_batch(X, b), self._get_batch(Y, b) - self._save_to_pickle(X=batch_X, Y=batch_Y, index=index) + batches = _get_number_of_mini_batches(length, self.batch_size) + remaining = f_proc((X, Y), self.upsampling, mod_rank, self.batch_size, self._path, index) + index += batches + if remaining is not None: + _save_to_pickle(self._path, X=remaining[0], Y=remaining[1], index=index) index += 1 - if (batches * self.batch_size) < length: # keep remaining to concatenate with next data element - remaining = (self._get_batch(X, batches), self._get_batch(Y, batches)) - else: - remaining = None - if remaining is not None: # add remaining as smaller batch - self._save_to_pickle(X=remaining[0], Y=remaining[1], index=index) - index += 1 self.indexes = np.arange(0, index).tolist() - - def _save_to_pickle(self, X: List[np.ndarray], Y: List[np.ndarray], index: int) -> None: - """Save data as pickle file with variables X and Y and given index as <index>.pickle .""" - data = {"X": X, "Y": Y} - file = self._path % index - with open(file, "wb") as f: - dill.dump(data, f) - - def _get_number_of_mini_batches(self, number_of_samples: int) -> int: - """Return number of mini batches as the floored ration of number of samples to batch size.""" - return math.floor(number_of_samples / self.batch_size) + if pool is not None: + pool.join() @staticmethod def _cleanup_path(path: str, create_new: bool = True) -> None: @@ -188,3 +194,49 @@ class KerasIterator(keras.utils.Sequence): """Randomly shuffle indexes if enabled.""" if self.shuffle is True: np.random.shuffle(self.indexes) + + +def _save_to_pickle(path, X: List[np.ndarray], Y: List[np.ndarray], index: int) -> None: + """Save data as pickle file with variables X and Y and given index as <index>.pickle .""" + data = {"X": X, "Y": Y} + file = path % index + with open(file, "wb") as f: + dill.dump(data, f) + + +def _get_batch(data_list: List[np.ndarray], b: int, batch_size: int) -> List[np.ndarray]: + """Get batch according to batch size from data list.""" + return list(map(lambda data: data[b * batch_size:(b + 1) * batch_size, ...], data_list)) + + +def _permute_data(X, Y): + p = np.random.permutation(len(X[0])) # equiv to .shape[0] + X = list(map(lambda x: x[p], X)) + Y = list(map(lambda x: x[p], Y)) + return X, Y + + +def _get_number_of_mini_batches(number_of_samples: int, batch_size: int) -> int: + """Return number of mini batches as the floored ration of number of samples to batch size.""" + return math.floor(number_of_samples / batch_size) + + +def f_proc(data, upsampling, mod_rank, batch_size, _path, index): + if isinstance(data, tuple) is True: + X, _Y = data + else: + X, _Y = data.get_data(upsampling=upsampling) + Y = [_Y[0] for _ in range(mod_rank)] + if upsampling: + X, Y = _permute_data(X, Y) + length = X[0].shape[0] + batches = _get_number_of_mini_batches(length, batch_size) + for b in range(batches): + batch_X, batch_Y = _get_batch(X, b, batch_size), _get_batch(Y, b, batch_size) + _save_to_pickle(_path, X=batch_X, Y=batch_Y, index=index) + index += 1 + if (batches * batch_size) < length: # keep remaining to concatenate with next data element + remaining = (_get_batch(X, batches, batch_size), _get_batch(Y, batches, batch_size)) + else: + remaining = None + return remaining diff --git a/mlair/helpers/__init__.py b/mlair/helpers/__init__.py index 3a5b8699a11ae39c0d3510a534db1dd144419d09..cf50fa05885d576bd64de67b83df3c8ed6d272e2 100644 --- a/mlair/helpers/__init__.py +++ b/mlair/helpers/__init__.py @@ -1,6 +1,7 @@ """Collection of different supporting functions and classes.""" -from .testing import PyTestRegex, PyTestAllEqual +from .testing import PyTestRegex, PyTestAllEqual, check_nested_equality from .time_tracking import TimeTracking, TimeTrackingWrapper from .logger import Logger -from .helpers import remove_items, float_round, dict_to_xarray, to_list, extract_value, select_from_dict, make_keras_pickable, sort_like +from .helpers import remove_items, float_round, dict_to_xarray, to_list, extract_value, select_from_dict, \ + make_keras_pickable, sort_like, filter_dict_by_value diff --git a/mlair/helpers/data_sources/__init__.py b/mlair/helpers/data_sources/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6b753bc3afb961be65ff0f934ef4f0de08804a0b --- /dev/null +++ b/mlair/helpers/data_sources/__init__.py @@ -0,0 +1,10 @@ +""" +Data Sources. + +The module data_sources collects different data sources, namely ERA5, TOAR-Data v1 (JOIN), and TOAR-Data v2 +""" + +__author__ = "Lukas Leufen" +__date__ = "2022-07-05" + +from . import era5, join, toar_data, toar_data_v2 diff --git a/mlair/helpers/data_sources/era5.py b/mlair/helpers/data_sources/era5.py new file mode 100644 index 0000000000000000000000000000000000000000..8eb7a03b2629db1d006e03fcc9d30b2af714c270 --- /dev/null +++ b/mlair/helpers/data_sources/era5.py @@ -0,0 +1,88 @@ +"""Methods to load era5 data.""" +__author__ = "Lukas Leufen" +__date__ = "2022-06-09" + +import logging +import os + +import pandas as pd +import xarray as xr + +from mlair import helpers +from mlair.configuration.era5_settings import era5_settings +from mlair.configuration.toar_data_v2_settings import toar_data_v2_settings +from mlair.helpers.data_sources.toar_data_v2 import load_station_information, combine_meta_data, correct_timezone +from mlair.helpers.data_sources.toar_data import EmptyQueryResult +from mlair.helpers.meteo import relative_humidity_from_dewpoint + + +def load_era5(station_name, stat_var, sampling, data_origin): + + # make sure station_name parameter is a list + station_name = helpers.to_list(station_name) + + # get data path + data_path, file_names = era5_settings(sampling) + + # correct stat_var values if data is not aggregated (hourly) + if sampling == "hourly": + stat_var = {key: "values" for key in stat_var.keys()} + else: + raise ValueError(f"Given sampling {sampling} is not supported, only hourly sampling can be used.") + + # load station meta using toar-data v2 API + meta_url_base, headers = toar_data_v2_settings("meta") + station_meta = load_station_information(station_name, meta_url_base, headers) + + # sel data for station using sel method nearest + logging.info(f"load data for {station_meta['codes'][0]} from ERA5") + try: + with xr.open_mfdataset(os.path.join(data_path, file_names)) as data: + lon, lat = station_meta["coordinates"]["lng"], station_meta["coordinates"]["lat"] + station_dask = data.sel(lon=lon, lat=lat, method="nearest", drop=True) + station_data = station_dask.to_array().T.compute() + except OSError as e: + logging.info(f"Cannot load era5 data from path {data_path} and filenames {file_names} due to: {e}") + return None, None + + # transform data and meta to pandas + station_data = station_data.to_pandas() + if "relhum" in stat_var: + station_data["RHw"] = relative_humidity_from_dewpoint(station_data["D2M"], station_data["T2M"]) + station_data.columns = _rename_era5_variables(station_data.columns) + + # check if all requested variables are available + if set(stat_var).issubset(station_data.columns) is False: + missing_variables = set(stat_var).difference(stat_var) + origin = helpers.select_from_dict(data_origin, missing_variables) + options = f"station={station_name}, origin={origin}" + raise EmptyQueryResult(f"No data found for variables {missing_variables} and options {options} in JOIN.") + else: + station_data = station_data[stat_var] + + # convert to local timezone + station_data = correct_timezone(station_data, station_meta, sampling) + + variable_meta = _emulate_meta_data(station_data) + meta = combine_meta_data(station_meta, variable_meta) + meta = pd.DataFrame.from_dict(meta, orient='index') + meta.columns = station_name + return station_data, meta + + +def _emulate_meta_data(station_data): + general_meta = {"sampling_frequency": "hourly", "data_origin": "model", "data_origin_type": "model"} + roles_meta = {"roles": [{"contact": {"organisation": {"name": "ERA5", "longname": "ECMWF"}}}]} + variable_meta = {var: {"variable": {"name": var}, **roles_meta, ** general_meta} for var in station_data.columns} + return variable_meta + + +def _rename_era5_variables(era5_names): + mapper = {"SP": "press", "U10M": "u", "V10M": "v", "T2M": "temp", "D2M": "dew", "BLH": "pblheight", + "TCC": "cloudcover", "RHw": "relhum"} + era5_names = list(era5_names) + try: + join_names = list(map(lambda x: mapper[x], era5_names)) + return join_names + except KeyError as e: + raise KeyError(f"Cannot map names from era5 to join naming convention: {e}") \ No newline at end of file diff --git a/mlair/helpers/data_sources/join.py b/mlair/helpers/data_sources/join.py new file mode 100644 index 0000000000000000000000000000000000000000..a978b2712a83b21f3c1256b2bf0826da63bdda3a --- /dev/null +++ b/mlair/helpers/data_sources/join.py @@ -0,0 +1,366 @@ +"""Functions to access join database.""" +__author__ = 'Felix Kleinert, Lukas Leufen' +__date__ = '2019-10-16' + +import datetime as dt +import logging +from typing import Iterator, Union, List, Dict, Tuple + +import pandas as pd + +from mlair import helpers +from mlair.configuration.join_settings import join_settings +from mlair.helpers.data_sources import toar_data, toar_data_v2 + + +# join_url_base = 'https://join.fz-juelich.de/services/rest/surfacedata/' +str_or_none = Union[str, None] + + +def download_join(station_name: Union[str, List[str]], stat_var: dict, station_type: str = None, + sampling: str = "daily", data_origin: Dict = None) -> [pd.DataFrame, pd.DataFrame]: + """ + Read data from JOIN/TOAR. + + :param station_name: Station name e.g. DEBY122 + :param stat_var: key as variable like 'O3', values as statistics on keys like 'mean' + :param station_type: set the station type like "traffic" or "background", can be none + :param sampling: sampling rate of the downloaded data, either set to daily or hourly (default daily) + :param data_origin: additional dictionary to specify data origin as key (for variable) value (origin) pair. Valid + origins are "REA" for reanalysis data and "" (empty string) for observational data. + + :returns: data frame with all variables and statistics and meta data frame with all meta information + """ + # make sure station_name parameter is a list + station_name = helpers.to_list(station_name) + + # split network and origin information + data_origin, network_name = split_network_and_origin(data_origin) + + # get data connection settings + join_url_base, headers = join_settings(sampling) + + # load series information + vars_dict, data_origin = load_series_information(station_name, station_type, network_name, join_url_base, headers, + data_origin, stat_var) + + # check if all requested variables are available + if set(stat_var).issubset(vars_dict) is False: + missing_variables = set(stat_var).difference(vars_dict) + origin = helpers.select_from_dict(data_origin, missing_variables) + options = f"station={station_name}, type={station_type}, network={network_name}, origin={origin}" + raise toar_data.EmptyQueryResult(f"No data found for variables {missing_variables} and options {options} in " + f"JOIN.") + + # correct stat_var values if data is not aggregated (hourly) + if sampling == "hourly": + stat_var = {key: "values" for key in stat_var.keys()} + + # download all variables with given statistic + data = None + df = None + meta = {} + logging.info(f"load data for {station_name[0]} from JOIN") + for var in _lower_list(sorted(vars_dict.keys())): + if var in stat_var.keys(): + + logging.debug('load: {}'.format(var)) + + # create data link + opts = {'base': join_url_base, 'service': 'stats', 'id': vars_dict[var], 'statistics': stat_var[var], + 'sampling': sampling, 'capture': 0, 'format': 'json'} + + # load data + data = toar_data.get_data(opts, headers) + + # adjust data format if given as list of list + # no branch cover because this just happens when downloading hourly data using a secret token, not available + # for CI testing. + if isinstance(data, list): # pragma: no branch + data = correct_data_format(data) + + # correct namespace of statistics + stat = toar_data.correct_stat_name(stat_var[var]) + + # store data in pandas dataframe + df = _save_to_pandas(df, data, stat, var) + meta[var] = _correct_meta(data["metadata"]) + + logging.debug('finished: {}'.format(var)) + + if data: + # load station meta using toar-data v2 API and convert to local timezone + meta_url_base, headers = toar_data_v2.toar_data_v2_settings("meta") + station_meta = toar_data_v2.load_station_information(station_name, meta_url_base, headers) + df = toar_data_v2.correct_timezone(df, station_meta, sampling) + + # create meta data + meta = toar_data_v2.combine_meta_data(station_meta, meta) + meta = pd.DataFrame.from_dict(meta, orient='index') + meta.columns = station_name + return df, meta + else: + raise toar_data.EmptyQueryResult("No data found in JOIN.") + + +def _correct_meta(meta): + meta_out = {} + for k, v in meta.items(): + if k.startswith("station"): + _k = k.split("_", 1)[1] + _d = meta_out.get("station", {}) + _d[_k] = v + meta_out["station"] = _d + elif k.startswith("parameter"): + _k = k.split("_", 1)[1] + _d = meta_out.get("variable", {}) + _d[_k] = v + meta_out["variable"] = _d + elif k == "network_name": + if v == "AIRBASE": + _d = {"name": "EEA", "longname": "European Environment Agency", "kind": "government"} + elif v == "UBA": + _d = {"name": "UBA", "longname": "Umweltbundesamt", "kind": "government", "country": "Germany"} + else: + _d = {"name": v} + meta_out["roles"] = [{"contact": {"organisation": _d}}] + elif k in ["google_resolution", "numid"]: + continue + else: + meta_out[k] = v + return meta_out + + +def split_network_and_origin(origin_network_dict: dict) -> Tuple[Union[None, dict], Union[None, dict]]: + """ + Split given dict into network and data origin. + + Method is required to transform Toar-Data v2 structure (using only origin) into Toar-Data v1 (JOIN) structure (which + uses origin and network parameter). Furthermore, EEA network (v2) is renamed to AIRBASE (v1). + """ + if origin_network_dict is None or len(origin_network_dict) == 0: + data_origin, network = None, None + else: + data_origin = {} + network = {} + for k, v in origin_network_dict.items(): + network[k] = [] + for _network in helpers.to_list(v): + if _network.lower() == "EEA".lower(): + network[k].append("AIRBASE") + elif _network.lower() != "REA".lower(): + network[k].append(_network) + if "REA" in v: + data_origin[k] = "REA" + else: + data_origin[k] = "" + network[k] = filter_network(network[k]) + return data_origin, network + + +def filter_network(network: list) -> Union[list, None]: + """ + Filter given list of networks. + + :param network: list of various network names (can contain duplicates) + :return: sorted list with unique entries + """ + sorted_network = [] + for v in list(filter(lambda x: x != "", network)): + if v not in sorted_network: + sorted_network.append(v) + if len(sorted_network) == 0: + sorted_network = None + return sorted_network + + +def correct_data_format(data): + """ + Transform to the standard data format. + + For some cases (e.g. hourly data), the data is returned as list instead of a dictionary with keys datetime, values + and metadata. This functions addresses this issue and transforms the data into the dictionary version. + + :param data: data in hourly format + + :return: the same data but formatted to fit with aggregated format + """ + formatted = {"datetime": [], + "values": [], + "metadata": data[-1]} + for d in data[:-1]: + for k, v in zip(["datetime", "values"], d): + formatted[k].append(v) + return formatted + + +def load_series_information(station_name: List[str], station_type: str_or_none, network_name: str_or_none, + join_url_base: str, headers: Dict, data_origin: Dict = None, stat_var: Dict = None) -> [Dict, Dict]: + """ + List all series ids that are available for given station id and network name. + + :param station_name: Station name e.g. DEBW107 + :param station_type: station type like "traffic" or "background" + :param network_name: measurement network of the station like "UBA" or "AIRBASE" + :param join_url_base: base url name to download data from + :param headers: additional headers information like authorization, can be empty + :param data_origin: additional information to select a distinct series e.g. from reanalysis (REA) or from observation + ("", empty string). This dictionary should contain a key for each variable and the information as key + :return: all available series for requested station stored in an dictionary with parameter name (variable) as key + and the series id as value. + """ + network_name_opts = _create_network_name_opts(network_name) + parameter_name_opts = _create_parameter_name_opts(stat_var) + opts = {"base": join_url_base, "service": "search", "station_id": station_name[0], "station_type": station_type, + "network_name": network_name_opts, "as_dict": "true", "parameter_name": parameter_name_opts, + "columns": "id,network_name,station_id,parameter_name,parameter_label,parameter_attribute"} + station_vars = toar_data.get_data(opts, headers) + logging.debug(f"{station_name}: {station_vars}") + return _select_distinct_series(station_vars, data_origin, network_name) + + +def _create_parameter_name_opts(stat_var): + if stat_var is None: + parameter_name_opts = None + else: + parameter_name_opts = ",".join(stat_var.keys()) + return parameter_name_opts + + +def _create_network_name_opts(network_name): + if network_name is None: + network_name_opts = network_name + elif isinstance(network_name, list): + network_name_opts = ",".join(helpers.to_list(network_name)) + elif isinstance(network_name, dict): + _network = [] + for v in network_name.values(): + _network.extend(helpers.to_list(v)) + network_name_opts = ",".join(filter(lambda x: x is not None, set(_network))) + network_name_opts = None if len(network_name_opts) == 0 else network_name_opts + else: + raise TypeError(f"network_name parameter must be of type None, list, or dict. Given is {type(network_name)}.") + return network_name_opts + + +def _select_distinct_series(vars: List[Dict], data_origin: Dict = None, network_name: Union[str, List[str]] = None) \ + -> [Dict, Dict]: + """ + Select distinct series ids for all variables. Also check if a parameter is from REA or not. + """ + data_origin = {} if data_origin is None else data_origin + selected, data_origin = _select_distinct_data_origin(vars, data_origin) + + network_name = [] if network_name is None else network_name + selected = _select_distinct_network(selected, network_name) + + # extract id + selected = {k: v["id"] for k, v in selected.items()} + return selected, data_origin + + +def _select_distinct_network(vars: dict, network_name: Union[list, dict]) -> dict: + """ + Select distinct series regarding network name. The order the network names are provided in parameter `network_name` + indicates priority (from high to low). If no network name is provided, first entry is used and a logging info is + issued. In case network names are given but no match can be found, this method raises a ValueError. + + :param vars: dictionary with all series candidates already grouped by variable name as key. Value should be a list + of possible candidates to select from. Each candidate must be a dictionary with at least keys `id` and + `network_name`. + :param network_name: list of networks to use with increasing priority (1st element has priority). Can be empty list + indicating to use always first candidate for each variable. + :return: dictionary with single series reference for each variable + """ + if isinstance(network_name, (list, str)): + network_name = {var: helpers.to_list(network_name) for var in vars.keys()} + selected = {} + for var, series in vars.items(): + res = [] + network_list = helpers.to_list(network_name.get(var, []) or []) + for network in network_list: + res.extend(list(filter(lambda x: x["network_name"].upper() == network.upper(), series))) + if len(res) > 0: # use first match which has the highest priority + selected[var] = res[0] + else: + if len(network_list) == 0: # just print message which network is used if none is provided + selected[var] = series[0] + logging.info(f"Could not find a valid match for variable {var} and networks {network_name.get(var, [])}" + f"! Therefore, use first answer from JOIN: {series[0]}") + else: # raise error if network name is provided but no match could be found + raise ValueError(f"Cannot find a valid match for requested networks {network_name.get(var, [])} and " + f"variable {var} as only following networks are available in JOIN: " + f"{list(map(lambda x: x['network_name'], series))}") + return selected + + +def _select_distinct_data_origin(vars: List[Dict], data_origin: Dict) -> (Dict[str, List], Dict): + """ + Select distinct series regarding their data origin. Series are grouped as list according to their variable's name. + As series can be reported with different network attribution, results might contain multiple entries for a variable. + This method assumes the default data origin for chemical variables as `` (empty source) and for meteorological + variables as `REA`. + :param vars: list of all entries to check data origin for + :param data_origin: data origin to match series with, if empty default values are used + :return: dictionary with unique variable names as keys and list of respective series as values + """ + data_origin_default = {"cloudcover": "REA", "humidity": "REA", "pblheight": "REA", "press": "REA", "relhum": "REA", + "temp": "REA", "totprecip": "REA", "u": "REA", "v": "REA", + "no": "", "no2": "", "o3": "", "pm10": "", "so2": ""} + selected = {} + for var in vars: + name = var["parameter_name"].lower() + var_attr = var["parameter_attribute"].lower() + if name not in data_origin.keys(): + data_origin.update({name: data_origin_default.get(name, "")}) + attr = data_origin.get(name, "").lower() + if var_attr == attr: + selected[name] = selected.get(name, []) + helpers.to_list(var) + return selected, data_origin + + +def _save_to_pandas(df: Union[pd.DataFrame, None], data: dict, stat: str, var: str) -> pd.DataFrame: + """ + Save given data in data frame. + + If given data frame is not empty, the data is appened as new column. + + :param df: data frame to append the new data, can be none + :param data: new data to append or format as data frame containing the keys 'datetime' and '<stat>' + :param stat: extracted statistic to get values from data (e.g. 'mean', 'dma8eu') + :param var: variable the data is from (e.g. 'o3') + + :return: new created or concatenated data frame + """ + if len(data["datetime"][0]) == 19: + str_format = "%Y-%m-%d %H:%M:%S" + else: + str_format = "%Y-%m-%d %H:%M" + index = map(lambda s: dt.datetime.strptime(s, str_format), data['datetime']) + if df is None: + df = pd.DataFrame(data[stat], index=index, columns=[var]) + else: + df = pd.concat([df, pd.DataFrame(data[stat], index=index, columns=[var])], axis=1) + return df + + +def _lower_list(args: List[str]) -> Iterator[str]: + """ + Lower all elements of given list. + + :param args: list with string entries to lower + + :return: iterator that lowers all list entries + """ + for string in args: + yield string.lower() + + +if __name__ == "__main__": + logging.basicConfig(level=logging.DEBUG) + var_all_dic = {'o3': 'dma8eu', 'relhum': 'average_values', 'temp': 'maximum', 'u': 'average_values', + 'v': 'average_values', 'no': 'dma8eu', 'no2': 'dma8eu', 'cloudcover': 'average_values', + 'pblheight': 'maximum'} + station = 'DEBW107' + # download_join(station, var_all_dic, sampling="daily") + download_join(station, var_all_dic, sampling="hourly") diff --git a/mlair/helpers/data_sources/toar_data.py b/mlair/helpers/data_sources/toar_data.py new file mode 100644 index 0000000000000000000000000000000000000000..27522855cbe0f3c6f0b78d1598709a694fc7b862 --- /dev/null +++ b/mlair/helpers/data_sources/toar_data.py @@ -0,0 +1,128 @@ +__author__ = "Lukas Leufen" +__date__ = "2022-07-05" + + +from typing import Union, List, Dict + +from . import join, toar_data_v2 + +import requests +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry +import pandas as pd + + +class EmptyQueryResult(Exception): + """Exception that get raised if a query to JOIN returns empty results.""" + + pass + + +def create_url(base: str, service: str, param_id: Union[str, int, None] = None, + **kwargs: Union[str, int, float, None]) -> str: + """ + Create a request url with given base url, service type and arbitrarily many additional keyword arguments. + + :param base: basic url of the rest service + :param service: service type, e.g. series, stats + :param param_id: id for a distinct service, is added between ending / of service and ? of kwargs + :param kwargs: keyword pairs for optional request specifications, e.g. 'statistics=maximum' + + :return: combined url as string + """ + url = f"{base}" + if not url.endswith("/"): + url += "/" + if service is not None: + url = f"{url}{service}" + if not url.endswith("/"): + url += "/" + if param_id is not None: + url = f"{url}{param_id}" + if len(kwargs) > 0: + url = f"{url}?{'&'.join(f'{k}={v}' for k, v in kwargs.items() if v is not None)}" + return url + + +def get_data(opts: Dict, headers: Dict, as_json: bool = True) -> Union[Dict, List, str]: + """ + Download join data using requests framework. + + Data is returned as json like structure. Depending on the response structure, this can lead to a list or dictionary. + + :param opts: options to create the request url + :param headers: additional headers information like authorization, can be empty + :param as_json: extract response as json if true (default True) + + :return: requested data (either as list or dictionary) + """ + url = create_url(**opts) + try: + response = retries_session().get(url, headers=headers, timeout=(5, None)) # timeout=(open, read) + if response.status_code == 200: + return response.json() if as_json is True else response.text + else: + raise EmptyQueryResult(f"There was an error (STATUS {response.status_code}) for request {url}") + except requests.exceptions.RetryError as e: + raise EmptyQueryResult(f"There was an RetryError for request {url}: {e}") + + +def retries_session(max_retries=3): + retry_strategy = Retry(total=max_retries, + backoff_factor=0.1, + status_forcelist=[429, 500, 502, 503, 504], + method_whitelist=["HEAD", "GET", "OPTIONS"]) + adapter = HTTPAdapter(max_retries=retry_strategy) + http = requests.Session() + http.mount("https://", adapter) + http.mount("http://", adapter) + return http + + +def download_toar(station, toar_stats, sampling, data_origin): + + try: + # load data from toar-data (v2) + df_toar, meta_toar = toar_data_v2.download_toar(station, toar_stats, sampling=sampling, data_origin=data_origin) + except (AttributeError, EmptyQueryResult, KeyError, requests.ConnectionError, ValueError, IndexError): + df_toar, meta_toar = None, None + + try: + # load join data (toar-data v1) + df_join, meta_join = join.download_join(station_name=station, stat_var=toar_stats, sampling=sampling, + data_origin=data_origin) + except (AttributeError, EmptyQueryResult, KeyError, requests.ConnectionError, ValueError, IndexError): + df_join, meta_join = None, None + + # merge both data sources with priority on toar-data v2 + if df_toar is not None and df_join is not None: + df_merged = merge_toar_join(df_toar, df_join, sampling) + meta_merged = meta_toar + else: + df_merged = df_toar if df_toar is not None else df_join + meta_merged = meta_toar if df_toar is not None else meta_join + return df_merged, meta_merged + + +def merge_toar_join(df_toar, df_join, sampling): + start_date = min([df_toar.index.min(), df_join.index.min()]) + end_date = max([df_toar.index.max(), df_join.index.max()]) + freq = {"hourly": "1H", "daily": "1d"}.get(sampling) + full_time = pd.date_range(start_date, end_date, freq=freq) + full_data = df_toar.reindex(full_time) + full_data.update(df_join, overwrite=False) + return full_data + + +def correct_stat_name(stat: str) -> str: + """ + Map given statistic name to new namespace defined by mapping dict. + + Return given name stat if not element of mapping namespace. + + :param stat: namespace from JOIN server + + :return: stat mapped to local namespace + """ + mapping = {'average_values': 'mean', 'maximum': 'max', 'minimum': 'min'} + return mapping.get(stat, stat) diff --git a/mlair/helpers/data_sources/toar_data_v2.py b/mlair/helpers/data_sources/toar_data_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..0fa53a7eb23f11675eeef2c12a7d5dceec3c38ac --- /dev/null +++ b/mlair/helpers/data_sources/toar_data_v2.py @@ -0,0 +1,238 @@ +"""Functions to access https://toar-data.fz-juelich.de/api/v2/""" +__author__ = 'Lukas Leufen' +__date__ = '2022-06-30' + + +import logging +from typing import Union, List, Dict +from io import StringIO + +import pandas as pd +import pytz +from timezonefinder import TimezoneFinder + +from mlair.configuration.toar_data_v2_settings import toar_data_v2_settings +from mlair.helpers import to_list +from mlair.helpers.data_sources.toar_data import EmptyQueryResult, get_data, correct_stat_name + + +str_or_none = Union[str, None] + + +def download_toar(station_name: Union[str, List[str]], stat_var: dict, + sampling: str = "daily", data_origin: Dict = None): + """ + Download data from https://toar-data.fz-juelich.de/api/v2/ + + Uses station name to indicate measurement site and keys of stat_var to indicate variable name. If data origin is + given, this method tries to load time series for this origin. In case no origin is provided, this method loads data + with the highest priority according to toar-data's order parameter. + + :param station_name: + :param stat_var: + :param sampling: + :param data_origin: + :return: + """ + + # make sure station_name parameter is a list + station_name = to_list(station_name) + + # also ensure that given data_origin dict is no reference + if data_origin is None or len(data_origin) == 0: + data_origin = None + else: + data_origin = {k: v for (k, v) in data_origin.items()} + + # get data connection settings for meta + meta_url_base, headers = toar_data_v2_settings("meta") + + # load variables + var_meta = load_variables_information(stat_var, meta_url_base, headers) + + # load station meta + station_meta = load_station_information(station_name, meta_url_base, headers) + + # load series information + timeseries_meta = load_timeseries_information(station_meta, var_meta, meta_url_base, headers, data_origin) + + # # correct stat_var values if data is not aggregated (hourly) + # if sampling == "hourly": + # stat_var = {key: "values" for key in stat_var.keys()} + + logging.info(f"load data for {station_meta['codes'][0]} from TOAR-DATA") + # get data connection settings for data + data_url_base, headers = toar_data_v2_settings(sampling) + + data_dict = {} + for var, meta in timeseries_meta.items(): + logging.debug(f"load {var}") + meta_and_opts = prepare_meta(meta, sampling, stat_var, var) + data_var = [] + for var_meta, opts in meta_and_opts: + data_var.extend(load_timeseries_data(var_meta, data_url_base, opts, headers, sampling)) + data_dict[var] = merge_data(*data_var, sampling=sampling) + data = pd.DataFrame.from_dict(data_dict) + data = correct_timezone(data, station_meta, sampling) + + meta = combine_meta_data(station_meta, {k: v[0] for k, v in timeseries_meta.items()}) + meta = pd.DataFrame.from_dict(meta, orient='index') + meta.columns = station_name + return data, meta + + +def merge_data(*args, sampling="hourly"): + start_date = min(map(lambda x: x.index.min(), args)) + end_date = max(map(lambda x: x.index.max(), args)) + freq = {"hourly": "1H", "daily": "1d"}.get(sampling) + full_time = pd.date_range(start_date, end_date, freq=freq) + full_data = args[0].reindex(full_time) + if not isinstance(full_data, pd.DataFrame): + full_data = full_data.to_frame() + for d in args[1:]: + full_data.update(d, overwrite=False) + return full_data.squeeze() + + +def correct_timezone(data, meta, sampling): + """ + Extract timezone information and convert data index to this timezone. + + Uses UTC if no information is provided. Note that is method only modifies data in with sampling='hourly'. In all + other cases, it returns just the given data without any change. This method expects date index of data to be in UTC. + Timezone information is not added to the index to get rid of daylight saving time and ambiguous timestamps. + """ + if sampling == "hourly": + tz_info = meta.get("timezone", "UTC") + try: + tz = pytz.timezone(tz_info) + except pytz.exceptions.UnknownTimeZoneError as e: + lon, lat = meta["coordinates"]["lng"], meta["coordinates"]["lat"] + tz = pytz.timezone(TimezoneFinder().timezone_at(lng=lon, lat=lat)) + index = data.index + index = index.tz_localize(None) + utc_offset = tz.utcoffset(index[0]) - tz.dst(index[0]) + data.index = index + utc_offset + return data + + +def prepare_meta(meta, sampling, stat_var, var): + out = [] + for m in meta: + opts = {} + if sampling == "daily": + opts["timeseries_id"] = m.pop("id") + m["id"] = None + opts["names"] = stat_var[var] + opts["sampling"] = sampling + out.append(([m], opts)) + return out + + +def combine_meta_data(station_meta, timeseries_meta): + meta = {} + for k, v in station_meta.items(): + if k == "codes": + meta[k] = v[0] + elif k in ["coordinates", "additional_metadata", "globalmeta"]: + for _key, _val in v.items(): + if _key == "lng": + meta["lon"] = _val + else: + meta[_key] = _val + elif k in ["changelog", "roles", "annotations", "aux_images", "aux_docs", "aux_urls"]: + continue + else: + meta[k] = v + for var, var_meta in timeseries_meta.items(): + for k, v in var_meta.items(): + if k in ["additional_metadata", "station", "programme", "annotations", "changelog"]: + continue + elif k == "roles": + for _key, _val in v[0]["contact"]["organisation"].items(): + new_k = f"{var}_organisation_{_key}" + meta[new_k] = _val + elif k == "variable": + for _key, _val in v.items(): + new_k = f"{var}_{_key}" + meta[new_k] = _val + else: + new_k = f"{var}_{k}" + meta[new_k] = v + return meta + + +def load_timeseries_data(timeseries_meta, url_base, opts, headers, sampling): + coll = [] + for meta in timeseries_meta: + series_id = meta["id"] + # opts = {"base": url_base, "service": f"data/timeseries/{series_id}"} + opts = {"base": url_base, "service": f"data/timeseries", "param_id": series_id, "format": "csv", **opts} + if sampling != "hourly": + opts["service"] = None + res = get_data(opts, headers, as_json=False) + data = pd.read_csv(StringIO(res), comment="#", index_col="datetime", parse_dates=True, + infer_datetime_format=True) + if len(data.index) > 0: + data = data[correct_stat_name(opts.get("names", "value"))].rename(meta["variable"]["name"]) + coll.append(data) + return coll + + +def load_station_information(station_name: List[str], url_base: str, headers: Dict): + # opts = {"base": url_base, "service": f"stationmeta/{station_name[0]}"} + opts = {"base": url_base, "service": f"stationmeta", "param_id": station_name[0]} + return get_data(opts, headers) + + +def load_timeseries_information(station_meta, var_meta, url_base: str, headers: Dict, + data_origin: Dict = None) -> [Dict, Dict]: + timeseries_id_dict = {} + missing = [] + for var, meta in var_meta.items(): + timeseries_id_dict[var] = [] + opts = {"base": url_base, "service": "search", "station_id": station_meta["id"], "variable_id": meta["id"]} + res = get_data(opts, headers) + if len(res) == 0: + missing.append((var, meta)) + # raise EmptyQueryResult(f"Cannot find any timeseries for station id {station_meta['id']} " + # f"({station_meta['codes'][0]}) and variable id {meta['id']} ({var}).") + if data_origin is not None: + var_origin = data_origin[var] + timeseries_id_dict[var] = select_timeseries_by_origin(res, var_origin) + # if len(timeseries_id_dict[var]) == 0: + # raise EmptyQueryResult(f"Cannot find any timeseries for station id {station_meta['id']} " + # f"({station_meta['codes'][0]}), variable id {meta['id']} ({var}) " + # f"and timeseries origin {var_origin}.") + if data_origin is None or len(timeseries_id_dict[var]) == 0: + timeseries_id_dict[var] = select_timeseries_by_order(res) + if len(missing) > 0: + missing = ",".join([f"{m[0]} ({m[1]['id']})" for m in missing]) + raise EmptyQueryResult(f"Cannot find any timeseries for station id {station_meta['id']} " + f"({station_meta['codes'][0]}) and variables {missing}.") + return timeseries_id_dict + + +def select_timeseries_by_order(toar_meta): + order_dict = {meta["order"]: meta for meta in toar_meta} + res = [order_dict[order] for order in sorted(order_dict.keys())] + return res + + +def select_timeseries_by_origin(toar_meta, var_origin): + res = [] + for origin in to_list(var_origin): + for meta in toar_meta: + for roles in meta["roles"]: + if roles["contact"]["organisation"]["name"].lower() == origin.lower(): + res.append(meta) + break + return res + + +def load_variables_information(var_dict, url_base, headers): + var_meta_dict = {} + for var in var_dict.keys(): + opts = {"base": url_base, "service": f"variables", "param_id": var} + var_meta_dict[var] = get_data(opts, headers) + return var_meta_dict diff --git a/mlair/helpers/filter.py b/mlair/helpers/filter.py index 247c4fc9c7c6d57d721c1d0895cc8c719b1bd4a5..5fc3df951ed5dec9e94ed7d34d8dc02bafddf262 100644 --- a/mlair/helpers/filter.py +++ b/mlair/helpers/filter.py @@ -214,6 +214,7 @@ class ClimateFIRFilter(FIRFilter): h = [] if self.sel_opts is not None: self.sel_opts = self.sel_opts if isinstance(self.sel_opts, dict) else {self.time_dim: self.sel_opts} + self._check_sel_opts() sampling = {1: "1d", 24: "1H"}.get(int(self.fs)) logging.debug(f"{self.display_name}: create diurnal_anomalies") if self.apriori_diurnal is True and sampling == "1H": @@ -303,6 +304,10 @@ class ClimateFIRFilter(FIRFilter): except Exception as e: logging.info(f"Could not plot climate fir filter due to following reason:\n{e}") + def _check_sel_opts(self): + if len(self.data.sel(**self.sel_opts).coords[self.time_dim]) == 0: + raise ValueError(f"Abort {self.__class__.__name__} as no data is available after applying sel_opts to data") + @staticmethod def _next_order(order: list, minimum_length: Union[int, None], pos: int, window: Union[str, tuple]) -> int: next_order = 0 diff --git a/mlair/helpers/helpers.py b/mlair/helpers/helpers.py index b583cf7dc473db96181f88b0ab26e60ee225240d..ca69f28557c6386f021b137e5861660f40b867d9 100644 --- a/mlair/helpers/helpers.py +++ b/mlair/helpers/helpers.py @@ -57,7 +57,7 @@ def to_list(obj: Any) -> List: :return: list containing obj, or obj itself (if obj was already a list) """ - if isinstance(obj, (set, tuple)): + if isinstance(obj, (set, tuple, type({}.keys()))): obj = list(obj) elif not isinstance(obj, list): obj = [obj] @@ -176,16 +176,17 @@ def remove_items(obj: Union[List, Dict, Tuple], items: Any): raise TypeError(f"{inspect.stack()[0][3]} does not support type {type(obj)}.") -def select_from_dict(dict_obj: dict, sel_list: Any, remove_none=False): +def select_from_dict(dict_obj: dict, sel_list: Any, remove_none: bool = False, filter_cond: bool = True) -> dict: """ Extract all key values pairs whose key is contained in the sel_list. - Does not perform a check if all elements of sel_list are keys of dict_obj. Therefore the number of pairs in the - returned dict is always smaller or equal to the number of elements in the sel_list. + Does not perform a check if all elements of sel_list are keys of dict_obj. Therefore, the number of pairs in the + returned dict is always smaller or equal to the number of elements in the sel_list. If `filter_cond` is given, this + method either return the parts of the input dictionary that are included or not in `sel_list`. """ sel_list = to_list(sel_list) assert isinstance(dict_obj, dict) - sel_dict = {k: v for k, v in dict_obj.items() if k in sel_list} + sel_dict = {k: v for k, v in dict_obj.items() if (k in sel_list) is filter_cond} sel_dict = sel_dict if not remove_none else {k: v for k, v in sel_dict.items() if v is not None} return sel_dict @@ -252,6 +253,19 @@ def convert2xrda(arr: Union[xr.DataArray, xr.Dataset, np.ndarray, int, float], return xr.DataArray(arr, **kwargs) +def filter_dict_by_value(dictionary: dict, filter_val: Any, filter_cond: bool) -> dict: + """ + Filter dictionary by its values. + + :param dictionary: dict to filter + :param filter_val: search only for key value pair with a value equal to filter_val + :param filter_cond: indicate to use either all dict entries that fulfil the filter_val criteria (if `True`) or that + do not match the criteria (if `False`) + :returns: a filtered dict with either matching or non-matching elements depending on the `filter_cond` + """ + return dict(filter(lambda x: (x[1] == filter_val) is filter_cond, dictionary.items())) + + # def convert_size(size_bytes): # if size_bytes == 0: # return "0B" diff --git a/mlair/helpers/join.py b/mlair/helpers/join.py deleted file mode 100644 index 67591b29a4e4bcc8b3083869825aed09ebebaf58..0000000000000000000000000000000000000000 --- a/mlair/helpers/join.py +++ /dev/null @@ -1,275 +0,0 @@ -"""Functions to access join database.""" -__author__ = 'Felix Kleinert, Lukas Leufen' -__date__ = '2019-10-16' - -import datetime as dt -import logging -from typing import Iterator, Union, List, Dict - -import pandas as pd -import requests -from requests.adapters import HTTPAdapter -from requests.packages.urllib3.util.retry import Retry - -from mlair import helpers -from mlair.configuration.join_settings import join_settings - -# join_url_base = 'https://join.fz-juelich.de/services/rest/surfacedata/' -str_or_none = Union[str, None] - - -class EmptyQueryResult(Exception): - """Exception that get raised if a query to JOIN returns empty results.""" - - pass - - -def download_join(station_name: Union[str, List[str]], stat_var: dict, station_type: str = None, - network_name: str = None, sampling: str = "daily", data_origin: Dict = None) -> [pd.DataFrame, - pd.DataFrame]: - """ - Read data from JOIN/TOAR. - - :param station_name: Station name e.g. DEBY122 - :param stat_var: key as variable like 'O3', values as statistics on keys like 'mean' - :param station_type: set the station type like "traffic" or "background", can be none - :param network_name: set the measurement network like "UBA" or "AIRBASE", can be none - :param sampling: sampling rate of the downloaded data, either set to daily or hourly (default daily) - :param data_origin: additional dictionary to specify data origin as key (for variable) value (origin) pair. Valid - origins are "REA" for reanalysis data and "" (empty string) for observational data. - - :returns: data frame with all variables and statistics and meta data frame with all meta information - """ - # make sure station_name parameter is a list - station_name = helpers.to_list(station_name) - - # also ensure that given data_origin dict is no reference - data_origin = None if data_origin is None else {k: v for (k, v) in data_origin.items()} - - # get data connection settings - join_url_base, headers = join_settings(sampling) - - # load series information - vars_dict, data_origin = load_series_information(station_name, station_type, network_name, join_url_base, headers, - data_origin) - - # check if all requested variables are available - if set(stat_var).issubset(vars_dict) is False: - missing_variables = set(stat_var).difference(vars_dict) - origin = helpers.select_from_dict(data_origin, missing_variables) - options = f"station={station_name}, type={station_type}, network={network_name}, origin={origin}" - raise EmptyQueryResult(f"No data found for variables {missing_variables} and options {options} in JOIN.") - - # correct stat_var values if data is not aggregated (hourly) - if sampling == "hourly": - stat_var = {key: "values" for key in stat_var.keys()} - - # download all variables with given statistic - data = None - df = None - logging.info(f"load data for {station_name[0]} from JOIN") - for var in _lower_list(sorted(vars_dict.keys())): - if var in stat_var.keys(): - - logging.debug('load: {}'.format(var)) - - # create data link - opts = {'base': join_url_base, 'service': 'stats', 'id': vars_dict[var], 'statistics': stat_var[var], - 'sampling': sampling, 'capture': 0, 'format': 'json'} - - # load data - data = get_data(opts, headers) - - # adjust data format if given as list of list - # no branch cover because this just happens when downloading hourly data using a secret token, not available - # for CI testing. - if isinstance(data, list): # pragma: no branch - data = correct_data_format(data) - - # correct namespace of statistics - stat = _correct_stat_name(stat_var[var]) - - # store data in pandas dataframe - df = _save_to_pandas(df, data, stat, var) - - logging.debug('finished: {}'.format(var)) - - if data: - meta = pd.DataFrame.from_dict(data['metadata'], orient='index') - meta.columns = station_name - return df, meta - else: - raise EmptyQueryResult("No data found in JOIN.") - - -def correct_data_format(data): - """ - Transform to the standard data format. - - For some cases (e.g. hourly data), the data is returned as list instead of a dictionary with keys datetime, values - and metadata. This functions addresses this issue and transforms the data into the dictionary version. - - :param data: data in hourly format - - :return: the same data but formatted to fit with aggregated format - """ - formatted = {"datetime": [], - "values": [], - "metadata": data[-1]} - for d in data[:-1]: - for k, v in zip(["datetime", "values"], d): - formatted[k].append(v) - return formatted - - -def get_data(opts: Dict, headers: Dict) -> Union[Dict, List]: - """ - Download join data using requests framework. - - Data is returned as json like structure. Depending on the response structure, this can lead to a list or dictionary. - - :param opts: options to create the request url - :param headers: additional headers information like authorization, can be empty - - :return: requested data (either as list or dictionary) - """ - url = create_url(**opts) - response = retries_session().get(url, headers=headers, timeout=(5, None)) # timeout=(open, read) - if response.status_code == 200: - return response.json() - else: - raise EmptyQueryResult(f"There was an error (STATUS {response.status_code}) for request {url}") - - -def retries_session(max_retries=3): - retry_strategy = Retry(total=max_retries, - backoff_factor=0.1, - status_forcelist=[429, 500, 502, 503, 504], - method_whitelist=["HEAD", "GET", "OPTIONS"]) - adapter = HTTPAdapter(max_retries=retry_strategy) - http = requests.Session() - http.mount("https://", adapter) - http.mount("http://", adapter) - return http - - -def load_series_information(station_name: List[str], station_type: str_or_none, network_name: str_or_none, - join_url_base: str, headers: Dict, data_origin: Dict = None) -> [Dict, Dict]: - """ - List all series ids that are available for given station id and network name. - - :param station_name: Station name e.g. DEBW107 - :param station_type: station type like "traffic" or "background" - :param network_name: measurement network of the station like "UBA" or "AIRBASE" - :param join_url_base: base url name to download data from - :param headers: additional headers information like authorization, can be empty - :param data_origin: additional information to select a distinct series e.g. from reanalysis (REA) or from observation - ("", empty string). This dictionary should contain a key for each variable and the information as key - :return: all available series for requested station stored in an dictionary with parameter name (variable) as key - and the series id as value. - """ - opts = {"base": join_url_base, "service": "search", "station_id": station_name[0], "station_type": station_type, - "network_name": network_name, "as_dict": "true", - "columns": "id,network_name,station_id,parameter_name,parameter_label,parameter_attribute"} - station_vars = get_data(opts, headers) - logging.debug(f"{station_name}: {station_vars}") - return _select_distinct_series(station_vars, data_origin) - - -def _select_distinct_series(vars: List[Dict], data_origin: Dict = None) -> [Dict, Dict]: - """ - Select distinct series ids for all variables. Also check if a parameter is from REA or not. - """ - data_origin_default = {"cloudcover": "REA", "humidity": "REA", "pblheight": "REA", "press": "REA", "relhum": "REA", - "temp": "REA", "totprecip": "REA", "u": "REA", "v": "REA", - "no": "", "no2": "", "o3": "", "pm10": "", "so2": ""} - if data_origin is None: - data_origin = {} - # ToDo: maybe press, wdir, wspeed from obs? or also temp, ... ? - selected = {} - for var in vars: - name = var["parameter_name"].lower() - var_attr = var["parameter_attribute"].lower() - if name not in data_origin.keys(): - data_origin.update({name: data_origin_default.get(name, "")}) - attr = data_origin.get(name, "").lower() - if var_attr == attr: - selected[name] = var["id"] - return selected, data_origin - - -def _save_to_pandas(df: Union[pd.DataFrame, None], data: dict, stat: str, var: str) -> pd.DataFrame: - """ - Save given data in data frame. - - If given data frame is not empty, the data is appened as new column. - - :param df: data frame to append the new data, can be none - :param data: new data to append or format as data frame containing the keys 'datetime' and '<stat>' - :param stat: extracted statistic to get values from data (e.g. 'mean', 'dma8eu') - :param var: variable the data is from (e.g. 'o3') - - :return: new created or concatenated data frame - """ - if len(data["datetime"][0]) == 19: - str_format = "%Y-%m-%d %H:%M:%S" - else: - str_format = "%Y-%m-%d %H:%M" - index = map(lambda s: dt.datetime.strptime(s, str_format), data['datetime']) - if df is None: - df = pd.DataFrame(data[stat], index=index, columns=[var]) - else: - df = pd.concat([df, pd.DataFrame(data[stat], index=index, columns=[var])], axis=1) - return df - - -def _correct_stat_name(stat: str) -> str: - """ - Map given statistic name to new namespace defined by mapping dict. - - Return given name stat if not element of mapping namespace. - - :param stat: namespace from JOIN server - - :return: stat mapped to local namespace - """ - mapping = {'average_values': 'mean', 'maximum': 'max', 'minimum': 'min'} - return mapping.get(stat, stat) - - -def _lower_list(args: List[str]) -> Iterator[str]: - """ - Lower all elements of given list. - - :param args: list with string entries to lower - - :return: iterator that lowers all list entries - """ - for string in args: - yield string.lower() - - -def create_url(base: str, service: str, **kwargs: Union[str, int, float, None]) -> str: - """ - Create a request url with given base url, service type and arbitrarily many additional keyword arguments. - - :param base: basic url of the rest service - :param service: service type, e.g. series, stats - :param kwargs: keyword pairs for optional request specifications, e.g. 'statistics=maximum' - - :return: combined url as string - """ - if not base.endswith("/"): - base += "/" - url = f"{base}{service}/?{'&'.join(f'{k}={v}' for k, v in kwargs.items() if v is not None)}" - return url - - -if __name__ == "__main__": - logging.basicConfig(level=logging.DEBUG) - var_all_dic = {'o3': 'dma8eu', 'relhum': 'average_values', 'temp': 'maximum', 'u': 'average_values', - 'v': 'average_values', 'no': 'dma8eu', 'no2': 'dma8eu', 'cloudcover': 'average_values', - 'pblheight': 'maximum'} - station = 'DEBW107' - # download_join(station, var_all_dic, sampling="daily") - download_join(station, var_all_dic, sampling="hourly") diff --git a/mlair/helpers/meteo.py b/mlair/helpers/meteo.py new file mode 100644 index 0000000000000000000000000000000000000000..c43d4ff23239f4ebff2b130779b3f8e2323620ca --- /dev/null +++ b/mlair/helpers/meteo.py @@ -0,0 +1,14 @@ + +import numpy as np + + +def relative_humidity_from_dewpoint(dew, temp): + return np.clip(100 * e_sat(dew) / e_sat(temp), 0, 100) + + +def e_sat(temperature): + a1 = 611.21 # Pa + a3 = 17.502 + a4 = 32.19 # K + T0 = 273.16 # K + return a1 * np.exp(a3 * (temperature - T0) / (temperature - a4)) diff --git a/mlair/helpers/statistics.py b/mlair/helpers/statistics.py index 7633a2a9c1842219d7af7b9c7b2b4f23a034cbdf..5f3aa45161530ff7d425ccbc7625dd7e081d8839 100644 --- a/mlair/helpers/statistics.py +++ b/mlair/helpers/statistics.py @@ -419,10 +419,13 @@ class SkillScores: skill_score.loc[["CASE III", "AIII"], iahead] = np.stack(self._climatological_skill_score( data, mu_type=3, forecast_name=forecast_name, observation_name=self.observation_name, external_data=external_data).values.flatten()) - - skill_score.loc[["CASE IV", "AIV", "BIV", "CIV"], iahead] = np.stack(self._climatological_skill_score( - data, mu_type=4, forecast_name=forecast_name, observation_name=self.observation_name, - external_data=external_data).values.flatten()) + try: + skill_score.loc[["CASE IV", "AIV", "BIV", "CIV"], iahead] = np.stack( + self._climatological_skill_score(data, mu_type=4, forecast_name=forecast_name, + observation_name=self.observation_name, + external_data=external_data).values.flatten()) + except ValueError: + pass return skill_score diff --git a/mlair/helpers/testing.py b/mlair/helpers/testing.py index eb8982ae3625cfccedf894717eebf299faffb3ee..21658ea52f194863ad709ae7efbea96a81d29cd9 100644 --- a/mlair/helpers/testing.py +++ b/mlair/helpers/testing.py @@ -1,4 +1,5 @@ """Helper functions that are used to simplify testing.""" +import logging import re from typing import Union, Pattern, List import inspect @@ -105,52 +106,70 @@ def get_all_args(*args, remove=None, add=None): return res -def check_nested_equality(obj1, obj2, precision=None): +def check_nested_equality(obj1, obj2, precision=None, skip_args=None): """Check for equality in nested structures. Use precision to indicate number of decimals to check for consistency""" assert precision is None or isinstance(precision, int) - + message = "" try: - print(f"check type {type(obj1)} and {type(obj2)}") + # print(f"check type {type(obj1)} and {type(obj2)}") + message = f"{type(obj1)}!={type(obj2)}\n{obj1} and {obj2} do not match" assert type(obj1) == type(obj2) - if isinstance(obj1, (tuple, list)): - print(f"check length {len(obj1)} and {len(obj2)}") + # print(f"check length {len(obj1)} and {len(obj2)}") + message = f"{len(obj1)}!={len(obj2)}\nlengths of {obj1} and {obj2} do not match" assert len(obj1) == len(obj2) for pos in range(len(obj1)): - print(f"check pos {obj1[pos]} and {obj2[pos]}") - assert check_nested_equality(obj1[pos], obj2[pos], precision) is True + # print(f"check pos {obj1[pos]} and {obj2[pos]}") + message = f"{obj1[pos]}!={obj2[pos]}\nobjects on pos {pos} of {obj1} and {obj2} do not match" + assert check_nested_equality(obj1[pos], obj2[pos], precision=precision, skip_args=skip_args) is True elif isinstance(obj1, dict): - print(f"check keys {obj1.keys()} and {obj2.keys()}") - assert sorted(obj1.keys()) == sorted(obj2.keys()) - for k in obj1.keys(): - print(f"check pos {obj1[k]} and {obj2[k]}") - assert check_nested_equality(obj1[k], obj2[k], precision) is True + obj1_keys, obj2_keys = obj1.keys(), obj2.keys() + if skip_args is not None and isinstance(skip_args, (str, list)): + skip_args = to_list(skip_args) + obj1_keys = list(set(obj1_keys).difference(skip_args)) + obj2_keys = list(set(obj2_keys).difference(skip_args)) + # print(f"check keys {obj1.keys()} and {obj2.keys()}") + message = f"{sorted(obj1_keys)}!={sorted(obj2_keys)}\n{set(obj1_keys).symmetric_difference(obj2_keys)} " \ + f"are not in both sorted key lists" + assert sorted(obj1_keys) == sorted(obj2_keys) + for k in obj1_keys: + # print(f"check pos {obj1[k]} and {obj2[k]}") + message = f"{obj1[k]}!={obj2[k]}\nobjects for key {k} of {obj1} and {obj2} do not match" + assert check_nested_equality(obj1[k], obj2[k], precision=precision, skip_args=skip_args) is True elif isinstance(obj1, xr.DataArray): if precision is None: - print(f"check xr {obj1} and {obj2}") + # print(f"check xr {obj1} and {obj2}") + message = f"{obj1}!={obj2}\n{obj1} and {obj2} do not match" assert xr.testing.assert_equal(obj1, obj2) is None else: - print(f"check xr {obj1} and {obj2} with precision {precision}") + # print(f"check xr {obj1} and {obj2} with precision {precision}") + message = f"{obj1}!={obj2} with precision {precision}\n{obj1} and {obj2} do not match" assert xr.testing.assert_allclose(obj1, obj2, atol=10**(-precision)) is None elif isinstance(obj1, np.ndarray): if precision is None: - print(f"check np {obj1} and {obj2}") + # print(f"check np {obj1} and {obj2}") + message = f"{obj1}!={obj2}\n{obj1} and {obj2} do not match" assert np.testing.assert_array_equal(obj1, obj2) is None else: - print(f"check np {obj1} and {obj2} with precision {precision}") + # print(f"check np {obj1} and {obj2} with precision {precision}") + message = f"{obj1}!={obj2} with precision {precision}\n{obj1} and {obj2} do not match" assert np.testing.assert_array_almost_equal(obj1, obj2, decimal=precision) is None else: if isinstance(obj1, (int, float)) and isinstance(obj2, (int, float)): if precision is None: - print(f"check number equal {obj1} and {obj2}") + # print(f"check number equal {obj1} and {obj2}") + message = f"{obj1}!={obj2}\n{obj1} and {obj2} do not match" assert np.testing.assert_equal(obj1, obj2) is None else: - print(f"check number equal {obj1} and {obj2} with precision {precision}") + # print(f"check number equal {obj1} and {obj2} with precision {precision}") + message = f"{obj1}!={obj2} with precision {precision}\n{obj1} and {obj2} do not match" assert np.testing.assert_almost_equal(obj1, obj2, decimal=precision) is None else: - print(f"check equal {obj1} and {obj2}") + # print(f"check equal {obj1} and {obj2}") + message = f"{obj1}!={obj2}\n{obj1} and {obj2} do not match" assert obj1 == obj2 except AssertionError: + logging.info(message) return False return True diff --git a/mlair/model_modules/convolutional_networks.py b/mlair/model_modules/convolutional_networks.py index 2270c1ee2abf8b17913e6017181cffcde17bd923..7bdd2ce210c126bf47dcf02c28f4efaacf789457 100644 --- a/mlair/model_modules/convolutional_networks.py +++ b/mlair/model_modules/convolutional_networks.py @@ -75,7 +75,7 @@ class CNNfromConfig(AbstractModelClass): # apply to model self.set_model() self.set_compile_options() - self.set_custom_objects(loss=custom_loss([keras.losses.mean_squared_error, var_loss]), var_loss=var_loss) + self.set_custom_objects(loss=self.compile_options["loss"][0], var_loss=var_loss) def set_model(self): x_input = keras.layers.Input(shape=self._input_shape) diff --git a/mlair/model_modules/probability_models.py b/mlair/model_modules/probability_models.py new file mode 100644 index 0000000000000000000000000000000000000000..9ffe77a5c561a4903a548062f2b015c623e66c69 --- /dev/null +++ b/mlair/model_modules/probability_models.py @@ -0,0 +1,864 @@ +""" +>>> MyCustomisedModel().model.compile(**kwargs) == MyCustomisedModel().compile(**kwargs) +True + +""" + +import mlair.model_modules.keras_extensions + +__author__ = "Felix Kleinert" +__date__ = '2022-07-08' + +import tensorflow as tf +import tensorflow.keras as keras +import tensorflow_probability as tfp +tfd = tfp.distributions +tfb = tfp.bijectors +tfpl = tfp.layers + +import logging +from mlair.model_modules import AbstractModelClass +from mlair.model_modules.inception_model import InceptionModelBase +from mlair.model_modules.flatten import flatten_tail +from mlair.model_modules.advanced_paddings import PadUtils, Padding2D, SymmetricPadding2D +from mlair.model_modules.loss import l_p_loss + + +class MyUnetProb(AbstractModelClass): + def __init__(self, input_shape: list, output_shape: list, num_of_training_samples: int): + super().__init__(input_shape[0], output_shape[0]) + self.first_filter_size = 16 # 16*2#self._input_shape[-1] # 16 + self.lstm_units = 64 * 2 # * 2 + self.kernel_size = (3, 1) # (3,1) + self.activation = "elu" + self.pool_size = (2, 1) + + self.num_of_training_samples = num_of_training_samples + # self.divergence_fn = lambda q, p, _: tfd.kl_divergence(q, p) / input_shape[0][0] + self.divergence_fn = lambda q, p, _: tfd.kl_divergence(q, p) / num_of_training_samples + + # self.loss_fn = lambda y_true, y_pred: -y_pred.log_prob(y_true) + # self.loss = nll + + self.dropout = .15 # .2 + self.k_mixed_components = 2 + self.kernel_regularizer = keras.regularizers.l1_l2(l1=0.01, l2=0.01) + self.bias_regularizer = keras.regularizers.l1_l2(l1=0.01, l2=0.01) + + self.kernel_initializer = 'he_normal' + + self.dense_units = 32 * 2 + self.initial_lr = 0.001 + + # apply to model + self.set_model() + self.set_compile_options() + self.set_custom_objects(SymmetricPadding2D=SymmetricPadding2D, loss=self.loss, divergence_fn=self.divergence_fn) + + def set_model(self): + input_train = keras.layers.Input(shape=self._input_shape) + pad_size = PadUtils.get_padding_for_same(self.kernel_size) + + c1 = Padding2D("SymPad2D")(padding=pad_size)(input_train) + c1 = tfpl.Convolution2DReparameterization( + self.first_filter_size, self.kernel_size, padding='valid', + kernel_prior_fn=tfpl.default_multivariate_normal_fn, + kernel_posterior_fn=tfpl.default_mean_field_normal_fn(), + kernel_divergence_fn=self.divergence_fn, + bias_prior_fn=tfpl.default_multivariate_normal_fn, + bias_posterior_fn=tfpl.default_mean_field_normal_fn(), + bias_divergence_fn=self.divergence_fn, + activation=self.activation, + )(c1) + #c1 = keras.layers.Conv2D(self.first_filter_size, self.kernel_size, activation=self.activation, + # kernel_initializer=self.kernel_initializer, kernel_regularizer=self.kernel_regularizer, + # bias_regularizer=self.bias_regularizer)(c1) + c1 = keras.layers.Dropout(self.dropout)(c1) + c1 = Padding2D("SymPad2D")(padding=pad_size)(c1) + c1 = tfpl.Convolution2DReparameterization( + self.first_filter_size, self.kernel_size, padding='valid', + kernel_prior_fn=tfpl.default_multivariate_normal_fn, + kernel_posterior_fn=tfpl.default_mean_field_normal_fn(), + kernel_divergence_fn=self.divergence_fn, + bias_prior_fn=tfpl.default_multivariate_normal_fn, + bias_posterior_fn=tfpl.default_mean_field_normal_fn(), + bias_divergence_fn=self.divergence_fn, + activation=self.activation, + name='c1' + )(c1) + #c1 = keras.layers.Conv2D(self.first_filter_size, self.kernel_size, activation=self.activation, + # kernel_initializer=self.kernel_initializer, name='c1', + # kernel_regularizer=self.kernel_regularizer, + # bias_regularizer=self.bias_regularizer)(c1) + p1 = c1 + # p1 = keras.layers.MaxPooling2D(self.pool_size)(c1) + + c2 = Padding2D("SymPad2D")(padding=pad_size)(p1) + c2 = tfpl.Convolution2DReparameterization( + self.first_filter_size * 2, self.kernel_size, padding='valid', + kernel_prior_fn=tfpl.default_multivariate_normal_fn, + kernel_posterior_fn=tfpl.default_mean_field_normal_fn(), + kernel_divergence_fn=self.divergence_fn, + bias_prior_fn=tfpl.default_multivariate_normal_fn, + bias_posterior_fn=tfpl.default_mean_field_normal_fn(), + bias_divergence_fn=self.divergence_fn, + activation=self.activation, + )(c2) + # c2 = keras.layers.Conv2D(self.first_filter_size * 2, self.kernel_size, activation=self.activation, + # kernel_initializer=self.kernel_initializer, kernel_regularizer=self.kernel_regularizer, + # bias_regularizer=self.bias_regularizer)(c2) + c2 = keras.layers.Dropout(self.dropout)(c2) + c2 = Padding2D("SymPad2D")(padding=pad_size)(c2) + c2 = tfpl.Convolution2DReparameterization( + self.first_filter_size * 2, self.kernel_size, padding='valid', + kernel_prior_fn=tfpl.default_multivariate_normal_fn, + kernel_posterior_fn=tfpl.default_mean_field_normal_fn(), + kernel_divergence_fn=self.divergence_fn, + bias_prior_fn=tfpl.default_multivariate_normal_fn, + bias_posterior_fn=tfpl.default_mean_field_normal_fn(), + bias_divergence_fn=self.divergence_fn, + activation=self.activation, name="c2" + )(c2) + # c2 = keras.layers.Conv2D(self.first_filter_size * 2, self.kernel_size, activation=self.activation, + # kernel_initializer=self.kernel_initializer, name='c2', + # kernel_regularizer=self.kernel_regularizer, + # bias_regularizer=self.bias_regularizer)(c2) + p2 = c2 + # p2 = keras.layers.MaxPooling2D(self.pool_size)(c2) + + c3 = Padding2D("SymPad2D")(padding=pad_size)(p2) + c3 = tfpl.Convolution2DReparameterization( + self.first_filter_size * 4, self.kernel_size, padding='valid', + kernel_prior_fn=tfpl.default_multivariate_normal_fn, + kernel_posterior_fn=tfpl.default_mean_field_normal_fn(), + kernel_divergence_fn=self.divergence_fn, + bias_prior_fn=tfpl.default_multivariate_normal_fn, + bias_posterior_fn=tfpl.default_mean_field_normal_fn(), + bias_divergence_fn=self.divergence_fn, + activation=self.activation + )(c3) + # c3 = keras.layers.Conv2D(self.first_filter_size * 4, self.kernel_size, activation=self.activation, + # kernel_initializer=self.kernel_initializer, kernel_regularizer=self.kernel_regularizer, + # bias_regularizer=self.bias_regularizer)(c3) + c3 = keras.layers.Dropout(self.dropout * 2)(c3) + c3 = Padding2D("SymPad2D")(padding=pad_size)(c3) + c3 = tfpl.Convolution2DReparameterization( + self.first_filter_size * 4, self.kernel_size, padding='valid', + kernel_prior_fn=tfpl.default_multivariate_normal_fn, + kernel_posterior_fn=tfpl.default_mean_field_normal_fn(), + kernel_divergence_fn=self.divergence_fn, + bias_prior_fn=tfpl.default_multivariate_normal_fn, + bias_posterior_fn=tfpl.default_mean_field_normal_fn(), + bias_divergence_fn=self.divergence_fn, + activation=self.activation, name="c3" + )(c3) + # c3 = keras.layers.Conv2D(self.first_filter_size * 4, self.kernel_size, activation=self.activation, + # kernel_initializer=self.kernel_initializer, name='c3', + # kernel_regularizer=self.kernel_regularizer, + # bias_regularizer=self.bias_regularizer)(c3) + # p3 = c3 + p3 = keras.layers.MaxPooling2D(self.pool_size)(c3) + + ### own LSTM Block ### + ls1 = keras.layers.Reshape((p3.shape[1], p3.shape[-1]))(p3) + ls1 = keras.layers.LSTM(self.lstm_units, return_sequences=True)(ls1) + ls1 = keras.layers.LSTM(self.lstm_units, return_sequences=True)(ls1) + c4 = keras.layers.Reshape((p3.shape[1], 1, -1))(ls1) + + ### own 2nd LSTM Block ### + ls2 = keras.layers.Reshape((c3.shape[1], c3.shape[-1]))(c3) + ls2 = keras.layers.LSTM(self.lstm_units, return_sequences=True)(ls2) + ls2 = keras.layers.LSTM(self.lstm_units, return_sequences=True)(ls2) + c4_2 = keras.layers.Reshape((c3.shape[1], 1, -1))(ls2) + + u7 = keras.layers.UpSampling2D(size=(3, 1))(c4) + cn3 = Padding2D("SymPad2D")(padding=pad_size)(c3) + # u7 = c4 + u7 = keras.layers.concatenate([u7, cn3], name="u7_c3") + c7 = u7 + # c7 = Padding2D("SymPad2D")(padding=pad_size)(u7) + c7 = tfpl.Convolution2DReparameterization( + self.first_filter_size * 4, self.kernel_size, padding='valid', + kernel_prior_fn=tfpl.default_multivariate_normal_fn, + kernel_posterior_fn=tfpl.default_mean_field_normal_fn(), + kernel_divergence_fn=self.divergence_fn, + bias_prior_fn=tfpl.default_multivariate_normal_fn, + bias_posterior_fn=tfpl.default_mean_field_normal_fn(), + bias_divergence_fn=self.divergence_fn, + activation=self.activation + )(c7) + # c7 = keras.layers.Conv2D(self.first_filter_size * 4, self.kernel_size, activation=self.activation, + # kernel_initializer=self.kernel_initializer, kernel_regularizer=self.kernel_regularizer, + # bias_regularizer=self.bias_regularizer)(c7) + c7 = keras.layers.concatenate([c7, c4_2], name="Concat_2nd_LSTM") + c7 = keras.layers.Dropout(self.dropout * 2)(c7) + c7 = Padding2D("SymPad2D")(padding=pad_size)(c7) + c7 = tfpl.Convolution2DReparameterization( + self.first_filter_size * 4, self.kernel_size, padding='valid', + kernel_prior_fn=tfpl.default_multivariate_normal_fn, + kernel_posterior_fn=tfpl.default_mean_field_normal_fn(), + kernel_divergence_fn=self.divergence_fn, + bias_prior_fn=tfpl.default_multivariate_normal_fn, + bias_posterior_fn=tfpl.default_mean_field_normal_fn(), + bias_divergence_fn=self.divergence_fn, + activation=self.activation, name='c7_to_u8' + )(c7) + # c7 = keras.layers.Conv2D(self.first_filter_size * 4, self.kernel_size, activation=self.activation, + # kernel_initializer=self.kernel_initializer, name='c7_to_u8', + # kernel_regularizer=self.kernel_regularizer, + # bias_regularizer=self.bias_regularizer)(c7) + + + # u8 = Padding2D("SymPad2D")(padding=pad_size)(c7) + # u8 = keras.layers.Conv2DTranspose(32, self.pool_size, strides=self.pool_size)(u8) + u8 = c7 + # u8 = c3 + u8 = keras.layers.concatenate([u8, c2], name="u8_c2") + c8 = Padding2D("SymPad2D")(padding=pad_size)(u8) + c8 = tfpl.Convolution2DReparameterization( + self.first_filter_size * 2, self.kernel_size, padding='valid', + kernel_prior_fn=tfpl.default_multivariate_normal_fn, + kernel_posterior_fn=tfpl.default_mean_field_normal_fn(), + kernel_divergence_fn=self.divergence_fn, + bias_prior_fn=tfpl.default_multivariate_normal_fn, + bias_posterior_fn=tfpl.default_mean_field_normal_fn(), + bias_divergence_fn=self.divergence_fn, + activation=self.activation + )(c8) + # c8 = keras.layers.Conv2D(self.first_filter_size * 2, self.kernel_size, activation=self.activation, + # kernel_initializer=self.kernel_initializer, kernel_regularizer=self.kernel_regularizer, + # bias_regularizer=self.bias_regularizer)(c8) + c8 = keras.layers.Dropout(self.dropout)(c8) + c8 = Padding2D("SymPad2D")(padding=pad_size)(c8) + c8 = tfpl.Convolution2DReparameterization( + self.first_filter_size * 2, self.kernel_size, padding='valid', + kernel_prior_fn=tfpl.default_multivariate_normal_fn, + kernel_posterior_fn=tfpl.default_mean_field_normal_fn(), + kernel_divergence_fn=self.divergence_fn, + bias_prior_fn=tfpl.default_multivariate_normal_fn, + bias_posterior_fn=tfpl.default_mean_field_normal_fn(), + bias_divergence_fn=self.divergence_fn, + activation=self.activation, name='c8_to_u9' + )(c8) + # c8 = keras.layers.Conv2D(self.first_filter_size * 2, self.kernel_size, activation=self.activation, + # kernel_initializer=self.kernel_initializer, name='c8_to_u9', + # kernel_regularizer=self.kernel_regularizer, + # bias_regularizer=self.bias_regularizer)(c8) + + # u9 = Padding2D("SymPad2D")(padding=pad_size)(c8) + # u9 = keras.layers.Conv2DTranspose(16, self.pool_size, strides=self.pool_size)(u9) + u9 = c8 + u9 = keras.layers.concatenate([u9, c1], name="u9_c1") + c9 = Padding2D("SymPad2D")(padding=pad_size)(u9) + c9 = tfpl.Convolution2DReparameterization( + self.first_filter_size, self.kernel_size, padding='valid', + kernel_prior_fn=tfpl.default_multivariate_normal_fn, + kernel_posterior_fn=tfpl.default_mean_field_normal_fn(), + kernel_divergence_fn=self.divergence_fn, + bias_prior_fn=tfpl.default_multivariate_normal_fn, + bias_posterior_fn=tfpl.default_mean_field_normal_fn(), + bias_divergence_fn=self.divergence_fn, + activation=self.activation, + )(c9) + # c9 = keras.layers.Conv2D(self.first_filter_size, self.kernel_size, activation=self.activation, + # kernel_initializer=self.kernel_initializer, kernel_regularizer=self.kernel_regularizer, + # bias_regularizer=self.bias_regularizer)(c9) + c9 = keras.layers.Dropout(self.dropout)(c9) + c9 = Padding2D("SymPad2D")(padding=pad_size)(c9) + c9 = tfpl.Convolution2DReparameterization( + self.first_filter_size, self.kernel_size, padding='valid', + kernel_prior_fn=tfpl.default_multivariate_normal_fn, + kernel_posterior_fn=tfpl.default_mean_field_normal_fn(), + kernel_divergence_fn=self.divergence_fn, + bias_prior_fn=tfpl.default_multivariate_normal_fn, + bias_posterior_fn=tfpl.default_mean_field_normal_fn(), + bias_divergence_fn=self.divergence_fn, + activation=self.activation, + )(c9) + # c9 = keras.layers.Conv2D(self.first_filter_size, self.kernel_size, activation=self.activation, + # kernel_initializer=self.kernel_initializer, name='c9', + # kernel_regularizer=self.kernel_regularizer, + # bias_regularizer=self.bias_regularizer)(c9) + + # outputs = keras.layers.Conv2D(1, (1, 1), activation='sigmoid')(c9) + dl = keras.layers.Flatten()(c9) + dl = keras.layers.Dropout(self.dropout)(dl) + + # outputs = tfpl.DenseVariational(tfpl.MultivariateNormalTriL.params_size(self._output_shape), + # make_posterior_fn=self.posterior, + # make_prior_fn=self.prior)(dl) + # outputs = tfpl.MultivariateNormalTriL(self._output_shape)(outputs) + # outputs = keras.layers.Dense(units=self._output_shape)(dl) + + #outputs = keras.layers.Dense(tfpl.IndependentNormal.params_size(self._output_shape), + # )(dl) + #outputs = tfpl.DenseVariational(units=tfpl.IndependentNormal.params_size(self._output_shape), + # #make_prior_fn=self.prior, + # make_prior_fn=prior_trainable, + # make_posterior_fn=self.posterior, + # )(dl) + #outputs = VarDense(units=tfpl.IndependentNormal.params_size(self._output_shape), + # make_prior_fn=self.prior, + # make_posterior_fn=self.posterior, + # )(dl) + + + #outputs = tfpl.IndependentNormal(self._output_shape)(outputs) + params_size = tfpl.MixtureSameFamily.params_size( + self.k_mixed_components, + component_params_size=tfpl.MultivariateNormalTriL.params_size(self._output_shape) + ) + + pars = tf.keras.layers.Dense(params_size)(dl) + # pars = DenseVariationalCustom( + # units=params_size, make_prior_fn=prior, make_posterior_fn=posterior, + # kl_use_exact=True, kl_weight=1./self.x_train_shape)(dl) + + outputs = tfpl.MixtureSameFamily(self.k_mixed_components, + tfpl.MultivariateNormalTriL( + self._output_shape, + convert_to_tensor_fn=tfp.distributions.Distribution.mode + ) + )(pars) + + self.model = keras.Model(inputs=input_train, outputs=outputs) + + def set_compile_options(self): + # self.optimizer = keras.optimizers.Adam(lr=self.initial_lr, + # clipnorm=self.clipnorm, + # ) + + # loss = nll + # self.optimizer = tf.keras.optimizers.RMSprop(learning_rate=self.initial_lr) + self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.initial_lr) + self.loss = nll + self.compile_options = {"metrics": ["mse", "mae"]} + + # loss = keras.losses.MeanSquaredError() + # self.compile_options = {"loss": [loss]} + + @staticmethod + def prior(kernel_size, bias_size, dtype=None): + n = kernel_size + bias_size + + prior_model = tf.keras.Sequential([ + + tfpl.DistributionLambda( + # Note: Our prior is a non-trianable distribution + lambda t: tfd.MultivariateNormalDiag(loc=tf.zeros(n), scale_diag=tf.ones(n))) + ]) + + return prior_model + + @staticmethod + def posterior(kernel_size, bias_size, dtype=None): + n = kernel_size + bias_size + + posterior_model = tf.keras.Sequential([ + + tfpl.VariableLayer(tfpl.MultivariateNormalTriL.params_size(n), dtype=dtype), + tfpl.MultivariateNormalTriL(n) + ]) + + return posterior_model + + + + +class MyCNNProb(AbstractModelClass): + """ + Taken fromhttps://towardsdatascience.com/uncertainty-in-deep-learning-bayesian-cnn-tensorflow-probability-758d7482bef6 + and modified to our data + """ + def __init__(self, input_shape: list, output_shape: list): + super().__init__(input_shape[0], output_shape[0]) + self.initial_lr = 0.001 + + self.divergence_fn = lambda q, p, q_tensor : self.approximate_kl(q, p, q_tensor) / 1000 # check how to get num of samples included here + # apply to model + self.set_model() + self.set_compile_options() + self.set_custom_objects(loss_fn=self.loss_fn ) + + + @staticmethod + def loss_fn(y_true, y_pred): + return -y_pred.log_prob(y_true) + + + # For Reparameterization Layers + + @staticmethod + def custom_normal_prior(dtype, shape, name, trainable, add_variable_fn): + distribution = tfd.Normal(loc = 0.1 * tf.ones(shape, dtype), + scale = 1.5 * tf.ones(shape, dtype)) + batch_ndims = tf.size(distribution.batch_shape_tensor()) + + distribution = tfd.Independent(distribution, + reinterpreted_batch_ndims = batch_ndims) + return distribution + + @staticmethod + def laplace_prior(dtype, shape, name, trainable, add_variable_fn): + distribution = tfd.Laplace(loc = tf.zeros(shape, dtype), + scale = tf.ones(shape, dtype)) + batch_ndims = tf.size(distribution.batch_shape_tensor()) + + distribution = tfd.Independent(distribution, + reinterpreted_batch_ndims = batch_ndims) + return distribution + + + @staticmethod + def approximate_kl(q, p, q_tensor): + return tf.reduce_mean(q.log_prob(q_tensor) - p.log_prob(q_tensor)) + + + def conv_reparameterization_layer(self, filters, kernel_size, activation): + # For simplicity, we use default prior and posterior. + # In the next parts, we will use custom mixture prior and posteriors. + return tfpl.Convolution2DReparameterization( + filters = filters, + kernel_size = kernel_size, + activation = activation, + padding = 'same', + kernel_posterior_fn = tfpl.default_mean_field_normal_fn(is_singular=False), + kernel_prior_fn = tfpl.default_multivariate_normal_fn, + + bias_prior_fn = tfpl.default_multivariate_normal_fn, + bias_posterior_fn = tfpl.default_mean_field_normal_fn(is_singular=False), + + kernel_divergence_fn = self.divergence_fn, + bias_divergence_fn = self.divergence_fn) + + def set_model(self): + bayesian_cnn = tf.keras.Sequential([ + tf.keras.layers.InputLayer(self._input_shape), + self.conv_reparameterization_layer(16, 3, 'swish'), + #tf.keras.layers.MaxPooling2D(2), + self.conv_reparameterization_layer(32, 3, 'swish'), + #tf.keras.layers.MaxPooling2D(2), + self.conv_reparameterization_layer(64, 3, 'swish'), + #tf.keras.layers.MaxPooling2D(2), + self.conv_reparameterization_layer(128, 3, 'swish'), + #tf.keras.layers.GlobalMaxPooling2D(), + tfpl.DenseReparameterization( + units=tfpl.IndependentNormal.params_size(self._output_shape), activation=None, + kernel_posterior_fn=tfpl.default_mean_field_normal_fn(is_singular=False), + kernel_prior_fn=tfpl.default_multivariate_normal_fn, + bias_prior_fn=tfpl.default_multivariate_normal_fn, + bias_posterior_fn=tfpl.default_mean_field_normal_fn(is_singular=False), + kernel_divergence_fn=self.divergence_fn, + bias_divergence_fn=self.divergence_fn), + tfpl.IndependentNormal(self._output_shape) + ]) + + input_train = keras.layers.Input(shape=self._input_shape) + x = self.conv_reparameterization_layer(16, 3, 'swish')(input_train) + # tf.keras.layers.MaxPooling2D(2), + x = self.conv_reparameterization_layer(32, 3, 'swish')(x) + # tf.keras.layers.MaxPooling2D(2), + x = self.conv_reparameterization_layer(64, 3, 'swish')(x) + # tf.keras.layers.MaxPooling2D(2), + x = self.conv_reparameterization_layer(128, 3, 'swish')(x) + x = tf.keras.layers.Flatten()(x) + # x = tfpl.DenseReparameterization( + # units=tfpl.IndependentNormal.params_size(self._output_shape), activation=None, + # kernel_posterior_fn=tfpl.default_mean_field_normal_fn(is_singular=False), + # kernel_prior_fn=tfpl.default_multivariate_normal_fn, + # bias_prior_fn=tfpl.default_multivariate_normal_fn, + # bias_posterior_fn=tfpl.default_mean_field_normal_fn(is_singular=False), + # kernel_divergence_fn=self.divergence_fn, + # bias_divergence_fn=self.divergence_fn)(x) + # outputs = tfpl.IndependentNormal(self._output_shape)(x) + x = tf.keras.layers.Dense(tfpl.IndependentNormal.params_size(event_shape=self._output_shape))(x) + outputs = tfpl.IndependentNormal(event_shape=self._output_shape)(x) + # outputs = tfpl.DistributionLambda( + # make_distribution_fn=lambda t: tfd.Normal( + # loc=t[..., 0], scale=tf.exp(t[..., 1])), + # convert_to_tensor_fn=lambda s: s.sample(30))(x) + + + bnn = keras.Model(inputs=input_train, outputs=outputs) + self.model = bnn + + + logging.info(f"model summary:\n{self.model.summary()}") + + def set_compile_options(self): + self.optimizer = tf.keras.optimizers.Adam(lr=self.initial_lr, + # clipnorm=self.clipnorm, + ) + + loss = self.loss_fn + # self.compile_options = {"loss": [loss], "metrics": ["mse", "mae"]} + + # loss = keras.losses.MeanSquaredError() + self.compile_options = {"loss": [loss]} + + + + +class VarDense(tf.keras.layers.Layer): + + def __init__(self, + units, + make_posterior_fn, + make_prior_fn, + kl_weight=None, + kl_use_exact=False, + activation=None, + use_bias=True, + activity_regularizer=None, + **kwargs + ): + super().__init__(**kwargs) + self.units = units + self.make_posterior_fn = make_posterior_fn + self.make_prior_fn = make_prior_fn + self.kl_weight = kl_weight, + self.kl_use_exact = kl_use_exact, + self.activation = activation, + self.use_bias = use_bias, + self.activity_regularizer = activity_regularizer + self.tfpllayer = tfpl.DenseVariational(units=self.units, + make_prior_fn=self.make_prior_fn, + make_posterior_fn=self.make_posterior_fn, + kl_weight=self.kl_weight, + kl_use_exact=self.kl_use_exact, + use_bias=self.use_bias, + activity_regularizer=self.activity_regularizer + ) + + def call(self, inputs): + return self.tfpllayer(inputs) + + + + + def get_config(self): + config = super().get_config().copy() + config.update({ + "units": self.units, + "make_posterior_fn": self.make_posterior_fn, + "make_prior_fn": self.make_prior_fn, + "kl_weight": self.kl_weight, + "kl_use_exact": self.kl_use_exact, + "activation": self.activation, + "use_bias": self.use_bias, + "activity_regularizer": self.activity_regularizer, + }) + return config + + +def prior_trainable(kernel_size, bias_size=0, dtype=None): + n = kernel_size + bias_size + return tf.keras.Sequential([ + tfp.layers.VariableLayer(n, dtype=dtype), + tfp.layers.DistributionLambda(lambda t: tfd.Independent( + tfd.Normal(loc=t, scale=1), + reinterpreted_batch_ndims=1)), + ]) + + +class ProbTestModel(AbstractModelClass): + def __init__(self, input_shape: list, output_shape: list): + super().__init__(input_shape[0], output_shape[0]) + self.initial_lr = 0.001 + self.loss = nll + + self.set_model() + self.set_compile_options() + self.set_custom_objects(nll=nll) + + def set_model(self): + + x_in = keras.layers.Input(self._input_shape) + x = keras.layers.Conv2D(kernel_size=(3,1), filters=8, + activation='relu', padding="same")(x_in) + x = keras.layers.Flatten()(x) + x = keras.layers.Dense(tfpl.IndependentNormal.params_size(self._output_shape))(x) + out = tfpl.IndependentNormal(self._output_shape)(x) + model = keras.Model(inputs=x_in, outputs=out) + + + #model = tf.keras.Sequential([ + # keras.layers.InputLayer(self._input_shape), + # keras.layers.Conv2D(kernel_size=(3,1), filters=8, + # activation='relu', padding="same"), + + # keras.layers.Flatten(), + + # keras.layers.Dense(tfpl.IndependentNormal.params_size(self._output_shape)), + # tfpl.IndependentNormal(self._output_shape, + # convert_to_tensor_fn=tfp.distributions.Distribution.sample + # ) + + #]) + self.model = model + logging.info(self.model.summary()) + + def set_compile_options(self): + self.optimizer = tf.keras.optimizers.RMSprop(lr=self.initial_lr, + # clipnorm=self.clipnorm, + ) + +class ProbTestModel2(AbstractModelClass): + def __init__(self, input_shape: list, output_shape: list): + super().__init__(input_shape[0], output_shape[0]) + self.initial_lr = 0.001 + self.loss = nll + self.divergence_fn = lambda q, p, _: tfd.kl_divergence(q, p) / input_shape[0][0] + + self.set_model() + self.set_compile_options() + self.set_custom_objects(nll=nll) + + def set_model(self): + model = tf.keras.Sequential([ + tf.keras.layers.InputLayer(self._input_shape), + #tf.keras.layers.Conv2D(kernel_size=(3,1), filters=8, + # activation='relu', padding="same"), + Convolution2DReparameterizationCustom( + 8, (3,1), padding='same', + kernel_prior_fn=tfpl.default_multivariate_normal_fn, + kernel_posterior_fn=tfpl.default_mean_field_normal_fn(), + kernel_divergence_fn=self.divergence_fn, + bias_prior_fn=tfpl.default_multivariate_normal_fn, + bias_posterior_fn=tfpl.default_mean_field_normal_fn(), + bias_divergence_fn=self.divergence_fn, + activation='relu'), + + tf.keras.layers.Flatten(), + + tf.keras.layers.Dense(tfpl.MultivariateNormalTriL.params_size(self._output_shape)), + tfpl.MultivariateNormalTriL(self._output_shape, + convert_to_tensor_fn=tfp.distributions.Distribution.mode + ) + + ]) + self.model = model + logging.info(self.model.summary()) + + def set_compile_options(self): + self.optimizer = tf.keras.optimizers.RMSprop(lr=self.initial_lr, + # clipnorm=self.clipnorm, + ) + + + + +class ProbTestModel3(AbstractModelClass): + def __init__(self, input_shape: list, output_shape: list): + super().__init__(input_shape[0], output_shape[0]) + + self.x_train_shape=100. + self.set_model() + self.set_compile_options() + self.set_custom_objects(nll=nll) + + def set_model(self): + model = tf.keras.Sequential([ + keras.layers.Flatten(input_shape=self._input_shape), + # Epistemic uncertainty + tfpl.DenseVariational(units=8, + make_prior_fn=prior, + make_posterior_fn=posterior, + kl_weight=1/self.x_train_shape, + kl_use_exact=False, + activation='sigmoid'), + + tfpl.DenseVariational(units=tfpl.IndependentNormal.params_size(1), + make_prior_fn=prior, + make_posterior_fn=posterior, + kl_use_exact=False, + kl_weight=1/self.x_train_shape), + + # Aleatoric uncertainty + tfpl.IndependentNormal(1) + ]) + logging.warning(model.summary()) + self.model = model + + def set_compile_options(self): + self.optimizer = tf.keras.optimizers.RMSprop() + + +class ProbTestModel4(AbstractModelClass): + def __init__(self, input_shape: list, output_shape: list): + super().__init__(input_shape[0], output_shape[0]) + + self.x_train_shape = 100. + self.set_model() + self.set_compile_options() + self.set_custom_objects(nll=nll) + + def set_model(self): + model = tf.keras.Sequential([ + keras.layers.Flatten(input_shape=self._input_shape), + # Epistemic uncertainty + DenseVariationalCustom(units=8, + make_prior_fn=prior, + make_posterior_fn=posterior, + kl_weight=1 / self.x_train_shape, + kl_use_exact=False, + activation='sigmoid'), + + DenseVariationalCustom(units=tfpl.IndependentNormal.params_size(self._output_shape), + make_prior_fn=prior, + make_posterior_fn=posterior, + kl_use_exact=False, + kl_weight=1 / self.x_train_shape), + + # Aleatoric uncertainty + tfpl.IndependentNormal(self._output_shape) + ]) + logging.warning(model.summary()) + self.model = model + + def set_compile_options(self): + self.optimizer = tf.keras.optimizers.RMSprop() + self.loss = nll + + + +class ProbTestModelMixture(AbstractModelClass): + def __init__(self, input_shape: list, output_shape: list): + super().__init__(input_shape[0], output_shape[0]) + self.initial_lr = 0.001 + self.loss = nll + self.divergence_fn = lambda q, p, _: tfd.kl_divergence(q, p) / input_shape[0][0] + self.k_mixed_components = 2 + + self.set_model() + self.set_compile_options() + self.set_custom_objects(nll=nll) + + def set_model(self): + x_input = tf.keras.layers.Input(self._input_shape) + #tf.keras.layers.Conv2D(kernel_size=(3,1), filters=8, + # activation='relu', padding="same"), + x = Convolution2DReparameterizationCustom( + 8, (3, 1), padding='same', + kernel_prior_fn=tfpl.default_multivariate_normal_fn, + kernel_posterior_fn=tfpl.default_mean_field_normal_fn(), + kernel_divergence_fn=self.divergence_fn, + bias_prior_fn=tfpl.default_multivariate_normal_fn, + bias_posterior_fn=tfpl.default_mean_field_normal_fn(), + bias_divergence_fn=self.divergence_fn, + activation='relu', + )(x_input) + + x = tf.keras.layers.Flatten()(x) + + params_size = tfpl.MixtureSameFamily.params_size( + self.k_mixed_components, + component_params_size=tfpl.MultivariateNormalTriL.params_size(self._output_shape) + ) + + x = tf.keras.layers.Dense(params_size)(x) + # tfpl.MultivariateNormalTriL(self._output_shape, + # convert_to_tensor_fn=tfp.distributions.Distribution.mode + # ) + out = tfpl.MixtureSameFamily(self.k_mixed_components, tfpl.MultivariateNormalTriL(self._output_shape, + convert_to_tensor_fn=tfp.distributions.Distribution.mode + ))(x) + + self.model = tf.keras.Model(inputs=[x_input], outputs=out) + logging.info(self.model.summary()) + + def set_compile_options(self): + self.optimizer = tf.keras.optimizers.RMSprop(lr=self.initial_lr, + # clipnorm=self.clipnorm, + ) + + + +def nll(y_true, y_pred): + """ + This function should return the negative log-likelihood of each sample + in y_true given the predicted distribution y_pred. If y_true is of shape + [B, E] and y_pred has batch shape [B] and event_shape [E], the output + should be a Tensor of shape [B]. + """ + return -y_pred.log_prob(y_true) + + +# Posterior +def posterior(kernel_size, bias_size, dtype=None): + + n = kernel_size + bias_size + + posterior_model = tf.keras.Sequential([ + + tfpl.VariableLayer(tfpl.MultivariateNormalTriL.params_size(n), dtype=dtype), + tfpl.MultivariateNormalTriL(n) + ]) + + return posterior_model + +# Prior - diagonal MVN ~ N(0, 1) +def prior(kernel_size, bias_size, dtype=None): + + n = kernel_size + bias_size + + prior_model = tf.keras.Sequential([ + + tfpl.DistributionLambda( + # Note: Our prior is a non-trianable distribution + lambda t: tfd.MultivariateNormalDiag(loc=tf.zeros(n), scale_diag=tf.ones(n))) + ]) + + return prior_model + + +class DenseVariationalCustom(tfpl.DenseVariational): + """ + Trying to implement a DensVar that can be stored: + https://github.com/tensorflow/probability/commit/0ca065fb526b50ce38b68f7d5b803f02c78c8f16# + """ + + def get_config(self): + config = super().get_config().copy() + config.update({ + 'units': self.units, + 'make_posterior_fn': self._make_posterior_fn, + 'make_prior_fn': self._make_prior_fn + }) + return config + + +class Convolution2DReparameterizationCustom(tfpl.Convolution2DReparameterization): + def get_config(self): + config = super().get_config().copy() + config.update({ + # 'units': self.units, + # 'make_posterior_fn': self._make_posterior_fn, + # 'make_prior_fn': self._make_prior_fn, + # 'kernel_divergence_fn': self.divergence_fn, + }) + return config + + +if __name__ == "__main__": + + mylayer = DenseVariationalCustom(units=8, + make_prior_fn=prior, + make_posterior_fn=posterior, + kl_weight=1/100., + kl_use_exact=False, + activation='sigmoid') + + print(mylayer) + + +#### How to access mixture model parameters: +# https://stackoverflow.com/questions/65918888/mixture-parameters-from-a-tensorflow-probability-mixture-density-network +# from MLAir perspective: +#gm = self.model.model(input_data) +# +#mixing parameters +#gm.mixture_distribution.probs_parameter() +# +#for parameters see keys and select +#gm.components_distribution.parameters.keys() diff --git a/mlair/plotting/abstract_plot_class.py b/mlair/plotting/abstract_plot_class.py index 21e5d9413b490a4be5281c2a80308be558fe64c8..a26023bb6cb8772623479491ac8bcc731dd42223 100644 --- a/mlair/plotting/abstract_plot_class.py +++ b/mlair/plotting/abstract_plot_class.py @@ -72,7 +72,10 @@ class AbstractPlotClass: # pragma: no cover self._update_rc_params() def __del__(self): - plt.close('all') + try: + plt.close('all') + except ImportError: + pass def _plot(self, *args): """Abstract plot class needs to be implemented in inheritance.""" diff --git a/mlair/reference_models/abstract_reference_model.py b/mlair/reference_models/abstract_reference_model.py index e187e7ef62e3fe84f7ba2149a490f63ac718308f..f400447385be2f29e2ebf969ef16f3df0a67fd99 100644 --- a/mlair/reference_models/abstract_reference_model.py +++ b/mlair/reference_models/abstract_reference_model.py @@ -17,13 +17,14 @@ class AbstractReferenceModel(ABC): def __init__(self, *args, **kwargs): pass - def make_reference_available_locally(self): + def make_reference_available_locally(self, *args): raise NotImplementedError @staticmethod def is_reference_available_locally(reference_path) -> bool: """ Checks if reference is available locally + :param reference_path: look in this path for data """ try: diff --git a/mlair/reference_models/reference_model_cams.py b/mlair/reference_models/reference_model_cams.py new file mode 100644 index 0000000000000000000000000000000000000000..1db19c05a846ec948d3eda71727d11dd597643fa --- /dev/null +++ b/mlair/reference_models/reference_model_cams.py @@ -0,0 +1,56 @@ +__author__ = "Lukas Leufen" +__date__ = "2022-06-27" + + +from mlair.configuration.path_config import check_path_and_create +from mlair.reference_models.abstract_reference_model import AbstractReferenceModel +import os +import xarray as xr +import pandas as pd + + +class CAMSforecast(AbstractReferenceModel): + + def __init__(self, ref_name: str, ref_store_path: str = None, data_path: str = None): + + super().__init__() + self.ref_name = ref_name + if ref_store_path is None: + ref_store_path = f"{self.ref_name}/" + self.ref_store_path = ref_store_path + if data_path is None: + self.data_path = os.path.abspath(".") + else: + self.data_path = os.path.abspath(data_path) + self.file_pattern = "forecasts_%s_test.nc" + self.time_dim = "index" + self.ahead_dim = "ahead" + self.type_dim = "type" + + def make_reference_available_locally(self, stations): + "dma8eu_ENS_FORECAST_2019-04-09.nc" + missing_stations = self.list_locally_available_references(self.ref_store_path, stations) + if len(missing_stations) > 0: + check_path_and_create(self.ref_store_path) + dataset = xr.open_mfdataset(os.path.join(self.data_path, "dma8eu_ENS_FORECAST_*.nc")) + darray = dataset.to_array().sortby(["longitude", "latitude"]) + for station, coords in missing_stations.items(): + lon, lat = coords["lon"], coords["lat"] + station_data = darray.sel(longitude=lon, latitude=lat, method="nearest", drop=True).squeeze(drop=True) + station_data = station_data.expand_dims(dim={self.type_dim: [self.ref_name]}).compute() + station_data.coords[self.time_dim] = station_data.coords[self.time_dim] - pd.Timedelta(days=1) + station_data.coords[self.ahead_dim] = station_data.coords[self.ahead_dim] + 1 + file_name = self.file_pattern % str(station) + station_data.to_netcdf(os.path.join(self.ref_store_path, file_name)) + + @staticmethod + def list_locally_available_references(reference_path, stations) -> dict: + try: + file_list = os.listdir(reference_path) + if len(file_list) > 0: + res = {k: v for k, v in stations.items() if all(k not in x for x in file_list)} + else: + res = stations + except FileNotFoundError: + res = stations + return res diff --git a/mlair/run_modules/experiment_setup.py b/mlair/run_modules/experiment_setup.py index d807db14c96a4a30fde791e54c8b1b32e519fb9c..f89633cbe0f80f26dbb2481ca24a7fd294ee6888 100644 --- a/mlair/run_modules/experiment_setup.py +++ b/mlair/run_modules/experiment_setup.py @@ -10,7 +10,7 @@ from dill.source import getsource from mlair.configuration import path_config from mlair import helpers -from mlair.configuration.defaults import DEFAULT_STATIONS, DEFAULT_VAR_ALL_DICT, DEFAULT_NETWORK, DEFAULT_STATION_TYPE, \ +from mlair.configuration.defaults import DEFAULT_STATIONS, DEFAULT_VAR_ALL_DICT, DEFAULT_STATION_TYPE, \ DEFAULT_START, DEFAULT_END, DEFAULT_WINDOW_HISTORY_SIZE, DEFAULT_OVERWRITE_LOCAL_DATA, \ DEFAULT_HPC_LOGIN_LIST, DEFAULT_HPC_HOST_LIST, DEFAULT_CREATE_NEW_MODEL, DEFAULT_TRAIN_MODEL, \ DEFAULT_FRACTION_OF_TRAINING, DEFAULT_EXTREME_VALUES, DEFAULT_EXTREMES_ON_RIGHT_TAIL_ONLY, DEFAULT_PERMUTE_DATA, \ @@ -23,8 +23,8 @@ from mlair.configuration.defaults import DEFAULT_STATIONS, DEFAULT_VAR_ALL_DICT, DEFAULT_USE_MULTIPROCESSING, DEFAULT_USE_MULTIPROCESSING_ON_DEBUG, DEFAULT_MAX_NUMBER_MULTIPROCESSING, \ DEFAULT_FEATURE_IMPORTANCE_BOOTSTRAP_TYPE, DEFAULT_FEATURE_IMPORTANCE_BOOTSTRAP_METHOD, DEFAULT_OVERWRITE_LAZY_DATA, \ DEFAULT_UNCERTAINTY_ESTIMATE_BLOCK_LENGTH, DEFAULT_UNCERTAINTY_ESTIMATE_EVALUATE_COMPETITORS, \ - DEFAULT_UNCERTAINTY_ESTIMATE_N_BOOTS, DEFAULT_DO_UNCERTAINTY_ESTIMATE, DEFAULT_EARLY_STOPPING_EPOCHS, \ - DEFAULT_RESTORE_BEST_MODEL_WEIGHTS + DEFAULT_UNCERTAINTY_ESTIMATE_N_BOOTS, DEFAULT_DO_UNCERTAINTY_ESTIMATE, DEFAULT_CREATE_SNAPSHOT, \ + DEFAULT_EARLY_STOPPING_EPOCHS, DEFAULT_RESTORE_BEST_MODEL_WEIGHTS, DEFAULT_COMPETITORS from mlair.data_handler import DefaultDataHandler from mlair.run_modules.run_environment import RunEnvironment from mlair.model_modules.fully_connected_networks import FCN_64_32_16 as VanillaModel @@ -196,7 +196,12 @@ class ExperimentSetup(RunEnvironment): :param transformation_file: Use transformation options from this file for transformation :param calculate_fresh_transformation: can either be True or False, indicates if new transformation options should be calculated in any case (transformation_file is not used in this case!). - + :param snapshot_path: path to store snapshot of current run (default inside experiment path) + :param create_snapshot: indicate if a snapshot is taken from current run or not (default False) + :param snapshot_load_path: path to load a snapshot from (default None). In contrast to `snapshot_path`, which is + only for storing a snapshot, `snapshot_load_path` indicates where to load the snapshot from. If this parameter + is not provided at all, no snapshot is loaded. Note, the workflow will apply the default preprocessing without + loading a snapshot only if this parameter is None! """ def __init__(self, @@ -236,7 +241,8 @@ class ExperimentSetup(RunEnvironment): overwrite_lazy_data: bool = None, uncertainty_estimate_block_length: str = None, uncertainty_estimate_evaluate_competitors: bool = None, uncertainty_estimate_n_boots: int = None, do_uncertainty_estimate: bool = None, model_display_name: str = None, transformation_file: str = None, - calculate_fresh_transformation: bool = None, **kwargs): + calculate_fresh_transformation: bool = None, snapshot_load_path: str = None, + create_snapshot: bool = None, snapshot_path: str = None, **kwargs): # create run framework super().__init__() @@ -312,6 +318,13 @@ class ExperimentSetup(RunEnvironment): self._set_param("tmp_path", None, os.path.join(experiment_path, "tmp")) path_config.check_path_and_create(self.data_store.get("tmp_path"), remove_existing=True) + # snapshot settings + self._set_param("snapshot_path", snapshot_path, default=os.path.join(experiment_path, "snapshot")) + path_config.check_path_and_create(self.data_store.get("snapshot_path"), remove_existing=False) + self._set_param("create_snapshot", create_snapshot, default=DEFAULT_CREATE_SNAPSHOT) + if snapshot_load_path is not None: + self._set_param("snapshot_load_path", snapshot_load_path) + # setup for data self._set_param("stations", stations, default=DEFAULT_STATIONS, apply=helpers.to_list) self._set_param("statistics_per_var", statistics_per_var, default=DEFAULT_VAR_ALL_DICT) @@ -404,7 +417,7 @@ class ExperimentSetup(RunEnvironment): raise IndexError(f"Given model_display_name {model_display_name} is also present in the competitors " f"variable {competitors}. To assure a proper workflow it is required to have unique names " f"for each model and competitor. Please use a different model display name or competitor.") - self._set_param("competitors", competitors, default=[]) + self._set_param("competitors", competitors, default=DEFAULT_COMPETITORS) competitor_path_default = os.path.join(self.data_store.get("data_path"), "competitors", "_".join(self.data_store.get("target_var"))) self._set_param("competitor_path", competitor_path, default=competitor_path_default) diff --git a/mlair/run_modules/model_setup.py b/mlair/run_modules/model_setup.py index eab8012b983a0676620bbc66f65ff79b31165aeb..b51a3f9c76ace4feef72e4c96945b463fb69a673 100644 --- a/mlair/run_modules/model_setup.py +++ b/mlair/run_modules/model_setup.py @@ -74,6 +74,9 @@ class ModelSetup(RunEnvironment): # set channels depending on inputs self._set_shapes() + # set number of training samples (total) + self._set_num_of_training_samples() + # build model graph using settings from my_model_settings() self.build_model() @@ -103,6 +106,19 @@ class ModelSetup(RunEnvironment): shape = list(map(lambda y: y.shape[1:], self.data_store.get("data_collection", "train")[0].get_Y())) self.data_store.set("output_shape", shape, self.scope) + def _set_num_of_training_samples(self): + """ Set number of training samples - needed for example for Bayesian NNs""" + samples = 0 + for s in self.data_store.get("data_collection", "train"): + if isinstance(s.get_Y(), list): + s_sam = s.get_Y()[0].shape[0] + elif isinstance(s.get_Y(), tuple): + s_sam = s.get_Y().shape[0] + else: + s_sam = np.nan + samples += s_sam + self.num_of_training_samples = samples + def compile_model(self): """ Compiles the keras model. Compile options are mandatory and have to be set by implementing set_compile() method @@ -162,6 +178,11 @@ class ModelSetup(RunEnvironment): """Build model using input and output shapes from data store.""" model = self.data_store.get("model_class") args_list = model.requirements() + if "num_of_training_samples" in args_list: + self.data_store.set("num_of_training_samples", self.num_of_training_samples, scope=self.scope) + logging.info(f"Store number of training samples ({self.num_of_training_samples}) in data_store: " + f"self.data_store.set('num_of_training_samples', {self.num_of_training_samples}, scope='{self.scope}')") + args = self.data_store.create_args_dict(args_list, self.scope) self.model = model(**args) self.get_model_settings() @@ -185,9 +206,12 @@ class ModelSetup(RunEnvironment): def plot_model(self): # pragma: no cover """Plot model architecture as `<model_name>.pdf`.""" - with tf.device("/cpu:0"): - file_name = f"{self.model_name.rsplit('.', 1)[0]}.pdf" - keras.utils.plot_model(self.model, to_file=file_name, show_shapes=True, show_layer_names=True) + try: + with tf.device("/cpu:0"): + file_name = f"{self.model_name.rsplit('.', 1)[0]}.pdf" + keras.utils.plot_model(self.model, to_file=file_name, show_shapes=True, show_layer_names=True) + except Exception as e: + logging.info(f"Can not plot model due to: {e}") def report_model(self): # report model settings @@ -200,10 +224,13 @@ class ModelSetup(RunEnvironment): if v is None: continue if isinstance(v, list): - if isinstance(v[0], dict): - v = ["{" + vi + "}" for vi in [",".join(f"{_f(str(uk))}:{_f(str(uv))}" for uk, uv in d.items()) for d in v]] + if len(v) > 0: + if isinstance(v[0], dict): + v = ["{" + vi + "}" for vi in [",".join(f"{_f(str(uk))}:{_f(str(uv))}" for uk, uv in d.items()) for d in v]] + else: + v = ",".join(_f(str(u)) for u in v) else: - v = ",".join(_f(str(u)) for u in v) + v = "[]" if "<" in str(v): v = _f(str(v)) df.loc[k] = str(v) diff --git a/mlair/run_modules/post_processing.py b/mlair/run_modules/post_processing.py index 00d82f3c6f48c3560e31d62b5bed4ddbd2bc49be..a48a82b25804da34d07807073b0c153408e4e028 100644 --- a/mlair/run_modules/post_processing.py +++ b/mlair/run_modules/post_processing.py @@ -8,11 +8,13 @@ import logging import os import sys import traceback +import copy from typing import Dict, Tuple, Union, List, Callable import numpy as np import pandas as pd import xarray as xr +import datetime as dt from mlair.configuration import path_config from mlair.data_handler import Bootstraps, KerasIterator @@ -33,7 +35,7 @@ class PostProcessing(RunEnvironment): Perform post-processing for performance evaluation. Schedule of post-processing: - #. train a ordinary least squared model (ols) for reference + #. train an ordinary least squared model (ols) for reference #. create forecasts for nn, ols, and persistence #. evaluate feature importance with bootstrapped predictions #. calculate skill scores @@ -220,8 +222,9 @@ class PostProcessing(RunEnvironment): for station in all_stations: # test data external_data = self._get_external_data(station, self.forecast_path) - if external_data is not None: - pass + if external_data is None: + logging.info(f"skip calculate_block_mse for {station} as no external_data are available") + continue # competitors if evaluate_competitors is True: competitor = self.load_competitors(station) @@ -261,11 +264,17 @@ class PostProcessing(RunEnvironment): """Ensure time dimension to be equidistant. Sometimes dates if missing values have been dropped.""" start_data = data.coords[dim].values[0] freq = {"daily": "1D", "hourly": "1H"}.get(sampling) - datetime_index = pd.DataFrame(index=pd.date_range(start, end, freq=freq)) + _ind = pd.date_range(start, end, freq=freq) # two steps required to include all hours of end interval + datetime_index = pd.DataFrame(index=pd.date_range(_ind.min(), _ind.max() + dt.timedelta(days=1), + closed="left", freq=freq)) t = data.sel({dim: start_data}, drop=True) res = xr.DataArray(coords=[datetime_index.index, *[t.coords[c] for c in t.coords]], dims=[dim, *t.coords]) res = res.transpose(*data.dims) - res.loc[data.coords] = data + if data.shape == res.shape: + res.loc[data.coords] = data + else: + _d = data.sel({dim: slice(start, end)}) + res.loc[_d.coords] = _d return res def load_competitors(self, station_name: str) -> xr.DataArray: @@ -341,7 +350,8 @@ class PostProcessing(RunEnvironment): return d[..., pos] # forecast - with TimeTracking(name=f"{inspect.stack()[0].function} ({bootstrap_type}, {bootstrap_method})"): + with TimeTracking(name=f"{inspect.stack()[0].function} ({bootstrap_type}, {bootstrap_method})", + log_on_enter=True): # extract all requirements from data store number_of_bootstraps = self.data_store.get("n_boots", "feature_importance") dims = [self.uncertainty_estimate_boot_dim, self.index_dim, self.ahead_dim, self.model_type_dim] @@ -610,19 +620,16 @@ class PostProcessing(RunEnvironment): try: if "PlotStationMap" in plot_list: - if self.data_store.get("hostname")[:2] in self.data_store.get("hpc_hosts") or self.data_store.get( - "hostname")[:6] in self.data_store.get("hpc_hosts"): - logging.warning( - f"Skip 'PlotStationMap` because running on a hpc node: {self.data_store.get('hostname')}") - else: - gens = [(self.train_data, {"marker": 5, "ms": 9}), - (self.val_data, {"marker": 6, "ms": 9}), - (self.test_data, {"marker": 4, "ms": 9})] - PlotStationMap(generators=gens, plot_folder=self.plot_path) - gens = [(self.train_val_data, {"marker": 8, "ms": 9}), - (self.test_data, {"marker": 9, "ms": 9})] - PlotStationMap(generators=gens, plot_folder=self.plot_path, plot_name="station_map_var") + gens = [(self.train_data, {"marker": 5, "ms": 9}), + (self.val_data, {"marker": 6, "ms": 9}), + (self.test_data, {"marker": 4, "ms": 9})] + PlotStationMap(generators=gens, plot_folder=self.plot_path) + gens = [(self.train_val_data, {"marker": 8, "ms": 9}), + (self.test_data, {"marker": 9, "ms": 9})] + PlotStationMap(generators=gens, plot_folder=self.plot_path, plot_name="station_map_var") except Exception as e: + if self.data_store.get("hostname")[:2] in self.data_store.get("hpc_hosts") or self.data_store.get("hostname")[:6] in self.data_store.get("hpc_hosts"): + logging.info(f"PlotStationMap might have failed as current workflow is running on hpc node {self.data_store.get('hostname')}. To download geographic elements, please run PlotStationMap once on login node.") logging.error(f"Could not create plot PlotStationMap due to the following error: {e}" f"\n{sys.exc_info()[0]}\n{sys.exc_info()[1]}\n{sys.exc_info()[2]}") @@ -691,8 +698,12 @@ class PostProcessing(RunEnvironment): @TimeTrackingWrapper def train_ols_model(self): """Train ordinary least squared model on train data.""" - logging.info(f"start train_ols_model on train data") - self.ols_model = OrdinaryLeastSquaredModel(self.train_data) + if "ols" in map(lambda x: x.lower(), self.competitors): + logging.info(f"start train_ols_model on train data") + self.ols_model = OrdinaryLeastSquaredModel(self.train_data) + self.competitors = [e for e in self.competitors if e.lower() != "ols"] + else: + logging.info(f"Skip train ols model as it is not present in competitors.") @TimeTrackingWrapper def make_prediction(self, subset): @@ -716,20 +727,26 @@ class PostProcessing(RunEnvironment): # get scaling parameters transformation_func = data.apply_transformation + nn_output = self.model.predict(input_data) + for normalised in [True, False]: # create empty arrays nn_prediction, persistence_prediction, ols_prediction, observation = self._create_empty_prediction_arrays( target_data, count=4) # nn forecast - nn_prediction = self._create_nn_forecast(input_data, nn_prediction, transformation_func, normalised) + nn_prediction = self._create_nn_forecast(copy.deepcopy(nn_output), nn_prediction, transformation_func, normalised) # persistence persistence_prediction = self._create_persistence_forecast(observation_data, persistence_prediction, transformation_func, normalised) # ols - ols_prediction = self._create_ols_forecast(input_data, ols_prediction, transformation_func, normalised) + if self.ols_model is not None: + ols_prediction = self._create_ols_forecast(input_data, ols_prediction, transformation_func, + normalised) + else: + ols_prediction = None # observation observation = self._create_observation(target_data, observation, transformation_func, normalised) @@ -761,6 +778,7 @@ class PostProcessing(RunEnvironment): indicated by `station_name`. The name of the competitor is set in the `type` axis as indicator. This method will raise either a `FileNotFoundError` or `KeyError` if no competitor could be found for the given station. Either there is no file provided in the expected path or no forecast for given `competitor_name` in the forecast file. + Forecast is trimmed on interval start and end of test subset. :param station_name: name of the station to load data for :param competitor_name: name of the model @@ -770,9 +788,14 @@ class PostProcessing(RunEnvironment): file = os.path.join(path, f"forecasts_{station_name}_test.nc") with xr.open_dataarray(file) as da: data = da.load() - forecast = data.sel(type=[self.forecast_indicator]) - forecast.coords[self.model_type_dim] = [competitor_name] - return forecast + if self.forecast_indicator in data.coords[self.model_type_dim]: + forecast = data.sel({self.model_type_dim: [self.forecast_indicator]}) + forecast.coords[self.model_type_dim] = [competitor_name] + else: + forecast = data.sel({self.model_type_dim: [competitor_name]}) + # limit forecast to time range of test subset + start, end = self.data_store.get("start", "test"), self.data_store.get("end", "test") + return self.create_full_time_dim(forecast, self.index_dim, self._sampling, start, end) def _create_observation(self, data, _, transformation_func: Callable, normalised: bool) -> xr.DataArray: """ @@ -807,8 +830,8 @@ class PostProcessing(RunEnvironment): tmp_ols = self.ols_model.predict(input_data) target_shape = ols_prediction.values.shape if target_shape != tmp_ols.shape: - if len(target_shape)==2: - new_values = np.swapaxes(tmp_ols,1,0) + if len(target_shape) == 2: + new_values = np.swapaxes(tmp_ols, 1, 0) else: new_values = np.swapaxes(tmp_ols, 2, 0) else: @@ -839,7 +862,7 @@ class PostProcessing(RunEnvironment): persistence_prediction = transformation_func(persistence_prediction, "target", inverse=True) return persistence_prediction - def _create_nn_forecast(self, input_data: xr.DataArray, nn_prediction: xr.DataArray, transformation_func: Callable, + def _create_nn_forecast(self, nn_output: xr.DataArray, nn_prediction: xr.DataArray, transformation_func: Callable, normalised: bool) -> xr.DataArray: """ Create NN forecast for given input data. @@ -848,22 +871,22 @@ class PostProcessing(RunEnvironment): output of the main branch is returned (not all minor branches, if the network has multiple output branches). The main branch is defined to be the last entry of all outputs. - :param input_data: transposed history from DataPrep + :param nn_output: Full NN model output :param nn_prediction: empty array in right shape to fill with data :param transformation_func: a callable function to apply inverse transformation :param normalised: transform prediction in original space if false, or use normalised predictions if true :return: filled data array with nn predictions """ - tmp_nn = self.model.predict(input_data) - if isinstance(tmp_nn, list): - nn_prediction.values = tmp_nn[-1] - elif tmp_nn.ndim == 3: - nn_prediction.values = tmp_nn[-1, ...] - elif tmp_nn.ndim == 2: - nn_prediction.values = tmp_nn + + if isinstance(nn_output, list): + nn_prediction.values = nn_output[-1] + elif nn_output.ndim == 3: + nn_prediction.values = nn_output[-1, ...] + elif nn_output.ndim == 2: + nn_prediction.values = nn_output else: - raise NotImplementedError(f"Number of dimension of model output must be 2 or 3, but not {tmp_nn.dims}.") + raise NotImplementedError(f"Number of dimension of model output must be 2 or 3, but not {nn_output.dims}.") if not normalised: nn_prediction = transformation_func(nn_prediction, base="target", inverse=True) return nn_prediction @@ -912,6 +935,7 @@ class PostProcessing(RunEnvironment): :return: xarray of dimension 3: index, ahead_names, # predictions """ + kwargs = {k: v for k, v in kwargs.items() if v is not None} keys = list(kwargs.keys()) res = xr.DataArray(np.full((len(index.index), len(ahead_names), len(keys)), np.nan), coords=[index.index, ahead_names, keys], dims=[index_dim, ahead_dim, type_dim]) diff --git a/mlair/run_modules/pre_processing.py b/mlair/run_modules/pre_processing.py index 0e416acbca4d66d5844e1179c7653ac5a9934f28..fc1ae4b7ad63a51b623aacb3d846d33ca3a482e0 100644 --- a/mlair/run_modules/pre_processing.py +++ b/mlair/run_modules/pre_processing.py @@ -16,9 +16,10 @@ import dill import pandas as pd from mlair.data_handler import DataCollection, AbstractDataHandler -from mlair.helpers import TimeTracking, to_list, tables +from mlair.helpers import TimeTracking, to_list, tables, remove_items from mlair.configuration import path_config -from mlair.helpers.join import EmptyQueryResult +from mlair.helpers.data_sources.toar_data import EmptyQueryResult +from mlair.helpers.testing import check_nested_equality from mlair.run_modules.run_environment import RunEnvironment @@ -59,16 +60,22 @@ class PreProcessing(RunEnvironment): self._run() def _run(self): - stations = self.data_store.get("stations") - data_handler = self.data_store.get("data_handler") - _, valid_stations = self.validate_station(data_handler, stations, - "preprocessing") # , store_processed_data=False) - if len(valid_stations) == 0: - raise ValueError("Couldn't find any valid data according to given parameters. Abort experiment run.") - self.data_store.set("stations", valid_stations) - self.split_train_val_test() + snapshot_load_path = self.data_store.get_default("snapshot_load_path", default=None) + if snapshot_load_path is None: + stations = self.data_store.get("stations") + data_handler = self.data_store.get("data_handler") + _, valid_stations = self.validate_station(data_handler, stations, + "preprocessing") # , store_processed_data=False) + if len(valid_stations) == 0: + raise ValueError("Couldn't find any valid data according to given parameters. Abort experiment run.") + self.data_store.set("stations", valid_stations) + self.split_train_val_test() + else: + self.load_snapshot(snapshot_load_path) self.report_pre_processing() self.prepare_competitors() + if self.data_store.get_default("create_snapshot", False) is True: + self.create_snapshot() def report_pre_processing(self): """Log some metrics on data and create latex report.""" @@ -114,8 +121,8 @@ class PreProcessing(RunEnvironment): +------------+-------------------------------------------+---------------+---------------+---------------+---------+-------+--------+ """ - meta_cols = ['station_name', 'station_lon', 'station_lat', 'station_alt'] - meta_round = ["station_lon", "station_lat", "station_alt"] + meta_cols = ["name", "lat", "lon", "alt", "country", "state", "type", "type_of_area", "toar1_category"] + meta_round = ["lat", "lon", "alt"] precision = 4 path = os.path.join(self.data_store.get("experiment_path"), "latex_report") path_config.check_path_and_create(path) @@ -129,13 +136,10 @@ class PreProcessing(RunEnvironment): tables.save_to_tex(path=path, filename="station_sample_size_short.tex", column_format=column_format, df=df_nometa) tables.save_to_md(path=path, filename="station_sample_size_short.md", df=df_nometa) - # df_nometa.to_latex(os.path.join(path, "station_sample_size_short.tex"), na_rep='---', - # column_format=column_format) df_descr = self.create_describe_df(df_nometa) column_format = tables.create_column_format_for_tex(df_descr) tables.save_to_tex(path=path, filename="station_describe_short.tex", column_format=column_format, df=df_descr) tables.save_to_md(path=path, filename="station_describe_short.md", df=df_descr) - # df_descr.to_latex(os.path.join(path, "station_describe_short.tex"), na_rep='---', column_format=column_format) @staticmethod def create_describe_df(df, percentiles=None, ignore_last_lines: int = 2): @@ -305,7 +309,8 @@ class PreProcessing(RunEnvironment): def store_data_handler_attributes(self, data_handler, collection): store_attributes = data_handler.store_attributes() if len(store_attributes) > 0: - logging.info("store data requested by the data handler") + logging.info(f"store following parameters ({len(store_attributes)}) requested by the data handler: " + f"{','.join(store_attributes)}") attrs = {} for dh in collection: station = str(dh) @@ -369,10 +374,69 @@ class PreProcessing(RunEnvironment): logging.info("Prepare IntelliO3-ts-v1 model") from mlair.reference_models.reference_model_intellio3_v1 import IntelliO3_ts_v1 path = os.path.join(self.data_store.get("competitor_path"), competitor_name) - IntelliO3_ts_v1("IntelliO3-ts-v1", path).make_reference_available_locally(remove_tmp_dir=False) + IntelliO3_ts_v1("IntelliO3-ts-v1", ref_store_path=path).make_reference_available_locally(remove_tmp_dir=False) + elif competitor_name.lower() == "CAMS".lower(): + logging.info("Prepare CAMS forecasts") + from mlair.reference_models.reference_model_cams import CAMSforecast + data_path = self.data_store.get_default("cams_data_path", default=None) + path = os.path.join(self.data_store.get("competitor_path"), competitor_name) + stations = {} + for subset in ["train", "val", "test"]: + data_collection = self.data_store.get("data_collection", subset) + stations.update({str(s): s.get_coordinates() for s in data_collection if s not in stations}) + CAMSforecast("CAMS", ref_store_path=path, data_path=data_path).make_reference_available_locally(stations) + else: + logging.info(f"No preparation required for competitor {competitor_name} as no specific instruction " + f"is provided.") else: logging.info("No preparation required because no competitor was provided to the workflow.") + def create_snapshot(self): + logging.info("create snapshot for preprocessing") + from mlair.configuration.snapshot_names import animals + for i_try in range(10): + snapshot_name = random.choice(animals).lower() + snapshot_path = os.path.abspath(self.data_store.get("snapshot_path")) + path_config.check_path_and_create(snapshot_path, remove_existing=False) + _snapshot_file = os.path.join(snapshot_path, f"snapshot_preprocessing_{snapshot_name}.pickle") + if not os.path.exists(_snapshot_file): + logging.info(f"store snapshot at: {_snapshot_file}") + with open(_snapshot_file, "wb") as f: + dill.dump(self.data_store, f, protocol=4) + print(_snapshot_file) + return + logging.info(f"Could not create snapshot at {_snapshot_file} as file is already existing ({i_try + 1}/10)") + logging.info(f"Could not create any snapshot after 10/10 tries.") + + def load_snapshot(self, file): + logging.info(f"load snapshot for preprocessing from {file}") + with open(file, "rb") as f: + snapshot = dill.load(f) + excluded_params = ["activation", "activation_output", "add_dense_layer", "batch_normalization", "batch_path", + "batch_size", "block_length", "bootstrap_method", "bootstrap_path", "bootstrap_type", + "competitor_path", "competitors", "create_new_bootstraps", "create_new_model", + "create_snapshot", "data_collection", "debug_mode", "dense_layer_configuration", + "do_uncertainty_estimate", "dropout", "dropout_rnn", "early_stopping_epochs", "epochs", + "evaluate_competitors", "evaluate_feature_importance", "experiment_name", "experiment_path", + "exponent_last_layer", "forecast_path", "fraction_of_training", "hostname", "hpc_hosts", + "kernel_regularizer", "kernel_size", "layer_configuration", "log_level_stream", + "logging_path", "login_nodes", "loss_type", "loss_weights", "max_number_multiprocessing", + "model_class", "model_display_name", "model_path", "n_boots", "n_hidden", "n_layer", + "neighbors", "plot_list", "plot_path", "regularizer", "restore_best_model_weights", + "snapshot_load_path", "snapshot_path", "stations", "tmp_path", "train_model", + "transformation", "use_multiprocessing", ] + + data_handler = self.data_store.get("data_handler") + model_class = self.data_store.get("model_class") + excluded_params = list(set(excluded_params + data_handler.store_attributes() + model_class.requirements())) + + if check_nested_equality(self.data_store._store, snapshot._store, skip_args=excluded_params) is True: + self.update_datastore(snapshot, excluded_params=remove_items(excluded_params, ["transformation", + "data_collection", + "stations"])) + else: + raise ReferenceError("provided snapshot does not match with the current experiment setup. Abort this run!") + def f_proc(data_handler, station, name_affix, store, return_strategy="", tmp_path=None, **kwargs): """ @@ -385,8 +449,8 @@ def f_proc(data_handler, station, name_affix, store, return_strategy="", tmp_pat res = data_handler.build(station, name_affix=name_affix, store_processed_data=store, **kwargs) except (AttributeError, EmptyQueryResult, KeyError, requests.ConnectionError, ValueError, IndexError) as e: formatted_lines = traceback.format_exc().splitlines() - logging.info( - f"remove station {station} because it raised an error: {e} -> {' | '.join(f_inspect_error(formatted_lines))}") + logging.info(f"remove station {station} because it raised an error: {e} -> " + f"{' | '.join(f_inspect_error(formatted_lines))}") logging.debug(f"detailed information for removal of station {station}: {traceback.format_exc()}") res = None if return_strategy == "result": @@ -401,10 +465,11 @@ def f_proc(data_handler, station, name_affix, store, return_strategy="", tmp_pat def f_proc_create_info_df(data, meta_cols): - station_name = str(data.id_class) - res = {"station_name": station_name, "Y_shape": data.get_Y()[0].shape[0], - "meta": data.id_class.meta.loc[meta_cols].values.flatten()} - return res + station_name = str(data.id_class) + meta = data.id_class.meta + res = {"station_name": station_name, "Y_shape": data.get_Y()[0].shape[0], + "meta": meta.reindex(meta_cols).values.flatten()} + return res def f_inspect_error(formatted): diff --git a/mlair/run_modules/run_environment.py b/mlair/run_modules/run_environment.py index df34345b4fb67e764f6e4d8d6570a5fafb762304..2bc81750bf86d60e0c59bbf0fef68ae9c29138c9 100644 --- a/mlair/run_modules/run_environment.py +++ b/mlair/run_modules/run_environment.py @@ -11,8 +11,7 @@ import time from mlair.helpers.datastore import DataStoreByScope as DataStoreObject from mlair.helpers.datastore import NameNotFoundInDataStore -from mlair.helpers import Logger -from mlair.helpers import TimeTracking +from mlair.helpers import Logger, to_list, TimeTracking from mlair.plotting.tracker_plot import TrackPlot @@ -114,18 +113,22 @@ class RunEnvironment(object): not as inheritance from this class, log file is copied and data store is cleared. """ if not self.del_by_exit: - self.time.stop() - logging.info(f"{self._name} finished after {self.time}") - self.del_by_exit = True - # copy log file and clear data store only if called as base class and not as super class - if self.__class__.__name__ == "RunEnvironment": + if hasattr(self, "time"): + self.time.stop() try: - self.__plot_tracking() - self.__save_tracking() - self.__move_log_file() - except FileNotFoundError: + logging.info(f"{self._name} finished after {self.time}") + except NameError: pass - self.data_store.clear_data_store() + self.del_by_exit = True + # copy log file and clear data store only if called as base class and not as super class + if self.__class__.__name__ == "RunEnvironment": + try: + self.__plot_tracking() + self.__save_tracking() + self.__move_log_file() + except (FileNotFoundError, NameError): + pass + self.data_store.clear_data_store() def __enter__(self): """Enter run environment.""" @@ -169,6 +172,22 @@ class RunEnvironment(object): new_file = filename_pattern % counter return new_file + @classmethod + def update_datastore(cls, new_data_store: DataStoreObject, excluded_params=None, apply_full_replacement=False): + if apply_full_replacement is True: + RunEnvironment.data_store = new_data_store + else: + assert type(RunEnvironment.data_store) == type(new_data_store) + scopes = new_data_store.list_all_scopes() + excluded_params = to_list(excluded_params) + for s in scopes: + # name, scope and value + entries = new_data_store.search_scope(s, current_scope_only=True, return_all=True) + for e in entries: + if e[0] not in excluded_params: + # name, value, scope + RunEnvironment.data_store.set(e[0], e[2], e[1]) + @staticmethod def do_stuff(length=2): """Just a placeholder method for testing without any sense.""" diff --git a/mlair/run_modules/training.py b/mlair/run_modules/training.py index 5ce906122ef184d6dcad5527e923e44f04028fe5..0d7bb98f109b612cf3cffc3dc31541bb1733c541 100644 --- a/mlair/run_modules/training.py +++ b/mlair/run_modules/training.py @@ -19,7 +19,7 @@ from mlair.model_modules.keras_extensions import CallbackHandler from mlair.plotting.training_monitoring import PlotModelHistory, PlotModelLearningRate from mlair.run_modules.run_environment import RunEnvironment from mlair.configuration import path_config -from mlair.helpers import to_list, tables +from mlair.helpers import to_list, tables, TimeTrackingWrapper class Training(RunEnvironment): @@ -102,6 +102,7 @@ class Training(RunEnvironment): """ self.model.make_predict_function() + @TimeTrackingWrapper def _set_gen(self, mode: str) -> None: """ Set and distribute the generators for given mode regarding batch size. @@ -109,9 +110,11 @@ class Training(RunEnvironment): :param mode: name of set, should be from ["train", "val", "test"] """ collection = self.data_store.get("data_collection", mode) - kwargs = self.data_store.create_args_dict(["upsampling", "shuffle_batches", "batch_path"], scope=mode) + kwargs = self.data_store.create_args_dict(["upsampling", "shuffle_batches", "batch_path", "use_multiprocessing", + "max_number_multiprocessing"], scope=mode) setattr(self, f"{mode}_set", KerasIterator(collection, self.batch_size, model=self.model, name=mode, **kwargs)) + @TimeTrackingWrapper def set_generators(self) -> None: """ Set all generators for training, validation, and testing subsets. diff --git a/mlair/workflows/custom_workflows.py b/mlair/workflows/custom_workflows.py new file mode 100644 index 0000000000000000000000000000000000000000..868a4d24b513110e500079d9c2cae9a22379cc4d --- /dev/null +++ b/mlair/workflows/custom_workflows.py @@ -0,0 +1,22 @@ +"""Collection of custom workflows.""" + +__author__ = "Lukas Leufen" +__date__ = '2022-07-20' + + +from mlair.run_modules import ExperimentSetup, PreProcessing, ModelSetup, Training +from mlair.workflows.default_workflow import DefaultWorkflow + + +class HyperSearchWorkflow(DefaultWorkflow): + """ + A workflow executing ExperimentSetup, PreProcessing, ModelSetup, and Training. PostProcessing is not added to this + workflow which is intended to be used in a training cycle. + """ + + def _setup(self, **kwargs): + """Set up default workflow.""" + self.add(ExperimentSetup, **kwargs) + self.add(PreProcessing) + self.add(ModelSetup) + self.add(Training) diff --git a/requirements.txt b/requirements.txt index 3afc17b67fddbf5a269df1e1b7e103045630a290..f644ae9257c0b5a18492f8a2d0ef27d1246ec0d4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,34 +1,35 @@ -astropy==4.1 +astropy==5.1 auto_mix_prep==0.2.0 -Cartopy==0.18.0 -dask==2021.3.0 +Cartopy==0.20.0 +dask==2021.9.1 dill==0.3.3 -fsspec==2021.11.0 -keras==2.6.0 -keras_nightly==2.5.0.dev2021032900 +fsspec==2021.10.1 +Keras==2.6.0 locket==0.2.1 -matplotlib==3.3.4 +matplotlib==3.4.3 mock==4.0.3 -netcdf4==1.5.8 -numpy==1.19.5 -pandas==1.1.5 +netcdf4==1.6.0 +numpy~=1.19.2 +pandas==1.3.4 partd==1.2.0 -psutil==5.8.0 +psutil==5.9.1 pydot==1.4.2 pytest==6.2.2 pytest-cov==2.11.1 pytest-html==3.1.1 pytest-lazy-fixture==0.6.3 -requests==2.25.1 -scipy==1.5.2 -seaborn==0.11.1 +requests==2.28.1 +scipy==1.7.1 +seaborn==0.11.2 setuptools==47.1.0 --no-binary shapely Shapely==1.8.0 six==1.15.0 -statsmodels==0.12.2 -tabulate==0.8.9 -tensorflow==2.5.0 -toolz==0.11.2 -typing_extensions==3.7.4.3 +statsmodels==0.13.2 +tabulate==0.8.10 +tensorflow==2.6.0 +tensorflow-probability==0.14.1 +timezonefinder==5.2.0 +toolz==0.11.1 +typing_extensions~=3.7.4 wget==3.2 xarray==0.16.2 diff --git a/run_bnn.py b/run_bnn.py new file mode 100644 index 0000000000000000000000000000000000000000..3642ec6d522aff51516a7eb710d00b04ab137d50 --- /dev/null +++ b/run_bnn.py @@ -0,0 +1,61 @@ +__author__ = "Felix Kleinert" +__date__ = '2022-08-05' + +import argparse +from mlair.workflows import DefaultWorkflow +# from mlair.model_modules.recurrent_networks import RNN as chosen_model +from mlair.helpers import remove_items +from mlair.configuration.defaults import DEFAULT_PLOT_LIST +from mlair.model_modules.probability_models import ProbTestModel4, MyUnetProb, ProbTestModel2, ProbTestModelMixture +import os +import tensorflow as tf + + +def load_stations(case=0): + import json + cases = { + 0: 'supplement/station_list_north_german_plain_rural.json', + 1: 'supplement/station_list_north_german_plain.json', + 2: 'supplement/German_background_stations.json', + } + try: + filename = cases[case] + with open(filename, 'r') as jfile: + stations = json.load(jfile) + except FileNotFoundError: + stations = None + return stations + + +def main(parser_args): + # tf.compat.v1.disable_v2_behavior() + plots = remove_items(DEFAULT_PLOT_LIST, ["PlotConditionalQuantiles", "PlotPeriodogram"]) + stats_per_var = {'o3': 'dma8eu', 'relhum': 'average_values', 'temp': 'maximum', 'u': 'average_values', + 'v': 'average_values', 'no': 'dma8eu', 'no2': 'dma8eu', 'cloudcover': 'average_values', + 'pblheight': 'maximum'} + workflow = DefaultWorkflow( # stations=load_stations(), + #stations=["DEBW087","DEBW013", "DEBW107", "DEBW076"], + stations=load_stations(2), + model=MyUnetProb, + window_lead_time=4, + window_history_size=6, + epochs=100, + batch_size=1024, + train_model=False, create_new_model=True, network="UBA", + evaluate_feature_importance=False, # plot_list=["PlotCompetitiveSkillScore"], + # competitors=["test_model", "test_model2"], + competitor_path=os.path.join(os.getcwd(), "data", "comp_test"), + variables=list(stats_per_var.keys()), + statistics_per_var=stats_per_var, + target_var="o3", + target_var_unit="ppb", + **parser_args.__dict__, start_script=__file__) + workflow.run() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--experiment_date', metavar='--exp_date', type=str, default="testrun", + help="set experiment date as string") + args = parser.parse_args() + main(args) diff --git a/test/test_configuration/test_defaults.py b/test/test_configuration/test_defaults.py index 07a5aa2f543b1992baf10421de4b28133feb0eac..b46590290eff09ac98d549c7d38010eb5506d09c 100644 --- a/test/test_configuration/test_defaults.py +++ b/test/test_configuration/test_defaults.py @@ -31,7 +31,6 @@ class TestAllDefaults: 'v': 'average_values', 'no': 'dma8eu', 'no2': 'dma8eu', 'cloudcover': 'average_values', 'pblheight': 'maximum'} - assert DEFAULT_NETWORK == "AIRBASE" assert DEFAULT_STATION_TYPE == "background" assert DEFAULT_VARIABLES == DEFAULT_VAR_ALL_DICT.keys() assert DEFAULT_START == "1997-01-01" diff --git a/test/test_data_handler/test_iterator.py b/test/test_data_handler/test_iterator.py index bb8ecb5d216519b3662a5baa4d463780b4c29d8c..b5fb30a90c99d33e9dfe3db1346cfd7f43549fc9 100644 --- a/test/test_data_handler/test_iterator.py +++ b/test/test_data_handler/test_iterator.py @@ -1,4 +1,5 @@ from mlair.data_handler.iterator import DataCollection, StandardIterator, KerasIterator +from mlair.data_handler.iterator import _get_number_of_mini_batches, _get_batch, _permute_data, _save_to_pickle, f_proc from mlair.helpers.testing import PyTestAllEqual from mlair.model_modules.model_class import MyBranchedModel from mlair.model_modules.fully_connected_networks import FCN_64_32_16 @@ -89,6 +90,14 @@ class DummyData: def __init__(self, number_of_samples=np.random.randint(100, 150)): np.random.seed(45) self.number_of_samples = number_of_samples + self._len = self.number_of_samples + self._len_upsampling = self.number_of_samples + + def __len__(self, upsampling=False): + if upsampling is False: + return self._len + else: + return self._len_upsampling def get_X(self, upsampling=False, as_numpy=True): np.random.seed(45) @@ -152,13 +161,6 @@ class TestKerasIterator: iterator._cleanup_path(path, create_new=False) assert os.path.exists(path) is False - def test_get_number_of_mini_batches(self): - iterator = object.__new__(KerasIterator) - iterator.batch_size = 36 - assert iterator._get_number_of_mini_batches(30) == 0 - assert iterator._get_number_of_mini_batches(40) == 1 - assert iterator._get_number_of_mini_batches(72) == 2 - def test_len(self): iterator = object.__new__(KerasIterator) iterator.indexes = [0, 1, 2, 3, 4, 5] @@ -175,25 +177,6 @@ class TestKerasIterator: for i in range(3): assert PyTestAllEqual([new_arr[i], test_arr[i]]) - def test_get_batch(self): - arr = DummyData(20).get_X() - iterator = object.__new__(KerasIterator) - iterator.batch_size = 19 - batch1 = iterator._get_batch(arr, 0) - assert batch1[0].shape[0] == 19 - batch2 = iterator._get_batch(arr, 1) - assert batch2[0].shape[0] == 1 - - def test_save_to_pickle(self, path): - os.makedirs(path) - d = DummyData(20) - X, Y = d.get_X(), d.get_Y() - iterator = object.__new__(KerasIterator) - iterator._path = os.path.join(path, "%i.pickle") - assert os.path.exists(iterator._path % 2) is False - iterator._save_to_pickle(X=X, Y=Y, index=2) - assert os.path.exists(iterator._path % 2) is True - def test_prepare_batches(self, collection, path): iterator = object.__new__(KerasIterator) iterator._collection = collection @@ -292,14 +275,111 @@ class TestKerasIterator: with pytest.raises(TypeError): iterator._get_model_rank() - def test_permute(self): - iterator = object.__new__(KerasIterator) + +class TestGetNumberOfMiniBatches: + + def test_get_number_of_mini_batches(self): + batch_size = 36 + assert _get_number_of_mini_batches(30, batch_size) == 0 + assert _get_number_of_mini_batches(40, batch_size) == 1 + assert _get_number_of_mini_batches(72, batch_size) == 2 + + +class TestGetBatch: + + def test_get_batch(self): + arr = DummyData(20).get_X() + batch_size = 19 + batch1 = _get_batch(arr, 0, batch_size) + assert batch1[0].shape[0] == 19 + batch2 = _get_batch(arr, 1, batch_size) + assert batch2[0].shape[0] == 1 + + +class TestSaveToPickle: + + @pytest.fixture + def path(self): + p = os.path.join(os.path.dirname(os.path.abspath(__file__)), "testdata") + shutil.rmtree(p, ignore_errors=True) if os.path.exists(p) else None + yield p + shutil.rmtree(p, ignore_errors=True) + + def test_save_to_pickle(self, path): + os.makedirs(path) + d = DummyData(20) + X, Y = d.get_X(), d.get_Y() + _path = os.path.join(path, "%i.pickle") + assert os.path.exists(_path % 2) is False + _save_to_pickle(_path, X=X, Y=Y, index=2) + assert os.path.exists(_path % 2) is True + + +class TestPermuteData: + + def test_permute_data(self): X = [np.array([[1, 2, 3, 4], [1.1, 2.1, 3.1, 4.1], [1.2, 2.2, 3.2, 4.2]], dtype="f2")] Y = [np.array([1, 2, 3])] - X_p, Y_p = iterator._permute_data(X, Y) + X_p, Y_p = _permute_data(X, Y) assert X_p[0].shape == X[0].shape assert Y_p[0].shape == Y[0].shape assert np.testing.assert_almost_equal(X_p[0].sum(), X[0].sum(), 2) is None assert np.testing.assert_almost_equal(Y_p[0].sum(), Y[0].sum(), 2) is None + + +class TestFProc: + + @pytest.fixture + def collection(self): + coll = [] + for i in range(3): + coll.append(DummyData(50 + i)) + data_coll = DataCollection(collection=coll) + return data_coll + + @pytest.fixture + def collection_small(self): + coll = [] + for i in range(3): + coll.append(DummyData(5 + i)) + data_coll = DataCollection(collection=coll) + return data_coll + + @pytest.fixture + def path(self): + p = os.path.join(os.path.dirname(os.path.abspath(__file__)), "testdata") + shutil.rmtree(p, ignore_errors=True) if os.path.exists(p) else None + os.makedirs(p) + yield p + shutil.rmtree(p, ignore_errors=True) + + def test_f_proc(self, collection, path): + data = collection[0] + upsampling = False + mod_rank = 2 + batch_size = 32 + remaining = f_proc(data, upsampling, mod_rank, batch_size, os.path.join(path, "%i.pickle"), 0) + assert isinstance(remaining, tuple) + assert len(remaining) == 2 + assert isinstance(remaining[0], list) + assert len(remaining[0]) == 3 + assert remaining[0][0].shape == (18, 14, 5) + + def test_f_proc_no_remaining(self, collection, path): + data = collection[0] + upsampling = False + mod_rank = 2 + batch_size = 50 + remaining = f_proc(data, upsampling, mod_rank, batch_size, os.path.join(path, "%i.pickle"), 0) + assert remaining is None + + def test_f_proc_X_Y(self, collection, path): + data = collection[0] + X, Y = data.get_data() + upsamling = False + mod_rank = 2 + batch_size = 40 + remaining = f_proc((X, Y), upsamling, mod_rank, batch_size, os.path.join(path, "%i.pickle"), 0) + assert remaining[0][0].shape == (10, 14, 5) diff --git a/test/test_helpers/test_data_sources/test_join.py b/test/test_helpers/test_data_sources/test_join.py new file mode 100644 index 0000000000000000000000000000000000000000..f9b12f5a7ff20e898695de0a0f035bed023674f2 --- /dev/null +++ b/test/test_helpers/test_data_sources/test_join.py @@ -0,0 +1,344 @@ +from typing import Iterable + +import pytest + +from mlair.helpers.data_sources.join import * +from mlair.helpers.data_sources.join import _save_to_pandas, _lower_list, _select_distinct_series, \ + _select_distinct_data_origin, _select_distinct_network +from mlair.configuration.join_settings import join_settings +from mlair.helpers.testing import check_nested_equality +from mlair.helpers.data_sources.toar_data import EmptyQueryResult + + +class TestDownloadJoin: + + def test_download_single_var(self): + data, meta = download_join("DEBW107", {"o3": "dma8eu"}) + assert data.columns == "o3" + assert meta.columns == "DEBW107" + + def test_download_empty(self): + with pytest.raises(EmptyQueryResult) as e: + download_join("DEBW107", {"o3": "dma8eu"}, "traffic") + assert e.value.args[-1] == "No data found for variables {'o3'} and options station=['DEBW107'], type=traffic," \ + " network=None, origin={} in JOIN." + + def test_download_incomplete(self): + with pytest.raises(EmptyQueryResult) as e: + download_join("DEBW107", {"o3": "dma8eu", "o10": "maximum"}, "background") + assert e.value.args[-1] == "No data found for variables {'o10'} and options station=['DEBW107'], " \ + "type=background, network=None, origin={} in JOIN." + with pytest.raises(EmptyQueryResult) as e: + download_join("DEBW107", {"o3": "dma8eu", "o10": "maximum"}, "background", data_origin={"o10": ""}) + assert e.value.args[-1] == "No data found for variables {'o10'} and options station=['DEBW107'], " \ + "type=background, network={'o10': None}, origin={'o10': ''} in JOIN." + + +class TestCorrectDataFormat: + + def test_correct_data_format(self): + list_data = [["2020-01-01 06:00:01", 23.], ["2020-01-01 06:00:11", 24.], ["2020-01-01 06:00:21", 25.], + ["2020-01-01 06:00:31", 26.], ["2020-01-01 06:00:41", 27.], ["2020-01-01 06:00:51", 23.], + {"station": "test_station_001", "author": "ME", "success": True}] + dict_data = correct_data_format(list_data) + assert dict_data == {"datetime": ["2020-01-01 06:00:01", "2020-01-01 06:00:11", "2020-01-01 06:00:21", + "2020-01-01 06:00:31", "2020-01-01 06:00:41", "2020-01-01 06:00:51"], + "values": [23., 24., 25., 26., 27., 23.], + "metadata": {"station": "test_station_001", "author": "ME", "success": True}} + + +class TestLoadSeriesInformation: + + def test_standard_query(self): + expected_subset = {'o3': 17057, 'no2': 17058, 'temp': 85587, 'wspeed': 17060} + res, orig = load_series_information(['DEBW107'], None, None, join_settings()[0], {}) + assert expected_subset.items() <= res.items() + + def test_empty_result(self): + res, orig = load_series_information(['DEBW107'], "traffic", None, join_settings()[0], {}) + assert res == {} + + +class TestSelectDistinctDataOrigin: + + @pytest.fixture + def vars(self): + return [{'id': 16686, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'no2', + 'parameter_label': 'NO2', 'parameter_attribute': ''}, + {'id': 16687, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'o3', + 'parameter_label': 'O3', 'parameter_attribute': ''}, + {'id': 16692, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'press', + 'parameter_label': 'PRESS--LANUV', 'parameter_attribute': ''}, + {'id': 16693, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'temp', + 'parameter_label': 'TEMP--LANUV', 'parameter_attribute': ''}, + {'id': 54036, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'cloudcover', + 'parameter_label': 'CLOUDCOVER', 'parameter_attribute': 'REA'}, + {'id': 88491, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'temp', + 'parameter_label': 'TEMP-REA-MIUB', 'parameter_attribute': 'REA'}, + {'id': 102660, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'press', + 'parameter_label': 'PRESS-REA-MIUB', 'parameter_attribute': 'REA'}, + {'id': 26692, 'network_name': 'AIRBASE', 'station_id': 'DENW053', 'parameter_name': 'press', + 'parameter_label': 'PRESS', 'parameter_attribute': 'REA'}] + + def test_no_origin_given(self, vars): + res, orig = _select_distinct_data_origin(vars, {}) + expected = { + "no2": [{'id': 16686, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'no2', + 'parameter_label': 'NO2', 'parameter_attribute': ''}], + "o3": [{'id': 16687, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'o3', + 'parameter_label': 'O3', 'parameter_attribute': ''}], + "cloudcover": [{'id': 54036, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'cloudcover', + 'parameter_label': 'CLOUDCOVER', 'parameter_attribute': 'REA'}], + "temp": [{'id': 88491, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'temp', + 'parameter_label': 'TEMP-REA-MIUB', 'parameter_attribute': 'REA'}], + "press": [{'id': 102660, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'press', + 'parameter_label': 'PRESS-REA-MIUB', 'parameter_attribute': 'REA'}, + {'id': 26692, 'network_name': 'AIRBASE', 'station_id': 'DENW053', + 'parameter_name': 'press', 'parameter_label': 'PRESS', 'parameter_attribute': 'REA'}]} + + assert check_nested_equality(res, expected) is True + # assert res == {"no2": 16686, "o3": 16687, "cloudcover": 54036, "temp": 88491, "press": 102660} + assert orig == {"no2": "", "o3": "", "cloudcover": "REA", "temp": "REA", "press": "REA"} + + def test_different_origins(self, vars): + origin = {"no2": "test", "temp": "", "cloudcover": "REA"} + res, orig = _select_distinct_data_origin(vars, data_origin=origin) + expected = { + "o3": [{'id': 16687, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'o3', + 'parameter_label': 'O3', 'parameter_attribute': ''}], + "cloudcover": [{'id': 54036, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'cloudcover', + 'parameter_label': 'CLOUDCOVER', 'parameter_attribute': 'REA'}], + "temp": [{'id': 16693, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'temp', + 'parameter_label': 'TEMP--LANUV', 'parameter_attribute': ''}], + "press": [{'id': 102660, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'press', + 'parameter_label': 'PRESS-REA-MIUB', 'parameter_attribute': 'REA'}, + {'id': 26692, 'network_name': 'AIRBASE', 'station_id': 'DENW053', + 'parameter_name': 'press', 'parameter_label': 'PRESS', 'parameter_attribute': 'REA'}]} + assert check_nested_equality(res, expected) is True + # assert res == {"o3": 16687, "press": 102660, "temp": 16693, "cloudcover": 54036} + assert orig == {"no2": "test", "o3": "", "cloudcover": "REA", "temp": "", "press": "REA"} + + +class TestSelectDistinctNetwork: + + @pytest.fixture + def vars(self): + return { + "no2": [{'id': 16686, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'no2', + 'parameter_label': 'NO2', 'parameter_attribute': ''}], + "o3": [{'id': 16687, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'o3', + 'parameter_label': 'O3', 'parameter_attribute': ''}], + "cloudcover": [{'id': 54036, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'cloudcover', + 'parameter_label': 'CLOUDCOVER', 'parameter_attribute': 'REA'}], + "temp": [{'id': 88491, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'temp', + 'parameter_label': 'TEMP-REA-MIUB', 'parameter_attribute': 'REA'}], + "press": [{'id': 102660, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'press', + 'parameter_label': 'PRESS-REA-MIUB', 'parameter_attribute': 'REA'}, + {'id': 26692, 'network_name': 'AIRBASE', 'station_id': 'DENW053', + 'parameter_name': 'press', 'parameter_label': 'PRESS', 'parameter_attribute': 'REA'}]} + + def test_no_network_given(self, caplog, vars): + caplog.set_level(logging.INFO) + res = _select_distinct_network(vars, []) + expected = { + "no2": {'id': 16686, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'no2', + 'parameter_label': 'NO2', 'parameter_attribute': ''}, + "o3": {'id': 16687, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'o3', + 'parameter_label': 'O3', 'parameter_attribute': ''}, + "cloudcover": {'id': 54036, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'cloudcover', + 'parameter_label': 'CLOUDCOVER', 'parameter_attribute': 'REA'}, + "temp": {'id': 88491, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'temp', + 'parameter_label': 'TEMP-REA-MIUB', 'parameter_attribute': 'REA'}, + "press": {'id': 102660, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'press', + 'parameter_label': 'PRESS-REA-MIUB', 'parameter_attribute': 'REA'}} + assert check_nested_equality(res, expected) is True + + message = "Could not find a valid match for variable %s and networks []! Therefore, use first answer from JOIN:" + assert message % "no2" in caplog.messages[0] + assert message % "o3" in caplog.messages[1] + assert message % "cloudcover" in caplog.messages[2] + assert message % "temp" in caplog.messages[3] + assert message % "press" in caplog.messages[4] + + def test_single_network_given(self, vars): + res = _select_distinct_network(vars, ["UBA"]) + expected = { + "no2": {'id': 16686, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'no2', + 'parameter_label': 'NO2', 'parameter_attribute': ''}, + "o3": {'id': 16687, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'o3', + 'parameter_label': 'O3', 'parameter_attribute': ''}, + "cloudcover": {'id': 54036, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'cloudcover', + 'parameter_label': 'CLOUDCOVER', 'parameter_attribute': 'REA'}, + "temp": {'id': 88491, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'temp', + 'parameter_label': 'TEMP-REA-MIUB', 'parameter_attribute': 'REA'}, + "press": {'id': 102660, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'press', + 'parameter_label': 'PRESS-REA-MIUB', 'parameter_attribute': 'REA'}} + assert check_nested_equality(res, expected) is True + + def test_single_network_given_no_match(self, vars): + with pytest.raises(ValueError) as e: # AIRBASE not avail for all variables + _select_distinct_network(vars, ["AIRBASE"]) + assert e.value.args[-1] == "Cannot find a valid match for requested networks ['AIRBASE'] and variable no2 as " \ + "only following networks are available in JOIN: ['UBA']" + + with pytest.raises(ValueError) as e: # both requested networks are not available for all variables + _select_distinct_network(vars, ["LUBW", "EMEP"]) + assert e.value.args[-1] == "Cannot find a valid match for requested networks ['LUBW', 'EMEP'] and variable " \ + "no2 as only following networks are available in JOIN: ['UBA']" + + def test_multiple_networks_given(self, vars): + res = _select_distinct_network(vars, ["UBA", "AIRBASE"]) + expected = { + "no2": {'id': 16686, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'no2', + 'parameter_label': 'NO2', 'parameter_attribute': ''}, + "o3": {'id': 16687, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'o3', + 'parameter_label': 'O3', 'parameter_attribute': ''}, + "cloudcover": {'id': 54036, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'cloudcover', + 'parameter_label': 'CLOUDCOVER', 'parameter_attribute': 'REA'}, + "temp": {'id': 88491, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'temp', + 'parameter_label': 'TEMP-REA-MIUB', 'parameter_attribute': 'REA'}, + "press": {'id': 102660, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'press', + 'parameter_label': 'PRESS-REA-MIUB', 'parameter_attribute': 'REA'}} + assert check_nested_equality(res, expected) is True + + res = _select_distinct_network(vars, ["AIRBASE", "UBA"]) + expected = { + "no2": {'id': 16686, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'no2', + 'parameter_label': 'NO2', 'parameter_attribute': ''}, + "o3": {'id': 16687, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'o3', + 'parameter_label': 'O3', 'parameter_attribute': ''}, + "cloudcover": {'id': 54036, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'cloudcover', + 'parameter_label': 'CLOUDCOVER', 'parameter_attribute': 'REA'}, + "temp": {'id': 88491, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'temp', + 'parameter_label': 'TEMP-REA-MIUB', 'parameter_attribute': 'REA'}, + "press": {'id': 26692, 'network_name': 'AIRBASE', 'station_id': 'DENW053', + 'parameter_name': 'press', 'parameter_label': 'PRESS', 'parameter_attribute': 'REA'}} + assert check_nested_equality(res, expected) is True + + def test_multiple_networks_given_by_dict(self, vars): + res = _select_distinct_network(vars, {"no2": "UBA", "o3": ["UBA", "AIRBASE"], "temp": ["AIRBASE", "UBA"], + "press": ["AIRBASE", "UBA"]}) + expected = { + "no2": {'id': 16686, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'no2', + 'parameter_label': 'NO2', 'parameter_attribute': ''}, + "o3": {'id': 16687, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'o3', + 'parameter_label': 'O3', 'parameter_attribute': ''}, + "cloudcover": {'id': 54036, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'cloudcover', + 'parameter_label': 'CLOUDCOVER', 'parameter_attribute': 'REA'}, + "temp": {'id': 88491, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'temp', + 'parameter_label': 'TEMP-REA-MIUB', 'parameter_attribute': 'REA'}, + "press": {'id': 26692, 'network_name': 'AIRBASE', 'station_id': 'DENW053', + 'parameter_name': 'press', 'parameter_label': 'PRESS', 'parameter_attribute': 'REA'}} + assert check_nested_equality(res, expected) is True + + +class TestSelectDistinctSeries: + + @pytest.fixture + def vars(self): + return [{'id': 16686, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'no2', + 'parameter_label': 'NO2', 'parameter_attribute': ''}, + {'id': 16687, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'o3', + 'parameter_label': 'O3', 'parameter_attribute': ''}, + {'id': 16692, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'press', + 'parameter_label': 'PRESS--LANUV', 'parameter_attribute': ''}, + {'id': 16693, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'temp', + 'parameter_label': 'TEMP--LANUV', 'parameter_attribute': ''}, + {'id': 54036, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'cloudcover', + 'parameter_label': 'CLOUDCOVER', 'parameter_attribute': 'REA'}, + {'id': 88491, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'temp', + 'parameter_label': 'TEMP-REA-MIUB', 'parameter_attribute': 'REA'}, + {'id': 102660, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'press', + 'parameter_label': 'PRESS-REA-MIUB', 'parameter_attribute': 'REA'}, + {'id': 26692, 'network_name': 'AIRBASE', 'station_id': 'DENW053', 'parameter_name': 'press', + 'parameter_label': 'PRESS', 'parameter_attribute': 'REA'}] + + def test_no_origin_given(self, vars): + res, orig = _select_distinct_series(vars) + assert res == {"no2": 16686, "o3": 16687, "cloudcover": 54036, "temp": 88491, "press": 102660} + assert orig == {"no2": "", "o3": "", "cloudcover": "REA", "temp": "REA", "press": "REA"} + + def test_different_origins(self, vars): + origin = {"no2": "test", "temp": "", "cloudcover": "REA"} + res, orig = _select_distinct_series(vars, data_origin=origin) + assert res == {"o3": 16687, "press": 102660, "temp": 16693, "cloudcover": 54036} + assert orig == {"no2": "test", "o3": "", "cloudcover": "REA", "temp": "", "press": "REA"} + res, orig = _select_distinct_series(vars, data_origin={}) + assert res == {"cloudcover": 54036, "no2": 16686, "o3": 16687, "press": 102660, "temp": 88491} + assert orig == {"no2": "", "o3": "", "temp": "REA", "press": "REA", "cloudcover": "REA"} + + def test_different_networks(self, vars): + res, orig = _select_distinct_series(vars, network_name="UBA") + assert res == {"no2": 16686, "o3": 16687, "cloudcover": 54036, "temp": 88491, "press": 102660} + assert orig == {"no2": "", "o3": "", "cloudcover": "REA", "temp": "REA", "press": "REA"} + + res, orig = _select_distinct_series(vars, network_name=["UBA", "EMEP", "AIRBASE"]) + assert res == {"no2": 16686, "o3": 16687, "cloudcover": 54036, "temp": 88491, "press": 102660} + assert orig == {"no2": "", "o3": "", "cloudcover": "REA", "temp": "REA", "press": "REA"} + + res, orig = _select_distinct_series(vars, network_name=["EMEP", "AIRBASE", "UBA"]) + assert res == {"no2": 16686, "o3": 16687, "cloudcover": 54036, "temp": 88491, "press": 26692} + assert orig == {"no2": "", "o3": "", "cloudcover": "REA", "temp": "REA", "press": "REA"} + + def test_network_not_available(self, vars): + with pytest.raises(ValueError) as e: + _select_distinct_series(vars, network_name="AIRBASE") + assert e.value.args[-1] == "Cannot find a valid match for requested networks ['AIRBASE'] and variable no2 as " \ + "only following networks are available in JOIN: ['UBA']" + + def test_different_network_and_origin(self, vars): + origin = {"no2": "test", "temp": "", "cloudcover": "REA"} + res, orig = _select_distinct_series(vars, data_origin=origin, network_name=["EMEP", "AIRBASE", "UBA"]) + assert res == {"o3": 16687, "press": 26692, "temp": 16693, "cloudcover": 54036} + assert orig == {"no2": "test", "o3": "", "cloudcover": "REA", "temp": "", "press": "REA"} + + +class TestSaveToPandas: + + @staticmethod + def convert_date(date): + return map(lambda s: dt.datetime.strptime(s, "%Y-%m-%d %H:%M"), date) + + @pytest.fixture + def date(self): + return ['1997-01-01 00:00', '1997-01-02 00:00', '1997-01-03 00:00', '1997-01-04 00:00'] + + @pytest.fixture + def date_len19(self): + return ['1997-01-01 00:00:00', '1997-01-02 00:00:00', '1997-01-03 00:00:00', '1997-01-04 00:00:00'] + + @pytest.fixture + def values(self): + return [86.21, 94.76, 76.96, 99.89] + + @pytest.fixture + def alternative_values(self): + return [20.0, 25.2, 25.1, 23.6] + + @pytest.fixture + def create_df(self, date, values): + return pd.DataFrame(values, index=self.convert_date(date), columns=['cloudcover']) + + def test_empty_df(self, date, values, create_df): + data = {'datetime': date, 'mean': values, 'metadata': None} + assert pd.testing.assert_frame_equal(create_df, _save_to_pandas(None, data, 'mean', 'cloudcover')) is None + + def test_not_empty_df(self, date, alternative_values, create_df): + data = {'datetime': date, 'max': alternative_values, 'metadata': None} + next_df = pd.DataFrame(data["max"], index=self.convert_date(date), columns=['temperature']) + df_concat = pd.concat([create_df, next_df], axis=1) + assert pd.testing.assert_frame_equal(df_concat, _save_to_pandas(create_df, data, 'max', 'temperature')) is None + + def test_alternative_date_format(self, date_len19, values, create_df): + data = {'datetime': date_len19, 'mean': values, 'metadata': None} + assert pd.testing.assert_frame_equal(create_df, _save_to_pandas(None, data, 'mean', 'cloudcover')) is None + + +class TestLowerList: + + def test_string_lowering(self): + list_iterator = _lower_list(["Capitalised", "already_small", "UPPERCASE", "veRyStRaNGe"]) + assert isinstance(list_iterator, Iterable) + assert list(list_iterator) == ["capitalised", "already_small", "uppercase", "verystrange"] + + diff --git a/test/test_helpers/test_data_sources/test_toar_data.py b/test/test_helpers/test_data_sources/test_toar_data.py new file mode 100644 index 0000000000000000000000000000000000000000..abaec10cc580b592d85d7dcc842616c67777f174 --- /dev/null +++ b/test/test_helpers/test_data_sources/test_toar_data.py @@ -0,0 +1,53 @@ +from mlair.configuration.join_settings import join_settings +from mlair.helpers.data_sources.toar_data import get_data, create_url, correct_stat_name + + +class TestGetData: + + def test(self): + opts = {"base": join_settings()[0], "service": "series", "station_id": 'DEBW107', "network_name": "UBA", + "parameter_name": "o3,no2"} + assert get_data(opts, headers={}) == [[17057, 'UBA', 'DEBW107', 'O3'], [17058, 'UBA', 'DEBW107', 'NO2']] + + +class TestCreateUrl: + + def test_minimal_args_given(self): + url = create_url("www.base.edu", "testingservice") + assert url == "www.base.edu/testingservice/" + + def test_given_kwargs(self): + url = create_url("www.base2.edu/", "testingservice", mood="happy", confidence=0.98) + assert url == "www.base2.edu/testingservice/?mood=happy&confidence=0.98" + + def test_single_kwargs(self): + url = create_url("www.base2.edu/", "testingservice", mood="undefined") + assert url == "www.base2.edu/testingservice/?mood=undefined" + + def test_none_kwargs(self): + url = create_url("www.base2.edu/", "testingservice", mood="sad", happiness=None, stress_factor=100) + assert url == "www.base2.edu/testingservice/?mood=sad&stress_factor=100" + + def test_param_id(self): + url = create_url("www.base.edu", "testingservice", param_id="2001") + assert url == "www.base.edu/testingservice/2001" + + def test_param_id_kwargs(self): + url = create_url("www.base.edu", "testingservice", param_id=2001, mood="sad", happiness=None, stress_factor=100) + assert url == "www.base.edu/testingservice/2001?mood=sad&stress_factor=100" + + url = create_url("www.base.edu", "testingservice", param_id=2001, mood="sad", series_id=222) + assert url == "www.base.edu/testingservice/2001?mood=sad&series_id=222" + + +class TestCorrectStatName: + + def test_nothing_to_do(self): + assert correct_stat_name("dma8eu") == "dma8eu" + assert correct_stat_name("max") == "max" + + def test_correct_string(self): + assert correct_stat_name("maximum") == "max" + assert correct_stat_name("minimum") == "min" + assert correct_stat_name("average_values") == "mean" + diff --git a/test/test_helpers/test_helpers.py b/test/test_helpers/test_helpers.py index 70640be9d56d71e4f68145b3bb68fb835e1e27a5..6f787d5835bd917fcfc55341d93a2d302f2c6e6e 100644 --- a/test/test_helpers/test_helpers.py +++ b/test/test_helpers/test_helpers.py @@ -12,8 +12,9 @@ import mock import pytest import string -from mlair.helpers import to_list, dict_to_xarray, float_round, remove_items, extract_value, select_from_dict, sort_like -from mlair.helpers import PyTestRegex +from mlair.helpers import to_list, dict_to_xarray, float_round, remove_items, extract_value, select_from_dict, \ + sort_like, filter_dict_by_value +from mlair.helpers import PyTestRegex, check_nested_equality from mlair.helpers import Logger, TimeTracking from mlair.helpers.helpers import is_xarray, convert2xrda, relative_round @@ -223,6 +224,10 @@ class TestSelectFromDict: assert select_from_dict(dictionary, ["a", "e"]) == {"a": 1, "e": None} assert select_from_dict(dictionary, ["a", "e"], remove_none=True) == {"a": 1} + def test_select_condition(self, dictionary): + assert select_from_dict(dictionary, ["a", "e"], filter_cond=False) == {"b": 23, "c": "last"} + assert select_from_dict(dictionary, ["a", "c"], filter_cond=False, remove_none=True) == {"b": 23} + class TestRemoveItems: @@ -487,3 +492,22 @@ class TestSortLike: l_obj = [1, 2, 3, 8, 4] with pytest.raises(AssertionError) as e: sort_like(l_obj, [1, 2, 3, 5, 6, 7, 8]) + + +class TestFilterDictByValue: + + def test_filter_dict_by_value(self): + data_origin = {'o3': '', 'no': '', 'no2': '', 'relhum': 'REA', 'u': 'REA', 'cloudcover': 'REA', 'temp': 'era5'} + expected = {'temp': 'era5'} + assert check_nested_equality(filter_dict_by_value(data_origin, "era5", True), expected) is True + expected = {'o3': '', 'no': '', 'no2': '', 'relhum': 'REA', 'u': 'REA', 'cloudcover': 'REA'} + assert check_nested_equality(filter_dict_by_value(data_origin, "era5", False), expected) is True + expected = {'o3': '', 'no': '', 'no2': ''} + assert check_nested_equality(filter_dict_by_value(data_origin, "", True), expected) is True + + def test_filter_dict_by_value_not_avail(self): + data_origin = {'o3': '', 'no': '', 'no2': '', 'relhum': 'REA', 'u': 'REA', 'cloudcover': 'REA', 'temp': 'era5'} + expected = {} + assert check_nested_equality(filter_dict_by_value(data_origin, "not_avail", True), expected) is True + assert check_nested_equality(filter_dict_by_value(data_origin, "EA", True), expected) is True + diff --git a/test/test_helpers/test_join.py b/test/test_helpers/test_join.py deleted file mode 100644 index e903669bf63f4056a8278401b07818d31a09616d..0000000000000000000000000000000000000000 --- a/test/test_helpers/test_join.py +++ /dev/null @@ -1,179 +0,0 @@ -from typing import Iterable - -import pytest - -from mlair.helpers.join import * -from mlair.helpers.join import _save_to_pandas, _correct_stat_name, _lower_list, _select_distinct_series -from mlair.configuration.join_settings import join_settings - - -class TestDownloadJoin: - - def test_download_single_var(self): - data, meta = download_join("DEBW107", {"o3": "dma8eu"}) - assert data.columns == "o3" - assert meta.columns == "DEBW107" - - def test_download_empty(self): - with pytest.raises(EmptyQueryResult) as e: - download_join("DEBW107", {"o3": "dma8eu"}, "traffic") - assert e.value.args[-1] == "No data found for variables {'o3'} and options station=['DEBW107'], type=traffic," \ - " network=None, origin={} in JOIN." - - def test_download_incomplete(self): - with pytest.raises(EmptyQueryResult) as e: - download_join("DEBW107", {"o3": "dma8eu", "o10": "maximum"}, "background") - assert e.value.args[-1] == "No data found for variables {'o10'} and options station=['DEBW107'], " \ - "type=background, network=None, origin={} in JOIN." - with pytest.raises(EmptyQueryResult) as e: - download_join("DEBW107", {"o3": "dma8eu", "o10": "maximum"}, "background", data_origin={"o10": ""}) - assert e.value.args[-1] == "No data found for variables {'o10'} and options station=['DEBW107'], " \ - "type=background, network=None, origin={'o10': ''} in JOIN." - - -class TestCorrectDataFormat: - - def test_correct_data_format(self): - list_data = [["2020-01-01 06:00:01", 23.], ["2020-01-01 06:00:11", 24.], ["2020-01-01 06:00:21", 25.], - ["2020-01-01 06:00:31", 26.], ["2020-01-01 06:00:41", 27.], ["2020-01-01 06:00:51", 23.], - {"station": "test_station_001", "author": "ME", "success": True}] - dict_data = correct_data_format(list_data) - assert dict_data == {"datetime": ["2020-01-01 06:00:01", "2020-01-01 06:00:11", "2020-01-01 06:00:21", - "2020-01-01 06:00:31", "2020-01-01 06:00:41", "2020-01-01 06:00:51"], - "values": [23., 24., 25., 26., 27., 23.], - "metadata": {"station": "test_station_001", "author": "ME", "success": True}} - - -class TestGetData: - - def test(self): - opts = {"base": join_settings()[0], "service": "series", "station_id": 'DEBW107', "network_name": "UBA", - "parameter_name": "o3,no2"} - assert get_data(opts, headers={}) == [[17057, 'UBA', 'DEBW107', 'O3'], [17058, 'UBA', 'DEBW107', 'NO2']] - - -class TestLoadSeriesInformation: - - def test_standard_query(self): - expected_subset = {'o3': 23031, 'no2': 39002, 'temp': 85584, 'wspeed': 17060} - res, orig = load_series_information(['DEBW107'], None, None, join_settings()[0], {}) - assert expected_subset.items() <= res.items() - - def test_empty_result(self): - res, orig = load_series_information(['DEBW107'], "traffic", None, join_settings()[0], {}) - assert res == {} - - -class TestSelectDistinctSeries: - - @pytest.fixture - def vars(self): - return [{'id': 16686, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'no2', - 'parameter_label': 'NO2', 'parameter_attribute': ''}, - {'id': 16687, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'o3', - 'parameter_label': 'O3', - 'parameter_attribute': ''}, - {'id': 16692, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'press', - 'parameter_label': 'PRESS--LANUV', 'parameter_attribute': ''}, - {'id': 16693, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'temp', - 'parameter_label': 'TEMP--LANUV', 'parameter_attribute': ''}, - {'id': 54036, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'cloudcover', - 'parameter_label': 'CLOUDCOVER', 'parameter_attribute': 'REA'}, - {'id': 88491, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'temp', - 'parameter_label': 'TEMP-REA-MIUB', 'parameter_attribute': 'REA'}, - {'id': 102660, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'press', - 'parameter_label': 'PRESS-REA-MIUB', 'parameter_attribute': 'REA'}] - - def test_no_origin_given(self, vars): - res, orig = _select_distinct_series(vars) - assert res == {"no2": 16686, "o3": 16687, "cloudcover": 54036, "temp": 88491, "press": 102660} - assert orig == {"no2": "", "o3": "", "cloudcover": "REA", "temp": "REA", "press": "REA"} - - def test_different_origins(self, vars): - origin = {"no2": "test", "temp": "", "cloudcover": "REA"} - res, orig = _select_distinct_series(vars, data_origin=origin) - assert res == {"o3": 16687, "press": 102660, "temp": 16693, "cloudcover": 54036} - assert orig == {"no2": "test", "o3": "", "cloudcover": "REA", "temp": "", "press": "REA"} - res, orig = _select_distinct_series(vars, data_origin={}) - assert res == {"cloudcover": 54036, "no2": 16686, "o3": 16687, "press": 102660, "temp": 88491} - assert orig == {"no2": "", "o3": "", "temp": "REA", "press": "REA", "cloudcover": "REA"} - - -class TestSaveToPandas: - - @staticmethod - def convert_date(date): - return map(lambda s: dt.datetime.strptime(s, "%Y-%m-%d %H:%M"), date) - - @pytest.fixture - def date(self): - return ['1997-01-01 00:00', '1997-01-02 00:00', '1997-01-03 00:00', '1997-01-04 00:00'] - - @pytest.fixture - def date_len19(self): - return ['1997-01-01 00:00:00', '1997-01-02 00:00:00', '1997-01-03 00:00:00', '1997-01-04 00:00:00'] - - @pytest.fixture - def values(self): - return [86.21, 94.76, 76.96, 99.89] - - @pytest.fixture - def alternative_values(self): - return [20.0, 25.2, 25.1, 23.6] - - @pytest.fixture - def create_df(self, date, values): - return pd.DataFrame(values, index=self.convert_date(date), columns=['cloudcover']) - - def test_empty_df(self, date, values, create_df): - data = {'datetime': date, 'mean': values, 'metadata': None} - assert pd.testing.assert_frame_equal(create_df, _save_to_pandas(None, data, 'mean', 'cloudcover')) is None - - def test_not_empty_df(self, date, alternative_values, create_df): - data = {'datetime': date, 'max': alternative_values, 'metadata': None} - next_df = pd.DataFrame(data["max"], index=self.convert_date(date), columns=['temperature']) - df_concat = pd.concat([create_df, next_df], axis=1) - assert pd.testing.assert_frame_equal(df_concat, _save_to_pandas(create_df, data, 'max', 'temperature')) is None - - def test_alternative_date_format(self, date_len19, values, create_df): - data = {'datetime': date_len19, 'mean': values, 'metadata': None} - assert pd.testing.assert_frame_equal(create_df, _save_to_pandas(None, data, 'mean', 'cloudcover')) is None - - -class TestCorrectStatName: - - def test_nothing_to_do(self): - assert _correct_stat_name("dma8eu") == "dma8eu" - assert _correct_stat_name("max") == "max" - - def test_correct_string(self): - assert _correct_stat_name("maximum") == "max" - assert _correct_stat_name("minimum") == "min" - assert _correct_stat_name("average_values") == "mean" - - -class TestLowerList: - - def test_string_lowering(self): - list_iterator = _lower_list(["Capitalised", "already_small", "UPPERCASE", "veRyStRaNGe"]) - assert isinstance(list_iterator, Iterable) - assert list(list_iterator) == ["capitalised", "already_small", "uppercase", "verystrange"] - - -class TestCreateUrl: - - def test_minimal_args_given(self): - url = create_url("www.base.edu", "testingservice") - assert url == "www.base.edu/testingservice/?" - - def test_given_kwargs(self): - url = create_url("www.base2.edu/", "testingservice", mood="happy", confidence=0.98) - assert url == "www.base2.edu/testingservice/?mood=happy&confidence=0.98" - - def test_single_kwargs(self): - url = create_url("www.base2.edu/", "testingservice", mood="undefined") - assert url == "www.base2.edu/testingservice/?mood=undefined" - - def test_none_kwargs(self): - url = create_url("www.base2.edu/", "testingservice", mood="sad", happiness=None, stress_factor=100) - assert url == "www.base2.edu/testingservice/?mood=sad&stress_factor=100" diff --git a/test/test_model_modules/test_abstract_model_class.py b/test/test_model_modules/test_abstract_model_class.py index a1ec4c63a2b3b44c26bbf722a3d4d84aec112bec..2a1578aa28c061fce40be2e3f2f2a29306663463 100644 --- a/test/test_model_modules/test_abstract_model_class.py +++ b/test/test_model_modules/test_abstract_model_class.py @@ -147,16 +147,16 @@ class TestAbstractModelClass: with pytest.raises(ValueError) as einfo: amc.compile_options = {"optimizer": keras.optimizers.Adam()} assert "Got different values or arguments for same argument: self.optimizer=<class" \ - " 'tensorflow.python.keras.optimizer_v2.gradient_descent.SGD'> and " \ - "'optimizer': <class 'tensorflow.python.keras.optimizer_v2.adam.Adam'>" in str(einfo.value) + " 'keras.optimizer_v2.gradient_descent.SGD'> and " \ + "'optimizer': <class 'keras.optimizer_v2.adam.Adam'>" in str(einfo.value) def test_compile_options_setter_as_mix_attr_dict_invalid_duplicates_same_optimizer_other_args(self, amc): amc.optimizer = keras.optimizers.SGD(lr=0.1) with pytest.raises(ValueError) as einfo: amc.compile_options = {"optimizer": keras.optimizers.SGD(lr=0.001)} assert "Got different values or arguments for same argument: self.optimizer=<class" \ - " 'tensorflow.python.keras.optimizer_v2.gradient_descent.SGD'> and " \ - "'optimizer': <class 'tensorflow.python.keras.optimizer_v2.gradient_descent.SGD'>" in str(einfo.value) + " 'keras.optimizer_v2.gradient_descent.SGD'> and " \ + "'optimizer': <class 'keras.optimizer_v2.gradient_descent.SGD'>" in str(einfo.value) def test_compile_options_setter_as_dict_invalid_keys(self, amc): with pytest.raises(ValueError) as einfo: diff --git a/test/test_model_modules/test_flatten_tail.py b/test/test_model_modules/test_flatten_tail.py index 83861be561fbe164d09048f1b748b51977b2fc27..b53e381ea1cecc1d6dfbe019264c726f11946479 100644 --- a/test/test_model_modules/test_flatten_tail.py +++ b/test/test_model_modules/test_flatten_tail.py @@ -27,7 +27,7 @@ class TestGetActivation: def test_layer_act(self, model_input): x_in = get_activation(model_input, activation=ELU, name='adv_layer') act = x_in._keras_history[0] - assert act.name == 'adv_layer' + assert act.name == 'tf.nn.elu' def test_layer_act_invalid(self, model_input): with pytest.raises(TypeError) as einfo: @@ -62,8 +62,8 @@ class TestFlattenTail: assert final_dense.units == 2 assert final_dense.kernel_regularizer is None inner_act = self.step_in(final_dense) - assert inner_act.name == 'Main_tail_act' - assert inner_act.__class__.__name__ == 'ELU' + assert inner_act.name == 'tf.nn.elu_1' + assert inner_act.__class__.__name__ == 'TFOpLambda' inner_dense = self.step_in(inner_act) assert inner_dense.name == 'Main_tail_inner_Dense' assert inner_dense.units == 64 @@ -112,9 +112,8 @@ class TestFlattenTail: 'dtype': 'float32', 'data_format': 'channels_last'} reduc_act = self.step_in(flatten) - assert reduc_act.get_config() == {'name': 'Main_tail_all_conv_act', 'trainable': True, - 'dtype': 'float32', 'alpha': 1.0} - + assert reduc_act.get_config() == {'name': 'tf.nn.elu_2', 'trainable': True, 'function': 'nn.elu', + 'dtype': 'float32'} reduc_conv = self.step_in(reduc_act) assert reduc_conv.kernel_size == (1, 1) diff --git a/test/test_model_modules/test_inception_model.py b/test/test_model_modules/test_inception_model.py index 0ed975d054841d9d4cfb8b4c964fa0cd2d4e2667..0a0dd38fa9d354c1243127df1ae9e079a6ca88e9 100644 --- a/test/test_model_modules/test_inception_model.py +++ b/test/test_model_modules/test_inception_model.py @@ -43,7 +43,7 @@ class TestInceptionModelBase: assert base.part_of_block == 1 assert tower.name == 'Block_0a_act_2/Relu:0' act_layer = tower._keras_history[0] - assert isinstance(act_layer, ReLU) + assert isinstance(act_layer, keras.layers.ReLU) assert act_layer.name == "Block_0a_act_2" # check previous element of tower (conv2D) conv_layer = self.step_in(act_layer) @@ -60,7 +60,7 @@ class TestInceptionModelBase: assert pad_layer.name == 'Block_0a_Pad' # check previous element of tower (activation) act_layer2 = self.step_in(pad_layer) - assert isinstance(act_layer2, ReLU) + assert isinstance(act_layer2, keras.layers.ReLU) assert act_layer2.name == "Block_0a_act_1" # check previous element of tower (conv2D) conv_layer2 = self.step_in(act_layer2) @@ -80,7 +80,7 @@ class TestInceptionModelBase: # assert tower.name == 'Block_0a_act_2/Relu:0' assert tower.name == 'Block_0a_act_2/Relu:0' act_layer = tower._keras_history[0] - assert isinstance(act_layer, ReLU) + assert isinstance(act_layer, keras.layers.ReLU) assert act_layer.name == "Block_0a_act_2" # check previous element of tower (batch_normal) batch_layer = self.step_in(act_layer) @@ -101,7 +101,7 @@ class TestInceptionModelBase: assert pad_layer.name == 'Block_0a_Pad' # check previous element of tower (activation) act_layer2 = self.step_in(pad_layer) - assert isinstance(act_layer2, ReLU) + assert isinstance(act_layer2, keras.layers.ReLU) assert act_layer2.name == "Block_0a_act_1" # check previous element of tower (conv2D) conv_layer2 = self.step_in(act_layer2) @@ -124,7 +124,7 @@ class TestInceptionModelBase: tower = base.create_conv_tower(activation=keras.layers.LeakyReLU, **opts) assert tower.name == 'Block_0b_act_2/LeakyRelu:0' act_layer = tower._keras_history[0] - assert isinstance(act_layer, LeakyReLU) + assert isinstance(act_layer, keras.layers.LeakyReLU) assert act_layer.name == "Block_0b_act_2" def test_create_conv_tower_1x1(self, base, input_x): @@ -134,7 +134,7 @@ class TestInceptionModelBase: assert base.part_of_block == 1 assert tower.name == 'Block_0a_act_1/Relu:0' act_layer = tower._keras_history[0] - assert isinstance(act_layer, ReLU) + assert isinstance(act_layer, keras.layers.ReLU) assert act_layer.name == "Block_0a_act_1" # check previous element of tower (conv2D) conv_layer = self.step_in(act_layer) @@ -160,7 +160,7 @@ class TestInceptionModelBase: assert base.part_of_block == 1 assert tower.name == 'Block_0a_act_1/Relu:0' act_layer = tower._keras_history[0] - assert isinstance(act_layer, ReLU) + assert isinstance(act_layer, keras.layers.ReLU) assert act_layer.name == "Block_0a_act_1" # check previous element of tower (conv2D) conv_layer = self.step_in(act_layer) diff --git a/test/test_run_modules/test_model_setup.py b/test/test_run_modules/test_model_setup.py index 6e8d3ea9ebab40c79b17b2fba386322a630f00e1..295954a7cf53927939d50de5a84d474cb1818026 100644 --- a/test/test_run_modules/test_model_setup.py +++ b/test/test_run_modules/test_model_setup.py @@ -139,6 +139,14 @@ class DummyData: def __init__(self, number_of_samples=np.random.randint(100, 150)): self.number_of_samples = number_of_samples + self._len = self.number_of_samples + self._len_upsampling = self.number_of_samples + + def __len__(self, upsampling=False): + if upsampling is False: + return self._len + else: + return self._len_upsampling def get_X(self, upsampling=False, as_numpy=True): X1 = np.random.randint(0, 10, size=(self.number_of_samples, 14, 1, 5)) # samples, window, variables diff --git a/test/test_run_modules/test_post_processing.py b/test/test_run_modules/test_post_processing.py deleted file mode 100644 index 67897451eaa5de065b644d1f9868a846cb57d84e..0000000000000000000000000000000000000000 --- a/test/test_run_modules/test_post_processing.py +++ /dev/null @@ -1,4 +0,0 @@ -class TestPostProcessing: - - def test_init(self): - pass diff --git a/test/test_run_modules/test_pre_processing.py b/test/test_run_modules/test_pre_processing.py index 1dafdbd5c4882932e3d57e726e7a06bea22a745d..6646e1a4795756edd1792ef91f535132e8cde61d 100644 --- a/test/test_run_modules/test_pre_processing.py +++ b/test/test_run_modules/test_pre_processing.py @@ -9,8 +9,6 @@ from mlair.helpers import PyTestRegex from mlair.run_modules.experiment_setup import ExperimentSetup from mlair.run_modules.pre_processing import PreProcessing from mlair.run_modules.run_environment import RunEnvironment -import pandas as pd -import numpy as np import multiprocessing @@ -30,30 +28,31 @@ class TestPreProcessing: @pytest.fixture def obj_with_exp_setup(self): - ExperimentSetup(stations=['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087', 'DEBW001'], + ExperimentSetup(stations=['DEBW107', 'DEBW013', 'DEBW087', 'DEBW99X'], statistics_per_var={'o3': 'dma8eu', 'temp': 'maximum'}, station_type="background", - data_handler=DefaultDataHandler) + data_origin={'o3': 'UBA', 'temp': 'UBA'}, data_handler=DefaultDataHandler) pre = object.__new__(PreProcessing) super(PreProcessing, pre).__init__() yield pre RunEnvironment().__del__() def test_init(self, caplog): - ExperimentSetup(stations=['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087'], - statistics_per_var={'o3': 'dma8eu', 'temp': 'maximum'}) + ExperimentSetup(stations=['DEBW107', 'DEBW013', 'DEBW087'], + statistics_per_var={'o3': 'dma8eu', 'temp': 'maximum'}, + data_origin={'o3': 'UBA', 'temp': 'UBA'}) caplog.clear() caplog.set_level(logging.INFO) with PreProcessing(): assert caplog.record_tuples[0] == ('root', 20, 'PreProcessing started') assert caplog.record_tuples[1] == ('root', 20, 'check valid stations started (preprocessing)') - assert caplog.record_tuples[-6] == ('root', 20, PyTestRegex(r'run for \d+:\d+:\d+ \(hh:mm:ss\) to check 5 ' - r'station\(s\). Found 5/5 valid stations.')) + assert caplog.record_tuples[-6] == ('root', 20, PyTestRegex(r'run for \d+:\d+:\d+ \(hh:mm:ss\) to check 3 ' + r'station\(s\). Found 3/3 valid stations.')) assert caplog.record_tuples[-5] == ('root', 20, "use serial create_info_df (train)") assert caplog.record_tuples[-4] == ('root', 20, "use serial create_info_df (val)") assert caplog.record_tuples[-3] == ('root', 20, "use serial create_info_df (test)") assert caplog.record_tuples[-2] == ('root', 20, "Searching for competitors to be prepared for use.") - assert caplog.record_tuples[-1] == ('root', 20, "No preparation required because no competitor was provided" - " to the workflow.") + assert caplog.record_tuples[-1] == ('root', 20, "No preparation required for competitor ols as no specific " + "instruction is provided.") RunEnvironment().__del__() def test_run(self, obj_with_exp_setup): @@ -71,29 +70,29 @@ class TestPreProcessing: "extreme_values", "extremes_on_right_tail_only", "upsampling"] assert data_store.search_scope("general.train") == sorted(expected_params) assert data_store.search_name("data_collection") == sorted(["general.train", "general.val", "general.test", - "general.train_val"]) + "general.train_val"]) def test_create_set_split_not_all_stations(self, caplog, obj_with_exp_setup): caplog.set_level(logging.DEBUG) obj_with_exp_setup.data_store.set("use_all_stations_on_all_data_sets", False, "general") obj_with_exp_setup.create_set_split(slice(0, 2), "awesome") - assert ('root', 10, "Awesome stations (len=2): ['DEBW107', 'DEBY081']") in caplog.record_tuples + assert ('root', 10, "Awesome stations (len=2): ['DEBW107', 'DEBW013']") in caplog.record_tuples data_store = obj_with_exp_setup.data_store assert isinstance(data_store.get("data_collection", "general.awesome"), DataCollection) with pytest.raises(NameNotFoundInScope): data_store.get("data_collection", "general") - assert data_store.get("stations", "general.awesome") == ["DEBW107", "DEBY081"] + assert data_store.get("stations", "general.awesome") == ["DEBW107", "DEBW013"] def test_create_set_split_all_stations(self, caplog, obj_with_exp_setup): caplog.set_level(logging.DEBUG) obj_with_exp_setup.create_set_split(slice(0, 2), "awesome") - message = "Awesome stations (len=6): ['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087', 'DEBW001']" + message = "Awesome stations (len=4): ['DEBW107', 'DEBW013', 'DEBW087', 'DEBW99X']" assert ('root', 10, message) in caplog.record_tuples data_store = obj_with_exp_setup.data_store assert isinstance(data_store.get("data_collection", "general.awesome"), DataCollection) with pytest.raises(NameNotFoundInScope): data_store.get("data_collection", "general") - assert data_store.get("stations", "general.awesome") == ['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087'] + assert data_store.get("stations", "general.awesome") == ['DEBW107', 'DEBW013', 'DEBW087'] @pytest.mark.parametrize("name", (None, "tester")) def test_validate_station_serial(self, caplog, obj_with_exp_setup, name): @@ -108,8 +107,8 @@ class TestPreProcessing: expected = "check valid stations started" + ' (%s)' % (name if name else 'all') assert caplog.record_tuples[0] == ('root', 20, expected) assert caplog.record_tuples[1] == ('root', 20, "use serial validate station approach") - assert caplog.record_tuples[-1] == ('root', 20, PyTestRegex(r'run for \d+:\d+:\d+ \(hh:mm:ss\) to check 6 ' - r'station\(s\). Found 5/6 valid stations.')) + assert caplog.record_tuples[-1] == ('root', 20, PyTestRegex(r'run for \d+:\d+:\d+ \(hh:mm:ss\) to check 4 ' + r'station\(s\). Found 3/4 valid stations.')) @mock.patch("psutil.cpu_count", return_value=3) @mock.patch("multiprocessing.Pool", return_value=multiprocessing.Pool(3)) @@ -126,8 +125,8 @@ class TestPreProcessing: assert caplog.record_tuples[0] == ('root', 20, "check valid stations started (all)") assert caplog.record_tuples[1] == ('root', 20, "use parallel validate station approach") assert caplog.record_tuples[2] == ('root', 20, "running 3 processes in parallel") - assert caplog.record_tuples[-1] == ('root', 20, PyTestRegex(r'run for \d+:\d+:\d+ \(hh:mm:ss\) to check 6 ' - r'station\(s\). Found 5/6 valid stations.')) + assert caplog.record_tuples[-1] == ('root', 20, PyTestRegex(r'run for \d+:\d+:\d+ \(hh:mm:ss\) to check 4 ' + r'station\(s\). Found 3/4 valid stations.')) def test_split_set_indices(self, obj_super_init): dummy_list = list(range(0, 15)) @@ -146,24 +145,3 @@ class TestPreProcessing: class data_preparation_no_trans: pass assert pre.transformation(data_preparation_no_trans, stations) is None - - # @pytest.fixture - # def dummy_df(self): - # data_dict = {'station_name': {'DEBW013': 'Stuttgart Bad Cannstatt', 'DEBW076': 'Baden-Baden', - # 'DEBW087': 'Schwäbische_Alb', 'DEBW107': 'Tübingen', - # 'DEBY081': 'Garmisch-Partenkirchen/Kreuzeckbahnstraße', '# Stations': np.nan, - # '# Samples': np.nan}, - # 'station_lon': {'DEBW013': 9.2297, 'DEBW076': 8.2202, 'DEBW087': 9.2076, 'DEBW107': 9.0512, - # 'DEBY081': 11.0631, '# Stations': np.nan, '# Samples': np.nan}, - # 'station_lat': {'DEBW013': 48.8088, 'DEBW076': 48.7731, 'DEBW087': 48.3458, 'DEBW107': 48.5077, - # 'DEBY081': 47.4764, '# Stations': np.nan, '# Samples': np.nan}, - # 'station_alt': {'DEBW013': 235.0, 'DEBW076': 148.0, 'DEBW087': 798.0, 'DEBW107': 325.0, - # 'DEBY081': 735.0, '# Stations': np.nan, '# Samples': np.nan}, - # 'train': {'DEBW013': 1413, 'DEBW076': 3002, 'DEBW087': 3016, 'DEBW107': 1782, 'DEBY081': 2837, - # '# Stations': 6, '# Samples': 12050}, - # 'val': {'DEBW013': 698, 'DEBW076': 715, 'DEBW087': 700, 'DEBW107': 701, 'DEBY081': 456, - # '# Stations': 6, '# Samples': 3270}, - # 'test': {'DEBW013': 1066, 'DEBW076': 696, 'DEBW087': 1080, 'DEBW107': 1080, 'DEBY081': 700, - # '# Stations': 6, '# Samples': 4622}} - # df = pd.DataFrame.from_dict(data_dict) - # return df diff --git a/test/test_run_modules/test_training.py b/test/test_run_modules/test_training.py index 8f1fcd1943f9f203e738053017e00f8c269afef1..cdaa7f506d6b4b655dc582a331eea5a71b776c32 100644 --- a/test/test_run_modules/test_training.py +++ b/test/test_run_modules/test_training.py @@ -23,29 +23,6 @@ from mlair.run_modules.run_environment import RunEnvironment from mlair.run_modules.training import Training -def my_test_model(activation, window_history_size, channels, output_size, dropout_rate, add_minor_branch=False): - inception_model = InceptionModelBase() - conv_settings_dict1 = { - 'tower_1': {'reduction_filter': 8, 'tower_filter': 8 * 2, 'tower_kernel': (3, 1), 'activation': activation}, - 'tower_2': {'reduction_filter': 8, 'tower_filter': 8 * 2, 'tower_kernel': (5, 1), 'activation': activation}, } - pool_settings_dict1 = {'pool_kernel': (3, 1), 'tower_filter': 8 * 2, 'activation': activation} - X_input = keras.layers.Input(shape=(window_history_size + 1, 1, channels)) - X_in = inception_model.inception_block(X_input, conv_settings_dict1, pool_settings_dict1) - if add_minor_branch: - out = [flatten_tail(X_in, inner_neurons=64, activation=activation, output_neurons=4, - output_activation='linear', reduction_filter=64, - name='Minor_1', dropout_rate=dropout_rate, - )] - else: - out = [] - X_in = keras.layers.Dropout(dropout_rate)(X_in) - out.append(flatten_tail(X_in, inner_neurons=64, activation=activation, output_neurons=output_size, - output_activation='linear', reduction_filter=64, - name='Main', dropout_rate=dropout_rate, - )) - return keras.Model(inputs=X_input, outputs=out) - - class TestTraining: @pytest.fixture @@ -90,15 +67,6 @@ class TestTraining: RunEnvironment().__del__() except AssertionError: pass - # try: - # yield obj - # finally: - # if os.path.exists(path): - # shutil.rmtree(path) - # try: - # RunEnvironment().__del__() - # except AssertionError: - # pass @pytest.fixture def learning_rate(self): @@ -150,12 +118,16 @@ class TestTraining: return {'o3': 'dma8eu', 'temp': 'maximum'} @pytest.fixture - def data_collection(self, path, window_history_size, window_lead_time, statistics_per_var): - data_prep = DefaultDataHandler.build(['DEBW107'], data_path=os.path.join(path, 'data'), + def data_origin(self): + return {'o3': 'UBA', 'temp': 'UBA'} + + @pytest.fixture + def data_collection(self, path, window_history_size, window_lead_time, statistics_per_var, data_origin): + data_prep = DefaultDataHandler.build('DEBW107', data_path=os.path.join(path, 'data'), experiment_path=os.path.join(path, 'exp_path'), statistics_per_var=statistics_per_var, station_type="background", - network="AIRBASE", sampling="daily", target_dim="variables", - target_var="o3", time_dim="datetime", + sampling="daily", target_dim="variables", + target_var="o3", time_dim="datetime", data_origin=data_origin, window_history_size=window_history_size, window_lead_time=window_lead_time, name_affix="train") return DataCollection([data_prep]) diff --git a/test/test_workflows/test_default_workflow.py b/test/test_workflows/test_default_workflow.py index c7c198a4821f779329b9f5f19b04e757d8ebc7da..790fb5f5de2fef207c64fdc430028f0739eb20fa 100644 --- a/test/test_workflows/test_default_workflow.py +++ b/test/test_workflows/test_default_workflow.py @@ -23,7 +23,7 @@ class TestDefaultWorkflow: flow = DefaultWorkflow(stations="test", real_kwarg=4) assert flow._registry[0].__name__ == ExperimentSetup.__name__ assert len(flow._registry_kwargs[0].keys()) == 3 - assert list(flow._registry_kwargs[0].keys()) == ["experiment_date", "stations", "real_kwarg"] + assert sorted(list(flow._registry_kwargs[0].keys())) == ["experiment_date", "real_kwarg", "stations"] def test_setup(self): flow = DefaultWorkflow()