diff --git a/.gitignore b/.gitignore index 277f76853b32137a2b20b3a8c75c10508b56d96e..0dff6567d88591716ae603a4796570b5ee213fb2 100644 --- a/.gitignore +++ b/.gitignore @@ -127,6 +127,6 @@ virt_env*/ # Ignore (Batch) runscripts video_prediction_tools/HPC_scripts/** !video_prediction_tools/HPC_scripts/*_template.sh -video_prediction_tools/nonHPC_scripts/** -!video_prediction_tools/nonHPC_scripts/*_template.sh +video_prediction_tools/no_HPC_scripts/** +!video_prediction_tools/no_HPC_scripts/*_template.sh diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 4242bb656f4356b42afd00a25033efcb03687b30..2627a6dc2a8190b75313462a2faeaed4e69199fa 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -9,9 +9,9 @@ Loading: - era5 stage: build script: - - echo "dataset testing" -# - cd /data_era5 -# - ls -ls + - echo "Dataset testing" + - cd /data_era5/2017 + - ls -ls EnvSetup: @@ -26,7 +26,7 @@ Preprocessing: - era5 stage: build script: - - echo "Building-preprocessing" + - echo "Building preprocessing" Training: @@ -34,14 +34,34 @@ Training: - era5 stage: build script: - - echo "Building-Training" + - echo "Building training" + + +Postprocessing: + tags: + - era5 + stage: build + script: + - echo "Building postprocessing" + - zypper --non-interactive install gcc gcc-c++ gcc-fortran + - zypper --non-interactive install openmpi openmpi-devel + - zypper --non-interactive install python3 + - ls /usr/lib64/mpi/gcc/openmpi/bin + - export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH + - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/lib64/mpi/gcc/openmpi/bin + - export PATH=$PATH:/usr/lib64/mpi/gcc/openmpi/bin + - mpicxx -showme:link -pthread -L/usr/lib64/mpi/gcc/openmpi/bin -lmpi_cxx -lmpi -lopen-rte -lopen-pal -ldl -Wl,--export-dynamic -lnsl -lutil -lm -ldl + - pip install -r video_prediction_tools/env_setup/requirements_non_HPC.txt + - chmod +x ./video_prediction_tools/other_scripts/visualize_postprocess_era5_template.sh + - ./video_prediction_tools/other_scripts/visualize_postprocess_era5_template.sh + test: tags: - era5 stage: build script: - - echo "model testing" + - echo "Model testing" # - zypper --non-interactive install python3-pip # - zypper --non-interactive install python-devel # - pip install --upgrade pip diff --git a/README.md b/README.md index 1ec6bdfbfcb9d9b6e68d12e5cd2f6227688b033c..41f5573e40dcb055024db6d8e6c33144872be129 100644 --- a/README.md +++ b/README.md @@ -5,69 +5,76 @@ ## Table of Contents -- [Introduction to Atmospheric Machine learning Benchmarking System](#introduction-to-atmopsheric-machine-learning-benchmarking-system) +- [Introduction to Atmospheric Machine Learning Benchmarking System](#introduction-to-atmopsheric-machine-learning-benchmarking-system) - [Prepare your dataset](#prepare-your-dataset) - + [Access ERA5 dataset (~TB)](#access-era5-dataset---tb-) + + [Access the ERA5 dataset (~TB)](#access-the-era5-dataset---tb-) + [Dry run with small samples (~15 GB)](#dry-run-with-small-samples---15-gb-) + [Climatological mean data](#climatological-mean-data) - [Prerequisites](#prerequisites) - [Installation](#installation) + * [Get NVIDIA's TF1.15 container](#get-nvidia-s-tf115-container) - [Start with AMBS](#start-with-ambs) - * [Set-up virtual environment](#set-up-virtual-environment) - + [On Jülich's HPC systems](#on-j-lich-s-hpc-systems) + * [Set-up the virtual environment](#set-up-the-virtual-environment) + + [On JSC's HPC-system](#on-jsc-s-hpc-system) + [On other HPC systems](#on-other-hpc-systems) + - [Case I - Usage of singularity TF1.15 container](#case-i---usage-of-singularity-tf115-container) + - [Case II - Usage of singularity TF1.15 container](#case-ii---usage-of-singularity-tf115-container) + - [Further details on the arguments](#further-details-on-the-arguments) + [Other systems](#other-systems) + - [Case I - Usage of singularity TF1.15 container](#case-i---usage-of-singularity-tf115-container-1) + - [Case II - Usage of singularity TF1.15 container](#case-ii---usage-of-singularity-tf115-container-1) + - [Further details](#further-details) * [Run the workflow](#run-the-workflow) - * [Preparation with NVIDIA's TF1.15 singularity containers](#preparation-with-nvidia-s-tf115-singularity-containers) - * [Create specific runscripts](#create-specific-runscripts) + + [Create specific runscripts](#create-specific-runscripts) * [Running the workflow substeps](#running-the-workflow-substeps) * [Compare and visualize the results](#compare-and-visualize-the-results) * [Input and Output folder structure and naming convention](#input-and-output-folder-structure-and-naming-convention) -- [Benchmarking architectures:](#benchmarking-architectures-) +- [Benchmarking architectures](#benchmarking-architectures) - [Contributors and contact](#contributors-and-contact) - [On-going work](#on-going-work) +## Introduction to Atmopsheric Machine Learning Benchmarking System -## Introduction to Atmopsheric Machine learning Benchmarking System - -**A**tmopsheric **M**achine learning **B**enchmarking **S**ystem (AMBS) aims to provide state-of-the-art video prediction methods applied to the meteorological domain. In the scope of the current application, the hourly evolution of the 2m temperature over a used-defined region is focused. - -Different Deep Learning video prediction architectures such as convLSTM and SAVP are trained with ERA5 reanalysis to perform a prediction for 12 hours based on the previous 12 hours. In addition to the 2m temperature (2t) itself, other variables can be fed to the video frame prediction models to enhance their capability to learn the complex physical processes driving the diurnal cycle of temperature. Currently, the recommended additional meteorological variables are the 850 hPa temperature (t850) and the total cloud cover (tcc) as described in our preprint GMD paper. +**A**tmopsheric **M**achine Learning **B**enchmarking **S**ystem (AMBS) aims to provide state-of-the-art video prediction methods applied to the meteorological domain. In the scope of the current application, the hourly evolution of the 2m temperature over a used-defined region is focused. +Different Deep Learning video prediction architectures such as ConvLSTM and SAVP are trained with ERA5 reanalysis to perform a prediction for 12 hours based on the previous 12 hours. In addition to the 2m temperature (2t) itself, other variables can be fed to the video frame prediction models to enhance their capability to learn the complex physical processes driving the diurnal cycle of temperature. Currently, the recommended additional meteorological variables are the 850 hPa temperature (t850) and the total cloud cover (tcc) as described in our preprint GMD paper. ## Prepare your dataset -#### Access ERA5 dataset (~TB) -The experiment described in the GMD paper relies on the rather large ERA5 dataset with 13 years data. +#### Access the ERA5 dataset (~TB) +The experiments described in the GMD paper rely on the ERA5 dataset from which 13 years are used for the dataset of the video prediction models (training, validation and test datasets). -- For the users of JSC HPC system: You access the data from the followin path: /p/fastdata/slmet/slmet111/met_data/ecmwf/era5/grib. If you meet access permission issue please contact: Stein, Olaf <o.stein@fz-juelich.de> +- For users of JSC's HPC system: Access to the ERA5 dataset is possible via the data repository [meteocloud](https://datapub.fz-juelich.de/slcs/meteocloud/). The corresponding path the grib-data files (used for data extraction, see below) is: `/p/fastdata/slmet/slmet111/met_data/ecmwf/era5/grib`. If you meet access permission issues, please contact: Stein, Olaf <o.stein@fz-juelich.de> -- For the users of other HPC sytems: You can retrieve the ERA5 data from the ECMWF MARS archive by specifying a resolution of 0.3° in the retrieval script (keyword "GRID", "https://confluence.ecmwf.int/pages/viewpage.action?pageId=123799065 "). The variable names and the corresponding paramID can be found in the ECMWF documentaation website [ERA5 documentations](https://confluence.ecmwf.int/display/CKB/ERA5%3A+data+documentation#ERA5:datadocumentation-Howtoacknowledge,citeandrefertoERA5) +- For other users (also on other HPC-systems): You can retrieve the ERA5 data from the [ECMWF MARS archive](https://confluence.ecmwf.int/display/CKB/ERA5%3A+data+documentation#ERA5:datadocumentation-DataorganisationandhowtodownloadERA5). Once you have access to the archive, the data can be downloaded by specifying a resolution of 0.3° in the retrieval script (keyword "GRID", see [here](https://confluence.ecmwf.int/pages/viewpage.action?pageId=123799065)). The variable names and the corresponding paramID can be found in the ECMWF documentaation website [ERA5 documentations](https://confluence.ecmwf.int/display/CKB/ERA5%3A+data+documentation#ERA5:datadocumentation-Howtoacknowledge,citeandrefertoERA5). For further informations on the ERA5 dataset, please consult the [documentation](https://confluence.ecmwf.int/display/CKB/ERA5%3A+data+documentation) provided by ECMWF. -We recommend the users to store the data following the input structure of the described [in the following description](#input-and-output-folder-structure-and-naming-convention) +We recommend the users to store the data following the directory structure for the input data described [below](#Input-and-Output-folder-structure-and-naming-convention). #### Dry run with small samples (~15 GB) -In our application, we are dealing with the large dataset. Nevertheless, we also prepared rather small samples ~ 15 GB (3 months data with few variables) to help the users to be able fast test the workflow. The data can be downloaded through the following link [link!!] . For the users of deepacf project in JSC: You can also access from the following path `cd /p/project/deepacf/deeprain/video_prediction_shared_folder/GMD_samples` +In our application, the typical use-case is to work on a large dataset. Nevertheless, we also prepared an example dataset (1 month data in 2007, 2008, 2009 respectively data with few variables) to help users to run tests on their own machine or to do some quick tests. The data can be downloaded by requesting from Bing Gong <b.gong@fz-juelich.de>. Users of the deepacf-project at JSC can also access the files from `/p/project/deepacf/deeprain/video_prediction_shared_folder/GMD_samples`. #### Climatological mean data -climatological mean which is inferred at each grid point from the ERA5 reanalysis data between 1990 and 2019 is used in the postprocess step. The data can be downloaded along with the small samples [link!!] . + +To compute anomaly correlations in the postprocessing step (see below), climatological mean data is required. This data constitutes the climatological mean for each daytime hour and for each month for the period 1990-2019. +For convenince, the data is also provided with our frozon version of code and can be downloaded from [zenodo-link!!](). ## Prerequisites - Linux or macOS -- Python 3.6 -- CPU or NVIDIA GPU + CUDA CuDNN +- Python>=3.6 +- NVIDIA GPU + CUDA CuDNN or CPU (small dataset only) - MPI -- Tensorflow 1.13.1 or CUDA-enabled NVIDIA TensorFlow 1.15 within a singularity container -- CDO >= 1.9.5 +- Tensorflow 1.13.1 or [CUDA-enabled NVIDIA](https://docs.nvidia.com/deeplearning/frameworks/tensorflow-release-notes/overview.html#overview) TensorFlow 1.15 within a (singularity)[https://sylabs.io/guides/3.5/user-guide/quick_start.html] container +- [CDO](https://code.mpimet.mpg.de/projects/cdo/embedded/index.html) >= 1.9.5 ## Installation + Clone this repo by typing the following command in your personal target dirctory: ```bash @@ -84,121 +91,170 @@ This will create a directory called `ambs` under which this README-file and two Thus, change into this subdirectory after cloning: ```bash -cd ambs/video_preditcion_tools/ +cd ambs/video_prediction_tools/ ``` +### Get NVIDIA's TF1.15 container -## Start with AMBS +In case, your HPC-system allows for the usage of singularity containers (such as JSC's HPC-system does) or if you have a NVIDIA GPU available, you can run the workflow with the help of NVIDIA's TensorFlow 1.15-containers. Note that this is the recommended approach! +To get the correct container version, check your NVIDIA driver with the help of `nvidia-smi`. Then search [here](https://docs.nvidia.com/deeplearning/frameworks/tensorflow-release-notes/index.html) for a suitable container version (try to get the latest possible container ) and download the singularity image via -### Set-up virtual environment +``` +singularity pull <path_to_image>/nvidia_tensorflow_<version>-tf1-py3.sif docker://nvcr.io/nvidia/tensorflow:<version>-tf1-py3 +``` +where `<version>` is set accordingly. Ensure that your current target directory (`<path_to_image>`) offers enough memory. The respective images are about 3-5 GB large. +Then create a symbolic link of the singularity container into the `HPC_scripts` and `no_HPC_scripts`-directory, respectively: +``` +ln -s <path_to_image>/nvidia_tensorflow_<version>-tf1-py3.sif HPC_scripts/tensorflow_<version>-tf1-py3.sif +ln -s <path_to_image>/nvidia_tensorflow_<version>-tf1-py3.sif no_HPC_scripts/tensorflow_<version>-tf1-py3.sif +``` +Note the slightly different name used for the symbolic link which is recommended to easily distinguish between the original file and the symbolic link. -AMBS is a tool for the users who develop on HPC systems with Slurm batch systems since the large-scale dataset and architectures would be used. -However, aforementioned we also provide a small dataset and runscripts for the users that can explore the tool on their personal computer systems. -In such case, we provide three approaches to set up your virtual environment based on systems that the users work on: Jülich HPC system, other HPC systems, or other computer systems. The introduction is described below. +For users with access to JSC's HPC-system: The required singularity image is available from `ambs/video_prediction_tools/HPC_scripts`. Thus, simply set `<path_to_image>` accordingly in the commands above. +Note that you need to log in [Judoor account]https://judoor.fz-juelich.de/login) and specifically request access to restricted container software beforehand! -#### On Jülich's HPC systems +In case, your operating system supports TF1.13 (or TF1.15) with GPU-support and does not allow for usage of NVIDIA's singularity containers, you can set your environment up as described below. -The following commands will setup a customized virtual environment on a known HPC-system at JSC (Juwels, Juwels Booster or HDF-ML). The script `create_env.sh` automatically detects on which machine it is executed and loads/installs all required Python (binary) modules and packages. The virtual environment with the name provide by user is then set up in a subdirectory `[...]/ambs/video_prediction_tools/virtual_envs/<env_name>` the top-level directory (`[...]/ambs/video_prediction_tools`). +## Start with AMBS -```bash -cd env_setup -source create_env.sh <env_name> -``` +### Set-up the virtual environment + +The workflow can be set-up on different operating systems. The related virtual environment can be set up with the help of the `create_env.sh`-script under the `env_setup`-directory. +This script will place all virtual environments under the `virtual_envs`-directory. +Depending on your system, you may do the following: -This also already sets up the runscript templates with regards to the five steps of the workflow for you under the folder `[...]/ambs/video_prediction_tools/JSC_scripts`. +#### On JSC's HPC-system +After linking the TF1.15 singularity container in the directories for the runscript (see previous step), simply run +``` +source create_env.sh <my_virtual_env> +``` +where `<my_virtual_env>` corresponds to a user-defined name of the virtual environment. -By default, the runscript templates make use of the standard target base directory `/p/project/deepacf/deeprain/video_prediction_shared_folder/`. This directory will serve as your standard top-level direcotry to store the output of each step in the workflow see details in the [folder structure section]( #input-and-output-folder-tructure-and-naming-convention). In case that you want to deviate from this, you may call `create_env.sh` to setup a new root direcotyr as follows: +By default, the script assumes that all data (input and preprocessed data as well as trained models and data from postprocessing) will be stored in the shared directory `/p/project/deepacf/deeprain/video_prediction_shared_folder/`. This directory is called 'base-directory' in the following. -```bash -source create_env.sh <env_name> -base_dir=<my_target_dir> +In case that you (need to) deviate from this, you can set a customized base-directory. For this, add the `-base_dir`-flag to the call of `create_env.sh`, i.e.: +``` +source create_env.sh <my_virtual_env> -base_dir=<my_target_dir> ``` -**Note** that suifficient read-write permissions and a reasonable amount of memory space is mandatory for your alternative standard output directory. +**Note:** Suifficient read-write permissions and a reasonable amount of memory space are mandatory for alternative base-directories. + #### On other HPC systems -Setting up the environment on other HPC is different from the ones in JSC since there is quite diversity with regards to the available software stack. The users need to load the modules manually. We prepare the templates for each step of workflow under the `HPC_scripts` . The users can follow the guidance to customise the templates. +On other HPC-systems, the AMBS workflow can also be run. The runscripts under `HPC_scripts` can still be used provided that your HPC-system uses SLURM for managing jobs. Otherwise, you may try to use the runscripts under `no_HPC_scripts` or set-up own runscripts based on your operating system. -#### Other systems +##### Case I - Usage of singularity TF1.15 container -AMBS also allows the users to test on other non-HPC machines. You may enter the folder `../ambs/video_prediction_tools/env_setup` and excute: +After retrieving a singlualrity container that fits your operating HPC-system (see [above](#get-nVIDIA's-tF1.15-container)), create a virtual environment as follows: +``` +source create_env.sh <my_virtual_env> -base_dir=<my_target_dir> -tf_container=<used_container> +``` +Further details on the arguments are given after Case II. -```bash -source create_env_non_HPC.sh <env_name> +##### Case II - Usage of singularity TF1.15 container +In case that running singularity containers is not possible for you, but your operating HPC-system provides the usage of TF 1.13 (or later) via modules, the source-code can still be run. +However, this requires you to populate `modules_train.sh` where all modules are listed. Note that you also need to load modules for opening and reading h5- and netCDF-files as well . Afterwards, the virtual environment can be created by +``` +source create_env.sh <my_virtual_env> -base_dir=<my_target_dir> -l_nocontainer ``` -Then the virtual enviornment will be created under `../ambs/video_prediction_tools/virtual_envs`. The required packages (`requirement_non_HPC.txt`) will be installed. -### Run the workflow +##### Further details on the arguments +In the set-up commands for the virtual environment mentioned above, `<my_virual_env>` corresponds to the user-defined name of the virtual environment.`<my_target_dir>` points to an (existing) directory which offers enough memory to store large amounts of data (>>100 GB) +This directory should also already hold the ERA5-data as described [above](#Access-the-ERA5-dataset-(~TB)). Besides, the basic directory tree for the output of the workflow steps should follow the description provided [here]((#Input-and-Output-folder-structure-and-naming-convention)). +The argument `-tf_container=<used_container>` allows you to specify the used singularity container (in Case I only!). Thus, `used_container` should correspond to `tensorflow_<version>-tf1-py3.sif` as described in this [section](#Get-NVIDIA's-TF1.15-container) above. -Depending on the computing system you are working on, the workflow steps will be invoked by dedicated runscripts either from the directory `JSC_scripts/` (on known HPC-systems, see above) or from the directory `HPC_scripts/`, `other_scripts/` -To help the users conduct different experiments with different configuration (e.g. input variables, hyperparameters etc). Each runscript can be set up conveniently with the help of the Python-script `generate_runscript.py`. Its usage as well the workflow runscripts are described subsequently. +#### Other systems +On other systems with access to a NVIDIA GPU, the virtual environment can be run as follows. +In case that you don't have access to a NVIDIA GPU, you can still run TensorFlow on your CPU. However, training becomes very slow then and thus, we recommend to just test with the small dataset mentioned [above](#dry-run-with- small-samples-(~15-GB)). -### Preparation with NVIDIA's TF1.15 singularity containers +Again, we describe the step to set-up the virtual environment separately in the following. -Since 2022, JSC HPC does not support TF1.X in the current stack software system. As an intermediate solution before the TF2 version being ready, -a singularity container with a CUDA-enabled NVIDIA TensorFlow v1.15 was made available which has to be reflected when setting up the virtual environment and when submiiting the job. +##### Case I - Usage of singularity TF1.15 container - Then, you can either download container image ([Link](https://docs.nvidia.com/deeplearning/frameworks/tensorflow-release-notes/rel_21-09.html#rel_21-09)) and place it under the folder`HPC_script`; Or you can access to the image though the symlink command as below, if you are part of the *deepacf*project (still link to the `HPC_scripts`-directory) +After retrieving a singlualrity container that fits your operating machine (see [above](#Get-NVIDIA's-TF1.15-container)), create a virtual environment as follows: +``` +source create_env.sh <my_virtual_env> -base_dir=<my_target_dir> -l_nohpc +``` +Further details on the arguments are given after Case II. -```bash -ln -sf /p/project/deepacf/deeprain/video_prediction_shared_folder/containers_juwels_booster/nvidia_tensorflow_21.09-tf1-py3.sif +##### Case II - Usage of singularity TF1.15 container + +Without using a singularity container (and using your CPU instead), please run +``` +source create_env.sh <my_virtual_env> -base_dir=<my_target_dir> -l_nocontainer -l_nohpc ``` +**Note:** To reproduce the results of GMD paper, we recommend to use the case II. + +##### Further details +Futher details on the used arguments are provided [above](#Further-details-on-the-arguments). The only exception holds for the `l_nohpc`-flag that is used to indicate that you are not running on a HPC-system. + + +### Run the workflow + +Depending on the computing system you are working on, the workflow steps will be invoked by dedicated runscripts either from the directory `HPC_scripts/` or from `no_HPC_scripts`. The used directory names are self-explanatory. +To help the users conduct different experiments with varying configurations (e.g. input variables, hyperparameters etc), each runscript can be set up conveniently with the help of the Python-script `generate_runscript.py`. Its usage as well the workflow runscripts are described subsequently. -Note that if you are the user of JSC HPC system, you need to log in [Judoor account] (https://judoor.fz-juelich.de/login) and specifically ask for the request to access to the restricted container software. -### Create specific runscripts +#### Create specific runscripts Specific runscripts for each workflow substep (see below) are generated conveniently by keyboard interaction. -The interactive Python script thereby has to be executed in an activated virtual environment with some additional modules! After prompting +The interactive Python script under the folder `generate_runscript.py` thereby has to be executed after running `create_env.sh`. Note that this script only creates a new virtual environment if `<env_name>` has not been used before. If the corresponding virtual environment is already existing, it is simply activated. + +After prompting ```bash -python generate_runscript.py +python generate_runscript.py --venv_path <env_name> ``` - -You will be asked first which workflow runscript shall be generated. You can chose one of the workflow step name: +you will be asked first which workflow runscript shall be generated. You can choose one of the following workflow step names: - extract - preprocess1 - preprocess2 - train - postprocess -The subsequent keyboard interactions then allow the user to make individual settings to the workflow step at hand. By pressing simply Enter, the user may receive some guidance for the keyboard interaction. +The subsequent keyboard interaction then allows the user to make individual settings to the workflow step at hand. By pressing simply Enter, the user may receive some guidance for the keyboard interaction. Note that the runscript creation of later workflow substeps depends on the preceding steps (i.e. by checking the arguments from keyboard interaction). -Thus, they should be created sequentially instead of all at once at the beginning. +Thus, they should be created sequentially instead of all at once at the beginning! -**Warning**: the `generate_runscript.py` currently is only for the JSC users. You can skip this step for non-JSC HPC users. If you have different settings for various experiments, you can simply copy the template to a new file where you can customize your setting. -### Running the workflow substeps +**NoteI**: The runscript creation depends on the preceding steps (i.e. by checking the arguments from keyboard interaction). +Thus, they should be created sequentially instead of all at once at the beginning! Note that running the workflow step is also mandatory, before the runscript for the next workflow step can be created. -Having created the runscript by keyboard interaction, the workflow substeps can be run sequentially. Depending on the machine you are working on, change either to `JSC_scripts/` (on Juwels, Juwels Booster or HDF-ML), `HPC_scripts/` or `other_scripts/` . The respective runscripts for all steps of the workflow are located whose order is as follows. Note that `[sbatch]` only has to precede on one of the HPC systems. Besides data extraction and preprocessing step 1 are only mandatory when ERA5 data is subject to the application. +**Note II**: Remember to enable your virtual environment before running `generate_runscripts.py`. For this, you can simply run +``` +source create_env.sh <env_name> +``` +where `<env_name>` corresponds to + +### Running the workflow substeps -Note we provide default configurations for each runscripts -that the users still need to manully configure flags based on which project and HPC systems you work on. Particurly, you must configure the flag `#SBATCH --account =<your computing project name>` with your project name. For partitions `#SBATCH --partition`, we refer the users to the following link [JUWELS/JUWELS Booster](https://apps.fz-juelich.de/jsc/hps/juwels/batchsystem.html#slurm-partitions) for further information. If you are using HDF-ML system, you can simply use `batch` as partition. +Having created the runscript by keyboard interaction, the workflow substeps can be run sequentially. -Now it is time to run the AMBS workflow -1. Data Extraction: This script retrieves the demanded variables for user-defined years from complete ERA% reanalysis grib-files and stores the data into netCDF-files. +Note that you have to adapt the `account`, the `partition` as well as the e-mail address in case you running on a HPC-system other than JSC's HPC-systems (HDF-ML, Juwels Cluster and Juwels Booster). +Now, it is time to run the AMBS workflow +1. **Data Extraction**:<br> This script retrieves the demanded variables for user-defined years from complete ERA% reanalysis grib-files and stores the data into netCDF-files. ```bash [sbatch] ./data_extraction_era5.sh ``` -2. Data Preprocessing: Crop the ERA 5-data (multiple years possible) to the region of interest (preprocesing step 1). All the year data will be touched once and the statistics are calculated and saved in the output folder. The TFrecord-files which are fed to the trained model (next workflow step) are created afterwards. Thus, two cases exist at this stage: +2. **Data Preprocessing**:<br> Crop the ERA 5-data (multiple years possible) to the region of interest (preprocesing step 1). All the year data will be touched once and the statistics are calculated and saved in the output folder. The TFrecord-files which are fed to the trained model (next workflow step) are created afterwards. Thus, two cases exist at this stage: ```bash [sbatch] ./preprocess_data_era5_step1.sh [sbatch] ./preprocess_data_era5_step2.sh ``` -3. Training: Training of one of the available models with the preprocessed data. -Note that the `exp_id` is generated automatically when running `generate_runscript.py`. +3. **Training**:<br> Training of one of the available models with the preprocessed data. Note that the `exp_id` is generated automatically when running `generate_runscript.py`. ```bash [sbatch] ./train_model_era5_<exp_id>.sh ``` -4. Postprocess: Create some plots and calculate the evaluation metrics for test dataset. Note that the `exp_id` is generated automatically when running `generate_runscript.py`. +4. **Postprocessing**:<br> Create some plots and calculate the evaluation metrics for test dataset. Note that the `exp_id` is generated automatically when running `generate_runscript.py`. ```bash [sbatch] ./visualize_postprocess_era5_<exp_id>.sh @@ -206,12 +262,12 @@ Note that the `exp_id` is generated automatically when running `generate_runscri ### Compare and visualize the results -AMBS also provide the tool (called met_postprocess) for the users to compare different experiments results and visualize the results as shown in GMD paper through `meta_postprocess` step. The runscript template are also prepared in the `HPC_scripts`, `JSC_scripts`, and `other_scripts`. +AMBS also provides the tool (called meta-postprocessing) for the users to compare different experiments results and visualize the results as shown in GMD paper through the`meta_postprocess`-step. The runscript template are also prepared in the `HPC_scripts`, `no_HPC_scripts`. ### Input and Output folder structure and naming convention -To successfully run the workflow and enable to track the result from each step, inputs and output directories, and the file name convention should be constructed as described below: +To successfully run the workflow and enable tracking the results from each workflow step, inputs and output directories, and the file name convention should be constructed as described below: -We demonstrate an example of inputs structure for ERA5 dataset. In detail, the data is recorded hourly and stored into two grib files. The file with postfix `*_ml.grb` consists of multi-layers of the variables, whereas `_sf.grb` only includes the surface data. +Below, we show at first the input data structure for the ERA5 dataset. In detail, the data is recorded hourly and stored into two different kind of grib files. The file with suffix `*_ml.grb` consists of multi-layer data, whereas `*_sf.grb` only includes the surface data. ``` ├── ERA5 dataset @@ -226,7 +282,9 @@ We demonstrate an example of inputs structure for ERA5 dataset. In detail, the d │ │ │ ├── ... ``` -The root output directory should be set up when you run the workflow at the first time as aformentioned. The output strucutre for each step of the workflow along with the file name convention are described below: +The root output directory should be set up when you run the workflow at the first time as aformentioned. + +The output structure for each step of the workflow along with the file name convention are described below: ``` ├── ExtractedData │ ├── [Year] @@ -291,8 +349,8 @@ Here we give some examples to explain the name conventions: |Note: Y2016to2017M01to12 = Y2016M01to12_Y2017M01to12 -## Benchmarking architectures: -Currently, the workflow include the following ML architectures, and we are working on integrating more into the system. +## Benchmarking architectures +Currently, the workflow includes the following ML architectures, and we are working on integrating more into the system. - ConvLSTM: [paper](https://papers.nips.cc/paper/5955-convolutional-lstm-network-a-machine-learning-approach-for-precipitation-nowcasting.pdf),[code](https://github.com/loliverhennigh/Convolutional-LSTM-in-Tensorflow) - Stochastic Adversarial Video Prediction (SAVP): [paper](https://arxiv.org/pdf/1804.01523.pdf),[code](https://github.com/alexlee-gk/video_prediction) - Variational Autoencoder:[paper](https://arxiv.org/pdf/1312.6114.pdf) @@ -315,4 +373,3 @@ Former code developers are Scarlet Stadtler and Severin Hussmann. - Integrate precipitation data and new architecture used in our submitted CVPR paper - Integrate the ML benchmark datasets such as Moving MNIST - diff --git a/video_prediction_tools/HPC_scripts/data_extraction_era5_template.sh b/video_prediction_tools/HPC_scripts/data_extraction_era5_template.sh index 518fe84998745905092c0d7a7e62e97119d8909e..d4741e796c2762d8056204bc36e21655af1e70dd 100644 --- a/video_prediction_tools/HPC_scripts/data_extraction_era5_template.sh +++ b/video_prediction_tools/HPC_scripts/data_extraction_era5_template.sh @@ -1,6 +1,6 @@ #!/bin/bash -x -## Controlling Batch-job : Need input -#SBATCH --account=<Project name> +## Controlling Batch-job +#SBATCH --account=<your_project> #SBATCH --nodes=1 #SBATCH --ntasks=13 ##SBATCH --ntasks-per-node=13 @@ -13,28 +13,26 @@ #SBATCH --mail-type=ALL #SBATCH --mail-user=me@somewhere.com -##Load basic Python module: Need input -#module load Python - - -##Create and activate a virtual environment: Need input -#VENV_NAME=<my_venv> -#Python -m venv ../virtual_envs/${VENV_NAME} -#source ../virtual_envs/${VENV_NAME}/bin/activate - - -## Install required packages -# set PYTHONPATH... -BASE_DIR="$(pwd)" -WORKING_DIR=="$(BASE_DIR "$dir")" -export PYTHONPATH=${WORKING_DIR}/virtual_envs/${VENV_NAME}/lib/python3.8/site-packages:$PYTHONPATH -export PYTHONPATH=${WORKING_DIR}:$PYTHONPATH -export PYTHONPATH=${WORKING_DIR}/utils:$PYTHONPATH -export PYTHONPATH=${WORKING_DIR}/model_modules:$PYTHONPATH -export PYTHONPATH=${WORKING_DIR}/postprocess:$PYTHONPATH -# ... install requirements -pip install --no-cache-dir -r ../env_setup/requirements.txt - +######### Template identifier (don't remove) ######### +echo "Do not run the template scripts" +exit 99 +######### Template identifier (don't remove) ######### + +# Name of virtual environment +VIRT_ENV_NAME="my_venv" + +# Activate virtual environment if needed (and possible) +if [ -z ${VIRTUAL_ENV} ]; then + if [[ -f ../virtual_envs/${VIRT_ENV_NAME}/bin/activate ]]; then + echo "Activating virtual environment..." + source ../virtual_envs/${VIRT_ENV_NAME}/bin/activate + else + echo "ERROR: Requested virtual environment ${VIRT_ENV_NAME} not found..." + exit 1 + fi +fi +# Loading modules +source ../env_setup/modules_preprocess+extract.sh # Declare path-variables (dest_dir will be set and configured automatically via generate_runscript.py) source_dir=/my/path/to/era5 diff --git a/video_prediction_tools/HPC_scripts/meta_postprocess_era5_template.sh b/video_prediction_tools/HPC_scripts/meta_postprocess_era5_template.sh index 7d9dcd10cabf0b44ae75ead14711059c5c167d3c..45d467d0d457c572814790cb8dbfdd540c16d5e8 100644 --- a/video_prediction_tools/HPC_scripts/meta_postprocess_era5_template.sh +++ b/video_prediction_tools/HPC_scripts/meta_postprocess_era5_template.sh @@ -1,55 +1,22 @@ #!/bin/bash -x -## Controlling Batch-job: Need input -#SBATCH --account=<Project name> +## Controlling Batch-job +#SBATCH --account=<your_project> #SBATCH --nodes=1 -#SBATCH --ntasks=13 +#SBATCH --ntasks=1 ##SBATCH --ntasks-per-node=13 #SBATCH --cpus-per-task=1 -#SBATCH --output=Data_Preprocess_step1_era5-out.%j -#SBATCH --error=Data_Preprocess_step1era5-err.%j -#SBATCH --time=04:20:00 +#SBATCH --output=meta_postprocess_era5-out.%j +#SBATCH --error=meta_postprocess_era5-err.%j +#SBATCH --time=00:20:00 #SBATCH --partition=batch #SBATCH --gres=gpu:0 #SBATCH --mail-type=ALL #SBATCH --mail-user=me@somewhere.com -##Load basic Python module: Need input -#module load Python - - -##Create and activate a virtual environment: Need input -#VENV_NAME=<my_venv> -#Python -m venv ../virtual_envs/${VENV_NAME} -#source ../virtual_envs/${VENV_NAME}/bin/activate - -## Install required packages -# set PYTHONPATH... -WORKING_DIR="$(pwd)" -BASE_DIR=="$(WORKING_DIR "$dir")" -export PYTHONPATH=${BASE_DIR}/virtual_envs/${VENV_NAME}/lib/python3.8/site-packages:$PYTHONPATH -export PYTHONPATH=${BASE_DIR}:$PYTHONPATH -export PYTHONPATH=${BASE_DIR}/utils:$PYTHONPATH -export PYTHONPATH=${BASE_DIR}/model_modules:$PYTHONPATH -export PYTHONPATH=${BASE_DIR}/postprocess:$PYTHONPATH -# ... install requirements -pip install --no-cache-dir -r ../env_setup/requirements.txt - -# Name of virtual environment -VENV_NAME=venv_hdfml -# Name of container image (must be available in working directory) -CONTAINER_IMG="${WORK_DIR}/tensorflow_21.09-tf1-py3.sif" -WRAPPER="${BASE_DIR}/env_setup/wrapper_container.sh" - -# sanity checks -if [[ ! -f ${CONTAINER_IMG} ]]; then - echo "ERROR: Cannot find required TF1.15 container image '${CONTAINER_IMG}'." - exit 1 -fi - -if [[ ! -f ${WRAPPER} ]]; then - echo "ERROR: Cannot find wrapper-script '${WRAPPER}' for TF1.15 container image." - exit 1 -fi +######### Template identifier (don't remove) ######### +echo "Do not run the template scripts" +exit 99 +######### Template identifier (don't remove) ######### # Declare input parameters root_dir=/p/project/deepacf/deeprain/video_prediction_shared_folder/ diff --git a/video_prediction_tools/HPC_scripts/preprocess_data_era5_step1_template.sh b/video_prediction_tools/HPC_scripts/preprocess_data_era5_step1_template.sh index cc500654c49e4b619399aa8685d51b7299836d42..990095c1f5fa00e4058e08d355830e0fa620b0f3 100644 --- a/video_prediction_tools/HPC_scripts/preprocess_data_era5_step1_template.sh +++ b/video_prediction_tools/HPC_scripts/preprocess_data_era5_step1_template.sh @@ -1,64 +1,59 @@ #!/bin/bash -x -## Controlling Batch-job : Need input -#SBATCH --account=<Project name> +## Controlling Batch-job +#SBATCH --account=<your_project> #SBATCH --nodes=1 #SBATCH --ntasks=13 -##SBATCH --ntasks-per-node=13 +##SBATCH --ntasks-per-node=12 #SBATCH --cpus-per-task=1 -#SBATCH --output=Data_Preprocess_step1_era5-out.%j -#SBATCH --error=Data_Preprocess_step1era5-err.%j +#SBATCH --output=DataPreprocess_era5_step1-out.%j +#SBATCH --error=DataPreprocess_era5_step1-err.%j #SBATCH --time=04:20:00 -#SBATCH --partition=batch #SBATCH --gres=gpu:0 +#SBATCH --partition=batch #SBATCH --mail-type=ALL #SBATCH --mail-user=me@somewhere.com -##Load basic Python module: Need input -#module load Python - - -##Create and activate a virtual environment : Need input -#VENV_NAME=<my_venv> -#Python -m venv ../virtual_envs/${VENV_NAME} -#source ../virtual_envs/${VENV_NAME}/bin/activate - +######### Template identifier (don't remove) ######### +echo "Do not run the template scripts" +exit 99 +######### Template identifier (don't remove) ######### + +# Name of virtual environment +VIRT_ENV_NAME="my_venv" + +# Activate virtual environment if needed (and possible) +if [ -z ${VIRTUAL_ENV} ]; then + if [[ -f ../virtual_envs/${VIRT_ENV_NAME}/bin/activate ]]; then + echo "Activating virtual environment..." + source ../virtual_envs/${VIRT_ENV_NAME}/bin/activate + else + echo "ERROR: Requested virtual environment ${VIRT_ENV_NAME} not found..." + exit 1 + fi +fi +# Loading modules +source ../env_setup/modules_preprocess+extract.sh + + +# select years and variables for dataset and define target domain +years=( "2015" ) +variables=( "t2" "t2" "t2" ) +sw_corner=( -999.9 -999.9) +nyx=( -999 -999 ) -## Install required packages -# set PYTHONPATH... -BASE_DIR="$(pwd)" -WORKING_DIR=="$(BASE_DIR "$dir")" -export PYTHONPATH=${WORKING_DIR}/virtual_envs/${VENV_NAME}/lib/python3.8/site-packages:$PYTHONPATH -export PYTHONPATH=${WORKING_DIR}:$PYTHONPATH -export PYTHONPATH=${WORKING_DIR}/utils:$PYTHONPATH -export PYTHONPATH=${WORKING_DIR}/model_modules:$PYTHONPATH -export PYTHONPATH=${WORKING_DIR}/postprocess:$PYTHONPATH -# ... install requirements -pip install --no-cache-dir -r ../env_setup/requirements.txt - - -# select years for dataset -declare -a years=( - "2017" - ) - -max_year=`echo "${years[*]}" | sort -nr | head -n1` -min_year=`echo "${years[*]}" | sort -nr | tail -n1` # set some paths -# note, that destination_dir is used during runtime to set a proper experiment directory -exp_id=xxx # experiment identifier is set by 'generate_workflow_runscripts.sh' -source_dir=${SAVE_DIR}/extractedData -destination_dir=${SAVE_DIR}/preprocessedData/era5-Y${min_year}to${max_year}M01to12 -script_dir=`pwd` - -for year in "${years[@]}"; - do - echo "Year $year" - echo "source_dir ${source_dir}/${year}" - mpirun -np 2 python ../../workflow_parallel_frame_prediction/DataPreprocess/mpi_stager_v2_process_netCDF.py \ - --source_dir ${source_dir} -scr_dir ${script_dir} -exp_dir ${exp_id} \ - --destination_dir ${destination_dir} --years ${years} --vars T2 MSL gph500 --lat_s 74 --lat_e 202 --lon_s 550 --lon_e 710 - done - +# note, that destination_dir is adjusted during runtime based on the data +source_dir=/my/path/to/extracted/data/ +destination_dir=/my/path/to/pickle/files +# execute Python-scripts +for year in "${years[@]}"; do + echo "start preprocessing data for year ${year}" + srun python ../main_scripts/main_preprocess_data_step1.py \ + --source_dir ${source_dir} --destination_dir ${destination_dir} --years "${year}" \ + --vars "${variables[0]}" "${variables[1]}" "${variables[2]}" \ + --sw_corner "${sw_corner[0]}" "${sw_corner[1]}" --nyx "${nyx[0]}" "${nyx[1]}" +done +#srun python ../../workflow_parallel_frame_prediction/DataPreprocess/mpi_split_data_multi_years.py --destination_dir ${destination_dir} --varnames T2 MSL gph500 diff --git a/video_prediction_tools/HPC_scripts/preprocess_data_era5_step2_template.sh b/video_prediction_tools/HPC_scripts/preprocess_data_era5_step2_template.sh index 1afb89088c008666c974adfc1bebe96e0c68f169..daa48d352ce6b1eca9c2f76692e68ca3e786273e 100644 --- a/video_prediction_tools/HPC_scripts/preprocess_data_era5_step2_template.sh +++ b/video_prediction_tools/HPC_scripts/preprocess_data_era5_step2_template.sh @@ -1,41 +1,29 @@ #!/bin/bash -x -## Controlling Batch-job: Need input -#SBATCH --account=<Project name> +#SBATCH --account=<your_project> #SBATCH --nodes=1 #SBATCH --ntasks=13 ##SBATCH --ntasks-per-node=13 #SBATCH --cpus-per-task=1 -#SBATCH --output=Data_Preprocess_step1_era5-out.%j -#SBATCH --error=Data_Preprocess_step1era5-err.%j -#SBATCH --time=04:20:00 -#SBATCH --partition=batch +#SBATCH --output=DataPreprocess_era5_step2-out.%j +#SBATCH --error=DataPreprocess_era5_step2-err.%j +#SBATCH --time=04:00:00 #SBATCH --gres=gpu:0 +#SBATCH --partition=batch #SBATCH --mail-type=ALL #SBATCH --mail-user=me@somewhere.com -##Load basic Python module: Need input -#module load Python - - -##Create and activate a virtual environment: Need input -#VENV_NAME=<my_venv> -#Python -m venv ../virtual_envs/${VENV_NAME} -#source ../virtual_envs/${VENV_NAME}/bin/activate - -## Install required packages -# set PYTHONPATH... -WORKING_DIR="$(pwd)" -BASE_DIR=="$(WORKING_DIR "$dir")" -export PYTHONPATH=${BASE_DIR}/virtual_envs/${VENV_NAME}/lib/python3.8/site-packages:$PYTHONPATH -export PYTHONPATH=${BASE_DIR}:$PYTHONPATH -export PYTHONPATH=${BASE_DIR}/utils:$PYTHONPATH -export PYTHONPATH=${BASE_DIR}/model_modules:$PYTHONPATH -export PYTHONPATH=${BASE_DIR}/postprocess:$PYTHONPATH -# ... install requirements -pip install --no-cache-dir -r ../env_setup/requirements.txt +######### Template identifier (don't remove) ######### +echo "Do not run the template scripts" +exit 99 +######### Template identifier (don't remove) ######### +# auxiliary variables +WORK_DIR="$(pwd)" +BASE_DIR=$(dirname "$WORK_DIR") # Name of virtual environment -VENV_NAME=venv_hdfml +VIRT_ENV_NAME="my_venv" +# !!! ADAPAT DEPENDING ON USAGE OF CONTAINER !!! +# For container usage, comment in the follwoing lines # Name of container image (must be available in working directory) CONTAINER_IMG="${WORK_DIR}/tensorflow_21.09-tf1-py3.sif" WRAPPER="${BASE_DIR}/env_setup/wrapper_container.sh" @@ -58,14 +46,32 @@ module purge source_dir=/my/path/to/pkl/files/ destination_dir=/my/path/to/tfrecords/files -sequence_length=24 +sequence_length=20 sequences_per_file=10 # run Preprocessing (step 2 where Tf-records are generated) -# run postprocessing/generation of model results including evaluation metrics export CUDA_VISIBLE_DEVICES=0 ## One node, single GPU srun --mpi=pspmix --cpu-bind=none \ - singularity exec --nv "${CONTAINER_IMG}" "${WRAPPER}" ${VENV_NAME} \ + singularity exec --nv "${CONTAINER_IMG}" "${WRAPPER}" ${VIRT_ENV_NAME} \ python3 ../main_scripts/main_preprocess_data_step2.py -source_dir ${source_dir} -dest_dir ${destination_dir} \ -sequence_length ${sequence_length} -sequences_per_file ${sequences_per_file} +# WITHOUT container usage, comment in the follwoing lines (and uncomment the lines above) +# Activate virtual environment if needed (and possible) +#if [ -z ${VIRTUAL_ENV} ]; then +# if [[ -f ../virtual_envs/${VIRT_ENV_NAME}/bin/activate ]]; then +# echo "Activating virtual environment..." +# source ../virtual_envs/${VIRT_ENV_NAME}/bin/activate +# else +# echo "ERROR: Requested virtual environment ${VIRT_ENV_NAME} not found..." +# exit 1 +# fi +#fi +# +# Loading modules +#module purge +#source ../env_setup/modules_train.sh +#export CUDA_VISIBLE_DEVICES=0 +# +# srun python3 ../main_scripts/main_preprocess_data_step2.py -source_dir ${source_dir} -dest_dir ${destination_dir} \ +# -sequence_length ${sequence_length} -sequences_per_file ${sequences_per_file} diff --git a/video_prediction_tools/HPC_scripts/preprocess_data_moving_mnist_template.sh b/video_prediction_tools/HPC_scripts/preprocess_data_moving_mnist_template.sh new file mode 100644 index 0000000000000000000000000000000000000000..f72950255efa181ca95b9b4f13c81efafe1e7733 --- /dev/null +++ b/video_prediction_tools/HPC_scripts/preprocess_data_moving_mnist_template.sh @@ -0,0 +1,71 @@ +#!/bin/bash -x +#SBATCH --account=<your_project> +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +##SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=1 +#SBATCH --output=DataPreprocess_moving_mnist-out.%j +#SBATCH --error=DataPreprocess_moving_mnist-err.%j +#SBATCH --time=04:00:00 +#SBATCH --partition=batch +#SBATCH --mail-type=ALL +#SBATCH --mail-user=me@somewhere.com + +######### Template identifier (don't remove) ######### +echo "Do not run the template scripts" +exit 99 +######### Template identifier (don't remove) ######### + +# Name of virtual environment +VIRT_ENV_NAME="my_venv" + +# !!! ADAPAT DEPENDING ON USAGE OF CONTAINER !!! +# For container usage, comment in the follwoing lines +# Name of container image (must be available in working directory) +CONTAINER_IMG="${WORK_DIR}/tensorflow_21.09-tf1-py3.sif" +WRAPPER="${BASE_DIR}/env_setup/wrapper_container.sh" + +# sanity checks +if [[ ! -f ${CONTAINER_IMG} ]]; then + echo "ERROR: Cannot find required TF1.15 container image '${CONTAINER_IMG}'." + exit 1 +fi + +if [[ ! -f ${WRAPPER} ]]; then + echo "ERROR: Cannot find wrapper-script '${WRAPPER}' for TF1.15 container image." + exit 1 +fi + +# clean-up modules to avoid conflicts between host and container settings +module purge + +# declare directory-variables which will be modified generate_runscript.py +source_dir=/my/path/to/mnist/raw/data/ +destination_dir=/my/path/to/mnist/tfrecords/ + +# run Preprocessing (step 2 where Tf-records are generated) +# run postprocessing/generation of model results including evaluation metrics +export CUDA_VISIBLE_DEVICES=0 +## One node, single GPU +srun --mpi=pspmix --cpu-bind=none \ + singularity exec --nv "${CONTAINER_IMG}" "${WRAPPER}" ${VIRT_ENV_NAME} \ + python3 ../video_prediction/datasets/moving_mnist.py ${source_dir} ${destination_dir} + +# WITHOUT container usage, comment in the follwoing lines (and uncomment the lines above) +# Activate virtual environment if needed (and possible) +#if [ -z ${VIRTUAL_ENV} ]; then +# if [[ -f ../virtual_envs/${VIRT_ENV_NAME}/bin/activate ]]; then +# echo "Activating virtual environment..." +# source ../virtual_envs/${VIRT_ENV_NAME}/bin/activate +# else +# echo "ERROR: Requested virtual environment ${VIRT_ENV_NAME} not found..." +# exit 1 +# fi +#fi +# +# Loading modules +#module purge +#source ../env_setup/modules_train.sh +#export CUDA_VISIBLE_DEVICES=0 +# +# srun python3 .../video_prediction/datasets/moving_mnist.py ${source_dir} ${destination_dir} \ No newline at end of file diff --git a/video_prediction_tools/HPC_scripts/train_model_era5_template.sh b/video_prediction_tools/HPC_scripts/train_model_era5_template.sh old mode 100644 new mode 100755 index 0f7b054908d09087dc266751157959f906e33fd8..2cbc4c4a2f59c0e7b68327be747afd7e3f9e06ce --- a/video_prediction_tools/HPC_scripts/train_model_era5_template.sh +++ b/video_prediction_tools/HPC_scripts/train_model_era5_template.sh @@ -1,41 +1,27 @@ #!/bin/bash -x -## Controlling Batch-job: Need input -#SBATCH --account=<Project name> +#SBATCH --account=<your_project> #SBATCH --nodes=1 -#SBATCH --ntasks=13 -##SBATCH --ntasks-per-node=13 -#SBATCH --cpus-per-task=1 -#SBATCH --output=Data_Preprocess_step1_era5-out.%j -#SBATCH --error=Data_Preprocess_step1era5-err.%j -#SBATCH --time=04:20:00 -#SBATCH --partition=batch -#SBATCH --gres=gpu:0 +#SBATCH --ntasks=1 +#SBATCH --output=train_model_era5-out.%j +#SBATCH --error=train_model_era5-err.%j +#SBATCH --time=24:00:00 +#SBATCH --gres=gpu:1 +#SBATCH --partition=some_partition #SBATCH --mail-type=ALL #SBATCH --mail-user=me@somewhere.com -##Load basic Python module: Need input -#module load Python - - -##Create and activate a virtual environment: Need input -#VENV_NAME=<my_venv> -#Python -m venv ../virtual_envs/${VENV_NAME} -#source ../virtual_envs/${VENV_NAME}/bin/activate - -## Install required packages -# set PYTHONPATH... -WORKING_DIR="$(pwd)" -BASE_DIR=="$(WORKING_DIR "$dir")" -export PYTHONPATH=${BASE_DIR}/virtual_envs/${VENV_NAME}/lib/python3.8/site-packages:$PYTHONPATH -export PYTHONPATH=${BASE_DIR}:$PYTHONPATH -export PYTHONPATH=${BASE_DIR}/utils:$PYTHONPATH -export PYTHONPATH=${BASE_DIR}/model_modules:$PYTHONPATH -export PYTHONPATH=${BASE_DIR}/postprocess:$PYTHONPATH -# ... install requirements -pip install --no-cache-dir -r ../env_setup/requirements.txt +######### Template identifier (don't remove) ######### +echo "Do not run the template scripts" +exit 99 +######### Template identifier (don't remove) ######### +# auxiliary variables +WORK_DIR="$(pwd)" +BASE_DIR=$(dirname "$WORK_DIR") # Name of virtual environment -VENV_NAME=venv_hdfml +VIRT_ENV_NAME="my_venv" +# !!! ADAPAT DEPENDING ON USAGE OF CONTAINER !!! +# For container usage, comment in the follwoing lines # Name of container image (must be available in working directory) CONTAINER_IMG="${WORK_DIR}/tensorflow_21.09-tf1-py3.sif" WRAPPER="${BASE_DIR}/env_setup/wrapper_container.sh" @@ -54,7 +40,6 @@ fi # clean-up modules to avoid conflicts between host and container settings module purge - # declare directory-variables which will be modified by generate_runscript.py source_dir=/my/path/to/tfrecords/files destination_dir=/my/model/output/path @@ -72,3 +57,22 @@ srun --mpi=pspmix --cpu-bind=none \ python3 "${BASE_DIR}"/main_scripts/main_train_models.py --input_dir ${source_dir} --datasplit_dict ${datasplit_dict} \ --dataset era5 --model ${model} --model_hparams_dict ${model_hparams} --output_dir ${destination_dir}/ +# WITHOUT container usage, comment in the follwoing lines (and uncomment the lines above) +# Activate virtual environment if needed (and possible) +#if [ -z ${VIRTUAL_ENV} ]; then +# if [[ -f ../virtual_envs/${VIRT_ENV_NAME}/bin/activate ]]; then +# echo "Activating virtual environment..." +# source ../virtual_envs/${VIRT_ENV_NAME}/bin/activate +# else +# echo "ERROR: Requested virtual environment ${VIRT_ENV_NAME} not found..." +# exit 1 +# fi +#fi +# +# Loading modules +#module purge +#source ../env_setup/modules_train.sh +#export CUDA_VISIBLE_DEVICES=0 +# +# srun python3 "${BASE_DIR}"/main_scripts/main_train_models.py --input_dir ${source_dir} --datasplit_dict ${datasplit_dict} \ +# --dataset era5 --model ${model} --model_hparams_dict ${model_hparams} --output_dir ${destination_dir}/ \ No newline at end of file diff --git a/video_prediction_tools/JSC_scripts/train_model_moving_mnist_template.sh b/video_prediction_tools/HPC_scripts/train_model_moving_mnist_template.sh similarity index 68% rename from video_prediction_tools/JSC_scripts/train_model_moving_mnist_template.sh rename to video_prediction_tools/HPC_scripts/train_model_moving_mnist_template.sh index 0f25f6906d63918d376d697fbec98eadfb1ad9a0..322d0fac362119032f558232e8161321434d2f2f 100755 --- a/video_prediction_tools/JSC_scripts/train_model_moving_mnist_template.sh +++ b/video_prediction_tools/HPC_scripts/train_model_moving_mnist_template.sh @@ -1,5 +1,5 @@ #!/bin/bash -x -#SBATCH --account=deepacf +#SBATCH --account=<your_project> #SBATCH --nodes=1 #SBATCH --ntasks=1 ##SBATCH --ntasks-per-node=1 @@ -10,8 +10,7 @@ #SBATCH --gres=gpu:1 #SBATCH --partition=gpus #SBATCH --mail-type=ALL -#SBATCH --mail-user=b.gong@fz-juelich.de -##jutil env activate -p cjjsc42 +#SBATCH --mail-user=me@somewhere.com ######### Template identifier (don't remove) ######### echo "Do not run the template scripts" @@ -23,6 +22,8 @@ WORK_DIR=`pwd` BASE_DIR=$(dirname "$WORK_DIR") # Name of virtual environment VIRT_ENV_NAME="my_venv" +# !!! ADAPAT DEPENDING ON USAGE OF CONTAINER !!! +# For container usage, comment in the follwoing lines # Name of container image (must be available in working directory) CONTAINER_IMG="${WORK_DIR}/tensorflow_21.09-tf1-py3.sif" WRAPPER="${BASE_DIR}/env_setup/wrapper_container.sh" @@ -52,7 +53,6 @@ dataset=moving_mnist model_hparams=../hparams/${dataset}/${model}/model_hparams.json destination_dir=${destination_dir}/${model}/"$(date +"%Y%m%dT%H%M")_"$USER"" -# rund training # run training in container export CUDA_VISIBLE_DEVICES=0 ## One node, single GPU @@ -60,3 +60,23 @@ srun --mpi=pspmix --cpu-bind=none \ singularity exec --nv "${CONTAINER_IMG}" "${WRAPPER}" ${VIRT_ENV_NAME} \ python ../main_scripts/train.py --input_dir ${source_dir}/tfrecords/ --dataset ${dataset} --model ${model} \ --model_hparams_dict ${model_hparams} --output_dir "${destination_dir}"/ + +# WITHOUT container usage, comment in the follwoing lines (and uncomment the lines above) +# Activate virtual environment if needed (and possible) +#if [ -z ${VIRTUAL_ENV} ]; then +# if [[ -f ../virtual_envs/${VIRT_ENV_NAME}/bin/activate ]]; then +# echo "Activating virtual environment..." +# source ../virtual_envs/${VIRT_ENV_NAME}/bin/activate +# else +# echo "ERROR: Requested virtual environment ${VIRT_ENV_NAME} not found..." +# exit 1 +# fi +#fi +# +# Loading modules +#module purge +#source ../env_setup/modules_train.sh +#export CUDA_VISIBLE_DEVICES=0 +# +# srun python3 ../main_scripts/train.py --input_dir ${source_dir}/tfrecords/ --dataset ${dataset} --model ${model} \ +# --model_hparams_dict ${model_hparams} --output_dir "${destination_dir}"/ \ No newline at end of file diff --git a/video_prediction_tools/HPC_scripts/visualize_postprocess_era5_template.sh b/video_prediction_tools/HPC_scripts/visualize_postprocess_era5_template.sh index e7f169337b5bddb47fc62116bce6b2af96991d7d..6239b82ff4b18e85b045d011dce50077bd93c1f2 100644 --- a/video_prediction_tools/HPC_scripts/visualize_postprocess_era5_template.sh +++ b/video_prediction_tools/HPC_scripts/visualize_postprocess_era5_template.sh @@ -1,41 +1,29 @@ #!/bin/bash -x -## Controlling Batch-job: Need input -#SBATCH --account=<Project name> +#SBATCH --account=<your_project> #SBATCH --nodes=1 -#SBATCH --ntasks=13 -##SBATCH --ntasks-per-node=13 +#SBATCH --ntasks=1 +##SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=1 -#SBATCH --output=Data_Preprocess_step1_era5-out.%j -#SBATCH --error=Data_Preprocess_step1era5-err.%j -#SBATCH --time=04:20:00 -#SBATCH --partition=batch -#SBATCH --gres=gpu:0 +#SBATCH --output=postprocess_era5-out.%j +#SBATCH --error=postprocess_era5-err.%j +#SBATCH --time=01:00:00 +#SBATCH --gres=gpu:1 +#SBATCH --partition=gpus #SBATCH --mail-type=ALL #SBATCH --mail-user=me@somewhere.com -##Load basic Python module: Need input -#module load Python +######### Template identifier (don't remove) ######### +echo "Do not run the template scripts" +exit 99 +######### Template identifier (don't remove) ######### - -##Create and activate a virtual environment: Need input -#VENV_NAME=<my_venv> -#Python -m venv ../virtual_envs/${VENV_NAME} -#source ../virtual_envs/${VENV_NAME}/bin/activate - -## Install required packages -# set PYTHONPATH... -WORKING_DIR="$(pwd)" -BASE_DIR=="$(WORKING_DIR "$dir")" -export PYTHONPATH=${BASE_DIR}/virtual_envs/${VENV_NAME}/lib/python3.8/site-packages:$PYTHONPATH -export PYTHONPATH=${BASE_DIR}:$PYTHONPATH -export PYTHONPATH=${BASE_DIR}/utils:$PYTHONPATH -export PYTHONPATH=${BASE_DIR}/model_modules:$PYTHONPATH -export PYTHONPATH=${BASE_DIR}/postprocess:$PYTHONPATH -# ... install requirements -pip install --no-cache-dir -r ../env_setup/requirements.txt - -# Name of virtual environment -VENV_NAME=venv_hdfml +# auxiliary variables +WORK_DIR="$(pwd)" +BASE_DIR=$(dirname "$WORK_DIR") +# Name of virtual environment +VIRT_ENV_NAME="my_venv" +# !!! ADAPAT DEPENDING ON USAGE OF CONTAINER !!! +# For container usage, comment in the follwoing lines # Name of container image (must be available in working directory) CONTAINER_IMG="${WORK_DIR}/tensorflow_21.09-tf1-py3.sif" WRAPPER="${BASE_DIR}/env_setup/wrapper_container.sh" @@ -51,10 +39,11 @@ if [[ ! -f ${WRAPPER} ]]; then exit 1 fi +# clean-up modules to avoid conflicts between host and container settings +module purge # declare directory-variables which will be modified by generate_runscript.py # Note: source_dir is only needed for retrieving the base-directory -source_dir=/my/source/dir/ checkpoint_dir=/my/trained/model/dir results_dir=/my/results/dir lquick="" @@ -69,3 +58,24 @@ srun --mpi=pspmix --cpu-bind=none \ --num_stochastic_samples 1 ${lquick} \ > postprocess_era5-out_all."${SLURM_JOB_ID}" +# WITHOUT container usage, comment in the follwoing lines (and uncomment the lines above) +# Activate virtual environment if needed (and possible) +#if [ -z ${VIRTUAL_ENV} ]; then +# if [[ -f ../virtual_envs/${VIRT_ENV_NAME}/bin/activate ]]; then +# echo "Activating virtual environment..." +# source ../virtual_envs/${VIRT_ENV_NAME}/bin/activate +# else +# echo "ERROR: Requested virtual environment ${VIRT_ENV_NAME} not found..." +# exit 1 +# fi +#fi +# +# Loading modules +#module purge +#source ../env_setup/modules_train.sh +#export CUDA_VISIBLE_DEVICES=0 +# +# srun python3 ../main_scripts/main_visualize_postprocess.py --checkpoint ${checkpoint_dir} --mode test \ +# --results_dir ${results_dir} --batch_size 4 \ +# --num_stochastic_samples 1 ${lquick} \ +# > postprocess_era5-out_all."${SLURM_JOB_ID}" \ No newline at end of file diff --git a/video_prediction_tools/JSC_scripts/visualize_postprocess_moving_mnist_template.sh b/video_prediction_tools/HPC_scripts/visualize_postprocess_moving_mnist_template.sh similarity index 66% rename from video_prediction_tools/JSC_scripts/visualize_postprocess_moving_mnist_template.sh rename to video_prediction_tools/HPC_scripts/visualize_postprocess_moving_mnist_template.sh index c57beecc13959eb2dd28654f5289f0c0b122a71c..142193121fb12ea792d0350eac859652512438a1 100755 --- a/video_prediction_tools/JSC_scripts/visualize_postprocess_moving_mnist_template.sh +++ b/video_prediction_tools/HPC_scripts/visualize_postprocess_moving_mnist_template.sh @@ -1,5 +1,5 @@ #!/bin/bash -x -#SBATCH --account=deepacf +#SBATCH --account=<your_project> #SBATCH --nodes=1 #SBATCH --ntasks=1 ##SBATCH --ntasks-per-node=1 @@ -10,8 +10,7 @@ #SBATCH --gres=gpu:1 #SBATCH --partition=develgpus #SBATCH --mail-type=ALL -#SBATCH --mail-user=b.gong@fz-juelich.de -##jutil env activate -p cjjsc42 +#SBATCH --mail-user=me@somewhere.com ######### Template identifier (don't remove) ######### echo "Do not run the template scripts" @@ -23,6 +22,8 @@ WORK_DIR="$(pwd)" BASE_DIR=$(dirname "$WORK_DIR") # Name of virtual environment VIRT_ENV_NAME="my_venv" +# !!! ADAPAT DEPENDING ON USAGE OF CONTAINER !!! +# For container usage, comment in the follwoing lines # Name of container image (must be available in working directory) CONTAINER_IMG="${WORK_DIR}/tensorflow_21.09-tf1-py3.sif" WRAPPER="${BASE_DIR}/env_setup/wrapper_container.sh" @@ -57,3 +58,23 @@ srun --mpi=pspmix --cpu-bind=none \ --checkpoint ${checkpoint_dir}/${model} --mode test --model ${model} --results_dir ${results_dir}/${model} \ --batch_size 2 --dataset era5 > generate_era5-out."${SLURM_JOB_ID}" +# WITHOUT container usage, comment in the follwoing lines (and uncomment the lines above) +# Activate virtual environment if needed (and possible) +#if [ -z ${VIRTUAL_ENV} ]; then +# if [[ -f ../virtual_envs/${VIRT_ENV_NAME}/bin/activate ]]; then +# echo "Activating virtual environment..." +# source ../virtual_envs/${VIRT_ENV_NAME}/bin/activate +# else +# echo "ERROR: Requested virtual environment ${VIRT_ENV_NAME} not found..." +# exit 1 +# fi +#fi +# +# Loading modules +#module purge +#source ../env_setup/modules_train.sh +#export CUDA_VISIBLE_DEVICES=0 +# +# srun python3 ../scripts/generate_movingmnist.py --input_dir ${source_dir}/ --dataset_hparams sequence_length=20 \ +# --checkpoint ${checkpoint_dir}/${model} --mode test --model ${model} --results_dir ${results_dir}/${model} \ +# --batch_size 2 --dataset era5 > generate_era5-out."${SLURM_JOB_ID}" \ No newline at end of file diff --git a/video_prediction_tools/JSC_scripts/data_extraction_era5_template.sh b/video_prediction_tools/JSC_scripts/data_extraction_era5_template.sh deleted file mode 100644 index f856eb55e47eb89fa9dbdba96e78dbe050ecdfab..0000000000000000000000000000000000000000 --- a/video_prediction_tools/JSC_scripts/data_extraction_era5_template.sh +++ /dev/null @@ -1,51 +0,0 @@ -#!/bin/bash -x -## Controlling Batch-job -#SBATCH --account=deepacf -#SBATCH --nodes=1 -#SBATCH --ntasks=13 -##SBATCH --ntasks-per-node=13 -#SBATCH --cpus-per-task=1 -#SBATCH --output=data_extraction_era5-out.%j -#SBATCH --error=data_extraction_era5-err.%j -#SBATCH --time=04:20:00 -#SBATCH --partition=batch -#SBATCH --gres=gpu:0 -#SBATCH --mail-type=ALL -#SBATCH --mail-user=me@somewhere.com - -######### Template identifier (don't remove) ######### -echo "Do not run the template scripts" -exit 99 -######### Template identifier (don't remove) ######### - -jutil env activate -p deepacf - -# Name of virtual environment -VIRT_ENV_NAME="my_venv" - -# Loading mouldes -source ../env_setup/modules_preprocess+extract.sh -# Activate virtual environment if needed (and possible) -if [ -z ${VIRTUAL_ENV} ]; then - if [[ -f ../${VIRT_ENV_NAME}/bin/activate ]]; then - echo "Activating virtual environment..." - source ../${VIRT_ENV_NAME}/bin/activate - else - echo "ERROR: Requested virtual environment ${VIRT_ENV_NAME} not found..." - exit 1 - fi -fi - -# Declare path-variables (dest_dir will be set and configured automatically via generate_runscript.py) -source_dir=/my/path/to/era5 -destination_dir=/my/path/to/extracted/data -varmap_file=/my/path/to/varmapping/file - -years=( "2015" ) - -# Run data extraction -for year in "${years[@]}"; do - echo "Perform ERA5-data extraction for year ${year}" - srun python ../main_scripts/main_data_extraction.py --source_dir ${source_dir} --target_dir ${destination_dir} \ - --year ${year} --varslist_path ${varmap_file} -done diff --git a/video_prediction_tools/JSC_scripts/meta_postprocess_era5_template.sh b/video_prediction_tools/JSC_scripts/meta_postprocess_era5_template.sh deleted file mode 100644 index ec8b6eb42c0f0bef9dbc1d70701408b6fabda7f0..0000000000000000000000000000000000000000 --- a/video_prediction_tools/JSC_scripts/meta_postprocess_era5_template.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash -x -## Controlling Batch-job -#SBATCH --account=deepacf -#SBATCH --nodes=1 -#SBATCH --ntasks=1 -##SBATCH --ntasks-per-node=13 -#SBATCH --cpus-per-task=1 -#SBATCH --output=meta_postprocess_era5-out.%j -#SBATCH --error=meta_postprocess_era5-err.%j -#SBATCH --time=00:20:00 -#SBATCH --partition=batch -#SBATCH --gres=gpu:0 -#SBATCH --mail-type=ALL -#SBATCH --mail-user=me@somewhere.com - -######### Template identifier (don't remove) ######### -echo "Do not run the template scripts" -exit 99 -######### Template identifier (don't remove) ######### -jutil env activate -p deepacf - - -# Declare input parameters -root_dir=/p/project/deepacf/deeprain/video_prediction_shared_folder/ -analysis_config=video_prediction_tools/meta_postprocess_config/meta_config.json -metric=mse -exp_id=test -enable_skill_scores=True - -srun python ../main_scripts/main_meta_postprocess.py --root_dir ${root_dir} --analysis_config ${analysis_config} \ - --metric ${metric} --exp_id ${exp_id} --enable_skill_scores ${enable_skill_scores} diff --git a/video_prediction_tools/JSC_scripts/preprocess_data_era5_step1_template.sh b/video_prediction_tools/JSC_scripts/preprocess_data_era5_step1_template.sh deleted file mode 100644 index 80d4de5266bc57c944bd57ffa5359512b4f23a4b..0000000000000000000000000000000000000000 --- a/video_prediction_tools/JSC_scripts/preprocess_data_era5_step1_template.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash -x -## Controlling Batch-job -#SBATCH --account=deepacf -#SBATCH --nodes=1 -#SBATCH --ntasks=13 -##SBATCH --ntasks-per-node=12 -#SBATCH --cpus-per-task=1 -#SBATCH --output=DataPreprocess_era5_step1-out.%j -#SBATCH --error=DataPreprocess_era5_step1-err.%j -#SBATCH --time=04:20:00 -#SBATCH --gres=gpu:0 -#SBATCH --partition=batch -#SBATCH --mail-type=ALL -#SBATCH --mail-user=me@somewhere.com - -######### Template identifier (don't remove) ######### -echo "Do not run the template scripts" -exit 99 -######### Template identifier (don't remove) ######### - -# Name of virtual environment -VIRT_ENV_NAME="my_venv" - -# Activate virtual environment if needed (and possible) -if [ -z ${VIRTUAL_ENV} ]; then - if [[ -f ../virtual_envs/${VIRT_ENV_NAME}/bin/activate ]]; then - echo "Activating virtual environment..." - source ../virtual_envs/${VIRT_ENV_NAME}/bin/activate - else - echo "ERROR: Requested virtual environment ${VIRT_ENV_NAME} not found..." - exit 1 - fi -fi -# Loading mouldes -source ../env_setup/modules_preprocess+extract.sh - - -# select years and variables for dataset and define target domain -years=( "2015" ) -variables=( "t2" "t2" "t2" ) -sw_corner=( -999.9 -999.9) -nyx=( -999 -999 ) - -# set some paths -# note, that destination_dir is adjusted during runtime based on the data -source_dir=/my/path/to/extracted/data/ -destination_dir=/my/path/to/pickle/files - -# execute Python-scripts -for year in "${years[@]}"; do - echo "start preprocessing data for year ${year}" - srun python ../main_scripts/main_preprocess_data_step1.py \ - --source_dir ${source_dir} --destination_dir ${destination_dir} --years "${year}" \ - --vars "${variables[0]}" "${variables[1]}" "${variables[2]}" \ - --sw_corner "${sw_corner[0]}" "${sw_corner[1]}" --nyx "${nyx[0]}" "${nyx[1]}" -done - - -#srun python ../../workflow_parallel_frame_prediction/DataPreprocess/mpi_split_data_multi_years.py --destination_dir ${destination_dir} --varnames T2 MSL gph500 diff --git a/video_prediction_tools/JSC_scripts/preprocess_data_era5_step2_template.sh b/video_prediction_tools/JSC_scripts/preprocess_data_era5_step2_template.sh deleted file mode 100644 index e0440dff5ab507f0ba475485781ab63283f8f4dc..0000000000000000000000000000000000000000 --- a/video_prediction_tools/JSC_scripts/preprocess_data_era5_step2_template.sh +++ /dev/null @@ -1,57 +0,0 @@ -#!/bin/bash -x -#SBATCH --account=deepacf -#SBATCH --nodes=1 -#SBATCH --ntasks=13 -##SBATCH --ntasks-per-node=13 -#SBATCH --cpus-per-task=1 -#SBATCH --output=DataPreprocess_era5_step2-out.%j -#SBATCH --error=DataPreprocess_era5_step2-err.%j -#SBATCH --time=04:00:00 -#SBATCH --gres=gpu:0 -#SBATCH --partition=batch -#SBATCH --mail-type=ALL -#SBATCH --mail-user=me@somewhere.com - -######### Template identifier (don't remove) ######### -echo "Do not run the template scripts" -exit 99 -######### Template identifier (don't remove) ######### - -# auxiliary variables -WORK_DIR="$(pwd)" -BASE_DIR=$(dirname "$WORK_DIR") -# Name of virtual environment -VIRT_ENV_NAME="my_venv" -# Name of container image (must be available in working directory) -CONTAINER_IMG="${WORK_DIR}/tensorflow_21.09-tf1-py3.sif" -WRAPPER="${BASE_DIR}/env_setup/wrapper_container.sh" - -# sanity checks -if [[ ! -f ${CONTAINER_IMG} ]]; then - echo "ERROR: Cannot find required TF1.15 container image '${CONTAINER_IMG}'." - exit 1 -fi - -if [[ ! -f ${WRAPPER} ]]; then - echo "ERROR: Cannot find wrapper-script '${WRAPPER}' for TF1.15 container image." - exit 1 -fi - -# clean-up modules to avoid conflicts between host and container settings -module purge - -# declare directory-variables which will be modified by config_runscript.py -source_dir=/my/path/to/pkl/files/ -destination_dir=/my/path/to/tfrecords/files - -sequence_length=20 -sequences_per_file=10 -# run Preprocessing (step 2 where Tf-records are generated) -# run postprocessing/generation of model results including evaluation metrics -export CUDA_VISIBLE_DEVICES=0 -## One node, single GPU -srun --mpi=pspmix --cpu-bind=none \ - singularity exec --nv "${CONTAINER_IMG}" "${WRAPPER}" ${VIRT_ENV_NAME} \ - python3 ../main_scripts/main_preprocess_data_step2.py -source_dir ${source_dir} -dest_dir ${destination_dir} \ - -sequence_length ${sequence_length} -sequences_per_file ${sequences_per_file} - diff --git a/video_prediction_tools/JSC_scripts/preprocess_data_moving_mnist_template.sh b/video_prediction_tools/JSC_scripts/preprocess_data_moving_mnist_template.sh deleted file mode 100644 index fba90ba9caec50822503a735751b5342ef3398fb..0000000000000000000000000000000000000000 --- a/video_prediction_tools/JSC_scripts/preprocess_data_moving_mnist_template.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/bin/bash -x -#SBATCH --account=deepacf -#SBATCH --nodes=1 -#SBATCH --ntasks=1 -##SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=1 -#SBATCH --output=DataPreprocess_moving_mnist-out.%j -#SBATCH --error=DataPreprocess_moving_mnist-err.%j -#SBATCH --time=04:00:00 -#SBATCH --partition=batch -#SBATCH --mail-type=ALL -#SBATCH --mail-user=me@somewhere.com - -######### Template identifier (don't remove) ######### -echo "Do not run the template scripts" -exit 99 -######### Template identifier (don't remove) ######### - -# Name of virtual environment -VIRT_ENV_NAME="my_venv" - -# Loading mouldes -source ../env_setup/modules_train.sh -# Activate virtual environment if needed (and possible) -if [ -z ${VIRTUAL_ENV} ]; then - if [[ -f ../${VIRT_ENV_NAME}/bin/activate ]]; then - echo "Activating virtual environment..." - source ../${VIRT_ENV_NAME}/bin/activate - else - echo "ERROR: Requested virtual environment ${VIRT_ENV_NAME} not found..." - exit 1 - fi -fi - -# declare directory-variables which will be modified generate_runscript.py -source_dir=/my/path/to/mnist/raw/data/ -destination_dir=/my/path/to/mnist/tfrecords/ - -# run Preprocessing (step 2 where Tf-records are generated) -# run postprocessing/generation of model results including evaluation metrics -export CUDA_VISIBLE_DEVICES=0 -## One node, single GPU -srun --mpi=pspmix --cpu-bind=none \ - singularity exec --nv "${CONTAINER_IMG}" "${WRAPPER}" ${VIRT_ENV_NAME} \ - python3 ../video_prediction/datasets/moving_mnist.py ${source_dir} ${destination_dir} diff --git a/video_prediction_tools/JSC_scripts/visualize_postprocess_era5_template.sh b/video_prediction_tools/JSC_scripts/visualize_postprocess_era5_template.sh deleted file mode 100644 index be3e67c03f8384de39e9d193ad206e44695282df..0000000000000000000000000000000000000000 --- a/video_prediction_tools/JSC_scripts/visualize_postprocess_era5_template.sh +++ /dev/null @@ -1,58 +0,0 @@ -#!/bin/bash -x -#SBATCH --account=deepacf -#SBATCH --nodes=1 -#SBATCH --ntasks=1 -##SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=1 -#SBATCH --output=postprocess_era5-out.%j -#SBATCH --error=postprocess_era5-err.%j -#SBATCH --time=01:00:00 -#SBATCH --gres=gpu:1 -#SBATCH --partition=gpus -#SBATCH --mail-type=ALL -#SBATCH --mail-user=me@somewhere.com - -######### Template identifier (don't remove) ######### -echo "Do not run the template scripts" -exit 99 -######### Template identifier (don't remove) ######### - -# auxiliary variables -WORK_DIR="$(pwd)" -BASE_DIR=$(dirname "$WORK_DIR") -# Name of virtual environment -VIRT_ENV_NAME="my_venv" -# Name of container image (must be available in working directory) -CONTAINER_IMG="${WORK_DIR}/tensorflow_21.09-tf1-py3.sif" -WRAPPER="${BASE_DIR}/env_setup/wrapper_container.sh" - -# sanity checks -if [[ ! -f ${CONTAINER_IMG} ]]; then - echo "ERROR: Cannot find required TF1.15 container image '${CONTAINER_IMG}'." - exit 1 -fi - -if [[ ! -f ${WRAPPER} ]]; then - echo "ERROR: Cannot find wrapper-script '${WRAPPER}' for TF1.15 container image." - exit 1 -fi - -# clean-up modules to avoid conflicts between host and container settings -module purge - -# declare directory-variables which will be modified by generate_runscript.py -# Note: source_dir is only needed for retrieving the base-directory -source_dir=/my/source/dir/ -checkpoint_dir=/my/trained/model/dir -results_dir=/my/results/dir -lquick="" - -# run postprocessing/generation of model results including evaluation metrics -export CUDA_VISIBLE_DEVICES=0 -## One node, single GPU -srun --mpi=pspmix --cpu-bind=none \ - singularity exec --nv "${CONTAINER_IMG}" "${WRAPPER}" ${VIRT_ENV_NAME} \ - python3 ../main_scripts/main_visualize_postprocess.py --checkpoint ${checkpoint_dir} --mode test \ - --results_dir ${results_dir} --batch_size 4 \ - --num_stochastic_samples 1 ${lquick} \ - > postprocess_era5-out_all."${SLURM_JOB_ID}" diff --git a/video_prediction_tools/env_setup/create_env.sh b/video_prediction_tools/env_setup/create_env.sh index 574a57e33b90e453d48d47ec4c013ca766320bde..285fc4832217444cd8c1f384f1525643da75a7e3 100755 --- a/video_prediction_tools/env_setup/create_env.sh +++ b/video_prediction_tools/env_setup/create_env.sh @@ -19,7 +19,26 @@ check_argin() { if [[ $argin == *"-base_dir="* ]]; then base_outdir=${argin#"-base_dir="} fi + if [[ $argin == *"-tf_container="* ]]; then + TF_CONTAINER_NAME=${argin#"-tf_container="} + fi + if [[ $argin == *"-l_nocontainer"* ]]; then + bool_container=0 + fi + if [[ $argin == *"-l_nohpc"* ]]; then + bool_hpc=0 + fi done + if [[ -z "${bool_container}" ]]; then + bool_container=1 + fi + if [[ -z "${bool_hpc}" ]]; then + bool_hpc=1 + fi + # in case that no TF-container is set manually, set the default + if [[ -z "${TF_CONTAINER_NAME}" ]]; then + TF_CONTAINER_NAME="tensorflow_21.09-tf1-py3.sif" + fi } # **************** Auxiliary functions **************** @@ -38,9 +57,7 @@ if [[ -z "$1" ]]; then return fi -if [[ "$#" -gt 1 ]]; then - check_argin ${@:2} # sets base_outdir if provided -fi +check_argin ${@:2} # sets further variables # set some variables HOST_NAME="$(hostname)" @@ -49,12 +66,15 @@ THIS_DIR="$(pwd)" WORKING_DIR="$(dirname "$THIS_DIR")" EXE_DIR="$(basename "$THIS_DIR")" ENV_DIR=${WORKING_DIR}/virtual_envs/${ENV_NAME} -TF_CONTAINER=${WORKING_DIR}/HPC_scripts/tensorflow_21.09-tf1-py3.sif +TF_CONTAINER=${WORKING_DIR}/HPC_scripts/${TF_CONTAINER_NAME} +if [[ ${bool_hpc} == 0 ]]; then + TF_CONTAINER=${WORKING_DIR}/no_HPC_scripts/${TF_CONTAINER_NAME} +fi ## perform sanity checks modules_purge="" -if [[ ! -f ${TF_CONTAINER} ]]; then +if [[ ! -f ${TF_CONTAINER} ]] && [[ ${bool_container} == 1 ]]; then echo "ERROR: Cannot find required TF1.15 container image '${TF_CONTAINER}'." return fi @@ -70,10 +90,15 @@ if [[ "${EXE_DIR}" != "env_setup" ]]; then fi if ! [[ "${HOST_NAME}" == hdfml* || "${HOST_NAME}" == *jwlogin* ]]; then - echo "ERROR: AMBS-workflow is currently only supported on the Juelich HPC-systems HDF-ML, Juwels and Juwels Booster" - return - # unset PYTHONPATH on every other machine that is not a known HPC-system - # unset PYTHONPATH + if [[ ${bool_container} == 0 ]]; then + echo "Execution without container. Please ensure that you fulfill the software requirements for Preprocessing." + if [[ ${bool_hpc} == 1 ]]; then + echo "Make use of modules provided on your HPC-system if possible, i.e. adapt modules_preprocess.sh and modules_train.sh." + fi + fi + if [[ ${bool_hpc} == 0 ]]; then + echo "Running on a non-HPC system. Ensure that you fulfill the software requirements on your machine, e.g. CDO." + fi fi if [[ -d ${ENV_DIR} ]]; then @@ -88,19 +113,38 @@ fi if [[ "$ENV_EXIST" == 0 ]]; then # Activate virtual environment and install additional Python packages. echo "Configuring and activating virtual environment on ${HOST_NAME}" + + if [[ ${bool_container} == 1 ]]; then + if [[ ${bool_hpc} == 1 ]]; then + module purge + fi + singularity exec --nv "${TF_CONTAINER}" ./install_venv_container.sh "${ENV_DIR}" - module purge - singularity exec --nv "${TF_CONTAINER}" ./install_venv_container.sh "${ENV_DIR}" - - info_str="Virtual environment ${ENV_DIR} has been set up successfully." + info_str="Virtual environment ${ENV_DIR} has been set up successfully." + else + if [[ ${bool_hpc} == 1 ]]; then + source ${THIS_DIR}/modules_train.sh + fi + unset PYTHONPATH + ./install_venv.sh "${ENV_DIR}" + + # Activate virtual environment again + source "${ENV_DIR}/bin/activate" + + if [[ ${bool_hpc} == 0 ]]; then + pip3 install --no-cache-dir tensorflow==1.13.1 + fi + fi elif [[ "$ENV_EXIST" == 1 ]]; then info_str="Virtual environment ${ENV_DIR} already exists." fi ## load modules (for running runscript-generator... echo "${info_str}" -echo "Load modules to enable running of runscript generator '${ENV_DIR}'." -source ${THIS_DIR}/modules_preprocess+extract.sh +if [[ ${bool_hpc} == 1 ]]; then + echo "Load modules to enable running of runscript generator '${ENV_DIR}'." + source ${THIS_DIR}/modules_preprocess+extract.sh +fi ## ... and prepare runscripts echo "Set up runscript template for user ${USER}..." diff --git a/video_prediction_tools/env_setup/create_env_non_HPC.sh b/video_prediction_tools/env_setup/create_env_non_HPC.sh deleted file mode 100644 index 2adc5cf578d9ae75cb8ab192c322bb9256480fc8..0000000000000000000000000000000000000000 --- a/video_prediction_tools/env_setup/create_env_non_HPC.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env bash -# -# __authors__ = Bing Gong -# __date__ = '2022_02_20' - -unset PYTHONPATH - -ENV_NAME=$1 -THIS_DIR="$(pwd)" -WORKING_DIR="$(dirname "$THIS_DIR")" -VENV_BASE=${WORKING_DIR}/virtual_envs -VENV_DIR=${WORKING_DIR}/virtual_envs/${ENV_NAME} -ACT_VENV="${VENV_DIR}/bin/activate" - -# check if directory to virtual environment is parsed -if [ -z "$1" ]; then - echo "ERROR: Provide a name to set up the virtual environment." - return -fi - - -#Create virtual enviornment -if ! [[ -d "${VENV_BASE}" ]]; then - mkdir "${VENV_BASE}" - echo "Installing virtualenv under ${VENV_BASE}..." - cd "${VENV_BASE}" - python3 -m virtualenv -p python3 ${ENV_NAME} - #activate source directory - source ${VENV_DIR}/bin/activate -fi - -#Install site packages -pip install --no-cache-dir -r requirements_non_HPC.txt -echo "The site-packages is installed for non_HPC users" - -## Add modules from the project -unset PYTHONPATH -export PYTHONPATH=${WORKING_DIR}:$PYTHONPATH -export PYTHONPATH=${WORKING_DIR}/utils:$PYTHONPATH -export PYTHONPATH=${WORKING_DIR}/model_modules:$PYTHONPATH -export PYTHONPATH=${WORKING_DIR}/postprocess:$PYTHONPATH - - -#ensure the PYTHONPATH is appended when activating the virtual enviornemnt -echo 'export PYTHONPATH='${WORKING_DIR}':$PYTHONPATH' >> ${ACT_VENV} -echo 'export PYTHONPATH='${WORKING_DIR}'/utils:$PYTHONPATH' >> ${ACT_VENV} -echo 'export PYTHONPATH='${WORKING_DIR}'/model_modules:$PYTHONPATH' >> ${ACT_VENV} -echo 'export PYTHONPATH='${WORKING_DIR}'/postprocess:$PYTHONPATH' >> ${ACT_VENV} - - -# get back to basic directory -cd "${WORKING_DIR}" || exit - - diff --git a/video_prediction_tools/env_setup/install_venv.sh b/video_prediction_tools/env_setup/install_venv.sh new file mode 100755 index 0000000000000000000000000000000000000000..3ff7e7b83046ab88a7a7624c1f15bdac324b1492 --- /dev/null +++ b/video_prediction_tools/env_setup/install_venv.sh @@ -0,0 +1,90 @@ +#!/usr/bin/env bash +# +# __authors__ = Bing Gong, Michael Langguth +# __date__ = '2022_02_28' +# __last_update__ = '2022_02_28' by Michael Langguth +# +# **************** Description **************** +# This auxiliary script sets up the virtual environment OUTSIDE singularity container. +# **************** Description **************** + +# set some basic variables +BASE_DIR="$(pwd)" +VENV_DIR=$1 +VENV_NAME="$(basename "${VENV_DIR}")" +VENV_BASE="$(dirname "${VENV_DIR}")" +WORKING_DIR="$(dirname "${VENV_BASE}")" +VENV_REQ=${BASE_DIR}/requirements_nocontainer.txt + +# sanity checks + +# check if directory to virtual environment is parsed +if [ -z "$1" ]; then + echo "ERROR: Provide a name to set up the virtual environment." + return +fi + +# check if virtual environment is not already existing +if [ -d "$1" ]; then + echo "ERROR: Target directory of virtual environment ${1} already exists. Chosse another directory path." + return +fi + +# check for requirement-file +if [ ! -f "${VENV_REQ}" ]; then + echo "ERROR: Cannot find requirement-file '${VENV_REQ}' to set up virtual environment." + return +fi + +# get Python-version +PYTHON_VERSION=$(python3 -c 'import sys; version=sys.version_info[:2]; print("{0}.{1}".format(*version))') +unset PYTHONPATH + +# create or change to base directory for virtual environment (i.e. where the virtualenv-module is placed) +if ! [[ -d "${VENV_BASE}" ]]; then + mkdir "${VENV_BASE}" + # Install virtualenv in this directory + echo "Installing virtualenv under ${VENV_BASE}..." + pip3 install --target="${VENV_BASE}/" virtualenv + # Change into the base-directory of virtual environments... + cd "${VENV_BASE}" || return +else + # Change into the base-directory of virtual environments... + cd "${VENV_BASE}" || return + if ! python3 -m virtualenv --version >/dev/null; then + echo "WARNING: Base directory for virtual environment exists, but virtualenv-module is unavailable." + echo "Try installing virtualenv." + pip3 install --target="${VENV_BASE}/" virtualenv + fi + echo "Virtualenv is already installed." +fi + + +# Set-up virtual environment in base directory for virtual environments +python3 -m virtualenv "${VENV_NAME}" +# Activate virtual environment and install required packages +echo "Activating virtual environment ${VENV_NAME} to install required Python modules..." +ACT_VENV="${VENV_DIR}/bin/activate" +source "${VENV_DIR}/bin/activate" +# set PYTHONPATH... +export PYTHONPATH="" +export PYTHONPATH=${WORKING_DIR}/virtual_envs/${VENV_NAME}/lib/python${PYTHON_VERSION}/site-packages:$PYTHONPATH +export PYTHONPATH=${WORKING_DIR}:$PYTHONPATH +export PYTHONPATH=${WORKING_DIR}/utils:$PYTHONPATH +export PYTHONPATH=${WORKING_DIR}/model_modules:$PYTHONPATH +export PYTHONPATH=${WORKING_DIR}/postprocess:$PYTHONPATH +# ... also ensure that PYTHONPATH is appended when activating the virtual environment... +echo 'export PYTHONPATH='"" >> ${ACT_VENV} +echo 'export PYTHONPATH='${WORKING_DIR}'/virtual_envs/'${VENV_NAME}'/lib/python'${PYTHON_VERSION}'/site-packages:$PYTHONPATH' >> ${ACT_VENV} +echo 'export PYTHONPATH='${WORKING_DIR}':$PYTHONPATH' >> ${ACT_VENV} +echo 'export PYTHONPATH='${WORKING_DIR}'/utils:$PYTHONPATH' >> ${ACT_VENV} +echo 'export PYTHONPATH='${WORKING_DIR}'/model_modules:$PYTHONPATH' >> ${ACT_VENV} +echo 'export PYTHONPATH='${WORKING_DIR}'/postprocess:$PYTHONPATH' >> ${ACT_VENV} +# ... install requirements +pip3 install --no-cache-dir -r "${VENV_REQ}" + +# get back to basic directory +cd "${BASE_DIR}" || exit + + + diff --git a/video_prediction_tools/env_setup/install_venv_container.sh b/video_prediction_tools/env_setup/install_venv_container.sh index 3e5c35b9c4d635179fafab47ee65e153b80d2380..45065c48a9f88b7c96a965255ca0165f79800129 100755 --- a/video_prediction_tools/env_setup/install_venv_container.sh +++ b/video_prediction_tools/env_setup/install_venv_container.sh @@ -41,6 +41,9 @@ if [ ! -f "${VENV_REQ}" ]; then return fi +# get Python-version +PYTHON_VERSION=$(python3 -c 'import sys; version=sys.version_info[:2]; print("{0}.{1}".format(*version))') + # create or change to base directory for virtual environment (i.e. where the virtualenv-module is placed) if ! [[ -d "${VENV_BASE}" ]]; then mkdir "${VENV_BASE}" @@ -53,8 +56,9 @@ else # Change into the base-directory of virtual environments... cd "${VENV_BASE}" || return if ! python -m virtualenv --version >/dev/null; then - echo "ERROR: Base directory for virtual environment exists, but virtualenv-module is unavailable." - exit + echo "WARNING: Base directory for virtual environment exists, but virtualenv-module is unavailable." + echo "Try installation." + pip3 install --target="${VENV_BASE}"/ virtualenv fi echo "Virtualenv is already installed." fi @@ -67,8 +71,8 @@ echo "Actiavting virtual environment ${VENV_NAME} to install required Python mod ACT_VENV="${VENV_DIR}/bin/activate" source "${VENV_DIR}/bin/activate" # set PYTHONPATH... -export PYTHONPATH=/usr/local/lib/python3.8/dist-packages/:$PYTHONPATH -export PYTHONPATH=${WORKING_DIR}/virtual_envs/${VENV_NAME}/lib/python3.8/site-packages:$PYTHONPATH +export PYTHONPATH=/usr/local/lib/python${PYTHON_VERSION}/dist-packages/:$PYTHONPATH +export PYTHONPATH=${WORKING_DIR}/virtual_envs/${VENV_NAME}/lib/python${PYTHON_VERSION}/site-packages:$PYTHONPATH export PYTHONPATH=${WORKING_DIR}:$PYTHONPATH export PYTHONPATH=${WORKING_DIR}/utils:$PYTHONPATH export PYTHONPATH=${WORKING_DIR}/model_modules:$PYTHONPATH diff --git a/video_prediction_tools/env_setup/modules_train.sh b/video_prediction_tools/env_setup/modules_train.sh new file mode 100644 index 0000000000000000000000000000000000000000..095aec5512dba953a01c62233248ad1b47f5e822 --- /dev/null +++ b/video_prediction_tools/env_setup/modules_train.sh @@ -0,0 +1,3 @@ +# Please populate this file when you run on an HPC-system without singularity containers +# You may refer to the file modules_preprocess+extract.sh as blue print which serves for the data extraction and the +# preprocessing step 1 on JSC's HPC_system. \ No newline at end of file diff --git a/video_prediction_tools/env_setup/requirements.txt b/video_prediction_tools/env_setup/requirements.txt index 35f6eb2439047f0697b04a0bac87bfb361fa0790..9c188138ea805d0f05203938a910e6247d4dd8ac 100755 --- a/video_prediction_tools/env_setup/requirements.txt +++ b/video_prediction_tools/env_setup/requirements.txt @@ -4,10 +4,10 @@ pandas==0.25.3 xarray==0.16.0 basemap==1.3.0 numpy==1.17.3 # although this numpy-version is in the container, we set it here to avoid any further installation -scikit-image==0.18.1 +scikit-image==0.17.2 opencv-python-headless==4.2.0.34 netcdf4==1.5.8 -metadata==0.2 +#metadata==0.2 normalization==0.4 utils==1.0.1 diff --git a/video_prediction_tools/env_setup/requirements_non_HPC.txt b/video_prediction_tools/env_setup/requirements_nocontainer.txt similarity index 58% rename from video_prediction_tools/env_setup/requirements_non_HPC.txt rename to video_prediction_tools/env_setup/requirements_nocontainer.txt index de100f9268b6112007f6b9a5df0549fe94718cfa..dc8475e048298372f4ddcfe137a39a1fb16766b9 100755 --- a/video_prediction_tools/env_setup/requirements_non_HPC.txt +++ b/video_prediction_tools/env_setup/requirements_nocontainer.txt @@ -3,12 +3,12 @@ mpi4py==3.0.1 pandas==0.25.3 xarray==0.16.0 basemap==1.3.0 -imageio==2.15.0 # although this numpy-version is in the container, we set it here to avoid any further installation +imageio==2.15.0 +numpy==1.17.3 scikit-image==0.17.2 opencv-python-headless==4.2.0.34 netcdf4==1.5.8 #metadata==0.2 normalization==0.4 utils==1.0.1 -tensorflow==1.13.1 diff --git a/video_prediction_tools/env_setup/wrapper_container.sh b/video_prediction_tools/env_setup/wrapper_container.sh index cfe716bee9f610b4a44988fc2ff6e4be048d06b4..97089e38a8dacc809bab5a3f9bbc62c9d5997690 100755 --- a/video_prediction_tools/env_setup/wrapper_container.sh +++ b/video_prediction_tools/env_setup/wrapper_container.sh @@ -8,8 +8,11 @@ VENV_DIR=$WORKING_DIR/virtual_envs/$1 shift # replaces $1 by $2, so that $@ does not include the name of the virtual environment anymore # sanity checks -if [[ "${EXE_DIR}" != "HPC_scripts" ]]; then - echo "ERROR: Run the setup-script for the enviornment from the HPC_scripts-directory!" +if [[ "${EXE_DIR}" = "HPC_scripts" ]] || [[ "${EXE_DIR}" = "no_HPC_scripts" ]]; +then + echo "The runscript is running under the folder ${EXE_DIR}" +else + echo "ERROR: Run the setup-script for the enviornment from the (no_)HPC_scripts-directory!" exit fi diff --git a/video_prediction_tools/main_scripts/main_train_models.py b/video_prediction_tools/main_scripts/main_train_models.py index 9e58de96a31913eb19678e151fac5c46d6e80409..b16e33919d9f335e0d2b45ad3309ad901e568f57 100644 --- a/video_prediction_tools/main_scripts/main_train_models.py +++ b/video_prediction_tools/main_scripts/main_train_models.py @@ -562,7 +562,8 @@ class BestModelSelector(object): Class to select the best performing model from multiple checkpoints created during training """ - def __init__(self, model_dir: str, eval_metric: str, criterion: str = "min", channel: int = 0, seed: int = 42): + def __init__(self, model_dir: str, eval_metric: str, ltest: bool, criterion: str = "min", channel: int = 0, + seed: int = 42): """ Class to retrieve the best model checkpoint. The last one is also retained. :param model_dir: path to directory where checkpoints are saved (the trained model output directory) @@ -570,6 +571,7 @@ class BestModelSelector(object): :param criterion: set to 'min' ('max') for negatively (positively) oriented metrics :param channel: channel of data used for selection :param seed: seed for the Postprocess-instance + :param ltest: flag to allow bootstrapping in Postprocessing on tiny datasets """ method = self.__class__.__name__ # sanity check @@ -581,6 +583,7 @@ class BestModelSelector(object): self.channel = channel self.metric = eval_metric self.checkpoint_base_dir = model_dir + self.ltest = ltest self.checkpoints_all = BestModelSelector.get_checkpoints_dirs(model_dir) self.ncheckpoints = len(self.checkpoints_all) # evaluate all checkpoints... @@ -604,7 +607,7 @@ class BestModelSelector(object): results_dir_eager = os.path.join(checkpoint, "results_eager") eager_eval = Postprocess(results_dir=results_dir_eager, checkpoint=checkpoint, data_mode="val", batch_size=32, seed=self.seed, eval_metrics=[eval_metric], channel=self.channel, frac_data=0.33, - lquick=True) + lquick=True, ltest=self.ltest) eager_eval.run() eager_eval.handle_eval_metrics() @@ -728,6 +731,8 @@ def main(): parser.add_argument("--frac_intv_save", type=float, default=0.01, help="Fraction of all iteration steps to define the saving interval.") parser.add_argument("--seed", default=1234, type=int) + parser.add_argument("--test_mode", "-test", dest="test_mode", default=False, action="store_true", + help="Test mode for postprocessing to allow bootstrapping on small datasets.") args = parser.parse_args() # start timing for the whole run @@ -753,7 +758,7 @@ def main(): # select best model if args.dataset == "era5" and args.frac_start_save < 1.: - _ = BestModelSelector(args.output_dir, "mse") + _ = BestModelSelector(args.output_dir, "mse", args.test_mode) timeit_finish = time.time() print("Selecting the best model checkpoint took {0:.2f} minutes.".format((timeit_finish - timeit_after_train)/60.)) else: diff --git a/video_prediction_tools/main_scripts/main_visualize_postprocess.py b/video_prediction_tools/main_scripts/main_visualize_postprocess.py index deab9c03666f56237c937622b2613f16fb7be249..0c9f8e434b9c1706b55a7ba6a8a99b3a7156e628 100644 --- a/video_prediction_tools/main_scripts/main_visualize_postprocess.py +++ b/video_prediction_tools/main_scripts/main_visualize_postprocess.py @@ -37,8 +37,9 @@ class Postprocess(TrainModel): def __init__(self, results_dir: str = None, checkpoint: str = None, data_mode: str = "test", batch_size: int = None, gpu_mem_frac: float = None, num_stochastic_samples: int = 1, stochastic_plot_id: int = 0, seed: int = None, channel: int = 0, run_mode: str = "deterministic", lquick: bool = None, - frac_data: float = 1., eval_metrics: List = ("mse", "psnr", "ssim", "acc"), args=None, - clim_path: str = "/p/scratch/deepacf/video_prediction_shared_folder/preprocessedData/T2monthly/climatology_t2m_1991-2020.nc"): + frac_data: float = 1., eval_metrics: List = ("mse", "psnr", "ssim", "acc"), ltest=False, + clim_path: str = "/p/scratch/deepacf/video_prediction_shared_folder/preprocessedData/T2monthly/"+ + "climatology_t2m_1991-2020.nc", args=None): """ Initialization of the class instance for postprocessing (generation of forecasts from trained model + basic evauation). @@ -56,6 +57,7 @@ class Postprocess(TrainModel): :param lquick: flag for quick evaluation :param frac_data: fraction of dataset to be used for evaluation (only applied when shuffling is active) :param eval_metrics: metrics used to evaluate the trained model + :param ltest: flag for test mode to allow bootstrapping on tiny datasets :param clim_path: the path to the netCDF-file storing climatolgical data :param args: namespace of parsed arguments """ @@ -85,10 +87,8 @@ class Postprocess(TrainModel): # configuration of basic evaluation self.eval_metrics = eval_metrics self.nboots_block = 1000 - if lquick: - self.block_length = 7 - else: - self.block_length = 7 * 24 # this corresponds to a block length of 7 days in case of hourly forecasts + self.block_length = 7 * 24 # this corresponds to a block length of 7 days in case of hourly forecasts + if ltest: self.block_length = 1 # initialize evrything to get an executable Postprocess instance if args is not None: self.save_args_to_option_json() # create options.json in results directory @@ -1268,8 +1268,10 @@ def main(): help="(Only) metric to evaluate when quick evaluation (-lquick) is chosen.") parser.add_argument("--climatology_file", "-clim_fl", dest="clim_fl", type=str, default=False, help="The path to the climatology_t2m_1991-2020.nc file ") - parse.add_argument("--frac_data", "-f_dt", dest="f_dt",type=float,default=1, - help="fraction of dataset to be used for evaluation (only applied when shuffling is active)") + parser.add_argument("--frac_data", "-f_dt", dest="f_dt", type=float, default=1., + help="Fraction of dataset to be used for evaluation (only applied when shuffling is active).") + parser.add_argument("--test_mode", "-test", dest="test_mode", default=False, action="store_true", + help="Test mode for postprocessing to allow bootstrapping on small datasets.") args = parser.parse_args() method = os.path.basename(__file__) @@ -1296,7 +1298,7 @@ def main(): batch_size=args.batch_size, num_stochastic_samples=args.num_stochastic_samples, gpu_mem_frac=args.gpu_mem_frac, seed=args.seed, args=args, eval_metrics=eval_metrics, channel=args.channel, lquick=args.lquick, - clim_path=args.clim_fl,frac_data=args.frac_data) + clim_path=args.clim_fl,frac_data=args.frac_data, ltest=args.test_mode) # run the postprocessing postproc_instance.run() postproc_instance.handle_eval_metrics() diff --git a/video_prediction_tools/other_scripts/data_extraction_era5_template.sh b/video_prediction_tools/no_HPC_scripts/data_extraction_era5_template.sh similarity index 72% rename from video_prediction_tools/other_scripts/data_extraction_era5_template.sh rename to video_prediction_tools/no_HPC_scripts/data_extraction_era5_template.sh index 5e10b6d43587b3a9b2f813d8ea4cd80f03518b4e..4e3c4d3a96f4ffbe1f0c238dc650eba7760151ff 100644 --- a/video_prediction_tools/other_scripts/data_extraction_era5_template.sh +++ b/video_prediction_tools/no_HPC_scripts/data_extraction_era5_template.sh @@ -1,6 +1,11 @@ #!/bin/bash -x -#User's input : your virtual enviornment name +######### Template identifier (don't remove) ######### +echo "Do not run the template scripts" +exit 99 +######### Template identifier (don't remove) ######### + +# Name of virtual environment VIRT_ENV_NAME=venv_test echo "Activating virtual environment..." @@ -13,7 +18,7 @@ varmap_file=/my/path/to/varmapping/file years=( "2007" ) -#The number of nodes should be equal to the number of 1 preprcessed folder plus 1 +#The number of nodes should be equal to the number of 1 preprocessed folder plus 1 n_nodes=3 # Run data extraction diff --git a/video_prediction_tools/other_scripts/meta_postprocess_era5_template.sh b/video_prediction_tools/no_HPC_scripts/meta_postprocess_era5_template.sh similarity index 90% rename from video_prediction_tools/other_scripts/meta_postprocess_era5_template.sh rename to video_prediction_tools/no_HPC_scripts/meta_postprocess_era5_template.sh index fb980cb2effa3d95c7dc24b55c4079d879ed0635..6a75c72835d7356e17fa0f7251d15ce2a3a475f7 100644 --- a/video_prediction_tools/other_scripts/meta_postprocess_era5_template.sh +++ b/video_prediction_tools/no_HPC_scripts/meta_postprocess_era5_template.sh @@ -6,8 +6,7 @@ VIRT_ENV_NAME=venv_test echo "Activating virtual environment..." source ../virtual_envs/${VIRT_ENV_NAME}/bin/activate -#the source directory contains the tfrecords -root_dir=/home/b.gong/ +root_dir=/path/to/root/output/directory analysis_config=../meta_postprocess_config/meta_config.json vim ${analysis_config} metric=mse diff --git a/video_prediction_tools/other_scripts/preprocess_data_era5_step1_template.sh b/video_prediction_tools/no_HPC_scripts/preprocess_data_era5_step1_template.sh similarity index 53% rename from video_prediction_tools/other_scripts/preprocess_data_era5_step1_template.sh rename to video_prediction_tools/no_HPC_scripts/preprocess_data_era5_step1_template.sh index e8f84bcc1fbc0e4beab851944276eef092f16876..8519569a32d6a86c98363c85eb707ffa75e4960f 100644 --- a/video_prediction_tools/other_scripts/preprocess_data_era5_step1_template.sh +++ b/video_prediction_tools/no_HPC_scripts/preprocess_data_era5_step1_template.sh @@ -1,10 +1,22 @@ #!/bin/bash -x -#User's input : your virtual enviornment name +######### Template identifier (don't remove) ######### +echo "Do not run the template scripts" +exit 99 +######### Template identifier (don't remove) ######### + +# Name of virtual environment VIRT_ENV_NAME=venv_test -echo "Activating virtual environment..." -source ../virtual_envs/${VIRT_ENV_NAME}/bin/activate +if [ -z ${VIRTUAL_ENV} ]; then + if [[ -f ../virtual_envs/${VIRT_ENV_NAME}/bin/activate ]]; then + echo "Activating virtual environment..." + source ../virtual_envs/${VIRT_ENV_NAME}/bin/activate + else + echo "ERROR: Requested virtual environment ${VIRT_ENV_NAME} not found..." + exit 1 + fi +fi #select years and variables for dataset and define target domain years=( "2007" ) @@ -16,7 +28,7 @@ nyx=( 40 40 ) source_dir=/home/b.gong/data_era5 destination_dir=/home/b.gong/preprocessed_data -#The number of nodes should be equal to the number of 1 preprcessed folder plus 1 +#The number of nodes should be equal to the number of 1 preprocessed folders plus 1 n_nodes=3 for year in "${years[@]}"; do diff --git a/video_prediction_tools/no_HPC_scripts/preprocess_data_era5_step2_template.sh b/video_prediction_tools/no_HPC_scripts/preprocess_data_era5_step2_template.sh new file mode 100644 index 0000000000000000000000000000000000000000..441924e7238cf87edbcca4b5a757f60358c3ecc4 --- /dev/null +++ b/video_prediction_tools/no_HPC_scripts/preprocess_data_era5_step2_template.sh @@ -0,0 +1,64 @@ +#!/bin/bash -x + +######### Template identifier (don't remove) ######### +echo "Do not run the template scripts" +exit 99 +######### Template identifier (don't remove) ######### + +# auxiliary variables +WORK_DIR="$(pwd)" +BASE_DIR=$(dirname "$WORK_DIR") +# Name of virtual environment +VIRT_ENV_NAME=venv_test + +# declare directory-variables which will be modified by config_runscript.py +source_dir=/my/path/to/pkl/files/ +destination_dir=/my/path/to/tfrecords/files + +sequence_length=20 +sequences_per_file=10 + +#the number of the nodes should be the number of processed folder (month) plus 1 +n_nodes=3 + +# !!! ADAPAT DEPENDING ON USAGE OF CONTAINER !!! +# For container usage, comment in the follwoing lines +# Name of container image (must be available in working directory) +CONTAINER_IMG="${WORK_DIR}/tensorflow_21.09-tf1-py3.sif" +WRAPPER="${BASE_DIR}/env_setup/wrapper_container.sh" + +# sanity checks +if [[ ! -f ${CONTAINER_IMG} ]]; then + echo "ERROR: Cannot find required TF1.15 container image '${CONTAINER_IMG}'." + exit 1 +fi + +if [[ ! -f ${WRAPPER} ]]; then + echo "ERROR: Cannot find wrapper-script '${WRAPPER}' for TF1.15 container image." + exit 1 +fi + +mpirun -n singularity exec --nv "${CONTAINER_IMG}" "${WRAPPER}" ${VIRT_ENV_NAME} \ +python3 ../main_scripts/main_preprocess_data_step2.py -source_dir ${source_dir} -dest_dir ${destination_dir} \ +-sequence_length ${sequence_length} -sequences_per_file ${sequences_per_file} + +# WITHOUT container usage, comment in the follwoing lines (and uncomment the lines above) +# Activate virtual environment if needed (and possible) +#if [ -z ${VIRTUAL_ENV} ]; then +# if [[ -f ../virtual_envs/${VIRT_ENV_NAME}/bin/activate ]]; then +# echo "Activating virtual environment..." +# source ../virtual_envs/${VIRT_ENV_NAME}/bin/activate +# else +# echo "ERROR: Requested virtual environment ${VIRT_ENV_NAME} not found..." +# exit 1 +# fi +#fi +# +# Run preprocessing step 2 +#mpirun -n ${n_nodes} python3 ../main_scripts/main_preprocess_data_step2.py -source_dir ${source_dir} -dest_dir ${destination_dir} \ +# -sequence_length ${sequence_length} -sequences_per_file ${sequences_per_file} + + + + + diff --git a/video_prediction_tools/JSC_scripts/train_model_era5_template.sh b/video_prediction_tools/no_HPC_scripts/train_model_era5_template.sh old mode 100755 new mode 100644 similarity index 52% rename from video_prediction_tools/JSC_scripts/train_model_era5_template.sh rename to video_prediction_tools/no_HPC_scripts/train_model_era5_template.sh index 8d9d7d0e8780cc5152f8e22106b878caa6ee8e83..2d0c6d34125e5ea1734683ca5d91837494f61ab6 --- a/video_prediction_tools/JSC_scripts/train_model_era5_template.sh +++ b/video_prediction_tools/no_HPC_scripts/train_model_era5_template.sh @@ -1,14 +1,4 @@ #!/bin/bash -x -#SBATCH --account=deepacf -#SBATCH --nodes=1 -#SBATCH --ntasks=1 -#SBATCH --output=train_model_era5-out.%j -#SBATCH --error=train_model_era5-err.%j -#SBATCH --time=24:00:00 -#SBATCH --gres=gpu:1 -#SBATCH --partition=some_partition -#SBATCH --mail-type=ALL -#SBATCH --mail-user=me@somewhere.com ######### Template identifier (don't remove) ######### echo "Do not run the template scripts" @@ -20,6 +10,18 @@ WORK_DIR="$(pwd)" BASE_DIR=$(dirname "$WORK_DIR") # Name of virtual environment VIRT_ENV_NAME="my_venv" + +# declare directory-variables which will be modified by generate_runscript.py +source_dir=/my/path/to/tfrecords/files +destination_dir=/my/model/output/path + +# valid identifiers for model-argument are: convLSTM, savp, mcnet and vae +model=convLSTM +datasplit_dict=${destination_dir}/data_split.json +model_hparams=${destination_dir}/model_hparams.json + +# !!! ADAPAT DEPENDING ON USAGE OF CONTAINER !!! +# For container usage, comment in the follwoing lines # Name of container image (must be available in working directory) CONTAINER_IMG="${WORK_DIR}/tensorflow_21.09-tf1-py3.sif" WRAPPER="${BASE_DIR}/env_setup/wrapper_container.sh" @@ -37,21 +39,28 @@ fi # clean-up modules to avoid conflicts between host and container settings module purge +# run training in container +export CUDA_VISIBLE_DEVICES=0 +## One node, single GPU +singularity exec --nv "${CONTAINER_IMG}" "${WRAPPER}" ${VIRT_ENV_NAME} \ +python3 "${BASE_DIR}"/main_scripts/main_train_models.py --input_dir ${source_dir} --datasplit_dict ${datasplit_dict} \ +--dataset era5 --model ${model} --model_hparams_dict ${model_hparams} --output_dir ${destination_dir}/ -# declare directory-variables which will be modified by generate_runscript.py -source_dir=/my/path/to/tfrecords/files -destination_dir=/my/model/output/path +# WITHOUT container usage, comment in the follwoing lines (and uncomment the lines above) +# Activate virtual environment if needed (and possible) +#if [ -z ${VIRTUAL_ENV} ]; then +# if [[ -f ../virtual_envs/${VIRT_ENV_NAME}/bin/activate ]]; then +# echo "Activating virtual environment..." +# source ../virtual_envs/${VIRT_ENV_NAME}/bin/activate +# else +# echo "ERROR: Requested virtual environment ${VIRT_ENV_NAME} not found..." +# exit 1 +# fi +#fi +# +# Run training +#python3 ../main_scripts/main_train_models.py --input_dir ${source_dir} --datasplit_dict ${datasplit_dict} \ +# --dataset era5 --model ${model} --model_hparams_dict ${model_hparams} --output_dir ${destination_dir}/ -# valid identifiers for model-argument are: convLSTM, savp, mcnet and vae -model=convLSTM -datasplit_dict=${destination_dir}/data_split.json -model_hparams=${destination_dir}/model_hparams.json -# run training in container -export CUDA_VISIBLE_DEVICES=0 -## One node, single GPU -srun --mpi=pspmix --cpu-bind=none \ - singularity exec --nv "${CONTAINER_IMG}" "${WRAPPER}" ${VIRT_ENV_NAME} \ - python3 "${BASE_DIR}"/main_scripts/main_train_models.py --input_dir ${source_dir} --datasplit_dict ${datasplit_dict} \ - --dataset era5 --model ${model} --model_hparams_dict ${model_hparams} --output_dir ${destination_dir}/ diff --git a/video_prediction_tools/no_HPC_scripts/visualize_postprocess_era5_template.sh b/video_prediction_tools/no_HPC_scripts/visualize_postprocess_era5_template.sh new file mode 100644 index 0000000000000000000000000000000000000000..24ecbb6fe043f2b1930348d6728f8c9d23c8f831 --- /dev/null +++ b/video_prediction_tools/no_HPC_scripts/visualize_postprocess_era5_template.sh @@ -0,0 +1,67 @@ +#!/bin/bash -x + +######### Template identifier (don't remove) ######### +echo "Do not run the template scripts" +exit 99 +######### Template identifier (don't remove) ######### + +# auxiliary variables +WORK_DIR="$(pwd)" +BASE_DIR=$(dirname "$WORK_DIR") +#your virtual enviornment name + +#User's input : your virtual enviornment name +VIRT_ENV_NAME=venv_test +# !!! ADAPAT DEPENDING ON USAGE OF CONTAINER !!! +# For container usage, comment in the follwoing lines +# Name of container image (must be available in working directory) +CONTAINER_IMG="${WORK_DIR}/tensorflow_21.09-tf1-py3.sif" +WRAPPER="${BASE_DIR}/env_setup/wrapper_container.sh" + +# sanity checks +if [[ ! -f ${CONTAINER_IMG} ]]; then + echo "ERROR: Cannot find required TF1.15 container image '${CONTAINER_IMG}'." + exit 1 +fi + +if [[ ! -f ${WRAPPER} ]]; then + echo "ERROR: Cannot find wrapper-script '${WRAPPER}' for TF1.15 container image." + exit 1 +fi + +# declare directory-variables which will be modified by generate_runscript.py +# Note: source_dir is only needed for retrieving the base-directory +checkpoint_dir=/my/trained/model/dir +results_dir=/my/results/dir +lquick="" + +# run postprocessing/generation of model results including evaluation metrics +export CUDA_VISIBLE_DEVICES=0 + +# For running on small datasets (e.g. the dry run), parse -test to the Python-script +singularity exec --nv "${CONTAINER_IMG}" "${WRAPPER}" ${VIRT_ENV_NAME} \ +python3 ../main_scripts/main_visualize_postprocess.py --checkpoint ${checkpoint_dir} --mode test \ + --results_dir ${results_dir} --batch_size 4 \ + --num_stochastic_samples 1 \ + --lquick_evaluation --climatology_file ${climate_file} + + +# WITHOUT container usage, comment in the follwoing lines (and uncomment the lines above) +# Activate virtual environment if needed (and possible) +#if [ -z ${VIRTUAL_ENV} ]; then +# if [[ -f ../virtual_envs/${VIRT_ENV_NAME}/bin/activate ]]; then +# echo "Activating virtual environment..." +# source ../virtual_envs/${VIRT_ENV_NAME}/bin/activate +# else +# echo "ERROR: Requested virtual environment ${VIRT_ENV_NAME} not found..." +# exit 1 +# fi +#fi +# +# Run postprocessing +# For running on small datasets (e.g. the dry run), parse -test to the Python-script +# python3 ../main_scripts/main_visualize_postprocess.py --checkpoint ${checkpoint_dir} --mode test \ +# --results_dir ${results_dir} --batch_size 4 \ +# --num_stochastic_samples 1 \ +# --lquick_evaluation --climatology_file ${climate_file} + diff --git a/video_prediction_tools/other_scripts/preprocess_data_era5_step2_template.sh b/video_prediction_tools/other_scripts/preprocess_data_era5_step2_template.sh deleted file mode 100644 index 9af83c2d845ddb3397d9ee76145446e8136bc05e..0000000000000000000000000000000000000000 --- a/video_prediction_tools/other_scripts/preprocess_data_era5_step2_template.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash -x - -#User's input : your virtual enviornment name -VIRT_ENV_NAME=venv_test - -echo "Activating virtual environment..." -source ../virtual_envs/${VIRT_ENV_NAME}/bin/activate - -sequence_length=20 -sequences_per_file=10 -source_dir=/home/b.gong/preprocessed_data-40x40-990N2010E-2t/pickle -base_dir="$(dirname "$source_dir")" -destination_dir=${base_dir}/tfrecords -#the number of the nodes should be the number of processed folder (month) plus 1 -n_nodes=3 - -mpirun -n ${n_nodes} python3 ../main_scripts/main_preprocess_data_step2.py -source_dir ${source_dir} -dest_dir ${destination_dir} \ - -sequence_length ${sequence_length} -sequences_per_file ${sequences_per_file} - - - - - diff --git a/video_prediction_tools/other_scripts/train_model_era5_template.sh b/video_prediction_tools/other_scripts/train_model_era5_template.sh deleted file mode 100644 index 526ea14c16859e092a987104360df75c6b45efa5..0000000000000000000000000000000000000000 --- a/video_prediction_tools/other_scripts/train_model_era5_template.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash -x - -#your virtual enviornment name -VIRT_ENV_NAME=venv_test - -echo "Activating virtual environment..." -source ../virtual_envs/${VIRT_ENV_NAME}/bin/activate - -# the source directory contains the tfrecords -source_dir=/home/b.gong/preprocessed_data-40x40-990N2010E-2t/tfrecords -destination_dir=/home/b.gong/model2/ - -#select models -model=savp -mkdir ${destination_dir} -cp ../hparams/era5/${model}/model_hparams_template.json ${destination_dir}/model_hparams.json -cp ../data_split/era5/datasplit.json ${destination_dir}/data_split.json - -#copy the configuration to destination_dir -vim ${destination_dir}/data_split.json -vim ${destination_dir}/model_hparams.json - -datasplit_dict=${destination_dir}/data_split.json -model_hparams=${destination_dir}/model_hparams.json - -python3 ../main_scripts/main_train_models.py --input_dir ${source_dir} --datasplit_dict ${datasplit_dict} \ - --dataset era5 --model ${model} --model_hparams_dict ${model_hparams} --output_dir ${destination_dir}/ - - - diff --git a/video_prediction_tools/other_scripts/visualize_postprocess_era5_template.sh b/video_prediction_tools/other_scripts/visualize_postprocess_era5_template.sh deleted file mode 100644 index 8a063501382054535d8f50fb1c1fd8cd3e1fda7f..0000000000000000000000000000000000000000 --- a/video_prediction_tools/other_scripts/visualize_postprocess_era5_template.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -x - -#User's input : your virtual enviornment name -VIRT_ENV_NAME=venv_test - -echo "Activating virtual environment..." -source ../virtual_envs/${VIRT_ENV_NAME}/bin/activate - -#the source directory contains the tfrecords -checkpoint_dir=/home/b.gong/model/checkpoint_89 -results_dir=/home/b.gong/results/ -lquick=1 -climate_file=/home/b.gong/data_era5/T2monthly/climatology_t2m_1991-2020.nc -#select models -model=convLSTM -mkdir ${results_dir} -python3 ../main_scripts/main_visualize_postprocess.py --checkpoint ${checkpoint_dir} --mode test \ - --results_dir ${results_dir} --batch_size 4 \ - --num_stochastic_samples 1 \ - --lquick_evaluation ${lquick} --climatology_file ${climate_file} - diff --git a/video_prediction_tools/utils/runscript_generator/config_postprocess.py b/video_prediction_tools/utils/runscript_generator/config_postprocess.py index eff2694bbd1dc381726e54bb4fb46bf0af5ddb4c..544337834fd4a9a1241f8e20e2cf186d8a2265ce 100755 --- a/video_prediction_tools/utils/runscript_generator/config_postprocess.py +++ b/video_prediction_tools/utils/runscript_generator/config_postprocess.py @@ -76,24 +76,32 @@ class Config_Postprocess(Config_runscript_base): self.model = os.path.basename(dir_base) # List the subdirectories... _ = Config_Postprocess.get_subdir_list(dir_base) - # ... and obtain the checkpoint directory + + # Chose the checkpoint directory + ckp_req_str = "Chose a checkpoint directory from the list above:" + ckp_req_err = NotADirectoryError("Could not find the passed directory.") + dir_base = Config_Postprocess.keyboard_interaction(ckp_req_str, Config_Postprocess.check_dir, ckp_req_err, + prefix2arg=dir_base+"/", ntries=2) + # List the subdirectories... + _ = Config_Postprocess.get_subdir_list(dir_base) + # ... and obtain the model directory with checkpoints trained_dir_req_str = "Choose a trained model from the experiment list above:" trained_err = FileNotFoundError("No trained model parameters found.") - self.checkpoint_dir = Config_Postprocess.keyboard_interaction(trained_dir_req_str, Config_Postprocess.check_traindir, trained_err, ntries=3, prefix2arg=dir_base+"/") + # get the relevant information from checkpoint_dir in order to construct source_dir and results_dir # (following naming convention) cp_dir_split = Config_Postprocess.path_rec_split(self.checkpoint_dir) cp_dir_split = list(filter(None, cp_dir_split)) # get rid of empty list elements - base_dir, exp_dir_base, exp_dir = "/"+os.path.join(*cp_dir_split[:-4]), cp_dir_split[-3], cp_dir_split[-1] + base_dir, exp_dir_base, exp_dir = "/"+os.path.join(*cp_dir_split[:-4]), cp_dir_split[-3], cp_dir_split[-2] self.runscript_target = self.rscrpt_tmpl_prefix + self.dataset + "_" + exp_dir + ".sh" # Set results_dir - self.results_dir = os.path.join(base_dir, "results", exp_dir_base, self.model, exp_dir) + self.results_dir = os.path.join(base_dir, "results", exp_dir_base,self.model, exp_dir) return # Decide if quick evaluation should be performed diff --git a/video_prediction_tools/utils/runscript_generator/config_utils.py b/video_prediction_tools/utils/runscript_generator/config_utils.py index e29ab9457615cbb6641aa27e0d5b9da2ac9bdc9f..02882f889ea8d1a9bb0a267263ee9b7fb6e804af 100755 --- a/video_prediction_tools/utils/runscript_generator/config_utils.py +++ b/video_prediction_tools/utils/runscript_generator/config_utils.py @@ -33,8 +33,8 @@ class Config_runscript_base: if lhpc: self.runscript_dir = "../HPC_scripts" else: - self.runscript_dir = "../nonHPC_scripts" - + self.runscript_dir = "../no_HPC_scripts" + self.long_name_wrk_step = None self.rscrpt_tmpl_prefix = None self.runscript_template = None diff --git a/video_prediction_tools/utils/runscript_generator/convert_runscript.sh b/video_prediction_tools/utils/runscript_generator/convert_runscript.sh index a2c0607aa707db95f085af8ec253f73920b7ddcb..a5ab4188b93a4743050606358049c48c70697af4 100755 --- a/video_prediction_tools/utils/runscript_generator/convert_runscript.sh +++ b/video_prediction_tools/utils/runscript_generator/convert_runscript.sh @@ -93,13 +93,14 @@ fi if [[ `grep "#SBATCH --error=" ${target_script}` ]]; then sed -i "s|#SBATCH --error=.*|#SBATCH --error=${log_str}-err\.%j|g" ${target_script} fi -# set correct e-mail address in Batch scripts on Juwels and HDF-ML +# set correct e-mail address (only when jutil-tool is available) +if ! [[ -z `command -v jutil` ]]; then + USER_EMAIL=$(jutil user show -o json | grep email | cut -f2 -d':' | cut -f1 -d',' | cut -f2 -d'"') +else + USER_EMAIL="" +fi +sed -i "s/--mail-user=.*/--mail-user=$USER_EMAIL/g" ${target_script} if [[ "${HOST_NAME}" == hdfml* || "${HOST_NAME}" == *juwels* ]]; then - if ! [[ -z `command -v jutil` ]]; then - USER_EMAIL=$(jutil user show -o json | grep email | cut -f2 -d':' | cut -f1 -d',' | cut -f2 -d'"') - else - USER_EMAIL="" - fi - sed -i "s/--mail-user=.*/--mail-user=$USER_EMAIL/g" ${target_script} + sed -i "s/--account=.*/--mail-user=deepacf/g" ${target_script} fi # end diff --git a/video_prediction_tools/utils/runscript_generator/setup_runscript_templates.sh b/video_prediction_tools/utils/runscript_generator/setup_runscript_templates.sh index ef4c533f67243953e5c50f87f01e75b744f99d10..a398fc6e1a55292ac9429b7a734232877e1091b0 100755 --- a/video_prediction_tools/utils/runscript_generator/setup_runscript_templates.sh +++ b/video_prediction_tools/utils/runscript_generator/setup_runscript_templates.sh @@ -46,8 +46,8 @@ else fi fi -echo "Start setting up templates under nonHPC_scripts/..." -for f in "${BASE_DIR}"/JSC_scripts/*template.sh; do +echo "Start setting up templates under HPC_scripts/..." +for f in "${BASE_DIR}"/HPC_scripts/*template.sh; do echo "Setting up ${f}..." fnew=${f%%.*}_${USER}.sh cp "${f}" "${fnew}" @@ -55,8 +55,8 @@ for f in "${BASE_DIR}"/JSC_scripts/*template.sh; do done echo "Done!" -echo "Start setting up templates under HPC_scripts/" -for f in "${BASE_DIR}"/HPC_scripts/*template.sh; do +echo "Start setting up templates under no_HPC_scripts/" +for f in "${BASE_DIR}"/no_HPC_scripts/*template.sh; do echo "Setting up ${f}..." fnew=${f%%.*}_${USER}.sh cp "${f}" "${fnew}"