Skip to content
Snippets Groups Projects
Commit e21ee5c9 authored by Michael Langguth's avatar Michael Langguth
Browse files

Add scripts to set-up virtual environments.

parent 7d679b00
No related branches found
No related tags found
No related merge requests found
#!/bin/bash -x
#SBATCH --account=deepacf
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --output=train_model_era5_container-out.%j
#SBATCH --error=train_model_era5_container-err.%j
#SBATCH --time=24:00:00
##SBATCH --time=00:20:00
#SBATCH --gres=gpu:1
#SBATCH --partition=booster
#SBATCH --mail-type=ALL
#SBATCH --mail-user=me@somewhere.com
### Two nodes, 8 GPUs
##SBATCH --nodes=2
##SBATCH --ntasks=8
##SBATCH --ntasks-per-node=4
##SBATCH --gres=gpu:4
## Also take care for the job submission with srun below!!!
WORK_DIR=`pwd`
BASE_DIR=$(dirname "$WORK_DIR")
# Name of virtual environment
VIRT_ENV_NAME="my_venv"
# Name of container image (must be available in working directory)
CONTAINER_IMG="${WORK_DIR}/tensorflow_21.09-tf1-py3.sif"
# clean-up modules to avoid conflicts between host and container settings
module purge
# declare directory-variables which will be modified appropriately during Preprocessing (invoked by mpi_split_data_multi_years.py)
source_dir=/my/path/to/tfrecords/files
destination_dir=/my/model/output/path
# valid identifiers for model-argument are: convLSTM, savp, mcnet and vae
model=convLSTM
datasplit_dict=${destination_dir}/data_split.json
model_hparams=${destination_dir}/model_hparams.json
# run training in container
export CUDA_VISIBLE_DEVICES=0,1,2,3
## One node, single GPU
srun --mpi=pspmix --cpu-bind=none \
singularity exec --nv ${CONTAINER_IMG} ./wrapper_container.sh ${VIRT_ENV_NAME} python3 ${BASE_DIR}/main_scripts/main_train_models.py \
--input_dir ${source_dir} --datasplit_dict ${datasplit_dict} --dataset era5 --model ${model} --model_hparams_dict ${model_hparams} --output_dir ${destination_dir}/
## Two nodes, 8 GPUs
#srun -N 2 -n 8 --ntasks-per-node 4 singularity exec --nv ${CONTAINER_IMG} ./wrapper_container.sh ${VIRT_ENV_NAME} python3 ${BASE_DIR}/main_scripts/main_train_models.py \
# --input_dir ${source_dir} --datasplit_dict ${datasplit_dict} --dataset era5 --model ${model} --model_hparams_dict ${model_hparams} --output_dir ${destination_dir}/
......@@ -2,52 +2,49 @@
#SBATCH --account=deepacf
#SBATCH --nodes=1
#SBATCH --ntasks=1
##SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=1
#SBATCH --output=train_era5-out.%j
#SBATCH --error=train_era5-err.%j
#SBATCH --time=20:00:00
#SBATCH --output=train_model_era5_container-out.%j
#SBATCH --error=train_model_era5_container-err.%j
#SBATCH --time=24:00:00
##SBATCH --time=00:20:00
#SBATCH --gres=gpu:1
#SBATCH --partition=gpus
#SBATCH --partition=booster
#SBATCH --mail-type=ALL
#SBATCH --mail-user=b.gong@fz-juelich.de
##jutil env activate -p cjjsc42
#SBATCH --mail-user=me@somewhere.com
######### Template identifier (don't remove) #########
echo "Do not run the template scripts"
exit 99
######### Template identifier (don't remove) #########
### Two nodes, 8 GPUs
##SBATCH --nodes=2
##SBATCH --ntasks=8
##SBATCH --ntasks-per-node=4
##SBATCH --gres=gpu:4
## Also take care for the job submission with srun below!!!
WORK_DIR=`pwd`
BASE_DIR=$(dirname "$WORK_DIR")
# Name of virtual environment
VIRT_ENV_NAME="my_venv"
# Name of container image (must be available in working directory)
CONTAINER_IMG="${WORK_DIR}/tensorflow_21.09-tf1-py3.sif"
# clean-up modules to avoid conflicts between host and container settings
module purge
# Loading mouldes
source ../env_setup/modules_train.sh
# Activate virtual environment if needed (and possible)
if [ -z ${VIRTUAL_ENV} ]; then
if [[ -f ../${VIRT_ENV_NAME}/bin/activate ]]; then
echo "Activating virtual environment..."
source ../${VIRT_ENV_NAME}/bin/activate
else
echo "ERROR: Requested virtual environment ${VIRT_ENV_NAME} not found..."
exit 1
fi
fi
# declare directory-variables which will be modified by config_runscript.py
source_dir=/p/project/deepacf/deeprain/video_prediction_shared_folder/preprocessedData/
destination_dir=/p/project/deepacf/deeprain/video_prediction_shared_folder/models/
# declare directory-variables which will be modified appropriately during Preprocessing (invoked by mpi_split_data_multi_years.py)
source_dir=/my/path/to/tfrecords/files
destination_dir=/my/model/output/path
# valid identifiers for model-argument are: convLSTM, savp, mcnet and vae
# the destination_dir_full cannot end up with "/", this will cause to save all the checkpoints issue in the results_dir
model=convLSTM
datasplit_dict=../data_split/cv_test.json
datasplit_dict=${destination_dir}/data_split.json
model_hparams=${destination_dir}/model_hparams.json
dataset=era5
#If you train savp, Please uncomment the following CUDA configuration
#CUDA_VISIBLE_DEVICES=1
# run training in container
export CUDA_VISIBLE_DEVICES=0,1,2,3
## One node, single GPU
srun --mpi=pspmix --cpu-bind=none \
singularity exec --nv ${CONTAINER_IMG} ./wrapper_container.sh ${VIRT_ENV_NAME} python3 ${BASE_DIR}/main_scripts/main_train_models.py \
--input_dir ${source_dir} --datasplit_dict ${datasplit_dict} --dataset era5 --model ${model} --model_hparams_dict ${model_hparams} --output_dir ${destination_dir}/
# run training
srun python ../main_scripts/main_train_models.py --input_dir ${source_dir} --datasplit_dict ${datasplit_dict} \
--dataset ${dataset} --model ${model} --model_hparams_dict ${model_hparams} --output_dir ${destination_dir} --checkpoint ${destination_dir}
## Two nodes, 8 GPUs
#srun -N 2 -n 8 --ntasks-per-node 4 singularity exec --nv ${CONTAINER_IMG} ./wrapper_container.sh ${VIRT_ENV_NAME} python3 ${BASE_DIR}/main_scripts/main_train_models.py \
# --input_dir ${source_dir} --datasplit_dict ${datasplit_dict} --dataset era5 --model ${model} --model_hparams_dict ${model_hparams} --output_dir ${destination_dir}/
......@@ -19,63 +19,45 @@ check_argin() {
if [[ $argin == *"-base_dir="* ]]; then
base_outdir=${argin#"-base_dir="}
fi
if [[ $argin == *"-lcontainer"* ]]; then
bool_container=1
fi
done
if [[ -z "${bool_container}" ]]; then
bool_container=0
fi
}
# **************** Auxiliary functions ****************
# **************** Actual script ****************
# some first sanity checks
if [[ ${BASH_SOURCE[0]} == ${0} ]]; then
if [[ ${BASH_SOURCE[0]} == "${0}" ]]; then
echo "ERROR: 'create_env.sh' must be sourced, i.e. execute by prompting 'source create_env.sh [virt_env_name]'"
exit 1
fi
# from now on, just return if something unexpected occurs instead of exiting
# as the latter would close the terminal including logging out
if [[ ! -n "$1" ]]; then
if [[ -z "$1" ]]; then
echo "ERROR: Provide a name to set up the virtual environment, i.e. execute by prompting 'source create_env.sh [virt_env_name]"
return
fi
if [[ "$#" -gt 1 ]]; then
check_argin ${@:2} # sets base_outdir if provided, always sets l_container
check_argin ${@:2} # sets base_outdir if provided
fi
# set some variables
HOST_NAME=`hostname`
HOST_NAME="$(hostname)"
ENV_NAME=$1
ENV_SETUP_DIR=`pwd`
WORKING_DIR="$(dirname "$ENV_SETUP_DIR")"
EXE_DIR="$(basename "$ENV_SETUP_DIR")"
THIS_DIR="$(pwd)"
WORKING_DIR="$(dirname "$THIS_DIR")"
EXE_DIR="$(basename "$THIS_DIR")"
ENV_DIR=${WORKING_DIR}/${ENV_NAME}
TF_CONTAINER=${WORKING_DIR}/env_setup/tensorflow_21.09-tf1-py3.sif
TF_CONTAINER=${WORKING_DIR}/HPC_scripts/tensorflow_21.09-tf1-py3.sif
## perform sanity checks
# correct bool_container if host is Juwels Booster and ensure running singularity
if [[ "${bool_container}" == 0 ]]; then
echo "******************************************** NOTE ********************************************"
echo " Set up virtual environment without TF1.15-container. "
echo " Note that training without container using GPUs on the Juelich HPC-systems is not possible! "
echo "******************************************** NOTE ********************************************"
fi
modules_purge=""
if [[ "${bool_container}" == 1 ]]; then
echo "Virtual environment will be set up in TensorFlow 1.15-container."
modules_purge=purge
# Check if singularity exists
if [[ ! -f "${TF_CONTAINER}" ]]; then
echo "ERROR: Could not found required TensorFlow 1.15-container under ${TF_CONTAINER}"
if [[ ! -f ${CONTAINER_IMG} ]]; then
echo "ERROR: Cannot find required TF1.15 container image '${CONTAINER_IMG}'."
return
fi
fi
# further sanity checks:
# * ensure execution from env_setup-directory
......@@ -95,16 +77,10 @@ else
fi
## check integratability of modules
if [[ "${HOST_NAME}" == hdfml* || "${HOST_NAME}" == *jwlogin* && ! "${HOST_NAME}" == *jwlogin2[2-4]* ]]; then
if [[ "${HOST_NAME}" == hdfml* || "${HOST_NAME}" == *jwlogin* ]]; then
# load modules and check for their availability
echo "***** Checking modules required during the workflow... *****"
source ${ENV_SETUP_DIR}/modules_preprocess.sh purge
source ${ENV_SETUP_DIR}/modules_train.sh purge
source ${ENV_SETUP_DIR}/modules_postprocess.sh ${modules_purge}
elif [[ "${HOST_NAME}" == *jwlogin2[2-4]* ]]; then
echo "***** Old Stages are not available on Juwels Booster ****"
echo "***** To check modules for preprocessing, ****"
echo "***** run this script on Juwels or HDF-ML. ****"
source "${ENV_SETUP_DIR}"/modules_preprocess.sh purge
else
echo "ERROR: AMBS-workflow is currently only supported on the Juelich HPC-systems HDF-ML, Juwels and Juwels Booster"
return
......@@ -117,7 +93,6 @@ if [[ "$ENV_EXIST" == 0 ]]; then
# Activate virtual environment and install additional Python packages.
echo "Configuring and activating virtual environment on ${HOST_NAME}"
if [[ "${bool_container}" == 1 ]]; then
singularity exec --nv "${TF_CONTAINER}" ./install_venv_container.sh "${ENV_DIR}"
else
# cretae virtual environemt here
......
......@@ -2,17 +2,18 @@
#
# __authors__ = Bing Gong, Michael Langguth
# __date__ = '2021_10_28'
# __last_update__ = '2021_10_28' by Michael Langguth
# __last_update__ = '2022_01_26' by Michael Langguth
#
# **************** Description ****************
# This auxiliary script sets up the virtual environment within a singularity container.
# **************** Description ****************
# set some basic variables
BASE_DIR=`pwd`
BASE_DIR="$(pwd)"
VENV_BASE=$1
VENV_NAME="$(basename "${VENV_BASE}")"
VENV_DIR=${VENV_BASE}/${VENV_NAME}
VENV_REQ=${BASE_DIR}/requirements.txt
# sanity checks
# check if we are running in a container
......@@ -34,8 +35,8 @@ if [ -d "$1" ]; then
fi
# check for requirement-file
if [ ! -f "${BASE_DIR}/requirements_container.txt" ]; then
echo "ERROR: Cannot find requirement-file ${BASE_DIR}/requirements_container.txt to set up virtual environment."
if [ ! -f "${VENV_REQ}" ]; then
echo "ERROR: Cannot find requirement-file '${VENV_REQ}' to set up virtual environment."
return
fi
......@@ -55,7 +56,7 @@ source "${VENV_DIR}/bin/activate"
# set PYTHONPATH and install packages
export PYTHONPATH="/usr/local/lib/python3.8/dist-packages/"
echo 'export PYTHONPATH="/usr/local/lib/python3.8/dist-packages/"' >> "${VENV_DIR}/bin/activate"
pip install -r "${BASE_DIR}/requirements_container.txt"
pip install -r "${VENV_REQ}"
# get back to basic directory
cd "${BASE_DIR}" || exit
......
opencv-python==4.2.0.34
hickle
matplotlib==3.3.0
mpi4py==3.0.3
pandas==0.25.3
scikit-image==0.18.1
opencv-python-headless==4.2.0.34
matplotlib==3.3.0
mpi4py==3.0.3
pandas==0.25.3
scikit-image==0.18.1
opencv-python-headless==4.2.0.34
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment