From a4af4d26027a2a3949f2d688d1444c100c4f81bd Mon Sep 17 00:00:00 2001 From: Fahad Khalid <f.khalid@fz-juelich.de> Date: Wed, 26 Jun 2019 12:26:20 +0200 Subject: [PATCH] Added job scripts for JUWELS stage 2019a. Keras and Tensorflow are working both independently and with Horovod. PyTorch code is generating dataset related errors that need to be investigated; possibly just a matter of updating the custom pre-downloaded-dataset handling code. --- horovod/keras/submit_job_juwels_python3.sh | 22 +++++++++++++++++++ horovod/pytorch/submit_job_juwels_python3.sh | 22 +++++++++++++++++++ .../tensorflow/submit_job_juwels_python3.sh | 22 +++++++++++++++++++ keras/submit_job_juwels_python3.sh | 20 +++++++++++++++++ pytorch/submit_job_juwels_python3.sh | 20 +++++++++++++++++ tensorflow/submit_job_juwels_python3.sh | 19 ++++++++++++++++ 6 files changed, 125 insertions(+) create mode 100755 horovod/keras/submit_job_juwels_python3.sh create mode 100755 horovod/pytorch/submit_job_juwels_python3.sh create mode 100755 horovod/tensorflow/submit_job_juwels_python3.sh create mode 100755 keras/submit_job_juwels_python3.sh create mode 100755 pytorch/submit_job_juwels_python3.sh create mode 100755 tensorflow/submit_job_juwels_python3.sh diff --git a/horovod/keras/submit_job_juwels_python3.sh b/horovod/keras/submit_job_juwels_python3.sh new file mode 100755 index 0000000..76fa6cd --- /dev/null +++ b/horovod/keras/submit_job_juwels_python3.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=2 +#SBATCH --ntasks=4 +#SBATCH --ntasks-per-node=4 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=HOROVOD_KERAS_MNIST +#SBATCH --gres=gpu:4 --partition=develgpus +#SBATCH --mail-type=ALL + +# Load the required modules +module load GCC/8.3.0 +module load MVAPICH2/2.3.1-GDR +module load TensorFlow/1.13.1-GPU-Python-3.6.8 +module load Keras/2.2.4-GPU-Python-3.6.8 +module load Horovod/0.16.2-GPU-Python-3.6.8 + +# Run the program +srun python -u mnist.py diff --git a/horovod/pytorch/submit_job_juwels_python3.sh b/horovod/pytorch/submit_job_juwels_python3.sh new file mode 100755 index 0000000..754793f --- /dev/null +++ b/horovod/pytorch/submit_job_juwels_python3.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=2 +#SBATCH --ntasks=4 +#SBATCH --ntasks-per-node=4 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=HOROVOD_PYTORCH_MNIST +#SBATCH --gres=gpu:4 --partition=develgpus +#SBATCH --mail-type=ALL + +# Load the required modules +module load GCC/8.3.0 +module load MVAPICH2/2.3.1-GDR +module load PyTorch/1.1.0-GPU-Python-3.6.8 +module load torchvision/0.3.0-GPU-Python-3.6.8 +module load Horovod/0.16.2-GPU-Python-3.6.8 + +# Run the program +srun python -u mnist.py diff --git a/horovod/tensorflow/submit_job_juwels_python3.sh b/horovod/tensorflow/submit_job_juwels_python3.sh new file mode 100755 index 0000000..bf0b4e6 --- /dev/null +++ b/horovod/tensorflow/submit_job_juwels_python3.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=2 +#SBATCH --ntasks=4 +#SBATCH --ntasks-per-node=4 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=HOROVOD_TFLOW_MNIST +#SBATCH --gres=gpu:4 --partition=develgpus +#SBATCH --mail-type=ALL + +# Load the required modules +module load GCC/8.3.0 +module load MVAPICH2/2.3.1-GDR +module load TensorFlow/1.13.1-GPU-Python-3.6.8 +module load Keras/2.2.4-GPU-Python-3.6.8 +module load Horovod/0.16.2-GPU-Python-3.6.8 + +# Run the program +srun python -u mnist.py diff --git a/keras/submit_job_juwels_python3.sh b/keras/submit_job_juwels_python3.sh new file mode 100755 index 0000000..429c440 --- /dev/null +++ b/keras/submit_job_juwels_python3.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=KERAS_MNIST +#SBATCH --gres=gpu:1 --partition=develgpus +#SBATCH --mail-type=ALL + +# Load the required modules +module load GCC/8.3.0 +module load TensorFlow/1.13.1-GPU-Python-3.6.8 +module load Keras/2.2.4-GPU-Python-3.6.8 + +# Run the program +srun python -u mnist.py diff --git a/pytorch/submit_job_juwels_python3.sh b/pytorch/submit_job_juwels_python3.sh new file mode 100755 index 0000000..15f53ac --- /dev/null +++ b/pytorch/submit_job_juwels_python3.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=PYTORCH_MNIST +#SBATCH --gres=gpu:1 --partition=develgpus +#SBATCH --mail-type=ALL + +# Load the required modules +module load GCC/8.3.0 +module load PyTorch/1.1.0-GPU-Python-3.6.8 +module load torchvision/0.3.0-GPU-Python-3.6.8 + +# Run the program +srun python -u mnist.py diff --git a/tensorflow/submit_job_juwels_python3.sh b/tensorflow/submit_job_juwels_python3.sh new file mode 100755 index 0000000..fda7d98 --- /dev/null +++ b/tensorflow/submit_job_juwels_python3.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=TFLOW_MNIST +#SBATCH --gres=gpu:1 --partition=develgpus +#SBATCH --mail-type=ALL + +# Load the required modules +module load GCC/8.3.0 +module load TensorFlow/1.13.1-GPU-Python-3.6.8 + +# Run the program +srun python -u mnist.py -- GitLab