From a4af4d26027a2a3949f2d688d1444c100c4f81bd Mon Sep 17 00:00:00 2001
From: Fahad Khalid <f.khalid@fz-juelich.de>
Date: Wed, 26 Jun 2019 12:26:20 +0200
Subject: [PATCH] Added job scripts for JUWELS stage 2019a. Keras and
 Tensorflow are working both independently and with Horovod. PyTorch code is
 generating dataset related errors that need to be investigated; possibly just
 a matter of updating the custom pre-downloaded-dataset handling code.

---
 horovod/keras/submit_job_juwels_python3.sh    | 22 +++++++++++++++++++
 horovod/pytorch/submit_job_juwels_python3.sh  | 22 +++++++++++++++++++
 .../tensorflow/submit_job_juwels_python3.sh   | 22 +++++++++++++++++++
 keras/submit_job_juwels_python3.sh            | 20 +++++++++++++++++
 pytorch/submit_job_juwels_python3.sh          | 20 +++++++++++++++++
 tensorflow/submit_job_juwels_python3.sh       | 19 ++++++++++++++++
 6 files changed, 125 insertions(+)
 create mode 100755 horovod/keras/submit_job_juwels_python3.sh
 create mode 100755 horovod/pytorch/submit_job_juwels_python3.sh
 create mode 100755 horovod/tensorflow/submit_job_juwels_python3.sh
 create mode 100755 keras/submit_job_juwels_python3.sh
 create mode 100755 pytorch/submit_job_juwels_python3.sh
 create mode 100755 tensorflow/submit_job_juwels_python3.sh

diff --git a/horovod/keras/submit_job_juwels_python3.sh b/horovod/keras/submit_job_juwels_python3.sh
new file mode 100755
index 0000000..76fa6cd
--- /dev/null
+++ b/horovod/keras/submit_job_juwels_python3.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+# Slurm job configuration
+#SBATCH --nodes=2
+#SBATCH --ntasks=4
+#SBATCH --ntasks-per-node=4
+#SBATCH --output=output_%j.out
+#SBATCH --error=error_%j.er
+#SBATCH --time=00:10:00
+#SBATCH --job-name=HOROVOD_KERAS_MNIST
+#SBATCH --gres=gpu:4 --partition=develgpus
+#SBATCH --mail-type=ALL
+
+# Load the required modules
+module load GCC/8.3.0
+module load MVAPICH2/2.3.1-GDR
+module load TensorFlow/1.13.1-GPU-Python-3.6.8
+module load Keras/2.2.4-GPU-Python-3.6.8
+module load Horovod/0.16.2-GPU-Python-3.6.8
+
+# Run the program
+srun python -u mnist.py
diff --git a/horovod/pytorch/submit_job_juwels_python3.sh b/horovod/pytorch/submit_job_juwels_python3.sh
new file mode 100755
index 0000000..754793f
--- /dev/null
+++ b/horovod/pytorch/submit_job_juwels_python3.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+# Slurm job configuration
+#SBATCH --nodes=2
+#SBATCH --ntasks=4
+#SBATCH --ntasks-per-node=4
+#SBATCH --output=output_%j.out
+#SBATCH --error=error_%j.er
+#SBATCH --time=00:10:00
+#SBATCH --job-name=HOROVOD_PYTORCH_MNIST
+#SBATCH --gres=gpu:4 --partition=develgpus
+#SBATCH --mail-type=ALL
+
+# Load the required modules
+module load GCC/8.3.0
+module load MVAPICH2/2.3.1-GDR
+module load PyTorch/1.1.0-GPU-Python-3.6.8
+module load torchvision/0.3.0-GPU-Python-3.6.8
+module load Horovod/0.16.2-GPU-Python-3.6.8
+
+# Run the program
+srun python -u mnist.py
diff --git a/horovod/tensorflow/submit_job_juwels_python3.sh b/horovod/tensorflow/submit_job_juwels_python3.sh
new file mode 100755
index 0000000..bf0b4e6
--- /dev/null
+++ b/horovod/tensorflow/submit_job_juwels_python3.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+# Slurm job configuration
+#SBATCH --nodes=2
+#SBATCH --ntasks=4
+#SBATCH --ntasks-per-node=4
+#SBATCH --output=output_%j.out
+#SBATCH --error=error_%j.er
+#SBATCH --time=00:10:00
+#SBATCH --job-name=HOROVOD_TFLOW_MNIST
+#SBATCH --gres=gpu:4 --partition=develgpus
+#SBATCH --mail-type=ALL
+
+# Load the required modules
+module load GCC/8.3.0
+module load MVAPICH2/2.3.1-GDR
+module load TensorFlow/1.13.1-GPU-Python-3.6.8
+module load Keras/2.2.4-GPU-Python-3.6.8
+module load Horovod/0.16.2-GPU-Python-3.6.8
+
+# Run the program
+srun python -u mnist.py
diff --git a/keras/submit_job_juwels_python3.sh b/keras/submit_job_juwels_python3.sh
new file mode 100755
index 0000000..429c440
--- /dev/null
+++ b/keras/submit_job_juwels_python3.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+
+# Slurm job configuration
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --output=output_%j.out
+#SBATCH --error=error_%j.er
+#SBATCH --time=00:10:00
+#SBATCH --job-name=KERAS_MNIST
+#SBATCH --gres=gpu:1 --partition=develgpus
+#SBATCH --mail-type=ALL
+
+# Load the required modules
+module load GCC/8.3.0
+module load TensorFlow/1.13.1-GPU-Python-3.6.8
+module load Keras/2.2.4-GPU-Python-3.6.8
+
+# Run the program
+srun python -u mnist.py
diff --git a/pytorch/submit_job_juwels_python3.sh b/pytorch/submit_job_juwels_python3.sh
new file mode 100755
index 0000000..15f53ac
--- /dev/null
+++ b/pytorch/submit_job_juwels_python3.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+
+# Slurm job configuration
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --output=output_%j.out
+#SBATCH --error=error_%j.er
+#SBATCH --time=00:10:00
+#SBATCH --job-name=PYTORCH_MNIST
+#SBATCH --gres=gpu:1 --partition=develgpus
+#SBATCH --mail-type=ALL
+
+# Load the required modules
+module load GCC/8.3.0
+module load PyTorch/1.1.0-GPU-Python-3.6.8
+module load torchvision/0.3.0-GPU-Python-3.6.8
+
+# Run the program
+srun python -u mnist.py
diff --git a/tensorflow/submit_job_juwels_python3.sh b/tensorflow/submit_job_juwels_python3.sh
new file mode 100755
index 0000000..fda7d98
--- /dev/null
+++ b/tensorflow/submit_job_juwels_python3.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+
+# Slurm job configuration
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --output=output_%j.out
+#SBATCH --error=error_%j.er
+#SBATCH --time=00:10:00
+#SBATCH --job-name=TFLOW_MNIST
+#SBATCH --gres=gpu:1 --partition=develgpus
+#SBATCH --mail-type=ALL
+
+# Load the required modules
+module load GCC/8.3.0
+module load TensorFlow/1.13.1-GPU-Python-3.6.8
+
+# Run the program
+srun python -u mnist.py
-- 
GitLab