Skip to content
Snippets Groups Projects
Commit 2751d5ef authored by Fahad Khalid's avatar Fahad Khalid
Browse files

Added/updated job scripts for data-distributed training.

parent d028ba3c
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env bash
# Slurm job configuration
#SBATCH --nodes=2
#SBATCH --ntasks=8
#SBATCH --ntasks-per-node=4
#SBATCH --output=output_%j.out
#SBATCH --error=error_%j.er
#SBATCH --time=00:10:00
#SBATCH --job-name=HVD_DATA_DIST
#SBATCH --gres=gpu:4 --partition=develgpus
#SBATCH --mail-type=ALL
# Load the required modules
module load GCC/8.3.0
module load MVAPICH2/2.3.1-GDR
module load TensorFlow/1.13.1-GPU-Python-3.6.8
module load Horovod/0.16.2-GPU-Python-3.6.8
# Run the program
srun python -u mnist_data_distributed.py
#!/usr/bin/env bash
#BSUB -q normal
#BSUB -W 10
#BSUB -n 8
#BSUB -R "span[ptile=4]"
#BSUB -gpu "num=4"
#BSUB -e "error.%J.er"
#BSUB -o "output_%J.out"
#BSUB -J HVD_DATA_DIST
# Load the required modules
module load python/3.6.1
module load tensorflow/1.12.0-gcc_5.4.0-cuda_10.0.130
module load horovod/0.15.2
# Run the program
mpirun -bind-to none \
-map-by slot \
-x NCCL_DEBUG=INFO \
-x LD_LIBRARY_PATH \
-x PATH \
-mca pml ob1 \
-mca btl ^openib \
python -u mnist_data_distributed.py
...@@ -17,8 +17,5 @@ module load MVAPICH2/2.3.1-GDR ...@@ -17,8 +17,5 @@ module load MVAPICH2/2.3.1-GDR
module load TensorFlow/1.13.1-GPU-Python-3.6.8 module load TensorFlow/1.13.1-GPU-Python-3.6.8
module load Horovod/0.16.2-GPU-Python-3.6.8 module load Horovod/0.16.2-GPU-Python-3.6.8
# Source the virtual environment
source activate venv_dl_hpc4ns/bin/activate
# Run the program # Run the program
srun python -u mnist_data_distributed.py srun python -u mnist_data_distributed.py
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment