diff --git a/horovod/keras/submit_job_jureca_python3.sh b/horovod/keras/submit_job_jureca_python3.sh index 561a45df30794898960af6aa37ed927cc20db714..76fa6cd115b81f19c3422449e48da8b407e7f828 100755 --- a/horovod/keras/submit_job_jureca_python3.sh +++ b/horovod/keras/submit_job_jureca_python3.sh @@ -3,22 +3,20 @@ # Slurm job configuration #SBATCH --nodes=2 #SBATCH --ntasks=4 -#SBATCH --ntasks-per-node=2 +#SBATCH --ntasks-per-node=4 #SBATCH --output=output_%j.out #SBATCH --error=error_%j.er #SBATCH --time=00:10:00 #SBATCH --job-name=HOROVOD_KERAS_MNIST -#SBATCH --gres=gpu:2 --partition=develgpus +#SBATCH --gres=gpu:4 --partition=develgpus #SBATCH --mail-type=ALL # Load the required modules -module use /usr/local/software/jureca/OtherStages -module load Stages/2018b -module load GCC/7.3.0 -module load MVAPICH2/2.3-GDR -module load TensorFlow/1.12.0-GPU-Python-3.6.6 -module load Keras/2.2.4-GPU-Python-3.6.6 -module load Horovod/0.15.2-GPU-Python-3.6.6 +module load GCC/8.3.0 +module load MVAPICH2/2.3.1-GDR +module load TensorFlow/1.13.1-GPU-Python-3.6.8 +module load Keras/2.2.4-GPU-Python-3.6.8 +module load Horovod/0.16.2-GPU-Python-3.6.8 # Run the program srun python -u mnist.py diff --git a/horovod/pytorch/mnist.py b/horovod/pytorch/mnist.py index 3d1b9c584ab4079dfddc9fe5f6633ad9ab2145b4..4d90a01b5d2df3a203357984f6abf2fb7fa4f0cb 100644 --- a/horovod/pytorch/mnist.py +++ b/horovod/pytorch/mnist.py @@ -57,7 +57,7 @@ if args.cuda: dataset_file = os.path.join(data_dir, data_file) # [HPCNS] Dataset filename for this rank -dataset_for_rank = 'MNIST-data-%d' % hvd.rank() +dataset_for_rank = 'MNIST' # [HPCNS] If the path already exists, remove it if os.path.exists(dataset_for_rank): @@ -68,7 +68,7 @@ shutil.copytree(dataset_file, dataset_for_rank) kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} train_dataset = \ - datasets.MNIST(dataset_for_rank, train=True, download=False, + datasets.MNIST('', train=True, download=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) @@ -80,7 +80,7 @@ train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs) test_dataset = \ - datasets.MNIST(dataset_for_rank, train=False, download=False, transform=transforms.Compose([ + datasets.MNIST('', train=False, download=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) diff --git a/horovod/pytorch/.submit_job_jureca_python3.sh b/horovod/pytorch/submit_job_jureca_python3.sh similarity index 54% rename from horovod/pytorch/.submit_job_jureca_python3.sh rename to horovod/pytorch/submit_job_jureca_python3.sh index 1afd8012e9c0ebb3078c0972bb05ede9caadbf2d..754793f7b3a86aca289b8d9b105b5057fb207d4a 100755 --- a/horovod/pytorch/.submit_job_jureca_python3.sh +++ b/horovod/pytorch/submit_job_jureca_python3.sh @@ -3,20 +3,20 @@ # Slurm job configuration #SBATCH --nodes=2 #SBATCH --ntasks=4 -#SBATCH --ntasks-per-node=2 +#SBATCH --ntasks-per-node=4 #SBATCH --output=output_%j.out #SBATCH --error=error_%j.er #SBATCH --time=00:10:00 #SBATCH --job-name=HOROVOD_PYTORCH_MNIST -#SBATCH --gres=gpu:2 --partition=develgpus +#SBATCH --gres=gpu:4 --partition=develgpus #SBATCH --mail-type=ALL # Load the required modules -module load GCC/7.3.0 -module load MVAPICH2/2.3-GDR -module load PyTorch/1.0.0-GPU-Python-3.6.6 -module load torchvision/0.2.1-GPU-Python-3.6.6 -module load Horovod/0.15.2-GPU-Python-3.6.6 +module load GCC/8.3.0 +module load MVAPICH2/2.3.1-GDR +module load PyTorch/1.1.0-GPU-Python-3.6.8 +module load torchvision/0.3.0-GPU-Python-3.6.8 +module load Horovod/0.16.2-GPU-Python-3.6.8 # Run the program srun python -u mnist.py diff --git a/horovod/tensorflow/submit_job_jureca_python3.sh b/horovod/tensorflow/submit_job_jureca_python3.sh index 2a50c8b813a7c7eeb56bdfd993b8738585fb6342..bf0b4e6cd79ed32f491170f037d1b248d2156f2f 100755 --- a/horovod/tensorflow/submit_job_jureca_python3.sh +++ b/horovod/tensorflow/submit_job_jureca_python3.sh @@ -3,21 +3,20 @@ # Slurm job configuration #SBATCH --nodes=2 #SBATCH --ntasks=4 -#SBATCH --ntasks-per-node=2 +#SBATCH --ntasks-per-node=4 #SBATCH --output=output_%j.out #SBATCH --error=error_%j.er #SBATCH --time=00:10:00 #SBATCH --job-name=HOROVOD_TFLOW_MNIST -#SBATCH --gres=gpu:2 --partition=develgpus +#SBATCH --gres=gpu:4 --partition=develgpus #SBATCH --mail-type=ALL # Load the required modules -module use /usr/local/software/jureca/OtherStages -module load Stages/2018b -module load GCC/7.3.0 -module load MVAPICH2/2.3-GDR -module load TensorFlow/1.12.0-GPU-Python-3.6.6 -module load Horovod/0.15.2-GPU-Python-3.6.6 +module load GCC/8.3.0 +module load MVAPICH2/2.3.1-GDR +module load TensorFlow/1.13.1-GPU-Python-3.6.8 +module load Keras/2.2.4-GPU-Python-3.6.8 +module load Horovod/0.16.2-GPU-Python-3.6.8 # Run the program srun python -u mnist.py diff --git a/pytorch/mnist.py b/pytorch/mnist.py index d4092b614e9cc2045952884199c63eafef5f7e5b..19bcac053726b51c1cb8d1c393546f70d037d6fd 100644 --- a/pytorch/mnist.py +++ b/pytorch/mnist.py @@ -108,7 +108,7 @@ def main(): dataset_file = os.path.join(data_dir, data_file) # [HPCNS] A copy of the dataset in the current directory - dataset_copy = 'MNIST-data' + dataset_copy = 'MNIST' # [HPCNS] If the path already exists, remove it if os.path.exists(dataset_copy): @@ -120,14 +120,14 @@ def main(): kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} train_loader = torch.utils.data.DataLoader( - datasets.MNIST(dataset_copy, train=True, download=False, + datasets.MNIST('', train=True, download=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])), batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader( - datasets.MNIST(dataset_copy, train=False, download=False, transform=transforms.Compose([ + datasets.MNIST('', train=False, download=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])), diff --git a/pytorch/submit_job_jureca_python3.sh b/pytorch/submit_job_jureca_python3.sh index d6d0de120c5c74a4e520a2a7f2cbd807da3e8c35..15f53ac1a55630cc5c628413738dacd4fab4429e 100755 --- a/pytorch/submit_job_jureca_python3.sh +++ b/pytorch/submit_job_jureca_python3.sh @@ -12,12 +12,9 @@ #SBATCH --mail-type=ALL # Load the required modules -module use /usr/local/software/jureca/OtherStages -module load Stages/2018b -module load GCC/7.3.0 -module load MVAPICH2/2.3-GDR -module load PyTorch/1.0.0-GPU-Python-3.6.6 -module load torchvision/0.2.1-GPU-Python-3.6.6 +module load GCC/8.3.0 +module load PyTorch/1.1.0-GPU-Python-3.6.8 +module load torchvision/0.3.0-GPU-Python-3.6.8 # Run the program srun python -u mnist.py diff --git a/tensorflow/submit_job_jureca_python3.sh b/tensorflow/submit_job_jureca_python3.sh index f9791cf51bce987caf46d4790523b784b339a853..fa294f1cb401c9cda6a1c20ab716419a64262e07 100755 --- a/tensorflow/submit_job_jureca_python3.sh +++ b/tensorflow/submit_job_jureca_python3.sh @@ -7,7 +7,7 @@ #SBATCH --output=output_%j.out #SBATCH --error=error_%j.er #SBATCH --time=00:10:00 -#SBATCH --job-name=TENSORFLOW_MNIST +#SBATCH --job-name=TFLOW_MNIST #SBATCH --gres=gpu:1 --partition=develgpus #SBATCH --mail-type=ALL