diff --git a/scripts/cyclone_basilisk/.gitkeep b/scripts/CYCLONE_BASILISK/.gitkeep similarity index 100% rename from scripts/cyclone_basilisk/.gitkeep rename to scripts/CYCLONE_BASILISK/.gitkeep diff --git a/scripts/CYCLONE_BASILISK/_script.sh b/scripts/CYCLONE_BASILISK/_script.sh new file mode 100644 index 0000000000000000000000000000000000000000..4840e23f677e2aca7f6fe0ddc4a60217e2b58467 --- /dev/null +++ b/scripts/CYCLONE_BASILISK/_script.sh @@ -0,0 +1,16 @@ +#!/bin/bash +#SBATCH --job-name=job +#SBATCH --output=job.out +#SBATCH --error=job.err +#SBATCH --account=%account% +#SBATCH --partition=%partition% +#SBATCH --nodes=%nodes% + +#MODULES BEGIN cyclone basilisk +module purge +ml load SWIG/4.0.2-GCCcore-10.2.0 Bison/3.7.1-GCCcore-10.2.0 CMake/3.18.4-GCCcore-10.2.0 Python/3.8.6-GCCcore-10.2.0 flex/2.6.4-GCCcore-10.2.0 glew/2.2.0-GCCcore-10.2.0-osmesa Mesa/20.2.1-GCCcore-10.2.0 libGLU/9.0.1-GCCcore-10.2.0 OpenMPI/4.0.5-GCC-10.2.0 +#MODULES END + +source your/env_path/bin/activate + +srun --exclusive %executable% diff --git a/scripts/CYCLONE_BASILISK/lamec.json b/scripts/CYCLONE_BASILISK/lamec.json new file mode 100644 index 0000000000000000000000000000000000000000..2837d615492f1ae7c1cb69f192c84ee7f493e8a6 --- /dev/null +++ b/scripts/CYCLONE_BASILISK/lamec.json @@ -0,0 +1 @@ +{"template": "_script.sh"} diff --git a/scripts/cyclone_horovod/.gitkeep b/scripts/CYCLONE_Horovod/.gitkeep similarity index 100% rename from scripts/cyclone_horovod/.gitkeep rename to scripts/CYCLONE_Horovod/.gitkeep diff --git a/scripts/CYCLONE_Horovod/_script.sh b/scripts/CYCLONE_Horovod/_script.sh new file mode 100644 index 0000000000000000000000000000000000000000..bf0f6b929e26cb648fa235f2bf00229abd4cc193 --- /dev/null +++ b/scripts/CYCLONE_Horovod/_script.sh @@ -0,0 +1,19 @@ +#!/bin/bash +#SBATCH --job-name=job +#SBATCH --output=job.out +#SBATCH --error=job.err +#SBATCH --account=%account% +#SBATCH --partition=%partition% +#SBATCH --nodes=%nodes% +#BATCH --gpus-per-node=4 +#SBATCH --ntasks-per-node=4 + +#MODULES BEGIN cyclone horovod +module purge +ml load h5py tqdm matplotlib PyTorch/1.9.1-fosscuda-2020b Horovod/0.22.0-fosscuda-2020b-PyTorch-1.9.1 +#MODULES END + +source your/env_path/bin/activate + +# Horovod NCCL/MPI setup +srun --cpu-bind=none python3 -u %executable% diff --git a/scripts/CYCLONE_Horovod/lamec.json b/scripts/CYCLONE_Horovod/lamec.json new file mode 100644 index 0000000000000000000000000000000000000000..2837d615492f1ae7c1cb69f192c84ee7f493e8a6 --- /dev/null +++ b/scripts/CYCLONE_Horovod/lamec.json @@ -0,0 +1 @@ +{"template": "_script.sh"} diff --git a/scripts/DEEP_DeepSpeed/DEEP_DeepSpeed_script.sh b/scripts/DEEP_DeepSpeed/DEEP_DeepSpeed_script.sh new file mode 100644 index 0000000000000000000000000000000000000000..e40ea0ad78120b998d814955e5968a67098b6324 --- /dev/null +++ b/scripts/DEEP_DeepSpeed/DEEP_DeepSpeed_script.sh @@ -0,0 +1,30 @@ +#!/bin/bash +#SBATCH --job-name=job +#SBATCH --output=job.out +#SBATCH --error=job.err +#SBATCH --account=%account% +#SBATCH --partition=%partition% +#SBATCH --nodes=%nodes% +#SBATCH --gpus-per-node=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=12 +#SBATCH --exclusive +#SBATCH --gres=gpu:1 + +#MODULES BEGIN DEEP DeepSpeed +ml Stages/2024 GCC OpenMPI CUDA/12 cuDNN MPI-settings/CUDA +ml Python CMake HDF5 PnetCDF libaio mpi4py +#MODULES END + +# variables for specific HPC +export CUDA_VISIBLE_DEVICES="0" +ln -sf /usr/lib64/libcuda.so.1 +ln -sf /usr/lib64/libnvidia-ml.so.1 +export LD_LIBRARY_PATH=.:/usr/local/cuda-11.7/lib64:$LD_LIBRARY_PATH + +source your/env_path/bin/activate + +# DeepSpeed NCCL/MPI setup +export MASTER_ADDR=$(scontrol show hostnames "\$SLURM_JOB_NODELIST" | head -n 1)i +export MASTER_PORT=29500 +srun --cpu-bind=none python %executable% --deepspeed diff --git a/scripts/DEEP_DeepSpeed/lamec.json b/scripts/DEEP_DeepSpeed/lamec.json new file mode 100644 index 0000000000000000000000000000000000000000..961eb5dffb1f88ccd8264a28b34690333203d180 --- /dev/null +++ b/scripts/DEEP_DeepSpeed/lamec.json @@ -0,0 +1 @@ +{"template": "DEEP_DeepSpeed_script.sh"} diff --git a/scripts/DEEP_HeAT/DEEP_HeAT_script.sh b/scripts/DEEP_HeAT/DEEP_HeAT_script.sh new file mode 100644 index 0000000000000000000000000000000000000000..3586839ff747dc2665979bc56fae48de93e0fb48 --- /dev/null +++ b/scripts/DEEP_HeAT/DEEP_HeAT_script.sh @@ -0,0 +1,28 @@ +#!/bin/bash +#SBATCH --job-name=job +#SBATCH --output=job.out +#SBATCH --error=job.err +#SBATCH --account=%account% +#SBATCH --partition=%partition% +#SBATCH --nodes=%nodes% +#SBATCH --gpus-per-node=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=12 +#SBATCH --exclusive +#SBATCH --gres=gpu:1 + +#MODULES BEGIN DEEP HeAT +ml Stages/2024 GCC OpenMPI CUDA/12 cuDNN MPI-settings/CUDA +ml Python CMake HDF5 PnetCDF libaio mpi4py +#MODULES END + +# variables for specific HPC +export CUDA_VISIBLE_DEVICES="0" +ln -sf /usr/lib64/libcuda.so.1 +ln -sf /usr/lib64/libnvidia-ml.so.1 +export LD_LIBRARY_PATH=.:/usr/local/cuda-11.7/lib64:$LD_LIBRARY_PATH + +source your/env_path/bin/activate + +# HeAT NCCL setup +srun --cpu-bind=none python3 -u %executable% diff --git a/scripts/DEEP_HeAT/lamec.json b/scripts/DEEP_HeAT/lamec.json new file mode 100644 index 0000000000000000000000000000000000000000..22c7482fa0cc46924223ae3497c93ee2065b455a --- /dev/null +++ b/scripts/DEEP_HeAT/lamec.json @@ -0,0 +1 @@ +{"template": "DEEP_HeAT_script.sh"} diff --git a/scripts/DEEP_Horovod/DEEP_Horovod_script.sh b/scripts/DEEP_Horovod/DEEP_Horovod_script.sh new file mode 100644 index 0000000000000000000000000000000000000000..a077b9a51476d4a9cd3a6bf726c70eb6090b9a5a --- /dev/null +++ b/scripts/DEEP_Horovod/DEEP_Horovod_script.sh @@ -0,0 +1,28 @@ +#!/bin/bash +#SBATCH --job-name=job +#SBATCH --output=job.out +#SBATCH --error=job.err +#SBATCH --account=%account% +#SBATCH --partition=%partition% +#SBATCH --nodes=%nodes% +#SBATCH --gpus-per-node=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=12 +#SBATCH --exclusive +#SBATCH --gres=gpu:1 + +#MODULES BEGIN DEEP Horovod +ml Stages/2024 GCC OpenMPI CUDA/12 cuDNN MPI-settings/CUDA +ml Python CMake HDF5 PnetCDF libaio mpi4py +#MODULES END + +# variables for specific HPC +export CUDA_VISIBLE_DEVICES="0" +ln -sf /usr/lib64/libcuda.so.1 +ln -sf /usr/lib64/libnvidia-ml.so.1 +export LD_LIBRARY_PATH=.:/usr/local/cuda-11.7/lib64:$LD_LIBRARY_PATH + +source your/env_path/bin/activate + +# Horovod NCCL/MPI setup +srun --mpi=pspmix python3 -u %executable% diff --git a/scripts/DEEP_Horovod/lamec.json b/scripts/DEEP_Horovod/lamec.json new file mode 100644 index 0000000000000000000000000000000000000000..7e0350b5fc7833f15f2102f4686d4e2e1cafe7e8 --- /dev/null +++ b/scripts/DEEP_Horovod/lamec.json @@ -0,0 +1 @@ +{"template": "DEEP_Horovod_script.sh"} diff --git a/scripts/DEEP_Pytorch-DDP/DEEP_Pytorch-DDP_script.sh b/scripts/DEEP_Pytorch-DDP/DEEP_Pytorch-DDP_script.sh new file mode 100644 index 0000000000000000000000000000000000000000..5f415a2ef4634872b829a4f1ed0011e516e14297 --- /dev/null +++ b/scripts/DEEP_Pytorch-DDP/DEEP_Pytorch-DDP_script.sh @@ -0,0 +1,36 @@ +#!/bin/bash +#SBATCH --job-name=job +#SBATCH --output=job.out +#SBATCH --error=job.err +#SBATCH --account=%account% +#SBATCH --partition=%partition% +#SBATCH --nodes=%nodes% +#SBATCH --gpus-per-node=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=12 +#SBATCH --exclusive +#SBATCH --gres=gpu:1 + +#MODULES BEGIN DEEP Pytorch-DDP +ml Stages/2024 GCC OpenMPI CUDA/12 cuDNN MPI-settings/CUDA +ml Python CMake HDF5 PnetCDF libaio mpi4py +#MODULES END + +# variables for specific HPC +export CUDA_VISIBLE_DEVICES="0" +ln -sf /usr/lib64/libcuda.so.1 +ln -sf /usr/lib64/libnvidia-ml.so.1 +export LD_LIBRARY_PATH=.:/usr/local/cuda-11.7/lib64:$LD_LIBRARY_PATH + +source your/env_path/bin/activate + + # DDP NCCL setup +srun --cpu-bind=none bash -c "torchrun \ + --log_dir='logs' \ + --nnodes=$SLURM_NNODES \ + --nproc_per_node=$SLURM_GPUS_PER_NODE \ + --rdzv_id=$SLURM_JOB_ID \ + --rdzv_backend=c10d \ + --rdzv_endpoint='$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)'i:29500 \ + --rdzv_conf=is_host=\$(((SLURM_NODEID)) && echo 0 || echo 1) \ +%executable%" diff --git a/scripts/DEEP_Pytorch-DDP/lamec.json b/scripts/DEEP_Pytorch-DDP/lamec.json new file mode 100644 index 0000000000000000000000000000000000000000..000e9a617309150327057beaecb5a8008136d295 --- /dev/null +++ b/scripts/DEEP_Pytorch-DDP/lamec.json @@ -0,0 +1 @@ +{"template": "DEEP_Pytorch-DDP_script.sh"} diff --git a/scripts/JURECA_DeepSpeed/JURECA_DeepSpeed_script.sh b/scripts/JURECA_DeepSpeed/JURECA_DeepSpeed_script.sh new file mode 100644 index 0000000000000000000000000000000000000000..7a88e71ae4f2b0ed798f28e69750587eec4b084a --- /dev/null +++ b/scripts/JURECA_DeepSpeed/JURECA_DeepSpeed_script.sh @@ -0,0 +1,26 @@ +#!/bin/bash +#SBATCH --job-name=job +#SBATCH --output=job.out +#SBATCH --error=job.err +#SBATCH --account=%account% +#SBATCH --partition=%partition% +#SBATCH --nodes=%nodes% +#SBATCH --gpus-per-node=4 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=128 +#SBATCH --exclusive +#SBATCH --gres=gpu:4 + +#MODULES BEGIN JURECA DeepSpeed +ml GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py +#MODULES END + +# variables for specific HPC +export CUDA_VISIBLE_DEVICES="0,1,2,3" + +source your/env_path/bin/activate + +# DeepSpeed NCCL/MPI setup +export MASTER_ADDR=$(scontrol show hostnames "\$SLURM_JOB_NODELIST" | head -n 1)i +export MASTER_PORT=29500 +srun --cpu-bind=none python %executable% --deepspeed diff --git a/scripts/JURECA_DeepSpeed/lamec.json b/scripts/JURECA_DeepSpeed/lamec.json new file mode 100644 index 0000000000000000000000000000000000000000..61354153e70b81cedf80265edcb7c7fefe6ee790 --- /dev/null +++ b/scripts/JURECA_DeepSpeed/lamec.json @@ -0,0 +1 @@ +{"template": "JURECA_DeepSpeed_script.sh"} diff --git a/scripts/JURECA_HeAT/JURECA_HeAT_script.sh b/scripts/JURECA_HeAT/JURECA_HeAT_script.sh new file mode 100644 index 0000000000000000000000000000000000000000..1fe2f161af84a02d1dae966430c34aa005401911 --- /dev/null +++ b/scripts/JURECA_HeAT/JURECA_HeAT_script.sh @@ -0,0 +1,24 @@ +#!/bin/bash +#SBATCH --job-name=job +#SBATCH --output=job.out +#SBATCH --error=job.err +#SBATCH --account=%account% +#SBATCH --partition=%partition% +#SBATCH --nodes=%nodes% +#SBATCH --gpus-per-node=4 +#SBATCH --ntasks-per-node=4 +#SBATCH --cpus-per-task=32 +#SBATCH --exclusive +#SBATCH --gres=gpu:4 + +#MODULES BEGIN JURECA HeAT +ml GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py +#MODULES END + +# variables for specific HPC +export CUDA_VISIBLE_DEVICES="0,1,2,3" + +source your/env_path/bin/activate + +# HeAT NCCL setup +srun --cpu-bind=none python3 -u %executable% diff --git a/scripts/JURECA_HeAT/lamec.json b/scripts/JURECA_HeAT/lamec.json new file mode 100644 index 0000000000000000000000000000000000000000..0d23a8a7e99d4f044bcacd8ff48512b481ac2ae1 --- /dev/null +++ b/scripts/JURECA_HeAT/lamec.json @@ -0,0 +1 @@ +{"template": "JURECA_HeAT_script.sh"} diff --git a/scripts/JURECA_Horovod/JURECA_Horovod_script.sh b/scripts/JURECA_Horovod/JURECA_Horovod_script.sh new file mode 100644 index 0000000000000000000000000000000000000000..49de9fc8bccc93da07a611d84b332aeb5838d95a --- /dev/null +++ b/scripts/JURECA_Horovod/JURECA_Horovod_script.sh @@ -0,0 +1,24 @@ +#!/bin/bash +#SBATCH --job-name=job +#SBATCH --output=job.out +#SBATCH --error=job.err +#SBATCH --account=%account% +#SBATCH --partition=%partition% +#SBATCH --nodes=%nodes% +#SBATCH --gpus-per-node=4 +#SBATCH --ntasks-per-node=4 +#SBATCH --cpus-per-task=32 +#SBATCH --exclusive +#SBATCH --gres=gpu:4 + +#MODULES BEGIN JURECA Horovod +ml GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py +#MODULES END + +# variables for specific HPC +export CUDA_VISIBLE_DEVICES="0,1,2,3" + +source your/env_path/bin/activate + +# Horovod NCCL/MPI setup +srun --cpu-bind=none python3 -u %executable% diff --git a/scripts/JURECA_Horovod/lamec.json b/scripts/JURECA_Horovod/lamec.json new file mode 100644 index 0000000000000000000000000000000000000000..74ea6d2c76b2265d189eadacb40204adf0d9c6b2 --- /dev/null +++ b/scripts/JURECA_Horovod/lamec.json @@ -0,0 +1 @@ +{"template": "JURECA_Horovod_script.sh"} diff --git a/scripts/JURECA_Pytorch-DDP/JURECA_Pytorch-DDP_script.sh b/scripts/JURECA_Pytorch-DDP/JURECA_Pytorch-DDP_script.sh new file mode 100644 index 0000000000000000000000000000000000000000..cf6cdc49755f6599ba09717a007e341e518d1459 --- /dev/null +++ b/scripts/JURECA_Pytorch-DDP/JURECA_Pytorch-DDP_script.sh @@ -0,0 +1,32 @@ +#!/bin/bash +#SBATCH --job-name=job +#SBATCH --output=job.out +#SBATCH --error=job.err +#SBATCH --account=%account% +#SBATCH --partition=%partition% +#SBATCH --nodes=%nodes% +#SBATCH --gpus-per-node=4 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=128 +#SBATCH --exclusive +#SBATCH --gres=gpu:4 + +#MODULES BEGIN JURECA Pytorch-DDP +ml GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py +#MODULES END + +# variables for specific HPC +export CUDA_VISIBLE_DEVICES="0,1,2,3" + +source your/env_path/bin/activate + + # DDP NCCL setup +srun --cpu-bind=none bash -c "torchrun \ + --log_dir='logs' \ + --nnodes=$SLURM_NNODES \ + --nproc_per_node=$SLURM_GPUS_PER_NODE \ + --rdzv_id=$SLURM_JOB_ID \ + --rdzv_backend=c10d \ + --rdzv_endpoint='$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)'i:29500 \ + --rdzv_conf=is_host=\$(((SLURM_NODEID)) && echo 0 || echo 1) \ +%executable%" diff --git a/scripts/JURECA_Pytorch-DDP/lamec.json b/scripts/JURECA_Pytorch-DDP/lamec.json new file mode 100644 index 0000000000000000000000000000000000000000..a9d0f0ba52bf724cde832c784dbdd00ff525d1a6 --- /dev/null +++ b/scripts/JURECA_Pytorch-DDP/lamec.json @@ -0,0 +1 @@ +{"template": "JURECA_Pytorch-DDP_script.sh"} diff --git a/scripts/JUWELS_DeepSpeed/JUWELS_DeepSpeed_script.sh b/scripts/JUWELS_DeepSpeed/JUWELS_DeepSpeed_script.sh new file mode 100644 index 0000000000000000000000000000000000000000..3a3ba90b75bc477f15f31967cf7e6c266c5d19d2 --- /dev/null +++ b/scripts/JUWELS_DeepSpeed/JUWELS_DeepSpeed_script.sh @@ -0,0 +1,26 @@ +#!/bin/bash +#SBATCH --job-name=job +#SBATCH --output=job.out +#SBATCH --error=job.err +#SBATCH --account=%account% +#SBATCH --partition=%partition% +#SBATCH --nodes=%nodes% +#SBATCH --gpus-per-node=4 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=48 +#SBATCH --exclusive +#SBATCH --gres=gpu:4 + +#MODULES BEGIN JUWELS DeepSpeed +ml GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py +#MODULES END + +# variables for specific HPC +export CUDA_VISIBLE_DEVICES="0,1,2,3" + +source your/env_path/bin/activate + +# DeepSpeed NCCL/MPI setup +export MASTER_ADDR=$(scontrol show hostnames "\$SLURM_JOB_NODELIST" | head -n 1)i +export MASTER_PORT=29500 +srun --cpu-bind=none python %executable% --deepspeed diff --git a/scripts/JUWELS_DeepSpeed/lamec.json b/scripts/JUWELS_DeepSpeed/lamec.json new file mode 100644 index 0000000000000000000000000000000000000000..90d2f1f8964a33e925e2d6a62932cc51fed05a19 --- /dev/null +++ b/scripts/JUWELS_DeepSpeed/lamec.json @@ -0,0 +1 @@ +{"template": "JUWELS_DeepSpeed_script.sh"} diff --git a/scripts/JUWELS_HeAT/JUWELS_HeAT_script.sh b/scripts/JUWELS_HeAT/JUWELS_HeAT_script.sh new file mode 100644 index 0000000000000000000000000000000000000000..e26eb11e93f2a536f718f3c2846d0d1eaea127e1 --- /dev/null +++ b/scripts/JUWELS_HeAT/JUWELS_HeAT_script.sh @@ -0,0 +1,24 @@ +#!/bin/bash +#SBATCH --job-name=job +#SBATCH --output=job.out +#SBATCH --error=job.err +#SBATCH --account=%account% +#SBATCH --partition=%partition% +#SBATCH --nodes=%nodes% +#SBATCH --gpus-per-node=4 +#SBATCH --ntasks-per-node=4 +#SBATCH --cpus-per-task=12 +#SBATCH --exclusive +#SBATCH --gres=gpu:4 + +#MODULES BEGIN JUWELS HeAT +ml GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py +#MODULES END + +# variables for specific HPC +export CUDA_VISIBLE_DEVICES="0,1,2,3" + +source your/env_path/bin/activate + +# HeAT NCCL setup +srun --cpu-bind=none python3 -u %executable% diff --git a/scripts/JUWELS_HeAT/lamec.json b/scripts/JUWELS_HeAT/lamec.json new file mode 100644 index 0000000000000000000000000000000000000000..8d272462b82edd2e57ca133d6f5ba6e13696fe20 --- /dev/null +++ b/scripts/JUWELS_HeAT/lamec.json @@ -0,0 +1 @@ +{"template": "JUWELS_HeAT_script.sh"} diff --git a/scripts/JUWELS_Horovod/JUWELS_Horovod_script.sh b/scripts/JUWELS_Horovod/JUWELS_Horovod_script.sh new file mode 100644 index 0000000000000000000000000000000000000000..e2d3daca09eb442adc1b82d6df06c15fc28a6675 --- /dev/null +++ b/scripts/JUWELS_Horovod/JUWELS_Horovod_script.sh @@ -0,0 +1,24 @@ +#!/bin/bash +#SBATCH --job-name=job +#SBATCH --output=job.out +#SBATCH --error=job.err +#SBATCH --account=%account% +#SBATCH --partition=%partition% +#SBATCH --nodes=%nodes% +#SBATCH --gpus-per-node=4 +#SBATCH --ntasks-per-node=4 +#SBATCH --cpus-per-task=12 +#SBATCH --exclusive +#SBATCH --gres=gpu:4 + +#MODULES BEGIN JUWELS Horovod +ml GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py +#MODULES END + +# variables for specific HPC +export CUDA_VISIBLE_DEVICES="0,1,2,3" + +source your/env_path/bin/activate + +# Horovod NCCL/MPI setup +srun --cpu-bind=none python3 -u %executable% diff --git a/scripts/JUWELS_Horovod/lamec.json b/scripts/JUWELS_Horovod/lamec.json new file mode 100644 index 0000000000000000000000000000000000000000..6a62839cedcb3babe1e9a9e99cc2c975005ceb35 --- /dev/null +++ b/scripts/JUWELS_Horovod/lamec.json @@ -0,0 +1 @@ +{"template": "JUWELS_Horovod_script.sh"} diff --git a/scripts/JUWELS_Pytorch-DDP/JUWELS_Pytorch-DDP_script.sh b/scripts/JUWELS_Pytorch-DDP/JUWELS_Pytorch-DDP_script.sh new file mode 100644 index 0000000000000000000000000000000000000000..b6f9abdb5868278f397d2472539843581ff4184c --- /dev/null +++ b/scripts/JUWELS_Pytorch-DDP/JUWELS_Pytorch-DDP_script.sh @@ -0,0 +1,32 @@ +#!/bin/bash +#SBATCH --job-name=job +#SBATCH --output=job.out +#SBATCH --error=job.err +#SBATCH --account=%account% +#SBATCH --partition=%partition% +#SBATCH --nodes=%nodes% +#SBATCH --gpus-per-node=4 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=48 +#SBATCH --exclusive +#SBATCH --gres=gpu:4 + +#MODULES BEGIN JUWELS Pytorch-DDP +ml GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py +#MODULES END + +# variables for specific HPC +export CUDA_VISIBLE_DEVICES="0,1,2,3" + +source your/env_path/bin/activate + + # DDP NCCL setup +srun --cpu-bind=none bash -c "torchrun \ + --log_dir='logs' \ + --nnodes=$SLURM_NNODES \ + --nproc_per_node=$SLURM_GPUS_PER_NODE \ + --rdzv_id=$SLURM_JOB_ID \ + --rdzv_backend=c10d \ + --rdzv_endpoint='$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)'i:29500 \ + --rdzv_conf=is_host=\$(((SLURM_NODEID)) && echo 0 || echo 1) \ +%executable%" diff --git a/scripts/JUWELS_Pytorch-DDP/lamec.json b/scripts/JUWELS_Pytorch-DDP/lamec.json new file mode 100644 index 0000000000000000000000000000000000000000..ee980baf09d21edd5f84c165de6354f8aa0430f3 --- /dev/null +++ b/scripts/JUWELS_Pytorch-DDP/lamec.json @@ -0,0 +1 @@ +{"template": "JUWELS_Pytorch-DDP_script.sh"} diff --git a/scripts/LUMI_HeAT/LUMI_HeAT_script.sh b/scripts/LUMI_HeAT/LUMI_HeAT_script.sh new file mode 100644 index 0000000000000000000000000000000000000000..250b51a5030ce6db1ad9df8a2d3b3fa21a6f5fd7 --- /dev/null +++ b/scripts/LUMI_HeAT/LUMI_HeAT_script.sh @@ -0,0 +1,39 @@ +#!/bin/bash +#SBATCH --job-name=job +#SBATCH --output=job.out +#SBATCH --error=job.err +#SBATCH --account=%account% +#SBATCH --partition=%partition% +#SBATCH --nodes=%nodes% +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --cpus-per-task=8 +#SBATCH --exclusive +#SBATCH --gres=gpu:8 + +#MODULES BEGIN LUMI HeAT +ml LUMI/22.08 partition/G rocm ModulePowerUser/LUMI buildtools cray-python +#MODULES END + +# variables for specific HPC +HIP_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" +export ROCR_VISIBLE_DEVICES=$SLURM_LOCALID +export LD_LIBRARY_PATH=$HIP_LIB_PATH:$LD_LIBRARY_PATH +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET_GDR_LEVEL=3 +export MIOPEN_USER_DB_PATH=/tmp/${{USER}}-miopen-cache-${{SLURM_JOB_ID}} +export MIOPEN_CUSTOM_CACHE_DIR=${{MIOPEN_USER_DB_PATH}} +export CXI_FORK_SAFE=1 +export CXI_FORK_SAFE_HP=1 +export FI_CXI_DISABLE_CQ_HUGETLB=1 +export HCC_AMDGPU_TARGET=gfx90a +export HIP_LAUNCH_BLOCKING=1 +export NCCL_ASYNC_ERROR_HANDLING=1 +export NCCL_IB_TIMEOUT=50 +export UCX_RC_TIMEOUT=4s +export NCCL_IB_RETRY_CNT=10 + +source your/env_path/bin/activate + +#HeAT NCCL setup +srun --cpu-bind=none python3 -u %executable% diff --git a/scripts/LUMI_HeAT/lamec.json b/scripts/LUMI_HeAT/lamec.json new file mode 100644 index 0000000000000000000000000000000000000000..0ff777cc219e63e5a7051f151a3b739f11e769d3 --- /dev/null +++ b/scripts/LUMI_HeAT/lamec.json @@ -0,0 +1 @@ +{"template": "LUMI_HeAT_script.sh"} diff --git a/scripts/VEGA_Basilisk/_script.sh b/scripts/VEGA_Basilisk/_script.sh new file mode 100644 index 0000000000000000000000000000000000000000..57530b0ae1cd06a263a5e3f22863357c8bc0efa2 --- /dev/null +++ b/scripts/VEGA_Basilisk/_script.sh @@ -0,0 +1,16 @@ +#!/bin/bash +#SBATCH --job-name=job +#SBATCH --output=job.out +#SBATCH --error=job.err +#SBATCH --account=%account% +#SBATCH --partition=%partition% +#SBATCH --nodes=%nodes% + +#MODULES BEGIN VEGA Basilisk +module purge +ml load Bison/3.7.1-GCCcore-10.2.0 CMake/3.18.4-GCCcore-10.2.0 Python/3.8.6-GCCcore-10.2.0 flex/2.6.4-GCCcore-10.2.0 SWIG/4.0.2-GCCcore-10.3.0 Mesa/20.2.1-GCCcore-10.2.0 libGLU/9.0.1-GCCcore-10.2.0 OpenMPI/4.1.3-GCC-10.3.0 ImageMagick/7.0.10-35-GCCcore-10.2.0 FFmpeg/4.4.2-GCCcore-11.3.0 +#MODULES END + +source your/env_path/bin/activate + +srun --mpi=pmix %executable% diff --git a/scripts/VEGA_Basilisk/lamec.json b/scripts/VEGA_Basilisk/lamec.json new file mode 100644 index 0000000000000000000000000000000000000000..2837d615492f1ae7c1cb69f192c84ee7f493e8a6 --- /dev/null +++ b/scripts/VEGA_Basilisk/lamec.json @@ -0,0 +1 @@ +{"template": "_script.sh"} diff --git a/scripts/cyclone_basilisk/basilisk_cfd.sh b/scripts/cyclone_basilisk/basilisk_cfd.sh deleted file mode 100644 index a39763b4863be39f6c0cc66332f693a89cd296a8..0000000000000000000000000000000000000000 --- a/scripts/cyclone_basilisk/basilisk_cfd.sh +++ /dev/null @@ -1,64 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=case0 -#SBATCH --account=p084 -#SBATCH --nodes=1 -#SBATCH --ntasks=20 -#SBATCH --hint=nomultithread -###SBATCH --mem=180G -#SBATCH --time=24:00:00 -#SBATCH --output=out.%j -#SBATCH --error=log.%j -#SBATCH --partition=cpu -#SBATCH --exclusive - -module purge -module load SWIG/4.0.2-GCCcore-10.2.0 Bison/3.7.1-GCCcore-10.2.0 CMake/3.18.4-GCCcore-10.2.0 Python/3.8.6-GCCcore-10.2.0 flex/2.6.4-GCCcore-10.2.0 glew/2.2.0-GCCcore-10.2.0-osmesa -module load Mesa/20.2.1-GCCcore-10.2.0 libGLU/9.0.1-GCCcore-10.2.0 -module load OpenMPI/4.0.5-GCC-10.2.0 -##module load FFmpeg -export BASILISK=/onyx/data/p084/basilisk/src -export PATH=$PATH:$BASILISK - -echo "Starting at `date`" -echo "Running on hosts: $SLURM_NODELIST" -echo "Running on $SLURM_NNODES nodes." -echo "Running on $SLURM_NPROCS processors." -echo "Job id is $SLURM_JOBID" - -ax_max=40 -ax_min=0 -ay_max=40 -ay_min=0 - b_max=10 - b_min=0 -xc_max=0.5 -xc_min=0.2 -yc_max=0.8 -yc_min=0.5 - -file="params.in" - -if ! [[ -f "restart" ]] ; then - RANDOM=$(date +%s%N | cut -b10-19) # give a seed - echo "$RANDOM / 32767 * ($ax_max-$ax_min) + $ax_min" | bc -l > $file - echo "$RANDOM / 32767 * ($ay_max-$ay_min) + $ay_min" | bc -l >> $file - echo "$RANDOM / 32767 * ( $b_max- $b_min) + $b_min" | bc -l >> $file - echo "$RANDOM / 32767 * ($xc_max-$xc_min) + $xc_min" | bc -l >> $file - echo "$RANDOM / 32767 * ($yc_max-$yc_min) + $yc_min" | bc -l >> $file -fi - -if ! [[ -d "output/" ]] ; then - mkdir output/ - mkdir output/wet_area/ - mkdir output/facets/ - mkdir output/my_output/ -fi - -#CC99='mpicc -std=c99' qcc -O2 -Wall -D_MPI=1 sessileweb_no_opengl.c -o run -lm -L$EBROOTLIBGLU/lib -lGLU -L$EBROOTMESA/lib -lOSMesa -L/onyx/data/p084/basilisk_new/basilisk/src/gl -lglutils -lfb_osmesa -CC99='mpicc -std=c99' qcc -O2 -Wall -D_MPI=1 drop.c -o run -lm -L$EBROOTLIBGLU/lib -lGLU -L$EBROOTGLEW/lib64 -lGLEW -L/onyx/data/p084/basilisk/src/gl -lfb_glx -lglutils -L$EBROOTLIBGLVND/lib -lGL -L$EBROOTX11/lib -lX11 - -srun --exclusive -K1 -n $SLURM_NTASKS ./run -# 2> log-$SLURM_NTASKS > out-$SLURM_NTASKS -# 2> log > out - -echo "Program finished with exit code $? at: `date`" diff --git a/scripts/cyclone_basilisk/lamec.json b/scripts/cyclone_basilisk/lamec.json deleted file mode 100644 index 1064dc7fa865bf41c2bb546218a75c84d3972532..0000000000000000000000000000000000000000 --- a/scripts/cyclone_basilisk/lamec.json +++ /dev/null @@ -1 +0,0 @@ -{"startscript": "basilisk_cfd.sh"} \ No newline at end of file diff --git a/scripts/cyclone_horovod/FNO_launch.sh b/scripts/cyclone_horovod/FNO_launch.sh deleted file mode 100644 index 65316943b445a55d9f297353dbc07e5e4bf93c8d..0000000000000000000000000000000000000000 --- a/scripts/cyclone_horovod/FNO_launch.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=FNO # Job name -#SBATCH --partition=gpu # Partition -#SBATCH --nodes=8 # Number of nodes -#BATCH --gpus-per-node=4 # Number of GPUs per node -####SBATCH --gres=gpu:4 # Number of GPUs per node -#####SBATCH --ntasks-per-node=4 # Number of tasks -#SBATCH --output=job.%j.out # Stdout (%j=jobId) -#SBATCH --error=job.%j.err # Stderr (%j=jobId) -#SBATCH --time=24:00:00 # Walltime -#SBATCH -A p101 - -module purge -module load h5py -module load tqdm -module load matplotlib -module load PyTorch/1.9.1-fosscuda-2020b -module load Horovod/0.22.0-fosscuda-2020b-PyTorch-1.9.1 - -echo "Starting at `date`" -echo "Running on hosts: $SLURM_NODELIST" -echo "Running on $SLURM_NNODES nodes." -echo "Running on $SLURM_NPROCS processors." -echo "Job id is $SLURM_JOBID" - -srun python3 train_mixF_hrvd.py - - - -echo "Program finished with exit code $? at: `date`" diff --git a/scripts/cyclone_horovod/lamec.json b/scripts/cyclone_horovod/lamec.json deleted file mode 100644 index 212f4d1f2a70bf54baf8e81fe5a1a044522ed4b0..0000000000000000000000000000000000000000 --- a/scripts/cyclone_horovod/lamec.json +++ /dev/null @@ -1 +0,0 @@ -{"startscript": "FNO_launch.sh"} \ No newline at end of file diff --git a/scripts/deep_clang/clang_script.sh b/scripts/deep_clang/clang_script.sh deleted file mode 100644 index 088a5dcbbb16ad60dfcbac19569029c8ea5476de..0000000000000000000000000000000000000000 --- a/scripts/deep_clang/clang_script.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env bash -#SBATCH --account=%account% -#SBATCH --partition=%partition% -#SBATCH --nodes=%nodes% -#SBATCH --time=0:00:10 - -%undefined% - -PROGNAME="%executable%" - -ml Stages/Devel-2019a Clang/10.0.1 - -clang "$PROGNAME".c -o "$PROGNAME" diff --git a/scripts/deep_clang/lamec.json b/scripts/deep_clang/lamec.json deleted file mode 100644 index 1b2ee6452ebcf6a48f7ed802436a32496ec222dc..0000000000000000000000000000000000000000 --- a/scripts/deep_clang/lamec.json +++ /dev/null @@ -1 +0,0 @@ -{"template": "clang_script.sh"} diff --git a/scripts/deep_ddp/DDP_startscript_deep.sh b/scripts/deep_ddp/DDP_startscript_deep.sh deleted file mode 100644 index 8990317ac16544486f6feabad44539ef63bc53aa..0000000000000000000000000000000000000000 --- a/scripts/deep_ddp/DDP_startscript_deep.sh +++ /dev/null @@ -1,102 +0,0 @@ -#!/bin/bash - -# general configuration of the job -#SBATCH --job-name=TorchTest -#SBATCH --account=deepext -#SBATCH --mail-user= -#SBATCH --mail-type=ALL -#SBATCH --output=job.out -#SBATCH --error=job.err -#SBATCH --time=0-01:00:00 - -# configure node and process count on the CM -#SBATCH --partition=dp-esb -# SBATCH --partition=dp-dam -#SBATCH --nodes=4 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=16 -#SBATCH --gpus-per-node=1 -#SBATCH --exclusive - -# parameters -debug=false # do debug -bs=96 # batch-size -epochs=1 # epochs -lr=0.01 # learning rate - -# dataset -# MNIST -#dataDir="/p/project/prcoe12/RAISE/data_MNIST/" -#COMMAND="DDP_pytorch_mnist.py" - -# AT -dataDir="/p/project/prcoe12/RAISE/T31/" -COMMAND="DDP_pytorch_AT.py" - -EXEC="$COMMAND \ - --batch-size $bs \ - --epochs $epochs \ - --lr $lr \ - --nworker $SLURM_CPUS_PER_TASK \ - --data-dir $dataDir" - -# set modules -ml --force purge -ml use $OTHERSTAGES -ml Stages/2022 GCC/11.2.0 OpenMPI/4.1.2 cuDNN/8.3.1.22-CUDA-11.5 NCCL/2.11.4-CUDA-11.5 Python/3.9.6 - -# recent bug: https://gitlab.jsc.fz-juelich.de/software-team/easybuild/-/wikis/Failed-to-initialize-NVML-Driver-library-version-mismatch-message -ml -nvidia-driver/.default - -# set env - pip -source /p/project/prcoe12/RAISE/envAI_deepv/bin/activate - -# set env - conda -#source /p/project/prcoe12/RAISE/miniconda3_deepv/etc/profile.d/conda.sh -#conda activate - -# New CUDA drivers on the compute nodes -ln -s /usr/lib64/libcuda.so.1 . -ln -s /usr/lib64/libnvidia-ml.so.1 . -LD_LIBRARY_PATH=.:/usr/local/cuda/lib64:$LD_LIBRARY_PATH - -# sleep a sec -sleep 1 - -# job info -echo "TIME: $(date)" -echo "DEBUG: EXECUTE: $EXEC" -echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" -echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" -echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" -echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" -echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" -echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" -echo "DEBUG: SLURM_NODEID: $SLURM_NODEID" -echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" -if [ "$debug" = true ] ; then - export NCCL_DEBUG=INFO -fi - -# set comm, CUDA and OMP -#export PSP_CUDA=1 # not needed atm -#export PSP_UCP=1 # not needed atm -export CUDA_VISIBLE_DEVICES="0" -export OMP_NUM_THREADS=1 -if [ "$SLURM_CPUS_PER_TASK" > 0 ] ; then - export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -fi - -# launch -srun bash -c "torchrun \ - --log_dir='logs' \ - --nnodes=$SLURM_NNODES \ - --nproc_per_node=$SLURM_GPUS_PER_NODE \ - --rdzv_id=$SLURM_JOB_ID \ - --rdzv_conf=is_host=\$(((SLURM_NODEID)) && echo 0 || echo 1) \ - --rdzv_backend=c10d \ - --rdzv_endpoint='$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)'i:29500 \ - $EXEC" - -# eof diff --git a/scripts/deep_ddp/README.md b/scripts/deep_ddp/README.md deleted file mode 100644 index de5d06c0319c5bec100b6144c357e4f141940737..0000000000000000000000000000000000000000 --- a/scripts/deep_ddp/README.md +++ /dev/null @@ -1,42 +0,0 @@ -# DL using DDP on deepv - -# source -https://github.com/pytorch/pytorch#from-source - -# current isues -1. dirty fix to infiniband IPs\ -https://github.com/pytorch/pytorch/issues/73656 - -# to-do -1. - -# done -1. CUDA is back! -2. connection issues are solved -3. updated to torch 1.10.0 -4. updated to torch 1.10.2 -5. infiniband IPs updated - -# usage - pip -1. clone -2. run `./createENV.sh` -3. submit `sbatch DDP_startscript_deep.sh` - -# usage - conda -1. clone -2. run `./conda_torch.sh` -3. modify `DDP_startscript_deep.sh`\ -comment out previous source\ -`source /p/project/prcoe12/RAISE/envAI_deepv/bin/activate`\ -uncomment:\ -`source /p/project/prcoe12/RAISE/miniconda3_deepv/etc/profile.d/conda.sh`\ -`conda activate` -4. submit `sbatch DDP_startscript_deep.sh` - -# updates -1. with the new Stage2020, Conda is no longer needed! Simply use the envAI_deep as:\ -`ml use $OTHERSTAGES`\ -`ml Stages/2022 GCC OpenMPI Python cuDNN NCCL Python`\ -`source /p/project/prcoe12/RAISE/envAI_deepv/bin/activate` -2. shared memory type performance increase is adapted, simply increase `--cpus-per-task` -3. migrated to OpenMPI (pscom issues) and updated to IB IPs diff --git a/scripts/deep_ddp/conda_torch.sh b/scripts/deep_ddp/conda_torch.sh deleted file mode 100755 index 59213f0c87e4b60f882ac15d4bb26a574cfe1723..0000000000000000000000000000000000000000 --- a/scripts/deep_ddp/conda_torch.sh +++ /dev/null @@ -1,80 +0,0 @@ -#!/bin/sh -# author: EI -# version: 210709a - -# get dir -iDir=$PWD - -# set modules -module --force purge -module use $OTHERSTAGES -ml Stages/2020 GCC/9.3.0 ParaStationMPI/5.4.7-1-mt CMake Ninja cuDNN NCCL mpi-settings/CUDA - -# conda -if [ -d "${iDir}/miniconda3" ];then - echo "miniconda3 already installed!" - source ${iDir}/miniconda3/etc/profile.d/conda.sh - conda activate -else - echo "miniconda3 will be compiled to ${iDir}/miniconda3!" - wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh - bash Miniconda3-latest-Linux-x86_64.sh -p ${iDir}/miniconda3 -b - source ${iDir}/miniconda3/etc/profile.d/conda.sh - conda activate - # std libs - conda install -y astunparse numpy pyyaml mkl mkl-include setuptools cffi typing_extensions future six requests dataclasses Pillow --force-reinstall - # cuda - check version with yours - conda install -c pytorch -y magma-cuda110 --force-reinstall - conda install -y pkg-config libuv --force-reinstall - rm -f Miniconda3-latest-Linux-x86_64.sh -fi - -# torch -if [ -d "${iDir}/pytorch/build" ];then - echo 'pytorch already installed!' -else - # clone pytorch - if [ -d "${iDir}/pytorch" ];then - echo 'pytorch repo is found!' - else - git clone --recursive https://github.com/pytorch/pytorch pytorch - fi - - # update repos - cd pytorch - git submodule sync - git submodule update --init --recursive - - # install pytorch - export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"} - export TMPDIR=${iDir}/tmp - python setup.py clean - CMAKE_C_COMPILER=$(which mpicc) CMAKE_CXX_COMPILER=$(which mpicxx) USE_DISTRIBUTED=ON USE_MPI=ON USE_CUDA=ON NCCL_ROOT_DIR=$EBROOTNCCL USE_NCCL=ON USE_GLOO=ON CUDNN_ROOT=$EBROOTCUDNN USE_CUDNN=ON python setup.py install - cd .. -fi - -# torchvision -if [ -d "${iDir}/torchvision/build" ];then - echo 'torchvision already installed!' -else - # clone torchvision - if [ -d "${iDir}/torchvision" ];then - echo 'torchvision repo is found!' - else - git clone --recursive https://github.com/pytorch/vision.git torchvision - fi - - # update repos - cd torchvision - git submodule sync - git submodule update --init --recursive - - # install torchvision - export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"} - export TMPDIR=${iDir}/tmp - python setup.py clean - CMAKE_C_COMPILER=$(which mpicc) CMAKE_CXX_COMPILER=$(which mpicxx) FORCE_CUDA=ON python setup.py install -fi - -echo 'done!' -# eof diff --git a/scripts/deep_ddp/createEnv.sh b/scripts/deep_ddp/createEnv.sh deleted file mode 100755 index fd9886be0c24576567e788eaefbdc13d7ac3f4c4..0000000000000000000000000000000000000000 --- a/scripts/deep_ddp/createEnv.sh +++ /dev/null @@ -1,193 +0,0 @@ -#!/bin/bash -# -*- coding: utf-8 -*- -# author: EI -# version: 221121a -# creates machine specific python env - -# set modules -ml --force purge - -# get sys info -cDir=$PWD -sysN="$(uname -n | cut -f2- -d.)" -echo "system:${sysN}" -echo - -cont1=false -if [ "$sysN" = 'deepv' ] ; then - ml use $OTHERSTAGES - ml Stages/2022 GCC/11.2.0 ParaStationMPI/5.5.0-1 cuDNN/8.3.1.22-CUDA-11.5 NCCL/2.12.7-1-CUDA-11.5 Python/3.9.6 CMake/3.21.1 - #ml Stages/2022 GCC/11.2.0 OpenMPI/4.1.2 cuDNN/8.3.1.22-CUDA-11.5 NCCL/2.12.7-1-CUDA-11.5 Python/3.9.6 CMake/3.21.1 - cont1=true -elif [ "$sysN" = 'juwels' ] ; then - ml GCC ParaStationMPI Python CMake - cont1=true -elif [ "$sysN" = 'jureca' ] ; then - #ml Stages/2022 GCC ParaStationMPI Python CMake NCCL libaio # Horovod iassues with pscom?? - ml Stages/2022 GCC OpenMPI Python NCCL cuDNN libaio CMake - cont1=true -else - echo - echo 'unknown system detected' - echo 'canceling' - echo -fi -echo "modules loaded" -echo - -# get python version -pver="$(python --version 2>&1 | awk {'print $2'} | cut -f1-2 -d.)" -echo "python version is ${pver}" -echo - -if [ "$cont1" = true ] ; then - if [ -d "${cDir}/envAI_${sysN}" ];then - echo 'env already exist' - echo - - source envAI_${sysN}/bin/activate - else - # create env - python3 -m venv envAI_${sysN} - - # get headers for pip - if [ -f "${cDir}/envAI_${sysN}/bin/pip3" ]; then - echo 'pip already exist' - else - cp "$(which pip3)" $cDir/envAI_${sysN}/bin/ - ln -s $cDir/envAI_${sysN}/bin/pip3 $cDir/envAI_${sysN}/bin/pip${pver} - var="#!$cDir/envAI_${sysN}/bin/python${pver}" - sed -i "1s|.*|$var|" $cDir/envAI_${sysN}/bin/pip3 - fi - - # activate env - source envAI_${sysN}/bin/activate - - echo "a new env is created in ${cDir}" - echo "activation is done via:" - echo "source ${cDir}/envAI_${sysN}/bin/activate" - fi -fi - -# install torch -if [ -f "${cDir}/envAI_${sysN}/bin/torchrun" ]; then - echo 'Torch already installed' - echo -else - export TMPDIR=${cDir} - - #pip3 install \ - # torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio==0.12.1+cu113 -f \ - # https://download.pytorch.org/whl/cu113/torch_stable.html --no-cache-dir - - ## Stages/2022 - CUDA/11.5 - #pip3 install \ - # torch==1.11.0+cu115 torchvision==0.12.0+cu115 torchaudio==0.11.0+cu115 -f \ - # https://download.pytorch.org/whl/cu115/torch_stable.html --no-cache-dir - - # Stages/2022 - CUDA/11.7 - pip3 install \ - torch==1.13.0+cu117 torchvision==0.14.0+cu117 torchaudio==0.13.0+cu117 -f \ - https://download.pytorch.org/whl/cu117/torch_stable.html --no-cache-dir -fi - -# install horovod -if [ -f "${cDir}/envAI_${sysN}/bin/horovodrun" ]; then - echo 'Horovod already installed' - echo -else - pip3 install --no-cache-dir wheel - #export HOROVOD_DEBUG=1 - export HOROVOD_GPU=CUDA - export HOROVOD_CUDA_HOME=$EBROOTCUDA - #export HOROVOD_WITH_MPI=1 - export HOROVOD_MPI_THREADS_DISABLE=1 - #export HOROVOD_GPU_OPERATIONS=MPI # only turn this off - export HOROVOD_GPU_OPERATIONS=NCCL # only turn this off - export HOROVOD_NCCL_HOME=$EBROOTNCCL - export HOROVOD_WITH_PYTORCH=1 - export HOROVOD_WITHOUT_TENSORFLOW=1 - export HOROVOD_WITHOUT_MXNET=1 - export TMPDIR=${cDir} - - pip3 install --no-cache-dir horovod --ignore-installed -fi - -# install deepspeed -if [ -f "${cDir}/envAI_${sysN}/bin/deepspeed" ]; then - echo 'DeepSpeed already installed' - echo -else - #export DS_BUILD_OPS=1 - # if above not working?? recursion error use this - export DS_BUILD_FUSED_ADAM=1 - export DS_BUILD_UTILS=1 - if [ "$sysN" = 'deepv' ] ; then - #fix libaio issues via: - export DS_BUILD_AIO=0 - fi - export TMPDIR=${cDir} - - pip3 install --no-cache-dir DeepSpeed - - # add this to deepspeed/launcher/launch.py l.126 - var=' args.node_rank=int(os.environ.get("SLURM_PROCID",0))' - sed -i "126s|.*|$var|" $cDir/envAI_${sysN}/lib/python${pver}/site-packages/deepspeed/launcher/launch.py -fi - -# install heat -if [ -d "${cDir}/envAI_${sysN}/lib/python${pver}/site-packages/heat" ]; then - echo 'HeAT already installed' - echo -else - export TMPDIR=${cDir} - - # need to modify setup.py to accep torch>1.9 for heat - wget https://files.pythonhosted.org/packages/5d/3a/4781f1e6910753bfdfa6712c83c732c60e675d8de14983926a0d9306c7a6/heat-1.1.1.tar.gz - tar xzf heat-1.1.1.tar.gz - var=' "torch>=1.7.0",' - sed -i "36s|.*|$var|" heat-1.1.1/setup.py - var=' "torchvision>=0.8.0",' - sed -i "39s|.*|$var|" heat-1.1.1/setup.py - - # create tar again! - rm -rf heat-1.1.1.tar.gz - tar czf heat-1.1.1.tar.gz heat-1.1.1 - rm -rf heat-1.1.1 - - pip3 install --no-cache-dir 'heat-1.1.1.tar.gz[hdf5,netcdf]' - - rm -rf heat-1.1.1.tar.gz -fi - -# get rest of the libraries$ -if [ "$cont1" = true ] ; then - # install rest - pip3 install -r reqs.txt --ignore-installed - - # modify l.4 of /torchnlp/_third_party/weighted_random_sampler.py - var='int_classes = int' - sed -i "4s|.*|$var|" \ - $cDir/envAI_${sysN}/lib/python${pver}/site-packages/torchnlp/_third_party/weighted_random_sampler.py -fi - -if [ -f "${cDir}/envAI_${sysN}/bin/torchrun" ]; then - sed -i -e '3,8s/^/#/' ${cDir}/envAI_${sysN}/bin/torchrun - echo """ -import re -import sys -from torch.distributed.run import main -from torch.distributed.elastic.agent.server import api as sapi - -def new_get_fq_hostname(): - return _orig_get_fq_hostname().replace('.', 'i.', 1) - -if __name__ == '__main__': - _orig_get_fq_hostname = sapi._get_fq_hostname - sapi._get_fq_hostname = new_get_fq_hostname - sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) - sys.exit(main()) -""" >> ${cDir}/envAI_${sysN}/bin/torchrun -fi - -#eof diff --git a/scripts/deep_ddp/lamec.json b/scripts/deep_ddp/lamec.json deleted file mode 100644 index 8e4595add2c83b22847dd952abab945a614dedab..0000000000000000000000000000000000000000 --- a/scripts/deep_ddp/lamec.json +++ /dev/null @@ -1 +0,0 @@ -{"startscript": "DDP_startscript_deep.sh"} \ No newline at end of file diff --git a/scripts/deep_ddp/reqs.txt b/scripts/deep_ddp/reqs.txt deleted file mode 100755 index 20310b90fd6de0eb8f66c984b045b147c09a89dc..0000000000000000000000000000000000000000 --- a/scripts/deep_ddp/reqs.txt +++ /dev/null @@ -1,6 +0,0 @@ -Pillow -pyparsing -python-dateutil -matplotlib -h5py -pytorch-nlp diff --git a/scripts/deep_deepspeed/DS_config.json b/scripts/deep_deepspeed/DS_config.json deleted file mode 100644 index ec1f0221568969e8236b1e3ff5f3699f0f68b5b0..0000000000000000000000000000000000000000 --- a/scripts/deep_deepspeed/DS_config.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "train_micro_batch_size_per_gpu": 96, - "gradient_accumulation_steps": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.01 - } - }, - "fp16": { - "enabled": false - }, - "zero_optimization": false -} diff --git a/scripts/deep_deepspeed/DS_startscript_deep.sh b/scripts/deep_deepspeed/DS_startscript_deep.sh deleted file mode 100644 index d99e75942413c2fbe2db23745d663f26d3b8c0d3..0000000000000000000000000000000000000000 --- a/scripts/deep_deepspeed/DS_startscript_deep.sh +++ /dev/null @@ -1,84 +0,0 @@ -#!/bin/bash - -# general configuration of the job -#SBATCH --job-name=DSTest -#SBATCH --account=deepext -#SBATCH --mail-user= -#SBATCH --mail-type=ALL -#SBATCH --output=job.out -#SBATCH --error=job.err -#SBATCH --time=00:30:00 - -# configure node and process count on the CM -#SBATCH --partition=dp-esb -#SBATCH --nodes=2 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=16 -#SBATCH --gpus-per-node=1 -#SBATCH --exclusive - -# parameters -debug=false # do nccl debug -epochs=10 # epochs -lr=0.01 # learning rate -bs=96 # batch-size - -# AT -dataDir="/p/project/prcoe12/RAISE/T31/" -COMMAND="DS_pytorch_AT.py" -EXEC="$COMMAND \ - --batch-size $bs \ - --epochs $epochs \ - --lr $lr \ - --nworker $SLURM_CPUS_PER_TASK \ - --data-dir $dataDir" - -# set modules -ml --force purge -ml use $OTHERSTAGES -ml Stages/2022 GCC/11.2.0 OpenMPI/4.1.2 cuDNN/8.3.1.22-CUDA-11.5 NCCL/2.11.4-CUDA-11.5 Python/3.9.6 - -# set env - pip -source /p/project/prcoe12/RAISE/envAI_deepv/bin/activate - -# sleep a sec -sleep 1 - -# job info -echo "DEBUG: EXECUTE: $EXEC" -echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" -echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" -echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" -echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" -echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" -echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" -echo "DEBUG: SLURM_NODEID: $SLURM_NODEID" -echo "DEBUG: SLURM_LOCALID: $SLURM_LOCALID" -echo "DEBUG: SLURM_PROCID: $SLURM_PROCID" -echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" -if [ "$debug" = true ] ; then - export NCCL_DEBUG=INFO -fi -echo - -#### do not change this part -# create node-list -sysN=$(scontrol show hostnames) -for i in $sysN; do - x+=\"$i\":[$CUDA_VISIBLE_DEVICES], -done -WID=`echo {${x::-1}} | base64 -w 0` - -# modify config file with parameters -sed -i "2s|.*| \"train_micro_batch_size_per_gpu\": ${bs},|" DS_config.json -#### - -srun python3 -m deepspeed.launcher.launch \ - --node_rank $SLURM_PROCID \ - --master_addr ${SLURMD_NODENAME}i \ - --master_port 29500 \ - --world_info $WID \ - $EXEC --deepspeed_mpi --deepspeed_config DS_config.json - -# eof diff --git a/scripts/deep_deepspeed/README.md b/scripts/deep_deepspeed/README.md deleted file mode 100644 index 1f92e3a956c9d90742f201b3a7d5da538e922281..0000000000000000000000000000000000000000 --- a/scripts/deep_deepspeed/README.md +++ /dev/null @@ -1,15 +0,0 @@ -# DL using DeepSpeed on deepv - -# source -https://github.com/microsoft/DeepSpeed - -# current isues -1. - -# to-do -1. - -# usage - pip -1. clone -2. run `./createENV.sh` -3. submit `sbatch DS_startscript_deep.sh` diff --git a/scripts/deep_deepspeed/createEnv.sh b/scripts/deep_deepspeed/createEnv.sh deleted file mode 100755 index 4558743e8e899f6f19fe9b8bc058491256f6c61e..0000000000000000000000000000000000000000 --- a/scripts/deep_deepspeed/createEnv.sh +++ /dev/null @@ -1,174 +0,0 @@ -#!/bin/bash -# -*- coding: utf-8 -*- -# author: EI -# version: 220302a -# creates machine specific python env - -# set modules -ml --force purge - -# get sys info -cDir=$PWD -sysN="$(uname -n | cut -f2- -d.)" -echo "system:${sysN}" -echo - -cont1=false -if [ "$sysN" = 'deepv' ] ; then - ml use $OTHERSTAGES - #ml Stages/2022 GCC ParaStationMPI cuDNN NCCL Python CMake # Horovod issues with pscom?? - ml Stages/2022 GCC OpenMPI cuDNN NCCL Python CMake - cont1=true -elif [ "$sysN" = 'juwels' ] ; then - ml GCC ParaStationMPI Python CMake - cont1=true -elif [ "$sysN" = 'jureca' ] ; then - #ml Stages/2022 GCC ParaStationMPI Python CMake NCCL libaio # Horovod iassues with pscom?? - ml Stages/2022 GCC OpenMPI Python NCCL cuDNN libaio CMake - cont1=true -else - echo - echo 'unknown system detected' - echo 'canceling' - echo -fi -echo "modules loaded" -echo - -# get python version -pver="$(python --version 2>&1 | awk {'print $2'} | cut -f1-2 -d.)" -echo "python version is ${pver}" -echo - -if [ "$cont1" = true ] ; then - if [ -d "${cDir}/envAI_${sysN}" ];then - echo 'env already exist' - echo - - source envAI_${sysN}/bin/activate - else - # create env - python3 -m venv envAI_${sysN} - - # get headers for pip - if [ -f "${cDir}/envAI_${sysN}/bin/pip3" ]; then - echo 'pip already exist' - else - cp "$(which pip3)" $cDir/envAI_${sysN}/bin/ - ln -s $cDir/envAI_${sysN}/bin/pip3 $cDir/envAI_${sysN}/bin/pip${pver} - var="#!$cDir/envAI_${sysN}/bin/python${pver}" - sed -i "1s|.*|$var|" $cDir/envAI_${sysN}/bin/pip3 - fi - - # activate env - source envAI_${sysN}/bin/activate - - echo "a new env is created in ${cDir}" - echo "activation is done via:" - echo "source ${cDir}/envAI_${sysN}/bin/activate" - fi -fi - -# install torch -if [ -f "${cDir}/envAI_${sysN}/bin/torchrun" ]; then - echo 'Torch already installed' - echo -else - export TMPDIR=${cDir} - - pip3 install \ - torch==1.10.2+cu113 torchvision==0.11.3+cu113 torchaudio==0.10.2+cu113 \ - -f https://download.pytorch.org/whl/cu113/torch_stable.html --no-cache-dir -fi - -# install horovod -if [ -f "${cDir}/envAI_${sysN}/bin/horovodrun" ]; then - echo 'Horovod already installed' - echo -else - export HOROVOD_GPU=CUDA - export HOROVOD_GPU_OPERATIONS=NCCL - export HOROVOD_WITH_PYTORCH=1 - export TMPDIR=${cDir} - - pip3 install --no-cache-dir horovod --ignore-installed -fi - -# install deepspeed -if [ -f "${cDir}/envAI_${sysN}/bin/deepspeed" ]; then - echo 'DeepSpeed already installed' - echo -else - export DS_BUILD_OPS=1 - # if above not working?? recursion error use this - #export DS_BUILD_FUSED_ADAM=1 - #export DS_BUILD_UTILS=1 - if [ "$sysN" = 'deepv' ] ; then - #fix libaio issues via: - export DS_BUILD_AIO=0 - fi - export TMPDIR=${cDir} - - pip3 install --no-cache-dir DeepSpeed - - # add this to deepspeed/launcher/launch.py l.85 - var=' args.node_rank=int(os.environ.get("SLURM_PROCID",0))' - sed -i "85s|.*|$var|" $cDir/envAI_${sysN}/lib/python${pver}/site-packages/deepspeed/launcher/launch.py -fi - -# install heat -if [ -d "${cDir}/envAI_${sysN}/lib/python${pver}/site-packages/heat" ]; then - echo 'HeAT already installed' - echo -else - export TMPDIR=${cDir} - - # need to modify setup.py to accep torch>1.9 for heat - wget https://files.pythonhosted.org/packages/5d/3a/4781f1e6910753bfdfa6712c83c732c60e675d8de14983926a0d9306c7a6/heat-1.1.1.tar.gz - tar xzf heat-1.1.1.tar.gz - var=' "torch>=1.7.0",' - sed -i "36s|.*|$var|" heat-1.1.1/setup.py - var=' "torchvision>=0.8.0",' - sed -i "39s|.*|$var|" heat-1.1.1/setup.py - - # create tar again! - rm -rf heat-1.1.1.tar.gz - tar czf heat-1.1.1.tar.gz heat-1.1.1 - rm -rf heat-1.1.1 - - pip3 install --no-cache-dir 'heat-1.1.1.tar.gz[hdf5,netcdf]' - - rm -rf heat-1.1.1.tar.gz -fi - -# get rest of the libraries$ -if [ "$cont1" = true ] ; then - # install rest - pip3 install -r reqs.txt --ignore-installed - - # modify l.4 of /torchnlp/_third_party/weighted_random_sampler.py - var='int_classes = int' - sed -i "4s|.*|$var|" \ - $cDir/envAI_${sysN}/lib/python${pver}/site-packages/torchnlp/_third_party/weighted_random_sampler.py -fi - -if [ -f "${cDir}/envAI_${sysN}/bin/torchrun" ]; then - sed -i -e '3,8s/^/#/' ${cDir}/envAI_${sysN}/bin/torchrun - echo """ -import re -import sys -from torch.distributed.run import main -from torch.distributed.elastic.agent.server import api as sapi - -def new_get_fq_hostname(): - return _orig_get_fq_hostname().replace('.', 'i.', 1) - -if __name__ == '__main__': - _orig_get_fq_hostname = sapi._get_fq_hostname - sapi._get_fq_hostname = new_get_fq_hostname - sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) - sys.exit(main()) -""" >> ${cDir}/envAI_${sysN}/bin/torchrun -fi - -#eof diff --git a/scripts/deep_deepspeed/lamec.json b/scripts/deep_deepspeed/lamec.json deleted file mode 100644 index b1572ed4b5ac84409ff6cb91e575344301c84b95..0000000000000000000000000000000000000000 --- a/scripts/deep_deepspeed/lamec.json +++ /dev/null @@ -1 +0,0 @@ -{"startscript": "DS_startscript_deep.sh"} \ No newline at end of file diff --git a/scripts/deep_deepspeed/reqs.txt b/scripts/deep_deepspeed/reqs.txt deleted file mode 100755 index 20310b90fd6de0eb8f66c984b045b147c09a89dc..0000000000000000000000000000000000000000 --- a/scripts/deep_deepspeed/reqs.txt +++ /dev/null @@ -1,6 +0,0 @@ -Pillow -pyparsing -python-dateutil -matplotlib -h5py -pytorch-nlp diff --git a/scripts/deep_heat/HeAT_startscript_deep.sh b/scripts/deep_heat/HeAT_startscript_deep.sh deleted file mode 100644 index 5454a785db114cdfa28efbbb540940b9c150b8ad..0000000000000000000000000000000000000000 --- a/scripts/deep_heat/HeAT_startscript_deep.sh +++ /dev/null @@ -1,90 +0,0 @@ -#!/bin/bash - -# general configuration of the job -#SBATCH --job-name=HeATTest -#SBATCH --account=deepext -#SBATCH --mail-user= -#SBATCH --mail-type=ALL -#SBATCH --output=job.out -#SBATCH --error=job.err -#SBATCH --time=01:00:00 - -# configure node and process count on the CM -#SBATCH --partition=dp-esb -#SBATCH --nodes=4 -#SBATCH --ntasks-per-node=4 -#SBATCH --cpus-per-task=1 -#SBATCH --gpus-per-node=1 -#SBATCH --exclusive - -# parameters -debug=false # do debug -bs=96 # batch-size -epochs=1 # epochs -lr=0.01 # learning rate - -# dataset -# MNIST -#dataDir="/p/project/prcoe12/RAISE/data_MNIST/" -#COMMAND="DDP_pytorch_mnist.py" - -# AT -dataDir="/p/project/prcoe12/RAISE/T31/" -COMMAND="HeAT_pytorch_AT.py" - -EXEC="$COMMAND \ - --batch-size $bs \ - --epochs $epochs \ - --lr $lr \ - --data-dir $dataDir" - -# set modules -ml --force purge -ml use $OTHERSTAGES -ml Stages/2022 GCC/11.2.0 OpenMPI/4.1.2 cuDNN/8.3.1.22-CUDA-11.5 NCCL/2.11.4-CUDA-11.5 Python/3.9.6 - -# set env - pip -source /p/project/prcoe12/RAISE/envAI_deepv/bin/activate - -# set env - conda -#source /p/project/prcoe12/RAISE/miniconda3_deepv/etc/profile.d/conda.sh -#conda activate - -# sleep a sec -sleep 1 - -# job info -echo "DEBUG: EXECUTE: $EXEC" -echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" -echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" -echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" -echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" -echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" -echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" -echo "DEBUG: SLURM_NODEID: $SLURM_NODEID" -echo "DEBUG: SLURM_LOCALID: $SLURM_LOCALID" -echo "DEBUG: SLURM_PROCID: $SLURM_PROCID" -echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" -if [ "$debug" = true ] ; then - export NCCL_DEBUG=INFO -fi -echo - -# set comm, CUDA and OMP -#PSP_CUDA=1 # not needed atm -#PSP_UCP=1 # not needed atm -#PSP_OPENIB=1 # not needed atm -#export NCCL_SOCKET_IFNAME=ib # not needed atm -#export NCCL_IB_HCA=ipogif0 # not needed atm -#export NCCL_IB_CUDA_SUPPORT=1 # not needed atm -export CUDA_VISIBLE_DEVICES="0" -export OMP_NUM_THREADS=1 -if [ "$SLURM_CPUS_PER_TASK" > 0 ] ; then - export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -fi - -# launch -srun --mpi=pspmix python3 -u $EXEC - -# eof diff --git a/scripts/deep_heat/README.md b/scripts/deep_heat/README.md deleted file mode 100644 index 74a97ada596a5f3a30181b88e0616259bbd8eec8..0000000000000000000000000000000000000000 --- a/scripts/deep_heat/README.md +++ /dev/null @@ -1,15 +0,0 @@ -# DL using HeAT/PyTorch on Jureca DC - -# source -https://github.com/helmholtz-analytics/heat - -# current isues -1. no alternative to --mpi=pspmix with OMPI, but works - -# to-do -1. - -# usage - pip -1. clone -2. run `./createENV.sh` -3. submit `sbatch HeAT_startscript_deep.sh` diff --git a/scripts/deep_heat/createEnv.sh b/scripts/deep_heat/createEnv.sh deleted file mode 100755 index 4558743e8e899f6f19fe9b8bc058491256f6c61e..0000000000000000000000000000000000000000 --- a/scripts/deep_heat/createEnv.sh +++ /dev/null @@ -1,174 +0,0 @@ -#!/bin/bash -# -*- coding: utf-8 -*- -# author: EI -# version: 220302a -# creates machine specific python env - -# set modules -ml --force purge - -# get sys info -cDir=$PWD -sysN="$(uname -n | cut -f2- -d.)" -echo "system:${sysN}" -echo - -cont1=false -if [ "$sysN" = 'deepv' ] ; then - ml use $OTHERSTAGES - #ml Stages/2022 GCC ParaStationMPI cuDNN NCCL Python CMake # Horovod issues with pscom?? - ml Stages/2022 GCC OpenMPI cuDNN NCCL Python CMake - cont1=true -elif [ "$sysN" = 'juwels' ] ; then - ml GCC ParaStationMPI Python CMake - cont1=true -elif [ "$sysN" = 'jureca' ] ; then - #ml Stages/2022 GCC ParaStationMPI Python CMake NCCL libaio # Horovod iassues with pscom?? - ml Stages/2022 GCC OpenMPI Python NCCL cuDNN libaio CMake - cont1=true -else - echo - echo 'unknown system detected' - echo 'canceling' - echo -fi -echo "modules loaded" -echo - -# get python version -pver="$(python --version 2>&1 | awk {'print $2'} | cut -f1-2 -d.)" -echo "python version is ${pver}" -echo - -if [ "$cont1" = true ] ; then - if [ -d "${cDir}/envAI_${sysN}" ];then - echo 'env already exist' - echo - - source envAI_${sysN}/bin/activate - else - # create env - python3 -m venv envAI_${sysN} - - # get headers for pip - if [ -f "${cDir}/envAI_${sysN}/bin/pip3" ]; then - echo 'pip already exist' - else - cp "$(which pip3)" $cDir/envAI_${sysN}/bin/ - ln -s $cDir/envAI_${sysN}/bin/pip3 $cDir/envAI_${sysN}/bin/pip${pver} - var="#!$cDir/envAI_${sysN}/bin/python${pver}" - sed -i "1s|.*|$var|" $cDir/envAI_${sysN}/bin/pip3 - fi - - # activate env - source envAI_${sysN}/bin/activate - - echo "a new env is created in ${cDir}" - echo "activation is done via:" - echo "source ${cDir}/envAI_${sysN}/bin/activate" - fi -fi - -# install torch -if [ -f "${cDir}/envAI_${sysN}/bin/torchrun" ]; then - echo 'Torch already installed' - echo -else - export TMPDIR=${cDir} - - pip3 install \ - torch==1.10.2+cu113 torchvision==0.11.3+cu113 torchaudio==0.10.2+cu113 \ - -f https://download.pytorch.org/whl/cu113/torch_stable.html --no-cache-dir -fi - -# install horovod -if [ -f "${cDir}/envAI_${sysN}/bin/horovodrun" ]; then - echo 'Horovod already installed' - echo -else - export HOROVOD_GPU=CUDA - export HOROVOD_GPU_OPERATIONS=NCCL - export HOROVOD_WITH_PYTORCH=1 - export TMPDIR=${cDir} - - pip3 install --no-cache-dir horovod --ignore-installed -fi - -# install deepspeed -if [ -f "${cDir}/envAI_${sysN}/bin/deepspeed" ]; then - echo 'DeepSpeed already installed' - echo -else - export DS_BUILD_OPS=1 - # if above not working?? recursion error use this - #export DS_BUILD_FUSED_ADAM=1 - #export DS_BUILD_UTILS=1 - if [ "$sysN" = 'deepv' ] ; then - #fix libaio issues via: - export DS_BUILD_AIO=0 - fi - export TMPDIR=${cDir} - - pip3 install --no-cache-dir DeepSpeed - - # add this to deepspeed/launcher/launch.py l.85 - var=' args.node_rank=int(os.environ.get("SLURM_PROCID",0))' - sed -i "85s|.*|$var|" $cDir/envAI_${sysN}/lib/python${pver}/site-packages/deepspeed/launcher/launch.py -fi - -# install heat -if [ -d "${cDir}/envAI_${sysN}/lib/python${pver}/site-packages/heat" ]; then - echo 'HeAT already installed' - echo -else - export TMPDIR=${cDir} - - # need to modify setup.py to accep torch>1.9 for heat - wget https://files.pythonhosted.org/packages/5d/3a/4781f1e6910753bfdfa6712c83c732c60e675d8de14983926a0d9306c7a6/heat-1.1.1.tar.gz - tar xzf heat-1.1.1.tar.gz - var=' "torch>=1.7.0",' - sed -i "36s|.*|$var|" heat-1.1.1/setup.py - var=' "torchvision>=0.8.0",' - sed -i "39s|.*|$var|" heat-1.1.1/setup.py - - # create tar again! - rm -rf heat-1.1.1.tar.gz - tar czf heat-1.1.1.tar.gz heat-1.1.1 - rm -rf heat-1.1.1 - - pip3 install --no-cache-dir 'heat-1.1.1.tar.gz[hdf5,netcdf]' - - rm -rf heat-1.1.1.tar.gz -fi - -# get rest of the libraries$ -if [ "$cont1" = true ] ; then - # install rest - pip3 install -r reqs.txt --ignore-installed - - # modify l.4 of /torchnlp/_third_party/weighted_random_sampler.py - var='int_classes = int' - sed -i "4s|.*|$var|" \ - $cDir/envAI_${sysN}/lib/python${pver}/site-packages/torchnlp/_third_party/weighted_random_sampler.py -fi - -if [ -f "${cDir}/envAI_${sysN}/bin/torchrun" ]; then - sed -i -e '3,8s/^/#/' ${cDir}/envAI_${sysN}/bin/torchrun - echo """ -import re -import sys -from torch.distributed.run import main -from torch.distributed.elastic.agent.server import api as sapi - -def new_get_fq_hostname(): - return _orig_get_fq_hostname().replace('.', 'i.', 1) - -if __name__ == '__main__': - _orig_get_fq_hostname = sapi._get_fq_hostname - sapi._get_fq_hostname = new_get_fq_hostname - sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) - sys.exit(main()) -""" >> ${cDir}/envAI_${sysN}/bin/torchrun -fi - -#eof diff --git a/scripts/deep_heat/example_mnist_heat.py b/scripts/deep_heat/example_mnist_heat.py deleted file mode 100644 index b2a552629aa21c71e43159060eca8a6154b3ede9..0000000000000000000000000000000000000000 --- a/scripts/deep_heat/example_mnist_heat.py +++ /dev/null @@ -1,184 +0,0 @@ -# example from : https://github.com/helmholtz-analytics/heat/blob/master/examples/nn/mnist.py - -from __future__ import print_function -import argparse -import sys -import time -import torch - -sys.path.append("../../") -import heat as ht -import heat.nn.functional as F -import heat.optim as optim -from heat.optim.lr_scheduler import StepLR -from heat.utils import vision_transforms -from heat.utils.data.mnist import MNISTDataset - -""" -This file is an example script for how to use the HeAT DataParallel class to train a network on the MNIST dataset. -To run this file execute the following in the examples/nn/ directory: - mpirun -np N python -u mnist.py -where N is the number of processes. -""" - - -class Net(ht.nn.Module): - def __init__(self): - super(Net, self).__init__() - self.conv1 = ht.nn.Conv2d(1, 32, 3, 1) - self.conv2 = ht.nn.Conv2d(32, 64, 3, 1) - self.dropout1 = ht.nn.Dropout2d(0.25) - self.dropout2 = ht.nn.Dropout2d(0.5) - self.fc1 = ht.nn.Linear(9216, 128) - self.fc2 = ht.nn.Linear(128, 10) - - def forward(self, x): - x = self.conv1(x) - x = F.relu(x) - x = self.conv2(x) - x = F.relu(x) - x = F.max_pool2d(x, 2) - x = self.dropout1(x) - x = torch.flatten(x, 1) - x = self.fc1(x) - x = F.relu(x) - x = self.dropout2(x) - x = self.fc2(x) - output = F.log_softmax(x, dim=1) - return output - - -def train(args, model, device, train_loader, optimizer, epoch): - model.train() - t_list = [] - for batch_idx, (data, target) in enumerate(train_loader): - t = time.perf_counter() - data, target = data.to(device), target.to(device) - optimizer.zero_grad() - output = model(data) - loss = F.nll_loss(output, target) - loss.backward() - optimizer.step() - if batch_idx % args.log_interval == 0: - print( - f"Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} " - f"({100.0 * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}" - ) - if args.dry_run: - break - t_list.append(time.perf_counter() - t) - print("average time", sum(t_list) / len(t_list)) - - -def test(model, device, test_loader): - model.eval() - test_loss = 0 - correct = 0 - with torch.no_grad(): - for data, target in test_loader: - data, target = data.to(device), target.to(device) - output = model(data) - test_loss += F.nll_loss(output, target, reduction="sum").item() # sum up batch loss - pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability - correct += pred.eq(target.view_as(pred)).sum().item() - test_loss /= len(test_loader.dataset) - print( - f"\nTest set: Average loss: {test_loss:.4f}, Accuracy: {correct}/{len(test_loader.dataset)}" - f" ({100.0 * correct / len(test_loader.dataset):.0f}%)\n" - ) - - -def main(): - # Training settings - parser = argparse.ArgumentParser(description="PyTorch MNIST Example") - parser.add_argument( - "--batch-size", - type=int, - default=64, - metavar="N", - help="input batch size for training (default: 64)", - ) - parser.add_argument( - "--test-batch-size", - type=int, - default=1000, - metavar="N", - help="input batch size for testing (default: 1000)", - ) - parser.add_argument( - "--epochs", - type=int, - default=14, - metavar="N", - help="number of epochs to train (default: 14)", - ) - parser.add_argument( - "--lr", type=float, default=1.0, metavar="LR", help="learning rate (default: 1.0)" - ) - parser.add_argument( - "--gamma", - type=float, - default=0.7, - metavar="M", - help="Learning rate step gamma (default: 0.7)", - ) - parser.add_argument( - "--no-cuda", action="store_true", default=False, help="disables CUDA training" - ) - parser.add_argument( - "--dry-run", action="store_true", default=False, help="quickly check a single pass" - ) - parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)") - parser.add_argument( - "--log-interval", - type=int, - default=10, - metavar="N", - help="how many batches to wait before logging training status", - ) - parser.add_argument( - "--save-model", action="store_true", default=False, help="For Saving the current Model" - ) - args = parser.parse_args() - use_cuda = not args.no_cuda and torch.cuda.is_available() - torch.manual_seed(args.seed) - device = torch.device("cuda" if use_cuda else "cpu") - kwargs = {"batch_size": args.batch_size} - if use_cuda: - kwargs.update({"num_workers": 0, "pin_memory": True}) - transform = ht.utils.vision_transforms.Compose( - [vision_transforms.ToTensor(), vision_transforms.Normalize((0.1307,), (0.3081,))] - ) - - dataDir="/p/project/prcoe12/RAISE/data_MNIST/" - dataset1 = MNISTDataset(dataDir, train=True, transform=transform, ishuffle=False) - dataset2 = MNISTDataset( - dataDir, train=False, transform=transform, ishuffle=False, test_set=True - ) - - train_loader = ht.utils.data.datatools.DataLoader(dataset=dataset1, **kwargs) - test_loader = ht.utils.data.datatools.DataLoader(dataset=dataset2, **kwargs) - model = Net().to(device) - optimizer = optim.Adadelta(model.parameters(), lr=args.lr) - blocking = False - dp_optim = ht.optim.DataParallelOptimizer(optimizer, blocking=blocking) - scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) - dp_model = ht.nn.DataParallel( - model, comm=dataset1.comm, optimizer=dp_optim, blocking_parameter_updates=blocking - ) - - for epoch in range(1, args.epochs + 1): - train(args, dp_model, device, train_loader, dp_optim, epoch) - test(dp_model, device, test_loader) - scheduler.step() - if epoch + 1 == args.epochs: - train_loader.last_epoch = True - test_loader.last_epoch = True - - if args.save_model: - torch.save(model.state_dict(), "mnist_cnn.pt") - - -if __name__ == "__main__": - main() - diff --git a/scripts/deep_heat/lamec.json b/scripts/deep_heat/lamec.json deleted file mode 100644 index d1bf1b27df9fd3984cd046733eeccea2901e07b3..0000000000000000000000000000000000000000 --- a/scripts/deep_heat/lamec.json +++ /dev/null @@ -1 +0,0 @@ -{"startscript": "HeAT_startscript_deep.sh"} \ No newline at end of file diff --git a/scripts/deep_heat/reqs.txt b/scripts/deep_heat/reqs.txt deleted file mode 100755 index 20310b90fd6de0eb8f66c984b045b147c09a89dc..0000000000000000000000000000000000000000 --- a/scripts/deep_heat/reqs.txt +++ /dev/null @@ -1,6 +0,0 @@ -Pillow -pyparsing -python-dateutil -matplotlib -h5py -pytorch-nlp diff --git a/scripts/deep_horovod/Hor_startscript_deep.sh b/scripts/deep_horovod/Hor_startscript_deep.sh deleted file mode 100644 index 1feaee8493e6042cee6078be02c61109a4cd1fda..0000000000000000000000000000000000000000 --- a/scripts/deep_horovod/Hor_startscript_deep.sh +++ /dev/null @@ -1,90 +0,0 @@ -#!/bin/bash - -# general configuration of the job -#SBATCH --job-name=TorchTest -#SBATCH --account=deepext -#SBATCH --mail-user= -#SBATCH --mail-type=ALL -#SBATCH --output=job.out -#SBATCH --error=job.err -#SBATCH --time=01:00:00 - -# configure node and process count on the CM -#SBATCH --partition=dp-esb -#SBATCH --nodes=4 -#SBATCH --ntasks-per-node=4 -#SBATCH --cpus-per-task=1 -#SBATCH --gpus-per-node=1 -#SBATCH --exclusive - -# parameters -debug=false # do debug -bs=96 # batch-size -epochs=1 # epochs -lr=0.01 # learning rate - -# dataset -# MNIST -#dataDir="/p/project/prcoe12/RAISE/data_MNIST/" -#COMMAND="DDP_pytorch_mnist.py" - -# AT -dataDir="/p/project/prcoe12/RAISE/T31/" -COMMAND="Hor_pytorch_AT.py" - -EXEC="$COMMAND \ - --batch-size $bs \ - --epochs $epochs \ - --lr $lr \ - --data-dir $dataDir" - -# set modules -ml --force purge -ml use $OTHERSTAGES -ml Stages/2022 GCC/11.2.0 OpenMPI/4.1.2 cuDNN/8.3.1.22-CUDA-11.5 NCCL/2.11.4-CUDA-11.5 Python/3.9.6 - -# set env - pip -source /p/project/prcoe12/RAISE/envAI_deepv/bin/activate - -# set env - conda -#source /p/project/prcoe12/RAISE/miniconda3_deepv/etc/profile.d/conda.sh -#conda activate - -# sleep a sec -sleep 1 - -# job info -echo "DEBUG: EXECUTE: $EXEC" -echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" -echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" -echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" -echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" -echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" -echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" -echo "DEBUG: SLURM_NODEID: $SLURM_NODEID" -echo "DEBUG: SLURM_LOCALID: $SLURM_LOCALID" -echo "DEBUG: SLURM_PROCID: $SLURM_PROCID" -echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" -if [ "$debug" = true ] ; then - export NCCL_DEBUG=INFO -fi -echo - -# set comm, CUDA and OMP -#PSP_CUDA=1 # not needed atm -#PSP_UCP=1 # not needed atm -#PSP_OPENIB=1 # not needed atm -#export NCCL_SOCKET_IFNAME=ib # not needed atm -#export NCCL_IB_HCA=ipogif0 # not needed atm -#export NCCL_IB_CUDA_SUPPORT=1 # not needed atm -export CUDA_VISIBLE_DEVICES="0" -export OMP_NUM_THREADS=1 -if [ "$SLURM_CPUS_PER_TASK" > 0 ] ; then - export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -fi - -# launch -srun --mpi=pspmix python3 -u $EXEC - -# eof diff --git a/scripts/deep_horovod/README.md b/scripts/deep_horovod/README.md deleted file mode 100644 index d88c2ef2f5d9026c0646723165e4cf9af1a32503..0000000000000000000000000000000000000000 --- a/scripts/deep_horovod/README.md +++ /dev/null @@ -1,15 +0,0 @@ -# DL using Horovod on deepv - -# source -https://github.com/horovod/horovod - -# current isues -1. no alternative to --mpi=pspmix with OMPI, but works - -# to-do -1. - -# usage - pip -1. clone -2. run `./createENV.sh` -3. submit `sbatch Hor_startscript_deep.sh` diff --git a/scripts/deep_horovod/createEnv.sh b/scripts/deep_horovod/createEnv.sh deleted file mode 100755 index 8aa89ef9ece51f12b7f85b02535ccd6a9bac1018..0000000000000000000000000000000000000000 --- a/scripts/deep_horovod/createEnv.sh +++ /dev/null @@ -1,174 +0,0 @@ -#!/bin/bash -# -*- coding: utf-8 -*- -# author: EI -# version: 230120a -# creates machine specific python env -# env ONLY - -# set modules -module --force purge - -# get sys info -#sysN="$(uname -n | cut -f2- -d.)" -sysN="deepv" -echo "system:${sysN}" -echo - -# create tmp dir -mkdir -p $PWD/tmp -export TMPDIR=$PWD/tmp - -if [ "$sysN" = 'deepv' ] ; then - module use $OTHERSTAGES - # main - ml Stages/2022 NVHPC/22.1 OpenMPI/4.1.2 NCCL/2.15.1-1-CUDA-11.5 cuDNN/8.3.1.22-CUDA-11.5 - - # side - ml Python/3.9.6 HDF5 CMake - - # version mismatch fix - ml -nvidia-driver/.default - - # new cuda drivers in comp node, only use this if salloc - ln -s /usr/lib64/libcuda.so.1 . - ln -s /usr/lib64/libnvidia-ml.so.1 . - LD_LIBRARY_PATH=.:/usr/local/cuda/lib64:$LD_LIBRARY_PATH - -elif [ "$sysN" = 'juwels' ] ; then - ml GCC ParaStationMPI Python CMake -elif [ "$sysN" = 'jureca' ] ; then - # main - ml Stages/2022 NVHPC/22.1 ParaStationMPI/5.5.0-1-mt NCCL/2.14.3-1-CUDA-11.5 cuDNN/8.3.1.22-CUDA-11.5 - - # side - ml Python/3.9.6 libaio/0.3.112 HDF5/1.12.1 PnetCDF/1.12.2 mpi-settings/CUDA CMake/3.21.1 -else - echo 'unknown system detected' - echo 'canceling' - exit -fi -echo 'modules loaded' -echo - -# get python version -pver="$(python --version 2>&1 | awk {'print $2'} | cut -f1-2 -d.)" -echo "python version is ${pver}" -echo - -if [ -d "$PWD/envAI_${sysN}" ];then - echo 'env already exist' - echo - - source envAI_${sysN}/bin/activate -else - # create env - python3 -m venv envAI_${sysN} - - # get headers for pip - if [ -f "$PWD/envAI_${sysN}/bin/pip3" ]; then - echo 'pip already exist' - else - cp "$(which pip3)" $PWD/envAI_${sysN}/bin/ - ln -s $PWD/envAI_${sysN}/bin/pip3 $PWD/envAI_${sysN}/bin/pip${pver} - var="#!$PWD/envAI_${sysN}/bin/python${pver}" - sed -i "1s|.*|$var|" $PWD/envAI_${sysN}/bin/pip3 - fi - - # activate env - source envAI_${sysN}/bin/activate - - echo "a new env is created in $PWD" - echo "activation is done via:" - echo "source $PWD/envAI_${sysN}/bin/activate" -fi - -# install torch -if [ -f "$PWD/envAI_${sysN}/bin/torchrun" ]; then - echo 'Torch already installed' - echo -else - pip3 install --no-cache-dir \ - torch==1.13.0+cu117 torchvision==0.14.0+cu117 torchaudio==0.13.0+cu117 -f \ - https://download.pytorch.org/whl/cu117/torch_stable.html --no-cache-dir -fi - -# install horovod -if [ -f "$PWD/envAI_${sysN}/bin/horovodrun" ]; then - echo 'Horovod already installed' - echo -else - #export HOROVOD_DEBUG=1 - export HOROVOD_WITH_MPI=1 - export HOROVOD_MPI_THREADS_DISABLE=1 - export HOROVOD_GPU=CUDA - #export HOROVOD_GPU_OPERATIONS=MPI - export HOROVOD_CUDA_HOME=$EBROOTCUDA - export HOROVOD_GPU_OPERATIONS=NCCL - export HOROVOD_NCCL_HOME=$EBROOTNCCL - export HOROVOD_WITH_PYTORCH=1 - export HOROVOD_WITHOUT_TENSORFLOW=1 - export HOROVOD_WITHOUT_MXNET=1 - - pip3 install --no-cache-dir wheel --ignore-installed - pip3 install --no-cache-dir horovod==0.25.0 --ignore-installed -fi - -# install deepspeed -if [ -f "$PWD/envAI_${sysN}/bin/deepspeed" ]; then - echo 'DeepSpeed already installed' - echo -else - # compile all opt. stuff - not needed & not working - #export DS_BUILD_OPS=1 - # compile req. opt. stuff - export DS_BUILD_FUSED_ADAM=1 - export DS_BUILD_UTILS=1 - - pip3 install --no-cache-dir DeepSpeed=0.9.1 - - # add this to .../deepspeed/launcher/launch.py l.219 - var=' args.node_rank=int(os.environ.get("SLURM_PROCID",0))' - sed -i "219s|.*|$var|" $PWD/envAI_${sysN}/lib/python${pver}/site-packages/deepspeed/launcher/launch.py -fi - -# install heat -if [ -d "$PWD/envAI_${sysN}/lib/python${pver}/site-packages/heat" ]; then - echo 'HeAT already installed' - echo -else - export CFLAGS="-noswitcherror" - export CXXFLAGS="-noswitcherror" - - pip3 install heat[hdf5,netcdf] -fi - -# get rest of the libraries -# install rest -pip3 install -r reqs.txt --ignore-installed - -# modify l.4 of /torchnlp/_third_party/weighted_random_sampler.py -var='int_classes = int' -sed -i "4s|.*|$var|" \ - $PWD/envAI_${sysN}/lib/python${pver}/site-packages/torchnlp/_third_party/weighted_random_sampler.py - -# fix IB IP config -if [ -f "$PWD/envAI_${sysN}/bin/torchrun" ]; then - sed -i -e '3,8s/^/#/' $PWD/envAI_${sysN}/bin/torchrun - echo """ -import re -import sys -from torch.distributed.run import main -from torch.distributed.elastic.agent.server import api as sapi - -def new_get_fq_hostname(): - return _orig_get_fq_hostname().replace('.', 'i.', 1) - -if __name__ == '__main__': - _orig_get_fq_hostname = sapi._get_fq_hostname - sapi._get_fq_hostname = new_get_fq_hostname - sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) - sys.exit(main()) -""" >> $PWD/envAI_${sysN}/bin/torchrun -fi - -#eof diff --git a/scripts/deep_horovod/lamec.json b/scripts/deep_horovod/lamec.json deleted file mode 100644 index 4aff71d30c25f064280724e030d6fd813c4c4c5d..0000000000000000000000000000000000000000 --- a/scripts/deep_horovod/lamec.json +++ /dev/null @@ -1 +0,0 @@ -{"startscript": "Hor_startscript_deep.sh"} \ No newline at end of file diff --git a/scripts/deep_horovod/pytorch_mnist.py b/scripts/deep_horovod/pytorch_mnist.py deleted file mode 100644 index 2d0c5ac0709000a609b7a5afaed85fc199762fca..0000000000000000000000000000000000000000 --- a/scripts/deep_horovod/pytorch_mnist.py +++ /dev/null @@ -1,205 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -# MNIST train in deepv -# origin: https://github.com/horovod/horovod/blob/master/examples/pytorch/pytorch_mnist.py -# changes L.132 from 'num_workers': 1 to 'num_workers': 0 - -import argparse -import os -from filelock import FileLock - -import torch.multiprocessing as mp -import torch.nn as nn -import torch.nn.functional as F -import torch.optim as optim -from torchvision import datasets, transforms -import torch.utils.data.distributed -import horovod.torch as hvd - -# Training settings -parser = argparse.ArgumentParser(description='PyTorch MNIST Example') -parser.add_argument('--batch-size', type=int, default=64, metavar='N', - help='input batch size for training (default: 64)') -parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', - help='input batch size for testing (default: 1000)') -parser.add_argument('--epochs', type=int, default=10, metavar='N', - help='number of epochs to train (default: 10)') -parser.add_argument('--lr', type=float, default=0.01, metavar='LR', - help='learning rate (default: 0.01)') -parser.add_argument('--momentum', type=float, default=0.5, metavar='M', - help='SGD momentum (default: 0.5)') -parser.add_argument('--no-cuda', action='store_true', default=False, - help='disables CUDA training') -parser.add_argument('--seed', type=int, default=42, metavar='S', - help='random seed (default: 42)') -parser.add_argument('--log-interval', type=int, default=10, metavar='N', - help='how many batches to wait before logging training status') -parser.add_argument('--fp16-allreduce', action='store_true', default=False, - help='use fp16 compression during allreduce') -parser.add_argument('--use-adasum', action='store_true', default=False, - help='use adasum algorithm to do reduction') -parser.add_argument('--gradient-predivide-factor', type=float, default=1.0, - help='apply gradient predivide factor in optimizer (default: 1.0)') -parser.add_argument('--data-dir', - help='location of the training dataset in the local filesystem (will be downloaded if needed)') - - -class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - self.conv1 = nn.Conv2d(1, 10, kernel_size=5) - self.conv2 = nn.Conv2d(10, 20, kernel_size=5) - self.conv2_drop = nn.Dropout2d() - self.fc1 = nn.Linear(320, 50) - self.fc2 = nn.Linear(50, 10) - - def forward(self, x): - x = F.relu(F.max_pool2d(self.conv1(x), 2)) - x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) - x = x.view(-1, 320) - x = F.relu(self.fc1(x)) - x = F.dropout(x, training=self.training) - x = self.fc2(x) - return F.log_softmax(x) - - -def train(epoch): - model.train() - # Horovod: set epoch to sampler for shuffling. - train_sampler.set_epoch(epoch) - for batch_idx, (data, target) in enumerate(train_loader): - if args.cuda: - data, target = data.cuda(), target.cuda() - optimizer.zero_grad() - output = model(data) - loss = F.nll_loss(output, target) - loss.backward() - optimizer.step() - if batch_idx % args.log_interval == 0: - # Horovod: use train_sampler to determine the number of examples in - # this worker's partition. - print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( - epoch, batch_idx * len(data), len(train_sampler), - 100. * batch_idx / len(train_loader), loss.item())) - - -def metric_average(val, name): - tensor = torch.tensor(val) - avg_tensor = hvd.allreduce(tensor, name=name) - return avg_tensor.item() - - -def test(): - model.eval() - test_loss = 0. - test_accuracy = 0. - for data, target in test_loader: - if args.cuda: - data, target = data.cuda(), target.cuda() - output = model(data) - # sum up batch loss - test_loss += F.nll_loss(output, target, size_average=False).item() - # get the index of the max log-probability - pred = output.data.max(1, keepdim=True)[1] - test_accuracy += pred.eq(target.data.view_as(pred)).cpu().float().sum() - - # Horovod: use test_sampler to determine the number of examples in - # this worker's partition. - test_loss /= len(test_sampler) - test_accuracy /= len(test_sampler) - - # Horovod: average metric values across workers. - test_loss = metric_average(test_loss, 'avg_loss') - test_accuracy = metric_average(test_accuracy, 'avg_accuracy') - - # Horovod: print output only on first rank. - if hvd.rank() == 0: - print('\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format( - test_loss, 100. * test_accuracy)) - - -if __name__ == '__main__': - args = parser.parse_args() - args.cuda = not args.no_cuda and torch.cuda.is_available() - - # Horovod: initialize library. - hvd.init() - torch.manual_seed(args.seed) - - if args.cuda: - # Horovod: pin GPU to local rank. - torch.cuda.set_device(hvd.local_rank()) - torch.cuda.manual_seed(args.seed) - - - # Horovod: limit # of CPU threads to be used per worker. - torch.set_num_threads(1) - - #kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} - kwargs = {'num_workers': 0, 'pin_memory': True} if args.cuda else {} - # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent - # issues with Infiniband implementations that are not fork-safe - if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context') and - mp._supports_context and 'forkserver' in mp.get_all_start_methods()): - kwargs['multiprocessing_context'] = 'forkserver' - - data_dir = args.data_dir or './data' - with FileLock(os.path.expanduser("~/.horovod_lock")): - train_dataset = \ - datasets.MNIST(data_dir, train=True, download=True, - transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])) - - # Horovod: use DistributedSampler to partition the training data. - train_sampler = torch.utils.data.distributed.DistributedSampler( - train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) - train_loader = torch.utils.data.DataLoader( - train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs) - - test_dataset = \ - datasets.MNIST(data_dir, train=False, transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])) - # Horovod: use DistributedSampler to partition the test data. - test_sampler = torch.utils.data.distributed.DistributedSampler( - test_dataset, num_replicas=hvd.size(), rank=hvd.rank()) - test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size, - sampler=test_sampler, **kwargs) - - model = Net() - - # By default, Adasum doesn't need scaling up learning rate. - lr_scaler = hvd.size() if not args.use_adasum else 1 - - if args.cuda: - # Move model to GPU. - model.cuda() - # If using GPU Adasum allreduce, scale learning rate by local_size. - if args.use_adasum and hvd.nccl_built(): - lr_scaler = hvd.local_size() - - # Horovod: scale learning rate by lr_scaler. - optimizer = optim.SGD(model.parameters(), lr=args.lr * lr_scaler, - momentum=args.momentum) - - # Horovod: broadcast parameters & optimizer state. - hvd.broadcast_parameters(model.state_dict(), root_rank=0) - hvd.broadcast_optimizer_state(optimizer, root_rank=0) - - # Horovod: (optional) compression algorithm. - compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none - - # Horovod: wrap optimizer with DistributedOptimizer. - optimizer = hvd.DistributedOptimizer(optimizer, - named_parameters=model.named_parameters(), - compression=compression, - op=hvd.Adasum if args.use_adasum else hvd.Average, - gradient_predivide_factor=args.gradient_predivide_factor) - - for epoch in range(1, args.epochs + 1): - train(epoch) - test() diff --git a/scripts/deep_horovod/pytorch_synthetic_benchmark.py b/scripts/deep_horovod/pytorch_synthetic_benchmark.py deleted file mode 100644 index 473d7c999cf810c381d2c4a3414aebaeac386944..0000000000000000000000000000000000000000 --- a/scripts/deep_horovod/pytorch_synthetic_benchmark.py +++ /dev/null @@ -1,127 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -# Synthetic train in deepv -# origin: https://github.com/horovod/horovod/blob/master/examples/pytorch/pytorch_synthetic_benchmark.py -# changes - -import argparse -import torch.backends.cudnn as cudnn -import torch.nn.functional as F -import torch.optim as optim -import torch.utils.data.distributed -from torchvision import models -import horovod.torch as hvd -import timeit -import numpy as np - -# Benchmark settings -parser = argparse.ArgumentParser(description='PyTorch Synthetic Benchmark', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) -parser.add_argument('--fp16-allreduce', action='store_true', default=False, - help='use fp16 compression during allreduce') - -parser.add_argument('--model', type=str, default='resnet50', - help='model to benchmark') -parser.add_argument('--batch-size', type=int, default=32, - help='input batch size') - -parser.add_argument('--num-warmup-batches', type=int, default=10, - help='number of warm-up batches that don\'t count towards benchmark') -parser.add_argument('--num-batches-per-iter', type=int, default=10, - help='number of batches per benchmark iteration') -parser.add_argument('--num-iters', type=int, default=10, - help='number of benchmark iterations') - -parser.add_argument('--no-cuda', action='store_true', default=False, - help='disables CUDA training') - -parser.add_argument('--use-adasum', action='store_true', default=False, - help='use adasum algorithm to do reduction') - -args = parser.parse_args() -args.cuda = not args.no_cuda and torch.cuda.is_available() - -hvd.init() - -if args.cuda: - # Horovod: pin GPU to local rank. - torch.cuda.set_device(hvd.local_rank()) - -cudnn.benchmark = True - -# Set up standard model. -model = getattr(models, args.model)() - -# By default, Adasum doesn't need scaling up learning rate. -lr_scaler = hvd.size() if not args.use_adasum else 1 - -if args.cuda: - # Move model to GPU. - model.cuda() - # If using GPU Adasum allreduce, scale learning rate by local_size. - if args.use_adasum and hvd.nccl_built(): - lr_scaler = hvd.local_size() - -optimizer = optim.SGD(model.parameters(), lr=0.01 * lr_scaler) - -# Horovod: (optional) compression algorithm. -compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none - -# Horovod: wrap optimizer with DistributedOptimizer. -optimizer = hvd.DistributedOptimizer(optimizer, - named_parameters=model.named_parameters(), - compression=compression, - op=hvd.Adasum if args.use_adasum else hvd.Average) - -# Horovod: broadcast parameters & optimizer state. -hvd.broadcast_parameters(model.state_dict(), root_rank=0) -hvd.broadcast_optimizer_state(optimizer, root_rank=0) - -# Set up fixed fake data -data = torch.randn(args.batch_size, 3, 224, 224) -target = torch.LongTensor(args.batch_size).random_() % 1000 -if args.cuda: - data, target = data.cuda(), target.cuda() - - -def benchmark_step(): - optimizer.zero_grad() - output = model(data) - loss = F.cross_entropy(output, target) - loss.backward() - optimizer.step() - - -def log(s, nl=True): - if hvd.rank() != 0: - return - print(s, end='\n' if nl else '') - - -log('Model: %s' % args.model) -log('Batch size: %d' % args.batch_size) -device = 'GPU' if args.cuda else 'CPU' -log('Number of %ss: %d' % (device, hvd.size())) - -# Warm-up -log('Running warmup...') -timeit.timeit(benchmark_step, number=args.num_warmup_batches) - -# Benchmark -log('Running benchmark...') -img_secs = [] -for x in range(args.num_iters): - time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter) - img_sec = args.batch_size * args.num_batches_per_iter / time - log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, device)) - img_secs.append(img_sec) - -# Results -img_sec_mean = np.mean(img_secs) -img_sec_conf = 1.96 * np.std(img_secs) -log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf)) -log('Total img/sec on %d %s(s): %.1f +-%.1f' % - (hvd.size(), device, hvd.size() * img_sec_mean, hvd.size() * img_sec_conf)) - -# eof diff --git a/scripts/deep_horovod/reqs.txt b/scripts/deep_horovod/reqs.txt deleted file mode 100755 index 20310b90fd6de0eb8f66c984b045b147c09a89dc..0000000000000000000000000000000000000000 --- a/scripts/deep_horovod/reqs.txt +++ /dev/null @@ -1,6 +0,0 @@ -Pillow -pyparsing -python-dateutil -matplotlib -h5py -pytorch-nlp diff --git a/scripts/deep_tensorflow/Create_Jupyter_deepv.ipynb b/scripts/deep_tensorflow/Create_Jupyter_deepv.ipynb deleted file mode 100644 index aedaf68a9d5805a5f52f3296a1f55704f4145fad..0000000000000000000000000000000000000000 --- a/scripts/deep_tensorflow/Create_Jupyter_deepv.ipynb +++ /dev/null @@ -1,489 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "emerging-record", - "metadata": { - "toc-hr-collapsed": false - }, - "source": [ - "# Create your own Jupyter Kernel\n", - "\n", - "Often the standard kernel do not provide all features you need for your work. This might be that certain modules are not loaded or packages are not installed. \n", - "With your own kernel you can overcome that problem easily and define your own environment, in which you work.\n", - "\n", - "This notebook shows you how you can build your own kernel for a **python environment**.\n", - "\n", - "-------------------------" - ] - }, - { - "cell_type": "markdown", - "id": "imported-mason", - "metadata": {}, - "source": [ - "## Building your own Jupyter kernel is a three step process\n", - "1. Create/Pimp new virtual Python environment\n", - " * venv\n", - "2. Create/Edit launch script for the Jupyter kernel\n", - " * kernel.sh\n", - "3. Create/Edit Jupyter kernel configuration\n", - " * kernel.json" - ] - }, - { - "cell_type": "markdown", - "id": "middle-viewer", - "metadata": {}, - "source": [ - "### Settings" - ] - }, - { - "cell_type": "markdown", - "id": "color-sponsorship", - "metadata": {}, - "source": [ - "* Set kernel name\n", - " - must be lower case\n", - " - change if you like" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "acknowledged-remains", - "metadata": {}, - "outputs": [], - "source": [ - "# INPUT NEEDED:\n", - "KERNEL_NAME=${USER}_kernel\n", - "\n", - "export KERNEL_NAME=$(echo \"${KERNEL_NAME}\" | awk '{print tolower($0)}')\n", - "echo ${KERNEL_NAME} # double check" - ] - }, - { - "cell_type": "markdown", - "id": "sustained-generation", - "metadata": {}, - "source": [ - "* List directories where JupyterLab will search for kernels" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "governmental-check", - "metadata": {}, - "outputs": [], - "source": [ - "# JUPYTER SEARCH PATH (for kernels-directory)\n", - "echo \"jupyter search paths for kernels-directories\"\n", - "if [ -z $JUPYTER_PATH ]; then\n", - " echo \"$HOME/.local/share/jupyter\"\n", - "else\n", - " tr ':' '\\n' <<< \"$JUPYTER_PATH\"\n", - "fi" - ] - }, - { - "cell_type": "markdown", - "id": "later-launch", - "metadata": {}, - "source": [ - "<div class=\"alert alert-block alert-info\">\n", - "<b>Attention:</b>\n", - "Please choose 'private kernel' if you are unsure.</br>\n", - "Using 'project kernel's need to be enabled for your project first by our Jupyter-JSC admins.\n", - "</div>\n", - "\n", - "* Set kernel type\n", - " - private kernel = \"\\${HOME}/.local/\" \n", - " - project kernel = \"\\${PROJECT}/.local/\" \n", - " - other kernel = \"\\<your-path\\>\" (ensure it is part of $JUPYTER_PATH or your kernel will not be found by JuypterLab)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "reported-shirt", - "metadata": {}, - "outputs": [], - "source": [ - "# INPUT NEEDED:\n", - "export KERNEL_TYPE=private # private, project or other\n", - "export KERNEL_SPECS_PREFIX=${HOME}/.local\n", - "\n", - "###################\n", - "# project kernel\n", - "if [ \"${KERNEL_TYPE}\" == \"project\" ]; then\n", - " export KERNEL_SPECS_PREFIX=${PROJECT}/.local\n", - " echo \"project kernel\"\n", - "# private kernel\n", - "elif [ \"${KERNEL_TYPE}\" == \"private\" ]; then\n", - " export KERNEL_SPECS_PREFIX=${HOME}/.local\n", - " echo \"private kernel\"\n", - "else\n", - " if [ ! -d \"$KERNEL_SPECS_PREFIX\" ]; then\n", - " echo \"ERROR: please create directory $KERNEL_SPECS_PREFIX\"\n", - " fi\n", - " echo \"other kernel\"\n", - "fi\n", - "export KERNEL_SPECS_DIR=${KERNEL_SPECS_PREFIX}/share/jupyter/kernels\n", - "\n", - "# check if kernel name is unique\n", - "if [ -d \"${KERNEL_SPECS_DIR}/${KERNEL_NAME}\" ]; then\n", - " echo \"ERROR: Kernel already exists in ${KERNEL_SPECS_DIR}/${KERNEL_NAME}\"\n", - " echo \" Rename kernel name or remove directory.\"\n", - "fi\n", - "\n", - "echo ${KERNEL_SPECS_DIR}/${KERNEL_NAME} # double check" - ] - }, - { - "cell_type": "markdown", - "id": "finnish-darwin", - "metadata": {}, - "source": [ - "* Set directory for kernels virtual environment\n", - " - change if you like" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "furnished-durham", - "metadata": {}, - "outputs": [], - "source": [ - "# INPUT NEEDED:\n", - "export KERNEL_VENVS_DIR=${PROJECT}/${USER}/jupyter/kernels\n", - "\n", - "###################\n", - "mkdir -p ${KERNEL_VENVS_DIR}\n", - "if [ \"${KERNEL_TYPE}\" != \"private\" ] && [ \"${KERNEL_TYPE}\" != \"other\" ]; then\n", - " echo \"Please check the permissions and ensure your project partners have read/execute permissions:\"\n", - " namei -l ${KERNEL_VENVS_DIR}\n", - "fi\n", - "\n", - "echo ${KERNEL_VENVS_DIR} # double check\n", - "ls -lt ${KERNEL_VENVS_DIR}" - ] - }, - { - "cell_type": "markdown", - "id": "settled-terminology", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "everyday-moral", - "metadata": {}, - "source": [ - "## 1. Create/Pimp new virtual Python environment" - ] - }, - { - "cell_type": "markdown", - "id": "defined-better", - "metadata": {}, - "source": [ - "* 1.1 - Load basic Python module" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "thrown-masters", - "metadata": {}, - "outputs": [], - "source": [ - "# set modules\n", - "sysN=\"$(uname -n | cut -f2- -d.)\"\n", - "ml --force purge\n", - "if [ \"$sysN\" = 'deepv' ] ; then\n", - " ml use $OTHERSTAGES\n", - " ml Stages/2022 GCC OpenMPI cuDNN NCCL Python CMake\n", - "elif [ \"$sysN\" = 'jureca' ] ; then\n", - " ml Stages/2022 GCC OpenMPI Python NCCL cuDNN libaio CMake\n", - "else\n", - " echo 'unknown system detected'\n", - "fi\n", - "ml list # double check" - ] - }, - { - "cell_type": "markdown", - "id": "deadly-assist", - "metadata": {}, - "source": [ - "* 1.2 - Create and activate a virtual environment for the kernel and ensure python packages installed in the virtual environment are always prefered" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "tutorial-raleigh", - "metadata": {}, - "outputs": [], - "source": [ - "which python\n", - "if [ -d \"${KERNEL_VENVS_DIR}/${KERNEL_NAME}\" ]; then\n", - " echo \"ERROR: Directory for virtual environment already ${KERNEL_VENVS_DIR}/${KERNEL_NAME}\"\n", - " echo \" Rename kernel name or remove directory.\"\n", - "else\n", - " python -m venv --system-site-packages ${KERNEL_VENVS_DIR}/${KERNEL_NAME}\n", - " source ${KERNEL_VENVS_DIR}/${KERNEL_NAME}/bin/activate\n", - " export PYTHONPATH=${VIRTUAL_ENV}/lib/python3.9/site-packages:${PYTHONPATH}\n", - " echo ${VIRTUAL_ENV} # double check\n", - "fi" - ] - }, - { - "cell_type": "markdown", - "id": "congressional-stream", - "metadata": {}, - "source": [ - "* 1.3 - Install Python libraries required for communication with Jupyter" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bronze-columbia", - "metadata": {}, - "outputs": [], - "source": [ - "cp \"$(which pip3)\" ${VIRTUAL_ENV}/bin/\n", - "var=$VIRTUAL_ENV/bin/python3.9\n", - "sed -i \"1s|.*|$var|\" ${VIRTUAL_ENV}/bin/pip3" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "adjacent-saturday", - "metadata": {}, - "outputs": [], - "source": [ - "which pip3\n", - "pip3 install --ignore-installed ipykernel\n", - "if [ -z \"${VIRTUAL_ENV}\" ]; then\n", - " echo \"ERROR: Virtual environment not successfully initialized.\"\n", - "else\n", - " pip3 install --ignore-installed ipykernel\n", - " ls ${VIRTUAL_ENV}/lib/python3.9/site-packages/ # double check\n", - "fi" - ] - }, - { - "cell_type": "markdown", - "id": "alleged-johns", - "metadata": {}, - "source": [ - "* 1.4 - Install whatever else you need in your Python virtual environment (using pip)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "mysterious-cement", - "metadata": {}, - "outputs": [], - "source": [ - "#pip install <python-package you need>\n", - "pip3 install --upgrade tensorflow --no-cache-dir" - ] - }, - { - "cell_type": "markdown", - "id": "cosmetic-status", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "colonial-migration", - "metadata": {}, - "source": [ - "## 2. Create/Edit launch script for the Jupyter kernel" - ] - }, - { - "cell_type": "markdown", - "id": "ambient-commerce", - "metadata": {}, - "source": [ - "* 2.1 - Create launch script, which loads your Python virtual environment and starts the ipykernel process inside:\n", - "\n", - "<div class=\"alert alert-block alert-info\">\n", - "<b>Attention:</b>\n", - "You MUST load the exactly the same modules as you did above for your virtual Python environment.\n", - "</div>" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "heated-ivory", - "metadata": {}, - "outputs": [], - "source": [ - "echo '#!/bin/bash'\"\n", - "\n", - "# Load basic Python module\n", - "module purge\n", - "module use \"'$OTHERSTAGES'\"\n", - "ml Stages/2022 GCC OpenMPI cuDNN NCCL Python CMake\n", - " \n", - "# Activate your Python virtual environment\n", - "source ${KERNEL_VENVS_DIR}/${KERNEL_NAME}/bin/activate\n", - " \n", - "# Ensure python packages installed in the virtual environment are always prefered\n", - "export PYTHONPATH=${VIRTUAL_ENV}/lib/python3.8/site-packages:\"'${PYTHONPATH}'\"\n", - " \n", - "exec python -m ipykernel \"'$@' > ${VIRTUAL_ENV}/kernel.sh\n", - "chmod +x ${VIRTUAL_ENV}/kernel.sh\n", - "\n", - "cat ${VIRTUAL_ENV}/kernel.sh # double check" - ] - }, - { - "cell_type": "markdown", - "id": "proof-portland", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "inner-silence", - "metadata": {}, - "source": [ - "## 3. Create/Edit Jupyter kernel configuration" - ] - }, - { - "cell_type": "markdown", - "id": "greater-princeton", - "metadata": {}, - "source": [ - "* 3.1 - Create Jupyter kernel configuration directory and files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ongoing-officer", - "metadata": {}, - "outputs": [], - "source": [ - "python -m ipykernel install --name=${KERNEL_NAME} --prefix ${VIRTUAL_ENV}\n", - "export VIRTUAL_ENV_KERNELS=${VIRTUAL_ENV}/share/jupyter/kernels" - ] - }, - { - "cell_type": "markdown", - "id": "documented-motor", - "metadata": {}, - "source": [ - "* 3.2 - Adjust kernel.json file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "finnish-apple", - "metadata": {}, - "outputs": [], - "source": [ - "mv ${VIRTUAL_ENV_KERNELS}/${KERNEL_NAME}/kernel.json ${VIRTUAL_ENV_KERNELS}/${KERNEL_NAME}/kernel.json.orig\n", - "\n", - "echo '{\n", - " \"argv\": [\n", - " \"'${KERNEL_VENVS_DIR}/${KERNEL_NAME}/kernel.sh'\",\n", - " \"-m\",\n", - " \"ipykernel_launcher\",\n", - " \"-f\",\n", - " \"{connection_file}\"\n", - " ],\n", - " \"display_name\": \"'${KERNEL_NAME}'\",\n", - " \"language\": \"python\"\n", - "}' > ${VIRTUAL_ENV_KERNELS}/${KERNEL_NAME}/kernel.json\n", - "\n", - "cat ${VIRTUAL_ENV_KERNELS}/${KERNEL_NAME}/kernel.json # double check" - ] - }, - { - "cell_type": "markdown", - "id": "english-sixth", - "metadata": {}, - "source": [ - "* 3.3 - Create link to kernel specs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "phantom-provision", - "metadata": {}, - "outputs": [], - "source": [ - "mkdir -p ${KERNEL_SPECS_DIR}\n", - "cd ${KERNEL_SPECS_DIR}\n", - "ln -s ${VIRTUAL_ENV_KERNELS}/${KERNEL_NAME} .\n", - "\n", - "echo -e \"\\n\\nThe new kernel '${KERNEL_NAME}' was added to your kernels in '${KERNEL_SPECS_DIR}/'\\n\"\n", - "ls ${KERNEL_SPECS_DIR} # double check" - ] - }, - { - "cell_type": "markdown", - "id": "based-jonathan", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "numerical-hobby", - "metadata": {}, - "source": [ - "## 4. Cleanup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "handmade-smith", - "metadata": {}, - "outputs": [], - "source": [ - "deactivate" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Bash", - "language": "bash", - "name": "bash" - }, - "language_info": { - "codemirror_mode": "shell", - "file_extension": ".sh", - "mimetype": "text/x-sh", - "name": "bash" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/scripts/deep_tensorflow/README.md b/scripts/deep_tensorflow/README.md deleted file mode 100644 index 211255c25226edcdeb52a8bc441fda71fea23d5b..0000000000000000000000000000000000000000 --- a/scripts/deep_tensorflow/README.md +++ /dev/null @@ -1,21 +0,0 @@ -# DL using TensorFlow with Jupyter on deepv - -# source -https://github.com/tensorflow/tensorflow - -# to-do -1. add notebooks - -# usage - pip -1. clone -2. run `bash createENV_TF.sh` -4. submit `sbatch TF_startscript_deep.sh` - -# usage - jupyter -1. clone -2. run `bash createENV_TF.sh` -3. run `bash jupyterAddKernel.sh testAI_deepv` -4. open via `https://jupyter-jsc.fz-juelich.de/hub/login` - -# updates -1. diff --git a/scripts/deep_tensorflow/TF_startscript_deep.sh b/scripts/deep_tensorflow/TF_startscript_deep.sh deleted file mode 100644 index b0fb2ef697bc5e481d8869ac5003493c56d26671..0000000000000000000000000000000000000000 --- a/scripts/deep_tensorflow/TF_startscript_deep.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/bin/bash -# shellcheck disable=SC2206 -#SBATCH --job-name=TFtest -#SBATCH --account=deepext -#SBATCH --mail-user= -#SBATCH --mail-type=ALL -#SBATCH --output=job.out -#SBATCH --error=job.err - -#SBATCH --partition=dp-esb -#SBATCH --nodes=1 -#SBATCH --tasks-per-node=4 -#SBATCH --gpus-per-node=1 -#SBATCH --time=05:00:00 -#SBATCH --exclusive - -ml --force purge -ml use $OTHERSTAGES -ml Stages/2022 GCC/11.2.0 OpenMPI/4.1.2 cuDNN/8.3.1.22-CUDA-11.5 NCCL/2.11.4-CUDA-11.5 Python/3.9.6 - -source /p/project/prcoe12/RAISE/testAI_deepv/bin/activate - -# job info -echo "DEBUG: EXECUTE: $EXEC" -echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" -echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" -echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" -echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" -echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" -echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" -echo "DEBUG: SLURM_NODEID: $SLURM_NODEID" -echo "DEBUG: SLURM_LOCALID: $SLURM_LOCALID" -echo "DEBUG: SLURM_PROCID: $SLURM_PROCID" -echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" -echo - -export CUDA_VISIBLE_DEVICES="0" -export OMP_NUM_THREADS=1 - -srun --cpu-bind=none --mpi=pspmix python3 -u tensorflow2_synthetic_benchmark.py diff --git a/scripts/deep_tensorflow/createEnv_TF.sh b/scripts/deep_tensorflow/createEnv_TF.sh deleted file mode 100755 index d309269d87ef9486bc0f208019eac248406cc4a8..0000000000000000000000000000000000000000 --- a/scripts/deep_tensorflow/createEnv_TF.sh +++ /dev/null @@ -1,99 +0,0 @@ -#!/bin/bash -# -*- coding: utf-8 -*- -# author: EI -# version: 220302a -# creates machine specific python env - -# set modules -ml --force purge - -# get sys info -cDir=$PWD -sysN="$(uname -n | cut -f2- -d.)" -echo "system:${sysN}" -echo - -cont1=false -if [ "$sysN" = 'deepv' ] ; then - ml use $OTHERSTAGES - ml Stages/2022 GCC OpenMPI cuDNN NCCL Python CMake - cont1=true -elif [ "$sysN" = 'juwels' ] ; then - ml Stages/2022 GCC ParaStationMPI Python CMake NCCL libaio cuDNN - cont1=true -elif [ "$sysN" = 'jureca' ] ; then - ml Stages/2022 GCC OpenMPI Python NCCL cuDNN libaio CMake - cont1=true -else - echo - echo 'unknown system detected' - echo 'canceling' - echo -fi -echo "modules loaded" -echo - -# get python version -pver="$(python --version 2>&1 | awk {'print $2'} | cut -f1-2 -d.)" -echo "python version is ${pver}" -echo - -if [ "$cont1" = true ] ; then - if [ -d "${cDir}/testAI_${sysN}" ];then - echo 'env already exist' - echo - - source testAI_${sysN}/bin/activate - else - # create env - python3 -m venv testAI_${sysN} - - # get headers for pip - if [ -f "${cDir}/testAI_${sysN}/bin/pip3" ]; then - echo 'pip already exist' - else - cp "$(which pip3)" $cDir/testAI_${sysN}/bin/ - ln -s $cDir/testAI_${sysN}/bin/pip3 $cDir/testAI_${sysN}/bin/pip${pver} - var="#!$cDir/testAI_${sysN}/bin/python${pver}" - sed -i "1s|.*|$var|" $cDir/testAI_${sysN}/bin/pip3 - fi - - # activate env - source testAI_${sysN}/bin/activate - - echo "a new env is created in ${cDir}" - echo "activation is done via:" - echo "source ${cDir}/testAI_${sysN}/bin/activate" - fi -fi - -# install TF -if [ -f "${cDir}/testAI_${sysN}/bin/tensorboard" ]; then - echo 'TF already installed' - echo -else - export TMPDIR=${cDir} - - pip3 install --upgrade tensorflow --no-cache-dir -fi - -# install horovod -if [ -f "${cDir}/testAI_${sysN}/bin/horovodrun" ]; then - echo 'Horovod already installed' - echo -else - export HOROVOD_GPU=CUDA - export HOROVOD_GPU_OPERATIONS=NCCL - export HOROVOD_WITH_TENSORFLOW=1 - export TMPDIR=${cDir} - - pip3 install --no-cache-dir horovod --ignore-installed -fi - -# get rest of the libraries$ -if [ "$cont1" = true ] ; then - pip3 install -r reqs_TF.txt --ignore-installed -fi - - -# eof diff --git a/scripts/deep_tensorflow/jupyterAddKernel.sh b/scripts/deep_tensorflow/jupyterAddKernel.sh deleted file mode 100755 index 741c247534d1752ccaa1e757c9ce3bc0bc4ed94b..0000000000000000000000000000000000000000 --- a/scripts/deep_tensorflow/jupyterAddKernel.sh +++ /dev/null @@ -1,109 +0,0 @@ -#!/bin/bash -# -*- coding: utf-8 -*- -# author: EI -# version: 220408a -# adds jupyter to an existing python env -# usage: bash jupyterAddKernel.sh <env_location> - -# get sys info -sysN="$(uname -n | cut -f2- -d.)" -cDir=$PWD -ENV_LOC=$cDir/$1 -export TMPDIR=$PWD -echo "system:${sysN}" -echo "env location: $ENV_LOC" -echo - -# warn if wrong bash command -if [ -z "$1" ];then - echo 'wrong usage: try: bash jupyterAddKernel.sh <env_location>' - exit -fi - -# set modules -ml --force purge -if [ "$sysN" = 'deepv' ] ; then - ml use $OTHERSTAGES - ml Stages/2022 GCC OpenMPI cuDNN NCCL Python CMake -elif [ "$sysN" = 'jureca' ] ; then - ml Stages/2022 GCC OpenMPI Python NCCL cuDNN libaio CMake -else - echo - echo 'unknown system detected' - echo 'canceling' - echo -fi -echo "modules loaded" -echo - -# kernel info -KERNEL_NAME=envAI_jk_${sysN} -KERNEL_SPECS_PREFIX=${HOME}/.local -KERNEL_SPECS_DIR=${KERNEL_SPECS_PREFIX}/share/jupyter/kernels - -# get python version -pver="$(python --version 2>&1 | awk {'print $2'} | cut -f1-2 -d.)" -echo "python version is ${pver}" -echo - -# environment that jupyter is built on -if [ -z "${ENV_LOC}" ];then - echo 'env does not exist' - echo 'usage: bash jupyterAddKernel.sh env_location' - exit -else - source ${ENV_LOC}/bin/activate - export PYTHONPATH=${VIRTUAL_ENV}/lib/python${pver}/site-packages:${PYTHONPATH} -fi - -# create/Edit launch script for the Jupyter kernel -if [ -f "${VIRTUAL_ENV}/kernel.sh" ];then - echo "kernel.sh exist!" -else - echo '#!/bin/bash'" - -# Load basic Python module -ml --force purge -ml use $OTHERSTAGES -ml Stages/2022 GCC OpenMPI cuDNN NCCL Python CMake - -# Activate your Python virtual environment -source ${VIRTUAL_ENV}/bin/activate - -# Ensure python packages installed in the virtual environment are always prefered -export PYTHONPATH=${VIRTUAL_ENV}/lib/python${pver}/site-packages:"'${PYTHONPATH}'" - -exec python3 -m ipykernel "'$@' > ${VIRTUAL_ENV}/kernel.sh - chmod +x ${VIRTUAL_ENV}/kernel.sh - - echo 'kernel.sh:' - cat ${VIRTUAL_ENV}/kernel.sh # double check -fi - -# create Jupyter kernel configuration directory and files -pip3 install --ignore-installed ipykernel --no-cache-dir -${VIRTUAL_ENV}/bin/python3 -m ipykernel install --name=${KERNEL_NAME} --prefix ${VIRTUAL_ENV} -VIRTUAL_ENV_KERNELS=${VIRTUAL_ENV}/share/jupyter/kernels - -# adjust kernel.json file -mv ${VIRTUAL_ENV_KERNELS}/${KERNEL_NAME}/kernel.json ${VIRTUAL_ENV_KERNELS}/${KERNEL_NAME}/kernel.json.orig # backup -echo '{ - "argv": [ - "'${VIRTUAL_ENV}/kernel.sh'", - "-m", - "ipykernel_launcher", - "-f", - "{connection_file}" - ], - "display_name": "'${KERNEL_NAME}'", - "language": "python" -}' > ${VIRTUAL_ENV_KERNELS}/${KERNEL_NAME}/kernel.json - -# create link to kernel specs -mkdir -p ${KERNEL_SPECS_DIR} -cd ${KERNEL_SPECS_DIR} -ln -s ${VIRTUAL_ENV_KERNELS}/${KERNEL_NAME} . - -echo -e "\n\nThe new kernel '${KERNEL_NAME}' was added to your kernels in '${KERNEL_SPECS_DIR}/'\n" - -#eof diff --git a/scripts/deep_tensorflow/jupyterCreateKernel.sh b/scripts/deep_tensorflow/jupyterCreateKernel.sh deleted file mode 100755 index a0c0ae951637038ac0d4f8d6484184e30d5f5dac..0000000000000000000000000000000000000000 --- a/scripts/deep_tensorflow/jupyterCreateKernel.sh +++ /dev/null @@ -1,123 +0,0 @@ -#!/bin/bash -# -*- coding: utf-8 -*- -# author: EI -# version: 220408a -# creates machine specific jupyter kernel - -# get sys info -sysN="$(uname -n | cut -f2- -d.)" -cDir=$PWD -export TMPDIR=$PWD -echo "system:${sysN}" -echo - -# set modules -ml --force purge -if [ "$sysN" = 'deepv' ] ; then - ml use $OTHERSTAGES - ml Stages/2022 GCC OpenMPI cuDNN NCCL Python CMake -elif [ "$sysN" = 'jureca' ] ; then - ml Stages/2022 GCC OpenMPI Python NCCL cuDNN libaio CMake -else - echo - echo 'unknown system detected' - echo 'canceling' - echo -fi -echo "modules loaded" -echo - -# kernel info -KERNEL_NAME=kernel_${sysN} -KERNEL_SPECS_PREFIX=${HOME}/.local -KERNEL_SPECS_DIR=${KERNEL_SPECS_PREFIX}/share/jupyter/kernels - -# get python version -pver="$(python --version 2>&1 | awk {'print $2'} | cut -f1-2 -d.)" -echo "python version is ${pver}" -echo - -# create and activate a virtual environment for the kernel -if [ -d "${cDir}/kernelAI_${sysN}" ];then - echo 'env already existi:' - - source ${cDir}/kernelAI_${sysN}/bin/activate - export PYTHONPATH=${VIRTUAL_ENV}/lib/python${pver}/site-packages:${PYTHONPATH} -else - # create env - python3 -m venv --system-site-packages ${cDir}/kernelAI_${sysN} - - # get headers for pip - if [ -f "${cDir}/kernelAI_${sysN}/bin/pip3" ]; then - echo 'pip already exist' - echo - else - cp "$(which pip3)" $cDir/kernelAI_${sysN}/bin/ - ln -s $cDir/kernelAI_${sysN}/bin/pip3 $cDir/kernelAI_${sysN}/bin/pip${pver} - var="#!$cDir/kernelAI_${sysN}/bin/python${pver}" - sed -i "1s|.*|$var|" $cDir/kernelAI_${sysN}/bin/pip3 - fi - - # activate env - source ${cDir}/kernelAI_${sysN}/bin/activate - export PYTHONPATH=${VIRTUAL_ENV}/lib/python${pver}/site-packages:${PYTHONPATH} -fi -echo 'location of new venv:' -echo ${VIRTUAL_ENV} # double check -echo - -# create/Edit launch script for the Jupyter kernel -if [ -f "${VIRTUAL_ENV}/kernel.sh" ];then - echo "kernel.sh exist!" -else - echo '#!/bin/bash'" - -# Load basic Python module -ml GCC ParaStationMPI Python - -# Activate your Python virtual environment -source ${VIRTUAL_ENV}/bin/activate - -# Ensure python packages installed in the virtual environment are always prefered -export PYTHONPATH=${VIRTUAL_ENV}/lib/python${pver}/site-packages:"'${PYTHONPATH}'" - -exec python3 -m ipykernel "'$@' > ${VIRTUAL_ENV}/kernel.sh - chmod +x ${VIRTUAL_ENV}/kernel.sh - - echo 'kernel.sh:' - cat ${VIRTUAL_ENV}/kernel.sh # double check -fi - -# create Jupyter kernel configuration directory and files -pip3 install --ignore-installed ipykernel --no-cache-dir -${VIRTUAL_ENV}/bin/python3 -m ipykernel install --name=${KERNEL_NAME} --prefix ${VIRTUAL_ENV} -VIRTUAL_ENV_KERNELS=${VIRTUAL_ENV}/share/jupyter/kernels - -# adjust kernel.json file -mv ${VIRTUAL_ENV_KERNELS}/${KERNEL_NAME}/kernel.json ${VIRTUAL_ENV_KERNELS}/${KERNEL_NAME}/kernel.json.orig # backup -echo '{ - "argv": [ - "'${VIRTUAL_ENV}/kernel.sh'", - "-m", - "ipykernel_launcher", - "-f", - "{connection_file}" - ], - "display_name": "'${KERNEL_NAME}'", - "language": "python" -}' > ${VIRTUAL_ENV_KERNELS}/${KERNEL_NAME}/kernel.json - -# create link to kernel specs -mkdir -p ${KERNEL_SPECS_DIR} -cd ${KERNEL_SPECS_DIR} -ln -s ${VIRTUAL_ENV_KERNELS}/${KERNEL_NAME} . - -echo -e "\n\nThe new kernel '${KERNEL_NAME}' was added to your kernels in '${KERNEL_SPECS_DIR}/'\n" - -echo 'load this env as: -ml --force purge -ml use $OTHERSTAGES -ml Stages/2022 GCC OpenMPI cuDNN NCCL Python CMake -source ${cDir}/kernelAI_${sysN}/bin/activate' - -#eof diff --git a/scripts/deep_tensorflow/lamec.json b/scripts/deep_tensorflow/lamec.json deleted file mode 100644 index 8c582bbd9f2365d3a43ac8c11f538f5b2a79c694..0000000000000000000000000000000000000000 --- a/scripts/deep_tensorflow/lamec.json +++ /dev/null @@ -1 +0,0 @@ -{"startscript": "TF_startscript_deep.sh"} \ No newline at end of file diff --git a/scripts/deep_tensorflow/tensorflow2_synthetic_benchmark.py b/scripts/deep_tensorflow/tensorflow2_synthetic_benchmark.py deleted file mode 100644 index 04b45c1bb4ca8773d1be6687e067c3136e9a95af..0000000000000000000000000000000000000000 --- a/scripts/deep_tensorflow/tensorflow2_synthetic_benchmark.py +++ /dev/null @@ -1,131 +0,0 @@ -# Copyright 2019 Uber Technologies, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -import argparse -import os -import numpy as np -import timeit - -import tensorflow as tf -import horovod.tensorflow as hvd -from tensorflow.keras import applications - -# Benchmark settings -parser = argparse.ArgumentParser(description='TensorFlow Synthetic Benchmark', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) -parser.add_argument('--fp16-allreduce', action='store_true', default=False, - help='use fp16 compression during allreduce') - -parser.add_argument('--model', type=str, default='ResNet50', - help='model to benchmark') -parser.add_argument('--batch-size', type=int, default=32, - help='input batch size') - -parser.add_argument('--num-warmup-batches', type=int, default=10, - help='number of warm-up batches that don\'t count towards benchmark') -parser.add_argument('--num-batches-per-iter', type=int, default=10, - help='number of batches per benchmark iteration') -parser.add_argument('--num-iters', type=int, default=100, - help='number of benchmark iterations') - -parser.add_argument('--no-cuda', action='store_true', default=False, - help='disables CUDA training') - - -args = parser.parse_args() -args.cuda = not args.no_cuda - -# Horovod: initialize Horovod. -hvd.init() - -# Horovod: pin GPU to be used to process local rank (one GPU per process) -if args.cuda: - gpus = tf.config.experimental.list_physical_devices('GPU') - for gpu in gpus: - tf.config.experimental.set_memory_growth(gpu, True) - if gpus: - tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') -else: - os.environ["CUDA_VISIBLE_DEVICES"] = "-1" - -# Set up standard model. -model = getattr(applications, args.model)(weights=None) -opt = tf.optimizers.SGD(0.01) - -data = tf.random.uniform([args.batch_size, 224, 224, 3]) -target = tf.random.uniform([args.batch_size, 1], minval=0, maxval=999, dtype=tf.int64) - - -@tf.function -def benchmark_step(first_batch): - # Horovod: (optional) compression algorithm. - compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none - - # Horovod: use DistributedGradientTape - with tf.GradientTape() as tape: - probs = model(data, training=True) - loss = tf.losses.sparse_categorical_crossentropy(target, probs) - - # Horovod: add Horovod Distributed GradientTape. - tape = hvd.DistributedGradientTape(tape, compression=compression) - - gradients = tape.gradient(loss, model.trainable_variables) - opt.apply_gradients(zip(gradients, model.trainable_variables)) - - # Horovod: broadcast initial variable states from rank 0 to all other processes. - # This is necessary to ensure consistent initialization of all workers when - # training is started with random weights or restored from a checkpoint. - # - # Note: broadcast should be done after the first gradient step to ensure optimizer - # initialization. - if first_batch: - hvd.broadcast_variables(model.variables, root_rank=0) - hvd.broadcast_variables(opt.variables(), root_rank=0) - - -def log(s, nl=True): - if hvd.rank() != 0: - return - print(s, end='\n' if nl else '') - - -log('Model: %s' % args.model) -log('Batch size: %d' % args.batch_size) -device = 'GPU' if args.cuda else 'CPU' -log('Number of %ss: %d' % (device, hvd.size())) - - -with tf.device(device): - # Warm-up - log('Running warmup...') - benchmark_step(first_batch=True) - timeit.timeit(lambda: benchmark_step(first_batch=False), - number=args.num_warmup_batches) - - # Benchmark - log('Running benchmark...') - img_secs = [] - for x in range(args.num_iters): - time = timeit.timeit(lambda: benchmark_step(first_batch=False), - number=args.num_batches_per_iter) - img_sec = args.batch_size * args.num_batches_per_iter / time - log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, device)) - img_secs.append(img_sec) - - # Results - img_sec_mean = np.mean(img_secs) - img_sec_conf = 1.96 * np.std(img_secs) - log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf)) - log('Total img/sec on %d %s(s): %.1f +-%.1f' % - (hvd.size(), device, hvd.size() * img_sec_mean, hvd.size() * img_sec_conf)) diff --git a/scripts/jureca_ddp/DDP_startscript.sh b/scripts/jureca_ddp/DDP_startscript.sh deleted file mode 100644 index 5d8409a7305de83835ae377d57fc3466c86f8615..0000000000000000000000000000000000000000 --- a/scripts/jureca_ddp/DDP_startscript.sh +++ /dev/null @@ -1,107 +0,0 @@ -#!/bin/bash - -# general configuration of the job -#SBATCH --job-name=TorchTest -#SBATCH --account=zam -#SBATCH --mail-user= -#SBATCH --mail-type=ALL -#SBATCH --output=job.out -#SBATCH --error=job.err -#SBATCH --time=00:15:00 - -# configure node and process count on the CM -#SBATCH --partition=dc-gpu-devel -#SBATCH --nodes=4 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=32 -#SBATCH --gpus-per-node=4 -#SBATCH --exclusive - -# gres options have to be disabled for deepv -#SBATCH --gres=gpu:4 - -# parameters -debug=false # do debug -bs=32 # batch-size -epochs=5 # epochs -lr=0.01 # learning rate - -# AT -dataDir="/p/scratch/raise-ctp2/T31_LD/" -COMMAND="DDP_pytorch_AT.py" -EXEC="$COMMAND \ - --batch-size $bs \ - --epochs $epochs \ - --lr $lr \ - --nworker $SLURM_CPUS_PER_TASK \ - --data-dir $dataDir" - - -### do not modify below ### - - -# set modules -ml --force purge -ml Stages/2022 NVHPC/22.3 ParaStationMPI/5.5.0-1-mt NCCL/2.12.7-1-CUDA-11.5 cuDNN/8.3.1.22-CUDA-11.5 -ml Python/3.9.6 libaio/0.3.112 HDF5/1.12.1-serial mpi-settings/CUDA - -# set env -source /p/project/raise-ctp1/RAISE/envAI_jureca/bin/activate - -# sleep a sec -sleep 1 - -# job info -echo "DEBUG: TIME: $(date)" -echo "DEBUG: EXECUTE: $EXEC" -echo "DEBUG: SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR" -echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" -echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" -echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" -echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" -echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" -echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" -echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" -if [ "$debug" = true ] ; then - export NCCL_DEBUG=INFO -fi -echo - -# set comm -export CUDA_VISIBLE_DEVICES="0,1,2,3" -export OMP_NUM_THREADS=1 -if [ "$SLURM_CPUS_PER_TASK" > 0 ] ; then - export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -fi - -# launch -srun --cpu-bind=none bash -c "torchrun \ - --log_dir='logs' \ - --nnodes=$SLURM_NNODES \ - --nproc_per_node=$SLURM_GPUS_PER_NODE \ - --rdzv_id=$SLURM_JOB_ID \ - --rdzv_conf=is_host=\$(((SLURM_NODEID)) && echo 0 || echo 1) \ - --rdzv_backend=c10d \ - --rdzv_endpoint='$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)'i:29500 \ - $EXEC" - -# add --globres=fs:cscratch@just flag to l. 78 if High Performance Storage Tier (HPST) - -# nsys profiler: following https://gist.github.com/mcarilli/376821aa1a7182dfcf59928a7cde3223 -#srun --cpu-bind=none nsys profile \ -# --trace=cublas,cuda,cudnn,nvtx,osrt \ -# --sample=cpu \ -# --stats=true \ -# --force-overwrite=true \ -# -o ./prof.out bash -c "torchrun \ -# --log_dir='logs' \ -# --nnodes=$SLURM_NNODES \ -# --nproc_per_node=$SLURM_GPUS_PER_NODE \ -# --rdzv_id=$SLURM_JOB_ID \ -# --rdzv_conf=is_host=\$(((SLURM_NODEID)) && echo 0 || echo 1) \ -# --rdzv_backend=c10d \ -# --rdzv_endpoint='$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)'i:29500 \ -# $EXEC" - -# eof diff --git a/scripts/jureca_ddp/DDP_startscript_container.sh b/scripts/jureca_ddp/DDP_startscript_container.sh deleted file mode 100644 index 377d16ed5c248d957dbe5a96180d3f613f9f6257..0000000000000000000000000000000000000000 --- a/scripts/jureca_ddp/DDP_startscript_container.sh +++ /dev/null @@ -1,81 +0,0 @@ -#!/bin/bash - -# general configuration of the job -#SBATCH --job-name=AMDTorchTest -#SBATCH --account=zam -#SBATCH --mail-user= -#SBATCH --mail-type=ALL -#SBATCH --output=job.out -#SBATCH --error=job.err -#SBATCH --time=00:15:00 - -# configure node and process count on the CM -#SBATCH --partition=dc-mi200 -#SBATCH --nodes=1 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=12 -#SBATCH --exclusive - -# parameters -debug=false # do debug -bs=32 # batch-size -epochs=5 # epochs -lr=0.01 # learning rate - -# AT -dataDir="/p/scratch/raise-ctp1/T31_LD/" -COMMAND="DDP_pytorch_AT.py" -EXEC="$COMMAND \ - --batch-size $bs \ - --epochs $epochs \ - --lr $lr \ - --nworker $SLURM_CPUS_PER_TASK \ - --data-dir $dataDir" - - -### do not modify below ### - - -# set modules -ml Architecture/jureca_mi200 -ml GCC/11.2.0 OpenMPI/4.1.4 ROCm/5.3.0 CMake/3.23.1 -ml UCX-settings/RC-ROCm - -# set env variables -export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" -mkdir -p $SLURM_SUBMIT_DIR/tmp -export MIOPEN_USER_DB_PATH=$SLURM_SUBMIT_DIR/tmp -export NCCL_DEBUG=WARN - -# job info -echo "DEBUG: TIME: $(date)" -echo "DEBUG: EXECUTE: $EXEC" -echo "DEBUG: SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR" -echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" -echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" -echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" -echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" -echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" -echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" -echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" -if [ "$debug" = true ] ; then - export NCCL_DEBUG=INFO -fi -echo - -# launch container -srun --cpu-bind=none bash -c "apptainer exec --rocm \ - torch_rocm_docker.sif \ - python -m fixed_torch_run \ - --log_dir='logs' \ - --nnodes=$SLURM_NNODES \ - --nproc_per_node=8 \ - --rdzv_id=$SLURM_JOB_ID \ - --rdzv_conf=is_host=\$(((SLURM_NODEID)) && echo 0 || echo 1) \ - --rdzv_backend=c10d \ - --rdzv_endpoint='$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)'i:29500 \ - $EXEC" - -#eof diff --git a/scripts/jureca_ddp/README.md b/scripts/jureca_ddp/README.md deleted file mode 100644 index df2234127426006d3f27a8663adc397257d7c6d6..0000000000000000000000000000000000000000 --- a/scripts/jureca_ddp/README.md +++ /dev/null @@ -1,39 +0,0 @@ -# DL using DDP on jureca dc gpu - -# DDP source -https://github.com/pytorch/pytorch#from-source - -# jureca user documentation -https://apps.fz-juelich.de/jsc/hps/jureca/index.html - -# current isues -1. torchrun: Hostname/endpoint mismatch not handled\ -workaround is to modify torchrun and use included batch script\ -simply run `createEnv.sh` to install fixed torch\ -discussion in: https://github.com/pytorch/pytorch/issues/73656 -2. for containers, instead of #1, use `fixed_torch_run.py` -- follow usage - containers. - -# to-do -1. - -# done -1. tested containers (for both NVIDIA & AMD GPUs):\ -https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch \ -https://www.amd.com/en/technologies/infinity-hub/pytorch \ -https://hub.docker.com/r/rocm/pytorch - - -# usage - Python Env -1. run `./createEnv.sh` to create env and install torch -2. select a case from CASES folder -3. submit `sbatch DDP_startscript.sh` - -# usage - containers (note this for AMD partition - modify for NVIDIA) -1. run `./createContainer.sh` to use and build Torch/ROCm container -2. select a case from CASES folder -3. submit `sbatch DDP_startscript_container.sh` - -# usage - Source Code -1. run `./createEnv_MPI.sh` to create Conda env and install torch with MPI support -2. select a case from CASES folder -3. submit `sbatch DDP_startscript.sh` diff --git a/scripts/jureca_ddp/createContainer.sh b/scripts/jureca_ddp/createContainer.sh deleted file mode 100644 index 3d7f58402d96f71e0b14a6097a92f0e17c4441ed..0000000000000000000000000000000000000000 --- a/scripts/jureca_ddp/createContainer.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash -# -*- coding: utf-8 -*- -# author: EI -# version: 2212008a -# pull and build containers for PyTorch/ROCm - -# load modules -ml Architecture/jureca_mi200 -ml GCC/11.2.0 OpenMPI/4.1.2 ROCm/5.3.0 CMake/3.23.1 -ml UCX-settings/RC-ROCm - -# create Cache/TMP so that $HOME would not be used -mkdir -p Cache -mkdir -p TMP -export APPTAINER_CACHEDIR=$(mktemp -d -p $PWD/Cache) -export APPTAINER_TMPDIR=$(mktemp -d -p $PWD/TMP) - -# official AMD container with Torch==1.10.0 -# apptainer pull torch_rocm_amd.sif docker://amdih/pytorch:rocm5.0_ubuntu18.04_py3.7_pytorch_1.10.0 - -# docker AMD container with Torch==1.12.1 -apptainer pull torch_rocm_docker.sif docker://rocm/pytorch - -#eof diff --git a/scripts/jureca_ddp/createEnv.sh b/scripts/jureca_ddp/createEnv.sh deleted file mode 100755 index 9d635bea23e61f3fbbab46bc42d9a0078b2e7e3b..0000000000000000000000000000000000000000 --- a/scripts/jureca_ddp/createEnv.sh +++ /dev/null @@ -1,183 +0,0 @@ -#!/bin/bash -# -*- coding: utf-8 -*- -# author: EI -# version: 220328a -# creates machine specific python env - -# set modules -ml --force purge - -# get sys info -cDir=$PWD -sysN="$(uname -n | cut -f2- -d.)" -echo "system:${sysN}" -echo - -cont1=false -if [ "$sysN" = 'deepv' ] ; then - ml use $OTHERSTAGES - ml Stages/2022 NVHPC/22.1 OpenMPI/4.1.2 NCCL/2.15.1-1-CUDA-11.5 cuDNN/8.3.1.22-CUDA-11.5 - ml Python/3.9.6 HDF5 CMake - ml -nvidia-driver/.default - cont1=true -elif [ "$sysN" = 'juwels' ] ; then - ml GCC ParaStationMPI Python CMake - cont1=true -elif [ "$sysN" = 'jureca' ] ; then - ml Stages/2022 NVHPC/22.1 ParaStationMPI/5.5.0-1-mt NCCL/2.11.4-CUDA-11.5 cuDNN/8.3.1.22-CUDA-11.5 - ml Python/3.9.6 CMake HDF5 PnetCDF libaio/0.3.112 mpi-settings/CUDA - cont1=true -else - echo - echo 'unknown system detected' - echo 'canceling' - echo -fi -echo "modules loaded" -echo - -# get python version -pver="$(python --version 2>&1 | awk {'print $2'} | cut -f1-2 -d.)" -echo "python version is ${pver}" -echo - -# create env -if [ "$cont1" = true ] ; then - if [ -d "${cDir}/envAI_${sysN}" ];then - echo 'env already exist' - echo - - source envAI_${sysN}/bin/activate - else - python3 -m venv envAI_${sysN} - - # get headers for pip - if [ -f "${cDir}/envAI_${sysN}/bin/pip3" ]; then - echo 'pip already exist' - else - cp "$(which pip3)" $cDir/envAI_${sysN}/bin/ - ln -s $cDir/envAI_${sysN}/bin/pip3 $cDir/envAI_${sysN}/bin/pip${pver} - var="#!$cDir/envAI_${sysN}/bin/python${pver}" - sed -i "1s|.*|$var|" $cDir/envAI_${sysN}/bin/pip3 - fi - - # activate env - source envAI_${sysN}/bin/activate - - echo "a new env is created in ${cDir}" - echo "activation is done via:" - echo "source ${cDir}/envAI_${sysN}/bin/activate" - fi -fi - -# install torch -if [ -f "${cDir}/envAI_${sysN}/bin/torchrun" ]; then - echo 'Torch already installed' - echo -else - export TMPDIR=${cDir} - - pip3 install \ - torch==1.11.0+cu115 torchvision==0.12.0+cu115 torchaudio==0.11.0+cu115 -f \ - https://download.pytorch.org/whl/cu115/torch_stable.html --no-cache-dir -fi - -# install horovod -if [ -f "${cDir}/envAI_${sysN}/bin/horovodrun" ]; then - echo 'Horovod already installed' - echo -else - export HOROVOD_WITH_MPI=1 - export HOROVOD_MPI_THREADS_DISABLE=1 - export HOROVOD_GPU=CUDA - export HOROVOD_GPU_OPERATIONS=NCCL - export HOROVOD_CUDA_HOME=$EBROOTCUDA - export HOROVOD_NCCL_HOME=$EBROOTNCCL - export HOROVOD_WITH_PYTORCH=1 - export TMPDIR=${cDir} - - pip3 install --no-cache-dir horovod --ignore-installed -fi - -# install deepspeed -if [ -f "${cDir}/envAI_${sysN}/bin/deepspeed" ]; then - echo 'DeepSpeed already installed' - echo -else - # compile all opt. stuff - not needed & not working - #export DS_BUILD_OPS=1 - # compile req. opt. stuff - export DS_BUILD_FUSED_ADAM=1 - export DS_BUILD_UTILS=1 - if [ "$sysN" = 'deepv' ] ; then - #fix libaio issues via: - export DS_BUILD_AIO=0 - fi - export TMPDIR=${cDir} - - pip3 install --no-cache-dir DeepSpeed - - # add this to .../deepspeed/launcher/launch.py l.93 - var=' args.node_rank=int(os.environ.get("SLURM_PROCID",0))' - sed -i "93s|.*|$var|" $cDir/envAI_${sysN}/lib/python${pver}/site-packages/deepspeed/launcher/launch.py -fi - -# install heat -if [ -d "${cDir}/envAI_${sysN}/lib/python${pver}/site-packages/heat" ]; then - echo 'HeAT already installed' - echo -else - export TMPDIR=${cDir} - export CFLAGS="-noswitcherror" - export CXXFLAGS="-noswitcherror" - - # need to modify setup.py to accep torch>1.9 for heat - wget https://files.pythonhosted.org/packages/5d/3a/4781f1e6910753bfdfa6712c83c732c60e675d8de14983926a0d9306c7a6/heat-1.1.1.tar.gz - tar xzf heat-1.1.1.tar.gz - var=' "torch>=1.7.0",' - sed -i "36s|.*|$var|" heat-1.1.1/setup.py - var=' "torchvision>=0.8.0",' - sed -i "39s|.*|$var|" heat-1.1.1/setup.py - - # create tar again! - rm -rf heat-1.1.1.tar.gz - tar czf heat-1.1.1.tar.gz heat-1.1.1 - rm -rf heat-1.1.1 - - pip3 install --no-cache-dir 'heat-1.1.1.tar.gz[hdf5,netcdf]' - - rm -rf heat-1.1.1.tar.gz -fi - -# get rest of the libraries$ -if [ "$cont1" = true ] ; then - # install rest - pip3 install -r reqs.txt --ignore-installed - - # modify l.4 of /torchnlp/_third_party/weighted_random_sampler.py - var='int_classes = int' - sed -i "4s|.*|$var|" \ - $cDir/envAI_${sysN}/lib/python${pver}/site-packages/torchnlp/_third_party/weighted_random_sampler.py -fi - -# fix IB IP config -if [ -f "${cDir}/envAI_${sysN}/bin/torchrun" ]; then - sed -i -e '3,8s/^/#/' ${cDir}/envAI_${sysN}/bin/torchrun - echo """ -import re -import sys -from torch.distributed.run import main -from torch.distributed.elastic.agent.server import api as sapi - -def new_get_fq_hostname(): - return _orig_get_fq_hostname().replace('.', 'i.', 1) - -if __name__ == '__main__': - _orig_get_fq_hostname = sapi._get_fq_hostname - sapi._get_fq_hostname = new_get_fq_hostname - sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) - sys.exit(main()) -""" >> ${cDir}/envAI_${sysN}/bin/torchrun -fi - -#eof diff --git a/scripts/jureca_ddp/createEnv_MPI.sh b/scripts/jureca_ddp/createEnv_MPI.sh deleted file mode 100644 index 2b35e8fa56def98771076f80ef348a55a4aeaa0e..0000000000000000000000000000000000000000 --- a/scripts/jureca_ddp/createEnv_MPI.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/bin/bash -# -*- coding: utf-8 -*- -# author: EI -# version: 221026a -# creates machine specific PyTorch with MPI support using Conda -# use compute node to compile! - -# jureca modules -ml --force purge -ml Stages/2022 GCC/11.2.0 ParaStationMPI/5.5.0-1 NCCL/2.12.7-1-CUDA-11.5 -ml cuDNN/8.3.1.22-CUDA-11.5 libaio/0.3.112 mpi-settings/CUDA CMake/3.21.1 -ml Ninja-Python/1.10.2 - -# get CUDA version in the system -CUDA_ver="$(echo $EBVERSIONCUDA 2>&1 | tr -d .)" - -# miniconda -download=false -if [ -d "$PWD/miniconda3" ];then - echo "miniconda3 already installed!" -else - wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh - bash Miniconda3-latest-Linux-x86_64.sh -p $PWD/miniconda3 -b - download=true -fi - -if [ "$download" = true ] ; then - # std libs - conda install -y astunparse numpy pyyaml mkl mkl-include setuptools cffi \ - typing_extensions future six requests dataclasses Pillow --force-reinstall - - # cuda support (v11.5) - conda install -c pytorch -y magma-cuda$CUDA_ver --force-reinstall - conda install -y pkg-config libuv --force-reinstall - - # fix older library issue - cp $EBROOTGCC/lib64/libstdc++.so.6.0.29 $CONDA_PREFIX/lib/ - pushd $CONDA_PREFIX/lib/ - rm -f libstdc++.so.6 - ln -s libstdc++.so.6.0.29 libstdc++.so.6 - popd -fi - -# enable Conda env -source $PWD/miniconda3/etc/profile.d/conda.sh -conda activate - -# pytorch with mpi support -if [ -d "$PWD/pytorch/build/test.dat" ];then - echo 'pytorch already installed!' -else - git clone --recursive https://github.com/pytorch/pytorch pytorch - pushd pytorch - rm -rf build - git submodule sync - git submodule update --init --recursive - - # install pytorch with custom flags - export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"} - - mkdir tmp - export TMPDIR=$PWD/tmp - export CUDA_HOME=$CUDA_HOME - python3 setup.py clean - CMAKE_C_COMPILER=$(which mpicc) CMAKE_CXX_COMPILER=$(which mpicxx) \ - USE_DISTRIBUTED=ON USE_MPI=ON CUDA_ROOT_DIR=$EBROOTCUDA USE_CUDA=ON \ - NCCL_ROOT_DIR=$EBROOTNCCL USE_NCCL=ON USE_GLOO=ON \ - CUDNN_ROOT=$EBROOTCUDNN USE_CUDNN=ON \ - python3 setup.py install - popd -fi - -#eof diff --git a/scripts/jureca_ddp/fixed_torch_run.py b/scripts/jureca_ddp/fixed_torch_run.py deleted file mode 100644 index cca970624b086399b5cc01f949d4881b191bb950..0000000000000000000000000000000000000000 --- a/scripts/jureca_ddp/fixed_torch_run.py +++ /dev/null @@ -1,51 +0,0 @@ -from argparse import ArgumentParser -import ipaddress -import runpy -import socket - -from torch.distributed.elastic.agent.server import api as sapi - - -def parse_host(): - parser = ArgumentParser() - parser.add_argument('--rdzv_endpoint') - endpoint = parser.parse_known_args()[0].rdzv_endpoint - host = ( - endpoint.split(':', 1)[0] - if endpoint - else None - ) - return host - - -def fix_torch_run(host): - _orig_get_fq_hostname = sapi._get_fq_hostname - - if host: - try: - ipaddress.ip_address(host) - is_ip = True - except ValueError: - is_ip = False - - if is_ip: - def new_get_fq_hostname(): - return socket.gethostbyaddr(host)[0] - else: - def new_get_fq_hostname(): - return socket.getfqdn(host) - else: - new_get_fq_hostname = _orig_get_fq_hostname - - sapi._get_fq_hostname = new_get_fq_hostname - - -def main(): - host = parse_host() - fix_torch_run(host) - runpy.run_module('torch.distributed.run', run_name='__main__') - - -if __name__ == '__main__': - main() - diff --git a/scripts/jureca_ddp/lamec.json b/scripts/jureca_ddp/lamec.json deleted file mode 100644 index a36ad5345ea21cdae8672d53fd40e52ea1cada36..0000000000000000000000000000000000000000 --- a/scripts/jureca_ddp/lamec.json +++ /dev/null @@ -1 +0,0 @@ -{"startscript": "DDP_startscript.sh"} \ No newline at end of file diff --git a/scripts/jureca_ddp/reqs.txt b/scripts/jureca_ddp/reqs.txt deleted file mode 100755 index 2d7bb74bbb496829cfebbe2c70a0fd7ec64585c7..0000000000000000000000000000000000000000 --- a/scripts/jureca_ddp/reqs.txt +++ /dev/null @@ -1,11 +0,0 @@ -Pillow -pyparsing -python-dateutil -matplotlib -h5py -pytorch-nlp -pyprof -filelock -scipy -perlin_noise -noise diff --git a/scripts/jureca_deepspeed/DS_config.json b/scripts/jureca_deepspeed/DS_config.json deleted file mode 100644 index ec1f0221568969e8236b1e3ff5f3699f0f68b5b0..0000000000000000000000000000000000000000 --- a/scripts/jureca_deepspeed/DS_config.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "train_micro_batch_size_per_gpu": 96, - "gradient_accumulation_steps": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.01 - } - }, - "fp16": { - "enabled": false - }, - "zero_optimization": false -} diff --git a/scripts/jureca_deepspeed/DS_startscript_deep.sh b/scripts/jureca_deepspeed/DS_startscript_deep.sh deleted file mode 100644 index 2578883eb3c1c2f29b26224dda86276fd4c26c36..0000000000000000000000000000000000000000 --- a/scripts/jureca_deepspeed/DS_startscript_deep.sh +++ /dev/null @@ -1,96 +0,0 @@ -#!/bin/bash - -# general configuration of the job -#SBATCH --job-name=DStest -#SBATCH --account=raise-ctp1 -#SBATCH --mail-user= -#SBATCH --mail-type=ALL -#SBATCH --output=job.out -#SBATCH --error=job.err -#SBATCH --time=02:00:00 - -# configure node and process count on the CM -#SBATCH --partition=dc-gpu -#SBATCH --nodes=2 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=32 -#SBATCH --gpus-per-node=4 -#SBATCH --exclusive - -# gres options have to be disabled for deepv -#SBATCH --gres=gpu:4 - -# parameters -debug=false # do nccl debug -epochs=1 # epochs -be='nccl' # backend -lr=0.001 # learning rate -bs=2 # batch-size - -# AT -dataDir='/p/scratch/raise-ctp1/inanc2/T31_LD/' -COMMAND="DS_pytorch_AT.py" - -EXEC=$COMMAND" --batch-size $bs - --epochs $epochs - --backend $be - --nworker $SLURM_CPUS_PER_TASK - --benchrun - --data-dir $dataDir" - -# set modules -ml --force purge -ml Stages/2022 NVHPC/22.3 ParaStationMPI/5.5.0-1-mt NCCL/2.12.7-1-CUDA-11.5 cuDNN/8.3.1.22-CUDA-11.5 -ml Python/3.9.6 libaio/0.3.112 HDF5/1.12.1-serial mpi-settings/CUDA - -# set env -source /p/project/raise-ctp1/RAISE/envAI_jureca/bin/activate - -# sleep a sec -sleep 1 - -# job info -echo "DEBUG: TIME: $(date)" -echo "DEBUG: EXECUTE: $EXEC" -echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" -echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" -echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" -echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" -echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" -echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" -echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" -if [ "$debug" = true ] ; then - export NCCL_DEBUG=INFO -fi -echo - -# set comm -export CUDA_VISIBLE_DEVICES="0,1,2,3" -export OMP_NUM_THREADS=1 -if [ "$SLURM_CPUS_PER_TASK" > 0 ] ; then - export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -fi - -#### do not change this part -# create node-list -sysN=$(eval "scontrol show hostnames") -for i in $sysN; do - x+=\"$i\":[$CUDA_VISIBLE_DEVICES], -done -WID=`echo {${x::-1}} | base64 -w 0` - -# modify config file with parameters -sed -i "2s|.*| \"train_micro_batch_size_per_gpu\": ${bs},|" DS_config.json -sed -i "7s|.*| \"lr\": ${lr}|" DS_config.json -#### - -# launch -srun python -m deepspeed.launcher.launch \ - --node_rank $SLURM_PROCID \ - --master_addr ${SLURMD_NODENAME}i \ - --master_port 29500 \ - --world_info $WID \ - $EXEC --deepspeed_mpi --deepspeed_config DS_config.json - -# eof diff --git a/scripts/jureca_deepspeed/README.md b/scripts/jureca_deepspeed/README.md deleted file mode 100644 index d0a70f88c2f04545509951f048bc24f6fad30e63..0000000000000000000000000000000000000000 --- a/scripts/jureca_deepspeed/README.md +++ /dev/null @@ -1,15 +0,0 @@ -# DL using DeepSpeed on Jureca DC - -# source -https://github.com/microsoft/DeepSpeed - -# current isues -1. - -# to-do -1. - -# usage - pip -1. clone -2. run `./createENV.sh` -3. submit `sbatch DS_startscript_deep.sh` diff --git a/scripts/jureca_deepspeed/createEnv.sh b/scripts/jureca_deepspeed/createEnv.sh deleted file mode 100755 index e5cc3af47484d29cf67a2dc137c219b1890161bf..0000000000000000000000000000000000000000 --- a/scripts/jureca_deepspeed/createEnv.sh +++ /dev/null @@ -1,180 +0,0 @@ -#!/bin/bash -# -*- coding: utf-8 -*- -# author: EI -# version: 220328a -# creates machine specific python env - -# set modules -ml --force purge - -# get sys info -cDir=$PWD -sysN="$(uname -n | cut -f2- -d.)" -echo "system:${sysN}" -echo - -cont1=false -if [ "$sysN" = 'deepv' ] ; then - ml use $OTHERSTAGES - ml Stages/2022 GCC OpenMPI cuDNN NCCL Python CMake - cont1=true -elif [ "$sysN" = 'juwels' ] ; then - ml GCC ParaStationMPI Python CMake - cont1=true -elif [ "$sysN" = 'jureca' ] ; then - ml Stages/2022 NVHPC ParaStationMPI/5.5.0-1-mt Python CMake NCCL/2.11.4-CUDA-11.5 cuDNN libaio HDF5 PnetCDF mpi-settings/CUDA - cont1=true -else - echo - echo 'unknown system detected' - echo 'canceling' - echo -fi -echo "modules loaded" -echo - -# get python version -pver="$(python --version 2>&1 | awk {'print $2'} | cut -f1-2 -d.)" -echo "python version is ${pver}" -echo - -# create env -if [ "$cont1" = true ] ; then - if [ -d "${cDir}/envAI_${sysN}" ];then - echo 'env already exist' - echo - - source envAI_${sysN}/bin/activate - else - python3 -m venv envAI_${sysN} - - # get headers for pip - if [ -f "${cDir}/envAI_${sysN}/bin/pip3" ]; then - echo 'pip already exist' - else - cp "$(which pip3)" $cDir/envAI_${sysN}/bin/ - ln -s $cDir/envAI_${sysN}/bin/pip3 $cDir/envAI_${sysN}/bin/pip${pver} - var="#!$cDir/envAI_${sysN}/bin/python${pver}" - sed -i "1s|.*|$var|" $cDir/envAI_${sysN}/bin/pip3 - fi - - # activate env - source envAI_${sysN}/bin/activate - - echo "a new env is created in ${cDir}" - echo "activation is done via:" - echo "source ${cDir}/envAI_${sysN}/bin/activate" - fi -fi - -# install torch -if [ -f "${cDir}/envAI_${sysN}/bin/torchrun" ]; then - echo 'Torch already installed' - echo -else - export TMPDIR=${cDir} - - pip3 install \ - torch==1.11.0+cu115 torchvision==0.12.0+cu115 torchaudio==0.11.0+cu115 -f \ - https://download.pytorch.org/whl/cu115/torch_stable.html --no-cache-dir -fi - -# install horovod -if [ -f "${cDir}/envAI_${sysN}/bin/horovodrun" ]; then - echo 'Horovod already installed' - echo -else - export HOROVOD_WITH_MPI=1 - export HOROVOD_MPI_THREADS_DISABLE=1 - export HOROVOD_GPU=CUDA - export HOROVOD_GPU_OPERATIONS=NCCL - export HOROVOD_CUDA_HOME=$EBROOTCUDA - export HOROVOD_NCCL_HOME=$EBROOTNCCL - export HOROVOD_WITH_PYTORCH=1 - export TMPDIR=${cDir} - - pip3 install --no-cache-dir horovod --ignore-installed -fi - -# install deepspeed -if [ -f "${cDir}/envAI_${sysN}/bin/deepspeed" ]; then - echo 'DeepSpeed already installed' - echo -else - # compile all opt. stuff - not needed & not working - #export DS_BUILD_OPS=1 - # compile req. opt. stuff - export DS_BUILD_FUSED_ADAM=1 - export DS_BUILD_UTILS=1 - if [ "$sysN" = 'deepv' ] ; then - #fix libaio issues via: - export DS_BUILD_AIO=0 - fi - export TMPDIR=${cDir} - - pip3 install --no-cache-dir DeepSpeed - - # add this to .../deepspeed/launcher/launch.py l.93 - var=' args.node_rank=int(os.environ.get("SLURM_PROCID",0))' - sed -i "93s|.*|$var|" $cDir/envAI_${sysN}/lib/python${pver}/site-packages/deepspeed/launcher/launch.py -fi - -# install heat -if [ -d "${cDir}/envAI_${sysN}/lib/python${pver}/site-packages/heat" ]; then - echo 'HeAT already installed' - echo -else - export TMPDIR=${cDir} - export CFLAGS="-noswitcherror" - export CXXFLAGS="-noswitcherror" - - # need to modify setup.py to accep torch>1.9 for heat - wget https://files.pythonhosted.org/packages/5d/3a/4781f1e6910753bfdfa6712c83c732c60e675d8de14983926a0d9306c7a6/heat-1.1.1.tar.gz - tar xzf heat-1.1.1.tar.gz - var=' "torch>=1.7.0",' - sed -i "36s|.*|$var|" heat-1.1.1/setup.py - var=' "torchvision>=0.8.0",' - sed -i "39s|.*|$var|" heat-1.1.1/setup.py - - # create tar again! - rm -rf heat-1.1.1.tar.gz - tar czf heat-1.1.1.tar.gz heat-1.1.1 - rm -rf heat-1.1.1 - - pip3 install --no-cache-dir 'heat-1.1.1.tar.gz[hdf5,netcdf]' - - rm -rf heat-1.1.1.tar.gz -fi - -# get rest of the libraries$ -if [ "$cont1" = true ] ; then - # install rest - pip3 install -r reqs.txt --ignore-installed - - # modify l.4 of /torchnlp/_third_party/weighted_random_sampler.py - var='int_classes = int' - sed -i "4s|.*|$var|" \ - $cDir/envAI_${sysN}/lib/python${pver}/site-packages/torchnlp/_third_party/weighted_random_sampler.py -fi - -# fix IB IP config -if [ -f "${cDir}/envAI_${sysN}/bin/torchrun" ]; then - sed -i -e '3,8s/^/#/' ${cDir}/envAI_${sysN}/bin/torchrun - echo """ -import re -import sys -from torch.distributed.run import main -from torch.distributed.elastic.agent.server import api as sapi - -def new_get_fq_hostname(): - return _orig_get_fq_hostname().replace('.', 'i.', 1) - -if __name__ == '__main__': - _orig_get_fq_hostname = sapi._get_fq_hostname - sapi._get_fq_hostname = new_get_fq_hostname - sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) - sys.exit(main()) -""" >> ${cDir}/envAI_${sysN}/bin/torchrun -fi - -#eof diff --git a/scripts/jureca_deepspeed/lamec.json b/scripts/jureca_deepspeed/lamec.json deleted file mode 100644 index b1572ed4b5ac84409ff6cb91e575344301c84b95..0000000000000000000000000000000000000000 --- a/scripts/jureca_deepspeed/lamec.json +++ /dev/null @@ -1 +0,0 @@ -{"startscript": "DS_startscript_deep.sh"} \ No newline at end of file diff --git a/scripts/jureca_deepspeed/reqs.txt b/scripts/jureca_deepspeed/reqs.txt deleted file mode 100755 index 8d4888638b41cd595be271f60fb51d8edcf00275..0000000000000000000000000000000000000000 --- a/scripts/jureca_deepspeed/reqs.txt +++ /dev/null @@ -1,8 +0,0 @@ -Pillow -pyparsing -python-dateutil -matplotlib -h5py -pytorch-nlp -pyprof -filelock diff --git a/scripts/jureca_graphcore/GC_pytorch_mnist.py b/scripts/jureca_graphcore/GC_pytorch_mnist.py deleted file mode 100644 index 438cebd72e961f7ae55dc76f503a88ff0276a527..0000000000000000000000000000000000000000 --- a/scripts/jureca_graphcore/GC_pytorch_mnist.py +++ /dev/null @@ -1,346 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# author: EI -# version: 220615a - -# std libs -import argparse, sys, os, time, numpy as np, random -from tqdm import tqdm - -# ml libs -import torch -import torch.distributed as dist -import torch.nn as nn -import torch.nn.functional as F -import torch.optim as optim -from torchvision import datasets, transforms - -# Graphcore (GC) additions -import poptorch - -# parsed settings -def pars_ini(): - global args - parser = argparse.ArgumentParser(description='PyTorch MNIST Example') - - # IO parsers - parser.add_argument('--data-dir', default='./', - help='location of the training dataset in the local filesystem') - parser.add_argument('--restart-int', type=int, default=10, - help='restart interval per epoch (default: 10)') - - # model parsers - parser.add_argument('--batch-size', type=int, default=64, - help='input batch size for training (default: 64)') - parser.add_argument('--test-batch-size', type=int, default=64, - help='input batch size for testing (default: 64)') - parser.add_argument('--epochs', type=int, default=10, - help='number of epochs to train (default: 10)') - parser.add_argument('--lr', type=float, default=0.01, - help='learning rate (default: 0.01)') - parser.add_argument('--concM', type=int, default=100, - help='conc MNIST to this factor (default: 1)') - parser.add_argument('--momentum', type=float, default=0.5, - help='momentum in SGD optimizer (default: 0.5)') - parser.add_argument('--shuff', action='store_true', default=False, - help='shuffle dataset (default: False)') - - # debug parsers - parser.add_argument('--testrun', action='store_true', default=False, - help='do a test run with seed (default: False)') - parser.add_argument('--nseed', type=int, default=0, - help='seed integer for reproducibility (default: 0)') - parser.add_argument('--log-int', type=int, default=10, - help='log interval per training') - - # parallel parsers - parser.add_argument('--nworker', type=int, default=0, - help='number of workers in DataLoader (default: 0 - only main)') - parser.add_argument('--prefetch', type=int, default=2, - help='prefetch data in DataLoader (default: 2)') - parser.add_argument('--benchrun', action='store_true', default=False, - help='do a bench run w/o IO (default: False)') - - # GC parsers - """ - Device iteration defines the number of iterations the device should - run over the data before returning to the user. - This is equivalent to running the IPU in a loop over that the specified - number of iterations, with a new batch of data each time. However, increasing - deviceIterations is more efficient because the loop runs on the IPU directly. - """ - parser.add_argument('--device-iterations', type=int, default=50, - help='check code! (default: 50)') - - args = parser.parse_args() - - # set minimum of 3 epochs when benchmarking (last epoch produces logs) - args.epochs = 3 if args.epochs < 3 and args.benchrun else args.epochs - -# network -class Block(nn.Module): - def __init__(self, in_channels, num_filters, kernel_size, pool_size): - super(Block, self).__init__() - self.conv = nn.Conv2d(in_channels, - num_filters, - kernel_size=kernel_size) - self.pool = nn.MaxPool2d(kernel_size=pool_size) - self.relu = nn.ReLU() - - def forward(self, x): - x = self.conv(x) - x = self.pool(x) - x = self.relu(x) - return x - -class Network(nn.Module): - def __init__(self): - super(Network, self).__init__() - self.layer1 = Block(1, 10, 5, 2) - self.layer2 = Block(10, 20, 5, 2) - self.layer3 = nn.Linear(320, 50) - self.layer3_act = nn.ReLU() - self.layer3_dropout = torch.nn.Dropout(0.5) - self.layer4 = nn.Linear(50, 10) - # GC - loss is defined in the network - self.loss = nn.NLLLoss() - - def forward(self, x, labels=None): - x = self.layer1(x) - x = self.layer2(x) - x = x.view(-1, 320) - x = self.layer3_act(self.layer3(x)) - x = self.layer4(self.layer3_dropout(x)) - x = nn.functional.log_softmax(x) - if self.training: - return x, self.loss(x, labels) - return x - -# train loop - GC -def train(model, train_loader, epoch): - model.train() - t_list = [] - loss_acc=0 - for batch_idx, (data, target) in enumerate(train_loader): - t = time.perf_counter() - pred,loss = model(data,target) - if batch_idx % args.log_int == 0: - print( - f'Train epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} ' - f'({100.0 * batch_idx / len(train_loader):.0f}%)]\t\tLoss: {loss.item():.6f}') - t_list.append(time.perf_counter() - t) - loss_acc+= loss.item() - print('TIMER: train time', sum(t_list) / len(t_list),'s') - return loss_acc - -# test loop - GC -def test(model, test_loader): - model.eval() - test_loss = 0 - for data, labels in test_loader: - output = model(data) - test_loss += accuracy(output, labels) - print('Accuracy on test set: {:0.2f}%'.format(test_loss / len(test_loader)),'\n') - -def accuracy(predictions, labels): - _, ind = torch.max(predictions, 1) - labels = labels[-predictions.size()[0]:] - accuracy = torch.sum(torch.eq(ind, labels)).item() / labels.size()[0] * 100.0 - return accuracy - -# save state of the training -def save_state(model,res_name,is_best): - if is_best: - rt = time.time() - torch.save(model.state_dict(),'./'+res_name) - print(f'DEBUG: state is saved') - -# main -def main(): - # get parse args - pars_ini() - - # get directory - program_dir = os.getcwd() - - # start the time.time for profiling - st = time.time() - - # deterministic testrun - if args.testrun: - torch.manual_seed(args.nseed) - g = torch.Generator() - g.manual_seed(args.nseed) - - # some debug - print('TIMER: initialise:', time.time()-st, 's') - print('DEBUG: sys.version:',sys.version,'\n') - - print('DEBUG: IO parsers:') - print('DEBUG: args.data_dir:',args.data_dir) - print('DEBUG: args.restart_int:',args.restart_int,'\n') - - print('DEBUG: model parsers:') - print('DEBUG: args.batch_size:',args.batch_size) - print('DEBUG: args.test_batch_size:',args.test_batch_size) - print('DEBUG: args.epochs:',args.epochs) - print('DEBUG: args.lr:',args.lr) - print('DEBUG: args.concM:',args.concM) - print('DEBUG: args.momentum:',args.momentum) - print('DEBUG: args.shuff:',args.shuff,'\n') - - print('DEBUG: debug parsers:') - print('DEBUG: args.testrun:',args.testrun) - print('DEBUG: args.nseed:',args.nseed) - print('DEBUG: args.log_int:',args.log_int,'\n') - - print('DEBUG: parallel parsers:') - print('DEBUG: args.nworker:',args.nworker) - print('DEBUG: args.prefetch:',args.prefetch) - print('DEBUG: args.benchrun:',args.benchrun,'\n') - - print('DEBUG: GC parsers:') - print('DEBUG: args.device_iterations:',args.device_iterations,'\n') - -# load datasets - data_dir = args.data_dir - mnist_scale = args.concM - largeData = [] - for i in range(mnist_scale): - largeData.append( - datasets.MNIST(data_dir, train=True, download=False, - transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])) - ) - - # concat data - training_dataset = torch.utils.data.ConcatDataset(largeData) - - mnist_scale = args.concM - largeData = [] - for i in range(mnist_scale): - largeData.append( - datasets.MNIST(data_dir, train=False, download=False, - transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])) - ) - - # concat data - test_dataset = torch.utils.data.ConcatDataset(largeData) - -# GC - set training options - """ - To accelerate the training deviceIterations=50 is set - data loader will pick 50 batches of data per step. - """ - training_opts = poptorch.Options() - training_opts.deviceIterations(args.device_iterations) - -# GC - data loader provided by PopTorch - args.shuff = args.shuff and not args.testrun - train_loader = poptorch.DataLoader( - options=training_opts, - dataset=training_dataset, - batch_size=args.batch_size, - shuffle=args.shuff, - drop_last=True, - num_workers=args.nworker - ) - - """ - A `poptorch.Options()` instance contains a set of default hyperparameters and options for the IPU. - """ - test_loader = poptorch.DataLoader( - options=poptorch.Options(), - dataset=test_dataset, - batch_size=args.test_batch_size, - num_workers=args.nworker - ) - - print('TIMER: read and concat data:', time.time()-st, 's') - -# create CNN model - model = Network() - -# optimizer - optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) - -# GC - distribute model to IPU - train_model = poptorch.trainingModel( - model, - training_opts, - optimizer=optimizer - ) - -# GC - distribute model to IPU w/o training options (for testing) - test_model = poptorch.inferenceModel(model,options=poptorch.Options()) - -# resume state if any - best_acc = np.Inf - res_name='checkpoint.pth.tar' - start_epoch = 1 - if os.path.isfile(res_name) and not args.benchrun: - try: - checkpoint = torch.load(program_dir+'/'+res_name) - start_epoch = checkpoint['epoch'] - print(f'WARNING: restarting from {start_epoch} epoch') - except: - print(f'WARNING: restart file cannot be loaded, restarting!') - - if start_epoch>=args.epochs+1: - print(f'WARNING: given epochs are less than the one in the restart file!\n' - f'WARNING: SYS.EXIT is issued') - sys.exit() - -# start trainin/testing loop - print('TIMER: initialization:', time.time()-st, 's') - print(f'\nDEBUG: start training') - print(f'--------------------------------------------------------') - - et = time.time() - for epoch in range(start_epoch, args.epochs + 1): - lt = time.time() - - # GC - combines forward + backward - loss_acc = train(train_model, train_loader, epoch) - - # GC - testing - acc_test = test(test_model, test_loader) - - # save first epoch timer - if epoch == start_epoch: - first_ep_t = time.time()-lt - - print('TIMER: epoch time:', time.time()-lt, 's') - -# GC - unload models from IPU - train_model.detachFromDevice() - test_model.detachFromDevice() - -# save final state - if not args.benchrun: - save_state(train_model,res_name,True) - - # some debug - print(f'\n--------------------------------------------------------') - print('DEBUG: training results:\n') - print('TIMER: first epoch time:', first_ep_t, ' s') - print('TIMER: last epoch time:', time.time()-lt, ' s') - print('TIMER: average epoch time:', (time.time()-et)/args.epochs, ' s') - print('TIMER: total epoch time:', time.time()-et, ' s') - if epoch > 1: - print('TIMER: total epoch-1 time:', time.time()-et-first_ep_t, ' s') - print('TIMER: average epoch-1 time:', (time.time()-et-first_ep_t)/(args.epochs-1), ' s') - if args.benchrun: - print('TIMER: total epoch-2 time:', lt-first_ep_t, ' s') - print('TIMER: average epoch-2 time:', (lt-first_ep_t)/(args.epochs-2), ' s') - -if __name__ == "__main__": - main() - sys.exit() - -#eof diff --git a/scripts/jureca_graphcore/GC_startscript.sh b/scripts/jureca_graphcore/GC_startscript.sh deleted file mode 100644 index 1c9f928bf0d67d90af81c780a7bde8d21a8d6601..0000000000000000000000000000000000000000 --- a/scripts/jureca_graphcore/GC_startscript.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -# general configuration of the job -#SBATCH --job-name=GC_test -#SBATCH --account=zam -#SBATCH --mail-user= -#SBATCH --mail-type=ALL -#SBATCH --output=job.out -#SBATCH --error=job.err -#SBATCH --time=01:00:00 - -# configure node and process count on the CM -#SBATCH --partition=dc-ipu -#SBATCH --nodes=1 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=64 -#SBATCH --exclusive - -srun apptainer run pytorch.sif -- python3 \ - ./GC_pytorch_mnist.py \ - --data-dir /p/scratch/raise-ctp1/data_MNIST/ \ - --nworker $SLURM_CPUS_PER_TASK \ - --concM 100 - -# eof diff --git a/scripts/jureca_graphcore/README.md b/scripts/jureca_graphcore/README.md deleted file mode 100644 index dafa0b7a9f7e0db392f823c25269832bc454e796..0000000000000000000000000000000000000000 --- a/scripts/jureca_graphcore/README.md +++ /dev/null @@ -1,28 +0,0 @@ -# DL using Graphcore IPU - -# Graphcore PyTorch documentation -https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/pytorch_to_poptorch.html# - -# jureca user documentation -https://apps.fz-juelich.de/jsc/hps/jureca/index.html - -# current isues -1. no parallel training - -# to-do -1. implement parallelization - -# done -1. initial mnist tests show 8x better performance than A100 - -# usage -apptainer is used for the containers -0. to use containers in Jureca, (if not done!) from JuDoor, click "Request access to restricted software", then "Access to other restricted software", and accept the agreement! ! finally, reset ssh -1. pull Graphcore SDK `apptainer pull poplar.sif docker://docker.io/graphcore/poplar:2.4.0` -2. build Graphcore SDK with PyTorch `apptainer build pytorch.sif docker://docker.io/graphcore/pytorch` \ -this comes with Torch-1.10.0 -3. additional libraries are needed: \ -`apptainer shell pytorch.sif` -`> pip3 install torchvision==1.11.0 tqdm h5py --user` -`> exit` -4. submit `sbatch GC_startscript.sh` diff --git a/scripts/jureca_graphcore/lamec.json b/scripts/jureca_graphcore/lamec.json deleted file mode 100644 index fe05ab67e3c1cffe3df4c751ceec2acfbf9c46b7..0000000000000000000000000000000000000000 --- a/scripts/jureca_graphcore/lamec.json +++ /dev/null @@ -1 +0,0 @@ -{"startscript": "GC_startscript.sh"} \ No newline at end of file diff --git a/scripts/jureca_heat/HeAT_startscript_deep.sh b/scripts/jureca_heat/HeAT_startscript_deep.sh deleted file mode 100644 index a48cb924b2ce6482d458c4c2675d67f8b805c099..0000000000000000000000000000000000000000 --- a/scripts/jureca_heat/HeAT_startscript_deep.sh +++ /dev/null @@ -1,71 +0,0 @@ -#!/bin/bash - -# general configuration of the job -#SBATCH --job-name=Heattest -#SBATCH --account=raise-ctp1 -#SBATCH --mail-user= -#SBATCH --mail-type=ALL -#SBATCH --output=job.out -#SBATCH --error=job.err -#SBATCH --time=00:30:00 - -# configure node and process count on the CM -#SBATCH --partition=dc-gpu -#SBATCH --nodes=2 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=32 -#SBATCH --gpus-per-node=4 -#SBATCH --exclusive - -# gres options have to be disabled for deepv -#SBATCH --gres=gpu:4 - -# parameters -debug=false -bs=3 -epochs=1 -lr=0.0001 -dataDir='/p/scratch/raise-ctp1/T31/' -COMMAND="HeAT_pytorch_AT.py - --batch-size $bs --epochs $epochs --lr $lr --nworker $SLURM_CPUS_PER_TASK --data-dir $dataDir" - -# command to exec -echo "DEBUG: EXECUTE=$COMMAND" - -# set modules -ml --force purge -ml Stages/2022 NVHPC/22.3 ParaStationMPI/5.5.0-1-mt NCCL/2.12.7-1-CUDA-11.5 cuDNN/8.3.1.22-CUDA-11.5 -ml Python/3.9.6 libaio/0.3.112 HDF5/1.12.1-serial mpi-settings/CUDA - -# set env -source /p/project/raise-ctp1/RAISE/envAI_jureca/bin/activate - -# sleep a sec -sleep 1 - -# job info -echo "DEBUG: TIME: $(date)" -echo "DEBUG: EXECUTE: $EXEC" -echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" -echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" -echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" -echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" -echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" -echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" -echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" -if [ "$debug" = true ] ; then - export NCCL_DEBUG=INFO -fi -echo - -# set comm -export CUDA_VISIBLE_DEVICES="0,1,2,3" -if [ "$SLURM_CPUS_PER_TASK" > 0 ] ; then - export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -fi - -# execute -srun --cpu-bind=none python3 -u $COMMAND - -# eof diff --git a/scripts/jureca_heat/README.md b/scripts/jureca_heat/README.md deleted file mode 100644 index c3c4afd537aacb2bd4b512ed7f55d6b4940ba9f5..0000000000000000000000000000000000000000 --- a/scripts/jureca_heat/README.md +++ /dev/null @@ -1,15 +0,0 @@ -# DL using HeAT/PyTorch on deepv - -# source -https://github.com/helmholtz-analytics/heat - -# current isues -1. - -# to-do -1. - -# usage - pip -1. clone -2. run `./createENV.sh` -3. submit `sbatch HeAT_startscript_deep.sh` diff --git a/scripts/jureca_heat/createEnv.sh b/scripts/jureca_heat/createEnv.sh deleted file mode 100755 index e5cc3af47484d29cf67a2dc137c219b1890161bf..0000000000000000000000000000000000000000 --- a/scripts/jureca_heat/createEnv.sh +++ /dev/null @@ -1,180 +0,0 @@ -#!/bin/bash -# -*- coding: utf-8 -*- -# author: EI -# version: 220328a -# creates machine specific python env - -# set modules -ml --force purge - -# get sys info -cDir=$PWD -sysN="$(uname -n | cut -f2- -d.)" -echo "system:${sysN}" -echo - -cont1=false -if [ "$sysN" = 'deepv' ] ; then - ml use $OTHERSTAGES - ml Stages/2022 GCC OpenMPI cuDNN NCCL Python CMake - cont1=true -elif [ "$sysN" = 'juwels' ] ; then - ml GCC ParaStationMPI Python CMake - cont1=true -elif [ "$sysN" = 'jureca' ] ; then - ml Stages/2022 NVHPC ParaStationMPI/5.5.0-1-mt Python CMake NCCL/2.11.4-CUDA-11.5 cuDNN libaio HDF5 PnetCDF mpi-settings/CUDA - cont1=true -else - echo - echo 'unknown system detected' - echo 'canceling' - echo -fi -echo "modules loaded" -echo - -# get python version -pver="$(python --version 2>&1 | awk {'print $2'} | cut -f1-2 -d.)" -echo "python version is ${pver}" -echo - -# create env -if [ "$cont1" = true ] ; then - if [ -d "${cDir}/envAI_${sysN}" ];then - echo 'env already exist' - echo - - source envAI_${sysN}/bin/activate - else - python3 -m venv envAI_${sysN} - - # get headers for pip - if [ -f "${cDir}/envAI_${sysN}/bin/pip3" ]; then - echo 'pip already exist' - else - cp "$(which pip3)" $cDir/envAI_${sysN}/bin/ - ln -s $cDir/envAI_${sysN}/bin/pip3 $cDir/envAI_${sysN}/bin/pip${pver} - var="#!$cDir/envAI_${sysN}/bin/python${pver}" - sed -i "1s|.*|$var|" $cDir/envAI_${sysN}/bin/pip3 - fi - - # activate env - source envAI_${sysN}/bin/activate - - echo "a new env is created in ${cDir}" - echo "activation is done via:" - echo "source ${cDir}/envAI_${sysN}/bin/activate" - fi -fi - -# install torch -if [ -f "${cDir}/envAI_${sysN}/bin/torchrun" ]; then - echo 'Torch already installed' - echo -else - export TMPDIR=${cDir} - - pip3 install \ - torch==1.11.0+cu115 torchvision==0.12.0+cu115 torchaudio==0.11.0+cu115 -f \ - https://download.pytorch.org/whl/cu115/torch_stable.html --no-cache-dir -fi - -# install horovod -if [ -f "${cDir}/envAI_${sysN}/bin/horovodrun" ]; then - echo 'Horovod already installed' - echo -else - export HOROVOD_WITH_MPI=1 - export HOROVOD_MPI_THREADS_DISABLE=1 - export HOROVOD_GPU=CUDA - export HOROVOD_GPU_OPERATIONS=NCCL - export HOROVOD_CUDA_HOME=$EBROOTCUDA - export HOROVOD_NCCL_HOME=$EBROOTNCCL - export HOROVOD_WITH_PYTORCH=1 - export TMPDIR=${cDir} - - pip3 install --no-cache-dir horovod --ignore-installed -fi - -# install deepspeed -if [ -f "${cDir}/envAI_${sysN}/bin/deepspeed" ]; then - echo 'DeepSpeed already installed' - echo -else - # compile all opt. stuff - not needed & not working - #export DS_BUILD_OPS=1 - # compile req. opt. stuff - export DS_BUILD_FUSED_ADAM=1 - export DS_BUILD_UTILS=1 - if [ "$sysN" = 'deepv' ] ; then - #fix libaio issues via: - export DS_BUILD_AIO=0 - fi - export TMPDIR=${cDir} - - pip3 install --no-cache-dir DeepSpeed - - # add this to .../deepspeed/launcher/launch.py l.93 - var=' args.node_rank=int(os.environ.get("SLURM_PROCID",0))' - sed -i "93s|.*|$var|" $cDir/envAI_${sysN}/lib/python${pver}/site-packages/deepspeed/launcher/launch.py -fi - -# install heat -if [ -d "${cDir}/envAI_${sysN}/lib/python${pver}/site-packages/heat" ]; then - echo 'HeAT already installed' - echo -else - export TMPDIR=${cDir} - export CFLAGS="-noswitcherror" - export CXXFLAGS="-noswitcherror" - - # need to modify setup.py to accep torch>1.9 for heat - wget https://files.pythonhosted.org/packages/5d/3a/4781f1e6910753bfdfa6712c83c732c60e675d8de14983926a0d9306c7a6/heat-1.1.1.tar.gz - tar xzf heat-1.1.1.tar.gz - var=' "torch>=1.7.0",' - sed -i "36s|.*|$var|" heat-1.1.1/setup.py - var=' "torchvision>=0.8.0",' - sed -i "39s|.*|$var|" heat-1.1.1/setup.py - - # create tar again! - rm -rf heat-1.1.1.tar.gz - tar czf heat-1.1.1.tar.gz heat-1.1.1 - rm -rf heat-1.1.1 - - pip3 install --no-cache-dir 'heat-1.1.1.tar.gz[hdf5,netcdf]' - - rm -rf heat-1.1.1.tar.gz -fi - -# get rest of the libraries$ -if [ "$cont1" = true ] ; then - # install rest - pip3 install -r reqs.txt --ignore-installed - - # modify l.4 of /torchnlp/_third_party/weighted_random_sampler.py - var='int_classes = int' - sed -i "4s|.*|$var|" \ - $cDir/envAI_${sysN}/lib/python${pver}/site-packages/torchnlp/_third_party/weighted_random_sampler.py -fi - -# fix IB IP config -if [ -f "${cDir}/envAI_${sysN}/bin/torchrun" ]; then - sed -i -e '3,8s/^/#/' ${cDir}/envAI_${sysN}/bin/torchrun - echo """ -import re -import sys -from torch.distributed.run import main -from torch.distributed.elastic.agent.server import api as sapi - -def new_get_fq_hostname(): - return _orig_get_fq_hostname().replace('.', 'i.', 1) - -if __name__ == '__main__': - _orig_get_fq_hostname = sapi._get_fq_hostname - sapi._get_fq_hostname = new_get_fq_hostname - sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) - sys.exit(main()) -""" >> ${cDir}/envAI_${sysN}/bin/torchrun -fi - -#eof diff --git a/scripts/jureca_heat/lamec.json b/scripts/jureca_heat/lamec.json deleted file mode 100644 index d1bf1b27df9fd3984cd046733eeccea2901e07b3..0000000000000000000000000000000000000000 --- a/scripts/jureca_heat/lamec.json +++ /dev/null @@ -1 +0,0 @@ -{"startscript": "HeAT_startscript_deep.sh"} \ No newline at end of file diff --git a/scripts/jureca_heat/reqs.txt b/scripts/jureca_heat/reqs.txt deleted file mode 100755 index 8d4888638b41cd595be271f60fb51d8edcf00275..0000000000000000000000000000000000000000 --- a/scripts/jureca_heat/reqs.txt +++ /dev/null @@ -1,8 +0,0 @@ -Pillow -pyparsing -python-dateutil -matplotlib -h5py -pytorch-nlp -pyprof -filelock diff --git a/scripts/jureca_horovod/Hor_startscript_deep.sh b/scripts/jureca_horovod/Hor_startscript_deep.sh deleted file mode 100644 index 315640ec6ffc48ec8f133666566b9a436e2e827d..0000000000000000000000000000000000000000 --- a/scripts/jureca_horovod/Hor_startscript_deep.sh +++ /dev/null @@ -1,82 +0,0 @@ -#!/bin/bash - -# general configuration of the job -#SBATCH --job-name=HorTest -#SBATCH --account=raise-ctp1 -#SBATCH --mail-user= -#SBATCH --mail-type=ALL -#SBATCH --output=job.out -#SBATCH --error=job.err -#SBATCH --time=00:15:00 - -# configure node and process count on the CM -#SBATCH --partition=dc-gpu -#SBATCH --nodes=2 -#SBATCH --ntasks-per-node=4 -#SBATCH --cpus-per-task=32 -#SBATCH --gpus-per-node=4 -#SBATCH --exclusive - -# gres options have to be disabled for deepv -#SBATCH --gres=gpu:4 - -# command to exec -debug=false # do nccl debug -bs=2 # batch-size -epochs=10 # epochs -lr=0.01 # learning rate - -dataDir='/p/scratch/raise-ctp1/T31_LD/' -COMMAND="Hor_pytorch_AT.py" -EXEC=$COMMAND" --batch-size $bs - --epochs $epochs - --lr $lr - --nworker $SLURM_CPUS_PER_TASK - --data-dir $dataDir" - -# set modules -ml --force purge -ml Stages/2022 NVHPC/22.3 ParaStationMPI/5.5.0-1-mt NCCL/2.12.7-1-CUDA-11.5 cuDNN/8.3.1.22-CUDA-11.5 -ml Python/3.9.6 libaio/0.3.112 HDF5/1.12.1-serial mpi-settings/CUDA - -# set env -source /p/project/raise-ctp1/RAISE/envAI_jureca/bin/activate - -# sleep a sec -sleep 1 - -# job info -echo "DEBUG: TIME: $(date)" -echo "DEBUG: EXECUTE: $EXEC" -echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" -echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" -echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" -echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" -echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" -echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" -echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" -if [ "$debug" = true ] ; then - export NCCL_DEBUG=INFO -fi -echo - -# set comm -export CUDA_VISIBLE_DEVICES="0,1,2,3" -export OMP_NUM_THREADS=1 -if [ "$SLURM_CPUS_PER_TASK" > 0 ] ; then - export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -fi - -# launch -srun --cpu-bind=none python3 -u $EXEC - -# nsys profiler: following https://gist.github.com/mcarilli/376821aa1a7182dfcf59928a7cde3223 -#srun --cpu-bind=none nsys profile \ -# --trace=cublas,cuda,cudnn,nvtx,osrt \ -# --sample=cpu \ -# --stats=true \ -# --force-overwrite=true \ -# -o ./prof.out python3 -u $EXEC - -# eof diff --git a/scripts/jureca_horovod/README.md b/scripts/jureca_horovod/README.md deleted file mode 100644 index 90520a33155e2fb065d3d12b59c2d20a3bf3aaec..0000000000000000000000000000000000000000 --- a/scripts/jureca_horovod/README.md +++ /dev/null @@ -1,15 +0,0 @@ -# DL using Horovod on Jureca DC - -# source -https://github.com/horovod/horovod - -# current isues -1. mpi-settings/CUDA is only available via NVHPC/ParaStationMPI - -# to-do -1. wait for GCC - -# usage - pip -1. clone -2. run `./createENV.sh` -3. submit `sbatch Hor_startscript_deep.sh` diff --git a/scripts/jureca_horovod/createEnv.sh b/scripts/jureca_horovod/createEnv.sh deleted file mode 100755 index e5cc3af47484d29cf67a2dc137c219b1890161bf..0000000000000000000000000000000000000000 --- a/scripts/jureca_horovod/createEnv.sh +++ /dev/null @@ -1,180 +0,0 @@ -#!/bin/bash -# -*- coding: utf-8 -*- -# author: EI -# version: 220328a -# creates machine specific python env - -# set modules -ml --force purge - -# get sys info -cDir=$PWD -sysN="$(uname -n | cut -f2- -d.)" -echo "system:${sysN}" -echo - -cont1=false -if [ "$sysN" = 'deepv' ] ; then - ml use $OTHERSTAGES - ml Stages/2022 GCC OpenMPI cuDNN NCCL Python CMake - cont1=true -elif [ "$sysN" = 'juwels' ] ; then - ml GCC ParaStationMPI Python CMake - cont1=true -elif [ "$sysN" = 'jureca' ] ; then - ml Stages/2022 NVHPC ParaStationMPI/5.5.0-1-mt Python CMake NCCL/2.11.4-CUDA-11.5 cuDNN libaio HDF5 PnetCDF mpi-settings/CUDA - cont1=true -else - echo - echo 'unknown system detected' - echo 'canceling' - echo -fi -echo "modules loaded" -echo - -# get python version -pver="$(python --version 2>&1 | awk {'print $2'} | cut -f1-2 -d.)" -echo "python version is ${pver}" -echo - -# create env -if [ "$cont1" = true ] ; then - if [ -d "${cDir}/envAI_${sysN}" ];then - echo 'env already exist' - echo - - source envAI_${sysN}/bin/activate - else - python3 -m venv envAI_${sysN} - - # get headers for pip - if [ -f "${cDir}/envAI_${sysN}/bin/pip3" ]; then - echo 'pip already exist' - else - cp "$(which pip3)" $cDir/envAI_${sysN}/bin/ - ln -s $cDir/envAI_${sysN}/bin/pip3 $cDir/envAI_${sysN}/bin/pip${pver} - var="#!$cDir/envAI_${sysN}/bin/python${pver}" - sed -i "1s|.*|$var|" $cDir/envAI_${sysN}/bin/pip3 - fi - - # activate env - source envAI_${sysN}/bin/activate - - echo "a new env is created in ${cDir}" - echo "activation is done via:" - echo "source ${cDir}/envAI_${sysN}/bin/activate" - fi -fi - -# install torch -if [ -f "${cDir}/envAI_${sysN}/bin/torchrun" ]; then - echo 'Torch already installed' - echo -else - export TMPDIR=${cDir} - - pip3 install \ - torch==1.11.0+cu115 torchvision==0.12.0+cu115 torchaudio==0.11.0+cu115 -f \ - https://download.pytorch.org/whl/cu115/torch_stable.html --no-cache-dir -fi - -# install horovod -if [ -f "${cDir}/envAI_${sysN}/bin/horovodrun" ]; then - echo 'Horovod already installed' - echo -else - export HOROVOD_WITH_MPI=1 - export HOROVOD_MPI_THREADS_DISABLE=1 - export HOROVOD_GPU=CUDA - export HOROVOD_GPU_OPERATIONS=NCCL - export HOROVOD_CUDA_HOME=$EBROOTCUDA - export HOROVOD_NCCL_HOME=$EBROOTNCCL - export HOROVOD_WITH_PYTORCH=1 - export TMPDIR=${cDir} - - pip3 install --no-cache-dir horovod --ignore-installed -fi - -# install deepspeed -if [ -f "${cDir}/envAI_${sysN}/bin/deepspeed" ]; then - echo 'DeepSpeed already installed' - echo -else - # compile all opt. stuff - not needed & not working - #export DS_BUILD_OPS=1 - # compile req. opt. stuff - export DS_BUILD_FUSED_ADAM=1 - export DS_BUILD_UTILS=1 - if [ "$sysN" = 'deepv' ] ; then - #fix libaio issues via: - export DS_BUILD_AIO=0 - fi - export TMPDIR=${cDir} - - pip3 install --no-cache-dir DeepSpeed - - # add this to .../deepspeed/launcher/launch.py l.93 - var=' args.node_rank=int(os.environ.get("SLURM_PROCID",0))' - sed -i "93s|.*|$var|" $cDir/envAI_${sysN}/lib/python${pver}/site-packages/deepspeed/launcher/launch.py -fi - -# install heat -if [ -d "${cDir}/envAI_${sysN}/lib/python${pver}/site-packages/heat" ]; then - echo 'HeAT already installed' - echo -else - export TMPDIR=${cDir} - export CFLAGS="-noswitcherror" - export CXXFLAGS="-noswitcherror" - - # need to modify setup.py to accep torch>1.9 for heat - wget https://files.pythonhosted.org/packages/5d/3a/4781f1e6910753bfdfa6712c83c732c60e675d8de14983926a0d9306c7a6/heat-1.1.1.tar.gz - tar xzf heat-1.1.1.tar.gz - var=' "torch>=1.7.0",' - sed -i "36s|.*|$var|" heat-1.1.1/setup.py - var=' "torchvision>=0.8.0",' - sed -i "39s|.*|$var|" heat-1.1.1/setup.py - - # create tar again! - rm -rf heat-1.1.1.tar.gz - tar czf heat-1.1.1.tar.gz heat-1.1.1 - rm -rf heat-1.1.1 - - pip3 install --no-cache-dir 'heat-1.1.1.tar.gz[hdf5,netcdf]' - - rm -rf heat-1.1.1.tar.gz -fi - -# get rest of the libraries$ -if [ "$cont1" = true ] ; then - # install rest - pip3 install -r reqs.txt --ignore-installed - - # modify l.4 of /torchnlp/_third_party/weighted_random_sampler.py - var='int_classes = int' - sed -i "4s|.*|$var|" \ - $cDir/envAI_${sysN}/lib/python${pver}/site-packages/torchnlp/_third_party/weighted_random_sampler.py -fi - -# fix IB IP config -if [ -f "${cDir}/envAI_${sysN}/bin/torchrun" ]; then - sed -i -e '3,8s/^/#/' ${cDir}/envAI_${sysN}/bin/torchrun - echo """ -import re -import sys -from torch.distributed.run import main -from torch.distributed.elastic.agent.server import api as sapi - -def new_get_fq_hostname(): - return _orig_get_fq_hostname().replace('.', 'i.', 1) - -if __name__ == '__main__': - _orig_get_fq_hostname = sapi._get_fq_hostname - sapi._get_fq_hostname = new_get_fq_hostname - sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) - sys.exit(main()) -""" >> ${cDir}/envAI_${sysN}/bin/torchrun -fi - -#eof diff --git a/scripts/jureca_horovod/lamec.json b/scripts/jureca_horovod/lamec.json deleted file mode 100644 index 4aff71d30c25f064280724e030d6fd813c4c4c5d..0000000000000000000000000000000000000000 --- a/scripts/jureca_horovod/lamec.json +++ /dev/null @@ -1 +0,0 @@ -{"startscript": "Hor_startscript_deep.sh"} \ No newline at end of file diff --git a/scripts/jureca_horovod/reqs.txt b/scripts/jureca_horovod/reqs.txt deleted file mode 100755 index 8d4888638b41cd595be271f60fb51d8edcf00275..0000000000000000000000000000000000000000 --- a/scripts/jureca_horovod/reqs.txt +++ /dev/null @@ -1,8 +0,0 @@ -Pillow -pyparsing -python-dateutil -matplotlib -h5py -pytorch-nlp -pyprof -filelock diff --git a/scripts/jureca_libtorch/MNIST/CMakeLists.txt b/scripts/jureca_libtorch/MNIST/CMakeLists.txt deleted file mode 100644 index acaf77157c301b0ea35b575cbe7c9482b956b79b..0000000000000000000000000000000000000000 --- a/scripts/jureca_libtorch/MNIST/CMakeLists.txt +++ /dev/null @@ -1,30 +0,0 @@ -cmake_minimum_required(VERSION 3.1 FATAL_ERROR) -project(mnist) -set(CMAKE_CXX_STANDARD 14) - -find_package(Torch REQUIRED) - -option(DOWNLOAD_MNIST "Download the MNIST dataset from the internet" ON) -if (DOWNLOAD_MNIST) - message(STATUS "Downloading MNIST dataset") - execute_process( - COMMAND python ${CMAKE_CURRENT_LIST_DIR}/../download_mnist.py - -d ${CMAKE_BINARY_DIR}/data - ERROR_VARIABLE DOWNLOAD_ERROR) - if (DOWNLOAD_ERROR) - message(FATAL_ERROR "Error downloading MNIST dataset: ${DOWNLOAD_ERROR}") - endif() -endif() - -add_executable(mnist mnist.cpp) -target_compile_features(mnist PUBLIC cxx_range_for) -target_link_libraries(mnist ${TORCH_LIBRARIES}) - -if (MSVC) - file(GLOB TORCH_DLLS "${TORCH_INSTALL_PREFIX}/lib/*.dll") - add_custom_command(TARGET mnist - POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy_if_different - ${TORCH_DLLS} - $<TARGET_FILE_DIR:mnist>) -endif (MSVC) diff --git a/scripts/jureca_libtorch/MNIST/LibTorch_startscript.sh b/scripts/jureca_libtorch/MNIST/LibTorch_startscript.sh deleted file mode 100644 index 04baaa32b1c5af8db735701236a5c7b5a379ad98..0000000000000000000000000000000000000000 --- a/scripts/jureca_libtorch/MNIST/LibTorch_startscript.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=LibTorchTest -#SBATCH --account=raise-ctp1 -#SBATCH --output=job.out -#SBATCH --error=job.err -#SBATCH --time=00:15:00 -#SBATCH --partition=dc-gpu-devel -#SBATCH --nodes=1 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=64 -#SBATCH --exclusive -#SBATCH --gres=gpu:1 - -ml NVHPC/22.3 cuDNN CMake - -echo "DEBUG: $(date)" -echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" -echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" - -export CUDA_VISIBLE_DEVICES="0" -export OMP_NUM_THREADS=1 - -srun ./mnist diff --git a/scripts/jureca_libtorch/MNIST/compile.sh b/scripts/jureca_libtorch/MNIST/compile.sh deleted file mode 100644 index 1ed0e2769cbab14dd50179bb1078e81db77fce43..0000000000000000000000000000000000000000 --- a/scripts/jureca_libtorch/MNIST/compile.sh +++ /dev/null @@ -1,19 +0,0 @@ -# compile mnist.cpp with latest LibTorch - -# load libraries -ml NVHPC/22.3 CMake/3.21.1 cuDNN/8.3.1.22-CUDA-11.5 Python/3.9.6 - -# get libtorch w/ gpu -wget https://download.pytorch.org/libtorch/cu116/libtorch-cxx11-abi-shared-with-deps-1.12.0%2Bcu116.zip -unzip libtorch-cxx11-abi-shared-with-deps-1.12.0+cu116.zip -libtorch_dir=$PWD/libtorch - -# compile mnist.cpp with libtorch w/ gpu to build folder -mkdir -p build -pushd build -cmake -DCMAKE_PREFIX_PATH=${libtorch_dir} -DDOWNLOAD_MNIST=ON .. -cmake --build . --config Release -mv mnist .. -popd - -# eof diff --git a/scripts/jureca_libtorch/MNIST/download_mnist.py b/scripts/jureca_libtorch/MNIST/download_mnist.py deleted file mode 100644 index 2a5068ffb8e7c35e18b75aee81774e9163374445..0000000000000000000000000000000000000000 --- a/scripts/jureca_libtorch/MNIST/download_mnist.py +++ /dev/null @@ -1,88 +0,0 @@ -from __future__ import division -from __future__ import print_function - -import argparse -import gzip -import os -import sys -import urllib - -try: - from urllib.error import URLError - from urllib.request import urlretrieve -except ImportError: - from urllib2 import URLError - from urllib import urlretrieve - -RESOURCES = [ - 'train-images-idx3-ubyte.gz', - 'train-labels-idx1-ubyte.gz', - 't10k-images-idx3-ubyte.gz', - 't10k-labels-idx1-ubyte.gz', -] - - -def report_download_progress(chunk_number, chunk_size, file_size): - if file_size != -1: - percent = min(1, (chunk_number * chunk_size) / file_size) - bar = '#' * int(64 * percent) - sys.stdout.write('\r0% |{:<64}| {}%'.format(bar, int(percent * 100))) - - -def download(destination_path, url, quiet): - if os.path.exists(destination_path): - if not quiet: - print('{} already exists, skipping ...'.format(destination_path)) - else: - print('Downloading {} ...'.format(url)) - try: - hook = None if quiet else report_download_progress - urlretrieve(url, destination_path, reporthook=hook) - except URLError: - raise RuntimeError('Error downloading resource!') - finally: - if not quiet: - # Just a newline. - print() - - -def unzip(zipped_path, quiet): - unzipped_path = os.path.splitext(zipped_path)[0] - if os.path.exists(unzipped_path): - if not quiet: - print('{} already exists, skipping ... '.format(unzipped_path)) - return - with gzip.open(zipped_path, 'rb') as zipped_file: - with open(unzipped_path, 'wb') as unzipped_file: - unzipped_file.write(zipped_file.read()) - if not quiet: - print('Unzipped {} ...'.format(zipped_path)) - - -def main(): - parser = argparse.ArgumentParser( - description='Download the MNIST dataset from the internet') - parser.add_argument( - '-d', '--destination', default='.', help='Destination directory') - parser.add_argument( - '-q', - '--quiet', - action='store_true', - help="Don't report about progress") - options = parser.parse_args() - - if not os.path.exists(options.destination): - os.makedirs(options.destination) - - try: - for resource in RESOURCES: - path = os.path.join(options.destination, resource) - url = 'http://yann.lecun.com/exdb/mnist/{}'.format(resource) - download(path, url, options.quiet) - unzip(path, options.quiet) - except KeyboardInterrupt: - print('Interrupted') - - -if __name__ == '__main__': - main() diff --git a/scripts/jureca_libtorch/MNIST/mnist.cpp b/scripts/jureca_libtorch/MNIST/mnist.cpp deleted file mode 100755 index edd51eda48e75e2c825ad0c3dc67ea0098ac3291..0000000000000000000000000000000000000000 --- a/scripts/jureca_libtorch/MNIST/mnist.cpp +++ /dev/null @@ -1,179 +0,0 @@ -#include <torch/torch.h> - -#include <cstddef> -#include <cstdio> -#include <iostream> -#include <string> -#include <vector> -#include <chrono> - -using namespace std::chrono; -using Clock = std::chrono::steady_clock; - -// Where to find the MNIST dataset. -const char* kDataRoot = "./data"; - -// The batch size for training. -const int64_t kTrainBatchSize = 64; - -// The batch size for testing. -const int64_t kTestBatchSize = 64; - -// The number of epochs to train. -const int64_t kNumberOfEpochs = 3; - -// After how many batches to log a new update with the loss value. -const int64_t kLogInterval = 10; - -struct Net : torch::nn::Module { - Net() - : conv1(torch::nn::Conv2dOptions(1, 10, /*kernel_size=*/5)), - conv2(torch::nn::Conv2dOptions(10, 20, /*kernel_size=*/5)), - fc1(320, 50), - fc2(50, 10) { - register_module("conv1", conv1); - register_module("conv2", conv2); - register_module("conv2_drop", conv2_drop); - register_module("fc1", fc1); - register_module("fc2", fc2); - } - - torch::Tensor forward(torch::Tensor x) { - x = torch::relu(torch::max_pool2d(conv1->forward(x), 2)); - x = torch::relu( - torch::max_pool2d(conv2_drop->forward(conv2->forward(x)), 2)); - x = x.view({-1, 320}); - x = torch::relu(fc1->forward(x)); - x = torch::dropout(x, /*p=*/0.5, /*training=*/is_training()); - x = fc2->forward(x); - return torch::log_softmax(x, /*dim=*/1); - } - - torch::nn::Conv2d conv1; - torch::nn::Conv2d conv2; - torch::nn::Dropout2d conv2_drop; - torch::nn::Linear fc1; - torch::nn::Linear fc2; -}; - -template <typename DataLoader> -void train( - size_t epoch, - Net& model, - torch::Device device, - DataLoader& data_loader, - torch::optim::Optimizer& optimizer, - size_t dataset_size) { - model.train(); - size_t batch_idx = 0; - for (auto& batch : data_loader) { - auto data = batch.data.to(device), targets = batch.target.to(device); - optimizer.zero_grad(); - auto output = model.forward(data); - auto loss = torch::nll_loss(output, targets); - AT_ASSERT(!std::isnan(loss.template item<float>())); - loss.backward(); - optimizer.step(); - - if (batch_idx++ % kLogInterval == 0) { - std::printf( - "\rTrain Epoch: %ld [%5ld/%5ld] Loss: %.4f\n", - epoch, - batch_idx * batch.data.size(0), - dataset_size, - loss.template item<float>()); - } - } -} - -template <typename DataLoader> -void test( - Net& model, - torch::Device device, - DataLoader& data_loader, - size_t dataset_size) { - torch::NoGradGuard no_grad; - model.eval(); - double test_loss = 0; - int32_t correct = 0; - for (const auto& batch : data_loader) { - auto data = batch.data.to(device), targets = batch.target.to(device); - auto output = model.forward(data); - test_loss += torch::nll_loss( - output, - targets, - /*weight=*/{}, - torch::Reduction::Sum) - .template item<float>(); - auto pred = output.argmax(1); - correct += pred.eq(targets).sum().template item<int64_t>(); - } - - test_loss /= dataset_size; - std::printf( - "\nTest set: Average loss: %.4f | Accuracy: %.3f\n", - test_loss, - static_cast<double>(correct) / dataset_size); -} - -auto main() -> int { - torch::manual_seed(1); - - torch::DeviceType device_type; - if (torch::cuda::is_available()) { - std::cout << "CUDA available! Training on GPU." << std::endl; - device_type = torch::kCUDA; - } else { - std::cout << "Training on CPU." << std::endl; - device_type = torch::kCPU; - } - torch::Device device(device_type); - - Net model; - model.to(device); - - auto train_dataset = torch::data::datasets::MNIST(kDataRoot) - .map(torch::data::transforms::Normalize<>(0.1307, 0.3081)) - .map(torch::data::transforms::Stack<>()); - - //std::cout << typeid(train_dataset).name() << '\n'; - auto test_dat = train_dataset.append(); - - - const size_t train_dataset_size = train_dataset.size().value(); - auto train_loader = - torch::data::make_data_loader<torch::data::samplers::SequentialSampler>( - std::move(train_dataset), kTrainBatchSize); - - auto test_dataset = torch::data::datasets::MNIST( - kDataRoot, torch::data::datasets::MNIST::Mode::kTest) - .map(torch::data::transforms::Normalize<>(0.1307, 0.3081)) - .map(torch::data::transforms::Stack<>()); - const size_t test_dataset_size = test_dataset.size().value(); - auto test_loader = - torch::data::make_data_loader(std::move(test_dataset), kTestBatchSize); - - torch::optim::SGD optimizer( - model.parameters(), torch::optim::SGDOptions(0.01).momentum(0.5)); - - // timer start - auto st = Clock::now(); - auto et1 = Clock::now(); - auto et2 = Clock::now(); - - // start loop - std::cout << "starting!" << std::endl; - for (size_t epoch = 1; epoch <= kNumberOfEpochs; ++epoch) { - et1 = Clock::now(); - train(epoch, model, device, *train_loader, optimizer, train_dataset_size); - test(model, device, *test_loader, test_dataset_size); - et2 = Clock::now(); - std::cout << "epoch:" << epoch<<" / " << - duration_cast<milliseconds>(et2-et1).count()/1000.0 << " sec" << std::endl; - } - - // timer end - auto et3 = Clock::now(); - std::cout << "\nfinal time:"<< - duration_cast<milliseconds>(et3-st).count()/1000.0 << " sec" << std::endl; -} diff --git a/scripts/jureca_libtorch/README.md b/scripts/jureca_libtorch/README.md deleted file mode 100644 index ab26d71c7f2bc4b90534c30ed360a4e1b916431a..0000000000000000000000000000000000000000 --- a/scripts/jureca_libtorch/README.md +++ /dev/null @@ -1,19 +0,0 @@ -# DL using LibTorch (C++ Torch) - -# documentation -https://github.com/pytorch/pytorch/blob/master/docs/libtorch.rst - -# current isues -1. no distributed training - -# to-do -1. implement distributed training - -# done -1. as Python version is a wrapper, no performance difference -2. can simply be used alongisde a c++ code w/o Cpython -3. very limited compared to Python version (many classes/functions are missing) - -# usage -1. simply compile `mnist.cpp` using the `cmake` file as `bash compile.sh` -2. submit compiled `mnist` with `sbatch LibTorch_startscript.sh` diff --git a/scripts/jureca_libtorch/TorchVision/compile_jpeg.sh b/scripts/jureca_libtorch/TorchVision/compile_jpeg.sh deleted file mode 100755 index fc5e96478aa6dff88fd07a066d02fe67b029b89a..0000000000000000000000000000000000000000 --- a/scripts/jureca_libtorch/TorchVision/compile_jpeg.sh +++ /dev/null @@ -1,14 +0,0 @@ -# load libraries -ml NVHPC/22.3 CMake/3.21.1 cuDNN/8.3.1.22-CUDA-11.5 - -git clone https://github.com/winlibs/libjpeg.git -cd libjpeg - -rm -rf build -mkdir -p build -mkdir -p install -pushd build -cmake -DCMAKE_INSTALL_PREFIX=../install -DCMAKE_BUILD_TYPE=Release .. -make -j -make install -popd diff --git a/scripts/jureca_libtorch/TorchVision/compile_png.sh b/scripts/jureca_libtorch/TorchVision/compile_png.sh deleted file mode 100755 index 80dd76eca309d6f3e65279aa414b6100e9126804..0000000000000000000000000000000000000000 --- a/scripts/jureca_libtorch/TorchVision/compile_png.sh +++ /dev/null @@ -1,14 +0,0 @@ -# load libraries -ml NVHPC/22.3 CMake/3.21.1 cuDNN/8.3.1.22-CUDA-11.5 - -wget http://prdownloads.sourceforge.net/libpng/libpng-1.6.37.tar.gz?download -mv 'libpng-1.6.37.tar.gz?download' libpng-1.6.37.tar.gz -tar xzf libpng-1.6.37.tar.gz - -pushd libpng-1.6.37 -rm -rf build -mkdir -p build -./configure --prefix=${PWD}/build -make -make install -popd diff --git a/scripts/jureca_libtorch/TorchVision/compile_torchvision.sh b/scripts/jureca_libtorch/TorchVision/compile_torchvision.sh deleted file mode 100755 index f3b5acfa028677a69323f380a9f412d28e60f2e5..0000000000000000000000000000000000000000 --- a/scripts/jureca_libtorch/TorchVision/compile_torchvision.sh +++ /dev/null @@ -1,44 +0,0 @@ -# compile torchvision for dataloading (optional) - -# load libraries -ml NVHPC/22.3 CMake/3.21.1 cuDNN/8.3.1.22-CUDA-11.5 - -# get libtorch w/ gpu -wget https://download.pytorch.org/libtorch/cu116/libtorch-cxx11-abi-shared-with-deps-1.12.0%2Bcu116.zip -unzip libtorch-cxx11-abi-shared-with-deps-1.12.0+cu116.zip -libtorch_dir=$PWD/libtorch - -# get png packages for torchvision -./compile_png.sh -libpng_dir=$PWD/libpng-1.6.37/build - -# get jpeg packages -./compile_png.sh -libjpeg_dir=$PWD/libjpeg/install - -# current dir -m_dir=$PWD - -# get torchvision -git clone https://github.com/pytorch/vision.git - -# compile torchvision -pushd torchvision -rm -rf build -mkdir -p build -mkdir -p install -cd build -cmake -DCMAKE_PREFIX_PATH=${libtorch_dir} \ - -DWITH_CUDA=on \ - -DPNG_LIBRARY=${libpng_dir}/lib/libpng.so \ - -DPNG_PNG_INCLUDE_DIR=${libpng_dir}/include \ - -DJPEG_LIBRARY=${libjpeg_dir}/lib64/libjpeg.so \ - -DJPEG_INCLUDE_DIR=${libjpeg_dir}/include \ - -DCMAKE_INSTALL_PREFIX=../install \ - -DCMAKE_BUILD_TYPE=Release .. - -make -j -make install -popd - -# eof diff --git a/scripts/jureca_libtorch/lamec.json b/scripts/jureca_libtorch/lamec.json deleted file mode 100644 index a8d025c21fcf84b3dfc1fc34d94547f4aaa9e5b1..0000000000000000000000000000000000000000 --- a/scripts/jureca_libtorch/lamec.json +++ /dev/null @@ -1 +0,0 @@ -{"startscript": "MNIST/LibTorch_startscript.sh"} \ No newline at end of file diff --git a/scripts/jureca_raytune/.gitkeep b/scripts/jureca_raytune/.gitkeep deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/scripts/jureca_raytune/README.md b/scripts/jureca_raytune/README.md deleted file mode 100644 index bd3dd8ba876fd4df91b8dac41e82240d7a1766c6..0000000000000000000000000000000000000000 --- a/scripts/jureca_raytune/README.md +++ /dev/null @@ -1,7 +0,0 @@ -# Simple Ray Tune script working with cifar10 dataset on JURECA-DC - -Steps: -- create environment by running *create_jureca_env.sh* (or use your own env) -- run startscript *jureca_run_ray.sh* - -Also includes a TensorFlow version (cifar_tune_tf.py) with TFMirroredStrategy for data-parallelism on a node-level diff --git a/scripts/jureca_raytune/RayTune+DDP/.gitkeep b/scripts/jureca_raytune/RayTune+DDP/.gitkeep deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/scripts/jureca_raytune/RayTune+DDP/cifar_tune.py b/scripts/jureca_raytune/RayTune+DDP/cifar_tune.py deleted file mode 100644 index 50fa034e987e09b37b037bf2bc28e2b9465efb0b..0000000000000000000000000000000000000000 --- a/scripts/jureca_raytune/RayTune+DDP/cifar_tune.py +++ /dev/null @@ -1,132 +0,0 @@ -# general imports -import numpy as np -import os - -# PyTorch imports -import torch -import torch.nn as nn -import torch.nn.functional as F -import torch.optim as optim -import torch.distributed as dist -import torchvision -import torchvision.transforms as transforms -import torchvision.models as models - -# Ray Tune imports -import ray -from ray import tune -from ray.tune import CLIReporter - - -# method to average the parameters over all GPUs - -# mean of field over GPUs -def par_mean(field): - res = torch.tensor(field).float() - res = res.cuda() - dist.all_reduce(res,op=dist.ReduceOp.SUM,group=None,async_op=True).wait() - res/=dist.get_world_size() - return res - - -# dataloading method -def load_data(data_dir=None): - transform = transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) - ]) - - trainset = torchvision.datasets.CIFAR10( - root=data_dir, train=True, download=False, transform=transform) - - return trainset - - -# cifar training method -def train_cifar(config): - - # get model - net = models.resnet18() - - # perpare model for RayTune - net = ray.train.torch.prepare_model(net) - - # loss and optimizer definition - criterion = nn.CrossEntropyLoss() - optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9) - - - # get the training set - trainset = load_data('/p/project/raise-ctp2/cifar10/data') - - # define dataloader with hyperparameters set by RayTune - train_loader = torch.utils.data.DataLoader( - trainset, - batch_size=int(config["batch_size"]), - shuffle=True, - num_workers=8) - - # prepare dataloader for RayTune - train_loader = ray.train.torch.prepare_data_loader(train_loader) - - - for epoch in range(20): # loop over the dataset multiple times - - loss = 0 - - for i, data in enumerate(train_loader, 0): - # get the inputs; data is a list of [inputs, labels] - inputs, labels = data - - # zero the parameter gradients - optimizer.zero_grad() - - # forward + backward + optimize - outputs = net(inputs) - loss = criterion(outputs, labels) - loss.backward() - optimizer.step() - - loss = par_mean(loss) - - # report metric of interest back to RayTune - ray.train.report(loss = loss.item()) - - print("Finished Training") - - -def main(num_samples, max_num_epochs, gpus_per_trial): - ray.init(address='auto') - - - # prepare RayTune with PyTorch DDP backend, num_workers specifies the number of GPUs to use per trial - from ray.train import Trainer - trainer = Trainer(backend="torch", num_workers=gpus_per_trial, use_gpu=True) - - # convert the train function to a Ray trainable - trainable = trainer.to_tune_trainable(train_cifar) - - # set search space - config = { - "batch_size": tune.choice([64, 128, 256, 512]), - "lr": tune.loguniform(10e-5, 1) - } - - - reporter = CLIReporter( - max_report_frequency=60) - - # run hyperparameter optimization - result = tune.run( - trainable, - local_dir=os.path.join(os.path.abspath(os.getcwd()), "ray_results"), - config=config, - num_samples=num_samples, - progress_reporter=reporter, - verbose=1, - scheduler=None) - - -if __name__ == "__main__": - # You can change the number of GPUs per trial here: - main(num_samples=10, max_num_epochs=30, gpus_per_trial=4) \ No newline at end of file diff --git a/scripts/jureca_raytune/RayTune+DDP/create_env.sh b/scripts/jureca_raytune/RayTune+DDP/create_env.sh deleted file mode 100644 index 3ccd263d10f67195367a976cb9588baf43b8e829..0000000000000000000000000000000000000000 --- a/scripts/jureca_raytune/RayTune+DDP/create_env.sh +++ /dev/null @@ -1,16 +0,0 @@ -ml --force purge -ml Stages/2022 GCC/11.2.0 CUDA/11.5 Python/3.9.6 PyTorch/1.11-CUDA-11.5 torchvision/0.12.0 - -## create vritual environment -python3 -m venv ddp_ray_env - -source ddp_ray_env/bin/activate - -# RAY TUNE 2.0 NOT WORKING -pip3 install ray==1.9.0 ray[tune]==1.9.0 ray[train]==1.9.0 - - -# might be necessay, might be not -pip3 install requests -pip3 install pytz -pip3 install python-dateutil diff --git a/scripts/jureca_raytune/RayTune+DDP/jureca_ray_ddp_startscript.sh b/scripts/jureca_raytune/RayTune+DDP/jureca_ray_ddp_startscript.sh deleted file mode 100644 index b8bc31f732b41386e67d9b7b96c0169450aba6e1..0000000000000000000000000000000000000000 --- a/scripts/jureca_raytune/RayTune+DDP/jureca_ray_ddp_startscript.sh +++ /dev/null @@ -1,78 +0,0 @@ -#!/bin/bash - -# general configuration of the job -#SBATCH --job-name=RayTuneDDP -#SBATCH --account=raise-ctp2 -#SBATCH --mail-user= -#SBATCH --mail-type=ALL -#SBATCH --output=job.out -#SBATCH --error=job.err -#SBATCH --time=01:00:00 - -# configure node and process count on the CM -#SBATCH --partition=dc-gpu-devel -#SBATCH --nodes=2 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=128 -#SBATCH --gpus-per-node=4 -#SBATCH --exclusive - -# gres options have to be disabled for deepv -#SBATCH --gres=gpu:4 - - -ml --force purge -ml Stages/2022 GCC/11.2.0 CUDA/11.5 Python/3.9.6 PyTorch/1.11-CUDA-11.5 torchvision/0.12.0 - - -num_gpus=4 -# set env -source ddp_ray_env/bin/activate - - -# set comm -export CUDA_VISIBLE_DEVICES="0,1,2,3" -export OMP_NUM_THREADS=1 -if [ "$SLURM_CPUS_PER_TASK" > 0 ] ; then - export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -fi - -# launch -# Getting the node names -nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -nodes_array=($nodes) - -head_node=${nodes_array[0]} - -# __doc_head_ray_start__ -port=8374 - -echo "Starting HEAD at $head_node" -srun --nodes=1 --ntasks=1 -w "$head_node" \ - ray start --head --node-ip-address="$head_node"i --port=$port \ - --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus 4 --block & -# __doc_head_ray_end__ - -# __doc_worker_ray_start__ - -# optional, though may be useful in certain versions of Ray < 1.0. -sleep 10 - -# number of nodes other than the head node -worker_num=$((SLURM_JOB_NUM_NODES - 1)) - -for ((i = 1; i <= worker_num; i++)); do - node_i=${nodes_array[$i]} - echo "Starting WORKER $i at $node_i" - srun --nodes=1 --ntasks=1 -w "$node_i" \ - ray start --address "$head_node"i:"$port" \ - --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus 4 --block & - sleep 5 -done - -echo "Ready" - -python3 -u cifar_tune.py - - -# eof diff --git a/scripts/jureca_raytune/Ray_2.4/.gitkeep b/scripts/jureca_raytune/Ray_2.4/.gitkeep deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/scripts/jureca_raytune/Ray_2.4/ASHA/.gitkeep b/scripts/jureca_raytune/Ray_2.4/ASHA/.gitkeep deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/scripts/jureca_raytune/Ray_2.4/ASHA/cifar_tune_asha.py b/scripts/jureca_raytune/Ray_2.4/ASHA/cifar_tune_asha.py deleted file mode 100644 index 689a457ebeb585dc3343fbe603b29e4216b08f74..0000000000000000000000000000000000000000 --- a/scripts/jureca_raytune/Ray_2.4/ASHA/cifar_tune_asha.py +++ /dev/null @@ -1,427 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -"""! @brief AI4HPC """ - -## -# @mainpage AI4HPC -# -# @section description_main Description -# Hyperparameter optimization of neural networks with Ray Tune library. -# -# -# -# @section notes_main Notes -# - The data directory of the cifar-10 dataset has the be specified in the startscript -# -# Copyright (c) 2023 RAISE, All rights reserved. - - -## -# @file cifar_tune_asha.py -# -# @brief Optimizing the hyperparameters of a ResNet18 trained on the cifar-10 dataset with Ray Tune libray and the ASHA algorithm. -# -# @section description_cifar_tune_asha description -# A standard ResNet18 model is trained on the cifar-10 vision dataset. To optimize the performance, multiple -# training runs (trials) with different hyperparameters (chagend learning rate and batch size) are performed using -# the Ray Tune library. The overall hyperparameter optimization process, as well as the single training runs can be -# parallelized across multiple GPUs. Trials with low performance (in terms of test set acuracy) are terminated early -# with the ASHA aglorithm. -# -# -# @section libraries_main Libraries/Modules -# - argparse standard library (https://docs.python.org/3/library/argparse.html) -# - Parse command-line options -# - sys standard library (https://docs.python.org/3/library/sys.html) -# - System commands -# - os standard library (https://docs.python.org/3/library/os.html) -# - OS commands -# - time standard library (https://docs.python.org/3/library/time.html) -# - Access timers for profilers -# - numpy library (https://numpy.org/) -# - Access numpy functions -# - random standard library (https://docs.python.org/3/library/time.html) -# - Generate random numbers -# - matplotlib library (https://matplotlib.org/) -# - Post-process data for validation -# - torch library (https://pytorch.org/) -# - ML framework -# - torchvision library (https://pypi.org/project/torchvision/) -# - Torch library additions for popular datasets and their transformations -# - ray libray (https://www.ray.io/) -# - Framework for distributed computing with a focus on hyperparameter optimization -# - pytz library (https://pythonhosted.org/pytz/) -# - Library for accurate and cross platform timezone calculation -# - python-dateutil (https://github.com/dateutil/dateutil) -# - Extension to pythons datetimes features -# - typing-extensions (https://pypi.org/project/typing-extensions/) -# - Support for different type systems -# -# @section notes_doxygen_example Notes -# - None. -# -# @section todo TODO -# - None. -# -# @section author Author(s) -# - Created by MA on 04/05/2023. -# - Modified by -# -# Copyright (c) 2023 RAISE, All rights reserved. - - - - -# load general modules -import argparse -import os -import time -import numpy as np - -# load torch and torchvision modules -import torch -import torch.nn as nn -import torch.optim as optim -import torch.distributed as dist - -import torchvision -from torchvision import datasets, transforms, models - -# load ray modules -import ray -from ray import tune -from ray.tune import CLIReporter -from ray.tune.schedulers import ASHAScheduler -from ray.air import session, RunConfig -import ray.train as train -from ray.train.torch import TorchTrainer -from ray.air.config import ScalingConfig -from ray.tune.tuner import Tuner, TuneConfig - - -def parsIni(): - parser = argparse.ArgumentParser(description='Ray Tune Cifar-10 Example') - parser.add_argument('--num-samples', type=int, default=24, metavar='N', - help='number of samples to train (default: 24)') - parser.add_argument('--max-iterations', type=int, default=10, metavar='N', - help='maximum iterations to train (default: 10)') - parser.add_argument('--par-workers', type=int, default=1, metavar='N', - help='parallel workers to train on a single trial (default: 1)') - parser.add_argument('--scheduler', type=str, default='RAND', - help='scheduler for tuning (default: RandomSearch)') - parser.add_argument('--data-dir', type=str, default='', - help='data directory for cifar-10 dataset') - - return parser - -def accuracy(output, target): - """! function that computes the accuracy of an output and target vector - @param output vector that the model predicted - @param target actual vector - - @return correct number of correct predictions - @return total number of total elements - """ - # get the index of the max log-probability - pred = output.max(1, keepdim=True)[1] - - # count correct classifications - correct = pred.eq(target.view_as(pred)).cpu().float().sum() - - # count total samples - total = target.size(0) - return correct, total - -def par_mean(field): - """! function that averages a field across all workers to a worker - @param field field in worker that should be averaged - - @return mean field - """ - - # convert field to tensor - res = torch.Tensor([field]) - - # move field to GPU/worker - res = res.cuda() - - # AllReduce operation - dist.all_reduce(res,op=dist.ReduceOp.SUM,group=None,async_op=True).wait() - - # average of number of workers - res/=dist.get_world_size() - - return res - -def par_sum(field): - """! function that sums a field across all workers to a worker - @param field field in worker that should be summed up - - @return sum of all fields - """ - # convert field to tensor - res = torch.Tensor([field]) - - # move field to GPU/worker - res = res.cuda() - - # AllReduce operation - dist.all_reduce(res,op=dist.ReduceOp.SUM,group=None,async_op=True).wait() - - return res - -def load_data(data_dir=None): - """! function that loads training and test set of cifar-10 - @param data_dir directory where the data is stored - - @return train_set training set of cifar-10 - @return test_set test set of cifar-10 - """ - # vision preprocessing values - mean = [x / 255 for x in [125.3, 123.0, 113.9]] - std = [x / 255 for x in [63.0, 62.1, 66.7]] - - # transformations for the training set - transform_train = transforms.Compose([ - transforms.RandomCrop(32, padding=4), - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), - transforms.Normalize(mean, std), - ]) - - # transformations for the testset - transform_test = transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize(mean, std), - ]) - - - # load the cifar-10 dataset from directory - train_set = torchvision.datasets.CIFAR10( - root=data_dir, train=True, download=False, transform=transform_train) - - test_set = torchvision.datasets.CIFAR10( - root=data_dir, train=False, download=False, transform=transform_test) - - - return train_set, test_set - - -def train_cifar(config): - """! function to train a ResNet on cifar-10 with different hyperparameters - @param config hyperparameter search space - """ - - # load a ResNet model - model = models.resnet18() - - # prepare the model for Ray Tune - model = train.torch.prepare_model(model) - - # define optimizer and loss function - criterion = nn.CrossEntropyLoss() - optimizer = optim.SGD(model.parameters(), lr=config["lr"]*dist.get_world_size()) - - # load the training and test data - train_set, test_set = load_data(str(config["data_dir"])) - - # define the train and test dataloader - train_loader = torch.utils.data.DataLoader( - train_set, - batch_size=int(config["batch_size"]), - shuffle=True, - num_workers=30) - - test_loader = torch.utils.data.DataLoader( - test_set, - batch_size=int(config["batch_size"]), - shuffle=False, - num_workers=30) - - # prepare the dataloaders for Ray Tune - train_loader = train.torch.prepare_data_loader(train_loader) - test_loader = train.torch.prepare_data_loader(test_loader) - - - # prepare metrics - train_acc = 0 - train_correct = 0 - train_total = 0 - - test_acc = 0 - test_correct = 0 - test_total = 0 - - # training and testing loop - for epoch in range(100): - - # prepare model for training and loop over training dataset - model.train() - for i, (images, target) in enumerate(train_loader): - - # compute output - optimizer.zero_grad() - output = model(images) - - # compute loss - loss = criterion(output, target) - - # count correct classifications - tmp_correct, tmp_total = accuracy(output, target) - train_correct +=tmp_correct - train_total +=tmp_total - - # backpropagation and optimization step - loss.backward() - optimizer.step() - - # average the train metrics over all workers - train_correct = par_sum(train_correct) - train_total = par_sum(train_total) - - # compute final training accuracy - train_acc = train_correct/train_total - - # only perform the testing loop every 10 epochs - if ((epoch+1)%10 == 0): - - # prepare model for testing and loop over test dataset - model.eval() - with torch.no_grad(): - for i, (images, target) in enumerate(test_loader): - - # compute output - output = model(images) - - # count correct classifications - tmp_correct, tmp_total = accuracy(output, target) - test_correct +=tmp_correct - test_total +=tmp_total - - # average the test metrics over all workers - test_correct = par_sum(test_correct) - test_total = par_sum(test_total) - - # compute final test accuracy - test_acc = test_correct/test_total - - # report the training and testing accuracy back to the head node of Ray Tune - session.report({"train_acc": train_acc.item(), "test_acc": test_acc.item()}) - - - -def main(args): - """! main function - @param args input arguments - """ - - # initalize Ray with the correct adress and node ip adress - ray.init(address=os.environ['ip_head'], _node_ip_address=os.environ["head_node_ip"]) - - - # define the hyperparameter search space - config = { - "batch_size": tune.choice([64, 128, 256, 512]), - "lr": tune.loguniform(10e-5, 1), - "data_dir": tune.choice([args.data_dir]), - } - - # select a hyperparameter optimization algorithm - if (args.scheduler == "ASHA"): - # Asynchronous Successive Halving Algorithm - scheduler = ASHAScheduler( - # the number of iterations to allow the trials to run at max - max_t=args.max_iterations, - # how many iterations before a bad trials get terminated - grace_period=2, - # which percentage of trials to terminate - reduction_factor=3) - - # set search algorithm - search_alg = None - - if (args.scheduler == "RAND"): - # random scheduler - scheduler = None - search_alg = None - - # define a reporter/logger to specifify which metrics to print out during the optimization process - reporter = CLIReporter( - metric_columns=["train_acc", "test_acc", "training_iteration", "time_this_iter_s", "time_total_s"], - max_report_frequency=60) - - - # define the general RunConfig of Ray Tune - run_config = RunConfig( - # name of the training run (directory name). - name="cifar_test_training", - # directory to store the ray tune results in . - local_dir=os.path.join(os.path.abspath(os.getcwd()), "ray_results"), - # logger - progress_reporter=reporter, - # stopping criterion when to end the optimization process - stop={"training_iteration": args.max_iterations} - - ) - - # wrapping the torch training function inside a TorchTrainer logic - trainer = TorchTrainer( - # torch training function - train_loop_per_worker=train_cifar, - # default hyperparameters for the function - train_loop_config={"batch_size": 64, "lr": 0.1, "data_dir": "/"}, - # setting the default resources/workers to use for the training function, including the number of CPUs and GPUs - scaling_config=ScalingConfig(num_workers=args.par_workers, use_gpu=True, resources_per_worker={"CPU": 30, "GPU": 1}), - ) - - # defining the hyperparameter tuner - tuner = Tuner( - # function to tune - trainer, - # hyperparameter search space - param_space={"train_loop_config": config}, - # the tuning configuration - tune_config=TuneConfig( - # define how many trials to evaluate - num_samples=args.num_samples, - # define which metric to use for measuring the performance of the trials - metric="test_acc", - # if the metric should be maximized or minimized - mode="max", - # define which scheduler to use - scheduler=scheduler, - # define which search algorithm to use - search_alg=search_alg, - ), - run_config=run_config - ) - - # measure the total runtime - start_time = time.time() - - # start the optimization process - result = tuner.fit() - - runtime = time.time() - start_time - - # print total runtime - print("Total runtime: ", runtime) - - # print metrics of the best trial - best_result = result.get_best_result(metric="test_acc", mode="max") - - print("Best result metrics: ", best_result) - - # print results dataframe - print("Result dataframe: ") - print(result.get_dataframe().sort_values("test_acc", ascending=False)) - - -if __name__ == "__main__": - - # get custom arguments from parser - parser = parsIni() - args = parser.parse_args() - - # call the main function to launch Ray - main(args) \ No newline at end of file diff --git a/scripts/jureca_raytune/Ray_2.4/ASHA/jureca_ray_startscript.sh b/scripts/jureca_raytune/Ray_2.4/ASHA/jureca_ray_startscript.sh deleted file mode 100644 index 514498a3e482bb8b0ad90b3779fb3c1d577b4ba1..0000000000000000000000000000000000000000 --- a/scripts/jureca_raytune/Ray_2.4/ASHA/jureca_ray_startscript.sh +++ /dev/null @@ -1,79 +0,0 @@ -#!/bin/bash -# shellcheck disable=SC2206 - -#SBATCH --job-name=ray_cifar_test -#SBATCH --account= -#SBATCH --output=ray_test_cifar.out -#SBATCH --error=ray_test_cifar.err -#SBATCH --partition=dc-gpu -#SBATCH --nodes=2 -#SBATCH --tasks-per-node=1 -#SBATCH --cpus-per-task=128 -#SBATCH --gres=gpu:4 -#SBATCH --time=00:30:00 -#SBATCH --exclusive - - -ml --force purge - -ml Stages/2023 GCC/11.3.0 OpenMPI/4.1.4 PyTorch/1.12.0-CUDA-11.7 torchvision/0.13.1-CUDA-11.7 - -source ray_tune_env/bin/activate - -COMMAND="cifar_tune_asha.py --scheduler ASHA --num-samples 12 --par-workers 2 --max-iterations 2 --data-dir /p/scratch/raise-ctp2/cifar10/data " - -echo $COMMAND - -sleep 1 -# make sure CUDA devices are visible -export CUDA_VISIBLE_DEVICES="0,1,2,3" -export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK} - -num_gpus=4 - -## Limit number of max pending trials -export TUNE_MAX_PENDING_TRIALS_PG=$(($SLURM_NNODES * 4)) - -## Disable Ray Usage Stats -export RAY_USAGE_STATS_DISABLE=1 - - -####### this part is taken from the ray example slurm script ##### -set -x - -# __doc_head_address_start__ - -# Getting the node names -nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -nodes_array=($nodes) - -head_node=${nodes_array[0]} - -port=7638 - -export ip_head="$head_node"i:"$port" -export head_node_ip="$head_node"i - -echo "Starting HEAD at $head_node" -srun --nodes=1 --ntasks=1 -w "$head_node" \ - ray start --head --node-ip-address="$head_node"i --port=$port \ - --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus $num_gpus --block & - -# optional, though may be useful in certain versions of Ray < 1.0. -sleep 10 - -# number of nodes other than the head node -worker_num=$((SLURM_JOB_NUM_NODES - 1)) - -for ((i = 1; i <= worker_num; i++)); do - node_i=${nodes_array[$i]} - echo "Starting WORKER $i at $node_i" - srun --nodes=1 --ntasks=1 -w "$node_i" \ - ray start --address "$head_node"i:"$port" --redis-password='5241590000000000' \ - --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus $num_gpus --block & - sleep 5 -done - -echo "Ready" - -python -u $COMMAND diff --git a/scripts/jureca_raytune/Ray_2.4/BOHB/.gitkeep b/scripts/jureca_raytune/Ray_2.4/BOHB/.gitkeep deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/scripts/jureca_raytune/Ray_2.4/BOHB/cifar_tune_bohb.py b/scripts/jureca_raytune/Ray_2.4/BOHB/cifar_tune_bohb.py deleted file mode 100644 index 035d72af3c2936b89ebc99effb8494048f3f285d..0000000000000000000000000000000000000000 --- a/scripts/jureca_raytune/Ray_2.4/BOHB/cifar_tune_bohb.py +++ /dev/null @@ -1,427 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -"""! @brief AI4HPC """ - -## -# @mainpage AI4HPC -# -# @section description_main Description -# Hyperparameter optimization of neural networks with Ray Tune library. -# -# -# -# @section notes_main Notes -# - The data directory of the cifar-10 dataset has the be specified in the startscript -# -# Copyright (c) 2023 RAISE, All rights reserved. - - -## -# @file cifar_tune_bohb.py -# -# @brief Optimizing the hyperparameters of a ResNet18 trained on the cifar-10 dataset with Ray Tune libray and the BOHB algorithm. -# -# @section description_cifar_tune_bohb description -# A standard ResNet18 model is trained on the cifar-10 vision dataset. To optimize the performance, multiple -# training runs (trials) with different hyperparameters (chagend learning rate and batch size) are performed using -# the Ray Tune library. The overall hyperparameter optimization process, as well as the single training runs can be -# parallelized across multiple GPUs. Trials with low performance (in terms of test set acuracy) are terminated early -# and their resources are assigned to new samples with the BOHB aglorithm. -# -# -# @section libraries_main Libraries/Modules -# - argparse standard library (https://docs.python.org/3/library/argparse.html) -# - Parse command-line options -# - sys standard library (https://docs.python.org/3/library/sys.html) -# - System commands -# - os standard library (https://docs.python.org/3/library/os.html) -# - OS commands -# - time standard library (https://docs.python.org/3/library/time.html) -# - Access timers for profilers -# - numpy library (https://numpy.org/) -# - Access numpy functions -# - torch library (https://pytorch.org/) -# - ML framework -# - torchvision library (https://pypi.org/project/torchvision/) -# - Torch library additions for popular datasets and their transformations -# - ray libray (https://www.ray.io/) -# - Framework for distributed computing with a focus on hyperparameter optimization -# - pytz library (https://pythonhosted.org/pytz/) -# - Library for accurate and cross platform timezone calculation -# - python-dateutil (https://github.com/dateutil/dateutil) -# - Extension to pythons datetimes features -# - typing-extensions (https://pypi.org/project/typing-extensions/) -# - Support for different type systems -# - hpbandster library (https://automl.github.io/HpBandSter/build/html/quickstart.html) -# - Library for performing hyperband operations -# - ConfigSpace library (https://automl.github.io/ConfigSpace/main/) -# - Library to manage configuration and search spaces for hyperparameter optimization -# -# @section notes_doxygen_example Notes -# - None. -# -# @section todo TODO -# - None. -# -# @section author Author(s) -# - Created by MA on 04/05/2023. -# - Modified by -# -# Copyright (c) 2023 RAISE, All rights reserved. - - -# load general modules -import argparse -import os -import time -import numpy as np - -# load torch and torchvision modules -import torch -import torch.nn as nn -import torch.optim as optim -import torch.distributed as dist - -import torchvision -from torchvision import datasets, transforms, models - -# load ray modules -import ray -from ray import tune -from ray.tune import CLIReporter -from ray.tune.schedulers.hb_bohb import HyperBandForBOHB -from ray.tune.search.bohb import TuneBOHB -from ray.air import session, RunConfig -import ray.train as train -from ray.train.torch import TorchTrainer -from ray.air.config import ScalingConfig -from ray.tune.tuner import Tuner, TuneConfig - - - - -def parsIni(): - parser = argparse.ArgumentParser(description='Ray Tune Cifar-10 Example') - parser.add_argument('--num-samples', type=int, default=24, metavar='N', - help='number of samples to train (default: 24)') - parser.add_argument('--max-iterations', type=int, default=10, metavar='N', - help='maximum iterations to train (default: 10)') - parser.add_argument('--par-workers', type=int, default=1, metavar='N', - help='parallel workers to train on a single trial (default: 1)') - parser.add_argument('--scheduler', type=str, default='RAND', - help='scheduler for tuning (default: RandomSearch)') - parser.add_argument('--data-dir', type=str, default='', - help='data directory for cifar-10 dataset') - - return parser - -def accuracy(output, target): - """! function that computes the accuracy of an output and target vector - @param output vector that the model predicted - @param target actual vector - - @return correct number of correct predictions - @return total number of total elements - """ - # get the index of the max log-probability - pred = output.max(1, keepdim=True)[1] - - # count correct classifications - correct = pred.eq(target.view_as(pred)).cpu().float().sum() - - # count total samples - total = target.size(0) - return correct, total - -def par_mean(field): - """! function that averages a field across all workers to a worker - @param field field in worker that should be averaged - - @return mean field - """ - - # convert field to tensor - res = torch.Tensor([field]) - - # move field to GPU/worker - res = res.cuda() - - # AllReduce operation - dist.all_reduce(res,op=dist.ReduceOp.SUM,group=None,async_op=True).wait() - - # average of number of workers - res/=dist.get_world_size() - - return res - -def par_sum(field): - """! function that sums a field across all workers to a worker - @param field field in worker that should be summed up - - @return sum of all fields - """ - # convert field to tensor - res = torch.Tensor([field]) - - # move field to GPU/worker - res = res.cuda() - - # AllReduce operation - dist.all_reduce(res,op=dist.ReduceOp.SUM,group=None,async_op=True).wait() - - return res - -def load_data(data_dir=None): - """! function that loads training and test set of cifar-10 - @param data_dir directory where the data is stored - - @return train_set training set of cifar-10 - @return test_set test set of cifar-10 - """ - # vision preprocessing values - mean = [x / 255 for x in [125.3, 123.0, 113.9]] - std = [x / 255 for x in [63.0, 62.1, 66.7]] - - # transformations for the training set - transform_train = transforms.Compose([ - transforms.RandomCrop(32, padding=4), - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), - transforms.Normalize(mean, std), - ]) - - # transformations for the testset - transform_test = transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize(mean, std), - ]) - - - # load the cifar-10 dataset from directory - train_set = torchvision.datasets.CIFAR10( - root=data_dir, train=True, download=False, transform=transform_train) - - test_set = torchvision.datasets.CIFAR10( - root=data_dir, train=False, download=False, transform=transform_test) - - - return train_set, test_set - - -def train_cifar(config): - """! function to train a ResNet on cifar-10 with different hyperparameters - @param config hyperparameter search space - """ - - # load a ResNet model - model = models.resnet18() - - # prepare the model for Ray Tune - model = train.torch.prepare_model(model) - - # define optimizer and loss function - criterion = nn.CrossEntropyLoss() - optimizer = optim.SGD(model.parameters(), lr=config["lr"]*dist.get_world_size()) - - # load the training and test data - train_set, test_set = load_data(str(config["data_dir"])) - - # define the train and test dataloader - train_loader = torch.utils.data.DataLoader( - train_set, - batch_size=int(config["batch_size"]), - shuffle=True, - num_workers=30) - - test_loader = torch.utils.data.DataLoader( - test_set, - batch_size=int(config["batch_size"]), - shuffle=False, - num_workers=30) - - # prepare the dataloaders for Ray Tune - train_loader = train.torch.prepare_data_loader(train_loader) - test_loader = train.torch.prepare_data_loader(test_loader) - - - # prepare metrics - train_acc = 0 - train_correct = 0 - train_total = 0 - - test_acc = 0 - test_correct = 0 - test_total = 0 - - # training and testing loop - for epoch in range(100): - - # prepare model for training and loop over training dataset - model.train() - for i, (images, target) in enumerate(train_loader): - - # compute output - optimizer.zero_grad() - output = model(images) - - # compute loss - loss = criterion(output, target) - - # count correct classifications - tmp_correct, tmp_total = accuracy(output, target) - train_correct +=tmp_correct - train_total +=tmp_total - - # backpropagation and optimization step - loss.backward() - optimizer.step() - - # average the train metrics over all workers - train_correct = par_sum(train_correct) - train_total = par_sum(train_total) - - # compute final training accuracy - train_acc = train_correct/train_total - - # only perform the testing loop every 10 epochs - if ((epoch+1)%10 == 0): - - # prepare model for testing and loop over test dataset - model.eval() - with torch.no_grad(): - for i, (images, target) in enumerate(test_loader): - - # compute output - output = model(images) - - # count correct classifications - tmp_correct, tmp_total = accuracy(output, target) - test_correct +=tmp_correct - test_total +=tmp_total - - # average the test metrics over all workers - test_correct = par_sum(test_correct) - test_total = par_sum(test_total) - - # compute final test accuracy - test_acc = test_correct/test_total - - # report the training and testing accuracy back to the head node of Ray Tune - session.report({"train_acc": train_acc.item(), "test_acc": test_acc.item()}) - - - -def main(args): - """! main function - @param args input arguments - """ - - # initalize Ray with the correct adress and node ip adress - ray.init(address=os.environ['ip_head'], _node_ip_address=os.environ["head_node_ip"]) - - - # define the hyperparameter search space - config = { - "batch_size": tune.choice([64, 128, 256, 512]), - "lr": tune.loguniform(10e-5, 1), - "data_dir": tune.choice([args.data_dir]), - } - - # select a hyperparameter optimization algorithm - - if (args.scheduler == "BOHB"): - # Bayesian Optimization and HyperBand - scheduler = HyperBandForBOHB( - # time attribute - time_attr="training_iteration", - # the number of iterations to allow the trials to run at max - max_t=args.max_iterations, - # which percentage of trials to terminate - reduction_factor=3) - - search_alg = TuneBOHB(seed=42) - - if (args.scheduler == "RAND"): - # random scheduler - scheduler = None - search_alg = None - - # define a reporter/logger to specifify which metrics to print out during the optimization process - reporter = CLIReporter( - metric_columns=["train_acc", "test_acc", "training_iteration", "time_this_iter_s", "time_total_s"], - max_report_frequency=60) - - - # define the general RunConfig of Ray Tune - run_config = RunConfig( - # name of the training run (directory name). - name="cifar_test_training", - # directory to store the ray tune results in . - local_dir=os.path.join(os.path.abspath(os.getcwd()), "ray_results"), - # logger - progress_reporter=reporter, - # stopping criterion when to end the optimization process - stop={"training_iteration": args.max_iterations} - - ) - - # wrapping the torch training function inside a TorchTrainer logic - trainer = TorchTrainer( - # torch training function - train_loop_per_worker=train_cifar, - # default hyperparameters for the function - train_loop_config={"batch_size": 64, "lr": 0.1, "data_dir": "/"}, - # setting the default resources/workers to use for the training function, including the number of CPUs and GPUs - scaling_config=ScalingConfig(num_workers=args.par_workers, use_gpu=True, resources_per_worker={"CPU": 30, "GPU": 1}), - ) - - # defining the hyperparameter tuner - tuner = Tuner( - # function to tune - trainer, - # hyperparameter search space - param_space={"train_loop_config": config}, - # the tuning configuration - tune_config=TuneConfig( - # define how many trials to evaluate - num_samples=args.num_samples, - # define which metric to use for measuring the performance of the trials - metric="test_acc", - # if the metric should be maximized or minimized - mode="max", - # define which scheduler to use - scheduler=scheduler, - # define which search algorithm to use - search_alg=search_alg), - run_config=run_config - ) - - # measure the total runtime - start_time = time.time() - - # start the optimization process - result = tuner.fit() - - runtime = time.time() - start_time - - # print total runtime - print("Total runtime: ", runtime) - - # print metrics of the best trial - best_result = result.get_best_result(metric="test_acc", mode="max") - - print("Best result metrics: ", best_result) - - # print results dataframe - print("Result dataframe: ") - print(result.get_dataframe().sort_values("test_acc", ascending=False)) - - -if __name__ == "__main__": - - # get custom arguments from parser - parser = parsIni() - args = parser.parse_args() - - # call the main function to launch Ray - main(args) \ No newline at end of file diff --git a/scripts/jureca_raytune/Ray_2.4/BOHB/jureca_ray_startscript.sh b/scripts/jureca_raytune/Ray_2.4/BOHB/jureca_ray_startscript.sh deleted file mode 100644 index f35209fb2b0465d3566f0394e180e54aa5f67d80..0000000000000000000000000000000000000000 --- a/scripts/jureca_raytune/Ray_2.4/BOHB/jureca_ray_startscript.sh +++ /dev/null @@ -1,78 +0,0 @@ -#!/bin/bash -# shellcheck disable=SC2206 - -#SBATCH --job-name=ray_cifar_test -#SBATCH --account= -#SBATCH --output=ray_test_cifar.out -#SBATCH --error=ray_test_cifar.err -#SBATCH --partition=dc-gpu -#SBATCH --nodes=2 -#SBATCH --tasks-per-node=1 -#SBATCH --cpus-per-task=128 -#SBATCH --gres=gpu:4 -#SBATCH --time=00:30:00 -#SBATCH --exclusive - -ml --force purge - -ml Stages/2023 GCC/11.3.0 OpenMPI/4.1.4 PyTorch/1.12.0-CUDA-11.7 torchvision/0.13.1-CUDA-11.7 - -source ray_tune_env/bin/activate - -COMMAND="cifar_tune_bohb.py --scheduler BOHB --num-samples 12 --par-workers 2 --max-iterations 2 --data-dir /p/scratch/raise-ctp2/cifar10/data " - -echo $COMMAND - -sleep 1 -# make sure CUDA devices are visible -export CUDA_VISIBLE_DEVICES="0,1,2,3" -export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK} - -num_gpus=4 - -## Limit number of max pending trials -export TUNE_MAX_PENDING_TRIALS_PG=$(($SLURM_NNODES * 4)) - -## Disable Ray Usage Stats -export RAY_USAGE_STATS_DISABLE=1 - - -####### this part is taken from the ray example slurm script ##### -set -x - -# __doc_head_address_start__ - -# Getting the node names -nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -nodes_array=($nodes) - -head_node=${nodes_array[0]} - -port=7638 - -export ip_head="$head_node"i:"$port" -export head_node_ip="$head_node"i - -echo "Starting HEAD at $head_node" -srun --nodes=1 --ntasks=1 -w "$head_node" \ - ray start --head --node-ip-address="$head_node"i --port=$port \ - --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus $num_gpus --block & - -# optional, though may be useful in certain versions of Ray < 1.0. -sleep 10 - -# number of nodes other than the head node -worker_num=$((SLURM_JOB_NUM_NODES - 1)) - -for ((i = 1; i <= worker_num; i++)); do - node_i=${nodes_array[$i]} - echo "Starting WORKER $i at $node_i" - srun --nodes=1 --ntasks=1 -w "$node_i" \ - ray start --address "$head_node"i:"$port" --redis-password='5241590000000000' \ - --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus $num_gpus --block & - sleep 5 -done - -echo "Ready" - -python -u $COMMAND diff --git a/scripts/jureca_raytune/Ray_2.4/PBT/.gitkeep b/scripts/jureca_raytune/Ray_2.4/PBT/.gitkeep deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/scripts/jureca_raytune/Ray_2.4/PBT/cifar_tune_pbt.py b/scripts/jureca_raytune/Ray_2.4/PBT/cifar_tune_pbt.py deleted file mode 100644 index 87a43d66a21c7d315aaedbd8fafecbfd68983d32..0000000000000000000000000000000000000000 --- a/scripts/jureca_raytune/Ray_2.4/PBT/cifar_tune_pbt.py +++ /dev/null @@ -1,459 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -"""! @brief AI4HPC """ - -## -# @mainpage AI4HPC -# -# @section description_main Description -# Hyperparameter optimization of neural networks with Ray Tune library. -# -# -# -# @section notes_main Notes -# - The data directory of the cifar-10 dataset has the be specified in the startscript -# -# Copyright (c) 2023 RAISE, All rights reserved. - - -## -# @file cifar_tune_pbt.py -# -# @brief Optimizing the hyperparameters of a ResNet18 trained on the cifar-10 dataset with Ray Tune libray and the PBT algorithm. -# -# @section description_cifar_tune_pbt description -# A standard ResNet18 model is trained on the cifar-10 vision dataset. To optimize the performance, multiple -# training runs (trials) with different hyperparameters (chagend learning rate and batch size) are performed using -# the Ray Tune library. The overall hyperparameter optimization process, as well as the single training runs can be -# parallelized across multiple GPUs. Trials with low performance (in terms of test set acuracy) copy the hyperparameters -# of better performing trials and apply mutations with the PBT aglorithm. -# -# @section libraries_main Libraries/Modules -# - argparse standard library (https://docs.python.org/3/library/argparse.html) -# - Parse command-line options -# - sys standard library (https://docs.python.org/3/library/sys.html) -# - System commands -# - os standard library (https://docs.python.org/3/library/os.html) -# - OS commands -# - time standard library (https://docs.python.org/3/library/time.html) -# - Access timers for profilers -# - numpy library (https://numpy.org/) -# - Access numpy functions -# - torch library (https://pytorch.org/) -# - ML framework -# - torchvision library (https://pypi.org/project/torchvision/) -# - Torch library additions for popular datasets and their transformations -# - ray libray (https://www.ray.io/) -# - Framework for distributed computing with a focus on hyperparameter optimization -# - pytz library (https://pythonhosted.org/pytz/) -# - Library for accurate and cross platform timezone calculation -# - python-dateutil (https://github.com/dateutil/dateutil) -# - Extension to pythons datetimes features -# - typing-extensions (https://pypi.org/project/typing-extensions/) -# - Support for different type systems -# -# @section notes_doxygen_example Notes -# - None. -# -# @section todo TODO -# - None. -# -# @section author Author(s) -# - Created by MA on 04/05/2023. -# - Modified by -# -# Copyright (c) 2023 RAISE, All rights reserved. - -# load general modules -import argparse -import os -import time -import numpy as np - -# load torch and torchvision modules -import torch -import torch.nn as nn -import torch.optim as optim -import torch.distributed as dist - -import torchvision -from torchvision import datasets, transforms, models - -# load ray modules -import ray -from ray import tune -from ray.tune import CLIReporter -from ray.tune.schedulers import PopulationBasedTraining -from ray.air import session, Checkpoint, RunConfig -import ray.train as train -from ray.train.torch import TorchTrainer -from ray.air.config import ScalingConfig -from ray.tune.tuner import Tuner, TuneConfig - - - - -def parsIni(): - parser = argparse.ArgumentParser(description='Ray Tune Cifar-10 Example') - parser.add_argument('--num-samples', type=int, default=24, metavar='N', - help='number of samples to train (default: 24)') - parser.add_argument('--max-iterations', type=int, default=10, metavar='N', - help='maximum iterations to train (default: 10)') - parser.add_argument('--par-workers', type=int, default=1, metavar='N', - help='parallel workers to train on a single trial (default: 1)') - parser.add_argument('--scheduler', type=str, default='RAND', - help='scheduler for tuning (default: RandomSearch)') - parser.add_argument('--data-dir', type=str, default='', - help='data directory for cifar-10 dataset') - - return parser - -def accuracy(output, target): - """! function that computes the accuracy of an output and target vector - @param output vector that the model predicted - @param target actual vector - - @return correct number of correct predictions - @return total number of total elements - """ - # get the index of the max log-probability - pred = output.max(1, keepdim=True)[1] - - # count correct classifications - correct = pred.eq(target.view_as(pred)).cpu().float().sum() - - # count total samples - total = target.size(0) - return correct, total - -def par_mean(field): - """! function that averages a field across all workers to a worker - @param field field in worker that should be averaged - - @return mean field - """ - - # convert field to tensor - res = torch.Tensor([field]) - - # move field to GPU/worker - res = res.cuda() - - # AllReduce operation - dist.all_reduce(res,op=dist.ReduceOp.SUM,group=None,async_op=True).wait() - - # average of number of workers - res/=dist.get_world_size() - - return res - -def par_sum(field): - """! function that sums a field across all workers to a worker - @param field field in worker that should be summed up - - @return sum of all fields - """ - # convert field to tensor - res = torch.Tensor([field]) - - # move field to GPU/worker - res = res.cuda() - - # AllReduce operation - dist.all_reduce(res,op=dist.ReduceOp.SUM,group=None,async_op=True).wait() - - return res - -def load_data(data_dir=None): - """! function that loads training and test set of cifar-10 - @param data_dir directory where the data is stored - - @return train_set training set of cifar-10 - @return test_set test set of cifar-10 - """ - # vision preprocessing values - mean = [x / 255 for x in [125.3, 123.0, 113.9]] - std = [x / 255 for x in [63.0, 62.1, 66.7]] - - # transformations for the training set - transform_train = transforms.Compose([ - transforms.RandomCrop(32, padding=4), - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), - transforms.Normalize(mean, std), - ]) - - # transformations for the testset - transform_test = transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize(mean, std), - ]) - - - # load the cifar-10 dataset from directory - train_set = torchvision.datasets.CIFAR10( - root=data_dir, train=True, download=False, transform=transform_train) - - test_set = torchvision.datasets.CIFAR10( - root=data_dir, train=False, download=False, transform=transform_test) - - - return train_set, test_set - - -def train_cifar(config): - """! function to train a ResNet on cifar-10 with different hyperparameters - @param config hyperparameter search space - """ - # PBT specific variable - step = 1 - - print("Starting Trials") - - # load a ResNet model - model = models.resnet18() - - # define optimizer and loss function - criterion = nn.CrossEntropyLoss() - optimizer = optim.SGD(model.parameters(), lr=config["lr"]*dist.get_world_size()) - - if session.get_checkpoint(): - # Load model state and iteration step from checkpoint. - checkpoint_dict = session.get_checkpoint().to_dict() - model.load_state_dict(checkpoint_dict["model_state_dict"]) - # Load optimizer state (needed since we're using momentum), - # then set the `lr` and `momentum` according to the config. - optimizer.load_state_dict(checkpoint_dict["optimizer_state_dict"]) - - # Note: Make sure to increment the checkpointed step by 1 to get the current step. - last_step = checkpoint_dict["step"] - step = last_step + 1 - - - - # prepare the model for Ray Tune - model = train.torch.prepare_model(model) - - # load the training and test data - train_set, test_set = load_data(str(config["data_dir"])) - - # define the train and test dataloader - train_loader = torch.utils.data.DataLoader( - train_set, - batch_size=int(config["batch_size"]), - shuffle=True, - num_workers=30) - - test_loader = torch.utils.data.DataLoader( - test_set, - batch_size=int(config["batch_size"]), - shuffle=False, - num_workers=30) - - # prepare the dataloaders for Ray Tune - train_loader = train.torch.prepare_data_loader(train_loader) - test_loader = train.torch.prepare_data_loader(test_loader) - - - # prepare metrics - train_acc = 0 - train_correct = 0 - train_total = 0 - - test_acc = 0 - test_correct = 0 - test_total = 0 - - # training and testing loop - for epoch in range(100): - - # prepare model for training and loop over training dataset - model.train() - for i, (images, target) in enumerate(train_loader): - - # compute output - optimizer.zero_grad() - output = model(images) - - # compute loss - loss = criterion(output, target) - - # count correct classifications - tmp_correct, tmp_total = accuracy(output, target) - train_correct +=tmp_correct - train_total +=tmp_total - - # backpropagation and optimization step - loss.backward() - optimizer.step() - - # average the train metrics over all workers - train_correct = par_sum(train_correct) - train_total = par_sum(train_total) - - # compute final training accuracy - train_acc = train_correct/train_total - - # only perform the testing loop every 10 epochs - if ((epoch+1)%10 == 0): - - # prepare model for testing and loop over test dataset - model.eval() - with torch.no_grad(): - for i, (images, target) in enumerate(test_loader): - - # compute output - output = model(images) - - # count correct classifications - tmp_correct, tmp_total = accuracy(output, target) - test_correct +=tmp_correct - test_total +=tmp_total - - # average the test metrics over all workers - test_correct = par_sum(test_correct) - test_total = par_sum(test_total) - - # compute final test accuracy - test_acc = test_correct/test_total - - # checkpoint the training - checkpoint = Checkpoint.from_dict({ - "step": step, - "model_state_dict": model.state_dict(), - "optimizer_state_dict": optimizer.state_dict(), - }) - - # report the training and testing accuracy back to the head node of Ray Tune - session.report({"train_acc": train_acc.item(), "test_acc": test_acc.item()}, checkpoint=checkpoint) - - step += 1 - - - -def main(args): - """! main function - @param args input arguments - """ - - # initalize Ray with the correct adress and node ip adress - ray.init(address=os.environ['ip_head'], _node_ip_address=os.environ["head_node_ip"]) - - - # define the (original) hyperparameter search space - config = { - "batch_size": tune.choice([64, 128, 256, 512]), - "lr": tune.loguniform(10e-5, 1), - "data_dir": tune.choice([args.data_dir]), - } - - # define the mutation config - mutation_config = {"lr": tune.loguniform(10e-5, 1),} - - # select a hyperparameter optimization algorithm - - if (args.scheduler == "PBT"): - # Population Based Training - scheduler = PopulationBasedTraining( - # time attribute - time_attr="training_iteration", - # intervals at that perturbations occur, - perturbation_interval=1, - # specification of hyperparameter mutatation search space (can be different than original search space!) - hyperparam_mutations={"train_loop_config": mutation_config}, - # the parameters of the top quantile_fraction percentage trials are transfered to the bottom quantile_fraction percentage of trials - quantile_fraction=0.33, - # probability to resample from original hyperparameter search space - resample_probability=0, - ) - - search_alg= None - - if (args.scheduler == "RAND"): - # random scheduler - scheduler = None - search_alg = None - - # define a reporter/logger to specifify which metrics to print out during the optimization process - reporter = CLIReporter( - metric_columns=["train_acc", "test_acc", "training_iteration", "time_this_iter_s", "time_total_s"], - max_report_frequency=60) - - - # define the general RunConfig of Ray Tune - run_config = RunConfig( - # name of the training run (directory name). - name="cifar_test_training", - # directory to store the ray tune results in . - local_dir=os.path.join(os.path.abspath(os.getcwd()), "ray_results"), - # logger - progress_reporter=reporter, - # stopping criterion when to end the optimization process - stop={"training_iteration": args.max_iterations}, - #checkpointing - checkpoint_config=ray.air.CheckpointConfig( - checkpoint_score_attribute="test_acc", - ), - - ) - - # wrapping the torch training function inside a TorchTrainer logic - trainer = TorchTrainer( - # torch training function - train_loop_per_worker=train_cifar, - # default hyperparameters for the function - train_loop_config={"batch_size": 64, "lr": 0.1, "data_dir": "/"}, - # setting the default resources/workers to use for the training function, including the number of CPUs and GPUs - scaling_config=ScalingConfig(num_workers=args.par_workers, use_gpu=True, resources_per_worker={"CPU": 30, "GPU": 1}), - ) - - # defining the hyperparameter tuner - tuner = Tuner( - # function to tune - trainer, - # general hyperparameter search space - param_space={"train_loop_config": config}, - # the tuning configuration - tune_config=TuneConfig( - # define how many trials to evaluate - num_samples=args.num_samples, - # define which metric to use for measuring the performance of the trials - metric="test_acc", - # if the metric should be maximized or minimized - mode="max", - # define which scheduler to use - scheduler=scheduler, - # define which search algorithm to use - search_alg=search_alg), - run_config=run_config, - ) - - # measure the total runtime - start_time = time.time() - - # start the optimization process - result = tuner.fit() - - runtime = time.time() - start_time - - # print total runtime - print("Total runtime: ", runtime) - - # print metrics of the best trial - best_result = result.get_best_result(metric="test_acc", mode="max") - - print("Best result metrics: ", best_result) - - # print results dataframe - print("Result dataframe: ") - print(result.get_dataframe().sort_values("test_acc", ascending=False)) - - -if __name__ == "__main__": - - # get custom arguments from parser - parser = parsIni() - args = parser.parse_args() - - # call the main function to launch Ray - main(args) \ No newline at end of file diff --git a/scripts/jureca_raytune/Ray_2.4/PBT/jureca_ray_startscript.sh b/scripts/jureca_raytune/Ray_2.4/PBT/jureca_ray_startscript.sh deleted file mode 100644 index e2fbc413b27e4d196f57d0b9d5bcfb08060d3c54..0000000000000000000000000000000000000000 --- a/scripts/jureca_raytune/Ray_2.4/PBT/jureca_ray_startscript.sh +++ /dev/null @@ -1,78 +0,0 @@ -#!/bin/bash -# shellcheck disable=SC2206 - -#SBATCH --job-name=ray_cifar_test -#SBATCH --account= -#SBATCH --output=ray_test_cifar.out -#SBATCH --error=ray_test_cifar.err -#SBATCH --partition=dc-gpu -#SBATCH --nodes=2 -#SBATCH --tasks-per-node=1 -#SBATCH --cpus-per-task=128 -#SBATCH --gres=gpu:4 -#SBATCH --time=00:30:00 -#SBATCH --exclusive - -ml --force purge - -ml Stages/2023 GCC/11.3.0 OpenMPI/4.1.4 PyTorch/1.12.0-CUDA-11.7 torchvision/0.13.1-CUDA-11.7 - -source ray_tune_env/bin/activate - -COMMAND="cifar_tune_pbt.py --scheduler PBT --num-samples 8 --par-workers 2 --max-iterations 5 --data-dir /p/scratch/raise-ctp2/cifar10/data " - -echo $COMMAND - -sleep 1 -# make sure CUDA devices are visible -export CUDA_VISIBLE_DEVICES="0,1,2,3" -export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK} - -num_gpus=4 - -## Limit number of max pending trials -export TUNE_MAX_PENDING_TRIALS_PG=$(($SLURM_NNODES * 4)) - -## Disable Ray Usage Stats -export RAY_USAGE_STATS_DISABLE=1 - - -####### this part is taken from the ray example slurm script ##### -set -x - -# __doc_head_address_start__ - -# Getting the node names -nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -nodes_array=($nodes) - -head_node=${nodes_array[0]} - -port=7638 - -export ip_head="$head_node"i:"$port" -export head_node_ip="$head_node"i - -echo "Starting HEAD at $head_node" -srun --nodes=1 --ntasks=1 -w "$head_node" \ - ray start --head --node-ip-address="$head_node"i --port=$port \ - --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus $num_gpus --block & - -# optional, though may be useful in certain versions of Ray < 1.0. -sleep 10 - -# number of nodes other than the head node -worker_num=$((SLURM_JOB_NUM_NODES - 1)) - -for ((i = 1; i <= worker_num; i++)); do - node_i=${nodes_array[$i]} - echo "Starting WORKER $i at $node_i" - srun --nodes=1 --ntasks=1 -w "$node_i" \ - ray start --address "$head_node"i:"$port" --redis-password='5241590000000000' \ - --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus $num_gpus --block & - sleep 5 -done - -echo "Ready" - -python -u $COMMAND diff --git a/scripts/jureca_raytune/Ray_2.4/build_ray_env.sh b/scripts/jureca_raytune/Ray_2.4/build_ray_env.sh deleted file mode 100644 index 645dea2fc55fb0f0e98b038f0eb4fa53682edada..0000000000000000000000000000000000000000 --- a/scripts/jureca_raytune/Ray_2.4/build_ray_env.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -ml --force purge - -ml Stages/2023 GCC/11.3.0 OpenMPI/4.1.4 PyTorch/1.12.0-CUDA-11.7 torchvision/0.13.1-CUDA-11.7 - -python3 -m venv ray_tune_env - -source ray_tune_env/bin/activate - -pip3 install ray==2.4.0 ray[tune]==2.4.0 -pip3 install python-dateutil pytz typing-extensions -pip3 install hpbandster ConfigSpace -deactivate \ No newline at end of file diff --git a/scripts/jureca_raytune/Ray_2.4/hpo.md b/scripts/jureca_raytune/Ray_2.4/hpo.md deleted file mode 100644 index ee02067c1021bbdc0e6618f5e77e2eef272b2110..0000000000000000000000000000000000000000 --- a/scripts/jureca_raytune/Ray_2.4/hpo.md +++ /dev/null @@ -1,58 +0,0 @@ -# Hyperparameter Optimization of Machine Learning Models with Ray Tune - -For the optimization of the hyperparameters of neural networks (such as learning rate or batch size) or machine learning models in general, the Ray Tune library (current version supported is 2.4.0) can be used. The library features a smooth integration of PyTorch-based training scripts and enables two stages of parallelism: - -- each training of a model with different hyperparameters (trial) can run in parallel on multiple GPUs (e.g. via PyTorch-DDP) -- several trials can run in parallel on an HPC machine (via Ray Tune itself) - -For installation of Ray Tune, run the installation script - -```bash -bash build_ray_env.py -``` - -After installation, several example are available: - -1. [Optimizing a ResNet18 on cifar-10 with AHSA or Random Search schedulers](https://gitlab.jsc.fz-juelich.de/CoE-RAISE/FZJ/ai-for-hpc/-/tree/main/Jureca_RayTune/Ray_2.4/ASHA) -2. [Optimizing a ResNet18 on cifar-10 with BOHB or Random Search schedulers](https://gitlab.jsc.fz-juelich.de/CoE-RAISE/FZJ/ai-for-hpc/-/tree/main/Jureca_RayTune/Ray_2.4/BOHB) -3. [Optimizing a ResNet18 on cifar-10 with PBT or Random Search schedulers (including checkpointing)](https://gitlab.jsc.fz-juelich.de/CoE-RAISE/FZJ/ai-for-hpc/-/tree/main/Jureca_RayTune/Ray_2.4/PBT) - - -The [ASHA](https://arxiv.org/pdf/1810.05934.pdf) scheduler is a variation of Random Search with early stopping of under-performing trials. The [BOHB](http://proceedings.mlr.press/v80/falkner18a/falkner18a.pdf) scheduler uses Bayesian Optimization in combination with early stopping, while the [PBT](https://arxiv.org/pdf/1711.09846.pdf) scheduler uses evolutionary optimization and is well suited for optimizing non-stationary hyperparameters (such as learning rate schedules). - -The following parameters can be set for each script: - -- num-samples: number of samples (trials) to evaluate -- max-iterations: for how long to train the trials at max -- par-workers: how many workers to allocate per trial -- scheduler: which scheduler to use -- data-dir: directory where the datasets are stored - -To submit a job to the JURECA-DC-GPU machine, use the following command: - -```bash -sbatch jureca_ray_startscript.sh -``` - -For communication via the infiniband network it is important the specify the node ip-address in the startscript (whan launching Ray) in the following format: - -```bash ---node-ip-address="$head_node"i -``` - -and - -```bash ---address "$head_node"i:"$port" -``` - -If multiple Ray instances run on the same machine, there might be problems if all use the same port value (7638), so it is advisable to change it to a different value in that case. - - - - - - - - - diff --git a/scripts/jureca_raytune/Ray_2.4/hpo.py b/scripts/jureca_raytune/Ray_2.4/hpo.py deleted file mode 100644 index 95894939d2158e1ea0d6569f9d487e23b4c30348..0000000000000000000000000000000000000000 --- a/scripts/jureca_raytune/Ray_2.4/hpo.py +++ /dev/null @@ -1,449 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -"""! @brief HPO """ - -## -# @mainpage HPO -# -# @section description_main Description -# Hyperparameter optimization of neural networks with Ray Tune library. -# -# -# -# @section notes_main Notes -# - The data directory of the CIFAR-10 dataset has the be specified in the startscript -# -# Copyright (c) 2023 RAISE, All rights reserved. - - -## -# @file hpo.py -# -# @brief Optimizing the hyperparameters of a ResNet18 trained on the CIFAR-10 dataset with Ray Tune libray -# and the ASHA, BOHB and PBT algorithm. -# -# @section hpo description -# A standard ResNet18 model is trained on the CIFAR-10 vision dataset. To optimize the performance, multiple -# training runs (trials) with different hyperparameters (chagend learning rate and batch size) are performed using -# the Ray Tune library. The overall hyperparameter optimization process, as well as the single training runs can be -# parallelized across multiple GPUs. -# For ASHA: Trials with low performance (in terms of test set acuracy) are terminated early -# with the ASHA aglorithm. -# For BOHB: Trials with low performance (in terms of test set acuracy) are terminated early -# and their resources are assigned to new samples with the BOHB aglorithm. -# For PBT: Trials with low performance (in terms of test set acuracy) copy the hyperparameters -# of better performing trials and apply mutations with the PBT aglorithm. -# For RAND: Random Search termination (no algorithm) -# -# -# @section libraries_main Libraries/Modules -# - argparse standard library (https://docs.python.org/3/library/argparse.html) -# - Parse command-line options -# - sys standard library (https://docs.python.org/3/library/sys.html) -# - System commands -# - os standard library (https://docs.python.org/3/library/os.html) -# - OS commands -# - time standard library (https://docs.python.org/3/library/time.html) -# - Access timers for profilers -# - numpy library (https://numpy.org/) -# - Access numpy functions -# - random standard library (https://docs.python.org/3/library/time.html) -# - Generate random numbers -# - matplotlib library (https://matplotlib.org/) -# - Post-process data for validation -# - torch library (https://pytorch.org/) -# - ML framework -# - torchvision library (https://pypi.org/project/torchvision/) -# - Torch library additions for popular datasets and their transformations -# - ray libray (https://www.ray.io/) -# - Framework for distributed computing with a focus on hyperparameter optimization -# -# @section notes_doxygen_example Notes -# - None. -# -# @section todo TODO -# - None. -# -# @section author Author(s) -# - Created by MA on 04/05/2023. -# - Modified by EI on 05/05/2023. -# -# Copyright (c) 2023 RAISE, All rights reserved. - -# load general modules -import argparse -import os -import time -import numpy as np - -# load torch and torchvision modules -import torch -import torch.nn as nn -import torch.optim as optim -import torch.distributed as dist -import torchvision -from torchvision import datasets, transforms, models - -# load ray modules -import ray -from ray import tune -from ray.tune import CLIReporter -from ray.air import session, RunConfig -import ray.train as train -from ray.train.torch import TorchTrainer -from ray.air.config import ScalingConfig -from ray.tune.tuner import Tuner, TuneConfig - -def parsIni(): - """! parse arguments - - @param --num-samples #samples - @param --max-iterations max. iteration - @param --ngpus parallel-workers per trial - @param --scheduler schedulers, ASHA, BOHB, PBT, RAND (no algorithm) - @param --data-dir dataset location - """ - parser = argparse.ArgumentParser(description='HPO Suite for AI4HPC') - parser.add_argument('--num-samples', type=int, default=24, metavar='N', - help='number of samples to train (default: 24)') - parser.add_argument('--max-iterations', type=int, default=10, metavar='N', - help='maximum iterations to train (default: 10)') - parser.add_argument('--ngpus', type=int, default=1, metavar='N', - help='number of GPUs used in a single trial (default: 1)') - parser.add_argument('--scheduler', type=str, default='RAND', - help='scheduler for tuning (default: RandomSearch)') - parser.add_argument('--data-dir', type=str, default='', - help='data directory for cifar-10 dataset') - return parser - -def accuracy(output, target): - """! function that computes the accuracy of an output and target vector - @param output vector that the model predicted - @param target actual vector - - @return correct number of correct predictions - @return total number of total elements - """ - # get the index of the max log-probability - pred = output.max(1, keepdim=True)[1] - - # count correct classifications - correct = pred.eq(target.view_as(pred)).cpu().float().sum() - - # count total samples - total = target.size(0) - return correct, total - -def par_sum(field): - """! function that sums a field across all workers to a worker - @param field field in worker that should be summed up - - @return sum of all fields - """ - # convert field to tensor - res = torch.Tensor([field]) - - # move field to GPU/worker - res = res.cuda() - - # AllReduce operation - dist.all_reduce(res,op=dist.ReduceOp.SUM,group=None,async_op=True).wait() - - return res - -def load_data(data_dir=None): - """! function that loads training and test set of cifar-10 - @param data_dir directory where the data is stored - - @return train_set training set of cifar-10 - @return test_set test set of cifar-10 - """ - # vision preprocessing values - mean = [x / 255 for x in [125.3, 123.0, 113.9]] - std = [x / 255 for x in [63.0, 62.1, 66.7]] - - # transformations for the training set - transform_train = transforms.Compose([ - transforms.RandomCrop(32, padding=4), - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), - transforms.Normalize(mean, std), - ]) - - # transformations for the testset - transform_test = transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize(mean, std), - ]) - - # load the cifar-10 dataset from directory - train_set = torchvision.datasets.CIFAR10( - root=data_dir, train=True, download=False, transform=transform_train) - - test_set = torchvision.datasets.CIFAR10( - root=data_dir, train=False, download=False, transform=transform_test) - - return train_set, test_set - -def train_cifar(config): - """! function to train a ResNet on cifar-10 with different hyperparameters - @param config hyperparameter search space - """ - # load a ResNet model - model = models.resnet18() - - # define optimizer and loss function - criterion = nn.CrossEntropyLoss() - optimizer = optim.SGD(model.parameters(), lr=config["lr"]*dist.get_world_size()) - - if session.get_checkpoint() and args.scheduler == "PBT": - step = 1 - # Load model state and iteration step from checkpoint. - checkpoint_dict = session.get_checkpoint().to_dict() - model.load_state_dict(checkpoint_dict["model_state_dict"]) - # Load optimizer state (needed since we're using momentum), - # then set the `lr` and `momentum` according to the config. - optimizer.load_state_dict(checkpoint_dict["optimizer_state_dict"]) - - # Note: Make sure to increment the checkpointed step by 1 to get the current step. - last_step = checkpoint_dict["step"] - step = last_step + 1 - - # prepare the model for Ray Tune - model = train.torch.prepare_model(model) - - # load the training and test data - train_set, test_set = load_data(str(config["data_dir"])) - - # define the train and test dataloader - train_loader = torch.utils.data.DataLoader( - train_set, - batch_size=int(config["batch_size"]), - shuffle=True, - num_workers=30) - - test_loader = torch.utils.data.DataLoader( - test_set, - batch_size=int(config["batch_size"]), - shuffle=False, - num_workers=30) - - # prepare the dataloaders for Ray Tune - train_loader = train.torch.prepare_data_loader(train_loader) - test_loader = train.torch.prepare_data_loader(test_loader) - - # prepare metrics - train_acc = 0 - train_correct = 0 - train_total = 0 - - test_acc = 0 - test_correct = 0 - test_total = 0 - - # training and testing loop - for epoch in range(100): - # prepare model for training and loop over training dataset - model.train() - for i, (images, target) in enumerate(train_loader): - # compute output - optimizer.zero_grad() - output = model(images) - - # compute loss - loss = criterion(output, target) - - # count correct classifications - tmp_correct, tmp_total = accuracy(output, target) - train_correct +=tmp_correct - train_total +=tmp_total - - # backpropagation and optimization step - loss.backward() - optimizer.step() - - # average the train metrics over all workers - train_correct = par_sum(train_correct) - train_total = par_sum(train_total) - - # compute final training accuracy - train_acc = train_correct/train_total - - # only perform the testing loop every 10 epochs - if ((epoch+1)%10 == 0): - # prepare model for testing and loop over test dataset - model.eval() - with torch.no_grad(): - for i, (images, target) in enumerate(test_loader): - - # compute output - output = model(images) - - # count correct classifications - tmp_correct, tmp_total = accuracy(output, target) - test_correct +=tmp_correct - test_total +=tmp_total - - # average the test metrics over all workers - test_correct = par_sum(test_correct) - test_total = par_sum(test_total) - - # compute final test accuracy - test_acc = test_correct/test_total - - # report the training and testing accuracy back to the head node of Ray Tune - session.report({"train_acc": train_acc.item(), "test_acc": test_acc.item()}) - - # PBT specific - if args.scheduler == "PBT": - step += 1 - -def main(args): - """! main function - @param args input arguments - """ - - # initalize Ray with the correct adress and node ip adress - ray.init(address=os.environ['ip_head'], _node_ip_address=os.environ["head_node_ip"]) - - # define the hyperparameter search space - config = { - "batch_size": tune.choice([64, 128, 256, 512]), - "lr": tune.loguniform(10e-5, 1), - "data_dir": tune.choice([args.data_dir]), - } - - # set search algorithm - search_alg = None - - if (args.scheduler == "ASHA"): - from ray.tune.schedulers import ASHAScheduler - # Asynchronous Successive Halving Algorithm - scheduler = ASHAScheduler( - # the number of iterations to allow the trials to run at max - max_t=args.max_iterations, - # how many iterations before a bad trials get terminated - grace_period=2, - # which percentage of trials to terminate - reduction_factor=3) - - elif (args.scheduler == "BOHB"): - from ray.tune.schedulers.hb_bohb import HyperBandForBOHB - from ray.tune.search.bohb import TuneBOHB - # Bayesian Optimization and HyperBand - scheduler = HyperBandForBOHB( - # time attribute - time_attr="training_iteration", - # the number of iterations to allow the trials to run at max - max_t=args.max_iterations, - # which percentage of trials to terminate - reduction_factor=3) - - # modify search algorithm for BOHB - search_alg = TuneBOHB(seed=42) - - elif (args.scheduler == "PBT"): - from ray.tune.schedulers import PopulationBasedTraining - from ray.air import session, Checkpoint - # define the mutation config - mutation_config = {"lr": tune.loguniform(10e-5, 1),} - - # Population Based Training - scheduler = PopulationBasedTraining( - # time attribute - time_attr="training_iteration", - # intervals at that perturbations occur, - perturbation_interval=1, - # specification of hyperparameter mutatation search space (can be different than original search space!) - hyperparam_mutations={"train_loop_config": mutation_config}, - """ the parameters of the top quantile_fraction percentage trials are transfered to the bottom - quantile_fraction percentage of trials""" - quantile_fraction=0.33, - # probability to resample from original hyperparameter search space - resample_probability=0, - ) - - elif (args.scheduler == "RAND"): - # random scheduler - scheduler = None - - # define a reporter/logger to specifify which metrics to print out during the optimization process - reporter = CLIReporter( - metric_columns=["train_acc", "test_acc", "training_iteration", "time_this_iter_s", "time_total_s"], - max_report_frequency=60) - - # define the general RunConfig of Ray Tune - run_config = RunConfig( - # name of the training run (directory name). - name="cifar_test_training", - # directory to store the ray tune results in . - local_dir=os.path.join(os.path.abspath(os.getcwd()), "ray_results"), - # logger - progress_reporter=reporter, - # stopping criterion when to end the optimization process - stop={"training_iteration": args.max_iterations}) - - if (args.scheduler == "PBT"): - #checkpointing - run_config.checkpoint_config=ray.air.CheckpointConfig(checkpoint_score_attribute="test_acc") - - # wrapping the torch training function inside a TorchTrainer logic - trainer = TorchTrainer( - # torch training function - train_loop_per_worker=train_cifar, - # default hyperparameters for the function - train_loop_config={"batch_size": 64, "lr": 0.1, "data_dir": "/"}, - # setting the default resources/workers to use for the training function, including the number of CPUs and GPUs - scaling_config=ScalingConfig(num_workers=args.ngpus, use_gpu=True, resources_per_worker={"CPU": 30, "GPU": 1}), - ) - - # defining the hyperparameter tuner - tuner = Tuner( - # function to tune - trainer, - # hyperparameter search space - param_space={"train_loop_config": config}, - # the tuning configuration - tune_config=TuneConfig( - # define how many trials to evaluate - num_samples=args.num_samples, - # define which metric to use for measuring the performance of the trials - metric="test_acc", - # if the metric should be maximized or minimized - mode="max", - # define which scheduler to use - scheduler=scheduler, - # define which search algorithm to use - search_alg=search_alg), - run_config=run_config - ) - - # measure the total runtime - start_time = time.time() - - # start the optimization process - result = tuner.fit() - - runtime = time.time() - start_time - - # print total runtime - print("Total runtime: ", runtime) - - # print metrics of the best trial - best_result = result.get_best_result(metric="test_acc", mode="max") - - print("Best result metrics: ", best_result) - - # print results dataframe - print("Result dataframe: ") - print(result.get_dataframe().sort_values("test_acc", ascending=False)) - -if __name__ == "__main__": - # get custom arguments from parser - parser = parsIni() - args = parser.parse_args() - - # call the main function to launch Ray - main(args) - -# eof diff --git a/scripts/jureca_raytune/cifar_tune.py b/scripts/jureca_raytune/cifar_tune.py deleted file mode 100644 index 59bacbcebf66a07f403bcbe2afcda81ddc227b7b..0000000000000000000000000000000000000000 --- a/scripts/jureca_raytune/cifar_tune.py +++ /dev/null @@ -1,104 +0,0 @@ -from functools import partial -import numpy as np -import os -import torch -import torch.nn as nn -import torch.nn.functional as F -import torch.optim as optim -import torchvision -import torchvision.transforms as transforms -import torchvision.models as models -import ray -from ray import tune -from ray.tune import CLIReporter - -def load_data(data_dir=None): - transform = transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) - ]) - - trainset = torchvision.datasets.CIFAR10( - root=data_dir, train=True, download=False, transform=transform) - - return trainset - - -def train_cifar(config, data_dir=None): - - net = models.resnet18() - - device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - - net.to(device) - - criterion = nn.CrossEntropyLoss() - optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9) - - - trainset = load_data(data_dir) - - - trainloader = torch.utils.data.DataLoader( - trainset, - batch_size=int(config["batch_size"]), - shuffle=True, - num_workers=0) - - - for epoch in range(10): # loop over the dataset multiple times - running_loss = 0.0 - epoch_steps = 0 - running_correct = 0 - for i, data in enumerate(trainloader, 0): - # get the inputs; data is a list of [inputs, labels] - inputs, labels = data - inputs, labels = inputs.to(device), labels.to(device) - - # zero the parameter gradients - optimizer.zero_grad() - - # forward + backward + optimize - outputs = net(inputs) - loss = criterion(outputs, labels) - pred = outputs.argmax(dim=1, keepdim=True) - loss.backward() - optimizer.step() - - running_correct += pred.eq(labels.view_as(pred)).sum().item() - - - tune.report(loss = loss.item(), accuracy=running_correct / len(trainset)) - - print("Finished Training") - - -def main(num_samples=10, max_num_epochs=10, gpus_per_trial=1): - ray.init(address='auto') - - - config = { - "batch_size": tune.choice([64, 128, 256, 512]), - "lr": tune.loguniform(10e-5, 1) - } - - result = tune.run( - partial(train_cifar, data_dir='/p/project/raise-ctp2/cifar10/data'), - local_dir=os.path.join(os.path.abspath(os.getcwd()), "ray_results"), - resources_per_trial={"cpu": 8, "gpu": gpus_per_trial}, - config=config, - num_samples=num_samples, - scheduler=None) - - - best_trial = result.get_best_trial("loss", "min", "last") - print("Best trial config: {}".format(best_trial.config)) - print("Best trial final validation loss: {}".format( - best_trial.last_result["loss"])) - print("Best trial final validation accuracy: {}".format( - best_trial.last_result["accuracy"])) - - -if __name__ == "__main__": - # You can change the number of GPUs per trial here: - main(num_samples=10, max_num_epochs=10, gpus_per_trial=1) \ No newline at end of file diff --git a/scripts/jureca_raytune/cifar_tune_tf.py b/scripts/jureca_raytune/cifar_tune_tf.py deleted file mode 100644 index ab9572d62bbf6c9cc96798d52c53abf1237dcabe..0000000000000000000000000000000000000000 --- a/scripts/jureca_raytune/cifar_tune_tf.py +++ /dev/null @@ -1,76 +0,0 @@ -from functools import partial -import os -import tensorflow as tf -from tensorflow.keras import datasets, layers, models -import tensorflow_datasets as tfds -from tensorflow.keras.applications.resnet50 import ResNet50 -import ray -from ray import tune -from ray.tune import CLIReporter -from ray.tune.integration.keras import TuneReportCallback - - -# transform functions for data preprocessing -def train_transform(inputs): - i = inputs["image"] - i = tf.cast(i, tf.float32) - i = tf.image.resize(i, size=[256,256]) - i = tf.image.random_crop(i, size=[224,224,3]) - i = tf.image.random_flip_left_right(i) - i = tf.keras.applications.resnet50.preprocess_input(i) - i = i / 255.0 - return (i, inputs["label"]) - -def val_transform(inputs): - i = inputs["image"] - i = tf.cast(i, tf.float32) - i = tf.image.resize(i, size=[256,256]) - i = tf.image.central_crop(i, 224/256) - i = tf.keras.applications.resnet50.preprocess_input(i) - i = i / 255.0 - return (i, inputs["label"]) - -# main train function -def train_cifar(config, data_dir=None): - - strategy = tf.distribute.MirroredStrategy() - - # load data - train_ds, test_ds = tfds.load('cifar10', split=['train','test'], data_dir=data_dir, download=False) - - with strategy.scope(): - # prepare data and load model - train_ds=train_ds.map(train_transform).batch(config["batch_size"]) - test_ds=test_ds.map(val_transform).batch(config["batch_size"]) - - model = ResNet50(weights=None) - - - # compile and run model - model.compile(optimizer='adam', - loss=tf.keras.losses.SparseCategoricalCrossentropy(), - metrics=['accuracy']) - - history = model.fit(train_ds,validation_data=test_ds, epochs=10, verbose=2, callbacks=[TuneReportCallback({"loss": "loss"})]) - - -def main(num_samples=10, max_num_epochs=10, gpus_per_trial=4): - ray.init(address='auto') - - - config = { - "batch_size": tune.choice([32, 64, 128, 256]) - } - - result = tune.run( - partial(train_cifar, data_dir='/p/project/raise-ctp2/tensorflow_datasets/'), - local_dir=os.path.join(os.path.abspath(os.getcwd()), "ray_results"), - resources_per_trial={"cpu": 8, "gpu": gpus_per_trial}, - config=config, - num_samples=num_samples, - scheduler=None) - - -if __name__ == "__main__": - # You can change the number of GPUs per trial here: - main(num_samples=10, max_num_epochs=10, gpus_per_trial=4) diff --git a/scripts/jureca_raytune/create_jureca_env.sh b/scripts/jureca_raytune/create_jureca_env.sh deleted file mode 100644 index 345b8a09a7ac72b7876d6b70a756942a62accf3c..0000000000000000000000000000000000000000 --- a/scripts/jureca_raytune/create_jureca_env.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash -ml --force purge - -ml Stages/2022 GCC/11.2.0 OpenMPI/4.1.2 PyTorch/1.11-CUDA-11.5 torchvision/0.12.0-CUDA-11.5 - -python3 -m venv ray_tune_env - -source ray_tune_env/bin/activate - -pip3 install ray ray[tune] - -## optional: -## pip3 install tensorflow tensorflow-datasets - -deactivate diff --git a/scripts/jureca_raytune/jureca_run_ray.sh b/scripts/jureca_raytune/jureca_run_ray.sh deleted file mode 100644 index a983e5b0420c7b4f89d31f251fb066474f4d3fe5..0000000000000000000000000000000000000000 --- a/scripts/jureca_raytune/jureca_run_ray.sh +++ /dev/null @@ -1,92 +0,0 @@ -#!/bin/bash -# shellcheck disable=SC2206 -#SBATCH --job-name=RayTuneTest -#SBATCH --account=raise-ctp2 -#SBATCH --mail-user= -#SBATCH --mail-type=ALL -#SBATCH --output=ray_tune.out -#SBATCH --error=ray_tune.err - -#SBATCH --partition=dc-gpu -#SBATCH --nodes=4 -#SBATCH --tasks-per-node=1 -#SBATCH --cpus-per-task=128 -#SBATCH --gres=gpu:4 -#SBATCH --time=01:00:00 -#SBATCH --exclusive - -ml --force purge - -ml Stages/2022 GCC/11.2.0 OpenMPI/4.1.2 PyTorch/1.11-CUDA-11.5 torchvision/0.12.0-CUDA-11.5 - -source ray_tune_env/bin/activate - -sleep 1 -# make sure CUDA devices are visible -export CUDA_VISIBLE_DEVICES="0,1,2,3" - -num_gpus=4 - -## Limit number of max pending trials -export TUNE_MAX_PENDING_TRIALS_PG=$(($SLURM_NNODES * 4)) - -## Disable Ray Usage Stats -export RAY_USAGE_STATS_DISABLE=1 - -####### this part is taken from the ray example slurm script ##### -set -x - -# __doc_head_address_start__ - -# Getting the node names -nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -nodes_array=($nodes) - -head_node=${nodes_array[0]} -head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) - -# if we detect a space character in the head node IP, we'll -# convert it to an ipv4 address. This step is optional. -if [[ "$head_node_ip" == *" "* ]]; then -IFS=' ' read -ra ADDR <<<"$head_node_ip" -if [[ ${#ADDR[0]} -gt 16 ]]; then - head_node_ip=${ADDR[1]} -else - head_node_ip=${ADDR[0]} -fi -echo "IPV6 address detected. We split the IPV4 address as $head_node_ip" -fi -# __doc_head_address_end__ - -# __doc_head_ray_start__ -port=6379 -ip_head=$head_node_ip:$port -export ip_head -echo "IP Head: $ip_head" - -echo "Starting HEAD at $head_node" -srun --nodes=1 --ntasks=1 -w "$head_node" \ - ray start --head --node-ip-address="$head_node"i --port=$port \ - --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus $num_gpus --block & -# __doc_head_ray_end__ - -# __doc_worker_ray_start__ - -# optional, though may be useful in certain versions of Ray < 1.0. -sleep 10 - -# number of nodes other than the head node -worker_num=$((SLURM_JOB_NUM_NODES - 1)) - -for ((i = 1; i <= worker_num; i++)); do - node_i=${nodes_array[$i]} - echo "Starting WORKER $i at $node_i" - srun --nodes=1 --ntasks=1 -w "$node_i" \ - ray start --address "$head_node"i:"$port" --redis-password='5241590000000000' \ - --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus $num_gpus --block & - sleep 5 -done - -echo "Ready" - -python3 -u cifar_tune.py diff --git a/scripts/juwels_ddp/README.md b/scripts/juwels_ddp/README.md deleted file mode 100644 index fb592ebbefe9688a86503219e2b12f6ae4ba8eb7..0000000000000000000000000000000000000000 --- a/scripts/juwels_ddp/README.md +++ /dev/null @@ -1,33 +0,0 @@ -# DL using DDP on juwels booster - -### DDP source -https://github.com/pytorch/pytorch#from-source - -### juwels documentation -https://apps.fz-juelich.de/jsc/hps/juwels/index.html - -### current isues -1. torchrun: Hostname/endpoint mismatch not handled\ -workaround is to modify torchrun and use included batch script\ -simply run `createEnv.sh` to install fixed torch\ -discussion in: https://github.com/pytorch/pytorch/issues/73656 -2. for containers, instead of #1, use `fixed_torch_run.py` -- follow usage - containers. - -### to-do -1. - -### done -1. fixed local IPs for TCP -2. tested containers \ -https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch -3. Scale up to 2400 GPUs using NCCL backend - -### usage - Python Env -1. run `./env_build.sh` to create env and install torch -2. select a case from CASES folder -3. submit `sbatch env_batch.sh` - -### usage - containers -1. run `./container_build.sh` to build .sif -2. select a case from CASES folder -3. submit `sbatch container_batch.sh` diff --git a/scripts/juwels_ddp/container_batch.sh b/scripts/juwels_ddp/container_batch.sh deleted file mode 100644 index 3aa57161daa39c7f53b27db4c7abd598730c9ed4..0000000000000000000000000000000000000000 --- a/scripts/juwels_ddp/container_batch.sh +++ /dev/null @@ -1,85 +0,0 @@ -#!/bin/bash - -# general configuration of the job -#SBATCH --job-name=TorchContTest -#SBATCH --account=slfse -#SBATCH --mail-user= -#SBATCH --mail-type=ALL -#SBATCH --output=job.out -#SBATCH --error=job.err -#SBATCH --time=0-00:15:00 - -# configure node and process count on the CM -#SBATCH --partition=develbooster -#SBATCH --nodes=4 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=24 -#SBATCH --gpus-per-node=4 -#SBATCH --exclusive - -# gres options have to be disabled for deepv -#SBATCH --gres=gpu:4 - -# parameters -debug=false # do debug -dataDir='/p/scratch/raise-ctp2/inanc2/T31/' -COMMAND="DDP_ATBL_CAE_mod.py" - -EXEC="$COMMAND \ - --batch-size 1 \ - --epochs 10 \ - --lr 0.001 \ - --nworker $SLURM_CPUS_PER_TASK \ - --shuff \ - --scale-lr \ - --schedule \ - --data-dir $dataDir" - - -### do not modify below ### - - -# set modules and envs -ml GCC/11.3.0 OpenMPI/4.1.4 cuDNN/8.6.0.163-CUDA-11.7 Apptainer-Tools/2023 -source $SLURM_SUBMIT_DIR/torch_env//bin/activate - -# set env vars -export CUDA_VISIBLE_DEVICES="0,1,2,3" -export OMP_NUM_THREADS=1 -if [ "$SLURM_CPUS_PER_TASK" > 0 ] ; then - export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -fi -export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK} - -# job info -echo "DEBUG: TIME: $(date)" -echo "DEBUG: EXECUTE: $EXEC" -echo "DEBUG: SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR" -echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" -echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" -echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" -echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" -echo "DEBUG: SLURM_GPUS_PER_NODE: $SLURM_GPUS_PER_NODE" -echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" -echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" -echo "DEBUG: SLURM_NODEID: $SLURM_NODEID" -echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" -if [ "$debug" = true ] ; then - export NCCL_DEBUG=INFO -fi -ech - -# launch container -srun --cpu-bind=none bash -c "apptainer exec --nv torch.sif \ - python -m fixed_torch_run \ - --log_dir='logs' \ - --nnodes=$SLURM_NNODES \ - --nproc_per_node=$SLURM_GPUS_PER_NODE \ - --rdzv_id=$SLURM_JOB_ID \ - --rdzv_conf=is_host=\$(((SLURM_NODEID)) && echo 0 || echo 1) \ - --rdzv_backend=c10d \ - --rdzv_endpoint='$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)'i:29500 \ - $EXEC" - -#eof diff --git a/scripts/juwels_ddp/container_build.sh b/scripts/juwels_ddp/container_build.sh deleted file mode 100644 index 7c405a1e573fab9187b20b642f2cc28c957a052f..0000000000000000000000000000000000000000 --- a/scripts/juwels_ddp/container_build.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash -# -*- coding: utf-8 -*- -# author: EI -# version: 2212008a -# pull and build containers for PyTorch/NVIDIA - -# load modules -ml GCC/11.3.0 OpenMPI/4.1.4 cuDNN/8.6.0.163-CUDA-11.7 Apptainer-Tools/2023 - -# create Cache/TMP so that $HOME would not be used -mkdir -p Cache -mkdir -p TMP -export APPTAINER_CACHEDIR=$(mktemp -d -p $PWD/Cache) -export APPTAINER_TMPDIR=$(mktemp -d -p $PWD/TMP) - -# official NVIDIA NVCR container with Torch==2.0.0 -apptainer pull torch.sif docker://nvcr.io/nvidia/pytorch:23.03-py3 - -# run bash to create envs -echo "running ./container_env.sh" -apptainer exec torch.sif bash -c "./container_env.sh" - -#eof diff --git a/scripts/juwels_ddp/container_env.sh b/scripts/juwels_ddp/container_env.sh deleted file mode 100644 index 641140a0fb58bb839e638c8e8163c7c8d8f2353f..0000000000000000000000000000000000000000 --- a/scripts/juwels_ddp/container_env.sh +++ /dev/null @@ -1,13 +0,0 @@ -nname='torch_env' - -# create env inside container -python3 -m venv $nname --system-site-packages -source ${nname}/bin/activate - -# install wheels -- from this point on, feel free to add anything -pip3 install -r reqs.txt - -# modify l.4 of /torchnlp/_third_party/weighted_random_sampler.py -var='int_classes = int' -sed -i "4s|.*|$var|" \ - $PWD/${nname}/lib/python3.8/site-packages/torchnlp/_third_party/weighted_random_sampler.py diff --git a/scripts/juwels_ddp/createEnv.sh b/scripts/juwels_ddp/createEnv.sh deleted file mode 100755 index ea826fd456d4b97b90e03deffa0540cf81619300..0000000000000000000000000000000000000000 --- a/scripts/juwels_ddp/createEnv.sh +++ /dev/null @@ -1,193 +0,0 @@ -#!/bin/bash -# -*- coding: utf-8 -*- -# author: EI -# version: 220328a -# creates machine specific python env - -# set modules -ml --force purge - -# get sys info -cDir=$PWD -sysN="$(uname -n | cut -f2- -d.)" -echo "system:${sysN}" -echo - -cont1=false -if [[ $sysN = 'deepv' || $sysN = 'dp-esb'* ]] ; then - sysN=deepv - ml use $OTHERSTAGES - ml Stages/2022 NVHPC/22.1 OpenMPI/4.1.2 NCCL/2.15.1-1-CUDA-11.5 cuDNN/8.3.1.22-CUDA-11.5 - ml Python/3.9.6 HDF5 CMake - ml -nvidia-driver/.default - cont1=true -elif [ "$sysN" = 'juwels' ] ; then - ml Stages/2023 StdEnv/2023 NVHPC/23.1 OpenMPI/4.1.4 cuDNN/8.6.0.163-CUDA-11.7 - ml Python/3.10.4 CMake HDF5 PnetCDF libaio/0.3.112 - cont1=true -elif [ "$sysN" = 'jureca' ] ; then - #ml Stages/2023 StdEnv/2023 NVHPC/23.1 OpenMPI/4.1.4 cuDNN/8.6.0.163-CUDA-11.7 - ml Stages/2023 StdEnv/2023 GCC/11.3.0 OpenMPI/4.1.4 cuDNN/8.6.0.163-CUDA-11.7 - ml Python/3.10.4 CMake HDF5 PnetCDF libaio/0.3.112 - cont1=true -else - echo - echo 'unknown system detected' - echo 'canceling' - echo -fi -echo "modules loaded" -echo - -# get python version -pver="$(python --version 2>&1 | awk {'print $2'} | cut -f1-2 -d.)" -echo "python version is ${pver}" -echo - -# create env -if [ "$cont1" = true ] ; then - if [ -d "${cDir}/envAI_${sysN}" ];then - echo 'env already exist' - echo - - source envAI_${sysN}/bin/activate - else - python3 -m venv envAI_${sysN} - - # get headers for pip - if [ -f "${cDir}/envAI_${sysN}/bin/pip3" ]; then - echo 'pip already exist' - else - cp "$(which pip3)" $cDir/envAI_${sysN}/bin/ - ln -s $cDir/envAI_${sysN}/bin/pip3 $cDir/envAI_${sysN}/bin/pip${pver} - var="#!$cDir/envAI_${sysN}/bin/python${pver}" - sed -i "1s|.*|$var|" $cDir/envAI_${sysN}/bin/pip3 - fi - - # activate env - source envAI_${sysN}/bin/activate - - echo "a new env is created in ${cDir}" - echo "activation is done via:" - echo "source ${cDir}/envAI_${sysN}/bin/activate" - fi -fi - -# set tmp dir env var -export TMPDIR=${cDir} - -# install torch -if [ -f "${cDir}/envAI_${sysN}/bin/torchrun" ]; then - echo 'Torch already installed' - echo -else - # Stages/2023 - CUDA/11.7 - torch 2.0 stable - pip3 install torch torchvision torchaudio --no-cache-dir -fi - -# install horovod -if [ -f "${cDir}/envAI_${sysN}/bin/horovodrun" ]; then - echo 'Horovod already installed' - echo -else - # compiler vars - export LDSHARED="$CC -shared" && - - # CPU vars - export HOROVOD_WITH_MPI=1 - export HOROVOD_MPI_THREADS_DISABLE=1 - export HOROVOD_CPU_OPERATIONS=MPI - - # GPU vars - #export HOROVOD_GPU=CUDA - #export HOROVOD_CUDA_HOME=$EBROOTCUDA - #export HOROVOD_GPU_OPERATIONS=MPI - #export HOROVOD_GPU_OPERATIONS=NCCL - export HOROVOD_GPU_ALLREDUCE=NCCL - export HOROVOD_NCCL_LINK=SHARED - export HOROVOD_NCCL_HOME=$EBROOTNCCL - - # Host language vars - export HOROVOD_WITH_PYTORCH=1 - export HOROVOD_WITHOUT_TENSORFLOW=1 - export HOROVOD_WITHOUT_MXNET=1 - - pip3 install --no-cache-dir wheel - pip3 install --no-cache-dir horovod -fi - -# install deepspeed -if [ -f "${cDir}/envAI_${sysN}/bin/deepspeed" ]; then - echo 'DeepSpeed already installed' - echo -else - # compile all opt. stuff - not needed & not working - #export DS_BUILD_OPS=1 - # compile req. opt. stuff - export DS_BUILD_FUSED_ADAM=1 - export DS_BUILD_UTILS=1 - if [ "$sysN" = 'deepv' ] ; then - #fix libaio issues via: - export DS_BUILD_AIO=0 - fi - - pip3 install --no-cache-dir DeepSpeed - - # add this to .../deepspeed/launcher/launch.py l.93 - var=' args.node_rank=int(os.environ.get("SLURM_PROCID",0))' - sed -i "132s|.*|$var|" $cDir/envAI_${sysN}/lib/python${pver}/site-packages/deepspeed/launcher/launch.py -fi - -# install heat -if [ -d "${cDir}/envAI_${sysN}/lib/python${pver}/site-packages/heat" ]; then - echo 'HeAT already installed' - echo -else - export CFLAGS="-noswitcherror" - export CXXFLAGS="-noswitcherror" - - # experimental - # modify setup.py to accep torch>1.7 for heat - git clone --recursive https://github.com/helmholtz-analytics/heat.git heat - var=' "torch>=1.7.0",' - sed -i "36s|.*|$var|" heat/setup.py - - # create tar ball - tar czf heat.tar.gz - - # install experimental heat - pip3 install --no-cache-dir 'heat.tar.gz[hdf5,netcdf]' -fi - -# get rest of the libraries$ -if [ "$cont1" = true ] ; then - # install rest - pip3 install -r reqs.txt - - # modify l.4 of /torchnlp/_third_party/weighted_random_sampler.py - var='int_classes = int' - sed -i "4s|.*|$var|" \ - $cDir/envAI_${sysN}/lib/python${pver}/site-packages/torchnlp/_third_party/weighted_random_sampler.py -fi - -# fix IB IP config -if [ -f "${cDir}/envAI_${sysN}/bin/torchrun" ]; then - sed -i -e '3,8s/^/#/' ${cDir}/envAI_${sysN}/bin/torchrun - echo """ -import re -import sys -from torch.distributed.run import main -from torch.distributed.elastic.agent.server import api as sapi - -def new_get_fq_hostname(): - return _orig_get_fq_hostname().replace('.', 'i.', 1) - -if __name__ == '__main__': - _orig_get_fq_hostname = sapi._get_fq_hostname - sapi._get_fq_hostname = new_get_fq_hostname - sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) - sys.exit(main()) -""" >> ${cDir}/envAI_${sysN}/bin/torchrun -fi - -#eof diff --git a/scripts/juwels_ddp/env_batch.sh b/scripts/juwels_ddp/env_batch.sh deleted file mode 100644 index 5c3b7eb8611239abdad9f8897550ef7529a8ebb9..0000000000000000000000000000000000000000 --- a/scripts/juwels_ddp/env_batch.sh +++ /dev/null @@ -1,90 +0,0 @@ -#!/bin/bash - -# general configuration of the job -#SBATCH --job-name=TorchTest -#SBATCH --account=slfse -#SBATCH --mail-user= -#SBATCH --mail-type=ALL -#SBATCH --output=job.out -#SBATCH --error=job.err -#SBATCH --time=0-00:15:00 - -# configure node and process count on the CM -#SBATCH --partition=develbooster -#SBATCH --nodes=4 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=24 -#SBATCH --gpus-per-node=4 -#SBATCH --exclusive - -# gres options have to be disabled for deepv -#SBATCH --gres=gpu:4 - -# parameters -debug=false # do debug -dataDir='/p/scratch/raise-ctp2/inanc2/T31/' -COMMAND="DDP_ATBL_CAE_mod.py" - -EXEC="$COMMAND \ - --batch-size 1 \ - --epochs 10 \ - --lr 0.001 \ - --nworker $SLURM_CPUS_PER_TASK \ - --shuff \ - --scale-lr \ - --schedule \ - --data-dir $dataDir" - - -### do not modify below ### - - -# set modules -ml --force purge -ml Stages/2023 StdEnv/2023 NVHPC/23.1 OpenMPI/4.1.4 cuDNN/8.6.0.163-CUDA-11.7 -ml Python/3.10.4 HDF5 libaio/0.3.112 - -# set env -source /p/project/prcoe12/RAISE/envAI_juwels/bin/activate - -# sleep a sec -sleep 1 - -# set env vars -export CUDA_VISIBLE_DEVICES="0,1,2,3" -export OMP_NUM_THREADS=1 -if [ "$SLURM_CPUS_PER_TASK" > 0 ] ; then - export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -fi -export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK} - -# job info -echo "DEBUG: TIME: $(date)" -echo "DEBUG: EXECUTE: $EXEC" -echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" -echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" -echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" -echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" -echo "DEBUG: SLURM_GPUS_PER_NODE: $SLURM_GPUS_PER_NODE" -echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" -echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" -echo "DEBUG: SLURM_NODEID: $SLURM_NODEID" -echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" -if [ "$debug" = true ] ; then - export NCCL_DEBUG=INFO -fi -echo - -# launch -srun --cpu-bind=none bash -c "torchrun \ - --log_dir='logs' \ - --nnodes=$SLURM_NNODES \ - --nproc_per_node=$SLURM_GPUS_PER_NODE \ - --rdzv_id=$SLURM_JOB_ID \ - --rdzv_conf=is_host=\$(((SLURM_NODEID)) && echo 0 || echo 1) \ - --rdzv_backend=c10d \ - --rdzv_endpoint='$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)'i:29500 \ - $EXEC" - -# eof diff --git a/scripts/juwels_ddp/env_build.sh b/scripts/juwels_ddp/env_build.sh deleted file mode 100755 index b237a1b0c8d1a1736aee0110a03d67c4ebbe6ec2..0000000000000000000000000000000000000000 --- a/scripts/juwels_ddp/env_build.sh +++ /dev/null @@ -1,151 +0,0 @@ -#!/bin/bash -# -*- coding: utf-8 -*- -# author: EI -# version: 220211a -# creates machine specific python env -# note: Stage 2023 has issues, this uses Stage 2022 instead - -# set modules -module --force purge - -# get sys info -cDir=$PWD -sysN="$(uname -n | cut -f2- -d.)" -echo "system:${sysN}" -echo - -cont1=false -if [ "$sysN" = 'deepv' ] ; then - module use $OTHERSTAGES - ml GCC ParaStationMPI/5.4.9-1-mt Python cuDNN NCCL Python - cont1=true -elif [ "$sysN" = 'juwels' ] ; then - ml Stages/2022 GCC ParaStationMPI Python CMake - cont1=true -elif [ "$sysN" = 'jureca' ] ; then - #ml Stages/2022 GCC ParaStationMPI Python CMake NCCL libaio # Horovod issues with pscom?? - ml Stages/2022 GCC OpenMPI Python NCCL cuDNN libaio CMake - cont1=true -else - echo - echo 'unknown system detected' - echo 'canceling' - echo -fi -echo "modules loaded" -echo - -# get python version -pver="$(python --version 2>&1 | awk {'print $2'} | cut -f1-2 -d.)" -echo "python version is ${pver}" -echo - -if [ "$cont1" = true ] ; then - if [ -d "${cDir}/envAI_${sysN}" ];then - echo 'env already exist' - echo - - source envAI_${sysN}/bin/activate - else - # create env - python3 -m venv envAI_${sysN} - - # get headers for pip - if [ -f "${cDir}/envAI_${sysN}/bin/pip3" ]; then - echo 'pip already exist' - else - cp "$(which pip3)" $cDir/envAI_${sysN}/bin/ - ln -s $cDir/envAI_${sysN}/bin/pip3 $cDir/envAI_${sysN}/bin/pip${pver} - var="#!$cDir/envAI_${sysN}/bin/python${pver}" - sed -i "1s|.*|$var|" $cDir/envAI_${sysN}/bin/pip3 - fi - - # activate env - source envAI_${sysN}/bin/activate - - echo "a new env is created in ${cDir}" - echo "activation is done via:" - echo "source ${cDir}/envAI_${sysN}/bin/activate" - fi -fi - -# install torch -if [ -f "${cDir}/envAI_${sysN}/bin/torchrun" ]; then - echo 'Torch already installed' - echo -else - export TMPDIR=${cDir} - - pip3 install \ - torch==1.10.2+cu113 torchvision==0.11.3+cu113 torchaudio==0.10.2+cu113 \ - -f https://download.pytorch.org/whl/cu113/torch_stable.html --no-cache-dir -fi - -# install horovod -if [ -f "${cDir}/envAI_${sysN}/bin/horovodrun" ]; then - echo 'Horovod already installed' - echo -else - export HOROVOD_GPU=CUDA - export HOROVOD_GPU_OPERATIONS=NCCL - export HOROVOD_WITH_PYTORCH=1 - export TMPDIR=${cDir} - - pip3 install --no-cache-dir horovod --ignore-installed -fi - -# install deepspeed -if [ -f "${cDir}/envAI_${sysN}/bin/deepspeed" ]; then - echo 'DeepSpeed already installed' - echo -else - export DS_BUILD_OPS=1 - # if above not working?? recursion error use this - #export DS_BUILD_FUSED_ADAM=1 - #export DS_BUILD_UTILS=1 - export TMPDIR=${cDir} - - pip3 install --no-cache-dir DeepSpeed - - add this to .../deepspeed/launcher/launch.py l.70 - var=' argsy1.node_rank=int(os.environ.get("SLURM_PROCID",0))' - sed -i "85s|.*|$var|" $cDir/envAI_${sysN}/lib/python${pver}/site-packages/deepspeed/launcher/launch.py -fi - -# install heat -if [ -d "${cDir}/envAI_${sysN}/lib/python${pver}/site-packages/heat" ]; then - echo 'HeAT already installed' - echo -else - export TMPDIR=${cDir} - - # need to modify setup.py to accep torch>1.9 for heat - wget https://files.pythonhosted.org/packages/5d/3a/4781f1e6910753bfdfa6712c83c732c60e675d8de14983926a0d9306c7a6/heat-1.1.1.tar.gz - tar xzf heat-1.1.1.tar.gz - var=' "torch>=1.7.0",' - sed -i "36s|.*|$var|" heat-1.1.1/setup.py - var=' "torchvision>=0.8.0",' - sed -i "39s|.*|$var|" heat-1.1.1/setup.py - - # create tar again! - rm -rf heat-1.1.1.tar.gz - tar czf heat-1.1.1.tar.gz heat-1.1.1 - rm -rf heat-1.1.1 - - pip3 install --no-cache-dir 'heat-1.1.1.tar.gz[hdf5,netcdf]' - - rm -rf heat-1.1.1.tar.gz -fi - -# get rest of the libraries$ -if [ "$cont1" = true ] ; then - # install rest - pip3 install -r reqs.txt --ignore-installed - - # modify l.4 of /torchnlp/_third_party/weighted_random_sampler.py - var='int_classes = int' - sed -i "4s|.*|$var|" \ - $cDir/envAI_${sysN}/lib/python${pver}/site-packages/torchnlp/_third_party/weighted_random_sampler.py -fi - -#eof diff --git a/scripts/juwels_ddp/fixed_torch_run.py b/scripts/juwels_ddp/fixed_torch_run.py deleted file mode 100644 index cca970624b086399b5cc01f949d4881b191bb950..0000000000000000000000000000000000000000 --- a/scripts/juwels_ddp/fixed_torch_run.py +++ /dev/null @@ -1,51 +0,0 @@ -from argparse import ArgumentParser -import ipaddress -import runpy -import socket - -from torch.distributed.elastic.agent.server import api as sapi - - -def parse_host(): - parser = ArgumentParser() - parser.add_argument('--rdzv_endpoint') - endpoint = parser.parse_known_args()[0].rdzv_endpoint - host = ( - endpoint.split(':', 1)[0] - if endpoint - else None - ) - return host - - -def fix_torch_run(host): - _orig_get_fq_hostname = sapi._get_fq_hostname - - if host: - try: - ipaddress.ip_address(host) - is_ip = True - except ValueError: - is_ip = False - - if is_ip: - def new_get_fq_hostname(): - return socket.gethostbyaddr(host)[0] - else: - def new_get_fq_hostname(): - return socket.getfqdn(host) - else: - new_get_fq_hostname = _orig_get_fq_hostname - - sapi._get_fq_hostname = new_get_fq_hostname - - -def main(): - host = parse_host() - fix_torch_run(host) - runpy.run_module('torch.distributed.run', run_name='__main__') - - -if __name__ == '__main__': - main() - diff --git a/scripts/juwels_ddp/install_pyDDP.sh b/scripts/juwels_ddp/install_pyDDP.sh deleted file mode 100755 index 59213f0c87e4b60f882ac15d4bb26a574cfe1723..0000000000000000000000000000000000000000 --- a/scripts/juwels_ddp/install_pyDDP.sh +++ /dev/null @@ -1,80 +0,0 @@ -#!/bin/sh -# author: EI -# version: 210709a - -# get dir -iDir=$PWD - -# set modules -module --force purge -module use $OTHERSTAGES -ml Stages/2020 GCC/9.3.0 ParaStationMPI/5.4.7-1-mt CMake Ninja cuDNN NCCL mpi-settings/CUDA - -# conda -if [ -d "${iDir}/miniconda3" ];then - echo "miniconda3 already installed!" - source ${iDir}/miniconda3/etc/profile.d/conda.sh - conda activate -else - echo "miniconda3 will be compiled to ${iDir}/miniconda3!" - wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh - bash Miniconda3-latest-Linux-x86_64.sh -p ${iDir}/miniconda3 -b - source ${iDir}/miniconda3/etc/profile.d/conda.sh - conda activate - # std libs - conda install -y astunparse numpy pyyaml mkl mkl-include setuptools cffi typing_extensions future six requests dataclasses Pillow --force-reinstall - # cuda - check version with yours - conda install -c pytorch -y magma-cuda110 --force-reinstall - conda install -y pkg-config libuv --force-reinstall - rm -f Miniconda3-latest-Linux-x86_64.sh -fi - -# torch -if [ -d "${iDir}/pytorch/build" ];then - echo 'pytorch already installed!' -else - # clone pytorch - if [ -d "${iDir}/pytorch" ];then - echo 'pytorch repo is found!' - else - git clone --recursive https://github.com/pytorch/pytorch pytorch - fi - - # update repos - cd pytorch - git submodule sync - git submodule update --init --recursive - - # install pytorch - export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"} - export TMPDIR=${iDir}/tmp - python setup.py clean - CMAKE_C_COMPILER=$(which mpicc) CMAKE_CXX_COMPILER=$(which mpicxx) USE_DISTRIBUTED=ON USE_MPI=ON USE_CUDA=ON NCCL_ROOT_DIR=$EBROOTNCCL USE_NCCL=ON USE_GLOO=ON CUDNN_ROOT=$EBROOTCUDNN USE_CUDNN=ON python setup.py install - cd .. -fi - -# torchvision -if [ -d "${iDir}/torchvision/build" ];then - echo 'torchvision already installed!' -else - # clone torchvision - if [ -d "${iDir}/torchvision" ];then - echo 'torchvision repo is found!' - else - git clone --recursive https://github.com/pytorch/vision.git torchvision - fi - - # update repos - cd torchvision - git submodule sync - git submodule update --init --recursive - - # install torchvision - export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"} - export TMPDIR=${iDir}/tmp - python setup.py clean - CMAKE_C_COMPILER=$(which mpicc) CMAKE_CXX_COMPILER=$(which mpicxx) FORCE_CUDA=ON python setup.py install -fi - -echo 'done!' -# eof diff --git a/scripts/juwels_ddp/lamec.json b/scripts/juwels_ddp/lamec.json deleted file mode 100644 index af5277f77f8cbb767eab6123b61c94a6efecc8da..0000000000000000000000000000000000000000 --- a/scripts/juwels_ddp/lamec.json +++ /dev/null @@ -1 +0,0 @@ -{"startscript": "env_batch.sh"} \ No newline at end of file diff --git a/scripts/juwels_ddp/reqs.txt b/scripts/juwels_ddp/reqs.txt deleted file mode 100644 index 3db480947faffa7072d83cddce58a8e90c6ae82c..0000000000000000000000000000000000000000 --- a/scripts/juwels_ddp/reqs.txt +++ /dev/null @@ -1,12 +0,0 @@ -python-hostlist -Pillow -pyparsing -python-dateutil -matplotlib -h5py -pytorch-nlp -pyprof -filelock -scipy -perlin_noise -noise diff --git a/scripts/vega_basilisk/.gitkeep b/scripts/vega_basilisk/.gitkeep deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/scripts/vega_basilisk/basilisk_cfd.sh b/scripts/vega_basilisk/basilisk_cfd.sh deleted file mode 100644 index faaa5c8c8b6fb0d8f7e68fae56956615b9a7fbf5..0000000000000000000000000000000000000000 --- a/scripts/vega_basilisk/basilisk_cfd.sh +++ /dev/null @@ -1,71 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=case001 -#SBATCH --account=r2203-054-users -#SBATCH --nodes=1 -#SBATCH --ntasks=32 -#SBATCH --hint=nomultithread -#SBATCH --mem=64G -#SBATCH --time=24:00:00 -#SBATCH --output=job.%j.out -#SBATCH --error=job.%j.err -#SBATCH --partition=cpu - -module purge -module load gc -module load openmpi/gnu/4.1.2.1 - -echo "Starting at `date`" -echo "Running on hosts: $SLURM_NODELIST" -echo "Running on $SLURM_NNODES nodes." -echo "Running on $SLURM_NPROCS processors." -echo "Job id is $SLURM_JOBID" -pi=`echo "4*a(1)" | bc -l` -p1_max=`echo "$pi/6.0" | bc -l` -p1_min=0 -p2_max=`echo "$pi/18.0" | bc -l` -p2_min=0 -p3_max=5.0 -p3_min=5.0 -p4_max=20.0 -p4_min=0 -p5_max=20.0 -p5_min=0 -p6_max=`echo "$pi/2.0" | bc -l` -p6_min=`echo "$pi/2.0" | bc -l` -p7_max=`echo "$pi" | bc -l` -p7_min=`echo "$pi" | bc -l` -xc_max=0.6 -xc_min=0.4 -yc_max=0.6 -yc_min=0.4 - -file="params.in" - -if ! [[ -f "restart" ]] ; then - RANDOM=$(date +%s%N | cut -b10-19) # give a seed - echo "$RANDOM / 32767 * ($p1_max-$p1_min) + $p1_min" | bc -l > $file - echo "$RANDOM / 32767 * ($p2_max-$p2_min) + $p2_min" | bc -l >> $file - echo "$RANDOM / 32767 * ($p3_max+$p3_min) - $p3_min" | bc -l >> $file - echo "$RANDOM / 32767 * ($p4_max-$p4_min) + $p4_min" | bc -l >> $file - echo "$RANDOM / 32767 * ($p5_max-$p5_min) + $p5_min" | bc -l >> $file - echo "$RANDOM / 32767 * ($p6_max+$p6_min) - $p6_min" | bc -l >> $file - echo "$RANDOM / 32767 * ($p7_max+$p7_min) - $p7_min" | bc -l >> $file - echo "$RANDOM / 32767 * ($xc_max-$xc_min) + $xc_min" | bc -l >> $file - echo "$RANDOM / 32767 * ($yc_max-$yc_min) + $yc_min" | bc -l >> $file -fi - - -if ! [[ -d "output/" ]] ; then - mkdir output/ - mkdir output/wet_area/ - mkdir output/facets/ - mkdir output/my_output/ -fi - -CC99='mpicc -std=c99' qcc -O2 -Wall -D_MPI=1 drop.c -o run -lm - -srun --mpi=pmix -K1 -n $SLURM_NTASKS ./run #\ -###srun --mpi=pmix --exclusive -K1 -n $SLURM_NTASKS ./run #\ -### 2> log > out - -echo "Program finished with exit code $? at: `date`" diff --git a/scripts/vega_basilisk/basilisk_pde.sh b/scripts/vega_basilisk/basilisk_pde.sh deleted file mode 100644 index 3f7fb9e218ce82565f9cbc1749c5ae4a16d2115a..0000000000000000000000000000000000000000 --- a/scripts/vega_basilisk/basilisk_pde.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=jobrun -#SBATCH --nodes=1 -#SBATCH --ntasks=12 -#SBATCH --hint=nomultithread -#SBATCH --mem=10G -#SBATCH --time=0:40:00 -#SBATCH --output %j.out -#SBATCH --error %j_log -#SBATCH --partition=cpu -#SBATCH --mail-type=end # send email when job ends -##SBATCH --mail-user= - -set -e -if [ -z "$1" ]; then - echo "Missing file for compilation"; - exit 1; -fi - -#create configuration backup -cat ${0} > '.case.cfg' -cat ${1} >> '.case.cfg' - -#get WALLTIME -twLine=$(eval grep -m 1 "time=" ${PWD}/launch.sh) -WALLTIME=${twLine##*=} - -module purge -module load Bison/3.7.1-GCCcore-10.2.0 CMake/3.18.4-GCCcore-10.2.0 Python/3.8.6-GCCcore-10.2.0 flex/2.6.4-GCCcore-10.2.0 SWIG/4.0.2-GCCcore-10.3.0 -module load Mesa/20.2.1-GCCcore-10.2.0 libGLU/9.0.1-GCCcore-10.2.0 -module load OpenMPI/4.1.3-GCC-10.3.0 -module load ImageMagick/7.0.10-35-GCCcore-10.2.0 -module load FFmpeg/4.4.2-GCCcore-11.3.0 - -export BASILISK=/ceph/hpc/home/euyiannisv/basilisk/src -export PATH=$PATH:$BASILISK - - -echo "Starting at `date`" -echo "Running on hosts: $SLURM_NODELIST" -echo "Running on $SLURM_NNODES nodes." -echo "Running on $SLURM_NPROCS processors." -echo "Job id is $SLURM_JOBID" - -PWD=$(eval pwd) -echo "Executable for $1 at ${PWD}" - -GLLIBS="-L${BASILISK}/gl -lglutils -lfb_osmesa -lOSMesa -lGLU" -CC99='mpicc -std=c99' qcc -O2 -DINCLINED -DBDF2 -Wall -D_MPI=1 -I${PWD}/utils $1 -o run ${GLLIBS} -lm - -if [ $? -eq 0 ]; then - echo "Compilation Success."; -else - echo "Compilation error."; - exit 1; -fi - -srun --mpi=pmix -K1 -n $SLURM_NTASKS ${PWD}/run -m $WALLTIME >out 2>log - -echo "Program finished with exit code $? at: `date`" diff --git a/scripts/vega_basilisk/lamec.json b/scripts/vega_basilisk/lamec.json deleted file mode 100644 index c714558e48ac2237e955c724f0fc67632e3da290..0000000000000000000000000000000000000000 --- a/scripts/vega_basilisk/lamec.json +++ /dev/null @@ -1 +0,0 @@ -{"startscript": "basilisk_cfd.sh"}