From c76caaf9039f205a1a5dcf0d68e03d4f85a3d410 Mon Sep 17 00:00:00 2001 From: Xin Liu <xi.liu@fz-juelich.de> Date: Mon, 25 Mar 2024 15:51:01 +0100 Subject: [PATCH] Update 152 files - /scripts/vega_basilisk/.gitkeep - /scripts/vega_basilisk/basilisk_cfd.sh - /scripts/vega_basilisk/basilisk_pde.sh - /scripts/vega_basilisk/lamec.json - /scripts/juwels_ddp/README.md - /scripts/juwels_ddp/container_batch.sh - /scripts/juwels_ddp/container_build.sh - /scripts/juwels_ddp/container_env.sh - /scripts/juwels_ddp/createEnv.sh - /scripts/juwels_ddp/env_batch.sh - /scripts/juwels_ddp/env_build.sh - /scripts/juwels_ddp/fixed_torch_run.py - /scripts/juwels_ddp/install_pyDDP.sh - /scripts/juwels_ddp/lamec.json - /scripts/juwels_ddp/reqs.txt - /scripts/jureca_ddp/DDP_startscript.sh - /scripts/jureca_ddp/DDP_startscript_container.sh - /scripts/jureca_ddp/README.md - /scripts/jureca_ddp/createContainer.sh - /scripts/jureca_ddp/createEnv.sh - /scripts/jureca_ddp/createEnv_MPI.sh - /scripts/jureca_ddp/fixed_torch_run.py - /scripts/jureca_ddp/lamec.json - /scripts/jureca_ddp/reqs.txt - /scripts/jureca_deepspeed/DS_config.json - /scripts/jureca_deepspeed/DS_startscript_deep.sh - /scripts/jureca_deepspeed/README.md - /scripts/jureca_deepspeed/createEnv.sh - /scripts/jureca_deepspeed/lamec.json - /scripts/jureca_deepspeed/reqs.txt - /scripts/jureca_graphcore/GC_pytorch_mnist.py - /scripts/jureca_graphcore/GC_startscript.sh - /scripts/jureca_graphcore/README.md - /scripts/jureca_graphcore/lamec.json - /scripts/jureca_heat/HeAT_startscript_deep.sh - /scripts/jureca_heat/README.md - /scripts/jureca_heat/createEnv.sh - /scripts/jureca_heat/lamec.json - /scripts/jureca_heat/reqs.txt - /scripts/jureca_horovod/Hor_startscript_deep.sh - /scripts/jureca_horovod/README.md - /scripts/jureca_horovod/createEnv.sh - /scripts/jureca_horovod/lamec.json - /scripts/jureca_horovod/reqs.txt - /scripts/jureca_libtorch/MNIST/CMakeLists.txt - /scripts/jureca_libtorch/MNIST/LibTorch_startscript.sh - /scripts/jureca_libtorch/MNIST/compile.sh - /scripts/jureca_libtorch/MNIST/download_mnist.py - /scripts/jureca_libtorch/MNIST/mnist.cpp - /scripts/jureca_libtorch/TorchVision/compile_jpeg.sh - /scripts/jureca_libtorch/TorchVision/compile_png.sh - /scripts/jureca_libtorch/TorchVision/compile_torchvision.sh - /scripts/jureca_libtorch/README.md - /scripts/jureca_libtorch/lamec.json - /scripts/jureca_raytune/RayTune+DDP/.gitkeep - /scripts/jureca_raytune/RayTune+DDP/cifar_tune.py - /scripts/jureca_raytune/RayTune+DDP/create_env.sh - /scripts/jureca_raytune/RayTune+DDP/jureca_ray_ddp_startscript.sh - /scripts/jureca_raytune/Ray_2.4/ASHA/.gitkeep - /scripts/jureca_raytune/Ray_2.4/ASHA/cifar_tune_asha.py - /scripts/jureca_raytune/Ray_2.4/ASHA/jureca_ray_startscript.sh - /scripts/jureca_raytune/Ray_2.4/BOHB/.gitkeep - /scripts/jureca_raytune/Ray_2.4/BOHB/cifar_tune_bohb.py - /scripts/jureca_raytune/Ray_2.4/BOHB/jureca_ray_startscript.sh - /scripts/jureca_raytune/Ray_2.4/PBT/.gitkeep - /scripts/jureca_raytune/Ray_2.4/PBT/cifar_tune_pbt.py - /scripts/jureca_raytune/Ray_2.4/PBT/jureca_ray_startscript.sh - /scripts/jureca_raytune/Ray_2.4/.gitkeep - /scripts/jureca_raytune/Ray_2.4/build_ray_env.sh - /scripts/jureca_raytune/Ray_2.4/hpo.md - /scripts/jureca_raytune/Ray_2.4/hpo.py - /scripts/jureca_raytune/.gitkeep - /scripts/jureca_raytune/README.md - /scripts/jureca_raytune/cifar_tune.py - /scripts/jureca_raytune/cifar_tune_tf.py - /scripts/jureca_raytune/create_jureca_env.sh - /scripts/jureca_raytune/jureca_run_ray.sh - /scripts/cyclone_basilisk/.gitkeep - /scripts/cyclone_basilisk/basilisk_cfd.sh - /scripts/cyclone_basilisk/lamec.json - /scripts/cyclone_horovod/.gitkeep - /scripts/cyclone_horovod/FNO_launch.sh - /scripts/cyclone_horovod/lamec.json - /scripts/deep_clang/clang_script.sh - /scripts/deep_clang/lamec.json - /scripts/deep_ddp/DDP_startscript_deep.sh - /scripts/deep_ddp/README.md - /scripts/deep_ddp/conda_torch.sh - /scripts/deep_ddp/createEnv.sh - /scripts/deep_ddp/lamec.json - /scripts/deep_ddp/reqs.txt - /scripts/deep_deepspeed/DS_config.json - /scripts/deep_deepspeed/DS_startscript_deep.sh - /scripts/deep_deepspeed/README.md - /scripts/deep_deepspeed/createEnv.sh - /scripts/deep_deepspeed/lamec.json - /scripts/deep_deepspeed/reqs.txt - /scripts/deep_heat/HeAT_startscript_deep.sh - /scripts/deep_heat/README.md - /scripts/deep_heat/createEnv.sh - /scripts/deep_heat/example_mnist_heat.py - /scripts/deep_heat/lamec.json - /scripts/deep_heat/reqs.txt - /scripts/deep_horovod/Hor_startscript_deep.sh - /scripts/deep_horovod/README.md - /scripts/deep_horovod/createEnv.sh - /scripts/deep_horovod/lamec.json - /scripts/deep_horovod/pytorch_mnist.py - /scripts/deep_horovod/pytorch_synthetic_benchmark.py - /scripts/deep_horovod/reqs.txt - /scripts/deep_tensorflow/Create_Jupyter_deepv.ipynb - /scripts/deep_tensorflow/README.md - /scripts/deep_tensorflow/TF_startscript_deep.sh - /scripts/deep_tensorflow/createEnv_TF.sh - /scripts/deep_tensorflow/jupyterAddKernel.sh - /scripts/deep_tensorflow/jupyterCreateKernel.sh - /scripts/deep_tensorflow/lamec.json - /scripts/deep_tensorflow/tensorflow2_synthetic_benchmark.py - /scripts/LUMI_HeAT/lamec.json - /scripts/LUMI_HeAT/LUMI_HeAT_script.sh - /scripts/VEGA_Basilisk/_script.sh - /scripts/VEGA_Basilisk/lamec.json - /scripts/JURECA_Pytorch-DDP/JURECA_Pytorch-DDP_script.sh - /scripts/JURECA_Pytorch-DDP/lamec.json - /scripts/JURECA_Horovod/JURECA_Horovod_script.sh - /scripts/JURECA_Horovod/lamec.json - /scripts/JURECA_HeAT/JURECA_HeAT_script.sh - /scripts/JURECA_HeAT/lamec.json - /scripts/JURECA_DeepSpeed/JURECA_DeepSpeed_script.sh - /scripts/JURECA_DeepSpeed/lamec.json - /scripts/CYCLONE_BASILISK/.gitkeep - /scripts/CYCLONE_BASILISK/_script.sh - /scripts/CYCLONE_BASILISK/lamec.json - /scripts/CYCLONE_Horovod/.gitkeep - /scripts/CYCLONE_Horovod/_script.sh - /scripts/CYCLONE_Horovod/lamec.json - /scripts/JUWELS_DeepSpeed/JUWELS_DeepSpeed_script.sh - /scripts/JUWELS_DeepSpeed/lamec.json - /scripts/JUWELS_Pytorch-DDP/JUWELS_Pytorch-DDP_script.sh - /scripts/JUWELS_Pytorch-DDP/lamec.json - /scripts/JUWELS_Horovod/JUWELS_Horovod_script.sh - /scripts/JUWELS_Horovod/lamec.json - /scripts/JUWELS_HeAT/JUWELS_HeAT_script.sh - /scripts/JUWELS_HeAT/lamec.json - /scripts/DEEP_Pytorch-DDP/DEEP_Pytorch-DDP_script.sh - /scripts/DEEP_Pytorch-DDP/lamec.json - /scripts/DEEP_Horovod/DEEP_Horovod_script.sh - /scripts/DEEP_Horovod/lamec.json - /scripts/DEEP_DeepSpeed/DEEP_DeepSpeed_script.sh - /scripts/DEEP_DeepSpeed/lamec.json - /scripts/DEEP_HeAT/DEEP_HeAT_script.sh - /scripts/DEEP_HeAT/lamec.json --- .../.gitkeep | 0 scripts/CYCLONE_BASILISK/_script.sh | 16 + scripts/CYCLONE_BASILISK/lamec.json | 1 + .../.gitkeep | 0 scripts/CYCLONE_Horovod/_script.sh | 19 + scripts/CYCLONE_Horovod/lamec.json | 1 + .../DEEP_DeepSpeed/DEEP_DeepSpeed_script.sh | 30 ++ scripts/DEEP_DeepSpeed/lamec.json | 1 + scripts/DEEP_HeAT/DEEP_HeAT_script.sh | 28 + scripts/DEEP_HeAT/lamec.json | 1 + scripts/DEEP_Horovod/DEEP_Horovod_script.sh | 28 + scripts/DEEP_Horovod/lamec.json | 1 + .../DEEP_Pytorch-DDP_script.sh | 36 ++ scripts/DEEP_Pytorch-DDP/lamec.json | 1 + .../JURECA_DeepSpeed_script.sh | 26 + scripts/JURECA_DeepSpeed/lamec.json | 1 + scripts/JURECA_HeAT/JURECA_HeAT_script.sh | 24 + scripts/JURECA_HeAT/lamec.json | 1 + .../JURECA_Horovod/JURECA_Horovod_script.sh | 24 + scripts/JURECA_Horovod/lamec.json | 1 + .../JURECA_Pytorch-DDP_script.sh | 32 ++ scripts/JURECA_Pytorch-DDP/lamec.json | 1 + .../JUWELS_DeepSpeed_script.sh | 26 + scripts/JUWELS_DeepSpeed/lamec.json | 1 + scripts/JUWELS_HeAT/JUWELS_HeAT_script.sh | 24 + scripts/JUWELS_HeAT/lamec.json | 1 + .../JUWELS_Horovod/JUWELS_Horovod_script.sh | 24 + scripts/JUWELS_Horovod/lamec.json | 1 + .../JUWELS_Pytorch-DDP_script.sh | 32 ++ scripts/JUWELS_Pytorch-DDP/lamec.json | 1 + scripts/LUMI_HeAT/LUMI_HeAT_script.sh | 39 ++ scripts/LUMI_HeAT/lamec.json | 1 + scripts/VEGA_Basilisk/_script.sh | 16 + scripts/VEGA_Basilisk/lamec.json | 1 + scripts/cyclone_basilisk/basilisk_cfd.sh | 64 --- scripts/cyclone_basilisk/lamec.json | 1 - scripts/cyclone_horovod/FNO_launch.sh | 31 -- scripts/cyclone_horovod/lamec.json | 1 - scripts/deep_clang/clang_script.sh | 13 - scripts/deep_clang/lamec.json | 1 - scripts/deep_ddp/DDP_startscript_deep.sh | 102 ---- scripts/deep_ddp/README.md | 42 -- scripts/deep_ddp/conda_torch.sh | 80 --- scripts/deep_ddp/createEnv.sh | 193 ------- scripts/deep_ddp/lamec.json | 1 - scripts/deep_ddp/reqs.txt | 6 - scripts/deep_deepspeed/DS_config.json | 14 - scripts/deep_deepspeed/DS_startscript_deep.sh | 84 --- scripts/deep_deepspeed/README.md | 15 - scripts/deep_deepspeed/createEnv.sh | 174 ------- scripts/deep_deepspeed/lamec.json | 1 - scripts/deep_deepspeed/reqs.txt | 6 - scripts/deep_heat/HeAT_startscript_deep.sh | 90 ---- scripts/deep_heat/README.md | 15 - scripts/deep_heat/createEnv.sh | 174 ------- scripts/deep_heat/example_mnist_heat.py | 184 ------- scripts/deep_heat/lamec.json | 1 - scripts/deep_heat/reqs.txt | 6 - scripts/deep_horovod/Hor_startscript_deep.sh | 90 ---- scripts/deep_horovod/README.md | 15 - scripts/deep_horovod/createEnv.sh | 174 ------- scripts/deep_horovod/lamec.json | 1 - scripts/deep_horovod/pytorch_mnist.py | 205 -------- .../pytorch_synthetic_benchmark.py | 127 ----- scripts/deep_horovod/reqs.txt | 6 - .../Create_Jupyter_deepv.ipynb | 489 ------------------ scripts/deep_tensorflow/README.md | 21 - .../deep_tensorflow/TF_startscript_deep.sh | 41 -- scripts/deep_tensorflow/createEnv_TF.sh | 99 ---- scripts/deep_tensorflow/jupyterAddKernel.sh | 109 ---- .../deep_tensorflow/jupyterCreateKernel.sh | 123 ----- scripts/deep_tensorflow/lamec.json | 1 - .../tensorflow2_synthetic_benchmark.py | 131 ----- scripts/jureca_ddp/DDP_startscript.sh | 107 ---- .../jureca_ddp/DDP_startscript_container.sh | 81 --- scripts/jureca_ddp/README.md | 39 -- scripts/jureca_ddp/createContainer.sh | 24 - scripts/jureca_ddp/createEnv.sh | 183 ------- scripts/jureca_ddp/createEnv_MPI.sh | 73 --- scripts/jureca_ddp/fixed_torch_run.py | 51 -- scripts/jureca_ddp/lamec.json | 1 - scripts/jureca_ddp/reqs.txt | 11 - scripts/jureca_deepspeed/DS_config.json | 14 - .../jureca_deepspeed/DS_startscript_deep.sh | 96 ---- scripts/jureca_deepspeed/README.md | 15 - scripts/jureca_deepspeed/createEnv.sh | 180 ------- scripts/jureca_deepspeed/lamec.json | 1 - scripts/jureca_deepspeed/reqs.txt | 8 - scripts/jureca_graphcore/GC_pytorch_mnist.py | 346 ------------- scripts/jureca_graphcore/GC_startscript.sh | 25 - scripts/jureca_graphcore/README.md | 28 - scripts/jureca_graphcore/lamec.json | 1 - scripts/jureca_heat/HeAT_startscript_deep.sh | 71 --- scripts/jureca_heat/README.md | 15 - scripts/jureca_heat/createEnv.sh | 180 ------- scripts/jureca_heat/lamec.json | 1 - scripts/jureca_heat/reqs.txt | 8 - .../jureca_horovod/Hor_startscript_deep.sh | 82 --- scripts/jureca_horovod/README.md | 15 - scripts/jureca_horovod/createEnv.sh | 180 ------- scripts/jureca_horovod/lamec.json | 1 - scripts/jureca_horovod/reqs.txt | 8 - scripts/jureca_libtorch/MNIST/CMakeLists.txt | 30 -- .../MNIST/LibTorch_startscript.sh | 24 - scripts/jureca_libtorch/MNIST/compile.sh | 19 - .../jureca_libtorch/MNIST/download_mnist.py | 88 ---- scripts/jureca_libtorch/MNIST/mnist.cpp | 179 ------- scripts/jureca_libtorch/README.md | 19 - .../TorchVision/compile_jpeg.sh | 14 - .../TorchVision/compile_png.sh | 14 - .../TorchVision/compile_torchvision.sh | 44 -- scripts/jureca_libtorch/lamec.json | 1 - scripts/jureca_raytune/.gitkeep | 0 scripts/jureca_raytune/README.md | 7 - scripts/jureca_raytune/RayTune+DDP/.gitkeep | 0 .../jureca_raytune/RayTune+DDP/cifar_tune.py | 132 ----- .../jureca_raytune/RayTune+DDP/create_env.sh | 16 - .../RayTune+DDP/jureca_ray_ddp_startscript.sh | 78 --- scripts/jureca_raytune/Ray_2.4/.gitkeep | 0 scripts/jureca_raytune/Ray_2.4/ASHA/.gitkeep | 0 .../Ray_2.4/ASHA/cifar_tune_asha.py | 427 --------------- .../Ray_2.4/ASHA/jureca_ray_startscript.sh | 79 --- scripts/jureca_raytune/Ray_2.4/BOHB/.gitkeep | 0 .../Ray_2.4/BOHB/cifar_tune_bohb.py | 427 --------------- .../Ray_2.4/BOHB/jureca_ray_startscript.sh | 78 --- scripts/jureca_raytune/Ray_2.4/PBT/.gitkeep | 0 .../Ray_2.4/PBT/cifar_tune_pbt.py | 459 ---------------- .../Ray_2.4/PBT/jureca_ray_startscript.sh | 78 --- .../jureca_raytune/Ray_2.4/build_ray_env.sh | 13 - scripts/jureca_raytune/Ray_2.4/hpo.md | 58 --- scripts/jureca_raytune/Ray_2.4/hpo.py | 449 ---------------- scripts/jureca_raytune/cifar_tune.py | 104 ---- scripts/jureca_raytune/cifar_tune_tf.py | 76 --- scripts/jureca_raytune/create_jureca_env.sh | 15 - scripts/jureca_raytune/jureca_run_ray.sh | 92 ---- scripts/juwels_ddp/README.md | 33 -- scripts/juwels_ddp/container_batch.sh | 85 --- scripts/juwels_ddp/container_build.sh | 23 - scripts/juwels_ddp/container_env.sh | 13 - scripts/juwels_ddp/createEnv.sh | 193 ------- scripts/juwels_ddp/env_batch.sh | 90 ---- scripts/juwels_ddp/env_build.sh | 151 ------ scripts/juwels_ddp/fixed_torch_run.py | 51 -- scripts/juwels_ddp/install_pyDDP.sh | 80 --- scripts/juwels_ddp/lamec.json | 1 - scripts/juwels_ddp/reqs.txt | 12 - scripts/vega_basilisk/.gitkeep | 0 scripts/vega_basilisk/basilisk_cfd.sh | 71 --- scripts/vega_basilisk/basilisk_pde.sh | 60 --- scripts/vega_basilisk/lamec.json | 1 - 150 files changed, 440 insertions(+), 8660 deletions(-) rename scripts/{cyclone_basilisk => CYCLONE_BASILISK}/.gitkeep (100%) create mode 100644 scripts/CYCLONE_BASILISK/_script.sh create mode 100644 scripts/CYCLONE_BASILISK/lamec.json rename scripts/{cyclone_horovod => CYCLONE_Horovod}/.gitkeep (100%) create mode 100644 scripts/CYCLONE_Horovod/_script.sh create mode 100644 scripts/CYCLONE_Horovod/lamec.json create mode 100644 scripts/DEEP_DeepSpeed/DEEP_DeepSpeed_script.sh create mode 100644 scripts/DEEP_DeepSpeed/lamec.json create mode 100644 scripts/DEEP_HeAT/DEEP_HeAT_script.sh create mode 100644 scripts/DEEP_HeAT/lamec.json create mode 100644 scripts/DEEP_Horovod/DEEP_Horovod_script.sh create mode 100644 scripts/DEEP_Horovod/lamec.json create mode 100644 scripts/DEEP_Pytorch-DDP/DEEP_Pytorch-DDP_script.sh create mode 100644 scripts/DEEP_Pytorch-DDP/lamec.json create mode 100644 scripts/JURECA_DeepSpeed/JURECA_DeepSpeed_script.sh create mode 100644 scripts/JURECA_DeepSpeed/lamec.json create mode 100644 scripts/JURECA_HeAT/JURECA_HeAT_script.sh create mode 100644 scripts/JURECA_HeAT/lamec.json create mode 100644 scripts/JURECA_Horovod/JURECA_Horovod_script.sh create mode 100644 scripts/JURECA_Horovod/lamec.json create mode 100644 scripts/JURECA_Pytorch-DDP/JURECA_Pytorch-DDP_script.sh create mode 100644 scripts/JURECA_Pytorch-DDP/lamec.json create mode 100644 scripts/JUWELS_DeepSpeed/JUWELS_DeepSpeed_script.sh create mode 100644 scripts/JUWELS_DeepSpeed/lamec.json create mode 100644 scripts/JUWELS_HeAT/JUWELS_HeAT_script.sh create mode 100644 scripts/JUWELS_HeAT/lamec.json create mode 100644 scripts/JUWELS_Horovod/JUWELS_Horovod_script.sh create mode 100644 scripts/JUWELS_Horovod/lamec.json create mode 100644 scripts/JUWELS_Pytorch-DDP/JUWELS_Pytorch-DDP_script.sh create mode 100644 scripts/JUWELS_Pytorch-DDP/lamec.json create mode 100644 scripts/LUMI_HeAT/LUMI_HeAT_script.sh create mode 100644 scripts/LUMI_HeAT/lamec.json create mode 100644 scripts/VEGA_Basilisk/_script.sh create mode 100644 scripts/VEGA_Basilisk/lamec.json delete mode 100644 scripts/cyclone_basilisk/basilisk_cfd.sh delete mode 100644 scripts/cyclone_basilisk/lamec.json delete mode 100644 scripts/cyclone_horovod/FNO_launch.sh delete mode 100644 scripts/cyclone_horovod/lamec.json delete mode 100644 scripts/deep_clang/clang_script.sh delete mode 100644 scripts/deep_clang/lamec.json delete mode 100644 scripts/deep_ddp/DDP_startscript_deep.sh delete mode 100644 scripts/deep_ddp/README.md delete mode 100755 scripts/deep_ddp/conda_torch.sh delete mode 100755 scripts/deep_ddp/createEnv.sh delete mode 100644 scripts/deep_ddp/lamec.json delete mode 100755 scripts/deep_ddp/reqs.txt delete mode 100644 scripts/deep_deepspeed/DS_config.json delete mode 100644 scripts/deep_deepspeed/DS_startscript_deep.sh delete mode 100644 scripts/deep_deepspeed/README.md delete mode 100755 scripts/deep_deepspeed/createEnv.sh delete mode 100644 scripts/deep_deepspeed/lamec.json delete mode 100755 scripts/deep_deepspeed/reqs.txt delete mode 100644 scripts/deep_heat/HeAT_startscript_deep.sh delete mode 100644 scripts/deep_heat/README.md delete mode 100755 scripts/deep_heat/createEnv.sh delete mode 100644 scripts/deep_heat/example_mnist_heat.py delete mode 100644 scripts/deep_heat/lamec.json delete mode 100755 scripts/deep_heat/reqs.txt delete mode 100644 scripts/deep_horovod/Hor_startscript_deep.sh delete mode 100644 scripts/deep_horovod/README.md delete mode 100755 scripts/deep_horovod/createEnv.sh delete mode 100644 scripts/deep_horovod/lamec.json delete mode 100644 scripts/deep_horovod/pytorch_mnist.py delete mode 100644 scripts/deep_horovod/pytorch_synthetic_benchmark.py delete mode 100755 scripts/deep_horovod/reqs.txt delete mode 100644 scripts/deep_tensorflow/Create_Jupyter_deepv.ipynb delete mode 100644 scripts/deep_tensorflow/README.md delete mode 100644 scripts/deep_tensorflow/TF_startscript_deep.sh delete mode 100755 scripts/deep_tensorflow/createEnv_TF.sh delete mode 100755 scripts/deep_tensorflow/jupyterAddKernel.sh delete mode 100755 scripts/deep_tensorflow/jupyterCreateKernel.sh delete mode 100644 scripts/deep_tensorflow/lamec.json delete mode 100644 scripts/deep_tensorflow/tensorflow2_synthetic_benchmark.py delete mode 100644 scripts/jureca_ddp/DDP_startscript.sh delete mode 100644 scripts/jureca_ddp/DDP_startscript_container.sh delete mode 100644 scripts/jureca_ddp/README.md delete mode 100644 scripts/jureca_ddp/createContainer.sh delete mode 100755 scripts/jureca_ddp/createEnv.sh delete mode 100644 scripts/jureca_ddp/createEnv_MPI.sh delete mode 100644 scripts/jureca_ddp/fixed_torch_run.py delete mode 100644 scripts/jureca_ddp/lamec.json delete mode 100755 scripts/jureca_ddp/reqs.txt delete mode 100644 scripts/jureca_deepspeed/DS_config.json delete mode 100644 scripts/jureca_deepspeed/DS_startscript_deep.sh delete mode 100644 scripts/jureca_deepspeed/README.md delete mode 100755 scripts/jureca_deepspeed/createEnv.sh delete mode 100644 scripts/jureca_deepspeed/lamec.json delete mode 100755 scripts/jureca_deepspeed/reqs.txt delete mode 100644 scripts/jureca_graphcore/GC_pytorch_mnist.py delete mode 100644 scripts/jureca_graphcore/GC_startscript.sh delete mode 100644 scripts/jureca_graphcore/README.md delete mode 100644 scripts/jureca_graphcore/lamec.json delete mode 100644 scripts/jureca_heat/HeAT_startscript_deep.sh delete mode 100644 scripts/jureca_heat/README.md delete mode 100755 scripts/jureca_heat/createEnv.sh delete mode 100644 scripts/jureca_heat/lamec.json delete mode 100755 scripts/jureca_heat/reqs.txt delete mode 100644 scripts/jureca_horovod/Hor_startscript_deep.sh delete mode 100644 scripts/jureca_horovod/README.md delete mode 100755 scripts/jureca_horovod/createEnv.sh delete mode 100644 scripts/jureca_horovod/lamec.json delete mode 100755 scripts/jureca_horovod/reqs.txt delete mode 100644 scripts/jureca_libtorch/MNIST/CMakeLists.txt delete mode 100644 scripts/jureca_libtorch/MNIST/LibTorch_startscript.sh delete mode 100644 scripts/jureca_libtorch/MNIST/compile.sh delete mode 100644 scripts/jureca_libtorch/MNIST/download_mnist.py delete mode 100755 scripts/jureca_libtorch/MNIST/mnist.cpp delete mode 100644 scripts/jureca_libtorch/README.md delete mode 100755 scripts/jureca_libtorch/TorchVision/compile_jpeg.sh delete mode 100755 scripts/jureca_libtorch/TorchVision/compile_png.sh delete mode 100755 scripts/jureca_libtorch/TorchVision/compile_torchvision.sh delete mode 100644 scripts/jureca_libtorch/lamec.json delete mode 100644 scripts/jureca_raytune/.gitkeep delete mode 100644 scripts/jureca_raytune/README.md delete mode 100644 scripts/jureca_raytune/RayTune+DDP/.gitkeep delete mode 100644 scripts/jureca_raytune/RayTune+DDP/cifar_tune.py delete mode 100644 scripts/jureca_raytune/RayTune+DDP/create_env.sh delete mode 100644 scripts/jureca_raytune/RayTune+DDP/jureca_ray_ddp_startscript.sh delete mode 100644 scripts/jureca_raytune/Ray_2.4/.gitkeep delete mode 100644 scripts/jureca_raytune/Ray_2.4/ASHA/.gitkeep delete mode 100644 scripts/jureca_raytune/Ray_2.4/ASHA/cifar_tune_asha.py delete mode 100644 scripts/jureca_raytune/Ray_2.4/ASHA/jureca_ray_startscript.sh delete mode 100644 scripts/jureca_raytune/Ray_2.4/BOHB/.gitkeep delete mode 100644 scripts/jureca_raytune/Ray_2.4/BOHB/cifar_tune_bohb.py delete mode 100644 scripts/jureca_raytune/Ray_2.4/BOHB/jureca_ray_startscript.sh delete mode 100644 scripts/jureca_raytune/Ray_2.4/PBT/.gitkeep delete mode 100644 scripts/jureca_raytune/Ray_2.4/PBT/cifar_tune_pbt.py delete mode 100644 scripts/jureca_raytune/Ray_2.4/PBT/jureca_ray_startscript.sh delete mode 100644 scripts/jureca_raytune/Ray_2.4/build_ray_env.sh delete mode 100644 scripts/jureca_raytune/Ray_2.4/hpo.md delete mode 100644 scripts/jureca_raytune/Ray_2.4/hpo.py delete mode 100644 scripts/jureca_raytune/cifar_tune.py delete mode 100644 scripts/jureca_raytune/cifar_tune_tf.py delete mode 100644 scripts/jureca_raytune/create_jureca_env.sh delete mode 100644 scripts/jureca_raytune/jureca_run_ray.sh delete mode 100644 scripts/juwels_ddp/README.md delete mode 100644 scripts/juwels_ddp/container_batch.sh delete mode 100644 scripts/juwels_ddp/container_build.sh delete mode 100644 scripts/juwels_ddp/container_env.sh delete mode 100755 scripts/juwels_ddp/createEnv.sh delete mode 100644 scripts/juwels_ddp/env_batch.sh delete mode 100755 scripts/juwels_ddp/env_build.sh delete mode 100644 scripts/juwels_ddp/fixed_torch_run.py delete mode 100755 scripts/juwels_ddp/install_pyDDP.sh delete mode 100644 scripts/juwels_ddp/lamec.json delete mode 100644 scripts/juwels_ddp/reqs.txt delete mode 100644 scripts/vega_basilisk/.gitkeep delete mode 100644 scripts/vega_basilisk/basilisk_cfd.sh delete mode 100644 scripts/vega_basilisk/basilisk_pde.sh delete mode 100644 scripts/vega_basilisk/lamec.json diff --git a/scripts/cyclone_basilisk/.gitkeep b/scripts/CYCLONE_BASILISK/.gitkeep similarity index 100% rename from scripts/cyclone_basilisk/.gitkeep rename to scripts/CYCLONE_BASILISK/.gitkeep diff --git a/scripts/CYCLONE_BASILISK/_script.sh b/scripts/CYCLONE_BASILISK/_script.sh new file mode 100644 index 0000000..4840e23 --- /dev/null +++ b/scripts/CYCLONE_BASILISK/_script.sh @@ -0,0 +1,16 @@ +#!/bin/bash +#SBATCH --job-name=job +#SBATCH --output=job.out +#SBATCH --error=job.err +#SBATCH --account=%account% +#SBATCH --partition=%partition% +#SBATCH --nodes=%nodes% + +#MODULES BEGIN cyclone basilisk +module purge +ml load SWIG/4.0.2-GCCcore-10.2.0 Bison/3.7.1-GCCcore-10.2.0 CMake/3.18.4-GCCcore-10.2.0 Python/3.8.6-GCCcore-10.2.0 flex/2.6.4-GCCcore-10.2.0 glew/2.2.0-GCCcore-10.2.0-osmesa Mesa/20.2.1-GCCcore-10.2.0 libGLU/9.0.1-GCCcore-10.2.0 OpenMPI/4.0.5-GCC-10.2.0 +#MODULES END + +source your/env_path/bin/activate + +srun --exclusive %executable% diff --git a/scripts/CYCLONE_BASILISK/lamec.json b/scripts/CYCLONE_BASILISK/lamec.json new file mode 100644 index 0000000..2837d61 --- /dev/null +++ b/scripts/CYCLONE_BASILISK/lamec.json @@ -0,0 +1 @@ +{"template": "_script.sh"} diff --git a/scripts/cyclone_horovod/.gitkeep b/scripts/CYCLONE_Horovod/.gitkeep similarity index 100% rename from scripts/cyclone_horovod/.gitkeep rename to scripts/CYCLONE_Horovod/.gitkeep diff --git a/scripts/CYCLONE_Horovod/_script.sh b/scripts/CYCLONE_Horovod/_script.sh new file mode 100644 index 0000000..bf0f6b9 --- /dev/null +++ b/scripts/CYCLONE_Horovod/_script.sh @@ -0,0 +1,19 @@ +#!/bin/bash +#SBATCH --job-name=job +#SBATCH --output=job.out +#SBATCH --error=job.err +#SBATCH --account=%account% +#SBATCH --partition=%partition% +#SBATCH --nodes=%nodes% +#BATCH --gpus-per-node=4 +#SBATCH --ntasks-per-node=4 + +#MODULES BEGIN cyclone horovod +module purge +ml load h5py tqdm matplotlib PyTorch/1.9.1-fosscuda-2020b Horovod/0.22.0-fosscuda-2020b-PyTorch-1.9.1 +#MODULES END + +source your/env_path/bin/activate + +# Horovod NCCL/MPI setup +srun --cpu-bind=none python3 -u %executable% diff --git a/scripts/CYCLONE_Horovod/lamec.json b/scripts/CYCLONE_Horovod/lamec.json new file mode 100644 index 0000000..2837d61 --- /dev/null +++ b/scripts/CYCLONE_Horovod/lamec.json @@ -0,0 +1 @@ +{"template": "_script.sh"} diff --git a/scripts/DEEP_DeepSpeed/DEEP_DeepSpeed_script.sh b/scripts/DEEP_DeepSpeed/DEEP_DeepSpeed_script.sh new file mode 100644 index 0000000..e40ea0a --- /dev/null +++ b/scripts/DEEP_DeepSpeed/DEEP_DeepSpeed_script.sh @@ -0,0 +1,30 @@ +#!/bin/bash +#SBATCH --job-name=job +#SBATCH --output=job.out +#SBATCH --error=job.err +#SBATCH --account=%account% +#SBATCH --partition=%partition% +#SBATCH --nodes=%nodes% +#SBATCH --gpus-per-node=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=12 +#SBATCH --exclusive +#SBATCH --gres=gpu:1 + +#MODULES BEGIN DEEP DeepSpeed +ml Stages/2024 GCC OpenMPI CUDA/12 cuDNN MPI-settings/CUDA +ml Python CMake HDF5 PnetCDF libaio mpi4py +#MODULES END + +# variables for specific HPC +export CUDA_VISIBLE_DEVICES="0" +ln -sf /usr/lib64/libcuda.so.1 +ln -sf /usr/lib64/libnvidia-ml.so.1 +export LD_LIBRARY_PATH=.:/usr/local/cuda-11.7/lib64:$LD_LIBRARY_PATH + +source your/env_path/bin/activate + +# DeepSpeed NCCL/MPI setup +export MASTER_ADDR=$(scontrol show hostnames "\$SLURM_JOB_NODELIST" | head -n 1)i +export MASTER_PORT=29500 +srun --cpu-bind=none python %executable% --deepspeed diff --git a/scripts/DEEP_DeepSpeed/lamec.json b/scripts/DEEP_DeepSpeed/lamec.json new file mode 100644 index 0000000..961eb5d --- /dev/null +++ b/scripts/DEEP_DeepSpeed/lamec.json @@ -0,0 +1 @@ +{"template": "DEEP_DeepSpeed_script.sh"} diff --git a/scripts/DEEP_HeAT/DEEP_HeAT_script.sh b/scripts/DEEP_HeAT/DEEP_HeAT_script.sh new file mode 100644 index 0000000..3586839 --- /dev/null +++ b/scripts/DEEP_HeAT/DEEP_HeAT_script.sh @@ -0,0 +1,28 @@ +#!/bin/bash +#SBATCH --job-name=job +#SBATCH --output=job.out +#SBATCH --error=job.err +#SBATCH --account=%account% +#SBATCH --partition=%partition% +#SBATCH --nodes=%nodes% +#SBATCH --gpus-per-node=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=12 +#SBATCH --exclusive +#SBATCH --gres=gpu:1 + +#MODULES BEGIN DEEP HeAT +ml Stages/2024 GCC OpenMPI CUDA/12 cuDNN MPI-settings/CUDA +ml Python CMake HDF5 PnetCDF libaio mpi4py +#MODULES END + +# variables for specific HPC +export CUDA_VISIBLE_DEVICES="0" +ln -sf /usr/lib64/libcuda.so.1 +ln -sf /usr/lib64/libnvidia-ml.so.1 +export LD_LIBRARY_PATH=.:/usr/local/cuda-11.7/lib64:$LD_LIBRARY_PATH + +source your/env_path/bin/activate + +# HeAT NCCL setup +srun --cpu-bind=none python3 -u %executable% diff --git a/scripts/DEEP_HeAT/lamec.json b/scripts/DEEP_HeAT/lamec.json new file mode 100644 index 0000000..22c7482 --- /dev/null +++ b/scripts/DEEP_HeAT/lamec.json @@ -0,0 +1 @@ +{"template": "DEEP_HeAT_script.sh"} diff --git a/scripts/DEEP_Horovod/DEEP_Horovod_script.sh b/scripts/DEEP_Horovod/DEEP_Horovod_script.sh new file mode 100644 index 0000000..a077b9a --- /dev/null +++ b/scripts/DEEP_Horovod/DEEP_Horovod_script.sh @@ -0,0 +1,28 @@ +#!/bin/bash +#SBATCH --job-name=job +#SBATCH --output=job.out +#SBATCH --error=job.err +#SBATCH --account=%account% +#SBATCH --partition=%partition% +#SBATCH --nodes=%nodes% +#SBATCH --gpus-per-node=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=12 +#SBATCH --exclusive +#SBATCH --gres=gpu:1 + +#MODULES BEGIN DEEP Horovod +ml Stages/2024 GCC OpenMPI CUDA/12 cuDNN MPI-settings/CUDA +ml Python CMake HDF5 PnetCDF libaio mpi4py +#MODULES END + +# variables for specific HPC +export CUDA_VISIBLE_DEVICES="0" +ln -sf /usr/lib64/libcuda.so.1 +ln -sf /usr/lib64/libnvidia-ml.so.1 +export LD_LIBRARY_PATH=.:/usr/local/cuda-11.7/lib64:$LD_LIBRARY_PATH + +source your/env_path/bin/activate + +# Horovod NCCL/MPI setup +srun --mpi=pspmix python3 -u %executable% diff --git a/scripts/DEEP_Horovod/lamec.json b/scripts/DEEP_Horovod/lamec.json new file mode 100644 index 0000000..7e0350b --- /dev/null +++ b/scripts/DEEP_Horovod/lamec.json @@ -0,0 +1 @@ +{"template": "DEEP_Horovod_script.sh"} diff --git a/scripts/DEEP_Pytorch-DDP/DEEP_Pytorch-DDP_script.sh b/scripts/DEEP_Pytorch-DDP/DEEP_Pytorch-DDP_script.sh new file mode 100644 index 0000000..5f415a2 --- /dev/null +++ b/scripts/DEEP_Pytorch-DDP/DEEP_Pytorch-DDP_script.sh @@ -0,0 +1,36 @@ +#!/bin/bash +#SBATCH --job-name=job +#SBATCH --output=job.out +#SBATCH --error=job.err +#SBATCH --account=%account% +#SBATCH --partition=%partition% +#SBATCH --nodes=%nodes% +#SBATCH --gpus-per-node=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=12 +#SBATCH --exclusive +#SBATCH --gres=gpu:1 + +#MODULES BEGIN DEEP Pytorch-DDP +ml Stages/2024 GCC OpenMPI CUDA/12 cuDNN MPI-settings/CUDA +ml Python CMake HDF5 PnetCDF libaio mpi4py +#MODULES END + +# variables for specific HPC +export CUDA_VISIBLE_DEVICES="0" +ln -sf /usr/lib64/libcuda.so.1 +ln -sf /usr/lib64/libnvidia-ml.so.1 +export LD_LIBRARY_PATH=.:/usr/local/cuda-11.7/lib64:$LD_LIBRARY_PATH + +source your/env_path/bin/activate + + # DDP NCCL setup +srun --cpu-bind=none bash -c "torchrun \ + --log_dir='logs' \ + --nnodes=$SLURM_NNODES \ + --nproc_per_node=$SLURM_GPUS_PER_NODE \ + --rdzv_id=$SLURM_JOB_ID \ + --rdzv_backend=c10d \ + --rdzv_endpoint='$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)'i:29500 \ + --rdzv_conf=is_host=\$(((SLURM_NODEID)) && echo 0 || echo 1) \ +%executable%" diff --git a/scripts/DEEP_Pytorch-DDP/lamec.json b/scripts/DEEP_Pytorch-DDP/lamec.json new file mode 100644 index 0000000..000e9a6 --- /dev/null +++ b/scripts/DEEP_Pytorch-DDP/lamec.json @@ -0,0 +1 @@ +{"template": "DEEP_Pytorch-DDP_script.sh"} diff --git a/scripts/JURECA_DeepSpeed/JURECA_DeepSpeed_script.sh b/scripts/JURECA_DeepSpeed/JURECA_DeepSpeed_script.sh new file mode 100644 index 0000000..7a88e71 --- /dev/null +++ b/scripts/JURECA_DeepSpeed/JURECA_DeepSpeed_script.sh @@ -0,0 +1,26 @@ +#!/bin/bash +#SBATCH --job-name=job +#SBATCH --output=job.out +#SBATCH --error=job.err +#SBATCH --account=%account% +#SBATCH --partition=%partition% +#SBATCH --nodes=%nodes% +#SBATCH --gpus-per-node=4 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=128 +#SBATCH --exclusive +#SBATCH --gres=gpu:4 + +#MODULES BEGIN JURECA DeepSpeed +ml GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py +#MODULES END + +# variables for specific HPC +export CUDA_VISIBLE_DEVICES="0,1,2,3" + +source your/env_path/bin/activate + +# DeepSpeed NCCL/MPI setup +export MASTER_ADDR=$(scontrol show hostnames "\$SLURM_JOB_NODELIST" | head -n 1)i +export MASTER_PORT=29500 +srun --cpu-bind=none python %executable% --deepspeed diff --git a/scripts/JURECA_DeepSpeed/lamec.json b/scripts/JURECA_DeepSpeed/lamec.json new file mode 100644 index 0000000..6135415 --- /dev/null +++ b/scripts/JURECA_DeepSpeed/lamec.json @@ -0,0 +1 @@ +{"template": "JURECA_DeepSpeed_script.sh"} diff --git a/scripts/JURECA_HeAT/JURECA_HeAT_script.sh b/scripts/JURECA_HeAT/JURECA_HeAT_script.sh new file mode 100644 index 0000000..1fe2f16 --- /dev/null +++ b/scripts/JURECA_HeAT/JURECA_HeAT_script.sh @@ -0,0 +1,24 @@ +#!/bin/bash +#SBATCH --job-name=job +#SBATCH --output=job.out +#SBATCH --error=job.err +#SBATCH --account=%account% +#SBATCH --partition=%partition% +#SBATCH --nodes=%nodes% +#SBATCH --gpus-per-node=4 +#SBATCH --ntasks-per-node=4 +#SBATCH --cpus-per-task=32 +#SBATCH --exclusive +#SBATCH --gres=gpu:4 + +#MODULES BEGIN JURECA HeAT +ml GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py +#MODULES END + +# variables for specific HPC +export CUDA_VISIBLE_DEVICES="0,1,2,3" + +source your/env_path/bin/activate + +# HeAT NCCL setup +srun --cpu-bind=none python3 -u %executable% diff --git a/scripts/JURECA_HeAT/lamec.json b/scripts/JURECA_HeAT/lamec.json new file mode 100644 index 0000000..0d23a8a --- /dev/null +++ b/scripts/JURECA_HeAT/lamec.json @@ -0,0 +1 @@ +{"template": "JURECA_HeAT_script.sh"} diff --git a/scripts/JURECA_Horovod/JURECA_Horovod_script.sh b/scripts/JURECA_Horovod/JURECA_Horovod_script.sh new file mode 100644 index 0000000..49de9fc --- /dev/null +++ b/scripts/JURECA_Horovod/JURECA_Horovod_script.sh @@ -0,0 +1,24 @@ +#!/bin/bash +#SBATCH --job-name=job +#SBATCH --output=job.out +#SBATCH --error=job.err +#SBATCH --account=%account% +#SBATCH --partition=%partition% +#SBATCH --nodes=%nodes% +#SBATCH --gpus-per-node=4 +#SBATCH --ntasks-per-node=4 +#SBATCH --cpus-per-task=32 +#SBATCH --exclusive +#SBATCH --gres=gpu:4 + +#MODULES BEGIN JURECA Horovod +ml GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py +#MODULES END + +# variables for specific HPC +export CUDA_VISIBLE_DEVICES="0,1,2,3" + +source your/env_path/bin/activate + +# Horovod NCCL/MPI setup +srun --cpu-bind=none python3 -u %executable% diff --git a/scripts/JURECA_Horovod/lamec.json b/scripts/JURECA_Horovod/lamec.json new file mode 100644 index 0000000..74ea6d2 --- /dev/null +++ b/scripts/JURECA_Horovod/lamec.json @@ -0,0 +1 @@ +{"template": "JURECA_Horovod_script.sh"} diff --git a/scripts/JURECA_Pytorch-DDP/JURECA_Pytorch-DDP_script.sh b/scripts/JURECA_Pytorch-DDP/JURECA_Pytorch-DDP_script.sh new file mode 100644 index 0000000..cf6cdc4 --- /dev/null +++ b/scripts/JURECA_Pytorch-DDP/JURECA_Pytorch-DDP_script.sh @@ -0,0 +1,32 @@ +#!/bin/bash +#SBATCH --job-name=job +#SBATCH --output=job.out +#SBATCH --error=job.err +#SBATCH --account=%account% +#SBATCH --partition=%partition% +#SBATCH --nodes=%nodes% +#SBATCH --gpus-per-node=4 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=128 +#SBATCH --exclusive +#SBATCH --gres=gpu:4 + +#MODULES BEGIN JURECA Pytorch-DDP +ml GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py +#MODULES END + +# variables for specific HPC +export CUDA_VISIBLE_DEVICES="0,1,2,3" + +source your/env_path/bin/activate + + # DDP NCCL setup +srun --cpu-bind=none bash -c "torchrun \ + --log_dir='logs' \ + --nnodes=$SLURM_NNODES \ + --nproc_per_node=$SLURM_GPUS_PER_NODE \ + --rdzv_id=$SLURM_JOB_ID \ + --rdzv_backend=c10d \ + --rdzv_endpoint='$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)'i:29500 \ + --rdzv_conf=is_host=\$(((SLURM_NODEID)) && echo 0 || echo 1) \ +%executable%" diff --git a/scripts/JURECA_Pytorch-DDP/lamec.json b/scripts/JURECA_Pytorch-DDP/lamec.json new file mode 100644 index 0000000..a9d0f0b --- /dev/null +++ b/scripts/JURECA_Pytorch-DDP/lamec.json @@ -0,0 +1 @@ +{"template": "JURECA_Pytorch-DDP_script.sh"} diff --git a/scripts/JUWELS_DeepSpeed/JUWELS_DeepSpeed_script.sh b/scripts/JUWELS_DeepSpeed/JUWELS_DeepSpeed_script.sh new file mode 100644 index 0000000..3a3ba90 --- /dev/null +++ b/scripts/JUWELS_DeepSpeed/JUWELS_DeepSpeed_script.sh @@ -0,0 +1,26 @@ +#!/bin/bash +#SBATCH --job-name=job +#SBATCH --output=job.out +#SBATCH --error=job.err +#SBATCH --account=%account% +#SBATCH --partition=%partition% +#SBATCH --nodes=%nodes% +#SBATCH --gpus-per-node=4 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=48 +#SBATCH --exclusive +#SBATCH --gres=gpu:4 + +#MODULES BEGIN JUWELS DeepSpeed +ml GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py +#MODULES END + +# variables for specific HPC +export CUDA_VISIBLE_DEVICES="0,1,2,3" + +source your/env_path/bin/activate + +# DeepSpeed NCCL/MPI setup +export MASTER_ADDR=$(scontrol show hostnames "\$SLURM_JOB_NODELIST" | head -n 1)i +export MASTER_PORT=29500 +srun --cpu-bind=none python %executable% --deepspeed diff --git a/scripts/JUWELS_DeepSpeed/lamec.json b/scripts/JUWELS_DeepSpeed/lamec.json new file mode 100644 index 0000000..90d2f1f --- /dev/null +++ b/scripts/JUWELS_DeepSpeed/lamec.json @@ -0,0 +1 @@ +{"template": "JUWELS_DeepSpeed_script.sh"} diff --git a/scripts/JUWELS_HeAT/JUWELS_HeAT_script.sh b/scripts/JUWELS_HeAT/JUWELS_HeAT_script.sh new file mode 100644 index 0000000..e26eb11 --- /dev/null +++ b/scripts/JUWELS_HeAT/JUWELS_HeAT_script.sh @@ -0,0 +1,24 @@ +#!/bin/bash +#SBATCH --job-name=job +#SBATCH --output=job.out +#SBATCH --error=job.err +#SBATCH --account=%account% +#SBATCH --partition=%partition% +#SBATCH --nodes=%nodes% +#SBATCH --gpus-per-node=4 +#SBATCH --ntasks-per-node=4 +#SBATCH --cpus-per-task=12 +#SBATCH --exclusive +#SBATCH --gres=gpu:4 + +#MODULES BEGIN JUWELS HeAT +ml GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py +#MODULES END + +# variables for specific HPC +export CUDA_VISIBLE_DEVICES="0,1,2,3" + +source your/env_path/bin/activate + +# HeAT NCCL setup +srun --cpu-bind=none python3 -u %executable% diff --git a/scripts/JUWELS_HeAT/lamec.json b/scripts/JUWELS_HeAT/lamec.json new file mode 100644 index 0000000..8d27246 --- /dev/null +++ b/scripts/JUWELS_HeAT/lamec.json @@ -0,0 +1 @@ +{"template": "JUWELS_HeAT_script.sh"} diff --git a/scripts/JUWELS_Horovod/JUWELS_Horovod_script.sh b/scripts/JUWELS_Horovod/JUWELS_Horovod_script.sh new file mode 100644 index 0000000..e2d3dac --- /dev/null +++ b/scripts/JUWELS_Horovod/JUWELS_Horovod_script.sh @@ -0,0 +1,24 @@ +#!/bin/bash +#SBATCH --job-name=job +#SBATCH --output=job.out +#SBATCH --error=job.err +#SBATCH --account=%account% +#SBATCH --partition=%partition% +#SBATCH --nodes=%nodes% +#SBATCH --gpus-per-node=4 +#SBATCH --ntasks-per-node=4 +#SBATCH --cpus-per-task=12 +#SBATCH --exclusive +#SBATCH --gres=gpu:4 + +#MODULES BEGIN JUWELS Horovod +ml GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py +#MODULES END + +# variables for specific HPC +export CUDA_VISIBLE_DEVICES="0,1,2,3" + +source your/env_path/bin/activate + +# Horovod NCCL/MPI setup +srun --cpu-bind=none python3 -u %executable% diff --git a/scripts/JUWELS_Horovod/lamec.json b/scripts/JUWELS_Horovod/lamec.json new file mode 100644 index 0000000..6a62839 --- /dev/null +++ b/scripts/JUWELS_Horovod/lamec.json @@ -0,0 +1 @@ +{"template": "JUWELS_Horovod_script.sh"} diff --git a/scripts/JUWELS_Pytorch-DDP/JUWELS_Pytorch-DDP_script.sh b/scripts/JUWELS_Pytorch-DDP/JUWELS_Pytorch-DDP_script.sh new file mode 100644 index 0000000..b6f9abd --- /dev/null +++ b/scripts/JUWELS_Pytorch-DDP/JUWELS_Pytorch-DDP_script.sh @@ -0,0 +1,32 @@ +#!/bin/bash +#SBATCH --job-name=job +#SBATCH --output=job.out +#SBATCH --error=job.err +#SBATCH --account=%account% +#SBATCH --partition=%partition% +#SBATCH --nodes=%nodes% +#SBATCH --gpus-per-node=4 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=48 +#SBATCH --exclusive +#SBATCH --gres=gpu:4 + +#MODULES BEGIN JUWELS Pytorch-DDP +ml GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py +#MODULES END + +# variables for specific HPC +export CUDA_VISIBLE_DEVICES="0,1,2,3" + +source your/env_path/bin/activate + + # DDP NCCL setup +srun --cpu-bind=none bash -c "torchrun \ + --log_dir='logs' \ + --nnodes=$SLURM_NNODES \ + --nproc_per_node=$SLURM_GPUS_PER_NODE \ + --rdzv_id=$SLURM_JOB_ID \ + --rdzv_backend=c10d \ + --rdzv_endpoint='$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)'i:29500 \ + --rdzv_conf=is_host=\$(((SLURM_NODEID)) && echo 0 || echo 1) \ +%executable%" diff --git a/scripts/JUWELS_Pytorch-DDP/lamec.json b/scripts/JUWELS_Pytorch-DDP/lamec.json new file mode 100644 index 0000000..ee980ba --- /dev/null +++ b/scripts/JUWELS_Pytorch-DDP/lamec.json @@ -0,0 +1 @@ +{"template": "JUWELS_Pytorch-DDP_script.sh"} diff --git a/scripts/LUMI_HeAT/LUMI_HeAT_script.sh b/scripts/LUMI_HeAT/LUMI_HeAT_script.sh new file mode 100644 index 0000000..250b51a --- /dev/null +++ b/scripts/LUMI_HeAT/LUMI_HeAT_script.sh @@ -0,0 +1,39 @@ +#!/bin/bash +#SBATCH --job-name=job +#SBATCH --output=job.out +#SBATCH --error=job.err +#SBATCH --account=%account% +#SBATCH --partition=%partition% +#SBATCH --nodes=%nodes% +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --cpus-per-task=8 +#SBATCH --exclusive +#SBATCH --gres=gpu:8 + +#MODULES BEGIN LUMI HeAT +ml LUMI/22.08 partition/G rocm ModulePowerUser/LUMI buildtools cray-python +#MODULES END + +# variables for specific HPC +HIP_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" +export ROCR_VISIBLE_DEVICES=$SLURM_LOCALID +export LD_LIBRARY_PATH=$HIP_LIB_PATH:$LD_LIBRARY_PATH +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET_GDR_LEVEL=3 +export MIOPEN_USER_DB_PATH=/tmp/${{USER}}-miopen-cache-${{SLURM_JOB_ID}} +export MIOPEN_CUSTOM_CACHE_DIR=${{MIOPEN_USER_DB_PATH}} +export CXI_FORK_SAFE=1 +export CXI_FORK_SAFE_HP=1 +export FI_CXI_DISABLE_CQ_HUGETLB=1 +export HCC_AMDGPU_TARGET=gfx90a +export HIP_LAUNCH_BLOCKING=1 +export NCCL_ASYNC_ERROR_HANDLING=1 +export NCCL_IB_TIMEOUT=50 +export UCX_RC_TIMEOUT=4s +export NCCL_IB_RETRY_CNT=10 + +source your/env_path/bin/activate + +#HeAT NCCL setup +srun --cpu-bind=none python3 -u %executable% diff --git a/scripts/LUMI_HeAT/lamec.json b/scripts/LUMI_HeAT/lamec.json new file mode 100644 index 0000000..0ff777c --- /dev/null +++ b/scripts/LUMI_HeAT/lamec.json @@ -0,0 +1 @@ +{"template": "LUMI_HeAT_script.sh"} diff --git a/scripts/VEGA_Basilisk/_script.sh b/scripts/VEGA_Basilisk/_script.sh new file mode 100644 index 0000000..57530b0 --- /dev/null +++ b/scripts/VEGA_Basilisk/_script.sh @@ -0,0 +1,16 @@ +#!/bin/bash +#SBATCH --job-name=job +#SBATCH --output=job.out +#SBATCH --error=job.err +#SBATCH --account=%account% +#SBATCH --partition=%partition% +#SBATCH --nodes=%nodes% + +#MODULES BEGIN VEGA Basilisk +module purge +ml load Bison/3.7.1-GCCcore-10.2.0 CMake/3.18.4-GCCcore-10.2.0 Python/3.8.6-GCCcore-10.2.0 flex/2.6.4-GCCcore-10.2.0 SWIG/4.0.2-GCCcore-10.3.0 Mesa/20.2.1-GCCcore-10.2.0 libGLU/9.0.1-GCCcore-10.2.0 OpenMPI/4.1.3-GCC-10.3.0 ImageMagick/7.0.10-35-GCCcore-10.2.0 FFmpeg/4.4.2-GCCcore-11.3.0 +#MODULES END + +source your/env_path/bin/activate + +srun --mpi=pmix %executable% diff --git a/scripts/VEGA_Basilisk/lamec.json b/scripts/VEGA_Basilisk/lamec.json new file mode 100644 index 0000000..2837d61 --- /dev/null +++ b/scripts/VEGA_Basilisk/lamec.json @@ -0,0 +1 @@ +{"template": "_script.sh"} diff --git a/scripts/cyclone_basilisk/basilisk_cfd.sh b/scripts/cyclone_basilisk/basilisk_cfd.sh deleted file mode 100644 index a39763b..0000000 --- a/scripts/cyclone_basilisk/basilisk_cfd.sh +++ /dev/null @@ -1,64 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=case0 -#SBATCH --account=p084 -#SBATCH --nodes=1 -#SBATCH --ntasks=20 -#SBATCH --hint=nomultithread -###SBATCH --mem=180G -#SBATCH --time=24:00:00 -#SBATCH --output=out.%j -#SBATCH --error=log.%j -#SBATCH --partition=cpu -#SBATCH --exclusive - -module purge -module load SWIG/4.0.2-GCCcore-10.2.0 Bison/3.7.1-GCCcore-10.2.0 CMake/3.18.4-GCCcore-10.2.0 Python/3.8.6-GCCcore-10.2.0 flex/2.6.4-GCCcore-10.2.0 glew/2.2.0-GCCcore-10.2.0-osmesa -module load Mesa/20.2.1-GCCcore-10.2.0 libGLU/9.0.1-GCCcore-10.2.0 -module load OpenMPI/4.0.5-GCC-10.2.0 -##module load FFmpeg -export BASILISK=/onyx/data/p084/basilisk/src -export PATH=$PATH:$BASILISK - -echo "Starting at `date`" -echo "Running on hosts: $SLURM_NODELIST" -echo "Running on $SLURM_NNODES nodes." -echo "Running on $SLURM_NPROCS processors." -echo "Job id is $SLURM_JOBID" - -ax_max=40 -ax_min=0 -ay_max=40 -ay_min=0 - b_max=10 - b_min=0 -xc_max=0.5 -xc_min=0.2 -yc_max=0.8 -yc_min=0.5 - -file="params.in" - -if ! [[ -f "restart" ]] ; then - RANDOM=$(date +%s%N | cut -b10-19) # give a seed - echo "$RANDOM / 32767 * ($ax_max-$ax_min) + $ax_min" | bc -l > $file - echo "$RANDOM / 32767 * ($ay_max-$ay_min) + $ay_min" | bc -l >> $file - echo "$RANDOM / 32767 * ( $b_max- $b_min) + $b_min" | bc -l >> $file - echo "$RANDOM / 32767 * ($xc_max-$xc_min) + $xc_min" | bc -l >> $file - echo "$RANDOM / 32767 * ($yc_max-$yc_min) + $yc_min" | bc -l >> $file -fi - -if ! [[ -d "output/" ]] ; then - mkdir output/ - mkdir output/wet_area/ - mkdir output/facets/ - mkdir output/my_output/ -fi - -#CC99='mpicc -std=c99' qcc -O2 -Wall -D_MPI=1 sessileweb_no_opengl.c -o run -lm -L$EBROOTLIBGLU/lib -lGLU -L$EBROOTMESA/lib -lOSMesa -L/onyx/data/p084/basilisk_new/basilisk/src/gl -lglutils -lfb_osmesa -CC99='mpicc -std=c99' qcc -O2 -Wall -D_MPI=1 drop.c -o run -lm -L$EBROOTLIBGLU/lib -lGLU -L$EBROOTGLEW/lib64 -lGLEW -L/onyx/data/p084/basilisk/src/gl -lfb_glx -lglutils -L$EBROOTLIBGLVND/lib -lGL -L$EBROOTX11/lib -lX11 - -srun --exclusive -K1 -n $SLURM_NTASKS ./run -# 2> log-$SLURM_NTASKS > out-$SLURM_NTASKS -# 2> log > out - -echo "Program finished with exit code $? at: `date`" diff --git a/scripts/cyclone_basilisk/lamec.json b/scripts/cyclone_basilisk/lamec.json deleted file mode 100644 index 1064dc7..0000000 --- a/scripts/cyclone_basilisk/lamec.json +++ /dev/null @@ -1 +0,0 @@ -{"startscript": "basilisk_cfd.sh"} \ No newline at end of file diff --git a/scripts/cyclone_horovod/FNO_launch.sh b/scripts/cyclone_horovod/FNO_launch.sh deleted file mode 100644 index 6531694..0000000 --- a/scripts/cyclone_horovod/FNO_launch.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=FNO # Job name -#SBATCH --partition=gpu # Partition -#SBATCH --nodes=8 # Number of nodes -#BATCH --gpus-per-node=4 # Number of GPUs per node -####SBATCH --gres=gpu:4 # Number of GPUs per node -#####SBATCH --ntasks-per-node=4 # Number of tasks -#SBATCH --output=job.%j.out # Stdout (%j=jobId) -#SBATCH --error=job.%j.err # Stderr (%j=jobId) -#SBATCH --time=24:00:00 # Walltime -#SBATCH -A p101 - -module purge -module load h5py -module load tqdm -module load matplotlib -module load PyTorch/1.9.1-fosscuda-2020b -module load Horovod/0.22.0-fosscuda-2020b-PyTorch-1.9.1 - -echo "Starting at `date`" -echo "Running on hosts: $SLURM_NODELIST" -echo "Running on $SLURM_NNODES nodes." -echo "Running on $SLURM_NPROCS processors." -echo "Job id is $SLURM_JOBID" - -srun python3 train_mixF_hrvd.py - - - -echo "Program finished with exit code $? at: `date`" diff --git a/scripts/cyclone_horovod/lamec.json b/scripts/cyclone_horovod/lamec.json deleted file mode 100644 index 212f4d1..0000000 --- a/scripts/cyclone_horovod/lamec.json +++ /dev/null @@ -1 +0,0 @@ -{"startscript": "FNO_launch.sh"} \ No newline at end of file diff --git a/scripts/deep_clang/clang_script.sh b/scripts/deep_clang/clang_script.sh deleted file mode 100644 index 088a5dc..0000000 --- a/scripts/deep_clang/clang_script.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env bash -#SBATCH --account=%account% -#SBATCH --partition=%partition% -#SBATCH --nodes=%nodes% -#SBATCH --time=0:00:10 - -%undefined% - -PROGNAME="%executable%" - -ml Stages/Devel-2019a Clang/10.0.1 - -clang "$PROGNAME".c -o "$PROGNAME" diff --git a/scripts/deep_clang/lamec.json b/scripts/deep_clang/lamec.json deleted file mode 100644 index 1b2ee64..0000000 --- a/scripts/deep_clang/lamec.json +++ /dev/null @@ -1 +0,0 @@ -{"template": "clang_script.sh"} diff --git a/scripts/deep_ddp/DDP_startscript_deep.sh b/scripts/deep_ddp/DDP_startscript_deep.sh deleted file mode 100644 index 8990317..0000000 --- a/scripts/deep_ddp/DDP_startscript_deep.sh +++ /dev/null @@ -1,102 +0,0 @@ -#!/bin/bash - -# general configuration of the job -#SBATCH --job-name=TorchTest -#SBATCH --account=deepext -#SBATCH --mail-user= -#SBATCH --mail-type=ALL -#SBATCH --output=job.out -#SBATCH --error=job.err -#SBATCH --time=0-01:00:00 - -# configure node and process count on the CM -#SBATCH --partition=dp-esb -# SBATCH --partition=dp-dam -#SBATCH --nodes=4 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=16 -#SBATCH --gpus-per-node=1 -#SBATCH --exclusive - -# parameters -debug=false # do debug -bs=96 # batch-size -epochs=1 # epochs -lr=0.01 # learning rate - -# dataset -# MNIST -#dataDir="/p/project/prcoe12/RAISE/data_MNIST/" -#COMMAND="DDP_pytorch_mnist.py" - -# AT -dataDir="/p/project/prcoe12/RAISE/T31/" -COMMAND="DDP_pytorch_AT.py" - -EXEC="$COMMAND \ - --batch-size $bs \ - --epochs $epochs \ - --lr $lr \ - --nworker $SLURM_CPUS_PER_TASK \ - --data-dir $dataDir" - -# set modules -ml --force purge -ml use $OTHERSTAGES -ml Stages/2022 GCC/11.2.0 OpenMPI/4.1.2 cuDNN/8.3.1.22-CUDA-11.5 NCCL/2.11.4-CUDA-11.5 Python/3.9.6 - -# recent bug: https://gitlab.jsc.fz-juelich.de/software-team/easybuild/-/wikis/Failed-to-initialize-NVML-Driver-library-version-mismatch-message -ml -nvidia-driver/.default - -# set env - pip -source /p/project/prcoe12/RAISE/envAI_deepv/bin/activate - -# set env - conda -#source /p/project/prcoe12/RAISE/miniconda3_deepv/etc/profile.d/conda.sh -#conda activate - -# New CUDA drivers on the compute nodes -ln -s /usr/lib64/libcuda.so.1 . -ln -s /usr/lib64/libnvidia-ml.so.1 . -LD_LIBRARY_PATH=.:/usr/local/cuda/lib64:$LD_LIBRARY_PATH - -# sleep a sec -sleep 1 - -# job info -echo "TIME: $(date)" -echo "DEBUG: EXECUTE: $EXEC" -echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" -echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" -echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" -echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" -echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" -echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" -echo "DEBUG: SLURM_NODEID: $SLURM_NODEID" -echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" -if [ "$debug" = true ] ; then - export NCCL_DEBUG=INFO -fi - -# set comm, CUDA and OMP -#export PSP_CUDA=1 # not needed atm -#export PSP_UCP=1 # not needed atm -export CUDA_VISIBLE_DEVICES="0" -export OMP_NUM_THREADS=1 -if [ "$SLURM_CPUS_PER_TASK" > 0 ] ; then - export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -fi - -# launch -srun bash -c "torchrun \ - --log_dir='logs' \ - --nnodes=$SLURM_NNODES \ - --nproc_per_node=$SLURM_GPUS_PER_NODE \ - --rdzv_id=$SLURM_JOB_ID \ - --rdzv_conf=is_host=\$(((SLURM_NODEID)) && echo 0 || echo 1) \ - --rdzv_backend=c10d \ - --rdzv_endpoint='$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)'i:29500 \ - $EXEC" - -# eof diff --git a/scripts/deep_ddp/README.md b/scripts/deep_ddp/README.md deleted file mode 100644 index de5d06c..0000000 --- a/scripts/deep_ddp/README.md +++ /dev/null @@ -1,42 +0,0 @@ -# DL using DDP on deepv - -# source -https://github.com/pytorch/pytorch#from-source - -# current isues -1. dirty fix to infiniband IPs\ -https://github.com/pytorch/pytorch/issues/73656 - -# to-do -1. - -# done -1. CUDA is back! -2. connection issues are solved -3. updated to torch 1.10.0 -4. updated to torch 1.10.2 -5. infiniband IPs updated - -# usage - pip -1. clone -2. run `./createENV.sh` -3. submit `sbatch DDP_startscript_deep.sh` - -# usage - conda -1. clone -2. run `./conda_torch.sh` -3. modify `DDP_startscript_deep.sh`\ -comment out previous source\ -`source /p/project/prcoe12/RAISE/envAI_deepv/bin/activate`\ -uncomment:\ -`source /p/project/prcoe12/RAISE/miniconda3_deepv/etc/profile.d/conda.sh`\ -`conda activate` -4. submit `sbatch DDP_startscript_deep.sh` - -# updates -1. with the new Stage2020, Conda is no longer needed! Simply use the envAI_deep as:\ -`ml use $OTHERSTAGES`\ -`ml Stages/2022 GCC OpenMPI Python cuDNN NCCL Python`\ -`source /p/project/prcoe12/RAISE/envAI_deepv/bin/activate` -2. shared memory type performance increase is adapted, simply increase `--cpus-per-task` -3. migrated to OpenMPI (pscom issues) and updated to IB IPs diff --git a/scripts/deep_ddp/conda_torch.sh b/scripts/deep_ddp/conda_torch.sh deleted file mode 100755 index 59213f0..0000000 --- a/scripts/deep_ddp/conda_torch.sh +++ /dev/null @@ -1,80 +0,0 @@ -#!/bin/sh -# author: EI -# version: 210709a - -# get dir -iDir=$PWD - -# set modules -module --force purge -module use $OTHERSTAGES -ml Stages/2020 GCC/9.3.0 ParaStationMPI/5.4.7-1-mt CMake Ninja cuDNN NCCL mpi-settings/CUDA - -# conda -if [ -d "${iDir}/miniconda3" ];then - echo "miniconda3 already installed!" - source ${iDir}/miniconda3/etc/profile.d/conda.sh - conda activate -else - echo "miniconda3 will be compiled to ${iDir}/miniconda3!" - wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh - bash Miniconda3-latest-Linux-x86_64.sh -p ${iDir}/miniconda3 -b - source ${iDir}/miniconda3/etc/profile.d/conda.sh - conda activate - # std libs - conda install -y astunparse numpy pyyaml mkl mkl-include setuptools cffi typing_extensions future six requests dataclasses Pillow --force-reinstall - # cuda - check version with yours - conda install -c pytorch -y magma-cuda110 --force-reinstall - conda install -y pkg-config libuv --force-reinstall - rm -f Miniconda3-latest-Linux-x86_64.sh -fi - -# torch -if [ -d "${iDir}/pytorch/build" ];then - echo 'pytorch already installed!' -else - # clone pytorch - if [ -d "${iDir}/pytorch" ];then - echo 'pytorch repo is found!' - else - git clone --recursive https://github.com/pytorch/pytorch pytorch - fi - - # update repos - cd pytorch - git submodule sync - git submodule update --init --recursive - - # install pytorch - export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"} - export TMPDIR=${iDir}/tmp - python setup.py clean - CMAKE_C_COMPILER=$(which mpicc) CMAKE_CXX_COMPILER=$(which mpicxx) USE_DISTRIBUTED=ON USE_MPI=ON USE_CUDA=ON NCCL_ROOT_DIR=$EBROOTNCCL USE_NCCL=ON USE_GLOO=ON CUDNN_ROOT=$EBROOTCUDNN USE_CUDNN=ON python setup.py install - cd .. -fi - -# torchvision -if [ -d "${iDir}/torchvision/build" ];then - echo 'torchvision already installed!' -else - # clone torchvision - if [ -d "${iDir}/torchvision" ];then - echo 'torchvision repo is found!' - else - git clone --recursive https://github.com/pytorch/vision.git torchvision - fi - - # update repos - cd torchvision - git submodule sync - git submodule update --init --recursive - - # install torchvision - export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"} - export TMPDIR=${iDir}/tmp - python setup.py clean - CMAKE_C_COMPILER=$(which mpicc) CMAKE_CXX_COMPILER=$(which mpicxx) FORCE_CUDA=ON python setup.py install -fi - -echo 'done!' -# eof diff --git a/scripts/deep_ddp/createEnv.sh b/scripts/deep_ddp/createEnv.sh deleted file mode 100755 index fd9886b..0000000 --- a/scripts/deep_ddp/createEnv.sh +++ /dev/null @@ -1,193 +0,0 @@ -#!/bin/bash -# -*- coding: utf-8 -*- -# author: EI -# version: 221121a -# creates machine specific python env - -# set modules -ml --force purge - -# get sys info -cDir=$PWD -sysN="$(uname -n | cut -f2- -d.)" -echo "system:${sysN}" -echo - -cont1=false -if [ "$sysN" = 'deepv' ] ; then - ml use $OTHERSTAGES - ml Stages/2022 GCC/11.2.0 ParaStationMPI/5.5.0-1 cuDNN/8.3.1.22-CUDA-11.5 NCCL/2.12.7-1-CUDA-11.5 Python/3.9.6 CMake/3.21.1 - #ml Stages/2022 GCC/11.2.0 OpenMPI/4.1.2 cuDNN/8.3.1.22-CUDA-11.5 NCCL/2.12.7-1-CUDA-11.5 Python/3.9.6 CMake/3.21.1 - cont1=true -elif [ "$sysN" = 'juwels' ] ; then - ml GCC ParaStationMPI Python CMake - cont1=true -elif [ "$sysN" = 'jureca' ] ; then - #ml Stages/2022 GCC ParaStationMPI Python CMake NCCL libaio # Horovod iassues with pscom?? - ml Stages/2022 GCC OpenMPI Python NCCL cuDNN libaio CMake - cont1=true -else - echo - echo 'unknown system detected' - echo 'canceling' - echo -fi -echo "modules loaded" -echo - -# get python version -pver="$(python --version 2>&1 | awk {'print $2'} | cut -f1-2 -d.)" -echo "python version is ${pver}" -echo - -if [ "$cont1" = true ] ; then - if [ -d "${cDir}/envAI_${sysN}" ];then - echo 'env already exist' - echo - - source envAI_${sysN}/bin/activate - else - # create env - python3 -m venv envAI_${sysN} - - # get headers for pip - if [ -f "${cDir}/envAI_${sysN}/bin/pip3" ]; then - echo 'pip already exist' - else - cp "$(which pip3)" $cDir/envAI_${sysN}/bin/ - ln -s $cDir/envAI_${sysN}/bin/pip3 $cDir/envAI_${sysN}/bin/pip${pver} - var="#!$cDir/envAI_${sysN}/bin/python${pver}" - sed -i "1s|.*|$var|" $cDir/envAI_${sysN}/bin/pip3 - fi - - # activate env - source envAI_${sysN}/bin/activate - - echo "a new env is created in ${cDir}" - echo "activation is done via:" - echo "source ${cDir}/envAI_${sysN}/bin/activate" - fi -fi - -# install torch -if [ -f "${cDir}/envAI_${sysN}/bin/torchrun" ]; then - echo 'Torch already installed' - echo -else - export TMPDIR=${cDir} - - #pip3 install \ - # torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio==0.12.1+cu113 -f \ - # https://download.pytorch.org/whl/cu113/torch_stable.html --no-cache-dir - - ## Stages/2022 - CUDA/11.5 - #pip3 install \ - # torch==1.11.0+cu115 torchvision==0.12.0+cu115 torchaudio==0.11.0+cu115 -f \ - # https://download.pytorch.org/whl/cu115/torch_stable.html --no-cache-dir - - # Stages/2022 - CUDA/11.7 - pip3 install \ - torch==1.13.0+cu117 torchvision==0.14.0+cu117 torchaudio==0.13.0+cu117 -f \ - https://download.pytorch.org/whl/cu117/torch_stable.html --no-cache-dir -fi - -# install horovod -if [ -f "${cDir}/envAI_${sysN}/bin/horovodrun" ]; then - echo 'Horovod already installed' - echo -else - pip3 install --no-cache-dir wheel - #export HOROVOD_DEBUG=1 - export HOROVOD_GPU=CUDA - export HOROVOD_CUDA_HOME=$EBROOTCUDA - #export HOROVOD_WITH_MPI=1 - export HOROVOD_MPI_THREADS_DISABLE=1 - #export HOROVOD_GPU_OPERATIONS=MPI # only turn this off - export HOROVOD_GPU_OPERATIONS=NCCL # only turn this off - export HOROVOD_NCCL_HOME=$EBROOTNCCL - export HOROVOD_WITH_PYTORCH=1 - export HOROVOD_WITHOUT_TENSORFLOW=1 - export HOROVOD_WITHOUT_MXNET=1 - export TMPDIR=${cDir} - - pip3 install --no-cache-dir horovod --ignore-installed -fi - -# install deepspeed -if [ -f "${cDir}/envAI_${sysN}/bin/deepspeed" ]; then - echo 'DeepSpeed already installed' - echo -else - #export DS_BUILD_OPS=1 - # if above not working?? recursion error use this - export DS_BUILD_FUSED_ADAM=1 - export DS_BUILD_UTILS=1 - if [ "$sysN" = 'deepv' ] ; then - #fix libaio issues via: - export DS_BUILD_AIO=0 - fi - export TMPDIR=${cDir} - - pip3 install --no-cache-dir DeepSpeed - - # add this to deepspeed/launcher/launch.py l.126 - var=' args.node_rank=int(os.environ.get("SLURM_PROCID",0))' - sed -i "126s|.*|$var|" $cDir/envAI_${sysN}/lib/python${pver}/site-packages/deepspeed/launcher/launch.py -fi - -# install heat -if [ -d "${cDir}/envAI_${sysN}/lib/python${pver}/site-packages/heat" ]; then - echo 'HeAT already installed' - echo -else - export TMPDIR=${cDir} - - # need to modify setup.py to accep torch>1.9 for heat - wget https://files.pythonhosted.org/packages/5d/3a/4781f1e6910753bfdfa6712c83c732c60e675d8de14983926a0d9306c7a6/heat-1.1.1.tar.gz - tar xzf heat-1.1.1.tar.gz - var=' "torch>=1.7.0",' - sed -i "36s|.*|$var|" heat-1.1.1/setup.py - var=' "torchvision>=0.8.0",' - sed -i "39s|.*|$var|" heat-1.1.1/setup.py - - # create tar again! - rm -rf heat-1.1.1.tar.gz - tar czf heat-1.1.1.tar.gz heat-1.1.1 - rm -rf heat-1.1.1 - - pip3 install --no-cache-dir 'heat-1.1.1.tar.gz[hdf5,netcdf]' - - rm -rf heat-1.1.1.tar.gz -fi - -# get rest of the libraries$ -if [ "$cont1" = true ] ; then - # install rest - pip3 install -r reqs.txt --ignore-installed - - # modify l.4 of /torchnlp/_third_party/weighted_random_sampler.py - var='int_classes = int' - sed -i "4s|.*|$var|" \ - $cDir/envAI_${sysN}/lib/python${pver}/site-packages/torchnlp/_third_party/weighted_random_sampler.py -fi - -if [ -f "${cDir}/envAI_${sysN}/bin/torchrun" ]; then - sed -i -e '3,8s/^/#/' ${cDir}/envAI_${sysN}/bin/torchrun - echo """ -import re -import sys -from torch.distributed.run import main -from torch.distributed.elastic.agent.server import api as sapi - -def new_get_fq_hostname(): - return _orig_get_fq_hostname().replace('.', 'i.', 1) - -if __name__ == '__main__': - _orig_get_fq_hostname = sapi._get_fq_hostname - sapi._get_fq_hostname = new_get_fq_hostname - sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) - sys.exit(main()) -""" >> ${cDir}/envAI_${sysN}/bin/torchrun -fi - -#eof diff --git a/scripts/deep_ddp/lamec.json b/scripts/deep_ddp/lamec.json deleted file mode 100644 index 8e4595a..0000000 --- a/scripts/deep_ddp/lamec.json +++ /dev/null @@ -1 +0,0 @@ -{"startscript": "DDP_startscript_deep.sh"} \ No newline at end of file diff --git a/scripts/deep_ddp/reqs.txt b/scripts/deep_ddp/reqs.txt deleted file mode 100755 index 20310b9..0000000 --- a/scripts/deep_ddp/reqs.txt +++ /dev/null @@ -1,6 +0,0 @@ -Pillow -pyparsing -python-dateutil -matplotlib -h5py -pytorch-nlp diff --git a/scripts/deep_deepspeed/DS_config.json b/scripts/deep_deepspeed/DS_config.json deleted file mode 100644 index ec1f022..0000000 --- a/scripts/deep_deepspeed/DS_config.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "train_micro_batch_size_per_gpu": 96, - "gradient_accumulation_steps": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.01 - } - }, - "fp16": { - "enabled": false - }, - "zero_optimization": false -} diff --git a/scripts/deep_deepspeed/DS_startscript_deep.sh b/scripts/deep_deepspeed/DS_startscript_deep.sh deleted file mode 100644 index d99e759..0000000 --- a/scripts/deep_deepspeed/DS_startscript_deep.sh +++ /dev/null @@ -1,84 +0,0 @@ -#!/bin/bash - -# general configuration of the job -#SBATCH --job-name=DSTest -#SBATCH --account=deepext -#SBATCH --mail-user= -#SBATCH --mail-type=ALL -#SBATCH --output=job.out -#SBATCH --error=job.err -#SBATCH --time=00:30:00 - -# configure node and process count on the CM -#SBATCH --partition=dp-esb -#SBATCH --nodes=2 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=16 -#SBATCH --gpus-per-node=1 -#SBATCH --exclusive - -# parameters -debug=false # do nccl debug -epochs=10 # epochs -lr=0.01 # learning rate -bs=96 # batch-size - -# AT -dataDir="/p/project/prcoe12/RAISE/T31/" -COMMAND="DS_pytorch_AT.py" -EXEC="$COMMAND \ - --batch-size $bs \ - --epochs $epochs \ - --lr $lr \ - --nworker $SLURM_CPUS_PER_TASK \ - --data-dir $dataDir" - -# set modules -ml --force purge -ml use $OTHERSTAGES -ml Stages/2022 GCC/11.2.0 OpenMPI/4.1.2 cuDNN/8.3.1.22-CUDA-11.5 NCCL/2.11.4-CUDA-11.5 Python/3.9.6 - -# set env - pip -source /p/project/prcoe12/RAISE/envAI_deepv/bin/activate - -# sleep a sec -sleep 1 - -# job info -echo "DEBUG: EXECUTE: $EXEC" -echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" -echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" -echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" -echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" -echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" -echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" -echo "DEBUG: SLURM_NODEID: $SLURM_NODEID" -echo "DEBUG: SLURM_LOCALID: $SLURM_LOCALID" -echo "DEBUG: SLURM_PROCID: $SLURM_PROCID" -echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" -if [ "$debug" = true ] ; then - export NCCL_DEBUG=INFO -fi -echo - -#### do not change this part -# create node-list -sysN=$(scontrol show hostnames) -for i in $sysN; do - x+=\"$i\":[$CUDA_VISIBLE_DEVICES], -done -WID=`echo {${x::-1}} | base64 -w 0` - -# modify config file with parameters -sed -i "2s|.*| \"train_micro_batch_size_per_gpu\": ${bs},|" DS_config.json -#### - -srun python3 -m deepspeed.launcher.launch \ - --node_rank $SLURM_PROCID \ - --master_addr ${SLURMD_NODENAME}i \ - --master_port 29500 \ - --world_info $WID \ - $EXEC --deepspeed_mpi --deepspeed_config DS_config.json - -# eof diff --git a/scripts/deep_deepspeed/README.md b/scripts/deep_deepspeed/README.md deleted file mode 100644 index 1f92e3a..0000000 --- a/scripts/deep_deepspeed/README.md +++ /dev/null @@ -1,15 +0,0 @@ -# DL using DeepSpeed on deepv - -# source -https://github.com/microsoft/DeepSpeed - -# current isues -1. - -# to-do -1. - -# usage - pip -1. clone -2. run `./createENV.sh` -3. submit `sbatch DS_startscript_deep.sh` diff --git a/scripts/deep_deepspeed/createEnv.sh b/scripts/deep_deepspeed/createEnv.sh deleted file mode 100755 index 4558743..0000000 --- a/scripts/deep_deepspeed/createEnv.sh +++ /dev/null @@ -1,174 +0,0 @@ -#!/bin/bash -# -*- coding: utf-8 -*- -# author: EI -# version: 220302a -# creates machine specific python env - -# set modules -ml --force purge - -# get sys info -cDir=$PWD -sysN="$(uname -n | cut -f2- -d.)" -echo "system:${sysN}" -echo - -cont1=false -if [ "$sysN" = 'deepv' ] ; then - ml use $OTHERSTAGES - #ml Stages/2022 GCC ParaStationMPI cuDNN NCCL Python CMake # Horovod issues with pscom?? - ml Stages/2022 GCC OpenMPI cuDNN NCCL Python CMake - cont1=true -elif [ "$sysN" = 'juwels' ] ; then - ml GCC ParaStationMPI Python CMake - cont1=true -elif [ "$sysN" = 'jureca' ] ; then - #ml Stages/2022 GCC ParaStationMPI Python CMake NCCL libaio # Horovod iassues with pscom?? - ml Stages/2022 GCC OpenMPI Python NCCL cuDNN libaio CMake - cont1=true -else - echo - echo 'unknown system detected' - echo 'canceling' - echo -fi -echo "modules loaded" -echo - -# get python version -pver="$(python --version 2>&1 | awk {'print $2'} | cut -f1-2 -d.)" -echo "python version is ${pver}" -echo - -if [ "$cont1" = true ] ; then - if [ -d "${cDir}/envAI_${sysN}" ];then - echo 'env already exist' - echo - - source envAI_${sysN}/bin/activate - else - # create env - python3 -m venv envAI_${sysN} - - # get headers for pip - if [ -f "${cDir}/envAI_${sysN}/bin/pip3" ]; then - echo 'pip already exist' - else - cp "$(which pip3)" $cDir/envAI_${sysN}/bin/ - ln -s $cDir/envAI_${sysN}/bin/pip3 $cDir/envAI_${sysN}/bin/pip${pver} - var="#!$cDir/envAI_${sysN}/bin/python${pver}" - sed -i "1s|.*|$var|" $cDir/envAI_${sysN}/bin/pip3 - fi - - # activate env - source envAI_${sysN}/bin/activate - - echo "a new env is created in ${cDir}" - echo "activation is done via:" - echo "source ${cDir}/envAI_${sysN}/bin/activate" - fi -fi - -# install torch -if [ -f "${cDir}/envAI_${sysN}/bin/torchrun" ]; then - echo 'Torch already installed' - echo -else - export TMPDIR=${cDir} - - pip3 install \ - torch==1.10.2+cu113 torchvision==0.11.3+cu113 torchaudio==0.10.2+cu113 \ - -f https://download.pytorch.org/whl/cu113/torch_stable.html --no-cache-dir -fi - -# install horovod -if [ -f "${cDir}/envAI_${sysN}/bin/horovodrun" ]; then - echo 'Horovod already installed' - echo -else - export HOROVOD_GPU=CUDA - export HOROVOD_GPU_OPERATIONS=NCCL - export HOROVOD_WITH_PYTORCH=1 - export TMPDIR=${cDir} - - pip3 install --no-cache-dir horovod --ignore-installed -fi - -# install deepspeed -if [ -f "${cDir}/envAI_${sysN}/bin/deepspeed" ]; then - echo 'DeepSpeed already installed' - echo -else - export DS_BUILD_OPS=1 - # if above not working?? recursion error use this - #export DS_BUILD_FUSED_ADAM=1 - #export DS_BUILD_UTILS=1 - if [ "$sysN" = 'deepv' ] ; then - #fix libaio issues via: - export DS_BUILD_AIO=0 - fi - export TMPDIR=${cDir} - - pip3 install --no-cache-dir DeepSpeed - - # add this to deepspeed/launcher/launch.py l.85 - var=' args.node_rank=int(os.environ.get("SLURM_PROCID",0))' - sed -i "85s|.*|$var|" $cDir/envAI_${sysN}/lib/python${pver}/site-packages/deepspeed/launcher/launch.py -fi - -# install heat -if [ -d "${cDir}/envAI_${sysN}/lib/python${pver}/site-packages/heat" ]; then - echo 'HeAT already installed' - echo -else - export TMPDIR=${cDir} - - # need to modify setup.py to accep torch>1.9 for heat - wget https://files.pythonhosted.org/packages/5d/3a/4781f1e6910753bfdfa6712c83c732c60e675d8de14983926a0d9306c7a6/heat-1.1.1.tar.gz - tar xzf heat-1.1.1.tar.gz - var=' "torch>=1.7.0",' - sed -i "36s|.*|$var|" heat-1.1.1/setup.py - var=' "torchvision>=0.8.0",' - sed -i "39s|.*|$var|" heat-1.1.1/setup.py - - # create tar again! - rm -rf heat-1.1.1.tar.gz - tar czf heat-1.1.1.tar.gz heat-1.1.1 - rm -rf heat-1.1.1 - - pip3 install --no-cache-dir 'heat-1.1.1.tar.gz[hdf5,netcdf]' - - rm -rf heat-1.1.1.tar.gz -fi - -# get rest of the libraries$ -if [ "$cont1" = true ] ; then - # install rest - pip3 install -r reqs.txt --ignore-installed - - # modify l.4 of /torchnlp/_third_party/weighted_random_sampler.py - var='int_classes = int' - sed -i "4s|.*|$var|" \ - $cDir/envAI_${sysN}/lib/python${pver}/site-packages/torchnlp/_third_party/weighted_random_sampler.py -fi - -if [ -f "${cDir}/envAI_${sysN}/bin/torchrun" ]; then - sed -i -e '3,8s/^/#/' ${cDir}/envAI_${sysN}/bin/torchrun - echo """ -import re -import sys -from torch.distributed.run import main -from torch.distributed.elastic.agent.server import api as sapi - -def new_get_fq_hostname(): - return _orig_get_fq_hostname().replace('.', 'i.', 1) - -if __name__ == '__main__': - _orig_get_fq_hostname = sapi._get_fq_hostname - sapi._get_fq_hostname = new_get_fq_hostname - sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) - sys.exit(main()) -""" >> ${cDir}/envAI_${sysN}/bin/torchrun -fi - -#eof diff --git a/scripts/deep_deepspeed/lamec.json b/scripts/deep_deepspeed/lamec.json deleted file mode 100644 index b1572ed..0000000 --- a/scripts/deep_deepspeed/lamec.json +++ /dev/null @@ -1 +0,0 @@ -{"startscript": "DS_startscript_deep.sh"} \ No newline at end of file diff --git a/scripts/deep_deepspeed/reqs.txt b/scripts/deep_deepspeed/reqs.txt deleted file mode 100755 index 20310b9..0000000 --- a/scripts/deep_deepspeed/reqs.txt +++ /dev/null @@ -1,6 +0,0 @@ -Pillow -pyparsing -python-dateutil -matplotlib -h5py -pytorch-nlp diff --git a/scripts/deep_heat/HeAT_startscript_deep.sh b/scripts/deep_heat/HeAT_startscript_deep.sh deleted file mode 100644 index 5454a78..0000000 --- a/scripts/deep_heat/HeAT_startscript_deep.sh +++ /dev/null @@ -1,90 +0,0 @@ -#!/bin/bash - -# general configuration of the job -#SBATCH --job-name=HeATTest -#SBATCH --account=deepext -#SBATCH --mail-user= -#SBATCH --mail-type=ALL -#SBATCH --output=job.out -#SBATCH --error=job.err -#SBATCH --time=01:00:00 - -# configure node and process count on the CM -#SBATCH --partition=dp-esb -#SBATCH --nodes=4 -#SBATCH --ntasks-per-node=4 -#SBATCH --cpus-per-task=1 -#SBATCH --gpus-per-node=1 -#SBATCH --exclusive - -# parameters -debug=false # do debug -bs=96 # batch-size -epochs=1 # epochs -lr=0.01 # learning rate - -# dataset -# MNIST -#dataDir="/p/project/prcoe12/RAISE/data_MNIST/" -#COMMAND="DDP_pytorch_mnist.py" - -# AT -dataDir="/p/project/prcoe12/RAISE/T31/" -COMMAND="HeAT_pytorch_AT.py" - -EXEC="$COMMAND \ - --batch-size $bs \ - --epochs $epochs \ - --lr $lr \ - --data-dir $dataDir" - -# set modules -ml --force purge -ml use $OTHERSTAGES -ml Stages/2022 GCC/11.2.0 OpenMPI/4.1.2 cuDNN/8.3.1.22-CUDA-11.5 NCCL/2.11.4-CUDA-11.5 Python/3.9.6 - -# set env - pip -source /p/project/prcoe12/RAISE/envAI_deepv/bin/activate - -# set env - conda -#source /p/project/prcoe12/RAISE/miniconda3_deepv/etc/profile.d/conda.sh -#conda activate - -# sleep a sec -sleep 1 - -# job info -echo "DEBUG: EXECUTE: $EXEC" -echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" -echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" -echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" -echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" -echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" -echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" -echo "DEBUG: SLURM_NODEID: $SLURM_NODEID" -echo "DEBUG: SLURM_LOCALID: $SLURM_LOCALID" -echo "DEBUG: SLURM_PROCID: $SLURM_PROCID" -echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" -if [ "$debug" = true ] ; then - export NCCL_DEBUG=INFO -fi -echo - -# set comm, CUDA and OMP -#PSP_CUDA=1 # not needed atm -#PSP_UCP=1 # not needed atm -#PSP_OPENIB=1 # not needed atm -#export NCCL_SOCKET_IFNAME=ib # not needed atm -#export NCCL_IB_HCA=ipogif0 # not needed atm -#export NCCL_IB_CUDA_SUPPORT=1 # not needed atm -export CUDA_VISIBLE_DEVICES="0" -export OMP_NUM_THREADS=1 -if [ "$SLURM_CPUS_PER_TASK" > 0 ] ; then - export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -fi - -# launch -srun --mpi=pspmix python3 -u $EXEC - -# eof diff --git a/scripts/deep_heat/README.md b/scripts/deep_heat/README.md deleted file mode 100644 index 74a97ad..0000000 --- a/scripts/deep_heat/README.md +++ /dev/null @@ -1,15 +0,0 @@ -# DL using HeAT/PyTorch on Jureca DC - -# source -https://github.com/helmholtz-analytics/heat - -# current isues -1. no alternative to --mpi=pspmix with OMPI, but works - -# to-do -1. - -# usage - pip -1. clone -2. run `./createENV.sh` -3. submit `sbatch HeAT_startscript_deep.sh` diff --git a/scripts/deep_heat/createEnv.sh b/scripts/deep_heat/createEnv.sh deleted file mode 100755 index 4558743..0000000 --- a/scripts/deep_heat/createEnv.sh +++ /dev/null @@ -1,174 +0,0 @@ -#!/bin/bash -# -*- coding: utf-8 -*- -# author: EI -# version: 220302a -# creates machine specific python env - -# set modules -ml --force purge - -# get sys info -cDir=$PWD -sysN="$(uname -n | cut -f2- -d.)" -echo "system:${sysN}" -echo - -cont1=false -if [ "$sysN" = 'deepv' ] ; then - ml use $OTHERSTAGES - #ml Stages/2022 GCC ParaStationMPI cuDNN NCCL Python CMake # Horovod issues with pscom?? - ml Stages/2022 GCC OpenMPI cuDNN NCCL Python CMake - cont1=true -elif [ "$sysN" = 'juwels' ] ; then - ml GCC ParaStationMPI Python CMake - cont1=true -elif [ "$sysN" = 'jureca' ] ; then - #ml Stages/2022 GCC ParaStationMPI Python CMake NCCL libaio # Horovod iassues with pscom?? - ml Stages/2022 GCC OpenMPI Python NCCL cuDNN libaio CMake - cont1=true -else - echo - echo 'unknown system detected' - echo 'canceling' - echo -fi -echo "modules loaded" -echo - -# get python version -pver="$(python --version 2>&1 | awk {'print $2'} | cut -f1-2 -d.)" -echo "python version is ${pver}" -echo - -if [ "$cont1" = true ] ; then - if [ -d "${cDir}/envAI_${sysN}" ];then - echo 'env already exist' - echo - - source envAI_${sysN}/bin/activate - else - # create env - python3 -m venv envAI_${sysN} - - # get headers for pip - if [ -f "${cDir}/envAI_${sysN}/bin/pip3" ]; then - echo 'pip already exist' - else - cp "$(which pip3)" $cDir/envAI_${sysN}/bin/ - ln -s $cDir/envAI_${sysN}/bin/pip3 $cDir/envAI_${sysN}/bin/pip${pver} - var="#!$cDir/envAI_${sysN}/bin/python${pver}" - sed -i "1s|.*|$var|" $cDir/envAI_${sysN}/bin/pip3 - fi - - # activate env - source envAI_${sysN}/bin/activate - - echo "a new env is created in ${cDir}" - echo "activation is done via:" - echo "source ${cDir}/envAI_${sysN}/bin/activate" - fi -fi - -# install torch -if [ -f "${cDir}/envAI_${sysN}/bin/torchrun" ]; then - echo 'Torch already installed' - echo -else - export TMPDIR=${cDir} - - pip3 install \ - torch==1.10.2+cu113 torchvision==0.11.3+cu113 torchaudio==0.10.2+cu113 \ - -f https://download.pytorch.org/whl/cu113/torch_stable.html --no-cache-dir -fi - -# install horovod -if [ -f "${cDir}/envAI_${sysN}/bin/horovodrun" ]; then - echo 'Horovod already installed' - echo -else - export HOROVOD_GPU=CUDA - export HOROVOD_GPU_OPERATIONS=NCCL - export HOROVOD_WITH_PYTORCH=1 - export TMPDIR=${cDir} - - pip3 install --no-cache-dir horovod --ignore-installed -fi - -# install deepspeed -if [ -f "${cDir}/envAI_${sysN}/bin/deepspeed" ]; then - echo 'DeepSpeed already installed' - echo -else - export DS_BUILD_OPS=1 - # if above not working?? recursion error use this - #export DS_BUILD_FUSED_ADAM=1 - #export DS_BUILD_UTILS=1 - if [ "$sysN" = 'deepv' ] ; then - #fix libaio issues via: - export DS_BUILD_AIO=0 - fi - export TMPDIR=${cDir} - - pip3 install --no-cache-dir DeepSpeed - - # add this to deepspeed/launcher/launch.py l.85 - var=' args.node_rank=int(os.environ.get("SLURM_PROCID",0))' - sed -i "85s|.*|$var|" $cDir/envAI_${sysN}/lib/python${pver}/site-packages/deepspeed/launcher/launch.py -fi - -# install heat -if [ -d "${cDir}/envAI_${sysN}/lib/python${pver}/site-packages/heat" ]; then - echo 'HeAT already installed' - echo -else - export TMPDIR=${cDir} - - # need to modify setup.py to accep torch>1.9 for heat - wget https://files.pythonhosted.org/packages/5d/3a/4781f1e6910753bfdfa6712c83c732c60e675d8de14983926a0d9306c7a6/heat-1.1.1.tar.gz - tar xzf heat-1.1.1.tar.gz - var=' "torch>=1.7.0",' - sed -i "36s|.*|$var|" heat-1.1.1/setup.py - var=' "torchvision>=0.8.0",' - sed -i "39s|.*|$var|" heat-1.1.1/setup.py - - # create tar again! - rm -rf heat-1.1.1.tar.gz - tar czf heat-1.1.1.tar.gz heat-1.1.1 - rm -rf heat-1.1.1 - - pip3 install --no-cache-dir 'heat-1.1.1.tar.gz[hdf5,netcdf]' - - rm -rf heat-1.1.1.tar.gz -fi - -# get rest of the libraries$ -if [ "$cont1" = true ] ; then - # install rest - pip3 install -r reqs.txt --ignore-installed - - # modify l.4 of /torchnlp/_third_party/weighted_random_sampler.py - var='int_classes = int' - sed -i "4s|.*|$var|" \ - $cDir/envAI_${sysN}/lib/python${pver}/site-packages/torchnlp/_third_party/weighted_random_sampler.py -fi - -if [ -f "${cDir}/envAI_${sysN}/bin/torchrun" ]; then - sed -i -e '3,8s/^/#/' ${cDir}/envAI_${sysN}/bin/torchrun - echo """ -import re -import sys -from torch.distributed.run import main -from torch.distributed.elastic.agent.server import api as sapi - -def new_get_fq_hostname(): - return _orig_get_fq_hostname().replace('.', 'i.', 1) - -if __name__ == '__main__': - _orig_get_fq_hostname = sapi._get_fq_hostname - sapi._get_fq_hostname = new_get_fq_hostname - sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) - sys.exit(main()) -""" >> ${cDir}/envAI_${sysN}/bin/torchrun -fi - -#eof diff --git a/scripts/deep_heat/example_mnist_heat.py b/scripts/deep_heat/example_mnist_heat.py deleted file mode 100644 index b2a5526..0000000 --- a/scripts/deep_heat/example_mnist_heat.py +++ /dev/null @@ -1,184 +0,0 @@ -# example from : https://github.com/helmholtz-analytics/heat/blob/master/examples/nn/mnist.py - -from __future__ import print_function -import argparse -import sys -import time -import torch - -sys.path.append("../../") -import heat as ht -import heat.nn.functional as F -import heat.optim as optim -from heat.optim.lr_scheduler import StepLR -from heat.utils import vision_transforms -from heat.utils.data.mnist import MNISTDataset - -""" -This file is an example script for how to use the HeAT DataParallel class to train a network on the MNIST dataset. -To run this file execute the following in the examples/nn/ directory: - mpirun -np N python -u mnist.py -where N is the number of processes. -""" - - -class Net(ht.nn.Module): - def __init__(self): - super(Net, self).__init__() - self.conv1 = ht.nn.Conv2d(1, 32, 3, 1) - self.conv2 = ht.nn.Conv2d(32, 64, 3, 1) - self.dropout1 = ht.nn.Dropout2d(0.25) - self.dropout2 = ht.nn.Dropout2d(0.5) - self.fc1 = ht.nn.Linear(9216, 128) - self.fc2 = ht.nn.Linear(128, 10) - - def forward(self, x): - x = self.conv1(x) - x = F.relu(x) - x = self.conv2(x) - x = F.relu(x) - x = F.max_pool2d(x, 2) - x = self.dropout1(x) - x = torch.flatten(x, 1) - x = self.fc1(x) - x = F.relu(x) - x = self.dropout2(x) - x = self.fc2(x) - output = F.log_softmax(x, dim=1) - return output - - -def train(args, model, device, train_loader, optimizer, epoch): - model.train() - t_list = [] - for batch_idx, (data, target) in enumerate(train_loader): - t = time.perf_counter() - data, target = data.to(device), target.to(device) - optimizer.zero_grad() - output = model(data) - loss = F.nll_loss(output, target) - loss.backward() - optimizer.step() - if batch_idx % args.log_interval == 0: - print( - f"Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} " - f"({100.0 * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}" - ) - if args.dry_run: - break - t_list.append(time.perf_counter() - t) - print("average time", sum(t_list) / len(t_list)) - - -def test(model, device, test_loader): - model.eval() - test_loss = 0 - correct = 0 - with torch.no_grad(): - for data, target in test_loader: - data, target = data.to(device), target.to(device) - output = model(data) - test_loss += F.nll_loss(output, target, reduction="sum").item() # sum up batch loss - pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability - correct += pred.eq(target.view_as(pred)).sum().item() - test_loss /= len(test_loader.dataset) - print( - f"\nTest set: Average loss: {test_loss:.4f}, Accuracy: {correct}/{len(test_loader.dataset)}" - f" ({100.0 * correct / len(test_loader.dataset):.0f}%)\n" - ) - - -def main(): - # Training settings - parser = argparse.ArgumentParser(description="PyTorch MNIST Example") - parser.add_argument( - "--batch-size", - type=int, - default=64, - metavar="N", - help="input batch size for training (default: 64)", - ) - parser.add_argument( - "--test-batch-size", - type=int, - default=1000, - metavar="N", - help="input batch size for testing (default: 1000)", - ) - parser.add_argument( - "--epochs", - type=int, - default=14, - metavar="N", - help="number of epochs to train (default: 14)", - ) - parser.add_argument( - "--lr", type=float, default=1.0, metavar="LR", help="learning rate (default: 1.0)" - ) - parser.add_argument( - "--gamma", - type=float, - default=0.7, - metavar="M", - help="Learning rate step gamma (default: 0.7)", - ) - parser.add_argument( - "--no-cuda", action="store_true", default=False, help="disables CUDA training" - ) - parser.add_argument( - "--dry-run", action="store_true", default=False, help="quickly check a single pass" - ) - parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)") - parser.add_argument( - "--log-interval", - type=int, - default=10, - metavar="N", - help="how many batches to wait before logging training status", - ) - parser.add_argument( - "--save-model", action="store_true", default=False, help="For Saving the current Model" - ) - args = parser.parse_args() - use_cuda = not args.no_cuda and torch.cuda.is_available() - torch.manual_seed(args.seed) - device = torch.device("cuda" if use_cuda else "cpu") - kwargs = {"batch_size": args.batch_size} - if use_cuda: - kwargs.update({"num_workers": 0, "pin_memory": True}) - transform = ht.utils.vision_transforms.Compose( - [vision_transforms.ToTensor(), vision_transforms.Normalize((0.1307,), (0.3081,))] - ) - - dataDir="/p/project/prcoe12/RAISE/data_MNIST/" - dataset1 = MNISTDataset(dataDir, train=True, transform=transform, ishuffle=False) - dataset2 = MNISTDataset( - dataDir, train=False, transform=transform, ishuffle=False, test_set=True - ) - - train_loader = ht.utils.data.datatools.DataLoader(dataset=dataset1, **kwargs) - test_loader = ht.utils.data.datatools.DataLoader(dataset=dataset2, **kwargs) - model = Net().to(device) - optimizer = optim.Adadelta(model.parameters(), lr=args.lr) - blocking = False - dp_optim = ht.optim.DataParallelOptimizer(optimizer, blocking=blocking) - scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) - dp_model = ht.nn.DataParallel( - model, comm=dataset1.comm, optimizer=dp_optim, blocking_parameter_updates=blocking - ) - - for epoch in range(1, args.epochs + 1): - train(args, dp_model, device, train_loader, dp_optim, epoch) - test(dp_model, device, test_loader) - scheduler.step() - if epoch + 1 == args.epochs: - train_loader.last_epoch = True - test_loader.last_epoch = True - - if args.save_model: - torch.save(model.state_dict(), "mnist_cnn.pt") - - -if __name__ == "__main__": - main() - diff --git a/scripts/deep_heat/lamec.json b/scripts/deep_heat/lamec.json deleted file mode 100644 index d1bf1b2..0000000 --- a/scripts/deep_heat/lamec.json +++ /dev/null @@ -1 +0,0 @@ -{"startscript": "HeAT_startscript_deep.sh"} \ No newline at end of file diff --git a/scripts/deep_heat/reqs.txt b/scripts/deep_heat/reqs.txt deleted file mode 100755 index 20310b9..0000000 --- a/scripts/deep_heat/reqs.txt +++ /dev/null @@ -1,6 +0,0 @@ -Pillow -pyparsing -python-dateutil -matplotlib -h5py -pytorch-nlp diff --git a/scripts/deep_horovod/Hor_startscript_deep.sh b/scripts/deep_horovod/Hor_startscript_deep.sh deleted file mode 100644 index 1feaee8..0000000 --- a/scripts/deep_horovod/Hor_startscript_deep.sh +++ /dev/null @@ -1,90 +0,0 @@ -#!/bin/bash - -# general configuration of the job -#SBATCH --job-name=TorchTest -#SBATCH --account=deepext -#SBATCH --mail-user= -#SBATCH --mail-type=ALL -#SBATCH --output=job.out -#SBATCH --error=job.err -#SBATCH --time=01:00:00 - -# configure node and process count on the CM -#SBATCH --partition=dp-esb -#SBATCH --nodes=4 -#SBATCH --ntasks-per-node=4 -#SBATCH --cpus-per-task=1 -#SBATCH --gpus-per-node=1 -#SBATCH --exclusive - -# parameters -debug=false # do debug -bs=96 # batch-size -epochs=1 # epochs -lr=0.01 # learning rate - -# dataset -# MNIST -#dataDir="/p/project/prcoe12/RAISE/data_MNIST/" -#COMMAND="DDP_pytorch_mnist.py" - -# AT -dataDir="/p/project/prcoe12/RAISE/T31/" -COMMAND="Hor_pytorch_AT.py" - -EXEC="$COMMAND \ - --batch-size $bs \ - --epochs $epochs \ - --lr $lr \ - --data-dir $dataDir" - -# set modules -ml --force purge -ml use $OTHERSTAGES -ml Stages/2022 GCC/11.2.0 OpenMPI/4.1.2 cuDNN/8.3.1.22-CUDA-11.5 NCCL/2.11.4-CUDA-11.5 Python/3.9.6 - -# set env - pip -source /p/project/prcoe12/RAISE/envAI_deepv/bin/activate - -# set env - conda -#source /p/project/prcoe12/RAISE/miniconda3_deepv/etc/profile.d/conda.sh -#conda activate - -# sleep a sec -sleep 1 - -# job info -echo "DEBUG: EXECUTE: $EXEC" -echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" -echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" -echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" -echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" -echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" -echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" -echo "DEBUG: SLURM_NODEID: $SLURM_NODEID" -echo "DEBUG: SLURM_LOCALID: $SLURM_LOCALID" -echo "DEBUG: SLURM_PROCID: $SLURM_PROCID" -echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" -if [ "$debug" = true ] ; then - export NCCL_DEBUG=INFO -fi -echo - -# set comm, CUDA and OMP -#PSP_CUDA=1 # not needed atm -#PSP_UCP=1 # not needed atm -#PSP_OPENIB=1 # not needed atm -#export NCCL_SOCKET_IFNAME=ib # not needed atm -#export NCCL_IB_HCA=ipogif0 # not needed atm -#export NCCL_IB_CUDA_SUPPORT=1 # not needed atm -export CUDA_VISIBLE_DEVICES="0" -export OMP_NUM_THREADS=1 -if [ "$SLURM_CPUS_PER_TASK" > 0 ] ; then - export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -fi - -# launch -srun --mpi=pspmix python3 -u $EXEC - -# eof diff --git a/scripts/deep_horovod/README.md b/scripts/deep_horovod/README.md deleted file mode 100644 index d88c2ef..0000000 --- a/scripts/deep_horovod/README.md +++ /dev/null @@ -1,15 +0,0 @@ -# DL using Horovod on deepv - -# source -https://github.com/horovod/horovod - -# current isues -1. no alternative to --mpi=pspmix with OMPI, but works - -# to-do -1. - -# usage - pip -1. clone -2. run `./createENV.sh` -3. submit `sbatch Hor_startscript_deep.sh` diff --git a/scripts/deep_horovod/createEnv.sh b/scripts/deep_horovod/createEnv.sh deleted file mode 100755 index 8aa89ef..0000000 --- a/scripts/deep_horovod/createEnv.sh +++ /dev/null @@ -1,174 +0,0 @@ -#!/bin/bash -# -*- coding: utf-8 -*- -# author: EI -# version: 230120a -# creates machine specific python env -# env ONLY - -# set modules -module --force purge - -# get sys info -#sysN="$(uname -n | cut -f2- -d.)" -sysN="deepv" -echo "system:${sysN}" -echo - -# create tmp dir -mkdir -p $PWD/tmp -export TMPDIR=$PWD/tmp - -if [ "$sysN" = 'deepv' ] ; then - module use $OTHERSTAGES - # main - ml Stages/2022 NVHPC/22.1 OpenMPI/4.1.2 NCCL/2.15.1-1-CUDA-11.5 cuDNN/8.3.1.22-CUDA-11.5 - - # side - ml Python/3.9.6 HDF5 CMake - - # version mismatch fix - ml -nvidia-driver/.default - - # new cuda drivers in comp node, only use this if salloc - ln -s /usr/lib64/libcuda.so.1 . - ln -s /usr/lib64/libnvidia-ml.so.1 . - LD_LIBRARY_PATH=.:/usr/local/cuda/lib64:$LD_LIBRARY_PATH - -elif [ "$sysN" = 'juwels' ] ; then - ml GCC ParaStationMPI Python CMake -elif [ "$sysN" = 'jureca' ] ; then - # main - ml Stages/2022 NVHPC/22.1 ParaStationMPI/5.5.0-1-mt NCCL/2.14.3-1-CUDA-11.5 cuDNN/8.3.1.22-CUDA-11.5 - - # side - ml Python/3.9.6 libaio/0.3.112 HDF5/1.12.1 PnetCDF/1.12.2 mpi-settings/CUDA CMake/3.21.1 -else - echo 'unknown system detected' - echo 'canceling' - exit -fi -echo 'modules loaded' -echo - -# get python version -pver="$(python --version 2>&1 | awk {'print $2'} | cut -f1-2 -d.)" -echo "python version is ${pver}" -echo - -if [ -d "$PWD/envAI_${sysN}" ];then - echo 'env already exist' - echo - - source envAI_${sysN}/bin/activate -else - # create env - python3 -m venv envAI_${sysN} - - # get headers for pip - if [ -f "$PWD/envAI_${sysN}/bin/pip3" ]; then - echo 'pip already exist' - else - cp "$(which pip3)" $PWD/envAI_${sysN}/bin/ - ln -s $PWD/envAI_${sysN}/bin/pip3 $PWD/envAI_${sysN}/bin/pip${pver} - var="#!$PWD/envAI_${sysN}/bin/python${pver}" - sed -i "1s|.*|$var|" $PWD/envAI_${sysN}/bin/pip3 - fi - - # activate env - source envAI_${sysN}/bin/activate - - echo "a new env is created in $PWD" - echo "activation is done via:" - echo "source $PWD/envAI_${sysN}/bin/activate" -fi - -# install torch -if [ -f "$PWD/envAI_${sysN}/bin/torchrun" ]; then - echo 'Torch already installed' - echo -else - pip3 install --no-cache-dir \ - torch==1.13.0+cu117 torchvision==0.14.0+cu117 torchaudio==0.13.0+cu117 -f \ - https://download.pytorch.org/whl/cu117/torch_stable.html --no-cache-dir -fi - -# install horovod -if [ -f "$PWD/envAI_${sysN}/bin/horovodrun" ]; then - echo 'Horovod already installed' - echo -else - #export HOROVOD_DEBUG=1 - export HOROVOD_WITH_MPI=1 - export HOROVOD_MPI_THREADS_DISABLE=1 - export HOROVOD_GPU=CUDA - #export HOROVOD_GPU_OPERATIONS=MPI - export HOROVOD_CUDA_HOME=$EBROOTCUDA - export HOROVOD_GPU_OPERATIONS=NCCL - export HOROVOD_NCCL_HOME=$EBROOTNCCL - export HOROVOD_WITH_PYTORCH=1 - export HOROVOD_WITHOUT_TENSORFLOW=1 - export HOROVOD_WITHOUT_MXNET=1 - - pip3 install --no-cache-dir wheel --ignore-installed - pip3 install --no-cache-dir horovod==0.25.0 --ignore-installed -fi - -# install deepspeed -if [ -f "$PWD/envAI_${sysN}/bin/deepspeed" ]; then - echo 'DeepSpeed already installed' - echo -else - # compile all opt. stuff - not needed & not working - #export DS_BUILD_OPS=1 - # compile req. opt. stuff - export DS_BUILD_FUSED_ADAM=1 - export DS_BUILD_UTILS=1 - - pip3 install --no-cache-dir DeepSpeed=0.9.1 - - # add this to .../deepspeed/launcher/launch.py l.219 - var=' args.node_rank=int(os.environ.get("SLURM_PROCID",0))' - sed -i "219s|.*|$var|" $PWD/envAI_${sysN}/lib/python${pver}/site-packages/deepspeed/launcher/launch.py -fi - -# install heat -if [ -d "$PWD/envAI_${sysN}/lib/python${pver}/site-packages/heat" ]; then - echo 'HeAT already installed' - echo -else - export CFLAGS="-noswitcherror" - export CXXFLAGS="-noswitcherror" - - pip3 install heat[hdf5,netcdf] -fi - -# get rest of the libraries -# install rest -pip3 install -r reqs.txt --ignore-installed - -# modify l.4 of /torchnlp/_third_party/weighted_random_sampler.py -var='int_classes = int' -sed -i "4s|.*|$var|" \ - $PWD/envAI_${sysN}/lib/python${pver}/site-packages/torchnlp/_third_party/weighted_random_sampler.py - -# fix IB IP config -if [ -f "$PWD/envAI_${sysN}/bin/torchrun" ]; then - sed -i -e '3,8s/^/#/' $PWD/envAI_${sysN}/bin/torchrun - echo """ -import re -import sys -from torch.distributed.run import main -from torch.distributed.elastic.agent.server import api as sapi - -def new_get_fq_hostname(): - return _orig_get_fq_hostname().replace('.', 'i.', 1) - -if __name__ == '__main__': - _orig_get_fq_hostname = sapi._get_fq_hostname - sapi._get_fq_hostname = new_get_fq_hostname - sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) - sys.exit(main()) -""" >> $PWD/envAI_${sysN}/bin/torchrun -fi - -#eof diff --git a/scripts/deep_horovod/lamec.json b/scripts/deep_horovod/lamec.json deleted file mode 100644 index 4aff71d..0000000 --- a/scripts/deep_horovod/lamec.json +++ /dev/null @@ -1 +0,0 @@ -{"startscript": "Hor_startscript_deep.sh"} \ No newline at end of file diff --git a/scripts/deep_horovod/pytorch_mnist.py b/scripts/deep_horovod/pytorch_mnist.py deleted file mode 100644 index 2d0c5ac..0000000 --- a/scripts/deep_horovod/pytorch_mnist.py +++ /dev/null @@ -1,205 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -# MNIST train in deepv -# origin: https://github.com/horovod/horovod/blob/master/examples/pytorch/pytorch_mnist.py -# changes L.132 from 'num_workers': 1 to 'num_workers': 0 - -import argparse -import os -from filelock import FileLock - -import torch.multiprocessing as mp -import torch.nn as nn -import torch.nn.functional as F -import torch.optim as optim -from torchvision import datasets, transforms -import torch.utils.data.distributed -import horovod.torch as hvd - -# Training settings -parser = argparse.ArgumentParser(description='PyTorch MNIST Example') -parser.add_argument('--batch-size', type=int, default=64, metavar='N', - help='input batch size for training (default: 64)') -parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', - help='input batch size for testing (default: 1000)') -parser.add_argument('--epochs', type=int, default=10, metavar='N', - help='number of epochs to train (default: 10)') -parser.add_argument('--lr', type=float, default=0.01, metavar='LR', - help='learning rate (default: 0.01)') -parser.add_argument('--momentum', type=float, default=0.5, metavar='M', - help='SGD momentum (default: 0.5)') -parser.add_argument('--no-cuda', action='store_true', default=False, - help='disables CUDA training') -parser.add_argument('--seed', type=int, default=42, metavar='S', - help='random seed (default: 42)') -parser.add_argument('--log-interval', type=int, default=10, metavar='N', - help='how many batches to wait before logging training status') -parser.add_argument('--fp16-allreduce', action='store_true', default=False, - help='use fp16 compression during allreduce') -parser.add_argument('--use-adasum', action='store_true', default=False, - help='use adasum algorithm to do reduction') -parser.add_argument('--gradient-predivide-factor', type=float, default=1.0, - help='apply gradient predivide factor in optimizer (default: 1.0)') -parser.add_argument('--data-dir', - help='location of the training dataset in the local filesystem (will be downloaded if needed)') - - -class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - self.conv1 = nn.Conv2d(1, 10, kernel_size=5) - self.conv2 = nn.Conv2d(10, 20, kernel_size=5) - self.conv2_drop = nn.Dropout2d() - self.fc1 = nn.Linear(320, 50) - self.fc2 = nn.Linear(50, 10) - - def forward(self, x): - x = F.relu(F.max_pool2d(self.conv1(x), 2)) - x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) - x = x.view(-1, 320) - x = F.relu(self.fc1(x)) - x = F.dropout(x, training=self.training) - x = self.fc2(x) - return F.log_softmax(x) - - -def train(epoch): - model.train() - # Horovod: set epoch to sampler for shuffling. - train_sampler.set_epoch(epoch) - for batch_idx, (data, target) in enumerate(train_loader): - if args.cuda: - data, target = data.cuda(), target.cuda() - optimizer.zero_grad() - output = model(data) - loss = F.nll_loss(output, target) - loss.backward() - optimizer.step() - if batch_idx % args.log_interval == 0: - # Horovod: use train_sampler to determine the number of examples in - # this worker's partition. - print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( - epoch, batch_idx * len(data), len(train_sampler), - 100. * batch_idx / len(train_loader), loss.item())) - - -def metric_average(val, name): - tensor = torch.tensor(val) - avg_tensor = hvd.allreduce(tensor, name=name) - return avg_tensor.item() - - -def test(): - model.eval() - test_loss = 0. - test_accuracy = 0. - for data, target in test_loader: - if args.cuda: - data, target = data.cuda(), target.cuda() - output = model(data) - # sum up batch loss - test_loss += F.nll_loss(output, target, size_average=False).item() - # get the index of the max log-probability - pred = output.data.max(1, keepdim=True)[1] - test_accuracy += pred.eq(target.data.view_as(pred)).cpu().float().sum() - - # Horovod: use test_sampler to determine the number of examples in - # this worker's partition. - test_loss /= len(test_sampler) - test_accuracy /= len(test_sampler) - - # Horovod: average metric values across workers. - test_loss = metric_average(test_loss, 'avg_loss') - test_accuracy = metric_average(test_accuracy, 'avg_accuracy') - - # Horovod: print output only on first rank. - if hvd.rank() == 0: - print('\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format( - test_loss, 100. * test_accuracy)) - - -if __name__ == '__main__': - args = parser.parse_args() - args.cuda = not args.no_cuda and torch.cuda.is_available() - - # Horovod: initialize library. - hvd.init() - torch.manual_seed(args.seed) - - if args.cuda: - # Horovod: pin GPU to local rank. - torch.cuda.set_device(hvd.local_rank()) - torch.cuda.manual_seed(args.seed) - - - # Horovod: limit # of CPU threads to be used per worker. - torch.set_num_threads(1) - - #kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} - kwargs = {'num_workers': 0, 'pin_memory': True} if args.cuda else {} - # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent - # issues with Infiniband implementations that are not fork-safe - if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context') and - mp._supports_context and 'forkserver' in mp.get_all_start_methods()): - kwargs['multiprocessing_context'] = 'forkserver' - - data_dir = args.data_dir or './data' - with FileLock(os.path.expanduser("~/.horovod_lock")): - train_dataset = \ - datasets.MNIST(data_dir, train=True, download=True, - transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])) - - # Horovod: use DistributedSampler to partition the training data. - train_sampler = torch.utils.data.distributed.DistributedSampler( - train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) - train_loader = torch.utils.data.DataLoader( - train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs) - - test_dataset = \ - datasets.MNIST(data_dir, train=False, transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])) - # Horovod: use DistributedSampler to partition the test data. - test_sampler = torch.utils.data.distributed.DistributedSampler( - test_dataset, num_replicas=hvd.size(), rank=hvd.rank()) - test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size, - sampler=test_sampler, **kwargs) - - model = Net() - - # By default, Adasum doesn't need scaling up learning rate. - lr_scaler = hvd.size() if not args.use_adasum else 1 - - if args.cuda: - # Move model to GPU. - model.cuda() - # If using GPU Adasum allreduce, scale learning rate by local_size. - if args.use_adasum and hvd.nccl_built(): - lr_scaler = hvd.local_size() - - # Horovod: scale learning rate by lr_scaler. - optimizer = optim.SGD(model.parameters(), lr=args.lr * lr_scaler, - momentum=args.momentum) - - # Horovod: broadcast parameters & optimizer state. - hvd.broadcast_parameters(model.state_dict(), root_rank=0) - hvd.broadcast_optimizer_state(optimizer, root_rank=0) - - # Horovod: (optional) compression algorithm. - compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none - - # Horovod: wrap optimizer with DistributedOptimizer. - optimizer = hvd.DistributedOptimizer(optimizer, - named_parameters=model.named_parameters(), - compression=compression, - op=hvd.Adasum if args.use_adasum else hvd.Average, - gradient_predivide_factor=args.gradient_predivide_factor) - - for epoch in range(1, args.epochs + 1): - train(epoch) - test() diff --git a/scripts/deep_horovod/pytorch_synthetic_benchmark.py b/scripts/deep_horovod/pytorch_synthetic_benchmark.py deleted file mode 100644 index 473d7c9..0000000 --- a/scripts/deep_horovod/pytorch_synthetic_benchmark.py +++ /dev/null @@ -1,127 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -# Synthetic train in deepv -# origin: https://github.com/horovod/horovod/blob/master/examples/pytorch/pytorch_synthetic_benchmark.py -# changes - -import argparse -import torch.backends.cudnn as cudnn -import torch.nn.functional as F -import torch.optim as optim -import torch.utils.data.distributed -from torchvision import models -import horovod.torch as hvd -import timeit -import numpy as np - -# Benchmark settings -parser = argparse.ArgumentParser(description='PyTorch Synthetic Benchmark', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) -parser.add_argument('--fp16-allreduce', action='store_true', default=False, - help='use fp16 compression during allreduce') - -parser.add_argument('--model', type=str, default='resnet50', - help='model to benchmark') -parser.add_argument('--batch-size', type=int, default=32, - help='input batch size') - -parser.add_argument('--num-warmup-batches', type=int, default=10, - help='number of warm-up batches that don\'t count towards benchmark') -parser.add_argument('--num-batches-per-iter', type=int, default=10, - help='number of batches per benchmark iteration') -parser.add_argument('--num-iters', type=int, default=10, - help='number of benchmark iterations') - -parser.add_argument('--no-cuda', action='store_true', default=False, - help='disables CUDA training') - -parser.add_argument('--use-adasum', action='store_true', default=False, - help='use adasum algorithm to do reduction') - -args = parser.parse_args() -args.cuda = not args.no_cuda and torch.cuda.is_available() - -hvd.init() - -if args.cuda: - # Horovod: pin GPU to local rank. - torch.cuda.set_device(hvd.local_rank()) - -cudnn.benchmark = True - -# Set up standard model. -model = getattr(models, args.model)() - -# By default, Adasum doesn't need scaling up learning rate. -lr_scaler = hvd.size() if not args.use_adasum else 1 - -if args.cuda: - # Move model to GPU. - model.cuda() - # If using GPU Adasum allreduce, scale learning rate by local_size. - if args.use_adasum and hvd.nccl_built(): - lr_scaler = hvd.local_size() - -optimizer = optim.SGD(model.parameters(), lr=0.01 * lr_scaler) - -# Horovod: (optional) compression algorithm. -compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none - -# Horovod: wrap optimizer with DistributedOptimizer. -optimizer = hvd.DistributedOptimizer(optimizer, - named_parameters=model.named_parameters(), - compression=compression, - op=hvd.Adasum if args.use_adasum else hvd.Average) - -# Horovod: broadcast parameters & optimizer state. -hvd.broadcast_parameters(model.state_dict(), root_rank=0) -hvd.broadcast_optimizer_state(optimizer, root_rank=0) - -# Set up fixed fake data -data = torch.randn(args.batch_size, 3, 224, 224) -target = torch.LongTensor(args.batch_size).random_() % 1000 -if args.cuda: - data, target = data.cuda(), target.cuda() - - -def benchmark_step(): - optimizer.zero_grad() - output = model(data) - loss = F.cross_entropy(output, target) - loss.backward() - optimizer.step() - - -def log(s, nl=True): - if hvd.rank() != 0: - return - print(s, end='\n' if nl else '') - - -log('Model: %s' % args.model) -log('Batch size: %d' % args.batch_size) -device = 'GPU' if args.cuda else 'CPU' -log('Number of %ss: %d' % (device, hvd.size())) - -# Warm-up -log('Running warmup...') -timeit.timeit(benchmark_step, number=args.num_warmup_batches) - -# Benchmark -log('Running benchmark...') -img_secs = [] -for x in range(args.num_iters): - time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter) - img_sec = args.batch_size * args.num_batches_per_iter / time - log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, device)) - img_secs.append(img_sec) - -# Results -img_sec_mean = np.mean(img_secs) -img_sec_conf = 1.96 * np.std(img_secs) -log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf)) -log('Total img/sec on %d %s(s): %.1f +-%.1f' % - (hvd.size(), device, hvd.size() * img_sec_mean, hvd.size() * img_sec_conf)) - -# eof diff --git a/scripts/deep_horovod/reqs.txt b/scripts/deep_horovod/reqs.txt deleted file mode 100755 index 20310b9..0000000 --- a/scripts/deep_horovod/reqs.txt +++ /dev/null @@ -1,6 +0,0 @@ -Pillow -pyparsing -python-dateutil -matplotlib -h5py -pytorch-nlp diff --git a/scripts/deep_tensorflow/Create_Jupyter_deepv.ipynb b/scripts/deep_tensorflow/Create_Jupyter_deepv.ipynb deleted file mode 100644 index aedaf68..0000000 --- a/scripts/deep_tensorflow/Create_Jupyter_deepv.ipynb +++ /dev/null @@ -1,489 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "emerging-record", - "metadata": { - "toc-hr-collapsed": false - }, - "source": [ - "# Create your own Jupyter Kernel\n", - "\n", - "Often the standard kernel do not provide all features you need for your work. This might be that certain modules are not loaded or packages are not installed. \n", - "With your own kernel you can overcome that problem easily and define your own environment, in which you work.\n", - "\n", - "This notebook shows you how you can build your own kernel for a **python environment**.\n", - "\n", - "-------------------------" - ] - }, - { - "cell_type": "markdown", - "id": "imported-mason", - "metadata": {}, - "source": [ - "## Building your own Jupyter kernel is a three step process\n", - "1. Create/Pimp new virtual Python environment\n", - " * venv\n", - "2. Create/Edit launch script for the Jupyter kernel\n", - " * kernel.sh\n", - "3. Create/Edit Jupyter kernel configuration\n", - " * kernel.json" - ] - }, - { - "cell_type": "markdown", - "id": "middle-viewer", - "metadata": {}, - "source": [ - "### Settings" - ] - }, - { - "cell_type": "markdown", - "id": "color-sponsorship", - "metadata": {}, - "source": [ - "* Set kernel name\n", - " - must be lower case\n", - " - change if you like" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "acknowledged-remains", - "metadata": {}, - "outputs": [], - "source": [ - "# INPUT NEEDED:\n", - "KERNEL_NAME=${USER}_kernel\n", - "\n", - "export KERNEL_NAME=$(echo \"${KERNEL_NAME}\" | awk '{print tolower($0)}')\n", - "echo ${KERNEL_NAME} # double check" - ] - }, - { - "cell_type": "markdown", - "id": "sustained-generation", - "metadata": {}, - "source": [ - "* List directories where JupyterLab will search for kernels" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "governmental-check", - "metadata": {}, - "outputs": [], - "source": [ - "# JUPYTER SEARCH PATH (for kernels-directory)\n", - "echo \"jupyter search paths for kernels-directories\"\n", - "if [ -z $JUPYTER_PATH ]; then\n", - " echo \"$HOME/.local/share/jupyter\"\n", - "else\n", - " tr ':' '\\n' <<< \"$JUPYTER_PATH\"\n", - "fi" - ] - }, - { - "cell_type": "markdown", - "id": "later-launch", - "metadata": {}, - "source": [ - "<div class=\"alert alert-block alert-info\">\n", - "<b>Attention:</b>\n", - "Please choose 'private kernel' if you are unsure.</br>\n", - "Using 'project kernel's need to be enabled for your project first by our Jupyter-JSC admins.\n", - "</div>\n", - "\n", - "* Set kernel type\n", - " - private kernel = \"\\${HOME}/.local/\" \n", - " - project kernel = \"\\${PROJECT}/.local/\" \n", - " - other kernel = \"\\<your-path\\>\" (ensure it is part of $JUPYTER_PATH or your kernel will not be found by JuypterLab)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "reported-shirt", - "metadata": {}, - "outputs": [], - "source": [ - "# INPUT NEEDED:\n", - "export KERNEL_TYPE=private # private, project or other\n", - "export KERNEL_SPECS_PREFIX=${HOME}/.local\n", - "\n", - "###################\n", - "# project kernel\n", - "if [ \"${KERNEL_TYPE}\" == \"project\" ]; then\n", - " export KERNEL_SPECS_PREFIX=${PROJECT}/.local\n", - " echo \"project kernel\"\n", - "# private kernel\n", - "elif [ \"${KERNEL_TYPE}\" == \"private\" ]; then\n", - " export KERNEL_SPECS_PREFIX=${HOME}/.local\n", - " echo \"private kernel\"\n", - "else\n", - " if [ ! -d \"$KERNEL_SPECS_PREFIX\" ]; then\n", - " echo \"ERROR: please create directory $KERNEL_SPECS_PREFIX\"\n", - " fi\n", - " echo \"other kernel\"\n", - "fi\n", - "export KERNEL_SPECS_DIR=${KERNEL_SPECS_PREFIX}/share/jupyter/kernels\n", - "\n", - "# check if kernel name is unique\n", - "if [ -d \"${KERNEL_SPECS_DIR}/${KERNEL_NAME}\" ]; then\n", - " echo \"ERROR: Kernel already exists in ${KERNEL_SPECS_DIR}/${KERNEL_NAME}\"\n", - " echo \" Rename kernel name or remove directory.\"\n", - "fi\n", - "\n", - "echo ${KERNEL_SPECS_DIR}/${KERNEL_NAME} # double check" - ] - }, - { - "cell_type": "markdown", - "id": "finnish-darwin", - "metadata": {}, - "source": [ - "* Set directory for kernels virtual environment\n", - " - change if you like" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "furnished-durham", - "metadata": {}, - "outputs": [], - "source": [ - "# INPUT NEEDED:\n", - "export KERNEL_VENVS_DIR=${PROJECT}/${USER}/jupyter/kernels\n", - "\n", - "###################\n", - "mkdir -p ${KERNEL_VENVS_DIR}\n", - "if [ \"${KERNEL_TYPE}\" != \"private\" ] && [ \"${KERNEL_TYPE}\" != \"other\" ]; then\n", - " echo \"Please check the permissions and ensure your project partners have read/execute permissions:\"\n", - " namei -l ${KERNEL_VENVS_DIR}\n", - "fi\n", - "\n", - "echo ${KERNEL_VENVS_DIR} # double check\n", - "ls -lt ${KERNEL_VENVS_DIR}" - ] - }, - { - "cell_type": "markdown", - "id": "settled-terminology", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "everyday-moral", - "metadata": {}, - "source": [ - "## 1. Create/Pimp new virtual Python environment" - ] - }, - { - "cell_type": "markdown", - "id": "defined-better", - "metadata": {}, - "source": [ - "* 1.1 - Load basic Python module" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "thrown-masters", - "metadata": {}, - "outputs": [], - "source": [ - "# set modules\n", - "sysN=\"$(uname -n | cut -f2- -d.)\"\n", - "ml --force purge\n", - "if [ \"$sysN\" = 'deepv' ] ; then\n", - " ml use $OTHERSTAGES\n", - " ml Stages/2022 GCC OpenMPI cuDNN NCCL Python CMake\n", - "elif [ \"$sysN\" = 'jureca' ] ; then\n", - " ml Stages/2022 GCC OpenMPI Python NCCL cuDNN libaio CMake\n", - "else\n", - " echo 'unknown system detected'\n", - "fi\n", - "ml list # double check" - ] - }, - { - "cell_type": "markdown", - "id": "deadly-assist", - "metadata": {}, - "source": [ - "* 1.2 - Create and activate a virtual environment for the kernel and ensure python packages installed in the virtual environment are always prefered" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "tutorial-raleigh", - "metadata": {}, - "outputs": [], - "source": [ - "which python\n", - "if [ -d \"${KERNEL_VENVS_DIR}/${KERNEL_NAME}\" ]; then\n", - " echo \"ERROR: Directory for virtual environment already ${KERNEL_VENVS_DIR}/${KERNEL_NAME}\"\n", - " echo \" Rename kernel name or remove directory.\"\n", - "else\n", - " python -m venv --system-site-packages ${KERNEL_VENVS_DIR}/${KERNEL_NAME}\n", - " source ${KERNEL_VENVS_DIR}/${KERNEL_NAME}/bin/activate\n", - " export PYTHONPATH=${VIRTUAL_ENV}/lib/python3.9/site-packages:${PYTHONPATH}\n", - " echo ${VIRTUAL_ENV} # double check\n", - "fi" - ] - }, - { - "cell_type": "markdown", - "id": "congressional-stream", - "metadata": {}, - "source": [ - "* 1.3 - Install Python libraries required for communication with Jupyter" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bronze-columbia", - "metadata": {}, - "outputs": [], - "source": [ - "cp \"$(which pip3)\" ${VIRTUAL_ENV}/bin/\n", - "var=$VIRTUAL_ENV/bin/python3.9\n", - "sed -i \"1s|.*|$var|\" ${VIRTUAL_ENV}/bin/pip3" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "adjacent-saturday", - "metadata": {}, - "outputs": [], - "source": [ - "which pip3\n", - "pip3 install --ignore-installed ipykernel\n", - "if [ -z \"${VIRTUAL_ENV}\" ]; then\n", - " echo \"ERROR: Virtual environment not successfully initialized.\"\n", - "else\n", - " pip3 install --ignore-installed ipykernel\n", - " ls ${VIRTUAL_ENV}/lib/python3.9/site-packages/ # double check\n", - "fi" - ] - }, - { - "cell_type": "markdown", - "id": "alleged-johns", - "metadata": {}, - "source": [ - "* 1.4 - Install whatever else you need in your Python virtual environment (using pip)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "mysterious-cement", - "metadata": {}, - "outputs": [], - "source": [ - "#pip install <python-package you need>\n", - "pip3 install --upgrade tensorflow --no-cache-dir" - ] - }, - { - "cell_type": "markdown", - "id": "cosmetic-status", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "colonial-migration", - "metadata": {}, - "source": [ - "## 2. Create/Edit launch script for the Jupyter kernel" - ] - }, - { - "cell_type": "markdown", - "id": "ambient-commerce", - "metadata": {}, - "source": [ - "* 2.1 - Create launch script, which loads your Python virtual environment and starts the ipykernel process inside:\n", - "\n", - "<div class=\"alert alert-block alert-info\">\n", - "<b>Attention:</b>\n", - "You MUST load the exactly the same modules as you did above for your virtual Python environment.\n", - "</div>" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "heated-ivory", - "metadata": {}, - "outputs": [], - "source": [ - "echo '#!/bin/bash'\"\n", - "\n", - "# Load basic Python module\n", - "module purge\n", - "module use \"'$OTHERSTAGES'\"\n", - "ml Stages/2022 GCC OpenMPI cuDNN NCCL Python CMake\n", - " \n", - "# Activate your Python virtual environment\n", - "source ${KERNEL_VENVS_DIR}/${KERNEL_NAME}/bin/activate\n", - " \n", - "# Ensure python packages installed in the virtual environment are always prefered\n", - "export PYTHONPATH=${VIRTUAL_ENV}/lib/python3.8/site-packages:\"'${PYTHONPATH}'\"\n", - " \n", - "exec python -m ipykernel \"'$@' > ${VIRTUAL_ENV}/kernel.sh\n", - "chmod +x ${VIRTUAL_ENV}/kernel.sh\n", - "\n", - "cat ${VIRTUAL_ENV}/kernel.sh # double check" - ] - }, - { - "cell_type": "markdown", - "id": "proof-portland", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "inner-silence", - "metadata": {}, - "source": [ - "## 3. Create/Edit Jupyter kernel configuration" - ] - }, - { - "cell_type": "markdown", - "id": "greater-princeton", - "metadata": {}, - "source": [ - "* 3.1 - Create Jupyter kernel configuration directory and files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ongoing-officer", - "metadata": {}, - "outputs": [], - "source": [ - "python -m ipykernel install --name=${KERNEL_NAME} --prefix ${VIRTUAL_ENV}\n", - "export VIRTUAL_ENV_KERNELS=${VIRTUAL_ENV}/share/jupyter/kernels" - ] - }, - { - "cell_type": "markdown", - "id": "documented-motor", - "metadata": {}, - "source": [ - "* 3.2 - Adjust kernel.json file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "finnish-apple", - "metadata": {}, - "outputs": [], - "source": [ - "mv ${VIRTUAL_ENV_KERNELS}/${KERNEL_NAME}/kernel.json ${VIRTUAL_ENV_KERNELS}/${KERNEL_NAME}/kernel.json.orig\n", - "\n", - "echo '{\n", - " \"argv\": [\n", - " \"'${KERNEL_VENVS_DIR}/${KERNEL_NAME}/kernel.sh'\",\n", - " \"-m\",\n", - " \"ipykernel_launcher\",\n", - " \"-f\",\n", - " \"{connection_file}\"\n", - " ],\n", - " \"display_name\": \"'${KERNEL_NAME}'\",\n", - " \"language\": \"python\"\n", - "}' > ${VIRTUAL_ENV_KERNELS}/${KERNEL_NAME}/kernel.json\n", - "\n", - "cat ${VIRTUAL_ENV_KERNELS}/${KERNEL_NAME}/kernel.json # double check" - ] - }, - { - "cell_type": "markdown", - "id": "english-sixth", - "metadata": {}, - "source": [ - "* 3.3 - Create link to kernel specs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "phantom-provision", - "metadata": {}, - "outputs": [], - "source": [ - "mkdir -p ${KERNEL_SPECS_DIR}\n", - "cd ${KERNEL_SPECS_DIR}\n", - "ln -s ${VIRTUAL_ENV_KERNELS}/${KERNEL_NAME} .\n", - "\n", - "echo -e \"\\n\\nThe new kernel '${KERNEL_NAME}' was added to your kernels in '${KERNEL_SPECS_DIR}/'\\n\"\n", - "ls ${KERNEL_SPECS_DIR} # double check" - ] - }, - { - "cell_type": "markdown", - "id": "based-jonathan", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "numerical-hobby", - "metadata": {}, - "source": [ - "## 4. Cleanup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "handmade-smith", - "metadata": {}, - "outputs": [], - "source": [ - "deactivate" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Bash", - "language": "bash", - "name": "bash" - }, - "language_info": { - "codemirror_mode": "shell", - "file_extension": ".sh", - "mimetype": "text/x-sh", - "name": "bash" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/scripts/deep_tensorflow/README.md b/scripts/deep_tensorflow/README.md deleted file mode 100644 index 211255c..0000000 --- a/scripts/deep_tensorflow/README.md +++ /dev/null @@ -1,21 +0,0 @@ -# DL using TensorFlow with Jupyter on deepv - -# source -https://github.com/tensorflow/tensorflow - -# to-do -1. add notebooks - -# usage - pip -1. clone -2. run `bash createENV_TF.sh` -4. submit `sbatch TF_startscript_deep.sh` - -# usage - jupyter -1. clone -2. run `bash createENV_TF.sh` -3. run `bash jupyterAddKernel.sh testAI_deepv` -4. open via `https://jupyter-jsc.fz-juelich.de/hub/login` - -# updates -1. diff --git a/scripts/deep_tensorflow/TF_startscript_deep.sh b/scripts/deep_tensorflow/TF_startscript_deep.sh deleted file mode 100644 index b0fb2ef..0000000 --- a/scripts/deep_tensorflow/TF_startscript_deep.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/bin/bash -# shellcheck disable=SC2206 -#SBATCH --job-name=TFtest -#SBATCH --account=deepext -#SBATCH --mail-user= -#SBATCH --mail-type=ALL -#SBATCH --output=job.out -#SBATCH --error=job.err - -#SBATCH --partition=dp-esb -#SBATCH --nodes=1 -#SBATCH --tasks-per-node=4 -#SBATCH --gpus-per-node=1 -#SBATCH --time=05:00:00 -#SBATCH --exclusive - -ml --force purge -ml use $OTHERSTAGES -ml Stages/2022 GCC/11.2.0 OpenMPI/4.1.2 cuDNN/8.3.1.22-CUDA-11.5 NCCL/2.11.4-CUDA-11.5 Python/3.9.6 - -source /p/project/prcoe12/RAISE/testAI_deepv/bin/activate - -# job info -echo "DEBUG: EXECUTE: $EXEC" -echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" -echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" -echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" -echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" -echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" -echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" -echo "DEBUG: SLURM_NODEID: $SLURM_NODEID" -echo "DEBUG: SLURM_LOCALID: $SLURM_LOCALID" -echo "DEBUG: SLURM_PROCID: $SLURM_PROCID" -echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" -echo - -export CUDA_VISIBLE_DEVICES="0" -export OMP_NUM_THREADS=1 - -srun --cpu-bind=none --mpi=pspmix python3 -u tensorflow2_synthetic_benchmark.py diff --git a/scripts/deep_tensorflow/createEnv_TF.sh b/scripts/deep_tensorflow/createEnv_TF.sh deleted file mode 100755 index d309269..0000000 --- a/scripts/deep_tensorflow/createEnv_TF.sh +++ /dev/null @@ -1,99 +0,0 @@ -#!/bin/bash -# -*- coding: utf-8 -*- -# author: EI -# version: 220302a -# creates machine specific python env - -# set modules -ml --force purge - -# get sys info -cDir=$PWD -sysN="$(uname -n | cut -f2- -d.)" -echo "system:${sysN}" -echo - -cont1=false -if [ "$sysN" = 'deepv' ] ; then - ml use $OTHERSTAGES - ml Stages/2022 GCC OpenMPI cuDNN NCCL Python CMake - cont1=true -elif [ "$sysN" = 'juwels' ] ; then - ml Stages/2022 GCC ParaStationMPI Python CMake NCCL libaio cuDNN - cont1=true -elif [ "$sysN" = 'jureca' ] ; then - ml Stages/2022 GCC OpenMPI Python NCCL cuDNN libaio CMake - cont1=true -else - echo - echo 'unknown system detected' - echo 'canceling' - echo -fi -echo "modules loaded" -echo - -# get python version -pver="$(python --version 2>&1 | awk {'print $2'} | cut -f1-2 -d.)" -echo "python version is ${pver}" -echo - -if [ "$cont1" = true ] ; then - if [ -d "${cDir}/testAI_${sysN}" ];then - echo 'env already exist' - echo - - source testAI_${sysN}/bin/activate - else - # create env - python3 -m venv testAI_${sysN} - - # get headers for pip - if [ -f "${cDir}/testAI_${sysN}/bin/pip3" ]; then - echo 'pip already exist' - else - cp "$(which pip3)" $cDir/testAI_${sysN}/bin/ - ln -s $cDir/testAI_${sysN}/bin/pip3 $cDir/testAI_${sysN}/bin/pip${pver} - var="#!$cDir/testAI_${sysN}/bin/python${pver}" - sed -i "1s|.*|$var|" $cDir/testAI_${sysN}/bin/pip3 - fi - - # activate env - source testAI_${sysN}/bin/activate - - echo "a new env is created in ${cDir}" - echo "activation is done via:" - echo "source ${cDir}/testAI_${sysN}/bin/activate" - fi -fi - -# install TF -if [ -f "${cDir}/testAI_${sysN}/bin/tensorboard" ]; then - echo 'TF already installed' - echo -else - export TMPDIR=${cDir} - - pip3 install --upgrade tensorflow --no-cache-dir -fi - -# install horovod -if [ -f "${cDir}/testAI_${sysN}/bin/horovodrun" ]; then - echo 'Horovod already installed' - echo -else - export HOROVOD_GPU=CUDA - export HOROVOD_GPU_OPERATIONS=NCCL - export HOROVOD_WITH_TENSORFLOW=1 - export TMPDIR=${cDir} - - pip3 install --no-cache-dir horovod --ignore-installed -fi - -# get rest of the libraries$ -if [ "$cont1" = true ] ; then - pip3 install -r reqs_TF.txt --ignore-installed -fi - - -# eof diff --git a/scripts/deep_tensorflow/jupyterAddKernel.sh b/scripts/deep_tensorflow/jupyterAddKernel.sh deleted file mode 100755 index 741c247..0000000 --- a/scripts/deep_tensorflow/jupyterAddKernel.sh +++ /dev/null @@ -1,109 +0,0 @@ -#!/bin/bash -# -*- coding: utf-8 -*- -# author: EI -# version: 220408a -# adds jupyter to an existing python env -# usage: bash jupyterAddKernel.sh <env_location> - -# get sys info -sysN="$(uname -n | cut -f2- -d.)" -cDir=$PWD -ENV_LOC=$cDir/$1 -export TMPDIR=$PWD -echo "system:${sysN}" -echo "env location: $ENV_LOC" -echo - -# warn if wrong bash command -if [ -z "$1" ];then - echo 'wrong usage: try: bash jupyterAddKernel.sh <env_location>' - exit -fi - -# set modules -ml --force purge -if [ "$sysN" = 'deepv' ] ; then - ml use $OTHERSTAGES - ml Stages/2022 GCC OpenMPI cuDNN NCCL Python CMake -elif [ "$sysN" = 'jureca' ] ; then - ml Stages/2022 GCC OpenMPI Python NCCL cuDNN libaio CMake -else - echo - echo 'unknown system detected' - echo 'canceling' - echo -fi -echo "modules loaded" -echo - -# kernel info -KERNEL_NAME=envAI_jk_${sysN} -KERNEL_SPECS_PREFIX=${HOME}/.local -KERNEL_SPECS_DIR=${KERNEL_SPECS_PREFIX}/share/jupyter/kernels - -# get python version -pver="$(python --version 2>&1 | awk {'print $2'} | cut -f1-2 -d.)" -echo "python version is ${pver}" -echo - -# environment that jupyter is built on -if [ -z "${ENV_LOC}" ];then - echo 'env does not exist' - echo 'usage: bash jupyterAddKernel.sh env_location' - exit -else - source ${ENV_LOC}/bin/activate - export PYTHONPATH=${VIRTUAL_ENV}/lib/python${pver}/site-packages:${PYTHONPATH} -fi - -# create/Edit launch script for the Jupyter kernel -if [ -f "${VIRTUAL_ENV}/kernel.sh" ];then - echo "kernel.sh exist!" -else - echo '#!/bin/bash'" - -# Load basic Python module -ml --force purge -ml use $OTHERSTAGES -ml Stages/2022 GCC OpenMPI cuDNN NCCL Python CMake - -# Activate your Python virtual environment -source ${VIRTUAL_ENV}/bin/activate - -# Ensure python packages installed in the virtual environment are always prefered -export PYTHONPATH=${VIRTUAL_ENV}/lib/python${pver}/site-packages:"'${PYTHONPATH}'" - -exec python3 -m ipykernel "'$@' > ${VIRTUAL_ENV}/kernel.sh - chmod +x ${VIRTUAL_ENV}/kernel.sh - - echo 'kernel.sh:' - cat ${VIRTUAL_ENV}/kernel.sh # double check -fi - -# create Jupyter kernel configuration directory and files -pip3 install --ignore-installed ipykernel --no-cache-dir -${VIRTUAL_ENV}/bin/python3 -m ipykernel install --name=${KERNEL_NAME} --prefix ${VIRTUAL_ENV} -VIRTUAL_ENV_KERNELS=${VIRTUAL_ENV}/share/jupyter/kernels - -# adjust kernel.json file -mv ${VIRTUAL_ENV_KERNELS}/${KERNEL_NAME}/kernel.json ${VIRTUAL_ENV_KERNELS}/${KERNEL_NAME}/kernel.json.orig # backup -echo '{ - "argv": [ - "'${VIRTUAL_ENV}/kernel.sh'", - "-m", - "ipykernel_launcher", - "-f", - "{connection_file}" - ], - "display_name": "'${KERNEL_NAME}'", - "language": "python" -}' > ${VIRTUAL_ENV_KERNELS}/${KERNEL_NAME}/kernel.json - -# create link to kernel specs -mkdir -p ${KERNEL_SPECS_DIR} -cd ${KERNEL_SPECS_DIR} -ln -s ${VIRTUAL_ENV_KERNELS}/${KERNEL_NAME} . - -echo -e "\n\nThe new kernel '${KERNEL_NAME}' was added to your kernels in '${KERNEL_SPECS_DIR}/'\n" - -#eof diff --git a/scripts/deep_tensorflow/jupyterCreateKernel.sh b/scripts/deep_tensorflow/jupyterCreateKernel.sh deleted file mode 100755 index a0c0ae9..0000000 --- a/scripts/deep_tensorflow/jupyterCreateKernel.sh +++ /dev/null @@ -1,123 +0,0 @@ -#!/bin/bash -# -*- coding: utf-8 -*- -# author: EI -# version: 220408a -# creates machine specific jupyter kernel - -# get sys info -sysN="$(uname -n | cut -f2- -d.)" -cDir=$PWD -export TMPDIR=$PWD -echo "system:${sysN}" -echo - -# set modules -ml --force purge -if [ "$sysN" = 'deepv' ] ; then - ml use $OTHERSTAGES - ml Stages/2022 GCC OpenMPI cuDNN NCCL Python CMake -elif [ "$sysN" = 'jureca' ] ; then - ml Stages/2022 GCC OpenMPI Python NCCL cuDNN libaio CMake -else - echo - echo 'unknown system detected' - echo 'canceling' - echo -fi -echo "modules loaded" -echo - -# kernel info -KERNEL_NAME=kernel_${sysN} -KERNEL_SPECS_PREFIX=${HOME}/.local -KERNEL_SPECS_DIR=${KERNEL_SPECS_PREFIX}/share/jupyter/kernels - -# get python version -pver="$(python --version 2>&1 | awk {'print $2'} | cut -f1-2 -d.)" -echo "python version is ${pver}" -echo - -# create and activate a virtual environment for the kernel -if [ -d "${cDir}/kernelAI_${sysN}" ];then - echo 'env already existi:' - - source ${cDir}/kernelAI_${sysN}/bin/activate - export PYTHONPATH=${VIRTUAL_ENV}/lib/python${pver}/site-packages:${PYTHONPATH} -else - # create env - python3 -m venv --system-site-packages ${cDir}/kernelAI_${sysN} - - # get headers for pip - if [ -f "${cDir}/kernelAI_${sysN}/bin/pip3" ]; then - echo 'pip already exist' - echo - else - cp "$(which pip3)" $cDir/kernelAI_${sysN}/bin/ - ln -s $cDir/kernelAI_${sysN}/bin/pip3 $cDir/kernelAI_${sysN}/bin/pip${pver} - var="#!$cDir/kernelAI_${sysN}/bin/python${pver}" - sed -i "1s|.*|$var|" $cDir/kernelAI_${sysN}/bin/pip3 - fi - - # activate env - source ${cDir}/kernelAI_${sysN}/bin/activate - export PYTHONPATH=${VIRTUAL_ENV}/lib/python${pver}/site-packages:${PYTHONPATH} -fi -echo 'location of new venv:' -echo ${VIRTUAL_ENV} # double check -echo - -# create/Edit launch script for the Jupyter kernel -if [ -f "${VIRTUAL_ENV}/kernel.sh" ];then - echo "kernel.sh exist!" -else - echo '#!/bin/bash'" - -# Load basic Python module -ml GCC ParaStationMPI Python - -# Activate your Python virtual environment -source ${VIRTUAL_ENV}/bin/activate - -# Ensure python packages installed in the virtual environment are always prefered -export PYTHONPATH=${VIRTUAL_ENV}/lib/python${pver}/site-packages:"'${PYTHONPATH}'" - -exec python3 -m ipykernel "'$@' > ${VIRTUAL_ENV}/kernel.sh - chmod +x ${VIRTUAL_ENV}/kernel.sh - - echo 'kernel.sh:' - cat ${VIRTUAL_ENV}/kernel.sh # double check -fi - -# create Jupyter kernel configuration directory and files -pip3 install --ignore-installed ipykernel --no-cache-dir -${VIRTUAL_ENV}/bin/python3 -m ipykernel install --name=${KERNEL_NAME} --prefix ${VIRTUAL_ENV} -VIRTUAL_ENV_KERNELS=${VIRTUAL_ENV}/share/jupyter/kernels - -# adjust kernel.json file -mv ${VIRTUAL_ENV_KERNELS}/${KERNEL_NAME}/kernel.json ${VIRTUAL_ENV_KERNELS}/${KERNEL_NAME}/kernel.json.orig # backup -echo '{ - "argv": [ - "'${VIRTUAL_ENV}/kernel.sh'", - "-m", - "ipykernel_launcher", - "-f", - "{connection_file}" - ], - "display_name": "'${KERNEL_NAME}'", - "language": "python" -}' > ${VIRTUAL_ENV_KERNELS}/${KERNEL_NAME}/kernel.json - -# create link to kernel specs -mkdir -p ${KERNEL_SPECS_DIR} -cd ${KERNEL_SPECS_DIR} -ln -s ${VIRTUAL_ENV_KERNELS}/${KERNEL_NAME} . - -echo -e "\n\nThe new kernel '${KERNEL_NAME}' was added to your kernels in '${KERNEL_SPECS_DIR}/'\n" - -echo 'load this env as: -ml --force purge -ml use $OTHERSTAGES -ml Stages/2022 GCC OpenMPI cuDNN NCCL Python CMake -source ${cDir}/kernelAI_${sysN}/bin/activate' - -#eof diff --git a/scripts/deep_tensorflow/lamec.json b/scripts/deep_tensorflow/lamec.json deleted file mode 100644 index 8c582bb..0000000 --- a/scripts/deep_tensorflow/lamec.json +++ /dev/null @@ -1 +0,0 @@ -{"startscript": "TF_startscript_deep.sh"} \ No newline at end of file diff --git a/scripts/deep_tensorflow/tensorflow2_synthetic_benchmark.py b/scripts/deep_tensorflow/tensorflow2_synthetic_benchmark.py deleted file mode 100644 index 04b45c1..0000000 --- a/scripts/deep_tensorflow/tensorflow2_synthetic_benchmark.py +++ /dev/null @@ -1,131 +0,0 @@ -# Copyright 2019 Uber Technologies, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -import argparse -import os -import numpy as np -import timeit - -import tensorflow as tf -import horovod.tensorflow as hvd -from tensorflow.keras import applications - -# Benchmark settings -parser = argparse.ArgumentParser(description='TensorFlow Synthetic Benchmark', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) -parser.add_argument('--fp16-allreduce', action='store_true', default=False, - help='use fp16 compression during allreduce') - -parser.add_argument('--model', type=str, default='ResNet50', - help='model to benchmark') -parser.add_argument('--batch-size', type=int, default=32, - help='input batch size') - -parser.add_argument('--num-warmup-batches', type=int, default=10, - help='number of warm-up batches that don\'t count towards benchmark') -parser.add_argument('--num-batches-per-iter', type=int, default=10, - help='number of batches per benchmark iteration') -parser.add_argument('--num-iters', type=int, default=100, - help='number of benchmark iterations') - -parser.add_argument('--no-cuda', action='store_true', default=False, - help='disables CUDA training') - - -args = parser.parse_args() -args.cuda = not args.no_cuda - -# Horovod: initialize Horovod. -hvd.init() - -# Horovod: pin GPU to be used to process local rank (one GPU per process) -if args.cuda: - gpus = tf.config.experimental.list_physical_devices('GPU') - for gpu in gpus: - tf.config.experimental.set_memory_growth(gpu, True) - if gpus: - tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') -else: - os.environ["CUDA_VISIBLE_DEVICES"] = "-1" - -# Set up standard model. -model = getattr(applications, args.model)(weights=None) -opt = tf.optimizers.SGD(0.01) - -data = tf.random.uniform([args.batch_size, 224, 224, 3]) -target = tf.random.uniform([args.batch_size, 1], minval=0, maxval=999, dtype=tf.int64) - - -@tf.function -def benchmark_step(first_batch): - # Horovod: (optional) compression algorithm. - compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none - - # Horovod: use DistributedGradientTape - with tf.GradientTape() as tape: - probs = model(data, training=True) - loss = tf.losses.sparse_categorical_crossentropy(target, probs) - - # Horovod: add Horovod Distributed GradientTape. - tape = hvd.DistributedGradientTape(tape, compression=compression) - - gradients = tape.gradient(loss, model.trainable_variables) - opt.apply_gradients(zip(gradients, model.trainable_variables)) - - # Horovod: broadcast initial variable states from rank 0 to all other processes. - # This is necessary to ensure consistent initialization of all workers when - # training is started with random weights or restored from a checkpoint. - # - # Note: broadcast should be done after the first gradient step to ensure optimizer - # initialization. - if first_batch: - hvd.broadcast_variables(model.variables, root_rank=0) - hvd.broadcast_variables(opt.variables(), root_rank=0) - - -def log(s, nl=True): - if hvd.rank() != 0: - return - print(s, end='\n' if nl else '') - - -log('Model: %s' % args.model) -log('Batch size: %d' % args.batch_size) -device = 'GPU' if args.cuda else 'CPU' -log('Number of %ss: %d' % (device, hvd.size())) - - -with tf.device(device): - # Warm-up - log('Running warmup...') - benchmark_step(first_batch=True) - timeit.timeit(lambda: benchmark_step(first_batch=False), - number=args.num_warmup_batches) - - # Benchmark - log('Running benchmark...') - img_secs = [] - for x in range(args.num_iters): - time = timeit.timeit(lambda: benchmark_step(first_batch=False), - number=args.num_batches_per_iter) - img_sec = args.batch_size * args.num_batches_per_iter / time - log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, device)) - img_secs.append(img_sec) - - # Results - img_sec_mean = np.mean(img_secs) - img_sec_conf = 1.96 * np.std(img_secs) - log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf)) - log('Total img/sec on %d %s(s): %.1f +-%.1f' % - (hvd.size(), device, hvd.size() * img_sec_mean, hvd.size() * img_sec_conf)) diff --git a/scripts/jureca_ddp/DDP_startscript.sh b/scripts/jureca_ddp/DDP_startscript.sh deleted file mode 100644 index 5d8409a..0000000 --- a/scripts/jureca_ddp/DDP_startscript.sh +++ /dev/null @@ -1,107 +0,0 @@ -#!/bin/bash - -# general configuration of the job -#SBATCH --job-name=TorchTest -#SBATCH --account=zam -#SBATCH --mail-user= -#SBATCH --mail-type=ALL -#SBATCH --output=job.out -#SBATCH --error=job.err -#SBATCH --time=00:15:00 - -# configure node and process count on the CM -#SBATCH --partition=dc-gpu-devel -#SBATCH --nodes=4 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=32 -#SBATCH --gpus-per-node=4 -#SBATCH --exclusive - -# gres options have to be disabled for deepv -#SBATCH --gres=gpu:4 - -# parameters -debug=false # do debug -bs=32 # batch-size -epochs=5 # epochs -lr=0.01 # learning rate - -# AT -dataDir="/p/scratch/raise-ctp2/T31_LD/" -COMMAND="DDP_pytorch_AT.py" -EXEC="$COMMAND \ - --batch-size $bs \ - --epochs $epochs \ - --lr $lr \ - --nworker $SLURM_CPUS_PER_TASK \ - --data-dir $dataDir" - - -### do not modify below ### - - -# set modules -ml --force purge -ml Stages/2022 NVHPC/22.3 ParaStationMPI/5.5.0-1-mt NCCL/2.12.7-1-CUDA-11.5 cuDNN/8.3.1.22-CUDA-11.5 -ml Python/3.9.6 libaio/0.3.112 HDF5/1.12.1-serial mpi-settings/CUDA - -# set env -source /p/project/raise-ctp1/RAISE/envAI_jureca/bin/activate - -# sleep a sec -sleep 1 - -# job info -echo "DEBUG: TIME: $(date)" -echo "DEBUG: EXECUTE: $EXEC" -echo "DEBUG: SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR" -echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" -echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" -echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" -echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" -echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" -echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" -echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" -if [ "$debug" = true ] ; then - export NCCL_DEBUG=INFO -fi -echo - -# set comm -export CUDA_VISIBLE_DEVICES="0,1,2,3" -export OMP_NUM_THREADS=1 -if [ "$SLURM_CPUS_PER_TASK" > 0 ] ; then - export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -fi - -# launch -srun --cpu-bind=none bash -c "torchrun \ - --log_dir='logs' \ - --nnodes=$SLURM_NNODES \ - --nproc_per_node=$SLURM_GPUS_PER_NODE \ - --rdzv_id=$SLURM_JOB_ID \ - --rdzv_conf=is_host=\$(((SLURM_NODEID)) && echo 0 || echo 1) \ - --rdzv_backend=c10d \ - --rdzv_endpoint='$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)'i:29500 \ - $EXEC" - -# add --globres=fs:cscratch@just flag to l. 78 if High Performance Storage Tier (HPST) - -# nsys profiler: following https://gist.github.com/mcarilli/376821aa1a7182dfcf59928a7cde3223 -#srun --cpu-bind=none nsys profile \ -# --trace=cublas,cuda,cudnn,nvtx,osrt \ -# --sample=cpu \ -# --stats=true \ -# --force-overwrite=true \ -# -o ./prof.out bash -c "torchrun \ -# --log_dir='logs' \ -# --nnodes=$SLURM_NNODES \ -# --nproc_per_node=$SLURM_GPUS_PER_NODE \ -# --rdzv_id=$SLURM_JOB_ID \ -# --rdzv_conf=is_host=\$(((SLURM_NODEID)) && echo 0 || echo 1) \ -# --rdzv_backend=c10d \ -# --rdzv_endpoint='$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)'i:29500 \ -# $EXEC" - -# eof diff --git a/scripts/jureca_ddp/DDP_startscript_container.sh b/scripts/jureca_ddp/DDP_startscript_container.sh deleted file mode 100644 index 377d16e..0000000 --- a/scripts/jureca_ddp/DDP_startscript_container.sh +++ /dev/null @@ -1,81 +0,0 @@ -#!/bin/bash - -# general configuration of the job -#SBATCH --job-name=AMDTorchTest -#SBATCH --account=zam -#SBATCH --mail-user= -#SBATCH --mail-type=ALL -#SBATCH --output=job.out -#SBATCH --error=job.err -#SBATCH --time=00:15:00 - -# configure node and process count on the CM -#SBATCH --partition=dc-mi200 -#SBATCH --nodes=1 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=12 -#SBATCH --exclusive - -# parameters -debug=false # do debug -bs=32 # batch-size -epochs=5 # epochs -lr=0.01 # learning rate - -# AT -dataDir="/p/scratch/raise-ctp1/T31_LD/" -COMMAND="DDP_pytorch_AT.py" -EXEC="$COMMAND \ - --batch-size $bs \ - --epochs $epochs \ - --lr $lr \ - --nworker $SLURM_CPUS_PER_TASK \ - --data-dir $dataDir" - - -### do not modify below ### - - -# set modules -ml Architecture/jureca_mi200 -ml GCC/11.2.0 OpenMPI/4.1.4 ROCm/5.3.0 CMake/3.23.1 -ml UCX-settings/RC-ROCm - -# set env variables -export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" -mkdir -p $SLURM_SUBMIT_DIR/tmp -export MIOPEN_USER_DB_PATH=$SLURM_SUBMIT_DIR/tmp -export NCCL_DEBUG=WARN - -# job info -echo "DEBUG: TIME: $(date)" -echo "DEBUG: EXECUTE: $EXEC" -echo "DEBUG: SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR" -echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" -echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" -echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" -echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" -echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" -echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" -echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" -if [ "$debug" = true ] ; then - export NCCL_DEBUG=INFO -fi -echo - -# launch container -srun --cpu-bind=none bash -c "apptainer exec --rocm \ - torch_rocm_docker.sif \ - python -m fixed_torch_run \ - --log_dir='logs' \ - --nnodes=$SLURM_NNODES \ - --nproc_per_node=8 \ - --rdzv_id=$SLURM_JOB_ID \ - --rdzv_conf=is_host=\$(((SLURM_NODEID)) && echo 0 || echo 1) \ - --rdzv_backend=c10d \ - --rdzv_endpoint='$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)'i:29500 \ - $EXEC" - -#eof diff --git a/scripts/jureca_ddp/README.md b/scripts/jureca_ddp/README.md deleted file mode 100644 index df22341..0000000 --- a/scripts/jureca_ddp/README.md +++ /dev/null @@ -1,39 +0,0 @@ -# DL using DDP on jureca dc gpu - -# DDP source -https://github.com/pytorch/pytorch#from-source - -# jureca user documentation -https://apps.fz-juelich.de/jsc/hps/jureca/index.html - -# current isues -1. torchrun: Hostname/endpoint mismatch not handled\ -workaround is to modify torchrun and use included batch script\ -simply run `createEnv.sh` to install fixed torch\ -discussion in: https://github.com/pytorch/pytorch/issues/73656 -2. for containers, instead of #1, use `fixed_torch_run.py` -- follow usage - containers. - -# to-do -1. - -# done -1. tested containers (for both NVIDIA & AMD GPUs):\ -https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch \ -https://www.amd.com/en/technologies/infinity-hub/pytorch \ -https://hub.docker.com/r/rocm/pytorch - - -# usage - Python Env -1. run `./createEnv.sh` to create env and install torch -2. select a case from CASES folder -3. submit `sbatch DDP_startscript.sh` - -# usage - containers (note this for AMD partition - modify for NVIDIA) -1. run `./createContainer.sh` to use and build Torch/ROCm container -2. select a case from CASES folder -3. submit `sbatch DDP_startscript_container.sh` - -# usage - Source Code -1. run `./createEnv_MPI.sh` to create Conda env and install torch with MPI support -2. select a case from CASES folder -3. submit `sbatch DDP_startscript.sh` diff --git a/scripts/jureca_ddp/createContainer.sh b/scripts/jureca_ddp/createContainer.sh deleted file mode 100644 index 3d7f584..0000000 --- a/scripts/jureca_ddp/createContainer.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash -# -*- coding: utf-8 -*- -# author: EI -# version: 2212008a -# pull and build containers for PyTorch/ROCm - -# load modules -ml Architecture/jureca_mi200 -ml GCC/11.2.0 OpenMPI/4.1.2 ROCm/5.3.0 CMake/3.23.1 -ml UCX-settings/RC-ROCm - -# create Cache/TMP so that $HOME would not be used -mkdir -p Cache -mkdir -p TMP -export APPTAINER_CACHEDIR=$(mktemp -d -p $PWD/Cache) -export APPTAINER_TMPDIR=$(mktemp -d -p $PWD/TMP) - -# official AMD container with Torch==1.10.0 -# apptainer pull torch_rocm_amd.sif docker://amdih/pytorch:rocm5.0_ubuntu18.04_py3.7_pytorch_1.10.0 - -# docker AMD container with Torch==1.12.1 -apptainer pull torch_rocm_docker.sif docker://rocm/pytorch - -#eof diff --git a/scripts/jureca_ddp/createEnv.sh b/scripts/jureca_ddp/createEnv.sh deleted file mode 100755 index 9d635be..0000000 --- a/scripts/jureca_ddp/createEnv.sh +++ /dev/null @@ -1,183 +0,0 @@ -#!/bin/bash -# -*- coding: utf-8 -*- -# author: EI -# version: 220328a -# creates machine specific python env - -# set modules -ml --force purge - -# get sys info -cDir=$PWD -sysN="$(uname -n | cut -f2- -d.)" -echo "system:${sysN}" -echo - -cont1=false -if [ "$sysN" = 'deepv' ] ; then - ml use $OTHERSTAGES - ml Stages/2022 NVHPC/22.1 OpenMPI/4.1.2 NCCL/2.15.1-1-CUDA-11.5 cuDNN/8.3.1.22-CUDA-11.5 - ml Python/3.9.6 HDF5 CMake - ml -nvidia-driver/.default - cont1=true -elif [ "$sysN" = 'juwels' ] ; then - ml GCC ParaStationMPI Python CMake - cont1=true -elif [ "$sysN" = 'jureca' ] ; then - ml Stages/2022 NVHPC/22.1 ParaStationMPI/5.5.0-1-mt NCCL/2.11.4-CUDA-11.5 cuDNN/8.3.1.22-CUDA-11.5 - ml Python/3.9.6 CMake HDF5 PnetCDF libaio/0.3.112 mpi-settings/CUDA - cont1=true -else - echo - echo 'unknown system detected' - echo 'canceling' - echo -fi -echo "modules loaded" -echo - -# get python version -pver="$(python --version 2>&1 | awk {'print $2'} | cut -f1-2 -d.)" -echo "python version is ${pver}" -echo - -# create env -if [ "$cont1" = true ] ; then - if [ -d "${cDir}/envAI_${sysN}" ];then - echo 'env already exist' - echo - - source envAI_${sysN}/bin/activate - else - python3 -m venv envAI_${sysN} - - # get headers for pip - if [ -f "${cDir}/envAI_${sysN}/bin/pip3" ]; then - echo 'pip already exist' - else - cp "$(which pip3)" $cDir/envAI_${sysN}/bin/ - ln -s $cDir/envAI_${sysN}/bin/pip3 $cDir/envAI_${sysN}/bin/pip${pver} - var="#!$cDir/envAI_${sysN}/bin/python${pver}" - sed -i "1s|.*|$var|" $cDir/envAI_${sysN}/bin/pip3 - fi - - # activate env - source envAI_${sysN}/bin/activate - - echo "a new env is created in ${cDir}" - echo "activation is done via:" - echo "source ${cDir}/envAI_${sysN}/bin/activate" - fi -fi - -# install torch -if [ -f "${cDir}/envAI_${sysN}/bin/torchrun" ]; then - echo 'Torch already installed' - echo -else - export TMPDIR=${cDir} - - pip3 install \ - torch==1.11.0+cu115 torchvision==0.12.0+cu115 torchaudio==0.11.0+cu115 -f \ - https://download.pytorch.org/whl/cu115/torch_stable.html --no-cache-dir -fi - -# install horovod -if [ -f "${cDir}/envAI_${sysN}/bin/horovodrun" ]; then - echo 'Horovod already installed' - echo -else - export HOROVOD_WITH_MPI=1 - export HOROVOD_MPI_THREADS_DISABLE=1 - export HOROVOD_GPU=CUDA - export HOROVOD_GPU_OPERATIONS=NCCL - export HOROVOD_CUDA_HOME=$EBROOTCUDA - export HOROVOD_NCCL_HOME=$EBROOTNCCL - export HOROVOD_WITH_PYTORCH=1 - export TMPDIR=${cDir} - - pip3 install --no-cache-dir horovod --ignore-installed -fi - -# install deepspeed -if [ -f "${cDir}/envAI_${sysN}/bin/deepspeed" ]; then - echo 'DeepSpeed already installed' - echo -else - # compile all opt. stuff - not needed & not working - #export DS_BUILD_OPS=1 - # compile req. opt. stuff - export DS_BUILD_FUSED_ADAM=1 - export DS_BUILD_UTILS=1 - if [ "$sysN" = 'deepv' ] ; then - #fix libaio issues via: - export DS_BUILD_AIO=0 - fi - export TMPDIR=${cDir} - - pip3 install --no-cache-dir DeepSpeed - - # add this to .../deepspeed/launcher/launch.py l.93 - var=' args.node_rank=int(os.environ.get("SLURM_PROCID",0))' - sed -i "93s|.*|$var|" $cDir/envAI_${sysN}/lib/python${pver}/site-packages/deepspeed/launcher/launch.py -fi - -# install heat -if [ -d "${cDir}/envAI_${sysN}/lib/python${pver}/site-packages/heat" ]; then - echo 'HeAT already installed' - echo -else - export TMPDIR=${cDir} - export CFLAGS="-noswitcherror" - export CXXFLAGS="-noswitcherror" - - # need to modify setup.py to accep torch>1.9 for heat - wget https://files.pythonhosted.org/packages/5d/3a/4781f1e6910753bfdfa6712c83c732c60e675d8de14983926a0d9306c7a6/heat-1.1.1.tar.gz - tar xzf heat-1.1.1.tar.gz - var=' "torch>=1.7.0",' - sed -i "36s|.*|$var|" heat-1.1.1/setup.py - var=' "torchvision>=0.8.0",' - sed -i "39s|.*|$var|" heat-1.1.1/setup.py - - # create tar again! - rm -rf heat-1.1.1.tar.gz - tar czf heat-1.1.1.tar.gz heat-1.1.1 - rm -rf heat-1.1.1 - - pip3 install --no-cache-dir 'heat-1.1.1.tar.gz[hdf5,netcdf]' - - rm -rf heat-1.1.1.tar.gz -fi - -# get rest of the libraries$ -if [ "$cont1" = true ] ; then - # install rest - pip3 install -r reqs.txt --ignore-installed - - # modify l.4 of /torchnlp/_third_party/weighted_random_sampler.py - var='int_classes = int' - sed -i "4s|.*|$var|" \ - $cDir/envAI_${sysN}/lib/python${pver}/site-packages/torchnlp/_third_party/weighted_random_sampler.py -fi - -# fix IB IP config -if [ -f "${cDir}/envAI_${sysN}/bin/torchrun" ]; then - sed -i -e '3,8s/^/#/' ${cDir}/envAI_${sysN}/bin/torchrun - echo """ -import re -import sys -from torch.distributed.run import main -from torch.distributed.elastic.agent.server import api as sapi - -def new_get_fq_hostname(): - return _orig_get_fq_hostname().replace('.', 'i.', 1) - -if __name__ == '__main__': - _orig_get_fq_hostname = sapi._get_fq_hostname - sapi._get_fq_hostname = new_get_fq_hostname - sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) - sys.exit(main()) -""" >> ${cDir}/envAI_${sysN}/bin/torchrun -fi - -#eof diff --git a/scripts/jureca_ddp/createEnv_MPI.sh b/scripts/jureca_ddp/createEnv_MPI.sh deleted file mode 100644 index 2b35e8f..0000000 --- a/scripts/jureca_ddp/createEnv_MPI.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/bin/bash -# -*- coding: utf-8 -*- -# author: EI -# version: 221026a -# creates machine specific PyTorch with MPI support using Conda -# use compute node to compile! - -# jureca modules -ml --force purge -ml Stages/2022 GCC/11.2.0 ParaStationMPI/5.5.0-1 NCCL/2.12.7-1-CUDA-11.5 -ml cuDNN/8.3.1.22-CUDA-11.5 libaio/0.3.112 mpi-settings/CUDA CMake/3.21.1 -ml Ninja-Python/1.10.2 - -# get CUDA version in the system -CUDA_ver="$(echo $EBVERSIONCUDA 2>&1 | tr -d .)" - -# miniconda -download=false -if [ -d "$PWD/miniconda3" ];then - echo "miniconda3 already installed!" -else - wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh - bash Miniconda3-latest-Linux-x86_64.sh -p $PWD/miniconda3 -b - download=true -fi - -if [ "$download" = true ] ; then - # std libs - conda install -y astunparse numpy pyyaml mkl mkl-include setuptools cffi \ - typing_extensions future six requests dataclasses Pillow --force-reinstall - - # cuda support (v11.5) - conda install -c pytorch -y magma-cuda$CUDA_ver --force-reinstall - conda install -y pkg-config libuv --force-reinstall - - # fix older library issue - cp $EBROOTGCC/lib64/libstdc++.so.6.0.29 $CONDA_PREFIX/lib/ - pushd $CONDA_PREFIX/lib/ - rm -f libstdc++.so.6 - ln -s libstdc++.so.6.0.29 libstdc++.so.6 - popd -fi - -# enable Conda env -source $PWD/miniconda3/etc/profile.d/conda.sh -conda activate - -# pytorch with mpi support -if [ -d "$PWD/pytorch/build/test.dat" ];then - echo 'pytorch already installed!' -else - git clone --recursive https://github.com/pytorch/pytorch pytorch - pushd pytorch - rm -rf build - git submodule sync - git submodule update --init --recursive - - # install pytorch with custom flags - export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"} - - mkdir tmp - export TMPDIR=$PWD/tmp - export CUDA_HOME=$CUDA_HOME - python3 setup.py clean - CMAKE_C_COMPILER=$(which mpicc) CMAKE_CXX_COMPILER=$(which mpicxx) \ - USE_DISTRIBUTED=ON USE_MPI=ON CUDA_ROOT_DIR=$EBROOTCUDA USE_CUDA=ON \ - NCCL_ROOT_DIR=$EBROOTNCCL USE_NCCL=ON USE_GLOO=ON \ - CUDNN_ROOT=$EBROOTCUDNN USE_CUDNN=ON \ - python3 setup.py install - popd -fi - -#eof diff --git a/scripts/jureca_ddp/fixed_torch_run.py b/scripts/jureca_ddp/fixed_torch_run.py deleted file mode 100644 index cca9706..0000000 --- a/scripts/jureca_ddp/fixed_torch_run.py +++ /dev/null @@ -1,51 +0,0 @@ -from argparse import ArgumentParser -import ipaddress -import runpy -import socket - -from torch.distributed.elastic.agent.server import api as sapi - - -def parse_host(): - parser = ArgumentParser() - parser.add_argument('--rdzv_endpoint') - endpoint = parser.parse_known_args()[0].rdzv_endpoint - host = ( - endpoint.split(':', 1)[0] - if endpoint - else None - ) - return host - - -def fix_torch_run(host): - _orig_get_fq_hostname = sapi._get_fq_hostname - - if host: - try: - ipaddress.ip_address(host) - is_ip = True - except ValueError: - is_ip = False - - if is_ip: - def new_get_fq_hostname(): - return socket.gethostbyaddr(host)[0] - else: - def new_get_fq_hostname(): - return socket.getfqdn(host) - else: - new_get_fq_hostname = _orig_get_fq_hostname - - sapi._get_fq_hostname = new_get_fq_hostname - - -def main(): - host = parse_host() - fix_torch_run(host) - runpy.run_module('torch.distributed.run', run_name='__main__') - - -if __name__ == '__main__': - main() - diff --git a/scripts/jureca_ddp/lamec.json b/scripts/jureca_ddp/lamec.json deleted file mode 100644 index a36ad53..0000000 --- a/scripts/jureca_ddp/lamec.json +++ /dev/null @@ -1 +0,0 @@ -{"startscript": "DDP_startscript.sh"} \ No newline at end of file diff --git a/scripts/jureca_ddp/reqs.txt b/scripts/jureca_ddp/reqs.txt deleted file mode 100755 index 2d7bb74..0000000 --- a/scripts/jureca_ddp/reqs.txt +++ /dev/null @@ -1,11 +0,0 @@ -Pillow -pyparsing -python-dateutil -matplotlib -h5py -pytorch-nlp -pyprof -filelock -scipy -perlin_noise -noise diff --git a/scripts/jureca_deepspeed/DS_config.json b/scripts/jureca_deepspeed/DS_config.json deleted file mode 100644 index ec1f022..0000000 --- a/scripts/jureca_deepspeed/DS_config.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "train_micro_batch_size_per_gpu": 96, - "gradient_accumulation_steps": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.01 - } - }, - "fp16": { - "enabled": false - }, - "zero_optimization": false -} diff --git a/scripts/jureca_deepspeed/DS_startscript_deep.sh b/scripts/jureca_deepspeed/DS_startscript_deep.sh deleted file mode 100644 index 2578883..0000000 --- a/scripts/jureca_deepspeed/DS_startscript_deep.sh +++ /dev/null @@ -1,96 +0,0 @@ -#!/bin/bash - -# general configuration of the job -#SBATCH --job-name=DStest -#SBATCH --account=raise-ctp1 -#SBATCH --mail-user= -#SBATCH --mail-type=ALL -#SBATCH --output=job.out -#SBATCH --error=job.err -#SBATCH --time=02:00:00 - -# configure node and process count on the CM -#SBATCH --partition=dc-gpu -#SBATCH --nodes=2 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=32 -#SBATCH --gpus-per-node=4 -#SBATCH --exclusive - -# gres options have to be disabled for deepv -#SBATCH --gres=gpu:4 - -# parameters -debug=false # do nccl debug -epochs=1 # epochs -be='nccl' # backend -lr=0.001 # learning rate -bs=2 # batch-size - -# AT -dataDir='/p/scratch/raise-ctp1/inanc2/T31_LD/' -COMMAND="DS_pytorch_AT.py" - -EXEC=$COMMAND" --batch-size $bs - --epochs $epochs - --backend $be - --nworker $SLURM_CPUS_PER_TASK - --benchrun - --data-dir $dataDir" - -# set modules -ml --force purge -ml Stages/2022 NVHPC/22.3 ParaStationMPI/5.5.0-1-mt NCCL/2.12.7-1-CUDA-11.5 cuDNN/8.3.1.22-CUDA-11.5 -ml Python/3.9.6 libaio/0.3.112 HDF5/1.12.1-serial mpi-settings/CUDA - -# set env -source /p/project/raise-ctp1/RAISE/envAI_jureca/bin/activate - -# sleep a sec -sleep 1 - -# job info -echo "DEBUG: TIME: $(date)" -echo "DEBUG: EXECUTE: $EXEC" -echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" -echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" -echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" -echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" -echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" -echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" -echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" -if [ "$debug" = true ] ; then - export NCCL_DEBUG=INFO -fi -echo - -# set comm -export CUDA_VISIBLE_DEVICES="0,1,2,3" -export OMP_NUM_THREADS=1 -if [ "$SLURM_CPUS_PER_TASK" > 0 ] ; then - export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -fi - -#### do not change this part -# create node-list -sysN=$(eval "scontrol show hostnames") -for i in $sysN; do - x+=\"$i\":[$CUDA_VISIBLE_DEVICES], -done -WID=`echo {${x::-1}} | base64 -w 0` - -# modify config file with parameters -sed -i "2s|.*| \"train_micro_batch_size_per_gpu\": ${bs},|" DS_config.json -sed -i "7s|.*| \"lr\": ${lr}|" DS_config.json -#### - -# launch -srun python -m deepspeed.launcher.launch \ - --node_rank $SLURM_PROCID \ - --master_addr ${SLURMD_NODENAME}i \ - --master_port 29500 \ - --world_info $WID \ - $EXEC --deepspeed_mpi --deepspeed_config DS_config.json - -# eof diff --git a/scripts/jureca_deepspeed/README.md b/scripts/jureca_deepspeed/README.md deleted file mode 100644 index d0a70f8..0000000 --- a/scripts/jureca_deepspeed/README.md +++ /dev/null @@ -1,15 +0,0 @@ -# DL using DeepSpeed on Jureca DC - -# source -https://github.com/microsoft/DeepSpeed - -# current isues -1. - -# to-do -1. - -# usage - pip -1. clone -2. run `./createENV.sh` -3. submit `sbatch DS_startscript_deep.sh` diff --git a/scripts/jureca_deepspeed/createEnv.sh b/scripts/jureca_deepspeed/createEnv.sh deleted file mode 100755 index e5cc3af..0000000 --- a/scripts/jureca_deepspeed/createEnv.sh +++ /dev/null @@ -1,180 +0,0 @@ -#!/bin/bash -# -*- coding: utf-8 -*- -# author: EI -# version: 220328a -# creates machine specific python env - -# set modules -ml --force purge - -# get sys info -cDir=$PWD -sysN="$(uname -n | cut -f2- -d.)" -echo "system:${sysN}" -echo - -cont1=false -if [ "$sysN" = 'deepv' ] ; then - ml use $OTHERSTAGES - ml Stages/2022 GCC OpenMPI cuDNN NCCL Python CMake - cont1=true -elif [ "$sysN" = 'juwels' ] ; then - ml GCC ParaStationMPI Python CMake - cont1=true -elif [ "$sysN" = 'jureca' ] ; then - ml Stages/2022 NVHPC ParaStationMPI/5.5.0-1-mt Python CMake NCCL/2.11.4-CUDA-11.5 cuDNN libaio HDF5 PnetCDF mpi-settings/CUDA - cont1=true -else - echo - echo 'unknown system detected' - echo 'canceling' - echo -fi -echo "modules loaded" -echo - -# get python version -pver="$(python --version 2>&1 | awk {'print $2'} | cut -f1-2 -d.)" -echo "python version is ${pver}" -echo - -# create env -if [ "$cont1" = true ] ; then - if [ -d "${cDir}/envAI_${sysN}" ];then - echo 'env already exist' - echo - - source envAI_${sysN}/bin/activate - else - python3 -m venv envAI_${sysN} - - # get headers for pip - if [ -f "${cDir}/envAI_${sysN}/bin/pip3" ]; then - echo 'pip already exist' - else - cp "$(which pip3)" $cDir/envAI_${sysN}/bin/ - ln -s $cDir/envAI_${sysN}/bin/pip3 $cDir/envAI_${sysN}/bin/pip${pver} - var="#!$cDir/envAI_${sysN}/bin/python${pver}" - sed -i "1s|.*|$var|" $cDir/envAI_${sysN}/bin/pip3 - fi - - # activate env - source envAI_${sysN}/bin/activate - - echo "a new env is created in ${cDir}" - echo "activation is done via:" - echo "source ${cDir}/envAI_${sysN}/bin/activate" - fi -fi - -# install torch -if [ -f "${cDir}/envAI_${sysN}/bin/torchrun" ]; then - echo 'Torch already installed' - echo -else - export TMPDIR=${cDir} - - pip3 install \ - torch==1.11.0+cu115 torchvision==0.12.0+cu115 torchaudio==0.11.0+cu115 -f \ - https://download.pytorch.org/whl/cu115/torch_stable.html --no-cache-dir -fi - -# install horovod -if [ -f "${cDir}/envAI_${sysN}/bin/horovodrun" ]; then - echo 'Horovod already installed' - echo -else - export HOROVOD_WITH_MPI=1 - export HOROVOD_MPI_THREADS_DISABLE=1 - export HOROVOD_GPU=CUDA - export HOROVOD_GPU_OPERATIONS=NCCL - export HOROVOD_CUDA_HOME=$EBROOTCUDA - export HOROVOD_NCCL_HOME=$EBROOTNCCL - export HOROVOD_WITH_PYTORCH=1 - export TMPDIR=${cDir} - - pip3 install --no-cache-dir horovod --ignore-installed -fi - -# install deepspeed -if [ -f "${cDir}/envAI_${sysN}/bin/deepspeed" ]; then - echo 'DeepSpeed already installed' - echo -else - # compile all opt. stuff - not needed & not working - #export DS_BUILD_OPS=1 - # compile req. opt. stuff - export DS_BUILD_FUSED_ADAM=1 - export DS_BUILD_UTILS=1 - if [ "$sysN" = 'deepv' ] ; then - #fix libaio issues via: - export DS_BUILD_AIO=0 - fi - export TMPDIR=${cDir} - - pip3 install --no-cache-dir DeepSpeed - - # add this to .../deepspeed/launcher/launch.py l.93 - var=' args.node_rank=int(os.environ.get("SLURM_PROCID",0))' - sed -i "93s|.*|$var|" $cDir/envAI_${sysN}/lib/python${pver}/site-packages/deepspeed/launcher/launch.py -fi - -# install heat -if [ -d "${cDir}/envAI_${sysN}/lib/python${pver}/site-packages/heat" ]; then - echo 'HeAT already installed' - echo -else - export TMPDIR=${cDir} - export CFLAGS="-noswitcherror" - export CXXFLAGS="-noswitcherror" - - # need to modify setup.py to accep torch>1.9 for heat - wget https://files.pythonhosted.org/packages/5d/3a/4781f1e6910753bfdfa6712c83c732c60e675d8de14983926a0d9306c7a6/heat-1.1.1.tar.gz - tar xzf heat-1.1.1.tar.gz - var=' "torch>=1.7.0",' - sed -i "36s|.*|$var|" heat-1.1.1/setup.py - var=' "torchvision>=0.8.0",' - sed -i "39s|.*|$var|" heat-1.1.1/setup.py - - # create tar again! - rm -rf heat-1.1.1.tar.gz - tar czf heat-1.1.1.tar.gz heat-1.1.1 - rm -rf heat-1.1.1 - - pip3 install --no-cache-dir 'heat-1.1.1.tar.gz[hdf5,netcdf]' - - rm -rf heat-1.1.1.tar.gz -fi - -# get rest of the libraries$ -if [ "$cont1" = true ] ; then - # install rest - pip3 install -r reqs.txt --ignore-installed - - # modify l.4 of /torchnlp/_third_party/weighted_random_sampler.py - var='int_classes = int' - sed -i "4s|.*|$var|" \ - $cDir/envAI_${sysN}/lib/python${pver}/site-packages/torchnlp/_third_party/weighted_random_sampler.py -fi - -# fix IB IP config -if [ -f "${cDir}/envAI_${sysN}/bin/torchrun" ]; then - sed -i -e '3,8s/^/#/' ${cDir}/envAI_${sysN}/bin/torchrun - echo """ -import re -import sys -from torch.distributed.run import main -from torch.distributed.elastic.agent.server import api as sapi - -def new_get_fq_hostname(): - return _orig_get_fq_hostname().replace('.', 'i.', 1) - -if __name__ == '__main__': - _orig_get_fq_hostname = sapi._get_fq_hostname - sapi._get_fq_hostname = new_get_fq_hostname - sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) - sys.exit(main()) -""" >> ${cDir}/envAI_${sysN}/bin/torchrun -fi - -#eof diff --git a/scripts/jureca_deepspeed/lamec.json b/scripts/jureca_deepspeed/lamec.json deleted file mode 100644 index b1572ed..0000000 --- a/scripts/jureca_deepspeed/lamec.json +++ /dev/null @@ -1 +0,0 @@ -{"startscript": "DS_startscript_deep.sh"} \ No newline at end of file diff --git a/scripts/jureca_deepspeed/reqs.txt b/scripts/jureca_deepspeed/reqs.txt deleted file mode 100755 index 8d48886..0000000 --- a/scripts/jureca_deepspeed/reqs.txt +++ /dev/null @@ -1,8 +0,0 @@ -Pillow -pyparsing -python-dateutil -matplotlib -h5py -pytorch-nlp -pyprof -filelock diff --git a/scripts/jureca_graphcore/GC_pytorch_mnist.py b/scripts/jureca_graphcore/GC_pytorch_mnist.py deleted file mode 100644 index 438cebd..0000000 --- a/scripts/jureca_graphcore/GC_pytorch_mnist.py +++ /dev/null @@ -1,346 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# author: EI -# version: 220615a - -# std libs -import argparse, sys, os, time, numpy as np, random -from tqdm import tqdm - -# ml libs -import torch -import torch.distributed as dist -import torch.nn as nn -import torch.nn.functional as F -import torch.optim as optim -from torchvision import datasets, transforms - -# Graphcore (GC) additions -import poptorch - -# parsed settings -def pars_ini(): - global args - parser = argparse.ArgumentParser(description='PyTorch MNIST Example') - - # IO parsers - parser.add_argument('--data-dir', default='./', - help='location of the training dataset in the local filesystem') - parser.add_argument('--restart-int', type=int, default=10, - help='restart interval per epoch (default: 10)') - - # model parsers - parser.add_argument('--batch-size', type=int, default=64, - help='input batch size for training (default: 64)') - parser.add_argument('--test-batch-size', type=int, default=64, - help='input batch size for testing (default: 64)') - parser.add_argument('--epochs', type=int, default=10, - help='number of epochs to train (default: 10)') - parser.add_argument('--lr', type=float, default=0.01, - help='learning rate (default: 0.01)') - parser.add_argument('--concM', type=int, default=100, - help='conc MNIST to this factor (default: 1)') - parser.add_argument('--momentum', type=float, default=0.5, - help='momentum in SGD optimizer (default: 0.5)') - parser.add_argument('--shuff', action='store_true', default=False, - help='shuffle dataset (default: False)') - - # debug parsers - parser.add_argument('--testrun', action='store_true', default=False, - help='do a test run with seed (default: False)') - parser.add_argument('--nseed', type=int, default=0, - help='seed integer for reproducibility (default: 0)') - parser.add_argument('--log-int', type=int, default=10, - help='log interval per training') - - # parallel parsers - parser.add_argument('--nworker', type=int, default=0, - help='number of workers in DataLoader (default: 0 - only main)') - parser.add_argument('--prefetch', type=int, default=2, - help='prefetch data in DataLoader (default: 2)') - parser.add_argument('--benchrun', action='store_true', default=False, - help='do a bench run w/o IO (default: False)') - - # GC parsers - """ - Device iteration defines the number of iterations the device should - run over the data before returning to the user. - This is equivalent to running the IPU in a loop over that the specified - number of iterations, with a new batch of data each time. However, increasing - deviceIterations is more efficient because the loop runs on the IPU directly. - """ - parser.add_argument('--device-iterations', type=int, default=50, - help='check code! (default: 50)') - - args = parser.parse_args() - - # set minimum of 3 epochs when benchmarking (last epoch produces logs) - args.epochs = 3 if args.epochs < 3 and args.benchrun else args.epochs - -# network -class Block(nn.Module): - def __init__(self, in_channels, num_filters, kernel_size, pool_size): - super(Block, self).__init__() - self.conv = nn.Conv2d(in_channels, - num_filters, - kernel_size=kernel_size) - self.pool = nn.MaxPool2d(kernel_size=pool_size) - self.relu = nn.ReLU() - - def forward(self, x): - x = self.conv(x) - x = self.pool(x) - x = self.relu(x) - return x - -class Network(nn.Module): - def __init__(self): - super(Network, self).__init__() - self.layer1 = Block(1, 10, 5, 2) - self.layer2 = Block(10, 20, 5, 2) - self.layer3 = nn.Linear(320, 50) - self.layer3_act = nn.ReLU() - self.layer3_dropout = torch.nn.Dropout(0.5) - self.layer4 = nn.Linear(50, 10) - # GC - loss is defined in the network - self.loss = nn.NLLLoss() - - def forward(self, x, labels=None): - x = self.layer1(x) - x = self.layer2(x) - x = x.view(-1, 320) - x = self.layer3_act(self.layer3(x)) - x = self.layer4(self.layer3_dropout(x)) - x = nn.functional.log_softmax(x) - if self.training: - return x, self.loss(x, labels) - return x - -# train loop - GC -def train(model, train_loader, epoch): - model.train() - t_list = [] - loss_acc=0 - for batch_idx, (data, target) in enumerate(train_loader): - t = time.perf_counter() - pred,loss = model(data,target) - if batch_idx % args.log_int == 0: - print( - f'Train epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} ' - f'({100.0 * batch_idx / len(train_loader):.0f}%)]\t\tLoss: {loss.item():.6f}') - t_list.append(time.perf_counter() - t) - loss_acc+= loss.item() - print('TIMER: train time', sum(t_list) / len(t_list),'s') - return loss_acc - -# test loop - GC -def test(model, test_loader): - model.eval() - test_loss = 0 - for data, labels in test_loader: - output = model(data) - test_loss += accuracy(output, labels) - print('Accuracy on test set: {:0.2f}%'.format(test_loss / len(test_loader)),'\n') - -def accuracy(predictions, labels): - _, ind = torch.max(predictions, 1) - labels = labels[-predictions.size()[0]:] - accuracy = torch.sum(torch.eq(ind, labels)).item() / labels.size()[0] * 100.0 - return accuracy - -# save state of the training -def save_state(model,res_name,is_best): - if is_best: - rt = time.time() - torch.save(model.state_dict(),'./'+res_name) - print(f'DEBUG: state is saved') - -# main -def main(): - # get parse args - pars_ini() - - # get directory - program_dir = os.getcwd() - - # start the time.time for profiling - st = time.time() - - # deterministic testrun - if args.testrun: - torch.manual_seed(args.nseed) - g = torch.Generator() - g.manual_seed(args.nseed) - - # some debug - print('TIMER: initialise:', time.time()-st, 's') - print('DEBUG: sys.version:',sys.version,'\n') - - print('DEBUG: IO parsers:') - print('DEBUG: args.data_dir:',args.data_dir) - print('DEBUG: args.restart_int:',args.restart_int,'\n') - - print('DEBUG: model parsers:') - print('DEBUG: args.batch_size:',args.batch_size) - print('DEBUG: args.test_batch_size:',args.test_batch_size) - print('DEBUG: args.epochs:',args.epochs) - print('DEBUG: args.lr:',args.lr) - print('DEBUG: args.concM:',args.concM) - print('DEBUG: args.momentum:',args.momentum) - print('DEBUG: args.shuff:',args.shuff,'\n') - - print('DEBUG: debug parsers:') - print('DEBUG: args.testrun:',args.testrun) - print('DEBUG: args.nseed:',args.nseed) - print('DEBUG: args.log_int:',args.log_int,'\n') - - print('DEBUG: parallel parsers:') - print('DEBUG: args.nworker:',args.nworker) - print('DEBUG: args.prefetch:',args.prefetch) - print('DEBUG: args.benchrun:',args.benchrun,'\n') - - print('DEBUG: GC parsers:') - print('DEBUG: args.device_iterations:',args.device_iterations,'\n') - -# load datasets - data_dir = args.data_dir - mnist_scale = args.concM - largeData = [] - for i in range(mnist_scale): - largeData.append( - datasets.MNIST(data_dir, train=True, download=False, - transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])) - ) - - # concat data - training_dataset = torch.utils.data.ConcatDataset(largeData) - - mnist_scale = args.concM - largeData = [] - for i in range(mnist_scale): - largeData.append( - datasets.MNIST(data_dir, train=False, download=False, - transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])) - ) - - # concat data - test_dataset = torch.utils.data.ConcatDataset(largeData) - -# GC - set training options - """ - To accelerate the training deviceIterations=50 is set - data loader will pick 50 batches of data per step. - """ - training_opts = poptorch.Options() - training_opts.deviceIterations(args.device_iterations) - -# GC - data loader provided by PopTorch - args.shuff = args.shuff and not args.testrun - train_loader = poptorch.DataLoader( - options=training_opts, - dataset=training_dataset, - batch_size=args.batch_size, - shuffle=args.shuff, - drop_last=True, - num_workers=args.nworker - ) - - """ - A `poptorch.Options()` instance contains a set of default hyperparameters and options for the IPU. - """ - test_loader = poptorch.DataLoader( - options=poptorch.Options(), - dataset=test_dataset, - batch_size=args.test_batch_size, - num_workers=args.nworker - ) - - print('TIMER: read and concat data:', time.time()-st, 's') - -# create CNN model - model = Network() - -# optimizer - optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) - -# GC - distribute model to IPU - train_model = poptorch.trainingModel( - model, - training_opts, - optimizer=optimizer - ) - -# GC - distribute model to IPU w/o training options (for testing) - test_model = poptorch.inferenceModel(model,options=poptorch.Options()) - -# resume state if any - best_acc = np.Inf - res_name='checkpoint.pth.tar' - start_epoch = 1 - if os.path.isfile(res_name) and not args.benchrun: - try: - checkpoint = torch.load(program_dir+'/'+res_name) - start_epoch = checkpoint['epoch'] - print(f'WARNING: restarting from {start_epoch} epoch') - except: - print(f'WARNING: restart file cannot be loaded, restarting!') - - if start_epoch>=args.epochs+1: - print(f'WARNING: given epochs are less than the one in the restart file!\n' - f'WARNING: SYS.EXIT is issued') - sys.exit() - -# start trainin/testing loop - print('TIMER: initialization:', time.time()-st, 's') - print(f'\nDEBUG: start training') - print(f'--------------------------------------------------------') - - et = time.time() - for epoch in range(start_epoch, args.epochs + 1): - lt = time.time() - - # GC - combines forward + backward - loss_acc = train(train_model, train_loader, epoch) - - # GC - testing - acc_test = test(test_model, test_loader) - - # save first epoch timer - if epoch == start_epoch: - first_ep_t = time.time()-lt - - print('TIMER: epoch time:', time.time()-lt, 's') - -# GC - unload models from IPU - train_model.detachFromDevice() - test_model.detachFromDevice() - -# save final state - if not args.benchrun: - save_state(train_model,res_name,True) - - # some debug - print(f'\n--------------------------------------------------------') - print('DEBUG: training results:\n') - print('TIMER: first epoch time:', first_ep_t, ' s') - print('TIMER: last epoch time:', time.time()-lt, ' s') - print('TIMER: average epoch time:', (time.time()-et)/args.epochs, ' s') - print('TIMER: total epoch time:', time.time()-et, ' s') - if epoch > 1: - print('TIMER: total epoch-1 time:', time.time()-et-first_ep_t, ' s') - print('TIMER: average epoch-1 time:', (time.time()-et-first_ep_t)/(args.epochs-1), ' s') - if args.benchrun: - print('TIMER: total epoch-2 time:', lt-first_ep_t, ' s') - print('TIMER: average epoch-2 time:', (lt-first_ep_t)/(args.epochs-2), ' s') - -if __name__ == "__main__": - main() - sys.exit() - -#eof diff --git a/scripts/jureca_graphcore/GC_startscript.sh b/scripts/jureca_graphcore/GC_startscript.sh deleted file mode 100644 index 1c9f928..0000000 --- a/scripts/jureca_graphcore/GC_startscript.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -# general configuration of the job -#SBATCH --job-name=GC_test -#SBATCH --account=zam -#SBATCH --mail-user= -#SBATCH --mail-type=ALL -#SBATCH --output=job.out -#SBATCH --error=job.err -#SBATCH --time=01:00:00 - -# configure node and process count on the CM -#SBATCH --partition=dc-ipu -#SBATCH --nodes=1 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=64 -#SBATCH --exclusive - -srun apptainer run pytorch.sif -- python3 \ - ./GC_pytorch_mnist.py \ - --data-dir /p/scratch/raise-ctp1/data_MNIST/ \ - --nworker $SLURM_CPUS_PER_TASK \ - --concM 100 - -# eof diff --git a/scripts/jureca_graphcore/README.md b/scripts/jureca_graphcore/README.md deleted file mode 100644 index dafa0b7..0000000 --- a/scripts/jureca_graphcore/README.md +++ /dev/null @@ -1,28 +0,0 @@ -# DL using Graphcore IPU - -# Graphcore PyTorch documentation -https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/pytorch_to_poptorch.html# - -# jureca user documentation -https://apps.fz-juelich.de/jsc/hps/jureca/index.html - -# current isues -1. no parallel training - -# to-do -1. implement parallelization - -# done -1. initial mnist tests show 8x better performance than A100 - -# usage -apptainer is used for the containers -0. to use containers in Jureca, (if not done!) from JuDoor, click "Request access to restricted software", then "Access to other restricted software", and accept the agreement! ! finally, reset ssh -1. pull Graphcore SDK `apptainer pull poplar.sif docker://docker.io/graphcore/poplar:2.4.0` -2. build Graphcore SDK with PyTorch `apptainer build pytorch.sif docker://docker.io/graphcore/pytorch` \ -this comes with Torch-1.10.0 -3. additional libraries are needed: \ -`apptainer shell pytorch.sif` -`> pip3 install torchvision==1.11.0 tqdm h5py --user` -`> exit` -4. submit `sbatch GC_startscript.sh` diff --git a/scripts/jureca_graphcore/lamec.json b/scripts/jureca_graphcore/lamec.json deleted file mode 100644 index fe05ab6..0000000 --- a/scripts/jureca_graphcore/lamec.json +++ /dev/null @@ -1 +0,0 @@ -{"startscript": "GC_startscript.sh"} \ No newline at end of file diff --git a/scripts/jureca_heat/HeAT_startscript_deep.sh b/scripts/jureca_heat/HeAT_startscript_deep.sh deleted file mode 100644 index a48cb92..0000000 --- a/scripts/jureca_heat/HeAT_startscript_deep.sh +++ /dev/null @@ -1,71 +0,0 @@ -#!/bin/bash - -# general configuration of the job -#SBATCH --job-name=Heattest -#SBATCH --account=raise-ctp1 -#SBATCH --mail-user= -#SBATCH --mail-type=ALL -#SBATCH --output=job.out -#SBATCH --error=job.err -#SBATCH --time=00:30:00 - -# configure node and process count on the CM -#SBATCH --partition=dc-gpu -#SBATCH --nodes=2 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=32 -#SBATCH --gpus-per-node=4 -#SBATCH --exclusive - -# gres options have to be disabled for deepv -#SBATCH --gres=gpu:4 - -# parameters -debug=false -bs=3 -epochs=1 -lr=0.0001 -dataDir='/p/scratch/raise-ctp1/T31/' -COMMAND="HeAT_pytorch_AT.py - --batch-size $bs --epochs $epochs --lr $lr --nworker $SLURM_CPUS_PER_TASK --data-dir $dataDir" - -# command to exec -echo "DEBUG: EXECUTE=$COMMAND" - -# set modules -ml --force purge -ml Stages/2022 NVHPC/22.3 ParaStationMPI/5.5.0-1-mt NCCL/2.12.7-1-CUDA-11.5 cuDNN/8.3.1.22-CUDA-11.5 -ml Python/3.9.6 libaio/0.3.112 HDF5/1.12.1-serial mpi-settings/CUDA - -# set env -source /p/project/raise-ctp1/RAISE/envAI_jureca/bin/activate - -# sleep a sec -sleep 1 - -# job info -echo "DEBUG: TIME: $(date)" -echo "DEBUG: EXECUTE: $EXEC" -echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" -echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" -echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" -echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" -echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" -echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" -echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" -if [ "$debug" = true ] ; then - export NCCL_DEBUG=INFO -fi -echo - -# set comm -export CUDA_VISIBLE_DEVICES="0,1,2,3" -if [ "$SLURM_CPUS_PER_TASK" > 0 ] ; then - export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -fi - -# execute -srun --cpu-bind=none python3 -u $COMMAND - -# eof diff --git a/scripts/jureca_heat/README.md b/scripts/jureca_heat/README.md deleted file mode 100644 index c3c4afd..0000000 --- a/scripts/jureca_heat/README.md +++ /dev/null @@ -1,15 +0,0 @@ -# DL using HeAT/PyTorch on deepv - -# source -https://github.com/helmholtz-analytics/heat - -# current isues -1. - -# to-do -1. - -# usage - pip -1. clone -2. run `./createENV.sh` -3. submit `sbatch HeAT_startscript_deep.sh` diff --git a/scripts/jureca_heat/createEnv.sh b/scripts/jureca_heat/createEnv.sh deleted file mode 100755 index e5cc3af..0000000 --- a/scripts/jureca_heat/createEnv.sh +++ /dev/null @@ -1,180 +0,0 @@ -#!/bin/bash -# -*- coding: utf-8 -*- -# author: EI -# version: 220328a -# creates machine specific python env - -# set modules -ml --force purge - -# get sys info -cDir=$PWD -sysN="$(uname -n | cut -f2- -d.)" -echo "system:${sysN}" -echo - -cont1=false -if [ "$sysN" = 'deepv' ] ; then - ml use $OTHERSTAGES - ml Stages/2022 GCC OpenMPI cuDNN NCCL Python CMake - cont1=true -elif [ "$sysN" = 'juwels' ] ; then - ml GCC ParaStationMPI Python CMake - cont1=true -elif [ "$sysN" = 'jureca' ] ; then - ml Stages/2022 NVHPC ParaStationMPI/5.5.0-1-mt Python CMake NCCL/2.11.4-CUDA-11.5 cuDNN libaio HDF5 PnetCDF mpi-settings/CUDA - cont1=true -else - echo - echo 'unknown system detected' - echo 'canceling' - echo -fi -echo "modules loaded" -echo - -# get python version -pver="$(python --version 2>&1 | awk {'print $2'} | cut -f1-2 -d.)" -echo "python version is ${pver}" -echo - -# create env -if [ "$cont1" = true ] ; then - if [ -d "${cDir}/envAI_${sysN}" ];then - echo 'env already exist' - echo - - source envAI_${sysN}/bin/activate - else - python3 -m venv envAI_${sysN} - - # get headers for pip - if [ -f "${cDir}/envAI_${sysN}/bin/pip3" ]; then - echo 'pip already exist' - else - cp "$(which pip3)" $cDir/envAI_${sysN}/bin/ - ln -s $cDir/envAI_${sysN}/bin/pip3 $cDir/envAI_${sysN}/bin/pip${pver} - var="#!$cDir/envAI_${sysN}/bin/python${pver}" - sed -i "1s|.*|$var|" $cDir/envAI_${sysN}/bin/pip3 - fi - - # activate env - source envAI_${sysN}/bin/activate - - echo "a new env is created in ${cDir}" - echo "activation is done via:" - echo "source ${cDir}/envAI_${sysN}/bin/activate" - fi -fi - -# install torch -if [ -f "${cDir}/envAI_${sysN}/bin/torchrun" ]; then - echo 'Torch already installed' - echo -else - export TMPDIR=${cDir} - - pip3 install \ - torch==1.11.0+cu115 torchvision==0.12.0+cu115 torchaudio==0.11.0+cu115 -f \ - https://download.pytorch.org/whl/cu115/torch_stable.html --no-cache-dir -fi - -# install horovod -if [ -f "${cDir}/envAI_${sysN}/bin/horovodrun" ]; then - echo 'Horovod already installed' - echo -else - export HOROVOD_WITH_MPI=1 - export HOROVOD_MPI_THREADS_DISABLE=1 - export HOROVOD_GPU=CUDA - export HOROVOD_GPU_OPERATIONS=NCCL - export HOROVOD_CUDA_HOME=$EBROOTCUDA - export HOROVOD_NCCL_HOME=$EBROOTNCCL - export HOROVOD_WITH_PYTORCH=1 - export TMPDIR=${cDir} - - pip3 install --no-cache-dir horovod --ignore-installed -fi - -# install deepspeed -if [ -f "${cDir}/envAI_${sysN}/bin/deepspeed" ]; then - echo 'DeepSpeed already installed' - echo -else - # compile all opt. stuff - not needed & not working - #export DS_BUILD_OPS=1 - # compile req. opt. stuff - export DS_BUILD_FUSED_ADAM=1 - export DS_BUILD_UTILS=1 - if [ "$sysN" = 'deepv' ] ; then - #fix libaio issues via: - export DS_BUILD_AIO=0 - fi - export TMPDIR=${cDir} - - pip3 install --no-cache-dir DeepSpeed - - # add this to .../deepspeed/launcher/launch.py l.93 - var=' args.node_rank=int(os.environ.get("SLURM_PROCID",0))' - sed -i "93s|.*|$var|" $cDir/envAI_${sysN}/lib/python${pver}/site-packages/deepspeed/launcher/launch.py -fi - -# install heat -if [ -d "${cDir}/envAI_${sysN}/lib/python${pver}/site-packages/heat" ]; then - echo 'HeAT already installed' - echo -else - export TMPDIR=${cDir} - export CFLAGS="-noswitcherror" - export CXXFLAGS="-noswitcherror" - - # need to modify setup.py to accep torch>1.9 for heat - wget https://files.pythonhosted.org/packages/5d/3a/4781f1e6910753bfdfa6712c83c732c60e675d8de14983926a0d9306c7a6/heat-1.1.1.tar.gz - tar xzf heat-1.1.1.tar.gz - var=' "torch>=1.7.0",' - sed -i "36s|.*|$var|" heat-1.1.1/setup.py - var=' "torchvision>=0.8.0",' - sed -i "39s|.*|$var|" heat-1.1.1/setup.py - - # create tar again! - rm -rf heat-1.1.1.tar.gz - tar czf heat-1.1.1.tar.gz heat-1.1.1 - rm -rf heat-1.1.1 - - pip3 install --no-cache-dir 'heat-1.1.1.tar.gz[hdf5,netcdf]' - - rm -rf heat-1.1.1.tar.gz -fi - -# get rest of the libraries$ -if [ "$cont1" = true ] ; then - # install rest - pip3 install -r reqs.txt --ignore-installed - - # modify l.4 of /torchnlp/_third_party/weighted_random_sampler.py - var='int_classes = int' - sed -i "4s|.*|$var|" \ - $cDir/envAI_${sysN}/lib/python${pver}/site-packages/torchnlp/_third_party/weighted_random_sampler.py -fi - -# fix IB IP config -if [ -f "${cDir}/envAI_${sysN}/bin/torchrun" ]; then - sed -i -e '3,8s/^/#/' ${cDir}/envAI_${sysN}/bin/torchrun - echo """ -import re -import sys -from torch.distributed.run import main -from torch.distributed.elastic.agent.server import api as sapi - -def new_get_fq_hostname(): - return _orig_get_fq_hostname().replace('.', 'i.', 1) - -if __name__ == '__main__': - _orig_get_fq_hostname = sapi._get_fq_hostname - sapi._get_fq_hostname = new_get_fq_hostname - sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) - sys.exit(main()) -""" >> ${cDir}/envAI_${sysN}/bin/torchrun -fi - -#eof diff --git a/scripts/jureca_heat/lamec.json b/scripts/jureca_heat/lamec.json deleted file mode 100644 index d1bf1b2..0000000 --- a/scripts/jureca_heat/lamec.json +++ /dev/null @@ -1 +0,0 @@ -{"startscript": "HeAT_startscript_deep.sh"} \ No newline at end of file diff --git a/scripts/jureca_heat/reqs.txt b/scripts/jureca_heat/reqs.txt deleted file mode 100755 index 8d48886..0000000 --- a/scripts/jureca_heat/reqs.txt +++ /dev/null @@ -1,8 +0,0 @@ -Pillow -pyparsing -python-dateutil -matplotlib -h5py -pytorch-nlp -pyprof -filelock diff --git a/scripts/jureca_horovod/Hor_startscript_deep.sh b/scripts/jureca_horovod/Hor_startscript_deep.sh deleted file mode 100644 index 315640e..0000000 --- a/scripts/jureca_horovod/Hor_startscript_deep.sh +++ /dev/null @@ -1,82 +0,0 @@ -#!/bin/bash - -# general configuration of the job -#SBATCH --job-name=HorTest -#SBATCH --account=raise-ctp1 -#SBATCH --mail-user= -#SBATCH --mail-type=ALL -#SBATCH --output=job.out -#SBATCH --error=job.err -#SBATCH --time=00:15:00 - -# configure node and process count on the CM -#SBATCH --partition=dc-gpu -#SBATCH --nodes=2 -#SBATCH --ntasks-per-node=4 -#SBATCH --cpus-per-task=32 -#SBATCH --gpus-per-node=4 -#SBATCH --exclusive - -# gres options have to be disabled for deepv -#SBATCH --gres=gpu:4 - -# command to exec -debug=false # do nccl debug -bs=2 # batch-size -epochs=10 # epochs -lr=0.01 # learning rate - -dataDir='/p/scratch/raise-ctp1/T31_LD/' -COMMAND="Hor_pytorch_AT.py" -EXEC=$COMMAND" --batch-size $bs - --epochs $epochs - --lr $lr - --nworker $SLURM_CPUS_PER_TASK - --data-dir $dataDir" - -# set modules -ml --force purge -ml Stages/2022 NVHPC/22.3 ParaStationMPI/5.5.0-1-mt NCCL/2.12.7-1-CUDA-11.5 cuDNN/8.3.1.22-CUDA-11.5 -ml Python/3.9.6 libaio/0.3.112 HDF5/1.12.1-serial mpi-settings/CUDA - -# set env -source /p/project/raise-ctp1/RAISE/envAI_jureca/bin/activate - -# sleep a sec -sleep 1 - -# job info -echo "DEBUG: TIME: $(date)" -echo "DEBUG: EXECUTE: $EXEC" -echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" -echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" -echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" -echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" -echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" -echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" -echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" -if [ "$debug" = true ] ; then - export NCCL_DEBUG=INFO -fi -echo - -# set comm -export CUDA_VISIBLE_DEVICES="0,1,2,3" -export OMP_NUM_THREADS=1 -if [ "$SLURM_CPUS_PER_TASK" > 0 ] ; then - export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -fi - -# launch -srun --cpu-bind=none python3 -u $EXEC - -# nsys profiler: following https://gist.github.com/mcarilli/376821aa1a7182dfcf59928a7cde3223 -#srun --cpu-bind=none nsys profile \ -# --trace=cublas,cuda,cudnn,nvtx,osrt \ -# --sample=cpu \ -# --stats=true \ -# --force-overwrite=true \ -# -o ./prof.out python3 -u $EXEC - -# eof diff --git a/scripts/jureca_horovod/README.md b/scripts/jureca_horovod/README.md deleted file mode 100644 index 90520a3..0000000 --- a/scripts/jureca_horovod/README.md +++ /dev/null @@ -1,15 +0,0 @@ -# DL using Horovod on Jureca DC - -# source -https://github.com/horovod/horovod - -# current isues -1. mpi-settings/CUDA is only available via NVHPC/ParaStationMPI - -# to-do -1. wait for GCC - -# usage - pip -1. clone -2. run `./createENV.sh` -3. submit `sbatch Hor_startscript_deep.sh` diff --git a/scripts/jureca_horovod/createEnv.sh b/scripts/jureca_horovod/createEnv.sh deleted file mode 100755 index e5cc3af..0000000 --- a/scripts/jureca_horovod/createEnv.sh +++ /dev/null @@ -1,180 +0,0 @@ -#!/bin/bash -# -*- coding: utf-8 -*- -# author: EI -# version: 220328a -# creates machine specific python env - -# set modules -ml --force purge - -# get sys info -cDir=$PWD -sysN="$(uname -n | cut -f2- -d.)" -echo "system:${sysN}" -echo - -cont1=false -if [ "$sysN" = 'deepv' ] ; then - ml use $OTHERSTAGES - ml Stages/2022 GCC OpenMPI cuDNN NCCL Python CMake - cont1=true -elif [ "$sysN" = 'juwels' ] ; then - ml GCC ParaStationMPI Python CMake - cont1=true -elif [ "$sysN" = 'jureca' ] ; then - ml Stages/2022 NVHPC ParaStationMPI/5.5.0-1-mt Python CMake NCCL/2.11.4-CUDA-11.5 cuDNN libaio HDF5 PnetCDF mpi-settings/CUDA - cont1=true -else - echo - echo 'unknown system detected' - echo 'canceling' - echo -fi -echo "modules loaded" -echo - -# get python version -pver="$(python --version 2>&1 | awk {'print $2'} | cut -f1-2 -d.)" -echo "python version is ${pver}" -echo - -# create env -if [ "$cont1" = true ] ; then - if [ -d "${cDir}/envAI_${sysN}" ];then - echo 'env already exist' - echo - - source envAI_${sysN}/bin/activate - else - python3 -m venv envAI_${sysN} - - # get headers for pip - if [ -f "${cDir}/envAI_${sysN}/bin/pip3" ]; then - echo 'pip already exist' - else - cp "$(which pip3)" $cDir/envAI_${sysN}/bin/ - ln -s $cDir/envAI_${sysN}/bin/pip3 $cDir/envAI_${sysN}/bin/pip${pver} - var="#!$cDir/envAI_${sysN}/bin/python${pver}" - sed -i "1s|.*|$var|" $cDir/envAI_${sysN}/bin/pip3 - fi - - # activate env - source envAI_${sysN}/bin/activate - - echo "a new env is created in ${cDir}" - echo "activation is done via:" - echo "source ${cDir}/envAI_${sysN}/bin/activate" - fi -fi - -# install torch -if [ -f "${cDir}/envAI_${sysN}/bin/torchrun" ]; then - echo 'Torch already installed' - echo -else - export TMPDIR=${cDir} - - pip3 install \ - torch==1.11.0+cu115 torchvision==0.12.0+cu115 torchaudio==0.11.0+cu115 -f \ - https://download.pytorch.org/whl/cu115/torch_stable.html --no-cache-dir -fi - -# install horovod -if [ -f "${cDir}/envAI_${sysN}/bin/horovodrun" ]; then - echo 'Horovod already installed' - echo -else - export HOROVOD_WITH_MPI=1 - export HOROVOD_MPI_THREADS_DISABLE=1 - export HOROVOD_GPU=CUDA - export HOROVOD_GPU_OPERATIONS=NCCL - export HOROVOD_CUDA_HOME=$EBROOTCUDA - export HOROVOD_NCCL_HOME=$EBROOTNCCL - export HOROVOD_WITH_PYTORCH=1 - export TMPDIR=${cDir} - - pip3 install --no-cache-dir horovod --ignore-installed -fi - -# install deepspeed -if [ -f "${cDir}/envAI_${sysN}/bin/deepspeed" ]; then - echo 'DeepSpeed already installed' - echo -else - # compile all opt. stuff - not needed & not working - #export DS_BUILD_OPS=1 - # compile req. opt. stuff - export DS_BUILD_FUSED_ADAM=1 - export DS_BUILD_UTILS=1 - if [ "$sysN" = 'deepv' ] ; then - #fix libaio issues via: - export DS_BUILD_AIO=0 - fi - export TMPDIR=${cDir} - - pip3 install --no-cache-dir DeepSpeed - - # add this to .../deepspeed/launcher/launch.py l.93 - var=' args.node_rank=int(os.environ.get("SLURM_PROCID",0))' - sed -i "93s|.*|$var|" $cDir/envAI_${sysN}/lib/python${pver}/site-packages/deepspeed/launcher/launch.py -fi - -# install heat -if [ -d "${cDir}/envAI_${sysN}/lib/python${pver}/site-packages/heat" ]; then - echo 'HeAT already installed' - echo -else - export TMPDIR=${cDir} - export CFLAGS="-noswitcherror" - export CXXFLAGS="-noswitcherror" - - # need to modify setup.py to accep torch>1.9 for heat - wget https://files.pythonhosted.org/packages/5d/3a/4781f1e6910753bfdfa6712c83c732c60e675d8de14983926a0d9306c7a6/heat-1.1.1.tar.gz - tar xzf heat-1.1.1.tar.gz - var=' "torch>=1.7.0",' - sed -i "36s|.*|$var|" heat-1.1.1/setup.py - var=' "torchvision>=0.8.0",' - sed -i "39s|.*|$var|" heat-1.1.1/setup.py - - # create tar again! - rm -rf heat-1.1.1.tar.gz - tar czf heat-1.1.1.tar.gz heat-1.1.1 - rm -rf heat-1.1.1 - - pip3 install --no-cache-dir 'heat-1.1.1.tar.gz[hdf5,netcdf]' - - rm -rf heat-1.1.1.tar.gz -fi - -# get rest of the libraries$ -if [ "$cont1" = true ] ; then - # install rest - pip3 install -r reqs.txt --ignore-installed - - # modify l.4 of /torchnlp/_third_party/weighted_random_sampler.py - var='int_classes = int' - sed -i "4s|.*|$var|" \ - $cDir/envAI_${sysN}/lib/python${pver}/site-packages/torchnlp/_third_party/weighted_random_sampler.py -fi - -# fix IB IP config -if [ -f "${cDir}/envAI_${sysN}/bin/torchrun" ]; then - sed -i -e '3,8s/^/#/' ${cDir}/envAI_${sysN}/bin/torchrun - echo """ -import re -import sys -from torch.distributed.run import main -from torch.distributed.elastic.agent.server import api as sapi - -def new_get_fq_hostname(): - return _orig_get_fq_hostname().replace('.', 'i.', 1) - -if __name__ == '__main__': - _orig_get_fq_hostname = sapi._get_fq_hostname - sapi._get_fq_hostname = new_get_fq_hostname - sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) - sys.exit(main()) -""" >> ${cDir}/envAI_${sysN}/bin/torchrun -fi - -#eof diff --git a/scripts/jureca_horovod/lamec.json b/scripts/jureca_horovod/lamec.json deleted file mode 100644 index 4aff71d..0000000 --- a/scripts/jureca_horovod/lamec.json +++ /dev/null @@ -1 +0,0 @@ -{"startscript": "Hor_startscript_deep.sh"} \ No newline at end of file diff --git a/scripts/jureca_horovod/reqs.txt b/scripts/jureca_horovod/reqs.txt deleted file mode 100755 index 8d48886..0000000 --- a/scripts/jureca_horovod/reqs.txt +++ /dev/null @@ -1,8 +0,0 @@ -Pillow -pyparsing -python-dateutil -matplotlib -h5py -pytorch-nlp -pyprof -filelock diff --git a/scripts/jureca_libtorch/MNIST/CMakeLists.txt b/scripts/jureca_libtorch/MNIST/CMakeLists.txt deleted file mode 100644 index acaf771..0000000 --- a/scripts/jureca_libtorch/MNIST/CMakeLists.txt +++ /dev/null @@ -1,30 +0,0 @@ -cmake_minimum_required(VERSION 3.1 FATAL_ERROR) -project(mnist) -set(CMAKE_CXX_STANDARD 14) - -find_package(Torch REQUIRED) - -option(DOWNLOAD_MNIST "Download the MNIST dataset from the internet" ON) -if (DOWNLOAD_MNIST) - message(STATUS "Downloading MNIST dataset") - execute_process( - COMMAND python ${CMAKE_CURRENT_LIST_DIR}/../download_mnist.py - -d ${CMAKE_BINARY_DIR}/data - ERROR_VARIABLE DOWNLOAD_ERROR) - if (DOWNLOAD_ERROR) - message(FATAL_ERROR "Error downloading MNIST dataset: ${DOWNLOAD_ERROR}") - endif() -endif() - -add_executable(mnist mnist.cpp) -target_compile_features(mnist PUBLIC cxx_range_for) -target_link_libraries(mnist ${TORCH_LIBRARIES}) - -if (MSVC) - file(GLOB TORCH_DLLS "${TORCH_INSTALL_PREFIX}/lib/*.dll") - add_custom_command(TARGET mnist - POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy_if_different - ${TORCH_DLLS} - $<TARGET_FILE_DIR:mnist>) -endif (MSVC) diff --git a/scripts/jureca_libtorch/MNIST/LibTorch_startscript.sh b/scripts/jureca_libtorch/MNIST/LibTorch_startscript.sh deleted file mode 100644 index 04baaa3..0000000 --- a/scripts/jureca_libtorch/MNIST/LibTorch_startscript.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=LibTorchTest -#SBATCH --account=raise-ctp1 -#SBATCH --output=job.out -#SBATCH --error=job.err -#SBATCH --time=00:15:00 -#SBATCH --partition=dc-gpu-devel -#SBATCH --nodes=1 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=64 -#SBATCH --exclusive -#SBATCH --gres=gpu:1 - -ml NVHPC/22.3 cuDNN CMake - -echo "DEBUG: $(date)" -echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" -echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" - -export CUDA_VISIBLE_DEVICES="0" -export OMP_NUM_THREADS=1 - -srun ./mnist diff --git a/scripts/jureca_libtorch/MNIST/compile.sh b/scripts/jureca_libtorch/MNIST/compile.sh deleted file mode 100644 index 1ed0e27..0000000 --- a/scripts/jureca_libtorch/MNIST/compile.sh +++ /dev/null @@ -1,19 +0,0 @@ -# compile mnist.cpp with latest LibTorch - -# load libraries -ml NVHPC/22.3 CMake/3.21.1 cuDNN/8.3.1.22-CUDA-11.5 Python/3.9.6 - -# get libtorch w/ gpu -wget https://download.pytorch.org/libtorch/cu116/libtorch-cxx11-abi-shared-with-deps-1.12.0%2Bcu116.zip -unzip libtorch-cxx11-abi-shared-with-deps-1.12.0+cu116.zip -libtorch_dir=$PWD/libtorch - -# compile mnist.cpp with libtorch w/ gpu to build folder -mkdir -p build -pushd build -cmake -DCMAKE_PREFIX_PATH=${libtorch_dir} -DDOWNLOAD_MNIST=ON .. -cmake --build . --config Release -mv mnist .. -popd - -# eof diff --git a/scripts/jureca_libtorch/MNIST/download_mnist.py b/scripts/jureca_libtorch/MNIST/download_mnist.py deleted file mode 100644 index 2a5068f..0000000 --- a/scripts/jureca_libtorch/MNIST/download_mnist.py +++ /dev/null @@ -1,88 +0,0 @@ -from __future__ import division -from __future__ import print_function - -import argparse -import gzip -import os -import sys -import urllib - -try: - from urllib.error import URLError - from urllib.request import urlretrieve -except ImportError: - from urllib2 import URLError - from urllib import urlretrieve - -RESOURCES = [ - 'train-images-idx3-ubyte.gz', - 'train-labels-idx1-ubyte.gz', - 't10k-images-idx3-ubyte.gz', - 't10k-labels-idx1-ubyte.gz', -] - - -def report_download_progress(chunk_number, chunk_size, file_size): - if file_size != -1: - percent = min(1, (chunk_number * chunk_size) / file_size) - bar = '#' * int(64 * percent) - sys.stdout.write('\r0% |{:<64}| {}%'.format(bar, int(percent * 100))) - - -def download(destination_path, url, quiet): - if os.path.exists(destination_path): - if not quiet: - print('{} already exists, skipping ...'.format(destination_path)) - else: - print('Downloading {} ...'.format(url)) - try: - hook = None if quiet else report_download_progress - urlretrieve(url, destination_path, reporthook=hook) - except URLError: - raise RuntimeError('Error downloading resource!') - finally: - if not quiet: - # Just a newline. - print() - - -def unzip(zipped_path, quiet): - unzipped_path = os.path.splitext(zipped_path)[0] - if os.path.exists(unzipped_path): - if not quiet: - print('{} already exists, skipping ... '.format(unzipped_path)) - return - with gzip.open(zipped_path, 'rb') as zipped_file: - with open(unzipped_path, 'wb') as unzipped_file: - unzipped_file.write(zipped_file.read()) - if not quiet: - print('Unzipped {} ...'.format(zipped_path)) - - -def main(): - parser = argparse.ArgumentParser( - description='Download the MNIST dataset from the internet') - parser.add_argument( - '-d', '--destination', default='.', help='Destination directory') - parser.add_argument( - '-q', - '--quiet', - action='store_true', - help="Don't report about progress") - options = parser.parse_args() - - if not os.path.exists(options.destination): - os.makedirs(options.destination) - - try: - for resource in RESOURCES: - path = os.path.join(options.destination, resource) - url = 'http://yann.lecun.com/exdb/mnist/{}'.format(resource) - download(path, url, options.quiet) - unzip(path, options.quiet) - except KeyboardInterrupt: - print('Interrupted') - - -if __name__ == '__main__': - main() diff --git a/scripts/jureca_libtorch/MNIST/mnist.cpp b/scripts/jureca_libtorch/MNIST/mnist.cpp deleted file mode 100755 index edd51ed..0000000 --- a/scripts/jureca_libtorch/MNIST/mnist.cpp +++ /dev/null @@ -1,179 +0,0 @@ -#include <torch/torch.h> - -#include <cstddef> -#include <cstdio> -#include <iostream> -#include <string> -#include <vector> -#include <chrono> - -using namespace std::chrono; -using Clock = std::chrono::steady_clock; - -// Where to find the MNIST dataset. -const char* kDataRoot = "./data"; - -// The batch size for training. -const int64_t kTrainBatchSize = 64; - -// The batch size for testing. -const int64_t kTestBatchSize = 64; - -// The number of epochs to train. -const int64_t kNumberOfEpochs = 3; - -// After how many batches to log a new update with the loss value. -const int64_t kLogInterval = 10; - -struct Net : torch::nn::Module { - Net() - : conv1(torch::nn::Conv2dOptions(1, 10, /*kernel_size=*/5)), - conv2(torch::nn::Conv2dOptions(10, 20, /*kernel_size=*/5)), - fc1(320, 50), - fc2(50, 10) { - register_module("conv1", conv1); - register_module("conv2", conv2); - register_module("conv2_drop", conv2_drop); - register_module("fc1", fc1); - register_module("fc2", fc2); - } - - torch::Tensor forward(torch::Tensor x) { - x = torch::relu(torch::max_pool2d(conv1->forward(x), 2)); - x = torch::relu( - torch::max_pool2d(conv2_drop->forward(conv2->forward(x)), 2)); - x = x.view({-1, 320}); - x = torch::relu(fc1->forward(x)); - x = torch::dropout(x, /*p=*/0.5, /*training=*/is_training()); - x = fc2->forward(x); - return torch::log_softmax(x, /*dim=*/1); - } - - torch::nn::Conv2d conv1; - torch::nn::Conv2d conv2; - torch::nn::Dropout2d conv2_drop; - torch::nn::Linear fc1; - torch::nn::Linear fc2; -}; - -template <typename DataLoader> -void train( - size_t epoch, - Net& model, - torch::Device device, - DataLoader& data_loader, - torch::optim::Optimizer& optimizer, - size_t dataset_size) { - model.train(); - size_t batch_idx = 0; - for (auto& batch : data_loader) { - auto data = batch.data.to(device), targets = batch.target.to(device); - optimizer.zero_grad(); - auto output = model.forward(data); - auto loss = torch::nll_loss(output, targets); - AT_ASSERT(!std::isnan(loss.template item<float>())); - loss.backward(); - optimizer.step(); - - if (batch_idx++ % kLogInterval == 0) { - std::printf( - "\rTrain Epoch: %ld [%5ld/%5ld] Loss: %.4f\n", - epoch, - batch_idx * batch.data.size(0), - dataset_size, - loss.template item<float>()); - } - } -} - -template <typename DataLoader> -void test( - Net& model, - torch::Device device, - DataLoader& data_loader, - size_t dataset_size) { - torch::NoGradGuard no_grad; - model.eval(); - double test_loss = 0; - int32_t correct = 0; - for (const auto& batch : data_loader) { - auto data = batch.data.to(device), targets = batch.target.to(device); - auto output = model.forward(data); - test_loss += torch::nll_loss( - output, - targets, - /*weight=*/{}, - torch::Reduction::Sum) - .template item<float>(); - auto pred = output.argmax(1); - correct += pred.eq(targets).sum().template item<int64_t>(); - } - - test_loss /= dataset_size; - std::printf( - "\nTest set: Average loss: %.4f | Accuracy: %.3f\n", - test_loss, - static_cast<double>(correct) / dataset_size); -} - -auto main() -> int { - torch::manual_seed(1); - - torch::DeviceType device_type; - if (torch::cuda::is_available()) { - std::cout << "CUDA available! Training on GPU." << std::endl; - device_type = torch::kCUDA; - } else { - std::cout << "Training on CPU." << std::endl; - device_type = torch::kCPU; - } - torch::Device device(device_type); - - Net model; - model.to(device); - - auto train_dataset = torch::data::datasets::MNIST(kDataRoot) - .map(torch::data::transforms::Normalize<>(0.1307, 0.3081)) - .map(torch::data::transforms::Stack<>()); - - //std::cout << typeid(train_dataset).name() << '\n'; - auto test_dat = train_dataset.append(); - - - const size_t train_dataset_size = train_dataset.size().value(); - auto train_loader = - torch::data::make_data_loader<torch::data::samplers::SequentialSampler>( - std::move(train_dataset), kTrainBatchSize); - - auto test_dataset = torch::data::datasets::MNIST( - kDataRoot, torch::data::datasets::MNIST::Mode::kTest) - .map(torch::data::transforms::Normalize<>(0.1307, 0.3081)) - .map(torch::data::transforms::Stack<>()); - const size_t test_dataset_size = test_dataset.size().value(); - auto test_loader = - torch::data::make_data_loader(std::move(test_dataset), kTestBatchSize); - - torch::optim::SGD optimizer( - model.parameters(), torch::optim::SGDOptions(0.01).momentum(0.5)); - - // timer start - auto st = Clock::now(); - auto et1 = Clock::now(); - auto et2 = Clock::now(); - - // start loop - std::cout << "starting!" << std::endl; - for (size_t epoch = 1; epoch <= kNumberOfEpochs; ++epoch) { - et1 = Clock::now(); - train(epoch, model, device, *train_loader, optimizer, train_dataset_size); - test(model, device, *test_loader, test_dataset_size); - et2 = Clock::now(); - std::cout << "epoch:" << epoch<<" / " << - duration_cast<milliseconds>(et2-et1).count()/1000.0 << " sec" << std::endl; - } - - // timer end - auto et3 = Clock::now(); - std::cout << "\nfinal time:"<< - duration_cast<milliseconds>(et3-st).count()/1000.0 << " sec" << std::endl; -} diff --git a/scripts/jureca_libtorch/README.md b/scripts/jureca_libtorch/README.md deleted file mode 100644 index ab26d71..0000000 --- a/scripts/jureca_libtorch/README.md +++ /dev/null @@ -1,19 +0,0 @@ -# DL using LibTorch (C++ Torch) - -# documentation -https://github.com/pytorch/pytorch/blob/master/docs/libtorch.rst - -# current isues -1. no distributed training - -# to-do -1. implement distributed training - -# done -1. as Python version is a wrapper, no performance difference -2. can simply be used alongisde a c++ code w/o Cpython -3. very limited compared to Python version (many classes/functions are missing) - -# usage -1. simply compile `mnist.cpp` using the `cmake` file as `bash compile.sh` -2. submit compiled `mnist` with `sbatch LibTorch_startscript.sh` diff --git a/scripts/jureca_libtorch/TorchVision/compile_jpeg.sh b/scripts/jureca_libtorch/TorchVision/compile_jpeg.sh deleted file mode 100755 index fc5e964..0000000 --- a/scripts/jureca_libtorch/TorchVision/compile_jpeg.sh +++ /dev/null @@ -1,14 +0,0 @@ -# load libraries -ml NVHPC/22.3 CMake/3.21.1 cuDNN/8.3.1.22-CUDA-11.5 - -git clone https://github.com/winlibs/libjpeg.git -cd libjpeg - -rm -rf build -mkdir -p build -mkdir -p install -pushd build -cmake -DCMAKE_INSTALL_PREFIX=../install -DCMAKE_BUILD_TYPE=Release .. -make -j -make install -popd diff --git a/scripts/jureca_libtorch/TorchVision/compile_png.sh b/scripts/jureca_libtorch/TorchVision/compile_png.sh deleted file mode 100755 index 80dd76e..0000000 --- a/scripts/jureca_libtorch/TorchVision/compile_png.sh +++ /dev/null @@ -1,14 +0,0 @@ -# load libraries -ml NVHPC/22.3 CMake/3.21.1 cuDNN/8.3.1.22-CUDA-11.5 - -wget http://prdownloads.sourceforge.net/libpng/libpng-1.6.37.tar.gz?download -mv 'libpng-1.6.37.tar.gz?download' libpng-1.6.37.tar.gz -tar xzf libpng-1.6.37.tar.gz - -pushd libpng-1.6.37 -rm -rf build -mkdir -p build -./configure --prefix=${PWD}/build -make -make install -popd diff --git a/scripts/jureca_libtorch/TorchVision/compile_torchvision.sh b/scripts/jureca_libtorch/TorchVision/compile_torchvision.sh deleted file mode 100755 index f3b5acf..0000000 --- a/scripts/jureca_libtorch/TorchVision/compile_torchvision.sh +++ /dev/null @@ -1,44 +0,0 @@ -# compile torchvision for dataloading (optional) - -# load libraries -ml NVHPC/22.3 CMake/3.21.1 cuDNN/8.3.1.22-CUDA-11.5 - -# get libtorch w/ gpu -wget https://download.pytorch.org/libtorch/cu116/libtorch-cxx11-abi-shared-with-deps-1.12.0%2Bcu116.zip -unzip libtorch-cxx11-abi-shared-with-deps-1.12.0+cu116.zip -libtorch_dir=$PWD/libtorch - -# get png packages for torchvision -./compile_png.sh -libpng_dir=$PWD/libpng-1.6.37/build - -# get jpeg packages -./compile_png.sh -libjpeg_dir=$PWD/libjpeg/install - -# current dir -m_dir=$PWD - -# get torchvision -git clone https://github.com/pytorch/vision.git - -# compile torchvision -pushd torchvision -rm -rf build -mkdir -p build -mkdir -p install -cd build -cmake -DCMAKE_PREFIX_PATH=${libtorch_dir} \ - -DWITH_CUDA=on \ - -DPNG_LIBRARY=${libpng_dir}/lib/libpng.so \ - -DPNG_PNG_INCLUDE_DIR=${libpng_dir}/include \ - -DJPEG_LIBRARY=${libjpeg_dir}/lib64/libjpeg.so \ - -DJPEG_INCLUDE_DIR=${libjpeg_dir}/include \ - -DCMAKE_INSTALL_PREFIX=../install \ - -DCMAKE_BUILD_TYPE=Release .. - -make -j -make install -popd - -# eof diff --git a/scripts/jureca_libtorch/lamec.json b/scripts/jureca_libtorch/lamec.json deleted file mode 100644 index a8d025c..0000000 --- a/scripts/jureca_libtorch/lamec.json +++ /dev/null @@ -1 +0,0 @@ -{"startscript": "MNIST/LibTorch_startscript.sh"} \ No newline at end of file diff --git a/scripts/jureca_raytune/.gitkeep b/scripts/jureca_raytune/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/scripts/jureca_raytune/README.md b/scripts/jureca_raytune/README.md deleted file mode 100644 index bd3dd8b..0000000 --- a/scripts/jureca_raytune/README.md +++ /dev/null @@ -1,7 +0,0 @@ -# Simple Ray Tune script working with cifar10 dataset on JURECA-DC - -Steps: -- create environment by running *create_jureca_env.sh* (or use your own env) -- run startscript *jureca_run_ray.sh* - -Also includes a TensorFlow version (cifar_tune_tf.py) with TFMirroredStrategy for data-parallelism on a node-level diff --git a/scripts/jureca_raytune/RayTune+DDP/.gitkeep b/scripts/jureca_raytune/RayTune+DDP/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/scripts/jureca_raytune/RayTune+DDP/cifar_tune.py b/scripts/jureca_raytune/RayTune+DDP/cifar_tune.py deleted file mode 100644 index 50fa034..0000000 --- a/scripts/jureca_raytune/RayTune+DDP/cifar_tune.py +++ /dev/null @@ -1,132 +0,0 @@ -# general imports -import numpy as np -import os - -# PyTorch imports -import torch -import torch.nn as nn -import torch.nn.functional as F -import torch.optim as optim -import torch.distributed as dist -import torchvision -import torchvision.transforms as transforms -import torchvision.models as models - -# Ray Tune imports -import ray -from ray import tune -from ray.tune import CLIReporter - - -# method to average the parameters over all GPUs - -# mean of field over GPUs -def par_mean(field): - res = torch.tensor(field).float() - res = res.cuda() - dist.all_reduce(res,op=dist.ReduceOp.SUM,group=None,async_op=True).wait() - res/=dist.get_world_size() - return res - - -# dataloading method -def load_data(data_dir=None): - transform = transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) - ]) - - trainset = torchvision.datasets.CIFAR10( - root=data_dir, train=True, download=False, transform=transform) - - return trainset - - -# cifar training method -def train_cifar(config): - - # get model - net = models.resnet18() - - # perpare model for RayTune - net = ray.train.torch.prepare_model(net) - - # loss and optimizer definition - criterion = nn.CrossEntropyLoss() - optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9) - - - # get the training set - trainset = load_data('/p/project/raise-ctp2/cifar10/data') - - # define dataloader with hyperparameters set by RayTune - train_loader = torch.utils.data.DataLoader( - trainset, - batch_size=int(config["batch_size"]), - shuffle=True, - num_workers=8) - - # prepare dataloader for RayTune - train_loader = ray.train.torch.prepare_data_loader(train_loader) - - - for epoch in range(20): # loop over the dataset multiple times - - loss = 0 - - for i, data in enumerate(train_loader, 0): - # get the inputs; data is a list of [inputs, labels] - inputs, labels = data - - # zero the parameter gradients - optimizer.zero_grad() - - # forward + backward + optimize - outputs = net(inputs) - loss = criterion(outputs, labels) - loss.backward() - optimizer.step() - - loss = par_mean(loss) - - # report metric of interest back to RayTune - ray.train.report(loss = loss.item()) - - print("Finished Training") - - -def main(num_samples, max_num_epochs, gpus_per_trial): - ray.init(address='auto') - - - # prepare RayTune with PyTorch DDP backend, num_workers specifies the number of GPUs to use per trial - from ray.train import Trainer - trainer = Trainer(backend="torch", num_workers=gpus_per_trial, use_gpu=True) - - # convert the train function to a Ray trainable - trainable = trainer.to_tune_trainable(train_cifar) - - # set search space - config = { - "batch_size": tune.choice([64, 128, 256, 512]), - "lr": tune.loguniform(10e-5, 1) - } - - - reporter = CLIReporter( - max_report_frequency=60) - - # run hyperparameter optimization - result = tune.run( - trainable, - local_dir=os.path.join(os.path.abspath(os.getcwd()), "ray_results"), - config=config, - num_samples=num_samples, - progress_reporter=reporter, - verbose=1, - scheduler=None) - - -if __name__ == "__main__": - # You can change the number of GPUs per trial here: - main(num_samples=10, max_num_epochs=30, gpus_per_trial=4) \ No newline at end of file diff --git a/scripts/jureca_raytune/RayTune+DDP/create_env.sh b/scripts/jureca_raytune/RayTune+DDP/create_env.sh deleted file mode 100644 index 3ccd263..0000000 --- a/scripts/jureca_raytune/RayTune+DDP/create_env.sh +++ /dev/null @@ -1,16 +0,0 @@ -ml --force purge -ml Stages/2022 GCC/11.2.0 CUDA/11.5 Python/3.9.6 PyTorch/1.11-CUDA-11.5 torchvision/0.12.0 - -## create vritual environment -python3 -m venv ddp_ray_env - -source ddp_ray_env/bin/activate - -# RAY TUNE 2.0 NOT WORKING -pip3 install ray==1.9.0 ray[tune]==1.9.0 ray[train]==1.9.0 - - -# might be necessay, might be not -pip3 install requests -pip3 install pytz -pip3 install python-dateutil diff --git a/scripts/jureca_raytune/RayTune+DDP/jureca_ray_ddp_startscript.sh b/scripts/jureca_raytune/RayTune+DDP/jureca_ray_ddp_startscript.sh deleted file mode 100644 index b8bc31f..0000000 --- a/scripts/jureca_raytune/RayTune+DDP/jureca_ray_ddp_startscript.sh +++ /dev/null @@ -1,78 +0,0 @@ -#!/bin/bash - -# general configuration of the job -#SBATCH --job-name=RayTuneDDP -#SBATCH --account=raise-ctp2 -#SBATCH --mail-user= -#SBATCH --mail-type=ALL -#SBATCH --output=job.out -#SBATCH --error=job.err -#SBATCH --time=01:00:00 - -# configure node and process count on the CM -#SBATCH --partition=dc-gpu-devel -#SBATCH --nodes=2 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=128 -#SBATCH --gpus-per-node=4 -#SBATCH --exclusive - -# gres options have to be disabled for deepv -#SBATCH --gres=gpu:4 - - -ml --force purge -ml Stages/2022 GCC/11.2.0 CUDA/11.5 Python/3.9.6 PyTorch/1.11-CUDA-11.5 torchvision/0.12.0 - - -num_gpus=4 -# set env -source ddp_ray_env/bin/activate - - -# set comm -export CUDA_VISIBLE_DEVICES="0,1,2,3" -export OMP_NUM_THREADS=1 -if [ "$SLURM_CPUS_PER_TASK" > 0 ] ; then - export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -fi - -# launch -# Getting the node names -nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -nodes_array=($nodes) - -head_node=${nodes_array[0]} - -# __doc_head_ray_start__ -port=8374 - -echo "Starting HEAD at $head_node" -srun --nodes=1 --ntasks=1 -w "$head_node" \ - ray start --head --node-ip-address="$head_node"i --port=$port \ - --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus 4 --block & -# __doc_head_ray_end__ - -# __doc_worker_ray_start__ - -# optional, though may be useful in certain versions of Ray < 1.0. -sleep 10 - -# number of nodes other than the head node -worker_num=$((SLURM_JOB_NUM_NODES - 1)) - -for ((i = 1; i <= worker_num; i++)); do - node_i=${nodes_array[$i]} - echo "Starting WORKER $i at $node_i" - srun --nodes=1 --ntasks=1 -w "$node_i" \ - ray start --address "$head_node"i:"$port" \ - --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus 4 --block & - sleep 5 -done - -echo "Ready" - -python3 -u cifar_tune.py - - -# eof diff --git a/scripts/jureca_raytune/Ray_2.4/.gitkeep b/scripts/jureca_raytune/Ray_2.4/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/scripts/jureca_raytune/Ray_2.4/ASHA/.gitkeep b/scripts/jureca_raytune/Ray_2.4/ASHA/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/scripts/jureca_raytune/Ray_2.4/ASHA/cifar_tune_asha.py b/scripts/jureca_raytune/Ray_2.4/ASHA/cifar_tune_asha.py deleted file mode 100644 index 689a457..0000000 --- a/scripts/jureca_raytune/Ray_2.4/ASHA/cifar_tune_asha.py +++ /dev/null @@ -1,427 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -"""! @brief AI4HPC """ - -## -# @mainpage AI4HPC -# -# @section description_main Description -# Hyperparameter optimization of neural networks with Ray Tune library. -# -# -# -# @section notes_main Notes -# - The data directory of the cifar-10 dataset has the be specified in the startscript -# -# Copyright (c) 2023 RAISE, All rights reserved. - - -## -# @file cifar_tune_asha.py -# -# @brief Optimizing the hyperparameters of a ResNet18 trained on the cifar-10 dataset with Ray Tune libray and the ASHA algorithm. -# -# @section description_cifar_tune_asha description -# A standard ResNet18 model is trained on the cifar-10 vision dataset. To optimize the performance, multiple -# training runs (trials) with different hyperparameters (chagend learning rate and batch size) are performed using -# the Ray Tune library. The overall hyperparameter optimization process, as well as the single training runs can be -# parallelized across multiple GPUs. Trials with low performance (in terms of test set acuracy) are terminated early -# with the ASHA aglorithm. -# -# -# @section libraries_main Libraries/Modules -# - argparse standard library (https://docs.python.org/3/library/argparse.html) -# - Parse command-line options -# - sys standard library (https://docs.python.org/3/library/sys.html) -# - System commands -# - os standard library (https://docs.python.org/3/library/os.html) -# - OS commands -# - time standard library (https://docs.python.org/3/library/time.html) -# - Access timers for profilers -# - numpy library (https://numpy.org/) -# - Access numpy functions -# - random standard library (https://docs.python.org/3/library/time.html) -# - Generate random numbers -# - matplotlib library (https://matplotlib.org/) -# - Post-process data for validation -# - torch library (https://pytorch.org/) -# - ML framework -# - torchvision library (https://pypi.org/project/torchvision/) -# - Torch library additions for popular datasets and their transformations -# - ray libray (https://www.ray.io/) -# - Framework for distributed computing with a focus on hyperparameter optimization -# - pytz library (https://pythonhosted.org/pytz/) -# - Library for accurate and cross platform timezone calculation -# - python-dateutil (https://github.com/dateutil/dateutil) -# - Extension to pythons datetimes features -# - typing-extensions (https://pypi.org/project/typing-extensions/) -# - Support for different type systems -# -# @section notes_doxygen_example Notes -# - None. -# -# @section todo TODO -# - None. -# -# @section author Author(s) -# - Created by MA on 04/05/2023. -# - Modified by -# -# Copyright (c) 2023 RAISE, All rights reserved. - - - - -# load general modules -import argparse -import os -import time -import numpy as np - -# load torch and torchvision modules -import torch -import torch.nn as nn -import torch.optim as optim -import torch.distributed as dist - -import torchvision -from torchvision import datasets, transforms, models - -# load ray modules -import ray -from ray import tune -from ray.tune import CLIReporter -from ray.tune.schedulers import ASHAScheduler -from ray.air import session, RunConfig -import ray.train as train -from ray.train.torch import TorchTrainer -from ray.air.config import ScalingConfig -from ray.tune.tuner import Tuner, TuneConfig - - -def parsIni(): - parser = argparse.ArgumentParser(description='Ray Tune Cifar-10 Example') - parser.add_argument('--num-samples', type=int, default=24, metavar='N', - help='number of samples to train (default: 24)') - parser.add_argument('--max-iterations', type=int, default=10, metavar='N', - help='maximum iterations to train (default: 10)') - parser.add_argument('--par-workers', type=int, default=1, metavar='N', - help='parallel workers to train on a single trial (default: 1)') - parser.add_argument('--scheduler', type=str, default='RAND', - help='scheduler for tuning (default: RandomSearch)') - parser.add_argument('--data-dir', type=str, default='', - help='data directory for cifar-10 dataset') - - return parser - -def accuracy(output, target): - """! function that computes the accuracy of an output and target vector - @param output vector that the model predicted - @param target actual vector - - @return correct number of correct predictions - @return total number of total elements - """ - # get the index of the max log-probability - pred = output.max(1, keepdim=True)[1] - - # count correct classifications - correct = pred.eq(target.view_as(pred)).cpu().float().sum() - - # count total samples - total = target.size(0) - return correct, total - -def par_mean(field): - """! function that averages a field across all workers to a worker - @param field field in worker that should be averaged - - @return mean field - """ - - # convert field to tensor - res = torch.Tensor([field]) - - # move field to GPU/worker - res = res.cuda() - - # AllReduce operation - dist.all_reduce(res,op=dist.ReduceOp.SUM,group=None,async_op=True).wait() - - # average of number of workers - res/=dist.get_world_size() - - return res - -def par_sum(field): - """! function that sums a field across all workers to a worker - @param field field in worker that should be summed up - - @return sum of all fields - """ - # convert field to tensor - res = torch.Tensor([field]) - - # move field to GPU/worker - res = res.cuda() - - # AllReduce operation - dist.all_reduce(res,op=dist.ReduceOp.SUM,group=None,async_op=True).wait() - - return res - -def load_data(data_dir=None): - """! function that loads training and test set of cifar-10 - @param data_dir directory where the data is stored - - @return train_set training set of cifar-10 - @return test_set test set of cifar-10 - """ - # vision preprocessing values - mean = [x / 255 for x in [125.3, 123.0, 113.9]] - std = [x / 255 for x in [63.0, 62.1, 66.7]] - - # transformations for the training set - transform_train = transforms.Compose([ - transforms.RandomCrop(32, padding=4), - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), - transforms.Normalize(mean, std), - ]) - - # transformations for the testset - transform_test = transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize(mean, std), - ]) - - - # load the cifar-10 dataset from directory - train_set = torchvision.datasets.CIFAR10( - root=data_dir, train=True, download=False, transform=transform_train) - - test_set = torchvision.datasets.CIFAR10( - root=data_dir, train=False, download=False, transform=transform_test) - - - return train_set, test_set - - -def train_cifar(config): - """! function to train a ResNet on cifar-10 with different hyperparameters - @param config hyperparameter search space - """ - - # load a ResNet model - model = models.resnet18() - - # prepare the model for Ray Tune - model = train.torch.prepare_model(model) - - # define optimizer and loss function - criterion = nn.CrossEntropyLoss() - optimizer = optim.SGD(model.parameters(), lr=config["lr"]*dist.get_world_size()) - - # load the training and test data - train_set, test_set = load_data(str(config["data_dir"])) - - # define the train and test dataloader - train_loader = torch.utils.data.DataLoader( - train_set, - batch_size=int(config["batch_size"]), - shuffle=True, - num_workers=30) - - test_loader = torch.utils.data.DataLoader( - test_set, - batch_size=int(config["batch_size"]), - shuffle=False, - num_workers=30) - - # prepare the dataloaders for Ray Tune - train_loader = train.torch.prepare_data_loader(train_loader) - test_loader = train.torch.prepare_data_loader(test_loader) - - - # prepare metrics - train_acc = 0 - train_correct = 0 - train_total = 0 - - test_acc = 0 - test_correct = 0 - test_total = 0 - - # training and testing loop - for epoch in range(100): - - # prepare model for training and loop over training dataset - model.train() - for i, (images, target) in enumerate(train_loader): - - # compute output - optimizer.zero_grad() - output = model(images) - - # compute loss - loss = criterion(output, target) - - # count correct classifications - tmp_correct, tmp_total = accuracy(output, target) - train_correct +=tmp_correct - train_total +=tmp_total - - # backpropagation and optimization step - loss.backward() - optimizer.step() - - # average the train metrics over all workers - train_correct = par_sum(train_correct) - train_total = par_sum(train_total) - - # compute final training accuracy - train_acc = train_correct/train_total - - # only perform the testing loop every 10 epochs - if ((epoch+1)%10 == 0): - - # prepare model for testing and loop over test dataset - model.eval() - with torch.no_grad(): - for i, (images, target) in enumerate(test_loader): - - # compute output - output = model(images) - - # count correct classifications - tmp_correct, tmp_total = accuracy(output, target) - test_correct +=tmp_correct - test_total +=tmp_total - - # average the test metrics over all workers - test_correct = par_sum(test_correct) - test_total = par_sum(test_total) - - # compute final test accuracy - test_acc = test_correct/test_total - - # report the training and testing accuracy back to the head node of Ray Tune - session.report({"train_acc": train_acc.item(), "test_acc": test_acc.item()}) - - - -def main(args): - """! main function - @param args input arguments - """ - - # initalize Ray with the correct adress and node ip adress - ray.init(address=os.environ['ip_head'], _node_ip_address=os.environ["head_node_ip"]) - - - # define the hyperparameter search space - config = { - "batch_size": tune.choice([64, 128, 256, 512]), - "lr": tune.loguniform(10e-5, 1), - "data_dir": tune.choice([args.data_dir]), - } - - # select a hyperparameter optimization algorithm - if (args.scheduler == "ASHA"): - # Asynchronous Successive Halving Algorithm - scheduler = ASHAScheduler( - # the number of iterations to allow the trials to run at max - max_t=args.max_iterations, - # how many iterations before a bad trials get terminated - grace_period=2, - # which percentage of trials to terminate - reduction_factor=3) - - # set search algorithm - search_alg = None - - if (args.scheduler == "RAND"): - # random scheduler - scheduler = None - search_alg = None - - # define a reporter/logger to specifify which metrics to print out during the optimization process - reporter = CLIReporter( - metric_columns=["train_acc", "test_acc", "training_iteration", "time_this_iter_s", "time_total_s"], - max_report_frequency=60) - - - # define the general RunConfig of Ray Tune - run_config = RunConfig( - # name of the training run (directory name). - name="cifar_test_training", - # directory to store the ray tune results in . - local_dir=os.path.join(os.path.abspath(os.getcwd()), "ray_results"), - # logger - progress_reporter=reporter, - # stopping criterion when to end the optimization process - stop={"training_iteration": args.max_iterations} - - ) - - # wrapping the torch training function inside a TorchTrainer logic - trainer = TorchTrainer( - # torch training function - train_loop_per_worker=train_cifar, - # default hyperparameters for the function - train_loop_config={"batch_size": 64, "lr": 0.1, "data_dir": "/"}, - # setting the default resources/workers to use for the training function, including the number of CPUs and GPUs - scaling_config=ScalingConfig(num_workers=args.par_workers, use_gpu=True, resources_per_worker={"CPU": 30, "GPU": 1}), - ) - - # defining the hyperparameter tuner - tuner = Tuner( - # function to tune - trainer, - # hyperparameter search space - param_space={"train_loop_config": config}, - # the tuning configuration - tune_config=TuneConfig( - # define how many trials to evaluate - num_samples=args.num_samples, - # define which metric to use for measuring the performance of the trials - metric="test_acc", - # if the metric should be maximized or minimized - mode="max", - # define which scheduler to use - scheduler=scheduler, - # define which search algorithm to use - search_alg=search_alg, - ), - run_config=run_config - ) - - # measure the total runtime - start_time = time.time() - - # start the optimization process - result = tuner.fit() - - runtime = time.time() - start_time - - # print total runtime - print("Total runtime: ", runtime) - - # print metrics of the best trial - best_result = result.get_best_result(metric="test_acc", mode="max") - - print("Best result metrics: ", best_result) - - # print results dataframe - print("Result dataframe: ") - print(result.get_dataframe().sort_values("test_acc", ascending=False)) - - -if __name__ == "__main__": - - # get custom arguments from parser - parser = parsIni() - args = parser.parse_args() - - # call the main function to launch Ray - main(args) \ No newline at end of file diff --git a/scripts/jureca_raytune/Ray_2.4/ASHA/jureca_ray_startscript.sh b/scripts/jureca_raytune/Ray_2.4/ASHA/jureca_ray_startscript.sh deleted file mode 100644 index 514498a..0000000 --- a/scripts/jureca_raytune/Ray_2.4/ASHA/jureca_ray_startscript.sh +++ /dev/null @@ -1,79 +0,0 @@ -#!/bin/bash -# shellcheck disable=SC2206 - -#SBATCH --job-name=ray_cifar_test -#SBATCH --account= -#SBATCH --output=ray_test_cifar.out -#SBATCH --error=ray_test_cifar.err -#SBATCH --partition=dc-gpu -#SBATCH --nodes=2 -#SBATCH --tasks-per-node=1 -#SBATCH --cpus-per-task=128 -#SBATCH --gres=gpu:4 -#SBATCH --time=00:30:00 -#SBATCH --exclusive - - -ml --force purge - -ml Stages/2023 GCC/11.3.0 OpenMPI/4.1.4 PyTorch/1.12.0-CUDA-11.7 torchvision/0.13.1-CUDA-11.7 - -source ray_tune_env/bin/activate - -COMMAND="cifar_tune_asha.py --scheduler ASHA --num-samples 12 --par-workers 2 --max-iterations 2 --data-dir /p/scratch/raise-ctp2/cifar10/data " - -echo $COMMAND - -sleep 1 -# make sure CUDA devices are visible -export CUDA_VISIBLE_DEVICES="0,1,2,3" -export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK} - -num_gpus=4 - -## Limit number of max pending trials -export TUNE_MAX_PENDING_TRIALS_PG=$(($SLURM_NNODES * 4)) - -## Disable Ray Usage Stats -export RAY_USAGE_STATS_DISABLE=1 - - -####### this part is taken from the ray example slurm script ##### -set -x - -# __doc_head_address_start__ - -# Getting the node names -nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -nodes_array=($nodes) - -head_node=${nodes_array[0]} - -port=7638 - -export ip_head="$head_node"i:"$port" -export head_node_ip="$head_node"i - -echo "Starting HEAD at $head_node" -srun --nodes=1 --ntasks=1 -w "$head_node" \ - ray start --head --node-ip-address="$head_node"i --port=$port \ - --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus $num_gpus --block & - -# optional, though may be useful in certain versions of Ray < 1.0. -sleep 10 - -# number of nodes other than the head node -worker_num=$((SLURM_JOB_NUM_NODES - 1)) - -for ((i = 1; i <= worker_num; i++)); do - node_i=${nodes_array[$i]} - echo "Starting WORKER $i at $node_i" - srun --nodes=1 --ntasks=1 -w "$node_i" \ - ray start --address "$head_node"i:"$port" --redis-password='5241590000000000' \ - --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus $num_gpus --block & - sleep 5 -done - -echo "Ready" - -python -u $COMMAND diff --git a/scripts/jureca_raytune/Ray_2.4/BOHB/.gitkeep b/scripts/jureca_raytune/Ray_2.4/BOHB/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/scripts/jureca_raytune/Ray_2.4/BOHB/cifar_tune_bohb.py b/scripts/jureca_raytune/Ray_2.4/BOHB/cifar_tune_bohb.py deleted file mode 100644 index 035d72a..0000000 --- a/scripts/jureca_raytune/Ray_2.4/BOHB/cifar_tune_bohb.py +++ /dev/null @@ -1,427 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -"""! @brief AI4HPC """ - -## -# @mainpage AI4HPC -# -# @section description_main Description -# Hyperparameter optimization of neural networks with Ray Tune library. -# -# -# -# @section notes_main Notes -# - The data directory of the cifar-10 dataset has the be specified in the startscript -# -# Copyright (c) 2023 RAISE, All rights reserved. - - -## -# @file cifar_tune_bohb.py -# -# @brief Optimizing the hyperparameters of a ResNet18 trained on the cifar-10 dataset with Ray Tune libray and the BOHB algorithm. -# -# @section description_cifar_tune_bohb description -# A standard ResNet18 model is trained on the cifar-10 vision dataset. To optimize the performance, multiple -# training runs (trials) with different hyperparameters (chagend learning rate and batch size) are performed using -# the Ray Tune library. The overall hyperparameter optimization process, as well as the single training runs can be -# parallelized across multiple GPUs. Trials with low performance (in terms of test set acuracy) are terminated early -# and their resources are assigned to new samples with the BOHB aglorithm. -# -# -# @section libraries_main Libraries/Modules -# - argparse standard library (https://docs.python.org/3/library/argparse.html) -# - Parse command-line options -# - sys standard library (https://docs.python.org/3/library/sys.html) -# - System commands -# - os standard library (https://docs.python.org/3/library/os.html) -# - OS commands -# - time standard library (https://docs.python.org/3/library/time.html) -# - Access timers for profilers -# - numpy library (https://numpy.org/) -# - Access numpy functions -# - torch library (https://pytorch.org/) -# - ML framework -# - torchvision library (https://pypi.org/project/torchvision/) -# - Torch library additions for popular datasets and their transformations -# - ray libray (https://www.ray.io/) -# - Framework for distributed computing with a focus on hyperparameter optimization -# - pytz library (https://pythonhosted.org/pytz/) -# - Library for accurate and cross platform timezone calculation -# - python-dateutil (https://github.com/dateutil/dateutil) -# - Extension to pythons datetimes features -# - typing-extensions (https://pypi.org/project/typing-extensions/) -# - Support for different type systems -# - hpbandster library (https://automl.github.io/HpBandSter/build/html/quickstart.html) -# - Library for performing hyperband operations -# - ConfigSpace library (https://automl.github.io/ConfigSpace/main/) -# - Library to manage configuration and search spaces for hyperparameter optimization -# -# @section notes_doxygen_example Notes -# - None. -# -# @section todo TODO -# - None. -# -# @section author Author(s) -# - Created by MA on 04/05/2023. -# - Modified by -# -# Copyright (c) 2023 RAISE, All rights reserved. - - -# load general modules -import argparse -import os -import time -import numpy as np - -# load torch and torchvision modules -import torch -import torch.nn as nn -import torch.optim as optim -import torch.distributed as dist - -import torchvision -from torchvision import datasets, transforms, models - -# load ray modules -import ray -from ray import tune -from ray.tune import CLIReporter -from ray.tune.schedulers.hb_bohb import HyperBandForBOHB -from ray.tune.search.bohb import TuneBOHB -from ray.air import session, RunConfig -import ray.train as train -from ray.train.torch import TorchTrainer -from ray.air.config import ScalingConfig -from ray.tune.tuner import Tuner, TuneConfig - - - - -def parsIni(): - parser = argparse.ArgumentParser(description='Ray Tune Cifar-10 Example') - parser.add_argument('--num-samples', type=int, default=24, metavar='N', - help='number of samples to train (default: 24)') - parser.add_argument('--max-iterations', type=int, default=10, metavar='N', - help='maximum iterations to train (default: 10)') - parser.add_argument('--par-workers', type=int, default=1, metavar='N', - help='parallel workers to train on a single trial (default: 1)') - parser.add_argument('--scheduler', type=str, default='RAND', - help='scheduler for tuning (default: RandomSearch)') - parser.add_argument('--data-dir', type=str, default='', - help='data directory for cifar-10 dataset') - - return parser - -def accuracy(output, target): - """! function that computes the accuracy of an output and target vector - @param output vector that the model predicted - @param target actual vector - - @return correct number of correct predictions - @return total number of total elements - """ - # get the index of the max log-probability - pred = output.max(1, keepdim=True)[1] - - # count correct classifications - correct = pred.eq(target.view_as(pred)).cpu().float().sum() - - # count total samples - total = target.size(0) - return correct, total - -def par_mean(field): - """! function that averages a field across all workers to a worker - @param field field in worker that should be averaged - - @return mean field - """ - - # convert field to tensor - res = torch.Tensor([field]) - - # move field to GPU/worker - res = res.cuda() - - # AllReduce operation - dist.all_reduce(res,op=dist.ReduceOp.SUM,group=None,async_op=True).wait() - - # average of number of workers - res/=dist.get_world_size() - - return res - -def par_sum(field): - """! function that sums a field across all workers to a worker - @param field field in worker that should be summed up - - @return sum of all fields - """ - # convert field to tensor - res = torch.Tensor([field]) - - # move field to GPU/worker - res = res.cuda() - - # AllReduce operation - dist.all_reduce(res,op=dist.ReduceOp.SUM,group=None,async_op=True).wait() - - return res - -def load_data(data_dir=None): - """! function that loads training and test set of cifar-10 - @param data_dir directory where the data is stored - - @return train_set training set of cifar-10 - @return test_set test set of cifar-10 - """ - # vision preprocessing values - mean = [x / 255 for x in [125.3, 123.0, 113.9]] - std = [x / 255 for x in [63.0, 62.1, 66.7]] - - # transformations for the training set - transform_train = transforms.Compose([ - transforms.RandomCrop(32, padding=4), - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), - transforms.Normalize(mean, std), - ]) - - # transformations for the testset - transform_test = transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize(mean, std), - ]) - - - # load the cifar-10 dataset from directory - train_set = torchvision.datasets.CIFAR10( - root=data_dir, train=True, download=False, transform=transform_train) - - test_set = torchvision.datasets.CIFAR10( - root=data_dir, train=False, download=False, transform=transform_test) - - - return train_set, test_set - - -def train_cifar(config): - """! function to train a ResNet on cifar-10 with different hyperparameters - @param config hyperparameter search space - """ - - # load a ResNet model - model = models.resnet18() - - # prepare the model for Ray Tune - model = train.torch.prepare_model(model) - - # define optimizer and loss function - criterion = nn.CrossEntropyLoss() - optimizer = optim.SGD(model.parameters(), lr=config["lr"]*dist.get_world_size()) - - # load the training and test data - train_set, test_set = load_data(str(config["data_dir"])) - - # define the train and test dataloader - train_loader = torch.utils.data.DataLoader( - train_set, - batch_size=int(config["batch_size"]), - shuffle=True, - num_workers=30) - - test_loader = torch.utils.data.DataLoader( - test_set, - batch_size=int(config["batch_size"]), - shuffle=False, - num_workers=30) - - # prepare the dataloaders for Ray Tune - train_loader = train.torch.prepare_data_loader(train_loader) - test_loader = train.torch.prepare_data_loader(test_loader) - - - # prepare metrics - train_acc = 0 - train_correct = 0 - train_total = 0 - - test_acc = 0 - test_correct = 0 - test_total = 0 - - # training and testing loop - for epoch in range(100): - - # prepare model for training and loop over training dataset - model.train() - for i, (images, target) in enumerate(train_loader): - - # compute output - optimizer.zero_grad() - output = model(images) - - # compute loss - loss = criterion(output, target) - - # count correct classifications - tmp_correct, tmp_total = accuracy(output, target) - train_correct +=tmp_correct - train_total +=tmp_total - - # backpropagation and optimization step - loss.backward() - optimizer.step() - - # average the train metrics over all workers - train_correct = par_sum(train_correct) - train_total = par_sum(train_total) - - # compute final training accuracy - train_acc = train_correct/train_total - - # only perform the testing loop every 10 epochs - if ((epoch+1)%10 == 0): - - # prepare model for testing and loop over test dataset - model.eval() - with torch.no_grad(): - for i, (images, target) in enumerate(test_loader): - - # compute output - output = model(images) - - # count correct classifications - tmp_correct, tmp_total = accuracy(output, target) - test_correct +=tmp_correct - test_total +=tmp_total - - # average the test metrics over all workers - test_correct = par_sum(test_correct) - test_total = par_sum(test_total) - - # compute final test accuracy - test_acc = test_correct/test_total - - # report the training and testing accuracy back to the head node of Ray Tune - session.report({"train_acc": train_acc.item(), "test_acc": test_acc.item()}) - - - -def main(args): - """! main function - @param args input arguments - """ - - # initalize Ray with the correct adress and node ip adress - ray.init(address=os.environ['ip_head'], _node_ip_address=os.environ["head_node_ip"]) - - - # define the hyperparameter search space - config = { - "batch_size": tune.choice([64, 128, 256, 512]), - "lr": tune.loguniform(10e-5, 1), - "data_dir": tune.choice([args.data_dir]), - } - - # select a hyperparameter optimization algorithm - - if (args.scheduler == "BOHB"): - # Bayesian Optimization and HyperBand - scheduler = HyperBandForBOHB( - # time attribute - time_attr="training_iteration", - # the number of iterations to allow the trials to run at max - max_t=args.max_iterations, - # which percentage of trials to terminate - reduction_factor=3) - - search_alg = TuneBOHB(seed=42) - - if (args.scheduler == "RAND"): - # random scheduler - scheduler = None - search_alg = None - - # define a reporter/logger to specifify which metrics to print out during the optimization process - reporter = CLIReporter( - metric_columns=["train_acc", "test_acc", "training_iteration", "time_this_iter_s", "time_total_s"], - max_report_frequency=60) - - - # define the general RunConfig of Ray Tune - run_config = RunConfig( - # name of the training run (directory name). - name="cifar_test_training", - # directory to store the ray tune results in . - local_dir=os.path.join(os.path.abspath(os.getcwd()), "ray_results"), - # logger - progress_reporter=reporter, - # stopping criterion when to end the optimization process - stop={"training_iteration": args.max_iterations} - - ) - - # wrapping the torch training function inside a TorchTrainer logic - trainer = TorchTrainer( - # torch training function - train_loop_per_worker=train_cifar, - # default hyperparameters for the function - train_loop_config={"batch_size": 64, "lr": 0.1, "data_dir": "/"}, - # setting the default resources/workers to use for the training function, including the number of CPUs and GPUs - scaling_config=ScalingConfig(num_workers=args.par_workers, use_gpu=True, resources_per_worker={"CPU": 30, "GPU": 1}), - ) - - # defining the hyperparameter tuner - tuner = Tuner( - # function to tune - trainer, - # hyperparameter search space - param_space={"train_loop_config": config}, - # the tuning configuration - tune_config=TuneConfig( - # define how many trials to evaluate - num_samples=args.num_samples, - # define which metric to use for measuring the performance of the trials - metric="test_acc", - # if the metric should be maximized or minimized - mode="max", - # define which scheduler to use - scheduler=scheduler, - # define which search algorithm to use - search_alg=search_alg), - run_config=run_config - ) - - # measure the total runtime - start_time = time.time() - - # start the optimization process - result = tuner.fit() - - runtime = time.time() - start_time - - # print total runtime - print("Total runtime: ", runtime) - - # print metrics of the best trial - best_result = result.get_best_result(metric="test_acc", mode="max") - - print("Best result metrics: ", best_result) - - # print results dataframe - print("Result dataframe: ") - print(result.get_dataframe().sort_values("test_acc", ascending=False)) - - -if __name__ == "__main__": - - # get custom arguments from parser - parser = parsIni() - args = parser.parse_args() - - # call the main function to launch Ray - main(args) \ No newline at end of file diff --git a/scripts/jureca_raytune/Ray_2.4/BOHB/jureca_ray_startscript.sh b/scripts/jureca_raytune/Ray_2.4/BOHB/jureca_ray_startscript.sh deleted file mode 100644 index f35209f..0000000 --- a/scripts/jureca_raytune/Ray_2.4/BOHB/jureca_ray_startscript.sh +++ /dev/null @@ -1,78 +0,0 @@ -#!/bin/bash -# shellcheck disable=SC2206 - -#SBATCH --job-name=ray_cifar_test -#SBATCH --account= -#SBATCH --output=ray_test_cifar.out -#SBATCH --error=ray_test_cifar.err -#SBATCH --partition=dc-gpu -#SBATCH --nodes=2 -#SBATCH --tasks-per-node=1 -#SBATCH --cpus-per-task=128 -#SBATCH --gres=gpu:4 -#SBATCH --time=00:30:00 -#SBATCH --exclusive - -ml --force purge - -ml Stages/2023 GCC/11.3.0 OpenMPI/4.1.4 PyTorch/1.12.0-CUDA-11.7 torchvision/0.13.1-CUDA-11.7 - -source ray_tune_env/bin/activate - -COMMAND="cifar_tune_bohb.py --scheduler BOHB --num-samples 12 --par-workers 2 --max-iterations 2 --data-dir /p/scratch/raise-ctp2/cifar10/data " - -echo $COMMAND - -sleep 1 -# make sure CUDA devices are visible -export CUDA_VISIBLE_DEVICES="0,1,2,3" -export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK} - -num_gpus=4 - -## Limit number of max pending trials -export TUNE_MAX_PENDING_TRIALS_PG=$(($SLURM_NNODES * 4)) - -## Disable Ray Usage Stats -export RAY_USAGE_STATS_DISABLE=1 - - -####### this part is taken from the ray example slurm script ##### -set -x - -# __doc_head_address_start__ - -# Getting the node names -nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -nodes_array=($nodes) - -head_node=${nodes_array[0]} - -port=7638 - -export ip_head="$head_node"i:"$port" -export head_node_ip="$head_node"i - -echo "Starting HEAD at $head_node" -srun --nodes=1 --ntasks=1 -w "$head_node" \ - ray start --head --node-ip-address="$head_node"i --port=$port \ - --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus $num_gpus --block & - -# optional, though may be useful in certain versions of Ray < 1.0. -sleep 10 - -# number of nodes other than the head node -worker_num=$((SLURM_JOB_NUM_NODES - 1)) - -for ((i = 1; i <= worker_num; i++)); do - node_i=${nodes_array[$i]} - echo "Starting WORKER $i at $node_i" - srun --nodes=1 --ntasks=1 -w "$node_i" \ - ray start --address "$head_node"i:"$port" --redis-password='5241590000000000' \ - --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus $num_gpus --block & - sleep 5 -done - -echo "Ready" - -python -u $COMMAND diff --git a/scripts/jureca_raytune/Ray_2.4/PBT/.gitkeep b/scripts/jureca_raytune/Ray_2.4/PBT/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/scripts/jureca_raytune/Ray_2.4/PBT/cifar_tune_pbt.py b/scripts/jureca_raytune/Ray_2.4/PBT/cifar_tune_pbt.py deleted file mode 100644 index 87a43d6..0000000 --- a/scripts/jureca_raytune/Ray_2.4/PBT/cifar_tune_pbt.py +++ /dev/null @@ -1,459 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -"""! @brief AI4HPC """ - -## -# @mainpage AI4HPC -# -# @section description_main Description -# Hyperparameter optimization of neural networks with Ray Tune library. -# -# -# -# @section notes_main Notes -# - The data directory of the cifar-10 dataset has the be specified in the startscript -# -# Copyright (c) 2023 RAISE, All rights reserved. - - -## -# @file cifar_tune_pbt.py -# -# @brief Optimizing the hyperparameters of a ResNet18 trained on the cifar-10 dataset with Ray Tune libray and the PBT algorithm. -# -# @section description_cifar_tune_pbt description -# A standard ResNet18 model is trained on the cifar-10 vision dataset. To optimize the performance, multiple -# training runs (trials) with different hyperparameters (chagend learning rate and batch size) are performed using -# the Ray Tune library. The overall hyperparameter optimization process, as well as the single training runs can be -# parallelized across multiple GPUs. Trials with low performance (in terms of test set acuracy) copy the hyperparameters -# of better performing trials and apply mutations with the PBT aglorithm. -# -# @section libraries_main Libraries/Modules -# - argparse standard library (https://docs.python.org/3/library/argparse.html) -# - Parse command-line options -# - sys standard library (https://docs.python.org/3/library/sys.html) -# - System commands -# - os standard library (https://docs.python.org/3/library/os.html) -# - OS commands -# - time standard library (https://docs.python.org/3/library/time.html) -# - Access timers for profilers -# - numpy library (https://numpy.org/) -# - Access numpy functions -# - torch library (https://pytorch.org/) -# - ML framework -# - torchvision library (https://pypi.org/project/torchvision/) -# - Torch library additions for popular datasets and their transformations -# - ray libray (https://www.ray.io/) -# - Framework for distributed computing with a focus on hyperparameter optimization -# - pytz library (https://pythonhosted.org/pytz/) -# - Library for accurate and cross platform timezone calculation -# - python-dateutil (https://github.com/dateutil/dateutil) -# - Extension to pythons datetimes features -# - typing-extensions (https://pypi.org/project/typing-extensions/) -# - Support for different type systems -# -# @section notes_doxygen_example Notes -# - None. -# -# @section todo TODO -# - None. -# -# @section author Author(s) -# - Created by MA on 04/05/2023. -# - Modified by -# -# Copyright (c) 2023 RAISE, All rights reserved. - -# load general modules -import argparse -import os -import time -import numpy as np - -# load torch and torchvision modules -import torch -import torch.nn as nn -import torch.optim as optim -import torch.distributed as dist - -import torchvision -from torchvision import datasets, transforms, models - -# load ray modules -import ray -from ray import tune -from ray.tune import CLIReporter -from ray.tune.schedulers import PopulationBasedTraining -from ray.air import session, Checkpoint, RunConfig -import ray.train as train -from ray.train.torch import TorchTrainer -from ray.air.config import ScalingConfig -from ray.tune.tuner import Tuner, TuneConfig - - - - -def parsIni(): - parser = argparse.ArgumentParser(description='Ray Tune Cifar-10 Example') - parser.add_argument('--num-samples', type=int, default=24, metavar='N', - help='number of samples to train (default: 24)') - parser.add_argument('--max-iterations', type=int, default=10, metavar='N', - help='maximum iterations to train (default: 10)') - parser.add_argument('--par-workers', type=int, default=1, metavar='N', - help='parallel workers to train on a single trial (default: 1)') - parser.add_argument('--scheduler', type=str, default='RAND', - help='scheduler for tuning (default: RandomSearch)') - parser.add_argument('--data-dir', type=str, default='', - help='data directory for cifar-10 dataset') - - return parser - -def accuracy(output, target): - """! function that computes the accuracy of an output and target vector - @param output vector that the model predicted - @param target actual vector - - @return correct number of correct predictions - @return total number of total elements - """ - # get the index of the max log-probability - pred = output.max(1, keepdim=True)[1] - - # count correct classifications - correct = pred.eq(target.view_as(pred)).cpu().float().sum() - - # count total samples - total = target.size(0) - return correct, total - -def par_mean(field): - """! function that averages a field across all workers to a worker - @param field field in worker that should be averaged - - @return mean field - """ - - # convert field to tensor - res = torch.Tensor([field]) - - # move field to GPU/worker - res = res.cuda() - - # AllReduce operation - dist.all_reduce(res,op=dist.ReduceOp.SUM,group=None,async_op=True).wait() - - # average of number of workers - res/=dist.get_world_size() - - return res - -def par_sum(field): - """! function that sums a field across all workers to a worker - @param field field in worker that should be summed up - - @return sum of all fields - """ - # convert field to tensor - res = torch.Tensor([field]) - - # move field to GPU/worker - res = res.cuda() - - # AllReduce operation - dist.all_reduce(res,op=dist.ReduceOp.SUM,group=None,async_op=True).wait() - - return res - -def load_data(data_dir=None): - """! function that loads training and test set of cifar-10 - @param data_dir directory where the data is stored - - @return train_set training set of cifar-10 - @return test_set test set of cifar-10 - """ - # vision preprocessing values - mean = [x / 255 for x in [125.3, 123.0, 113.9]] - std = [x / 255 for x in [63.0, 62.1, 66.7]] - - # transformations for the training set - transform_train = transforms.Compose([ - transforms.RandomCrop(32, padding=4), - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), - transforms.Normalize(mean, std), - ]) - - # transformations for the testset - transform_test = transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize(mean, std), - ]) - - - # load the cifar-10 dataset from directory - train_set = torchvision.datasets.CIFAR10( - root=data_dir, train=True, download=False, transform=transform_train) - - test_set = torchvision.datasets.CIFAR10( - root=data_dir, train=False, download=False, transform=transform_test) - - - return train_set, test_set - - -def train_cifar(config): - """! function to train a ResNet on cifar-10 with different hyperparameters - @param config hyperparameter search space - """ - # PBT specific variable - step = 1 - - print("Starting Trials") - - # load a ResNet model - model = models.resnet18() - - # define optimizer and loss function - criterion = nn.CrossEntropyLoss() - optimizer = optim.SGD(model.parameters(), lr=config["lr"]*dist.get_world_size()) - - if session.get_checkpoint(): - # Load model state and iteration step from checkpoint. - checkpoint_dict = session.get_checkpoint().to_dict() - model.load_state_dict(checkpoint_dict["model_state_dict"]) - # Load optimizer state (needed since we're using momentum), - # then set the `lr` and `momentum` according to the config. - optimizer.load_state_dict(checkpoint_dict["optimizer_state_dict"]) - - # Note: Make sure to increment the checkpointed step by 1 to get the current step. - last_step = checkpoint_dict["step"] - step = last_step + 1 - - - - # prepare the model for Ray Tune - model = train.torch.prepare_model(model) - - # load the training and test data - train_set, test_set = load_data(str(config["data_dir"])) - - # define the train and test dataloader - train_loader = torch.utils.data.DataLoader( - train_set, - batch_size=int(config["batch_size"]), - shuffle=True, - num_workers=30) - - test_loader = torch.utils.data.DataLoader( - test_set, - batch_size=int(config["batch_size"]), - shuffle=False, - num_workers=30) - - # prepare the dataloaders for Ray Tune - train_loader = train.torch.prepare_data_loader(train_loader) - test_loader = train.torch.prepare_data_loader(test_loader) - - - # prepare metrics - train_acc = 0 - train_correct = 0 - train_total = 0 - - test_acc = 0 - test_correct = 0 - test_total = 0 - - # training and testing loop - for epoch in range(100): - - # prepare model for training and loop over training dataset - model.train() - for i, (images, target) in enumerate(train_loader): - - # compute output - optimizer.zero_grad() - output = model(images) - - # compute loss - loss = criterion(output, target) - - # count correct classifications - tmp_correct, tmp_total = accuracy(output, target) - train_correct +=tmp_correct - train_total +=tmp_total - - # backpropagation and optimization step - loss.backward() - optimizer.step() - - # average the train metrics over all workers - train_correct = par_sum(train_correct) - train_total = par_sum(train_total) - - # compute final training accuracy - train_acc = train_correct/train_total - - # only perform the testing loop every 10 epochs - if ((epoch+1)%10 == 0): - - # prepare model for testing and loop over test dataset - model.eval() - with torch.no_grad(): - for i, (images, target) in enumerate(test_loader): - - # compute output - output = model(images) - - # count correct classifications - tmp_correct, tmp_total = accuracy(output, target) - test_correct +=tmp_correct - test_total +=tmp_total - - # average the test metrics over all workers - test_correct = par_sum(test_correct) - test_total = par_sum(test_total) - - # compute final test accuracy - test_acc = test_correct/test_total - - # checkpoint the training - checkpoint = Checkpoint.from_dict({ - "step": step, - "model_state_dict": model.state_dict(), - "optimizer_state_dict": optimizer.state_dict(), - }) - - # report the training and testing accuracy back to the head node of Ray Tune - session.report({"train_acc": train_acc.item(), "test_acc": test_acc.item()}, checkpoint=checkpoint) - - step += 1 - - - -def main(args): - """! main function - @param args input arguments - """ - - # initalize Ray with the correct adress and node ip adress - ray.init(address=os.environ['ip_head'], _node_ip_address=os.environ["head_node_ip"]) - - - # define the (original) hyperparameter search space - config = { - "batch_size": tune.choice([64, 128, 256, 512]), - "lr": tune.loguniform(10e-5, 1), - "data_dir": tune.choice([args.data_dir]), - } - - # define the mutation config - mutation_config = {"lr": tune.loguniform(10e-5, 1),} - - # select a hyperparameter optimization algorithm - - if (args.scheduler == "PBT"): - # Population Based Training - scheduler = PopulationBasedTraining( - # time attribute - time_attr="training_iteration", - # intervals at that perturbations occur, - perturbation_interval=1, - # specification of hyperparameter mutatation search space (can be different than original search space!) - hyperparam_mutations={"train_loop_config": mutation_config}, - # the parameters of the top quantile_fraction percentage trials are transfered to the bottom quantile_fraction percentage of trials - quantile_fraction=0.33, - # probability to resample from original hyperparameter search space - resample_probability=0, - ) - - search_alg= None - - if (args.scheduler == "RAND"): - # random scheduler - scheduler = None - search_alg = None - - # define a reporter/logger to specifify which metrics to print out during the optimization process - reporter = CLIReporter( - metric_columns=["train_acc", "test_acc", "training_iteration", "time_this_iter_s", "time_total_s"], - max_report_frequency=60) - - - # define the general RunConfig of Ray Tune - run_config = RunConfig( - # name of the training run (directory name). - name="cifar_test_training", - # directory to store the ray tune results in . - local_dir=os.path.join(os.path.abspath(os.getcwd()), "ray_results"), - # logger - progress_reporter=reporter, - # stopping criterion when to end the optimization process - stop={"training_iteration": args.max_iterations}, - #checkpointing - checkpoint_config=ray.air.CheckpointConfig( - checkpoint_score_attribute="test_acc", - ), - - ) - - # wrapping the torch training function inside a TorchTrainer logic - trainer = TorchTrainer( - # torch training function - train_loop_per_worker=train_cifar, - # default hyperparameters for the function - train_loop_config={"batch_size": 64, "lr": 0.1, "data_dir": "/"}, - # setting the default resources/workers to use for the training function, including the number of CPUs and GPUs - scaling_config=ScalingConfig(num_workers=args.par_workers, use_gpu=True, resources_per_worker={"CPU": 30, "GPU": 1}), - ) - - # defining the hyperparameter tuner - tuner = Tuner( - # function to tune - trainer, - # general hyperparameter search space - param_space={"train_loop_config": config}, - # the tuning configuration - tune_config=TuneConfig( - # define how many trials to evaluate - num_samples=args.num_samples, - # define which metric to use for measuring the performance of the trials - metric="test_acc", - # if the metric should be maximized or minimized - mode="max", - # define which scheduler to use - scheduler=scheduler, - # define which search algorithm to use - search_alg=search_alg), - run_config=run_config, - ) - - # measure the total runtime - start_time = time.time() - - # start the optimization process - result = tuner.fit() - - runtime = time.time() - start_time - - # print total runtime - print("Total runtime: ", runtime) - - # print metrics of the best trial - best_result = result.get_best_result(metric="test_acc", mode="max") - - print("Best result metrics: ", best_result) - - # print results dataframe - print("Result dataframe: ") - print(result.get_dataframe().sort_values("test_acc", ascending=False)) - - -if __name__ == "__main__": - - # get custom arguments from parser - parser = parsIni() - args = parser.parse_args() - - # call the main function to launch Ray - main(args) \ No newline at end of file diff --git a/scripts/jureca_raytune/Ray_2.4/PBT/jureca_ray_startscript.sh b/scripts/jureca_raytune/Ray_2.4/PBT/jureca_ray_startscript.sh deleted file mode 100644 index e2fbc41..0000000 --- a/scripts/jureca_raytune/Ray_2.4/PBT/jureca_ray_startscript.sh +++ /dev/null @@ -1,78 +0,0 @@ -#!/bin/bash -# shellcheck disable=SC2206 - -#SBATCH --job-name=ray_cifar_test -#SBATCH --account= -#SBATCH --output=ray_test_cifar.out -#SBATCH --error=ray_test_cifar.err -#SBATCH --partition=dc-gpu -#SBATCH --nodes=2 -#SBATCH --tasks-per-node=1 -#SBATCH --cpus-per-task=128 -#SBATCH --gres=gpu:4 -#SBATCH --time=00:30:00 -#SBATCH --exclusive - -ml --force purge - -ml Stages/2023 GCC/11.3.0 OpenMPI/4.1.4 PyTorch/1.12.0-CUDA-11.7 torchvision/0.13.1-CUDA-11.7 - -source ray_tune_env/bin/activate - -COMMAND="cifar_tune_pbt.py --scheduler PBT --num-samples 8 --par-workers 2 --max-iterations 5 --data-dir /p/scratch/raise-ctp2/cifar10/data " - -echo $COMMAND - -sleep 1 -# make sure CUDA devices are visible -export CUDA_VISIBLE_DEVICES="0,1,2,3" -export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK} - -num_gpus=4 - -## Limit number of max pending trials -export TUNE_MAX_PENDING_TRIALS_PG=$(($SLURM_NNODES * 4)) - -## Disable Ray Usage Stats -export RAY_USAGE_STATS_DISABLE=1 - - -####### this part is taken from the ray example slurm script ##### -set -x - -# __doc_head_address_start__ - -# Getting the node names -nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -nodes_array=($nodes) - -head_node=${nodes_array[0]} - -port=7638 - -export ip_head="$head_node"i:"$port" -export head_node_ip="$head_node"i - -echo "Starting HEAD at $head_node" -srun --nodes=1 --ntasks=1 -w "$head_node" \ - ray start --head --node-ip-address="$head_node"i --port=$port \ - --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus $num_gpus --block & - -# optional, though may be useful in certain versions of Ray < 1.0. -sleep 10 - -# number of nodes other than the head node -worker_num=$((SLURM_JOB_NUM_NODES - 1)) - -for ((i = 1; i <= worker_num; i++)); do - node_i=${nodes_array[$i]} - echo "Starting WORKER $i at $node_i" - srun --nodes=1 --ntasks=1 -w "$node_i" \ - ray start --address "$head_node"i:"$port" --redis-password='5241590000000000' \ - --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus $num_gpus --block & - sleep 5 -done - -echo "Ready" - -python -u $COMMAND diff --git a/scripts/jureca_raytune/Ray_2.4/build_ray_env.sh b/scripts/jureca_raytune/Ray_2.4/build_ray_env.sh deleted file mode 100644 index 645dea2..0000000 --- a/scripts/jureca_raytune/Ray_2.4/build_ray_env.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -ml --force purge - -ml Stages/2023 GCC/11.3.0 OpenMPI/4.1.4 PyTorch/1.12.0-CUDA-11.7 torchvision/0.13.1-CUDA-11.7 - -python3 -m venv ray_tune_env - -source ray_tune_env/bin/activate - -pip3 install ray==2.4.0 ray[tune]==2.4.0 -pip3 install python-dateutil pytz typing-extensions -pip3 install hpbandster ConfigSpace -deactivate \ No newline at end of file diff --git a/scripts/jureca_raytune/Ray_2.4/hpo.md b/scripts/jureca_raytune/Ray_2.4/hpo.md deleted file mode 100644 index ee02067..0000000 --- a/scripts/jureca_raytune/Ray_2.4/hpo.md +++ /dev/null @@ -1,58 +0,0 @@ -# Hyperparameter Optimization of Machine Learning Models with Ray Tune - -For the optimization of the hyperparameters of neural networks (such as learning rate or batch size) or machine learning models in general, the Ray Tune library (current version supported is 2.4.0) can be used. The library features a smooth integration of PyTorch-based training scripts and enables two stages of parallelism: - -- each training of a model with different hyperparameters (trial) can run in parallel on multiple GPUs (e.g. via PyTorch-DDP) -- several trials can run in parallel on an HPC machine (via Ray Tune itself) - -For installation of Ray Tune, run the installation script - -```bash -bash build_ray_env.py -``` - -After installation, several example are available: - -1. [Optimizing a ResNet18 on cifar-10 with AHSA or Random Search schedulers](https://gitlab.jsc.fz-juelich.de/CoE-RAISE/FZJ/ai-for-hpc/-/tree/main/Jureca_RayTune/Ray_2.4/ASHA) -2. [Optimizing a ResNet18 on cifar-10 with BOHB or Random Search schedulers](https://gitlab.jsc.fz-juelich.de/CoE-RAISE/FZJ/ai-for-hpc/-/tree/main/Jureca_RayTune/Ray_2.4/BOHB) -3. [Optimizing a ResNet18 on cifar-10 with PBT or Random Search schedulers (including checkpointing)](https://gitlab.jsc.fz-juelich.de/CoE-RAISE/FZJ/ai-for-hpc/-/tree/main/Jureca_RayTune/Ray_2.4/PBT) - - -The [ASHA](https://arxiv.org/pdf/1810.05934.pdf) scheduler is a variation of Random Search with early stopping of under-performing trials. The [BOHB](http://proceedings.mlr.press/v80/falkner18a/falkner18a.pdf) scheduler uses Bayesian Optimization in combination with early stopping, while the [PBT](https://arxiv.org/pdf/1711.09846.pdf) scheduler uses evolutionary optimization and is well suited for optimizing non-stationary hyperparameters (such as learning rate schedules). - -The following parameters can be set for each script: - -- num-samples: number of samples (trials) to evaluate -- max-iterations: for how long to train the trials at max -- par-workers: how many workers to allocate per trial -- scheduler: which scheduler to use -- data-dir: directory where the datasets are stored - -To submit a job to the JURECA-DC-GPU machine, use the following command: - -```bash -sbatch jureca_ray_startscript.sh -``` - -For communication via the infiniband network it is important the specify the node ip-address in the startscript (whan launching Ray) in the following format: - -```bash ---node-ip-address="$head_node"i -``` - -and - -```bash ---address "$head_node"i:"$port" -``` - -If multiple Ray instances run on the same machine, there might be problems if all use the same port value (7638), so it is advisable to change it to a different value in that case. - - - - - - - - - diff --git a/scripts/jureca_raytune/Ray_2.4/hpo.py b/scripts/jureca_raytune/Ray_2.4/hpo.py deleted file mode 100644 index 9589493..0000000 --- a/scripts/jureca_raytune/Ray_2.4/hpo.py +++ /dev/null @@ -1,449 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -"""! @brief HPO """ - -## -# @mainpage HPO -# -# @section description_main Description -# Hyperparameter optimization of neural networks with Ray Tune library. -# -# -# -# @section notes_main Notes -# - The data directory of the CIFAR-10 dataset has the be specified in the startscript -# -# Copyright (c) 2023 RAISE, All rights reserved. - - -## -# @file hpo.py -# -# @brief Optimizing the hyperparameters of a ResNet18 trained on the CIFAR-10 dataset with Ray Tune libray -# and the ASHA, BOHB and PBT algorithm. -# -# @section hpo description -# A standard ResNet18 model is trained on the CIFAR-10 vision dataset. To optimize the performance, multiple -# training runs (trials) with different hyperparameters (chagend learning rate and batch size) are performed using -# the Ray Tune library. The overall hyperparameter optimization process, as well as the single training runs can be -# parallelized across multiple GPUs. -# For ASHA: Trials with low performance (in terms of test set acuracy) are terminated early -# with the ASHA aglorithm. -# For BOHB: Trials with low performance (in terms of test set acuracy) are terminated early -# and their resources are assigned to new samples with the BOHB aglorithm. -# For PBT: Trials with low performance (in terms of test set acuracy) copy the hyperparameters -# of better performing trials and apply mutations with the PBT aglorithm. -# For RAND: Random Search termination (no algorithm) -# -# -# @section libraries_main Libraries/Modules -# - argparse standard library (https://docs.python.org/3/library/argparse.html) -# - Parse command-line options -# - sys standard library (https://docs.python.org/3/library/sys.html) -# - System commands -# - os standard library (https://docs.python.org/3/library/os.html) -# - OS commands -# - time standard library (https://docs.python.org/3/library/time.html) -# - Access timers for profilers -# - numpy library (https://numpy.org/) -# - Access numpy functions -# - random standard library (https://docs.python.org/3/library/time.html) -# - Generate random numbers -# - matplotlib library (https://matplotlib.org/) -# - Post-process data for validation -# - torch library (https://pytorch.org/) -# - ML framework -# - torchvision library (https://pypi.org/project/torchvision/) -# - Torch library additions for popular datasets and their transformations -# - ray libray (https://www.ray.io/) -# - Framework for distributed computing with a focus on hyperparameter optimization -# -# @section notes_doxygen_example Notes -# - None. -# -# @section todo TODO -# - None. -# -# @section author Author(s) -# - Created by MA on 04/05/2023. -# - Modified by EI on 05/05/2023. -# -# Copyright (c) 2023 RAISE, All rights reserved. - -# load general modules -import argparse -import os -import time -import numpy as np - -# load torch and torchvision modules -import torch -import torch.nn as nn -import torch.optim as optim -import torch.distributed as dist -import torchvision -from torchvision import datasets, transforms, models - -# load ray modules -import ray -from ray import tune -from ray.tune import CLIReporter -from ray.air import session, RunConfig -import ray.train as train -from ray.train.torch import TorchTrainer -from ray.air.config import ScalingConfig -from ray.tune.tuner import Tuner, TuneConfig - -def parsIni(): - """! parse arguments - - @param --num-samples #samples - @param --max-iterations max. iteration - @param --ngpus parallel-workers per trial - @param --scheduler schedulers, ASHA, BOHB, PBT, RAND (no algorithm) - @param --data-dir dataset location - """ - parser = argparse.ArgumentParser(description='HPO Suite for AI4HPC') - parser.add_argument('--num-samples', type=int, default=24, metavar='N', - help='number of samples to train (default: 24)') - parser.add_argument('--max-iterations', type=int, default=10, metavar='N', - help='maximum iterations to train (default: 10)') - parser.add_argument('--ngpus', type=int, default=1, metavar='N', - help='number of GPUs used in a single trial (default: 1)') - parser.add_argument('--scheduler', type=str, default='RAND', - help='scheduler for tuning (default: RandomSearch)') - parser.add_argument('--data-dir', type=str, default='', - help='data directory for cifar-10 dataset') - return parser - -def accuracy(output, target): - """! function that computes the accuracy of an output and target vector - @param output vector that the model predicted - @param target actual vector - - @return correct number of correct predictions - @return total number of total elements - """ - # get the index of the max log-probability - pred = output.max(1, keepdim=True)[1] - - # count correct classifications - correct = pred.eq(target.view_as(pred)).cpu().float().sum() - - # count total samples - total = target.size(0) - return correct, total - -def par_sum(field): - """! function that sums a field across all workers to a worker - @param field field in worker that should be summed up - - @return sum of all fields - """ - # convert field to tensor - res = torch.Tensor([field]) - - # move field to GPU/worker - res = res.cuda() - - # AllReduce operation - dist.all_reduce(res,op=dist.ReduceOp.SUM,group=None,async_op=True).wait() - - return res - -def load_data(data_dir=None): - """! function that loads training and test set of cifar-10 - @param data_dir directory where the data is stored - - @return train_set training set of cifar-10 - @return test_set test set of cifar-10 - """ - # vision preprocessing values - mean = [x / 255 for x in [125.3, 123.0, 113.9]] - std = [x / 255 for x in [63.0, 62.1, 66.7]] - - # transformations for the training set - transform_train = transforms.Compose([ - transforms.RandomCrop(32, padding=4), - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), - transforms.Normalize(mean, std), - ]) - - # transformations for the testset - transform_test = transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize(mean, std), - ]) - - # load the cifar-10 dataset from directory - train_set = torchvision.datasets.CIFAR10( - root=data_dir, train=True, download=False, transform=transform_train) - - test_set = torchvision.datasets.CIFAR10( - root=data_dir, train=False, download=False, transform=transform_test) - - return train_set, test_set - -def train_cifar(config): - """! function to train a ResNet on cifar-10 with different hyperparameters - @param config hyperparameter search space - """ - # load a ResNet model - model = models.resnet18() - - # define optimizer and loss function - criterion = nn.CrossEntropyLoss() - optimizer = optim.SGD(model.parameters(), lr=config["lr"]*dist.get_world_size()) - - if session.get_checkpoint() and args.scheduler == "PBT": - step = 1 - # Load model state and iteration step from checkpoint. - checkpoint_dict = session.get_checkpoint().to_dict() - model.load_state_dict(checkpoint_dict["model_state_dict"]) - # Load optimizer state (needed since we're using momentum), - # then set the `lr` and `momentum` according to the config. - optimizer.load_state_dict(checkpoint_dict["optimizer_state_dict"]) - - # Note: Make sure to increment the checkpointed step by 1 to get the current step. - last_step = checkpoint_dict["step"] - step = last_step + 1 - - # prepare the model for Ray Tune - model = train.torch.prepare_model(model) - - # load the training and test data - train_set, test_set = load_data(str(config["data_dir"])) - - # define the train and test dataloader - train_loader = torch.utils.data.DataLoader( - train_set, - batch_size=int(config["batch_size"]), - shuffle=True, - num_workers=30) - - test_loader = torch.utils.data.DataLoader( - test_set, - batch_size=int(config["batch_size"]), - shuffle=False, - num_workers=30) - - # prepare the dataloaders for Ray Tune - train_loader = train.torch.prepare_data_loader(train_loader) - test_loader = train.torch.prepare_data_loader(test_loader) - - # prepare metrics - train_acc = 0 - train_correct = 0 - train_total = 0 - - test_acc = 0 - test_correct = 0 - test_total = 0 - - # training and testing loop - for epoch in range(100): - # prepare model for training and loop over training dataset - model.train() - for i, (images, target) in enumerate(train_loader): - # compute output - optimizer.zero_grad() - output = model(images) - - # compute loss - loss = criterion(output, target) - - # count correct classifications - tmp_correct, tmp_total = accuracy(output, target) - train_correct +=tmp_correct - train_total +=tmp_total - - # backpropagation and optimization step - loss.backward() - optimizer.step() - - # average the train metrics over all workers - train_correct = par_sum(train_correct) - train_total = par_sum(train_total) - - # compute final training accuracy - train_acc = train_correct/train_total - - # only perform the testing loop every 10 epochs - if ((epoch+1)%10 == 0): - # prepare model for testing and loop over test dataset - model.eval() - with torch.no_grad(): - for i, (images, target) in enumerate(test_loader): - - # compute output - output = model(images) - - # count correct classifications - tmp_correct, tmp_total = accuracy(output, target) - test_correct +=tmp_correct - test_total +=tmp_total - - # average the test metrics over all workers - test_correct = par_sum(test_correct) - test_total = par_sum(test_total) - - # compute final test accuracy - test_acc = test_correct/test_total - - # report the training and testing accuracy back to the head node of Ray Tune - session.report({"train_acc": train_acc.item(), "test_acc": test_acc.item()}) - - # PBT specific - if args.scheduler == "PBT": - step += 1 - -def main(args): - """! main function - @param args input arguments - """ - - # initalize Ray with the correct adress and node ip adress - ray.init(address=os.environ['ip_head'], _node_ip_address=os.environ["head_node_ip"]) - - # define the hyperparameter search space - config = { - "batch_size": tune.choice([64, 128, 256, 512]), - "lr": tune.loguniform(10e-5, 1), - "data_dir": tune.choice([args.data_dir]), - } - - # set search algorithm - search_alg = None - - if (args.scheduler == "ASHA"): - from ray.tune.schedulers import ASHAScheduler - # Asynchronous Successive Halving Algorithm - scheduler = ASHAScheduler( - # the number of iterations to allow the trials to run at max - max_t=args.max_iterations, - # how many iterations before a bad trials get terminated - grace_period=2, - # which percentage of trials to terminate - reduction_factor=3) - - elif (args.scheduler == "BOHB"): - from ray.tune.schedulers.hb_bohb import HyperBandForBOHB - from ray.tune.search.bohb import TuneBOHB - # Bayesian Optimization and HyperBand - scheduler = HyperBandForBOHB( - # time attribute - time_attr="training_iteration", - # the number of iterations to allow the trials to run at max - max_t=args.max_iterations, - # which percentage of trials to terminate - reduction_factor=3) - - # modify search algorithm for BOHB - search_alg = TuneBOHB(seed=42) - - elif (args.scheduler == "PBT"): - from ray.tune.schedulers import PopulationBasedTraining - from ray.air import session, Checkpoint - # define the mutation config - mutation_config = {"lr": tune.loguniform(10e-5, 1),} - - # Population Based Training - scheduler = PopulationBasedTraining( - # time attribute - time_attr="training_iteration", - # intervals at that perturbations occur, - perturbation_interval=1, - # specification of hyperparameter mutatation search space (can be different than original search space!) - hyperparam_mutations={"train_loop_config": mutation_config}, - """ the parameters of the top quantile_fraction percentage trials are transfered to the bottom - quantile_fraction percentage of trials""" - quantile_fraction=0.33, - # probability to resample from original hyperparameter search space - resample_probability=0, - ) - - elif (args.scheduler == "RAND"): - # random scheduler - scheduler = None - - # define a reporter/logger to specifify which metrics to print out during the optimization process - reporter = CLIReporter( - metric_columns=["train_acc", "test_acc", "training_iteration", "time_this_iter_s", "time_total_s"], - max_report_frequency=60) - - # define the general RunConfig of Ray Tune - run_config = RunConfig( - # name of the training run (directory name). - name="cifar_test_training", - # directory to store the ray tune results in . - local_dir=os.path.join(os.path.abspath(os.getcwd()), "ray_results"), - # logger - progress_reporter=reporter, - # stopping criterion when to end the optimization process - stop={"training_iteration": args.max_iterations}) - - if (args.scheduler == "PBT"): - #checkpointing - run_config.checkpoint_config=ray.air.CheckpointConfig(checkpoint_score_attribute="test_acc") - - # wrapping the torch training function inside a TorchTrainer logic - trainer = TorchTrainer( - # torch training function - train_loop_per_worker=train_cifar, - # default hyperparameters for the function - train_loop_config={"batch_size": 64, "lr": 0.1, "data_dir": "/"}, - # setting the default resources/workers to use for the training function, including the number of CPUs and GPUs - scaling_config=ScalingConfig(num_workers=args.ngpus, use_gpu=True, resources_per_worker={"CPU": 30, "GPU": 1}), - ) - - # defining the hyperparameter tuner - tuner = Tuner( - # function to tune - trainer, - # hyperparameter search space - param_space={"train_loop_config": config}, - # the tuning configuration - tune_config=TuneConfig( - # define how many trials to evaluate - num_samples=args.num_samples, - # define which metric to use for measuring the performance of the trials - metric="test_acc", - # if the metric should be maximized or minimized - mode="max", - # define which scheduler to use - scheduler=scheduler, - # define which search algorithm to use - search_alg=search_alg), - run_config=run_config - ) - - # measure the total runtime - start_time = time.time() - - # start the optimization process - result = tuner.fit() - - runtime = time.time() - start_time - - # print total runtime - print("Total runtime: ", runtime) - - # print metrics of the best trial - best_result = result.get_best_result(metric="test_acc", mode="max") - - print("Best result metrics: ", best_result) - - # print results dataframe - print("Result dataframe: ") - print(result.get_dataframe().sort_values("test_acc", ascending=False)) - -if __name__ == "__main__": - # get custom arguments from parser - parser = parsIni() - args = parser.parse_args() - - # call the main function to launch Ray - main(args) - -# eof diff --git a/scripts/jureca_raytune/cifar_tune.py b/scripts/jureca_raytune/cifar_tune.py deleted file mode 100644 index 59bacbc..0000000 --- a/scripts/jureca_raytune/cifar_tune.py +++ /dev/null @@ -1,104 +0,0 @@ -from functools import partial -import numpy as np -import os -import torch -import torch.nn as nn -import torch.nn.functional as F -import torch.optim as optim -import torchvision -import torchvision.transforms as transforms -import torchvision.models as models -import ray -from ray import tune -from ray.tune import CLIReporter - -def load_data(data_dir=None): - transform = transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) - ]) - - trainset = torchvision.datasets.CIFAR10( - root=data_dir, train=True, download=False, transform=transform) - - return trainset - - -def train_cifar(config, data_dir=None): - - net = models.resnet18() - - device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - - net.to(device) - - criterion = nn.CrossEntropyLoss() - optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9) - - - trainset = load_data(data_dir) - - - trainloader = torch.utils.data.DataLoader( - trainset, - batch_size=int(config["batch_size"]), - shuffle=True, - num_workers=0) - - - for epoch in range(10): # loop over the dataset multiple times - running_loss = 0.0 - epoch_steps = 0 - running_correct = 0 - for i, data in enumerate(trainloader, 0): - # get the inputs; data is a list of [inputs, labels] - inputs, labels = data - inputs, labels = inputs.to(device), labels.to(device) - - # zero the parameter gradients - optimizer.zero_grad() - - # forward + backward + optimize - outputs = net(inputs) - loss = criterion(outputs, labels) - pred = outputs.argmax(dim=1, keepdim=True) - loss.backward() - optimizer.step() - - running_correct += pred.eq(labels.view_as(pred)).sum().item() - - - tune.report(loss = loss.item(), accuracy=running_correct / len(trainset)) - - print("Finished Training") - - -def main(num_samples=10, max_num_epochs=10, gpus_per_trial=1): - ray.init(address='auto') - - - config = { - "batch_size": tune.choice([64, 128, 256, 512]), - "lr": tune.loguniform(10e-5, 1) - } - - result = tune.run( - partial(train_cifar, data_dir='/p/project/raise-ctp2/cifar10/data'), - local_dir=os.path.join(os.path.abspath(os.getcwd()), "ray_results"), - resources_per_trial={"cpu": 8, "gpu": gpus_per_trial}, - config=config, - num_samples=num_samples, - scheduler=None) - - - best_trial = result.get_best_trial("loss", "min", "last") - print("Best trial config: {}".format(best_trial.config)) - print("Best trial final validation loss: {}".format( - best_trial.last_result["loss"])) - print("Best trial final validation accuracy: {}".format( - best_trial.last_result["accuracy"])) - - -if __name__ == "__main__": - # You can change the number of GPUs per trial here: - main(num_samples=10, max_num_epochs=10, gpus_per_trial=1) \ No newline at end of file diff --git a/scripts/jureca_raytune/cifar_tune_tf.py b/scripts/jureca_raytune/cifar_tune_tf.py deleted file mode 100644 index ab9572d..0000000 --- a/scripts/jureca_raytune/cifar_tune_tf.py +++ /dev/null @@ -1,76 +0,0 @@ -from functools import partial -import os -import tensorflow as tf -from tensorflow.keras import datasets, layers, models -import tensorflow_datasets as tfds -from tensorflow.keras.applications.resnet50 import ResNet50 -import ray -from ray import tune -from ray.tune import CLIReporter -from ray.tune.integration.keras import TuneReportCallback - - -# transform functions for data preprocessing -def train_transform(inputs): - i = inputs["image"] - i = tf.cast(i, tf.float32) - i = tf.image.resize(i, size=[256,256]) - i = tf.image.random_crop(i, size=[224,224,3]) - i = tf.image.random_flip_left_right(i) - i = tf.keras.applications.resnet50.preprocess_input(i) - i = i / 255.0 - return (i, inputs["label"]) - -def val_transform(inputs): - i = inputs["image"] - i = tf.cast(i, tf.float32) - i = tf.image.resize(i, size=[256,256]) - i = tf.image.central_crop(i, 224/256) - i = tf.keras.applications.resnet50.preprocess_input(i) - i = i / 255.0 - return (i, inputs["label"]) - -# main train function -def train_cifar(config, data_dir=None): - - strategy = tf.distribute.MirroredStrategy() - - # load data - train_ds, test_ds = tfds.load('cifar10', split=['train','test'], data_dir=data_dir, download=False) - - with strategy.scope(): - # prepare data and load model - train_ds=train_ds.map(train_transform).batch(config["batch_size"]) - test_ds=test_ds.map(val_transform).batch(config["batch_size"]) - - model = ResNet50(weights=None) - - - # compile and run model - model.compile(optimizer='adam', - loss=tf.keras.losses.SparseCategoricalCrossentropy(), - metrics=['accuracy']) - - history = model.fit(train_ds,validation_data=test_ds, epochs=10, verbose=2, callbacks=[TuneReportCallback({"loss": "loss"})]) - - -def main(num_samples=10, max_num_epochs=10, gpus_per_trial=4): - ray.init(address='auto') - - - config = { - "batch_size": tune.choice([32, 64, 128, 256]) - } - - result = tune.run( - partial(train_cifar, data_dir='/p/project/raise-ctp2/tensorflow_datasets/'), - local_dir=os.path.join(os.path.abspath(os.getcwd()), "ray_results"), - resources_per_trial={"cpu": 8, "gpu": gpus_per_trial}, - config=config, - num_samples=num_samples, - scheduler=None) - - -if __name__ == "__main__": - # You can change the number of GPUs per trial here: - main(num_samples=10, max_num_epochs=10, gpus_per_trial=4) diff --git a/scripts/jureca_raytune/create_jureca_env.sh b/scripts/jureca_raytune/create_jureca_env.sh deleted file mode 100644 index 345b8a0..0000000 --- a/scripts/jureca_raytune/create_jureca_env.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash -ml --force purge - -ml Stages/2022 GCC/11.2.0 OpenMPI/4.1.2 PyTorch/1.11-CUDA-11.5 torchvision/0.12.0-CUDA-11.5 - -python3 -m venv ray_tune_env - -source ray_tune_env/bin/activate - -pip3 install ray ray[tune] - -## optional: -## pip3 install tensorflow tensorflow-datasets - -deactivate diff --git a/scripts/jureca_raytune/jureca_run_ray.sh b/scripts/jureca_raytune/jureca_run_ray.sh deleted file mode 100644 index a983e5b..0000000 --- a/scripts/jureca_raytune/jureca_run_ray.sh +++ /dev/null @@ -1,92 +0,0 @@ -#!/bin/bash -# shellcheck disable=SC2206 -#SBATCH --job-name=RayTuneTest -#SBATCH --account=raise-ctp2 -#SBATCH --mail-user= -#SBATCH --mail-type=ALL -#SBATCH --output=ray_tune.out -#SBATCH --error=ray_tune.err - -#SBATCH --partition=dc-gpu -#SBATCH --nodes=4 -#SBATCH --tasks-per-node=1 -#SBATCH --cpus-per-task=128 -#SBATCH --gres=gpu:4 -#SBATCH --time=01:00:00 -#SBATCH --exclusive - -ml --force purge - -ml Stages/2022 GCC/11.2.0 OpenMPI/4.1.2 PyTorch/1.11-CUDA-11.5 torchvision/0.12.0-CUDA-11.5 - -source ray_tune_env/bin/activate - -sleep 1 -# make sure CUDA devices are visible -export CUDA_VISIBLE_DEVICES="0,1,2,3" - -num_gpus=4 - -## Limit number of max pending trials -export TUNE_MAX_PENDING_TRIALS_PG=$(($SLURM_NNODES * 4)) - -## Disable Ray Usage Stats -export RAY_USAGE_STATS_DISABLE=1 - -####### this part is taken from the ray example slurm script ##### -set -x - -# __doc_head_address_start__ - -# Getting the node names -nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -nodes_array=($nodes) - -head_node=${nodes_array[0]} -head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) - -# if we detect a space character in the head node IP, we'll -# convert it to an ipv4 address. This step is optional. -if [[ "$head_node_ip" == *" "* ]]; then -IFS=' ' read -ra ADDR <<<"$head_node_ip" -if [[ ${#ADDR[0]} -gt 16 ]]; then - head_node_ip=${ADDR[1]} -else - head_node_ip=${ADDR[0]} -fi -echo "IPV6 address detected. We split the IPV4 address as $head_node_ip" -fi -# __doc_head_address_end__ - -# __doc_head_ray_start__ -port=6379 -ip_head=$head_node_ip:$port -export ip_head -echo "IP Head: $ip_head" - -echo "Starting HEAD at $head_node" -srun --nodes=1 --ntasks=1 -w "$head_node" \ - ray start --head --node-ip-address="$head_node"i --port=$port \ - --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus $num_gpus --block & -# __doc_head_ray_end__ - -# __doc_worker_ray_start__ - -# optional, though may be useful in certain versions of Ray < 1.0. -sleep 10 - -# number of nodes other than the head node -worker_num=$((SLURM_JOB_NUM_NODES - 1)) - -for ((i = 1; i <= worker_num; i++)); do - node_i=${nodes_array[$i]} - echo "Starting WORKER $i at $node_i" - srun --nodes=1 --ntasks=1 -w "$node_i" \ - ray start --address "$head_node"i:"$port" --redis-password='5241590000000000' \ - --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus $num_gpus --block & - sleep 5 -done - -echo "Ready" - -python3 -u cifar_tune.py diff --git a/scripts/juwels_ddp/README.md b/scripts/juwels_ddp/README.md deleted file mode 100644 index fb592eb..0000000 --- a/scripts/juwels_ddp/README.md +++ /dev/null @@ -1,33 +0,0 @@ -# DL using DDP on juwels booster - -### DDP source -https://github.com/pytorch/pytorch#from-source - -### juwels documentation -https://apps.fz-juelich.de/jsc/hps/juwels/index.html - -### current isues -1. torchrun: Hostname/endpoint mismatch not handled\ -workaround is to modify torchrun and use included batch script\ -simply run `createEnv.sh` to install fixed torch\ -discussion in: https://github.com/pytorch/pytorch/issues/73656 -2. for containers, instead of #1, use `fixed_torch_run.py` -- follow usage - containers. - -### to-do -1. - -### done -1. fixed local IPs for TCP -2. tested containers \ -https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch -3. Scale up to 2400 GPUs using NCCL backend - -### usage - Python Env -1. run `./env_build.sh` to create env and install torch -2. select a case from CASES folder -3. submit `sbatch env_batch.sh` - -### usage - containers -1. run `./container_build.sh` to build .sif -2. select a case from CASES folder -3. submit `sbatch container_batch.sh` diff --git a/scripts/juwels_ddp/container_batch.sh b/scripts/juwels_ddp/container_batch.sh deleted file mode 100644 index 3aa5716..0000000 --- a/scripts/juwels_ddp/container_batch.sh +++ /dev/null @@ -1,85 +0,0 @@ -#!/bin/bash - -# general configuration of the job -#SBATCH --job-name=TorchContTest -#SBATCH --account=slfse -#SBATCH --mail-user= -#SBATCH --mail-type=ALL -#SBATCH --output=job.out -#SBATCH --error=job.err -#SBATCH --time=0-00:15:00 - -# configure node and process count on the CM -#SBATCH --partition=develbooster -#SBATCH --nodes=4 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=24 -#SBATCH --gpus-per-node=4 -#SBATCH --exclusive - -# gres options have to be disabled for deepv -#SBATCH --gres=gpu:4 - -# parameters -debug=false # do debug -dataDir='/p/scratch/raise-ctp2/inanc2/T31/' -COMMAND="DDP_ATBL_CAE_mod.py" - -EXEC="$COMMAND \ - --batch-size 1 \ - --epochs 10 \ - --lr 0.001 \ - --nworker $SLURM_CPUS_PER_TASK \ - --shuff \ - --scale-lr \ - --schedule \ - --data-dir $dataDir" - - -### do not modify below ### - - -# set modules and envs -ml GCC/11.3.0 OpenMPI/4.1.4 cuDNN/8.6.0.163-CUDA-11.7 Apptainer-Tools/2023 -source $SLURM_SUBMIT_DIR/torch_env//bin/activate - -# set env vars -export CUDA_VISIBLE_DEVICES="0,1,2,3" -export OMP_NUM_THREADS=1 -if [ "$SLURM_CPUS_PER_TASK" > 0 ] ; then - export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -fi -export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK} - -# job info -echo "DEBUG: TIME: $(date)" -echo "DEBUG: EXECUTE: $EXEC" -echo "DEBUG: SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR" -echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" -echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" -echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" -echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" -echo "DEBUG: SLURM_GPUS_PER_NODE: $SLURM_GPUS_PER_NODE" -echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" -echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" -echo "DEBUG: SLURM_NODEID: $SLURM_NODEID" -echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" -if [ "$debug" = true ] ; then - export NCCL_DEBUG=INFO -fi -ech - -# launch container -srun --cpu-bind=none bash -c "apptainer exec --nv torch.sif \ - python -m fixed_torch_run \ - --log_dir='logs' \ - --nnodes=$SLURM_NNODES \ - --nproc_per_node=$SLURM_GPUS_PER_NODE \ - --rdzv_id=$SLURM_JOB_ID \ - --rdzv_conf=is_host=\$(((SLURM_NODEID)) && echo 0 || echo 1) \ - --rdzv_backend=c10d \ - --rdzv_endpoint='$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)'i:29500 \ - $EXEC" - -#eof diff --git a/scripts/juwels_ddp/container_build.sh b/scripts/juwels_ddp/container_build.sh deleted file mode 100644 index 7c405a1..0000000 --- a/scripts/juwels_ddp/container_build.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash -# -*- coding: utf-8 -*- -# author: EI -# version: 2212008a -# pull and build containers for PyTorch/NVIDIA - -# load modules -ml GCC/11.3.0 OpenMPI/4.1.4 cuDNN/8.6.0.163-CUDA-11.7 Apptainer-Tools/2023 - -# create Cache/TMP so that $HOME would not be used -mkdir -p Cache -mkdir -p TMP -export APPTAINER_CACHEDIR=$(mktemp -d -p $PWD/Cache) -export APPTAINER_TMPDIR=$(mktemp -d -p $PWD/TMP) - -# official NVIDIA NVCR container with Torch==2.0.0 -apptainer pull torch.sif docker://nvcr.io/nvidia/pytorch:23.03-py3 - -# run bash to create envs -echo "running ./container_env.sh" -apptainer exec torch.sif bash -c "./container_env.sh" - -#eof diff --git a/scripts/juwels_ddp/container_env.sh b/scripts/juwels_ddp/container_env.sh deleted file mode 100644 index 641140a..0000000 --- a/scripts/juwels_ddp/container_env.sh +++ /dev/null @@ -1,13 +0,0 @@ -nname='torch_env' - -# create env inside container -python3 -m venv $nname --system-site-packages -source ${nname}/bin/activate - -# install wheels -- from this point on, feel free to add anything -pip3 install -r reqs.txt - -# modify l.4 of /torchnlp/_third_party/weighted_random_sampler.py -var='int_classes = int' -sed -i "4s|.*|$var|" \ - $PWD/${nname}/lib/python3.8/site-packages/torchnlp/_third_party/weighted_random_sampler.py diff --git a/scripts/juwels_ddp/createEnv.sh b/scripts/juwels_ddp/createEnv.sh deleted file mode 100755 index ea826fd..0000000 --- a/scripts/juwels_ddp/createEnv.sh +++ /dev/null @@ -1,193 +0,0 @@ -#!/bin/bash -# -*- coding: utf-8 -*- -# author: EI -# version: 220328a -# creates machine specific python env - -# set modules -ml --force purge - -# get sys info -cDir=$PWD -sysN="$(uname -n | cut -f2- -d.)" -echo "system:${sysN}" -echo - -cont1=false -if [[ $sysN = 'deepv' || $sysN = 'dp-esb'* ]] ; then - sysN=deepv - ml use $OTHERSTAGES - ml Stages/2022 NVHPC/22.1 OpenMPI/4.1.2 NCCL/2.15.1-1-CUDA-11.5 cuDNN/8.3.1.22-CUDA-11.5 - ml Python/3.9.6 HDF5 CMake - ml -nvidia-driver/.default - cont1=true -elif [ "$sysN" = 'juwels' ] ; then - ml Stages/2023 StdEnv/2023 NVHPC/23.1 OpenMPI/4.1.4 cuDNN/8.6.0.163-CUDA-11.7 - ml Python/3.10.4 CMake HDF5 PnetCDF libaio/0.3.112 - cont1=true -elif [ "$sysN" = 'jureca' ] ; then - #ml Stages/2023 StdEnv/2023 NVHPC/23.1 OpenMPI/4.1.4 cuDNN/8.6.0.163-CUDA-11.7 - ml Stages/2023 StdEnv/2023 GCC/11.3.0 OpenMPI/4.1.4 cuDNN/8.6.0.163-CUDA-11.7 - ml Python/3.10.4 CMake HDF5 PnetCDF libaio/0.3.112 - cont1=true -else - echo - echo 'unknown system detected' - echo 'canceling' - echo -fi -echo "modules loaded" -echo - -# get python version -pver="$(python --version 2>&1 | awk {'print $2'} | cut -f1-2 -d.)" -echo "python version is ${pver}" -echo - -# create env -if [ "$cont1" = true ] ; then - if [ -d "${cDir}/envAI_${sysN}" ];then - echo 'env already exist' - echo - - source envAI_${sysN}/bin/activate - else - python3 -m venv envAI_${sysN} - - # get headers for pip - if [ -f "${cDir}/envAI_${sysN}/bin/pip3" ]; then - echo 'pip already exist' - else - cp "$(which pip3)" $cDir/envAI_${sysN}/bin/ - ln -s $cDir/envAI_${sysN}/bin/pip3 $cDir/envAI_${sysN}/bin/pip${pver} - var="#!$cDir/envAI_${sysN}/bin/python${pver}" - sed -i "1s|.*|$var|" $cDir/envAI_${sysN}/bin/pip3 - fi - - # activate env - source envAI_${sysN}/bin/activate - - echo "a new env is created in ${cDir}" - echo "activation is done via:" - echo "source ${cDir}/envAI_${sysN}/bin/activate" - fi -fi - -# set tmp dir env var -export TMPDIR=${cDir} - -# install torch -if [ -f "${cDir}/envAI_${sysN}/bin/torchrun" ]; then - echo 'Torch already installed' - echo -else - # Stages/2023 - CUDA/11.7 - torch 2.0 stable - pip3 install torch torchvision torchaudio --no-cache-dir -fi - -# install horovod -if [ -f "${cDir}/envAI_${sysN}/bin/horovodrun" ]; then - echo 'Horovod already installed' - echo -else - # compiler vars - export LDSHARED="$CC -shared" && - - # CPU vars - export HOROVOD_WITH_MPI=1 - export HOROVOD_MPI_THREADS_DISABLE=1 - export HOROVOD_CPU_OPERATIONS=MPI - - # GPU vars - #export HOROVOD_GPU=CUDA - #export HOROVOD_CUDA_HOME=$EBROOTCUDA - #export HOROVOD_GPU_OPERATIONS=MPI - #export HOROVOD_GPU_OPERATIONS=NCCL - export HOROVOD_GPU_ALLREDUCE=NCCL - export HOROVOD_NCCL_LINK=SHARED - export HOROVOD_NCCL_HOME=$EBROOTNCCL - - # Host language vars - export HOROVOD_WITH_PYTORCH=1 - export HOROVOD_WITHOUT_TENSORFLOW=1 - export HOROVOD_WITHOUT_MXNET=1 - - pip3 install --no-cache-dir wheel - pip3 install --no-cache-dir horovod -fi - -# install deepspeed -if [ -f "${cDir}/envAI_${sysN}/bin/deepspeed" ]; then - echo 'DeepSpeed already installed' - echo -else - # compile all opt. stuff - not needed & not working - #export DS_BUILD_OPS=1 - # compile req. opt. stuff - export DS_BUILD_FUSED_ADAM=1 - export DS_BUILD_UTILS=1 - if [ "$sysN" = 'deepv' ] ; then - #fix libaio issues via: - export DS_BUILD_AIO=0 - fi - - pip3 install --no-cache-dir DeepSpeed - - # add this to .../deepspeed/launcher/launch.py l.93 - var=' args.node_rank=int(os.environ.get("SLURM_PROCID",0))' - sed -i "132s|.*|$var|" $cDir/envAI_${sysN}/lib/python${pver}/site-packages/deepspeed/launcher/launch.py -fi - -# install heat -if [ -d "${cDir}/envAI_${sysN}/lib/python${pver}/site-packages/heat" ]; then - echo 'HeAT already installed' - echo -else - export CFLAGS="-noswitcherror" - export CXXFLAGS="-noswitcherror" - - # experimental - # modify setup.py to accep torch>1.7 for heat - git clone --recursive https://github.com/helmholtz-analytics/heat.git heat - var=' "torch>=1.7.0",' - sed -i "36s|.*|$var|" heat/setup.py - - # create tar ball - tar czf heat.tar.gz - - # install experimental heat - pip3 install --no-cache-dir 'heat.tar.gz[hdf5,netcdf]' -fi - -# get rest of the libraries$ -if [ "$cont1" = true ] ; then - # install rest - pip3 install -r reqs.txt - - # modify l.4 of /torchnlp/_third_party/weighted_random_sampler.py - var='int_classes = int' - sed -i "4s|.*|$var|" \ - $cDir/envAI_${sysN}/lib/python${pver}/site-packages/torchnlp/_third_party/weighted_random_sampler.py -fi - -# fix IB IP config -if [ -f "${cDir}/envAI_${sysN}/bin/torchrun" ]; then - sed -i -e '3,8s/^/#/' ${cDir}/envAI_${sysN}/bin/torchrun - echo """ -import re -import sys -from torch.distributed.run import main -from torch.distributed.elastic.agent.server import api as sapi - -def new_get_fq_hostname(): - return _orig_get_fq_hostname().replace('.', 'i.', 1) - -if __name__ == '__main__': - _orig_get_fq_hostname = sapi._get_fq_hostname - sapi._get_fq_hostname = new_get_fq_hostname - sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) - sys.exit(main()) -""" >> ${cDir}/envAI_${sysN}/bin/torchrun -fi - -#eof diff --git a/scripts/juwels_ddp/env_batch.sh b/scripts/juwels_ddp/env_batch.sh deleted file mode 100644 index 5c3b7eb..0000000 --- a/scripts/juwels_ddp/env_batch.sh +++ /dev/null @@ -1,90 +0,0 @@ -#!/bin/bash - -# general configuration of the job -#SBATCH --job-name=TorchTest -#SBATCH --account=slfse -#SBATCH --mail-user= -#SBATCH --mail-type=ALL -#SBATCH --output=job.out -#SBATCH --error=job.err -#SBATCH --time=0-00:15:00 - -# configure node and process count on the CM -#SBATCH --partition=develbooster -#SBATCH --nodes=4 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=24 -#SBATCH --gpus-per-node=4 -#SBATCH --exclusive - -# gres options have to be disabled for deepv -#SBATCH --gres=gpu:4 - -# parameters -debug=false # do debug -dataDir='/p/scratch/raise-ctp2/inanc2/T31/' -COMMAND="DDP_ATBL_CAE_mod.py" - -EXEC="$COMMAND \ - --batch-size 1 \ - --epochs 10 \ - --lr 0.001 \ - --nworker $SLURM_CPUS_PER_TASK \ - --shuff \ - --scale-lr \ - --schedule \ - --data-dir $dataDir" - - -### do not modify below ### - - -# set modules -ml --force purge -ml Stages/2023 StdEnv/2023 NVHPC/23.1 OpenMPI/4.1.4 cuDNN/8.6.0.163-CUDA-11.7 -ml Python/3.10.4 HDF5 libaio/0.3.112 - -# set env -source /p/project/prcoe12/RAISE/envAI_juwels/bin/activate - -# sleep a sec -sleep 1 - -# set env vars -export CUDA_VISIBLE_DEVICES="0,1,2,3" -export OMP_NUM_THREADS=1 -if [ "$SLURM_CPUS_PER_TASK" > 0 ] ; then - export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -fi -export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK} - -# job info -echo "DEBUG: TIME: $(date)" -echo "DEBUG: EXECUTE: $EXEC" -echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" -echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" -echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" -echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" -echo "DEBUG: SLURM_GPUS_PER_NODE: $SLURM_GPUS_PER_NODE" -echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" -echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" -echo "DEBUG: SLURM_NODEID: $SLURM_NODEID" -echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" -if [ "$debug" = true ] ; then - export NCCL_DEBUG=INFO -fi -echo - -# launch -srun --cpu-bind=none bash -c "torchrun \ - --log_dir='logs' \ - --nnodes=$SLURM_NNODES \ - --nproc_per_node=$SLURM_GPUS_PER_NODE \ - --rdzv_id=$SLURM_JOB_ID \ - --rdzv_conf=is_host=\$(((SLURM_NODEID)) && echo 0 || echo 1) \ - --rdzv_backend=c10d \ - --rdzv_endpoint='$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)'i:29500 \ - $EXEC" - -# eof diff --git a/scripts/juwels_ddp/env_build.sh b/scripts/juwels_ddp/env_build.sh deleted file mode 100755 index b237a1b..0000000 --- a/scripts/juwels_ddp/env_build.sh +++ /dev/null @@ -1,151 +0,0 @@ -#!/bin/bash -# -*- coding: utf-8 -*- -# author: EI -# version: 220211a -# creates machine specific python env -# note: Stage 2023 has issues, this uses Stage 2022 instead - -# set modules -module --force purge - -# get sys info -cDir=$PWD -sysN="$(uname -n | cut -f2- -d.)" -echo "system:${sysN}" -echo - -cont1=false -if [ "$sysN" = 'deepv' ] ; then - module use $OTHERSTAGES - ml GCC ParaStationMPI/5.4.9-1-mt Python cuDNN NCCL Python - cont1=true -elif [ "$sysN" = 'juwels' ] ; then - ml Stages/2022 GCC ParaStationMPI Python CMake - cont1=true -elif [ "$sysN" = 'jureca' ] ; then - #ml Stages/2022 GCC ParaStationMPI Python CMake NCCL libaio # Horovod issues with pscom?? - ml Stages/2022 GCC OpenMPI Python NCCL cuDNN libaio CMake - cont1=true -else - echo - echo 'unknown system detected' - echo 'canceling' - echo -fi -echo "modules loaded" -echo - -# get python version -pver="$(python --version 2>&1 | awk {'print $2'} | cut -f1-2 -d.)" -echo "python version is ${pver}" -echo - -if [ "$cont1" = true ] ; then - if [ -d "${cDir}/envAI_${sysN}" ];then - echo 'env already exist' - echo - - source envAI_${sysN}/bin/activate - else - # create env - python3 -m venv envAI_${sysN} - - # get headers for pip - if [ -f "${cDir}/envAI_${sysN}/bin/pip3" ]; then - echo 'pip already exist' - else - cp "$(which pip3)" $cDir/envAI_${sysN}/bin/ - ln -s $cDir/envAI_${sysN}/bin/pip3 $cDir/envAI_${sysN}/bin/pip${pver} - var="#!$cDir/envAI_${sysN}/bin/python${pver}" - sed -i "1s|.*|$var|" $cDir/envAI_${sysN}/bin/pip3 - fi - - # activate env - source envAI_${sysN}/bin/activate - - echo "a new env is created in ${cDir}" - echo "activation is done via:" - echo "source ${cDir}/envAI_${sysN}/bin/activate" - fi -fi - -# install torch -if [ -f "${cDir}/envAI_${sysN}/bin/torchrun" ]; then - echo 'Torch already installed' - echo -else - export TMPDIR=${cDir} - - pip3 install \ - torch==1.10.2+cu113 torchvision==0.11.3+cu113 torchaudio==0.10.2+cu113 \ - -f https://download.pytorch.org/whl/cu113/torch_stable.html --no-cache-dir -fi - -# install horovod -if [ -f "${cDir}/envAI_${sysN}/bin/horovodrun" ]; then - echo 'Horovod already installed' - echo -else - export HOROVOD_GPU=CUDA - export HOROVOD_GPU_OPERATIONS=NCCL - export HOROVOD_WITH_PYTORCH=1 - export TMPDIR=${cDir} - - pip3 install --no-cache-dir horovod --ignore-installed -fi - -# install deepspeed -if [ -f "${cDir}/envAI_${sysN}/bin/deepspeed" ]; then - echo 'DeepSpeed already installed' - echo -else - export DS_BUILD_OPS=1 - # if above not working?? recursion error use this - #export DS_BUILD_FUSED_ADAM=1 - #export DS_BUILD_UTILS=1 - export TMPDIR=${cDir} - - pip3 install --no-cache-dir DeepSpeed - - add this to .../deepspeed/launcher/launch.py l.70 - var=' argsy1.node_rank=int(os.environ.get("SLURM_PROCID",0))' - sed -i "85s|.*|$var|" $cDir/envAI_${sysN}/lib/python${pver}/site-packages/deepspeed/launcher/launch.py -fi - -# install heat -if [ -d "${cDir}/envAI_${sysN}/lib/python${pver}/site-packages/heat" ]; then - echo 'HeAT already installed' - echo -else - export TMPDIR=${cDir} - - # need to modify setup.py to accep torch>1.9 for heat - wget https://files.pythonhosted.org/packages/5d/3a/4781f1e6910753bfdfa6712c83c732c60e675d8de14983926a0d9306c7a6/heat-1.1.1.tar.gz - tar xzf heat-1.1.1.tar.gz - var=' "torch>=1.7.0",' - sed -i "36s|.*|$var|" heat-1.1.1/setup.py - var=' "torchvision>=0.8.0",' - sed -i "39s|.*|$var|" heat-1.1.1/setup.py - - # create tar again! - rm -rf heat-1.1.1.tar.gz - tar czf heat-1.1.1.tar.gz heat-1.1.1 - rm -rf heat-1.1.1 - - pip3 install --no-cache-dir 'heat-1.1.1.tar.gz[hdf5,netcdf]' - - rm -rf heat-1.1.1.tar.gz -fi - -# get rest of the libraries$ -if [ "$cont1" = true ] ; then - # install rest - pip3 install -r reqs.txt --ignore-installed - - # modify l.4 of /torchnlp/_third_party/weighted_random_sampler.py - var='int_classes = int' - sed -i "4s|.*|$var|" \ - $cDir/envAI_${sysN}/lib/python${pver}/site-packages/torchnlp/_third_party/weighted_random_sampler.py -fi - -#eof diff --git a/scripts/juwels_ddp/fixed_torch_run.py b/scripts/juwels_ddp/fixed_torch_run.py deleted file mode 100644 index cca9706..0000000 --- a/scripts/juwels_ddp/fixed_torch_run.py +++ /dev/null @@ -1,51 +0,0 @@ -from argparse import ArgumentParser -import ipaddress -import runpy -import socket - -from torch.distributed.elastic.agent.server import api as sapi - - -def parse_host(): - parser = ArgumentParser() - parser.add_argument('--rdzv_endpoint') - endpoint = parser.parse_known_args()[0].rdzv_endpoint - host = ( - endpoint.split(':', 1)[0] - if endpoint - else None - ) - return host - - -def fix_torch_run(host): - _orig_get_fq_hostname = sapi._get_fq_hostname - - if host: - try: - ipaddress.ip_address(host) - is_ip = True - except ValueError: - is_ip = False - - if is_ip: - def new_get_fq_hostname(): - return socket.gethostbyaddr(host)[0] - else: - def new_get_fq_hostname(): - return socket.getfqdn(host) - else: - new_get_fq_hostname = _orig_get_fq_hostname - - sapi._get_fq_hostname = new_get_fq_hostname - - -def main(): - host = parse_host() - fix_torch_run(host) - runpy.run_module('torch.distributed.run', run_name='__main__') - - -if __name__ == '__main__': - main() - diff --git a/scripts/juwels_ddp/install_pyDDP.sh b/scripts/juwels_ddp/install_pyDDP.sh deleted file mode 100755 index 59213f0..0000000 --- a/scripts/juwels_ddp/install_pyDDP.sh +++ /dev/null @@ -1,80 +0,0 @@ -#!/bin/sh -# author: EI -# version: 210709a - -# get dir -iDir=$PWD - -# set modules -module --force purge -module use $OTHERSTAGES -ml Stages/2020 GCC/9.3.0 ParaStationMPI/5.4.7-1-mt CMake Ninja cuDNN NCCL mpi-settings/CUDA - -# conda -if [ -d "${iDir}/miniconda3" ];then - echo "miniconda3 already installed!" - source ${iDir}/miniconda3/etc/profile.d/conda.sh - conda activate -else - echo "miniconda3 will be compiled to ${iDir}/miniconda3!" - wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh - bash Miniconda3-latest-Linux-x86_64.sh -p ${iDir}/miniconda3 -b - source ${iDir}/miniconda3/etc/profile.d/conda.sh - conda activate - # std libs - conda install -y astunparse numpy pyyaml mkl mkl-include setuptools cffi typing_extensions future six requests dataclasses Pillow --force-reinstall - # cuda - check version with yours - conda install -c pytorch -y magma-cuda110 --force-reinstall - conda install -y pkg-config libuv --force-reinstall - rm -f Miniconda3-latest-Linux-x86_64.sh -fi - -# torch -if [ -d "${iDir}/pytorch/build" ];then - echo 'pytorch already installed!' -else - # clone pytorch - if [ -d "${iDir}/pytorch" ];then - echo 'pytorch repo is found!' - else - git clone --recursive https://github.com/pytorch/pytorch pytorch - fi - - # update repos - cd pytorch - git submodule sync - git submodule update --init --recursive - - # install pytorch - export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"} - export TMPDIR=${iDir}/tmp - python setup.py clean - CMAKE_C_COMPILER=$(which mpicc) CMAKE_CXX_COMPILER=$(which mpicxx) USE_DISTRIBUTED=ON USE_MPI=ON USE_CUDA=ON NCCL_ROOT_DIR=$EBROOTNCCL USE_NCCL=ON USE_GLOO=ON CUDNN_ROOT=$EBROOTCUDNN USE_CUDNN=ON python setup.py install - cd .. -fi - -# torchvision -if [ -d "${iDir}/torchvision/build" ];then - echo 'torchvision already installed!' -else - # clone torchvision - if [ -d "${iDir}/torchvision" ];then - echo 'torchvision repo is found!' - else - git clone --recursive https://github.com/pytorch/vision.git torchvision - fi - - # update repos - cd torchvision - git submodule sync - git submodule update --init --recursive - - # install torchvision - export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"} - export TMPDIR=${iDir}/tmp - python setup.py clean - CMAKE_C_COMPILER=$(which mpicc) CMAKE_CXX_COMPILER=$(which mpicxx) FORCE_CUDA=ON python setup.py install -fi - -echo 'done!' -# eof diff --git a/scripts/juwels_ddp/lamec.json b/scripts/juwels_ddp/lamec.json deleted file mode 100644 index af5277f..0000000 --- a/scripts/juwels_ddp/lamec.json +++ /dev/null @@ -1 +0,0 @@ -{"startscript": "env_batch.sh"} \ No newline at end of file diff --git a/scripts/juwels_ddp/reqs.txt b/scripts/juwels_ddp/reqs.txt deleted file mode 100644 index 3db4809..0000000 --- a/scripts/juwels_ddp/reqs.txt +++ /dev/null @@ -1,12 +0,0 @@ -python-hostlist -Pillow -pyparsing -python-dateutil -matplotlib -h5py -pytorch-nlp -pyprof -filelock -scipy -perlin_noise -noise diff --git a/scripts/vega_basilisk/.gitkeep b/scripts/vega_basilisk/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/scripts/vega_basilisk/basilisk_cfd.sh b/scripts/vega_basilisk/basilisk_cfd.sh deleted file mode 100644 index faaa5c8..0000000 --- a/scripts/vega_basilisk/basilisk_cfd.sh +++ /dev/null @@ -1,71 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=case001 -#SBATCH --account=r2203-054-users -#SBATCH --nodes=1 -#SBATCH --ntasks=32 -#SBATCH --hint=nomultithread -#SBATCH --mem=64G -#SBATCH --time=24:00:00 -#SBATCH --output=job.%j.out -#SBATCH --error=job.%j.err -#SBATCH --partition=cpu - -module purge -module load gc -module load openmpi/gnu/4.1.2.1 - -echo "Starting at `date`" -echo "Running on hosts: $SLURM_NODELIST" -echo "Running on $SLURM_NNODES nodes." -echo "Running on $SLURM_NPROCS processors." -echo "Job id is $SLURM_JOBID" -pi=`echo "4*a(1)" | bc -l` -p1_max=`echo "$pi/6.0" | bc -l` -p1_min=0 -p2_max=`echo "$pi/18.0" | bc -l` -p2_min=0 -p3_max=5.0 -p3_min=5.0 -p4_max=20.0 -p4_min=0 -p5_max=20.0 -p5_min=0 -p6_max=`echo "$pi/2.0" | bc -l` -p6_min=`echo "$pi/2.0" | bc -l` -p7_max=`echo "$pi" | bc -l` -p7_min=`echo "$pi" | bc -l` -xc_max=0.6 -xc_min=0.4 -yc_max=0.6 -yc_min=0.4 - -file="params.in" - -if ! [[ -f "restart" ]] ; then - RANDOM=$(date +%s%N | cut -b10-19) # give a seed - echo "$RANDOM / 32767 * ($p1_max-$p1_min) + $p1_min" | bc -l > $file - echo "$RANDOM / 32767 * ($p2_max-$p2_min) + $p2_min" | bc -l >> $file - echo "$RANDOM / 32767 * ($p3_max+$p3_min) - $p3_min" | bc -l >> $file - echo "$RANDOM / 32767 * ($p4_max-$p4_min) + $p4_min" | bc -l >> $file - echo "$RANDOM / 32767 * ($p5_max-$p5_min) + $p5_min" | bc -l >> $file - echo "$RANDOM / 32767 * ($p6_max+$p6_min) - $p6_min" | bc -l >> $file - echo "$RANDOM / 32767 * ($p7_max+$p7_min) - $p7_min" | bc -l >> $file - echo "$RANDOM / 32767 * ($xc_max-$xc_min) + $xc_min" | bc -l >> $file - echo "$RANDOM / 32767 * ($yc_max-$yc_min) + $yc_min" | bc -l >> $file -fi - - -if ! [[ -d "output/" ]] ; then - mkdir output/ - mkdir output/wet_area/ - mkdir output/facets/ - mkdir output/my_output/ -fi - -CC99='mpicc -std=c99' qcc -O2 -Wall -D_MPI=1 drop.c -o run -lm - -srun --mpi=pmix -K1 -n $SLURM_NTASKS ./run #\ -###srun --mpi=pmix --exclusive -K1 -n $SLURM_NTASKS ./run #\ -### 2> log > out - -echo "Program finished with exit code $? at: `date`" diff --git a/scripts/vega_basilisk/basilisk_pde.sh b/scripts/vega_basilisk/basilisk_pde.sh deleted file mode 100644 index 3f7fb9e..0000000 --- a/scripts/vega_basilisk/basilisk_pde.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=jobrun -#SBATCH --nodes=1 -#SBATCH --ntasks=12 -#SBATCH --hint=nomultithread -#SBATCH --mem=10G -#SBATCH --time=0:40:00 -#SBATCH --output %j.out -#SBATCH --error %j_log -#SBATCH --partition=cpu -#SBATCH --mail-type=end # send email when job ends -##SBATCH --mail-user= - -set -e -if [ -z "$1" ]; then - echo "Missing file for compilation"; - exit 1; -fi - -#create configuration backup -cat ${0} > '.case.cfg' -cat ${1} >> '.case.cfg' - -#get WALLTIME -twLine=$(eval grep -m 1 "time=" ${PWD}/launch.sh) -WALLTIME=${twLine##*=} - -module purge -module load Bison/3.7.1-GCCcore-10.2.0 CMake/3.18.4-GCCcore-10.2.0 Python/3.8.6-GCCcore-10.2.0 flex/2.6.4-GCCcore-10.2.0 SWIG/4.0.2-GCCcore-10.3.0 -module load Mesa/20.2.1-GCCcore-10.2.0 libGLU/9.0.1-GCCcore-10.2.0 -module load OpenMPI/4.1.3-GCC-10.3.0 -module load ImageMagick/7.0.10-35-GCCcore-10.2.0 -module load FFmpeg/4.4.2-GCCcore-11.3.0 - -export BASILISK=/ceph/hpc/home/euyiannisv/basilisk/src -export PATH=$PATH:$BASILISK - - -echo "Starting at `date`" -echo "Running on hosts: $SLURM_NODELIST" -echo "Running on $SLURM_NNODES nodes." -echo "Running on $SLURM_NPROCS processors." -echo "Job id is $SLURM_JOBID" - -PWD=$(eval pwd) -echo "Executable for $1 at ${PWD}" - -GLLIBS="-L${BASILISK}/gl -lglutils -lfb_osmesa -lOSMesa -lGLU" -CC99='mpicc -std=c99' qcc -O2 -DINCLINED -DBDF2 -Wall -D_MPI=1 -I${PWD}/utils $1 -o run ${GLLIBS} -lm - -if [ $? -eq 0 ]; then - echo "Compilation Success."; -else - echo "Compilation error."; - exit 1; -fi - -srun --mpi=pmix -K1 -n $SLURM_NTASKS ${PWD}/run -m $WALLTIME >out 2>log - -echo "Program finished with exit code $? at: `date`" diff --git a/scripts/vega_basilisk/lamec.json b/scripts/vega_basilisk/lamec.json deleted file mode 100644 index c714558..0000000 --- a/scripts/vega_basilisk/lamec.json +++ /dev/null @@ -1 +0,0 @@ -{"startscript": "basilisk_cfd.sh"} -- GitLab