jenia jitsev · 624954d5
--- a/Home.md
+++ b/Home.md
@@ -58,10 +58,104 @@ Once the partitions are configured correctly, you should be able to start a DALL
 If everything runs fine, you can change the paths in the `sbatch` scripts according to your locations.

 ## (Data Parallel) Training with Horovod
-*ToDo*
+Example of a job:
+
+```
+#!/usr/bin/env bash
+
+#SBATCH --nodes 1
+#SBATCH --tasks-per-node 4
+#SBATCH --gres gpu
+#SBATCH -A cstdl
+#SBATCH --partition develgpus
+
+HOME_PATH=
+CHECKPOINT_NAME=
+LOGS_PATH=
+
+module purge
+module load Stages/2020 GCC OpenMPI PyTorch torchvision Horovod
+
+DATASET_PATH=/p/scratch/ccstdl/${HOME_PATH}/LAION_SAMPLE/
+VQGAN_MODEL_PATH=/p/scratch/ccstdl/${HOME_PATH}/vqgan_models/imagenet_16384_slim.ckpt
+VQGAN_CONFIG_PATH=/p/scratch/ccstdl/${HOME_PATH}/vqgan_models/imagenet_16384.yaml
+CKPT_SAVE_PATH=/p/scratch/ccstdl/${HOME_PATH}/checkpoints/${CHECKPOINT_NAME}
+RESIZE_RATIO=1.0
+BATCH_SIZE=32
+DEPTH=16
+HEADS=16
+DIM_HEAD=64
+FF_DROPOUT=0.1
+ATTN_DROPOUT=0.1
+CLIP_GRAD_NORM=0.5
+WANDB_NAME=dalle_test_laion_develgpus
+LOGFILE=${LOGS_PATH}/dalle_test_laion_develgpus.txt
+DIM=512
+TEXT_SEQ_LEN=128
+EPOCHS=1
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+srun -A cstdl --cpu-bind=none \
+     python -u train_dalle.py \
+            --epochs="$EPOCHS" \
+            --clip_grad_norm="$CLIP_GRAD_NORM" \
+            --dim="$DIM" \
+            --text_seq_len="$TEXT_SEQ_LEN" \
+            --depth="$DEPTH" \
+            --heads="$HEADS" \
+            --dim_head="$DIM_HEAD" \
+            --ff_dropout="$FF_DROPOUT" \
+            --attn_dropout="$ATTN_DROPOUT" \
+            --batch_size="$BATCH_SIZE" \
+            --random_resize_crop_lower_ratio="$RESIZE_RATIO" \
+            --vqgan_config_path="$VQGAN_CONFIG_PATH" \
+            --vqgan_model_path="$VQGAN_MODEL_PATH" \
+            --dalle_output_file_name="$CKPT_SAVE_PATH" \
+            --image_text_folder="$DATASET_PATH" \
+            --wandb_name="$WANDB_NAME" \
+            --wds=jpg,txt \
+            --reversible \
+            --lr_decay \
+            --taming \
+            --shift_tokens \
+            --rotary_emb \
+            --truncate_captions \
+            --flops_profiler \
+            --distributed_backend="horovod" | tee "$LOGFILE"
+
+```

 ## Training with DeepSpeed
-*ToDo*
+Example of a job:
+
+```
+
+#!/usr/bin/env bash
+
+#SBATCH --nodes 1
+#SBATCH --tasks-per-node 4
+#SBATCH --gres gpu
+#SBATCH -A cstdl
+#SBATCH --partition develgpus
+
+module purge
+module load Stages/2020 GCC OpenMPI PyTorch torchvision DeepSpeed
+
+# ...define vars
+
+deepspeed train_dalle.py \
+# ...options
+            --wds=jpg,txt \
+            --reversible \
+            --lr_decay \
+            --taming \
+            --shift_tokens \
+            --rotary_emb \
+            --truncate_captions \
+            --flops_profiler \
+            --distributed_backend="deepspeed" | tee "$LOGFILE"
+
+```

 ## Monitoring a Job