... | ... | @@ -58,10 +58,104 @@ Once the partitions are configured correctly, you should be able to start a DALL |
|
|
If everything runs fine, you can change the paths in the `sbatch` scripts according to your locations.
|
|
|
|
|
|
## (Data Parallel) Training with Horovod
|
|
|
*ToDo*
|
|
|
Example of a job:
|
|
|
|
|
|
```
|
|
|
#!/usr/bin/env bash
|
|
|
|
|
|
#SBATCH --nodes 1
|
|
|
#SBATCH --tasks-per-node 4
|
|
|
#SBATCH --gres gpu
|
|
|
#SBATCH -A cstdl
|
|
|
#SBATCH --partition develgpus
|
|
|
|
|
|
HOME_PATH=
|
|
|
CHECKPOINT_NAME=
|
|
|
LOGS_PATH=
|
|
|
|
|
|
module purge
|
|
|
module load Stages/2020 GCC OpenMPI PyTorch torchvision Horovod
|
|
|
|
|
|
DATASET_PATH=/p/scratch/ccstdl/${HOME_PATH}/LAION_SAMPLE/
|
|
|
VQGAN_MODEL_PATH=/p/scratch/ccstdl/${HOME_PATH}/vqgan_models/imagenet_16384_slim.ckpt
|
|
|
VQGAN_CONFIG_PATH=/p/scratch/ccstdl/${HOME_PATH}/vqgan_models/imagenet_16384.yaml
|
|
|
CKPT_SAVE_PATH=/p/scratch/ccstdl/${HOME_PATH}/checkpoints/${CHECKPOINT_NAME}
|
|
|
RESIZE_RATIO=1.0
|
|
|
BATCH_SIZE=32
|
|
|
DEPTH=16
|
|
|
HEADS=16
|
|
|
DIM_HEAD=64
|
|
|
FF_DROPOUT=0.1
|
|
|
ATTN_DROPOUT=0.1
|
|
|
CLIP_GRAD_NORM=0.5
|
|
|
WANDB_NAME=dalle_test_laion_develgpus
|
|
|
LOGFILE=${LOGS_PATH}/dalle_test_laion_develgpus.txt
|
|
|
DIM=512
|
|
|
TEXT_SEQ_LEN=128
|
|
|
EPOCHS=1
|
|
|
|
|
|
export CUDA_VISIBLE_DEVICES=0,1,2,3
|
|
|
srun -A cstdl --cpu-bind=none \
|
|
|
python -u train_dalle.py \
|
|
|
--epochs="$EPOCHS" \
|
|
|
--clip_grad_norm="$CLIP_GRAD_NORM" \
|
|
|
--dim="$DIM" \
|
|
|
--text_seq_len="$TEXT_SEQ_LEN" \
|
|
|
--depth="$DEPTH" \
|
|
|
--heads="$HEADS" \
|
|
|
--dim_head="$DIM_HEAD" \
|
|
|
--ff_dropout="$FF_DROPOUT" \
|
|
|
--attn_dropout="$ATTN_DROPOUT" \
|
|
|
--batch_size="$BATCH_SIZE" \
|
|
|
--random_resize_crop_lower_ratio="$RESIZE_RATIO" \
|
|
|
--vqgan_config_path="$VQGAN_CONFIG_PATH" \
|
|
|
--vqgan_model_path="$VQGAN_MODEL_PATH" \
|
|
|
--dalle_output_file_name="$CKPT_SAVE_PATH" \
|
|
|
--image_text_folder="$DATASET_PATH" \
|
|
|
--wandb_name="$WANDB_NAME" \
|
|
|
--wds=jpg,txt \
|
|
|
--reversible \
|
|
|
--lr_decay \
|
|
|
--taming \
|
|
|
--shift_tokens \
|
|
|
--rotary_emb \
|
|
|
--truncate_captions \
|
|
|
--flops_profiler \
|
|
|
--distributed_backend="horovod" | tee "$LOGFILE"
|
|
|
|
|
|
```
|
|
|
|
|
|
## Training with DeepSpeed
|
|
|
*ToDo*
|
|
|
Example of a job:
|
|
|
|
|
|
```
|
|
|
|
|
|
#!/usr/bin/env bash
|
|
|
|
|
|
#SBATCH --nodes 1
|
|
|
#SBATCH --tasks-per-node 4
|
|
|
#SBATCH --gres gpu
|
|
|
#SBATCH -A cstdl
|
|
|
#SBATCH --partition develgpus
|
|
|
|
|
|
module purge
|
|
|
module load Stages/2020 GCC OpenMPI PyTorch torchvision DeepSpeed
|
|
|
|
|
|
# ...define vars
|
|
|
|
|
|
deepspeed train_dalle.py \
|
|
|
# ...options
|
|
|
--wds=jpg,txt \
|
|
|
--reversible \
|
|
|
--lr_decay \
|
|
|
--taming \
|
|
|
--shift_tokens \
|
|
|
--rotary_emb \
|
|
|
--truncate_captions \
|
|
|
--flops_profiler \
|
|
|
--distributed_backend="deepspeed" | tee "$LOGFILE"
|
|
|
|
|
|
```
|
|
|
|
|
|
## Monitoring a Job
|
|
|
|
... | ... | |