Skip to content
Snippets Groups Projects
Select Git revision
  • master
1 result

main.cpp

Blame
  • tr1-13B-round1-small_juwels.sbatch 5.88 KiB
    #!/bin/bash
    #SBATCH --job-name=tr1-13B-small
    #SBATCH --nodes=512
    #SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
    #SBATCH --cpus-per-task=96           # number of cores per tasks
    # SBATCH --hint=nomultithread         # we get physical cores not logical
    #SBATCH --gres=gpu:4                 # number of gpus
    #SBATCH --time 00:10:00              # maximum execution time (HH:MM:SS)
    #SBATCH --output=%x-%j.out           # output file name
    #SBATCH --account=opengptx-elm
    #SBATCH --partition=largebooster
    
    set -x -e
    
    echo "START TIME: $(date)"
    
    [ -x "$(command -v deactivate)" ] && deactivate
    
    module purge
    # (Most) modules in requirements.txt don't have specific versions, so
    # use our modules where possible.
    module load GCC CMake Python libaio
    module load cuDNN NCCL
    # Since this is the only optionally used module, we could nuke it in
    # favor of compatibility during setup.
    module load PyTorch
    
    VENV_DIR=/p/project/opengptx/ebert1/opengpt/bigscience/env
    CLEAN_PREV_JIT_BUILD=1
    
    if ! [ -d "$VENV_DIR" ]; then
         echo 'please execute `set_up.sbatch` before continuing'
         exit 1
    fi
    
    source "$VENV_DIR/bin/activate"
    export PYTHONPATH="$(realpath "$VENV_DIR"/lib/python*/site-packages):$PYTHONPATH"
    
    #ROUND=3
    
    DATA_OUTPUT_PATH=/p/scratch/opengptx/ebert1/opengpt/bigscience/output_dir
    CHECKPOINT_PATH=$DATA_OUTPUT_PATH/checkpoints
    TENSORBOARD_PATH=$DATA_OUTPUT_PATH/tensorboard
    CODECARBON_PATH=$DATA_OUTPUT_PATH/codecarbon
    LOGS_PATH=$DATA_OUTPUT_PATH/logs
    
    MEGATRON_DEEPSPEED_REPO=/p/project/opengptx/ebert1/opengpt/bigscience/Megatron-DeepSpeed
    
    VOCAB_FILE=/p/project/opengptx/ebert1/opengpt/gpt2-vocab.json
    MERGE_FILE=/p/project/opengptx/ebert1/opengpt/gpt2-merges.txt
    DATA_PATH=/p/scratch/opengptx/ebert1/opengpt/bigscience/oscar/oscar_text_document
    
    cd $MEGATRON_DEEPSPEED_REPO
    ((CLEAN_PREV_JIT_BUILD)) && rm -rf megatron/fused_kernels/{build,__pycache__}
    
    MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
    # Allow communication over InfiniBand cells.
    MASTER_ADDR="${MASTER_ADDR}i"
    MASTER_PORT=6000
    
    GPUS_PER_NODE=4
    NNODES=$SLURM_JOB_NUM_NODES   # switch to 128
    TP_SIZE=1    # always fixed to the size of a single node
    PP_SIZE=1    # NLAYERS must be a multiple of PP_SIZE here
    # TP_SIZE=2    # always fixed to the size of a single node
    # PP_SIZE=4    # NLAYERS must be a multiple of PP_SIZE here
    #DP_SIZE=$NNODES*$GPUS_PER_NODE/($PP_SIZE*$TP_SIZE) # will get derived automatically by trainer
    
    export CUDA_VISIBLE_DEVICES=$(seq 0 $(($GPUS_PER_NODE - 1)) | awk '{a=a "," $1} END {print substr(a, 2)}')
    # Set `TORCH_CUDA_ARCH_LIST` according to A100 compute capability.
    export TORCH_CUDA_ARCH_LIST='8.0'
    export CXX=g++
    
    # GLOBAL_BATCH_SIZE has to be divisible by MICRO_BATCH_SIZE*DP_size
    # GLOBAL_BATCH_SIZE=$(($MICRO_BATCH_SIZE*$GAS*$DP_SIZE)) - GAS is auto-derived by deepspeed
    MICRO_BATCH_SIZE=1
    GLOBAL_BATCH_SIZE=$((($NNODES * $GPUS_PER_NODE / ($PP_SIZE * $TP_SIZE)) * $MICRO_BATCH_SIZE))
    
    # From Eleuther "small.yml"
    NLAYERS=12
    NHIDDEN=768
    NHEADS=12
    FFN_HIDDEN_SIZE=768
    SEQ_LEN=2048
    VOCAB_SIZE=50257
    
    # NLAYERS=40
    # NHIDDEN=5120
    # NHEADS=32
    # FFN_HIDDEN_SIZE=20480
    # SEQ_LEN=2048
    # VOCAB_SIZE=50257
    
    SAVE_INTERVAL=1500
    
    OPTIMIZER_ARGS=" \
        --optimizer adam \
        --adam-beta1 0.9 \
        --adam-beta2 0.999 \
        --adam-eps 1e-8 \
        --lr 1e-4 \
        --min-lr 1e-5 \
        --lr-decay-style cosine \
        --lr-decay-samples 126_953_125 \
        --lr-warmup-samples 216_320 \
        --clip-grad 1.0 \
        --weight-decay 1e-1 \
        "
    
    EXIT_OPTS=" \
        --exit-duration-in-mins 1190 \
        "
    
        # --rampup-batch-size 16 16 5_000_000 \
    GPT_ARGS=" \
        --num-layers $NLAYERS \
        --hidden-size $NHIDDEN \
        --ffn-hidden-size $FFN_HIDDEN_SIZE \
        --num-attention-heads $NHEADS \
        --seq-length $SEQ_LEN \
        --max-position-embeddings $SEQ_LEN \
        --micro-batch-size $MICRO_BATCH_SIZE \
        --global-batch-size $GLOBAL_BATCH_SIZE \
        --train-samples 300_000_000 \
        --vocab-file $VOCAB_FILE \
        --merge-file $MERGE_FILE \
        --loss-scale 12 \
        --clip-grad 1.0 \
        --fp16 \
        --checkpoint-activations \
        --seed 42
        $OPTIMIZER_ARGS \
        $EXIT_OPTS \
        "
    
    OUTPUT_ARGS=" \
        --log-interval 10 \
        --save-interval $SAVE_INTERVAL \
        --eval-interval 1000 \
        --eval-iters 5 \
        --codecarbon-dir $CODECARBON_PATH \
        --tensorboard-dir $TENSORBOARD_PATH \
        --tensorboard-queue-size 5 \
        --log-timers-to-tensorboard \
        --log-batch-size-to-tensorboard \
        --log-validation-ppl-to-tensorboard \
        "
    
    ZERO_STAGE=1
    
    config_json="./ds_config.$SLURM_JOBID.json"
    
    # Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
    cat <<EOT > $config_json
    {
      "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
      "train_batch_size": $GLOBAL_BATCH_SIZE,
      "gradient_clipping": 1.0,
      "zero_optimization": {
        "stage": $ZERO_STAGE
      },
      "fp16": {
        "enabled": true,
        "loss_scale": 0,
        "loss_scale_window": 500,
        "hysteresis": 2,
        "min_loss_scale": 1,
        "initial_scale_power": 12
      },
      "steps_per_print": 2000,
      "wall_clock_breakdown": false
    }
    EOT
    
    
    DEEPSPEED_ARGS=" \
        --deepspeed \
        --deepspeed_config ${config_json} \
        --zero-stage ${ZERO_STAGE} \
        --deepspeed-activation-checkpointing \
        "
    
    export LAUNCHER="python -u -m torch.distributed.launch \
        --nproc_per_node $GPUS_PER_NODE \
        --nnodes $NNODES \
        --master_addr $MASTER_ADDR \
        --master_port $MASTER_PORT \
        "
    
        # --load $CHECKPOINT_PATH \
    export CMD=" \
        $(pwd)/pretrain_gpt.py \
        --tensor-model-parallel-size $TP_SIZE \
        --pipeline-model-parallel-size $PP_SIZE \
        $GPT_ARGS \
        $OUTPUT_ARGS \
        --save $CHECKPOINT_PATH \
        --data-path $DATA_PATH \
        --data-impl mmap \
        --split 949,50,1 \
        --distributed-backend nccl \
         $DEEPSPEED_ARGS \
        "
    
    echo $CMD
    
    # to debug - add echo (it exits and prints what it would have launched)
    clear; srun --jobid $SLURM_JOBID bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $LOGS_PATH/main_log.txt
    
    echo "END TIME: $(date)"
    
    #