Skip to content
Snippets Groups Projects
Select Git revision
  • 470184c074e4c1764fb2d0e8f563aa021fd8c9fd
  • main default protected
  • add_automatic_checkpoint_and_restart
3 results

tr1-13B-round1_juwels_pipe.sbatch

Blame
  • penke3's avatar
    Carolin Penke authored
    deleted commented unused, confusing line
    470184c0
    History
    tr1-13B-round1_juwels_pipe.sbatch 5.19 KiB
    #!/bin/bash
    #SBATCH --job-name=tr1-13B-pipe
    #SBATCH --nodes=4
    #SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
    #SBATCH --cpus-per-task=48           # number of cores per tasks
    #SBATCH --hint=nomultithread         # we get physical cores not logical
    #SBATCH --gres=gpu:4                 # number of gpus
    #SBATCH --time=00:10:00              # maximum execution time (HH:MM:SS)
    #SBATCH --output=%x-%j.out           # output file name
    #SBATCH --account=opengptx-elm
    # Use `develbooster` for debugging, `booster` for "normal" jobs, and
    # `largebooster` for jobs on more than 256 nodes.
    #SBATCH --partition=develbooster
    
    set -x -e
    
    echo "START TIME: $(date)"
    
    CLEAN_PREV_JIT_BUILD=0
    
    if ! [ -e activate.bash ]; then
        echo 'Please execute the sbatch script from the `run_scripts` directory.'
        exit 1
    fi
    source activate.bash
    
    #ROUND=3
    
    # The following paths might already be set in long-running-session in  StartLongRun.sh
    [ "x$DATA_OUTPUT_PATH" = x ] &&  DATA_OUTPUT_PATH="$ROOT_OUTPUT_DIR"/output_dir/tr1-13B
    [ "x$CHECKPOINT_PATH" = x ] && CHECKPOINT_PATH=$DATA_OUTPUT_PATH/checkpoints
    [ "x$TENSORBOARD_PATH" = x ] &&  TENSORBOARD_PATH=$DATA_OUTPUT_PATH/tensorboard
    [ "x$CODECARBON_PATH" = x ] &&  CODECARBON_PATH=$DATA_OUTPUT_PATH/codecarbon
    [ "x$LOGS_PATH" = x ] &&  LOGS_PATH=$DATA_OUTPUT_PATH/logs
    mkdir -p $LOGS_PATH
    
    cd "$MEGATRON_DEEPSPEED_REPO"
    rm -f megatron/fused_kernels/build/lock
    ((CLEAN_PREV_JIT_BUILD)) && rm -rf megatron/fused_kernels/{build,__pycache__}
    
    MASTER_ADDR="$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)"
    # Allow communication over InfiniBand cells.
    MASTER_ADDR="${MASTER_ADDR}i"
    MASTER_PORT=6000
    
    GPUS_PER_NODE=4
    NNODES=$SLURM_JOB_NUM_NODES   # switch to 128
    TP_SIZE=2    # always fixed to the size of a single node
    PP_SIZE=4    # NLAYERS must be a multiple of PP_SIZE here
    #DP_SIZE=$NNODES*$GPUS_PER_NODE/($PP_SIZE*$TP_SIZE) # will get derived automatically by trainer
    
    export CUDA_VISIBLE_DEVICES="$(comma_range "$GPUS_PER_NODE")"
    
    # GLOBAL_BATCH_SIZE has to be divisible by MICRO_BATCH_SIZE*DP_size
    # GLOBAL_BATCH_SIZE=$(($MICRO_BATCH_SIZE*$GAS*$DP_SIZE)) - GAS is auto-derived by deepspeed
    MICRO_BATCH_SIZE=1
    GLOBAL_BATCH_SIZE=$(((NNODES * GPUS_PER_NODE / (PP_SIZE * TP_SIZE)) * MICRO_BATCH_SIZE))
    
    NLAYERS=40
    NHIDDEN=5120
    NHEADS=32
    FFN_HIDDEN_SIZE=20480
    SEQ_LEN=2048
    VOCAB_SIZE=50257
    
    SAVE_INTERVAL=1500
    
    OPTIMIZER_ARGS=" \
        --optimizer adam \
        --adam-beta1 0.9 \
        --adam-beta2 0.999 \
        --adam-eps 1e-8 \
        --lr 1e-4 \
        --min-lr 1e-5 \
        --lr-decay-style cosine \
        --lr-decay-samples 126_953_125 \
        --lr-warmup-samples 216_320 \
        --clip-grad 1.0 \
        --weight-decay 1e-1 \
        "
    
    EXIT_OPTS=" \
        --exit-duration-in-mins 1190 \
        "
    
        # --rampup-batch-size 16 16 5_000_000 \
    GPT_ARGS=" \
        --num-layers $NLAYERS \
        --hidden-size $NHIDDEN \
        --ffn-hidden-size $FFN_HIDDEN_SIZE \
        --num-attention-heads $NHEADS \
        --seq-length $SEQ_LEN \
        --max-position-embeddings $SEQ_LEN \
        --micro-batch-size $MICRO_BATCH_SIZE \
        --global-batch-size $GLOBAL_BATCH_SIZE \
        --train-samples 300_000_000 \
        --vocab-file $VOCAB_FILE \
        --merge-file $MERGE_FILE \
        --loss-scale 12 \
        --clip-grad 1.0 \
        --fp16 \
        --checkpoint-activations \
        --seed 42
        $OPTIMIZER_ARGS \
        $EXIT_OPTS \
        "
    
    OUTPUT_ARGS=" \
        --log-interval 10 \
        --save-interval $SAVE_INTERVAL \
        --eval-interval 1000 \
        --eval-iters 5 \
        --codecarbon-dir $CODECARBON_PATH \
        --tensorboard-dir $TENSORBOARD_PATH \
        --tensorboard-queue-size 5 \
        --log-timers-to-tensorboard \
        --log-batch-size-to-tensorboard \
        --log-validation-ppl-to-tensorboard \
        "
    
    ZERO_STAGE=1
    
    config_json="./ds_config.$SLURM_JOBID.json"
    
    # Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
    cat <<EOT > "$config_json"
    {
      "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
      "train_batch_size": $GLOBAL_BATCH_SIZE,
      "gradient_clipping": 1.0,
      "zero_optimization": {
        "stage": $ZERO_STAGE
      },
      "fp16": {
        "enabled": true,
        "loss_scale": 0,
        "loss_scale_window": 500,
        "hysteresis": 2,
        "min_loss_scale": 1,
        "initial_scale_power": 12
      },
      "steps_per_print": 2000,
      "wall_clock_breakdown": false
    }
    EOT
    
    
    DEEPSPEED_ARGS=" \
        --deepspeed \
        --deepspeed_config ${config_json} \
        --zero-stage ${ZERO_STAGE} \
        --deepspeed-activation-checkpointing \
        "
    
    export LAUNCHER="python -u -m torch.distributed.launch \
        --nproc_per_node $GPUS_PER_NODE \
        --nnodes $NNODES \
        --master_addr $MASTER_ADDR \
        --master_port $MASTER_PORT \
        "
    
    export CMD=" \
        $(pwd)/pretrain_gpt.py \
        --tensor-model-parallel-size $TP_SIZE \
        --pipeline-model-parallel-size $PP_SIZE \
        $GPT_ARGS \
        $OUTPUT_ARGS \
        --save $CHECKPOINT_PATH \
        --data-path $DATA_PATH \
        --data-impl mmap \
        --split 949,50,1 \
        --distributed-backend nccl \
         $DEEPSPEED_ARGS \
        "
    
    if [ "$LOAD_CHECKPOINTS" = true ] ; then
        export CMD="$CMD\
            --load $CHECKPOINT_PATH \
            "
    fi
    
    echo $CMD
    
    # to debug - add echo (it exits and prints what it would have launched)
    
    clear; srun --jobid $SLURM_JOBID bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a "$LOGS_PATH"/main_log.txt
    
    echo "END TIME: $(date)"
    
    #