Skip to content
Snippets Groups Projects
Commit 497618b7 authored by Carolin Penke's avatar Carolin Penke
Browse files

bugfixes

parent c2ea7f7c
No related branches found
No related tags found
1 merge request!1changed paths to opengptx-elm and added StartLongRun.bash to start multiple...
NUM_JOBS=5
RUNTIME_PER_JOB="00:05:00"
# This script starts a number of slurm jobs in order to bypass the maximum allowed runtime on JUWELS
# Booster, using checkpoints. NUM_JOBS are launched, each job runs for RUNTIME_PER_JOB time and
# depends on the previous one having terminated. By default, DATA_OUTPUT_PATH with a timestamp is
# generated. Checkpoints, logs, etc. are saved in subdirectories. When instead DATA_OUTPUT_PATH is
# explicitly set, a training run can be resumed further. Furthermore a tensorboard instance is
# started and the suggested port forwarding is given for easier usability.
## Go on indefinitely until canceled by you
# NUM_JOBS=infinite
# The tr1-13B-round1_juwels_pipe.sbatch script was changed, s.t. the Path variables (for
# checkpointing etc.) are only set, when they are not already set. And the --load CHECKPOUINT_PATH
# was added to the launcher when the flag LOAD_CHECKPOINTS=true is set.
## Make sure ROOT_OUTPUT_DIR was set in variables.bash
# Change the following as you like
NUM_JOBS=3
RUNTIME_PER_JOB="24:00:00"
# Make sure ROOT_OUTPUT_DIR was set in variables.bash
[ "x$ROOT_OUTPUT_DIR" = x ] && source variables.bash
## Set checkpoint directory
# Set checkpoint directory
TIMESTAMP=`date +%Y-%m-%d_%H-%M-%S`
export DATA_OUTPUT_PATH=$ROOT_OUTPUT_DIR/$TIMESTAMP/output_dir/tr1-13B
# Or comment two lines above, uncomment lines below and change to checkpoint directory to pick up
# where you left.
# export DATA_OUTPUT_PATH=/p/project/opengptx-elm/$USER/opengpt/bigscience/2022-05-30_16-20-22/output_dir/tr1-13B
# export DATA_OUTPUT_PATH=/p/scratch/opengptx-elm/$USER/opengpt/bigscience/2022-05-30_16-20-22/output_dir/tr1-13B
# Set paths
export CHECKPOINT_PATH=$DATA_OUTPUT_PATH/checkpoints
export TENSORBOARD_PATH=$DATA_OUTPUT_PATH/tensorboard
export CODECARBON_PATH=$DATA_OUTPUT_PATH/codecarbon
export LOGS_PATH=$DATA_OUTPUT_PATH/logs
mkdir -p $LOGS_PATH
echo $CHECKPOINT_PATH
echo "DATA_OUTPUT_PATH: $DATA_OUTPUT_PATH"
echo "CHECKPOINT_PATH: $CHECKPOINT_PATH"
echo "TENSORBOARD_PATH: $TENSORBOARD_PATH"
echo "CODECARBON_PATH: $CODECARBON_PATH"
echo "LOGS_PATH: $LOGS_PATH"
# Tell sbatch scritp to pass the --load option in the launcher
export LOAD_CHECKPOINTS=true
# Start NUM_JOBS jobs, that will only start when the previous one has terminated
DEPENDENT=1
re='^[0-9]+$'
if [[ $NUM_JOBS =~ $re ]] ; then
for ((i=1; i <= $NUM_JOBS; i++));
do
CMD_SBATCH="sbatch ./tr1-13B-round1_juwels_pipe.sbatch --wait"
[ "x$RUNTIME_PER_JOB" != x ] && CMD_SBATCH="$CMD_SBATCH --time $RUNTIME_PER_JOB"
"$CMD_SBATCH"
done
if [ "x$RUNTIME_PER_JOB" != x ]
then
JOBID=$(sbatch --parsable --dependency=afterany:$DEPENDENT --time $RUNTIME_PER_JOB tr1-13B-round1_juwels_pipe.sbatch)
else
echo infinite $NUM_JOBS
while true
do
CMD_SBATCH="sbatch ./tr1-13B-round1_juwels_pipe.sbatch --wait"
[ "x$RUNTIME_PER_JOB" != x ] && CMD_SBATCH="$CMD_SBATCH --time $RUNTIME_PER_JOB"
"$CMD_SBATCH"
JOBID=$(sbatch --parsable --dependency=afterany:$DEPENDENT tr1-13B-round1_juwels_pipe.sbatch)
fi
echo "Submitted batch job $JOBID, dependent on termination of $DEPENDENT"
DEPENDENT=$JOBID
done
else
echo "NUM_JOBS not a number"
fi
echo "Starting tensorboard.."
# Grab free port
PORT=`comm -23 <(seq 49152 65535 | sort) <(ss -Htan | awk '{print $4}' | cut -d':' -f2 | sort -u) | shuf | head -n 1`
ml GCC TensorFlow
tensorboard --port $PORT --logdir $TENSORBOARD_PATH 2>/dev/null &
pid_tb=$!
echo "Forward the port used by tensorboard, by using the following command on your local machine:"
echo -e "\033[1mssh -L $PORT:localhost:$PORT $USER@juwels-booster.fz-juelich.de -N\033[m"
echo -e "Then open dashboard at \033[4mhttp://localhost:$PORT/\033[m in your local browser."
echo -e "\033[2mtensorboard is now running in the background, use \"pkill $pid_tb\" on $(hostname) to end it.\033[m"
\ No newline at end of file
#!/bin/bash
#SBATCH --job-name=tr1-13B-pipe
#SBATCH --nodes=1
# SBATCH --dependency=after:5011858
#SBATCH --nodes=4
#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
#SBATCH --cpus-per-task=48 # number of cores per tasks
#SBATCH --hint=nomultithread # we get physical cores not logical
......@@ -31,8 +30,10 @@ source activate.bash
# The following paths might already be set in long-running-session in StartLongRun.sh
[ "x$DATA_OUTPUT_PATH" = x ] && DATA_OUTPUT_PATH="$ROOT_OUTPUT_DIR"/output_dir/tr1-13B
[ "x$CHECKPOINT_PATH" = x ] && CHECKPOINT_PATH=$DATA_OUTPUT_PATH/checkpoints
[ "x$TENSORBOARD_PATH" = x ] && TENSORBOARD_PATH=$DATA_OUTPUT_PATH/tensorboard
[ "x$CODECARBON_PATH" = x ] && CODECARBON_PATH=$DATA_OUTPUT_PATH/codecarbon
[ "x$LOGS_PATH" = x ] && LOGS_PATH=$DATA_OUTPUT_PATH/logs
mkdir -p $LOGS_PATH
cd "$MEGATRON_DEEPSPEED_REPO"
rm -f megatron/fused_kernels/build/lock
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment