diff --git a/jobscript.sh b/jobscript.sh index 57c5a39a70ed84af3eb9768b26e41fe847032757..1f18b94744493fa70c5f0bda159552971ccf8d65 100644 --- a/jobscript.sh +++ b/jobscript.sh @@ -1,13 +1,13 @@ #!/bin/bash #SBATCH --account=opengptx-elm #SBATCH --partition=booster -#SBATCH --job-name=opt175b -#SBATCH --nodes=48 +#SBATCH --job-name=opt125m_test +#SBATCH --nodes=4 #SBATCH --hint=nomultithread #SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=12 # number of cores per tasks #SBATCH --gres=gpu:4 # number of gpus -#SBATCH --time=00:15:00 # maximum execution time (HH:MM:SS) +#SBATCH --time=00:10:00 # maximum execution time (HH:MM:SS) #SBATCH --output=slurmLog/%x-%j.out # output file name #SBATCH --error=slurmLog/%x-%j.err # error file name @@ -25,7 +25,7 @@ opt-baselines -n "$SLURM_NNODES" -g 4 \ --account opengptx-elm \ --partition booster \ --prefix "$SLURM_JOB_NAME" \ - --model-size 175b \ + --model-size 125m \ --juwelsbooster \ --data "$DATA_PATH" \ --ntasks-per-node 4 \ @@ -34,7 +34,7 @@ opt-baselines -n "$SLURM_NNODES" -g 4 \ --tensorboard-logdir "$TENSORBOARD_PATH" \ --no-save-dir \ --snapshot-root "$ROOT_OUTPUT_DIR" \ - --time 15 \ + --time 10 \ --no-wandb \ --cpu-bind socket \ --salloc diff --git a/setup.bash b/setup.bash index 534085235b15b6996d43351f60e692aa0c10c533..29ec0b19322083aaf4874af37ff05b241006e91e 100644 --- a/setup.bash +++ b/setup.bash @@ -32,6 +32,9 @@ source activate.bash python -m pip install --upgrade pip +# Requires numpy 1.22.0 +python -m pip install numpy==1.22.0 + #Installing PyTorch 1.10.1 version with cuda 11.3 used by metaseq # python -m pip install torch==1.10.1+cu113 torchvision==0.11.2+cu113 torchaudio==0.10.1+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html