#!/usr/bin/env bash #SBATCH --account=opengptx-elm #SBATCH --partition develbooster #SBATCH --nodes 1 #SBATCH --cpus-per-task 9 #SBATCH --gres gpu:1 #SBATCH --time 01:00:00 module purge # (Most) modules in requirements.txt don't have specific versions, so # use our modules where possible. module load Stages/2020 GCC Python libaio # Since this is the only optionally used module, we could nuke it in # favor of compatibility during setup. module load PyTorch export CUDA_VISIBLE_DEVICES=0 VENV_DIR=/p/project/opengptx/ebert1/opengpt/bigscience/env CODE_DIR=/p/project/opengptx/ebert1/opengpt/bigscience/Megatron-DeepSpeed INPUT_PATH=/p/project/opengptx/ebert1/opengpt/bigscience/oscar-1GB.jsonl OUTPUT_PREFIX=/p/project/opengptx/ebert1/opengpt/bigscience/oscar VOCAB_FILE=/p/project/opengptx/ebert1/opengpt/gpt2-vocab.json MERGE_FILE=/p/project/opengptx/ebert1/opengpt/gpt2-merges.txt if ! [ -d "$VENV_DIR" ]; then echo 'please execute `set_up.sbatch` before continuing' exit 1 fi [ -x "$(command -v deactivate)" ] && deactivate source "$VENV_DIR/bin/activate" export PYTHONPATH="$(realpath "$VENV_DIR"/lib/python*/site-packages):$PYTHONPATH" cd "$CODE_DIR" srun python ./tools/preprocess_data.py \ --input "$INPUT_PATH" \ --output-prefix "$OUTPUT_PREFIX" \ --vocab "$VOCAB_FILE" \ --dataset-impl mmap \ --tokenizer-type GPT2BPETokenizer \ --merge-file "$MERGE_FILE" \ --append-eod \ --workers 8 echo 'Done!'