#!/usr/bin/env bash

#SBATCH --account=opengptx-elm
#SBATCH --partition develbooster
#SBATCH --nodes 1
#SBATCH --cpus-per-task 9
#SBATCH --gres gpu:1
#SBATCH --time 01:00:00

module purge
# (Most) modules in requirements.txt don't have specific versions, so
# use our modules where possible.
module load Stages/2020 GCC Python libaio
# Since this is the only optionally used module, we could nuke it in
# favor of compatibility during setup.
module load PyTorch

export CUDA_VISIBLE_DEVICES=0

VENV_DIR=/p/project/opengptx/ebert1/opengpt/bigscience/env
CODE_DIR=/p/project/opengptx/ebert1/opengpt/bigscience/Megatron-DeepSpeed

INPUT_PATH=/p/project/opengptx/ebert1/opengpt/bigscience/oscar-1GB.jsonl
OUTPUT_PREFIX=/p/project/opengptx/ebert1/opengpt/bigscience/oscar
VOCAB_FILE=/p/project/opengptx/ebert1/opengpt/gpt2-vocab.json
MERGE_FILE=/p/project/opengptx/ebert1/opengpt/gpt2-merges.txt

if ! [ -d "$VENV_DIR" ]; then
     echo 'please execute `set_up.sbatch` before continuing'
     exit 1
fi

[ -x "$(command -v deactivate)" ] && deactivate
source "$VENV_DIR/bin/activate"
export PYTHONPATH="$(realpath "$VENV_DIR"/lib/python*/site-packages):$PYTHONPATH"

cd "$CODE_DIR"

srun python ./tools/preprocess_data.py \
    --input "$INPUT_PATH" \
    --output-prefix "$OUTPUT_PREFIX" \
    --vocab "$VOCAB_FILE" \
    --dataset-impl mmap \
    --tokenizer-type GPT2BPETokenizer \
    --merge-file "$MERGE_FILE" \
    --append-eod \
    --workers 8

echo 'Done!'