set_up.sbatch

#!/usr/bin/env bash

#SBATCH --account opengptx-elm
#SBATCH --partition develbooster
#SBATCH --nodes 1
#SBATCH --gres gpu:1
#SBATCH --time 01:00:00

# This is probably better to not use as an sbatch script due to the
# offline compute nodes. You could work around it by setting the Git
# repositories up in advance and using `python -m pip download -r
# requirements.txt` and similar.
#
# Otherwise just use `nice bash set_up.sbatch`.

set -euo pipefail

[ -x "$(command -v deactivate)" ] && deactivate

module purge
# (Most) modules in requirements.txt don't have specific versions, so
# use our modules where possible.
module load Stages/2020 GCC CMake Ninja Python libaio
# With PyTorch 1.10, need CUDA and NCCL module (as we don't load PyTorch):
module load cuDNN NCCL

ROOT_DIR=/p/project/opengptx/"$USER"/opengpt/bigscience
VENV_DIR="$ROOT_DIR"/env
CODE_DIR="$ROOT_DIR"/Megatron-DeepSpeed
DO_PULL=1

if ! [ -L "$HOME/.cache" ]; then
    echo 'Please link your `.cache` directory in order ' \
         'to avoid out-of-memory errors!'
    exit 1
fi

mkdir -p "$(dirname "$VENV_DIR")"

[ -d "$CODE_DIR" ] \
    || git clone \
           https://github.com/bigscience-workshop/Megatron-DeepSpeed \
           "$CODE_DIR"

[ -d "$VENV_DIR" ] || python -m venv --system-site-packages "$VENV_DIR"
source "$VENV_DIR/bin/activate"
# Maybe later if this causes errors.
export PYTHONPATH="$(realpath "$VENV_DIR"/lib/python*/site-packages):$PYTHONPATH"

# Set `TORCH_CUDA_ARCH_LIST` according to A100 compute capability.
export TORCH_CUDA_ARCH_LIST='8.0+PTX'
export CXX=g++

python -m pip install --upgrade pip
cd "$CODE_DIR"
((DO_PULL)) && git pull --rebase origin main
git am /p/project/opengptx/ebert1/opengpt/bigscience/0001-Build-fused-kernels-in-temporary-directory.patch \
    || echo 'WARNING: Could not apply patch for building in temp dir.'
# Remove previous build if any.
rm -rf megatron/fused_kernels/build

# For PyTorch 1.10, install it:
python -m pip install torch==1.10.1+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html

# Following from here on the Megatron-DeepSpeed instructions at
# https://github.com/bigscience-workshop/Megatron-DeepSpeed#setup
# (permalink) https://github.com/bigscience-workshop/Megatron-DeepSpeed/blob/7ab5c05ad2e3b4ee678307260c62e89ab592505a/README.md#setup

# Install BigScience repo
# I got errors with DeepSpeed here but those don't matter as we're
# going to install it ourselves.
# So just comment the `deepspeed` requirement.
sed -i 's/^[Dd]eep[Ss]peed/# &/' requirements.txt
python -m pip install -r requirements.txt
sed -i 's/^# \([Dd]eep[Ss]peed\)/\1/' requirements.txt
# python -m pip install -e .

# We now follow
# https://github.com/bigscience-workshop/bigscience/blob/master/jz/envs#building-things-from-source
# (permalink) https://github.com/bigscience-workshop/bigscience/blob/ce5115c1c12f407e864f711f865a30251013835c/jz/envs/README.md#building-things-from-source
# instead of the Megatron-DeepSpeed instructions.

# Install DeepSpeed
[ -d deepspeed ] || git clone https://github.com/microsoft/deepspeed
cd deepspeed
((DO_PULL)) && git pull
# Pre-build DeepSpeed
DS_BUILD_DIR=/tmp/deepspeed
export TMPDIR="$DS_BUILD_DIR"
mkdir -p "$TMPDIR"

rm -rf build
# There are three variations of this line; one in the JZ README linked
# above, one in
# https://github.com/bigscience-workshop/bigscience/blob/master/jz/envs/deepspeed/build.sh
# (permalink) https://github.com/bigscience-workshop/bigscience/blob/ce5115c1c12f407e864f711f865a30251013835c/jz/envs/deepspeed/build.sh
# , and one in the original Megatron-DeepSpeed repository:
# https://github.com/bigscience-workshop/Megatron-DeepSpeed#setup
# (permalink) https://github.com/bigscience-workshop/Megatron-DeepSpeed/blob/7ab5c05ad2e3b4ee678307260c62e89ab592505a/README.md#setup
# .

# We are _not_ going with the more recently edited version as of
# 2022-01-10 (the one from the JZ README). Instead we use the
# `build.sh` version because it's probably the one actively used.

# README version
# Needed to install triton==1.0.0 manually.
python -m pip install triton==1.0.0
# Needed to install ninja manually.
python -m pip install ninja
python -m pip uninstall deepspeed
# time \
#     DS_BUILD_CPU_ADAM=1 \
#     DS_BUILD_UTILS=1 \
#     python -m pip install \
#     -e . \
#     --global-option="build_ext" \
#     --global-option="-j8" \
#     --no-cache \
#     -v \
#     --disable-pip-version-check \
#     2>&1 \
#     | tee build.log
# `build.sh` version
time \
    DS_BUILD_CPU_ADAM=1 \
    DS_BUILD_FUSED_ADAM=1 \
    DS_BUILD_FUSED_LAMB=1 \
    DS_BUILD_TRANSFORMER=1 \
    DS_BUILD_STOCHASTIC_TRANSFORMER=1 \
    DS_BUILD_UTILS=1 \
    python -m pip install -e . \
    --global-option="build_ext" \
    --global-option="-j8" \
    --no-cache \
    -v \
    --disable-pip-version-check 2>&1 \
    | tee build.log
cd ..

# Install Apex
[ -d apex ] || git clone https://github.com/NVIDIA/apex
cd apex
((DO_PULL)) && git pull
python -m pip install \
       --global-option="--cpp_ext" \
       --global-option="--cuda_ext" \
       --no-cache \
       -v \
       --disable-pip-version-check \
       . \
       2>&1 \
    | tee build.log
cd ..

echo 'Done!'