Select Git revision
conftest.py
set_up.sbatch 5.07 KiB
#!/usr/bin/env bash
#SBATCH --account opengptx-elm
#SBATCH --partition develbooster
#SBATCH --nodes 1
#SBATCH --gres gpu:1
#SBATCH --time 01:00:00
# This is probably better to not use as an sbatch script due to the
# offline compute nodes. You could work around it by setting the Git
# repositories up in advance and using `python -m pip download -r
# requirements.txt` and similar.
#
# Otherwise just use `nice bash set_up.sbatch`.
set -euo pipefail
[ -x "$(command -v deactivate)" ] && deactivate
module purge
# (Most) modules in requirements.txt don't have specific versions, so
# use our modules where possible.
module load Stages/2020 GCC CMake Ninja Python libaio
# With PyTorch 1.10, need CUDA and NCCL module (as we don't load PyTorch):
module load cuDNN NCCL
ROOT_DIR=/p/project/opengptx/"$USER"/opengpt/bigscience
VENV_DIR="$ROOT_DIR"/env
CODE_DIR="$ROOT_DIR"/Megatron-DeepSpeed
DO_PULL=1
if ! [ -L "$HOME/.cache" ]; then
echo 'Please link your `.cache` directory in order ' \
'to avoid out-of-memory errors!'
exit 1
fi
mkdir -p "$(dirname "$VENV_DIR")"
[ -d "$CODE_DIR" ] \
|| git clone \
https://github.com/bigscience-workshop/Megatron-DeepSpeed \
"$CODE_DIR"
[ -d "$VENV_DIR" ] || python -m venv --system-site-packages "$VENV_DIR"
source "$VENV_DIR/bin/activate"
# Maybe later if this causes errors.
export PYTHONPATH="$(realpath "$VENV_DIR"/lib/python*/site-packages):$PYTHONPATH"
# Set `TORCH_CUDA_ARCH_LIST` according to A100 compute capability.
export TORCH_CUDA_ARCH_LIST='8.0+PTX'
export CXX=g++
python -m pip install --upgrade pip
cd "$CODE_DIR"
((DO_PULL)) && git pull --rebase origin main
git am /p/project/opengptx/ebert1/opengpt/bigscience/0001-Build-fused-kernels-in-temporary-directory.patch \
|| echo 'WARNING: Could not apply patch for building in temp dir.'
# Remove previous build if any.
rm -rf megatron/fused_kernels/build
# For PyTorch 1.10, install it:
python -m pip install torch==1.10.1+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
# Following from here on the Megatron-DeepSpeed instructions at
# https://github.com/bigscience-workshop/Megatron-DeepSpeed#setup
# (permalink) https://github.com/bigscience-workshop/Megatron-DeepSpeed/blob/7ab5c05ad2e3b4ee678307260c62e89ab592505a/README.md#setup
# Install BigScience repo
# I got errors with DeepSpeed here but those don't matter as we're
# going to install it ourselves.
# So just comment the `deepspeed` requirement.
sed -i 's/^[Dd]eep[Ss]peed/# &/' requirements.txt
python -m pip install -r requirements.txt
sed -i 's/^# \([Dd]eep[Ss]peed\)/\1/' requirements.txt
# python -m pip install -e .
# We now follow
# https://github.com/bigscience-workshop/bigscience/blob/master/jz/envs#building-things-from-source
# (permalink) https://github.com/bigscience-workshop/bigscience/blob/ce5115c1c12f407e864f711f865a30251013835c/jz/envs/README.md#building-things-from-source
# instead of the Megatron-DeepSpeed instructions.
# Install DeepSpeed
[ -d deepspeed ] || git clone https://github.com/microsoft/deepspeed
cd deepspeed
((DO_PULL)) && git pull
# Pre-build DeepSpeed
DS_BUILD_DIR=/tmp/deepspeed
export TMPDIR="$DS_BUILD_DIR"
mkdir -p "$TMPDIR"
rm -rf build
# There are three variations of this line; one in the JZ README linked
# above, one in
# https://github.com/bigscience-workshop/bigscience/blob/master/jz/envs/deepspeed/build.sh
# (permalink) https://github.com/bigscience-workshop/bigscience/blob/ce5115c1c12f407e864f711f865a30251013835c/jz/envs/deepspeed/build.sh
# , and one in the original Megatron-DeepSpeed repository:
# https://github.com/bigscience-workshop/Megatron-DeepSpeed#setup
# (permalink) https://github.com/bigscience-workshop/Megatron-DeepSpeed/blob/7ab5c05ad2e3b4ee678307260c62e89ab592505a/README.md#setup
# .
# We are _not_ going with the more recently edited version as of
# 2022-01-10 (the one from the JZ README). Instead we use the
# `build.sh` version because it's probably the one actively used.
# README version
# Needed to install triton==1.0.0 manually.
python -m pip install triton==1.0.0
# Needed to install ninja manually.
python -m pip install ninja
python -m pip uninstall deepspeed
# time \
# DS_BUILD_CPU_ADAM=1 \
# DS_BUILD_UTILS=1 \
# python -m pip install \
# -e . \
# --global-option="build_ext" \
# --global-option="-j8" \
# --no-cache \
# -v \
# --disable-pip-version-check \
# 2>&1 \
# | tee build.log
# `build.sh` version
time \
DS_BUILD_CPU_ADAM=1 \
DS_BUILD_FUSED_ADAM=1 \
DS_BUILD_FUSED_LAMB=1 \
DS_BUILD_TRANSFORMER=1 \
DS_BUILD_STOCHASTIC_TRANSFORMER=1 \
DS_BUILD_UTILS=1 \
python -m pip install -e . \
--global-option="build_ext" \
--global-option="-j8" \
--no-cache \
-v \
--disable-pip-version-check 2>&1 \
| tee build.log
cd ..
# Install Apex
[ -d apex ] || git clone https://github.com/NVIDIA/apex
cd apex
((DO_PULL)) && git pull
python -m pip install \
--global-option="--cpp_ext" \
--global-option="--cuda_ext" \
--no-cache \
-v \
--disable-pip-version-check \
. \
2>&1 \
| tee build.log
cd ..
echo 'Done!'