Skip to content
Snippets Groups Projects
Select Git revision
  • 8c8c0d58aceab4d095ce794f4b9dfbcf5a6548b7
  • master default protected
  • enxhi_issue460_remove_TOAR-I_access
  • michael_issue459_preprocess_german_stations
  • sh_pollutants
  • develop protected
  • release_v2.4.0
  • michael_issue450_feat_load-ifs-data
  • lukas_issue457_feat_set-config-paths-as-parameter
  • lukas_issue454_feat_use-toar-statistics-api-v2
  • lukas_issue453_refac_advanced-retry-strategy
  • lukas_issue452_bug_update-proj-version
  • lukas_issue449_refac_load-era5-data-from-toar-db
  • lukas_issue451_feat_robust-apriori-estimate-for-short-timeseries
  • lukas_issue448_feat_load-model-from-path
  • lukas_issue447_feat_store-and-load-local-clim-apriori-data
  • lukas_issue445_feat_data-insight-plot-monthly-distribution
  • lukas_issue442_feat_bias-free-evaluation
  • lukas_issue444_feat_choose-interp-method-cams
  • 414-include-crps-analysis-and-other-ens-verif-methods-or-plots
  • lukas_issue384_feat_aqw-data-handler
  • v2.4.0 protected
  • v2.3.0 protected
  • v2.2.0 protected
  • v2.1.0 protected
  • Kleinert_etal_2022_initial_submission
  • v2.0.0 protected
  • v1.5.0 protected
  • v1.4.0 protected
  • v1.3.0 protected
  • v1.2.1 protected
  • v1.2.0 protected
  • v1.1.0 protected
  • IntelliO3-ts-v1.0_R1-submit
  • v1.0.0 protected
  • v0.12.2 protected
  • v0.12.1 protected
  • v0.12.0 protected
  • v0.11.0 protected
  • v0.10.0 protected
  • IntelliO3-ts-v1.0_initial-submit
41 results

conftest.py

Blame
  • set_up.sbatch 5.07 KiB
    #!/usr/bin/env bash
    
    #SBATCH --account opengptx-elm
    #SBATCH --partition develbooster
    #SBATCH --nodes 1
    #SBATCH --gres gpu:1
    #SBATCH --time 01:00:00
    
    # This is probably better to not use as an sbatch script due to the
    # offline compute nodes. You could work around it by setting the Git
    # repositories up in advance and using `python -m pip download -r
    # requirements.txt` and similar.
    #
    # Otherwise just use `nice bash set_up.sbatch`.
    
    set -euo pipefail
    
    [ -x "$(command -v deactivate)" ] && deactivate
    
    module purge
    # (Most) modules in requirements.txt don't have specific versions, so
    # use our modules where possible.
    module load Stages/2020 GCC CMake Ninja Python libaio
    # With PyTorch 1.10, need CUDA and NCCL module (as we don't load PyTorch):
    module load cuDNN NCCL
    
    ROOT_DIR=/p/project/opengptx/"$USER"/opengpt/bigscience
    VENV_DIR="$ROOT_DIR"/env
    CODE_DIR="$ROOT_DIR"/Megatron-DeepSpeed
    DO_PULL=1
    
    if ! [ -L "$HOME/.cache" ]; then
        echo 'Please link your `.cache` directory in order ' \
             'to avoid out-of-memory errors!'
        exit 1
    fi
    
    mkdir -p "$(dirname "$VENV_DIR")"
    
    [ -d "$CODE_DIR" ] \
        || git clone \
               https://github.com/bigscience-workshop/Megatron-DeepSpeed \
               "$CODE_DIR"
    
    [ -d "$VENV_DIR" ] || python -m venv --system-site-packages "$VENV_DIR"
    source "$VENV_DIR/bin/activate"
    # Maybe later if this causes errors.
    export PYTHONPATH="$(realpath "$VENV_DIR"/lib/python*/site-packages):$PYTHONPATH"
    
    # Set `TORCH_CUDA_ARCH_LIST` according to A100 compute capability.
    export TORCH_CUDA_ARCH_LIST='8.0+PTX'
    export CXX=g++
    
    python -m pip install --upgrade pip
    cd "$CODE_DIR"
    ((DO_PULL)) && git pull --rebase origin main
    git am /p/project/opengptx/ebert1/opengpt/bigscience/0001-Build-fused-kernels-in-temporary-directory.patch \
        || echo 'WARNING: Could not apply patch for building in temp dir.'
    # Remove previous build if any.
    rm -rf megatron/fused_kernels/build
    
    # For PyTorch 1.10, install it:
    python -m pip install torch==1.10.1+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
    
    # Following from here on the Megatron-DeepSpeed instructions at
    # https://github.com/bigscience-workshop/Megatron-DeepSpeed#setup
    # (permalink) https://github.com/bigscience-workshop/Megatron-DeepSpeed/blob/7ab5c05ad2e3b4ee678307260c62e89ab592505a/README.md#setup
    
    # Install BigScience repo
    # I got errors with DeepSpeed here but those don't matter as we're
    # going to install it ourselves.
    # So just comment the `deepspeed` requirement.
    sed -i 's/^[Dd]eep[Ss]peed/# &/' requirements.txt
    python -m pip install -r requirements.txt
    sed -i 's/^# \([Dd]eep[Ss]peed\)/\1/' requirements.txt
    # python -m pip install -e .
    
    # We now follow
    # https://github.com/bigscience-workshop/bigscience/blob/master/jz/envs#building-things-from-source
    # (permalink) https://github.com/bigscience-workshop/bigscience/blob/ce5115c1c12f407e864f711f865a30251013835c/jz/envs/README.md#building-things-from-source
    # instead of the Megatron-DeepSpeed instructions.
    
    # Install DeepSpeed
    [ -d deepspeed ] || git clone https://github.com/microsoft/deepspeed
    cd deepspeed
    ((DO_PULL)) && git pull
    # Pre-build DeepSpeed
    DS_BUILD_DIR=/tmp/deepspeed
    export TMPDIR="$DS_BUILD_DIR"
    mkdir -p "$TMPDIR"
    
    rm -rf build
    # There are three variations of this line; one in the JZ README linked
    # above, one in
    # https://github.com/bigscience-workshop/bigscience/blob/master/jz/envs/deepspeed/build.sh
    # (permalink) https://github.com/bigscience-workshop/bigscience/blob/ce5115c1c12f407e864f711f865a30251013835c/jz/envs/deepspeed/build.sh
    # , and one in the original Megatron-DeepSpeed repository:
    # https://github.com/bigscience-workshop/Megatron-DeepSpeed#setup
    # (permalink) https://github.com/bigscience-workshop/Megatron-DeepSpeed/blob/7ab5c05ad2e3b4ee678307260c62e89ab592505a/README.md#setup
    # .
    
    # We are _not_ going with the more recently edited version as of
    # 2022-01-10 (the one from the JZ README). Instead we use the
    # `build.sh` version because it's probably the one actively used.
    
    # README version
    # Needed to install triton==1.0.0 manually.
    python -m pip install triton==1.0.0
    # Needed to install ninja manually.
    python -m pip install ninja
    python -m pip uninstall deepspeed
    # time \
    #     DS_BUILD_CPU_ADAM=1 \
    #     DS_BUILD_UTILS=1 \
    #     python -m pip install \
    #     -e . \
    #     --global-option="build_ext" \
    #     --global-option="-j8" \
    #     --no-cache \
    #     -v \
    #     --disable-pip-version-check \
    #     2>&1 \
    #     | tee build.log
    # `build.sh` version
    time \
        DS_BUILD_CPU_ADAM=1 \
        DS_BUILD_FUSED_ADAM=1 \
        DS_BUILD_FUSED_LAMB=1 \
        DS_BUILD_TRANSFORMER=1 \
        DS_BUILD_STOCHASTIC_TRANSFORMER=1 \
        DS_BUILD_UTILS=1 \
        python -m pip install -e . \
        --global-option="build_ext" \
        --global-option="-j8" \
        --no-cache \
        -v \
        --disable-pip-version-check 2>&1 \
        | tee build.log
    cd ..
    
    # Install Apex
    [ -d apex ] || git clone https://github.com/NVIDIA/apex
    cd apex
    ((DO_PULL)) && git pull
    python -m pip install \
           --global-option="--cpp_ext" \
           --global-option="--cuda_ext" \
           --no-cache \
           -v \
           --disable-pip-version-check \
           . \
           2>&1 \
        | tee build.log
    cd ..
    
    echo 'Done!'