Select Git revision
Forked from
Stefan Kesselheim / mlperf_juwelsbooster
8 commits behind the upstream repository.

Stefan Kesselheim authored
Dockerfile 4.18 KiB
# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/mxnet:20.06-py3
FROM ${FROM_IMAGE_NAME}
# Somehow there is a ucx installation in the image, we remove it
RUN rm -rf /usr/local/ucx
# MELLANOX neohost does not like python2, so the link must be removed
RUN rm -rf /opt/conda/bin/python
ARG MOFEDVERSION=5.1-2.5.8.0
RUN wget http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MOFEDVERSION}/MLNX_OFED_LINUX-${MOFEDVERSION}-ubuntu18.04-$(uname -m).tgz
run tar -xvzf MLNX*.tgz
run apt-get update && apt-get install -y libcap2
run cd MLNX_OFED_LINUX-${MOFEDVERSION}-ubuntu18.04-$(uname -m) && ./mlnxofedinstall --force --user-space-only --without-fw-update; cat /tmp/MLNX_OFED_LINUX.*.logs/neohost-backend.debinstall.log
# The plugin is almost working
#run apt-get remove ucx && rm -rf /usr/local/uxc
#
#run git clone http://github.com/Mellanox/nccl-rdma-sharp-plugins
#workdir /workspace/nccl-rdma-sharp-plugins
#run git checkout v2.0.x
#run ./autogen.sh
#env LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda-11.0/targets/x86_64-linux/lib
#env LIBRARY_PATH=${LIBRARY_PATH}:/usr/local/cuda-11.0/targets/x86_64-linux/lib
#env CPATH=${CPATH}:/usr/local/cuda-11.0/targets/x86_64-linux/include
#run ./configure --with-ucx=/usr
#run make -j && make install
#CPATH=$CPATH:/usr/local/cuda-11.0/targets/x86_64-linux/include
#export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.0/targets/x86_64-linux/lib
#WORKDIR MLNX_OFED_LINUX-5.1-2.5.8.0-ubuntu18.04-x86_64/DEBS
#run apt-get remove ibverbs-providers libibverbs-dev libibverbs2 ibverbs-utils
#run dpkg -i ibverbs-providers_51mlnx1-1.51258_amd64.deb ibverbs-utils_51mlnx1-1.51258_amd64.deb libibverbs1_51mlnx1-1.51258_amd64.deb librdmacm1_51mlnx1-1.51258_amd64.deb ucx_1.9.0-1.51258_amd64.deb libibumad3_51mlnx1-1.51258_amd64.deb sharp_2.2.2.MLNX20201102.b26a0fd-1.51258_amd64.deb
#WORKDIR /workspace
#
#ENV DEBIAN_FRONTEND=noninteractive
#RUN apt-get update && apt-get install -y nvidia-driver-460
#ENV LD_LIBRARY_PATH="/usr/local/cuda/compat/lib.real:/${LD_LIBRARY_PATH}"
#
#RUN apt install --no-install-recommends -y build-essential devscripts debhelper check libsubunit-dev fakeroot pkg-config dkms
#RUN git clone https://github.com/NVIDIA/gdrcopy
#RUN ln -s /usr/local/cuda/compat/lib.real/libcuda.so /usr/local/cuda-11.0/lib64
#RUN cd gdrcopy/packages && \
# CUDA=/usr/local/cuda-11.0 ./build-deb-packages.sh && \
# dpkg -i gdrdrv-dkms_2.2-1_amd64.deb &&\
# dpkg -i gdrcopy_2.2-1_amd64.deb
# UCX not require, because in mlx package
#RUN apt-get update && apt-get install -y libnuma-dev
## UCX. There is a package for CentOS 8 but it is old, so build from source.
#ARG UCX_VERSION=1.9.0
#ENV CPATH=$CPATH:CPATH=/usr/local/cuda-11.0/include
#RUN git clone --branch v${UCX_VERSION} --depth 1 \
# https://github.com/openucx/ucx.git \
# && cd ucx \
# && ./autogen.sh \
# && ./contrib/configure-release-mt --prefix=/usr/local --with-cuda=/usr/local/cuda-11.0/targets/x86_64-linux/ --with-verbs --without-java --disable-doxygen-doc --enable-optimizations --enable-mt --disable-debug --disable-logging --disable-assertions --disable-params-check --disable-dependency-tracking --enable-cma --with-rc --with-ud --with-dc --with-mlx5-dv --with-ib-hw-tm --with-dm --without-cm --with-avx --with-gdrcopy \
# && make -j$(getconf _NPROCESSORS_ONLN) install
run apt-get remove -y libnccl2
run apt-get install -y software-properties-common
run apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
run add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /"
run apt-get install -y libnccl2 libnccl-dev