Skip to content
Snippets Groups Projects
Select Git revision
  • master
1 result

Dockerfile

Blame
  • Forked from Stefan Kesselheim / mlperf_juwelsbooster
    8 commits behind the upstream repository.
    Dockerfile 4.18 KiB
    # Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
    #
    # Licensed under the Apache License, Version 2.0 (the "License");
    # you may not use this file except in compliance with the License.
    # You may obtain a copy of the License at
    #
    #     http://www.apache.org/licenses/LICENSE-2.0
    #
    # Unless required by applicable law or agreed to in writing, software
    # distributed under the License is distributed on an "AS IS" BASIS,
    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    # See the License for the specific language governing permissions and
    # limitations under the License.
    
    ARG FROM_IMAGE_NAME=nvcr.io/nvidia/mxnet:20.06-py3
    FROM ${FROM_IMAGE_NAME}
    # Somehow there is a ucx installation in the image, we remove it
    RUN rm -rf /usr/local/ucx
    # MELLANOX neohost does not like python2, so the link must be removed
    RUN rm -rf /opt/conda/bin/python
    ARG MOFEDVERSION=5.1-2.5.8.0
    RUN wget http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MOFEDVERSION}/MLNX_OFED_LINUX-${MOFEDVERSION}-ubuntu18.04-$(uname -m).tgz
    run tar -xvzf MLNX*.tgz
    run apt-get update && apt-get install -y libcap2 
    run cd MLNX_OFED_LINUX-${MOFEDVERSION}-ubuntu18.04-$(uname -m) && ./mlnxofedinstall  --force --user-space-only --without-fw-update; cat /tmp/MLNX_OFED_LINUX.*.logs/neohost-backend.debinstall.log
    
    # The plugin is almost working
    #run apt-get remove ucx && rm -rf /usr/local/uxc
    #
    #run git clone http://github.com/Mellanox/nccl-rdma-sharp-plugins
    #workdir /workspace/nccl-rdma-sharp-plugins
    #run git checkout v2.0.x
    #run ./autogen.sh
    #env LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda-11.0/targets/x86_64-linux/lib
    #env LIBRARY_PATH=${LIBRARY_PATH}:/usr/local/cuda-11.0/targets/x86_64-linux/lib
    #env CPATH=${CPATH}:/usr/local/cuda-11.0/targets/x86_64-linux/include
    #run ./configure --with-ucx=/usr 
    #run make -j && make install
    
    #CPATH=$CPATH:/usr/local/cuda-11.0/targets/x86_64-linux/include
    #export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.0/targets/x86_64-linux/lib
    
    #WORKDIR MLNX_OFED_LINUX-5.1-2.5.8.0-ubuntu18.04-x86_64/DEBS
    #run apt-get remove ibverbs-providers libibverbs-dev libibverbs2 ibverbs-utils
    #run dpkg -i ibverbs-providers_51mlnx1-1.51258_amd64.deb ibverbs-utils_51mlnx1-1.51258_amd64.deb libibverbs1_51mlnx1-1.51258_amd64.deb librdmacm1_51mlnx1-1.51258_amd64.deb ucx_1.9.0-1.51258_amd64.deb libibumad3_51mlnx1-1.51258_amd64.deb sharp_2.2.2.MLNX20201102.b26a0fd-1.51258_amd64.deb
    
    
    #WORKDIR /workspace
    #
    #ENV DEBIAN_FRONTEND=noninteractive
    #RUN apt-get update && apt-get install -y nvidia-driver-460
    #ENV LD_LIBRARY_PATH="/usr/local/cuda/compat/lib.real:/${LD_LIBRARY_PATH}"
    #
    #RUN apt install --no-install-recommends -y build-essential devscripts debhelper check libsubunit-dev fakeroot pkg-config dkms
    #RUN git clone https://github.com/NVIDIA/gdrcopy
    #RUN ln -s /usr/local/cuda/compat/lib.real/libcuda.so /usr/local/cuda-11.0/lib64
    #RUN cd gdrcopy/packages  && \
    #    CUDA=/usr/local/cuda-11.0 ./build-deb-packages.sh && \
    #    dpkg -i gdrdrv-dkms_2.2-1_amd64.deb &&\
    #    dpkg -i gdrcopy_2.2-1_amd64.deb
    
    # UCX not require, because in mlx package
    #RUN apt-get update && apt-get install -y libnuma-dev
    ## UCX. There is a package for CentOS 8 but it is old, so build from source.
    #ARG UCX_VERSION=1.9.0
    #ENV CPATH=$CPATH:CPATH=/usr/local/cuda-11.0/include
    #RUN git clone --branch v${UCX_VERSION} --depth 1 \
    #              https://github.com/openucx/ucx.git \
    # && cd ucx \
    # && ./autogen.sh \
    # && ./contrib/configure-release-mt --prefix=/usr/local --with-cuda=/usr/local/cuda-11.0/targets/x86_64-linux/ --with-verbs --without-java --disable-doxygen-doc --enable-optimizations --enable-mt --disable-debug --disable-logging --disable-assertions --disable-params-check --disable-dependency-tracking --enable-cma --with-rc --with-ud --with-dc --with-mlx5-dv --with-ib-hw-tm --with-dm --without-cm --with-avx --with-gdrcopy  \
    # && make -j$(getconf _NPROCESSORS_ONLN) install
    
    run apt-get remove -y libnccl2
    run apt-get install -y software-properties-common
    run apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
    run add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /"
    run  apt-get install -y libnccl2 libnccl-dev