diff --git a/orttraining/tools/amdgpu/Dockerfile.rocm4.2.pytorch b/orttraining/tools/amdgpu/Dockerfile.rocm4.2.pytorch new file mode 100644 index 0000000000..243896ee89 --- /dev/null +++ b/orttraining/tools/amdgpu/Dockerfile.rocm4.2.pytorch @@ -0,0 +1,171 @@ +# docker build --network=host --file Dockerfile.rocm4.2.pytorch --tag ort:rocm4.2-pytorch . + +FROM rocm/pytorch:rocm4.2_ubuntu18.04_py3.6_pytorch_1.8.1 + +RUN apt-get -y install gpg-agent +RUN wget -q -O - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - +RUN echo 'deb [arch=amd64] http://repo.radeon.com/rocm/apt/4.2/ xenial main' | tee /etc/apt/sources.list.d/rocm.list + +RUN apt-get -y update +RUN apt-get -y install apt-utils +RUN apt-get -y install build-essential autotools-dev \ + make git curl vim wget rsync jq openssh-server openssh-client sudo \ + iputils-ping net-tools ethtool libcap2 \ + automake autoconf libtool flex doxygen \ + perl lsb-release iproute2 pciutils graphviz \ + bc tar git bash pbzip2 pv bzip2 unzip cabextract \ + g++ gcc \ + && apt-get autoremove + +# sh +RUN rm /bin/sh && ln -s /bin/bash /bin/sh + +# Labels for the docker +LABEL description="This docker sets up the environment to run ORT Training with AMD GPU" + +# CMake +ENV CMAKE_VERSION=3.18.2 +RUN cd /usr/local && \ + wget -q -O - https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz | tar zxf - +ENV PATH=/usr/local/cmake-${CMAKE_VERSION}-Linux-x86_64/bin:${PATH} + +ENV WORKSPACE_DIR=/workspace +RUN mkdir -p $WORKSPACE_DIR +WORKDIR $WORKSPACE_DIR + +ENV OLD_PATH=${PATH} +ENV PATH=/usr/bin:${PATH} +# Infiniband setup, openmpi installed under /usr/mpi/gcc/openmpi-4.0.4rc3 doesn't support multi-thread +ENV MOFED_VERSION=5.1-0.6.6.0 +ENV MOFED_OS=ubuntu18.04 +ENV MOFED_FILENAME=MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64 +RUN curl -fSsL https://www.mellanox.com/downloads/ofed/MLNX_OFED-${MOFED_VERSION}/${MOFED_FILENAME}.tgz | tar -zxpf - +RUN cd MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64 && \ + ./mlnxofedinstall --force --user-space-only --without-fw-update --hpc && \ + cd .. && \ + rm -r MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64 + +ENV PATH=${OLD_PATH} +ENV unset OLD_PATH + +# python env +RUN pip3 install --upgrade setuptools +ARG NUMPY_VERSION=1.18.5 +ARG ONNX_VERSION=1.7.0 +RUN pip3 install --no-cache-dir wheel tqdm boto3 requests six ipdb h5py html2text nltk progressbar pyyaml \ + git+https://github.com/NVIDIA/dllogger \ + numpy==${NUMPY_VERSION} \ + onnx=="${ONNX_VERSION}" + +ENV GITHUB_DIR=$WORKSPACE_DIR/github +RUN mkdir -p $GITHUB_DIR + +# UCX +WORKDIR $GITHUB_DIR +RUN apt-get -y update && apt-get -y --no-install-recommends install libnuma-dev +ARG UCX_VERSION=1.9.0-rc3 +ENV UCX_DIR=$WORKSPACE_DIR/ucx-$UCX_VERSION +RUN git clone https://github.com/openucx/ucx.git \ + && cd ucx \ + && git checkout v$UCX_VERSION \ + && ./autogen.sh \ + && mkdir build \ + && cd build \ + && ../contrib/configure-opt --prefix=$UCX_DIR --without-rocm --without-knem --without-cuda \ + && make -j"$(nproc)" \ + && make install \ + && cd .. \ + && rm -rf build + +# OpenMPI +# note: require --enable-orterun-prefix-by-default for Azure machine learning compute +# note: disable verbs as we use ucx middleware and don't want btl openib warnings +WORKDIR $GITHUB_DIR +ARG OPENMPI_BASEVERSION=4.0 +ARG OPENMPI_VERSION=${OPENMPI_BASEVERSION}.5 +ENV OPENMPI_DIR=$WORKSPACE_DIR/openmpi-${OPENMPI_VERSION} +RUN git clone --recursive https://github.com/open-mpi/ompi.git \ + && cd ompi \ + && git checkout v$OPENMPI_VERSION \ + && ./autogen.pl \ + && mkdir build \ + && cd build \ + && ../configure --prefix=$OPENMPI_DIR --with-ucx=$UCX_DIR --without-verbs \ + --enable-mpirun-prefix-by-default --enable-orterun-prefix-by-default \ + --enable-mca-no-build=btl-uct --disable-mpi-fortran \ + && make -j"$(nproc)" \ + && make install \ + && cd .. \ + && rm -rf build \ + && ldconfig \ + && test -f ${OPENMPI_DIR}/bin/mpic++ + +ENV PATH=$OPENMPI_DIR/bin:${PATH} +ENV LD_LIBRARY_PATH=$OPENMPI_DIR/lib:${LD_LIBRARY_PATH} + +# Create a wrapper for OpenMPI to allow running as root by default +RUN mv $OPENMPI_DIR/bin/mpirun $OPENMPI_DIR/bin/mpirun.real && \ + echo '#!/bin/bash' > $OPENMPI_DIR/bin/mpirun && \ + echo 'mpirun.real --allow-run-as-root "$@"' >> $OPENMPI_DIR/bin/mpirun && \ + chmod a+x $OPENMPI_DIR/bin/mpirun + +# install mpi4py (be sure to link existing /opt/openmpi-xxx) +RUN CC=mpicc MPICC=mpicc pip install mpi4py --no-binary mpi4py + +ARG CACHE_DATA=2021-05-18 + +# ONNX Runtime +WORKDIR $GITHUB_DIR +ENV ORT_DIR=$GITHUB_DIR/onnxruntime +RUN git clone --recursive https://github.com/microsoft/onnxruntime.git \ + && cd onnxruntime \ + && python3 tools/ci_build/build.py \ + --cmake_extra_defines ONNXRUNTIME_VERSION=`cat ./VERSION_NUMBER` \ + --build_dir build \ + --config Release \ + --parallel \ + --skip_tests \ + --build_wheel \ + --use_rocm --rocm_home /opt/rocm \ + --mpi_home $OPENMPI_DIR \ + --nccl_home /opt/rocm \ + --enable_training \ + && test -f $ORT_DIR/build/Release/onnxruntime_training_bert \ + && pip install $ORT_DIR/build/Release/dist/*.whl \ + && ldconfig + +RUN pip3 install --no-cache-dir GPUtil azureml azureml-core datasets tokenizers ninja cerberus sympy sacremoses sacrebleu + +# Huggingface Examples +WORKDIR $GITHUB_DIR +RUN git clone https://github.com/microsoft/huggingface-transformers.git + +# Enable ssh access without password needed +RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/g' /etc/ssh/sshd_config +RUN sed -i 's/#StrictModes yes/StrictModes no/g' /etc/ssh/sshd_config +RUN sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/g' /etc/ssh/sshd_config +RUN sed -i 's/#PermitEmptyPasswords no/PermitEmptyPasswords yes/g' /etc/ssh/sshd_config + +# Start or Restart sshd service +ENTRYPOINT service ssh restart && /bin/bash + +# Add model and scripts +ADD model ${WORKSPACE_DIR}/model +ADD script ${WORKSPACE_DIR}/script +RUN chmod a+x ${WORKSPACE_DIR}/script/run_bert.sh + +# add locale en_US.UTF-8 +RUN apt-get install -y locales +RUN locale-gen en_US.UTF-8 + +# Workaround an issue in AMD compiler which generates poor GPU ISA +# when the type of kernel parameter is a structure and “pass-by-value” is used +# ENV HSA_NO_SCRATCH_RECLAIM=1 + +# Distributed training related environment variables +ENV HSA_FORCE_FINE_GRAIN_PCIE=1 +# ENV NCCL_DEBUG=INFO +# ENV RCCL_ALLTOALL_KERNEL_DISABLE=1 +# ENV NCCL_DEBUG_SUBSYS=INIT,COLL + +WORKDIR ${WORKSPACE_DIR}/script