onnxruntime/orttraining/tools/amdgpu/Dockerfile.rocm4.0.pytorch

# docker build --network=host --file Dockerfile.rocm4.0.pytorch --tag ort:rocm4.0-pytorch .

FROM rocm/pytorch:rocm4.0_ubuntu18.04_py3.6_pytorch

RUN apt-get -y install gpg-agent
RUN wget -q -O - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
RUN echo 'deb [arch=amd64] http://repo.radeon.com/rocm/apt/4.0/ xenial main' | tee /etc/apt/sources.list.d/rocm.list

RUN apt-get -y update
RUN apt-get -y install apt-utils
RUN apt-get -y install build-essential autotools-dev \
    make git curl vim wget rsync jq openssh-server openssh-client sudo \
    iputils-ping net-tools ethtool libcap2 \
    automake autoconf libtool flex doxygen \
    perl lsb-release iproute2 pciutils graphviz \
    bc tar git bash pbzip2 pv bzip2 unzip cabextract \
    g++ gcc \
    && apt-get autoremove

# sh
RUN rm /bin/sh && ln -s /bin/bash /bin/sh

# Labels for the docker
LABEL description="This docker sets up the environment to run ORT Training with AMD GPU"

# CMake
ENV CMAKE_VERSION=3.18.2
RUN cd /usr/local && \
    wget -q -O - https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz | tar zxf -
ENV PATH=/usr/local/cmake-${CMAKE_VERSION}-Linux-x86_64/bin:${PATH}

ENV WORKSPACE_DIR=/workspace
RUN mkdir -p $WORKSPACE_DIR
WORKDIR $WORKSPACE_DIR

ENV OLD_PATH=${PATH}
ENV PATH=/usr/bin:${PATH}
# Infiniband setup, openmpi installed under /usr/mpi/gcc/openmpi-4.0.4rc3 doesn't support multi-thread
ENV MOFED_VERSION=5.1-0.6.6.0
ENV MOFED_OS=ubuntu18.04
ENV MOFED_FILENAME=MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64
RUN curl -fSsL https://www.mellanox.com/downloads/ofed/MLNX_OFED-${MOFED_VERSION}/${MOFED_FILENAME}.tgz | tar -zxpf -
RUN cd MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64 && \
    ./mlnxofedinstall --force --user-space-only --without-fw-update --hpc && \
    cd .. && \
    rm -r MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64

ENV PATH=${OLD_PATH}
ENV unset OLD_PATH

# python env
ARG NUMPY_VERSION=1.18.5
ARG ONNX_VERSION=1.7.0
RUN pip3 install --no-cache-dir wheel tqdm boto3 requests six ipdb h5py html2text nltk progressbar pyyaml \
        git+https://github.com/NVIDIA/dllogger \
        numpy==${NUMPY_VERSION} \
        onnx=="${ONNX_VERSION}"

ENV GITHUB_DIR=$WORKSPACE_DIR/github
RUN mkdir -p $GITHUB_DIR

# UCX
WORKDIR $GITHUB_DIR
RUN apt-get -y update && apt-get -y --no-install-recommends install libnuma-dev
ARG UCX_VERSION=1.9.0-rc3
ENV UCX_DIR=$WORKSPACE_DIR/ucx-$UCX_VERSION
RUN git clone https://github.com/openucx/ucx.git \
  && cd ucx \
  && git checkout v$UCX_VERSION \
  && ./autogen.sh \
  && mkdir build \
  && cd build \
  && ../contrib/configure-opt --prefix=$UCX_DIR --without-rocm --without-knem --without-cuda \
  && make -j"$(nproc)" \
  && make install

# OpenMPI
# note: require --enable-orterun-prefix-by-default for Azure machine learning compute
# note: disable verbs as we use ucx middleware and don't want btl openib warnings
WORKDIR $GITHUB_DIR
ARG OPENMPI_BASEVERSION=4.0
ARG OPENMPI_VERSION=${OPENMPI_BASEVERSION}.5
ENV OPENMPI_DIR=$WORKSPACE_DIR/openmpi-${OPENMPI_VERSION}
RUN git clone --recursive https://github.com/open-mpi/ompi.git \
  && cd ompi \
  && git checkout v$OPENMPI_VERSION \
  && ./autogen.pl \
  && mkdir build \
  && cd build \
  && ../configure --prefix=$OPENMPI_DIR --with-ucx=$UCX_DIR --without-verbs \
                  --enable-mpirun-prefix-by-default --enable-orterun-prefix-by-default \
                  --enable-mca-no-build=btl-uct --disable-mpi-fortran \
  && make -j"$(nproc)" \
  && make install \
  && ldconfig \
  && test -f ${OPENMPI_DIR}/bin/mpic++

ENV PATH=$OPENMPI_DIR/bin:${PATH}
ENV LD_LIBRARY_PATH=$OPENMPI_DIR/lib:${LD_LIBRARY_PATH}

# Create a wrapper for OpenMPI to allow running as root by default
RUN mv $OPENMPI_DIR/bin/mpirun $OPENMPI_DIR/bin/mpirun.real && \
    echo '#!/bin/bash' > $OPENMPI_DIR/bin/mpirun && \
    echo 'mpirun.real --allow-run-as-root "$@"' >> $OPENMPI_DIR/bin/mpirun && \
    chmod a+x $OPENMPI_DIR/bin/mpirun

# install mpi4py (be sure to link existing /opt/openmpi-xxx)
RUN CC=mpicc MPICC=mpicc pip install mpi4py --no-binary mpi4py

ARG CACHE_DATA=2020-12-06

# ONNX Runtime
WORKDIR $GITHUB_DIR
ENV ORT_DIR=$GITHUB_DIR/onnxruntime
RUN git clone --recursive https://github.com/microsoft/onnxruntime.git \
  && cd onnxruntime \
  && python3 tools/ci_build/build.py \
    --cmake_extra_defines ONNXRUNTIME_VERSION=`cat ./VERSION_NUMBER` \
    --build_dir build \
    --config RelWithDebInfo \
    --parallel \
    --skip_tests \
    --build_wheel \
    --use_rocm --rocm_home /opt/rocm \
    --mpi_home $OPENMPI_DIR \
    --nccl_home /opt/rocm \
    --enable_training \
  && test -f $ORT_DIR/build/RelWithDebInfo/onnxruntime_training_bert \
  && pip install $ORT_DIR/build/RelWithDebInfo/dist/*.whl \
  && ldconfig

# ONNX Runtime Training Examples
WORKDIR $GITHUB_DIR
ARG GPT2_DATASET=wikitext-103
RUN git clone -b wezhan/amdgpu https://github.com/microsoft/onnxruntime-training-examples.git \
  && cd onnxruntime-training-examples \
  # Nvidia BERT
  && git clone --no-checkout https://github.com/NVIDIA/DeepLearningExamples.git \
  && cd DeepLearningExamples \
  && git checkout cf54b787 \
  && cd .. \
  && mv DeepLearningExamples/PyTorch/LanguageModeling/BERT ${WORKSPACE_DIR} \
  && rm -rf DeepLearningExamples \
  && cp -r ./nvidia-bert/ort_addon/* ${WORKSPACE_DIR}/BERT \
  # GPT2 fine-tuning
  && cd huggingface-gpt2 \
  && git clone https://github.com/huggingface/transformers.git \
  && cd transformers \
  && git checkout 9a0a8c1c6f4f2f0c80ff07d36713a3ada785eec5 \
  && cd .. \
  && mkdir -p ${WORKSPACE_DIR}/GPT2 \
  && cp -r transformers ${WORKSPACE_DIR}/GPT2 \
  && cd ${WORKSPACE_DIR}/GPT2/transformers \
  && git apply $GITHUB_DIR/onnxruntime-training-examples/huggingface-gpt2/ort_addon/src_changes.patch \
  && cp -r $GITHUB_DIR/onnxruntime-training-examples/huggingface-gpt2/ort_addon/ort_supplement/* ./ \
  && python3 -m pip install --no-cache-dir -e . \
  && python3 -m pip install --no-cache-dir -r examples/requirements.txt \
  && python3 -m pip install cerberus sympy \
  && cd .. \
  && wget https://s3.amazonaws.com/research.metamind.io/wikitext/${GPT2_DATASET}-v1.zip \
  && unzip ${GPT2_DATASET}-v1.zip

ENV BERT_DIR=${WORKSPACE_DIR}/BERT
ENV GPT2_DIR=${WORKSPACE_DIR}/GPT2
ENV TRAIN_FILE=${WORKSPACE_DIR}/GPT2/${GPT2_DATASET}/wiki.train.tokens
ENV TEST_FILE=${WORKSPACE_DIR}/GPT2/${GPT2_DATASET}/wiki.test.tokens

# Enable ssh access without password needed
RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/g' /etc/ssh/sshd_config
RUN sed -i 's/#StrictModes yes/StrictModes no/g' /etc/ssh/sshd_config
RUN sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/g' /etc/ssh/sshd_config
RUN sed -i 's/#PermitEmptyPasswords no/PermitEmptyPasswords yes/g' /etc/ssh/sshd_config

# Start or Restart sshd service
ENTRYPOINT service ssh restart && /bin/bash

# Add model and scripts
ADD model ${WORKSPACE_DIR}/model
ADD script ${WORKSPACE_DIR}/script
RUN chmod a+x ${WORKSPACE_DIR}/script/run_bert.sh

# add locale en_US.UTF-8
RUN apt-get install -y locales
RUN locale-gen en_US.UTF-8

# Workaround an issue in AMD compiler which generates poor GPU ISA
# when the type of kernel parameter is a structure and “pass-by-value” is used
ENV HSA_NO_SCRATCH_RECLAIM=1

# Distributed training related environment variables
ENV HSA_FORCE_FINE_GRAIN_PCIE=1
ENV NCCL_DEBUG=INFO
ENV RCCL_ALLTOALL_KERNEL_DISABLE=1
# ENV NCCL_DEBUG_SUBSYS=INIT,COLL

WORKDIR ${WORKSPACE_DIR}/script