onnxruntime/orttraining/tools/amdgpu/Dockerfile.rocm3.7
Weixing Zhang aec4cb489e
ROCm EP for AMD GPU (#5480)
The ROCm EP is designed and implemented based on AMD GPU software stack named ROCm. Here is the link for the details about ROCm: https://rocmdocs.amd.com/en/latest/

ROCm EP was created based on the following things:
1. AMD GPU programming language: HIP
2. AMD GPU HIP language runtime: amdhip64
3. BLAS: rocBLAS, hipBLAS
4. DNN: miOpen
5. Collective Communication library: RCCL
6. cub: hipCub
7. …

Current status:
BERT-L and GPT2 training can be ran on AMD GPU with data parallel.

Next:
1. Make more GPU code be sharable between ROCm EP and CUDA EP since HIP language and HIP runtime API are very close to CUDA.
2. Continue improving the implementation.
3. Continue GPU kernel optimization.
4. Support model parallelism on ROCm EP.
……

The rocm kernels have been removed from this commit and will be in a separate PR. Since the original PR was too big(~180 files), it was suggested to split the PR into two parts, one is rocm-kernels, the other is non rocm kernels.  

Co-authored-by: Weixing Zhang <wezhan@microsoft.com>
Co-authored-by: sabreshao <sabre.shao@amd.com>
Co-authored-by: anghostcici <11013544+anghostcici@users.noreply.github.com>
Co-authored-by: Suffian Khan <sukha@microsoft.com>
Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com>
2020-10-29 17:13:04 -07:00

195 lines
7.2 KiB
Groff

# docker build --network=host --file Dockerfile.rocm3.7 --tag ort:rocm3.7-ort-dev .
FROM rocm/tensorflow:rocm3.7-tf2.1-dev
RUN curl https://bazel.build/bazel-release.pub.gpg | apt-key add -
RUN cat /dev/null > /etc/apt/sources.list.d/rocm.list
RUN echo 'deb [arch=amd64] http://repo.radeon.com/rocm/apt/3.7/ xenial main' | tee /etc/apt/sources.list.d/rocm.list
RUN apt-get -y update
RUN apt-get -y install apt-utils
RUN apt-get -y install build-essential autotools-dev \
make git curl vim wget rsync jq openssh-server openssh-client sudo \
iputils-ping net-tools ethtool libcap2 \
automake autoconf libtool flex doxygen \
perl lsb-release iproute2 pciutils graphviz \
bc tar git bash pbzip2 pv bzip2 cabextract \
g++ gcc \
&& apt-get autoremove
# sh
RUN rm /bin/sh && ln -s /bin/bash /bin/sh
# Labels for the docker
LABEL description="This docker sets up the environment to run ORT Training with AMD GPU"
# CMake
ENV CMAKE_VERSION=3.18.2
RUN cd /usr/local && \
wget -q -O - https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz | tar zxf -
ENV PATH=/usr/local/cmake-${CMAKE_VERSION}-Linux-x86_64/bin:${PATH}
# WORKSPACE_DIR
ENV WORKSPACE_DIR=/workspace
RUN mkdir -p $WORKSPACE_DIR
WORKDIR $WORKSPACE_DIR
# Infiniband setup, openmpi installed under /usr/mpi/gcc/openmpi-4.0.4rc3 doesn't support multi-thread
ENV MOFED_VERSION=5.1-0.6.6.0
ENV MOFED_OS=ubuntu18.04
ENV MOFED_FILENAME=MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64
RUN curl -fSsL https://www.mellanox.com/downloads/ofed/MLNX_OFED-${MOFED_VERSION}/${MOFED_FILENAME}.tgz | tar -zxpf -
RUN cd MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64 && \
./mlnxofedinstall --force --user-space-only --without-fw-update --hpc && \
cd .. && \
rm -r MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64
# install miniconda (comes with python 3.7 default)
ARG CONDA_VERSION=4.7.10
ARG CONDA_URL=https://repo.anaconda.com/miniconda/Miniconda3-${CONDA_VERSION}-Linux-x86_64.sh
RUN curl -fSsL --insecure ${CONDA_URL} -o install-conda.sh &&\
/bin/bash ./install-conda.sh -b -p /opt/conda &&\
/opt/conda/bin/conda clean -ya
ENV PATH=/opt/conda/bin:${PATH}
ARG NUMPY_VERSION=1.18.5
ARG ONNX_VERSION=1.7.0
RUN conda install -y \
numpy=${NUMPY_VERSION} \
cmake \
ninja \
pyyaml \
cffi \
setuptools \
&& pip install --no-cache-dir wheel tqdm boto3 requests six ipdb h5py html2text nltk progressbar \
git+https://github.com/NVIDIA/dllogger \
onnx=="${ONNX_VERSION}"
# GITHUB_DIR
ENV GITHUB_DIR=$WORKSPACE_DIR/github
RUN mkdir -p $GITHUB_DIR
# UCX
WORKDIR $GITHUB_DIR
RUN apt-get -y update && apt-get -y --no-install-recommends install libnuma-dev
ARG UCX_VERSION=1.9.0-rc3
ENV UCX_DIR=$WORKSPACE_DIR/ucx-$UCX_VERSION
RUN git clone https://github.com/openucx/ucx.git \
&& cd ucx \
&& git checkout v$UCX_VERSION \
&& ./autogen.sh \
&& mkdir build \
&& cd build \
&& ../contrib/configure-opt --prefix=$UCX_DIR --without-rocm --without-knem --without-cuda \
&& make -j"$(nproc)" \
&& make install
# OpenMPI
# note: require --enable-orterun-prefix-by-default for Azure machine learning compute
# note: disable verbs as we use ucx middleware and don't want btl openib warnings
WORKDIR $GITHUB_DIR
ARG OPENMPI_BASEVERSION=4.0
ARG OPENMPI_VERSION=${OPENMPI_BASEVERSION}.5
ENV OPENMPI_DIR=$WORKSPACE_DIR/openmpi-${OPENMPI_VERSION}
RUN git clone --recursive https://github.com/open-mpi/ompi.git \
&& cd ompi \
&& git checkout v$OPENMPI_VERSION \
&& ./autogen.pl \
&& mkdir build \
&& cd build \
&& ../configure --prefix=$OPENMPI_DIR --with-ucx=$UCX_DIR --without-verbs \
--enable-mpirun-prefix-by-default --enable-orterun-prefix-by-default \
--enable-mca-no-build=btl-uct --disable-mpi-fortran \
&& make -j"$(nproc)" \
&& make install \
&& ldconfig \
&& test -f ${OPENMPI_DIR}/bin/mpic++
ENV PATH=$OPENMPI_DIR/bin:${PATH}
ENV LD_LIBRARY_PATH=$OPENMPI_DIR/lib:${LD_LIBRARY_PATH}
# Create a wrapper for OpenMPI to allow running as root by default
RUN mv $OPENMPI_DIR/bin/mpirun $OPENMPI_DIR/bin/mpirun.real && \
echo '#!/bin/bash' > $OPENMPI_DIR/bin/mpirun && \
echo 'mpirun.real --allow-run-as-root "$@"' >> $OPENMPI_DIR/bin/mpirun && \
chmod a+x $OPENMPI_DIR/bin/mpirun
# install mpi4py (be sure to link existing /opt/openmpi-xxx)
RUN CC=mpicc MPICC=mpicc pip install mpi4py --no-binary mpi4py
# ONNX Runtime
WORKDIR $GITHUB_DIR
ENV ORT_DIR=$GITHUB_DIR/onnxruntime
RUN git clone --recursive -b wezhan/amdgpu https://github.com/microsoft/onnxruntime.git \
&& cd onnxruntime \
&& python3 tools/ci_build/build.py \
--cmake_extra_defines ONNXRUNTIME_VERSION=`cat ./VERSION_NUMBER` \
--build_dir build \
--config Release \
--parallel \
--skip_tests \
--build_wheel \
--use_rocm --rocm_home /opt/rocm \
--mpi_home $OPENMPI_DIR \
--nccl_home /opt/rocm \
--enable_training \
&& test -f $ORT_DIR/build/Release/onnxruntime_training_bert \
&& pip install $ORT_DIR/build/Release/dist/*.whl \
&& ldconfig
# ONNX Runtime Training Examples
WORKDIR $GITHUB_DIR
RUN git clone -b wezhan/amdgpu https://github.com/microsoft/onnxruntime-training-examples.git \
&& cd onnxruntime-training-examples \
&& git clone --no-checkout https://github.com/NVIDIA/DeepLearningExamples.git \
&& cd DeepLearningExamples \
&& git checkout cf54b787 \
&& cd .. \
&& mv DeepLearningExamples/PyTorch/LanguageModeling/BERT/ ${WORKSPACE_DIR} \
&& rm -rf DeepLearningExamples \
&& cp -r ./nvidia-bert/ort_addon/* ${WORKSPACE_DIR}/BERT
ENV BERT_DIR=${WORKSPACE_DIR}/BERT
# OpenBLAS
WORKDIR $GITHUB_DIR
ARG OpenBLAS_VERSION=0.3.10
ENV OpenBLAS_DIR=$WORKSPACE_DIR/OpenBLAS-${OpenBLAS_VERSION}
RUN git clone https://github.com/xianyi/OpenBLAS.git \
&& cd OpenBLAS \
&& git checkout v$OpenBLAS_VERSION \
&& make TARGET=ZEN \
&& make install PREFIX=$OpenBLAS_DIR
# PyTorch
RUN pip install pyyaml
RUN for fn in $(find /opt/rocm/ -name \*.cmake ); do sed --in-place='~' 's/find_dependency(hip)/find_dependency(HIP)/' $fn ; done
WORKDIR $GITHUB_DIR
# ARG PYTORCH_VERSION=1.6.0
# RUN git clone --recursive https://github.com/pytorch/pytorch.git \
# && cd pytorch \
# && git checkout v$PYTORCH_VERSION \
# && git submodule update --recursive \
# && python3 tools/amd_build/build_amd.py \
# && OpenBLAS_HOME=$OpenBLAS_DIR BLAS="OpenBLAS" RCCL_DIR=/opt/rocm/rccl/lib/cmake/rccl/ hip_DIR=/opt/rocm/hip/cmake/ PYTORCH_ROCM_ARCH=gfx906 USE_ROCM=ON USE_CUDA=OFF BUILD_CAFFE2_OPS=0 BUILD_TEST=0 python3 setup.py install
# Enable ssh access without password needed
RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/g' /etc/ssh/sshd_config
RUN sed -i 's/#StrictModes yes/StrictModes no/g' /etc/ssh/sshd_config
RUN sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/g' /etc/ssh/sshd_config
RUN sed -i 's/#PermitEmptyPasswords no/PermitEmptyPasswords yes/g' /etc/ssh/sshd_config
# Start or Restart sshd service
ENTRYPOINT service ssh restart && /bin/bash
# Add model and scripts
ADD model $WORKSPACE_DIR/model
ADD script $WORKSPACE_DIR/script
RUN chmod a+x $WORKSPACE_DIR/script/run_bert.sh
ADD lib $WORKSPACE_DIR/lib
ENV LD_PERLOAD=$WORKSPACE_DIR/lib/libpsl.so
ENV LOG_DIR=/data/wezhan/logs
WORKDIR $WORKSPACE_DIR/script