mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-06 00:03:22 +00:00
The ROCm EP is designed and implemented based on AMD GPU software stack named ROCm. Here is the link for the details about ROCm: https://rocmdocs.amd.com/en/latest/ ROCm EP was created based on the following things: 1. AMD GPU programming language: HIP 2. AMD GPU HIP language runtime: amdhip64 3. BLAS: rocBLAS, hipBLAS 4. DNN: miOpen 5. Collective Communication library: RCCL 6. cub: hipCub 7. … Current status: BERT-L and GPT2 training can be ran on AMD GPU with data parallel. Next: 1. Make more GPU code be sharable between ROCm EP and CUDA EP since HIP language and HIP runtime API are very close to CUDA. 2. Continue improving the implementation. 3. Continue GPU kernel optimization. 4. Support model parallelism on ROCm EP. …… The rocm kernels have been removed from this commit and will be in a separate PR. Since the original PR was too big(~180 files), it was suggested to split the PR into two parts, one is rocm-kernels, the other is non rocm kernels. Co-authored-by: Weixing Zhang <wezhan@microsoft.com> Co-authored-by: sabreshao <sabre.shao@amd.com> Co-authored-by: anghostcici <11013544+anghostcici@users.noreply.github.com> Co-authored-by: Suffian Khan <sukha@microsoft.com> Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com>
195 lines
7.2 KiB
Groff
195 lines
7.2 KiB
Groff
# docker build --network=host --file Dockerfile.rocm3.7 --tag ort:rocm3.7-ort-dev .
|
|
|
|
FROM rocm/tensorflow:rocm3.7-tf2.1-dev
|
|
|
|
RUN curl https://bazel.build/bazel-release.pub.gpg | apt-key add -
|
|
RUN cat /dev/null > /etc/apt/sources.list.d/rocm.list
|
|
RUN echo 'deb [arch=amd64] http://repo.radeon.com/rocm/apt/3.7/ xenial main' | tee /etc/apt/sources.list.d/rocm.list
|
|
|
|
RUN apt-get -y update
|
|
RUN apt-get -y install apt-utils
|
|
RUN apt-get -y install build-essential autotools-dev \
|
|
make git curl vim wget rsync jq openssh-server openssh-client sudo \
|
|
iputils-ping net-tools ethtool libcap2 \
|
|
automake autoconf libtool flex doxygen \
|
|
perl lsb-release iproute2 pciutils graphviz \
|
|
bc tar git bash pbzip2 pv bzip2 cabextract \
|
|
g++ gcc \
|
|
&& apt-get autoremove
|
|
|
|
# sh
|
|
RUN rm /bin/sh && ln -s /bin/bash /bin/sh
|
|
|
|
# Labels for the docker
|
|
LABEL description="This docker sets up the environment to run ORT Training with AMD GPU"
|
|
|
|
# CMake
|
|
ENV CMAKE_VERSION=3.18.2
|
|
RUN cd /usr/local && \
|
|
wget -q -O - https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz | tar zxf -
|
|
ENV PATH=/usr/local/cmake-${CMAKE_VERSION}-Linux-x86_64/bin:${PATH}
|
|
|
|
# WORKSPACE_DIR
|
|
ENV WORKSPACE_DIR=/workspace
|
|
RUN mkdir -p $WORKSPACE_DIR
|
|
WORKDIR $WORKSPACE_DIR
|
|
|
|
# Infiniband setup, openmpi installed under /usr/mpi/gcc/openmpi-4.0.4rc3 doesn't support multi-thread
|
|
ENV MOFED_VERSION=5.1-0.6.6.0
|
|
ENV MOFED_OS=ubuntu18.04
|
|
ENV MOFED_FILENAME=MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64
|
|
RUN curl -fSsL https://www.mellanox.com/downloads/ofed/MLNX_OFED-${MOFED_VERSION}/${MOFED_FILENAME}.tgz | tar -zxpf -
|
|
RUN cd MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64 && \
|
|
./mlnxofedinstall --force --user-space-only --without-fw-update --hpc && \
|
|
cd .. && \
|
|
rm -r MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64
|
|
|
|
# install miniconda (comes with python 3.7 default)
|
|
ARG CONDA_VERSION=4.7.10
|
|
ARG CONDA_URL=https://repo.anaconda.com/miniconda/Miniconda3-${CONDA_VERSION}-Linux-x86_64.sh
|
|
RUN curl -fSsL --insecure ${CONDA_URL} -o install-conda.sh &&\
|
|
/bin/bash ./install-conda.sh -b -p /opt/conda &&\
|
|
/opt/conda/bin/conda clean -ya
|
|
ENV PATH=/opt/conda/bin:${PATH}
|
|
|
|
ARG NUMPY_VERSION=1.18.5
|
|
ARG ONNX_VERSION=1.7.0
|
|
RUN conda install -y \
|
|
numpy=${NUMPY_VERSION} \
|
|
cmake \
|
|
ninja \
|
|
pyyaml \
|
|
cffi \
|
|
setuptools \
|
|
&& pip install --no-cache-dir wheel tqdm boto3 requests six ipdb h5py html2text nltk progressbar \
|
|
git+https://github.com/NVIDIA/dllogger \
|
|
onnx=="${ONNX_VERSION}"
|
|
|
|
# GITHUB_DIR
|
|
ENV GITHUB_DIR=$WORKSPACE_DIR/github
|
|
RUN mkdir -p $GITHUB_DIR
|
|
|
|
# UCX
|
|
WORKDIR $GITHUB_DIR
|
|
RUN apt-get -y update && apt-get -y --no-install-recommends install libnuma-dev
|
|
ARG UCX_VERSION=1.9.0-rc3
|
|
ENV UCX_DIR=$WORKSPACE_DIR/ucx-$UCX_VERSION
|
|
RUN git clone https://github.com/openucx/ucx.git \
|
|
&& cd ucx \
|
|
&& git checkout v$UCX_VERSION \
|
|
&& ./autogen.sh \
|
|
&& mkdir build \
|
|
&& cd build \
|
|
&& ../contrib/configure-opt --prefix=$UCX_DIR --without-rocm --without-knem --without-cuda \
|
|
&& make -j"$(nproc)" \
|
|
&& make install
|
|
|
|
# OpenMPI
|
|
# note: require --enable-orterun-prefix-by-default for Azure machine learning compute
|
|
# note: disable verbs as we use ucx middleware and don't want btl openib warnings
|
|
WORKDIR $GITHUB_DIR
|
|
ARG OPENMPI_BASEVERSION=4.0
|
|
ARG OPENMPI_VERSION=${OPENMPI_BASEVERSION}.5
|
|
ENV OPENMPI_DIR=$WORKSPACE_DIR/openmpi-${OPENMPI_VERSION}
|
|
RUN git clone --recursive https://github.com/open-mpi/ompi.git \
|
|
&& cd ompi \
|
|
&& git checkout v$OPENMPI_VERSION \
|
|
&& ./autogen.pl \
|
|
&& mkdir build \
|
|
&& cd build \
|
|
&& ../configure --prefix=$OPENMPI_DIR --with-ucx=$UCX_DIR --without-verbs \
|
|
--enable-mpirun-prefix-by-default --enable-orterun-prefix-by-default \
|
|
--enable-mca-no-build=btl-uct --disable-mpi-fortran \
|
|
&& make -j"$(nproc)" \
|
|
&& make install \
|
|
&& ldconfig \
|
|
&& test -f ${OPENMPI_DIR}/bin/mpic++
|
|
|
|
ENV PATH=$OPENMPI_DIR/bin:${PATH}
|
|
ENV LD_LIBRARY_PATH=$OPENMPI_DIR/lib:${LD_LIBRARY_PATH}
|
|
|
|
# Create a wrapper for OpenMPI to allow running as root by default
|
|
RUN mv $OPENMPI_DIR/bin/mpirun $OPENMPI_DIR/bin/mpirun.real && \
|
|
echo '#!/bin/bash' > $OPENMPI_DIR/bin/mpirun && \
|
|
echo 'mpirun.real --allow-run-as-root "$@"' >> $OPENMPI_DIR/bin/mpirun && \
|
|
chmod a+x $OPENMPI_DIR/bin/mpirun
|
|
|
|
# install mpi4py (be sure to link existing /opt/openmpi-xxx)
|
|
RUN CC=mpicc MPICC=mpicc pip install mpi4py --no-binary mpi4py
|
|
|
|
# ONNX Runtime
|
|
WORKDIR $GITHUB_DIR
|
|
ENV ORT_DIR=$GITHUB_DIR/onnxruntime
|
|
RUN git clone --recursive -b wezhan/amdgpu https://github.com/microsoft/onnxruntime.git \
|
|
&& cd onnxruntime \
|
|
&& python3 tools/ci_build/build.py \
|
|
--cmake_extra_defines ONNXRUNTIME_VERSION=`cat ./VERSION_NUMBER` \
|
|
--build_dir build \
|
|
--config Release \
|
|
--parallel \
|
|
--skip_tests \
|
|
--build_wheel \
|
|
--use_rocm --rocm_home /opt/rocm \
|
|
--mpi_home $OPENMPI_DIR \
|
|
--nccl_home /opt/rocm \
|
|
--enable_training \
|
|
&& test -f $ORT_DIR/build/Release/onnxruntime_training_bert \
|
|
&& pip install $ORT_DIR/build/Release/dist/*.whl \
|
|
&& ldconfig
|
|
|
|
# ONNX Runtime Training Examples
|
|
WORKDIR $GITHUB_DIR
|
|
RUN git clone -b wezhan/amdgpu https://github.com/microsoft/onnxruntime-training-examples.git \
|
|
&& cd onnxruntime-training-examples \
|
|
&& git clone --no-checkout https://github.com/NVIDIA/DeepLearningExamples.git \
|
|
&& cd DeepLearningExamples \
|
|
&& git checkout cf54b787 \
|
|
&& cd .. \
|
|
&& mv DeepLearningExamples/PyTorch/LanguageModeling/BERT/ ${WORKSPACE_DIR} \
|
|
&& rm -rf DeepLearningExamples \
|
|
&& cp -r ./nvidia-bert/ort_addon/* ${WORKSPACE_DIR}/BERT
|
|
|
|
ENV BERT_DIR=${WORKSPACE_DIR}/BERT
|
|
|
|
# OpenBLAS
|
|
WORKDIR $GITHUB_DIR
|
|
ARG OpenBLAS_VERSION=0.3.10
|
|
ENV OpenBLAS_DIR=$WORKSPACE_DIR/OpenBLAS-${OpenBLAS_VERSION}
|
|
RUN git clone https://github.com/xianyi/OpenBLAS.git \
|
|
&& cd OpenBLAS \
|
|
&& git checkout v$OpenBLAS_VERSION \
|
|
&& make TARGET=ZEN \
|
|
&& make install PREFIX=$OpenBLAS_DIR
|
|
|
|
# PyTorch
|
|
RUN pip install pyyaml
|
|
RUN for fn in $(find /opt/rocm/ -name \*.cmake ); do sed --in-place='~' 's/find_dependency(hip)/find_dependency(HIP)/' $fn ; done
|
|
WORKDIR $GITHUB_DIR
|
|
# ARG PYTORCH_VERSION=1.6.0
|
|
# RUN git clone --recursive https://github.com/pytorch/pytorch.git \
|
|
# && cd pytorch \
|
|
# && git checkout v$PYTORCH_VERSION \
|
|
# && git submodule update --recursive \
|
|
# && python3 tools/amd_build/build_amd.py \
|
|
# && OpenBLAS_HOME=$OpenBLAS_DIR BLAS="OpenBLAS" RCCL_DIR=/opt/rocm/rccl/lib/cmake/rccl/ hip_DIR=/opt/rocm/hip/cmake/ PYTORCH_ROCM_ARCH=gfx906 USE_ROCM=ON USE_CUDA=OFF BUILD_CAFFE2_OPS=0 BUILD_TEST=0 python3 setup.py install
|
|
|
|
# Enable ssh access without password needed
|
|
RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/g' /etc/ssh/sshd_config
|
|
RUN sed -i 's/#StrictModes yes/StrictModes no/g' /etc/ssh/sshd_config
|
|
RUN sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/g' /etc/ssh/sshd_config
|
|
RUN sed -i 's/#PermitEmptyPasswords no/PermitEmptyPasswords yes/g' /etc/ssh/sshd_config
|
|
|
|
# Start or Restart sshd service
|
|
ENTRYPOINT service ssh restart && /bin/bash
|
|
|
|
# Add model and scripts
|
|
ADD model $WORKSPACE_DIR/model
|
|
ADD script $WORKSPACE_DIR/script
|
|
RUN chmod a+x $WORKSPACE_DIR/script/run_bert.sh
|
|
|
|
ADD lib $WORKSPACE_DIR/lib
|
|
ENV LD_PERLOAD=$WORKSPACE_DIR/lib/libpsl.so
|
|
|
|
ENV LOG_DIR=/data/wezhan/logs
|
|
|
|
WORKDIR $WORKSPACE_DIR/script
|