mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-17 21:10:43 +00:00
### Description This PR Fix warning - `LegacyKeyValueFormat: "ENV key=value" should be used instead of legacy "ENV key value" format` from all Dockerfile ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. -->
170 lines
6.2 KiB
Text
170 lines
6.2 KiB
Text
# docker build --network=host --file Dockerfile.rocm4.3.1.pytorch --tag ort:rocm4.3.1-pytorch .
|
|
|
|
FROM rocm/pytorch:rocm4.3.1_ubuntu18.04_py3.6_pytorch_1.9.0
|
|
|
|
RUN apt-get -y install gpg-agent
|
|
RUN wget -q -O - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
|
|
RUN echo 'deb [arch=amd64] http://repo.radeon.com/rocm/apt/4.3.1 xenial main' | tee /etc/apt/sources.list.d/rocm.list
|
|
|
|
RUN apt-get -y update
|
|
RUN apt-get -y install apt-utils
|
|
RUN apt-get -y install build-essential autotools-dev \
|
|
make git curl vim wget rsync jq openssh-server openssh-client sudo \
|
|
iputils-ping net-tools ethtool libcap2 \
|
|
automake autoconf libtool flex doxygen \
|
|
perl lsb-release iproute2 pciutils graphviz \
|
|
bc tar git bash pbzip2 pv bzip2 unzip cabextract \
|
|
g++ gcc \
|
|
&& apt-get autoremove
|
|
|
|
# sh
|
|
RUN rm /bin/sh && ln -s /bin/bash /bin/sh
|
|
|
|
# Labels for the docker
|
|
LABEL description="This docker sets up the environment to run ORT Training with AMD GPU"
|
|
|
|
# CMake
|
|
ENV CMAKE_VERSION=3.18.2
|
|
RUN cd /usr/local && \
|
|
wget -q -O - https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz | tar zxf -
|
|
ENV PATH=/usr/local/cmake-${CMAKE_VERSION}-Linux-x86_64/bin:${PATH}
|
|
|
|
ENV WORKSPACE_DIR=/workspace
|
|
RUN mkdir -p $WORKSPACE_DIR
|
|
WORKDIR $WORKSPACE_DIR
|
|
|
|
ENV OLD_PATH=${PATH}
|
|
ENV PATH=/usr/bin:${PATH}
|
|
# Infiniband setup, openmpi installed under /usr/mpi/gcc/openmpi-4.0.4rc3 doesn't support multi-thread
|
|
ENV MOFED_VERSION=5.1-0.6.6.0
|
|
ENV MOFED_OS=ubuntu18.04
|
|
ENV MOFED_FILENAME=MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64
|
|
RUN curl -fSsL https://www.mellanox.com/downloads/ofed/MLNX_OFED-${MOFED_VERSION}/${MOFED_FILENAME}.tgz | tar -zxpf -
|
|
RUN cd MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64 && \
|
|
./mlnxofedinstall --force --user-space-only --without-fw-update --hpc && \
|
|
cd .. && \
|
|
rm -r MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64
|
|
|
|
ENV PATH=${OLD_PATH}
|
|
ENV unset=OLD_PATH
|
|
|
|
# python env
|
|
RUN pip3 install --upgrade setuptools
|
|
ARG NUMPY_VERSION=1.18.5
|
|
ARG ONNX_VERSION=1.10.2
|
|
RUN pip3 install --no-cache-dir wheel tqdm boto3 requests six ipdb h5py html2text nltk progressbar pyyaml \
|
|
git+https://github.com/NVIDIA/dllogger \
|
|
numpy==${NUMPY_VERSION} \
|
|
onnx=="${ONNX_VERSION}"
|
|
|
|
ENV GITHUB_DIR=$WORKSPACE_DIR/github
|
|
RUN mkdir -p $GITHUB_DIR
|
|
|
|
# UCX
|
|
WORKDIR $GITHUB_DIR
|
|
RUN apt-get -y update && apt-get -y --no-install-recommends install libnuma-dev
|
|
ARG UCX_VERSION=1.9.0-rc3
|
|
ENV UCX_DIR=$WORKSPACE_DIR/ucx-$UCX_VERSION
|
|
RUN git clone https://github.com/openucx/ucx.git \
|
|
&& cd ucx \
|
|
&& git checkout v$UCX_VERSION \
|
|
&& ./autogen.sh \
|
|
&& mkdir build \
|
|
&& cd build \
|
|
&& ../contrib/configure-opt --prefix=$UCX_DIR --without-rocm --without-knem --without-cuda \
|
|
&& make -j"$(nproc)" \
|
|
&& make install \
|
|
&& cd .. \
|
|
&& rm -rf build
|
|
|
|
# OpenMPI
|
|
# note: require --enable-orterun-prefix-by-default for Azure machine learning compute
|
|
# note: disable verbs as we use ucx middleware and don't want btl openib warnings
|
|
WORKDIR $GITHUB_DIR
|
|
ARG OPENMPI_BASEVERSION=4.0
|
|
ARG OPENMPI_VERSION=${OPENMPI_BASEVERSION}.5
|
|
ENV OPENMPI_DIR=$WORKSPACE_DIR/openmpi-${OPENMPI_VERSION}
|
|
RUN git clone --recursive https://github.com/open-mpi/ompi.git \
|
|
&& cd ompi \
|
|
&& git checkout v$OPENMPI_VERSION \
|
|
&& ./autogen.pl \
|
|
&& mkdir build \
|
|
&& cd build \
|
|
&& ../configure --prefix=$OPENMPI_DIR --with-ucx=$UCX_DIR --without-verbs \
|
|
--enable-mpirun-prefix-by-default --enable-orterun-prefix-by-default \
|
|
--enable-mca-no-build=btl-uct --disable-mpi-fortran \
|
|
&& make -j"$(nproc)" \
|
|
&& make install \
|
|
&& cd .. \
|
|
&& rm -rf build \
|
|
&& ldconfig \
|
|
&& test -f ${OPENMPI_DIR}/bin/mpic++
|
|
|
|
ENV PATH=$OPENMPI_DIR/bin:${PATH}
|
|
ENV LD_LIBRARY_PATH=$OPENMPI_DIR/lib:${LD_LIBRARY_PATH}
|
|
|
|
# Create a wrapper for OpenMPI to allow running as root by default
|
|
RUN mv $OPENMPI_DIR/bin/mpirun $OPENMPI_DIR/bin/mpirun.real && \
|
|
echo '#!/bin/bash' > $OPENMPI_DIR/bin/mpirun && \
|
|
echo 'mpirun.real --allow-run-as-root "$@"' >> $OPENMPI_DIR/bin/mpirun && \
|
|
chmod a+x $OPENMPI_DIR/bin/mpirun
|
|
|
|
# install mpi4py (be sure to link existing /opt/openmpi-xxx)
|
|
RUN CC=mpicc MPICC=mpicc pip install mpi4py --no-binary mpi4py
|
|
|
|
ARG CACHE_DATA=2021-10-25
|
|
|
|
# ONNX Runtime
|
|
WORKDIR $GITHUB_DIR
|
|
ENV ORT_DIR=$GITHUB_DIR/onnxruntime
|
|
RUN git clone -b wezhan/tnlrv4 --recursive https://github.com/microsoft/onnxruntime.git \
|
|
&& cd onnxruntime \
|
|
&& python3 tools/ci_build/build.py \
|
|
--cmake_extra_defines ONNXRUNTIME_VERSION=`cat ./VERSION_NUMBER` \
|
|
--build_dir build \
|
|
--config Release \
|
|
--parallel \
|
|
--skip_tests \
|
|
--build_wheel \
|
|
--use_rocm --rocm_version=4.3.1 --rocm_home /opt/rocm \
|
|
--mpi_home $OPENMPI_DIR \
|
|
--nccl_home /opt/rocm \
|
|
--enable_training \
|
|
&& test -f $ORT_DIR/build/Release/onnxruntime_training_bert \
|
|
&& pip install $ORT_DIR/build/Release/dist/*.whl \
|
|
&& ldconfig
|
|
|
|
RUN pip3 install --no-cache-dir GPUtil azureml azureml-core datasets tokenizers ninja cerberus sympy sacremoses sacrebleu
|
|
|
|
RUN pip install transformers==2.10.0 scikit-learn tensorboardX
|
|
RUN pip install --pre torch-ort -f https://download.onnxruntime.ai/torch_ort_nightly.html
|
|
RUN python -m torch_ort.configure
|
|
|
|
# Enable ssh access without password needed
|
|
RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/g' /etc/ssh/sshd_config
|
|
RUN sed -i 's/#StrictModes yes/StrictModes no/g' /etc/ssh/sshd_config
|
|
RUN sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/g' /etc/ssh/sshd_config
|
|
RUN sed -i 's/#PermitEmptyPasswords no/PermitEmptyPasswords yes/g' /etc/ssh/sshd_config
|
|
|
|
# Start or Restart sshd service
|
|
ENTRYPOINT service ssh restart && /bin/bash
|
|
|
|
# Add model and scripts
|
|
ADD script ${WORKSPACE_DIR}/script
|
|
RUN chmod a+x ${WORKSPACE_DIR}/script/run_bert.sh
|
|
|
|
# add locale en_US.UTF-8
|
|
RUN apt-get install -y locales
|
|
RUN locale-gen en_US.UTF-8
|
|
|
|
# Workaround an issue in AMD compiler which generates poor GPU ISA
|
|
# when the type of kernel parameter is a structure and “pass-by-value” is used
|
|
# ENV HSA_NO_SCRATCH_RECLAIM=1
|
|
|
|
# Distributed training related environment variables
|
|
ENV HSA_FORCE_FINE_GRAIN_PCIE=1
|
|
# ENV NCCL_DEBUG=INFO
|
|
# ENV RCCL_ALLTOALL_KERNEL_DISABLE=1
|
|
# ENV NCCL_DEBUG_SUBSYS=INIT,COLL
|
|
|
|
WORKDIR ${WORKSPACE_DIR}/script
|