mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-18 21:21:17 +00:00
### Description 1. Renames all references of on device training to training apis. This is to keep the naming general. Nothing really prevents us from using the same apis on servers\non-edge devices. 2. Update ENABLE_TRAINING option: With this PR when this option is enabled, training apis and torch interop is also enabled. 3. Refactoring for onnxruntime_ENABLE_TRAINING_TORCH_INTEROP option: - Removed user facing option - Setting onnxruntime_ENABLE_TRAINING_TORCH_INTEROP to ON when onnxruntime_ENABLE_TRAINING is ON as we always build with torch interop. Once this PR is merged when --enable_training is selected we will do a "FULL Build" for training (with all the training entry points and features). Training entry points include: 1. ORTModule 2. Training APIs Features include: 1. ATen Fallback 2. All Training OPs includes communication and collectives 3. Strided Tensor Support 4. Python Op (torch interop) 5. ONNXBlock (Front end tools for training artifacts prep when using trianing apis) ### Motivation and Context Intention is to simply the options for building training enabled builds. This is part of the larger work item to create dedicated build for learning on the edge scenarios with just training apis enabled.
170 lines
6.2 KiB
Text
170 lines
6.2 KiB
Text
# docker build --network=host --file Dockerfile.rocm4.3.1.pytorch --tag ort:rocm4.3.1-pytorch .
|
|
|
|
FROM rocm/pytorch:rocm4.3.1_ubuntu18.04_py3.6_pytorch_1.9.0
|
|
|
|
RUN apt-get -y install gpg-agent
|
|
RUN wget -q -O - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
|
|
RUN echo 'deb [arch=amd64] http://repo.radeon.com/rocm/apt/4.3.1 xenial main' | tee /etc/apt/sources.list.d/rocm.list
|
|
|
|
RUN apt-get -y update
|
|
RUN apt-get -y install apt-utils
|
|
RUN apt-get -y install build-essential autotools-dev \
|
|
make git curl vim wget rsync jq openssh-server openssh-client sudo \
|
|
iputils-ping net-tools ethtool libcap2 \
|
|
automake autoconf libtool flex doxygen \
|
|
perl lsb-release iproute2 pciutils graphviz \
|
|
bc tar git bash pbzip2 pv bzip2 unzip cabextract \
|
|
g++ gcc \
|
|
&& apt-get autoremove
|
|
|
|
# sh
|
|
RUN rm /bin/sh && ln -s /bin/bash /bin/sh
|
|
|
|
# Labels for the docker
|
|
LABEL description="This docker sets up the environment to run ORT Training with AMD GPU"
|
|
|
|
# CMake
|
|
ENV CMAKE_VERSION=3.18.2
|
|
RUN cd /usr/local && \
|
|
wget -q -O - https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz | tar zxf -
|
|
ENV PATH=/usr/local/cmake-${CMAKE_VERSION}-Linux-x86_64/bin:${PATH}
|
|
|
|
ENV WORKSPACE_DIR=/workspace
|
|
RUN mkdir -p $WORKSPACE_DIR
|
|
WORKDIR $WORKSPACE_DIR
|
|
|
|
ENV OLD_PATH=${PATH}
|
|
ENV PATH=/usr/bin:${PATH}
|
|
# Infiniband setup, openmpi installed under /usr/mpi/gcc/openmpi-4.0.4rc3 doesn't support multi-thread
|
|
ENV MOFED_VERSION=5.1-0.6.6.0
|
|
ENV MOFED_OS=ubuntu18.04
|
|
ENV MOFED_FILENAME=MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64
|
|
RUN curl -fSsL https://www.mellanox.com/downloads/ofed/MLNX_OFED-${MOFED_VERSION}/${MOFED_FILENAME}.tgz | tar -zxpf -
|
|
RUN cd MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64 && \
|
|
./mlnxofedinstall --force --user-space-only --without-fw-update --hpc && \
|
|
cd .. && \
|
|
rm -r MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64
|
|
|
|
ENV PATH=${OLD_PATH}
|
|
ENV unset OLD_PATH
|
|
|
|
# python env
|
|
RUN pip3 install --upgrade setuptools
|
|
ARG NUMPY_VERSION=1.18.5
|
|
ARG ONNX_VERSION=1.10.2
|
|
RUN pip3 install --no-cache-dir wheel tqdm boto3 requests six ipdb h5py html2text nltk progressbar pyyaml \
|
|
git+https://github.com/NVIDIA/dllogger \
|
|
numpy==${NUMPY_VERSION} \
|
|
onnx=="${ONNX_VERSION}"
|
|
|
|
ENV GITHUB_DIR=$WORKSPACE_DIR/github
|
|
RUN mkdir -p $GITHUB_DIR
|
|
|
|
# UCX
|
|
WORKDIR $GITHUB_DIR
|
|
RUN apt-get -y update && apt-get -y --no-install-recommends install libnuma-dev
|
|
ARG UCX_VERSION=1.9.0-rc3
|
|
ENV UCX_DIR=$WORKSPACE_DIR/ucx-$UCX_VERSION
|
|
RUN git clone https://github.com/openucx/ucx.git \
|
|
&& cd ucx \
|
|
&& git checkout v$UCX_VERSION \
|
|
&& ./autogen.sh \
|
|
&& mkdir build \
|
|
&& cd build \
|
|
&& ../contrib/configure-opt --prefix=$UCX_DIR --without-rocm --without-knem --without-cuda \
|
|
&& make -j"$(nproc)" \
|
|
&& make install \
|
|
&& cd .. \
|
|
&& rm -rf build
|
|
|
|
# OpenMPI
|
|
# note: require --enable-orterun-prefix-by-default for Azure machine learning compute
|
|
# note: disable verbs as we use ucx middleware and don't want btl openib warnings
|
|
WORKDIR $GITHUB_DIR
|
|
ARG OPENMPI_BASEVERSION=4.0
|
|
ARG OPENMPI_VERSION=${OPENMPI_BASEVERSION}.5
|
|
ENV OPENMPI_DIR=$WORKSPACE_DIR/openmpi-${OPENMPI_VERSION}
|
|
RUN git clone --recursive https://github.com/open-mpi/ompi.git \
|
|
&& cd ompi \
|
|
&& git checkout v$OPENMPI_VERSION \
|
|
&& ./autogen.pl \
|
|
&& mkdir build \
|
|
&& cd build \
|
|
&& ../configure --prefix=$OPENMPI_DIR --with-ucx=$UCX_DIR --without-verbs \
|
|
--enable-mpirun-prefix-by-default --enable-orterun-prefix-by-default \
|
|
--enable-mca-no-build=btl-uct --disable-mpi-fortran \
|
|
&& make -j"$(nproc)" \
|
|
&& make install \
|
|
&& cd .. \
|
|
&& rm -rf build \
|
|
&& ldconfig \
|
|
&& test -f ${OPENMPI_DIR}/bin/mpic++
|
|
|
|
ENV PATH=$OPENMPI_DIR/bin:${PATH}
|
|
ENV LD_LIBRARY_PATH=$OPENMPI_DIR/lib:${LD_LIBRARY_PATH}
|
|
|
|
# Create a wrapper for OpenMPI to allow running as root by default
|
|
RUN mv $OPENMPI_DIR/bin/mpirun $OPENMPI_DIR/bin/mpirun.real && \
|
|
echo '#!/bin/bash' > $OPENMPI_DIR/bin/mpirun && \
|
|
echo 'mpirun.real --allow-run-as-root "$@"' >> $OPENMPI_DIR/bin/mpirun && \
|
|
chmod a+x $OPENMPI_DIR/bin/mpirun
|
|
|
|
# install mpi4py (be sure to link existing /opt/openmpi-xxx)
|
|
RUN CC=mpicc MPICC=mpicc pip install mpi4py --no-binary mpi4py
|
|
|
|
ARG CACHE_DATA=2021-10-25
|
|
|
|
# ONNX Runtime
|
|
WORKDIR $GITHUB_DIR
|
|
ENV ORT_DIR=$GITHUB_DIR/onnxruntime
|
|
RUN git clone -b wezhan/tnlrv4 --recursive https://github.com/microsoft/onnxruntime.git \
|
|
&& cd onnxruntime \
|
|
&& python3 tools/ci_build/build.py \
|
|
--cmake_extra_defines ONNXRUNTIME_VERSION=`cat ./VERSION_NUMBER` \
|
|
--build_dir build \
|
|
--config Release \
|
|
--parallel \
|
|
--skip_tests \
|
|
--build_wheel \
|
|
--use_rocm --rocm_version=4.3.1 --rocm_home /opt/rocm \
|
|
--mpi_home $OPENMPI_DIR \
|
|
--nccl_home /opt/rocm \
|
|
--enable_training \
|
|
&& test -f $ORT_DIR/build/Release/onnxruntime_training_bert \
|
|
&& pip install $ORT_DIR/build/Release/dist/*.whl \
|
|
&& ldconfig
|
|
|
|
RUN pip3 install --no-cache-dir GPUtil azureml azureml-core datasets tokenizers ninja cerberus sympy sacremoses sacrebleu
|
|
|
|
RUN pip install transformers==2.10.0 scikit-learn tensorboardX
|
|
RUN pip install --pre torch-ort -f https://download.onnxruntime.ai/torch_ort_nightly.html
|
|
RUN python -m torch_ort.configure
|
|
|
|
# Enable ssh access without password needed
|
|
RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/g' /etc/ssh/sshd_config
|
|
RUN sed -i 's/#StrictModes yes/StrictModes no/g' /etc/ssh/sshd_config
|
|
RUN sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/g' /etc/ssh/sshd_config
|
|
RUN sed -i 's/#PermitEmptyPasswords no/PermitEmptyPasswords yes/g' /etc/ssh/sshd_config
|
|
|
|
# Start or Restart sshd service
|
|
ENTRYPOINT service ssh restart && /bin/bash
|
|
|
|
# Add model and scripts
|
|
ADD script ${WORKSPACE_DIR}/script
|
|
RUN chmod a+x ${WORKSPACE_DIR}/script/run_bert.sh
|
|
|
|
# add locale en_US.UTF-8
|
|
RUN apt-get install -y locales
|
|
RUN locale-gen en_US.UTF-8
|
|
|
|
# Workaround an issue in AMD compiler which generates poor GPU ISA
|
|
# when the type of kernel parameter is a structure and “pass-by-value” is used
|
|
# ENV HSA_NO_SCRATCH_RECLAIM=1
|
|
|
|
# Distributed training related environment variables
|
|
ENV HSA_FORCE_FINE_GRAIN_PCIE=1
|
|
# ENV NCCL_DEBUG=INFO
|
|
# ENV RCCL_ALLTOALL_KERNEL_DISABLE=1
|
|
# ENV NCCL_DEBUG_SUBSYS=INIT,COLL
|
|
|
|
WORKDIR ${WORKSPACE_DIR}/script
|