onnxruntime/orttraining/tools/amdgpu/Dockerfile.rocm4.3.1.pytorch
Ashwini Khade 68b5b2d7d3
Refactor training build options (#13964)
### Description
1. Renames all references of on device training to training apis. This
is to keep the naming general. Nothing really prevents us from using the
same apis on servers\non-edge devices.
2. Update ENABLE_TRAINING option: With this PR when this option is
enabled, training apis and torch interop is also enabled.
3. Refactoring for onnxruntime_ENABLE_TRAINING_TORCH_INTEROP option: 
   -  Removed user facing option
- Setting onnxruntime_ENABLE_TRAINING_TORCH_INTEROP to ON when
onnxruntime_ENABLE_TRAINING is ON as we always build with torch interop.

Once this PR is merged when --enable_training is selected we will do a
"FULL Build" for training (with all the training entry points and
features).
Training entry points include:
1. ORTModule
2. Training APIs

Features include:
1. ATen Fallback
2. All Training OPs includes communication and collectives
3. Strided Tensor Support
4. Python Op (torch interop)
5. ONNXBlock (Front end tools for training artifacts prep when using
trianing apis)

### Motivation and Context
Intention is to simply the options for building training enabled builds.
This is part of the larger work item to create dedicated build for
learning on the edge scenarios with just training apis enabled.
2023-01-03 13:28:16 -08:00

170 lines
6.2 KiB
Text

# docker build --network=host --file Dockerfile.rocm4.3.1.pytorch --tag ort:rocm4.3.1-pytorch .
FROM rocm/pytorch:rocm4.3.1_ubuntu18.04_py3.6_pytorch_1.9.0
RUN apt-get -y install gpg-agent
RUN wget -q -O - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
RUN echo 'deb [arch=amd64] http://repo.radeon.com/rocm/apt/4.3.1 xenial main' | tee /etc/apt/sources.list.d/rocm.list
RUN apt-get -y update
RUN apt-get -y install apt-utils
RUN apt-get -y install build-essential autotools-dev \
make git curl vim wget rsync jq openssh-server openssh-client sudo \
iputils-ping net-tools ethtool libcap2 \
automake autoconf libtool flex doxygen \
perl lsb-release iproute2 pciutils graphviz \
bc tar git bash pbzip2 pv bzip2 unzip cabextract \
g++ gcc \
&& apt-get autoremove
# sh
RUN rm /bin/sh && ln -s /bin/bash /bin/sh
# Labels for the docker
LABEL description="This docker sets up the environment to run ORT Training with AMD GPU"
# CMake
ENV CMAKE_VERSION=3.18.2
RUN cd /usr/local && \
wget -q -O - https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz | tar zxf -
ENV PATH=/usr/local/cmake-${CMAKE_VERSION}-Linux-x86_64/bin:${PATH}
ENV WORKSPACE_DIR=/workspace
RUN mkdir -p $WORKSPACE_DIR
WORKDIR $WORKSPACE_DIR
ENV OLD_PATH=${PATH}
ENV PATH=/usr/bin:${PATH}
# Infiniband setup, openmpi installed under /usr/mpi/gcc/openmpi-4.0.4rc3 doesn't support multi-thread
ENV MOFED_VERSION=5.1-0.6.6.0
ENV MOFED_OS=ubuntu18.04
ENV MOFED_FILENAME=MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64
RUN curl -fSsL https://www.mellanox.com/downloads/ofed/MLNX_OFED-${MOFED_VERSION}/${MOFED_FILENAME}.tgz | tar -zxpf -
RUN cd MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64 && \
./mlnxofedinstall --force --user-space-only --without-fw-update --hpc && \
cd .. && \
rm -r MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64
ENV PATH=${OLD_PATH}
ENV unset OLD_PATH
# python env
RUN pip3 install --upgrade setuptools
ARG NUMPY_VERSION=1.18.5
ARG ONNX_VERSION=1.10.2
RUN pip3 install --no-cache-dir wheel tqdm boto3 requests six ipdb h5py html2text nltk progressbar pyyaml \
git+https://github.com/NVIDIA/dllogger \
numpy==${NUMPY_VERSION} \
onnx=="${ONNX_VERSION}"
ENV GITHUB_DIR=$WORKSPACE_DIR/github
RUN mkdir -p $GITHUB_DIR
# UCX
WORKDIR $GITHUB_DIR
RUN apt-get -y update && apt-get -y --no-install-recommends install libnuma-dev
ARG UCX_VERSION=1.9.0-rc3
ENV UCX_DIR=$WORKSPACE_DIR/ucx-$UCX_VERSION
RUN git clone https://github.com/openucx/ucx.git \
&& cd ucx \
&& git checkout v$UCX_VERSION \
&& ./autogen.sh \
&& mkdir build \
&& cd build \
&& ../contrib/configure-opt --prefix=$UCX_DIR --without-rocm --without-knem --without-cuda \
&& make -j"$(nproc)" \
&& make install \
&& cd .. \
&& rm -rf build
# OpenMPI
# note: require --enable-orterun-prefix-by-default for Azure machine learning compute
# note: disable verbs as we use ucx middleware and don't want btl openib warnings
WORKDIR $GITHUB_DIR
ARG OPENMPI_BASEVERSION=4.0
ARG OPENMPI_VERSION=${OPENMPI_BASEVERSION}.5
ENV OPENMPI_DIR=$WORKSPACE_DIR/openmpi-${OPENMPI_VERSION}
RUN git clone --recursive https://github.com/open-mpi/ompi.git \
&& cd ompi \
&& git checkout v$OPENMPI_VERSION \
&& ./autogen.pl \
&& mkdir build \
&& cd build \
&& ../configure --prefix=$OPENMPI_DIR --with-ucx=$UCX_DIR --without-verbs \
--enable-mpirun-prefix-by-default --enable-orterun-prefix-by-default \
--enable-mca-no-build=btl-uct --disable-mpi-fortran \
&& make -j"$(nproc)" \
&& make install \
&& cd .. \
&& rm -rf build \
&& ldconfig \
&& test -f ${OPENMPI_DIR}/bin/mpic++
ENV PATH=$OPENMPI_DIR/bin:${PATH}
ENV LD_LIBRARY_PATH=$OPENMPI_DIR/lib:${LD_LIBRARY_PATH}
# Create a wrapper for OpenMPI to allow running as root by default
RUN mv $OPENMPI_DIR/bin/mpirun $OPENMPI_DIR/bin/mpirun.real && \
echo '#!/bin/bash' > $OPENMPI_DIR/bin/mpirun && \
echo 'mpirun.real --allow-run-as-root "$@"' >> $OPENMPI_DIR/bin/mpirun && \
chmod a+x $OPENMPI_DIR/bin/mpirun
# install mpi4py (be sure to link existing /opt/openmpi-xxx)
RUN CC=mpicc MPICC=mpicc pip install mpi4py --no-binary mpi4py
ARG CACHE_DATA=2021-10-25
# ONNX Runtime
WORKDIR $GITHUB_DIR
ENV ORT_DIR=$GITHUB_DIR/onnxruntime
RUN git clone -b wezhan/tnlrv4 --recursive https://github.com/microsoft/onnxruntime.git \
&& cd onnxruntime \
&& python3 tools/ci_build/build.py \
--cmake_extra_defines ONNXRUNTIME_VERSION=`cat ./VERSION_NUMBER` \
--build_dir build \
--config Release \
--parallel \
--skip_tests \
--build_wheel \
--use_rocm --rocm_version=4.3.1 --rocm_home /opt/rocm \
--mpi_home $OPENMPI_DIR \
--nccl_home /opt/rocm \
--enable_training \
&& test -f $ORT_DIR/build/Release/onnxruntime_training_bert \
&& pip install $ORT_DIR/build/Release/dist/*.whl \
&& ldconfig
RUN pip3 install --no-cache-dir GPUtil azureml azureml-core datasets tokenizers ninja cerberus sympy sacremoses sacrebleu
RUN pip install transformers==2.10.0 scikit-learn tensorboardX
RUN pip install --pre torch-ort -f https://download.onnxruntime.ai/torch_ort_nightly.html
RUN python -m torch_ort.configure
# Enable ssh access without password needed
RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/g' /etc/ssh/sshd_config
RUN sed -i 's/#StrictModes yes/StrictModes no/g' /etc/ssh/sshd_config
RUN sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/g' /etc/ssh/sshd_config
RUN sed -i 's/#PermitEmptyPasswords no/PermitEmptyPasswords yes/g' /etc/ssh/sshd_config
# Start or Restart sshd service
ENTRYPOINT service ssh restart && /bin/bash
# Add model and scripts
ADD script ${WORKSPACE_DIR}/script
RUN chmod a+x ${WORKSPACE_DIR}/script/run_bert.sh
# add locale en_US.UTF-8
RUN apt-get install -y locales
RUN locale-gen en_US.UTF-8
# Workaround an issue in AMD compiler which generates poor GPU ISA
# when the type of kernel parameter is a structure and “pass-by-value” is used
# ENV HSA_NO_SCRATCH_RECLAIM=1
# Distributed training related environment variables
ENV HSA_FORCE_FINE_GRAIN_PCIE=1
# ENV NCCL_DEBUG=INFO
# ENV RCCL_ALLTOALL_KERNEL_DISABLE=1
# ENV NCCL_DEBUG_SUBSYS=INIT,COLL
WORKDIR ${WORKSPACE_DIR}/script