onnxruntime/dockerfiles/Dockerfile.training
2021-06-03 18:51:00 -07:00

224 lines
7.7 KiB
Text

# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# multi-stage arguments (repeat ARG NAME below)
ARG UCX_VERSION=1.8.0
ARG OPENMPI_VERSION=4.0.4
ARG CONDA_VERSION=4.7.10
ARG NUMPY_VERSION=1.18.5
ARG ONNX_VERSION=1.7.0
ARG PYTORCH_VERSION=1.6.0
ARG BUILD_CONFIG=Release
ARG OPENMPI_PATH=/opt/openmpi-${OPENMPI_VERSION}
ARG COMMIT=master
# cuda development image for building sources
FROM nvcr.io/nvidia/cuda:11.1.1-cudnn8-devel-ubuntu18.04 as builder
# set location for builds
WORKDIR /stage
# install curl, git, ssh (required by MPI when running ORT tests)
RUN apt-get -y update &&\
apt-get -y --no-install-recommends install \
curl \
git \
language-pack-en \
openssh-client \
unattended-upgrades
# update existing packages to minimize security vulnerabilities
RUN unattended-upgrade
RUN locale-gen en_US.UTF-8 && \
update-locale LANG=en_US.UTF-8 && \
curl -O -L https://github.com/Kitware/CMake/releases/download/v3.20.3/cmake-3.20.3-Linux-x86_64.tar.gz && \
tar -zxf cmake-3.20.3-Linux-x86_64.tar.gz --strip=1 -C /usr && \
rm -rf cmake-3.20.3-Linux-x86_64.tar.gz
# install miniconda (comes with python 3.7 default)
ARG CONDA_VERSION
ARG CONDA_URL=https://repo.anaconda.com/miniconda/Miniconda3-${CONDA_VERSION}-Linux-x86_64.sh
RUN cd /stage && curl -fSsL --insecure ${CONDA_URL} -o install-conda.sh &&\
/bin/bash ./install-conda.sh -b -p /opt/conda &&\
/opt/conda/bin/conda clean -ya
ENV PATH=/opt/conda/bin:${PATH}
# install setuptools, numpy, and onnx
ARG NUMPY_VERSION
ARG ONNX_VERSION
RUN conda install -y \
setuptools \
numpy=${NUMPY_VERSION} &&\
pip install \
onnx=="${ONNX_VERSION}"
# install cerberus for the new pytorch front-end
RUN pip install cerberus
# build ucx suite
# note: openmpi will not select ucx without multithreading enabled
ARG UCX_VERSION
ARG UCX_TARNAME=ucx-$UCX_VERSION
ARG UCX_URL=https://github.com/openucx/ucx/releases/download/v${UCX_VERSION}/${UCX_TARNAME}.tar.gz
RUN apt-get -y update && apt-get -y --no-install-recommends install \
libibverbs-dev \
libnuma-dev &&\
cd /stage && curl -fSsL ${UCX_URL} | tar xzf - &&\
cd ${UCX_TARNAME} &&\
./configure \
--prefix=/opt/ucx \
--with-cuda=/usr/local/cuda \
--with-verbs=/usr/lib/x86_64-linux-gnu \
--enable-mt &&\
make -j"$(nproc)" &&\
make install
# build openmpi (use --prefix /opt/openmpi-xxx to move to runtime image)
# note: require --enable-orterun-prefix-by-default for Azure machine learning compute
# note: disable verbs as we use ucx middleware and don't want btl openib warnings
ARG OPENMPI_VERSION
ARG OPENMPI_PATH
ARG OPENMPI_TARNAME=openmpi-${OPENMPI_VERSION}
ARG OPENMPI_URL=https://download.open-mpi.org/release/open-mpi/v%OMPI_BASE%/${OPENMPI_TARNAME}.tar.gz
RUN export OMPI_BASE=${OPENMPI_VERSION%.*} &&\
cd /stage && curl -fSsL `echo ${OPENMPI_URL} | sed s/%OMPI_BASE%/$OMPI_BASE/` | tar xzf - &&\
cd ${OPENMPI_TARNAME} &&\
./configure \
--prefix=${OPENMPI_PATH} \
--with-ucx=/opt/ucx \
--without-verbs \
--with-cuda=/usr/local/cuda \
--enable-mpirun-prefix-by-default \
--enable-orterun-prefix-by-default \
--enable-mca-no-build=btl-uct &&\
make -j"$(nproc)" install &&\
ldconfig
ENV PATH=${OPENMPI_PATH}/bin:$PATH
ENV LD_LIBRARY_PATH=${OPENMPI_PATH}/lib:$LD_LIBRARY_PATH
# install mpi4py (be sure to link existing /opt/openmpi-xxx)
RUN CC=mpicc MPICC=mpicc pip install mpi4py --no-binary mpi4py
# install pytorch
ARG PYTORCH_VERSION
RUN pip install torch==${PYTORCH_VERSION}
# in case you need to build pytorch:
# note: if you want specific branch or to link system cuda libraries or MPI
# note: recommend using many high-frequency cores (e.g. 32+ skylake cores)
# ENV CUDA_HOME="/usr/local/cuda" \
# CUDNN_LIBRARY="/usr/lib/x86_64-linux-gnu" \
# NCCL_INCLUDE_DIR="/usr/include" \
# NCCL_LIB_DIR="/usr/lib/x86_64-linux-gnu" \
# USE_SYSTEM_NCCL=1
# RUN conda install -y \
# mkl \
# mkl-include \
# ninja \
# pyyaml \
# cffi &&\
# cd /stage && git clone https://github.com/pytorch/pytorch.git &&\
# cd pytorch &&\
# git checkout v1.6.0 &&\
# git submodule update --init --recursive &&\
# python setup.py bdist_wheel -d build/wheel &&\
# pip install build/wheel/*.whl
# build onnxruntime wheel with cuda and mpi support
ARG BUILD_CONFIG
ARG COMMIT
RUN cd /stage && git clone https://github.com/microsoft/onnxruntime.git &&\
cd onnxruntime &&\
git checkout ${COMMIT} &&\
cp ThirdPartyNotices.txt /stage/ThirdPartyNotices.txt &&\
cp dockerfiles/LICENSE-IMAGE.txt /stage/LICENSE-IMAGE.txt &&\
python tools/ci_build/build.py \
--cmake_extra_defines \
ONNXRUNTIME_VERSION=`cat ./VERSION_NUMBER` \
--config ${BUILD_CONFIG} \
--enable_training \
--mpi_home ${OPENMPI_PATH} \
--use_cuda \
--cuda_home /usr/local/cuda \
--cudnn_home /usr/lib/x86_64-linux-gnu/ \
--nccl_home /usr/lib/x86_64-linux-gnu/ \
--update \
--parallel \
--build_dir build \
--build \
--build_wheel \
--skip_tests --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=35;37;50;52;60;61;70;75;80;86' &&\
pip install build/${BUILD_CONFIG}/dist/*.whl
# Install AzureML support and commonly used packages.
RUN pip install azureml-defaults sentencepiece==0.1.92 transformers==2.11.0 msgpack==1.0.0 tensorboardX==1.8 tensorboard==2.3.0
# switch to cuda runtime environment
# note: launch with --gpus all or nvidia-docker
FROM nvcr.io/nvidia/cuda:11.1.1-cudnn8-runtime-ubuntu18.04
WORKDIR /stage
# install ucx
# note: launch with --cap-add=sys_nice to avoid 'mbind' warnings
COPY --from=builder /opt/ucx /opt/ucx
ENV PATH=/opt/ucx/bin:$PATH
ENV LD_LIBRARY_PATH=/opt/ucx/lib:$LD_LIBRARY_PATH
# install openmpi
# note: permit mpirun as root for Azure cluster submissions
# note: enforce openmpi select ucx or fail
ARG OPENMPI_VERSION
ARG OPENMPI_PATH
COPY --from=builder ${OPENMPI_PATH} ${OPENMPI_PATH}
ENV PATH=${OPENMPI_PATH}/bin:$PATH
ENV LD_LIBRARY_PATH=${OPENMPI_PATH}/lib:$LD_LIBRARY_PATH
ENV OMPI_ALLOW_RUN_AS_ROOT=1
ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
ENV OMPI_MCA_pml=ucx
RUN apt-get -y update && apt-get -y --no-install-recommends install \
openssh-server \
openssh-client \
libibverbs-dev \
libnuma-dev &&\
ldconfig
# copy conda environment (includes numpy, mpi4py, pytorch, onnxruntime)
COPY --from=builder /opt/conda /opt/conda
ENV PATH=/opt/conda/bin:${PATH}
# make ssh/sshd less strict for wiring containers on Azure VM scale set
# note: use 'service ssh start' to launch sshd (will fail if 22 in use)
# note: can also set port != 22 and set port=X in MPI hosts file
# note: need to setup password free ssh login between MPI hosts
RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/g' \
/etc/ssh/sshd_config &&\
sed -i 's/#StrictModes yes/StrictModes no/g' \
/etc/ssh/sshd_config &&\
sed -i 's/# StrictHostKeyChecking ask/ StrictHostKeyChecking no/g' \
/etc/ssh/ssh_config &&\
mkdir /run/sshd &&\
chmod u=rwx,go=rx /run/sshd
# export versions
ARG UCX_VERSION
ARG OPENMPI_VERSION
ARG CONDA_VERSION
ARG NUMPY_VERSION
ARG ONNX_VERSION
ARG PYTORCH_VERSION
LABEL UCX_VERSION=${UCX_VERSION}
LABEL OPENMPI_VERSION=${OPENMPI_VERSION}
LABEL CONDA_VERSION=${CONDA_VERSION}
LABEL NUMPY_VERSION=${NUMPY_VERSION}
LABEL ONNX_VERSION=${ONNX_VERSION}
LABEL PYTORCH_VERSION=${PYTORCH_VERSION}
# clean\finalize environment
# note: adds onnxruntime license and third party notices
RUN apt-get purge -y build-essential &&\
apt-get autoremove -y &&\
rm -fr /stage
WORKDIR /workspace
COPY --from=builder /stage/*.txt /workspace/