onnxruntime/dockerfiles/Dockerfile.training

# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

# multi-stage arguments (repeat ARG NAME below)
ARG UCX_VERSION=1.8.0
ARG OPENMPI_VERSION=4.0.4
ARG CONDA_VERSION=4.7.10
ARG NUMPY_VERSION=1.18.5
ARG ONNX_VERSION=1.7.0
ARG PYTORCH_VERSION=1.6.0

ARG BUILD_VERSION=Release
ARG OPENMPI_PATH=/opt/openmpi-${OPENMPI_VERSION}

# cuda development image for building sources
FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 as build

# set location for builds
WORKDIR /stage

# install curl and git
RUN apt-get -y update &&\
    apt-get -y --no-install-recommends install \
        curl \
        git

# install miniconda (comes with python 3.7 default)
ARG CONDA_VERSION
ARG CONDA_URL=https://repo.anaconda.com/miniconda/Miniconda3-${CONDA_VERSION}-Linux-x86_64.sh
RUN cd /stage && curl -fSsL --insecure ${CONDA_URL} -o install-conda.sh &&\
    /bin/bash ./install-conda.sh -b -p /opt/conda &&\
    /opt/conda/bin/conda clean -ya
ENV PATH=/opt/conda/bin:${PATH}

# install cmake, setuptools, numpy, and onnx
ARG NUMPY_VERSION
ARG ONNX_VERSION
RUN conda install -y \
        setuptools \
        cmake \
        numpy=${NUMPY_VERSION} &&\
    pip install \
        onnx=="${ONNX_VERSION}"

# build ucx suite
# note: openmpi will not select ucx without multithreading enabled
ARG UCX_VERSION
ARG UCX_TARNAME=ucx-$UCX_VERSION
ARG UCX_URL=https://github.com/openucx/ucx/releases/download/v${UCX_VERSION}/${UCX_TARNAME}.tar.gz
RUN apt-get -y update && apt-get -y --no-install-recommends install \
        libibverbs-dev \
        libnuma-dev &&\
    cd /stage && curl -fSsL ${UCX_URL} | tar xzf - &&\
    cd ${UCX_TARNAME} &&\
    ./configure \
	--prefix=/opt/ucx \
        --with-cuda=/usr/local/cuda \
        --with-verbs=/usr/lib/x86_64-linux-gnu \
        --enable-mt &&\
    make -j"$(nproc)" &&\
    make install

# build openmpi (use --prefix /opt/openmpi-xxx to move to runtime image)
# note: require --enable-orterun-prefix-by-default for Azure machine learning compute
# note: disable verbs as we use ucx middleware and don't want btl openib warnings
ARG OPENMPI_VERSION
ARG OPENMPI_PATH
ARG OPENMPI_TARNAME=openmpi-${OPENMPI_VERSION}
ARG OPENMPI_URL=https://download.open-mpi.org/release/open-mpi/v%OMPI_BASE%/${OPENMPI_TARNAME}.tar.gz
RUN export OMPI_BASE=${OPENMPI_VERSION%.*} &&\
    cd /stage && curl -fSsL `echo ${OPENMPI_URL} | sed s/%OMPI_BASE%/$OMPI_BASE/` | tar xzf - &&\
    cd ${OPENMPI_TARNAME} &&\
    ./configure \
        --prefix=${OPENMPI_PATH} \
        --with-ucx=/opt/ucx \
        --without-verbs \
        --with-cuda=/usr/local/cuda \
        --enable-mpirun-prefix-by-default \
        --enable-orterun-prefix-by-default \
        --enable-mca-no-build=btl-uct &&\
    make -j"$(nproc)" install &&\
    ldconfig
ENV PATH=${OPENMPI_PATH}/bin:$PATH
ENV LD_LIBRARY_PATH=${OPENMPI_PATH}/lib:$LD_LIBRARY_PATH

# install mpi4py (be sure to link existing /opt/openmpi-xxx)
RUN CC=mpicc MPICC=mpicc pip install mpi4py --no-binary mpi4py

# install pytorch
ARG PYTORCH_VERSION
RUN pip install torch==${PYTORCH_VERSION}

# in case you need to build pytorch:
# note: if you want specific branch or to link system cuda libraries or MPI
# note: recommend using many high-frequency cores (e.g. 32+ skylake cores)
# ENV CUDA_HOME="/usr/local/cuda" \
#     CUDNN_LIBRARY="/usr/lib/x86_64-linux-gnu" \
#     NCCL_INCLUDE_DIR="/usr/include" \
#     NCCL_LIB_DIR="/usr/lib/x86_64-linux-gnu" \
#     USE_SYSTEM_NCCL=1
# RUN conda install -y \
#         mkl \
#         mkl-include \
#         ninja \
#         pyyaml \
#         cffi &&\
#     cd /stage && git clone https://github.com/pytorch/pytorch.git &&\
#     cd pytorch &&\
#     git checkout v1.6.0 &&\
#     git submodule update --init --recursive &&\
#     python setup.py bdist_wheel -d build/wheel &&\
#     pip install build/wheel/*.whl

# build onnxruntime wheel with cuda and mpi support
ARG BUILD_VERSION
RUN cd /stage && git clone https://github.com/microsoft/onnxruntime.git &&\
    cd onnxruntime &&\
    git checkout master &&\
    cp ThirdPartyNotices.txt /stage/ThirdPartyNotices.txt &&\
    cp dockerfiles/LICENSE-IMAGE.txt /stage/LICENSE-IMAGE.txt &&\
    python tools/ci_build/build.py \
        --cmake_extra_defines \
            ONNXRUNTIME_VERSION=`cat ./VERSION_NUMBER` \
        --config ${BUILD_VERSION} \
        --enable_training \
        --mpi_home ${OPENMPI_PATH} \
        --use_cuda \
        --cuda_home /usr/local/cuda \
        --cudnn_home /usr/lib/x86_64-linux-gnu/ \
        --nccl_home /usr/lib/x86_64-linux-gnu/ \
        --update \
        --parallel \
        --build_dir build \
        --build \
        --build_wheel \
        --skip_tests &&\
    pip install build/${BUILD_VERSION}/dist/*.whl

# switch to cuda runtime environment
# note: launch with --gpus all or nvidia-docker
FROM nvidia/cuda:10.2-cudnn7-runtime-ubuntu18.04
WORKDIR /stage

# install ucx
# note: launch with --cap-add=sys_nice to avoid 'mbind' warnings
COPY --from=build /opt/ucx /opt/ucx
ENV PATH=/opt/ucx/bin:$PATH
ENV LD_LIBRARY_PATH=/opt/ucx/lib:$LD_LIBRARY_PATH

# install openmpi
# note: permit mpirun as root for Azure cluster submissions
# note: enforce openmpi select ucx or fail
ARG OPENMPI_VERSION
ARG OPENMPI_PATH
COPY --from=build ${OPENMPI_PATH} ${OPENMPI_PATH}
ENV PATH=${OPENMPI_PATH}/bin:$PATH
ENV LD_LIBRARY_PATH=${OPENMPI_PATH}/lib:$LD_LIBRARY_PATH
ENV OMPI_ALLOW_RUN_AS_ROOT=1
ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
ENV OMPI_MCA_pml=ucx
RUN apt-get -y update && apt-get -y --no-install-recommends install \
        openssh-server \
        openssh-client \
        libibverbs-dev \
        libnuma-dev &&\
    ldconfig

# copy conda environment (includes numpy, mpi4py, pytorch, onnxruntime)
COPY --from=build /opt/conda /opt/conda
ENV PATH=/opt/conda/bin:${PATH}

# make ssh/sshd less strict for wiring containers on Azure VM scale set
# note: use 'service ssh start' to launch sshd (will fail if 22 in use)
# note: can also set port != 22 and set port=X in MPI hosts file
# note: need to setup password free ssh login between MPI hosts
RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/g' \
        /etc/ssh/sshd_config &&\
    sed -i 's/#StrictModes yes/StrictModes no/g' \
        /etc/ssh/sshd_config &&\
    sed -i 's/#   StrictHostKeyChecking ask/    StrictHostKeyChecking no/g' \
        /etc/ssh/ssh_config &&\
    mkdir /run/sshd &&\
    chmod u=rwx,go=rx /run/sshd

# export versions
ARG UCX_VERSION
ARG OPENMPI_VERSION
ARG CONDA_VERSION
ARG NUMPY_VERSION
ARG ONNX_VERSION
ARG PYTORCH_VERSION
LABEL UCX_VERSION=${UCX_VERSION}
LABEL OPENMPI_VERSION=${OPENMPI_VERSION}
LABEL CONDA_VERSION=${CONDA_VERSION}
LABEL NUMPY_VERSION=${NUMPY_VERSION}
LABEL ONNX_VERSION=${ONNX_VERSION}
LABEL PYTORCH_VERSION=${PYTORCH_VERSION}

# clean\finalize environment
# note: adds onnxruntime license and third party notices
RUN conda remove -y cmake &&\
    apt-get purge -y build-essential &&\
    apt-get autoremove -y &&\
    rm -fr /stage
WORKDIR /workspace
COPY --from=build /stage/*.txt /workspace/