onnxruntime/dockerfiles/Dockerfile.training
suffiank 005fa5c3ae
Add initial Dockerfile for distributed training targets (#4578)
* add training dockerfile tested for examples repo

* forgot pytorch patch for build from source

* make apt-get update -y adjacent apt-get install -y due to Docker caching rules

* comment for mellanox libraries

* mpi4py comment as I forgot where it came from

* apparently curl not included anymore

* grr.. nvidia change nccl location

* dont need findnccl.patch after nvidia changed nccl location

* pr comment /opt/ompi4 => /opt/openmpi-xxx

* switch to pip install pytorch

* use Release instead of RelWithDebInfo

* comment wording

* wordin

* missed RelWithDebInfo => Release

* replace Mellanox with libibverbs

* stale comment

* ordering

* no more ninja

* add / at end of copy

* update cgmanifest.json

* pr comments

Co-authored-by: suffian khan <sukha@OrtTrainingDev1.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
2020-08-05 18:54:54 -07:00

206 lines
6.9 KiB
Text

# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# multi-stage arguments (repeat ARG NAME below)
ARG UCX_VERSION=1.8.0
ARG OPENMPI_VERSION=4.0.4
ARG CONDA_VERSION=4.7.10
ARG NUMPY_VERSION=1.18.5
ARG ONNX_VERSION=1.7.0
ARG PYTORCH_VERSION=1.6.0
ARG BUILD_VERSION=Release
ARG OPENMPI_PATH=/opt/openmpi-${OPENMPI_VERSION}
# cuda development image for building sources
FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 as build
# set location for builds
WORKDIR /stage
# install curl and git
RUN apt-get -y update &&\
apt-get -y --no-install-recommends install \
curl \
git
# install miniconda (comes with python 3.7 default)
ARG CONDA_VERSION
ARG CONDA_URL=https://repo.anaconda.com/miniconda/Miniconda3-${CONDA_VERSION}-Linux-x86_64.sh
RUN cd /stage && curl -fSsL --insecure ${CONDA_URL} -o install-conda.sh &&\
/bin/bash ./install-conda.sh -b -p /opt/conda &&\
/opt/conda/bin/conda clean -ya
ENV PATH=/opt/conda/bin:${PATH}
# install cmake, setuptools, numpy, and onnx
ARG NUMPY_VERSION
ARG ONNX_VERSION
RUN conda install -y \
setuptools \
cmake \
numpy=${NUMPY_VERSION} &&\
pip install \
onnx=="${ONNX_VERSION}"
# build ucx suite
# note: openmpi will not select ucx without multithreading enabled
ARG UCX_VERSION
ARG UCX_TARNAME=ucx-$UCX_VERSION
ARG UCX_URL=https://github.com/openucx/ucx/releases/download/v${UCX_VERSION}/${UCX_TARNAME}.tar.gz
RUN apt-get -y update && apt-get -y --no-install-recommends install \
libibverbs-dev \
libnuma-dev &&\
cd /stage && curl -fSsL ${UCX_URL} | tar xzf - &&\
cd ${UCX_TARNAME} &&\
./configure \
--prefix=/opt/ucx \
--with-cuda=/usr/local/cuda \
--with-verbs=/usr/lib/x86_64-linux-gnu \
--enable-mt &&\
make -j"$(nproc)" &&\
make install
# build openmpi (use --prefix /opt/openmpi-xxx to move to runtime image)
# note: require --enable-orterun-prefix-by-default for Azure machine learning compute
# note: disable verbs as we use ucx middleware and don't want btl openib warnings
ARG OPENMPI_VERSION
ARG OPENMPI_PATH
ARG OPENMPI_TARNAME=openmpi-${OPENMPI_VERSION}
ARG OPENMPI_URL=https://download.open-mpi.org/release/open-mpi/v%OMPI_BASE%/${OPENMPI_TARNAME}.tar.gz
RUN export OMPI_BASE=${OPENMPI_VERSION%.*} &&\
cd /stage && curl -fSsL `echo ${OPENMPI_URL} | sed s/%OMPI_BASE%/$OMPI_BASE/` | tar xzf - &&\
cd ${OPENMPI_TARNAME} &&\
./configure \
--prefix=${OPENMPI_PATH} \
--with-ucx=/opt/ucx \
--without-verbs \
--with-cuda=/usr/local/cuda \
--enable-mpirun-prefix-by-default \
--enable-orterun-prefix-by-default \
--enable-mca-no-build=btl-uct &&\
make -j"$(nproc)" install &&\
ldconfig
ENV PATH=${OPENMPI_PATH}/bin:$PATH
ENV LD_LIBRARY_PATH=${OPENMPI_PATH}/lib:$LD_LIBRARY_PATH
# install mpi4py (be sure to link existing /opt/openmpi-xxx)
RUN CC=mpicc MPICC=mpicc pip install mpi4py --no-binary mpi4py
# install pytorch
ARG PYTORCH_VERSION
RUN pip install torch==${PYTORCH_VERSION}
# in case you need to build pytorch:
# note: if you want specific branch or to link system cuda libraries or MPI
# note: recommend using many high-frequency cores (e.g. 32+ skylake cores)
# ENV CUDA_HOME="/usr/local/cuda" \
# CUDNN_LIBRARY="/usr/lib/x86_64-linux-gnu" \
# NCCL_INCLUDE_DIR="/usr/include" \
# NCCL_LIB_DIR="/usr/lib/x86_64-linux-gnu" \
# USE_SYSTEM_NCCL=1
# RUN conda install -y \
# mkl \
# mkl-include \
# ninja \
# pyyaml \
# cffi &&\
# cd /stage && git clone https://github.com/pytorch/pytorch.git &&\
# cd pytorch &&\
# git checkout v1.6.0 &&\
# git submodule update --init --recursive &&\
# python setup.py bdist_wheel -d build/wheel &&\
# pip install build/wheel/*.whl
# build onnxruntime wheel with cuda and mpi support
ARG BUILD_VERSION
RUN cd /stage && git clone https://github.com/microsoft/onnxruntime.git &&\
cd onnxruntime &&\
git checkout master &&\
cp ThirdPartyNotices.txt /stage/ThirdPartyNotices.txt &&\
cp dockerfiles/LICENSE-IMAGE.txt /stage/LICENSE-IMAGE.txt &&\
python tools/ci_build/build.py \
--cmake_extra_defines \
ONNXRUNTIME_VERSION=`cat ./VERSION_NUMBER` \
--config ${BUILD_VERSION} \
--enable_training \
--mpi_home ${OPENMPI_PATH} \
--use_cuda \
--cuda_home /usr/local/cuda \
--cudnn_home /usr/lib/x86_64-linux-gnu/ \
--nccl_home /usr/lib/x86_64-linux-gnu/ \
--update \
--parallel \
--build_dir build \
--build \
--build_wheel \
--skip_tests &&\
pip install build/${BUILD_VERSION}/dist/*.whl
# switch to cuda runtime environment
# note: launch with --gpus all or nvidia-docker
FROM nvidia/cuda:10.2-cudnn7-runtime-ubuntu18.04
WORKDIR /stage
# install ucx
# note: launch with --cap-add=sys_nice to avoid 'mbind' warnings
COPY --from=build /opt/ucx /opt/ucx
ENV PATH=/opt/ucx/bin:$PATH
ENV LD_LIBRARY_PATH=/opt/ucx/lib:$LD_LIBRARY_PATH
# install openmpi
# note: permit mpirun as root for Azure cluster submissions
# note: enforce openmpi select ucx or fail
ARG OPENMPI_VERSION
ARG OPENMPI_PATH
COPY --from=build ${OPENMPI_PATH} ${OPENMPI_PATH}
ENV PATH=${OPENMPI_PATH}/bin:$PATH
ENV LD_LIBRARY_PATH=${OPENMPI_PATH}/lib:$LD_LIBRARY_PATH
ENV OMPI_ALLOW_RUN_AS_ROOT=1
ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
ENV OMPI_MCA_pml=ucx
RUN apt-get -y update && apt-get -y --no-install-recommends install \
openssh-server \
openssh-client \
libibverbs-dev \
libnuma-dev &&\
ldconfig
# copy conda environment (includes numpy, mpi4py, pytorch, onnxruntime)
COPY --from=build /opt/conda /opt/conda
ENV PATH=/opt/conda/bin:${PATH}
# make ssh/sshd less strict for wiring containers on Azure VM scale set
# note: use 'service ssh start' to launch sshd (will fail if 22 in use)
# note: can also set port != 22 and set port=X in MPI hosts file
# note: need to setup password free ssh login between MPI hosts
RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/g' \
/etc/ssh/sshd_config &&\
sed -i 's/#StrictModes yes/StrictModes no/g' \
/etc/ssh/sshd_config &&\
sed -i 's/# StrictHostKeyChecking ask/ StrictHostKeyChecking no/g' \
/etc/ssh/ssh_config &&\
mkdir /run/sshd &&\
chmod u=rwx,go=rx /run/sshd
# export versions
ARG UCX_VERSION
ARG OPENMPI_VERSION
ARG CONDA_VERSION
ARG NUMPY_VERSION
ARG ONNX_VERSION
ARG PYTORCH_VERSION
LABEL UCX_VERSION=${UCX_VERSION}
LABEL OPENMPI_VERSION=${OPENMPI_VERSION}
LABEL CONDA_VERSION=${CONDA_VERSION}
LABEL NUMPY_VERSION=${NUMPY_VERSION}
LABEL ONNX_VERSION=${ONNX_VERSION}
LABEL PYTORCH_VERSION=${PYTORCH_VERSION}
# clean\finalize environment
# note: adds onnxruntime license and third party notices
RUN conda remove -y cmake &&\
apt-get purge -y build-essential &&\
apt-get autoremove -y &&\
rm -fr /stage
WORKDIR /workspace
COPY --from=build /stage/*.txt /workspace/