diff --git a/ThirdPartyNotices.txt b/ThirdPartyNotices.txt index db679aadf7..9cdda90892 100644 --- a/ThirdPartyNotices.txt +++ b/ThirdPartyNotices.txt @@ -4175,4 +4175,159 @@ libprotobuf-mutator distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. \ No newline at end of file + limitations under the License. + + ----- + + openucx/ucx + https://github.com/openucx/ucx + + Copyright (c) 2014-2015 UT-Battelle, LLC. All rights reserved. + Copyright (C) 2014-2020 Mellanox Technologies Ltd. All rights reserved. + Copyright (C) 2014-2015 The University of Houston System. All rights reserved. + Copyright (C) 2015 The University of Tennessee and The University + of Tennessee Research Foundation. All rights reserved. + Copyright (C) 2016-2020 ARM Ltd. All rights reserved. + Copyright (c) 2016 Los Alamos National Security, LLC. All rights reserved. + Copyright (C) 2016-2020 Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2019 UChicago Argonne, LLC. All rights reserved. + Copyright (c) 2018-2020 NVIDIA CORPORATION. All rights reserved. + Copyright (C) 2020 Huawei Technologies Co., Ltd. All rights reserved. + Copyright (C) 2016-2020 Stony Brook University. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + 3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + ----- + + From PyTorch: + + Copyright (c) 2016- Facebook, Inc (Adam Paszke) + Copyright (c) 2014- Facebook, Inc (Soumith Chintala) + Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) + Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) + Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) + Copyright (c) 2011-2013 NYU (Clement Farabet) + Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston) + Copyright (c) 2006 Idiap Research Institute (Samy Bengio) + Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz) + + From Caffe2: + + Copyright (c) 2016-present, Facebook Inc. All rights reserved. + + All contributions by Facebook: + Copyright (c) 2016 Facebook Inc. + + All contributions by Google: + Copyright (c) 2015 Google Inc. + All rights reserved. + + All contributions by Yangqing Jia: + Copyright (c) 2015 Yangqing Jia + All rights reserved. + + All contributions from Caffe: + Copyright(c) 2013, 2014, 2015, the respective contributors + All rights reserved. + + All other contributions: + Copyright(c) 2015, 2016 the respective contributors + All rights reserved. + + Caffe2 uses a copyright model similar to Caffe: each contributor holds + copyright over their contributions to Caffe2. The project versioning records + all such contribution and copyright details. If a contributor wants to further + mark their specific copyright on a particular contribution, they should + indicate their copyright solely in the commit message of the change when it is + committed. + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America + and IDIAP Research Institute nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + +-- + + mpi4py + https://github.com/mpi4py/mpi4py/ + + ======================= + LICENSE: MPI for Python + ======================= + + :Author: Lisandro Dalcin + :Contact: dalcinl@gmail.com + + + Copyright (c) 2019, Lisandro Dalcin. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/cgmanifests/cgmanifest.json b/cgmanifests/cgmanifest.json index eb4042a1ec..c0cc258dc3 100644 --- a/cgmanifests/cgmanifest.json +++ b/cgmanifests/cgmanifest.json @@ -126,6 +126,16 @@ } } }, + { + "component": { + "type": "git", + "git": { + "commitHash": "b31f58de6fa8bbda5353b3c77d9be4914399724d", + "repositoryUrl": "https://github.com/pytorch/pytorch.git" + }, + "comments": "pytorch 1.6 used by onnxruntime training image" + } + }, { "component": { "type": "git", @@ -277,6 +287,27 @@ } } }, + { + "component": { + "Type": "other", + "Other": { + "Name": "OpenMPI", + "Version": "4.0.4", + "DownloadUrl": "https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.4.tar.gz" + }, + "comments": "openmpi 4.0.4 used by onnxruntime training image" + } + }, + { + "component": { + "Type": "git", + "git": { + "commitHash": "7db3f9c741d3dfd8dda14ffb537ed251280d2025", + "repositoryUrl": "https://github.com/mpi4py/mpi4py" + }, + "comments": "mpi4py 3.0.3 used by onnxruntime training image" + } + }, { "component": { "Type": "other", @@ -296,6 +327,16 @@ }, "comments": "used by onnxruntime" } + }, + { + "component": { + "type": "git", + "git": { + "commitHash": "c30b7da2301202da5f9f0529966944f110e5d6e7", + "repositoryUrl": "https://github.com/openucx/ucx" + }, + "comments": "middleware between IB verbs and OpenMPI used by onnxruntime training image" + } } ], "Version": 1 diff --git a/dockerfiles/Dockerfile.training b/dockerfiles/Dockerfile.training new file mode 100644 index 0000000000..be83807804 --- /dev/null +++ b/dockerfiles/Dockerfile.training @@ -0,0 +1,206 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +# multi-stage arguments (repeat ARG NAME below) +ARG UCX_VERSION=1.8.0 +ARG OPENMPI_VERSION=4.0.4 +ARG CONDA_VERSION=4.7.10 +ARG NUMPY_VERSION=1.18.5 +ARG ONNX_VERSION=1.7.0 +ARG PYTORCH_VERSION=1.6.0 + +ARG BUILD_VERSION=Release +ARG OPENMPI_PATH=/opt/openmpi-${OPENMPI_VERSION} + +# cuda development image for building sources +FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 as build + +# set location for builds +WORKDIR /stage + +# install curl and git +RUN apt-get -y update &&\ + apt-get -y --no-install-recommends install \ + curl \ + git + +# install miniconda (comes with python 3.7 default) +ARG CONDA_VERSION +ARG CONDA_URL=https://repo.anaconda.com/miniconda/Miniconda3-${CONDA_VERSION}-Linux-x86_64.sh +RUN cd /stage && curl -fSsL --insecure ${CONDA_URL} -o install-conda.sh &&\ + /bin/bash ./install-conda.sh -b -p /opt/conda &&\ + /opt/conda/bin/conda clean -ya +ENV PATH=/opt/conda/bin:${PATH} + +# install cmake, setuptools, numpy, and onnx +ARG NUMPY_VERSION +ARG ONNX_VERSION +RUN conda install -y \ + setuptools \ + cmake \ + numpy=${NUMPY_VERSION} &&\ + pip install \ + onnx=="${ONNX_VERSION}" + +# build ucx suite +# note: openmpi will not select ucx without multithreading enabled +ARG UCX_VERSION +ARG UCX_TARNAME=ucx-$UCX_VERSION +ARG UCX_URL=https://github.com/openucx/ucx/releases/download/v${UCX_VERSION}/${UCX_TARNAME}.tar.gz +RUN apt-get -y update && apt-get -y --no-install-recommends install \ + libibverbs-dev \ + libnuma-dev &&\ + cd /stage && curl -fSsL ${UCX_URL} | tar xzf - &&\ + cd ${UCX_TARNAME} &&\ + ./configure \ + --prefix=/opt/ucx \ + --with-cuda=/usr/local/cuda \ + --with-verbs=/usr/lib/x86_64-linux-gnu \ + --enable-mt &&\ + make -j"$(nproc)" &&\ + make install + +# build openmpi (use --prefix /opt/openmpi-xxx to move to runtime image) +# note: require --enable-orterun-prefix-by-default for Azure machine learning compute +# note: disable verbs as we use ucx middleware and don't want btl openib warnings +ARG OPENMPI_VERSION +ARG OPENMPI_PATH +ARG OPENMPI_TARNAME=openmpi-${OPENMPI_VERSION} +ARG OPENMPI_URL=https://download.open-mpi.org/release/open-mpi/v%OMPI_BASE%/${OPENMPI_TARNAME}.tar.gz +RUN export OMPI_BASE=${OPENMPI_VERSION%.*} &&\ + cd /stage && curl -fSsL `echo ${OPENMPI_URL} | sed s/%OMPI_BASE%/$OMPI_BASE/` | tar xzf - &&\ + cd ${OPENMPI_TARNAME} &&\ + ./configure \ + --prefix=${OPENMPI_PATH} \ + --with-ucx=/opt/ucx \ + --without-verbs \ + --with-cuda=/usr/local/cuda \ + --enable-mpirun-prefix-by-default \ + --enable-orterun-prefix-by-default \ + --enable-mca-no-build=btl-uct &&\ + make -j"$(nproc)" install &&\ + ldconfig +ENV PATH=${OPENMPI_PATH}/bin:$PATH +ENV LD_LIBRARY_PATH=${OPENMPI_PATH}/lib:$LD_LIBRARY_PATH + +# install mpi4py (be sure to link existing /opt/openmpi-xxx) +RUN CC=mpicc MPICC=mpicc pip install mpi4py --no-binary mpi4py + +# install pytorch +ARG PYTORCH_VERSION +RUN pip install torch==${PYTORCH_VERSION} + +# in case you need to build pytorch: +# note: if you want specific branch or to link system cuda libraries or MPI +# note: recommend using many high-frequency cores (e.g. 32+ skylake cores) +# ENV CUDA_HOME="/usr/local/cuda" \ +# CUDNN_LIBRARY="/usr/lib/x86_64-linux-gnu" \ +# NCCL_INCLUDE_DIR="/usr/include" \ +# NCCL_LIB_DIR="/usr/lib/x86_64-linux-gnu" \ +# USE_SYSTEM_NCCL=1 +# RUN conda install -y \ +# mkl \ +# mkl-include \ +# ninja \ +# pyyaml \ +# cffi &&\ +# cd /stage && git clone https://github.com/pytorch/pytorch.git &&\ +# cd pytorch &&\ +# git checkout v1.6.0 &&\ +# git submodule update --init --recursive &&\ +# python setup.py bdist_wheel -d build/wheel &&\ +# pip install build/wheel/*.whl + +# build onnxruntime wheel with cuda and mpi support +ARG BUILD_VERSION +RUN cd /stage && git clone https://github.com/microsoft/onnxruntime.git &&\ + cd onnxruntime &&\ + git checkout master &&\ + cp ThirdPartyNotices.txt /stage/ThirdPartyNotices.txt &&\ + cp dockerfiles/LICENSE-IMAGE.txt /stage/LICENSE-IMAGE.txt &&\ + python tools/ci_build/build.py \ + --cmake_extra_defines \ + ONNXRUNTIME_VERSION=`cat ./VERSION_NUMBER` \ + --config ${BUILD_VERSION} \ + --enable_training \ + --mpi_home ${OPENMPI_PATH} \ + --use_cuda \ + --cuda_home /usr/local/cuda \ + --cudnn_home /usr/lib/x86_64-linux-gnu/ \ + --nccl_home /usr/lib/x86_64-linux-gnu/ \ + --update \ + --parallel \ + --build_dir build \ + --build \ + --build_wheel \ + --skip_tests &&\ + pip install build/${BUILD_VERSION}/dist/*.whl + +# switch to cuda runtime environment +# note: launch with --gpus all or nvidia-docker +FROM nvidia/cuda:10.2-cudnn7-runtime-ubuntu18.04 +WORKDIR /stage + +# install ucx +# note: launch with --cap-add=sys_nice to avoid 'mbind' warnings +COPY --from=build /opt/ucx /opt/ucx +ENV PATH=/opt/ucx/bin:$PATH +ENV LD_LIBRARY_PATH=/opt/ucx/lib:$LD_LIBRARY_PATH + +# install openmpi +# note: permit mpirun as root for Azure cluster submissions +# note: enforce openmpi select ucx or fail +ARG OPENMPI_VERSION +ARG OPENMPI_PATH +COPY --from=build ${OPENMPI_PATH} ${OPENMPI_PATH} +ENV PATH=${OPENMPI_PATH}/bin:$PATH +ENV LD_LIBRARY_PATH=${OPENMPI_PATH}/lib:$LD_LIBRARY_PATH +ENV OMPI_ALLOW_RUN_AS_ROOT=1 +ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 +ENV OMPI_MCA_pml=ucx +RUN apt-get -y update && apt-get -y --no-install-recommends install \ + openssh-server \ + openssh-client \ + libibverbs-dev \ + libnuma-dev &&\ + ldconfig + +# copy conda environment (includes numpy, mpi4py, pytorch, onnxruntime) +COPY --from=build /opt/conda /opt/conda +ENV PATH=/opt/conda/bin:${PATH} + +# make ssh/sshd less strict for wiring containers on Azure VM scale set +# note: use 'service ssh start' to launch sshd (will fail if 22 in use) +# note: can also set port != 22 and set port=X in MPI hosts file +# note: need to setup password free ssh login between MPI hosts +RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/g' \ + /etc/ssh/sshd_config &&\ + sed -i 's/#StrictModes yes/StrictModes no/g' \ + /etc/ssh/sshd_config &&\ + sed -i 's/# StrictHostKeyChecking ask/ StrictHostKeyChecking no/g' \ + /etc/ssh/ssh_config &&\ + mkdir /run/sshd &&\ + chmod u=rwx,go=rx /run/sshd + +# export versions +ARG UCX_VERSION +ARG OPENMPI_VERSION +ARG CONDA_VERSION +ARG NUMPY_VERSION +ARG ONNX_VERSION +ARG PYTORCH_VERSION +LABEL UCX_VERSION=${UCX_VERSION} +LABEL OPENMPI_VERSION=${OPENMPI_VERSION} +LABEL CONDA_VERSION=${CONDA_VERSION} +LABEL NUMPY_VERSION=${NUMPY_VERSION} +LABEL ONNX_VERSION=${ONNX_VERSION} +LABEL PYTORCH_VERSION=${PYTORCH_VERSION} + +# clean\finalize environment +# note: adds onnxruntime license and third party notices +RUN conda remove -y cmake &&\ + apt-get purge -y build-essential &&\ + apt-get autoremove -y &&\ + rm -fr /stage +WORKDIR /workspace +COPY --from=build /stage/*.txt /workspace/