2020-08-06 01:54:54 +00:00
|
|
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
|
|
|
# Licensed under the MIT License.
|
|
|
|
|
|
|
|
|
|
# multi-stage arguments (repeat ARG NAME below)
|
|
|
|
|
ARG UCX_VERSION=1.8.0
|
|
|
|
|
ARG OPENMPI_VERSION=4.0.4
|
|
|
|
|
ARG CONDA_VERSION=4.7.10
|
|
|
|
|
ARG NUMPY_VERSION=1.18.5
|
|
|
|
|
ARG ONNX_VERSION=1.7.0
|
|
|
|
|
ARG PYTORCH_VERSION=1.6.0
|
|
|
|
|
|
2020-08-12 20:29:37 +00:00
|
|
|
ARG BUILD_CONFIG=Release
|
2020-08-13 23:48:48 +00:00
|
|
|
ARG OPENMPI_PATH=/opt/openmpi-${OPENMPI_VERSION}
|
2020-08-12 20:29:37 +00:00
|
|
|
ARG COMMIT=master
|
2020-08-06 01:54:54 +00:00
|
|
|
|
|
|
|
|
# cuda development image for building sources
|
2020-09-24 02:03:42 +00:00
|
|
|
FROM nvidia/cuda:10.2-cudnn8-devel-ubuntu18.04 as builder
|
2020-08-06 01:54:54 +00:00
|
|
|
|
|
|
|
|
# set location for builds
|
|
|
|
|
WORKDIR /stage
|
|
|
|
|
|
2020-09-16 16:53:30 +00:00
|
|
|
# install curl, git, ssh (required by MPI when running ORT tests)
|
2020-08-06 01:54:54 +00:00
|
|
|
RUN apt-get -y update &&\
|
|
|
|
|
apt-get -y --no-install-recommends install \
|
|
|
|
|
curl \
|
2020-08-12 20:29:37 +00:00
|
|
|
git \
|
2020-08-18 00:05:01 +00:00
|
|
|
language-pack-en \
|
2020-09-16 16:53:30 +00:00
|
|
|
openssh-client \
|
2020-08-18 00:05:01 +00:00
|
|
|
unattended-upgrades
|
|
|
|
|
|
|
|
|
|
# update existing packages to minimize security vulnerabilities
|
|
|
|
|
RUN unattended-upgrade
|
2020-08-12 20:29:37 +00:00
|
|
|
|
|
|
|
|
RUN locale-gen en_US.UTF-8 && \
|
|
|
|
|
update-locale LANG=en_US.UTF-8
|
|
|
|
|
|
2020-08-06 01:54:54 +00:00
|
|
|
# install miniconda (comes with python 3.7 default)
|
|
|
|
|
ARG CONDA_VERSION
|
|
|
|
|
ARG CONDA_URL=https://repo.anaconda.com/miniconda/Miniconda3-${CONDA_VERSION}-Linux-x86_64.sh
|
|
|
|
|
RUN cd /stage && curl -fSsL --insecure ${CONDA_URL} -o install-conda.sh &&\
|
|
|
|
|
/bin/bash ./install-conda.sh -b -p /opt/conda &&\
|
|
|
|
|
/opt/conda/bin/conda clean -ya
|
|
|
|
|
ENV PATH=/opt/conda/bin:${PATH}
|
|
|
|
|
|
|
|
|
|
# install cmake, setuptools, numpy, and onnx
|
|
|
|
|
ARG NUMPY_VERSION
|
|
|
|
|
ARG ONNX_VERSION
|
|
|
|
|
RUN conda install -y \
|
|
|
|
|
setuptools \
|
|
|
|
|
cmake \
|
|
|
|
|
numpy=${NUMPY_VERSION} &&\
|
|
|
|
|
pip install \
|
|
|
|
|
onnx=="${ONNX_VERSION}"
|
|
|
|
|
|
Add new PytTrch front-end (#4815)
* Add ORTTrainerOptions class for the new pytorch frontend (#4382)
Add ORTTrainerOptions class and some placeholders
* Add _ORTTrainerModelDesc to perform validation for model description (#4416)
* Add Loss Scaler classes to the new frontend (#4306)
* Add TrainStepInfo used on the new frontend API (#4256)
* Add Optimizer classes to the new frontend (#4280)
* Add LRScheduler implementation (#4357)
* Add basic ORTTrainer API (#4435)
This PR presents the public API for ORTTrainer for the short term
development.
It also validates and saves input parameters, which will be used in the
next stages, such as building ONNX model, post processing the model and
configuring the training session
* Add opset_version into ORTTrainerOptions and change type of ORTTrainer.loss_fn (#4592)
* Update ModelDescription and minor fix on ORTTrainer ctor (#4605)
* Update ModelDescription and minor fix on ORTTrainer/ORTTrainerOptions
This PR keeps the public API intact, but changes how model description is stored on the backend
Currently, users creates a dict with two lists of tuples.
One list called 'inputs' and each tuple has the following format tuple(name, shape).
The second list is called 'outputs' and each tuple can be either tuple(name, shape) or tuple(name, shape, is_loss).
With this PR, when this dict is passed in to ORTTrainer, it is fully validated as usual.
However, tuples are internally replaced by namedtuples and all output tuples will have
tuple(name, shape, is_loss) format instead of is_loss being optionally present.
Additionally to that normalization in the internal representation (which eases coding),
two internal methods were created to replace a namedtuple(name, shape) to namedtuple(name, shape, dtype)
or namedtuple(name, shape, is_loss, dtype) dependeing whether the tuple is an input or output.
This is necessary as ORTTRainer finds out data types of each input/output during model export to onnx.
Finally, a minor fix was done on ORTTrainer. It could initialize ORTTrainerOptions incorrectly when options=None
* Rename input name for test
* Add ONNX Model Export to New Frontend (#4612)
Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
* Create training session + minor improvements (#4668)
Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
* Save ONNX model in file (#4671)
Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
* Add eval step (#4674)
Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
* Add train_step (#4677)
Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
* Add LR Scheduler (#4694)
Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
* Add deterministic compute tests (#4716)
Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
* Add legacy vs experimental ORTTrainer accuracy comparison (#4727)
Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
* Add Mixed precision/LossScaler + several fixes (#4739)
Additionally to the mixed precision/loss scaler code, this PR includes:
* Fix CUDA training
* Add optimization_step into TrainStepInfo class
* Refactor LRSCheduler to use optimization_step instead of step
* Updated several default values at ORTTrainerOptions
* Add initial Gradient Accumulation supported. Untested
* Fix ONNX model post processing
* Refactor unit tests
* Add ONNX BERT example + minor fixes (#4757)
* Fix training issue when passing ONNX file into ORTTrainer
Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
* Add Dynamic Shape support (#4758)
* Update DeepSpeed Zero Stage option to a separate option group (#4772)
* Add support to fetches (#4777)
* Add Gradient Accumulation Steps support (#4793)
* Fix Dynamic Axes feature and add unit test (#4795)
* Add frozen weights test (#4807)
* Move new pytorch front-end to 'experimental' namespace (#4814)
* Fix build
Co-authored-by: Rayan-Krishnan <rayankrishnan@live.com>
Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
2020-08-17 16:45:25 +00:00
|
|
|
# install cerberus for the new pytorch front-end
|
|
|
|
|
RUN pip install cerberus
|
|
|
|
|
|
2020-08-06 01:54:54 +00:00
|
|
|
# build ucx suite
|
|
|
|
|
# note: openmpi will not select ucx without multithreading enabled
|
|
|
|
|
ARG UCX_VERSION
|
|
|
|
|
ARG UCX_TARNAME=ucx-$UCX_VERSION
|
|
|
|
|
ARG UCX_URL=https://github.com/openucx/ucx/releases/download/v${UCX_VERSION}/${UCX_TARNAME}.tar.gz
|
|
|
|
|
RUN apt-get -y update && apt-get -y --no-install-recommends install \
|
|
|
|
|
libibverbs-dev \
|
|
|
|
|
libnuma-dev &&\
|
|
|
|
|
cd /stage && curl -fSsL ${UCX_URL} | tar xzf - &&\
|
|
|
|
|
cd ${UCX_TARNAME} &&\
|
|
|
|
|
./configure \
|
|
|
|
|
--prefix=/opt/ucx \
|
|
|
|
|
--with-cuda=/usr/local/cuda \
|
|
|
|
|
--with-verbs=/usr/lib/x86_64-linux-gnu \
|
|
|
|
|
--enable-mt &&\
|
|
|
|
|
make -j"$(nproc)" &&\
|
|
|
|
|
make install
|
|
|
|
|
|
|
|
|
|
# build openmpi (use --prefix /opt/openmpi-xxx to move to runtime image)
|
|
|
|
|
# note: require --enable-orterun-prefix-by-default for Azure machine learning compute
|
|
|
|
|
# note: disable verbs as we use ucx middleware and don't want btl openib warnings
|
|
|
|
|
ARG OPENMPI_VERSION
|
|
|
|
|
ARG OPENMPI_PATH
|
|
|
|
|
ARG OPENMPI_TARNAME=openmpi-${OPENMPI_VERSION}
|
|
|
|
|
ARG OPENMPI_URL=https://download.open-mpi.org/release/open-mpi/v%OMPI_BASE%/${OPENMPI_TARNAME}.tar.gz
|
|
|
|
|
RUN export OMPI_BASE=${OPENMPI_VERSION%.*} &&\
|
|
|
|
|
cd /stage && curl -fSsL `echo ${OPENMPI_URL} | sed s/%OMPI_BASE%/$OMPI_BASE/` | tar xzf - &&\
|
|
|
|
|
cd ${OPENMPI_TARNAME} &&\
|
|
|
|
|
./configure \
|
|
|
|
|
--prefix=${OPENMPI_PATH} \
|
|
|
|
|
--with-ucx=/opt/ucx \
|
|
|
|
|
--without-verbs \
|
|
|
|
|
--with-cuda=/usr/local/cuda \
|
|
|
|
|
--enable-mpirun-prefix-by-default \
|
|
|
|
|
--enable-orterun-prefix-by-default \
|
|
|
|
|
--enable-mca-no-build=btl-uct &&\
|
|
|
|
|
make -j"$(nproc)" install &&\
|
|
|
|
|
ldconfig
|
|
|
|
|
ENV PATH=${OPENMPI_PATH}/bin:$PATH
|
|
|
|
|
ENV LD_LIBRARY_PATH=${OPENMPI_PATH}/lib:$LD_LIBRARY_PATH
|
|
|
|
|
|
|
|
|
|
# install mpi4py (be sure to link existing /opt/openmpi-xxx)
|
|
|
|
|
RUN CC=mpicc MPICC=mpicc pip install mpi4py --no-binary mpi4py
|
|
|
|
|
|
|
|
|
|
# install pytorch
|
|
|
|
|
ARG PYTORCH_VERSION
|
|
|
|
|
RUN pip install torch==${PYTORCH_VERSION}
|
|
|
|
|
|
|
|
|
|
# in case you need to build pytorch:
|
|
|
|
|
# note: if you want specific branch or to link system cuda libraries or MPI
|
|
|
|
|
# note: recommend using many high-frequency cores (e.g. 32+ skylake cores)
|
|
|
|
|
# ENV CUDA_HOME="/usr/local/cuda" \
|
|
|
|
|
# CUDNN_LIBRARY="/usr/lib/x86_64-linux-gnu" \
|
|
|
|
|
# NCCL_INCLUDE_DIR="/usr/include" \
|
|
|
|
|
# NCCL_LIB_DIR="/usr/lib/x86_64-linux-gnu" \
|
2020-08-13 23:48:48 +00:00
|
|
|
# USE_SYSTEM_NCCL=1
|
2020-08-06 01:54:54 +00:00
|
|
|
# RUN conda install -y \
|
|
|
|
|
# mkl \
|
|
|
|
|
# mkl-include \
|
|
|
|
|
# ninja \
|
|
|
|
|
# pyyaml \
|
|
|
|
|
# cffi &&\
|
|
|
|
|
# cd /stage && git clone https://github.com/pytorch/pytorch.git &&\
|
|
|
|
|
# cd pytorch &&\
|
|
|
|
|
# git checkout v1.6.0 &&\
|
|
|
|
|
# git submodule update --init --recursive &&\
|
|
|
|
|
# python setup.py bdist_wheel -d build/wheel &&\
|
|
|
|
|
# pip install build/wheel/*.whl
|
|
|
|
|
|
|
|
|
|
# build onnxruntime wheel with cuda and mpi support
|
2020-08-12 20:29:37 +00:00
|
|
|
ARG BUILD_CONFIG
|
|
|
|
|
ARG COMMIT
|
2020-08-06 01:54:54 +00:00
|
|
|
RUN cd /stage && git clone https://github.com/microsoft/onnxruntime.git &&\
|
|
|
|
|
cd onnxruntime &&\
|
2020-08-12 20:29:37 +00:00
|
|
|
git checkout ${COMMIT} &&\
|
2020-08-06 01:54:54 +00:00
|
|
|
cp ThirdPartyNotices.txt /stage/ThirdPartyNotices.txt &&\
|
|
|
|
|
cp dockerfiles/LICENSE-IMAGE.txt /stage/LICENSE-IMAGE.txt &&\
|
|
|
|
|
python tools/ci_build/build.py \
|
|
|
|
|
--cmake_extra_defines \
|
|
|
|
|
ONNXRUNTIME_VERSION=`cat ./VERSION_NUMBER` \
|
2020-08-12 20:29:37 +00:00
|
|
|
--config ${BUILD_CONFIG} \
|
2020-08-06 01:54:54 +00:00
|
|
|
--enable_training \
|
|
|
|
|
--mpi_home ${OPENMPI_PATH} \
|
|
|
|
|
--use_cuda \
|
|
|
|
|
--cuda_home /usr/local/cuda \
|
|
|
|
|
--cudnn_home /usr/lib/x86_64-linux-gnu/ \
|
|
|
|
|
--nccl_home /usr/lib/x86_64-linux-gnu/ \
|
|
|
|
|
--update \
|
|
|
|
|
--parallel \
|
|
|
|
|
--build_dir build \
|
|
|
|
|
--build \
|
|
|
|
|
--build_wheel \
|
|
|
|
|
--skip_tests &&\
|
2020-08-12 20:29:37 +00:00
|
|
|
pip install build/${BUILD_CONFIG}/dist/*.whl
|
2020-08-06 01:54:54 +00:00
|
|
|
|
2020-08-13 23:48:48 +00:00
|
|
|
# Install AzureML support and commonly used packages.
|
2020-09-03 20:11:41 +00:00
|
|
|
RUN pip install azureml-defaults transformers==2.11.0 msgpack==1.0.0 tensorboardX==1.8 tensorboard==2.3.0
|
2020-08-13 23:48:48 +00:00
|
|
|
|
2020-08-06 01:54:54 +00:00
|
|
|
# switch to cuda runtime environment
|
|
|
|
|
# note: launch with --gpus all or nvidia-docker
|
2020-09-24 02:03:42 +00:00
|
|
|
FROM nvidia/cuda:10.2-cudnn8-runtime-ubuntu18.04
|
2020-08-06 01:54:54 +00:00
|
|
|
WORKDIR /stage
|
|
|
|
|
|
|
|
|
|
# install ucx
|
|
|
|
|
# note: launch with --cap-add=sys_nice to avoid 'mbind' warnings
|
2020-08-12 20:29:37 +00:00
|
|
|
COPY --from=builder /opt/ucx /opt/ucx
|
2020-08-06 01:54:54 +00:00
|
|
|
ENV PATH=/opt/ucx/bin:$PATH
|
|
|
|
|
ENV LD_LIBRARY_PATH=/opt/ucx/lib:$LD_LIBRARY_PATH
|
|
|
|
|
|
|
|
|
|
# install openmpi
|
|
|
|
|
# note: permit mpirun as root for Azure cluster submissions
|
|
|
|
|
# note: enforce openmpi select ucx or fail
|
|
|
|
|
ARG OPENMPI_VERSION
|
|
|
|
|
ARG OPENMPI_PATH
|
2020-08-12 20:29:37 +00:00
|
|
|
COPY --from=builder ${OPENMPI_PATH} ${OPENMPI_PATH}
|
2020-08-06 01:54:54 +00:00
|
|
|
ENV PATH=${OPENMPI_PATH}/bin:$PATH
|
|
|
|
|
ENV LD_LIBRARY_PATH=${OPENMPI_PATH}/lib:$LD_LIBRARY_PATH
|
|
|
|
|
ENV OMPI_ALLOW_RUN_AS_ROOT=1
|
|
|
|
|
ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
|
|
|
|
|
ENV OMPI_MCA_pml=ucx
|
|
|
|
|
RUN apt-get -y update && apt-get -y --no-install-recommends install \
|
|
|
|
|
openssh-server \
|
|
|
|
|
openssh-client \
|
|
|
|
|
libibverbs-dev \
|
|
|
|
|
libnuma-dev &&\
|
|
|
|
|
ldconfig
|
|
|
|
|
|
|
|
|
|
# copy conda environment (includes numpy, mpi4py, pytorch, onnxruntime)
|
2020-08-12 20:29:37 +00:00
|
|
|
COPY --from=builder /opt/conda /opt/conda
|
2020-08-06 01:54:54 +00:00
|
|
|
ENV PATH=/opt/conda/bin:${PATH}
|
|
|
|
|
|
|
|
|
|
# make ssh/sshd less strict for wiring containers on Azure VM scale set
|
|
|
|
|
# note: use 'service ssh start' to launch sshd (will fail if 22 in use)
|
|
|
|
|
# note: can also set port != 22 and set port=X in MPI hosts file
|
|
|
|
|
# note: need to setup password free ssh login between MPI hosts
|
|
|
|
|
RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/g' \
|
|
|
|
|
/etc/ssh/sshd_config &&\
|
|
|
|
|
sed -i 's/#StrictModes yes/StrictModes no/g' \
|
|
|
|
|
/etc/ssh/sshd_config &&\
|
|
|
|
|
sed -i 's/# StrictHostKeyChecking ask/ StrictHostKeyChecking no/g' \
|
|
|
|
|
/etc/ssh/ssh_config &&\
|
|
|
|
|
mkdir /run/sshd &&\
|
|
|
|
|
chmod u=rwx,go=rx /run/sshd
|
|
|
|
|
|
|
|
|
|
# export versions
|
|
|
|
|
ARG UCX_VERSION
|
|
|
|
|
ARG OPENMPI_VERSION
|
|
|
|
|
ARG CONDA_VERSION
|
|
|
|
|
ARG NUMPY_VERSION
|
|
|
|
|
ARG ONNX_VERSION
|
|
|
|
|
ARG PYTORCH_VERSION
|
|
|
|
|
LABEL UCX_VERSION=${UCX_VERSION}
|
|
|
|
|
LABEL OPENMPI_VERSION=${OPENMPI_VERSION}
|
|
|
|
|
LABEL CONDA_VERSION=${CONDA_VERSION}
|
|
|
|
|
LABEL NUMPY_VERSION=${NUMPY_VERSION}
|
|
|
|
|
LABEL ONNX_VERSION=${ONNX_VERSION}
|
|
|
|
|
LABEL PYTORCH_VERSION=${PYTORCH_VERSION}
|
|
|
|
|
|
|
|
|
|
# clean\finalize environment
|
|
|
|
|
# note: adds onnxruntime license and third party notices
|
|
|
|
|
RUN conda remove -y cmake &&\
|
|
|
|
|
apt-get purge -y build-essential &&\
|
|
|
|
|
apt-get autoremove -y &&\
|
|
|
|
|
rm -fr /stage
|
|
|
|
|
WORKDIR /workspace
|
2020-08-12 20:29:37 +00:00
|
|
|
COPY --from=builder /stage/*.txt /workspace/
|