mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-14 20:48:00 +00:00
Synchronize training dependency versions between Docker image and wheel, update docs, refactor build scripts.
223 lines
7.4 KiB
Text
223 lines
7.4 KiB
Text
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
# Licensed under the MIT License.
|
|
|
|
# multi-stage arguments (repeat ARG NAME below)
|
|
ARG UCX_VERSION=1.8.0
|
|
ARG OPENMPI_VERSION=4.0.4
|
|
ARG CONDA_VERSION=4.7.10
|
|
ARG NUMPY_VERSION=1.18.5
|
|
ARG ONNX_VERSION=1.7.0
|
|
ARG PYTORCH_VERSION=1.6.0
|
|
|
|
ARG BUILD_CONFIG=Release
|
|
ARG OPENMPI_PATH=/opt/openmpi-${OPENMPI_VERSION}
|
|
ARG COMMIT=master
|
|
|
|
# cuda development image for building sources
|
|
FROM nvidia/cuda:10.2-cudnn8-devel-ubuntu18.04 as builder
|
|
|
|
# set location for builds
|
|
WORKDIR /stage
|
|
|
|
# install curl, git, ssh (required by MPI when running ORT tests)
|
|
RUN apt-get -y update &&\
|
|
apt-get -y --no-install-recommends install \
|
|
curl \
|
|
git \
|
|
language-pack-en \
|
|
openssh-client \
|
|
unattended-upgrades
|
|
|
|
# update existing packages to minimize security vulnerabilities
|
|
RUN unattended-upgrade
|
|
|
|
RUN locale-gen en_US.UTF-8 && \
|
|
update-locale LANG=en_US.UTF-8
|
|
|
|
# install miniconda (comes with python 3.7 default)
|
|
ARG CONDA_VERSION
|
|
ARG CONDA_URL=https://repo.anaconda.com/miniconda/Miniconda3-${CONDA_VERSION}-Linux-x86_64.sh
|
|
RUN cd /stage && curl -fSsL --insecure ${CONDA_URL} -o install-conda.sh &&\
|
|
/bin/bash ./install-conda.sh -b -p /opt/conda &&\
|
|
/opt/conda/bin/conda clean -ya
|
|
ENV PATH=/opt/conda/bin:${PATH}
|
|
|
|
# install cmake, setuptools, numpy, and onnx
|
|
ARG NUMPY_VERSION
|
|
ARG ONNX_VERSION
|
|
RUN conda install -y \
|
|
setuptools \
|
|
cmake \
|
|
numpy=${NUMPY_VERSION} &&\
|
|
pip install \
|
|
onnx=="${ONNX_VERSION}"
|
|
|
|
# install cerberus for the new pytorch front-end
|
|
RUN pip install cerberus
|
|
|
|
# build ucx suite
|
|
# note: openmpi will not select ucx without multithreading enabled
|
|
ARG UCX_VERSION
|
|
ARG UCX_TARNAME=ucx-$UCX_VERSION
|
|
ARG UCX_URL=https://github.com/openucx/ucx/releases/download/v${UCX_VERSION}/${UCX_TARNAME}.tar.gz
|
|
RUN apt-get -y update && apt-get -y --no-install-recommends install \
|
|
libibverbs-dev \
|
|
libnuma-dev &&\
|
|
cd /stage && curl -fSsL ${UCX_URL} | tar xzf - &&\
|
|
cd ${UCX_TARNAME} &&\
|
|
./configure \
|
|
--prefix=/opt/ucx \
|
|
--with-cuda=/usr/local/cuda \
|
|
--with-verbs=/usr/lib/x86_64-linux-gnu \
|
|
--enable-mt &&\
|
|
make -j"$(nproc)" &&\
|
|
make install
|
|
|
|
# build openmpi (use --prefix /opt/openmpi-xxx to move to runtime image)
|
|
# note: require --enable-orterun-prefix-by-default for Azure machine learning compute
|
|
# note: disable verbs as we use ucx middleware and don't want btl openib warnings
|
|
ARG OPENMPI_VERSION
|
|
ARG OPENMPI_PATH
|
|
ARG OPENMPI_TARNAME=openmpi-${OPENMPI_VERSION}
|
|
ARG OPENMPI_URL=https://download.open-mpi.org/release/open-mpi/v%OMPI_BASE%/${OPENMPI_TARNAME}.tar.gz
|
|
RUN export OMPI_BASE=${OPENMPI_VERSION%.*} &&\
|
|
cd /stage && curl -fSsL `echo ${OPENMPI_URL} | sed s/%OMPI_BASE%/$OMPI_BASE/` | tar xzf - &&\
|
|
cd ${OPENMPI_TARNAME} &&\
|
|
./configure \
|
|
--prefix=${OPENMPI_PATH} \
|
|
--with-ucx=/opt/ucx \
|
|
--without-verbs \
|
|
--with-cuda=/usr/local/cuda \
|
|
--enable-mpirun-prefix-by-default \
|
|
--enable-orterun-prefix-by-default \
|
|
--enable-mca-no-build=btl-uct &&\
|
|
make -j"$(nproc)" install &&\
|
|
ldconfig
|
|
ENV PATH=${OPENMPI_PATH}/bin:$PATH
|
|
ENV LD_LIBRARY_PATH=${OPENMPI_PATH}/lib:$LD_LIBRARY_PATH
|
|
|
|
# install mpi4py (be sure to link existing /opt/openmpi-xxx)
|
|
RUN CC=mpicc MPICC=mpicc pip install mpi4py --no-binary mpi4py
|
|
|
|
# install pytorch
|
|
ARG PYTORCH_VERSION
|
|
RUN pip install torch==${PYTORCH_VERSION}
|
|
|
|
# in case you need to build pytorch:
|
|
# note: if you want specific branch or to link system cuda libraries or MPI
|
|
# note: recommend using many high-frequency cores (e.g. 32+ skylake cores)
|
|
# ENV CUDA_HOME="/usr/local/cuda" \
|
|
# CUDNN_LIBRARY="/usr/lib/x86_64-linux-gnu" \
|
|
# NCCL_INCLUDE_DIR="/usr/include" \
|
|
# NCCL_LIB_DIR="/usr/lib/x86_64-linux-gnu" \
|
|
# USE_SYSTEM_NCCL=1
|
|
# RUN conda install -y \
|
|
# mkl \
|
|
# mkl-include \
|
|
# ninja \
|
|
# pyyaml \
|
|
# cffi &&\
|
|
# cd /stage && git clone https://github.com/pytorch/pytorch.git &&\
|
|
# cd pytorch &&\
|
|
# git checkout v1.6.0 &&\
|
|
# git submodule update --init --recursive &&\
|
|
# python setup.py bdist_wheel -d build/wheel &&\
|
|
# pip install build/wheel/*.whl
|
|
|
|
# build onnxruntime wheel with cuda and mpi support
|
|
ARG BUILD_CONFIG
|
|
ARG COMMIT
|
|
RUN cd /stage && git clone https://github.com/microsoft/onnxruntime.git &&\
|
|
cd onnxruntime &&\
|
|
git checkout ${COMMIT} &&\
|
|
cp ThirdPartyNotices.txt /stage/ThirdPartyNotices.txt &&\
|
|
cp dockerfiles/LICENSE-IMAGE.txt /stage/LICENSE-IMAGE.txt &&\
|
|
python tools/ci_build/build.py \
|
|
--cmake_extra_defines \
|
|
ONNXRUNTIME_VERSION=`cat ./VERSION_NUMBER` \
|
|
--config ${BUILD_CONFIG} \
|
|
--enable_training \
|
|
--mpi_home ${OPENMPI_PATH} \
|
|
--use_cuda \
|
|
--cuda_home /usr/local/cuda \
|
|
--cudnn_home /usr/lib/x86_64-linux-gnu/ \
|
|
--nccl_home /usr/lib/x86_64-linux-gnu/ \
|
|
--update \
|
|
--parallel \
|
|
--build_dir build \
|
|
--build \
|
|
--build_wheel \
|
|
--skip_tests &&\
|
|
pip install build/${BUILD_CONFIG}/dist/*.whl
|
|
|
|
# Install AzureML support and commonly used packages.
|
|
RUN pip install azureml-defaults transformers==2.11.0 msgpack==1.0.0 tensorboardX==1.8 tensorboard==2.3.0
|
|
|
|
# switch to cuda runtime environment
|
|
# note: launch with --gpus all or nvidia-docker
|
|
FROM nvidia/cuda:10.2-cudnn8-runtime-ubuntu18.04
|
|
WORKDIR /stage
|
|
|
|
# install ucx
|
|
# note: launch with --cap-add=sys_nice to avoid 'mbind' warnings
|
|
COPY --from=builder /opt/ucx /opt/ucx
|
|
ENV PATH=/opt/ucx/bin:$PATH
|
|
ENV LD_LIBRARY_PATH=/opt/ucx/lib:$LD_LIBRARY_PATH
|
|
|
|
# install openmpi
|
|
# note: permit mpirun as root for Azure cluster submissions
|
|
# note: enforce openmpi select ucx or fail
|
|
ARG OPENMPI_VERSION
|
|
ARG OPENMPI_PATH
|
|
COPY --from=builder ${OPENMPI_PATH} ${OPENMPI_PATH}
|
|
ENV PATH=${OPENMPI_PATH}/bin:$PATH
|
|
ENV LD_LIBRARY_PATH=${OPENMPI_PATH}/lib:$LD_LIBRARY_PATH
|
|
ENV OMPI_ALLOW_RUN_AS_ROOT=1
|
|
ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
|
|
ENV OMPI_MCA_pml=ucx
|
|
RUN apt-get -y update && apt-get -y --no-install-recommends install \
|
|
openssh-server \
|
|
openssh-client \
|
|
libibverbs-dev \
|
|
libnuma-dev &&\
|
|
ldconfig
|
|
|
|
# copy conda environment (includes numpy, mpi4py, pytorch, onnxruntime)
|
|
COPY --from=builder /opt/conda /opt/conda
|
|
ENV PATH=/opt/conda/bin:${PATH}
|
|
|
|
# make ssh/sshd less strict for wiring containers on Azure VM scale set
|
|
# note: use 'service ssh start' to launch sshd (will fail if 22 in use)
|
|
# note: can also set port != 22 and set port=X in MPI hosts file
|
|
# note: need to setup password free ssh login between MPI hosts
|
|
RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/g' \
|
|
/etc/ssh/sshd_config &&\
|
|
sed -i 's/#StrictModes yes/StrictModes no/g' \
|
|
/etc/ssh/sshd_config &&\
|
|
sed -i 's/# StrictHostKeyChecking ask/ StrictHostKeyChecking no/g' \
|
|
/etc/ssh/ssh_config &&\
|
|
mkdir /run/sshd &&\
|
|
chmod u=rwx,go=rx /run/sshd
|
|
|
|
# export versions
|
|
ARG UCX_VERSION
|
|
ARG OPENMPI_VERSION
|
|
ARG CONDA_VERSION
|
|
ARG NUMPY_VERSION
|
|
ARG ONNX_VERSION
|
|
ARG PYTORCH_VERSION
|
|
LABEL UCX_VERSION=${UCX_VERSION}
|
|
LABEL OPENMPI_VERSION=${OPENMPI_VERSION}
|
|
LABEL CONDA_VERSION=${CONDA_VERSION}
|
|
LABEL NUMPY_VERSION=${NUMPY_VERSION}
|
|
LABEL ONNX_VERSION=${ONNX_VERSION}
|
|
LABEL PYTORCH_VERSION=${PYTORCH_VERSION}
|
|
|
|
# clean\finalize environment
|
|
# note: adds onnxruntime license and third party notices
|
|
RUN conda remove -y cmake &&\
|
|
apt-get purge -y build-essential &&\
|
|
apt-get autoremove -y &&\
|
|
rm -fr /stage
|
|
WORKDIR /workspace
|
|
COPY --from=builder /stage/*.txt /workspace/
|