Docker image release pipeline (#4682)

* create orttraining-1p-linux-gpu-ci-pipeline.yml

* fix syntax

* fix file path

* fix template path

* publish docker image to test acr

* use right task name

* change parameter list

* use variables

* use python.version

* remove --enable_onnx_tests due to segfault

* add back --enable_onnx_tests

* fix docker push command line

* change docker login command

* login differently

* fix docker tag script

* create password.txt

* add ortrelease docker image

* enable test in build.sh

* add pipeline parameter

* add pipeline parameter

* change timeout

* change timeout

* fix run_dockerbuild.sh

* use PR checkin build docker

* fix strategy syntax

* fix strategy syntax

* change dockerfile

* change run_dockerbuild.sh

* change tag name

* build with root user

* use build id for docker image tag

* remove all user lines

* change docker tag

* add mpi, mellanox

* add missing args

* use release dockerfile for ci build

* remove install wheel

* use release docker image

* fix syntax

* use different pool

* add Dockerfile.training

* remove sudo to run on Linux-Multi-GPU-V100

* change docker file path

* update dockerfile

* use latest dockerfile

* change agent pool

* remove --preserve-env

* add back parameter

* Add test_flag

* use azuredevops docker

* change repository

* use cmd for docker login

* echo build script

* use ortrelrease ACR

* change key vault connection

* Move --build flag

* change build command

* add paramter for image tag

* clean up for PR

* remove unnecessary changes

* whitespace changes

* whitespace changes

* change build flag

* change flag name

* change flag

* use latest dockerfile

* enable build tests

* build builder stage and run test

* Add back python.version

* change build directory

* always run build entire dockerfile

* fix yml syntax

* fix syntax

* add en-UTF8 locale

* rename

* remove unused template

* Update orttraining-linux-gpu-docker-release-pipeline.yml for Azure Pipelines

* Update orttraining-linux-gpu-docker-release-pipeline.yml for Azure Pipelines

* Test commit sha1 in pipeline

* fix parameter

* update docker file

* fix --from=build

* remove commented blocks

* PR comments

* fix syntax

* fix syntax

* use timestamp as build number

* remove latest tag

* add build_timestamp variable

* remove wrong property

* fix docker run command

* test build id

* Use datestamp build id

* change build tags

* add no-cache to docker build

* rename BUILD_VERSION -> BUILD_CONFIG

Co-authored-by: Jingyan Wang <jingywa@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
Co-authored-by: Jingyan Wang <jingywa@OrtTrainingDev3.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
This commit is contained in:
jingyanwangms 2020-08-12 13:29:37 -07:00 committed by GitHub
parent 8a66ad79a6
commit adda8c66d9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 95 additions and 12 deletions

View file

@ -9,11 +9,12 @@ ARG NUMPY_VERSION=1.18.5
ARG ONNX_VERSION=1.7.0
ARG PYTORCH_VERSION=1.6.0
ARG BUILD_VERSION=Release
ARG BUILD_CONFIG=Release
ARG OPENMPI_PATH=/opt/openmpi-${OPENMPI_VERSION}
ARG COMMIT=master
# cuda development image for building sources
FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 as build
FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 as builder
# set location for builds
WORKDIR /stage
@ -22,8 +23,12 @@ WORKDIR /stage
RUN apt-get -y update &&\
apt-get -y --no-install-recommends install \
curl \
git
git \
language-pack-en
RUN locale-gen en_US.UTF-8 && \
update-locale LANG=en_US.UTF-8
# install miniconda (comes with python 3.7 default)
ARG CONDA_VERSION
ARG CONDA_URL=https://repo.anaconda.com/miniconda/Miniconda3-${CONDA_VERSION}-Linux-x86_64.sh
@ -112,16 +117,17 @@ RUN pip install torch==${PYTORCH_VERSION}
# pip install build/wheel/*.whl
# build onnxruntime wheel with cuda and mpi support
ARG BUILD_VERSION
ARG BUILD_CONFIG
ARG COMMIT
RUN cd /stage && git clone https://github.com/microsoft/onnxruntime.git &&\
cd onnxruntime &&\
git checkout master &&\
git checkout ${COMMIT} &&\
cp ThirdPartyNotices.txt /stage/ThirdPartyNotices.txt &&\
cp dockerfiles/LICENSE-IMAGE.txt /stage/LICENSE-IMAGE.txt &&\
python tools/ci_build/build.py \
--cmake_extra_defines \
ONNXRUNTIME_VERSION=`cat ./VERSION_NUMBER` \
--config ${BUILD_VERSION} \
--config ${BUILD_CONFIG} \
--enable_training \
--mpi_home ${OPENMPI_PATH} \
--use_cuda \
@ -134,7 +140,7 @@ RUN cd /stage && git clone https://github.com/microsoft/onnxruntime.git &&\
--build \
--build_wheel \
--skip_tests &&\
pip install build/${BUILD_VERSION}/dist/*.whl
pip install build/${BUILD_CONFIG}/dist/*.whl
# switch to cuda runtime environment
# note: launch with --gpus all or nvidia-docker
@ -143,7 +149,7 @@ WORKDIR /stage
# install ucx
# note: launch with --cap-add=sys_nice to avoid 'mbind' warnings
COPY --from=build /opt/ucx /opt/ucx
COPY --from=builder /opt/ucx /opt/ucx
ENV PATH=/opt/ucx/bin:$PATH
ENV LD_LIBRARY_PATH=/opt/ucx/lib:$LD_LIBRARY_PATH
@ -152,7 +158,7 @@ ENV LD_LIBRARY_PATH=/opt/ucx/lib:$LD_LIBRARY_PATH
# note: enforce openmpi select ucx or fail
ARG OPENMPI_VERSION
ARG OPENMPI_PATH
COPY --from=build ${OPENMPI_PATH} ${OPENMPI_PATH}
COPY --from=builder ${OPENMPI_PATH} ${OPENMPI_PATH}
ENV PATH=${OPENMPI_PATH}/bin:$PATH
ENV LD_LIBRARY_PATH=${OPENMPI_PATH}/lib:$LD_LIBRARY_PATH
ENV OMPI_ALLOW_RUN_AS_ROOT=1
@ -166,7 +172,7 @@ RUN apt-get -y update && apt-get -y --no-install-recommends install \
ldconfig
# copy conda environment (includes numpy, mpi4py, pytorch, onnxruntime)
COPY --from=build /opt/conda /opt/conda
COPY --from=builder /opt/conda /opt/conda
ENV PATH=/opt/conda/bin:${PATH}
# make ssh/sshd less strict for wiring containers on Azure VM scale set
@ -203,4 +209,4 @@ RUN conda remove -y cmake &&\
apt-get autoremove -y &&\
rm -fr /stage
WORKDIR /workspace
COPY --from=build /stage/*.txt /workspace/
COPY --from=builder /stage/*.txt /workspace/

View file

@ -0,0 +1,77 @@
parameters:
- name: commit
displayName: Commit (Default to master or use sha1)
type: string
default: master
- name: image_tag
displayName: Image Tag
type: string
default: latest
variables:
docker_image_prefix: onnxruntime-training
linux_gpu_dockerfile: dockerfiles/Dockerfile.training
build_config: Release
name: $(Date:yyyyMMdd)$(Rev:.r)
jobs:
- job: Linux_py_GPU_Build_Test_Release_Dockerfile
timeoutInMinutes: 90
workspace:
clean: all
pool: Linux-GPU-CUDA10
steps:
- task: CmdLine@2
displayName: Build builder stage of docker file
inputs:
script: |
docker build \
--pull \
-t ${{ variables.docker_image_prefix }}-manylinux-gpu-release-stage1 \
--target builder \
--no-cache \
--build-arg COMMIT="${{ parameters.commit }}" \
--build-arg BUILD_CONFIG="${{ variables.build_config }}" \
-f ${{ variables.linux_gpu_dockerfile }} .
workingDirectory: $(Build.SourcesDirectory)
- task: CmdLine@2
displayName: Run tests
inputs:
script: |
docker run \
--gpus all \
--rm \
${{ variables.docker_image_prefix }}-manylinux-gpu-release-stage1 \
python onnxruntime/tools/ci_build/build.py \
--build_dir onnxruntime/build \
--config ${{ variables.build_config }} \
--test \
--enable_onnx_tests
workingDirectory: $(Build.SourcesDirectory)
- task: Docker@2
displayName: Build entire docker file
inputs:
command: build
containerRegistry: 'ortrelease'
repository: 'onnxruntime-training'
arguments: --build-arg COMMIT="${{ parameters.commit }}" --build-arg BUILD_CONFIG="${{ variables.build_config }}"
Dockerfile: ${{ variables.linux_gpu_dockerfile }}
tags: |
$(Build.BuildNumber)
${{ parameters.image_tag }}
- task: Docker@2
displayName: Push docker image
inputs:
command: push
containerRegistry: 'ortrelease'
repository: 'onnxruntime-training'
tags: |
$(Build.BuildNumber)
${{ parameters.image_tag }}
- template: templates/component-governance-component-detection-steps.yml
- template: templates/clean-agent-build-directory-step.yml