From adda8c66d997aedaeaff7c6f9d80eed0b6edaa92 Mon Sep 17 00:00:00 2001 From: jingyanwangms <47403504+jingyanwangms@users.noreply.github.com> Date: Wed, 12 Aug 2020 13:29:37 -0700 Subject: [PATCH] Docker image release pipeline (#4682) * create orttraining-1p-linux-gpu-ci-pipeline.yml * fix syntax * fix file path * fix template path * publish docker image to test acr * use right task name * change parameter list * use variables * use python.version * remove --enable_onnx_tests due to segfault * add back --enable_onnx_tests * fix docker push command line * change docker login command * login differently * fix docker tag script * create password.txt * add ortrelease docker image * enable test in build.sh * add pipeline parameter * add pipeline parameter * change timeout * change timeout * fix run_dockerbuild.sh * use PR checkin build docker * fix strategy syntax * fix strategy syntax * change dockerfile * change run_dockerbuild.sh * change tag name * build with root user * use build id for docker image tag * remove all user lines * change docker tag * add mpi, mellanox * add missing args * use release dockerfile for ci build * remove install wheel * use release docker image * fix syntax * use different pool * add Dockerfile.training * remove sudo to run on Linux-Multi-GPU-V100 * change docker file path * update dockerfile * use latest dockerfile * change agent pool * remove --preserve-env * add back parameter * Add test_flag * use azuredevops docker * change repository * use cmd for docker login * echo build script * use ortrelrease ACR * change key vault connection * Move --build flag * change build command * add paramter for image tag * clean up for PR * remove unnecessary changes * whitespace changes * whitespace changes * change build flag * change flag name * change flag * use latest dockerfile * enable build tests * build builder stage and run test * Add back python.version * change build directory * always run build entire dockerfile * fix yml syntax * fix syntax * add en-UTF8 locale * rename * remove unused template * Update orttraining-linux-gpu-docker-release-pipeline.yml for Azure Pipelines * Update orttraining-linux-gpu-docker-release-pipeline.yml for Azure Pipelines * Test commit sha1 in pipeline * fix parameter * update docker file * fix --from=build * remove commented blocks * PR comments * fix syntax * fix syntax * use timestamp as build number * remove latest tag * add build_timestamp variable * remove wrong property * fix docker run command * test build id * Use datestamp build id * change build tags * add no-cache to docker build * rename BUILD_VERSION -> BUILD_CONFIG Co-authored-by: Jingyan Wang Co-authored-by: Jingyan Wang --- dockerfiles/Dockerfile.training | 30 +++++--- ...ning-linux-gpu-docker-release-pipeline.yml | 77 +++++++++++++++++++ 2 files changed, 95 insertions(+), 12 deletions(-) create mode 100644 tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-docker-release-pipeline.yml diff --git a/dockerfiles/Dockerfile.training b/dockerfiles/Dockerfile.training index be83807804..eb22fc2c56 100644 --- a/dockerfiles/Dockerfile.training +++ b/dockerfiles/Dockerfile.training @@ -9,11 +9,12 @@ ARG NUMPY_VERSION=1.18.5 ARG ONNX_VERSION=1.7.0 ARG PYTORCH_VERSION=1.6.0 -ARG BUILD_VERSION=Release +ARG BUILD_CONFIG=Release ARG OPENMPI_PATH=/opt/openmpi-${OPENMPI_VERSION} +ARG COMMIT=master # cuda development image for building sources -FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 as build +FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 as builder # set location for builds WORKDIR /stage @@ -22,8 +23,12 @@ WORKDIR /stage RUN apt-get -y update &&\ apt-get -y --no-install-recommends install \ curl \ - git - + git \ + language-pack-en + +RUN locale-gen en_US.UTF-8 && \ + update-locale LANG=en_US.UTF-8 + # install miniconda (comes with python 3.7 default) ARG CONDA_VERSION ARG CONDA_URL=https://repo.anaconda.com/miniconda/Miniconda3-${CONDA_VERSION}-Linux-x86_64.sh @@ -112,16 +117,17 @@ RUN pip install torch==${PYTORCH_VERSION} # pip install build/wheel/*.whl # build onnxruntime wheel with cuda and mpi support -ARG BUILD_VERSION +ARG BUILD_CONFIG +ARG COMMIT RUN cd /stage && git clone https://github.com/microsoft/onnxruntime.git &&\ cd onnxruntime &&\ - git checkout master &&\ + git checkout ${COMMIT} &&\ cp ThirdPartyNotices.txt /stage/ThirdPartyNotices.txt &&\ cp dockerfiles/LICENSE-IMAGE.txt /stage/LICENSE-IMAGE.txt &&\ python tools/ci_build/build.py \ --cmake_extra_defines \ ONNXRUNTIME_VERSION=`cat ./VERSION_NUMBER` \ - --config ${BUILD_VERSION} \ + --config ${BUILD_CONFIG} \ --enable_training \ --mpi_home ${OPENMPI_PATH} \ --use_cuda \ @@ -134,7 +140,7 @@ RUN cd /stage && git clone https://github.com/microsoft/onnxruntime.git &&\ --build \ --build_wheel \ --skip_tests &&\ - pip install build/${BUILD_VERSION}/dist/*.whl + pip install build/${BUILD_CONFIG}/dist/*.whl # switch to cuda runtime environment # note: launch with --gpus all or nvidia-docker @@ -143,7 +149,7 @@ WORKDIR /stage # install ucx # note: launch with --cap-add=sys_nice to avoid 'mbind' warnings -COPY --from=build /opt/ucx /opt/ucx +COPY --from=builder /opt/ucx /opt/ucx ENV PATH=/opt/ucx/bin:$PATH ENV LD_LIBRARY_PATH=/opt/ucx/lib:$LD_LIBRARY_PATH @@ -152,7 +158,7 @@ ENV LD_LIBRARY_PATH=/opt/ucx/lib:$LD_LIBRARY_PATH # note: enforce openmpi select ucx or fail ARG OPENMPI_VERSION ARG OPENMPI_PATH -COPY --from=build ${OPENMPI_PATH} ${OPENMPI_PATH} +COPY --from=builder ${OPENMPI_PATH} ${OPENMPI_PATH} ENV PATH=${OPENMPI_PATH}/bin:$PATH ENV LD_LIBRARY_PATH=${OPENMPI_PATH}/lib:$LD_LIBRARY_PATH ENV OMPI_ALLOW_RUN_AS_ROOT=1 @@ -166,7 +172,7 @@ RUN apt-get -y update && apt-get -y --no-install-recommends install \ ldconfig # copy conda environment (includes numpy, mpi4py, pytorch, onnxruntime) -COPY --from=build /opt/conda /opt/conda +COPY --from=builder /opt/conda /opt/conda ENV PATH=/opt/conda/bin:${PATH} # make ssh/sshd less strict for wiring containers on Azure VM scale set @@ -203,4 +209,4 @@ RUN conda remove -y cmake &&\ apt-get autoremove -y &&\ rm -fr /stage WORKDIR /workspace -COPY --from=build /stage/*.txt /workspace/ +COPY --from=builder /stage/*.txt /workspace/ diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-docker-release-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-docker-release-pipeline.yml new file mode 100644 index 0000000000..9b34e93ca7 --- /dev/null +++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-docker-release-pipeline.yml @@ -0,0 +1,77 @@ +parameters: +- name: commit + displayName: Commit (Default to master or use sha1) + type: string + default: master +- name: image_tag + displayName: Image Tag + type: string + default: latest + +variables: + docker_image_prefix: onnxruntime-training + linux_gpu_dockerfile: dockerfiles/Dockerfile.training + build_config: Release + +name: $(Date:yyyyMMdd)$(Rev:.r) +jobs: +- job: Linux_py_GPU_Build_Test_Release_Dockerfile + timeoutInMinutes: 90 + workspace: + clean: all + pool: Linux-GPU-CUDA10 + steps: + - task: CmdLine@2 + displayName: Build builder stage of docker file + inputs: + script: | + docker build \ + --pull \ + -t ${{ variables.docker_image_prefix }}-manylinux-gpu-release-stage1 \ + --target builder \ + --no-cache \ + --build-arg COMMIT="${{ parameters.commit }}" \ + --build-arg BUILD_CONFIG="${{ variables.build_config }}" \ + -f ${{ variables.linux_gpu_dockerfile }} . + workingDirectory: $(Build.SourcesDirectory) + + - task: CmdLine@2 + displayName: Run tests + inputs: + script: | + docker run \ + --gpus all \ + --rm \ + ${{ variables.docker_image_prefix }}-manylinux-gpu-release-stage1 \ + python onnxruntime/tools/ci_build/build.py \ + --build_dir onnxruntime/build \ + --config ${{ variables.build_config }} \ + --test \ + --enable_onnx_tests + workingDirectory: $(Build.SourcesDirectory) + + - task: Docker@2 + displayName: Build entire docker file + inputs: + command: build + containerRegistry: 'ortrelease' + repository: 'onnxruntime-training' + arguments: --build-arg COMMIT="${{ parameters.commit }}" --build-arg BUILD_CONFIG="${{ variables.build_config }}" + Dockerfile: ${{ variables.linux_gpu_dockerfile }} + tags: | + $(Build.BuildNumber) + ${{ parameters.image_tag }} + + - task: Docker@2 + displayName: Push docker image + inputs: + command: push + containerRegistry: 'ortrelease' + repository: 'onnxruntime-training' + tags: | + $(Build.BuildNumber) + ${{ parameters.image_tag }} + + - template: templates/component-governance-component-detection-steps.yml + + - template: templates/clean-agent-build-directory-step.yml