From 0d1da41ca82a0e90f71e987c25ef196a97f83c51 Mon Sep 17 00:00:00 2001 From: Yi Zhang Date: Tue, 6 Aug 2024 21:37:09 +0800 Subject: [PATCH] Fix docker image layer caching to avoid redundant docker building and transient connection exceptions. (#21612) ### Description Improve docker commands to make docker image layer caching works. It can make docker building faster and more stable. So far, A100 pool's system disk is too small to use docker cache. We won't use pipeline cache for docker image and remove some legacy code. ### Motivation and Context There are often an exception of ``` 64.58 + curl https://nodejs.org/dist/v18.17.1/node-v18.17.1-linux-x64.tar.gz -sSL --retry 5 --retry-delay 30 --create-dirs -o /tmp/src/node-v18.17.1-linux-x64.tar.gz --fail 286.4 curl: (92) HTTP/2 stream 0 was not closed cleanly: INTERNAL_ERROR (err 2) ``` Because Onnxruntime pipeline have been sending too many requests to download Nodejs in docker building. Which is the major reason of pipeline failing now In fact, docker image layer caching never works. We can always see the scrips are still running ``` #9 [3/5] RUN cd /tmp/scripts && /tmp/scripts/install_centos.sh && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts #9 0.234 /bin/sh: warning: setlocale: LC_ALL: cannot change locale (en_US.UTF-8) #9 0.235 /bin/sh: warning: setlocale: LC_ALL: cannot change locale (en_US.UTF-8) #9 0.235 /tmp/scripts/install_centos.sh: line 1: !/bin/bash: No such file or directory #9 0.235 ++ '[' '!' -f /etc/yum.repos.d/microsoft-prod.repo ']' #9 0.236 +++ tr -dc 0-9. #9 0.236 +++ cut -d . -f1 #9 0.238 ++ os_major_version=8 .... #9 60.41 + curl https://nodejs.org/dist/v18.17.1/node-v18.17.1-linux-x64.tar.gz -sSL --retry 5 --retry-delay 30 --create-dirs -o /tmp/src/node-v18.17.1-linux-x64.tar.gz --fail #9 60.59 + return 0 ... ``` This PR is improving the docker command to make image layer caching work. Thus, CI won't send so many redundant request of downloading NodeJS. ``` #9 [2/5] ADD scripts /tmp/scripts #9 CACHED #10 [3/5] RUN cd /tmp/scripts && /tmp/scripts/install_centos.sh && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts #10 CACHED #11 [4/5] RUN adduser --uid 1000 onnxruntimedev #11 CACHED #12 [5/5] WORKDIR /home/onnxruntimedev #12 CACHED ``` ###Reference https://docs.docker.com/build/drivers/ --------- Co-authored-by: Yi Zhang --- tools/ci_build/get_docker_image.py | 24 ++----- .../azure-pipelines/bigmodels-ci-pipeline.yml | 1 + .../templates/c-api-linux-cpu.yml | 8 +-- .../templates/get-docker-image-steps.yml | 64 ++++++------------- .../inference/aarch64/default/cpu/Dockerfile | 2 +- .../inference/x86_64/default/cpu/Dockerfile | 2 +- 6 files changed, 32 insertions(+), 69 deletions(-) diff --git a/tools/ci_build/get_docker_image.py b/tools/ci_build/get_docker_image.py index 99ecaf677f..a3f603b0be 100755 --- a/tools/ci_build/get_docker_image.py +++ b/tools/ci_build/get_docker_image.py @@ -98,17 +98,19 @@ def main(): ) if use_container_registry: + run(args.docker_path, "buildx", "create", "--driver=docker-container", "--name=container_builder") run( args.docker_path, "--log-level", "error", "buildx", "build", - "--push", + "--load", "--tag", full_image_name, - "--cache-from", - full_image_name, + "--cache-from=type=registry,ref=" + full_image_name, + "--builder", + "container_builder", "--build-arg", "BUILDKIT_INLINE_CACHE=1", *shlex.split(args.docker_build_args), @@ -116,24 +118,10 @@ def main(): args.dockerfile, args.context, ) - elif args.use_imagecache: - log.info("Building image with pipeline cache...") run( args.docker_path, - "--log-level", - "error", - "buildx", - "build", - "--tag", + "push", full_image_name, - "--cache-from", - full_image_name, - "--build-arg", - "BUILDKIT_INLINE_CACHE=1", - *shlex.split(args.docker_build_args), - "-f", - args.dockerfile, - args.context, ) else: log.info("Building image...") diff --git a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml index a66828ee5e..4a3532dd57 100644 --- a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml @@ -321,6 +321,7 @@ stages: --build-arg TRT_VERSION=${{ variables.linux_trt_version }} " Repository: onnxruntimeubi8packagestest_torch + UseImageCacheContainerRegistry: false UpdateDepsTxt: false - task: DownloadPackage@1 diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml index e2b71c5c55..0f4328f75e 100644 --- a/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml +++ b/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml @@ -51,15 +51,15 @@ jobs: Dockerfile: tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile Context: tools/ci_build/github/linux/docker/inference/x86_64/default/cpu DockerBuildArgs: "--build-arg BUILD_UID=$( id -u ) --build-arg BASEIMAGE=${{parameters.BaseImage}}" - Repository: onnxruntimecpubuildcentos8${{parameters.OnnxruntimeArch}} - + Repository: onnxruntimecpubuildcentos8${{parameters.OnnxruntimeArch}}_packaging + - ${{ if eq(parameters.OnnxruntimeArch, 'aarch64') }}: - template: get-docker-image-steps.yml parameters: Dockerfile: tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile Context: tools/ci_build/github/linux/docker/inference/aarch64/default/cpu DockerBuildArgs: "--build-arg BUILD_UID=$( id -u ) --build-arg BASEIMAGE=${{parameters.BaseImage}}" - Repository: onnxruntimecpubuildcentos8${{parameters.OnnxruntimeArch}} + Repository: onnxruntimecpubuildcentos8${{parameters.OnnxruntimeArch}}_packaging UpdateDepsTxt: false - task: CmdLine@2 @@ -67,7 +67,7 @@ jobs: script: | mkdir -p $HOME/.onnx docker run --rm --volume /data/onnx:/data/onnx:ro --volume $(Build.SourcesDirectory):/onnxruntime_src --volume $(Build.BinariesDirectory):/build \ - --volume $HOME/.onnx:/home/onnxruntimedev/.onnx -e NIGHTLY_BUILD onnxruntimecpubuildcentos8${{parameters.OnnxruntimeArch}} /bin/bash -c "python3.9 \ + --volume $HOME/.onnx:/home/onnxruntimedev/.onnx -e NIGHTLY_BUILD onnxruntimecpubuildcentos8${{parameters.OnnxruntimeArch}}_packaging /bin/bash -c "python3.9 \ /onnxruntime_src/tools/ci_build/build.py --enable_lto --build_java --build_nodejs --build_dir /build --config Release \ --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib ${{ parameters.AdditionalBuildFlags }} && cd /build/Release && make install DESTDIR=/build/installed" workingDirectory: $(Build.SourcesDirectory) diff --git a/tools/ci_build/github/azure-pipelines/templates/get-docker-image-steps.yml b/tools/ci_build/github/azure-pipelines/templates/get-docker-image-steps.yml index 94cdf042ec..5b6769685a 100644 --- a/tools/ci_build/github/azure-pipelines/templates/get-docker-image-steps.yml +++ b/tools/ci_build/github/azure-pipelines/templates/get-docker-image-steps.yml @@ -53,6 +53,7 @@ steps: displayName: patch manylinux - script: | + docker version docker image ls docker system df displayName: Check Docker Images @@ -71,52 +72,25 @@ steps: displayName: "Get ${{ parameters.Repository }} image for ${{ parameters.Dockerfile }}" ContainerRegistry: onnxruntimebuildcache - ${{ if eq(parameters.UseImageCacheContainerRegistry, false) }}: - - task: Cache@2 - displayName: Cache Docker Image Task - inputs: - key: ' "${{ parameters.Repository }}" | "$(Build.SourceVersion)" ' - path: ${{ parameters.IMAGE_CACHE_DIR }} - restoreKeys: | - "${{ parameters.Repository }}" | "$(Build.SourceVersion)" - "${{ parameters.Repository }}" - cacheHitVar: CACHE_RESTORED - condition: eq('${{ parameters.UsePipelineCache }}', 'true') + # the difference is no --container-registry + - template: with-container-registry-steps.yml + parameters: + Steps: + - script: | + ${{ parameters.ScriptName }} \ + --dockerfile "${{ parameters.Dockerfile }}" \ + --context "${{ parameters.Context }}" \ + --docker-build-args "${{ parameters.DockerBuildArgs }}" \ + --repository "${{ parameters.Repository }}" + displayName: "Get ${{ parameters.Repository }} image for ${{ parameters.Dockerfile }}" + ContainerRegistry: onnxruntimebuildcache - - script: | - test -f ${{ parameters.IMAGE_CACHE_DIR }}/cache.tar && docker load -i ${{ parameters.IMAGE_CACHE_DIR }}/cache.tar - docker image ls - displayName: Docker restore - condition: eq('${{ parameters.UsePipelineCache }}', 'true') - - - script: | - if [ ${{ parameters.UsePipelineCache}} ] - then - use_imagecache="--use_imagecache" - else - use_imagecache="" - fi - ${{ parameters.ScriptName }} \ - --dockerfile "${{ parameters.Dockerfile }}" \ - --context "${{ parameters.Context }}" \ - --docker-build-args "${{ parameters.DockerBuildArgs }}" \ - --repository "${{ parameters.Repository }}" \ - $use_imagecache - displayName: "Get ${{ parameters.Repository }} image for ${{ parameters.Dockerfile }}" - - - script: | - set -ex - mkdir -p "${{ parameters.IMAGE_CACHE_DIR }}" - docker save -o "${{ parameters.IMAGE_CACHE_DIR }}/cache.tar" ${{ parameters.Repository }} - docker image ls - docker system df - displayName: Docker save - condition: eq('${{ parameters.UsePipelineCache }}', 'true') - - - script: | - echo ${{ parameters.IMAGE_CACHE_DIR }} - ls -lah ${{ parameters.IMAGE_CACHE_DIR }} - displayName: Display docker dir - condition: eq('${{ parameters.UsePipelineCache }}', 'true') +- script: | + docker version + docker image ls + docker system df + df -h + displayName: Check Docker Images - ${{ if and(eq(parameters.UpdateDepsTxt, true), or(eq(variables['System.CollectionId'], 'f3ad12f2-e480-4533-baf2-635c95467d29'),eq(variables['System.CollectionId'], 'bc038106-a83b-4dab-9dd3-5a41bc58f34c'))) }}: - task: PythonScript@0 diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile index 2cd054e624..ca00050121 100644 --- a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile @@ -5,7 +5,7 @@ ARG BASEIMAGE=arm64v8/almalinux:8 FROM $BASEIMAGE -ENV PATH /opt/rh/gcc-toolset-12/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin +ENV PATH=/opt/rh/gcc-toolset-12/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin ENV LANG=en_US.UTF-8 ENV LC_ALL=en_US.UTF-8 diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile index caf9583807..ef28dde676 100644 --- a/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile @@ -5,7 +5,7 @@ ARG BASEIMAGE=amd64/almalinux:8 FROM $BASEIMAGE -ENV PATH /usr/lib/jvm/msopenjdk-11/bin:/opt/rh/gcc-toolset-12/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin +ENV PATH=/usr/lib/jvm/msopenjdk-11/bin:/opt/rh/gcc-toolset-12/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin ENV LANG=en_US.UTF-8 ENV LC_ALL=en_US.UTF-8 ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-11