From 188d5f5398936d35649c2a6cdcaa76b3735c1653 Mon Sep 17 00:00:00 2001 From: Changming Sun Date: Thu, 15 Jun 2023 16:24:46 -0700 Subject: [PATCH] Fix Linux Multi GPU build pipeline (#16368) ### Description The build pipeline runs on Azure NV12 machines that will be deprecated soon because the SKU is too old. So this PR will move the pipeline to a Windows machine with two A10 GPUs. --- onnxruntime/test/providers/cpu/model_tests.cc | 6 ++ .../linux-multi-gpu-ci-pipeline.yml | 70 ------------------- .../azure-pipelines/post-merge-jobs.yml | 21 ++++++ .../templates/jobs/win-ci-vs-2022-job.yml | 6 ++ 4 files changed, 33 insertions(+), 70 deletions(-) delete mode 100644 tools/ci_build/github/azure-pipelines/linux-multi-gpu-ci-pipeline.yml diff --git a/onnxruntime/test/providers/cpu/model_tests.cc b/onnxruntime/test/providers/cpu/model_tests.cc index 6c5d1399f5..80b48c56b0 100644 --- a/onnxruntime/test/providers/cpu/model_tests.cc +++ b/onnxruntime/test/providers/cpu/model_tests.cc @@ -684,6 +684,12 @@ TEST_P(ModelTest, Run) { ASSERT_ORT_STATUS_OK(OrtApis::CreateCUDAProviderOptions(&cuda_options)); std::unique_ptr rel_cuda_options( cuda_options, &OrtApis::ReleaseCUDAProviderOptions); + std::vector keys{"device_id"}; + + std::vector values; + std::string device_id = Env::Default().GetEnvironmentVar("ONNXRUNTIME_TEST_GPU_DEVICE_ID"); + values.push_back(device_id.empty() ? "0" : device_id.c_str()); + ASSERT_ORT_STATUS_OK(OrtApis::UpdateCUDAProviderOptions(cuda_options, keys.data(), values.data(), 1)); ortso.AppendExecutionProvider_CUDA_V2(*cuda_options); } else if (provider_name == "rocm") { OrtROCMProviderOptions ep_options; diff --git a/tools/ci_build/github/azure-pipelines/linux-multi-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-multi-gpu-ci-pipeline.yml deleted file mode 100644 index bf845cbf32..0000000000 --- a/tools/ci_build/github/azure-pipelines/linux-multi-gpu-ci-pipeline.yml +++ /dev/null @@ -1,70 +0,0 @@ -resources: - repositories: - - repository: manylinux # The name used to reference this repository in the checkout step - type: Github - endpoint: Microsoft - name: pypa/manylinux - ref: 5eda9aded5462201e6310105728d33016e637ea7 - -variables: - - template: templates/common-variables.yml - -jobs: -- job: Linux_Build - timeoutInMinutes: 180 - workspace: - clean: all - pool: Linux-Multi-GPU - steps: - - checkout: self - clean: true - submodules: none - - - template: templates/get-docker-image-steps.yml - parameters: - Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11 - Context: tools/ci_build/github/linux/docker - DockerBuildArgs: "--network=host --build-arg POLICY=manylinux2014 --build-arg PLATFORM=x86_64 --build-arg BASEIMAGE=${{variables.common_cuda_baseimg}} --build-arg DEVTOOLSET_ROOTPATH=/opt/rh/devtoolset-11/root --build-arg PREPEND_PATH=/opt/rh/devtoolset-11/root/usr/bin: --build-arg LD_LIBRARY_PATH_ARG=/opt/rh/devtoolset-11/root/usr/lib64:/opt/rh/devtoolset-11/root/usr/lib:/opt/rh/devtoolset-11/root/usr/lib64/dyninst:/opt/rh/devtoolset-11/root/usr/lib/dyninst:/usr/local/lib64 --build-arg BUILD_UID=$( id -u )" - Repository: onnxruntimecuda11build - - - task: CmdLine@2 - inputs: - script: | - mkdir -p $HOME/.onnx - docker run --gpus all -e CC=/opt/rh/devtoolset-11/root/usr/bin/cc -e CXX=/opt/rh/devtoolset-11/root/usr/bin/c++ -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" --rm \ - --volume /data/onnx:/data/onnx:ro \ - --volume $(Build.SourcesDirectory):/onnxruntime_src \ - --volume $(Build.BinariesDirectory):/build \ - --volume /data/models:/build/models:ro \ - --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \ - -e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \ - -e NIGHTLY_BUILD \ - -e BUILD_BUILDNUMBER \ - onnxruntimecuda11build \ - /opt/python/cp38-cp38/bin/python3.8 /onnxruntime_src/tools/ci_build/build.py \ - --build_dir /build --cmake_generator Ninja \ - --config Release \ - --skip_submodule_sync \ - --build_shared_lib \ - --parallel \ - --build_wheel \ - --enable_onnx_tests --use_cuda --cuda_version=${{variables.common_cuda_version}} --cuda_home=/usr/local/cuda-${{variables.common_cuda_version}} --cudnn_home=/usr/local/cuda-${{variables.common_cuda_version}} \ - --enable_pybind --build_java --build_nodejs --enable_multi_device_test \ - --cmake_extra_defines CMAKE_CUDA_HOST_COMPILER=/opt/rh/devtoolset-11/root/usr/bin/cc CMAKE_CUDA_ARCHITECTURES=52 - workingDirectory: $(Build.SourcesDirectory) - - - task: PublishTestResults@2 - displayName: 'Publish unit test results' - inputs: - testResultsFiles: '**/*.results.xml' - searchFolder: '$(Build.BinariesDirectory)' - testRunTitle: 'Unit Test Run' - condition: succeededOrFailed() - - - template: templates/component-governance-component-detection-steps.yml - parameters: - condition: 'succeeded' - - - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3 - displayName: 'Clean Agent Directories' - condition: always() diff --git a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml index 4cee4def11..5880608e85 100644 --- a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml +++ b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml @@ -59,6 +59,27 @@ stages: buildNodejs: true ort_build_pool_name: 'onnxruntime-Win-CPU-2022' +- ${{ if or(startsWith(variables['System.CollectionUri'], 'https://dev.azure.com/aiinfra/'),startsWith(variables['System.CollectionUri'], 'https://aiinfra.visualstudio.com/')) }}: + # The settings below is the same as Windows GPU CI pipeline's CUDA job except here we set OnnxruntimeTestGpuDeviceId to 1 + - stage: cuda_multi_gpu + dependsOn: [] + jobs: + - template: templates/jobs/win-ci-vs-2022-job.yml + parameters: + BuildConfig: 'RelWithDebInfo' + EnvSetupScript: setup_env_cuda_11.bat + buildArch: x64 + additionalBuildFlags: --enable_pybind --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --enable_cuda_profiling --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 + msbuildPlatform: x64 + isX86: false + job_name_suffix: x64_RelWithDebInfo + RunOnnxRuntimeTests: true + RunStaticCodeAnalysis: false + ORT_EP_NAME: CUDA + WITH_CACHE: true + MachinePool: onnxruntime-Win2022-GPU-MultiA10 + OnnxruntimeTestGpuDeviceId: 1 + - stage: Mimalloc dependsOn: [ ] jobs: diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml index 9b7c3fc327..ca118fedfb 100644 --- a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml +++ b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml @@ -55,6 +55,11 @@ parameters: type: boolean default: false +- name: OnnxruntimeTestGpuDeviceId + type: number + default: 0 + + jobs: - job: build_${{ parameters.job_name_suffix }} variables: @@ -69,6 +74,7 @@ jobs: DEPS_CACHE_DIR: $(Agent.TempDirectory)/deps_ccache ORT_CACHE_DIR: $(Agent.TempDirectory)/ort_ccache TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)] + ONNXRUNTIME_TEST_GPU_DEVICE_ID: ${{ parameters.OnnxruntimeTestGpuDeviceId }} ${{ if eq(parameters.WITH_CACHE, true) }}: PS_CACHE_ARG: '-use_cache' PY_CACHE_ARG: '--use_cache'