Fix Linux Multi GPU build pipeline (#16368)

### Description
The build pipeline runs on Azure NV12 machines that will be deprecated
soon because the SKU is too old. So this PR will move the pipeline to a
Windows machine with two A10 GPUs.
This commit is contained in:
Changming Sun 2023-06-15 16:24:46 -07:00 committed by GitHub
parent 5754cd7d1d
commit 188d5f5398
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 33 additions and 70 deletions

View file

@ -684,6 +684,12 @@ TEST_P(ModelTest, Run) {
ASSERT_ORT_STATUS_OK(OrtApis::CreateCUDAProviderOptions(&cuda_options));
std::unique_ptr<OrtCUDAProviderOptionsV2, decltype(&OrtApis::ReleaseCUDAProviderOptions)> rel_cuda_options(
cuda_options, &OrtApis::ReleaseCUDAProviderOptions);
std::vector<const char*> keys{"device_id"};
std::vector<const char*> values;
std::string device_id = Env::Default().GetEnvironmentVar("ONNXRUNTIME_TEST_GPU_DEVICE_ID");
values.push_back(device_id.empty() ? "0" : device_id.c_str());
ASSERT_ORT_STATUS_OK(OrtApis::UpdateCUDAProviderOptions(cuda_options, keys.data(), values.data(), 1));
ortso.AppendExecutionProvider_CUDA_V2(*cuda_options);
} else if (provider_name == "rocm") {
OrtROCMProviderOptions ep_options;

View file

@ -1,70 +0,0 @@
resources:
repositories:
- repository: manylinux # The name used to reference this repository in the checkout step
type: Github
endpoint: Microsoft
name: pypa/manylinux
ref: 5eda9aded5462201e6310105728d33016e637ea7
variables:
- template: templates/common-variables.yml
jobs:
- job: Linux_Build
timeoutInMinutes: 180
workspace:
clean: all
pool: Linux-Multi-GPU
steps:
- checkout: self
clean: true
submodules: none
- template: templates/get-docker-image-steps.yml
parameters:
Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11
Context: tools/ci_build/github/linux/docker
DockerBuildArgs: "--network=host --build-arg POLICY=manylinux2014 --build-arg PLATFORM=x86_64 --build-arg BASEIMAGE=${{variables.common_cuda_baseimg}} --build-arg DEVTOOLSET_ROOTPATH=/opt/rh/devtoolset-11/root --build-arg PREPEND_PATH=/opt/rh/devtoolset-11/root/usr/bin: --build-arg LD_LIBRARY_PATH_ARG=/opt/rh/devtoolset-11/root/usr/lib64:/opt/rh/devtoolset-11/root/usr/lib:/opt/rh/devtoolset-11/root/usr/lib64/dyninst:/opt/rh/devtoolset-11/root/usr/lib/dyninst:/usr/local/lib64 --build-arg BUILD_UID=$( id -u )"
Repository: onnxruntimecuda11build
- task: CmdLine@2
inputs:
script: |
mkdir -p $HOME/.onnx
docker run --gpus all -e CC=/opt/rh/devtoolset-11/root/usr/bin/cc -e CXX=/opt/rh/devtoolset-11/root/usr/bin/c++ -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" --rm \
--volume /data/onnx:/data/onnx:ro \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume /data/models:/build/models:ro \
--volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
-e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \
-e NIGHTLY_BUILD \
-e BUILD_BUILDNUMBER \
onnxruntimecuda11build \
/opt/python/cp38-cp38/bin/python3.8 /onnxruntime_src/tools/ci_build/build.py \
--build_dir /build --cmake_generator Ninja \
--config Release \
--skip_submodule_sync \
--build_shared_lib \
--parallel \
--build_wheel \
--enable_onnx_tests --use_cuda --cuda_version=${{variables.common_cuda_version}} --cuda_home=/usr/local/cuda-${{variables.common_cuda_version}} --cudnn_home=/usr/local/cuda-${{variables.common_cuda_version}} \
--enable_pybind --build_java --build_nodejs --enable_multi_device_test \
--cmake_extra_defines CMAKE_CUDA_HOST_COMPILER=/opt/rh/devtoolset-11/root/usr/bin/cc CMAKE_CUDA_ARCHITECTURES=52
workingDirectory: $(Build.SourcesDirectory)
- task: PublishTestResults@2
displayName: 'Publish unit test results'
inputs:
testResultsFiles: '**/*.results.xml'
searchFolder: '$(Build.BinariesDirectory)'
testRunTitle: 'Unit Test Run'
condition: succeededOrFailed()
- template: templates/component-governance-component-detection-steps.yml
parameters:
condition: 'succeeded'
- task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
displayName: 'Clean Agent Directories'
condition: always()

View file

@ -59,6 +59,27 @@ stages:
buildNodejs: true
ort_build_pool_name: 'onnxruntime-Win-CPU-2022'
- ${{ if or(startsWith(variables['System.CollectionUri'], 'https://dev.azure.com/aiinfra/'),startsWith(variables['System.CollectionUri'], 'https://aiinfra.visualstudio.com/')) }}:
# The settings below is the same as Windows GPU CI pipeline's CUDA job except here we set OnnxruntimeTestGpuDeviceId to 1
- stage: cuda_multi_gpu
dependsOn: []
jobs:
- template: templates/jobs/win-ci-vs-2022-job.yml
parameters:
BuildConfig: 'RelWithDebInfo'
EnvSetupScript: setup_env_cuda_11.bat
buildArch: x64
additionalBuildFlags: --enable_pybind --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --enable_cuda_profiling --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
msbuildPlatform: x64
isX86: false
job_name_suffix: x64_RelWithDebInfo
RunOnnxRuntimeTests: true
RunStaticCodeAnalysis: false
ORT_EP_NAME: CUDA
WITH_CACHE: true
MachinePool: onnxruntime-Win2022-GPU-MultiA10
OnnxruntimeTestGpuDeviceId: 1
- stage: Mimalloc
dependsOn: [ ]
jobs:

View file

@ -55,6 +55,11 @@ parameters:
type: boolean
default: false
- name: OnnxruntimeTestGpuDeviceId
type: number
default: 0
jobs:
- job: build_${{ parameters.job_name_suffix }}
variables:
@ -69,6 +74,7 @@ jobs:
DEPS_CACHE_DIR: $(Agent.TempDirectory)/deps_ccache
ORT_CACHE_DIR: $(Agent.TempDirectory)/ort_ccache
TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
ONNXRUNTIME_TEST_GPU_DEVICE_ID: ${{ parameters.OnnxruntimeTestGpuDeviceId }}
${{ if eq(parameters.WITH_CACHE, true) }}:
PS_CACHE_ARG: '-use_cache'
PY_CACHE_ARG: '--use_cache'