Split more windows GPU workflow into 2 stages, building and testing, to make them more stable (#20080)

### Description
reactor win-ci.yml to solve the random hang issue in more GPU workflows,
move nugget-zip packages and python cuda12 packages building to CPU
machine.

---------

Co-authored-by: Yi Zhang <your@email.com>
This commit is contained in:
Yi Zhang 2024-03-28 12:55:44 +08:00 committed by GitHub
parent 16af7adc70
commit 8f069f81c4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 238 additions and 74 deletions

View file

@ -222,6 +222,8 @@ stages:
buildJava: true
java_artifact_id: onnxruntime_gpu
CudaVersion: 11.8
SpecificArtifact: ${{ parameters.SpecificArtifact }}
BuildId: ${{ parameters.BuildId }}
# CUDA with Tensorrt
- template: templates/win-ci.yml
@ -310,8 +312,8 @@ stages:
dependsOn:
- Linux_C_API_Packaging_GPU_x64
- Linux_C_API_Packaging_GPU_TensorRT_x64
- Windows_Packaging_gpu
- Windows_Packaging_tensorrt
- Windows_Packaging_gpu_Testing
- Windows_Packaging_tensorrt_Testing
- Download_Java_Tools
condition: succeeded()
jobs:
@ -488,8 +490,8 @@ stages:
- stage: Windows_Packaging_combined_GPU
dependsOn:
- Windows_Packaging_gpu
- Windows_Packaging_tensorrt
- Windows_Packaging_gpu_Testing
- Windows_Packaging_tensorrt_Testing
condition: succeeded()
jobs:
- job:
@ -582,9 +584,9 @@ stages:
- stage: NuGet_Packaging_GPU
dependsOn:
- Setup
- Windows_Packaging_gpu
- Windows_Packaging_gpu_Testing
- Windows_Packaging_CPU_x64_default
- Windows_Packaging_tensorrt
- Windows_Packaging_tensorrt_Testing
- Linux_C_API_Packaging_GPU_x64
- Linux_C_API_Packaging_GPU_TensorRT_x64
condition: succeeded()

View file

@ -143,6 +143,8 @@ stages:
CudaVersion: ${{ parameters.CudaVersion }}
win_trt_home: ${{ variables.win_trt_home }}
win_cuda_home: ${{ variables.win_cuda_home }}
SpecificArtifact: ${{ parameters.SpecificArtifact }}
BuildId: ${{ parameters.BuildId }}
- template: stages/nuget-combine-cuda-stage.yml
parameters:
DoCompliance: ${{ parameters.DoCompliance }}

View file

@ -17,8 +17,8 @@ stages:
- stage: NuGet_Packaging_GPU
dependsOn:
- Set_ReleaseVersionSuffix
- Windows_Packaging_gpu
- Windows_Packaging_tensorrt
- Windows_Packaging_gpu_Testing
- Windows_Packaging_tensorrt_Testing
- Linux_C_API_Packaging_CPU_x64
- Linux_C_API_Packaging_GPU_x64
- Linux_C_API_Packaging_GPU_TensorRT_x64

View file

@ -23,6 +23,16 @@ parameters:
- name: win_trt_home
type: string
- name: SpecificArtifact
displayName: Use Specific Artifact
type: boolean
default: false
- name: BuildId
displayName: Specific Artifact's BuildId
type: string
default: '0'
stages:
# Windows CUDA without TensorRT Packaging
- template: ../templates/win-ci.yml
@ -40,6 +50,8 @@ stages:
buildJava: false
java_artifact_id: onnxruntime_gpu
PublishProtoc: true
SpecificArtifact: ${{ parameters.SpecificArtifact }}
BuildId: ${{ parameters.BuildId }}
# Windows CUDA with TensorRT Packaging
- template: ../templates/win-ci.yml
parameters:
@ -56,12 +68,14 @@ stages:
buildJava: false
java_artifact_id: onnxruntime_gpu
UseIncreasedTimeoutForTests: ${{ parameters.UseIncreasedTimeoutForTests }}
SpecificArtifact: ${{ parameters.SpecificArtifact }}
BuildId: ${{ parameters.BuildId }}
# Windows CUDA Combined Testing and Publishing
- stage: Windows_Packaging_combined_GPU
dependsOn:
- Windows_Packaging_gpu
- Windows_Packaging_tensorrt
- Windows_Packaging_gpu_Testing
- Windows_Packaging_tensorrt_Testing
condition: succeeded()
jobs:
@ -159,4 +173,4 @@ stages:
displayName: 'Publish Pipeline Combined GPU Package Artifact'
inputs:
artifactName: 'onnxruntime-win-x64-gpu'
targetPath: '$(Build.ArtifactStagingDirectory)/onnxruntime-win-x64-gpu-$(OnnxRuntimeVersion).zip'
targetPath: '$(Build.ArtifactStagingDirectory)/onnxruntime-win-x64-gpu-$(OnnxRuntimeVersion).zip'

View file

@ -0,0 +1,47 @@
parameters:
- name: msbuildPlatform
type: string
- name: java_artifact_id
type: string
steps:
- task: CmdLine@2
displayName: 'Add symbols and notices to Java'
inputs:
script: |
@echo on
cd $(Build.SourcesDirectory)\java
call $(Build.SourcesDirectory)\java\gradlew.bat cmakeCheck -DcmakeBuildDir=$(Build.BinariesDirectory)\RelWithDebInfo
if %errorlevel% neq 0 exit /b %errorlevel%
cd $(Build.BinariesDirectory)\RelWithDebInfo
set NATIVE_FOLDER=$(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\stage\ai\onnxruntime\native\win-x64
mkdir %NATIVE_FOLDER%
echo "Directories created"
copy .\java\build\libs\*.jar $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}
pushd $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}
set artifact_id=${{ parameters.java_artifact_id }}
jar xf onnxruntime-$(OnnxRuntimeVersion).jar META-INF\maven\com.microsoft.onnxruntime\%artifact_id%\pom.xml
move META-INF\maven\com.microsoft.onnxruntime\%artifact_id%\pom.xml onnxruntime-$(OnnxRuntimeVersion).pom
rd /s /q META-INF
popd
copy .\RelWithDebInfo\onnxruntime.pdb %NATIVE_FOLDER%
copy .\RelWithDebInfo\onnxruntime4j_jni.pdb %NATIVE_FOLDER%
copy $(Build.SourcesDirectory)\docs\Privacy.md $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\stage\Privacy.md
copy $(Build.SourcesDirectory)\ThirdPartyNotices.txt $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\stage\ThirdPartyNotices.txt
@echo $(OnnxRuntimeGitCommitHash) > $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\stage\GIT_COMMIT_ID
pushd $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\stage
jar uf $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\onnxruntime-$(OnnxRuntimeVersion).jar ai\onnxruntime\native\win-x64\onnxruntime.pdb
jar uf $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\onnxruntime-$(OnnxRuntimeVersion).jar ai\onnxruntime\native\win-x64\onnxruntime4j_jni.pdb
jar uf $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\onnxruntime-$(OnnxRuntimeVersion).jar Privacy.md ThirdPartyNotices.txt GIT_COMMIT_ID
popd
pushd $(Build.SourcesDirectory)\java\build\classes\java\test
if %errorlevel% neq 0 exit /b %errorlevel%
jar cvf $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\testing.jar .
if %errorlevel% neq 0 exit /b %errorlevel%
popd
pushd $(Build.SourcesDirectory)\java\build\resources\test
rd /s /q ai\onnxruntime\native
jar uvf $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\testing.jar .
popd
rd /s /q $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\stage
dir /s /b $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}

View file

@ -71,6 +71,16 @@ parameters:
- 11.8
- 12.2
- name: SpecificArtifact
displayName: Use Specific Artifact
type: boolean
default: false
- name: BuildId
displayName: Specific Artifact's BuildId
type: string
default: '0'
stages:
- stage: Windows_Packaging_${{ parameters.stage_name_suffix }}
dependsOn: []
@ -82,13 +92,20 @@ stages:
- job:
workspace:
clean: all
pool: ${{ parameters.ort_build_pool_name }}
${{ if contains(parameters.ort_build_pool_name, 'GPU') }}:
pool: onnxruntime-Win-CPU-2022
${{ else }}:
pool: ${{ parameters.ort_build_pool_name }}
${{ if eq(parameters['UseIncreasedTimeoutForTests'], 'true') }}:
timeoutInMinutes: 1200
${{ else }}:
timeoutInMinutes: 300
steps:
- task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
displayName: 'Clean Agent Directories'
condition: always()
- checkout: self
clean: true
submodules: none
@ -155,6 +172,7 @@ stages:
arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "$(VSGenerator)" --enable_onnx_tests $(TelemetryOption) ${{ parameters.buildparameter }} '
workingDirectory: '$(Build.BinariesDirectory)'
- task: VSBuild@1
displayName: 'Build'
inputs:
@ -166,19 +184,59 @@ stages:
logProjectEvents: true
workingFolder: '$(Build.BinariesDirectory)\RelWithDebInfo'
createLogFile: true
msbuildArgs: "/p:CL_MPCount=2" # 2x cl.exe per project building.
- task: PythonScript@0
displayName: 'test'
condition: and(succeeded(), eq('${{ parameters.runTests}}', true))
inputs:
scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "$(VSGenerator)" --enable_onnx_tests $(TelemetryOption) ${{ parameters.buildparameter }}'
# For CPU job, tests are run in the same machine as building
- ${{ if contains(parameters.ort_build_pool_name, 'CPU') }}:
- ${{ if eq(parameters.buildJava, 'true') }}:
- template: make_java_win_binaries.yml
parameters:
msbuildPlatform: ${{ parameters.msbuildPlatform }}
java_artifact_id: ${{ parameters.java_artifact_id }}
- task: PublishBuildArtifacts@1
condition: and(succeeded(), eq('${{ parameters.buildJava}}', true))
displayName: 'Publish Java temp binaries'
inputs:
pathtoPublish: '$(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}'
artifactName: 'drop-onnxruntime-java-win-${{ parameters.packageName }}${{parameters.artifact_name_suffix}}'
- task: PythonScript@0
displayName: 'test'
condition: and(succeeded(), eq('${{ parameters.runTests}}', true))
inputs:
scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "$(VSGenerator)" --enable_onnx_tests $(TelemetryOption) ${{ parameters.buildparameter }}'
workingDirectory: '$(Build.BinariesDirectory)'
- ${{ else }}:
- powershell: |
New-Item $(Agent.TempDirectory)/RelWithDebInfo -Force -ItemType Directory
Copy-Item -Path "$(Build.BinariesDirectory)/RelWithDebInfo/CTestTestfile.cmake" -Destination $(Agent.TempDirectory)/RelWithDebInfo/ -Force
Copy-Item -Path "$(Build.BinariesDirectory)/RelWithDebInfo/RelWithDebInfo" -Destination $(Agent.TempDirectory)/RelWithDebInfo/ -Recurse -Force
Get-ChildItem -Path "$(Agent.TempDirectory)/RelWithDebInfo" -Include *.pdb -File -Recurse | ForEach-Object { $_.Delete() }
Get-ChildItem -Path "$(Agent.TempDirectory)/RelWithDebInfo" -Include *.lib -File -Recurse | ForEach-Object { $_.Delete() }
Copy-Item -Path $(Build.BinariesDirectory)/RelWithDebInfo/RelWithDebInfo/onnxruntime.pdb -Destination $(Agent.TempDirectory)/RelWithDebInfo/RelWithDebInfo -Force
Copy-Item -Path $(Build.BinariesDirectory)/RelWithDebInfo/RelWithDebInfo/onnxruntime4j_jni.pdb -Destination $(Agent.TempDirectory)/RelWithDebInfo/RelWithDebInfo -Force
cd $(Agent.TempDirectory)/RelWithDebInfo
tree /f
displayName: 'Copy native test needs files'
workingDirectory: '$(Build.BinariesDirectory)'
- ${{ if eq(parameters['buildJava'], 'true') }}:
- powershell: |
Copy-Item -Path "$(Build.BinariesDirectory)/RelWithDebInfo/java" -Destination $(Agent.TempDirectory)/RelWithDebInfo/ -Recurse -Force
cd $(Agent.TempDirectory)/RelWithDebInfo
tree /f
displayName: 'Copy java folder for java test'
workingDirectory: '$(Build.BinariesDirectory)'
- task: PublishPipelineArtifact@1
inputs:
targetPath: '$(Agent.TempDirectory)/RelWithDebInfo'
artifactName: 'Windows_Packaging_${{ parameters.stage_name_suffix }}_build_artifacts'
publishLocation: 'pipeline'
- script: |
dir *.dll
mkdir $(Build.ArtifactStagingDirectory)\testdata
dir *.dll
mkdir $(Build.ArtifactStagingDirectory)\testdata
workingDirectory: '$(Build.BinariesDirectory)/RelWithDebInfo/RelWithDebInfo'
displayName: 'List built DLLs'
@ -205,7 +263,6 @@ stages:
targetPath: '$(Build.BinariesDirectory)\RelWithDebInfo\installed\bin\protoc.exe'
artifactName: 'drop-extra${{ parameters.artifact_name_suffix }}'
- task: CopyFiles@2
displayName: 'Copy custom_op_library to: $(Build.ArtifactStagingDirectory)'
condition: and(succeeded(), eq('${{ parameters.packageName}}', 'x64'))
@ -214,55 +271,6 @@ stages:
Contents: 'custom_op_library.dll'
TargetFolder: '$(Build.ArtifactStagingDirectory)/testdata'
- task: CmdLine@2
condition: and(succeeded(), eq('${{ parameters.buildJava}}', true))
displayName: 'Add symbols and notices to Java'
inputs:
script: |
@echo on
cd $(Build.SourcesDirectory)\java
call $(Build.SourcesDirectory)\java\gradlew.bat cmakeCheck -DcmakeBuildDir=$(Build.BinariesDirectory)\RelWithDebInfo
if %errorlevel% neq 0 exit /b %errorlevel%
cd $(Build.BinariesDirectory)\RelWithDebInfo
set NATIVE_FOLDER=$(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\stage\ai\onnxruntime\native\win-x64
mkdir %NATIVE_FOLDER%
echo "Directories created"
copy .\java\build\libs\*.jar $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}
pushd $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}
set artifact_id=${{ parameters.java_artifact_id }}
jar xf onnxruntime-$(OnnxRuntimeVersion).jar META-INF\maven\com.microsoft.onnxruntime\%artifact_id%\pom.xml
move META-INF\maven\com.microsoft.onnxruntime\%artifact_id%\pom.xml onnxruntime-$(OnnxRuntimeVersion).pom
rd /s /q META-INF
popd
copy .\RelWithDebInfo\onnxruntime.pdb %NATIVE_FOLDER%
copy .\RelWithDebInfo\onnxruntime4j_jni.pdb %NATIVE_FOLDER%
copy $(Build.SourcesDirectory)\docs\Privacy.md $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\stage\Privacy.md
copy $(Build.SourcesDirectory)\ThirdPartyNotices.txt $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\stage\ThirdPartyNotices.txt
@echo $(OnnxRuntimeGitCommitHash) > $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\stage\GIT_COMMIT_ID
pushd $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\stage
jar uf $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\onnxruntime-$(OnnxRuntimeVersion).jar ai\onnxruntime\native\win-x64\onnxruntime.pdb
jar uf $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\onnxruntime-$(OnnxRuntimeVersion).jar ai\onnxruntime\native\win-x64\onnxruntime4j_jni.pdb
jar uf $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\onnxruntime-$(OnnxRuntimeVersion).jar Privacy.md ThirdPartyNotices.txt GIT_COMMIT_ID
popd
pushd $(Build.SourcesDirectory)\java\build\classes\java\test
if %errorlevel% neq 0 exit /b %errorlevel%
jar cvf $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\testing.jar .
if %errorlevel% neq 0 exit /b %errorlevel%
popd
pushd $(Build.SourcesDirectory)\java\build\resources\test
rd /s /q ai\onnxruntime\native
jar uvf $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\testing.jar .
popd
rd /s /q $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\stage
dir /s /b $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}
- task: PublishBuildArtifacts@1
condition: and(succeeded(), eq('${{ parameters.buildJava}}', true))
displayName: 'Publish Java temp binaries'
inputs:
pathtoPublish: '$(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}'
artifactName: 'drop-onnxruntime-java-win-${{ parameters.packageName }}${{parameters.artifact_name_suffix}}'
- ${{ if eq(parameters['DoCompliance'], 'true') }}:
- task: CredScan@3
displayName: 'Run CredScan'
@ -295,6 +303,97 @@ stages:
parameters :
condition : 'succeeded'
- task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
displayName: 'Clean Agent Directories'
condition: always()
- ${{ if contains(parameters.ort_build_pool_name, 'GPU') }}:
- stage: Windows_Packaging_${{ parameters.stage_name_suffix }}_Testing
dependsOn: Windows_Packaging_${{ parameters.stage_name_suffix }}
variables:
CUDA_MODULE_LOADING: 'LAZY'
jobs:
- job: Windows_Packaging_${{ parameters.stage_name_suffix }}_Testing
workspace:
clean: all
pool: ${{ parameters.ort_build_pool_name }}
timeoutInMinutes: 120
steps:
- task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
displayName: 'Clean Agent Directories'
condition: always()
- checkout: self
clean: true
submodules: none
- template: flex-downloadPipelineArtifact.yml
parameters:
ArtifactName: "Windows_Packaging_${{ parameters.stage_name_suffix }}_build_artifacts"
StepName: 'Download Pipeline Artifact - Windows GPU Packages Build'
TargetPath: '$(Build.BinariesDirectory)/RelWithDebInfo/'
SpecificArtifact: ${{ parameters.SpecificArtifact }}
BuildId: ${{ parameters.BuildId }}
- powershell: |
tree /f
$drive = (Get-Location).Drive.Name
$file = 'CTestTestfile.cmake'
(Get-Content $file ) -replace 'C:\\a', -join($drive, ':\\a') | Set-Content $file
(Get-Content $file ) -replace 'C:/a', -join($drive, ':/a') | Set-Content $file
displayName: 'List built files and update CTestTestfile.cmake drive letter'
workingDirectory: '$(Build.BinariesDirectory)/RelWithDebInfo/'
- template: telemetry-steps.yml
- template: set-version-number-variables-step.yml
- ${{ if eq(parameters['buildJava'], 'true') }}:
- task: JavaToolInstaller@0
inputs:
versionSpec: "11"
jdkArchitectureOption: ${{ parameters.buildArch }}
jdkSourceOption: 'PreInstalled'
- task: UsePythonVersion@0
inputs:
versionSpec: '3.8'
addToPath: true
architecture: ${{ parameters.buildArch }}
- task: NodeTool@0
condition: and(succeeded(), eq('${{ parameters.buildNodejs}}', true))
inputs:
versionSpec: '18.x'
- ${{ if ne(parameters.CudaVersion, '') }}:
- template: jobs/download_win_gpu_library.yml
parameters:
CudaVersion: ${{ parameters.CudaVersion }}
${{ if contains(parameters.buildparameter, 'use_cuda') }}:
DownloadCUDA: true
${{ if contains(parameters.buildparameter, 'use_tensorrt') }}:
DownloadCUDA: true
DownloadTRT: true
- powershell: |
Write-Host "##vso[task.prependpath]C:\Program Files (x86)\dotnet"
displayName: 'Append dotnet x86 Directory to PATH'
condition: and(succeeded(), eq('${{ parameters.buildArch}}', 'x86'))
- task: PythonScript@0
displayName: 'test'
condition: and(succeeded(), eq('${{ parameters.runTests}}', true))
inputs:
scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --enable_onnx_tests $(TelemetryOption) '
workingDirectory: '$(Build.BinariesDirectory)'
- ${{ if eq(parameters.buildJava, 'true') }}:
- template: make_java_win_binaries.yml
parameters:
msbuildPlatform: ${{ parameters.msbuildPlatform }}
java_artifact_id: ${{ parameters.java_artifact_id }}
- task: PublishBuildArtifacts@1
condition: and(succeeded(), eq('${{ parameters.buildJava}}', true))
displayName: 'Publish Java temp binaries'
inputs:
pathtoPublish: '$(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}'
artifactName: 'drop-onnxruntime-java-win-${{ parameters.packageName }}${{parameters.artifact_name_suffix}}'