Reuse T4 for Cuda12.2 training packaging pipeline. (#20244)

### Description It always has been out of memory in training CUDA 12.2 packaging pipeline https://dev.azure.com/aiinfra/Lotus/_build?definitionId=1308&_a=summary since the PR #19910 I tried other CPU agents for example, D64as_v5(256G memory) and D32as_v4(128G memory and 256 G SSD temp storage), which are still out of memory like the below image ![image](https://github.com/microsoft/onnxruntime/assets/16190118/5acde9ef-674f-4b6d-a1b3-b54647645083) But it works on T4, though T4 only has 4 vCPUs, 28G memory and 180G temp storage, and it takes much more time. ### Motivation and Context Restore CUDA 12.2 training packaging pipeline first. More time is needed to investigate the root cause ### Other Clues. These 2 compilation steps take nearly 6 minutes with Cuda 12.2 on T4 And it runs out of memory on CPU machine. @ajindal1 cuda12.2 on T4 ``` 2024-03-14T05:39:08.7726865Z [ 90%] Building CUDA object CMakeFiles/onnxruntime_providers_cuda.dir/onnxruntime_src/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim32_fp16_sm80.cu.o 2024-03-14T05:45:01.3223393Z [ 90%] Building CUDA object CMakeFiles/onnxruntime_providers_cuda.dir/onnxruntime_src/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim64_bf16_sm80.cu.o 2024-03-14T05:46:07.9218003Z [ 90%] Building CUDA object CMakeFiles/onnxruntime_providers_cuda.dir/onnxruntime_src/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim96_fp16_sm80.cu.o 2024-03-14T05:52:59.2387051Z [ 90%] Building CUDA object CMakeFiles/onnxruntime_providers_cuda.dir/onnxruntime_src/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu.o ``` But they could be finished in about one minute with Cuda 11.8 on CPU ``` cuda11.8 on CPU 2024-04-09T11:34:35.0849836Z [ 90%] Building CUDA object CMakeFiles/onnxruntime_providers_cuda.dir/onnxruntime_src/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim32_fp16_sm80.cu.o 2024-04-09T11:35:53.6648154Z [ 90%] Building CUDA object CMakeFiles/onnxruntime_providers_cuda.dir/onnxruntime_src/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim64_bf16_sm80.cu.o cuda11.8 on GPU 024-03-13T12:16:33.4102477Z [ 90%] Building CUDA object CMakeFiles/onnxruntime_providers_cuda.dir/onnxruntime_src/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim32_fp16_sm80.cu.o 2024-03-13T12:19:58.8268272Z [ 90%] Building CUDA object CMakeFiles/onnxruntime_providers_cuda.dir/onnxruntime_src/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim64_bf16_sm80.cu.o ```
2026-07-16 18:31:27 +00:00 · 2024-04-10 09:21:40 +08:00 · 2024-04-10 09:21:40 +08:00 · 14d7872ce9
commit 14d7872ce9
parent 7d8dea9f10
4 changed files with 12 additions and 4 deletions
--- a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda.yml
@ -33,3 +33,4 @@ stages:
    debug_build: false
    SpecificArtifact: ${{ parameters.SpecificArtifact }}
    BuildId: ${{ parameters.BuildId }}
+    build_pool_name: 'onnxruntime-Ubuntu2204-AMD-CPU'
--- a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda12.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda12.yml
@ -20,3 +20,4 @@ stages:
    agent_pool: Onnxruntime-Linux-GPU
    upload_wheel: 'yes'
    debug_build: false
+    build_pool_name: 'onnxruntime-Linux-GPU'
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml
@ -11,6 +11,7 @@ parameters:
  stage_name: ''
  SpecificArtifact: false
  BuildId: '0'
+  build_pool_name: ''

 stages:
  - stage: Build_${{ parameters.stage_name }}
@ -30,12 +31,12 @@ stages:
      - name: PythonVersion
        value: ${{ parameters.python_version }}
      - name: Repository
-        value: onnxruntimetraininggpubuild_${{ parameters.python_version }}
+        value: onnxruntimetraininggpubuild_cu${{ replace(parameters.cuda_version, '.', '') }}_py${{ replace(parameters.python_version, '.', '') }}
    dependsOn: []

    jobs:
    - job: Build
-      pool: onnxruntime-Ubuntu2204-AMD-CPU
+      pool: ${{ parameters.build_pool_name }}
      timeoutInMinutes: 180
      steps:
        - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
@ -84,7 +85,6 @@ stages:
                --volume $(Build.BinariesDirectory):/build \
                --volume /data/models:/build/models:ro \
                --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
-                -e NVIDIA_VISIBLE_DEVICES=all \
                -e NIGHTLY_BUILD \
                -e DEFAULT_TRAINING_PACKAGE_DEVICE \
                -e BUILD_BUILDNUMBER \
@ -137,7 +137,7 @@ stages:
      - name: PythonVersion
        value: ${{ parameters.python_version }}
      - name: Repository
-        value: onnxruntimetraininggpubuild_${{ parameters.python_version }}
+        value: onnxruntimetraininggpubuild_cu${{ replace(parameters.cuda_version, '.', '') }}_py${{ replace(parameters.python_version, '.', '') }}
      - name: UploadWheel
        value: ${{ parameters.upload_wheel }}
    dependsOn: Build_${{ parameters.stage_name }}
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml
@ -57,6 +57,11 @@ parameters:
  type: string
  default: '0'

+- name: build_pool_name
+  displayName: >
+    build_pool_name.
+  type: string
+
 - name: PythonVersionList
  displayName: Python Version List
  type: object
@ -86,3 +91,4 @@ stages:
      python_version: ${{ python_version.version }}
      SpecificArtifact: ${{ parameters.SpecificArtifact }}
      BuildId: ${{ parameters.BuildId }}
+      build_pool_name: ${{ parameters.build_pool_name }}