From 14d7872ce97affa0ab6adf1cf4e4c2bf8ddd5c11 Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Wed, 10 Apr 2024 09:21:40 +0800
Subject: [PATCH] Reuse T4 for Cuda12.2 training packaging pipeline. (#20244)

### Description
It always has been out of memory in training CUDA 12.2 packaging
pipeline
https://dev.azure.com/aiinfra/Lotus/_build?definitionId=1308&_a=summary
since the PR #19910
I tried other CPU agents for example, D64as_v5(256G memory) and
D32as_v4(128G memory and 256 G SSD temp storage), which are still out of
memory like the below image

![image](https://github.com/microsoft/onnxruntime/assets/16190118/5acde9ef-674f-4b6d-a1b3-b54647645083)


But it works on T4, though T4 only has 4 vCPUs, 28G memory and 180G temp
storage, and it takes much more time.

### Motivation and Context
Restore CUDA 12.2 training packaging pipeline first.
More time is needed to investigate the root cause


### Other Clues.
These 2 compilation steps take nearly 6 minutes with Cuda 12.2 on T4
And it runs out of memory on CPU machine. @ajindal1
cuda12.2 on T4
```
2024-03-14T05:39:08.7726865Z [ 90%] Building CUDA object CMakeFiles/onnxruntime_providers_cuda.dir/onnxruntime_src/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim32_fp16_sm80.cu.o
2024-03-14T05:45:01.3223393Z [ 90%] Building CUDA object CMakeFiles/onnxruntime_providers_cuda.dir/onnxruntime_src/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim64_bf16_sm80.cu.o

2024-03-14T05:46:07.9218003Z [ 90%] Building CUDA object CMakeFiles/onnxruntime_providers_cuda.dir/onnxruntime_src/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim96_fp16_sm80.cu.o
2024-03-14T05:52:59.2387051Z [ 90%] Building CUDA object CMakeFiles/onnxruntime_providers_cuda.dir/onnxruntime_src/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu.o

```

But they could be finished in about one minute with Cuda 11.8 on CPU
```
cuda11.8 on CPU
2024-04-09T11:34:35.0849836Z [ 90%] Building CUDA object CMakeFiles/onnxruntime_providers_cuda.dir/onnxruntime_src/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim32_fp16_sm80.cu.o
2024-04-09T11:35:53.6648154Z [ 90%] Building CUDA object CMakeFiles/onnxruntime_providers_cuda.dir/onnxruntime_src/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim64_bf16_sm80.cu.o

cuda11.8 on GPU
024-03-13T12:16:33.4102477Z [ 90%] Building CUDA object CMakeFiles/onnxruntime_providers_cuda.dir/onnxruntime_src/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim32_fp16_sm80.cu.o
2024-03-13T12:19:58.8268272Z [ 90%] Building CUDA object CMakeFiles/onnxruntime_providers_cuda.dir/onnxruntime_src/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim64_bf16_sm80.cu.o
```
---
 .../orttraining-py-packaging-pipeline-cuda.yml            | 1 +
 .../orttraining-py-packaging-pipeline-cuda12.yml          | 1 +
 .../templates/py-packaging-training-cuda-stage-steps.yml  | 8 ++++----
 .../templates/py-packaging-training-cuda-stage.yml        | 6 ++++++
 4 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda.yml b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda.yml
index 539a61c021..2c65432471 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda.yml
@@ -33,3 +33,4 @@ stages:
     debug_build: false
     SpecificArtifact: ${{ parameters.SpecificArtifact }}
     BuildId: ${{ parameters.BuildId }}
+    build_pool_name: 'onnxruntime-Ubuntu2204-AMD-CPU'
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda12.yml b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda12.yml
index 86dce7ae46..6e02714f28 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda12.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda12.yml
@@ -20,3 +20,4 @@ stages:
     agent_pool: Onnxruntime-Linux-GPU
     upload_wheel: 'yes'
     debug_build: false
+    build_pool_name: 'onnxruntime-Linux-GPU'
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml
index 08f001efb6..f6b36733eb 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml
@@ -11,6 +11,7 @@ parameters:
   stage_name: ''
   SpecificArtifact: false
   BuildId: '0'
+  build_pool_name: ''
 
 stages:
   - stage: Build_${{ parameters.stage_name }}
@@ -30,12 +31,12 @@ stages:
       - name: PythonVersion
         value: ${{ parameters.python_version }}
       - name: Repository
-        value: onnxruntimetraininggpubuild_${{ parameters.python_version }}
+        value: onnxruntimetraininggpubuild_cu${{ replace(parameters.cuda_version, '.', '') }}_py${{ replace(parameters.python_version, '.', '') }}
     dependsOn: []
 
     jobs:
     - job: Build
-      pool: onnxruntime-Ubuntu2204-AMD-CPU
+      pool: ${{ parameters.build_pool_name }}
       timeoutInMinutes: 180
       steps:
         - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
@@ -84,7 +85,6 @@ stages:
                 --volume $(Build.BinariesDirectory):/build \
                 --volume /data/models:/build/models:ro \
                 --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
-                -e NVIDIA_VISIBLE_DEVICES=all \
                 -e NIGHTLY_BUILD \
                 -e DEFAULT_TRAINING_PACKAGE_DEVICE \
                 -e BUILD_BUILDNUMBER \
@@ -137,7 +137,7 @@ stages:
       - name: PythonVersion
         value: ${{ parameters.python_version }}
       - name: Repository
-        value: onnxruntimetraininggpubuild_${{ parameters.python_version }}
+        value: onnxruntimetraininggpubuild_cu${{ replace(parameters.cuda_version, '.', '') }}_py${{ replace(parameters.python_version, '.', '') }}
       - name: UploadWheel
         value: ${{ parameters.upload_wheel }}
     dependsOn: Build_${{ parameters.stage_name }}
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml
index f7ecc3cf84..a1f326ebaa 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml
@@ -57,6 +57,11 @@ parameters:
   type: string
   default: '0'
 
+- name: build_pool_name
+  displayName: >
+    build_pool_name.
+  type: string
+
 - name: PythonVersionList
   displayName: Python Version List
   type: object
@@ -86,3 +91,4 @@ stages:
       python_version: ${{ python_version.version }}
       SpecificArtifact: ${{ parameters.SpecificArtifact }}
       BuildId: ${{ parameters.BuildId }}
+      build_pool_name: ${{ parameters.build_pool_name }}