From 14d7872ce97affa0ab6adf1cf4e4c2bf8ddd5c11 Mon Sep 17 00:00:00 2001 From: Yi Zhang Date: Wed, 10 Apr 2024 09:21:40 +0800 Subject: [PATCH] Reuse T4 for Cuda12.2 training packaging pipeline. (#20244) ### Description It always has been out of memory in training CUDA 12.2 packaging pipeline https://dev.azure.com/aiinfra/Lotus/_build?definitionId=1308&_a=summary since the PR #19910 I tried other CPU agents for example, D64as_v5(256G memory) and D32as_v4(128G memory and 256 G SSD temp storage), which are still out of memory like the below image ![image](https://github.com/microsoft/onnxruntime/assets/16190118/5acde9ef-674f-4b6d-a1b3-b54647645083) But it works on T4, though T4 only has 4 vCPUs, 28G memory and 180G temp storage, and it takes much more time. ### Motivation and Context Restore CUDA 12.2 training packaging pipeline first. More time is needed to investigate the root cause ### Other Clues. These 2 compilation steps take nearly 6 minutes with Cuda 12.2 on T4 And it runs out of memory on CPU machine. @ajindal1 cuda12.2 on T4 ``` 2024-03-14T05:39:08.7726865Z [ 90%] Building CUDA object CMakeFiles/onnxruntime_providers_cuda.dir/onnxruntime_src/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim32_fp16_sm80.cu.o 2024-03-14T05:45:01.3223393Z [ 90%] Building CUDA object CMakeFiles/onnxruntime_providers_cuda.dir/onnxruntime_src/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim64_bf16_sm80.cu.o 2024-03-14T05:46:07.9218003Z [ 90%] Building CUDA object CMakeFiles/onnxruntime_providers_cuda.dir/onnxruntime_src/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim96_fp16_sm80.cu.o 2024-03-14T05:52:59.2387051Z [ 90%] Building CUDA object CMakeFiles/onnxruntime_providers_cuda.dir/onnxruntime_src/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu.o ``` But they could be finished in about one minute with Cuda 11.8 on CPU ``` cuda11.8 on CPU 2024-04-09T11:34:35.0849836Z [ 90%] Building CUDA object CMakeFiles/onnxruntime_providers_cuda.dir/onnxruntime_src/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim32_fp16_sm80.cu.o 2024-04-09T11:35:53.6648154Z [ 90%] Building CUDA object CMakeFiles/onnxruntime_providers_cuda.dir/onnxruntime_src/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim64_bf16_sm80.cu.o cuda11.8 on GPU 024-03-13T12:16:33.4102477Z [ 90%] Building CUDA object CMakeFiles/onnxruntime_providers_cuda.dir/onnxruntime_src/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim32_fp16_sm80.cu.o 2024-03-13T12:19:58.8268272Z [ 90%] Building CUDA object CMakeFiles/onnxruntime_providers_cuda.dir/onnxruntime_src/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim64_bf16_sm80.cu.o ``` --- .../orttraining-py-packaging-pipeline-cuda.yml | 1 + .../orttraining-py-packaging-pipeline-cuda12.yml | 1 + .../templates/py-packaging-training-cuda-stage-steps.yml | 8 ++++---- .../templates/py-packaging-training-cuda-stage.yml | 6 ++++++ 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda.yml b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda.yml index 539a61c021..2c65432471 100644 --- a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda.yml +++ b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda.yml @@ -33,3 +33,4 @@ stages: debug_build: false SpecificArtifact: ${{ parameters.SpecificArtifact }} BuildId: ${{ parameters.BuildId }} + build_pool_name: 'onnxruntime-Ubuntu2204-AMD-CPU' diff --git a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda12.yml b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda12.yml index 86dce7ae46..6e02714f28 100644 --- a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda12.yml +++ b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda12.yml @@ -20,3 +20,4 @@ stages: agent_pool: Onnxruntime-Linux-GPU upload_wheel: 'yes' debug_build: false + build_pool_name: 'onnxruntime-Linux-GPU' diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml index 08f001efb6..f6b36733eb 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml @@ -11,6 +11,7 @@ parameters: stage_name: '' SpecificArtifact: false BuildId: '0' + build_pool_name: '' stages: - stage: Build_${{ parameters.stage_name }} @@ -30,12 +31,12 @@ stages: - name: PythonVersion value: ${{ parameters.python_version }} - name: Repository - value: onnxruntimetraininggpubuild_${{ parameters.python_version }} + value: onnxruntimetraininggpubuild_cu${{ replace(parameters.cuda_version, '.', '') }}_py${{ replace(parameters.python_version, '.', '') }} dependsOn: [] jobs: - job: Build - pool: onnxruntime-Ubuntu2204-AMD-CPU + pool: ${{ parameters.build_pool_name }} timeoutInMinutes: 180 steps: - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3 @@ -84,7 +85,6 @@ stages: --volume $(Build.BinariesDirectory):/build \ --volume /data/models:/build/models:ro \ --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \ - -e NVIDIA_VISIBLE_DEVICES=all \ -e NIGHTLY_BUILD \ -e DEFAULT_TRAINING_PACKAGE_DEVICE \ -e BUILD_BUILDNUMBER \ @@ -137,7 +137,7 @@ stages: - name: PythonVersion value: ${{ parameters.python_version }} - name: Repository - value: onnxruntimetraininggpubuild_${{ parameters.python_version }} + value: onnxruntimetraininggpubuild_cu${{ replace(parameters.cuda_version, '.', '') }}_py${{ replace(parameters.python_version, '.', '') }} - name: UploadWheel value: ${{ parameters.upload_wheel }} dependsOn: Build_${{ parameters.stage_name }} diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml index f7ecc3cf84..a1f326ebaa 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml @@ -57,6 +57,11 @@ parameters: type: string default: '0' +- name: build_pool_name + displayName: > + build_pool_name. + type: string + - name: PythonVersionList displayName: Python Version List type: object @@ -86,3 +91,4 @@ stages: python_version: ${{ python_version.version }} SpecificArtifact: ${{ parameters.SpecificArtifact }} BuildId: ${{ parameters.BuildId }} + build_pool_name: ${{ parameters.build_pool_name }}