mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-26 22:35:43 +00:00
Reuse T4 for Cuda12.2 training packaging pipeline. (#20244)
### Description It always has been out of memory in training CUDA 12.2 packaging pipeline https://dev.azure.com/aiinfra/Lotus/_build?definitionId=1308&_a=summary since the PR #19910 I tried other CPU agents for example, D64as_v5(256G memory) and D32as_v4(128G memory and 256 G SSD temp storage), which are still out of memory like the below image  But it works on T4, though T4 only has 4 vCPUs, 28G memory and 180G temp storage, and it takes much more time. ### Motivation and Context Restore CUDA 12.2 training packaging pipeline first. More time is needed to investigate the root cause ### Other Clues. These 2 compilation steps take nearly 6 minutes with Cuda 12.2 on T4 And it runs out of memory on CPU machine. @ajindal1 cuda12.2 on T4 ``` 2024-03-14T05:39:08.7726865Z [ 90%] Building CUDA object CMakeFiles/onnxruntime_providers_cuda.dir/onnxruntime_src/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim32_fp16_sm80.cu.o 2024-03-14T05:45:01.3223393Z [ 90%] Building CUDA object CMakeFiles/onnxruntime_providers_cuda.dir/onnxruntime_src/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim64_bf16_sm80.cu.o 2024-03-14T05:46:07.9218003Z [ 90%] Building CUDA object CMakeFiles/onnxruntime_providers_cuda.dir/onnxruntime_src/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim96_fp16_sm80.cu.o 2024-03-14T05:52:59.2387051Z [ 90%] Building CUDA object CMakeFiles/onnxruntime_providers_cuda.dir/onnxruntime_src/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu.o ``` But they could be finished in about one minute with Cuda 11.8 on CPU ``` cuda11.8 on CPU 2024-04-09T11:34:35.0849836Z [ 90%] Building CUDA object CMakeFiles/onnxruntime_providers_cuda.dir/onnxruntime_src/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim32_fp16_sm80.cu.o 2024-04-09T11:35:53.6648154Z [ 90%] Building CUDA object CMakeFiles/onnxruntime_providers_cuda.dir/onnxruntime_src/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim64_bf16_sm80.cu.o cuda11.8 on GPU 024-03-13T12:16:33.4102477Z [ 90%] Building CUDA object CMakeFiles/onnxruntime_providers_cuda.dir/onnxruntime_src/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim32_fp16_sm80.cu.o 2024-03-13T12:19:58.8268272Z [ 90%] Building CUDA object CMakeFiles/onnxruntime_providers_cuda.dir/onnxruntime_src/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim64_bf16_sm80.cu.o ```
This commit is contained in:
parent
7d8dea9f10
commit
14d7872ce9
4 changed files with 12 additions and 4 deletions
|
|
@ -33,3 +33,4 @@ stages:
|
|||
debug_build: false
|
||||
SpecificArtifact: ${{ parameters.SpecificArtifact }}
|
||||
BuildId: ${{ parameters.BuildId }}
|
||||
build_pool_name: 'onnxruntime-Ubuntu2204-AMD-CPU'
|
||||
|
|
|
|||
|
|
@ -20,3 +20,4 @@ stages:
|
|||
agent_pool: Onnxruntime-Linux-GPU
|
||||
upload_wheel: 'yes'
|
||||
debug_build: false
|
||||
build_pool_name: 'onnxruntime-Linux-GPU'
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@ parameters:
|
|||
stage_name: ''
|
||||
SpecificArtifact: false
|
||||
BuildId: '0'
|
||||
build_pool_name: ''
|
||||
|
||||
stages:
|
||||
- stage: Build_${{ parameters.stage_name }}
|
||||
|
|
@ -30,12 +31,12 @@ stages:
|
|||
- name: PythonVersion
|
||||
value: ${{ parameters.python_version }}
|
||||
- name: Repository
|
||||
value: onnxruntimetraininggpubuild_${{ parameters.python_version }}
|
||||
value: onnxruntimetraininggpubuild_cu${{ replace(parameters.cuda_version, '.', '') }}_py${{ replace(parameters.python_version, '.', '') }}
|
||||
dependsOn: []
|
||||
|
||||
jobs:
|
||||
- job: Build
|
||||
pool: onnxruntime-Ubuntu2204-AMD-CPU
|
||||
pool: ${{ parameters.build_pool_name }}
|
||||
timeoutInMinutes: 180
|
||||
steps:
|
||||
- task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
|
||||
|
|
@ -84,7 +85,6 @@ stages:
|
|||
--volume $(Build.BinariesDirectory):/build \
|
||||
--volume /data/models:/build/models:ro \
|
||||
--volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
|
||||
-e NVIDIA_VISIBLE_DEVICES=all \
|
||||
-e NIGHTLY_BUILD \
|
||||
-e DEFAULT_TRAINING_PACKAGE_DEVICE \
|
||||
-e BUILD_BUILDNUMBER \
|
||||
|
|
@ -137,7 +137,7 @@ stages:
|
|||
- name: PythonVersion
|
||||
value: ${{ parameters.python_version }}
|
||||
- name: Repository
|
||||
value: onnxruntimetraininggpubuild_${{ parameters.python_version }}
|
||||
value: onnxruntimetraininggpubuild_cu${{ replace(parameters.cuda_version, '.', '') }}_py${{ replace(parameters.python_version, '.', '') }}
|
||||
- name: UploadWheel
|
||||
value: ${{ parameters.upload_wheel }}
|
||||
dependsOn: Build_${{ parameters.stage_name }}
|
||||
|
|
|
|||
|
|
@ -57,6 +57,11 @@ parameters:
|
|||
type: string
|
||||
default: '0'
|
||||
|
||||
- name: build_pool_name
|
||||
displayName: >
|
||||
build_pool_name.
|
||||
type: string
|
||||
|
||||
- name: PythonVersionList
|
||||
displayName: Python Version List
|
||||
type: object
|
||||
|
|
@ -86,3 +91,4 @@ stages:
|
|||
python_version: ${{ python_version.version }}
|
||||
SpecificArtifact: ${{ parameters.SpecificArtifact }}
|
||||
BuildId: ${{ parameters.BuildId }}
|
||||
build_pool_name: ${{ parameters.build_pool_name }}
|
||||
|
|
|
|||
Loading…
Reference in a new issue