onnxruntime/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-distributed-test-ci-pipeline.yml
Ye Wang f35dd1407f
custom allreduce cuda kernel (#20703)
### Description
<!-- Describe your changes. -->

Conditionally route to custom AllReduce kernel when buffer size and gpu
numbers meet certain requirements. Otherwise, keep using NCCL's
AllReduce.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

---------

Co-authored-by: Ye Wang <wangye@microsoft.com@h100vm-ort.kxelwkzfzxguje5bxvwxxs135a.gvxx.internal.cloudapp.net>
Co-authored-by: Your Name <you@example.com>
2024-06-13 11:09:49 -07:00

134 lines
4.6 KiB
YAML

##### start trigger Don't edit it manually, Please do edit set-trigger-rules.py ####
trigger:
branches:
include:
- main
- rel-*
paths:
exclude:
- docs/**
- README.md
- CONTRIBUTING.md
- BUILD.md
- 'js/web'
- 'onnxruntime/core/providers/js'
pr:
branches:
include:
- main
- rel-*
paths:
exclude:
- docs/**
- README.md
- CONTRIBUTING.md
- BUILD.md
- 'js/web'
- 'onnxruntime/core/providers/js'
#### end trigger ####
stages:
- stage: ORTModuleDistributedTest
dependsOn: []
jobs:
- job: Onnxruntime_Linux_GPU_ORTModule_Distributed_Test
timeoutInMinutes: 120
pool: 'Onnxruntime-Linux-GPU-NC24sv3'
steps:
- checkout: self
clean: true
submodules: recursive
- template: templates/jobs/download_training_test_data.yml
- template: templates/run-docker-build-steps.yml
parameters:
RunDockerBuildArgs: |
-o ubuntu20.04 -d gpu \
-t onnxruntime_ortmodule_distributed_tests_image \
-x " \
--config RelWithDebInfo \
--use_cuda --cuda_version=11.8 --cuda_home=/usr/local/cuda-11.8 --cudnn_home=/usr/local/cuda-11.8 \
--enable_training \
--update --build \
--build_wheel \
" \
-m \
-u \
-e
DisplayName: 'Build'
# Entry point for all ORTModule distributed tests
# Refer to orttraining/orttraining/test/python/how_to_add_ortmodule_distributed_ci_pipeline_tests.md for guidelines on how to add new tests to this pipeline.
- script: |
docker run \
--gpus all \
--shm-size=1024m \
--rm \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume $(Agent.TempDirectory)/mnist:/mnist \
onnxruntime_ortmodule_distributed_tests_image \
bash -c "rm -rf /build/RelWithDebInfo/onnxruntime/ && python3 -m pip install /build/RelWithDebInfo/dist/onnxruntime*.whl && python3 -m onnxruntime.training.ortmodule.torch_cpp_extensions.install && /build/RelWithDebInfo/launch_test.py --cmd_line_with_args 'python orttraining_ortmodule_distributed_tests.py --mnist /mnist' --cwd /build/RelWithDebInfo" \
displayName: 'Run orttraining_ortmodule_distributed_tests.py'
condition: succeededOrFailed()
timeoutInMinutes: 30
- template: templates/component-governance-component-detection-steps.yml
parameters:
condition: 'succeeded'
- template: templates/clean-agent-build-directory-step.yml
- stage: DistributedInferenceTest
dependsOn: []
jobs:
- job: Onnxruntime_Linux_GPU_Inference_Distributed_Test
timeoutInMinutes: 120
pool: 'Onnxruntime-Linux-GPU-NC24sv3'
steps:
- checkout: self
clean: true
submodules: recursive
- template: templates/run-docker-build-steps.yml
parameters:
RunDockerBuildArgs: |
-o ubuntu20.04 -d gpu \
-t onnxruntime_ortmodule_distributed_tests_image \
-x " \
--config RelWithDebInfo \
--use_cuda --cuda_version=11.8 --cuda_home=/usr/local/cuda-11.8 --cudnn_home=/usr/local/cuda-11.8 \
--update --build \
--build_wheel \
--use_mpi \
--enable_nccl \
" \
-m \
-u \
-e
DisplayName: 'Build'
- script: |
docker run \
--gpus all \
--shm-size=1024m \
--rm \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume /mnist:/mnist \
onnxruntime_ortmodule_distributed_tests_image \
bash -c "rm -rf /build/RelWithDebInfo/onnxruntime/ && python3 -m pip install mpi4py onnxscript && python3 -m pip install /build/RelWithDebInfo/dist/onnxruntime*.whl && mpirun -n 4 -x NCCL_DEBUG=INFO python /onnxruntime_src/onnxruntime/test/python/onnxruntime_test_collective.py && mpirun -n 2 -x NCCL_DEBUG=INFO python /onnxruntime_src/onnxruntime/test/python/onnxruntime_test_distributed.py && mpirun -n 2 -x NCCL_DEBUG=INFO python /onnxruntime_src/onnxruntime/test/python/transformers/sharded_moe/test_sharded_moe.py" \
displayName: 'Run onnxruntime_test_collective.py, onnxruntime_test_distributed.py and test_sharded_moe.py'
condition: succeededOrFailed()
timeoutInMinutes: 30
- template: templates/component-governance-component-detection-steps.yml
parameters:
condition: 'succeeded'
- template: templates/clean-agent-build-directory-step.yml