mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-31 23:27:43 +00:00
### Description Introduce collective ops into onnxruntime inference build, including 1) AllReduce and AllGather schema in contrib op, controlled by USE_MPI flag 2) AllReduce and AllGather kernel in cuda EP, controlled by ORT_USE_NCCL flag ### Motivation and Context Enable the collective ops in onnxruntime inference build so we have the ability to run distributed inference with multiple GPUs. The original ncclAllReduce ops in training build require quite complex configurations, which is not suitable for inference case, and it already broken. so we introduce a new implementation. --------- Co-authored-by: Cheng Tang <chenta@microsoft.com@orttrainingdev9.d32nl1ml4oruzj4qz3bqlggovf.px.internal.cloudapp.net>
109 lines
4 KiB
YAML
109 lines
4 KiB
YAML
trigger: none
|
|
|
|
stages:
|
|
- stage: ORTModuleDistributedTest
|
|
dependsOn: []
|
|
jobs:
|
|
- job: Onnxruntime_Linux_GPU_ORTModule_Distributed_Test
|
|
|
|
timeoutInMinutes: 120
|
|
pool: 'Onnxruntime-Linux-GPU-NC24sv3'
|
|
|
|
steps:
|
|
- checkout: self
|
|
clean: true
|
|
submodules: recursive
|
|
|
|
- template: templates/run-docker-build-steps.yml
|
|
parameters:
|
|
RunDockerBuildArgs: |
|
|
-o ubuntu20.04 -d gpu \
|
|
-t onnxruntime_ortmodule_distributed_tests_image \
|
|
-x " \
|
|
--config RelWithDebInfo \
|
|
--use_cuda --cuda_version=11.6 --cuda_home=/usr/local/cuda-11.6 --cudnn_home=/usr/local/cuda-11.6 \
|
|
--enable_training \
|
|
--update --build \
|
|
--build_wheel \
|
|
" \
|
|
-m \
|
|
-u \
|
|
-e
|
|
DisplayName: 'Build'
|
|
|
|
- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/mnist" -d "/mnist"
|
|
displayName: 'Mount MNIST'
|
|
condition: succeededOrFailed()
|
|
|
|
# Entry point for all ORTModule distributed tests
|
|
# Refer to orttraining/orttraining/test/python/how_to_add_ortmodule_distributed_ci_pipeline_tests.md for guidelines on how to add new tests to this pipeline.
|
|
- script: |
|
|
docker run \
|
|
--gpus all \
|
|
--shm-size=1024m \
|
|
--rm \
|
|
--volume $(Build.SourcesDirectory):/onnxruntime_src \
|
|
--volume $(Build.BinariesDirectory):/build \
|
|
--volume /mnist:/mnist \
|
|
onnxruntime_ortmodule_distributed_tests_image \
|
|
bash -c "rm -rf /build/RelWithDebInfo/onnxruntime/ && python3 -m pip install /build/RelWithDebInfo/dist/onnxruntime*.whl && python3 -m onnxruntime.training.ortmodule.torch_cpp_extensions.install && /build/RelWithDebInfo/launch_test.py --cmd_line_with_args 'python orttraining_ortmodule_distributed_tests.py --mnist /mnist' --cwd /build/RelWithDebInfo" \
|
|
displayName: 'Run orttraining_ortmodule_distributed_tests.py'
|
|
condition: succeededOrFailed()
|
|
timeoutInMinutes: 30
|
|
|
|
- template: templates/component-governance-component-detection-steps.yml
|
|
parameters:
|
|
condition: 'succeeded'
|
|
|
|
- template: templates/clean-agent-build-directory-step.yml
|
|
|
|
- stage: DistributedInferenceTest
|
|
dependsOn: []
|
|
jobs:
|
|
- job: Onnxruntime_Linux_GPU_Inference_Distributed_Test
|
|
|
|
timeoutInMinutes: 120
|
|
pool: 'Onnxruntime-Linux-GPU-NC24sv3'
|
|
|
|
steps:
|
|
- checkout: self
|
|
clean: true
|
|
submodules: recursive
|
|
|
|
- template: templates/run-docker-build-steps.yml
|
|
parameters:
|
|
RunDockerBuildArgs: |
|
|
-o ubuntu20.04 -d gpu \
|
|
-t onnxruntime_ortmodule_distributed_tests_image \
|
|
-x " \
|
|
--config RelWithDebInfo \
|
|
--use_cuda --cuda_version=11.6 --cuda_home=/usr/local/cuda-11.6 --cudnn_home=/usr/local/cuda-11.6 \
|
|
--update --build \
|
|
--build_wheel \
|
|
--use_mpi \
|
|
--enable_nccl \
|
|
" \
|
|
-m \
|
|
-u \
|
|
-e
|
|
DisplayName: 'Build'
|
|
|
|
- script: |
|
|
docker run \
|
|
--gpus all \
|
|
--shm-size=1024m \
|
|
--rm \
|
|
--volume $(Build.SourcesDirectory):/onnxruntime_src \
|
|
--volume $(Build.BinariesDirectory):/build \
|
|
--volume /mnist:/mnist \
|
|
onnxruntime_ortmodule_distributed_tests_image \
|
|
bash -c "rm -rf /build/RelWithDebInfo/onnxruntime/ && python3 -m pip install mpi4py && python3 -m pip install /build/RelWithDebInfo/dist/onnxruntime*.whl && mpirun -n 4 -x NCCL_DEBUG=INFO python /onnxruntime_src/onnxruntime/test/python/onnxruntime_test_collective.py" \
|
|
displayName: 'Run onnxruntime_test_collective.py'
|
|
condition: succeededOrFailed()
|
|
timeoutInMinutes: 30
|
|
|
|
- template: templates/component-governance-component-detection-steps.yml
|
|
parameters:
|
|
condition: 'succeeded'
|
|
|
|
- template: templates/clean-agent-build-directory-step.yml
|