onnxruntime/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-distributed-test-ci-pipeline.yml
Tang, Cheng 8f34c8c8ed
Introduce collective ops to ort inference build (#14399)
### Description
Introduce collective ops into onnxruntime inference build, including
1) AllReduce and AllGather schema in contrib op, controlled by USE_MPI
flag
2) AllReduce and AllGather kernel in cuda EP, controlled by ORT_USE_NCCL
flag


### Motivation and Context
Enable the collective ops in onnxruntime inference build so we have the
ability to run distributed inference with multiple GPUs.
The original ncclAllReduce ops in training build require quite complex
configurations, which is not suitable for inference case, and it already
broken. so we introduce a new implementation.

---------

Co-authored-by: Cheng Tang <chenta@microsoft.com@orttrainingdev9.d32nl1ml4oruzj4qz3bqlggovf.px.internal.cloudapp.net>
2023-02-07 13:47:48 -08:00

109 lines
4 KiB
YAML

trigger: none
stages:
- stage: ORTModuleDistributedTest
dependsOn: []
jobs:
- job: Onnxruntime_Linux_GPU_ORTModule_Distributed_Test
timeoutInMinutes: 120
pool: 'Onnxruntime-Linux-GPU-NC24sv3'
steps:
- checkout: self
clean: true
submodules: recursive
- template: templates/run-docker-build-steps.yml
parameters:
RunDockerBuildArgs: |
-o ubuntu20.04 -d gpu \
-t onnxruntime_ortmodule_distributed_tests_image \
-x " \
--config RelWithDebInfo \
--use_cuda --cuda_version=11.6 --cuda_home=/usr/local/cuda-11.6 --cudnn_home=/usr/local/cuda-11.6 \
--enable_training \
--update --build \
--build_wheel \
" \
-m \
-u \
-e
DisplayName: 'Build'
- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/mnist" -d "/mnist"
displayName: 'Mount MNIST'
condition: succeededOrFailed()
# Entry point for all ORTModule distributed tests
# Refer to orttraining/orttraining/test/python/how_to_add_ortmodule_distributed_ci_pipeline_tests.md for guidelines on how to add new tests to this pipeline.
- script: |
docker run \
--gpus all \
--shm-size=1024m \
--rm \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume /mnist:/mnist \
onnxruntime_ortmodule_distributed_tests_image \
bash -c "rm -rf /build/RelWithDebInfo/onnxruntime/ && python3 -m pip install /build/RelWithDebInfo/dist/onnxruntime*.whl && python3 -m onnxruntime.training.ortmodule.torch_cpp_extensions.install && /build/RelWithDebInfo/launch_test.py --cmd_line_with_args 'python orttraining_ortmodule_distributed_tests.py --mnist /mnist' --cwd /build/RelWithDebInfo" \
displayName: 'Run orttraining_ortmodule_distributed_tests.py'
condition: succeededOrFailed()
timeoutInMinutes: 30
- template: templates/component-governance-component-detection-steps.yml
parameters:
condition: 'succeeded'
- template: templates/clean-agent-build-directory-step.yml
- stage: DistributedInferenceTest
dependsOn: []
jobs:
- job: Onnxruntime_Linux_GPU_Inference_Distributed_Test
timeoutInMinutes: 120
pool: 'Onnxruntime-Linux-GPU-NC24sv3'
steps:
- checkout: self
clean: true
submodules: recursive
- template: templates/run-docker-build-steps.yml
parameters:
RunDockerBuildArgs: |
-o ubuntu20.04 -d gpu \
-t onnxruntime_ortmodule_distributed_tests_image \
-x " \
--config RelWithDebInfo \
--use_cuda --cuda_version=11.6 --cuda_home=/usr/local/cuda-11.6 --cudnn_home=/usr/local/cuda-11.6 \
--update --build \
--build_wheel \
--use_mpi \
--enable_nccl \
" \
-m \
-u \
-e
DisplayName: 'Build'
- script: |
docker run \
--gpus all \
--shm-size=1024m \
--rm \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume /mnist:/mnist \
onnxruntime_ortmodule_distributed_tests_image \
bash -c "rm -rf /build/RelWithDebInfo/onnxruntime/ && python3 -m pip install mpi4py && python3 -m pip install /build/RelWithDebInfo/dist/onnxruntime*.whl && mpirun -n 4 -x NCCL_DEBUG=INFO python /onnxruntime_src/onnxruntime/test/python/onnxruntime_test_collective.py" \
displayName: 'Run onnxruntime_test_collective.py'
condition: succeededOrFailed()
timeoutInMinutes: 30
- template: templates/component-governance-component-detection-steps.yml
parameters:
condition: 'succeeded'
- template: templates/clean-agent-build-directory-step.yml