mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-23 22:13:38 +00:00
109 lines
4 KiB
YAML
109 lines
4 KiB
YAML
trigger: none
|
|
|
|
stages:
|
|
- stage: ORTModuleDistributedTest
|
|
dependsOn: []
|
|
jobs:
|
|
- job: Onnxruntime_Linux_GPU_ORTModule_Distributed_Test
|
|
|
|
timeoutInMinutes: 120
|
|
pool: 'Onnxruntime-Linux-GPU-NC24sv3'
|
|
|
|
steps:
|
|
- checkout: self
|
|
clean: true
|
|
submodules: recursive
|
|
|
|
- template: templates/run-docker-build-steps.yml
|
|
parameters:
|
|
RunDockerBuildArgs: |
|
|
-o ubuntu20.04 -d gpu \
|
|
-t onnxruntime_ortmodule_distributed_tests_image \
|
|
-x " \
|
|
--config RelWithDebInfo \
|
|
--use_cuda --cuda_version=11.8 --cuda_home=/usr/local/cuda-11.8 --cudnn_home=/usr/local/cuda-11.8 \
|
|
--enable_training \
|
|
--update --build \
|
|
--build_wheel \
|
|
" \
|
|
-m \
|
|
-u \
|
|
-e
|
|
DisplayName: 'Build'
|
|
|
|
- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/mnist" -d "/mnist"
|
|
displayName: 'Mount MNIST'
|
|
condition: succeededOrFailed()
|
|
|
|
# Entry point for all ORTModule distributed tests
|
|
# Refer to orttraining/orttraining/test/python/how_to_add_ortmodule_distributed_ci_pipeline_tests.md for guidelines on how to add new tests to this pipeline.
|
|
- script: |
|
|
docker run \
|
|
--gpus all \
|
|
--shm-size=1024m \
|
|
--rm \
|
|
--volume $(Build.SourcesDirectory):/onnxruntime_src \
|
|
--volume $(Build.BinariesDirectory):/build \
|
|
--volume /mnist:/mnist \
|
|
onnxruntime_ortmodule_distributed_tests_image \
|
|
bash -c "rm -rf /build/RelWithDebInfo/onnxruntime/ && python3 -m pip install /build/RelWithDebInfo/dist/onnxruntime*.whl && python3 -m onnxruntime.training.ortmodule.torch_cpp_extensions.install && /build/RelWithDebInfo/launch_test.py --cmd_line_with_args 'python orttraining_ortmodule_distributed_tests.py --mnist /mnist' --cwd /build/RelWithDebInfo" \
|
|
displayName: 'Run orttraining_ortmodule_distributed_tests.py'
|
|
condition: succeededOrFailed()
|
|
timeoutInMinutes: 30
|
|
|
|
- template: templates/component-governance-component-detection-steps.yml
|
|
parameters:
|
|
condition: 'succeeded'
|
|
|
|
- template: templates/clean-agent-build-directory-step.yml
|
|
|
|
- stage: DistributedInferenceTest
|
|
dependsOn: []
|
|
jobs:
|
|
- job: Onnxruntime_Linux_GPU_Inference_Distributed_Test
|
|
|
|
timeoutInMinutes: 120
|
|
pool: 'Onnxruntime-Linux-GPU-NC24sv3'
|
|
|
|
steps:
|
|
- checkout: self
|
|
clean: true
|
|
submodules: recursive
|
|
|
|
- template: templates/run-docker-build-steps.yml
|
|
parameters:
|
|
RunDockerBuildArgs: |
|
|
-o ubuntu20.04 -d gpu \
|
|
-t onnxruntime_ortmodule_distributed_tests_image \
|
|
-x " \
|
|
--config RelWithDebInfo \
|
|
--use_cuda --cuda_version=11.8 --cuda_home=/usr/local/cuda-11.8 --cudnn_home=/usr/local/cuda-11.8 \
|
|
--update --build \
|
|
--build_wheel \
|
|
--use_mpi \
|
|
--enable_nccl \
|
|
" \
|
|
-m \
|
|
-u \
|
|
-e
|
|
DisplayName: 'Build'
|
|
|
|
- script: |
|
|
docker run \
|
|
--gpus all \
|
|
--shm-size=1024m \
|
|
--rm \
|
|
--volume $(Build.SourcesDirectory):/onnxruntime_src \
|
|
--volume $(Build.BinariesDirectory):/build \
|
|
--volume /mnist:/mnist \
|
|
onnxruntime_ortmodule_distributed_tests_image \
|
|
bash -c "rm -rf /build/RelWithDebInfo/onnxruntime/ && python3 -m pip install mpi4py && python3 -m pip install /build/RelWithDebInfo/dist/onnxruntime*.whl && mpirun -n 4 -x NCCL_DEBUG=INFO python /onnxruntime_src/onnxruntime/test/python/onnxruntime_test_collective.py" \
|
|
displayName: 'Run onnxruntime_test_collective.py'
|
|
condition: succeededOrFailed()
|
|
timeoutInMinutes: 30
|
|
|
|
- template: templates/component-governance-component-detection-steps.yml
|
|
parameters:
|
|
condition: 'succeeded'
|
|
|
|
- template: templates/clean-agent-build-directory-step.yml
|