mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-23 02:38:28 +00:00
### Description orttrainingtestdatascus has only save mnist whose size is only 64M in Azure File To meet security requirements and reduce maintenance cost, move the test data to lotusscus and saved in Azure blob.
134 lines
4.4 KiB
YAML
134 lines
4.4 KiB
YAML
##### start trigger Don't edit it manually, Please do edit set-trigger-rules.py ####
|
|
trigger:
|
|
branches:
|
|
include:
|
|
- main
|
|
- rel-*
|
|
paths:
|
|
exclude:
|
|
- docs/**
|
|
- README.md
|
|
- CONTRIBUTING.md
|
|
- BUILD.md
|
|
- 'js/web'
|
|
- 'onnxruntime/core/providers/js'
|
|
pr:
|
|
branches:
|
|
include:
|
|
- main
|
|
- rel-*
|
|
paths:
|
|
exclude:
|
|
- docs/**
|
|
- README.md
|
|
- CONTRIBUTING.md
|
|
- BUILD.md
|
|
- 'js/web'
|
|
- 'onnxruntime/core/providers/js'
|
|
#### end trigger ####
|
|
|
|
stages:
|
|
- stage: ORTModuleDistributedTest
|
|
dependsOn: []
|
|
jobs:
|
|
- job: Onnxruntime_Linux_GPU_ORTModule_Distributed_Test
|
|
|
|
timeoutInMinutes: 120
|
|
pool: 'Onnxruntime-Linux-GPU-NC24sv3'
|
|
|
|
steps:
|
|
- checkout: self
|
|
clean: true
|
|
submodules: recursive
|
|
|
|
- template: templates/jobs/download_training_test_data.yml
|
|
|
|
- template: templates/run-docker-build-steps.yml
|
|
parameters:
|
|
RunDockerBuildArgs: |
|
|
-o ubuntu20.04 -d gpu \
|
|
-t onnxruntime_ortmodule_distributed_tests_image \
|
|
-x " \
|
|
--config RelWithDebInfo \
|
|
--use_cuda --cuda_version=11.8 --cuda_home=/usr/local/cuda-11.8 --cudnn_home=/usr/local/cuda-11.8 \
|
|
--enable_training \
|
|
--update --build \
|
|
--build_wheel \
|
|
" \
|
|
-m \
|
|
-u \
|
|
-e
|
|
DisplayName: 'Build'
|
|
|
|
# Entry point for all ORTModule distributed tests
|
|
# Refer to orttraining/orttraining/test/python/how_to_add_ortmodule_distributed_ci_pipeline_tests.md for guidelines on how to add new tests to this pipeline.
|
|
- script: |
|
|
docker run \
|
|
--gpus all \
|
|
--shm-size=1024m \
|
|
--rm \
|
|
--volume $(Build.SourcesDirectory):/onnxruntime_src \
|
|
--volume $(Build.BinariesDirectory):/build \
|
|
--volume $(Agent.TempDirectory)/mnist:/mnist \
|
|
onnxruntime_ortmodule_distributed_tests_image \
|
|
bash -c "rm -rf /build/RelWithDebInfo/onnxruntime/ && python3 -m pip install /build/RelWithDebInfo/dist/onnxruntime*.whl && python3 -m onnxruntime.training.ortmodule.torch_cpp_extensions.install && /build/RelWithDebInfo/launch_test.py --cmd_line_with_args 'python orttraining_ortmodule_distributed_tests.py --mnist /mnist' --cwd /build/RelWithDebInfo" \
|
|
displayName: 'Run orttraining_ortmodule_distributed_tests.py'
|
|
condition: succeededOrFailed()
|
|
timeoutInMinutes: 30
|
|
|
|
- template: templates/component-governance-component-detection-steps.yml
|
|
parameters:
|
|
condition: 'succeeded'
|
|
|
|
- template: templates/clean-agent-build-directory-step.yml
|
|
|
|
- stage: DistributedInferenceTest
|
|
dependsOn: []
|
|
jobs:
|
|
- job: Onnxruntime_Linux_GPU_Inference_Distributed_Test
|
|
|
|
timeoutInMinutes: 120
|
|
pool: 'Onnxruntime-Linux-GPU-NC24sv3'
|
|
|
|
steps:
|
|
- checkout: self
|
|
clean: true
|
|
submodules: recursive
|
|
|
|
- template: templates/run-docker-build-steps.yml
|
|
parameters:
|
|
RunDockerBuildArgs: |
|
|
-o ubuntu20.04 -d gpu \
|
|
-t onnxruntime_ortmodule_distributed_tests_image \
|
|
-x " \
|
|
--config RelWithDebInfo \
|
|
--use_cuda --cuda_version=11.8 --cuda_home=/usr/local/cuda-11.8 --cudnn_home=/usr/local/cuda-11.8 \
|
|
--update --build \
|
|
--build_wheel \
|
|
--use_mpi \
|
|
--enable_nccl \
|
|
" \
|
|
-m \
|
|
-u \
|
|
-e
|
|
DisplayName: 'Build'
|
|
|
|
- script: |
|
|
docker run \
|
|
--gpus all \
|
|
--shm-size=1024m \
|
|
--rm \
|
|
--volume $(Build.SourcesDirectory):/onnxruntime_src \
|
|
--volume $(Build.BinariesDirectory):/build \
|
|
--volume /mnist:/mnist \
|
|
onnxruntime_ortmodule_distributed_tests_image \
|
|
bash -c "rm -rf /build/RelWithDebInfo/onnxruntime/ && python3 -m pip install mpi4py onnxscript && python3 -m pip install /build/RelWithDebInfo/dist/onnxruntime*.whl && mpirun -n 4 -x NCCL_DEBUG=INFO python /onnxruntime_src/onnxruntime/test/python/onnxruntime_test_collective.py && mpirun -n 2 -x NCCL_DEBUG=INFO python /onnxruntime_src/onnxruntime/test/python/onnxruntime_test_distributed.py" \
|
|
displayName: 'Run onnxruntime_test_collective.py'
|
|
condition: succeededOrFailed()
|
|
timeoutInMinutes: 30
|
|
|
|
- template: templates/component-governance-component-detection-steps.yml
|
|
parameters:
|
|
condition: 'succeeded'
|
|
|
|
- template: templates/clean-agent-build-directory-step.yml
|