mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-28 03:20:58 +00:00
* move e2e nightly pipeline to azure devop Co-authored-by: liqun <liqun@OrtTrainingDev4.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
255 lines
10 KiB
YAML
255 lines
10 KiB
YAML
trigger: none
|
|
|
|
jobs:
|
|
- job: Orttraining_Linux_GPU_Training_E2E_Test_Nightly
|
|
|
|
timeoutInMinutes: 120
|
|
pool: 'Linux-Multi-GPU-V100-E2E3'
|
|
|
|
steps:
|
|
- checkout: self
|
|
clean: true
|
|
submodules: recursive
|
|
|
|
- template: templates/run-docker-build-steps.yml
|
|
parameters:
|
|
RunDockerBuildArgs: |
|
|
-o ubuntu16.04 -d gpu -r $(Build.BinariesDirectory) \
|
|
-t onnxruntime_e2e_test_image \
|
|
-x " \
|
|
--config RelWithDebInfo \
|
|
--enable_training \
|
|
--update --build \
|
|
--build_wheel \
|
|
--enable_training_python_frontend_e2e_tests \
|
|
--enable_training_pipeline_e2e_tests \
|
|
" \
|
|
-m
|
|
DisplayName: 'Build'
|
|
|
|
# update these if the E2E test data changes
|
|
- script: |
|
|
orttraining/tools/ci_test/download_azure_blob_archive.py \
|
|
--azure_blob_url https://onnxruntimetestdata.blob.core.windows.net/training/onnxruntime_training_data.zip?snapshot=2020-06-15T23:17:35.8314853Z \
|
|
--target_dir $(Build.BinariesDirectory)/training_e2e_test_data \
|
|
--archive_sha256_digest B01C169B6550D1A0A6F1B4E2F34AE2A8714B52DBB70AC04DA85D371F691BDFF9
|
|
displayName: 'Download onnxruntime_training_data.zip data'
|
|
|
|
- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh $(orttrainingtestdata-storage-key)
|
|
displayName: 'Map test data'
|
|
condition: succeededOrFailed() # ensure all tests are run
|
|
|
|
# Hit OOM with run_training_pipeline_e2e_tests.py - slightly above 16GB limit.
|
|
# leave this code here for further investigation.
|
|
# https://msdata.visualstudio.com/Vienna/_workitems/edit/956642
|
|
# - script: |
|
|
# docker run \
|
|
# --gpus all \
|
|
# --shm-size=1024m \
|
|
# --rm \
|
|
# --volume $(Build.BinariesDirectory):/build \
|
|
# --volume /bert_data:/bert_data \
|
|
# --volume /bert_ort:/bert_ort \
|
|
# onnxruntime_e2e_test_image \
|
|
# /build/RelWithDebInfo/run_training_pipeline_e2e_tests.py \
|
|
# --cwd /build/RelWithDebInfo
|
|
# displayName: 'Run run_training_pipeline_e2e_tests.py'
|
|
# condition: succeededOrFailed() # ensure all tests are run
|
|
# timeoutInMinutes: 10
|
|
|
|
- script: |
|
|
docker run \
|
|
--gpus all \
|
|
--rm \
|
|
--volume $(Build.SourcesDirectory):/onnxruntime_src \
|
|
--volume $(Build.BinariesDirectory):/build \
|
|
--volume $(Build.BinariesDirectory)/training_e2e_test_data:/training_e2e_test_data:ro \
|
|
onnxruntime_e2e_test_image \
|
|
/onnxruntime_src/orttraining/tools/ci_test/run_batch_size_test.py \
|
|
--binary_dir /build/RelWithDebInfo \
|
|
--model_root /training_e2e_test_data/models
|
|
displayName: 'Run batch size test'
|
|
condition: succeededOrFailed() # ensure all tests are run
|
|
|
|
- script: |
|
|
docker run \
|
|
--gpus all \
|
|
--rm \
|
|
--volume $(Build.SourcesDirectory):/onnxruntime_src \
|
|
--volume $(Build.BinariesDirectory):/build \
|
|
--volume $(Build.BinariesDirectory)/training_e2e_test_data:/training_e2e_test_data:ro \
|
|
onnxruntime_e2e_test_image \
|
|
/onnxruntime_src/orttraining/tools/ci_test/run_convergence_test.py \
|
|
--binary_dir /build/RelWithDebInfo \
|
|
--model_root /training_e2e_test_data/models \
|
|
--training_data_root /training_e2e_test_data/data
|
|
displayName: 'Run convergence test'
|
|
condition: succeededOrFailed() # ensure all tests are run
|
|
|
|
# migrated from frontend e2e pipeline
|
|
- script: |
|
|
docker run \
|
|
--gpus all \
|
|
--shm-size=1024m \
|
|
--rm \
|
|
--volume $(Build.SourcesDirectory):/onnxruntime_src \
|
|
--volume $(Build.BinariesDirectory):/build \
|
|
--volume /bert_data:/bert_data \
|
|
onnxruntime_e2e_test_image \
|
|
/build/RelWithDebInfo/launch_test.py \
|
|
--cmd_line_with_args "python orttraining_run_frontend_batch_size_test.py -v" \
|
|
--cwd /build/RelWithDebInfo \
|
|
--env CUDA_VISIBLE_DEVICES 2
|
|
displayName: 'Run orttraining_run_frontend_batch_size_test.py'
|
|
condition: succeededOrFailed() # ensure all tests are run
|
|
timeoutInMinutes: 30
|
|
|
|
- script: |
|
|
docker run \
|
|
--gpus all \
|
|
--shm-size=1024m \
|
|
--rm \
|
|
--volume $(Build.SourcesDirectory):/onnxruntime_src \
|
|
--volume $(Build.BinariesDirectory):/build \
|
|
--volume /bert_data:/bert_data \
|
|
onnxruntime_e2e_test_image \
|
|
/build/RelWithDebInfo/launch_test.py \
|
|
--cmd_line_with_args "mpirun -n 4 -x NCCL_DEBUG=INFO python orttraining_run_glue.py" \
|
|
--cwd /build/RelWithDebInfo
|
|
displayName: 'mpirun -n 4 -x NCCL_DEBUG=INFO python orttraining_run_glue.py'
|
|
condition: succeededOrFailed() # ensure all tests are run
|
|
timeoutInMinutes: 10
|
|
|
|
- script: |
|
|
docker run \
|
|
--gpus all \
|
|
--shm-size=1024m \
|
|
--rm \
|
|
--volume $(Build.SourcesDirectory):/onnxruntime_src \
|
|
--volume $(Build.BinariesDirectory):/build \
|
|
--volume /bert_data:/bert_data \
|
|
onnxruntime_e2e_test_image \
|
|
/build/RelWithDebInfo/launch_test.py \
|
|
--cmd_line_with_args "python orttraining_run_glue.py ORTGlueTest.test_bert_with_mrpc -v" \
|
|
--cwd /build/RelWithDebInfo \
|
|
--env CUDA_VISIBLE_DEVICES 2
|
|
displayName: 'Run orttraining_run_glue.py ORTGlueTest.test_bert_with_mrpc'
|
|
condition: succeededOrFailed() # ensure all tests are run
|
|
timeoutInMinutes: 10
|
|
|
|
- script: |
|
|
docker run \
|
|
--gpus all \
|
|
--shm-size=1024m \
|
|
--rm \
|
|
--volume $(Build.SourcesDirectory):/onnxruntime_src \
|
|
--volume $(Build.BinariesDirectory):/build \
|
|
--volume /bert_data:/bert_data \
|
|
onnxruntime_e2e_test_image \
|
|
/build/RelWithDebInfo/launch_test.py \
|
|
--cmd_line_with_args "python orttraining_run_glue.py ORTGlueTest.test_bert_fp16_with_mrpc -v" \
|
|
--cwd /build/RelWithDebInfo \
|
|
--env CUDA_VISIBLE_DEVICES 2
|
|
displayName: 'Run orttraining_run_glue.py ORTGlueTest.test_bert_fp16_with_mrpc'
|
|
condition: succeededOrFailed() # ensure all tests are run
|
|
timeoutInMinutes: 10
|
|
|
|
- script: |
|
|
docker run \
|
|
--gpus all \
|
|
--shm-size=1024m \
|
|
--rm \
|
|
--volume $(Build.SourcesDirectory):/onnxruntime_src \
|
|
--volume $(Build.BinariesDirectory):/build \
|
|
--volume /bert_data:/bert_data \
|
|
onnxruntime_e2e_test_image \
|
|
/build/RelWithDebInfo/launch_test.py \
|
|
--cmd_line_with_args "python orttraining_run_glue.py ORTGlueTest.test_roberta_with_mrpc -v" \
|
|
--cwd /build/RelWithDebInfo \
|
|
--env CUDA_VISIBLE_DEVICES 2
|
|
displayName: 'Run orttraining_run_glue.py ORTGlueTest.test_roberta_with_mrpc'
|
|
condition: succeededOrFailed() # ensure all tests are run
|
|
timeoutInMinutes: 10
|
|
|
|
- script: |
|
|
docker run \
|
|
--gpus all \
|
|
--shm-size=1024m \
|
|
--rm \
|
|
--volume $(Build.SourcesDirectory):/onnxruntime_src \
|
|
--volume $(Build.BinariesDirectory):/build \
|
|
--volume /bert_data:/bert_data \
|
|
onnxruntime_e2e_test_image \
|
|
/build/RelWithDebInfo/launch_test.py \
|
|
--cmd_line_with_args "python orttraining_run_glue.py ORTGlueTest.test_roberta_fp16_with_mrpc -v" \
|
|
--cwd /build/RelWithDebInfo \
|
|
--env CUDA_VISIBLE_DEVICES 2
|
|
displayName: 'Run orttraining_run_glue.py ORTGlueTest.test_roberta_fp16_with_mrpc'
|
|
condition: succeededOrFailed() # ensure all tests are run
|
|
timeoutInMinutes: 10
|
|
|
|
- script: |
|
|
docker run \
|
|
--gpus all \
|
|
--shm-size=1024m \
|
|
--rm \
|
|
--volume $(Build.SourcesDirectory):/onnxruntime_src \
|
|
--volume $(Build.BinariesDirectory):/build \
|
|
--volume /bert_data:/bert_data \
|
|
onnxruntime_e2e_test_image \
|
|
/build/RelWithDebInfo/launch_test.py \
|
|
--cmd_line_with_args "python orttraining_run_multiple_choice.py ORTMultipleChoiceTest.test_bert_fp16_with_swag -v" \
|
|
--cwd /build/RelWithDebInfo \
|
|
--env CUDA_VISIBLE_DEVICES 2
|
|
displayName: 'Run orttraining_run_multiple_choice.py ORTMultipleChoiceTest.test_bert_fp16_with_swag'
|
|
condition: succeededOrFailed() # ensure all tests are run
|
|
timeoutInMinutes: 30
|
|
|
|
- script: |
|
|
docker run \
|
|
--gpus all \
|
|
--shm-size=1024m \
|
|
--rm \
|
|
--volume $(Build.SourcesDirectory):/onnxruntime_src \
|
|
--volume $(Build.BinariesDirectory):/build \
|
|
--volume /bert_data:/bert_data \
|
|
onnxruntime_e2e_test_image \
|
|
/build/RelWithDebInfo/launch_test.py \
|
|
--cmd_line_with_args "python onnxruntime_test_ort_trainer_with_mixed_precision.py -v" \
|
|
--cwd /build/RelWithDebInfo
|
|
displayName: 'Run onnxruntime_test_ort_trainer_with_mixed_precision.py'
|
|
condition: succeededOrFailed() # ensure all tests are run
|
|
timeoutInMinutes: 10
|
|
|
|
- script: |
|
|
docker run \
|
|
--gpus all \
|
|
--rm \
|
|
--volume $(Build.SourcesDirectory):/onnxruntime_src \
|
|
--volume $(Build.BinariesDirectory):/build \
|
|
--volume /bert_data:/bert_data \
|
|
onnxruntime_e2e_test_image \
|
|
/build/RelWithDebInfo/launch_test.py \
|
|
--cmd_line_with_args "python orttraining_test_transformers.py BertModelTest.test_for_pretraining_mixed_precision -v" \
|
|
--cwd /build/RelWithDebInfo
|
|
displayName: 'Run orttraining_test_transformers.py BertModelTest.test_for_pretraining_mixed_precision'
|
|
condition: succeededOrFailed() # ensure all tests are run
|
|
timeoutInMinutes: 10
|
|
|
|
- script: |
|
|
docker run \
|
|
--gpus all \
|
|
--shm-size=1024m \
|
|
--rm \
|
|
--volume $(Build.SourcesDirectory):/onnxruntime_src \
|
|
--volume $(Build.BinariesDirectory):/build \
|
|
--volume /bert_data:/bert_data \
|
|
onnxruntime_e2e_test_image \
|
|
/build/RelWithDebInfo/launch_test.py \
|
|
--cmd_line_with_args "mpirun -n 4 -x NCCL_DEBUG=INFO python orttraining_run_bert_pretrain.py ORTBertPretrainTest.test_pretrain_convergence" \
|
|
--cwd /build/RelWithDebInfo
|
|
displayName: 'Run orttraining_run_bert_pretrain.py ORTBertPretrainTest.test_pretrain_convergence'
|
|
condition: succeededOrFailed() # ensure all tests are run
|
|
timeoutInMinutes: 20
|
|
|
|
- template: templates/clean-agent-build-directory-step.yml
|