mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-06 00:03:22 +00:00
1. Remove openmp related packaging pipelines and build jobs. 2. Set continueOnError to true for the TSAUpload tasks. Their service is unstable recently. 3. Update Ubuntu 16 docker images to Ubuntu 18, in prepare for getting C++17 support 4. Cherry-pick the changes in 1.7.1 to the master: updating CFLAGS/CXXFLAGS to strip out debug symbols
321 lines
13 KiB
YAML
321 lines
13 KiB
YAML
trigger: none
|
|
|
|
jobs:
|
|
- job: Orttraining_Linux_GPU_Distributed_E2E_Test
|
|
|
|
timeoutInMinutes: 180
|
|
pool: 'Linux-Multi-GPU-V100-E2E3'
|
|
|
|
steps:
|
|
- checkout: self
|
|
clean: true
|
|
submodules: recursive
|
|
|
|
- template: templates/run-docker-build-steps.yml
|
|
parameters:
|
|
RunDockerBuildArgs: |
|
|
-o ubuntu18.04 -d gpu -r $(Build.BinariesDirectory) \
|
|
-t onnxruntime_e2e_test_image \
|
|
-x " \
|
|
--config RelWithDebInfo \
|
|
--enable_training \
|
|
--update --build \
|
|
--build_wheel \
|
|
" \
|
|
-m
|
|
DisplayName: 'Build'
|
|
|
|
# update these if the E2E test data changes
|
|
- script: |
|
|
orttraining/tools/ci_test/download_azure_blob_archive.py \
|
|
--azure_blob_url https://onnxruntimetestdata.blob.core.windows.net/training/onnxruntime_training_data.zip?snapshot=2020-06-15T23:17:35.8314853Z \
|
|
--target_dir $(Build.BinariesDirectory)/training_e2e_test_data \
|
|
--archive_sha256_digest B01C169B6550D1A0A6F1B4E2F34AE2A8714B52DBB70AC04DA85D371F691BDFF9
|
|
displayName: 'Download onnxruntime_training_data.zip data'
|
|
|
|
- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh $(orttrainingtestdata-storage-key)
|
|
displayName: 'Map test data'
|
|
condition: succeededOrFailed() # ensure all tests are run
|
|
|
|
- script: |
|
|
docker run \
|
|
--gpus all \
|
|
--shm-size=1024m \
|
|
--rm \
|
|
--volume $(Build.SourcesDirectory):/onnxruntime_src \
|
|
--volume $(Build.BinariesDirectory):/build \
|
|
onnxruntime_e2e_test_image \
|
|
/build/RelWithDebInfo/launch_test.py \
|
|
--cmd_line_with_args "python orttraining_distributed_tests.py" \
|
|
--cwd /build/RelWithDebInfo \
|
|
displayName: 'Run orttraining_distributed_tests.py'
|
|
condition: succeededOrFailed()
|
|
timeoutInMinutes: 120
|
|
|
|
- script: |
|
|
docker run \
|
|
--gpus all \
|
|
--shm-size=1024m \
|
|
--rm \
|
|
--volume $(Build.BinariesDirectory):/build \
|
|
--volume /bert_data:/bert_data \
|
|
onnxruntime_e2e_test_image \
|
|
mpirun -n 4 \
|
|
/build/RelWithDebInfo/onnxruntime_training_bert \
|
|
--ort_log_severity 1 \
|
|
--optimizer=Lamb \
|
|
--learning_rate=3e-3 \
|
|
--max_seq_length=128 \
|
|
--max_predictions_per_seq=20 \
|
|
--warmup_ratio=0.2843 \
|
|
--warmup_mode=Poly \
|
|
--model_name /bert_data/bert_models/nv/bert-large/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12 \
|
|
--train_data_dir /bert_data/128/books_wiki_en_corpus/train \
|
|
--test_data_dir /bert_data/128/books_wiki_en_corpus/test \
|
|
--display_loss_steps 1 \
|
|
--use_nccl \
|
|
--use_mixed_precision \
|
|
--allreduce_in_fp16 \
|
|
--gradient_accumulation_steps 48 \
|
|
--num_train_steps 96 \
|
|
--train_batch_size 40 \
|
|
--pipeline_parallel_size 4 \
|
|
--cut_group_info 1149:407-1219/1341/1463/1585/1707/1829,1881:407-1951/2073/2195/2317/2439/2561,2613:407-2683/2805/2927/3049/3171/3293
|
|
displayName: 'mpirun onnxruntime_training_bert --pipeline_parallel_size 4'
|
|
condition: succeededOrFailed() # ensure all tests are run
|
|
timeoutInMinutes: 10
|
|
|
|
- script: |
|
|
docker run \
|
|
--gpus all \
|
|
--shm-size=1024m \
|
|
--rm \
|
|
--volume $(Build.BinariesDirectory):/build \
|
|
--volume /bert_data:/bert_data \
|
|
onnxruntime_e2e_test_image \
|
|
mpirun -n 4 \
|
|
/build/RelWithDebInfo/onnxruntime_training_bert \
|
|
--ort_log_severity 1 \
|
|
--optimizer=Lamb \
|
|
--learning_rate=3e-3 \
|
|
--max_seq_length=128 \
|
|
--max_predictions_per_seq=20 \
|
|
--warmup_ratio=0.2843 \
|
|
--warmup_mode=Poly \
|
|
--model_name /bert_data/bert_models/nv/bert-large/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12 \
|
|
--train_data_dir /bert_data/128/books_wiki_en_corpus/train \
|
|
--test_data_dir /bert_data/128/books_wiki_en_corpus/test \
|
|
--display_loss_steps 1 \
|
|
--use_nccl \
|
|
--use_mixed_precision \
|
|
--allreduce_in_fp16 \
|
|
--gradient_accumulation_steps 48 \
|
|
--num_train_steps 96 \
|
|
--train_batch_size 40 \
|
|
--data_parallel_size 2 \
|
|
--pipeline_parallel_size 2 \
|
|
--cut_group_info 1881:407-1951/2073/2195/2317/2439/2561/2683/2805/2927/3049/3171/3293
|
|
displayName: 'mpirun onnxruntime_training_bert --data_parallel_size 2 --pipeline_parallel_size 2'
|
|
condition: succeededOrFailed() # ensure all tests are run
|
|
timeoutInMinutes: 10
|
|
|
|
- script: |
|
|
docker run \
|
|
--gpus all \
|
|
--rm \
|
|
--volume $(Build.SourcesDirectory):/onnxruntime_src \
|
|
--volume $(Build.BinariesDirectory):/build \
|
|
--volume $(Build.BinariesDirectory)/training_e2e_test_data:/training_e2e_test_data:ro \
|
|
onnxruntime_e2e_test_image \
|
|
/onnxruntime_src/orttraining/tools/ci_test/run_batch_size_test.py \
|
|
--binary_dir /build/RelWithDebInfo \
|
|
--model_root /training_e2e_test_data/models
|
|
displayName: 'Run batch size test'
|
|
condition: succeededOrFailed() # ensure all tests are run
|
|
|
|
- script: |
|
|
docker run \
|
|
--gpus all \
|
|
--rm \
|
|
--volume $(Build.SourcesDirectory):/onnxruntime_src \
|
|
--volume $(Build.BinariesDirectory):/build \
|
|
--volume $(Build.BinariesDirectory)/training_e2e_test_data:/training_e2e_test_data:ro \
|
|
onnxruntime_e2e_test_image \
|
|
/onnxruntime_src/orttraining/tools/ci_test/run_convergence_test.py \
|
|
--binary_dir /build/RelWithDebInfo \
|
|
--model_root /training_e2e_test_data/models \
|
|
--training_data_root /training_e2e_test_data/data
|
|
displayName: 'Run convergence test'
|
|
condition: succeededOrFailed() # ensure all tests are run
|
|
|
|
# migrated from frontend e2e pipeline
|
|
- script: |
|
|
docker run \
|
|
--gpus all \
|
|
--shm-size=1024m \
|
|
--rm \
|
|
--volume $(Build.SourcesDirectory):/onnxruntime_src \
|
|
--volume $(Build.BinariesDirectory):/build \
|
|
--volume /bert_data:/bert_data \
|
|
onnxruntime_e2e_test_image \
|
|
/build/RelWithDebInfo/launch_test.py \
|
|
--cmd_line_with_args "python orttraining_run_frontend_batch_size_test.py -v" \
|
|
--cwd /build/RelWithDebInfo \
|
|
--env CUDA_VISIBLE_DEVICES 2
|
|
displayName: 'Run orttraining_run_frontend_batch_size_test.py'
|
|
condition: succeededOrFailed() # ensure all tests are run
|
|
timeoutInMinutes: 30
|
|
|
|
- script: |
|
|
docker run \
|
|
--gpus all \
|
|
--shm-size=1024m \
|
|
--rm \
|
|
--volume $(Build.SourcesDirectory):/onnxruntime_src \
|
|
--volume $(Build.BinariesDirectory):/build \
|
|
--volume /bert_data:/bert_data \
|
|
onnxruntime_e2e_test_image \
|
|
/build/RelWithDebInfo/launch_test.py \
|
|
--cmd_line_with_args "mpirun -n 4 -x NCCL_DEBUG=INFO python orttraining_run_glue.py" \
|
|
--cwd /build/RelWithDebInfo
|
|
displayName: 'mpirun -n 4 -x NCCL_DEBUG=INFO python orttraining_run_glue.py'
|
|
condition: succeededOrFailed() # ensure all tests are run
|
|
timeoutInMinutes: 10
|
|
|
|
- script: |
|
|
docker run \
|
|
--gpus all \
|
|
--shm-size=1024m \
|
|
--rm \
|
|
--volume $(Build.SourcesDirectory):/onnxruntime_src \
|
|
--volume $(Build.BinariesDirectory):/build \
|
|
--volume /bert_data:/bert_data \
|
|
onnxruntime_e2e_test_image \
|
|
/build/RelWithDebInfo/launch_test.py \
|
|
--cmd_line_with_args "python orttraining_run_glue.py ORTGlueTest.test_bert_with_mrpc -v" \
|
|
--cwd /build/RelWithDebInfo \
|
|
--env CUDA_VISIBLE_DEVICES 2
|
|
displayName: 'Run orttraining_run_glue.py ORTGlueTest.test_bert_with_mrpc'
|
|
condition: succeededOrFailed() # ensure all tests are run
|
|
timeoutInMinutes: 10
|
|
|
|
- script: |
|
|
docker run \
|
|
--gpus all \
|
|
--shm-size=1024m \
|
|
--rm \
|
|
--volume $(Build.SourcesDirectory):/onnxruntime_src \
|
|
--volume $(Build.BinariesDirectory):/build \
|
|
--volume /bert_data:/bert_data \
|
|
onnxruntime_e2e_test_image \
|
|
/build/RelWithDebInfo/launch_test.py \
|
|
--cmd_line_with_args "python orttraining_run_glue.py ORTGlueTest.test_bert_fp16_with_mrpc -v" \
|
|
--cwd /build/RelWithDebInfo \
|
|
--env CUDA_VISIBLE_DEVICES 2
|
|
displayName: 'Run orttraining_run_glue.py ORTGlueTest.test_bert_fp16_with_mrpc'
|
|
condition: succeededOrFailed() # ensure all tests are run
|
|
timeoutInMinutes: 10
|
|
|
|
- script: |
|
|
docker run \
|
|
--gpus all \
|
|
--shm-size=1024m \
|
|
--rm \
|
|
--volume $(Build.SourcesDirectory):/onnxruntime_src \
|
|
--volume $(Build.BinariesDirectory):/build \
|
|
--volume /bert_data:/bert_data \
|
|
onnxruntime_e2e_test_image \
|
|
/build/RelWithDebInfo/launch_test.py \
|
|
--cmd_line_with_args "python orttraining_run_glue.py ORTGlueTest.test_roberta_with_mrpc -v" \
|
|
--cwd /build/RelWithDebInfo \
|
|
--env CUDA_VISIBLE_DEVICES 2
|
|
displayName: 'Run orttraining_run_glue.py ORTGlueTest.test_roberta_with_mrpc'
|
|
condition: succeededOrFailed() # ensure all tests are run
|
|
timeoutInMinutes: 10
|
|
|
|
- script: |
|
|
docker run \
|
|
--gpus all \
|
|
--shm-size=1024m \
|
|
--rm \
|
|
--volume $(Build.SourcesDirectory):/onnxruntime_src \
|
|
--volume $(Build.BinariesDirectory):/build \
|
|
--volume /bert_data:/bert_data \
|
|
onnxruntime_e2e_test_image \
|
|
/build/RelWithDebInfo/launch_test.py \
|
|
--cmd_line_with_args "python orttraining_run_glue.py ORTGlueTest.test_roberta_fp16_with_mrpc -v" \
|
|
--cwd /build/RelWithDebInfo \
|
|
--env CUDA_VISIBLE_DEVICES 2
|
|
displayName: 'Run orttraining_run_glue.py ORTGlueTest.test_roberta_fp16_with_mrpc'
|
|
condition: succeededOrFailed() # ensure all tests are run
|
|
timeoutInMinutes: 10
|
|
|
|
- script: |
|
|
docker run \
|
|
--gpus all \
|
|
--shm-size=1024m \
|
|
--rm \
|
|
--volume $(Build.SourcesDirectory):/onnxruntime_src \
|
|
--volume $(Build.BinariesDirectory):/build \
|
|
--volume /bert_data:/bert_data \
|
|
onnxruntime_e2e_test_image \
|
|
/build/RelWithDebInfo/launch_test.py \
|
|
--cmd_line_with_args "python orttraining_run_multiple_choice.py ORTMultipleChoiceTest.test_bert_fp16_with_swag -v" \
|
|
--cwd /build/RelWithDebInfo \
|
|
--env CUDA_VISIBLE_DEVICES 2
|
|
displayName: 'Run orttraining_run_multiple_choice.py ORTMultipleChoiceTest.test_bert_fp16_with_swag'
|
|
condition: succeededOrFailed() # ensure all tests are run
|
|
timeoutInMinutes: 30
|
|
|
|
- script: |
|
|
docker run \
|
|
--gpus all \
|
|
--shm-size=1024m \
|
|
--rm \
|
|
--volume $(Build.SourcesDirectory):/onnxruntime_src \
|
|
--volume $(Build.BinariesDirectory):/build \
|
|
--volume /bert_data:/bert_data \
|
|
onnxruntime_e2e_test_image \
|
|
/build/RelWithDebInfo/launch_test.py \
|
|
--cmd_line_with_args "python onnxruntime_test_ort_trainer_with_mixed_precision.py -v" \
|
|
--cwd /build/RelWithDebInfo
|
|
displayName: 'Run onnxruntime_test_ort_trainer_with_mixed_precision.py'
|
|
condition: succeededOrFailed() # ensure all tests are run
|
|
timeoutInMinutes: 10
|
|
|
|
- script: |
|
|
docker run \
|
|
--gpus all \
|
|
--rm \
|
|
--volume $(Build.SourcesDirectory):/onnxruntime_src \
|
|
--volume $(Build.BinariesDirectory):/build \
|
|
--volume /bert_data:/bert_data \
|
|
onnxruntime_e2e_test_image \
|
|
/build/RelWithDebInfo/launch_test.py \
|
|
--cmd_line_with_args "python orttraining_test_transformers.py BertModelTest.test_for_pretraining_mixed_precision -v" \
|
|
--cwd /build/RelWithDebInfo
|
|
displayName: 'Run orttraining_test_transformers.py BertModelTest.test_for_pretraining_mixed_precision'
|
|
condition: succeededOrFailed() # ensure all tests are run
|
|
timeoutInMinutes: 10
|
|
|
|
- script: |
|
|
docker run \
|
|
--gpus all \
|
|
--shm-size=1024m \
|
|
--rm \
|
|
--volume $(Build.SourcesDirectory):/onnxruntime_src \
|
|
--volume $(Build.BinariesDirectory):/build \
|
|
--volume /bert_data:/bert_data \
|
|
onnxruntime_e2e_test_image \
|
|
/build/RelWithDebInfo/launch_test.py \
|
|
--cmd_line_with_args "mpirun -n 4 -x NCCL_DEBUG=INFO python orttraining_run_bert_pretrain.py ORTBertPretrainTest.test_pretrain_convergence" \
|
|
--cwd /build/RelWithDebInfo
|
|
displayName: 'Run orttraining_run_bert_pretrain.py ORTBertPretrainTest.test_pretrain_convergence'
|
|
condition: succeededOrFailed() # ensure all tests are run
|
|
timeoutInMinutes: 30
|
|
|
|
- template: templates/component-governance-component-detection-steps.yml
|
|
parameters:
|
|
condition: 'succeeded'
|
|
|
|
- template: templates/clean-agent-build-directory-step.yml
|