onnxruntime/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-distributed-e2e-test-pipeline.yml
Changming Sun 4161758058
Remove openmp related packaging pipeline (#6991)
1. Remove openmp related packaging pipelines and build jobs.
2. Set continueOnError to true for the TSAUpload tasks. Their service is unstable recently.
3. Update Ubuntu 16 docker images to Ubuntu 18, in prepare for getting C++17 support
4. Cherry-pick the changes in 1.7.1 to the master: updating CFLAGS/CXXFLAGS to strip out debug symbols
2021-03-12 10:02:59 -08:00

321 lines
13 KiB
YAML

trigger: none
jobs:
- job: Orttraining_Linux_GPU_Distributed_E2E_Test
timeoutInMinutes: 180
pool: 'Linux-Multi-GPU-V100-E2E3'
steps:
- checkout: self
clean: true
submodules: recursive
- template: templates/run-docker-build-steps.yml
parameters:
RunDockerBuildArgs: |
-o ubuntu18.04 -d gpu -r $(Build.BinariesDirectory) \
-t onnxruntime_e2e_test_image \
-x " \
--config RelWithDebInfo \
--enable_training \
--update --build \
--build_wheel \
" \
-m
DisplayName: 'Build'
# update these if the E2E test data changes
- script: |
orttraining/tools/ci_test/download_azure_blob_archive.py \
--azure_blob_url https://onnxruntimetestdata.blob.core.windows.net/training/onnxruntime_training_data.zip?snapshot=2020-06-15T23:17:35.8314853Z \
--target_dir $(Build.BinariesDirectory)/training_e2e_test_data \
--archive_sha256_digest B01C169B6550D1A0A6F1B4E2F34AE2A8714B52DBB70AC04DA85D371F691BDFF9
displayName: 'Download onnxruntime_training_data.zip data'
- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh $(orttrainingtestdata-storage-key)
displayName: 'Map test data'
condition: succeededOrFailed() # ensure all tests are run
- script: |
docker run \
--gpus all \
--shm-size=1024m \
--rm \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
onnxruntime_e2e_test_image \
/build/RelWithDebInfo/launch_test.py \
--cmd_line_with_args "python orttraining_distributed_tests.py" \
--cwd /build/RelWithDebInfo \
displayName: 'Run orttraining_distributed_tests.py'
condition: succeededOrFailed()
timeoutInMinutes: 120
- script: |
docker run \
--gpus all \
--shm-size=1024m \
--rm \
--volume $(Build.BinariesDirectory):/build \
--volume /bert_data:/bert_data \
onnxruntime_e2e_test_image \
mpirun -n 4 \
/build/RelWithDebInfo/onnxruntime_training_bert \
--ort_log_severity 1 \
--optimizer=Lamb \
--learning_rate=3e-3 \
--max_seq_length=128 \
--max_predictions_per_seq=20 \
--warmup_ratio=0.2843 \
--warmup_mode=Poly \
--model_name /bert_data/bert_models/nv/bert-large/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12 \
--train_data_dir /bert_data/128/books_wiki_en_corpus/train \
--test_data_dir /bert_data/128/books_wiki_en_corpus/test \
--display_loss_steps 1 \
--use_nccl \
--use_mixed_precision \
--allreduce_in_fp16 \
--gradient_accumulation_steps 48 \
--num_train_steps 96 \
--train_batch_size 40 \
--pipeline_parallel_size 4 \
--cut_group_info 1149:407-1219/1341/1463/1585/1707/1829,1881:407-1951/2073/2195/2317/2439/2561,2613:407-2683/2805/2927/3049/3171/3293
displayName: 'mpirun onnxruntime_training_bert --pipeline_parallel_size 4'
condition: succeededOrFailed() # ensure all tests are run
timeoutInMinutes: 10
- script: |
docker run \
--gpus all \
--shm-size=1024m \
--rm \
--volume $(Build.BinariesDirectory):/build \
--volume /bert_data:/bert_data \
onnxruntime_e2e_test_image \
mpirun -n 4 \
/build/RelWithDebInfo/onnxruntime_training_bert \
--ort_log_severity 1 \
--optimizer=Lamb \
--learning_rate=3e-3 \
--max_seq_length=128 \
--max_predictions_per_seq=20 \
--warmup_ratio=0.2843 \
--warmup_mode=Poly \
--model_name /bert_data/bert_models/nv/bert-large/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12 \
--train_data_dir /bert_data/128/books_wiki_en_corpus/train \
--test_data_dir /bert_data/128/books_wiki_en_corpus/test \
--display_loss_steps 1 \
--use_nccl \
--use_mixed_precision \
--allreduce_in_fp16 \
--gradient_accumulation_steps 48 \
--num_train_steps 96 \
--train_batch_size 40 \
--data_parallel_size 2 \
--pipeline_parallel_size 2 \
--cut_group_info 1881:407-1951/2073/2195/2317/2439/2561/2683/2805/2927/3049/3171/3293
displayName: 'mpirun onnxruntime_training_bert --data_parallel_size 2 --pipeline_parallel_size 2'
condition: succeededOrFailed() # ensure all tests are run
timeoutInMinutes: 10
- script: |
docker run \
--gpus all \
--rm \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume $(Build.BinariesDirectory)/training_e2e_test_data:/training_e2e_test_data:ro \
onnxruntime_e2e_test_image \
/onnxruntime_src/orttraining/tools/ci_test/run_batch_size_test.py \
--binary_dir /build/RelWithDebInfo \
--model_root /training_e2e_test_data/models
displayName: 'Run batch size test'
condition: succeededOrFailed() # ensure all tests are run
- script: |
docker run \
--gpus all \
--rm \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume $(Build.BinariesDirectory)/training_e2e_test_data:/training_e2e_test_data:ro \
onnxruntime_e2e_test_image \
/onnxruntime_src/orttraining/tools/ci_test/run_convergence_test.py \
--binary_dir /build/RelWithDebInfo \
--model_root /training_e2e_test_data/models \
--training_data_root /training_e2e_test_data/data
displayName: 'Run convergence test'
condition: succeededOrFailed() # ensure all tests are run
# migrated from frontend e2e pipeline
- script: |
docker run \
--gpus all \
--shm-size=1024m \
--rm \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume /bert_data:/bert_data \
onnxruntime_e2e_test_image \
/build/RelWithDebInfo/launch_test.py \
--cmd_line_with_args "python orttraining_run_frontend_batch_size_test.py -v" \
--cwd /build/RelWithDebInfo \
--env CUDA_VISIBLE_DEVICES 2
displayName: 'Run orttraining_run_frontend_batch_size_test.py'
condition: succeededOrFailed() # ensure all tests are run
timeoutInMinutes: 30
- script: |
docker run \
--gpus all \
--shm-size=1024m \
--rm \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume /bert_data:/bert_data \
onnxruntime_e2e_test_image \
/build/RelWithDebInfo/launch_test.py \
--cmd_line_with_args "mpirun -n 4 -x NCCL_DEBUG=INFO python orttraining_run_glue.py" \
--cwd /build/RelWithDebInfo
displayName: 'mpirun -n 4 -x NCCL_DEBUG=INFO python orttraining_run_glue.py'
condition: succeededOrFailed() # ensure all tests are run
timeoutInMinutes: 10
- script: |
docker run \
--gpus all \
--shm-size=1024m \
--rm \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume /bert_data:/bert_data \
onnxruntime_e2e_test_image \
/build/RelWithDebInfo/launch_test.py \
--cmd_line_with_args "python orttraining_run_glue.py ORTGlueTest.test_bert_with_mrpc -v" \
--cwd /build/RelWithDebInfo \
--env CUDA_VISIBLE_DEVICES 2
displayName: 'Run orttraining_run_glue.py ORTGlueTest.test_bert_with_mrpc'
condition: succeededOrFailed() # ensure all tests are run
timeoutInMinutes: 10
- script: |
docker run \
--gpus all \
--shm-size=1024m \
--rm \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume /bert_data:/bert_data \
onnxruntime_e2e_test_image \
/build/RelWithDebInfo/launch_test.py \
--cmd_line_with_args "python orttraining_run_glue.py ORTGlueTest.test_bert_fp16_with_mrpc -v" \
--cwd /build/RelWithDebInfo \
--env CUDA_VISIBLE_DEVICES 2
displayName: 'Run orttraining_run_glue.py ORTGlueTest.test_bert_fp16_with_mrpc'
condition: succeededOrFailed() # ensure all tests are run
timeoutInMinutes: 10
- script: |
docker run \
--gpus all \
--shm-size=1024m \
--rm \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume /bert_data:/bert_data \
onnxruntime_e2e_test_image \
/build/RelWithDebInfo/launch_test.py \
--cmd_line_with_args "python orttraining_run_glue.py ORTGlueTest.test_roberta_with_mrpc -v" \
--cwd /build/RelWithDebInfo \
--env CUDA_VISIBLE_DEVICES 2
displayName: 'Run orttraining_run_glue.py ORTGlueTest.test_roberta_with_mrpc'
condition: succeededOrFailed() # ensure all tests are run
timeoutInMinutes: 10
- script: |
docker run \
--gpus all \
--shm-size=1024m \
--rm \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume /bert_data:/bert_data \
onnxruntime_e2e_test_image \
/build/RelWithDebInfo/launch_test.py \
--cmd_line_with_args "python orttraining_run_glue.py ORTGlueTest.test_roberta_fp16_with_mrpc -v" \
--cwd /build/RelWithDebInfo \
--env CUDA_VISIBLE_DEVICES 2
displayName: 'Run orttraining_run_glue.py ORTGlueTest.test_roberta_fp16_with_mrpc'
condition: succeededOrFailed() # ensure all tests are run
timeoutInMinutes: 10
- script: |
docker run \
--gpus all \
--shm-size=1024m \
--rm \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume /bert_data:/bert_data \
onnxruntime_e2e_test_image \
/build/RelWithDebInfo/launch_test.py \
--cmd_line_with_args "python orttraining_run_multiple_choice.py ORTMultipleChoiceTest.test_bert_fp16_with_swag -v" \
--cwd /build/RelWithDebInfo \
--env CUDA_VISIBLE_DEVICES 2
displayName: 'Run orttraining_run_multiple_choice.py ORTMultipleChoiceTest.test_bert_fp16_with_swag'
condition: succeededOrFailed() # ensure all tests are run
timeoutInMinutes: 30
- script: |
docker run \
--gpus all \
--shm-size=1024m \
--rm \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume /bert_data:/bert_data \
onnxruntime_e2e_test_image \
/build/RelWithDebInfo/launch_test.py \
--cmd_line_with_args "python onnxruntime_test_ort_trainer_with_mixed_precision.py -v" \
--cwd /build/RelWithDebInfo
displayName: 'Run onnxruntime_test_ort_trainer_with_mixed_precision.py'
condition: succeededOrFailed() # ensure all tests are run
timeoutInMinutes: 10
- script: |
docker run \
--gpus all \
--rm \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume /bert_data:/bert_data \
onnxruntime_e2e_test_image \
/build/RelWithDebInfo/launch_test.py \
--cmd_line_with_args "python orttraining_test_transformers.py BertModelTest.test_for_pretraining_mixed_precision -v" \
--cwd /build/RelWithDebInfo
displayName: 'Run orttraining_test_transformers.py BertModelTest.test_for_pretraining_mixed_precision'
condition: succeededOrFailed() # ensure all tests are run
timeoutInMinutes: 10
- script: |
docker run \
--gpus all \
--shm-size=1024m \
--rm \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume /bert_data:/bert_data \
onnxruntime_e2e_test_image \
/build/RelWithDebInfo/launch_test.py \
--cmd_line_with_args "mpirun -n 4 -x NCCL_DEBUG=INFO python orttraining_run_bert_pretrain.py ORTBertPretrainTest.test_pretrain_convergence" \
--cwd /build/RelWithDebInfo
displayName: 'Run orttraining_run_bert_pretrain.py ORTBertPretrainTest.test_pretrain_convergence'
condition: succeededOrFailed() # ensure all tests are run
timeoutInMinutes: 30
- template: templates/component-governance-component-detection-steps.yml
parameters:
condition: 'succeeded'
- template: templates/clean-agent-build-directory-step.yml