onnxruntime/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-distributed-e2e-test-pipeline.yml
Changming Sun b854f2399d
Update manylinux build scripts and GPU CUDA version from 11.0 to 11.1 (#7632)
1. Update manylinux build scripts. This will add [PEP600](https://www.python.org/dev/peps/pep-0600/)(manylinux2 tags) support. numpy has adopted this new feature, we should do the same. The old build script files were copied from https://github.com/pypa/manylinux, but they has been deleted and replaced in the upstream repo. The manylinux repo doesn't have a manylinux2014 branch anymore. So I'm removing the obsolete code, sync the files with the latest master.
2. Update GPU CUDA version from 11.0 to 11.1(after a discussion with PMs). 
3. Delete tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda10_2.  (Merged the content to tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11)
4. Modernize the cmake code of how to locate python devel files. It was suggested in https://github.com/onnx/onnx/pull/1631 .
5. Remove `onnxruntime_MSVC_STATIC_RUNTIME` and `onnxruntime_GCC_STATIC_CPP_RUNTIME` build options. Now cmake has builtin support for it. Starting from cmake 3.15, we can use `CMAKE_MSVC_RUNTIME_LIBRARY` cmake variable to choose which MSVC runtime library we want to use. 
6. Update Ubuntu docker images that used in our CI build from Ubuntu 18.04 to Ubuntu 20.04.
7. Update GCC version in CUDA 11.1 pipelines from 8.x to 9.3.1
8. Split Linux GPU CI pipeline to two jobs: build the code on a CPU machine then run the tests on another GPU machines.  In the past we didn't test our python packages. We only tested the pre-packed files. So we didn't catch the rpath issue in CI build. 
9. Add a CentOS machine pool and test our Linux GPU build on real CentOS machines. 
10. Rework ARM64 Linux GPU python packaging pipeline. Previously it uses cross-compiling therefore we must static link to C Runtime. But now have pluggable EP API and it doesn't support static link. So I changed to use qemu emulation instead. Now the build is 10x slower than before. But it is more extensible.
2021-06-02 23:36:49 -07:00

321 lines
13 KiB
YAML

trigger: none
jobs:
- job: Orttraining_Linux_GPU_Distributed_E2E_Test
timeoutInMinutes: 180
pool: 'Linux-Multi-GPU-V100-E2E3'
steps:
- checkout: self
clean: true
submodules: recursive
- template: templates/run-docker-build-steps.yml
parameters:
RunDockerBuildArgs: |
-o ubuntu20.04 -d gpu -r $(Build.BinariesDirectory) \
-t onnxruntime_e2e_test_image \
-x " \
--config RelWithDebInfo \
--enable_training \
--update --build \
--build_wheel \
" \
-m
DisplayName: 'Build'
# update these if the E2E test data changes
- script: |
orttraining/tools/ci_test/download_azure_blob_archive.py \
--azure_blob_url https://onnxruntimetestdata.blob.core.windows.net/training/onnxruntime_training_data.zip?snapshot=2020-06-15T23:17:35.8314853Z \
--target_dir $(Build.BinariesDirectory)/training_e2e_test_data \
--archive_sha256_digest B01C169B6550D1A0A6F1B4E2F34AE2A8714B52DBB70AC04DA85D371F691BDFF9
displayName: 'Download onnxruntime_training_data.zip data'
- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdata-storage-key) -s "//orttrainingtestdata.file.core.windows.net/bert-data" -d "/bert_data"
displayName: 'Mount bert-data'
condition: succeededOrFailed()
- script: |
docker run \
--gpus all \
--shm-size=1024m \
--rm \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
onnxruntime_e2e_test_image \
/build/RelWithDebInfo/launch_test.py \
--cmd_line_with_args "python orttraining_distributed_tests.py" \
--cwd /build/RelWithDebInfo \
displayName: 'Run orttraining_distributed_tests.py'
condition: succeededOrFailed()
timeoutInMinutes: 120
- script: |
docker run \
--gpus all \
--shm-size=1024m \
--rm \
--volume $(Build.BinariesDirectory):/build \
--volume /bert_data:/bert_data \
onnxruntime_e2e_test_image \
mpirun -n 4 \
/build/RelWithDebInfo/onnxruntime_training_bert \
--ort_log_severity 1 \
--optimizer=Lamb \
--learning_rate=3e-3 \
--max_seq_length=128 \
--max_predictions_per_seq=20 \
--warmup_ratio=0.2843 \
--warmup_mode=Poly \
--model_name /bert_data/bert_models/nv/bert-large/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12 \
--train_data_dir /bert_data/128/books_wiki_en_corpus/train \
--test_data_dir /bert_data/128/books_wiki_en_corpus/test \
--display_loss_steps 1 \
--use_nccl \
--use_mixed_precision \
--allreduce_in_fp16 \
--gradient_accumulation_steps 48 \
--num_train_steps 96 \
--train_batch_size 40 \
--pipeline_parallel_size 4 \
--cut_group_info 1149:407-1219/1341/1463/1585/1707/1829,1881:407-1951/2073/2195/2317/2439/2561,2613:407-2683/2805/2927/3049/3171/3293
displayName: 'mpirun onnxruntime_training_bert --pipeline_parallel_size 4'
condition: succeededOrFailed() # ensure all tests are run
timeoutInMinutes: 10
- script: |
docker run \
--gpus all \
--shm-size=1024m \
--rm \
--volume $(Build.BinariesDirectory):/build \
--volume /bert_data:/bert_data \
onnxruntime_e2e_test_image \
mpirun -n 4 \
/build/RelWithDebInfo/onnxruntime_training_bert \
--ort_log_severity 1 \
--optimizer=Lamb \
--learning_rate=3e-3 \
--max_seq_length=128 \
--max_predictions_per_seq=20 \
--warmup_ratio=0.2843 \
--warmup_mode=Poly \
--model_name /bert_data/bert_models/nv/bert-large/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12 \
--train_data_dir /bert_data/128/books_wiki_en_corpus/train \
--test_data_dir /bert_data/128/books_wiki_en_corpus/test \
--display_loss_steps 1 \
--use_nccl \
--use_mixed_precision \
--allreduce_in_fp16 \
--gradient_accumulation_steps 48 \
--num_train_steps 96 \
--train_batch_size 40 \
--data_parallel_size 2 \
--pipeline_parallel_size 2 \
--cut_group_info 1881:407-1951/2073/2195/2317/2439/2561/2683/2805/2927/3049/3171/3293
displayName: 'mpirun onnxruntime_training_bert --data_parallel_size 2 --pipeline_parallel_size 2'
condition: succeededOrFailed() # ensure all tests are run
timeoutInMinutes: 10
- script: |
docker run \
--gpus all \
--rm \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume $(Build.BinariesDirectory)/training_e2e_test_data:/training_e2e_test_data:ro \
onnxruntime_e2e_test_image \
/onnxruntime_src/orttraining/tools/ci_test/run_batch_size_test.py \
--binary_dir /build/RelWithDebInfo \
--model_root /training_e2e_test_data/models
displayName: 'Run batch size test'
condition: succeededOrFailed() # ensure all tests are run
- script: |
docker run \
--gpus all \
--rm \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume $(Build.BinariesDirectory)/training_e2e_test_data:/training_e2e_test_data:ro \
onnxruntime_e2e_test_image \
/onnxruntime_src/orttraining/tools/ci_test/run_convergence_test.py \
--binary_dir /build/RelWithDebInfo \
--model_root /training_e2e_test_data/models \
--training_data_root /training_e2e_test_data/data
displayName: 'Run convergence test'
condition: succeededOrFailed() # ensure all tests are run
# migrated from frontend e2e pipeline
- script: |
docker run \
--gpus all \
--shm-size=1024m \
--rm \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume /bert_data:/bert_data \
onnxruntime_e2e_test_image \
/build/RelWithDebInfo/launch_test.py \
--cmd_line_with_args "python orttraining_run_frontend_batch_size_test.py -v" \
--cwd /build/RelWithDebInfo \
--env CUDA_VISIBLE_DEVICES 2
displayName: 'Run orttraining_run_frontend_batch_size_test.py'
condition: succeededOrFailed() # ensure all tests are run
timeoutInMinutes: 30
- script: |
docker run \
--gpus all \
--shm-size=1024m \
--rm \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume /bert_data:/bert_data \
onnxruntime_e2e_test_image \
/build/RelWithDebInfo/launch_test.py \
--cmd_line_with_args "mpirun -n 4 -x NCCL_DEBUG=INFO python orttraining_run_glue.py" \
--cwd /build/RelWithDebInfo
displayName: 'mpirun -n 4 -x NCCL_DEBUG=INFO python orttraining_run_glue.py'
condition: succeededOrFailed() # ensure all tests are run
timeoutInMinutes: 10
- script: |
docker run \
--gpus all \
--shm-size=1024m \
--rm \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume /bert_data:/bert_data \
onnxruntime_e2e_test_image \
/build/RelWithDebInfo/launch_test.py \
--cmd_line_with_args "python orttraining_run_glue.py ORTGlueTest.test_bert_with_mrpc -v" \
--cwd /build/RelWithDebInfo \
--env CUDA_VISIBLE_DEVICES 2
displayName: 'Run orttraining_run_glue.py ORTGlueTest.test_bert_with_mrpc'
condition: succeededOrFailed() # ensure all tests are run
timeoutInMinutes: 10
- script: |
docker run \
--gpus all \
--shm-size=1024m \
--rm \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume /bert_data:/bert_data \
onnxruntime_e2e_test_image \
/build/RelWithDebInfo/launch_test.py \
--cmd_line_with_args "python orttraining_run_glue.py ORTGlueTest.test_bert_fp16_with_mrpc -v" \
--cwd /build/RelWithDebInfo \
--env CUDA_VISIBLE_DEVICES 2
displayName: 'Run orttraining_run_glue.py ORTGlueTest.test_bert_fp16_with_mrpc'
condition: succeededOrFailed() # ensure all tests are run
timeoutInMinutes: 10
- script: |
docker run \
--gpus all \
--shm-size=1024m \
--rm \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume /bert_data:/bert_data \
onnxruntime_e2e_test_image \
/build/RelWithDebInfo/launch_test.py \
--cmd_line_with_args "python orttraining_run_glue.py ORTGlueTest.test_roberta_with_mrpc -v" \
--cwd /build/RelWithDebInfo \
--env CUDA_VISIBLE_DEVICES 2
displayName: 'Run orttraining_run_glue.py ORTGlueTest.test_roberta_with_mrpc'
condition: succeededOrFailed() # ensure all tests are run
timeoutInMinutes: 10
- script: |
docker run \
--gpus all \
--shm-size=1024m \
--rm \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume /bert_data:/bert_data \
onnxruntime_e2e_test_image \
/build/RelWithDebInfo/launch_test.py \
--cmd_line_with_args "python orttraining_run_glue.py ORTGlueTest.test_roberta_fp16_with_mrpc -v" \
--cwd /build/RelWithDebInfo \
--env CUDA_VISIBLE_DEVICES 2
displayName: 'Run orttraining_run_glue.py ORTGlueTest.test_roberta_fp16_with_mrpc'
condition: succeededOrFailed() # ensure all tests are run
timeoutInMinutes: 10
- script: |
docker run \
--gpus all \
--shm-size=1024m \
--rm \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume /bert_data:/bert_data \
onnxruntime_e2e_test_image \
/build/RelWithDebInfo/launch_test.py \
--cmd_line_with_args "python orttraining_run_multiple_choice.py ORTMultipleChoiceTest.test_bert_fp16_with_swag -v" \
--cwd /build/RelWithDebInfo \
--env CUDA_VISIBLE_DEVICES 2
displayName: 'Run orttraining_run_multiple_choice.py ORTMultipleChoiceTest.test_bert_fp16_with_swag'
condition: succeededOrFailed() # ensure all tests are run
timeoutInMinutes: 30
- script: |
docker run \
--gpus all \
--shm-size=1024m \
--rm \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume /bert_data:/bert_data \
onnxruntime_e2e_test_image \
/build/RelWithDebInfo/launch_test.py \
--cmd_line_with_args "python onnxruntime_test_ort_trainer_with_mixed_precision.py -v" \
--cwd /build/RelWithDebInfo
displayName: 'Run onnxruntime_test_ort_trainer_with_mixed_precision.py'
condition: succeededOrFailed() # ensure all tests are run
timeoutInMinutes: 10
- script: |
docker run \
--gpus all \
--rm \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume /bert_data:/bert_data \
onnxruntime_e2e_test_image \
/build/RelWithDebInfo/launch_test.py \
--cmd_line_with_args "python orttraining_test_transformers.py BertModelTest.test_for_pretraining_mixed_precision -v" \
--cwd /build/RelWithDebInfo
displayName: 'Run orttraining_test_transformers.py BertModelTest.test_for_pretraining_mixed_precision'
condition: succeededOrFailed() # ensure all tests are run
timeoutInMinutes: 10
- script: |
docker run \
--gpus all \
--shm-size=1024m \
--rm \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume /bert_data:/bert_data \
onnxruntime_e2e_test_image \
/build/RelWithDebInfo/launch_test.py \
--cmd_line_with_args "mpirun -n 4 -x NCCL_DEBUG=INFO python orttraining_run_bert_pretrain.py ORTBertPretrainTest.test_pretrain_convergence" \
--cwd /build/RelWithDebInfo
displayName: 'Run orttraining_run_bert_pretrain.py ORTBertPretrainTest.test_pretrain_convergence'
condition: succeededOrFailed() # ensure all tests are run
timeoutInMinutes: 30
- template: templates/component-governance-component-detection-steps.yml
parameters:
condition: 'succeeded'
- template: templates/clean-agent-build-directory-step.yml