trigger: none jobs: - job: Orttraining_Linux_GPU_Distributed_E2E_Test timeoutInMinutes: 180 pool: 'Onnxruntime-Linux-GPU-NC24sv3' steps: - checkout: self clean: true submodules: recursive - template: templates/run-docker-build-steps.yml parameters: RunDockerBuildArgs: | -o ubuntu20.04 -d gpu \ -t onnxruntime_e2e_test_image \ -x " \ --config RelWithDebInfo \ --enable_training \ --update --build \ --build_wheel --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=70 \ " \ -m DisplayName: 'Build' # update these if the E2E test data changes - script: | orttraining/tools/ci_test/download_azure_blob_archive.py \ --azure_blob_url https://onnxruntimetestdata.blob.core.windows.net/training/onnxruntime_training_data.zip?snapshot=2020-06-15T23:17:35.8314853Z \ --target_dir $(Build.BinariesDirectory)/training_e2e_test_data \ --archive_sha256_digest B01C169B6550D1A0A6F1B4E2F34AE2A8714B52DBB70AC04DA85D371F691BDFF9 displayName: 'Download onnxruntime_training_data.zip data' - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/bert-data" -d "/bert_data" displayName: 'Mount bert-data' condition: succeededOrFailed() - script: | docker run \ --gpus all \ --shm-size=1024m \ --rm \ --volume $(Build.SourcesDirectory):/onnxruntime_src \ --volume $(Build.BinariesDirectory):/build \ onnxruntime_e2e_test_image \ /build/RelWithDebInfo/launch_test.py \ --cmd_line_with_args "python orttraining_distributed_tests.py" \ --cwd /build/RelWithDebInfo \ displayName: 'Run orttraining_distributed_tests.py' condition: succeededOrFailed() timeoutInMinutes: 120 - script: | docker run \ --gpus all \ --shm-size=1024m \ --rm \ --volume $(Build.BinariesDirectory):/build \ --volume /bert_data:/bert_data \ onnxruntime_e2e_test_image \ mpirun -n 4 \ /build/RelWithDebInfo/onnxruntime_training_bert \ --ort_log_severity 1 \ --optimizer=Lamb \ --learning_rate=3e-3 \ --max_seq_length=128 \ --max_predictions_per_seq=20 \ --warmup_ratio=0.2843 \ --warmup_mode=Poly \ --model_name /bert_data/bert_models/nv/bert-large/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12 \ --train_data_dir /bert_data/128/books_wiki_en_corpus/train \ --test_data_dir /bert_data/128/books_wiki_en_corpus/test \ --display_loss_steps 1 \ --use_nccl \ --use_mixed_precision \ --allreduce_in_fp16 \ --gradient_accumulation_steps 48 \ --num_train_steps 96 \ --train_batch_size 40 \ --pipeline_parallel_size 4 \ --cut_group_info 1149:407-1219/1341/1463/1585/1707/1829,1881:407-1951/2073/2195/2317/2439/2561,2613:407-2683/2805/2927/3049/3171/3293 displayName: 'mpirun onnxruntime_training_bert --pipeline_parallel_size 4' condition: succeededOrFailed() # ensure all tests are run timeoutInMinutes: 10 - script: | docker run \ --gpus all \ --shm-size=1024m \ --rm \ --volume $(Build.BinariesDirectory):/build \ --volume /bert_data:/bert_data \ onnxruntime_e2e_test_image \ mpirun -n 4 \ /build/RelWithDebInfo/onnxruntime_training_bert \ --ort_log_severity 1 \ --optimizer=Lamb \ --learning_rate=3e-3 \ --max_seq_length=128 \ --max_predictions_per_seq=20 \ --warmup_ratio=0.2843 \ --warmup_mode=Poly \ --model_name /bert_data/bert_models/nv/bert-large/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12 \ --train_data_dir /bert_data/128/books_wiki_en_corpus/train \ --test_data_dir /bert_data/128/books_wiki_en_corpus/test \ --display_loss_steps 1 \ --use_nccl \ --use_mixed_precision \ --allreduce_in_fp16 \ --gradient_accumulation_steps 48 \ --num_train_steps 96 \ --train_batch_size 40 \ --data_parallel_size 2 \ --pipeline_parallel_size 2 \ --cut_group_info 1881:407-1951/2073/2195/2317/2439/2561/2683/2805/2927/3049/3171/3293 displayName: 'mpirun onnxruntime_training_bert --data_parallel_size 2 --pipeline_parallel_size 2' condition: succeededOrFailed() # ensure all tests are run timeoutInMinutes: 10 - script: | docker run \ --gpus all \ --rm \ --volume $(Build.SourcesDirectory):/onnxruntime_src \ --volume $(Build.BinariesDirectory):/build \ --volume $(Build.BinariesDirectory)/training_e2e_test_data:/training_e2e_test_data:ro \ onnxruntime_e2e_test_image \ /onnxruntime_src/orttraining/tools/ci_test/run_batch_size_test.py \ --binary_dir /build/RelWithDebInfo \ --model_root /training_e2e_test_data/models displayName: 'Run batch size test' condition: succeededOrFailed() # ensure all tests are run - script: | docker run \ --gpus all \ --rm \ --volume $(Build.SourcesDirectory):/onnxruntime_src \ --volume $(Build.BinariesDirectory):/build \ --volume $(Build.BinariesDirectory)/training_e2e_test_data:/training_e2e_test_data:ro \ onnxruntime_e2e_test_image \ /onnxruntime_src/orttraining/tools/ci_test/run_convergence_test.py \ --binary_dir /build/RelWithDebInfo \ --model_root /training_e2e_test_data/models \ --training_data_root /training_e2e_test_data/data displayName: 'Run convergence test' condition: succeededOrFailed() # ensure all tests are run # migrated from frontend e2e pipeline - script: | docker run \ --gpus all \ --shm-size=1024m \ --rm \ --volume $(Build.SourcesDirectory):/onnxruntime_src \ --volume $(Build.BinariesDirectory):/build \ --volume /bert_data:/bert_data \ onnxruntime_e2e_test_image \ /build/RelWithDebInfo/launch_test.py \ --cmd_line_with_args "python orttraining_run_frontend_batch_size_test.py -v" \ --cwd /build/RelWithDebInfo \ --env CUDA_VISIBLE_DEVICES 2 displayName: 'Run orttraining_run_frontend_batch_size_test.py' condition: succeededOrFailed() # ensure all tests are run timeoutInMinutes: 30 - script: | docker run \ --gpus all \ --shm-size=1024m \ --rm \ --volume $(Build.SourcesDirectory):/onnxruntime_src \ --volume $(Build.BinariesDirectory):/build \ --volume /bert_data:/bert_data \ onnxruntime_e2e_test_image \ /build/RelWithDebInfo/launch_test.py \ --cmd_line_with_args "mpirun -n 4 -x NCCL_DEBUG=INFO python orttraining_run_glue.py" \ --cwd /build/RelWithDebInfo displayName: 'mpirun -n 4 -x NCCL_DEBUG=INFO python orttraining_run_glue.py' condition: succeededOrFailed() # ensure all tests are run timeoutInMinutes: 10 - script: | docker run \ --gpus all \ --shm-size=1024m \ --rm \ --volume $(Build.SourcesDirectory):/onnxruntime_src \ --volume $(Build.BinariesDirectory):/build \ --volume /bert_data:/bert_data \ onnxruntime_e2e_test_image \ /build/RelWithDebInfo/launch_test.py \ --cmd_line_with_args "python orttraining_run_glue.py ORTGlueTest.test_bert_with_mrpc -v" \ --cwd /build/RelWithDebInfo \ --env CUDA_VISIBLE_DEVICES 2 displayName: 'Run orttraining_run_glue.py ORTGlueTest.test_bert_with_mrpc' condition: succeededOrFailed() # ensure all tests are run timeoutInMinutes: 10 - script: | docker run \ --gpus all \ --shm-size=1024m \ --rm \ --volume $(Build.SourcesDirectory):/onnxruntime_src \ --volume $(Build.BinariesDirectory):/build \ --volume /bert_data:/bert_data \ onnxruntime_e2e_test_image \ /build/RelWithDebInfo/launch_test.py \ --cmd_line_with_args "python orttraining_run_glue.py ORTGlueTest.test_bert_fp16_with_mrpc -v" \ --cwd /build/RelWithDebInfo \ --env CUDA_VISIBLE_DEVICES 2 displayName: 'Run orttraining_run_glue.py ORTGlueTest.test_bert_fp16_with_mrpc' condition: succeededOrFailed() # ensure all tests are run timeoutInMinutes: 10 - script: | docker run \ --gpus all \ --shm-size=1024m \ --rm \ --volume $(Build.SourcesDirectory):/onnxruntime_src \ --volume $(Build.BinariesDirectory):/build \ --volume /bert_data:/bert_data \ onnxruntime_e2e_test_image \ /build/RelWithDebInfo/launch_test.py \ --cmd_line_with_args "python orttraining_run_glue.py ORTGlueTest.test_roberta_with_mrpc -v" \ --cwd /build/RelWithDebInfo \ --env CUDA_VISIBLE_DEVICES 2 displayName: 'Run orttraining_run_glue.py ORTGlueTest.test_roberta_with_mrpc' condition: succeededOrFailed() # ensure all tests are run timeoutInMinutes: 10 - script: | docker run \ --gpus all \ --shm-size=1024m \ --rm \ --volume $(Build.SourcesDirectory):/onnxruntime_src \ --volume $(Build.BinariesDirectory):/build \ --volume /bert_data:/bert_data \ onnxruntime_e2e_test_image \ /build/RelWithDebInfo/launch_test.py \ --cmd_line_with_args "python orttraining_run_glue.py ORTGlueTest.test_roberta_fp16_with_mrpc -v" \ --cwd /build/RelWithDebInfo \ --env CUDA_VISIBLE_DEVICES 2 displayName: 'Run orttraining_run_glue.py ORTGlueTest.test_roberta_fp16_with_mrpc' condition: succeededOrFailed() # ensure all tests are run timeoutInMinutes: 10 - script: | docker run \ --gpus all \ --shm-size=1024m \ --rm \ --volume $(Build.SourcesDirectory):/onnxruntime_src \ --volume $(Build.BinariesDirectory):/build \ --volume /bert_data:/bert_data \ onnxruntime_e2e_test_image \ /build/RelWithDebInfo/launch_test.py \ --cmd_line_with_args "python orttraining_run_multiple_choice.py ORTMultipleChoiceTest.test_bert_fp16_with_swag -v" \ --cwd /build/RelWithDebInfo \ --env CUDA_VISIBLE_DEVICES 2 displayName: 'Run orttraining_run_multiple_choice.py ORTMultipleChoiceTest.test_bert_fp16_with_swag' condition: succeededOrFailed() # ensure all tests are run timeoutInMinutes: 30 - script: | docker run \ --gpus all \ --shm-size=1024m \ --rm \ --volume $(Build.SourcesDirectory):/onnxruntime_src \ --volume $(Build.BinariesDirectory):/build \ --volume /bert_data:/bert_data \ onnxruntime_e2e_test_image \ /build/RelWithDebInfo/launch_test.py \ --cmd_line_with_args "python onnxruntime_test_ort_trainer_with_mixed_precision.py -v" \ --cwd /build/RelWithDebInfo displayName: 'Run onnxruntime_test_ort_trainer_with_mixed_precision.py' condition: succeededOrFailed() # ensure all tests are run timeoutInMinutes: 10 - script: | docker run \ --gpus all \ --rm \ --volume $(Build.SourcesDirectory):/onnxruntime_src \ --volume $(Build.BinariesDirectory):/build \ --volume /bert_data:/bert_data \ onnxruntime_e2e_test_image \ /build/RelWithDebInfo/launch_test.py \ --cmd_line_with_args "python orttraining_test_transformers.py BertModelTest.test_for_pretraining_mixed_precision -v" \ --cwd /build/RelWithDebInfo displayName: 'Run orttraining_test_transformers.py BertModelTest.test_for_pretraining_mixed_precision' condition: succeededOrFailed() # ensure all tests are run timeoutInMinutes: 10 - script: | docker run \ --gpus all \ --shm-size=1024m \ --rm \ --volume $(Build.SourcesDirectory):/onnxruntime_src \ --volume $(Build.BinariesDirectory):/build \ --volume /bert_data:/bert_data \ onnxruntime_e2e_test_image \ /build/RelWithDebInfo/launch_test.py \ --cmd_line_with_args "mpirun -n 4 -x NCCL_DEBUG=INFO python orttraining_run_bert_pretrain.py ORTBertPretrainTest.test_pretrain_convergence" \ --cwd /build/RelWithDebInfo displayName: 'Run orttraining_run_bert_pretrain.py ORTBertPretrainTest.test_pretrain_convergence' condition: succeededOrFailed() # ensure all tests are run timeoutInMinutes: 30 - template: templates/component-governance-component-detection-steps.yml parameters: condition: 'succeeded' - template: templates/clean-agent-build-directory-step.yml