From 6ed12402a471796cf190db230e1cbbfd970efdbf Mon Sep 17 00:00:00 2001 From: liqunfu Date: Mon, 25 Jan 2021 15:15:26 -0800 Subject: [PATCH] Liqun/liqun/enable pipeline parallel test2 (#6399) * enable data and pipeline parallism test Co-authored-by: liqun --- .../python/run_training_pipeline_e2e_tests.py | 78 ------------------- tools/ci_build/build.py | 65 ---------------- ...ng-linux-gpu-e2e-test-nightly-pipeline.yml | 71 ++++++++++++----- 3 files changed, 50 insertions(+), 164 deletions(-) delete mode 100755 orttraining/orttraining/test/python/run_training_pipeline_e2e_tests.py diff --git a/orttraining/orttraining/test/python/run_training_pipeline_e2e_tests.py b/orttraining/orttraining/test/python/run_training_pipeline_e2e_tests.py deleted file mode 100755 index fd7e61a366..0000000000 --- a/orttraining/orttraining/test/python/run_training_pipeline_e2e_tests.py +++ /dev/null @@ -1,78 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. - -import sys -import argparse - -from _test_commons import run_subprocess - -import logging - -logging.basicConfig( - format="%(asctime)s %(name)s [%(levelname)s] - %(message)s", - level=logging.DEBUG) -log = logging.getLogger("Build") - -def parse_arguments(): - parser = argparse.ArgumentParser() - parser.add_argument("--cwd", help="cwd") - return parser.parse_args() - -def main(): - import torch - ngpus = torch.cuda.device_count() - - # TODO: currently the CI machine only has 4 GPUs for parallel tests. - # Fill in more pipeline partition options when the machine has different GPUs counts. - if ngpus != 4: - return 0 - - log.info("Running pipeline e2e tests.") - - args = parse_arguments() - cwd = args.cwd - - command = ['./onnxruntime_training_bert', - '--ort_log_severity', '1', - '--optimizer=Lamb', - '--learning_rate=3e-3', - '--max_seq_length=128', - '--max_predictions_per_seq=20', - '--warmup_ratio=0.2843', - '--warmup_mode=Poly', - '--model_name', '/bert_ort/bert_models/nv/bert-large/' + - 'bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12', - '--train_data_dir', '/bert_data/128/books_wiki_en_corpus/train', - '--test_data_dir', '/bert_data/128/books_wiki_en_corpus/test', - '--display_loss_steps', '1', - '--use_nccl', - '--use_mixed_precision', - '--allreduce_in_fp16', - '--gradient_accumulation_steps', '48', - '--num_train_steps', '96', - '--train_batch_size', '50'] - - # Test 4-way pipeline parallel - pp_command = ['mpirun', '-n', str(ngpus)] + command + ['--pipeline_parallel_size', '4', '--cut_group_info', - '1149:407-1219/1341/1463/1585/1707/1829,' + - '1881:407-1951/2073/2195/2317/2439/2561,' + - '2613:407-2683/2805/2927/3049/3171/3293'] - command_str = ', '.join(pp_command) - log.debug('RUN: ' + command_str) - run_subprocess(pp_command, cwd=cwd, log=log) - - # Test 2-way data parallel + 2-way pipeline parallel - pp_dp_command = ['mpirun', '-n', str(ngpus)] - pp_dp_command = pp_dp_command + command - pp_dp_command = pp_dp_command + ['--data_parallel_size', '2', '--pipeline_parallel_size', - '2', '--cut_group_info', - '1881:407-1951/2073/2195/2317/2439/2561/2683/2805/2927/3049/3171/3293'] - command_str = ', '.join(pp_dp_command) - log.debug('RUN: ' + command_str) - run_subprocess(pp_dp_command, cwd=cwd, log=log) - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index a4fbb8f016..0e7efeaf05 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -1309,64 +1309,6 @@ def run_training_python_frontend_e2e_tests(cwd): sys.executable, 'orttraining_test_transformers.py', 'BertModelTest.test_for_pretraining_mixed_precision'], cwd=cwd) - # this test is not stable. it occasionally causes segfault due to its session creation/release pattern. - # need to skip to unblock release - # run_subprocess([ - # sys.executable, 'orttraining_test_transformers.py', - # 'BertModelTest.test_for_pretraining_mixed_precision_with_gradient_accumulation'], cwd=cwd) - - -def run_training_pipeline_e2e_tests(cwd): - # pipeline tests are to be added here: - log.info("Running pipeline e2e tests.") - - import torch - ngpus = torch.cuda.device_count() - - command = ['./onnxruntime_training_bert', - '--ort_log_severity', '1', - '--optimizer=Lamb', - '--learning_rate=3e-3', - '--max_seq_length=128', - '--max_predictions_per_seq=20', - '--warmup_ratio=0.2843', - '--warmup_mode=Poly', - '--model_name', '/bert_ort/bert_models/nv/bert-large/' + - 'bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12', - '--train_data_dir', '/bert_data/128/books_wiki_en_corpus/train', - '--test_data_dir', '/bert_data/128/books_wiki_en_corpus/test', - '--display_loss_steps', '1', - '--use_nccl', - '--use_mixed_precision', - '--allreduce_in_fp16', - '--gradient_accumulation_steps', '48', - '--num_train_steps', '96', - '--train_batch_size', '50'] - - # TODO: currently the CI machine only has 4 GPUs for parallel tests. - # Fill in more pipeline partition options when the machine has different GPUs counts. - if ngpus != 4: - return - - # Test 4-way pipeline parallel - pp_command = ['mpirun', '-n', str(ngpus)] + command + ['--pipeline_parallel_size', '4', '--cut_group_info', - '1149:407-1219/1341/1463/1585/1707/1829,' + - '1881:407-1951/2073/2195/2317/2439/2561,' + - '2613:407-2683/2805/2927/3049/3171/3293'] - command_str = ', '.join(pp_command) - log.debug('RUN: ' + command_str) - run_subprocess(pp_command, cwd=cwd) - - # Test 2-way data parallel + 2-way pipeline parallel - pp_dp_command = ['mpirun', '-n', str(ngpus)] - pp_dp_command = pp_dp_command + command - pp_dp_command = pp_dp_command + ['--data_parallel_size', '2', '--pipeline_parallel_size', - '2', '--cut_group_info', - '1881:407-1951/2073/2195/2317/2439/2561/2683/2805/2927/3049/3171/3293'] - command_str = ', '.join(pp_dp_command) - log.debug('RUN: ' + command_str) - run_subprocess(pp_dp_command, cwd=cwd) - def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs): for config in configs: @@ -1374,13 +1316,6 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs): cwd = get_config_build_dir(build_dir, config) cwd = os.path.abspath(cwd) - # TODO: temporarily disable this test to restore pipeline health. This test fails due to - # an OOM regression. Invetigation undergoing. - # if args.enable_training and args.use_cuda and args.enable_training_pipeline_e2e_tests: - # # run distributed pipeline test on 4-GPU CI machine. - # run_training_pipeline_e2e_tests(cwd=cwd) - # continue - if args.android: run_android_tests(args, source_dir, config, cwd) continue diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-e2e-test-nightly-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-e2e-test-nightly-pipeline.yml index d332965be1..20e70cb60c 100644 --- a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-e2e-test-nightly-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-e2e-test-nightly-pipeline.yml @@ -47,35 +47,64 @@ jobs: --volume $(Build.BinariesDirectory):/build \ --volume /bert_data:/bert_data \ onnxruntime_e2e_test_image \ - mpirun -n 4 -x NCCL_DEBUG=INFO /build/RelWithDebInfo/onnxruntime_training_bert --ort_log_severity 1 --optimizer=Lamb --learning_rate=3e-3 \ - --max_seq_length=128 --max_predictions_per_seq=20 --warmup_ratio=0.2843 --warmup_mode=Poly \ + mpirun -n 4 \ + /build/RelWithDebInfo/onnxruntime_training_bert \ + --ort_log_severity 1 \ + --optimizer=Lamb \ + --learning_rate=3e-3 \ + --max_seq_length=128 \ + --max_predictions_per_seq=20 \ + --warmup_ratio=0.2843 \ + --warmup_mode=Poly \ --model_name /bert_data/bert_models/nv/bert-large/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12 \ --train_data_dir /bert_data/128/books_wiki_en_corpus/train \ --test_data_dir /bert_data/128/books_wiki_en_corpus/test \ - --display_loss_steps 1 --use_nccl --use_mixed_precision --allreduce_in_fp16 --gradient_accumulation_steps 48 --num_train_steps 96 \ - --train_batch_size 40 --pipeline_parallel_size 4 \ + --display_loss_steps 1 \ + --use_nccl \ + --use_mixed_precision \ + --allreduce_in_fp16 \ + --gradient_accumulation_steps 48 \ + --num_train_steps 96 \ + --train_batch_size 40 \ + --pipeline_parallel_size 4 \ --cut_group_info 1149:407-1219/1341/1463/1585/1707/1829,1881:407-1951/2073/2195/2317/2439/2561,2613:407-2683/2805/2927/3049/3171/3293 displayName: 'mpirun onnxruntime_training_bert --pipeline_parallel_size 4' condition: succeededOrFailed() # ensure all tests are run timeoutInMinutes: 10 - # Hit OOM with run_training_pipeline_e2e_tests.py - slightly above 16GB limit. - # leave this code here for further investigation. - # https://msdata.visualstudio.com/Vienna/_workitems/edit/956642 - # - script: | - # docker run \ - # --gpus all \ - # --shm-size=1024m \ - # --rm \ - # --volume $(Build.BinariesDirectory):/build \ - # --volume /bert_data:/bert_data \ - # --volume /bert_ort:/bert_ort \ - # onnxruntime_e2e_test_image \ - # /build/RelWithDebInfo/run_training_pipeline_e2e_tests.py \ - # --cwd /build/RelWithDebInfo - # displayName: 'Run run_training_pipeline_e2e_tests.py' - # condition: succeededOrFailed() # ensure all tests are run - # timeoutInMinutes: 10 + - script: | + docker run \ + --gpus all \ + --shm-size=1024m \ + --rm \ + --volume $(Build.BinariesDirectory):/build \ + --volume /bert_data:/bert_data \ + onnxruntime_e2e_test_image \ + mpirun -n 4 \ + /build/RelWithDebInfo/onnxruntime_training_bert \ + --ort_log_severity 1 \ + --optimizer=Lamb \ + --learning_rate=3e-3 \ + --max_seq_length=128 \ + --max_predictions_per_seq=20 \ + --warmup_ratio=0.2843 \ + --warmup_mode=Poly \ + --model_name /bert_data/bert_models/nv/bert-large/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12 \ + --train_data_dir /bert_data/128/books_wiki_en_corpus/train \ + --test_data_dir /bert_data/128/books_wiki_en_corpus/test \ + --display_loss_steps 1 \ + --use_nccl \ + --use_mixed_precision \ + --allreduce_in_fp16 \ + --gradient_accumulation_steps 48 \ + --num_train_steps 96 \ + --train_batch_size 40 \ + --data_parallel_size 2 \ + --pipeline_parallel_size 2 \ + --cut_group_info 1881:407-1951/2073/2195/2317/2439/2561/2683/2805/2927/3049/3171/3293 + displayName: 'mpirun onnxruntime_training_bert --data_parallel_size 2 --pipeline_parallel_size 2' + condition: succeededOrFailed() # ensure all tests are run + timeoutInMinutes: 10 - script: | docker run \