From 6ed12402a471796cf190db230e1cbbfd970efdbf Mon Sep 17 00:00:00 2001
From: liqunfu <liqfu@microsoft.com>
Date: Mon, 25 Jan 2021 15:15:26 -0800
Subject: [PATCH] Liqun/liqun/enable pipeline parallel test2 (#6399)

* enable data and pipeline parallism test

Co-authored-by: liqun <liqun@OrtTrainingDev4.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
---
 .../python/run_training_pipeline_e2e_tests.py | 78 -------------------
 tools/ci_build/build.py                       | 65 ----------------
 ...ng-linux-gpu-e2e-test-nightly-pipeline.yml | 71 ++++++++++++-----
 3 files changed, 50 insertions(+), 164 deletions(-)
 delete mode 100755 orttraining/orttraining/test/python/run_training_pipeline_e2e_tests.py

diff --git a/orttraining/orttraining/test/python/run_training_pipeline_e2e_tests.py b/orttraining/orttraining/test/python/run_training_pipeline_e2e_tests.py
deleted file mode 100755
index fd7e61a366..0000000000
--- a/orttraining/orttraining/test/python/run_training_pipeline_e2e_tests.py
+++ /dev/null
@@ -1,78 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-
-import sys
-import argparse
-
-from _test_commons import run_subprocess
-
-import logging
-
-logging.basicConfig(
-    format="%(asctime)s %(name)s [%(levelname)s] - %(message)s",
-    level=logging.DEBUG)
-log = logging.getLogger("Build")
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--cwd", help="cwd")
-    return parser.parse_args()
-
-def main():
-    import torch
-    ngpus = torch.cuda.device_count()
-
-    # TODO: currently the CI machine only has 4 GPUs for parallel tests.
-    # Fill in more pipeline partition options when the machine has different GPUs counts.
-    if ngpus != 4:
-        return 0
-
-    log.info("Running pipeline e2e tests.")
-
-    args = parse_arguments()
-    cwd = args.cwd
-
-    command = ['./onnxruntime_training_bert',
-               '--ort_log_severity', '1',
-               '--optimizer=Lamb',
-               '--learning_rate=3e-3',
-               '--max_seq_length=128',
-               '--max_predictions_per_seq=20',
-               '--warmup_ratio=0.2843',
-               '--warmup_mode=Poly',
-               '--model_name', '/bert_ort/bert_models/nv/bert-large/' +
-               'bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12',
-               '--train_data_dir', '/bert_data/128/books_wiki_en_corpus/train',
-               '--test_data_dir', '/bert_data/128/books_wiki_en_corpus/test',
-               '--display_loss_steps', '1',
-               '--use_nccl',
-               '--use_mixed_precision',
-               '--allreduce_in_fp16',
-               '--gradient_accumulation_steps', '48',
-               '--num_train_steps', '96',
-               '--train_batch_size', '50']
-
-    # Test 4-way pipeline parallel
-    pp_command = ['mpirun', '-n', str(ngpus)] + command + ['--pipeline_parallel_size', '4', '--cut_group_info',
-                                                           '1149:407-1219/1341/1463/1585/1707/1829,' +
-                                                           '1881:407-1951/2073/2195/2317/2439/2561,' +
-                                                           '2613:407-2683/2805/2927/3049/3171/3293']
-    command_str = ', '.join(pp_command)
-    log.debug('RUN: ' + command_str)
-    run_subprocess(pp_command, cwd=cwd, log=log)
-
-    # Test 2-way data parallel + 2-way pipeline parallel
-    pp_dp_command = ['mpirun', '-n', str(ngpus)]
-    pp_dp_command = pp_dp_command + command
-    pp_dp_command = pp_dp_command + ['--data_parallel_size', '2', '--pipeline_parallel_size',
-                                     '2', '--cut_group_info',
-                                     '1881:407-1951/2073/2195/2317/2439/2561/2683/2805/2927/3049/3171/3293']
-    command_str = ', '.join(pp_dp_command)
-    log.debug('RUN: ' + command_str)
-    run_subprocess(pp_dp_command, cwd=cwd, log=log)
-    return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index a4fbb8f016..0e7efeaf05 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -1309,64 +1309,6 @@ def run_training_python_frontend_e2e_tests(cwd):
         sys.executable, 'orttraining_test_transformers.py',
         'BertModelTest.test_for_pretraining_mixed_precision'], cwd=cwd)
 
-    # this test is not stable. it occasionally causes segfault due to its session creation/release pattern.
-    # need to skip to unblock release
-    # run_subprocess([
-    #     sys.executable, 'orttraining_test_transformers.py',
-    #     'BertModelTest.test_for_pretraining_mixed_precision_with_gradient_accumulation'], cwd=cwd)
-
-
-def run_training_pipeline_e2e_tests(cwd):
-    # pipeline tests are to be added here:
-    log.info("Running pipeline e2e tests.")
-
-    import torch
-    ngpus = torch.cuda.device_count()
-
-    command = ['./onnxruntime_training_bert',
-               '--ort_log_severity', '1',
-               '--optimizer=Lamb',
-               '--learning_rate=3e-3',
-               '--max_seq_length=128',
-               '--max_predictions_per_seq=20',
-               '--warmup_ratio=0.2843',
-               '--warmup_mode=Poly',
-               '--model_name', '/bert_ort/bert_models/nv/bert-large/' +
-               'bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12',
-               '--train_data_dir', '/bert_data/128/books_wiki_en_corpus/train',
-               '--test_data_dir', '/bert_data/128/books_wiki_en_corpus/test',
-               '--display_loss_steps', '1',
-               '--use_nccl',
-               '--use_mixed_precision',
-               '--allreduce_in_fp16',
-               '--gradient_accumulation_steps', '48',
-               '--num_train_steps', '96',
-               '--train_batch_size', '50']
-
-    # TODO: currently the CI machine only has 4 GPUs for parallel tests.
-    # Fill in more pipeline partition options when the machine has different GPUs counts.
-    if ngpus != 4:
-        return
-
-    # Test 4-way pipeline parallel
-    pp_command = ['mpirun', '-n', str(ngpus)] + command + ['--pipeline_parallel_size', '4', '--cut_group_info',
-                                                           '1149:407-1219/1341/1463/1585/1707/1829,' +
-                                                           '1881:407-1951/2073/2195/2317/2439/2561,' +
-                                                           '2613:407-2683/2805/2927/3049/3171/3293']
-    command_str = ', '.join(pp_command)
-    log.debug('RUN: ' + command_str)
-    run_subprocess(pp_command, cwd=cwd)
-
-    # Test 2-way data parallel + 2-way pipeline parallel
-    pp_dp_command = ['mpirun', '-n', str(ngpus)]
-    pp_dp_command = pp_dp_command + command
-    pp_dp_command = pp_dp_command + ['--data_parallel_size', '2', '--pipeline_parallel_size',
-                                     '2', '--cut_group_info',
-                                     '1881:407-1951/2073/2195/2317/2439/2561/2683/2805/2927/3049/3171/3293']
-    command_str = ', '.join(pp_dp_command)
-    log.debug('RUN: ' + command_str)
-    run_subprocess(pp_dp_command, cwd=cwd)
-
 
 def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
     for config in configs:
@@ -1374,13 +1316,6 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
         cwd = get_config_build_dir(build_dir, config)
         cwd = os.path.abspath(cwd)
 
-        # TODO: temporarily disable this test to restore pipeline health. This test fails due to
-        # an OOM regression. Invetigation undergoing.
-        # if args.enable_training and args.use_cuda and args.enable_training_pipeline_e2e_tests:
-        #     # run distributed pipeline test on 4-GPU CI machine.
-        #     run_training_pipeline_e2e_tests(cwd=cwd)
-        #     continue
-
         if args.android:
             run_android_tests(args, source_dir, config, cwd)
             continue
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-e2e-test-nightly-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-e2e-test-nightly-pipeline.yml
index d332965be1..20e70cb60c 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-e2e-test-nightly-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-e2e-test-nightly-pipeline.yml
@@ -47,35 +47,64 @@ jobs:
         --volume $(Build.BinariesDirectory):/build \
         --volume /bert_data:/bert_data \
         onnxruntime_e2e_test_image \
-          mpirun -n 4 -x NCCL_DEBUG=INFO /build/RelWithDebInfo/onnxruntime_training_bert --ort_log_severity 1 --optimizer=Lamb --learning_rate=3e-3 \
-          --max_seq_length=128 --max_predictions_per_seq=20 --warmup_ratio=0.2843 --warmup_mode=Poly \
+          mpirun -n 4 \
+          /build/RelWithDebInfo/onnxruntime_training_bert \
+          --ort_log_severity 1 \
+          --optimizer=Lamb \
+          --learning_rate=3e-3 \
+          --max_seq_length=128 \
+          --max_predictions_per_seq=20 \
+          --warmup_ratio=0.2843 \
+          --warmup_mode=Poly \
           --model_name /bert_data/bert_models/nv/bert-large/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12 \
           --train_data_dir /bert_data/128/books_wiki_en_corpus/train \
           --test_data_dir /bert_data/128/books_wiki_en_corpus/test \
-          --display_loss_steps 1 --use_nccl --use_mixed_precision --allreduce_in_fp16 --gradient_accumulation_steps 48 --num_train_steps 96 \
-          --train_batch_size 40 --pipeline_parallel_size 4 \
+          --display_loss_steps 1 \
+          --use_nccl \
+          --use_mixed_precision \
+          --allreduce_in_fp16 \
+          --gradient_accumulation_steps 48 \
+          --num_train_steps 96 \
+          --train_batch_size 40 \
+          --pipeline_parallel_size 4 \
           --cut_group_info 1149:407-1219/1341/1463/1585/1707/1829,1881:407-1951/2073/2195/2317/2439/2561,2613:407-2683/2805/2927/3049/3171/3293
     displayName: 'mpirun onnxruntime_training_bert --pipeline_parallel_size 4'
     condition: succeededOrFailed() # ensure all tests are run
     timeoutInMinutes: 10
         
-  # Hit OOM with run_training_pipeline_e2e_tests.py - slightly above 16GB limit.
-  # leave this code here for further investigation.
-  # https://msdata.visualstudio.com/Vienna/_workitems/edit/956642
-  # - script: |
-  #     docker run \
-  #       --gpus all \
-  #       --shm-size=1024m \
-  #       --rm \
-  #       --volume $(Build.BinariesDirectory):/build \
-  #       --volume /bert_data:/bert_data \
-  #       --volume /bert_ort:/bert_ort \
-  #       onnxruntime_e2e_test_image \
-  #         /build/RelWithDebInfo/run_training_pipeline_e2e_tests.py \
-  #           --cwd /build/RelWithDebInfo
-  #   displayName: 'Run run_training_pipeline_e2e_tests.py'
-  #   condition: succeededOrFailed() # ensure all tests are run
-  #   timeoutInMinutes: 10
+  - script: |
+      docker run \
+        --gpus all \
+        --shm-size=1024m \
+        --rm \
+        --volume $(Build.BinariesDirectory):/build \
+        --volume /bert_data:/bert_data \
+        onnxruntime_e2e_test_image \
+          mpirun -n 4 \
+          /build/RelWithDebInfo/onnxruntime_training_bert \
+          --ort_log_severity 1 \
+          --optimizer=Lamb \
+          --learning_rate=3e-3 \
+          --max_seq_length=128 \
+          --max_predictions_per_seq=20 \
+          --warmup_ratio=0.2843 \
+          --warmup_mode=Poly \
+          --model_name /bert_data/bert_models/nv/bert-large/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12 \
+          --train_data_dir /bert_data/128/books_wiki_en_corpus/train \
+          --test_data_dir /bert_data/128/books_wiki_en_corpus/test \
+          --display_loss_steps 1 \
+          --use_nccl \
+          --use_mixed_precision \
+          --allreduce_in_fp16 \
+          --gradient_accumulation_steps 48 \
+          --num_train_steps 96 \
+          --train_batch_size 40 \
+          --data_parallel_size 2 \
+          --pipeline_parallel_size 2 \
+          --cut_group_info 1881:407-1951/2073/2195/2317/2439/2561/2683/2805/2927/3049/3171/3293
+    displayName: 'mpirun onnxruntime_training_bert --data_parallel_size 2 --pipeline_parallel_size 2'
+    condition: succeededOrFailed() # ensure all tests are run
+    timeoutInMinutes: 10
 
   - script: |
       docker run \