mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-22 22:01:08 +00:00
Liqun/liqun/enable pipeline parallel test2 (#6399)
* enable data and pipeline parallism test Co-authored-by: liqun <liqun@OrtTrainingDev4.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
This commit is contained in:
parent
24f1bd6156
commit
6ed12402a4
3 changed files with 50 additions and 164 deletions
|
|
@ -1,78 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
from _test_commons import run_subprocess
|
||||
|
||||
import logging
|
||||
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s %(name)s [%(levelname)s] - %(message)s",
|
||||
level=logging.DEBUG)
|
||||
log = logging.getLogger("Build")
|
||||
|
||||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--cwd", help="cwd")
|
||||
return parser.parse_args()
|
||||
|
||||
def main():
|
||||
import torch
|
||||
ngpus = torch.cuda.device_count()
|
||||
|
||||
# TODO: currently the CI machine only has 4 GPUs for parallel tests.
|
||||
# Fill in more pipeline partition options when the machine has different GPUs counts.
|
||||
if ngpus != 4:
|
||||
return 0
|
||||
|
||||
log.info("Running pipeline e2e tests.")
|
||||
|
||||
args = parse_arguments()
|
||||
cwd = args.cwd
|
||||
|
||||
command = ['./onnxruntime_training_bert',
|
||||
'--ort_log_severity', '1',
|
||||
'--optimizer=Lamb',
|
||||
'--learning_rate=3e-3',
|
||||
'--max_seq_length=128',
|
||||
'--max_predictions_per_seq=20',
|
||||
'--warmup_ratio=0.2843',
|
||||
'--warmup_mode=Poly',
|
||||
'--model_name', '/bert_ort/bert_models/nv/bert-large/' +
|
||||
'bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12',
|
||||
'--train_data_dir', '/bert_data/128/books_wiki_en_corpus/train',
|
||||
'--test_data_dir', '/bert_data/128/books_wiki_en_corpus/test',
|
||||
'--display_loss_steps', '1',
|
||||
'--use_nccl',
|
||||
'--use_mixed_precision',
|
||||
'--allreduce_in_fp16',
|
||||
'--gradient_accumulation_steps', '48',
|
||||
'--num_train_steps', '96',
|
||||
'--train_batch_size', '50']
|
||||
|
||||
# Test 4-way pipeline parallel
|
||||
pp_command = ['mpirun', '-n', str(ngpus)] + command + ['--pipeline_parallel_size', '4', '--cut_group_info',
|
||||
'1149:407-1219/1341/1463/1585/1707/1829,' +
|
||||
'1881:407-1951/2073/2195/2317/2439/2561,' +
|
||||
'2613:407-2683/2805/2927/3049/3171/3293']
|
||||
command_str = ', '.join(pp_command)
|
||||
log.debug('RUN: ' + command_str)
|
||||
run_subprocess(pp_command, cwd=cwd, log=log)
|
||||
|
||||
# Test 2-way data parallel + 2-way pipeline parallel
|
||||
pp_dp_command = ['mpirun', '-n', str(ngpus)]
|
||||
pp_dp_command = pp_dp_command + command
|
||||
pp_dp_command = pp_dp_command + ['--data_parallel_size', '2', '--pipeline_parallel_size',
|
||||
'2', '--cut_group_info',
|
||||
'1881:407-1951/2073/2195/2317/2439/2561/2683/2805/2927/3049/3171/3293']
|
||||
command_str = ', '.join(pp_dp_command)
|
||||
log.debug('RUN: ' + command_str)
|
||||
run_subprocess(pp_dp_command, cwd=cwd, log=log)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
|
|
@ -1309,64 +1309,6 @@ def run_training_python_frontend_e2e_tests(cwd):
|
|||
sys.executable, 'orttraining_test_transformers.py',
|
||||
'BertModelTest.test_for_pretraining_mixed_precision'], cwd=cwd)
|
||||
|
||||
# this test is not stable. it occasionally causes segfault due to its session creation/release pattern.
|
||||
# need to skip to unblock release
|
||||
# run_subprocess([
|
||||
# sys.executable, 'orttraining_test_transformers.py',
|
||||
# 'BertModelTest.test_for_pretraining_mixed_precision_with_gradient_accumulation'], cwd=cwd)
|
||||
|
||||
|
||||
def run_training_pipeline_e2e_tests(cwd):
|
||||
# pipeline tests are to be added here:
|
||||
log.info("Running pipeline e2e tests.")
|
||||
|
||||
import torch
|
||||
ngpus = torch.cuda.device_count()
|
||||
|
||||
command = ['./onnxruntime_training_bert',
|
||||
'--ort_log_severity', '1',
|
||||
'--optimizer=Lamb',
|
||||
'--learning_rate=3e-3',
|
||||
'--max_seq_length=128',
|
||||
'--max_predictions_per_seq=20',
|
||||
'--warmup_ratio=0.2843',
|
||||
'--warmup_mode=Poly',
|
||||
'--model_name', '/bert_ort/bert_models/nv/bert-large/' +
|
||||
'bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12',
|
||||
'--train_data_dir', '/bert_data/128/books_wiki_en_corpus/train',
|
||||
'--test_data_dir', '/bert_data/128/books_wiki_en_corpus/test',
|
||||
'--display_loss_steps', '1',
|
||||
'--use_nccl',
|
||||
'--use_mixed_precision',
|
||||
'--allreduce_in_fp16',
|
||||
'--gradient_accumulation_steps', '48',
|
||||
'--num_train_steps', '96',
|
||||
'--train_batch_size', '50']
|
||||
|
||||
# TODO: currently the CI machine only has 4 GPUs for parallel tests.
|
||||
# Fill in more pipeline partition options when the machine has different GPUs counts.
|
||||
if ngpus != 4:
|
||||
return
|
||||
|
||||
# Test 4-way pipeline parallel
|
||||
pp_command = ['mpirun', '-n', str(ngpus)] + command + ['--pipeline_parallel_size', '4', '--cut_group_info',
|
||||
'1149:407-1219/1341/1463/1585/1707/1829,' +
|
||||
'1881:407-1951/2073/2195/2317/2439/2561,' +
|
||||
'2613:407-2683/2805/2927/3049/3171/3293']
|
||||
command_str = ', '.join(pp_command)
|
||||
log.debug('RUN: ' + command_str)
|
||||
run_subprocess(pp_command, cwd=cwd)
|
||||
|
||||
# Test 2-way data parallel + 2-way pipeline parallel
|
||||
pp_dp_command = ['mpirun', '-n', str(ngpus)]
|
||||
pp_dp_command = pp_dp_command + command
|
||||
pp_dp_command = pp_dp_command + ['--data_parallel_size', '2', '--pipeline_parallel_size',
|
||||
'2', '--cut_group_info',
|
||||
'1881:407-1951/2073/2195/2317/2439/2561/2683/2805/2927/3049/3171/3293']
|
||||
command_str = ', '.join(pp_dp_command)
|
||||
log.debug('RUN: ' + command_str)
|
||||
run_subprocess(pp_dp_command, cwd=cwd)
|
||||
|
||||
|
||||
def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
|
||||
for config in configs:
|
||||
|
|
@ -1374,13 +1316,6 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
|
|||
cwd = get_config_build_dir(build_dir, config)
|
||||
cwd = os.path.abspath(cwd)
|
||||
|
||||
# TODO: temporarily disable this test to restore pipeline health. This test fails due to
|
||||
# an OOM regression. Invetigation undergoing.
|
||||
# if args.enable_training and args.use_cuda and args.enable_training_pipeline_e2e_tests:
|
||||
# # run distributed pipeline test on 4-GPU CI machine.
|
||||
# run_training_pipeline_e2e_tests(cwd=cwd)
|
||||
# continue
|
||||
|
||||
if args.android:
|
||||
run_android_tests(args, source_dir, config, cwd)
|
||||
continue
|
||||
|
|
|
|||
|
|
@ -47,35 +47,64 @@ jobs:
|
|||
--volume $(Build.BinariesDirectory):/build \
|
||||
--volume /bert_data:/bert_data \
|
||||
onnxruntime_e2e_test_image \
|
||||
mpirun -n 4 -x NCCL_DEBUG=INFO /build/RelWithDebInfo/onnxruntime_training_bert --ort_log_severity 1 --optimizer=Lamb --learning_rate=3e-3 \
|
||||
--max_seq_length=128 --max_predictions_per_seq=20 --warmup_ratio=0.2843 --warmup_mode=Poly \
|
||||
mpirun -n 4 \
|
||||
/build/RelWithDebInfo/onnxruntime_training_bert \
|
||||
--ort_log_severity 1 \
|
||||
--optimizer=Lamb \
|
||||
--learning_rate=3e-3 \
|
||||
--max_seq_length=128 \
|
||||
--max_predictions_per_seq=20 \
|
||||
--warmup_ratio=0.2843 \
|
||||
--warmup_mode=Poly \
|
||||
--model_name /bert_data/bert_models/nv/bert-large/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12 \
|
||||
--train_data_dir /bert_data/128/books_wiki_en_corpus/train \
|
||||
--test_data_dir /bert_data/128/books_wiki_en_corpus/test \
|
||||
--display_loss_steps 1 --use_nccl --use_mixed_precision --allreduce_in_fp16 --gradient_accumulation_steps 48 --num_train_steps 96 \
|
||||
--train_batch_size 40 --pipeline_parallel_size 4 \
|
||||
--display_loss_steps 1 \
|
||||
--use_nccl \
|
||||
--use_mixed_precision \
|
||||
--allreduce_in_fp16 \
|
||||
--gradient_accumulation_steps 48 \
|
||||
--num_train_steps 96 \
|
||||
--train_batch_size 40 \
|
||||
--pipeline_parallel_size 4 \
|
||||
--cut_group_info 1149:407-1219/1341/1463/1585/1707/1829,1881:407-1951/2073/2195/2317/2439/2561,2613:407-2683/2805/2927/3049/3171/3293
|
||||
displayName: 'mpirun onnxruntime_training_bert --pipeline_parallel_size 4'
|
||||
condition: succeededOrFailed() # ensure all tests are run
|
||||
timeoutInMinutes: 10
|
||||
|
||||
# Hit OOM with run_training_pipeline_e2e_tests.py - slightly above 16GB limit.
|
||||
# leave this code here for further investigation.
|
||||
# https://msdata.visualstudio.com/Vienna/_workitems/edit/956642
|
||||
# - script: |
|
||||
# docker run \
|
||||
# --gpus all \
|
||||
# --shm-size=1024m \
|
||||
# --rm \
|
||||
# --volume $(Build.BinariesDirectory):/build \
|
||||
# --volume /bert_data:/bert_data \
|
||||
# --volume /bert_ort:/bert_ort \
|
||||
# onnxruntime_e2e_test_image \
|
||||
# /build/RelWithDebInfo/run_training_pipeline_e2e_tests.py \
|
||||
# --cwd /build/RelWithDebInfo
|
||||
# displayName: 'Run run_training_pipeline_e2e_tests.py'
|
||||
# condition: succeededOrFailed() # ensure all tests are run
|
||||
# timeoutInMinutes: 10
|
||||
- script: |
|
||||
docker run \
|
||||
--gpus all \
|
||||
--shm-size=1024m \
|
||||
--rm \
|
||||
--volume $(Build.BinariesDirectory):/build \
|
||||
--volume /bert_data:/bert_data \
|
||||
onnxruntime_e2e_test_image \
|
||||
mpirun -n 4 \
|
||||
/build/RelWithDebInfo/onnxruntime_training_bert \
|
||||
--ort_log_severity 1 \
|
||||
--optimizer=Lamb \
|
||||
--learning_rate=3e-3 \
|
||||
--max_seq_length=128 \
|
||||
--max_predictions_per_seq=20 \
|
||||
--warmup_ratio=0.2843 \
|
||||
--warmup_mode=Poly \
|
||||
--model_name /bert_data/bert_models/nv/bert-large/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12 \
|
||||
--train_data_dir /bert_data/128/books_wiki_en_corpus/train \
|
||||
--test_data_dir /bert_data/128/books_wiki_en_corpus/test \
|
||||
--display_loss_steps 1 \
|
||||
--use_nccl \
|
||||
--use_mixed_precision \
|
||||
--allreduce_in_fp16 \
|
||||
--gradient_accumulation_steps 48 \
|
||||
--num_train_steps 96 \
|
||||
--train_batch_size 40 \
|
||||
--data_parallel_size 2 \
|
||||
--pipeline_parallel_size 2 \
|
||||
--cut_group_info 1881:407-1951/2073/2195/2317/2439/2561/2683/2805/2927/3049/3171/3293
|
||||
displayName: 'mpirun onnxruntime_training_bert --data_parallel_size 2 --pipeline_parallel_size 2'
|
||||
condition: succeededOrFailed() # ensure all tests are run
|
||||
timeoutInMinutes: 10
|
||||
|
||||
- script: |
|
||||
docker run \
|
||||
|
|
|
|||
Loading…
Reference in a new issue