From e0334f177cb3cc5dc7aeabe8a5ef4773cd9ea902 Mon Sep 17 00:00:00 2001 From: ytaous <4484531+ytaous@users.noreply.github.com> Date: Mon, 15 Jun 2020 14:26:53 -0700 Subject: [PATCH] Opset12 upgrade for existing models used by perf/e2e pipelines (#4238) * opset12 support * opset12 support * on comments Co-authored-by: Ethan Tao --- .../tools/ci_test/download_e2e_test_data.py | 4 +- .../bert_base.convergence.baseline.csv | 12 +- .../tools/ci_test/run_batch_size_test.py | 4 +- .../tools/ci_test/run_bert_perf_test.py | 2 +- .../tools/ci_test/run_convergence_test.py | 2 +- .../tools/ci_test/run_gpt2_perf_test.py | 2 +- .../tools/scripts/opset12_model_transform.py | 133 ++++++++++++++++++ 7 files changed, 146 insertions(+), 13 deletions(-) create mode 100644 orttraining/tools/scripts/opset12_model_transform.py diff --git a/orttraining/tools/ci_test/download_e2e_test_data.py b/orttraining/tools/ci_test/download_e2e_test_data.py index eb8e443d79..ef0e64a880 100755 --- a/orttraining/tools/ci_test/download_e2e_test_data.py +++ b/orttraining/tools/ci_test/download_e2e_test_data.py @@ -11,8 +11,8 @@ import urllib.request import zipfile # update these if the E2E test data changes -ARCHIVE_BLOB_URL = "https://onnxruntimetestdata.blob.core.windows.net/training/onnxruntime_training_data.zip?snapshot=2020-04-14T02:10:05.3158245Z" -ARCHIVE_SHA256_DIGEST = "ea4168a801ded478f4e2af08232cb1174913caac300d5bf73b2652dc6894372c" +ARCHIVE_BLOB_URL = "https://onnxruntimetestdata.blob.core.windows.net/training/onnxruntime_training_data_v12.zip?snapshot=2020-06-13T06:24:15.0833240Z" +ARCHIVE_SHA256_DIGEST = "B01C169B6550D1A0A6F1B4E2F34AE2A8714B52DBB70AC04DA85D371F691BDFF9" def _download(url, local_path): urllib.request.urlretrieve(url, local_path) diff --git a/orttraining/tools/ci_test/results/bert_base.convergence.baseline.csv b/orttraining/tools/ci_test/results/bert_base.convergence.baseline.csv index b718cdf312..cd24bfef9e 100644 --- a/orttraining/tools/ci_test/results/bert_base.convergence.baseline.csv +++ b/orttraining/tools/ci_test/results/bert_base.convergence.baseline.csv @@ -1,11 +1,11 @@ step,total_loss,mlm_loss,nsp_loss -0,11.2031,10.4979,0.707195 +0,11.2422,10.5228,0.717476 5,10.1875,7.75453,2.43238 -10,8.33594,7.63755,0.697193 +10,8.42188,7.63755,0.792425 15,8.35156,7.60502,0.744699 -20,8.22656,7.48076,0.749099 -25,8.27344,7.56207,0.71167 +20,8.22656,7.4854,0.749099 +25,8.29688,7.56207,0.73899 30,8.125,7.40926,0.716592 -35,7.95703,7.26281,0.694741 +35,7.99219,7.26281,0.726583 40,7.94531,7.26573,0.679934 -45,7.93359,7.27335,0.661407 +45,7.94141,7.27335,0.668663 diff --git a/orttraining/tools/ci_test/run_batch_size_test.py b/orttraining/tools/ci_test/run_batch_size_test.py index c9bdc6e9d7..a8cc92b785 100755 --- a/orttraining/tools/ci_test/run_batch_size_test.py +++ b/orttraining/tools/ci_test/run_batch_size_test.py @@ -35,7 +35,7 @@ def main(): os.path.join(args.binary_dir, "onnxruntime_training_bert"), "--model_name", os.path.join( args.model_root, - "nv/bert-large/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm"), + "nv/bert-large/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12"), "--train_batch_size", str(config.max_batch_size), "--mode", "perf", "--max_seq_length", str(config.sequence_length), @@ -57,7 +57,7 @@ def main(): if config.enable_mixed_precision: cmds.append("--use_mixed_precision"), - subprocess.run(cmds, timeout=60).check_returncode() + subprocess.run(cmds, timeout=120).check_returncode() return 0 diff --git a/orttraining/tools/ci_test/run_bert_perf_test.py b/orttraining/tools/ci_test/run_bert_perf_test.py index b11aad9127..14f72a2c92 100644 --- a/orttraining/tools/ci_test/run_bert_perf_test.py +++ b/orttraining/tools/ci_test/run_bert_perf_test.py @@ -38,7 +38,7 @@ def main(): cmds = [ os.path.join(args.binary_dir, "onnxruntime_training_bert"), "--model_name", os.path.join( - args.model_root, "nv/bert-large/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm"), + args.model_root, "nv/bert-large/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12"), "--train_data_dir", os.path.join( args.training_data_root, str(c.max_seq_length), "books_wiki_en_corpus/train"), "--test_data_dir", os.path.join( diff --git a/orttraining/tools/ci_test/run_convergence_test.py b/orttraining/tools/ci_test/run_convergence_test.py index 4fa3c7c312..68528e2897 100755 --- a/orttraining/tools/ci_test/run_convergence_test.py +++ b/orttraining/tools/ci_test/run_convergence_test.py @@ -33,7 +33,7 @@ def main(): subprocess.run([ os.path.join(args.binary_dir, "onnxruntime_training_bert"), "--model_name", os.path.join( - args.model_root, "nv/bert-base/bert-base-uncased_L_12_H_768_A_12_V_30528_S_512_Dp_0.1_optimized_layer_norm"), + args.model_root, "nv/bert-base/bert-base-uncased_L_12_H_768_A_12_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12"), "--train_data_dir", os.path.join( args.training_data_root, "128/books_wiki_en_corpus/train"), "--test_data_dir", os.path.join( diff --git a/orttraining/tools/ci_test/run_gpt2_perf_test.py b/orttraining/tools/ci_test/run_gpt2_perf_test.py index 8f4594a4d1..3e39ffd9a6 100644 --- a/orttraining/tools/ci_test/run_gpt2_perf_test.py +++ b/orttraining/tools/ci_test/run_gpt2_perf_test.py @@ -36,7 +36,7 @@ def main(): cmds = [ os.path.join(args.binary_dir, "onnxruntime_training_gpt2"), "--model_name", os.path.join( - args.model_root, "megatron-gpt2_hidden-size-1024_num-layers-24_vocab-size-50257_num-attention-heads-16_max-position-embeddings-1024_optimized"), + args.model_root, "megatron-gpt2_hidden-size-1024_num-layers-24_vocab-size-50257_num-attention-heads-16_max-position-embeddings-1024_optimized_opset12"), "--train_data_dir", os.path.join( args.training_data_root, "train"), "--test_data_dir", os.path.join( diff --git a/orttraining/tools/scripts/opset12_model_transform.py b/orttraining/tools/scripts/opset12_model_transform.py new file mode 100644 index 0000000000..0360eba999 --- /dev/null +++ b/orttraining/tools/scripts/opset12_model_transform.py @@ -0,0 +1,133 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# +# This converter is an internal util to upgrade existing bert/gpt-2 models, +# which were previously transformed/optimized from orginal model, to Opset 12 +# version as well as replacing deprecated node, i.e., TrainableDropout with +# the "Dropout" node matching the Opset 12 Spec. Typically, a model to +# be run by this scripts would have "_optimized" substring in its model name, +# and the graph should have one or more "TrainableDropout" nodes in its graph. +# Example usage: +# python opset12_model_transform.py bert-base-uncased_L_12_H_768_A_12_V_30528_S_512_Dp_0.1_optimized_layer_norm.onnx +# Output: +# bert-base-uncased_L_12_H_768_A_12_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12.onnx + +import sys +import onnx +from onnx import helper, shape_inference +from onnx import TensorProto +import numpy as np +from onnx import numpy_helper + +if len(sys.argv) < 2: + print("Please give model path...") + exit(1) + +input_model_name = sys.argv[1] +output_model_name = input_model_name[:-5] + '_opset12.onnx' + +model = onnx.load(input_model_name) + +def find_input_node(model, arg): + result = [] + for node in model.graph.node: + for output in node.output: + if output == arg: + result.append(node) + return result[0] if len(result)== 1 else None + +def find_output_node(model, arg): + result = [] + for node in model.graph.node: + for input in node.input: + if input == arg: + result.append(node) + return result[0] if len(result) == 1 else None + +def find_input(model, arg): + for initializer in model.graph.initializer: + if initializer.name == arg: + return initializer + return None + +def get_node_index(model, node): + i = 0 + while i < len(model.graph.node): + if model.graph.node[i] == node: + break; + i += 1 + return i if i < len(model.graph.node) else None; + +def add_const(model, name, output, t_value = None, f_value = None): + const_node = model.graph.node.add() + const_node.op_type = 'Constant' + const_node.name = name + const_node.output.extend([output]) + attr = const_node.attribute.add() + attr.name = 'value' + if t_value is not None: + attr.type = 4 + attr.t.CopyFrom(t_value) + else: + attr.type = 1 + attr.f = f_value + return const_node + +def process_trainabledropout(model): + delete_nodes = [] + index = 0 + for node in model.graph.node: + if node.op_type == 'TrainableDropout': + new_dropout = model.graph.node.add() + new_dropout.op_type = 'Dropout' + new_dropout.name = 'Dropout_%d' % index + # add seed attribute + attr = new_dropout.attribute.add() + attr.name = 'seed' + attr.type = 2 + # find old ratio node + ratio_node = find_input_node(model, node.input[1]) + assert ratio_node.op_type == 'Constant' + delete_nodes.append(get_node_index(model, ratio_node)) + # make ratio scalar node + ratio_attr = ratio_node.attribute + ratio_data = numpy_helper.to_array(ratio_attr[0].t) + ratio_scalar = ratio_data.astype(np.float32).reshape(()) + ratio_value = numpy_helper.from_array(ratio_scalar, "ratio") + new_ratio_node = add_const(model, 'dropout_ratio_node_%d' % index, 'dropout_ratio_%d' % index, t_value=ratio_value) + index+=1 + # add training_mode output + mode_scalar = np.asarray([True]).astype(np.bool).reshape(()) + mode_value = numpy_helper.from_array(mode_scalar, "training_mode") + training_mode_node = add_const(model, 'dropout_training_mode_node_%d' % index, 'dropout_training_mode_%d' % index, t_value=mode_value) + index+=1 + + new_dropout.input.extend([node.input[0], new_ratio_node.output[0], training_mode_node.output[0]]) + new_dropout.output.extend(node.output) + delete_nodes.append(get_node_index(model, node)) + index += 1 + + delete_nodes.sort(reverse=True) + for d in delete_nodes: + del model.graph.node[d] + +def align_attention_mask_dim(model): + for model_input in model.graph.input: + if model_input.name == "attention_mask": + model_input.type.tensor_type.shape.dim[0].dim_param = "batch" + + +#replace TrainableDropout with Dropout +process_trainabledropout(model) +# some gpt-2 models (large ones) still don't have this input corrected +align_attention_mask_dim(model) + +#set opset version to 12 +model.opset_import[0].version = 12 + +with open (output_model_name, "wb") as f: + f.write(model.SerializeToString()) + +# +# To verify the converted model in case of bert, refer to the code at the end of model_transform.py +#