Opset12 upgrade for existing models used by perf/e2e pipelines (#4238)

* opset12 support

* opset12 support

* on comments

Co-authored-by: Ethan Tao <ettao@microsoft.com>
This commit is contained in:
ytaous 2020-06-15 14:26:53 -07:00 committed by GitHub
parent 4486c66ed4
commit e0334f177c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 146 additions and 13 deletions

View file

@ -11,8 +11,8 @@ import urllib.request
import zipfile
# update these if the E2E test data changes
ARCHIVE_BLOB_URL = "https://onnxruntimetestdata.blob.core.windows.net/training/onnxruntime_training_data.zip?snapshot=2020-04-14T02:10:05.3158245Z"
ARCHIVE_SHA256_DIGEST = "ea4168a801ded478f4e2af08232cb1174913caac300d5bf73b2652dc6894372c"
ARCHIVE_BLOB_URL = "https://onnxruntimetestdata.blob.core.windows.net/training/onnxruntime_training_data_v12.zip?snapshot=2020-06-13T06:24:15.0833240Z"
ARCHIVE_SHA256_DIGEST = "B01C169B6550D1A0A6F1B4E2F34AE2A8714B52DBB70AC04DA85D371F691BDFF9"
def _download(url, local_path):
urllib.request.urlretrieve(url, local_path)

View file

@ -1,11 +1,11 @@
step,total_loss,mlm_loss,nsp_loss
0,11.2031,10.4979,0.707195
0,11.2422,10.5228,0.717476
5,10.1875,7.75453,2.43238
10,8.33594,7.63755,0.697193
10,8.42188,7.63755,0.792425
15,8.35156,7.60502,0.744699
20,8.22656,7.48076,0.749099
25,8.27344,7.56207,0.71167
20,8.22656,7.4854,0.749099
25,8.29688,7.56207,0.73899
30,8.125,7.40926,0.716592
35,7.95703,7.26281,0.694741
35,7.99219,7.26281,0.726583
40,7.94531,7.26573,0.679934
45,7.93359,7.27335,0.661407
45,7.94141,7.27335,0.668663

1 step total_loss mlm_loss nsp_loss
2 0 11.2031 11.2422 10.4979 10.5228 0.707195 0.717476
3 5 10.1875 7.75453 2.43238
4 10 8.33594 8.42188 7.63755 0.697193 0.792425
5 15 8.35156 7.60502 0.744699
6 20 8.22656 7.48076 7.4854 0.749099
7 25 8.27344 8.29688 7.56207 0.71167 0.73899
8 30 8.125 7.40926 0.716592
9 35 7.95703 7.99219 7.26281 0.694741 0.726583
10 40 7.94531 7.26573 0.679934
11 45 7.93359 7.94141 7.27335 0.661407 0.668663

View file

@ -35,7 +35,7 @@ def main():
os.path.join(args.binary_dir, "onnxruntime_training_bert"),
"--model_name", os.path.join(
args.model_root,
"nv/bert-large/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm"),
"nv/bert-large/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12"),
"--train_batch_size", str(config.max_batch_size),
"--mode", "perf",
"--max_seq_length", str(config.sequence_length),
@ -57,7 +57,7 @@ def main():
if config.enable_mixed_precision:
cmds.append("--use_mixed_precision"),
subprocess.run(cmds, timeout=60).check_returncode()
subprocess.run(cmds, timeout=120).check_returncode()
return 0

View file

@ -38,7 +38,7 @@ def main():
cmds = [
os.path.join(args.binary_dir, "onnxruntime_training_bert"),
"--model_name", os.path.join(
args.model_root, "nv/bert-large/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm"),
args.model_root, "nv/bert-large/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12"),
"--train_data_dir", os.path.join(
args.training_data_root, str(c.max_seq_length), "books_wiki_en_corpus/train"),
"--test_data_dir", os.path.join(

View file

@ -33,7 +33,7 @@ def main():
subprocess.run([
os.path.join(args.binary_dir, "onnxruntime_training_bert"),
"--model_name", os.path.join(
args.model_root, "nv/bert-base/bert-base-uncased_L_12_H_768_A_12_V_30528_S_512_Dp_0.1_optimized_layer_norm"),
args.model_root, "nv/bert-base/bert-base-uncased_L_12_H_768_A_12_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12"),
"--train_data_dir", os.path.join(
args.training_data_root, "128/books_wiki_en_corpus/train"),
"--test_data_dir", os.path.join(

View file

@ -36,7 +36,7 @@ def main():
cmds = [
os.path.join(args.binary_dir, "onnxruntime_training_gpt2"),
"--model_name", os.path.join(
args.model_root, "megatron-gpt2_hidden-size-1024_num-layers-24_vocab-size-50257_num-attention-heads-16_max-position-embeddings-1024_optimized"),
args.model_root, "megatron-gpt2_hidden-size-1024_num-layers-24_vocab-size-50257_num-attention-heads-16_max-position-embeddings-1024_optimized_opset12"),
"--train_data_dir", os.path.join(
args.training_data_root, "train"),
"--test_data_dir", os.path.join(

View file

@ -0,0 +1,133 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
#
# This converter is an internal util to upgrade existing bert/gpt-2 models,
# which were previously transformed/optimized from orginal model, to Opset 12
# version as well as replacing deprecated node, i.e., TrainableDropout with
# the "Dropout" node matching the Opset 12 Spec. Typically, a model to
# be run by this scripts would have "_optimized" substring in its model name,
# and the graph should have one or more "TrainableDropout" nodes in its graph.
# Example usage:
# python opset12_model_transform.py bert-base-uncased_L_12_H_768_A_12_V_30528_S_512_Dp_0.1_optimized_layer_norm.onnx
# Output:
# bert-base-uncased_L_12_H_768_A_12_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12.onnx
import sys
import onnx
from onnx import helper, shape_inference
from onnx import TensorProto
import numpy as np
from onnx import numpy_helper
if len(sys.argv) < 2:
print("Please give model path...")
exit(1)
input_model_name = sys.argv[1]
output_model_name = input_model_name[:-5] + '_opset12.onnx'
model = onnx.load(input_model_name)
def find_input_node(model, arg):
result = []
for node in model.graph.node:
for output in node.output:
if output == arg:
result.append(node)
return result[0] if len(result)== 1 else None
def find_output_node(model, arg):
result = []
for node in model.graph.node:
for input in node.input:
if input == arg:
result.append(node)
return result[0] if len(result) == 1 else None
def find_input(model, arg):
for initializer in model.graph.initializer:
if initializer.name == arg:
return initializer
return None
def get_node_index(model, node):
i = 0
while i < len(model.graph.node):
if model.graph.node[i] == node:
break;
i += 1
return i if i < len(model.graph.node) else None;
def add_const(model, name, output, t_value = None, f_value = None):
const_node = model.graph.node.add()
const_node.op_type = 'Constant'
const_node.name = name
const_node.output.extend([output])
attr = const_node.attribute.add()
attr.name = 'value'
if t_value is not None:
attr.type = 4
attr.t.CopyFrom(t_value)
else:
attr.type = 1
attr.f = f_value
return const_node
def process_trainabledropout(model):
delete_nodes = []
index = 0
for node in model.graph.node:
if node.op_type == 'TrainableDropout':
new_dropout = model.graph.node.add()
new_dropout.op_type = 'Dropout'
new_dropout.name = 'Dropout_%d' % index
# add seed attribute
attr = new_dropout.attribute.add()
attr.name = 'seed'
attr.type = 2
# find old ratio node
ratio_node = find_input_node(model, node.input[1])
assert ratio_node.op_type == 'Constant'
delete_nodes.append(get_node_index(model, ratio_node))
# make ratio scalar node
ratio_attr = ratio_node.attribute
ratio_data = numpy_helper.to_array(ratio_attr[0].t)
ratio_scalar = ratio_data.astype(np.float32).reshape(())
ratio_value = numpy_helper.from_array(ratio_scalar, "ratio")
new_ratio_node = add_const(model, 'dropout_ratio_node_%d' % index, 'dropout_ratio_%d' % index, t_value=ratio_value)
index+=1
# add training_mode output
mode_scalar = np.asarray([True]).astype(np.bool).reshape(())
mode_value = numpy_helper.from_array(mode_scalar, "training_mode")
training_mode_node = add_const(model, 'dropout_training_mode_node_%d' % index, 'dropout_training_mode_%d' % index, t_value=mode_value)
index+=1
new_dropout.input.extend([node.input[0], new_ratio_node.output[0], training_mode_node.output[0]])
new_dropout.output.extend(node.output)
delete_nodes.append(get_node_index(model, node))
index += 1
delete_nodes.sort(reverse=True)
for d in delete_nodes:
del model.graph.node[d]
def align_attention_mask_dim(model):
for model_input in model.graph.input:
if model_input.name == "attention_mask":
model_input.type.tensor_type.shape.dim[0].dim_param = "batch"
#replace TrainableDropout with Dropout
process_trainabledropout(model)
# some gpt-2 models (large ones) still don't have this input corrected
align_attention_mask_dim(model)
#set opset version to 12
model.opset_import[0].version = 12
with open (output_model_name, "wb") as f:
f.write(model.SerializeToString())
#
# To verify the converted model in case of bert, refer to the code at the end of model_transform.py
#