Opset12 upgrade for existing models used by perf/e2e pipelines (#4238)

* opset12 support * opset12 support * on comments Co-authored-by: Ethan Tao <ettao@microsoft.com>
2026-05-14 20:48:00 +00:00 · 2020-06-15 14:26:53 -07:00 · 2020-06-15 14:26:53 -07:00 · e0334f177c
commit e0334f177c
parent 4486c66ed4
7 changed files with 146 additions and 13 deletions
--- a/orttraining/tools/ci_test/download_e2e_test_data.py
+++ b/orttraining/tools/ci_test/download_e2e_test_data.py
@ -11,8 +11,8 @@ import urllib.request
 import zipfile

 # update these if the E2E test data changes
-ARCHIVE_BLOB_URL = "https://onnxruntimetestdata.blob.core.windows.net/training/onnxruntime_training_data.zip?snapshot=2020-04-14T02:10:05.3158245Z"
-ARCHIVE_SHA256_DIGEST = "ea4168a801ded478f4e2af08232cb1174913caac300d5bf73b2652dc6894372c"
+ARCHIVE_BLOB_URL = "https://onnxruntimetestdata.blob.core.windows.net/training/onnxruntime_training_data_v12.zip?snapshot=2020-06-13T06:24:15.0833240Z"
+ARCHIVE_SHA256_DIGEST = "B01C169B6550D1A0A6F1B4E2F34AE2A8714B52DBB70AC04DA85D371F691BDFF9"

 def _download(url, local_path):
  urllib.request.urlretrieve(url, local_path)
--- a/orttraining/tools/ci_test/results/bert_base.convergence.baseline.csv
+++ b/orttraining/tools/ci_test/results/bert_base.convergence.baseline.csv
@ -1,11 +1,11 @@
 step,total_loss,mlm_loss,nsp_loss
-0,11.2031,10.4979,0.707195
+0,11.2422,10.5228,0.717476
 5,10.1875,7.75453,2.43238
-10,8.33594,7.63755,0.697193
+10,8.42188,7.63755,0.792425
 15,8.35156,7.60502,0.744699
-20,8.22656,7.48076,0.749099
-25,8.27344,7.56207,0.71167
+20,8.22656,7.4854,0.749099
+25,8.29688,7.56207,0.73899
 30,8.125,7.40926,0.716592
-35,7.95703,7.26281,0.694741
+35,7.99219,7.26281,0.726583
 40,7.94531,7.26573,0.679934
-45,7.93359,7.27335,0.661407
+45,7.94141,7.27335,0.668663
--- a/orttraining/tools/ci_test/run_batch_size_test.py
+++ b/orttraining/tools/ci_test/run_batch_size_test.py
@ -35,7 +35,7 @@ def main():
            os.path.join(args.binary_dir, "onnxruntime_training_bert"),
            "--model_name", os.path.join(
                args.model_root,
-                "nv/bert-large/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm"),
+                "nv/bert-large/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12"),
            "--train_batch_size", str(config.max_batch_size),
            "--mode", "perf",
            "--max_seq_length", str(config.sequence_length),
@ -57,7 +57,7 @@ def main():
        if config.enable_mixed_precision:
            cmds.append("--use_mixed_precision"),

-        subprocess.run(cmds, timeout=60).check_returncode()
+        subprocess.run(cmds, timeout=120).check_returncode()

    return 0

--- a/orttraining/tools/ci_test/run_bert_perf_test.py
+++ b/orttraining/tools/ci_test/run_bert_perf_test.py
@ -38,7 +38,7 @@ def main():
        cmds = [
            os.path.join(args.binary_dir, "onnxruntime_training_bert"),
            "--model_name", os.path.join(
-                args.model_root, "nv/bert-large/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm"),
+                args.model_root, "nv/bert-large/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12"),
            "--train_data_dir", os.path.join(
                args.training_data_root, str(c.max_seq_length), "books_wiki_en_corpus/train"),
            "--test_data_dir", os.path.join(
--- a/orttraining/tools/ci_test/run_convergence_test.py
+++ b/orttraining/tools/ci_test/run_convergence_test.py
@ -33,7 +33,7 @@ def main():
    subprocess.run([
        os.path.join(args.binary_dir, "onnxruntime_training_bert"),
        "--model_name", os.path.join(
-            args.model_root, "nv/bert-base/bert-base-uncased_L_12_H_768_A_12_V_30528_S_512_Dp_0.1_optimized_layer_norm"),
+            args.model_root, "nv/bert-base/bert-base-uncased_L_12_H_768_A_12_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12"),
        "--train_data_dir", os.path.join(
            args.training_data_root, "128/books_wiki_en_corpus/train"),
        "--test_data_dir", os.path.join(
--- a/orttraining/tools/ci_test/run_gpt2_perf_test.py
+++ b/orttraining/tools/ci_test/run_gpt2_perf_test.py
@ -36,7 +36,7 @@ def main():
        cmds = [
            os.path.join(args.binary_dir, "onnxruntime_training_gpt2"),
            "--model_name", os.path.join(
-                args.model_root, "megatron-gpt2_hidden-size-1024_num-layers-24_vocab-size-50257_num-attention-heads-16_max-position-embeddings-1024_optimized"),
+                args.model_root, "megatron-gpt2_hidden-size-1024_num-layers-24_vocab-size-50257_num-attention-heads-16_max-position-embeddings-1024_optimized_opset12"),
            "--train_data_dir", os.path.join(
                args.training_data_root, "train"),
            "--test_data_dir", os.path.join(
--- a/orttraining/tools/scripts/opset12_model_transform.py
+++ b/orttraining/tools/scripts/opset12_model_transform.py
@ -0,0 +1,133 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+#
+# This converter is an internal util to upgrade existing bert/gpt-2 models, 
+# which were previously transformed/optimized from orginal model, to Opset 12 
+# version as well as replacing deprecated node, i.e., TrainableDropout with 
+# the "Dropout" node matching the Opset 12 Spec. Typically, a model to 
+# be run by this scripts would have "_optimized" substring in its model name, 
+# and the graph should have one or more "TrainableDropout" nodes in its graph. 
+# Example usage: 
+#   python opset12_model_transform.py bert-base-uncased_L_12_H_768_A_12_V_30528_S_512_Dp_0.1_optimized_layer_norm.onnx
+# Output: 
+#   bert-base-uncased_L_12_H_768_A_12_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12.onnx
+
+import sys
+import onnx
+from onnx import helper, shape_inference
+from onnx import TensorProto
+import numpy as np
+from onnx import numpy_helper
+
+if len(sys.argv) < 2:
+    print("Please give model path...")
+    exit(1)
+
+input_model_name = sys.argv[1]
+output_model_name = input_model_name[:-5] + '_opset12.onnx'
+
+model = onnx.load(input_model_name)
+
+def find_input_node(model, arg):
+    result = []
+    for node in model.graph.node:
+        for output in node.output:
+            if output == arg:
+                result.append(node)
+    return result[0] if len(result)== 1 else None
+
+def find_output_node(model, arg):
+    result = []
+    for node in model.graph.node:
+        for input in node.input:
+            if input == arg:
+                result.append(node)
+    return result[0] if len(result) == 1 else None
+
+def find_input(model, arg):
+    for initializer in model.graph.initializer:
+        if initializer.name == arg:
+            return initializer
+    return None
+
+def get_node_index(model, node):
+    i = 0
+    while i < len(model.graph.node):
+        if model.graph.node[i] == node:
+            break;
+        i += 1
+    return i if i < len(model.graph.node) else None;
+
+def add_const(model, name, output, t_value = None, f_value = None):
+    const_node = model.graph.node.add()
+    const_node.op_type = 'Constant'
+    const_node.name = name
+    const_node.output.extend([output])
+    attr = const_node.attribute.add()
+    attr.name = 'value'
+    if t_value is not None:
+        attr.type = 4
+        attr.t.CopyFrom(t_value)
+    else:
+        attr.type = 1
+        attr.f = f_value
+    return const_node
+
+def process_trainabledropout(model):
+    delete_nodes = []  
+    index = 0
+    for node in model.graph.node:
+        if node.op_type == 'TrainableDropout':
+            new_dropout = model.graph.node.add()
+            new_dropout.op_type = 'Dropout'
+            new_dropout.name = 'Dropout_%d' % index
+            # add seed attribute
+            attr = new_dropout.attribute.add()
+            attr.name = 'seed'
+            attr.type = 2
+            # find old ratio node
+            ratio_node = find_input_node(model, node.input[1])
+            assert ratio_node.op_type == 'Constant'
+            delete_nodes.append(get_node_index(model, ratio_node))
+            # make ratio scalar node 
+            ratio_attr = ratio_node.attribute
+            ratio_data = numpy_helper.to_array(ratio_attr[0].t)
+            ratio_scalar = ratio_data.astype(np.float32).reshape(())
+            ratio_value = numpy_helper.from_array(ratio_scalar, "ratio")
+            new_ratio_node = add_const(model, 'dropout_ratio_node_%d' % index, 'dropout_ratio_%d' % index, t_value=ratio_value)
+            index+=1
+            # add training_mode output
+            mode_scalar = np.asarray([True]).astype(np.bool).reshape(())
+            mode_value = numpy_helper.from_array(mode_scalar, "training_mode")
+            training_mode_node = add_const(model, 'dropout_training_mode_node_%d' % index, 'dropout_training_mode_%d' % index, t_value=mode_value)
+            index+=1
+
+            new_dropout.input.extend([node.input[0], new_ratio_node.output[0], training_mode_node.output[0]])
+            new_dropout.output.extend(node.output)
+            delete_nodes.append(get_node_index(model, node))
+            index += 1
+
+    delete_nodes.sort(reverse=True)
+    for d in delete_nodes:
+        del model.graph.node[d]
+
+def align_attention_mask_dim(model):
+    for model_input in model.graph.input:
+        if model_input.name == "attention_mask":
+            model_input.type.tensor_type.shape.dim[0].dim_param = "batch"
+
+
+#replace TrainableDropout with Dropout
+process_trainabledropout(model)
+# some gpt-2 models (large ones) still don't have this input corrected
+align_attention_mask_dim(model)
+
+#set opset version to 12
+model.opset_import[0].version = 12
+
+with open (output_model_name, "wb") as f:
+    f.write(model.SerializeToString())
+
+#
+# To verify the converted model in case of bert, refer to the code at the end of model_transform.py 
+#