From e0334f177cb3cc5dc7aeabe8a5ef4773cd9ea902 Mon Sep 17 00:00:00 2001
From: ytaous <4484531+ytaous@users.noreply.github.com>
Date: Mon, 15 Jun 2020 14:26:53 -0700
Subject: [PATCH] Opset12 upgrade for existing models used by perf/e2e
 pipelines (#4238)

* opset12 support

* opset12 support

* on comments

Co-authored-by: Ethan Tao <ettao@microsoft.com>
---
 .../tools/ci_test/download_e2e_test_data.py   |   4 +-
 .../bert_base.convergence.baseline.csv        |  12 +-
 .../tools/ci_test/run_batch_size_test.py      |   4 +-
 .../tools/ci_test/run_bert_perf_test.py       |   2 +-
 .../tools/ci_test/run_convergence_test.py     |   2 +-
 .../tools/ci_test/run_gpt2_perf_test.py       |   2 +-
 .../tools/scripts/opset12_model_transform.py  | 133 ++++++++++++++++++
 7 files changed, 146 insertions(+), 13 deletions(-)
 create mode 100644 orttraining/tools/scripts/opset12_model_transform.py

diff --git a/orttraining/tools/ci_test/download_e2e_test_data.py b/orttraining/tools/ci_test/download_e2e_test_data.py
index eb8e443d79..ef0e64a880 100755
--- a/orttraining/tools/ci_test/download_e2e_test_data.py
+++ b/orttraining/tools/ci_test/download_e2e_test_data.py
@@ -11,8 +11,8 @@ import urllib.request
 import zipfile
 
 # update these if the E2E test data changes
-ARCHIVE_BLOB_URL = "https://onnxruntimetestdata.blob.core.windows.net/training/onnxruntime_training_data.zip?snapshot=2020-04-14T02:10:05.3158245Z"
-ARCHIVE_SHA256_DIGEST = "ea4168a801ded478f4e2af08232cb1174913caac300d5bf73b2652dc6894372c"
+ARCHIVE_BLOB_URL = "https://onnxruntimetestdata.blob.core.windows.net/training/onnxruntime_training_data_v12.zip?snapshot=2020-06-13T06:24:15.0833240Z"
+ARCHIVE_SHA256_DIGEST = "B01C169B6550D1A0A6F1B4E2F34AE2A8714B52DBB70AC04DA85D371F691BDFF9"
 
 def _download(url, local_path):
   urllib.request.urlretrieve(url, local_path)
diff --git a/orttraining/tools/ci_test/results/bert_base.convergence.baseline.csv b/orttraining/tools/ci_test/results/bert_base.convergence.baseline.csv
index b718cdf312..cd24bfef9e 100644
--- a/orttraining/tools/ci_test/results/bert_base.convergence.baseline.csv
+++ b/orttraining/tools/ci_test/results/bert_base.convergence.baseline.csv
@@ -1,11 +1,11 @@
 step,total_loss,mlm_loss,nsp_loss
-0,11.2031,10.4979,0.707195
+0,11.2422,10.5228,0.717476
 5,10.1875,7.75453,2.43238
-10,8.33594,7.63755,0.697193
+10,8.42188,7.63755,0.792425
 15,8.35156,7.60502,0.744699
-20,8.22656,7.48076,0.749099
-25,8.27344,7.56207,0.71167
+20,8.22656,7.4854,0.749099
+25,8.29688,7.56207,0.73899
 30,8.125,7.40926,0.716592
-35,7.95703,7.26281,0.694741
+35,7.99219,7.26281,0.726583
 40,7.94531,7.26573,0.679934
-45,7.93359,7.27335,0.661407
+45,7.94141,7.27335,0.668663
diff --git a/orttraining/tools/ci_test/run_batch_size_test.py b/orttraining/tools/ci_test/run_batch_size_test.py
index c9bdc6e9d7..a8cc92b785 100755
--- a/orttraining/tools/ci_test/run_batch_size_test.py
+++ b/orttraining/tools/ci_test/run_batch_size_test.py
@@ -35,7 +35,7 @@ def main():
             os.path.join(args.binary_dir, "onnxruntime_training_bert"),
             "--model_name", os.path.join(
                 args.model_root,
-                "nv/bert-large/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm"),
+                "nv/bert-large/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12"),
             "--train_batch_size", str(config.max_batch_size),
             "--mode", "perf",
             "--max_seq_length", str(config.sequence_length),
@@ -57,7 +57,7 @@ def main():
         if config.enable_mixed_precision:
             cmds.append("--use_mixed_precision"),
 
-        subprocess.run(cmds, timeout=60).check_returncode()
+        subprocess.run(cmds, timeout=120).check_returncode()
 
     return 0
 
diff --git a/orttraining/tools/ci_test/run_bert_perf_test.py b/orttraining/tools/ci_test/run_bert_perf_test.py
index b11aad9127..14f72a2c92 100644
--- a/orttraining/tools/ci_test/run_bert_perf_test.py
+++ b/orttraining/tools/ci_test/run_bert_perf_test.py
@@ -38,7 +38,7 @@ def main():
         cmds = [
             os.path.join(args.binary_dir, "onnxruntime_training_bert"),
             "--model_name", os.path.join(
-                args.model_root, "nv/bert-large/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm"),
+                args.model_root, "nv/bert-large/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12"),
             "--train_data_dir", os.path.join(
                 args.training_data_root, str(c.max_seq_length), "books_wiki_en_corpus/train"),
             "--test_data_dir", os.path.join(
diff --git a/orttraining/tools/ci_test/run_convergence_test.py b/orttraining/tools/ci_test/run_convergence_test.py
index 4fa3c7c312..68528e2897 100755
--- a/orttraining/tools/ci_test/run_convergence_test.py
+++ b/orttraining/tools/ci_test/run_convergence_test.py
@@ -33,7 +33,7 @@ def main():
     subprocess.run([
         os.path.join(args.binary_dir, "onnxruntime_training_bert"),
         "--model_name", os.path.join(
-            args.model_root, "nv/bert-base/bert-base-uncased_L_12_H_768_A_12_V_30528_S_512_Dp_0.1_optimized_layer_norm"),
+            args.model_root, "nv/bert-base/bert-base-uncased_L_12_H_768_A_12_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12"),
         "--train_data_dir", os.path.join(
             args.training_data_root, "128/books_wiki_en_corpus/train"),
         "--test_data_dir", os.path.join(
diff --git a/orttraining/tools/ci_test/run_gpt2_perf_test.py b/orttraining/tools/ci_test/run_gpt2_perf_test.py
index 8f4594a4d1..3e39ffd9a6 100644
--- a/orttraining/tools/ci_test/run_gpt2_perf_test.py
+++ b/orttraining/tools/ci_test/run_gpt2_perf_test.py
@@ -36,7 +36,7 @@ def main():
         cmds = [
             os.path.join(args.binary_dir, "onnxruntime_training_gpt2"),
             "--model_name", os.path.join(
-                args.model_root, "megatron-gpt2_hidden-size-1024_num-layers-24_vocab-size-50257_num-attention-heads-16_max-position-embeddings-1024_optimized"),
+                args.model_root, "megatron-gpt2_hidden-size-1024_num-layers-24_vocab-size-50257_num-attention-heads-16_max-position-embeddings-1024_optimized_opset12"),
             "--train_data_dir", os.path.join(
                 args.training_data_root, "train"),
             "--test_data_dir", os.path.join(
diff --git a/orttraining/tools/scripts/opset12_model_transform.py b/orttraining/tools/scripts/opset12_model_transform.py
new file mode 100644
index 0000000000..0360eba999
--- /dev/null
+++ b/orttraining/tools/scripts/opset12_model_transform.py
@@ -0,0 +1,133 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+#
+# This converter is an internal util to upgrade existing bert/gpt-2 models, 
+# which were previously transformed/optimized from orginal model, to Opset 12 
+# version as well as replacing deprecated node, i.e., TrainableDropout with 
+# the "Dropout" node matching the Opset 12 Spec. Typically, a model to 
+# be run by this scripts would have "_optimized" substring in its model name, 
+# and the graph should have one or more "TrainableDropout" nodes in its graph. 
+# Example usage: 
+#   python opset12_model_transform.py bert-base-uncased_L_12_H_768_A_12_V_30528_S_512_Dp_0.1_optimized_layer_norm.onnx
+# Output: 
+#   bert-base-uncased_L_12_H_768_A_12_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12.onnx
+
+import sys
+import onnx
+from onnx import helper, shape_inference
+from onnx import TensorProto
+import numpy as np
+from onnx import numpy_helper
+
+if len(sys.argv) < 2:
+    print("Please give model path...")
+    exit(1)
+
+input_model_name = sys.argv[1]
+output_model_name = input_model_name[:-5] + '_opset12.onnx'
+
+model = onnx.load(input_model_name)
+
+def find_input_node(model, arg):
+    result = []
+    for node in model.graph.node:
+        for output in node.output:
+            if output == arg:
+                result.append(node)
+    return result[0] if len(result)== 1 else None
+
+def find_output_node(model, arg):
+    result = []
+    for node in model.graph.node:
+        for input in node.input:
+            if input == arg:
+                result.append(node)
+    return result[0] if len(result) == 1 else None
+
+def find_input(model, arg):
+    for initializer in model.graph.initializer:
+        if initializer.name == arg:
+            return initializer
+    return None
+
+def get_node_index(model, node):
+    i = 0
+    while i < len(model.graph.node):
+        if model.graph.node[i] == node:
+            break;
+        i += 1
+    return i if i < len(model.graph.node) else None;
+
+def add_const(model, name, output, t_value = None, f_value = None):
+    const_node = model.graph.node.add()
+    const_node.op_type = 'Constant'
+    const_node.name = name
+    const_node.output.extend([output])
+    attr = const_node.attribute.add()
+    attr.name = 'value'
+    if t_value is not None:
+        attr.type = 4
+        attr.t.CopyFrom(t_value)
+    else:
+        attr.type = 1
+        attr.f = f_value
+    return const_node
+
+def process_trainabledropout(model):
+    delete_nodes = []  
+    index = 0
+    for node in model.graph.node:
+        if node.op_type == 'TrainableDropout':
+            new_dropout = model.graph.node.add()
+            new_dropout.op_type = 'Dropout'
+            new_dropout.name = 'Dropout_%d' % index
+            # add seed attribute
+            attr = new_dropout.attribute.add()
+            attr.name = 'seed'
+            attr.type = 2
+            # find old ratio node
+            ratio_node = find_input_node(model, node.input[1])
+            assert ratio_node.op_type == 'Constant'
+            delete_nodes.append(get_node_index(model, ratio_node))
+            # make ratio scalar node 
+            ratio_attr = ratio_node.attribute
+            ratio_data = numpy_helper.to_array(ratio_attr[0].t)
+            ratio_scalar = ratio_data.astype(np.float32).reshape(())
+            ratio_value = numpy_helper.from_array(ratio_scalar, "ratio")
+            new_ratio_node = add_const(model, 'dropout_ratio_node_%d' % index, 'dropout_ratio_%d' % index, t_value=ratio_value)
+            index+=1
+            # add training_mode output
+            mode_scalar = np.asarray([True]).astype(np.bool).reshape(())
+            mode_value = numpy_helper.from_array(mode_scalar, "training_mode")
+            training_mode_node = add_const(model, 'dropout_training_mode_node_%d' % index, 'dropout_training_mode_%d' % index, t_value=mode_value)
+            index+=1
+
+            new_dropout.input.extend([node.input[0], new_ratio_node.output[0], training_mode_node.output[0]])
+            new_dropout.output.extend(node.output)
+            delete_nodes.append(get_node_index(model, node))
+            index += 1
+
+    delete_nodes.sort(reverse=True)
+    for d in delete_nodes:
+        del model.graph.node[d]
+
+def align_attention_mask_dim(model):
+    for model_input in model.graph.input:
+        if model_input.name == "attention_mask":
+            model_input.type.tensor_type.shape.dim[0].dim_param = "batch"
+
+
+#replace TrainableDropout with Dropout
+process_trainabledropout(model)
+# some gpt-2 models (large ones) still don't have this input corrected
+align_attention_mask_dim(model)
+
+#set opset version to 12
+model.opset_import[0].version = 12
+
+with open (output_model_name, "wb") as f:
+    f.write(model.SerializeToString())
+
+#
+# To verify the converted model in case of bert, refer to the code at the end of model_transform.py 
+#