From 90f976d0603eedc02f88e9ec0deea207305e6edd Mon Sep 17 00:00:00 2001
From: Ye Wang <52801275+wangyems@users.noreply.github.com>
Date: Thu, 8 Oct 2020 19:35:17 -0700
Subject: [PATCH] Some improvements on transformers tool (#5383)

* modify tensoflow benchmark gpu setting

* add export from tf choice in script

* fix typo

* match more embedlayernorm pattern

* format
---
 .../python/tools/transformers/benchmark.py    | 31 ++++++++++++++++++-
 .../tools/transformers/fusion_embedlayer.py   | 23 ++++++++------
 .../tools/transformers/onnx_exporter.py       |  9 +++++-
 .../tools/transformers/run_benchmark.sh       |  8 +++++
 4 files changed, 60 insertions(+), 11 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py
index 1cfbd73e3b..4884310a39 100644
--- a/onnxruntime/python/tools/transformers/benchmark.py
+++ b/onnxruntime/python/tools/transformers/benchmark.py
@@ -240,6 +240,31 @@ def run_pytorch(use_gpu, model_names, model_class, precision, num_threads, batch
     return results
 
 
+def run_with_tf_optimizations(do_eager_mode: bool, use_xla: bool):
+    import tensorflow as tf
+    from functools import wraps
+
+    def run_func(func):
+        @wraps(func)
+        def run_in_eager_mode(*args, **kwargs):
+            return func(*args, **kwargs)
+
+        @wraps(func)
+        @tf.function(experimental_compile=use_xla)
+        def run_in_graph_mode(*args, **kwargs):
+            return func(*args, **kwargs)
+
+        if do_eager_mode is True:
+            assert (
+                use_xla is False
+            ), "Cannot run model in XLA, if `args.eager_mode` is set to `True`. Please set `args.eager_mode=False`."
+            return run_in_eager_mode
+        else:
+            return run_in_graph_mode
+
+    return run_func
+
+
 def run_tensorflow(use_gpu, model_names, model_class, precision, num_threads, batch_sizes, sequence_lengths,
                    repeat_times, cache_dir, verbose):
     results = []
@@ -258,6 +283,8 @@ def run_tensorflow(use_gpu, model_names, model_class, precision, num_threads, ba
         physical_devices = tf.config.list_physical_devices('GPU')
         try:
             tf.config.set_visible_devices(physical_devices[0], 'GPU')
+            tf.config.experimental.set_memory_growth(physical_devices[0], True)
+            tf.distribute.OneDeviceStrategy(device='/gpu:0')
         except RuntimeError as e:
             logger.exception(e)
 
@@ -295,10 +322,12 @@ def run_tensorflow(use_gpu, model_names, model_class, precision, num_threads, ba
                 input_ids = tf.constant(values, shape=(batch_size, sequence_length), dtype=tf.int32)
 
                 try:
-
+                    # Disable both for better inference perf
+                    @run_with_tf_optimizations(do_eager_mode=False, use_xla=False)
                     def encoder_forward():
                         return model(input_ids, training=False)
 
+                    @run_with_tf_optimizations(do_eager_mode=False, use_xla=False)
                     def encoder_decoder_forward():
                         return model(input_ids, decoder_input_ids=input_ids, training=False)
 
diff --git a/onnxruntime/python/tools/transformers/fusion_embedlayer.py b/onnxruntime/python/tools/transformers/fusion_embedlayer.py
index ff43278690..c5812caab6 100644
--- a/onnxruntime/python/tools/transformers/fusion_embedlayer.py
+++ b/onnxruntime/python/tools/transformers/fusion_embedlayer.py
@@ -130,13 +130,13 @@ class FusionEmbedLayerNoMask(Fusion):
 
         input_ids = word_embedding_gather.input[1]
 
-        position_embedding_expand = None
+        position_embedding_node_before_gather = None
         position_embedding_shape = None
 
         position_embedding_path = self.model.match_parent_path(normalize_node, ['Gather', 'Expand'],
                                                                [1, 1])  # for distill-bert
         if position_embedding_path is not None:
-            position_embedding_weight_node, position_embedding_expand = position_embedding_path
+            position_embedding_weight_node, position_embedding_node_before_gather = position_embedding_path
         else:
             position_embedding_path = self.model.match_parent_path(normalize_node, ['Reshape', 'Slice'], [1, 0])
             if position_embedding_path is not None:
@@ -145,28 +145,33 @@ class FusionEmbedLayerNoMask(Fusion):
                 position_embedding_path = self.model.match_parent_path(add_node, ['Gather', 'Expand', 'Shape'],
                                                                        [1, 1, 1])
                 if position_embedding_path is not None:
-                    position_embedding_weight_node, position_embedding_expand, position_embedding_shape = position_embedding_path
+                    position_embedding_weight_node, position_embedding_node_before_gather, position_embedding_shape = position_embedding_path
                 else:
                     position_embedding_path = self.model.match_parent_path(
                         add_node, ['Gather', 'Expand', 'Concat', 'Unsqueeze', 'Gather', 'Shape'], [1, 1, 1, 1, 0, 0])
                     if position_embedding_path is not None:
-                        position_embedding_weight_node, position_embedding_expand, _, _, _, position_embedding_shape = position_embedding_path
+                        position_embedding_weight_node, position_embedding_node_before_gather, _, _, _, position_embedding_shape = position_embedding_path
                     else:
                         # Here we will not try to get exact match. Instead, we only try identify position embedding weights.
                         position_embedding_path = self.model.match_parent_path(add_node, ['Gather', 'Expand'], [1, 1])
                         if position_embedding_path is not None:
-                            position_embedding_weight_node, position_embedding_expand = position_embedding_path
+                            position_embedding_weight_node, position_embedding_node_before_gather = position_embedding_path
                         else:
-                            logger.info("Position embedding path is not found. Embed layer cannot be fused.")
-                            return
+                            position_embedding_path = self.model.match_parent_path(add_node, ['Gather', 'Slice'],
+                                                                                   [1, 1])
+                            if position_embedding_path is not None:
+                                position_embedding_weight_node, position_embedding_node_before_gather = position_embedding_path
+                            else:
+                                logger.info("Position embedding path is not found. Embed layer cannot be fused.")
+                                return
 
                 if position_embedding_shape is not None and position_embedding_shape.input[0] != input_ids:
                     logger.info("position and word embedding is expected to be applied on same input")
                     return
 
-        if position_embedding_expand and position_embedding_shape:
+        if position_embedding_node_before_gather and position_embedding_shape:
             input_parent = self.model.get_parent(position_embedding_shape, 0, output_name_to_node)
-            subgraph_nodes = self.model.get_parent_subgraph_nodes(position_embedding_expand,
+            subgraph_nodes = self.model.get_parent_subgraph_nodes(position_embedding_node_before_gather,
                                                                   [input_parent] if input_parent else [],
                                                                   output_name_to_node)
             self.nodes_to_remove.extend(subgraph_nodes)
diff --git a/onnxruntime/python/tools/transformers/onnx_exporter.py b/onnxruntime/python/tools/transformers/onnx_exporter.py
index 145ff4b0fd..8660abf4e4 100644
--- a/onnxruntime/python/tools/transformers/onnx_exporter.py
+++ b/onnxruntime/python/tools/transformers/onnx_exporter.py
@@ -261,7 +261,11 @@ def load_pt_model(model_name, model_class, cache_dir):
 def load_tf_model(model_name, model_class, cache_dir):
     config = AutoConfig.from_pretrained(model_name, cache_dir=cache_dir)
 
-    model = load_pretrained_model(model_name, config=config, cache_dir=cache_dir, custom_model_class=model_class, is_tf_model=True)
+    model = load_pretrained_model(model_name,
+                                  config=config,
+                                  cache_dir=cache_dir,
+                                  custom_model_class=model_class,
+                                  is_tf_model=True)
 
     return config, model
 
@@ -366,6 +370,9 @@ def export_onnx_model_from_pt(model_name, opset_version, use_external_data_forma
 def export_onnx_model_from_tf(model_name, opset_version, use_external_data_format, model_type, model_class, cache_dir,
                               onnx_dir, input_names, use_gpu, precision, optimize_onnx, validate_onnx,
                               use_raw_attention_mask, overwrite, model_fusion_statistics):
+    # Use CPU to export
+    import tensorflow as tf
+    tf.config.set_visible_devices([], 'GPU')
 
     config, model = load_tf_model(model_name, model_class, cache_dir)
 
diff --git a/onnxruntime/python/tools/transformers/run_benchmark.sh b/onnxruntime/python/tools/transformers/run_benchmark.sh
index f826cd5118..51d7ead1ae 100644
--- a/onnxruntime/python/tools/transformers/run_benchmark.sh
+++ b/onnxruntime/python/tools/transformers/run_benchmark.sh
@@ -22,6 +22,9 @@ run_torch=false
 run_torchscript=true
 run_tensorflow=false
 
+# Onnx model source (default is from pytorch, set export_onnx_from_tf=true to convert from tensorflow model)
+export_onnx_from_tf=false
+
 # Devices to test (You can run either CPU or GPU, but not both: gpu need onnxruntime-gpu, and CPU need onnxruntime).
 run_gpu_fp32=true
 run_gpu_fp16=true
@@ -100,6 +103,11 @@ fi
 onnx_export_options="-i $input_counts -v -b 0 --overwrite -f fusion.csv -c $cache_dir --onnx_dir $onnx_dir"
 benchmark_options="-b $batch_sizes -s $sequence_lengths -t $average_over -f fusion.csv -r result.csv -d detail.csv -c $cache_dir --onnx_dir $onnx_dir"
 
+if [ "$export_onnx_from_tf" = true ] ; then
+  onnx_export_options="$onnx_export_options --model_source tf"
+  benchmark_options="$benchmark_options --model_source tf"
+fi
+
 if [ "$use_optimizer" = true ] ; then
   onnx_export_options="$onnx_export_options -o"
   benchmark_options="$benchmark_options -o"