Some improvements on transformers tool (#5383)

* modify tensoflow benchmark gpu setting * add export from tf choice in script * fix typo * match more embedlayernorm pattern * format
2026-07-11 17:48:34 +00:00 · 2020-10-08 19:35:17 -07:00 · 2020-10-08 19:35:17 -07:00 · 90f976d060
commit 90f976d060
parent fab7f799a7
4 changed files with 60 additions and 11 deletions
--- a/onnxruntime/python/tools/transformers/benchmark.py
+++ b/onnxruntime/python/tools/transformers/benchmark.py
@ -240,6 +240,31 @@ def run_pytorch(use_gpu, model_names, model_class, precision, num_threads, batch
    return results


+def run_with_tf_optimizations(do_eager_mode: bool, use_xla: bool):
+    import tensorflow as tf
+    from functools import wraps
+
+    def run_func(func):
+        @wraps(func)
+        def run_in_eager_mode(*args, **kwargs):
+            return func(*args, **kwargs)
+
+        @wraps(func)
+        @tf.function(experimental_compile=use_xla)
+        def run_in_graph_mode(*args, **kwargs):
+            return func(*args, **kwargs)
+
+        if do_eager_mode is True:
+            assert (
+                use_xla is False
+            ), "Cannot run model in XLA, if `args.eager_mode` is set to `True`. Please set `args.eager_mode=False`."
+            return run_in_eager_mode
+        else:
+            return run_in_graph_mode
+
+    return run_func
+
+
 def run_tensorflow(use_gpu, model_names, model_class, precision, num_threads, batch_sizes, sequence_lengths,
                   repeat_times, cache_dir, verbose):
    results = []
@ -258,6 +283,8 @@ def run_tensorflow(use_gpu, model_names, model_class, precision, num_threads, ba
        physical_devices = tf.config.list_physical_devices('GPU')
        try:
            tf.config.set_visible_devices(physical_devices[0], 'GPU')
+            tf.config.experimental.set_memory_growth(physical_devices[0], True)
+            tf.distribute.OneDeviceStrategy(device='/gpu:0')
        except RuntimeError as e:
            logger.exception(e)

@ -295,10 +322,12 @@ def run_tensorflow(use_gpu, model_names, model_class, precision, num_threads, ba
                input_ids = tf.constant(values, shape=(batch_size, sequence_length), dtype=tf.int32)

                try:
-
+                    # Disable both for better inference perf
+                    @run_with_tf_optimizations(do_eager_mode=False, use_xla=False)
                    def encoder_forward():
                        return model(input_ids, training=False)

+                    @run_with_tf_optimizations(do_eager_mode=False, use_xla=False)
                    def encoder_decoder_forward():
                        return model(input_ids, decoder_input_ids=input_ids, training=False)

--- a/onnxruntime/python/tools/transformers/fusion_embedlayer.py
+++ b/onnxruntime/python/tools/transformers/fusion_embedlayer.py
@ -130,13 +130,13 @@ class FusionEmbedLayerNoMask(Fusion):

        input_ids = word_embedding_gather.input[1]

-        position_embedding_expand = None
+        position_embedding_node_before_gather = None
        position_embedding_shape = None

        position_embedding_path = self.model.match_parent_path(normalize_node, ['Gather', 'Expand'],
                                                               [1, 1])  # for distill-bert
        if position_embedding_path is not None:
-            position_embedding_weight_node, position_embedding_expand = position_embedding_path
+            position_embedding_weight_node, position_embedding_node_before_gather = position_embedding_path
        else:
            position_embedding_path = self.model.match_parent_path(normalize_node, ['Reshape', 'Slice'], [1, 0])
            if position_embedding_path is not None:
@ -145,28 +145,33 @@ class FusionEmbedLayerNoMask(Fusion):
                position_embedding_path = self.model.match_parent_path(add_node, ['Gather', 'Expand', 'Shape'],
                                                                       [1, 1, 1])
                if position_embedding_path is not None:
-                    position_embedding_weight_node, position_embedding_expand, position_embedding_shape = position_embedding_path
+                    position_embedding_weight_node, position_embedding_node_before_gather, position_embedding_shape = position_embedding_path
                else:
                    position_embedding_path = self.model.match_parent_path(
                        add_node, ['Gather', 'Expand', 'Concat', 'Unsqueeze', 'Gather', 'Shape'], [1, 1, 1, 1, 0, 0])
                    if position_embedding_path is not None:
-                        position_embedding_weight_node, position_embedding_expand, _, _, _, position_embedding_shape = position_embedding_path
+                        position_embedding_weight_node, position_embedding_node_before_gather, _, _, _, position_embedding_shape = position_embedding_path
                    else:
                        # Here we will not try to get exact match. Instead, we only try identify position embedding weights.
                        position_embedding_path = self.model.match_parent_path(add_node, ['Gather', 'Expand'], [1, 1])
                        if position_embedding_path is not None:
-                            position_embedding_weight_node, position_embedding_expand = position_embedding_path
+                            position_embedding_weight_node, position_embedding_node_before_gather = position_embedding_path
                        else:
-                            logger.info("Position embedding path is not found. Embed layer cannot be fused.")
-                            return
+                            position_embedding_path = self.model.match_parent_path(add_node, ['Gather', 'Slice'],
+                                                                                   [1, 1])
+                            if position_embedding_path is not None:
+                                position_embedding_weight_node, position_embedding_node_before_gather = position_embedding_path
+                            else:
+                                logger.info("Position embedding path is not found. Embed layer cannot be fused.")
+                                return

                if position_embedding_shape is not None and position_embedding_shape.input[0] != input_ids:
                    logger.info("position and word embedding is expected to be applied on same input")
                    return

-        if position_embedding_expand and position_embedding_shape:
+        if position_embedding_node_before_gather and position_embedding_shape:
            input_parent = self.model.get_parent(position_embedding_shape, 0, output_name_to_node)
-            subgraph_nodes = self.model.get_parent_subgraph_nodes(position_embedding_expand,
+            subgraph_nodes = self.model.get_parent_subgraph_nodes(position_embedding_node_before_gather,
                                                                  [input_parent] if input_parent else [],
                                                                  output_name_to_node)
            self.nodes_to_remove.extend(subgraph_nodes)
--- a/onnxruntime/python/tools/transformers/onnx_exporter.py
+++ b/onnxruntime/python/tools/transformers/onnx_exporter.py
@ -261,7 +261,11 @@ def load_pt_model(model_name, model_class, cache_dir):
 def load_tf_model(model_name, model_class, cache_dir):
    config = AutoConfig.from_pretrained(model_name, cache_dir=cache_dir)

-    model = load_pretrained_model(model_name, config=config, cache_dir=cache_dir, custom_model_class=model_class, is_tf_model=True)
+    model = load_pretrained_model(model_name,
+                                  config=config,
+                                  cache_dir=cache_dir,
+                                  custom_model_class=model_class,
+                                  is_tf_model=True)

    return config, model

@ -366,6 +370,9 @@ def export_onnx_model_from_pt(model_name, opset_version, use_external_data_forma
 def export_onnx_model_from_tf(model_name, opset_version, use_external_data_format, model_type, model_class, cache_dir,
                              onnx_dir, input_names, use_gpu, precision, optimize_onnx, validate_onnx,
                              use_raw_attention_mask, overwrite, model_fusion_statistics):
+    # Use CPU to export
+    import tensorflow as tf
+    tf.config.set_visible_devices([], 'GPU')

    config, model = load_tf_model(model_name, model_class, cache_dir)

--- a/onnxruntime/python/tools/transformers/run_benchmark.sh
+++ b/onnxruntime/python/tools/transformers/run_benchmark.sh
@ -22,6 +22,9 @@ run_torch=false
 run_torchscript=true
 run_tensorflow=false

+# Onnx model source (default is from pytorch, set export_onnx_from_tf=true to convert from tensorflow model)
+export_onnx_from_tf=false
+
 # Devices to test (You can run either CPU or GPU, but not both: gpu need onnxruntime-gpu, and CPU need onnxruntime).
 run_gpu_fp32=true
 run_gpu_fp16=true
@ -100,6 +103,11 @@ fi
 onnx_export_options="-i $input_counts -v -b 0 --overwrite -f fusion.csv -c $cache_dir --onnx_dir $onnx_dir"
 benchmark_options="-b $batch_sizes -s $sequence_lengths -t $average_over -f fusion.csv -r result.csv -d detail.csv -c $cache_dir --onnx_dir $onnx_dir"

+if [ "$export_onnx_from_tf" = true ] ; then
+  onnx_export_options="$onnx_export_options --model_source tf"
+  benchmark_options="$benchmark_options --model_source tf"
+fi
+
 if [ "$use_optimizer" = true ] ; then
  onnx_export_options="$onnx_export_options -o"
  benchmark_options="$benchmark_options -o"