From 90f976d0603eedc02f88e9ec0deea207305e6edd Mon Sep 17 00:00:00 2001 From: Ye Wang <52801275+wangyems@users.noreply.github.com> Date: Thu, 8 Oct 2020 19:35:17 -0700 Subject: [PATCH] Some improvements on transformers tool (#5383) * modify tensoflow benchmark gpu setting * add export from tf choice in script * fix typo * match more embedlayernorm pattern * format --- .../python/tools/transformers/benchmark.py | 31 ++++++++++++++++++- .../tools/transformers/fusion_embedlayer.py | 23 ++++++++------ .../tools/transformers/onnx_exporter.py | 9 +++++- .../tools/transformers/run_benchmark.sh | 8 +++++ 4 files changed, 60 insertions(+), 11 deletions(-) diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py index 1cfbd73e3b..4884310a39 100644 --- a/onnxruntime/python/tools/transformers/benchmark.py +++ b/onnxruntime/python/tools/transformers/benchmark.py @@ -240,6 +240,31 @@ def run_pytorch(use_gpu, model_names, model_class, precision, num_threads, batch return results +def run_with_tf_optimizations(do_eager_mode: bool, use_xla: bool): + import tensorflow as tf + from functools import wraps + + def run_func(func): + @wraps(func) + def run_in_eager_mode(*args, **kwargs): + return func(*args, **kwargs) + + @wraps(func) + @tf.function(experimental_compile=use_xla) + def run_in_graph_mode(*args, **kwargs): + return func(*args, **kwargs) + + if do_eager_mode is True: + assert ( + use_xla is False + ), "Cannot run model in XLA, if `args.eager_mode` is set to `True`. Please set `args.eager_mode=False`." + return run_in_eager_mode + else: + return run_in_graph_mode + + return run_func + + def run_tensorflow(use_gpu, model_names, model_class, precision, num_threads, batch_sizes, sequence_lengths, repeat_times, cache_dir, verbose): results = [] @@ -258,6 +283,8 @@ def run_tensorflow(use_gpu, model_names, model_class, precision, num_threads, ba physical_devices = tf.config.list_physical_devices('GPU') try: tf.config.set_visible_devices(physical_devices[0], 'GPU') + tf.config.experimental.set_memory_growth(physical_devices[0], True) + tf.distribute.OneDeviceStrategy(device='/gpu:0') except RuntimeError as e: logger.exception(e) @@ -295,10 +322,12 @@ def run_tensorflow(use_gpu, model_names, model_class, precision, num_threads, ba input_ids = tf.constant(values, shape=(batch_size, sequence_length), dtype=tf.int32) try: - + # Disable both for better inference perf + @run_with_tf_optimizations(do_eager_mode=False, use_xla=False) def encoder_forward(): return model(input_ids, training=False) + @run_with_tf_optimizations(do_eager_mode=False, use_xla=False) def encoder_decoder_forward(): return model(input_ids, decoder_input_ids=input_ids, training=False) diff --git a/onnxruntime/python/tools/transformers/fusion_embedlayer.py b/onnxruntime/python/tools/transformers/fusion_embedlayer.py index ff43278690..c5812caab6 100644 --- a/onnxruntime/python/tools/transformers/fusion_embedlayer.py +++ b/onnxruntime/python/tools/transformers/fusion_embedlayer.py @@ -130,13 +130,13 @@ class FusionEmbedLayerNoMask(Fusion): input_ids = word_embedding_gather.input[1] - position_embedding_expand = None + position_embedding_node_before_gather = None position_embedding_shape = None position_embedding_path = self.model.match_parent_path(normalize_node, ['Gather', 'Expand'], [1, 1]) # for distill-bert if position_embedding_path is not None: - position_embedding_weight_node, position_embedding_expand = position_embedding_path + position_embedding_weight_node, position_embedding_node_before_gather = position_embedding_path else: position_embedding_path = self.model.match_parent_path(normalize_node, ['Reshape', 'Slice'], [1, 0]) if position_embedding_path is not None: @@ -145,28 +145,33 @@ class FusionEmbedLayerNoMask(Fusion): position_embedding_path = self.model.match_parent_path(add_node, ['Gather', 'Expand', 'Shape'], [1, 1, 1]) if position_embedding_path is not None: - position_embedding_weight_node, position_embedding_expand, position_embedding_shape = position_embedding_path + position_embedding_weight_node, position_embedding_node_before_gather, position_embedding_shape = position_embedding_path else: position_embedding_path = self.model.match_parent_path( add_node, ['Gather', 'Expand', 'Concat', 'Unsqueeze', 'Gather', 'Shape'], [1, 1, 1, 1, 0, 0]) if position_embedding_path is not None: - position_embedding_weight_node, position_embedding_expand, _, _, _, position_embedding_shape = position_embedding_path + position_embedding_weight_node, position_embedding_node_before_gather, _, _, _, position_embedding_shape = position_embedding_path else: # Here we will not try to get exact match. Instead, we only try identify position embedding weights. position_embedding_path = self.model.match_parent_path(add_node, ['Gather', 'Expand'], [1, 1]) if position_embedding_path is not None: - position_embedding_weight_node, position_embedding_expand = position_embedding_path + position_embedding_weight_node, position_embedding_node_before_gather = position_embedding_path else: - logger.info("Position embedding path is not found. Embed layer cannot be fused.") - return + position_embedding_path = self.model.match_parent_path(add_node, ['Gather', 'Slice'], + [1, 1]) + if position_embedding_path is not None: + position_embedding_weight_node, position_embedding_node_before_gather = position_embedding_path + else: + logger.info("Position embedding path is not found. Embed layer cannot be fused.") + return if position_embedding_shape is not None and position_embedding_shape.input[0] != input_ids: logger.info("position and word embedding is expected to be applied on same input") return - if position_embedding_expand and position_embedding_shape: + if position_embedding_node_before_gather and position_embedding_shape: input_parent = self.model.get_parent(position_embedding_shape, 0, output_name_to_node) - subgraph_nodes = self.model.get_parent_subgraph_nodes(position_embedding_expand, + subgraph_nodes = self.model.get_parent_subgraph_nodes(position_embedding_node_before_gather, [input_parent] if input_parent else [], output_name_to_node) self.nodes_to_remove.extend(subgraph_nodes) diff --git a/onnxruntime/python/tools/transformers/onnx_exporter.py b/onnxruntime/python/tools/transformers/onnx_exporter.py index 145ff4b0fd..8660abf4e4 100644 --- a/onnxruntime/python/tools/transformers/onnx_exporter.py +++ b/onnxruntime/python/tools/transformers/onnx_exporter.py @@ -261,7 +261,11 @@ def load_pt_model(model_name, model_class, cache_dir): def load_tf_model(model_name, model_class, cache_dir): config = AutoConfig.from_pretrained(model_name, cache_dir=cache_dir) - model = load_pretrained_model(model_name, config=config, cache_dir=cache_dir, custom_model_class=model_class, is_tf_model=True) + model = load_pretrained_model(model_name, + config=config, + cache_dir=cache_dir, + custom_model_class=model_class, + is_tf_model=True) return config, model @@ -366,6 +370,9 @@ def export_onnx_model_from_pt(model_name, opset_version, use_external_data_forma def export_onnx_model_from_tf(model_name, opset_version, use_external_data_format, model_type, model_class, cache_dir, onnx_dir, input_names, use_gpu, precision, optimize_onnx, validate_onnx, use_raw_attention_mask, overwrite, model_fusion_statistics): + # Use CPU to export + import tensorflow as tf + tf.config.set_visible_devices([], 'GPU') config, model = load_tf_model(model_name, model_class, cache_dir) diff --git a/onnxruntime/python/tools/transformers/run_benchmark.sh b/onnxruntime/python/tools/transformers/run_benchmark.sh index f826cd5118..51d7ead1ae 100644 --- a/onnxruntime/python/tools/transformers/run_benchmark.sh +++ b/onnxruntime/python/tools/transformers/run_benchmark.sh @@ -22,6 +22,9 @@ run_torch=false run_torchscript=true run_tensorflow=false +# Onnx model source (default is from pytorch, set export_onnx_from_tf=true to convert from tensorflow model) +export_onnx_from_tf=false + # Devices to test (You can run either CPU or GPU, but not both: gpu need onnxruntime-gpu, and CPU need onnxruntime). run_gpu_fp32=true run_gpu_fp16=true @@ -100,6 +103,11 @@ fi onnx_export_options="-i $input_counts -v -b 0 --overwrite -f fusion.csv -c $cache_dir --onnx_dir $onnx_dir" benchmark_options="-b $batch_sizes -s $sequence_lengths -t $average_over -f fusion.csv -r result.csv -d detail.csv -c $cache_dir --onnx_dir $onnx_dir" +if [ "$export_onnx_from_tf" = true ] ; then + onnx_export_options="$onnx_export_options --model_source tf" + benchmark_options="$benchmark_options --model_source tf" +fi + if [ "$use_optimizer" = true ] ; then onnx_export_options="$onnx_export_options -o" benchmark_options="$benchmark_options -o"