From 47f136e2d3240e725b1c4b2bedeb5de0c2db240b Mon Sep 17 00:00:00 2001
From: petermcaughan <peter.mcaughan@gmail.com>
Date: Fri, 30 Jun 2023 12:22:30 -0700
Subject: [PATCH] Speed Up Whisper Export (#16504)

### Description
Add a greedy option to the initializer deduplication process in the
Whisper export.

Currently to detect shared initializers, ORT compares every initializer
against every other initializer (n^2). In the comparison operator, if
the two initializers have different data types (e.g. raw_data and
int_64), both initializers are converted to a numpy array and the cast
result is compared. This cast happens in every comparison, and
exponentially affects the runtime of finding shared initializers. This
cast operation is the bottleneck for the current Whisper export script.

The conversion to the numpy array is useful for detecting equal
initializer values across nodes of different data types (e.g.
recognizing a bias value of 0.0 is the same as a slice index of 0) but
isn't triggered when comparing initializers of the same data type (e.g.
weight value of 0.6 == weight value of 0.6). The latter case is where
the majority of utility is for Whisper, and so by eliminating our path
for comparing numpy arrays for initializers we save a lot of time for
minimal cost.

In other words, this PR adds an option to remove the ability to detect
shared initializers of different types (e.g. Slice Index and MatMul
Constant) while retaining the ability to deduplicate weights.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- Current time to export Whisper-large is prohibitive.

---------

Co-authored-by: Peter McAughan <petermca@microsoft.com>
---
 .../tools/transformers/convert_generation.py       | 12 +++++++-----
 .../transformers/models/whisper/whisper_chain.py   |  2 +-
 .../python/tools/transformers/onnx_model.py        | 14 +++++++++++---
 3 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/convert_generation.py b/onnxruntime/python/tools/transformers/convert_generation.py
index 162713e54b..90ae35a931 100644
--- a/onnxruntime/python/tools/transformers/convert_generation.py
+++ b/onnxruntime/python/tools/transformers/convert_generation.py
@@ -889,6 +889,7 @@ def remove_shared_initializers(
     graph2: GraphProto,
     shared_prefix: str = "shared_",
     min_elements: int = 1024,
+    require_raw_data: bool = False,
 ):
     """Remove initializers with same value from two graphs.
 
@@ -897,6 +898,7 @@ def remove_shared_initializers(
         graph2 (GraphProto): the second graph to process
         shared_prefix (str): add prefix to the shared initializers among two graphs
         min_elements (int, optional): minimal number of elements for initializers to be considered. Defaults to 1024.
+        require_raw_data (bool, optional): Only remove tensors with raw_data field to speed up method
     """
 
     mapping_initializers_1 = {}
@@ -913,7 +915,7 @@ def remove_shared_initializers(
             if not (initializer2.dims and sum(initializer2.dims) >= min_elements):
                 continue
 
-            if OnnxModel.has_same_value(initializer1, initializer2):
+            if OnnxModel.has_same_value(initializer1, initializer2, require_raw_data=True):
                 mapping_initializers_1[initializer1.name] = shared_prefix + initializer2.name
                 shared_initializers_1.append(initializer1)
 
@@ -986,14 +988,14 @@ def remove_shared_initializers(
     return shared_initializers_2
 
 
-def get_shared_initializers(encoder_model: ModelProto, decoder_model: ModelProto):
+def get_shared_initializers(encoder_model: ModelProto, decoder_model: ModelProto, require_raw_data: bool = False):
     encoder = OnnxModel(encoder_model)
     decoder = OnnxModel(decoder_model)
     encoder.add_prefix_to_names("e_")
     decoder.add_prefix_to_names("d_")
-    encoder.remove_duplicated_initializer()
-    decoder.remove_duplicated_initializer()
-    initializers = remove_shared_initializers(encoder.model.graph, decoder.model.graph, "s_")
+    encoder.remove_duplicated_initializer(require_raw_data)
+    decoder.remove_duplicated_initializer(require_raw_data)
+    initializers = remove_shared_initializers(decoder.model.graph, encoder.model.graph, "s_", require_raw_data)
     return initializers
 
 
diff --git a/onnxruntime/python/tools/transformers/models/whisper/whisper_chain.py b/onnxruntime/python/tools/transformers/models/whisper/whisper_chain.py
index d3a47200c5..1a20cbd101 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/whisper_chain.py
+++ b/onnxruntime/python/tools/transformers/models/whisper/whisper_chain.py
@@ -141,7 +141,7 @@ def chain_model(args):
 
     # Initializers/opsets
     # Delete shared data between decoder/encoder and move to larger graph initializers
-    initializers = get_shared_initializers(encoder_model, decoder_model)
+    initializers = get_shared_initializers(encoder_model, decoder_model, require_raw_data=True)
     node.attribute.extend(
         [
             helper.make_attribute("decoder", decoder_model.graph),
diff --git a/onnxruntime/python/tools/transformers/onnx_model.py b/onnxruntime/python/tools/transformers/onnx_model.py
index 3b1c624720..ead61df9f3 100644
--- a/onnxruntime/python/tools/transformers/onnx_model.py
+++ b/onnxruntime/python/tools/transformers/onnx_model.py
@@ -1092,13 +1092,15 @@ class OnnxModel:
         return op_count
 
     @staticmethod
-    def has_same_value(tensor1: TensorProto, tensor2: TensorProto) -> bool:
+    def has_same_value(tensor1: TensorProto, tensor2: TensorProto, require_raw_data: bool = False) -> bool:
         """Returns True when two tensors have same value.
            Note that name can be different.
 
         Args:
             tensor1 (TensorProto): initializer 1
             tensor2 (TensorProto): initializer 2
+            require_raw_data (bool): ignore tensors without raw_data
+                Note: Flag can speed up runtime significantly
 
         Returns:
             bool: True when two intializers has same value.
@@ -1107,11 +1109,15 @@ class OnnxModel:
             return False
         if tensor1.HasField("raw_data") and tensor2.HasField("raw_data"):
             return tensor1.raw_data == tensor2.raw_data
+        if require_raw_data:
+            return False
+
         return (numpy_helper.to_array(tensor1) == numpy_helper.to_array(tensor2)).all()
 
-    def remove_duplicated_initializer(self):
+    def remove_duplicated_initializer(self, require_raw_data: bool = False):
         """Remove initializers with duplicated values, and only keep the first one.
         It could help reduce size of models (like ALBert) with shared weights.
+        If require_raw_data passed, method will only compare raw_data initializers to speed runtime
         Note: this function does not process subgraph.
         """
         if len(self.graphs()) > 1:
@@ -1124,7 +1130,9 @@ class OnnxModel:
             if same[i] >= 0:
                 continue
             for j in range(i + 1, initializer_count):
-                if OnnxModel.has_same_value(self.model.graph.initializer[i], self.model.graph.initializer[j]):
+                if OnnxModel.has_same_value(
+                    self.model.graph.initializer[i], self.model.graph.initializer[j], require_raw_data
+                ):
                     same[j] = i
 
         count = 0