Speed Up Whisper Export (#16504)

### Description Add a greedy option to the initializer deduplication process in the Whisper export. Currently to detect shared initializers, ORT compares every initializer against every other initializer (n^2). In the comparison operator, if the two initializers have different data types (e.g. raw_data and int_64), both initializers are converted to a numpy array and the cast result is compared. This cast happens in every comparison, and exponentially affects the runtime of finding shared initializers. This cast operation is the bottleneck for the current Whisper export script. The conversion to the numpy array is useful for detecting equal initializer values across nodes of different data types (e.g. recognizing a bias value of 0.0 is the same as a slice index of 0) but isn't triggered when comparing initializers of the same data type (e.g. weight value of 0.6 == weight value of 0.6). The latter case is where the majority of utility is for Whisper, and so by eliminating our path for comparing numpy arrays for initializers we save a lot of time for minimal cost. In other words, this PR adds an option to remove the ability to detect shared initializers of different types (e.g. Slice Index and MatMul Constant) while retaining the ability to deduplicate weights. ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - Current time to export Whisper-large is prohibitive. --------- Co-authored-by: Peter McAughan <petermca@microsoft.com>
2026-07-10 17:37:14 +00:00 · 2023-06-30 12:22:30 -07:00 · 2023-06-30 12:22:30 -07:00 · 47f136e2d3
commit 47f136e2d3
parent 708dec5d95
3 changed files with 19 additions and 9 deletions
--- a/onnxruntime/python/tools/transformers/convert_generation.py
+++ b/onnxruntime/python/tools/transformers/convert_generation.py
@ -889,6 +889,7 @@ def remove_shared_initializers(
    graph2: GraphProto,
    shared_prefix: str = "shared_",
    min_elements: int = 1024,
+    require_raw_data: bool = False,
 ):
    """Remove initializers with same value from two graphs.

@ -897,6 +898,7 @@ def remove_shared_initializers(
        graph2 (GraphProto): the second graph to process
        shared_prefix (str): add prefix to the shared initializers among two graphs
        min_elements (int, optional): minimal number of elements for initializers to be considered. Defaults to 1024.
+        require_raw_data (bool, optional): Only remove tensors with raw_data field to speed up method
    """

    mapping_initializers_1 = {}
@ -913,7 +915,7 @@ def remove_shared_initializers(
            if not (initializer2.dims and sum(initializer2.dims) >= min_elements):
                continue

-            if OnnxModel.has_same_value(initializer1, initializer2):
+            if OnnxModel.has_same_value(initializer1, initializer2, require_raw_data=True):
                mapping_initializers_1[initializer1.name] = shared_prefix + initializer2.name
                shared_initializers_1.append(initializer1)

@ -986,14 +988,14 @@ def remove_shared_initializers(
    return shared_initializers_2


-def get_shared_initializers(encoder_model: ModelProto, decoder_model: ModelProto):
+def get_shared_initializers(encoder_model: ModelProto, decoder_model: ModelProto, require_raw_data: bool = False):
    encoder = OnnxModel(encoder_model)
    decoder = OnnxModel(decoder_model)
    encoder.add_prefix_to_names("e_")
    decoder.add_prefix_to_names("d_")
-    encoder.remove_duplicated_initializer()
-    decoder.remove_duplicated_initializer()
-    initializers = remove_shared_initializers(encoder.model.graph, decoder.model.graph, "s_")
+    encoder.remove_duplicated_initializer(require_raw_data)
+    decoder.remove_duplicated_initializer(require_raw_data)
+    initializers = remove_shared_initializers(decoder.model.graph, encoder.model.graph, "s_", require_raw_data)
    return initializers


--- a/onnxruntime/python/tools/transformers/models/whisper/whisper_chain.py
+++ b/onnxruntime/python/tools/transformers/models/whisper/whisper_chain.py
@ -141,7 +141,7 @@ def chain_model(args):

    # Initializers/opsets
    # Delete shared data between decoder/encoder and move to larger graph initializers
-    initializers = get_shared_initializers(encoder_model, decoder_model)
+    initializers = get_shared_initializers(encoder_model, decoder_model, require_raw_data=True)
    node.attribute.extend(
        [
            helper.make_attribute("decoder", decoder_model.graph),
--- a/onnxruntime/python/tools/transformers/onnx_model.py
+++ b/onnxruntime/python/tools/transformers/onnx_model.py
@ -1092,13 +1092,15 @@ class OnnxModel:
        return op_count

    @staticmethod
-    def has_same_value(tensor1: TensorProto, tensor2: TensorProto) -> bool:
+    def has_same_value(tensor1: TensorProto, tensor2: TensorProto, require_raw_data: bool = False) -> bool:
        """Returns True when two tensors have same value.
           Note that name can be different.

        Args:
            tensor1 (TensorProto): initializer 1
            tensor2 (TensorProto): initializer 2
+            require_raw_data (bool): ignore tensors without raw_data
+                Note: Flag can speed up runtime significantly

        Returns:
            bool: True when two intializers has same value.
@ -1107,11 +1109,15 @@ class OnnxModel:
            return False
        if tensor1.HasField("raw_data") and tensor2.HasField("raw_data"):
            return tensor1.raw_data == tensor2.raw_data
+        if require_raw_data:
+            return False
+
        return (numpy_helper.to_array(tensor1) == numpy_helper.to_array(tensor2)).all()

-    def remove_duplicated_initializer(self):
+    def remove_duplicated_initializer(self, require_raw_data: bool = False):
        """Remove initializers with duplicated values, and only keep the first one.
        It could help reduce size of models (like ALBert) with shared weights.
+        If require_raw_data passed, method will only compare raw_data initializers to speed runtime
        Note: this function does not process subgraph.
        """
        if len(self.graphs()) > 1:
@ -1124,7 +1130,9 @@ class OnnxModel:
            if same[i] >= 0:
                continue
            for j in range(i + 1, initializer_count):
-                if OnnxModel.has_same_value(self.model.graph.initializer[i], self.model.graph.initializer[j]):
+                if OnnxModel.has_same_value(
+                    self.model.graph.initializer[i], self.model.graph.initializer[j], require_raw_data
+                ):
                    same[j] = i

        count = 0