Speed Up Whisper Export (#16504)

### Description
Add a greedy option to the initializer deduplication process in the
Whisper export.

Currently to detect shared initializers, ORT compares every initializer
against every other initializer (n^2). In the comparison operator, if
the two initializers have different data types (e.g. raw_data and
int_64), both initializers are converted to a numpy array and the cast
result is compared. This cast happens in every comparison, and
exponentially affects the runtime of finding shared initializers. This
cast operation is the bottleneck for the current Whisper export script.

The conversion to the numpy array is useful for detecting equal
initializer values across nodes of different data types (e.g.
recognizing a bias value of 0.0 is the same as a slice index of 0) but
isn't triggered when comparing initializers of the same data type (e.g.
weight value of 0.6 == weight value of 0.6). The latter case is where
the majority of utility is for Whisper, and so by eliminating our path
for comparing numpy arrays for initializers we save a lot of time for
minimal cost.

In other words, this PR adds an option to remove the ability to detect
shared initializers of different types (e.g. Slice Index and MatMul
Constant) while retaining the ability to deduplicate weights.



### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- Current time to export Whisper-large is prohibitive.

---------

Co-authored-by: Peter McAughan <petermca@microsoft.com>
This commit is contained in:
petermcaughan 2023-06-30 12:22:30 -07:00 committed by GitHub
parent 708dec5d95
commit 47f136e2d3
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 19 additions and 9 deletions

View file

@ -889,6 +889,7 @@ def remove_shared_initializers(
graph2: GraphProto,
shared_prefix: str = "shared_",
min_elements: int = 1024,
require_raw_data: bool = False,
):
"""Remove initializers with same value from two graphs.
@ -897,6 +898,7 @@ def remove_shared_initializers(
graph2 (GraphProto): the second graph to process
shared_prefix (str): add prefix to the shared initializers among two graphs
min_elements (int, optional): minimal number of elements for initializers to be considered. Defaults to 1024.
require_raw_data (bool, optional): Only remove tensors with raw_data field to speed up method
"""
mapping_initializers_1 = {}
@ -913,7 +915,7 @@ def remove_shared_initializers(
if not (initializer2.dims and sum(initializer2.dims) >= min_elements):
continue
if OnnxModel.has_same_value(initializer1, initializer2):
if OnnxModel.has_same_value(initializer1, initializer2, require_raw_data=True):
mapping_initializers_1[initializer1.name] = shared_prefix + initializer2.name
shared_initializers_1.append(initializer1)
@ -986,14 +988,14 @@ def remove_shared_initializers(
return shared_initializers_2
def get_shared_initializers(encoder_model: ModelProto, decoder_model: ModelProto):
def get_shared_initializers(encoder_model: ModelProto, decoder_model: ModelProto, require_raw_data: bool = False):
encoder = OnnxModel(encoder_model)
decoder = OnnxModel(decoder_model)
encoder.add_prefix_to_names("e_")
decoder.add_prefix_to_names("d_")
encoder.remove_duplicated_initializer()
decoder.remove_duplicated_initializer()
initializers = remove_shared_initializers(encoder.model.graph, decoder.model.graph, "s_")
encoder.remove_duplicated_initializer(require_raw_data)
decoder.remove_duplicated_initializer(require_raw_data)
initializers = remove_shared_initializers(decoder.model.graph, encoder.model.graph, "s_", require_raw_data)
return initializers

View file

@ -141,7 +141,7 @@ def chain_model(args):
# Initializers/opsets
# Delete shared data between decoder/encoder and move to larger graph initializers
initializers = get_shared_initializers(encoder_model, decoder_model)
initializers = get_shared_initializers(encoder_model, decoder_model, require_raw_data=True)
node.attribute.extend(
[
helper.make_attribute("decoder", decoder_model.graph),

View file

@ -1092,13 +1092,15 @@ class OnnxModel:
return op_count
@staticmethod
def has_same_value(tensor1: TensorProto, tensor2: TensorProto) -> bool:
def has_same_value(tensor1: TensorProto, tensor2: TensorProto, require_raw_data: bool = False) -> bool:
"""Returns True when two tensors have same value.
Note that name can be different.
Args:
tensor1 (TensorProto): initializer 1
tensor2 (TensorProto): initializer 2
require_raw_data (bool): ignore tensors without raw_data
Note: Flag can speed up runtime significantly
Returns:
bool: True when two intializers has same value.
@ -1107,11 +1109,15 @@ class OnnxModel:
return False
if tensor1.HasField("raw_data") and tensor2.HasField("raw_data"):
return tensor1.raw_data == tensor2.raw_data
if require_raw_data:
return False
return (numpy_helper.to_array(tensor1) == numpy_helper.to_array(tensor2)).all()
def remove_duplicated_initializer(self):
def remove_duplicated_initializer(self, require_raw_data: bool = False):
"""Remove initializers with duplicated values, and only keep the first one.
It could help reduce size of models (like ALBert) with shared weights.
If require_raw_data passed, method will only compare raw_data initializers to speed runtime
Note: this function does not process subgraph.
"""
if len(self.graphs()) > 1:
@ -1124,7 +1130,9 @@ class OnnxModel:
if same[i] >= 0:
continue
for j in range(i + 1, initializer_count):
if OnnxModel.has_same_value(self.model.graph.initializer[i], self.model.graph.initializer[j]):
if OnnxModel.has_same_value(
self.model.graph.initializer[i], self.model.graph.initializer[j], require_raw_data
):
same[j] = i
count = 0