Revert saving optimized model as external data (#5690)

* revert and add support for saving external data * review comments * update
2026-06-08 00:23:03 +00:00 · 2020-11-06 11:54:19 -08:00 · 2020-11-06 11:54:19 -08:00 · 95e6da7957
commit 95e6da7957
parent 71f90e08f1
4 changed files with 31 additions and 21 deletions
--- a/onnxruntime/python/tools/transformers/fusion_attention.py
+++ b/onnxruntime/python/tools/transformers/fusion_attention.py
@ -131,13 +131,15 @@ class FusionAttention(Fusion):
        weight = helper.make_tensor(name=attention_node_name + '_qkv_weight',
                                    data_type=TensorProto.FLOAT,
                                    dims=[self.hidden_size, 3 * self.hidden_size],
-                                    vals=qkv_weight.flatten().tolist())
+                                    vals=bytes(qkv_weight.flatten()),
+                                    raw=True)
        self.model.add_initializer(weight)

        bias = helper.make_tensor(name=attention_node_name + '_qkv_bias',
                                  data_type=TensorProto.FLOAT,
                                  dims=[3 * self.hidden_size],
-                                  vals=qkv_bias.flatten().tolist())
+                                  vals=bytes(qkv_bias.flatten()),
+                                  raw=True)
        self.model.add_initializer(bias)

        attnetion_inputs = [input, attention_node_name + '_qkv_weight', attention_node_name + '_qkv_bias']
@ -291,9 +293,12 @@ class FusionAttention(Fusion):
            if einsum_node is not None:
                unique_index = einsum_node.input[0]
                new_edge = "edge_modified_" + unique_index
-                shape_tensor = self.model.convert_list_to_tensor(
-                    "shape_modified_tensor" + unique_index, TensorProto.INT64, [4],
-                    [0, 0, self.num_heads, int(self.hidden_size / self.num_heads)])
+                shape_tensor = helper.make_tensor(
+                    name="shape_modified_tensor" + unique_index,
+                    data_type=TensorProto.INT64,
+                    dims=[4],
+                    vals=np.int64([0, 0, self.num_heads, int(self.hidden_size / self.num_heads)]).tobytes(),
+                    raw=True)
                self.model.add_initializer(shape_tensor)
                self.model.add_node(
                    helper.make_node("Reshape", [attention_last_node.output[0], shape_tensor.name], [new_edge],
--- a/onnxruntime/python/tools/transformers/huggingface_models.py
+++ b/onnxruntime/python/tools/transformers/huggingface_models.py
@ -6,10 +6,17 @@

 # Maps model class name to a tuple of model class
 MODEL_CLASSES = [
-    'AutoModel',
-    'AutoModelWithLMHead',
-    'AutoModelForSequenceClassification',
-    'AutoModelForQuestionAnswering'
+    'AutoModel', 'AutoModelWithLMHead', 'AutoModelForSequenceClassification', 'AutoModelForQuestionAnswering'
+]
+
+# List of models that require external data saving for onnx export but do not require it when saving optimized onnx model
+# Very few models in the huggingface list require it for both: albert-xxlarge-v1, albert-xxlarge-v2
+# TODO: most of the models in the below exempt list having runtime issues when saving these optimized onnx models
+# using external data format. Need to address the issue in the future
+EXEMPT_MODELS = [
+    "gpt2-large", "gpt2-xl", "xlm-mlm-en-2048", "xlm-mlm-17-1280", "xlm-mlm-100-1280", "ctrl", "albert-xlarge-v1",
+    "albert-xlarge-v2", "t5-large", "t5-3b", "t5-11b", "xlm-roberta-large", "microsoft/DialoGPT-large",
+    "facebook/mbart-large-en-ro"
 ]

 # List of pretrained models: https://huggingface.co/transformers/pretrained_models.html
@ -83,11 +90,11 @@ MODELS = {
    "albert-base-v1": (["input_ids"], 12, False, "bert"),
    "albert-large-v1": (["input_ids"], 12, False, "bert"),
    "albert-xlarge-v1": (["input_ids"], 12, True, "bert"),
-    "albert-xxlarge-v1": (["input_ids"], 12, True, "bert"),
+    #"albert-xxlarge-v1": (["input_ids"], 12, True, "bert"),
    "albert-base-v2": (["input_ids"], 12, False, "bert"),
    "albert-large-v2": (["input_ids"], 12, False, "bert"),
    "albert-xlarge-v2": (["input_ids"], 12, True, "bert"),
-    "albert-xxlarge-v2": (["input_ids"], 12, True, "bert"),
+    #"albert-xxlarge-v2": (["input_ids"], 12, True, "bert"),
    # T5
    "t5-small": (["input_ids"], 12, False, "bert"),
    "t5-base": (["input_ids"], 12, False, "bert"),
--- a/onnxruntime/python/tools/transformers/onnx_exporter.py
+++ b/onnxruntime/python/tools/transformers/onnx_exporter.py
@ -13,7 +13,7 @@ from transformers import AutoConfig, AutoTokenizer, AutoModel
 from benchmark_helper import create_onnxruntime_session, Precision
 from gpt2_helper import GPT2ModelNoPastState, PRETRAINED_GPT2_MODELS
 from quantize_helper import QuantizeHelper
-from huggingface_models import MODEL_CLASSES
+from huggingface_models import MODEL_CLASSES, EXEMPT_MODELS

 logger = logging.getLogger(__name__)

@ -169,8 +169,8 @@ def optimize_onnx_model_by_ort(onnx_model_path, ort_model_path, use_gpu, overwri
        logger.info(f"Skip optimization since model existed: {ort_model_path}")


-def optimize_onnx_model(onnx_model_path, optimized_model_path, model_type, num_attention_heads, hidden_size, use_gpu,
-                        precision, use_raw_attention_mask, overwrite, model_fusion_statistics,
+def optimize_onnx_model(model_name, onnx_model_path, optimized_model_path, model_type, num_attention_heads, hidden_size,
+                        use_gpu, precision, use_raw_attention_mask, overwrite, model_fusion_statistics,
                        use_external_data_format):
    if overwrite or not os.path.exists(optimized_model_path):
        Path(optimized_model_path).parent.mkdir(parents=True, exist_ok=True)
@ -202,6 +202,10 @@ def optimize_onnx_model(onnx_model_path, optimized_model_path, model_type, num_a

        if Precision.FLOAT16 == precision:
            opt_model.convert_model_float32_to_float16()
+
+        if model_name in EXEMPT_MODELS:
+            use_external_data_format = False
+
        opt_model.save_model_to_file(optimized_model_path, use_external_data_format)
    else:
        logger.info(f"Skip optimization since model existed: {optimized_model_path}")
@ -291,7 +295,7 @@ def validate_and_optimize_onnx(model_name, use_external_data_format, model_type,
    if optimize_onnx or precision == Precision.FLOAT16 or precision == Precision.INT8:  # Use script (optimizer.py) to optimize
        optimized_model_path = get_onnx_file_path(onnx_dir, model_name, len(input_names), True, use_gpu, precision,
                                                  False, use_external_data_format)
-        optimize_onnx_model(onnx_model_path, optimized_model_path, model_type, config.num_attention_heads,
+        optimize_onnx_model(model_name, onnx_model_path, optimized_model_path, model_type, config.num_attention_heads,
                            config.hidden_size, use_gpu, precision, use_raw_attention_mask, overwrite,
                            model_fusion_statistics, use_external_data_format)

@ -419,4 +423,3 @@ def export_onnx_model_from_tf(model_name, opset_version, use_external_data_forma
        example_inputs, example_outputs_flatten)

    return onnx_model_file, is_valid_onnx_model, vocab_size, max_input_size
-
--- a/onnxruntime/python/tools/transformers/onnx_model.py
+++ b/onnxruntime/python/tools/transformers/onnx_model.py
@ -368,11 +368,6 @@ class OnnxModel:
                shape_list.append("?")  # shall not happen
        return shape_list

-    def convert_list_to_tensor(self, name, type, shape, value):
-        """ Convert list to tensor
-        """
-        return helper.make_tensor(name, type, shape, value)
-
    def change_input_output_float32_to_float16(self):
        """ Change graph input and output data type from FLOAT to FLOAT16
        """