Optimize constant sharing perf (#20143)

### Optimize constant sharing perf by avoiding [renaming for the first name we detect a constant pattern. Currently every time we start run ConstantSharing, for each initializer, we find its pattern does not exist, then we create a new NodeArg with a unique name. Then later if other initializer share the same pattern, they will be replaced by the NodeArg. The problem is: once there is no real constant sharing cases, we still modify the graph for each initializer. This is not needed. ### Motivation and Context
2026-07-16 18:31:27 +00:00 · 2024-04-09 12:04:36 +08:00 · 2024-04-09 12:04:36 +08:00 · 81005e2c92
commit 81005e2c92
parent 07b5377f7c
2 changed files with 19 additions and 18 deletions
--- a/onnxruntime/core/optimizer/constant_sharing.cc
+++ b/onnxruntime/core/optimizer/constant_sharing.cc
@ -32,10 +32,9 @@ using SupportedTypeList = boost::mp11::mp_list<MLFloat16, float, double, int32_t
 // A threshold is defined here to restrict the graph transformation only applied to small tensors.
 // Be note: having a bigger threshold means more overhead when we do the graph transformations.
 // `8` is chosen to cover common constant use cases in some Reshape/Gather/Concat's inputs.
-// TODO(pengwa): we can gradually increase this threshold if we see more benefits (memory saving
+// TODO(pengwa): we can gradually increase this threshold if we see more benefits (memory-saving
 // or more CSE optimizations triggered). Should be careful to cover test cases that assume initializer
 // name did not change after transformation then.
-static constexpr char SHARED_INITIALIZER_PREFIX[] = "ortshared_";

 bool IsAllowedToShare(const ONNX_NAMESPACE::TensorShapeProto* input_shape,
                      int64_t& num_elements) {
@ -78,7 +77,7 @@ bool PrepareInputPortsToReplace(Graph& graph, const NodeArg* origin_initializer_
    }

    // Iterate all input defs to replace those that are equal to origin_initializer_node_arg,
-    // Then it would be safe to remove the consumer node afterwards.
+    // Then it would be safe to remove the consumer node afterward.
    for (int i = 0; i < static_cast<int>(const_node->InputDefs().size()); ++i) {
      if (const_node->InputDefs()[i] == origin_initializer_node_arg) {
        consumer_node_to_input_ports_map[const_node].push_back(i);
@ -233,24 +232,17 @@ Status ConstantSharing::ApplyImpl(Graph& graph, bool& modified, int /*graph_leve
    size_t value_id = GetOrAddValueInConstantStore(std::move(init_value), const_value_store, data_store_key);

    // Construct a string by data type, value, and rank. Used as a key in pattern_key_to_shared_arg_map.
-    const std::string pattern_key = MakeString(SHARED_INITIALIZER_PREFIX, data_store_key, "_", value_id);
+    const std::string pattern_key = MakeString(data_store_key, "_", value_id);

    // If there is no such existing scalar pattern, add a new one.
    if (pattern_key_to_shared_arg_map.find(pattern_key) == pattern_key_to_shared_arg_map.end()) {
-      // Do a copy and rename the TensorProto.
-      ONNX_NAMESPACE::TensorProto constant_tensor_proto_as_replacement(*tensor_proto);
-      constant_tensor_proto_as_replacement.set_name(graph.GenerateNodeArgName(pattern_key));
-      NodeArg& shared_scalar_initializer_node_arg = graph_utils::AddInitializer(graph,
-                                                                                constant_tensor_proto_as_replacement);
-      pattern_key_to_shared_arg_map[pattern_key] = &shared_scalar_initializer_node_arg;
+      pattern_key_to_shared_arg_map[pattern_key] = origin_initializer_node_arg;
    } else {
      shared_count += 1;
+      ReplaceInputsToUseSharedInitializer(graph, consumer_node_to_input_ports_map, origin_initializer_node_arg,
+                                          pattern_key_to_shared_arg_map[pattern_key]);
+      modified = true;
    }
-
-    ReplaceInputsToUseSharedInitializer(graph, consumer_node_to_input_ports_map, origin_initializer_node_arg,
-                                        pattern_key_to_shared_arg_map[pattern_key]);
-
-    modified = true;
  }
  if (shared_count > 0) {
    LOGS(logger, INFO) << "Total shared scalar initializer count: " << shared_count;
--- a/onnxruntime/core/optimizer/constant_sharing.h
+++ b/onnxruntime/core/optimizer/constant_sharing.h
@ -14,13 +14,13 @@ namespace onnxruntime {
@class ConstantSharing

 Transformer that traverses the graph top-down and performs constant sharing, i.e.,
-constant initializers having same dtype, value and shape, will be replaced by one single (newly created) initializer.
-Currently, only scalar valued initializers are handled.
+constant initializers having same data type, value and shape, will be replaced by one single initializer.
+Currently, only scalar-valued initializers are handled.
 */
 class ConstantSharing : public GraphTransformer {
 public:
  /**
-   * @param compatible_execution_providers comptatible execution provider list for considered nodes.
+   * @param compatible_execution_providers compatible execution provider list for considered nodes.
   * @param excluded_initializers explicitly excluded initializer names that should not changed.
   */
  ConstantSharing(const InlinedHashSet<std::string_view>& compatible_execution_providers = {},
@ -29,6 +29,15 @@ class ConstantSharing : public GraphTransformer {
        excluded_initializers_(excluded_initializers) {
  }

+  bool ShouldOnlyApplyOnce() const override {
+#if defined(ENABLE_TRAINING)
+    return false;
+#else
+    // Reduce model processing time by applying this optimization only once for inference.
+    return true;
+#endif
+  }
+
  static constexpr int64_t TENSOR_ELEM_COUNT_THRESHOLD = 8;

 private: