Address performance regression with const tensors shared across DML partitions

2026-07-15 18:23:41 +00:00 · 2023-05-24 18:09:28 -07:00 · 2023-05-24 18:09:28 -07:00 · 1bf441415a
commit 1bf441415a
parent 7aed386a2b
3 changed files with 21 additions and 7 deletions
--- a/onnxruntime/core/optimizer/constant_sharing.cc
+++ b/onnxruntime/core/optimizer/constant_sharing.cc
@ -35,7 +35,6 @@ using SupportedTypeList = boost::mp11::mp_list<MLFloat16, float, double, int32_t
 // TODO(pengwa): we can gradually increase this threshold if we see more benefits (memory saving
 // or more CSE optimizations triggered). Should be careful to cover test cases that assume initializer
 // name did not change after transformation then.
-static constexpr int64_t TENSOR_ELEM_COUNT_THRESHOLD = 8;
 static constexpr char SHARED_INITIALIZER_PREFIX[] = "ortshared_";

 bool IsAllowedToShare(const ONNX_NAMESPACE::TensorShapeProto* input_shape,
@ -52,12 +51,12 @@ bool IsAllowedToShare(const ONNX_NAMESPACE::TensorShapeProto* input_shape,

    int64_t dim_value = dim.dim_value();
    num_elements *= dim_value;
-    if (num_elements > TENSOR_ELEM_COUNT_THRESHOLD) {
+    if (num_elements > ConstantSharing::TENSOR_ELEM_COUNT_THRESHOLD) {
      return false;
    }
  }

-  if (num_elements > 0 && num_elements <= TENSOR_ELEM_COUNT_THRESHOLD) {
+  if (num_elements > 0 && num_elements <= ConstantSharing::TENSOR_ELEM_COUNT_THRESHOLD) {
    return true;
  }

--- a/onnxruntime/core/optimizer/constant_sharing.h
+++ b/onnxruntime/core/optimizer/constant_sharing.h
@ -29,6 +29,8 @@ class ConstantSharing : public GraphTransformer {
        excluded_initializers_(excluded_initializers) {
  }

+  static constexpr int64_t TENSOR_ELEM_COUNT_THRESHOLD = 8;
+
 private:
  Status ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const override;

--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.cpp
@ -7,6 +7,7 @@
 #include "GraphPartitioner.h"
 #include "core/framework/kernel_type_str_resolver.h"
 #include "core/framework/kernel_lookup.h"
+#include "core/optimizer/constant_sharing.h"
 #include "FusedGraphKernel.h"
 #include "MLOperatorAuthorImpl.h"
 #include "DmlGraphFusionHelper.h"
@ -87,11 +88,23 @@ namespace Dml
                        assert(iter != initializerPartitionMap.end());
                        if (iter->second.size() > 1)
                        {
-                            if (requiredInitializerMap.find(input) != requiredInitializerMap.end())
+                            // By including non-transferrable tensors in isInitializerTransferable, it causes DML to upload and preprocess them
+                            // to duplicate locations rather than treating them as being non-constant, which is helpful for optimization.
+                            // The size threshold for this should be no smaller than that used to combine initializers in the constant
+                            // sharing transform to prevent that transform from hurting performance.
+                            // If the kernel relies on this input to be initialized, it should also be small enough to copy cheaply.
+                            const uint64_t maximumElementsForDuplicationTensor = 64;
+                            static_assert(maximumElementsForDuplicationTensor >= onnxruntime::ConstantSharing::TENSOR_ELEM_COUNT_THRESHOLD);
+
+                            uint64_t totalElementCount = 1;
+                            for (int i = 0; i < tensor->dims().size(); ++i)
+                            {
+                                totalElementCount *= tensor->dims()[i];
+                            }
+
+                            if (totalElementCount <=  maximumElementsForDuplicationTensor ||
+                                requiredInitializerMap.find(input) != requiredInitializerMap.end())
                            {
-                                // The kernel relies on this input to be initialized, and it should be small enough to copy
-                                // cheaply. FusedGraphKernel only handles constant CPU inputs through transferred initializers,
-                                // rather than ORT, to avoid mismatches in policy or implementation causing failures.
                                isInitializerTransferable[input] = {tensor, false};
                            }