Fix constant folding of node assigned to CUDA (#2510)

* Constant folding bug fix/improvements - Handle constant folding for node that is assigned to a non cpu EP - Check for errors in optimizer execution frame setup - Improve CUDA partitioning to look for initializers in parent graphs - Add unit test Fixes #2474
2026-07-15 18:23:41 +00:00 · 2019-12-03 16:28:44 +10:00 · 2019-12-03 16:28:44 +10:00 · e8b327d657
commit e8b327d657
parent 4354023913
5 changed files with 79 additions and 36 deletions
--- a/onnxruntime/core/optimizer/constant_folding.cc
+++ b/onnxruntime/core/optimizer/constant_folding.cc
@ -25,6 +25,17 @@ Status ConstantFolding::ApplyImpl(Graph& graph, bool& modified, int graph_level,

    InitializedTensorSet constant_inputs;

+    // we currently constant fold using the CPU EP only.
+    // if the node is assigned to a different EP we can run it if it's an ONNX op as we have CPU based implementations
+    // for all ONNX ops. if it's from a different domain we can't.
+    // NOTE: This is in addition to the IsSupportedProvider check below which will optionally do further filtering
+    // on the EPs we constant fold for.
+    auto ep_type = node->GetExecutionProviderType();
+    bool cpu_ep = ep_type == kCpuExecutionProvider;
+    if (!cpu_ep && node->Domain() != kOnnxDomain) {
+      continue;
+    }
+
    // Check if constant folding can be applied on this node.
    if (!graph_utils::IsSupportedProvider(*node, GetCompatibleExecutionProviders()) ||
        excluded_op_types_.find(node->OpType()) != excluded_op_types_.end() ||
@ -36,9 +47,19 @@ Status ConstantFolding::ApplyImpl(Graph& graph, bool& modified, int graph_level,
      continue;
    }

+    // override the EP while setting up OptimizerExecutionFrame::Info so that it will use the CPU kernel for Compute.
+    if (!cpu_ep) {
+      node->SetExecutionProviderType(kCpuExecutionProvider);
+    }
+
    // Create execution frame for executing constant nodes.
    OptimizerExecutionFrame::Info info({node}, constant_inputs);

+    // undo the EP change in case something fails prior to node removal
+    if (!cpu_ep) {
+      node->SetExecutionProviderType(ep_type);
+    }
+
    std::vector<int> fetch_mlvalue_idxs;
    for (const auto* node_out : node->OutputDefs()) {
      fetch_mlvalue_idxs.push_back(info.GetMLValueIndex(node_out->Name()));
@ -62,8 +83,8 @@ Status ConstantFolding::ApplyImpl(Graph& graph, bool& modified, int graph_level,
      OrtValue& ort_value = fetches[fetch_idx];

      if (!ort_value.IsTensor()) {
-          LOGS(logger, WARNING) << "Unsupported output type of " << ort_value.Type()
-                                 << ". Can't constant fold " << node->OpType() << " node '" << node->Name() << "'";
+        LOGS(logger, WARNING) << "Unsupported output type of " << ort_value.Type()
+                              << ". Can't constant fold " << node->OpType() << " node '" << node->Name() << "'";
        unsupported_output_type = true;
        break;
      }
--- a/onnxruntime/core/optimizer/constant_folding.h
+++ b/onnxruntime/core/optimizer/constant_folding.h
@ -16,8 +16,8 @@ it statically computes parts of the graph that rely only on constant initializer
 */
 class ConstantFolding : public GraphTransformer {
 public:
-  ConstantFolding(const std::unordered_set<std::string>& compatible_execution_providers = {}) noexcept :
-    GraphTransformer("ConstantFolding", compatible_execution_providers) {}
+  ConstantFolding(const std::unordered_set<std::string>& compatible_execution_providers = {}) noexcept
+      : GraphTransformer("ConstantFolding", compatible_execution_providers) {}

 private:
  /** Constant folding will not be applied to nodes whose op_type is included in this set.
@ -26,11 +26,6 @@ class ConstantFolding : public GraphTransformer {
      {"RandomUniform", "RandomNormal", "RandomUniformLike", "RandomNormalLike", "Multinomial"};

  Status ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const override;
-
-  /** Create a TensorProto that has the same value as the given OrtValue
-  and the same type and dimensions as the given NodeArg. */
-  void BuildTensorProtoForInitializer(const OrtValue& ort_value, const NodeArg& constant_node_arg,
-                                      ONNX_NAMESPACE::TensorProto& tensorproto) const;
 };

 }  // namespace onnxruntime
--- a/onnxruntime/core/optimizer/optimizer_execution_frame.cc
+++ b/onnxruntime/core/optimizer/optimizer_execution_frame.cc
@ -57,8 +57,8 @@ OptimizerExecutionFrame::Info::Info(const std::vector<const Node*>& nodes,

  // TODO: node->ImplicitInputDefs() need to be added here for control flow nodes.
  for (auto* node : nodes) {
-    onnxruntime::Node::ForEachWithIndex(node->InputDefs(), initialize_maps);
-    onnxruntime::Node::ForEachWithIndex(node->OutputDefs(), initialize_maps);
+    ORT_THROW_IF_ERROR(onnxruntime::Node::ForEachWithIndex(node->InputDefs(), initialize_maps));
+    ORT_THROW_IF_ERROR(onnxruntime::Node::ForEachWithIndex(node->OutputDefs(), initialize_maps));
  }

  node_index_info_ = onnxruntime::make_unique<NodeIndexInfo>(nodes, ort_value_name_idx_map_);
@ -67,8 +67,9 @@ OptimizerExecutionFrame::Info::Info(const std::vector<const Node*>& nodes,
  for (auto* node : nodes) {
    std::unique_ptr<OpKernel> op_kernel;
    std::shared_ptr<KernelRegistry> kernel_registry = cpu_execution_provider_->GetKernelRegistry();
-    auto status = kernel_registry->TryCreateKernel(*node, *cpu_execution_provider_, initializers_,
-                                                   ort_value_name_idx_map_, FuncManager(), data_transfer_mgr_, op_kernel);
+    ORT_THROW_IF_ERROR(kernel_registry->TryCreateKernel(*node, *cpu_execution_provider_, initializers_,
+                                                        ort_value_name_idx_map_, FuncManager(), data_transfer_mgr_,
+                                                        op_kernel));
    kernels_[node->Index()] = std::move(op_kernel);
  }
 }
@ -118,8 +119,8 @@ Status OptimizerExecutionFrame::CreateNodeOutputMLValueImpl(OrtValue& ort_value,
  auto element_type = static_cast<const TensorTypeBase*>(ml_type)->GetElementType();
  AllocatorPtr allocator_ptr = info_.GetAllocator();
  std::unique_ptr<Tensor> p_tensor = onnxruntime::make_unique<Tensor>(element_type,
-                                                              *shape,
-                                                              allocator_ptr);
+                                                                      *shape,
+                                                                      allocator_ptr);

  auto ml_tensor = DataTypeImpl::GetType<Tensor>();
  ort_value.Init(p_tensor.release(), ml_tensor, ml_tensor->GetDeleteFunc());
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@ -3,11 +3,12 @@

 #include "cuda_common.h"
 #include "cuda_execution_provider.h"
-#include "core/framework/memcpy.h"
 #include "cuda_fence.h"
 #include "cuda_allocator.h"
 #include "core/framework/kernel_registry.h"
 #include "core/framework/compute_capability.h"
+#include "core/framework/memcpy.h"
+#include "core/graph/graph_utils.h"
 #include "core/providers/cuda/gpu_data_transfer.h"

 #ifndef DISABLE_CONTRIB_OPS
@ -1303,28 +1304,27 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
      // Note that nodes with only inputs from initializer would not be place on CUDA
      // Ideally, those nodes should be eliminated in constant folding
      bool should_force_outside = true;
-      bool all_input_are_initializer = true;
-      node.ForEachWithIndex(
-          node.InputDefs(),
-          [&](const NodeArg& def, size_t index) {
-            const ONNX_NAMESPACE::TensorProto* initializer = nullptr;
-            // The input is not a initializer and the input is from CPU
-            // or the input declared as CPU memory and is from CPU
-            // in that case we should still keep the node on CUDA
-            bool initializer_input = graph.GetInitializedTensor(def.Name(), initializer);
-            bool input_is_on_cpu = defs_outside_cuda.count(&def) > 0;
-            if ((!initializer_input && !input_is_on_cpu) ||
-                (input_is_on_cpu && cuda_kernel_def->kernel_def->IsInputOnCpu(index)))
-              should_force_outside = false;
+      bool all_inputs_are_initializers = true;
+      node.ForEachWithIndex(node.InputDefs(),
+                            [&](const NodeArg& def, size_t index) {
+                              // The input is not a initializer and the input is from CPU
+                              // or the input declared as CPU memory and is from CPU
+                              // in that case we should still keep the node on CUDA
+                              bool initializer_input = graph.IsConstantInitializer(def.Name(), /*check_outer_scope*/ true);
+                              bool input_is_on_cpu = defs_outside_cuda.count(&def) > 0;
+                              if ((!initializer_input && !input_is_on_cpu) ||
+                                  (input_is_on_cpu && cuda_kernel_def->kernel_def->IsInputOnCpu(index))) {
+                                should_force_outside = false;
+                              }

-            if (!initializer_input) {
-              all_input_are_initializer = false;
-            }
-            return Status::OK();
-          });
+                              if (!initializer_input) {
+                                all_inputs_are_initializers = false;
+                              }
+                              return Status::OK();
+                            });

      // If all the inputs are initializers, we shouldn't force it to CPU
-      if (should_force_outside && !all_input_are_initializer) {
+      if (should_force_outside && !all_inputs_are_initializers) {
        force_outside = true;
      }
    }
--- a/onnxruntime/test/optimizer/graph_transform_test.cc
+++ b/onnxruntime/test/optimizer/graph_transform_test.cc
@ -131,6 +131,33 @@ TEST(GraphTransformationTests, ConstantFolding) {
  ASSERT_TRUE(op_to_count["Unsqueeze"] == 0);
 }

+TEST(GraphTransformationTests, ConstantFoldingNodesOnDifferentEP) {
+  auto model_uri = MODEL_FOLDER "fusion/fuse-conv-bn-mul-add-unsqueeze.onnx";
+  std::shared_ptr<Model> model;
+  ASSERT_TRUE(Model::Load(model_uri, model, nullptr, DefaultLoggingManager().DefaultLogger()).IsOK());
+  Graph& graph = model->MainGraph();
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  ASSERT_TRUE(op_to_count["Unsqueeze"] == 2);
+
+  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  graph_transformation_mgr.Register(onnxruntime::make_unique<ConstantFolding>(), TransformerLevel::Level1);
+
+  // assign all nodes to CUDA. the constant folding should override this to perform the constant folding on cpu
+  for (auto& node : graph.Nodes()) {
+    node.SetExecutionProviderType(kCudaExecutionProvider);
+  }
+
+  ASSERT_TRUE(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, DefaultLoggingManager().DefaultLogger()).IsOK());
+
+  op_to_count = CountOpsInGraph(graph);
+  ASSERT_TRUE(op_to_count["Unsqueeze"] == 0);
+
+  // all remaining nodes should still be on CUDA
+  for (auto& node : graph.Nodes()) {
+    EXPECT_STREQ(node.GetExecutionProviderType().c_str(), kCudaExecutionProvider);
+  }
+}
+
 TEST(GraphTransformationTests, ConstantFoldingSubgraph) {
  TensorProto value_tensor;
  value_tensor.add_dims(1);
@ -1010,7 +1037,6 @@ static void ValidateAttention(Graph& graph) {
      for (size_t i = 0; i < expected_value2.size(); i++) {
        EXPECT_EQ(data2[i], static_cast<float>(expected_value2[i]));
      }
-
    }
  }
 }