From e8b327d6573845654700c048c330c9c722f3d3f8 Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Tue, 3 Dec 2019 16:28:44 +1000
Subject: [PATCH] Fix constant folding of node assigned to CUDA (#2510)

* Constant folding bug fix/improvements
  - Handle constant folding for node that is assigned to a non cpu EP
  - Check for errors in optimizer execution frame setup
  - Improve CUDA partitioning to look for initializers in parent graphs
  - Add unit test

Fixes #2474
---
 .../core/optimizer/constant_folding.cc        | 25 +++++++++++-
 onnxruntime/core/optimizer/constant_folding.h |  9 +----
 .../optimizer/optimizer_execution_frame.cc    | 13 +++---
 .../providers/cuda/cuda_execution_provider.cc | 40 +++++++++----------
 .../test/optimizer/graph_transform_test.cc    | 28 ++++++++++++-
 5 files changed, 79 insertions(+), 36 deletions(-)
diff --git a/onnxruntime/core/optimizer/constant_folding.cc b/onnxruntime/core/optimizer/constant_folding.cc
index 3c4fdb71bb..d00c1759ce 100644
--- a/onnxruntime/core/optimizer/constant_folding.cc
+++ b/onnxruntime/core/optimizer/constant_folding.cc
@@ -25,6 +25,17 @@ Status ConstantFolding::ApplyImpl(Graph& graph, bool& modified, int graph_level,
 
     InitializedTensorSet constant_inputs;
 
+    // we currently constant fold using the CPU EP only.
+    // if the node is assigned to a different EP we can run it if it's an ONNX op as we have CPU based implementations
+    // for all ONNX ops. if it's from a different domain we can't.
+    // NOTE: This is in addition to the IsSupportedProvider check below which will optionally do further filtering
+    // on the EPs we constant fold for.
+    auto ep_type = node->GetExecutionProviderType();
+    bool cpu_ep = ep_type == kCpuExecutionProvider;
+    if (!cpu_ep && node->Domain() != kOnnxDomain) {
+      continue;
+    }
+
     // Check if constant folding can be applied on this node.
     if (!graph_utils::IsSupportedProvider(*node, GetCompatibleExecutionProviders()) ||
         excluded_op_types_.find(node->OpType()) != excluded_op_types_.end() ||
@@ -36,9 +47,19 @@ Status ConstantFolding::ApplyImpl(Graph& graph, bool& modified, int graph_level,
       continue;
     }
 
+    // override the EP while setting up OptimizerExecutionFrame::Info so that it will use the CPU kernel for Compute.
+    if (!cpu_ep) {
+      node->SetExecutionProviderType(kCpuExecutionProvider);
+    }
+
     // Create execution frame for executing constant nodes.
     OptimizerExecutionFrame::Info info({node}, constant_inputs);
 
+    // undo the EP change in case something fails prior to node removal
+    if (!cpu_ep) {
+      node->SetExecutionProviderType(ep_type);
+    }
+
     std::vector<int> fetch_mlvalue_idxs;
     for (const auto* node_out : node->OutputDefs()) {
       fetch_mlvalue_idxs.push_back(info.GetMLValueIndex(node_out->Name()));
@@ -62,8 +83,8 @@ Status ConstantFolding::ApplyImpl(Graph& graph, bool& modified, int graph_level,
       OrtValue& ort_value = fetches[fetch_idx];
 
       if (!ort_value.IsTensor()) {
-          LOGS(logger, WARNING) << "Unsupported output type of " << ort_value.Type()
-                                 << ". Can't constant fold " << node->OpType() << " node '" << node->Name() << "'";
+        LOGS(logger, WARNING) << "Unsupported output type of " << ort_value.Type()
+                              << ". Can't constant fold " << node->OpType() << " node '" << node->Name() << "'";
         unsupported_output_type = true;
         break;
       }
diff --git a/onnxruntime/core/optimizer/constant_folding.h b/onnxruntime/core/optimizer/constant_folding.h
index 15cc065b75..2be65a2670 100644
--- a/onnxruntime/core/optimizer/constant_folding.h
+++ b/onnxruntime/core/optimizer/constant_folding.h
@@ -16,8 +16,8 @@ it statically computes parts of the graph that rely only on constant initializer
 */
 class ConstantFolding : public GraphTransformer {
  public:
-  ConstantFolding(const std::unordered_set<std::string>& compatible_execution_providers = {}) noexcept :
-    GraphTransformer("ConstantFolding", compatible_execution_providers) {}
+  ConstantFolding(const std::unordered_set<std::string>& compatible_execution_providers = {}) noexcept
+      : GraphTransformer("ConstantFolding", compatible_execution_providers) {}
 
  private:
   /** Constant folding will not be applied to nodes whose op_type is included in this set.
@@ -26,11 +26,6 @@ class ConstantFolding : public GraphTransformer {
       {"RandomUniform", "RandomNormal", "RandomUniformLike", "RandomNormalLike", "Multinomial"};
 
   Status ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const override;
-
-  /** Create a TensorProto that has the same value as the given OrtValue
-  and the same type and dimensions as the given NodeArg. */
-  void BuildTensorProtoForInitializer(const OrtValue& ort_value, const NodeArg& constant_node_arg,
-                                      ONNX_NAMESPACE::TensorProto& tensorproto) const;
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/optimizer_execution_frame.cc b/onnxruntime/core/optimizer/optimizer_execution_frame.cc
index a9ba230cf3..b876521ca6 100644
--- a/onnxruntime/core/optimizer/optimizer_execution_frame.cc
+++ b/onnxruntime/core/optimizer/optimizer_execution_frame.cc
@@ -57,8 +57,8 @@ OptimizerExecutionFrame::Info::Info(const std::vector<const Node*>& nodes,
 
   // TODO: node->ImplicitInputDefs() need to be added here for control flow nodes.
   for (auto* node : nodes) {
-    onnxruntime::Node::ForEachWithIndex(node->InputDefs(), initialize_maps);
-    onnxruntime::Node::ForEachWithIndex(node->OutputDefs(), initialize_maps);
+    ORT_THROW_IF_ERROR(onnxruntime::Node::ForEachWithIndex(node->InputDefs(), initialize_maps));
+    ORT_THROW_IF_ERROR(onnxruntime::Node::ForEachWithIndex(node->OutputDefs(), initialize_maps));
   }
 
   node_index_info_ = onnxruntime::make_unique<NodeIndexInfo>(nodes, ort_value_name_idx_map_);
@@ -67,8 +67,9 @@ OptimizerExecutionFrame::Info::Info(const std::vector<const Node*>& nodes,
   for (auto* node : nodes) {
     std::unique_ptr<OpKernel> op_kernel;
     std::shared_ptr<KernelRegistry> kernel_registry = cpu_execution_provider_->GetKernelRegistry();
-    auto status = kernel_registry->TryCreateKernel(*node, *cpu_execution_provider_, initializers_,
-                                                   ort_value_name_idx_map_, FuncManager(), data_transfer_mgr_, op_kernel);
+    ORT_THROW_IF_ERROR(kernel_registry->TryCreateKernel(*node, *cpu_execution_provider_, initializers_,
+                                                        ort_value_name_idx_map_, FuncManager(), data_transfer_mgr_,
+                                                        op_kernel));
     kernels_[node->Index()] = std::move(op_kernel);
   }
 }
@@ -118,8 +119,8 @@ Status OptimizerExecutionFrame::CreateNodeOutputMLValueImpl(OrtValue& ort_value,
   auto element_type = static_cast<const TensorTypeBase*>(ml_type)->GetElementType();
   AllocatorPtr allocator_ptr = info_.GetAllocator();
   std::unique_ptr<Tensor> p_tensor = onnxruntime::make_unique<Tensor>(element_type,
-                                                              *shape,
-                                                              allocator_ptr);
+                                                                      *shape,
+                                                                      allocator_ptr);
 
   auto ml_tensor = DataTypeImpl::GetType<Tensor>();
   ort_value.Init(p_tensor.release(), ml_tensor, ml_tensor->GetDeleteFunc());
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index d9b8205e0c..1f900d3a7c 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -3,11 +3,12 @@
 
 #include "cuda_common.h"
 #include "cuda_execution_provider.h"
-#include "core/framework/memcpy.h"
 #include "cuda_fence.h"
 #include "cuda_allocator.h"
 #include "core/framework/kernel_registry.h"
 #include "core/framework/compute_capability.h"
+#include "core/framework/memcpy.h"
+#include "core/graph/graph_utils.h"
 #include "core/providers/cuda/gpu_data_transfer.h"
 
 #ifndef DISABLE_CONTRIB_OPS
@@ -1303,28 +1304,27 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
       // Note that nodes with only inputs from initializer would not be place on CUDA
       // Ideally, those nodes should be eliminated in constant folding
       bool should_force_outside = true;
-      bool all_input_are_initializer = true;
-      node.ForEachWithIndex(
-          node.InputDefs(),
-          [&](const NodeArg& def, size_t index) {
-            const ONNX_NAMESPACE::TensorProto* initializer = nullptr;
-            // The input is not a initializer and the input is from CPU
-            // or the input declared as CPU memory and is from CPU
-            // in that case we should still keep the node on CUDA
-            bool initializer_input = graph.GetInitializedTensor(def.Name(), initializer);
-            bool input_is_on_cpu = defs_outside_cuda.count(&def) > 0;
-            if ((!initializer_input && !input_is_on_cpu) ||
-                (input_is_on_cpu && cuda_kernel_def->kernel_def->IsInputOnCpu(index)))
-              should_force_outside = false;
+      bool all_inputs_are_initializers = true;
+      node.ForEachWithIndex(node.InputDefs(),
+                            [&](const NodeArg& def, size_t index) {
+                              // The input is not a initializer and the input is from CPU
+                              // or the input declared as CPU memory and is from CPU
+                              // in that case we should still keep the node on CUDA
+                              bool initializer_input = graph.IsConstantInitializer(def.Name(), /*check_outer_scope*/ true);
+                              bool input_is_on_cpu = defs_outside_cuda.count(&def) > 0;
+                              if ((!initializer_input && !input_is_on_cpu) ||
+                                  (input_is_on_cpu && cuda_kernel_def->kernel_def->IsInputOnCpu(index))) {
+                                should_force_outside = false;
+                              }
 
-            if (!initializer_input) {
-              all_input_are_initializer = false;
-            }
-            return Status::OK();
-          });
+                              if (!initializer_input) {
+                                all_inputs_are_initializers = false;
+                              }
+                              return Status::OK();
+                            });
 
       // If all the inputs are initializers, we shouldn't force it to CPU
-      if (should_force_outside && !all_input_are_initializer) {
+      if (should_force_outside && !all_inputs_are_initializers) {
         force_outside = true;
       }
     }
diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc
index 00220740a6..ce24d9e964 100644
--- a/onnxruntime/test/optimizer/graph_transform_test.cc
+++ b/onnxruntime/test/optimizer/graph_transform_test.cc
@@ -131,6 +131,33 @@ TEST(GraphTransformationTests, ConstantFolding) {
   ASSERT_TRUE(op_to_count["Unsqueeze"] == 0);
 }
 
+TEST(GraphTransformationTests, ConstantFoldingNodesOnDifferentEP) {
+  auto model_uri = MODEL_FOLDER "fusion/fuse-conv-bn-mul-add-unsqueeze.onnx";
+  std::shared_ptr<Model> model;
+  ASSERT_TRUE(Model::Load(model_uri, model, nullptr, DefaultLoggingManager().DefaultLogger()).IsOK());
+  Graph& graph = model->MainGraph();
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  ASSERT_TRUE(op_to_count["Unsqueeze"] == 2);
+
+  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  graph_transformation_mgr.Register(onnxruntime::make_unique<ConstantFolding>(), TransformerLevel::Level1);
+
+  // assign all nodes to CUDA. the constant folding should override this to perform the constant folding on cpu
+  for (auto& node : graph.Nodes()) {
+    node.SetExecutionProviderType(kCudaExecutionProvider);
+  }
+
+  ASSERT_TRUE(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, DefaultLoggingManager().DefaultLogger()).IsOK());
+
+  op_to_count = CountOpsInGraph(graph);
+  ASSERT_TRUE(op_to_count["Unsqueeze"] == 0);
+
+  // all remaining nodes should still be on CUDA
+  for (auto& node : graph.Nodes()) {
+    EXPECT_STREQ(node.GetExecutionProviderType().c_str(), kCudaExecutionProvider);
+  }
+}
+
 TEST(GraphTransformationTests, ConstantFoldingSubgraph) {
   TensorProto value_tensor;
   value_tensor.add_dims(1);
@@ -1010,7 +1037,6 @@ static void ValidateAttention(Graph& graph) {
       for (size_t i = 0; i < expected_value2.size(); i++) {
         EXPECT_EQ(data2[i], static_cast<float>(expected_value2[i]));
       }
-
     }
   }
 }