From b85805ed013b0a9d5f9a750bf3ef4debb31ff401 Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Sat, 30 May 2020 07:08:21 +1000
Subject: [PATCH] Handle edge case with implicit input and multiple levels of
 subgraphs (#4031)

* Handle edge case where an implicit input for a subgraph may not get wired in correctly.

Conditions required:
  - two or more levels of nested subgraph
  - an implicit input from above the bottom two levels is used in both levels of subgraph
    - this creates a NodeArg for the implicit input at both levels
  - something changes to the first level subgraph to no longer use the implicit input
    - could be constant folding, could be partitioning of nodes results in a copy of the implicit input being made to a different device

When that occurs we lose the wiring through to the second level of nested subgraph as there's a NodeArg in the first level but the implicit input is no longer used there. Fix that by doing a final check for outer scope values once we know all the outputs produced by the current graph.

Found by commenting out the CUDA implementations of the control flow nodes and running ssd_mobilenet_300 from the mlperf models.

* Add test case.
---
 onnxruntime/core/graph/graph.cc               |  24 ++++++-
 .../providers/cpu/controlflow/loop_test.cc    |  31 ++++++++
 .../test/testdata/ort_github_issue_4031.onnx  | Bin 0 -> 974 bytes
 .../test/testdata/ort_github_issue_4031.py    |  66 ++++++++++++++++++
 4 files changed, 118 insertions(+), 3 deletions(-)
 create mode 100644 onnxruntime/test/testdata/ort_github_issue_4031.onnx
 create mode 100644 onnxruntime/test/testdata/ort_github_issue_4031.py

diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc
index 9b144b89fe..89e59c0850 100644
--- a/onnxruntime/core/graph/graph.cc
+++ b/onnxruntime/core/graph/graph.cc
@@ -1124,6 +1124,8 @@ Status Graph::BuildConnections(std::unordered_set<std::string>& outer_scope_node
   const std::unordered_set<std::string>& outer_scope_node_args = resolve_context_.outer_scope_node_args;
   std::unordered_set<Node*> inner_nodes;
 
+  std::unordered_set<std::string> node_args_consumed_by_subgraphs;
+
   // recurse into subgraphs first so we can update any nodes in this graph that are used by those subgraphs
   if (!resolve_context_.nodes_with_subgraphs.empty()) {
     for (auto* node : resolve_context_.nodes_with_subgraphs) {
@@ -1157,6 +1159,12 @@ Status Graph::BuildConnections(std::unordered_set<std::string>& outer_scope_node
                   "This is an invalid model. Failed to find NodeArg in all parent graphs. Name=", node_arg_name,
                   " Graph may not conform to the ONNX spec and contain initializers that are not graph inputs.");
             }
+          } else {
+            // this value may be produced by this graph, or it could still be coming from a parent graph if it
+            // is also directly consumed at this level as we create a NodeArg for all Node inputs in this graph.
+            // due to that we need to check the outputs from this level to determine if it is an outer scope value.
+            // we don't have that info yet so store and check before returning from BuildConnections
+            ORT_IGNORE_RETURN_VALUE(node_args_consumed_by_subgraphs.insert(node_arg_name));
           }
 
           // add it to the Node's list of implicit inputs
@@ -1178,8 +1186,9 @@ Status Graph::BuildConnections(std::unordered_set<std::string>& outer_scope_node
 
             inner_nodes.insert(&output_node);
 
-            // If this Graph was built manually, remove the implicit input from the graph outputs if it is present there
-            // and not explicitly listed in the ordered graph outputs (as that implies we should leave it as an output).
+            // If this Graph was built manually, remove the implicit input from the graph outputs
+            // if it is present there and not explicitly listed in the ordered graph outputs
+            // (as that implies we should leave it as an output).
             // If the Graph was loaded from a GraphProto, honor the explicit graph outputs and leave as is.
             if (!is_loaded_from_model_file_) {
               graph_outputs_.erase(std::remove(graph_outputs_.begin(), graph_outputs_.end(), node_arg),
@@ -1252,8 +1261,17 @@ Status Graph::BuildConnections(std::unordered_set<std::string>& outer_scope_node
     }
   }
 
+  // finally check any node args consumed by subgraphs to see if they're available locally.
+  // if not we add them to the list of outer scope values consumed.
+  for (const auto& name : node_args_consumed_by_subgraphs) {
+    if (node_arg_to_producer_node_.count(name) == 0 &&
+        resolve_context_.inputs_and_initializers.find(name) == resolve_context_.inputs_and_initializers.cend()) {
+      ORT_IGNORE_RETURN_VALUE(outer_scope_node_args_consumed.insert(name));
+    }
+  }
+
   return Status::OK();
-}  // namespace onnxruntime
+}
 
 void Graph::ReverseDFSFrom(const std::vector<NodeIndex>& from,
                            const std::function<void(const Node*)>& enter,
diff --git a/onnxruntime/test/providers/cpu/controlflow/loop_test.cc b/onnxruntime/test/providers/cpu/controlflow/loop_test.cc
index 41d29d97aa..868287474f 100644
--- a/onnxruntime/test/providers/cpu/controlflow/loop_test.cc
+++ b/onnxruntime/test/providers/cpu/controlflow/loop_test.cc
@@ -929,6 +929,37 @@ TEST(Loop, PassThroughSubgraphInputNoTypeOrShape) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
+TEST(Loop, BugFixIssue4031_implicit_input_handling) {
+  SessionOptions so;
+  so.graph_optimization_level = TransformerLevel::Level2;  // we need constant folding to run
+  InferenceSession session_object{so, GetEnvironment()};
+  static constexpr const ORTCHAR_T* MODEL_URI = ORT_TSTR("testdata/ort_github_issue_4031.onnx");
+
+  ASSERT_STATUS_OK(session_object.Load(MODEL_URI));
+  ASSERT_STATUS_OK(session_object.Initialize());
+
+  onnxruntime::RunOptions run_options;
+  run_options.run_tag = "BugFixIssue4031_implicit_input_handling";
+
+  // prepare inputs
+  OrtValue ml_value;
+  CreateMLValue<float>(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), {1}, {123.f},
+                       &ml_value);
+  NameMLValMap feeds;
+  feeds.insert(std::make_pair("state_var_in", ml_value));
+
+  // prepare outputs
+  std::vector<std::string> output_names{"state_var_out"};
+  std::vector<OrtValue> fetches;
+
+  // Now run
+  ASSERT_STATUS_OK(session_object.Run(run_options, feeds, output_names, &fetches));
+
+  const auto& output = fetches[0].Get<Tensor>();
+  ASSERT_TRUE(output.Shape().Size() == 1);
+  ASSERT_TRUE(output.Data<float>()[0] == 125.f);
+}
+
 #ifdef USE_CUDA
 // test that when part of the subgraph run on CUDA it executes successfully
 TEST(Loop, MixedExecutionProviders) {
diff --git a/onnxruntime/test/testdata/ort_github_issue_4031.onnx b/onnxruntime/test/testdata/ort_github_issue_4031.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..d75c755816ff0ef2104384edbf79bb8443ea43ab
GIT binary patch
literal 974
zcmd5*u~Nc75cNVrk`<7N3J%(s(Iy>~=!_#)jQRsKHy4t?jgVxLi=$SS{(}v@-{CL#
z0e*zFcLe1?(%a;6ySKaV?Y=ei?z;|eWvETb17-=0@i59l25{&5F&;%U96=>zg!%ZM
zB$$R)b$N?2*3xdGC~n)TIGj%nR5$V-J0BWc!+x5%c$AQM%;PAdWI~_(1R7I9!@qWG
zM|y<QY}!&UJ+Ez-_!rJ#-3!$TuZVLn99B)b3S1(%lLvn((BM#VBuid`OmmZ6gqQk3
z+gAF4^NL{GpQJu^6A})`ofib>s~ssMq!Z7>B3Zc~VE&(@vEj#VmANy{n5|xslsP6S
zEIKe}NU>@(Sk%L!o$ekio>~BgVOlCE2AUylj*gA}@L!^{Od*On1LxR4b!4ksThAB0
z&AJ#ILX|Q<A&f>L4zuZ;tDbWmoD_~w>WD}o$$-ySI8mO$J%yra%UyMkU`<ML65ewQ
wZh!UbG&wE3i%@TEe;ixlTOfF6m%lL`9xN@}ptUM4xwy;ett_AH>?!riHwb4S6#xJL

literal 0
HcmV?d00001

diff --git a/onnxruntime/test/testdata/ort_github_issue_4031.py b/onnxruntime/test/testdata/ort_github_issue_4031.py
new file mode 100644
index 0000000000..f991b41024
--- /dev/null
+++ b/onnxruntime/test/testdata/ort_github_issue_4031.py
@@ -0,0 +1,66 @@
+import onnx
+from onnx import helper
+from onnx import TensorProto
+
+if_body = helper.make_graph(
+    [
+        # need to use main_graph_initializer in a way that can't be constant folded
+        helper.make_node("Add", ["state_var_in", "main_graph_initializer"], ["add_out"], "If_add"),
+        helper.make_node("Cast", ["add_out"], ["output"], to=TensorProto.BOOL),
+    ],
+    "if_branch_body",
+    [
+        # no explicit inputs
+    ],
+    [
+        helper.make_tensor_value_info('output', TensorProto.BOOL, [1]), # how is this getting a type of float?
+    ])
+
+# Loop body graph with If node and usage of main_graph_initializer on this level
+body = helper.make_graph(
+    [
+        # Add node that can be constant folded. Creates NodeArg when created but that implicit usage of an outer scope
+        # value main_graph_initializer goes away after constant folding
+        helper.make_node("Add", ["sub_graph_initializer", "main_graph_initializer"], ["initializer_sum"], "Add1"),
+        helper.make_node("Add", ["initializer_sum", "loop_state_in"], ["loop_state_out"], "Add2"),
+        # If node to create usage of main_graph_initializer another level down
+        helper.make_node("If", ["subgraph_keep_going_in"], ["subgraph_keep_going_out"], "If1",
+                         then_branch=if_body, else_branch=if_body),
+    ],
+    "Loop_body",
+    [
+        helper.make_tensor_value_info('iteration_num', TensorProto.INT64, [1]),
+        helper.make_tensor_value_info('subgraph_keep_going_in', TensorProto.BOOL, [1]),
+        helper.make_tensor_value_info('loop_state_in', TensorProto.FLOAT, [1])
+    ],
+    [
+        helper.make_tensor_value_info('subgraph_keep_going_out', TensorProto.BOOL, [1]),
+        helper.make_tensor_value_info('loop_state_out', TensorProto.FLOAT, [1]),
+    ],
+    [
+        helper.make_tensor('sub_graph_initializer', TensorProto.FLOAT, [1], [1.])
+    ]
+)
+
+# Create the main graph
+graph_proto = helper.make_graph(
+    [
+        helper.make_node("Loop", ["max_trip_count", "keep_going", "state_var_in"],
+                         ["state_var_out"], "Loop1", body=body)
+    ],
+    "Main_graph",
+    [
+        helper.make_tensor_value_info('state_var_in', TensorProto.FLOAT, [1]),
+    ],
+    [
+        helper.make_tensor_value_info('state_var_out', TensorProto.FLOAT, [1]),
+    ],
+    [
+        helper.make_tensor('max_trip_count', TensorProto.INT64, [1], [1]),
+        helper.make_tensor('main_graph_initializer', TensorProto.FLOAT, [1], [1.]),
+        helper.make_tensor('keep_going', TensorProto.BOOL, [1], [True]),
+    ]
+)
+
+model = helper.make_model(graph_proto)
+onnx.save(model, 'ort_github_issue_4031.onnx')
\ No newline at end of file