From 6c33d7f5df8006a8c7cb741ef114e2ff6c3feb41 Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Tue, 11 Aug 2020 06:31:29 +1000
Subject: [PATCH] Fix bug in Loop optimization  (#4210)

* Fix bug where an optimization to avoid a copy resulted in the iteration num for a Loop subgraph

* Update comments to clarify
---
 .../core/framework/allocation_planner.cc      | 38 +++++++---
 .../providers/cpu/controlflow/loop_test.cc    | 74 +++++++++++++++++++
 2 files changed, 103 insertions(+), 9 deletions(-)

diff --git a/onnxruntime/core/framework/allocation_planner.cc b/onnxruntime/core/framework/allocation_planner.cc
index 75298a8ee9..b39a79b2a9 100644
--- a/onnxruntime/core/framework/allocation_planner.cc
+++ b/onnxruntime/core/framework/allocation_planner.cc
@@ -580,16 +580,36 @@ class PlannerImpl {
           // node_output is graph's output, so we can't reuse intermediate buffer
           AllocPlan(current).alloc_kind = AllocKind::kAllocateOutput;
 
-          // hacky perf optimization to not copy a pre-existing value to an output if this is a Loop subgraph.
-          // ideally this is temporary, and a future ONNX change to allow empty variadic inputs means we don't
-          // have converted models that unnecessarily add loop state variables. if the value is just being
-          // passed through an implicit input should be used instead.
+          // hacky perf optimization to not copy a pre-existing value to an output if this is a Loop subgraph and
+          // the value is not being changed in the subgraph.
+          //
+          // this usage of a loop state variable has been seen in two scenarios. both have better alternatives now.
+          // we maintain the optimization for existing models.
+          //
+          // 1. a loop state variable was being provided due to ONNX not supporting empty variadic inputs.
+          //    a dummy loop state variable was required in this case.
+          //    ONNX now supports empty variadic inputs, so a new model should not add a dummy loop state variable.
+          //
+          // 2. a loop state variable was being used to explicitly pass in an outer scope value to the subgraph.
+          //    this sort of usage is automatically handled via implicit inputs and there's no need to add a
+          //    loop state variable in order to access the outer scope value.
           if (parent_node_ && pnode->OpType() == "Identity" && parent_node_->OpType() == "Loop") {
-            const auto& input_name = pnode->InputDefs()[0]->Name();
-            const auto input_index = Index(input_name);
-            const auto& alloc_plan = AllocPlan(input_index);
-            if (alloc_plan.alloc_kind == AllocKind::kPreExisting) {
-              Reuse(input_index, current, AllocKind::kShare);
+            const NodeArg* input = pnode->InputDefs()[0];
+
+            // first input to the Loop subgraph is the iteration number.
+            bool input_is_loop_iteration_number = input == graph_viewer_.GetInputs()[0];
+            if (input_is_loop_iteration_number) {
+              // as the value inside the OrtValue gets changed by the Loop implementation on each iteration
+              // (so it can re-use the OrtValue instance) if it is also a subgraph output it must be allocated
+              // so a copy of the current value is returned, so leave alloc_kind as kAllocateOutput
+            } else {
+              const auto& input_name = input->Name();
+              const auto input_index = Index(input_name);
+
+              const auto& alloc_plan = AllocPlan(input_index);
+              if (alloc_plan.alloc_kind == AllocKind::kPreExisting) {
+                Reuse(input_index, current, AllocKind::kShare);
+              }
             }
           }
         } else if (IsNonTensor(*node_output)) {
diff --git a/onnxruntime/test/providers/cpu/controlflow/loop_test.cc b/onnxruntime/test/providers/cpu/controlflow/loop_test.cc
index 868287474f..9a6ce3f7f8 100644
--- a/onnxruntime/test/providers/cpu/controlflow/loop_test.cc
+++ b/onnxruntime/test/providers/cpu/controlflow/loop_test.cc
@@ -960,6 +960,80 @@ TEST(Loop, BugFixIssue4031_implicit_input_handling) {
   ASSERT_TRUE(output.Data<float>()[0] == 125.f);
 }
 
+// check the optimization in AllocationPlanner doesn't affect the iteration count when it is passed through
+// and becomes a subgraph output. if we prevent a separate allocation for the output via the optimization
+// the final value will be repeated in the loop output instead of the value from each iteration
+TEST(Loop, IterationCountAsOutput) {
+  auto create_subgraph = []() {
+    Model model("Iter_num_in subgraph output", false, DefaultLoggingManager().DefaultLogger());
+    auto& graph = model.MainGraph();
+
+    std::vector<NodeArg*> inputs;
+    std::vector<NodeArg*> outputs;
+
+    /* Inputs: iter_num, cond_in, loop carried state variables.
+
+         iter_num_in    cond_in     
+             |             |        
+         [Identity]   [Identity]   
+             |             |        
+     loop_var_0_out    cond_out    
+    */
+
+    // graph inputs types.
+    TypeProto int64_scalar;
+    int64_scalar.mutable_tensor_type()->set_elem_type(TensorProto_DataType_INT64);
+    int64_scalar.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1);
+
+    TypeProto bool_scalar;
+    bool_scalar.mutable_tensor_type()->set_elem_type(TensorProto_DataType_BOOL);
+    bool_scalar.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1);
+
+    // graph inputs
+    auto& iter_num_in = graph.GetOrCreateNodeArg("iter_num_in", &int64_scalar);
+    auto& cond_in = graph.GetOrCreateNodeArg("cond_in", &bool_scalar);
+
+    // graph outputs
+    auto& cond_out = graph.GetOrCreateNodeArg("cond_out", &bool_scalar);
+    auto& loop_var_0_out = graph.GetOrCreateNodeArg("loop_var_0_out", &int64_scalar);
+
+    // iter_num_in -> loop_var_0_out
+    {
+      inputs = {&iter_num_in};
+      outputs = {&loop_var_0_out};
+
+      graph.AddNode("loop_var_out", "Identity", "Forward cond_in to loop_var_0_out", inputs, outputs);
+    }
+
+    // cond_in -> cond_out
+    {
+      inputs = {&cond_in};
+      outputs = {&cond_out};
+
+      graph.AddNode("cond_in_identity", "Identity", "Forward cond_in to cond_out", inputs, outputs);
+    }
+
+    graph.SetInputs({&iter_num_in, &cond_in});
+    graph.SetOutputs({&cond_out, &loop_var_0_out});
+
+    auto status = graph.Resolve();
+    EXPECT_EQ(status, Status::OK());
+
+    return graph.ToGraphProto();
+  };
+
+  OpTester test("Loop", 11);
+  auto body = create_subgraph();
+  test.AddAttribute<GraphProto>("body", body);
+  test.AddInput<int64_t>("M", {1}, {3});
+  test.AddInput<bool>("cond", {1}, {true});
+
+  test.AddOutput<int64_t>("loop_var_0_final", {3, 1}, {0, 1, 2});
+
+  // Disable TensorRT on unsupported data type BOOL
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
 #ifdef USE_CUDA
 // test that when part of the subgraph run on CUDA it executes successfully
 TEST(Loop, MixedExecutionProviders) {