From 6c33d7f5df8006a8c7cb741ef114e2ff6c3feb41 Mon Sep 17 00:00:00 2001 From: Scott McKay Date: Tue, 11 Aug 2020 06:31:29 +1000 Subject: [PATCH] Fix bug in Loop optimization (#4210) * Fix bug where an optimization to avoid a copy resulted in the iteration num for a Loop subgraph * Update comments to clarify --- .../core/framework/allocation_planner.cc | 38 +++++++--- .../providers/cpu/controlflow/loop_test.cc | 74 +++++++++++++++++++ 2 files changed, 103 insertions(+), 9 deletions(-) diff --git a/onnxruntime/core/framework/allocation_planner.cc b/onnxruntime/core/framework/allocation_planner.cc index 75298a8ee9..b39a79b2a9 100644 --- a/onnxruntime/core/framework/allocation_planner.cc +++ b/onnxruntime/core/framework/allocation_planner.cc @@ -580,16 +580,36 @@ class PlannerImpl { // node_output is graph's output, so we can't reuse intermediate buffer AllocPlan(current).alloc_kind = AllocKind::kAllocateOutput; - // hacky perf optimization to not copy a pre-existing value to an output if this is a Loop subgraph. - // ideally this is temporary, and a future ONNX change to allow empty variadic inputs means we don't - // have converted models that unnecessarily add loop state variables. if the value is just being - // passed through an implicit input should be used instead. + // hacky perf optimization to not copy a pre-existing value to an output if this is a Loop subgraph and + // the value is not being changed in the subgraph. + // + // this usage of a loop state variable has been seen in two scenarios. both have better alternatives now. + // we maintain the optimization for existing models. + // + // 1. a loop state variable was being provided due to ONNX not supporting empty variadic inputs. + // a dummy loop state variable was required in this case. + // ONNX now supports empty variadic inputs, so a new model should not add a dummy loop state variable. + // + // 2. a loop state variable was being used to explicitly pass in an outer scope value to the subgraph. + // this sort of usage is automatically handled via implicit inputs and there's no need to add a + // loop state variable in order to access the outer scope value. if (parent_node_ && pnode->OpType() == "Identity" && parent_node_->OpType() == "Loop") { - const auto& input_name = pnode->InputDefs()[0]->Name(); - const auto input_index = Index(input_name); - const auto& alloc_plan = AllocPlan(input_index); - if (alloc_plan.alloc_kind == AllocKind::kPreExisting) { - Reuse(input_index, current, AllocKind::kShare); + const NodeArg* input = pnode->InputDefs()[0]; + + // first input to the Loop subgraph is the iteration number. + bool input_is_loop_iteration_number = input == graph_viewer_.GetInputs()[0]; + if (input_is_loop_iteration_number) { + // as the value inside the OrtValue gets changed by the Loop implementation on each iteration + // (so it can re-use the OrtValue instance) if it is also a subgraph output it must be allocated + // so a copy of the current value is returned, so leave alloc_kind as kAllocateOutput + } else { + const auto& input_name = input->Name(); + const auto input_index = Index(input_name); + + const auto& alloc_plan = AllocPlan(input_index); + if (alloc_plan.alloc_kind == AllocKind::kPreExisting) { + Reuse(input_index, current, AllocKind::kShare); + } } } } else if (IsNonTensor(*node_output)) { diff --git a/onnxruntime/test/providers/cpu/controlflow/loop_test.cc b/onnxruntime/test/providers/cpu/controlflow/loop_test.cc index 868287474f..9a6ce3f7f8 100644 --- a/onnxruntime/test/providers/cpu/controlflow/loop_test.cc +++ b/onnxruntime/test/providers/cpu/controlflow/loop_test.cc @@ -960,6 +960,80 @@ TEST(Loop, BugFixIssue4031_implicit_input_handling) { ASSERT_TRUE(output.Data()[0] == 125.f); } +// check the optimization in AllocationPlanner doesn't affect the iteration count when it is passed through +// and becomes a subgraph output. if we prevent a separate allocation for the output via the optimization +// the final value will be repeated in the loop output instead of the value from each iteration +TEST(Loop, IterationCountAsOutput) { + auto create_subgraph = []() { + Model model("Iter_num_in subgraph output", false, DefaultLoggingManager().DefaultLogger()); + auto& graph = model.MainGraph(); + + std::vector inputs; + std::vector outputs; + + /* Inputs: iter_num, cond_in, loop carried state variables. + + iter_num_in cond_in + | | + [Identity] [Identity] + | | + loop_var_0_out cond_out + */ + + // graph inputs types. + TypeProto int64_scalar; + int64_scalar.mutable_tensor_type()->set_elem_type(TensorProto_DataType_INT64); + int64_scalar.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1); + + TypeProto bool_scalar; + bool_scalar.mutable_tensor_type()->set_elem_type(TensorProto_DataType_BOOL); + bool_scalar.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1); + + // graph inputs + auto& iter_num_in = graph.GetOrCreateNodeArg("iter_num_in", &int64_scalar); + auto& cond_in = graph.GetOrCreateNodeArg("cond_in", &bool_scalar); + + // graph outputs + auto& cond_out = graph.GetOrCreateNodeArg("cond_out", &bool_scalar); + auto& loop_var_0_out = graph.GetOrCreateNodeArg("loop_var_0_out", &int64_scalar); + + // iter_num_in -> loop_var_0_out + { + inputs = {&iter_num_in}; + outputs = {&loop_var_0_out}; + + graph.AddNode("loop_var_out", "Identity", "Forward cond_in to loop_var_0_out", inputs, outputs); + } + + // cond_in -> cond_out + { + inputs = {&cond_in}; + outputs = {&cond_out}; + + graph.AddNode("cond_in_identity", "Identity", "Forward cond_in to cond_out", inputs, outputs); + } + + graph.SetInputs({&iter_num_in, &cond_in}); + graph.SetOutputs({&cond_out, &loop_var_0_out}); + + auto status = graph.Resolve(); + EXPECT_EQ(status, Status::OK()); + + return graph.ToGraphProto(); + }; + + OpTester test("Loop", 11); + auto body = create_subgraph(); + test.AddAttribute("body", body); + test.AddInput("M", {1}, {3}); + test.AddInput("cond", {1}, {true}); + + test.AddOutput("loop_var_0_final", {3, 1}, {0, 1, 2}); + + // Disable TensorRT on unsupported data type BOOL + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); +} + #ifdef USE_CUDA // test that when part of the subgraph run on CUDA it executes successfully TEST(Loop, MixedExecutionProviders) {