Fix bug in Loop optimization (#4210)

* Fix bug where an optimization to avoid a copy resulted in the iteration num for a Loop subgraph

* Update comments to clarify
This commit is contained in:
Scott McKay 2020-08-11 06:31:29 +10:00 committed by GitHub
parent 082a741636
commit 6c33d7f5df
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 103 additions and 9 deletions

View file

@ -580,16 +580,36 @@ class PlannerImpl {
// node_output is graph's output, so we can't reuse intermediate buffer
AllocPlan(current).alloc_kind = AllocKind::kAllocateOutput;
// hacky perf optimization to not copy a pre-existing value to an output if this is a Loop subgraph.
// ideally this is temporary, and a future ONNX change to allow empty variadic inputs means we don't
// have converted models that unnecessarily add loop state variables. if the value is just being
// passed through an implicit input should be used instead.
// hacky perf optimization to not copy a pre-existing value to an output if this is a Loop subgraph and
// the value is not being changed in the subgraph.
//
// this usage of a loop state variable has been seen in two scenarios. both have better alternatives now.
// we maintain the optimization for existing models.
//
// 1. a loop state variable was being provided due to ONNX not supporting empty variadic inputs.
// a dummy loop state variable was required in this case.
// ONNX now supports empty variadic inputs, so a new model should not add a dummy loop state variable.
//
// 2. a loop state variable was being used to explicitly pass in an outer scope value to the subgraph.
// this sort of usage is automatically handled via implicit inputs and there's no need to add a
// loop state variable in order to access the outer scope value.
if (parent_node_ && pnode->OpType() == "Identity" && parent_node_->OpType() == "Loop") {
const auto& input_name = pnode->InputDefs()[0]->Name();
const auto input_index = Index(input_name);
const auto& alloc_plan = AllocPlan(input_index);
if (alloc_plan.alloc_kind == AllocKind::kPreExisting) {
Reuse(input_index, current, AllocKind::kShare);
const NodeArg* input = pnode->InputDefs()[0];
// first input to the Loop subgraph is the iteration number.
bool input_is_loop_iteration_number = input == graph_viewer_.GetInputs()[0];
if (input_is_loop_iteration_number) {
// as the value inside the OrtValue gets changed by the Loop implementation on each iteration
// (so it can re-use the OrtValue instance) if it is also a subgraph output it must be allocated
// so a copy of the current value is returned, so leave alloc_kind as kAllocateOutput
} else {
const auto& input_name = input->Name();
const auto input_index = Index(input_name);
const auto& alloc_plan = AllocPlan(input_index);
if (alloc_plan.alloc_kind == AllocKind::kPreExisting) {
Reuse(input_index, current, AllocKind::kShare);
}
}
}
} else if (IsNonTensor(*node_output)) {

View file

@ -960,6 +960,80 @@ TEST(Loop, BugFixIssue4031_implicit_input_handling) {
ASSERT_TRUE(output.Data<float>()[0] == 125.f);
}
// check the optimization in AllocationPlanner doesn't affect the iteration count when it is passed through
// and becomes a subgraph output. if we prevent a separate allocation for the output via the optimization
// the final value will be repeated in the loop output instead of the value from each iteration
TEST(Loop, IterationCountAsOutput) {
auto create_subgraph = []() {
Model model("Iter_num_in subgraph output", false, DefaultLoggingManager().DefaultLogger());
auto& graph = model.MainGraph();
std::vector<NodeArg*> inputs;
std::vector<NodeArg*> outputs;
/* Inputs: iter_num, cond_in, loop carried state variables.
iter_num_in cond_in
| |
[Identity] [Identity]
| |
loop_var_0_out cond_out
*/
// graph inputs types.
TypeProto int64_scalar;
int64_scalar.mutable_tensor_type()->set_elem_type(TensorProto_DataType_INT64);
int64_scalar.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1);
TypeProto bool_scalar;
bool_scalar.mutable_tensor_type()->set_elem_type(TensorProto_DataType_BOOL);
bool_scalar.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1);
// graph inputs
auto& iter_num_in = graph.GetOrCreateNodeArg("iter_num_in", &int64_scalar);
auto& cond_in = graph.GetOrCreateNodeArg("cond_in", &bool_scalar);
// graph outputs
auto& cond_out = graph.GetOrCreateNodeArg("cond_out", &bool_scalar);
auto& loop_var_0_out = graph.GetOrCreateNodeArg("loop_var_0_out", &int64_scalar);
// iter_num_in -> loop_var_0_out
{
inputs = {&iter_num_in};
outputs = {&loop_var_0_out};
graph.AddNode("loop_var_out", "Identity", "Forward cond_in to loop_var_0_out", inputs, outputs);
}
// cond_in -> cond_out
{
inputs = {&cond_in};
outputs = {&cond_out};
graph.AddNode("cond_in_identity", "Identity", "Forward cond_in to cond_out", inputs, outputs);
}
graph.SetInputs({&iter_num_in, &cond_in});
graph.SetOutputs({&cond_out, &loop_var_0_out});
auto status = graph.Resolve();
EXPECT_EQ(status, Status::OK());
return graph.ToGraphProto();
};
OpTester test("Loop", 11);
auto body = create_subgraph();
test.AddAttribute<GraphProto>("body", body);
test.AddInput<int64_t>("M", {1}, {3});
test.AddInput<bool>("cond", {1}, {true});
test.AddOutput<int64_t>("loop_var_0_final", {3, 1}, {0, 1, 2});
// Disable TensorRT on unsupported data type BOOL
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
}
#ifdef USE_CUDA
// test that when part of the subgraph run on CUDA it executes successfully
TEST(Loop, MixedExecutionProviders) {