mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-01 23:30:35 +00:00
Fix bug in Loop optimization (#4210)
* Fix bug where an optimization to avoid a copy resulted in the iteration num for a Loop subgraph * Update comments to clarify
This commit is contained in:
parent
082a741636
commit
6c33d7f5df
2 changed files with 103 additions and 9 deletions
|
|
@ -580,16 +580,36 @@ class PlannerImpl {
|
|||
// node_output is graph's output, so we can't reuse intermediate buffer
|
||||
AllocPlan(current).alloc_kind = AllocKind::kAllocateOutput;
|
||||
|
||||
// hacky perf optimization to not copy a pre-existing value to an output if this is a Loop subgraph.
|
||||
// ideally this is temporary, and a future ONNX change to allow empty variadic inputs means we don't
|
||||
// have converted models that unnecessarily add loop state variables. if the value is just being
|
||||
// passed through an implicit input should be used instead.
|
||||
// hacky perf optimization to not copy a pre-existing value to an output if this is a Loop subgraph and
|
||||
// the value is not being changed in the subgraph.
|
||||
//
|
||||
// this usage of a loop state variable has been seen in two scenarios. both have better alternatives now.
|
||||
// we maintain the optimization for existing models.
|
||||
//
|
||||
// 1. a loop state variable was being provided due to ONNX not supporting empty variadic inputs.
|
||||
// a dummy loop state variable was required in this case.
|
||||
// ONNX now supports empty variadic inputs, so a new model should not add a dummy loop state variable.
|
||||
//
|
||||
// 2. a loop state variable was being used to explicitly pass in an outer scope value to the subgraph.
|
||||
// this sort of usage is automatically handled via implicit inputs and there's no need to add a
|
||||
// loop state variable in order to access the outer scope value.
|
||||
if (parent_node_ && pnode->OpType() == "Identity" && parent_node_->OpType() == "Loop") {
|
||||
const auto& input_name = pnode->InputDefs()[0]->Name();
|
||||
const auto input_index = Index(input_name);
|
||||
const auto& alloc_plan = AllocPlan(input_index);
|
||||
if (alloc_plan.alloc_kind == AllocKind::kPreExisting) {
|
||||
Reuse(input_index, current, AllocKind::kShare);
|
||||
const NodeArg* input = pnode->InputDefs()[0];
|
||||
|
||||
// first input to the Loop subgraph is the iteration number.
|
||||
bool input_is_loop_iteration_number = input == graph_viewer_.GetInputs()[0];
|
||||
if (input_is_loop_iteration_number) {
|
||||
// as the value inside the OrtValue gets changed by the Loop implementation on each iteration
|
||||
// (so it can re-use the OrtValue instance) if it is also a subgraph output it must be allocated
|
||||
// so a copy of the current value is returned, so leave alloc_kind as kAllocateOutput
|
||||
} else {
|
||||
const auto& input_name = input->Name();
|
||||
const auto input_index = Index(input_name);
|
||||
|
||||
const auto& alloc_plan = AllocPlan(input_index);
|
||||
if (alloc_plan.alloc_kind == AllocKind::kPreExisting) {
|
||||
Reuse(input_index, current, AllocKind::kShare);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (IsNonTensor(*node_output)) {
|
||||
|
|
|
|||
|
|
@ -960,6 +960,80 @@ TEST(Loop, BugFixIssue4031_implicit_input_handling) {
|
|||
ASSERT_TRUE(output.Data<float>()[0] == 125.f);
|
||||
}
|
||||
|
||||
// check the optimization in AllocationPlanner doesn't affect the iteration count when it is passed through
|
||||
// and becomes a subgraph output. if we prevent a separate allocation for the output via the optimization
|
||||
// the final value will be repeated in the loop output instead of the value from each iteration
|
||||
TEST(Loop, IterationCountAsOutput) {
|
||||
auto create_subgraph = []() {
|
||||
Model model("Iter_num_in subgraph output", false, DefaultLoggingManager().DefaultLogger());
|
||||
auto& graph = model.MainGraph();
|
||||
|
||||
std::vector<NodeArg*> inputs;
|
||||
std::vector<NodeArg*> outputs;
|
||||
|
||||
/* Inputs: iter_num, cond_in, loop carried state variables.
|
||||
|
||||
iter_num_in cond_in
|
||||
| |
|
||||
[Identity] [Identity]
|
||||
| |
|
||||
loop_var_0_out cond_out
|
||||
*/
|
||||
|
||||
// graph inputs types.
|
||||
TypeProto int64_scalar;
|
||||
int64_scalar.mutable_tensor_type()->set_elem_type(TensorProto_DataType_INT64);
|
||||
int64_scalar.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1);
|
||||
|
||||
TypeProto bool_scalar;
|
||||
bool_scalar.mutable_tensor_type()->set_elem_type(TensorProto_DataType_BOOL);
|
||||
bool_scalar.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1);
|
||||
|
||||
// graph inputs
|
||||
auto& iter_num_in = graph.GetOrCreateNodeArg("iter_num_in", &int64_scalar);
|
||||
auto& cond_in = graph.GetOrCreateNodeArg("cond_in", &bool_scalar);
|
||||
|
||||
// graph outputs
|
||||
auto& cond_out = graph.GetOrCreateNodeArg("cond_out", &bool_scalar);
|
||||
auto& loop_var_0_out = graph.GetOrCreateNodeArg("loop_var_0_out", &int64_scalar);
|
||||
|
||||
// iter_num_in -> loop_var_0_out
|
||||
{
|
||||
inputs = {&iter_num_in};
|
||||
outputs = {&loop_var_0_out};
|
||||
|
||||
graph.AddNode("loop_var_out", "Identity", "Forward cond_in to loop_var_0_out", inputs, outputs);
|
||||
}
|
||||
|
||||
// cond_in -> cond_out
|
||||
{
|
||||
inputs = {&cond_in};
|
||||
outputs = {&cond_out};
|
||||
|
||||
graph.AddNode("cond_in_identity", "Identity", "Forward cond_in to cond_out", inputs, outputs);
|
||||
}
|
||||
|
||||
graph.SetInputs({&iter_num_in, &cond_in});
|
||||
graph.SetOutputs({&cond_out, &loop_var_0_out});
|
||||
|
||||
auto status = graph.Resolve();
|
||||
EXPECT_EQ(status, Status::OK());
|
||||
|
||||
return graph.ToGraphProto();
|
||||
};
|
||||
|
||||
OpTester test("Loop", 11);
|
||||
auto body = create_subgraph();
|
||||
test.AddAttribute<GraphProto>("body", body);
|
||||
test.AddInput<int64_t>("M", {1}, {3});
|
||||
test.AddInput<bool>("cond", {1}, {true});
|
||||
|
||||
test.AddOutput<int64_t>("loop_var_0_final", {3, 1}, {0, 1, 2});
|
||||
|
||||
// Disable TensorRT on unsupported data type BOOL
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
|
||||
}
|
||||
|
||||
#ifdef USE_CUDA
|
||||
// test that when part of the subgraph run on CUDA it executes successfully
|
||||
TEST(Loop, MixedExecutionProviders) {
|
||||
|
|
|
|||
Loading…
Reference in a new issue