diff --git a/onnxruntime/core/framework/allocation_planner.cc b/onnxruntime/core/framework/allocation_planner.cc index f7c3fbc76a..8c72823f88 100644 --- a/onnxruntime/core/framework/allocation_planner.cc +++ b/onnxruntime/core/framework/allocation_planner.cc @@ -427,6 +427,15 @@ class PlannerImpl { plan_.allocation_plan.resize(num_ml_values); } + bool ExternalOutputs(const Node& node) const { + const KernelCreateInfo& ci = GetKernelCreateInfo(kernel_create_info_map_, node.Index()); + if (ci.kernel_def == nullptr) { + return false; + } + + return ci.kernel_def->ExternalOutputs(); + } + Status ComputeUseCounts() { // Note: for every ml-value, its definition must appear before all its uses in a topological sort of a valid model std::unordered_set graph_inputs; @@ -511,12 +520,14 @@ class PlannerImpl { auto outputs = pnode->OutputDefs(); auto num_outputs = outputs.size(); + bool external_outputs = ExternalOutputs(*pnode); for (size_t i = 0; i < num_outputs; ++i) { auto* node_output = outputs[i]; if (!node_output->Exists()) continue; OrtValueIndex index = Index(node_output->Name()); ProcessDef(index, node_output); - ++UseCount(index); + // Ensures external outputs will not be reused. + UseCount(index) += (external_outputs ? 2 : 1); auto allocator = exec_provider->GetAllocator(0, p_kernel_def->OutputMemoryType(i)); ORT_ENFORCE(allocator); plan_.SetLocation(static_cast(index), @@ -600,15 +611,6 @@ class PlannerImpl { return Status::OK(); } - bool ExternalOutputs(const Node& node) const { - const KernelCreateInfo& ci = GetKernelCreateInfo(kernel_create_info_map_, node.Index()); - if (ci.kernel_def == nullptr) { - return false; - } - - return ci.kernel_def->ExternalOutputs(); - } - // Should only be used after ProcessDef() Status ComputeReusePlan() { std::vector& execution_plan(plan_.execution_plan); diff --git a/onnxruntime/test/framework/allocation_planner_test.cc b/onnxruntime/test/framework/allocation_planner_test.cc index 078f284f0e..8b35ca101a 100644 --- a/onnxruntime/test/framework/allocation_planner_test.cc +++ b/onnxruntime/test/framework/allocation_planner_test.cc @@ -156,6 +156,7 @@ class PlannerTest : public ::testing::Test { std::unique_ptr<::onnxruntime::KernelDef> std_kernel_; // a unary kernel with no-aliasing and no-in-place std::unique_ptr<::onnxruntime::KernelDef> in_place_kernel_; // a unary kernel with in-place + std::unique_ptr<::onnxruntime::KernelDef> external_outputs_kernel_; // an unary kernel with external outputs std::unordered_map name_to_arg_; std::vector> nodes_; @@ -178,6 +179,8 @@ class PlannerTest : public ::testing::Test { std_kernel_ = KernelDefBuilder().SetName("Transpose").Provider(kCpuExecutionProvider).SinceVersion(1, 10).Build(); in_place_kernel_ = KernelDefBuilder().SetName("Relu").Provider(kCpuExecutionProvider).SinceVersion(1, 10).MayInplace(0, 0).Build(); + external_outputs_kernel_ = + KernelDefBuilder().SetName("Tanh").Provider(kCpuExecutionProvider).SinceVersion(1, 10).ExternalOutputs().Build(); CPUExecutionProviderInfo epi; auto execution_provider = onnxruntime::make_unique(epi); execution_providers_.Add("CPUExecutionProvider", std::move(execution_provider)); @@ -209,6 +212,10 @@ class PlannerTest : public ::testing::Test { return AddNode(*in_place_kernel_, input, output); } + onnxruntime::Node* AddExternalOutputsNode(std::string& input, std::string& output) { + return AddNode(*external_outputs_kernel_, input, output); + } + void BindKernel(onnxruntime::Node* p_node, ::onnxruntime::KernelDef& kernel_def, KernelRegistry* reg, std::unordered_map>& kernel_create_info_map) { const IExecutionProvider* ep = execution_providers_.Get(*p_node); @@ -403,6 +410,35 @@ TEST_F(PlannerTest, InPlaceTest) { CheckFreed(2, {X2}); } +TEST_F(PlannerTest, ExternalOutputsTest) { + // tensor variables: + std::string X1("X1"), X2("X2"), X3("X3"), X4("X4"); + + // graph structure: + AddExternalOutputsNode(X1, X2); // external-outputs operator; X1: input; X2: temporary + AddNormalNode(X2, X3); // normal operator; X3: temporary + AddNormalNode(X3, X4); // normal operator; X4: output + + // simulate shape-inference results: + Shape shape1{"M", "N"}; + auto shape = &shape1.value; + SetShape({{X1, shape}, {X2, shape}, {X3, shape}, {X4, shape}}); + + CreatePlan(); + + // check allocation kind: + CheckAllocKind(X1, AllocKind::kPreExisting); + CheckAllocKind(X2, AllocKind::kPreExisting); + CheckAllocKind(X3, AllocKind::kAllocate); + CheckAllocKind(X4, AllocKind::kAllocateOutput); + + // check each ml-value is freed at appropriate step + // X2 will not be reused and will not be freed. X3 will be allocated and will be freed. + CheckFreed(0, {}); + CheckFreed(1, {}); + CheckFreed(2, {X3}); +} + // InPlaceSizeMismatchTest: Check that Inplace reuse is not allowed when sizes don't match. // Also tests reuse of disjoint lifetime tensors. TEST_F(PlannerTest, InPlaceSizeMismatchTest) {