Add UseCount for External Outputs (#6894)

* add usecount for external outputs * ut
2026-06-27 03:11:28 +00:00 · 2021-03-09 17:06:27 +08:00 · 2021-03-09 17:06:27 +08:00 · 91c6a330c0
commit 91c6a330c0
parent f1ade14e44
2 changed files with 48 additions and 10 deletions
--- a/onnxruntime/core/framework/allocation_planner.cc
+++ b/onnxruntime/core/framework/allocation_planner.cc
@ -427,6 +427,15 @@ class PlannerImpl {
    plan_.allocation_plan.resize(num_ml_values);
  }

+  bool ExternalOutputs(const Node& node) const {
+    const KernelCreateInfo& ci = GetKernelCreateInfo(kernel_create_info_map_, node.Index());
+    if (ci.kernel_def == nullptr) {
+      return false;
+    }
+
+    return ci.kernel_def->ExternalOutputs();
+  }
+
  Status ComputeUseCounts() {
    // Note: for every ml-value, its definition must appear before all its uses in a topological sort of a valid model
    std::unordered_set<std::string> graph_inputs;
@ -511,12 +520,14 @@ class PlannerImpl {

      auto outputs = pnode->OutputDefs();
      auto num_outputs = outputs.size();
+      bool external_outputs = ExternalOutputs(*pnode);
      for (size_t i = 0; i < num_outputs; ++i) {
        auto* node_output = outputs[i];
        if (!node_output->Exists()) continue;
        OrtValueIndex index = Index(node_output->Name());
        ProcessDef(index, node_output);
-        ++UseCount(index);
+        // Ensures external outputs will not be reused.
+        UseCount(index) += (external_outputs ? 2 : 1);
        auto allocator = exec_provider->GetAllocator(0, p_kernel_def->OutputMemoryType(i));
        ORT_ENFORCE(allocator);
        plan_.SetLocation(static_cast<size_t>(index),
@ -600,15 +611,6 @@ class PlannerImpl {
    return Status::OK();
  }

-  bool ExternalOutputs(const Node& node) const {
-    const KernelCreateInfo& ci = GetKernelCreateInfo(kernel_create_info_map_, node.Index());
-    if (ci.kernel_def == nullptr) {
-      return false;
-    }
-
-    return ci.kernel_def->ExternalOutputs();
-  }
-
  // Should only be used after ProcessDef()
  Status ComputeReusePlan() {
    std::vector<SequentialExecutionPlan::NodeExecutionPlan>& execution_plan(plan_.execution_plan);
--- a/onnxruntime/test/framework/allocation_planner_test.cc
+++ b/onnxruntime/test/framework/allocation_planner_test.cc
@ -156,6 +156,7 @@ class PlannerTest : public ::testing::Test {

  std::unique_ptr<::onnxruntime::KernelDef> std_kernel_;       // a unary kernel with no-aliasing and no-in-place
  std::unique_ptr<::onnxruntime::KernelDef> in_place_kernel_;  // a unary kernel with in-place
+  std::unique_ptr<::onnxruntime::KernelDef> external_outputs_kernel_; // an unary kernel with external outputs

  std::unordered_map<std::string, onnxruntime::NodeArg*> name_to_arg_;
  std::vector<std::unique_ptr<UnaryNode>> nodes_;
@ -178,6 +179,8 @@ class PlannerTest : public ::testing::Test {
    std_kernel_ = KernelDefBuilder().SetName("Transpose").Provider(kCpuExecutionProvider).SinceVersion(1, 10).Build();
    in_place_kernel_ =
        KernelDefBuilder().SetName("Relu").Provider(kCpuExecutionProvider).SinceVersion(1, 10).MayInplace(0, 0).Build();
+    external_outputs_kernel_ =
+        KernelDefBuilder().SetName("Tanh").Provider(kCpuExecutionProvider).SinceVersion(1, 10).ExternalOutputs().Build();
    CPUExecutionProviderInfo epi;
    auto execution_provider = onnxruntime::make_unique<CPUExecutionProvider>(epi);
    execution_providers_.Add("CPUExecutionProvider", std::move(execution_provider));
@ -209,6 +212,10 @@ class PlannerTest : public ::testing::Test {
    return AddNode(*in_place_kernel_, input, output);
  }

+  onnxruntime::Node* AddExternalOutputsNode(std::string& input, std::string& output) {
+    return AddNode(*external_outputs_kernel_, input, output);
+  }
+
  void BindKernel(onnxruntime::Node* p_node, ::onnxruntime::KernelDef& kernel_def, KernelRegistry* reg,
                  std::unordered_map<NodeIndex, gsl::not_null<const KernelCreateInfo*>>& kernel_create_info_map) {
    const IExecutionProvider* ep = execution_providers_.Get(*p_node);
@ -403,6 +410,35 @@ TEST_F(PlannerTest, InPlaceTest) {
  CheckFreed(2, {X2});
 }

+TEST_F(PlannerTest, ExternalOutputsTest) {
+  // tensor variables:
+  std::string X1("X1"), X2("X2"), X3("X3"), X4("X4");
+
+  // graph structure:
+  AddExternalOutputsNode(X1, X2);   // external-outputs operator; X1: input; X2: temporary
+  AddNormalNode(X2, X3);  // normal operator; X3: temporary
+  AddNormalNode(X3, X4);   // normal operator; X4: output
+
+  // simulate shape-inference results:
+  Shape shape1{"M", "N"};
+  auto shape = &shape1.value;
+  SetShape({{X1, shape}, {X2, shape}, {X3, shape}, {X4, shape}});
+
+  CreatePlan();
+
+  // check allocation kind:
+  CheckAllocKind(X1, AllocKind::kPreExisting);
+  CheckAllocKind(X2, AllocKind::kPreExisting);
+  CheckAllocKind(X3, AllocKind::kAllocate);
+  CheckAllocKind(X4, AllocKind::kAllocateOutput);
+
+  // check each ml-value is freed at appropriate step
+  // X2 will not be reused and will not be freed. X3 will be allocated and will be freed.
+  CheckFreed(0, {});
+  CheckFreed(1, {});
+  CheckFreed(2, {X3});
+}
+
 // InPlaceSizeMismatchTest: Check that Inplace reuse is not allowed when sizes don't match.
 // Also tests reuse of disjoint lifetime tensors.
 TEST_F(PlannerTest, InPlaceSizeMismatchTest) {