Make MemcpyToHost to a separate stream for performance gain (#14487)

### Description Make MemcpyToHost to a separate stream for performance gain in default DeviceBasedPartitioner ### Motivation and Context Our experiments show that make MemcpyToHost a separate stream will make it run parallel with other kernels, especially those compute-intensive ones. --------- Co-authored-by: Lei Cao <leca@microsoft.com>
2026-07-11 17:48:34 +00:00 · 2023-02-23 14:52:01 -08:00 · 2023-02-23 14:52:01 -08:00 · a012d60777
commit a012d60777
parent 664e296270
3 changed files with 49 additions and 54 deletions
--- a/onnxruntime/core/framework/allocation_planner.cc
+++ b/onnxruntime/core/framework/allocation_planner.cc
@ -572,15 +572,6 @@ class PlannerImpl {
  }

  Status ComputeReuseCount() {
-    // Note: for every ml-value, its definition must appear before all its uses in a topological sort of a valid model
-    using GraphInputsSet = InlinedHashSet<std::string_view>;
-    const auto& graph_inputs_nodes = graph_viewer_.GetInputsIncludingInitializers();
-    GraphInputsSet graph_inputs;
-    graph_inputs.reserve(graph_inputs_nodes.size());
-    for (auto& graph_input : graph_inputs_nodes) {
-      graph_inputs.insert(graph_input->Name());
-    }
-
    for (auto graph_input : graph_viewer_.GetInputs()) {
      OrtValueIndex index = Index(graph_input->Name());
      UseCount(index)++;  // Models caller's usage post-inference; ensures it will not be reused.
@ -1050,9 +1041,8 @@ class PlannerImpl {
    auto& allocation_plan = plan_.allocation_plan;

    // build the consumer list for each value
-    std::vector<InlinedVector<NodeIndex>> value_consumers;
    int num_ml_values = ort_value_name_idx_map_.MaxIdx() + 1;
-    value_consumers.resize(num_ml_values);
+    value_consumer_map_.reserve(num_ml_values);

    // iterate each stream from back, so the first element is the last consumer in single stream case
    for (auto& stream : stream_nodes_) {
@ -1068,7 +1058,7 @@ class PlannerImpl {
            auto origin = Buffer(value_idx);
            if (origin != -1 && plan_.allocation_plan[origin].alloc_kind == AllocKind::kAllocate) {
              // add current node as consumer for origin buffer
-              value_consumers[origin].push_back(node_index);
+              value_consumer_map_[origin].insert(node_index);
            }
          }
          return Status::OK();
@ -1119,8 +1109,8 @@ class PlannerImpl {
              auto p_input_arg = input_args[pair.first];
              if (p_input_arg->Exists()) {
                OrtValueIndex reusable_input{};
-                if (value_map.GetIdx(p_input_arg->Name(), reusable_input).IsOK() &&
-                    allocation_plan[reusable_input].alloc_kind == AllocKind::kAllocate) {
+                if (value_map.GetIdx(p_input_arg->Name(), reusable_input).IsOK() /*&&
+                    allocation_plan[reusable_input].alloc_kind == AllocKind::kAllocate*/) {
                  std::cout << p_input_arg->Name() << " reused by " << p_output_arg->Name() << " as input" << std::endl;
                  allocation_plan[output_idx_global].alloc_kind = AllocKind::kReuse;
                  allocation_plan[output_idx_global].reused_buffer = reusable_input;
@ -1152,7 +1142,6 @@ class PlannerImpl {
              OrtValueIndex reusable_input{};
              if (value_map.GetIdx(p_input_arg->Name(), reusable_input).IsOK() &&
                  allocation_plan[reusable_input].alloc_kind == AllocKind::kAllocate) {
-                std::cout << p_input_arg->Name() << " reused by " << p_output_arg->Name() << " as input" << std::endl;
                allocation_plan[output_idx_global].alloc_kind = AllocKind::kReuse;
                allocation_plan[output_idx_global].reused_buffer = reusable_input;
                value_consumer_map_[reusable_input].insert(value_consumer_map_[output_idx_global].begin(),
@ -1175,7 +1164,6 @@ class PlannerImpl {
                if (value_map.GetIdx(p_input_arg->Name(), input_arg_index).IsOK() &&
                    allocation_plan[input_arg_index].alloc_kind == AllocKind::kAllocate) {
                  if (value_consumer_map_[input_arg_index].size() == 1 && SameSize(*p_input_arg, *p_output_arg)) {
-                    std::cout << p_input_arg->Name() << " reused by " << p_output_arg->Name() << " as an input" << std::endl;
                    allocation_plan[output_idx_global].alloc_kind = AllocKind::kReuse;
                    allocation_plan[output_idx_global].reused_buffer = input_arg_index;
                    value_consumer_map_[input_arg_index].insert(value_consumer_map_[output_idx_global].begin(),
@ -1302,6 +1290,17 @@ class PlannerImpl {
        }
      }
    }
+
+    for (size_t value_index = 0; value_index < allocation_plan.size(); ++value_index) {
+      if (allocation_plan[value_index].alloc_kind == AllocKind::kReuse) {
+        while (allocation_plan[allocation_plan[value_index].reused_buffer].alloc_kind == AllocKind::kReuse &&
+               allocation_plan[value_index].reused_buffer != allocation_plan[allocation_plan[value_index].reused_buffer].reused_buffer) {
+          allocation_plan[value_index].reused_buffer = allocation_plan[allocation_plan[value_index].reused_buffer].reused_buffer;
+        }
+        ort_value_info_[value_index].reused_buffer_index = allocation_plan[value_index].reused_buffer;
+      }
+    }
+
    return Status::OK();
  }
 #endif
@ -2110,19 +2109,6 @@ Status PlannerImpl::CreatePlan(
  ORT_RETURN_IF_ERROR(BuildExecutionPlan(execution_providers_));
 #endif

-  // build value_node_map
-  for (auto node_index : graph_viewer_.GetNodesInTopologicalOrder(context_->GetExecutionOrder())) {
-    auto* node = graph_viewer_.GetNode(node_index);
-    const auto& output_defs = node->OutputDefs();
-    for (size_t output_idx_local = 0; output_idx_local < output_defs.size(); ++output_idx_local) {
-      const auto& node_output = output_defs[output_idx_local];
-      if (!node_output->Exists()) continue;
-      OrtValueIndex output_idx_global;
-      ORT_THROW_IF_ERROR(ort_value_name_idx_map_.GetIdx(node_output->Name(), output_idx_global));
-      value_node_map_[output_idx_global] = node_index;
-    }
-  }
-
  // determine sharing/reuse among ml-values
  ORT_RETURN_IF_ERROR(ComputeReusePlan());

@ -2365,7 +2351,7 @@ std::unique_ptr<IGraphPartitioner> IGraphPartitioner::CreateGraphPartitioner(con
                                                                             const PathString& config_file) {
  // use device based partitioner by default
  IGraphPartitioner::GraphPartitioningStrategy partitioner_type =
-      IGraphPartitioner::GraphPartitioningStrategy::Unknown;
+      IGraphPartitioner::GraphPartitioningStrategy::DeviceBasedPartition;
  if (!config_file.empty()) {
    std::ifstream f(config_file);
    if (f.is_open()) {
@ -2383,11 +2369,8 @@ std::unique_ptr<IGraphPartitioner> IGraphPartitioner::CreateGraphPartitioner(con
      f.close();
    }
  }
-  if (partitioner_type == IGraphPartitioner::GraphPartitioningStrategy::Unknown) {
-    partitioner_type = IGraphPartitioner::GraphPartitioningStrategy::DeviceBasedPartition;
-    LOGS(logger, INFO) << "Use DeviceBasedPartition as default";
-  }
  if (partitioner_type == IGraphPartitioner::GraphPartitioningStrategy::DeviceBasedPartition) {
+    LOGS(logger, INFO) << "Use DeviceBasedPartition as default";
    return std::make_unique<DeviceBasedPartitioner>(logger, config_file);
  }  // else if other partitioner types ...
  ORT_THROW("Failed to create partitioner");
--- a/onnxruntime/test/framework/allocation_planner_test.cc
+++ b/onnxruntime/test/framework/allocation_planner_test.cc
@ -371,6 +371,29 @@ class PlannerTest : public ::testing::Test {
  void SetNodePartitionConfigFilePath(const char* config_file_path) {
    ORT_THROW_IF_ERROR(sess_options_->config_options.AddConfigEntry(kNodePartitionConfigFile, config_file_path));
  }
+  std::unique_ptr<::onnxruntime::KernelDef>& GetStdKernel() { return std_kernel_; }
+#ifdef USE_CUDA
+  void MemcpyToHostInCuda_TransposeInCudaAndCpu(const char* partitionConfigFile = nullptr) {
+    std::unique_ptr<::onnxruntime::KernelDef> cudaKernel = KernelDefBuilder().SetName("MemcpyToHost").Provider(kCudaExecutionProvider).SetDefaultOutputMemoryType(OrtMemTypeCPUOutput).Build();
+    std::unique_ptr<::onnxruntime::KernelDef> cudaKernelTrans = KernelDefBuilder().SetName("Transpose").Provider(kCudaExecutionProvider).SinceVersion(1, 10).Build();
+    std::string Graph_input("Graph_input"), Arg1("Arg1"), Arg2("Arg2"), Arg3("Arg3"), node1("node1"), node2("node2"), node3("node3");
+    std::vector<onnxruntime::NodeArg*> input1{Arg(Graph_input)}, output1{Arg(Arg1)}, output2{Arg(Arg2)}, output3{Arg(Arg3)};
+    AddNode(*cudaKernel, node1, input1, output1);
+    AddNode(*GetStdKernel(), node2, output1, output2);
+    AddNode(*cudaKernelTrans, node3, output1, output3);
+
+    CUDAExecutionProviderInfo epi;
+    onnxruntime::ProviderInfo_CUDA& ep = onnxruntime::GetProviderInfo_CUDA();
+    auto epFactory = ep.CreateExecutionProviderFactory(epi);
+    std::unique_ptr<IExecutionProvider> execution_provider = epFactory->CreateProvider();
+    AllocatorManager am;
+    execution_provider->RegisterAllocator(am);
+    ORT_THROW_IF_ERROR(GetExecutionProviders().Add("CUDAExecutionProvider", std::move(execution_provider)));
+
+    if (partitionConfigFile != nullptr) SetNodePartitionConfigFilePath(partitionConfigFile);
+    CreatePlan({}, false);
+  }
+#endif  // USE_CUDA
 };

 TEST_F(PlannerTest, ChainTest) {
@ -1272,23 +1295,7 @@ TEST_F(PlannerTest, MultiStream1StreamWaitFor2Streams) {
 // stream 1: node2 (CPU EP)
 // node1's output, which is consumed by both node2 and node3, is in CPU.
 TEST_F(PlannerTest, MultiStreamCudaEPNodeCPUOutput) {
-  std::unique_ptr<::onnxruntime::KernelDef> cudaKernel = KernelDefBuilder().SetName("MemcpyToHost").Provider(kCudaExecutionProvider).SetDefaultOutputMemoryType(OrtMemTypeCPUOutput).Build();
-  std::unique_ptr<::onnxruntime::KernelDef> cudaKernelTrans = KernelDefBuilder().SetName("Transpose").Provider(kCudaExecutionProvider).SinceVersion(1, 10).Build();
-  std::string Graph_input("Graph_input"), Arg1("Arg1"), Arg2("Arg2"), Arg3("Arg3");
-  AddNode(*cudaKernel, Graph_input, Arg1);
-  AddNormalNode(Arg1, Arg2);
-  AddNode(*cudaKernelTrans, Arg1, Arg3);
-
-  CUDAExecutionProviderInfo epi;
-  onnxruntime::ProviderInfo_CUDA& ep = onnxruntime::GetProviderInfo_CUDA();
-  auto epFactory = ep.CreateExecutionProviderFactory(epi);
-  std::unique_ptr<IExecutionProvider> execution_provider = epFactory->CreateProvider();
-  AllocatorManager am;
-  execution_provider->RegisterAllocator(am);
-  ORT_THROW_IF_ERROR(GetExecutionProviders().Add("CUDAExecutionProvider", std::move(execution_provider)));
-
-  CreatePlan({}, false);
-
+  MemcpyToHostInCuda_TransposeInCudaAndCpu("./testdata/multi_stream_models/memcpyToHost_same_stream_with_transpose.json");
  EXPECT_EQ(GetState().GetExecutionPlan()->execution_plan.size(), 2) << "2 logic streams";
  EXPECT_EQ(GetState().GetExecutionPlan()->execution_plan[0]->steps_.size(), 5) << "stream 0 has 5 steps";
  EXPECT_NE(strstr(typeid(*GetState().GetExecutionPlan()->execution_plan[0]->steps_[0]).name(), "LaunchKernelStep"), nullptr) << "0th step: LaunchKernelStep for node 1";
@ -1814,9 +1821,9 @@ TEST_F(PlannerTest, ParaPlanCreation) {
  auto* exe_plan = const_cast<onnxruntime::SessionState&>(main_graph_session_state).GetExecutionPlan();
  auto& per_value_plans = exe_plan->GetAllocationPlan();
  InlinedHashMap<std::string, std::string> reuse_pairs;
-  reuse_pairs.emplace("conv_0_out", "maxpool_out");
-  reuse_pairs.emplace("conv_1_out", "conv_2_out");
-  reuse_pairs.emplace("relu_1_out", "relu_2_out");
+  reuse_pairs.emplace("conv_0_out", "relu_0_out");  // conv_0_out is reused by relu_0_out
+  reuse_pairs.emplace("conv_1_out", "relu_1_out");  // conv_1_out is reused by relu_1_out
+  reuse_pairs.emplace("conv_2_out", "relu_2_out");  // conv_2_out is reused by relu_2_out
  for (size_t i = 0; i < per_value_plans.size(); ++i) {
    auto& per_value_plan = per_value_plans[i];
    if (per_value_plan.alloc_kind == AllocKind::kReuse) {
--- a/onnxruntime/test/testdata/multi_stream_models/memcpyToHost_same_stream_with_transpose.json
+++ b/onnxruntime/test/testdata/multi_stream_models/memcpyToHost_same_stream_with_transpose.json
@ -0,0 +1,5 @@
+{
+"type":"DeviceBasedPartitioner",
+"streams":[["node1", "node3"],["node2"]],
+"devices":["1","0"]
+}