From a012d60777a26515efece4bcd228c3863733db5d Mon Sep 17 00:00:00 2001
From: cao lei <jslhcl@gmail.com>
Date: Thu, 23 Feb 2023 14:52:01 -0800
Subject: [PATCH] Make MemcpyToHost to a separate stream for performance gain 
 (#14487)

### Description
Make MemcpyToHost to a separate stream for performance gain in default
DeviceBasedPartitioner



### Motivation and Context
Our experiments show that make MemcpyToHost a separate stream will make
it run parallel with other kernels, especially those compute-intensive
ones.

---------

Co-authored-by: Lei Cao <leca@microsoft.com>
---
 .../core/framework/allocation_planner.cc      | 51 +++++++------------
 .../test/framework/allocation_planner_test.cc | 47 +++++++++--------
 ...mcpyToHost_same_stream_with_transpose.json |  5 ++
 3 files changed, 49 insertions(+), 54 deletions(-)
 create mode 100644 onnxruntime/test/testdata/multi_stream_models/memcpyToHost_same_stream_with_transpose.json
diff --git a/onnxruntime/core/framework/allocation_planner.cc b/onnxruntime/core/framework/allocation_planner.cc
index 0af5924987..8cf07f566a 100644
--- a/onnxruntime/core/framework/allocation_planner.cc
+++ b/onnxruntime/core/framework/allocation_planner.cc
@@ -572,15 +572,6 @@ class PlannerImpl {
   }
 
   Status ComputeReuseCount() {
-    // Note: for every ml-value, its definition must appear before all its uses in a topological sort of a valid model
-    using GraphInputsSet = InlinedHashSet<std::string_view>;
-    const auto& graph_inputs_nodes = graph_viewer_.GetInputsIncludingInitializers();
-    GraphInputsSet graph_inputs;
-    graph_inputs.reserve(graph_inputs_nodes.size());
-    for (auto& graph_input : graph_inputs_nodes) {
-      graph_inputs.insert(graph_input->Name());
-    }
-
     for (auto graph_input : graph_viewer_.GetInputs()) {
       OrtValueIndex index = Index(graph_input->Name());
       UseCount(index)++;  // Models caller's usage post-inference; ensures it will not be reused.
@@ -1050,9 +1041,8 @@ class PlannerImpl {
     auto& allocation_plan = plan_.allocation_plan;
 
     // build the consumer list for each value
-    std::vector<InlinedVector<NodeIndex>> value_consumers;
     int num_ml_values = ort_value_name_idx_map_.MaxIdx() + 1;
-    value_consumers.resize(num_ml_values);
+    value_consumer_map_.reserve(num_ml_values);
 
     // iterate each stream from back, so the first element is the last consumer in single stream case
     for (auto& stream : stream_nodes_) {
@@ -1068,7 +1058,7 @@ class PlannerImpl {
             auto origin = Buffer(value_idx);
             if (origin != -1 && plan_.allocation_plan[origin].alloc_kind == AllocKind::kAllocate) {
               // add current node as consumer for origin buffer
-              value_consumers[origin].push_back(node_index);
+              value_consumer_map_[origin].insert(node_index);
             }
           }
           return Status::OK();
@@ -1119,8 +1109,8 @@ class PlannerImpl {
               auto p_input_arg = input_args[pair.first];
               if (p_input_arg->Exists()) {
                 OrtValueIndex reusable_input{};
-                if (value_map.GetIdx(p_input_arg->Name(), reusable_input).IsOK() &&
-                    allocation_plan[reusable_input].alloc_kind == AllocKind::kAllocate) {
+                if (value_map.GetIdx(p_input_arg->Name(), reusable_input).IsOK() /*&&
+                    allocation_plan[reusable_input].alloc_kind == AllocKind::kAllocate*/) {
                   std::cout << p_input_arg->Name() << " reused by " << p_output_arg->Name() << " as input" << std::endl;
                   allocation_plan[output_idx_global].alloc_kind = AllocKind::kReuse;
                   allocation_plan[output_idx_global].reused_buffer = reusable_input;
@@ -1152,7 +1142,6 @@ class PlannerImpl {
               OrtValueIndex reusable_input{};
               if (value_map.GetIdx(p_input_arg->Name(), reusable_input).IsOK() &&
                   allocation_plan[reusable_input].alloc_kind == AllocKind::kAllocate) {
-                std::cout << p_input_arg->Name() << " reused by " << p_output_arg->Name() << " as input" << std::endl;
                 allocation_plan[output_idx_global].alloc_kind = AllocKind::kReuse;
                 allocation_plan[output_idx_global].reused_buffer = reusable_input;
                 value_consumer_map_[reusable_input].insert(value_consumer_map_[output_idx_global].begin(),
@@ -1175,7 +1164,6 @@ class PlannerImpl {
                 if (value_map.GetIdx(p_input_arg->Name(), input_arg_index).IsOK() &&
                     allocation_plan[input_arg_index].alloc_kind == AllocKind::kAllocate) {
                   if (value_consumer_map_[input_arg_index].size() == 1 && SameSize(*p_input_arg, *p_output_arg)) {
-                    std::cout << p_input_arg->Name() << " reused by " << p_output_arg->Name() << " as an input" << std::endl;
                     allocation_plan[output_idx_global].alloc_kind = AllocKind::kReuse;
                     allocation_plan[output_idx_global].reused_buffer = input_arg_index;
                     value_consumer_map_[input_arg_index].insert(value_consumer_map_[output_idx_global].begin(),
@@ -1302,6 +1290,17 @@ class PlannerImpl {
         }
       }
     }
+
+    for (size_t value_index = 0; value_index < allocation_plan.size(); ++value_index) {
+      if (allocation_plan[value_index].alloc_kind == AllocKind::kReuse) {
+        while (allocation_plan[allocation_plan[value_index].reused_buffer].alloc_kind == AllocKind::kReuse &&
+               allocation_plan[value_index].reused_buffer != allocation_plan[allocation_plan[value_index].reused_buffer].reused_buffer) {
+          allocation_plan[value_index].reused_buffer = allocation_plan[allocation_plan[value_index].reused_buffer].reused_buffer;
+        }
+        ort_value_info_[value_index].reused_buffer_index = allocation_plan[value_index].reused_buffer;
+      }
+    }
+
     return Status::OK();
   }
 #endif
@@ -2110,19 +2109,6 @@ Status PlannerImpl::CreatePlan(
   ORT_RETURN_IF_ERROR(BuildExecutionPlan(execution_providers_));
 #endif
 
-  // build value_node_map
-  for (auto node_index : graph_viewer_.GetNodesInTopologicalOrder(context_->GetExecutionOrder())) {
-    auto* node = graph_viewer_.GetNode(node_index);
-    const auto& output_defs = node->OutputDefs();
-    for (size_t output_idx_local = 0; output_idx_local < output_defs.size(); ++output_idx_local) {
-      const auto& node_output = output_defs[output_idx_local];
-      if (!node_output->Exists()) continue;
-      OrtValueIndex output_idx_global;
-      ORT_THROW_IF_ERROR(ort_value_name_idx_map_.GetIdx(node_output->Name(), output_idx_global));
-      value_node_map_[output_idx_global] = node_index;
-    }
-  }
-
   // determine sharing/reuse among ml-values
   ORT_RETURN_IF_ERROR(ComputeReusePlan());
 
@@ -2365,7 +2351,7 @@ std::unique_ptr<IGraphPartitioner> IGraphPartitioner::CreateGraphPartitioner(con
                                                                              const PathString& config_file) {
   // use device based partitioner by default
   IGraphPartitioner::GraphPartitioningStrategy partitioner_type =
-      IGraphPartitioner::GraphPartitioningStrategy::Unknown;
+      IGraphPartitioner::GraphPartitioningStrategy::DeviceBasedPartition;
   if (!config_file.empty()) {
     std::ifstream f(config_file);
     if (f.is_open()) {
@@ -2383,11 +2369,8 @@ std::unique_ptr<IGraphPartitioner> IGraphPartitioner::CreateGraphPartitioner(con
       f.close();
     }
   }
-  if (partitioner_type == IGraphPartitioner::GraphPartitioningStrategy::Unknown) {
-    partitioner_type = IGraphPartitioner::GraphPartitioningStrategy::DeviceBasedPartition;
-    LOGS(logger, INFO) << "Use DeviceBasedPartition as default";
-  }
   if (partitioner_type == IGraphPartitioner::GraphPartitioningStrategy::DeviceBasedPartition) {
+    LOGS(logger, INFO) << "Use DeviceBasedPartition as default";
     return std::make_unique<DeviceBasedPartitioner>(logger, config_file);
   }  // else if other partitioner types ...
   ORT_THROW("Failed to create partitioner");
diff --git a/onnxruntime/test/framework/allocation_planner_test.cc b/onnxruntime/test/framework/allocation_planner_test.cc
index 3cda40dff4..6af97f23e8 100644
--- a/onnxruntime/test/framework/allocation_planner_test.cc
+++ b/onnxruntime/test/framework/allocation_planner_test.cc
@@ -371,6 +371,29 @@ class PlannerTest : public ::testing::Test {
   void SetNodePartitionConfigFilePath(const char* config_file_path) {
     ORT_THROW_IF_ERROR(sess_options_->config_options.AddConfigEntry(kNodePartitionConfigFile, config_file_path));
   }
+  std::unique_ptr<::onnxruntime::KernelDef>& GetStdKernel() { return std_kernel_; }
+#ifdef USE_CUDA
+  void MemcpyToHostInCuda_TransposeInCudaAndCpu(const char* partitionConfigFile = nullptr) {
+    std::unique_ptr<::onnxruntime::KernelDef> cudaKernel = KernelDefBuilder().SetName("MemcpyToHost").Provider(kCudaExecutionProvider).SetDefaultOutputMemoryType(OrtMemTypeCPUOutput).Build();
+    std::unique_ptr<::onnxruntime::KernelDef> cudaKernelTrans = KernelDefBuilder().SetName("Transpose").Provider(kCudaExecutionProvider).SinceVersion(1, 10).Build();
+    std::string Graph_input("Graph_input"), Arg1("Arg1"), Arg2("Arg2"), Arg3("Arg3"), node1("node1"), node2("node2"), node3("node3");
+    std::vector<onnxruntime::NodeArg*> input1{Arg(Graph_input)}, output1{Arg(Arg1)}, output2{Arg(Arg2)}, output3{Arg(Arg3)};
+    AddNode(*cudaKernel, node1, input1, output1);
+    AddNode(*GetStdKernel(), node2, output1, output2);
+    AddNode(*cudaKernelTrans, node3, output1, output3);
+
+    CUDAExecutionProviderInfo epi;
+    onnxruntime::ProviderInfo_CUDA& ep = onnxruntime::GetProviderInfo_CUDA();
+    auto epFactory = ep.CreateExecutionProviderFactory(epi);
+    std::unique_ptr<IExecutionProvider> execution_provider = epFactory->CreateProvider();
+    AllocatorManager am;
+    execution_provider->RegisterAllocator(am);
+    ORT_THROW_IF_ERROR(GetExecutionProviders().Add("CUDAExecutionProvider", std::move(execution_provider)));
+
+    if (partitionConfigFile != nullptr) SetNodePartitionConfigFilePath(partitionConfigFile);
+    CreatePlan({}, false);
+  }
+#endif  // USE_CUDA
 };
 
 TEST_F(PlannerTest, ChainTest) {
@@ -1272,23 +1295,7 @@ TEST_F(PlannerTest, MultiStream1StreamWaitFor2Streams) {
 // stream 1: node2 (CPU EP)
 // node1's output, which is consumed by both node2 and node3, is in CPU.
 TEST_F(PlannerTest, MultiStreamCudaEPNodeCPUOutput) {
-  std::unique_ptr<::onnxruntime::KernelDef> cudaKernel = KernelDefBuilder().SetName("MemcpyToHost").Provider(kCudaExecutionProvider).SetDefaultOutputMemoryType(OrtMemTypeCPUOutput).Build();
-  std::unique_ptr<::onnxruntime::KernelDef> cudaKernelTrans = KernelDefBuilder().SetName("Transpose").Provider(kCudaExecutionProvider).SinceVersion(1, 10).Build();
-  std::string Graph_input("Graph_input"), Arg1("Arg1"), Arg2("Arg2"), Arg3("Arg3");
-  AddNode(*cudaKernel, Graph_input, Arg1);
-  AddNormalNode(Arg1, Arg2);
-  AddNode(*cudaKernelTrans, Arg1, Arg3);
-
-  CUDAExecutionProviderInfo epi;
-  onnxruntime::ProviderInfo_CUDA& ep = onnxruntime::GetProviderInfo_CUDA();
-  auto epFactory = ep.CreateExecutionProviderFactory(epi);
-  std::unique_ptr<IExecutionProvider> execution_provider = epFactory->CreateProvider();
-  AllocatorManager am;
-  execution_provider->RegisterAllocator(am);
-  ORT_THROW_IF_ERROR(GetExecutionProviders().Add("CUDAExecutionProvider", std::move(execution_provider)));
-
-  CreatePlan({}, false);
-
+  MemcpyToHostInCuda_TransposeInCudaAndCpu("./testdata/multi_stream_models/memcpyToHost_same_stream_with_transpose.json");
   EXPECT_EQ(GetState().GetExecutionPlan()->execution_plan.size(), 2) << "2 logic streams";
   EXPECT_EQ(GetState().GetExecutionPlan()->execution_plan[0]->steps_.size(), 5) << "stream 0 has 5 steps";
   EXPECT_NE(strstr(typeid(*GetState().GetExecutionPlan()->execution_plan[0]->steps_[0]).name(), "LaunchKernelStep"), nullptr) << "0th step: LaunchKernelStep for node 1";
@@ -1814,9 +1821,9 @@ TEST_F(PlannerTest, ParaPlanCreation) {
   auto* exe_plan = const_cast<onnxruntime::SessionState&>(main_graph_session_state).GetExecutionPlan();
   auto& per_value_plans = exe_plan->GetAllocationPlan();
   InlinedHashMap<std::string, std::string> reuse_pairs;
-  reuse_pairs.emplace("conv_0_out", "maxpool_out");
-  reuse_pairs.emplace("conv_1_out", "conv_2_out");
-  reuse_pairs.emplace("relu_1_out", "relu_2_out");
+  reuse_pairs.emplace("conv_0_out", "relu_0_out");  // conv_0_out is reused by relu_0_out
+  reuse_pairs.emplace("conv_1_out", "relu_1_out");  // conv_1_out is reused by relu_1_out
+  reuse_pairs.emplace("conv_2_out", "relu_2_out");  // conv_2_out is reused by relu_2_out
   for (size_t i = 0; i < per_value_plans.size(); ++i) {
     auto& per_value_plan = per_value_plans[i];
     if (per_value_plan.alloc_kind == AllocKind::kReuse) {
diff --git a/onnxruntime/test/testdata/multi_stream_models/memcpyToHost_same_stream_with_transpose.json b/onnxruntime/test/testdata/multi_stream_models/memcpyToHost_same_stream_with_transpose.json
new file mode 100644
index 0000000000..ad6dd08675
--- /dev/null
+++ b/onnxruntime/test/testdata/multi_stream_models/memcpyToHost_same_stream_with_transpose.json
@@ -0,0 +1,5 @@
+{
+"type":"DeviceBasedPartitioner",
+"streams":[["node1", "node3"],["node2"]],
+"devices":["1","0"]
+}