From a012d60777a26515efece4bcd228c3863733db5d Mon Sep 17 00:00:00 2001 From: cao lei Date: Thu, 23 Feb 2023 14:52:01 -0800 Subject: [PATCH] Make MemcpyToHost to a separate stream for performance gain (#14487) ### Description Make MemcpyToHost to a separate stream for performance gain in default DeviceBasedPartitioner ### Motivation and Context Our experiments show that make MemcpyToHost a separate stream will make it run parallel with other kernels, especially those compute-intensive ones. --------- Co-authored-by: Lei Cao --- .../core/framework/allocation_planner.cc | 51 +++++++------------ .../test/framework/allocation_planner_test.cc | 47 +++++++++-------- ...mcpyToHost_same_stream_with_transpose.json | 5 ++ 3 files changed, 49 insertions(+), 54 deletions(-) create mode 100644 onnxruntime/test/testdata/multi_stream_models/memcpyToHost_same_stream_with_transpose.json diff --git a/onnxruntime/core/framework/allocation_planner.cc b/onnxruntime/core/framework/allocation_planner.cc index 0af5924987..8cf07f566a 100644 --- a/onnxruntime/core/framework/allocation_planner.cc +++ b/onnxruntime/core/framework/allocation_planner.cc @@ -572,15 +572,6 @@ class PlannerImpl { } Status ComputeReuseCount() { - // Note: for every ml-value, its definition must appear before all its uses in a topological sort of a valid model - using GraphInputsSet = InlinedHashSet; - const auto& graph_inputs_nodes = graph_viewer_.GetInputsIncludingInitializers(); - GraphInputsSet graph_inputs; - graph_inputs.reserve(graph_inputs_nodes.size()); - for (auto& graph_input : graph_inputs_nodes) { - graph_inputs.insert(graph_input->Name()); - } - for (auto graph_input : graph_viewer_.GetInputs()) { OrtValueIndex index = Index(graph_input->Name()); UseCount(index)++; // Models caller's usage post-inference; ensures it will not be reused. @@ -1050,9 +1041,8 @@ class PlannerImpl { auto& allocation_plan = plan_.allocation_plan; // build the consumer list for each value - std::vector> value_consumers; int num_ml_values = ort_value_name_idx_map_.MaxIdx() + 1; - value_consumers.resize(num_ml_values); + value_consumer_map_.reserve(num_ml_values); // iterate each stream from back, so the first element is the last consumer in single stream case for (auto& stream : stream_nodes_) { @@ -1068,7 +1058,7 @@ class PlannerImpl { auto origin = Buffer(value_idx); if (origin != -1 && plan_.allocation_plan[origin].alloc_kind == AllocKind::kAllocate) { // add current node as consumer for origin buffer - value_consumers[origin].push_back(node_index); + value_consumer_map_[origin].insert(node_index); } } return Status::OK(); @@ -1119,8 +1109,8 @@ class PlannerImpl { auto p_input_arg = input_args[pair.first]; if (p_input_arg->Exists()) { OrtValueIndex reusable_input{}; - if (value_map.GetIdx(p_input_arg->Name(), reusable_input).IsOK() && - allocation_plan[reusable_input].alloc_kind == AllocKind::kAllocate) { + if (value_map.GetIdx(p_input_arg->Name(), reusable_input).IsOK() /*&& + allocation_plan[reusable_input].alloc_kind == AllocKind::kAllocate*/) { std::cout << p_input_arg->Name() << " reused by " << p_output_arg->Name() << " as input" << std::endl; allocation_plan[output_idx_global].alloc_kind = AllocKind::kReuse; allocation_plan[output_idx_global].reused_buffer = reusable_input; @@ -1152,7 +1142,6 @@ class PlannerImpl { OrtValueIndex reusable_input{}; if (value_map.GetIdx(p_input_arg->Name(), reusable_input).IsOK() && allocation_plan[reusable_input].alloc_kind == AllocKind::kAllocate) { - std::cout << p_input_arg->Name() << " reused by " << p_output_arg->Name() << " as input" << std::endl; allocation_plan[output_idx_global].alloc_kind = AllocKind::kReuse; allocation_plan[output_idx_global].reused_buffer = reusable_input; value_consumer_map_[reusable_input].insert(value_consumer_map_[output_idx_global].begin(), @@ -1175,7 +1164,6 @@ class PlannerImpl { if (value_map.GetIdx(p_input_arg->Name(), input_arg_index).IsOK() && allocation_plan[input_arg_index].alloc_kind == AllocKind::kAllocate) { if (value_consumer_map_[input_arg_index].size() == 1 && SameSize(*p_input_arg, *p_output_arg)) { - std::cout << p_input_arg->Name() << " reused by " << p_output_arg->Name() << " as an input" << std::endl; allocation_plan[output_idx_global].alloc_kind = AllocKind::kReuse; allocation_plan[output_idx_global].reused_buffer = input_arg_index; value_consumer_map_[input_arg_index].insert(value_consumer_map_[output_idx_global].begin(), @@ -1302,6 +1290,17 @@ class PlannerImpl { } } } + + for (size_t value_index = 0; value_index < allocation_plan.size(); ++value_index) { + if (allocation_plan[value_index].alloc_kind == AllocKind::kReuse) { + while (allocation_plan[allocation_plan[value_index].reused_buffer].alloc_kind == AllocKind::kReuse && + allocation_plan[value_index].reused_buffer != allocation_plan[allocation_plan[value_index].reused_buffer].reused_buffer) { + allocation_plan[value_index].reused_buffer = allocation_plan[allocation_plan[value_index].reused_buffer].reused_buffer; + } + ort_value_info_[value_index].reused_buffer_index = allocation_plan[value_index].reused_buffer; + } + } + return Status::OK(); } #endif @@ -2110,19 +2109,6 @@ Status PlannerImpl::CreatePlan( ORT_RETURN_IF_ERROR(BuildExecutionPlan(execution_providers_)); #endif - // build value_node_map - for (auto node_index : graph_viewer_.GetNodesInTopologicalOrder(context_->GetExecutionOrder())) { - auto* node = graph_viewer_.GetNode(node_index); - const auto& output_defs = node->OutputDefs(); - for (size_t output_idx_local = 0; output_idx_local < output_defs.size(); ++output_idx_local) { - const auto& node_output = output_defs[output_idx_local]; - if (!node_output->Exists()) continue; - OrtValueIndex output_idx_global; - ORT_THROW_IF_ERROR(ort_value_name_idx_map_.GetIdx(node_output->Name(), output_idx_global)); - value_node_map_[output_idx_global] = node_index; - } - } - // determine sharing/reuse among ml-values ORT_RETURN_IF_ERROR(ComputeReusePlan()); @@ -2365,7 +2351,7 @@ std::unique_ptr IGraphPartitioner::CreateGraphPartitioner(con const PathString& config_file) { // use device based partitioner by default IGraphPartitioner::GraphPartitioningStrategy partitioner_type = - IGraphPartitioner::GraphPartitioningStrategy::Unknown; + IGraphPartitioner::GraphPartitioningStrategy::DeviceBasedPartition; if (!config_file.empty()) { std::ifstream f(config_file); if (f.is_open()) { @@ -2383,11 +2369,8 @@ std::unique_ptr IGraphPartitioner::CreateGraphPartitioner(con f.close(); } } - if (partitioner_type == IGraphPartitioner::GraphPartitioningStrategy::Unknown) { - partitioner_type = IGraphPartitioner::GraphPartitioningStrategy::DeviceBasedPartition; - LOGS(logger, INFO) << "Use DeviceBasedPartition as default"; - } if (partitioner_type == IGraphPartitioner::GraphPartitioningStrategy::DeviceBasedPartition) { + LOGS(logger, INFO) << "Use DeviceBasedPartition as default"; return std::make_unique(logger, config_file); } // else if other partitioner types ... ORT_THROW("Failed to create partitioner"); diff --git a/onnxruntime/test/framework/allocation_planner_test.cc b/onnxruntime/test/framework/allocation_planner_test.cc index 3cda40dff4..6af97f23e8 100644 --- a/onnxruntime/test/framework/allocation_planner_test.cc +++ b/onnxruntime/test/framework/allocation_planner_test.cc @@ -371,6 +371,29 @@ class PlannerTest : public ::testing::Test { void SetNodePartitionConfigFilePath(const char* config_file_path) { ORT_THROW_IF_ERROR(sess_options_->config_options.AddConfigEntry(kNodePartitionConfigFile, config_file_path)); } + std::unique_ptr<::onnxruntime::KernelDef>& GetStdKernel() { return std_kernel_; } +#ifdef USE_CUDA + void MemcpyToHostInCuda_TransposeInCudaAndCpu(const char* partitionConfigFile = nullptr) { + std::unique_ptr<::onnxruntime::KernelDef> cudaKernel = KernelDefBuilder().SetName("MemcpyToHost").Provider(kCudaExecutionProvider).SetDefaultOutputMemoryType(OrtMemTypeCPUOutput).Build(); + std::unique_ptr<::onnxruntime::KernelDef> cudaKernelTrans = KernelDefBuilder().SetName("Transpose").Provider(kCudaExecutionProvider).SinceVersion(1, 10).Build(); + std::string Graph_input("Graph_input"), Arg1("Arg1"), Arg2("Arg2"), Arg3("Arg3"), node1("node1"), node2("node2"), node3("node3"); + std::vector input1{Arg(Graph_input)}, output1{Arg(Arg1)}, output2{Arg(Arg2)}, output3{Arg(Arg3)}; + AddNode(*cudaKernel, node1, input1, output1); + AddNode(*GetStdKernel(), node2, output1, output2); + AddNode(*cudaKernelTrans, node3, output1, output3); + + CUDAExecutionProviderInfo epi; + onnxruntime::ProviderInfo_CUDA& ep = onnxruntime::GetProviderInfo_CUDA(); + auto epFactory = ep.CreateExecutionProviderFactory(epi); + std::unique_ptr execution_provider = epFactory->CreateProvider(); + AllocatorManager am; + execution_provider->RegisterAllocator(am); + ORT_THROW_IF_ERROR(GetExecutionProviders().Add("CUDAExecutionProvider", std::move(execution_provider))); + + if (partitionConfigFile != nullptr) SetNodePartitionConfigFilePath(partitionConfigFile); + CreatePlan({}, false); + } +#endif // USE_CUDA }; TEST_F(PlannerTest, ChainTest) { @@ -1272,23 +1295,7 @@ TEST_F(PlannerTest, MultiStream1StreamWaitFor2Streams) { // stream 1: node2 (CPU EP) // node1's output, which is consumed by both node2 and node3, is in CPU. TEST_F(PlannerTest, MultiStreamCudaEPNodeCPUOutput) { - std::unique_ptr<::onnxruntime::KernelDef> cudaKernel = KernelDefBuilder().SetName("MemcpyToHost").Provider(kCudaExecutionProvider).SetDefaultOutputMemoryType(OrtMemTypeCPUOutput).Build(); - std::unique_ptr<::onnxruntime::KernelDef> cudaKernelTrans = KernelDefBuilder().SetName("Transpose").Provider(kCudaExecutionProvider).SinceVersion(1, 10).Build(); - std::string Graph_input("Graph_input"), Arg1("Arg1"), Arg2("Arg2"), Arg3("Arg3"); - AddNode(*cudaKernel, Graph_input, Arg1); - AddNormalNode(Arg1, Arg2); - AddNode(*cudaKernelTrans, Arg1, Arg3); - - CUDAExecutionProviderInfo epi; - onnxruntime::ProviderInfo_CUDA& ep = onnxruntime::GetProviderInfo_CUDA(); - auto epFactory = ep.CreateExecutionProviderFactory(epi); - std::unique_ptr execution_provider = epFactory->CreateProvider(); - AllocatorManager am; - execution_provider->RegisterAllocator(am); - ORT_THROW_IF_ERROR(GetExecutionProviders().Add("CUDAExecutionProvider", std::move(execution_provider))); - - CreatePlan({}, false); - + MemcpyToHostInCuda_TransposeInCudaAndCpu("./testdata/multi_stream_models/memcpyToHost_same_stream_with_transpose.json"); EXPECT_EQ(GetState().GetExecutionPlan()->execution_plan.size(), 2) << "2 logic streams"; EXPECT_EQ(GetState().GetExecutionPlan()->execution_plan[0]->steps_.size(), 5) << "stream 0 has 5 steps"; EXPECT_NE(strstr(typeid(*GetState().GetExecutionPlan()->execution_plan[0]->steps_[0]).name(), "LaunchKernelStep"), nullptr) << "0th step: LaunchKernelStep for node 1"; @@ -1814,9 +1821,9 @@ TEST_F(PlannerTest, ParaPlanCreation) { auto* exe_plan = const_cast(main_graph_session_state).GetExecutionPlan(); auto& per_value_plans = exe_plan->GetAllocationPlan(); InlinedHashMap reuse_pairs; - reuse_pairs.emplace("conv_0_out", "maxpool_out"); - reuse_pairs.emplace("conv_1_out", "conv_2_out"); - reuse_pairs.emplace("relu_1_out", "relu_2_out"); + reuse_pairs.emplace("conv_0_out", "relu_0_out"); // conv_0_out is reused by relu_0_out + reuse_pairs.emplace("conv_1_out", "relu_1_out"); // conv_1_out is reused by relu_1_out + reuse_pairs.emplace("conv_2_out", "relu_2_out"); // conv_2_out is reused by relu_2_out for (size_t i = 0; i < per_value_plans.size(); ++i) { auto& per_value_plan = per_value_plans[i]; if (per_value_plan.alloc_kind == AllocKind::kReuse) { diff --git a/onnxruntime/test/testdata/multi_stream_models/memcpyToHost_same_stream_with_transpose.json b/onnxruntime/test/testdata/multi_stream_models/memcpyToHost_same_stream_with_transpose.json new file mode 100644 index 0000000000..ad6dd08675 --- /dev/null +++ b/onnxruntime/test/testdata/multi_stream_models/memcpyToHost_same_stream_with_transpose.json @@ -0,0 +1,5 @@ +{ +"type":"DeviceBasedPartitioner", +"streams":[["node1", "node3"],["node2"]], +"devices":["1","0"] +}