mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-26 22:35:43 +00:00
Make MemcpyToHost to a separate stream for performance gain (#14487)
### Description Make MemcpyToHost to a separate stream for performance gain in default DeviceBasedPartitioner ### Motivation and Context Our experiments show that make MemcpyToHost a separate stream will make it run parallel with other kernels, especially those compute-intensive ones. --------- Co-authored-by: Lei Cao <leca@microsoft.com>
This commit is contained in:
parent
664e296270
commit
a012d60777
3 changed files with 49 additions and 54 deletions
|
|
@ -572,15 +572,6 @@ class PlannerImpl {
|
|||
}
|
||||
|
||||
Status ComputeReuseCount() {
|
||||
// Note: for every ml-value, its definition must appear before all its uses in a topological sort of a valid model
|
||||
using GraphInputsSet = InlinedHashSet<std::string_view>;
|
||||
const auto& graph_inputs_nodes = graph_viewer_.GetInputsIncludingInitializers();
|
||||
GraphInputsSet graph_inputs;
|
||||
graph_inputs.reserve(graph_inputs_nodes.size());
|
||||
for (auto& graph_input : graph_inputs_nodes) {
|
||||
graph_inputs.insert(graph_input->Name());
|
||||
}
|
||||
|
||||
for (auto graph_input : graph_viewer_.GetInputs()) {
|
||||
OrtValueIndex index = Index(graph_input->Name());
|
||||
UseCount(index)++; // Models caller's usage post-inference; ensures it will not be reused.
|
||||
|
|
@ -1050,9 +1041,8 @@ class PlannerImpl {
|
|||
auto& allocation_plan = plan_.allocation_plan;
|
||||
|
||||
// build the consumer list for each value
|
||||
std::vector<InlinedVector<NodeIndex>> value_consumers;
|
||||
int num_ml_values = ort_value_name_idx_map_.MaxIdx() + 1;
|
||||
value_consumers.resize(num_ml_values);
|
||||
value_consumer_map_.reserve(num_ml_values);
|
||||
|
||||
// iterate each stream from back, so the first element is the last consumer in single stream case
|
||||
for (auto& stream : stream_nodes_) {
|
||||
|
|
@ -1068,7 +1058,7 @@ class PlannerImpl {
|
|||
auto origin = Buffer(value_idx);
|
||||
if (origin != -1 && plan_.allocation_plan[origin].alloc_kind == AllocKind::kAllocate) {
|
||||
// add current node as consumer for origin buffer
|
||||
value_consumers[origin].push_back(node_index);
|
||||
value_consumer_map_[origin].insert(node_index);
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
|
|
@ -1119,8 +1109,8 @@ class PlannerImpl {
|
|||
auto p_input_arg = input_args[pair.first];
|
||||
if (p_input_arg->Exists()) {
|
||||
OrtValueIndex reusable_input{};
|
||||
if (value_map.GetIdx(p_input_arg->Name(), reusable_input).IsOK() &&
|
||||
allocation_plan[reusable_input].alloc_kind == AllocKind::kAllocate) {
|
||||
if (value_map.GetIdx(p_input_arg->Name(), reusable_input).IsOK() /*&&
|
||||
allocation_plan[reusable_input].alloc_kind == AllocKind::kAllocate*/) {
|
||||
std::cout << p_input_arg->Name() << " reused by " << p_output_arg->Name() << " as input" << std::endl;
|
||||
allocation_plan[output_idx_global].alloc_kind = AllocKind::kReuse;
|
||||
allocation_plan[output_idx_global].reused_buffer = reusable_input;
|
||||
|
|
@ -1152,7 +1142,6 @@ class PlannerImpl {
|
|||
OrtValueIndex reusable_input{};
|
||||
if (value_map.GetIdx(p_input_arg->Name(), reusable_input).IsOK() &&
|
||||
allocation_plan[reusable_input].alloc_kind == AllocKind::kAllocate) {
|
||||
std::cout << p_input_arg->Name() << " reused by " << p_output_arg->Name() << " as input" << std::endl;
|
||||
allocation_plan[output_idx_global].alloc_kind = AllocKind::kReuse;
|
||||
allocation_plan[output_idx_global].reused_buffer = reusable_input;
|
||||
value_consumer_map_[reusable_input].insert(value_consumer_map_[output_idx_global].begin(),
|
||||
|
|
@ -1175,7 +1164,6 @@ class PlannerImpl {
|
|||
if (value_map.GetIdx(p_input_arg->Name(), input_arg_index).IsOK() &&
|
||||
allocation_plan[input_arg_index].alloc_kind == AllocKind::kAllocate) {
|
||||
if (value_consumer_map_[input_arg_index].size() == 1 && SameSize(*p_input_arg, *p_output_arg)) {
|
||||
std::cout << p_input_arg->Name() << " reused by " << p_output_arg->Name() << " as an input" << std::endl;
|
||||
allocation_plan[output_idx_global].alloc_kind = AllocKind::kReuse;
|
||||
allocation_plan[output_idx_global].reused_buffer = input_arg_index;
|
||||
value_consumer_map_[input_arg_index].insert(value_consumer_map_[output_idx_global].begin(),
|
||||
|
|
@ -1302,6 +1290,17 @@ class PlannerImpl {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t value_index = 0; value_index < allocation_plan.size(); ++value_index) {
|
||||
if (allocation_plan[value_index].alloc_kind == AllocKind::kReuse) {
|
||||
while (allocation_plan[allocation_plan[value_index].reused_buffer].alloc_kind == AllocKind::kReuse &&
|
||||
allocation_plan[value_index].reused_buffer != allocation_plan[allocation_plan[value_index].reused_buffer].reused_buffer) {
|
||||
allocation_plan[value_index].reused_buffer = allocation_plan[allocation_plan[value_index].reused_buffer].reused_buffer;
|
||||
}
|
||||
ort_value_info_[value_index].reused_buffer_index = allocation_plan[value_index].reused_buffer;
|
||||
}
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
#endif
|
||||
|
|
@ -2110,19 +2109,6 @@ Status PlannerImpl::CreatePlan(
|
|||
ORT_RETURN_IF_ERROR(BuildExecutionPlan(execution_providers_));
|
||||
#endif
|
||||
|
||||
// build value_node_map
|
||||
for (auto node_index : graph_viewer_.GetNodesInTopologicalOrder(context_->GetExecutionOrder())) {
|
||||
auto* node = graph_viewer_.GetNode(node_index);
|
||||
const auto& output_defs = node->OutputDefs();
|
||||
for (size_t output_idx_local = 0; output_idx_local < output_defs.size(); ++output_idx_local) {
|
||||
const auto& node_output = output_defs[output_idx_local];
|
||||
if (!node_output->Exists()) continue;
|
||||
OrtValueIndex output_idx_global;
|
||||
ORT_THROW_IF_ERROR(ort_value_name_idx_map_.GetIdx(node_output->Name(), output_idx_global));
|
||||
value_node_map_[output_idx_global] = node_index;
|
||||
}
|
||||
}
|
||||
|
||||
// determine sharing/reuse among ml-values
|
||||
ORT_RETURN_IF_ERROR(ComputeReusePlan());
|
||||
|
||||
|
|
@ -2365,7 +2351,7 @@ std::unique_ptr<IGraphPartitioner> IGraphPartitioner::CreateGraphPartitioner(con
|
|||
const PathString& config_file) {
|
||||
// use device based partitioner by default
|
||||
IGraphPartitioner::GraphPartitioningStrategy partitioner_type =
|
||||
IGraphPartitioner::GraphPartitioningStrategy::Unknown;
|
||||
IGraphPartitioner::GraphPartitioningStrategy::DeviceBasedPartition;
|
||||
if (!config_file.empty()) {
|
||||
std::ifstream f(config_file);
|
||||
if (f.is_open()) {
|
||||
|
|
@ -2383,11 +2369,8 @@ std::unique_ptr<IGraphPartitioner> IGraphPartitioner::CreateGraphPartitioner(con
|
|||
f.close();
|
||||
}
|
||||
}
|
||||
if (partitioner_type == IGraphPartitioner::GraphPartitioningStrategy::Unknown) {
|
||||
partitioner_type = IGraphPartitioner::GraphPartitioningStrategy::DeviceBasedPartition;
|
||||
LOGS(logger, INFO) << "Use DeviceBasedPartition as default";
|
||||
}
|
||||
if (partitioner_type == IGraphPartitioner::GraphPartitioningStrategy::DeviceBasedPartition) {
|
||||
LOGS(logger, INFO) << "Use DeviceBasedPartition as default";
|
||||
return std::make_unique<DeviceBasedPartitioner>(logger, config_file);
|
||||
} // else if other partitioner types ...
|
||||
ORT_THROW("Failed to create partitioner");
|
||||
|
|
|
|||
|
|
@ -371,6 +371,29 @@ class PlannerTest : public ::testing::Test {
|
|||
void SetNodePartitionConfigFilePath(const char* config_file_path) {
|
||||
ORT_THROW_IF_ERROR(sess_options_->config_options.AddConfigEntry(kNodePartitionConfigFile, config_file_path));
|
||||
}
|
||||
std::unique_ptr<::onnxruntime::KernelDef>& GetStdKernel() { return std_kernel_; }
|
||||
#ifdef USE_CUDA
|
||||
void MemcpyToHostInCuda_TransposeInCudaAndCpu(const char* partitionConfigFile = nullptr) {
|
||||
std::unique_ptr<::onnxruntime::KernelDef> cudaKernel = KernelDefBuilder().SetName("MemcpyToHost").Provider(kCudaExecutionProvider).SetDefaultOutputMemoryType(OrtMemTypeCPUOutput).Build();
|
||||
std::unique_ptr<::onnxruntime::KernelDef> cudaKernelTrans = KernelDefBuilder().SetName("Transpose").Provider(kCudaExecutionProvider).SinceVersion(1, 10).Build();
|
||||
std::string Graph_input("Graph_input"), Arg1("Arg1"), Arg2("Arg2"), Arg3("Arg3"), node1("node1"), node2("node2"), node3("node3");
|
||||
std::vector<onnxruntime::NodeArg*> input1{Arg(Graph_input)}, output1{Arg(Arg1)}, output2{Arg(Arg2)}, output3{Arg(Arg3)};
|
||||
AddNode(*cudaKernel, node1, input1, output1);
|
||||
AddNode(*GetStdKernel(), node2, output1, output2);
|
||||
AddNode(*cudaKernelTrans, node3, output1, output3);
|
||||
|
||||
CUDAExecutionProviderInfo epi;
|
||||
onnxruntime::ProviderInfo_CUDA& ep = onnxruntime::GetProviderInfo_CUDA();
|
||||
auto epFactory = ep.CreateExecutionProviderFactory(epi);
|
||||
std::unique_ptr<IExecutionProvider> execution_provider = epFactory->CreateProvider();
|
||||
AllocatorManager am;
|
||||
execution_provider->RegisterAllocator(am);
|
||||
ORT_THROW_IF_ERROR(GetExecutionProviders().Add("CUDAExecutionProvider", std::move(execution_provider)));
|
||||
|
||||
if (partitionConfigFile != nullptr) SetNodePartitionConfigFilePath(partitionConfigFile);
|
||||
CreatePlan({}, false);
|
||||
}
|
||||
#endif // USE_CUDA
|
||||
};
|
||||
|
||||
TEST_F(PlannerTest, ChainTest) {
|
||||
|
|
@ -1272,23 +1295,7 @@ TEST_F(PlannerTest, MultiStream1StreamWaitFor2Streams) {
|
|||
// stream 1: node2 (CPU EP)
|
||||
// node1's output, which is consumed by both node2 and node3, is in CPU.
|
||||
TEST_F(PlannerTest, MultiStreamCudaEPNodeCPUOutput) {
|
||||
std::unique_ptr<::onnxruntime::KernelDef> cudaKernel = KernelDefBuilder().SetName("MemcpyToHost").Provider(kCudaExecutionProvider).SetDefaultOutputMemoryType(OrtMemTypeCPUOutput).Build();
|
||||
std::unique_ptr<::onnxruntime::KernelDef> cudaKernelTrans = KernelDefBuilder().SetName("Transpose").Provider(kCudaExecutionProvider).SinceVersion(1, 10).Build();
|
||||
std::string Graph_input("Graph_input"), Arg1("Arg1"), Arg2("Arg2"), Arg3("Arg3");
|
||||
AddNode(*cudaKernel, Graph_input, Arg1);
|
||||
AddNormalNode(Arg1, Arg2);
|
||||
AddNode(*cudaKernelTrans, Arg1, Arg3);
|
||||
|
||||
CUDAExecutionProviderInfo epi;
|
||||
onnxruntime::ProviderInfo_CUDA& ep = onnxruntime::GetProviderInfo_CUDA();
|
||||
auto epFactory = ep.CreateExecutionProviderFactory(epi);
|
||||
std::unique_ptr<IExecutionProvider> execution_provider = epFactory->CreateProvider();
|
||||
AllocatorManager am;
|
||||
execution_provider->RegisterAllocator(am);
|
||||
ORT_THROW_IF_ERROR(GetExecutionProviders().Add("CUDAExecutionProvider", std::move(execution_provider)));
|
||||
|
||||
CreatePlan({}, false);
|
||||
|
||||
MemcpyToHostInCuda_TransposeInCudaAndCpu("./testdata/multi_stream_models/memcpyToHost_same_stream_with_transpose.json");
|
||||
EXPECT_EQ(GetState().GetExecutionPlan()->execution_plan.size(), 2) << "2 logic streams";
|
||||
EXPECT_EQ(GetState().GetExecutionPlan()->execution_plan[0]->steps_.size(), 5) << "stream 0 has 5 steps";
|
||||
EXPECT_NE(strstr(typeid(*GetState().GetExecutionPlan()->execution_plan[0]->steps_[0]).name(), "LaunchKernelStep"), nullptr) << "0th step: LaunchKernelStep for node 1";
|
||||
|
|
@ -1814,9 +1821,9 @@ TEST_F(PlannerTest, ParaPlanCreation) {
|
|||
auto* exe_plan = const_cast<onnxruntime::SessionState&>(main_graph_session_state).GetExecutionPlan();
|
||||
auto& per_value_plans = exe_plan->GetAllocationPlan();
|
||||
InlinedHashMap<std::string, std::string> reuse_pairs;
|
||||
reuse_pairs.emplace("conv_0_out", "maxpool_out");
|
||||
reuse_pairs.emplace("conv_1_out", "conv_2_out");
|
||||
reuse_pairs.emplace("relu_1_out", "relu_2_out");
|
||||
reuse_pairs.emplace("conv_0_out", "relu_0_out"); // conv_0_out is reused by relu_0_out
|
||||
reuse_pairs.emplace("conv_1_out", "relu_1_out"); // conv_1_out is reused by relu_1_out
|
||||
reuse_pairs.emplace("conv_2_out", "relu_2_out"); // conv_2_out is reused by relu_2_out
|
||||
for (size_t i = 0; i < per_value_plans.size(); ++i) {
|
||||
auto& per_value_plan = per_value_plans[i];
|
||||
if (per_value_plan.alloc_kind == AllocKind::kReuse) {
|
||||
|
|
|
|||
5
onnxruntime/test/testdata/multi_stream_models/memcpyToHost_same_stream_with_transpose.json
vendored
Normal file
5
onnxruntime/test/testdata/multi_stream_models/memcpyToHost_same_stream_with_transpose.json
vendored
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
{
|
||||
"type":"DeviceBasedPartitioner",
|
||||
"streams":[["node1", "node3"],["node2"]],
|
||||
"devices":["1","0"]
|
||||
}
|
||||
Loading…
Reference in a new issue