Make MemcpyToHost to a separate stream for performance gain (#14487)

### Description
Make MemcpyToHost to a separate stream for performance gain in default
DeviceBasedPartitioner



### Motivation and Context
Our experiments show that make MemcpyToHost a separate stream will make
it run parallel with other kernels, especially those compute-intensive
ones.

---------

Co-authored-by: Lei Cao <leca@microsoft.com>
This commit is contained in:
cao lei 2023-02-23 14:52:01 -08:00 committed by GitHub
parent 664e296270
commit a012d60777
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 49 additions and 54 deletions

View file

@ -572,15 +572,6 @@ class PlannerImpl {
}
Status ComputeReuseCount() {
// Note: for every ml-value, its definition must appear before all its uses in a topological sort of a valid model
using GraphInputsSet = InlinedHashSet<std::string_view>;
const auto& graph_inputs_nodes = graph_viewer_.GetInputsIncludingInitializers();
GraphInputsSet graph_inputs;
graph_inputs.reserve(graph_inputs_nodes.size());
for (auto& graph_input : graph_inputs_nodes) {
graph_inputs.insert(graph_input->Name());
}
for (auto graph_input : graph_viewer_.GetInputs()) {
OrtValueIndex index = Index(graph_input->Name());
UseCount(index)++; // Models caller's usage post-inference; ensures it will not be reused.
@ -1050,9 +1041,8 @@ class PlannerImpl {
auto& allocation_plan = plan_.allocation_plan;
// build the consumer list for each value
std::vector<InlinedVector<NodeIndex>> value_consumers;
int num_ml_values = ort_value_name_idx_map_.MaxIdx() + 1;
value_consumers.resize(num_ml_values);
value_consumer_map_.reserve(num_ml_values);
// iterate each stream from back, so the first element is the last consumer in single stream case
for (auto& stream : stream_nodes_) {
@ -1068,7 +1058,7 @@ class PlannerImpl {
auto origin = Buffer(value_idx);
if (origin != -1 && plan_.allocation_plan[origin].alloc_kind == AllocKind::kAllocate) {
// add current node as consumer for origin buffer
value_consumers[origin].push_back(node_index);
value_consumer_map_[origin].insert(node_index);
}
}
return Status::OK();
@ -1119,8 +1109,8 @@ class PlannerImpl {
auto p_input_arg = input_args[pair.first];
if (p_input_arg->Exists()) {
OrtValueIndex reusable_input{};
if (value_map.GetIdx(p_input_arg->Name(), reusable_input).IsOK() &&
allocation_plan[reusable_input].alloc_kind == AllocKind::kAllocate) {
if (value_map.GetIdx(p_input_arg->Name(), reusable_input).IsOK() /*&&
allocation_plan[reusable_input].alloc_kind == AllocKind::kAllocate*/) {
std::cout << p_input_arg->Name() << " reused by " << p_output_arg->Name() << " as input" << std::endl;
allocation_plan[output_idx_global].alloc_kind = AllocKind::kReuse;
allocation_plan[output_idx_global].reused_buffer = reusable_input;
@ -1152,7 +1142,6 @@ class PlannerImpl {
OrtValueIndex reusable_input{};
if (value_map.GetIdx(p_input_arg->Name(), reusable_input).IsOK() &&
allocation_plan[reusable_input].alloc_kind == AllocKind::kAllocate) {
std::cout << p_input_arg->Name() << " reused by " << p_output_arg->Name() << " as input" << std::endl;
allocation_plan[output_idx_global].alloc_kind = AllocKind::kReuse;
allocation_plan[output_idx_global].reused_buffer = reusable_input;
value_consumer_map_[reusable_input].insert(value_consumer_map_[output_idx_global].begin(),
@ -1175,7 +1164,6 @@ class PlannerImpl {
if (value_map.GetIdx(p_input_arg->Name(), input_arg_index).IsOK() &&
allocation_plan[input_arg_index].alloc_kind == AllocKind::kAllocate) {
if (value_consumer_map_[input_arg_index].size() == 1 && SameSize(*p_input_arg, *p_output_arg)) {
std::cout << p_input_arg->Name() << " reused by " << p_output_arg->Name() << " as an input" << std::endl;
allocation_plan[output_idx_global].alloc_kind = AllocKind::kReuse;
allocation_plan[output_idx_global].reused_buffer = input_arg_index;
value_consumer_map_[input_arg_index].insert(value_consumer_map_[output_idx_global].begin(),
@ -1302,6 +1290,17 @@ class PlannerImpl {
}
}
}
for (size_t value_index = 0; value_index < allocation_plan.size(); ++value_index) {
if (allocation_plan[value_index].alloc_kind == AllocKind::kReuse) {
while (allocation_plan[allocation_plan[value_index].reused_buffer].alloc_kind == AllocKind::kReuse &&
allocation_plan[value_index].reused_buffer != allocation_plan[allocation_plan[value_index].reused_buffer].reused_buffer) {
allocation_plan[value_index].reused_buffer = allocation_plan[allocation_plan[value_index].reused_buffer].reused_buffer;
}
ort_value_info_[value_index].reused_buffer_index = allocation_plan[value_index].reused_buffer;
}
}
return Status::OK();
}
#endif
@ -2110,19 +2109,6 @@ Status PlannerImpl::CreatePlan(
ORT_RETURN_IF_ERROR(BuildExecutionPlan(execution_providers_));
#endif
// build value_node_map
for (auto node_index : graph_viewer_.GetNodesInTopologicalOrder(context_->GetExecutionOrder())) {
auto* node = graph_viewer_.GetNode(node_index);
const auto& output_defs = node->OutputDefs();
for (size_t output_idx_local = 0; output_idx_local < output_defs.size(); ++output_idx_local) {
const auto& node_output = output_defs[output_idx_local];
if (!node_output->Exists()) continue;
OrtValueIndex output_idx_global;
ORT_THROW_IF_ERROR(ort_value_name_idx_map_.GetIdx(node_output->Name(), output_idx_global));
value_node_map_[output_idx_global] = node_index;
}
}
// determine sharing/reuse among ml-values
ORT_RETURN_IF_ERROR(ComputeReusePlan());
@ -2365,7 +2351,7 @@ std::unique_ptr<IGraphPartitioner> IGraphPartitioner::CreateGraphPartitioner(con
const PathString& config_file) {
// use device based partitioner by default
IGraphPartitioner::GraphPartitioningStrategy partitioner_type =
IGraphPartitioner::GraphPartitioningStrategy::Unknown;
IGraphPartitioner::GraphPartitioningStrategy::DeviceBasedPartition;
if (!config_file.empty()) {
std::ifstream f(config_file);
if (f.is_open()) {
@ -2383,11 +2369,8 @@ std::unique_ptr<IGraphPartitioner> IGraphPartitioner::CreateGraphPartitioner(con
f.close();
}
}
if (partitioner_type == IGraphPartitioner::GraphPartitioningStrategy::Unknown) {
partitioner_type = IGraphPartitioner::GraphPartitioningStrategy::DeviceBasedPartition;
LOGS(logger, INFO) << "Use DeviceBasedPartition as default";
}
if (partitioner_type == IGraphPartitioner::GraphPartitioningStrategy::DeviceBasedPartition) {
LOGS(logger, INFO) << "Use DeviceBasedPartition as default";
return std::make_unique<DeviceBasedPartitioner>(logger, config_file);
} // else if other partitioner types ...
ORT_THROW("Failed to create partitioner");

View file

@ -371,6 +371,29 @@ class PlannerTest : public ::testing::Test {
void SetNodePartitionConfigFilePath(const char* config_file_path) {
ORT_THROW_IF_ERROR(sess_options_->config_options.AddConfigEntry(kNodePartitionConfigFile, config_file_path));
}
std::unique_ptr<::onnxruntime::KernelDef>& GetStdKernel() { return std_kernel_; }
#ifdef USE_CUDA
void MemcpyToHostInCuda_TransposeInCudaAndCpu(const char* partitionConfigFile = nullptr) {
std::unique_ptr<::onnxruntime::KernelDef> cudaKernel = KernelDefBuilder().SetName("MemcpyToHost").Provider(kCudaExecutionProvider).SetDefaultOutputMemoryType(OrtMemTypeCPUOutput).Build();
std::unique_ptr<::onnxruntime::KernelDef> cudaKernelTrans = KernelDefBuilder().SetName("Transpose").Provider(kCudaExecutionProvider).SinceVersion(1, 10).Build();
std::string Graph_input("Graph_input"), Arg1("Arg1"), Arg2("Arg2"), Arg3("Arg3"), node1("node1"), node2("node2"), node3("node3");
std::vector<onnxruntime::NodeArg*> input1{Arg(Graph_input)}, output1{Arg(Arg1)}, output2{Arg(Arg2)}, output3{Arg(Arg3)};
AddNode(*cudaKernel, node1, input1, output1);
AddNode(*GetStdKernel(), node2, output1, output2);
AddNode(*cudaKernelTrans, node3, output1, output3);
CUDAExecutionProviderInfo epi;
onnxruntime::ProviderInfo_CUDA& ep = onnxruntime::GetProviderInfo_CUDA();
auto epFactory = ep.CreateExecutionProviderFactory(epi);
std::unique_ptr<IExecutionProvider> execution_provider = epFactory->CreateProvider();
AllocatorManager am;
execution_provider->RegisterAllocator(am);
ORT_THROW_IF_ERROR(GetExecutionProviders().Add("CUDAExecutionProvider", std::move(execution_provider)));
if (partitionConfigFile != nullptr) SetNodePartitionConfigFilePath(partitionConfigFile);
CreatePlan({}, false);
}
#endif // USE_CUDA
};
TEST_F(PlannerTest, ChainTest) {
@ -1272,23 +1295,7 @@ TEST_F(PlannerTest, MultiStream1StreamWaitFor2Streams) {
// stream 1: node2 (CPU EP)
// node1's output, which is consumed by both node2 and node3, is in CPU.
TEST_F(PlannerTest, MultiStreamCudaEPNodeCPUOutput) {
std::unique_ptr<::onnxruntime::KernelDef> cudaKernel = KernelDefBuilder().SetName("MemcpyToHost").Provider(kCudaExecutionProvider).SetDefaultOutputMemoryType(OrtMemTypeCPUOutput).Build();
std::unique_ptr<::onnxruntime::KernelDef> cudaKernelTrans = KernelDefBuilder().SetName("Transpose").Provider(kCudaExecutionProvider).SinceVersion(1, 10).Build();
std::string Graph_input("Graph_input"), Arg1("Arg1"), Arg2("Arg2"), Arg3("Arg3");
AddNode(*cudaKernel, Graph_input, Arg1);
AddNormalNode(Arg1, Arg2);
AddNode(*cudaKernelTrans, Arg1, Arg3);
CUDAExecutionProviderInfo epi;
onnxruntime::ProviderInfo_CUDA& ep = onnxruntime::GetProviderInfo_CUDA();
auto epFactory = ep.CreateExecutionProviderFactory(epi);
std::unique_ptr<IExecutionProvider> execution_provider = epFactory->CreateProvider();
AllocatorManager am;
execution_provider->RegisterAllocator(am);
ORT_THROW_IF_ERROR(GetExecutionProviders().Add("CUDAExecutionProvider", std::move(execution_provider)));
CreatePlan({}, false);
MemcpyToHostInCuda_TransposeInCudaAndCpu("./testdata/multi_stream_models/memcpyToHost_same_stream_with_transpose.json");
EXPECT_EQ(GetState().GetExecutionPlan()->execution_plan.size(), 2) << "2 logic streams";
EXPECT_EQ(GetState().GetExecutionPlan()->execution_plan[0]->steps_.size(), 5) << "stream 0 has 5 steps";
EXPECT_NE(strstr(typeid(*GetState().GetExecutionPlan()->execution_plan[0]->steps_[0]).name(), "LaunchKernelStep"), nullptr) << "0th step: LaunchKernelStep for node 1";
@ -1814,9 +1821,9 @@ TEST_F(PlannerTest, ParaPlanCreation) {
auto* exe_plan = const_cast<onnxruntime::SessionState&>(main_graph_session_state).GetExecutionPlan();
auto& per_value_plans = exe_plan->GetAllocationPlan();
InlinedHashMap<std::string, std::string> reuse_pairs;
reuse_pairs.emplace("conv_0_out", "maxpool_out");
reuse_pairs.emplace("conv_1_out", "conv_2_out");
reuse_pairs.emplace("relu_1_out", "relu_2_out");
reuse_pairs.emplace("conv_0_out", "relu_0_out"); // conv_0_out is reused by relu_0_out
reuse_pairs.emplace("conv_1_out", "relu_1_out"); // conv_1_out is reused by relu_1_out
reuse_pairs.emplace("conv_2_out", "relu_2_out"); // conv_2_out is reused by relu_2_out
for (size_t i = 0; i < per_value_plans.size(); ++i) {
auto& per_value_plan = per_value_plans[i];
if (per_value_plan.alloc_kind == AllocKind::kReuse) {

View file

@ -0,0 +1,5 @@
{
"type":"DeviceBasedPartitioner",
"streams":[["node1", "node3"],["node2"]],
"devices":["1","0"]
}