From d5c5c4fa500eaabc5e48775437c10b608a0a66e7 Mon Sep 17 00:00:00 2001 From: Hariharan Seshadri Date: Tue, 12 Oct 2021 11:21:17 -0700 Subject: [PATCH] Handle implicit subgraph inputs required on different devices in Memcpy transformer (#9299) --- .../core/optimizer/transformer_memcpy.cc | 43 ++++- .../test/framework/memcpy_transformer_test.cc | 164 +++++++++++++++++- 2 files changed, 200 insertions(+), 7 deletions(-) diff --git a/onnxruntime/core/optimizer/transformer_memcpy.cc b/onnxruntime/core/optimizer/transformer_memcpy.cc index d998f66e26..aed760aa05 100644 --- a/onnxruntime/core/optimizer/transformer_memcpy.cc +++ b/onnxruntime/core/optimizer/transformer_memcpy.cc @@ -153,6 +153,35 @@ bool TransformerMemcpyImpl::ModifyGraph(const KernelRegistryManager& kernel_regi modified = true; } + // Process implicit inputs in subgraphs that is explicitly consumed + // on both provider and non-provider nodes. This is mimicking + // logic for explicit graph inputs. + if (graph_.IsSubgraph()) { + for (auto arg : graph_.ParentNode()->ImplicitInputDefs()) { + // Looking into `provider_input_defs_` and `non_provider_input_defs_` + // using NodeArg pointers from the outer scope is okay because the + // comparator is only name based (and doesn't compare raw pointers) + if (provider_input_defs_.count(arg) && non_provider_input_defs_.count(arg)) { + // There should be at-least one explicit consumer of the NodeArg + // in both the provider node list and the non-provider node list. + // If there are no explicit consumers in both lists, we don't want + // to get into the business of adding copy nodes at this + // level. + // If there are explicit consumers in only one list (either provider + // or non-provider node consumers), there isn't any point in adding + // copy nodes in that case either as subgraph copy logic will take + // it to the required device (i.e.) we don't need to care about it here. + + // Be sure to use the NodeArg* relevant to the current graph level + // (the name will be the same as the parent node's implicit input) + const auto* node_arg_in_current_graph_level = *provider_input_defs_.find(arg); + + AddCopyNode(const_cast(node_arg_in_current_graph_level), true); + modified = true; + } + } + } + return modified; } @@ -176,7 +205,7 @@ void TransformerMemcpyImpl::ProcessDefs(onnxruntime::Node& node, const KernelReg } // implicit inputs have no location info in the kernel def, so do nothing to them here, leaving the control - // flow op (Loop, Scan, If) to do the necessary copy if the input crosses different provider. + // flow op (Loop, Scan, If) to do the necessary copy if the input crosses different provider. // PlannerImpl::ComputeUseCounts has matching logic so the allocation plan does the same thing if (!is_implicit_input) { if (utils::IsInputOnCpu(node, kci, index)) { @@ -220,10 +249,14 @@ void TransformerMemcpyImpl::ProcessDefs(onnxruntime::Node& node, const KernelReg non_provider_input_defs_.insert(arg); } - for (const auto* arg : node.ImplicitInputDefs()) { - if (arg->Exists()) - non_provider_input_defs_.insert(arg); - } + // Never add an implicit def to provider_input_defs_ or non_provider_input_defs_. + // This is because we don't want to add copy nodes on account of implicit + // inputs to nodes. + // We will rely on utils::CopyInputsAcrossDevices() to do the job. + //for (const auto* arg : node.ImplicitInputDefs()) { + // if (arg->Exists()) + // non_provider_input_defs_.insert(arg); + //} for (auto* arg : node.MutableOutputDefs()) { if (arg->Exists()) diff --git a/onnxruntime/test/framework/memcpy_transformer_test.cc b/onnxruntime/test/framework/memcpy_transformer_test.cc index 77b5feb66f..6e86e5b58a 100644 --- a/onnxruntime/test/framework/memcpy_transformer_test.cc +++ b/onnxruntime/test/framework/memcpy_transformer_test.cc @@ -182,10 +182,16 @@ TEST(TransformerTest, MemcpyTransformerTestCudaFirst) { ExpectSame(node2, node4, 0); ExpectSame(node2, node4, 1); } -TEST(TransformerTest, TestCopyNodeInsertionInitializerInSubgraph) { +TEST(TransformerTest, TestInitializerDuplicationInSubgraph) { // In this test, we are going to create a subgraph consuming an implicit input // which is an initializer in the outer scope, and this implicit input to the subgraph - // is consumed by nodes on multiple devices + // is consumed by nodes on multiple devices. + + // Since, the outer scope initializer is consumed on different devices in the subgraph, + // a copy of the initializer is made in the subgraph to be provided to the provider (CUDA) node. + // No explicit copy nodes are inserted in this scenario and hence we do not check for copy nodes. + // Instead, we do check if the transformer modified the graph while processing the parent initializer + // in the subgraph. TensorProto value_tensor; value_tensor.add_dims(1); value_tensor.add_float_data(1.f); @@ -288,6 +294,160 @@ TEST(TransformerTest, TestCopyNodeInsertionInitializerInSubgraph) { EXPECT_TRUE(modified); } +TEST(TransformerTest, MemcpyTransformerTestGraphInputConsumedOnMultipleDevices) { + // In this test, a graph input is consumed by 2 nodes partitioned to different devices. + // We expect a copy node to get inserted to the provider (CUDA) node while consuming + // the graph input. + std::unordered_map domain_to_version; + domain_to_version[kOnnxDomain] = 7; + auto model = std::make_shared("test", false, ModelMetaData(), PathString(), + IOnnxRuntimeOpSchemaRegistryList(), + domain_to_version, std::vector(), + DefaultLoggingManager().DefaultLogger()); + onnxruntime::Graph& graph = model->MainGraph(); + + TypeProto tensor_float_type; + tensor_float_type.mutable_tensor_type()->set_elem_type(TensorProto_DataType_FLOAT); + onnxruntime::NodeArg i1_def("I1", &tensor_float_type), + o1_def("O1", &tensor_float_type), + o2_def("O2", &tensor_float_type); + + // I1 is a graph input that is consumed by 2 MatMul nodes on different devices + auto& node1 = graph.AddNode("node1", "MatMul", "cpu operator1", ArgMap{&i1_def, &i1_def}, ArgMap{&o1_def}); + node1.SetExecutionProviderType(onnxruntime::kCpuExecutionProvider); + auto& node2 = graph.AddNode("node2", "MatMul", "gpu operator1", ArgMap{&i1_def, &i1_def}, ArgMap{&o2_def}); + node2.SetExecutionProviderType(onnxruntime::kCudaExecutionProvider); + + auto status = graph.Resolve(); + ASSERT_TRUE(status.IsOK()) << status.ErrorMessage(); + + KernelRegistryManager kernel_registry_manager; + ExecutionProviders execution_providers; + ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider())); + ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCpuExecutionProvider, + std::make_unique(CPUExecutionProviderInfo()))); + KernelRegistryManager test_registry_manager; + ASSERT_STATUS_OK(test_registry_manager.RegisterKernels(execution_providers)); + + MemcpyTransformer transformer({onnxruntime::kCudaExecutionProvider}, test_registry_manager); + + bool modified = false; + status = transformer.Apply(graph, modified, DefaultLoggingManager().DefaultLogger()); + EXPECT_TRUE(status.IsOK()) << status.ErrorMessage(); + EXPECT_TRUE(modified); + + auto op_count_map = CountOpsInGraph(graph); + ASSERT_TRUE(op_count_map["MemcpyFromHost"] == 1); +} + +TEST(TransformerTest, MemcpyTransformerTestImplicitInputConsumedOnMultipleDevices) { + // In this test, an implicit input (consumed by If subgraphs) + // is consumed by 2 nodes partitioned to different devices. + // We expect a copy node to get inserted to the provider (CUDA) node while consuming + // the implicit input. + std::unordered_map domain_to_version; + domain_to_version[kOnnxDomain] = 7; + auto model = std::make_shared("test", false, ModelMetaData(), PathString(), + IOnnxRuntimeOpSchemaRegistryList(), + domain_to_version, std::vector(), + DefaultLoggingManager().DefaultLogger()); + onnxruntime::Graph& graph = model->MainGraph(); + + std::unordered_map subgraph_domain_to_version; + subgraph_domain_to_version[kOnnxDomain] = 7; + auto sub_model = std::make_shared("test_subgraph", + false, + ModelMetaData(), + PathString(), + IOnnxRuntimeOpSchemaRegistryList(), + subgraph_domain_to_version, std::vector(), + DefaultLoggingManager().DefaultLogger()); + onnxruntime::Graph& subgraph = sub_model->MainGraph(); + + TypeProto tensor_float_type; + tensor_float_type.mutable_tensor_type()->set_elem_type(TensorProto_DataType_FLOAT); + + TypeProto tensor_bool_type; + tensor_bool_type.mutable_tensor_type()->set_elem_type(TensorProto_DataType_BOOL); + + onnxruntime::NodeArg i1_def("I1", &tensor_bool_type), + i2_def("I2", &tensor_float_type), + o1_def("O1", &tensor_float_type), + o2_def("O2", &tensor_float_type); + + // I1 is a subgraph input that is consumed by 2 MatMul nodes on different devices + auto& implicit_input_arg = graph.GetOrCreateNodeArg("I2", &tensor_float_type); + subgraph.AddNode("node1", "MatMul", "cpu operator1", ArgMap{&implicit_input_arg, &implicit_input_arg}, ArgMap{&o1_def}); + subgraph.AddNode("node2", "MatMul", "gpu operator1", ArgMap{&implicit_input_arg, &implicit_input_arg}, ArgMap{&o2_def}); + + subgraph.AddOuterScopeNodeArg("I2"); + + auto status = subgraph.Resolve(); + ASSERT_TRUE(status.IsOK()) << status.ErrorMessage(); + + // Main graph continued + TensorProto init; + init.add_dims(1); + init.add_int32_data(1); + init.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_BOOL); + init.set_name("I1"); + graph.AddInitializedTensor(init); + + auto& if_node = graph.AddNode("node3", "If", "gpu operator", ArgMap{&i1_def}, ArgMap{&o1_def, &o2_def}); + + if_node.AddAttribute("then_branch", subgraph.ToGraphProto()); + if_node.AddAttribute("else_branch", subgraph.ToGraphProto()); + + graph.SetInputs({&i1_def, &i2_def}); + + onnxruntime::Graph* subgraph_1 = if_node.GetMutableGraphAttribute("then_branch"); + for (auto& node : subgraph_1->Nodes()) { + if (node.Name() == "node2") { + // only this node is on GPU + node.SetExecutionProviderType(onnxruntime::kCudaExecutionProvider); + } else { + node.SetExecutionProviderType(onnxruntime::kCpuExecutionProvider); + } + } + + onnxruntime::Graph* subgraph_2 = if_node.GetMutableGraphAttribute("else_branch"); + for (auto& node : subgraph_2->Nodes()) { + if (node.Name() == "node2") { + // only this node is on GPU + node.SetExecutionProviderType(onnxruntime::kCudaExecutionProvider); + } else { + node.SetExecutionProviderType(onnxruntime::kCpuExecutionProvider); + } + } + + status = graph.Resolve(); + ASSERT_TRUE(status.IsOK()) << status.ErrorMessage(); + + KernelRegistryManager kernel_registry_manager; + ExecutionProviders execution_providers; + ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider())); + ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCpuExecutionProvider, + std::make_unique(CPUExecutionProviderInfo()))); + KernelRegistryManager test_registry_manager; + ASSERT_STATUS_OK(test_registry_manager.RegisterKernels(execution_providers)); + + MemcpyTransformer transformer({onnxruntime::kCudaExecutionProvider}, test_registry_manager); + + bool modified = false; + status = transformer.Apply(graph, modified, DefaultLoggingManager().DefaultLogger()); + EXPECT_TRUE(status.IsOK()) << status.ErrorMessage(); + EXPECT_TRUE(modified); + + // We expect to see copy nodes inserted in each of the subgraphs + // because an implicit input is consumed both by provider (CUDA) and + // non-provider (CPU) nodes. + auto op_count_map = CountOpsInGraph(*subgraph_1); + ASSERT_TRUE(op_count_map["MemcpyFromHost"] == 1); + + op_count_map = CountOpsInGraph(*subgraph_2); + ASSERT_TRUE(op_count_map["MemcpyFromHost"] == 1); +} + #endif } // namespace test