Fix bug in TransformerMemcpy (#1413)

* Initial commit

* Add test case

* Revert unintentional change

* Update comments

* Resolve PR feedback

* Craft test casse and add more logs

* Fix build failures

* Fix minor bug in the way modified is updated

* Remove full model inference session test

* Resolve PR comments

* Resolve more PR feedback

* Resolve more PR feedback

* Resolve more PR comments

* Remove logging

* Move GetInitializer() method to memcpy_transformer scope

* Remove some unnecessary blank lines

* Make GetInitializer static
This commit is contained in:
Hariharan Seshadri 2019-07-19 13:54:08 -07:00 committed by GitHub
parent 4cbc6e1cf5
commit 751ee7bb23
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 157 additions and 21 deletions

View file

@ -18,10 +18,10 @@ class TransformerMemcpyImpl {
bool ModifyGraph(const KernelRegistryManager& schema_registries);
private:
void ProcessDefs(onnxruntime::Node& node, const KernelRegistryManager& kernel_registries);
void ProcessDefs(onnxruntime::Node& node, const KernelRegistryManager& kernel_registries, InitializedTensorSet& initializers_consumed);
void BuildDefsMapping(const onnxruntime::NodeArg* arg, const KernelRegistryManager& kernel_registries);
void AddCopyNode(onnxruntime::NodeArg* arg, bool is_input);
void ProcessInitializers(const KernelRegistryManager& kernel_registries);
bool ProcessInitializers(const KernelRegistryManager& kernel_registries, const InitializedTensorSet& initializers_consumed);
private:
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(TransformerMemcpyImpl);
@ -52,6 +52,19 @@ class TransformerMemcpyImpl {
std::string provider_;
};
/** Helper that returns a pointer to the corresponding TensorProto for a name if it is an initializer.
@param check_outer_scope If true and the graph is a subgraph, check parent graph/s for 'name' if not found in 'graph'.
*/
static const onnx::TensorProto* GetInitializer(const Graph& graph, const std::string& name, bool check_outer_scope) {
const onnx::TensorProto* initializer = nullptr;
if (graph.GetInitializedTensor(name, initializer)) {
return initializer;
} else if (check_outer_scope && graph.IsSubgraph()) {
return GetInitializer(*graph.ParentGraph(), name, check_outer_scope);
}
return initializer;
}
// very simple GraphTransformer that uses TransformerMemcpyImpl for each graph
// and mainly provides the subgraph recursion functionality
common::Status MemcpyTransformer::ApplyImpl(Graph& graph, bool& modified, int graph_level) const {
@ -63,7 +76,8 @@ common::Status MemcpyTransformer::ApplyImpl(Graph& graph, bool& modified, int gr
provider != onnxruntime::kTensorrtExecutionProvider &&
provider != onnxruntime::kOpenVINOExecutionProvider) {
TransformerMemcpyImpl copy_impl(graph, provider);
modified = copy_impl.ModifyGraph(registry_manager_);
auto current_modified = copy_impl.ModifyGraph(registry_manager_);
modified = modified || current_modified;
}
}
@ -109,14 +123,16 @@ This transformer does not currently optimize copies between, e.g., two different
bool TransformerMemcpyImpl::ModifyGraph(const KernelRegistryManager& kernel_registries) {
bool modified = false;
InitializedTensorSet initializers_consumed;
// find defs that require copy
for (auto& node : graph_.Nodes()) {
//don't need to do node placement here now, onnxruntime will do it according to registered kernels.
ProcessDefs(node, kernel_registries);
//as we process the defs, collect all the initializers consumed at the current graph level
ProcessDefs(node, kernel_registries, initializers_consumed);
}
// for initializers shared by different providers, create dups
ProcessInitializers(kernel_registries);
if (ProcessInitializers(kernel_registries, initializers_consumed))
modified = true;
for (auto arg : graph_.GetInputs())
BuildDefsMapping(arg, kernel_registries);
@ -150,21 +166,27 @@ bool TransformerMemcpyImpl::ModifyGraph(const KernelRegistryManager& kernel_regi
return modified;
}
void TransformerMemcpyImpl::ProcessDefs(onnxruntime::Node& node, const KernelRegistryManager& kernel_registries) {
void TransformerMemcpyImpl::ProcessDefs(onnxruntime::Node& node, const KernelRegistryManager& kernel_registries, InitializedTensorSet& initializers_consumed) {
if (node.GetExecutionProviderType() == provider_) {
provider_nodes_.insert(&node);
// note KernelCreateInfo might be nullptr for custom kernel
const KernelCreateInfo* kci = nullptr;
kernel_registries.SearchKernelRegistry(node, &kci);
auto status = onnxruntime::Node::ForEachWithIndex(node.InputDefs(),
[this, &kci](const onnxruntime::NodeArg& arg, size_t index) {
if (kci && kci->kernel_def->IsInputOnCpu(index))
non_provider_input_defs_.insert(&arg);
else
provider_input_defs_.insert(&arg);
return Status::OK();
});
auto status = onnxruntime::Node::ForEachWithIndex(
node.InputDefs(),
[this, &kci, &initializers_consumed](const onnxruntime::NodeArg& arg, size_t index) {
// check if this NodeArg is an initializer defined in current outer graph level
const auto* initializer_tensor_proto =
GetInitializer(graph_, arg.Name(), true);
if (initializer_tensor_proto != nullptr)
initializers_consumed[arg.Name()] = initializer_tensor_proto;
if (kci && kci->kernel_def->IsInputOnCpu(index))
non_provider_input_defs_.insert(&arg);
else
provider_input_defs_.insert(&arg);
return Status::OK();
});
ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
@ -274,9 +296,9 @@ static const onnxruntime::NodeArg* FindNodeArg(const NodeArgSetType& def_set, co
// We duplicate any initializer that is used by both provider nodes and non-provider nodes
// to ensure that provider nodes and non-provider nodes don't share initializers, as they
// need to stay in different memory locations.
void TransformerMemcpyImpl::ProcessInitializers(const KernelRegistryManager& kernel_registries) {
bool TransformerMemcpyImpl::ProcessInitializers(const KernelRegistryManager& kernel_registries, const InitializedTensorSet& initializers_consumed) {
std::map<const onnxruntime::NodeArg*, onnxruntime::NodeArg*> replacements;
for (const auto& pair : graph_.GetAllInitializedTensors()) {
for (const auto& pair : initializers_consumed) {
const auto& name = pair.first;
const onnxruntime::NodeArg* provider_def = FindNodeArg(provider_input_defs_, name);
const onnxruntime::NodeArg* non_provider_def = FindNodeArg(non_provider_input_defs_, name);
@ -284,10 +306,15 @@ void TransformerMemcpyImpl::ProcessInitializers(const KernelRegistryManager& ker
std::string new_def_name = graph_.GenerateNodeArgName(name);
auto& new_def = graph_.GetOrCreateNodeArg(new_def_name, provider_def->TypeAsProto());
const TensorProto* tensor_proto = nullptr;
bool found = graph_.GetInitializedTensor(name, tensor_proto);
ORT_ENFORCE(found, "Failed to get initialized tensor ", name);
// We make a copy of the initializer that is to be consumed by the provider Node so that
// session state initializer can copy it over to the provider device during its operation
// TODO: The copy being made is possibly redundant if this occurs in a subgraph
// When multiple subgraphs consume the same initializer as an implicit input,
// multiple copies of the initializer will be made into the provider device
// This should not directly affect runtime performance as the copies occur during initialization
// but overuse of the provider device's memory is definitely inefficient
// In future, we need to "statefully" make the copy only once and use it in all subgraphs referencing the initializer
const TensorProto* tensor_proto = pair.second;
TensorProto new_tensor_proto = *tensor_proto;
*(new_tensor_proto.mutable_name()) = new_def_name;
graph_.AddInitializedTensor(new_tensor_proto);
@ -322,6 +349,9 @@ void TransformerMemcpyImpl::ProcessInitializers(const KernelRegistryManager& ker
p_node->ReplaceDefs(dup_replacements);
}
// This denotes a modification to the graph
return !replacements.empty();
}
} // namespace onnxruntime

View file

@ -177,6 +177,112 @@ TEST(TransformerTest, MemcpyTransformerTestCudaFirst) {
ExpectSame(node2, node4, 0);
ExpectSame(node2, node4, 1);
}
TEST(TransformerTest, TestCopyNodeInsertionInitializerInSubgraph) {
// In this test, we are going to create a subgraph consuming an implicit input
// which is an initializer in the outer scope, and this implicit input to the subgraph
// is consumed by nodes on multiple devices
TensorProto value_tensor;
value_tensor.add_dims(1);
value_tensor.add_float_data(1.f);
value_tensor.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
TypeProto tensor_float_type;
tensor_float_type.mutable_tensor_type()->set_elem_type(TensorProto_DataType_FLOAT);
TypeProto tensor_bool_type;
tensor_bool_type.mutable_tensor_type()->set_elem_type(TensorProto_DataType_BOOL);
onnxruntime::NodeArg i1_def("I1", &tensor_bool_type),
o1_def("O1", &tensor_float_type),
o2_def("O2", &tensor_float_type);
// main graph
// this will only contain one 'If' node
std::unordered_map<std::string, int> domain_to_version;
domain_to_version[kOnnxDomain] = 7;
auto model = std::make_shared<onnxruntime::Model>("test",
false,
ModelMetaData(),
IOnnxRuntimeOpSchemaRegistryList(),
domain_to_version);
onnxruntime::Graph& graph = model->MainGraph();
TensorProto parent_constant(value_tensor);
parent_constant.set_name("parent_constant");
graph.AddInitializedTensor(parent_constant);
// subgraph
// this will contain 2 'Add' nodes - one on CPU and one of GPU
// one of the inputs to the 'Add' nodes is an implicit input to the subgraph
// which is an initializer in the main graph
std::unordered_map<std::string, int> subgraph_domain_to_version;
subgraph_domain_to_version[kOnnxDomain] = 7;
auto sub_model = std::make_shared<onnxruntime::Model>("test_subgraph",
false,
ModelMetaData(),
IOnnxRuntimeOpSchemaRegistryList(),
subgraph_domain_to_version);
onnxruntime::Graph& subgraph = sub_model->MainGraph();
TensorProto local_constant(value_tensor);
local_constant.set_name("local_constant");
subgraph.AddInitializedTensor(local_constant);
subgraph.AddOuterScopeNodeArg("parent_constant");
subgraph.AddNode("node1", "Add", "operator1",
ArgMap{&subgraph.GetOrCreateNodeArg("local_constant", &tensor_float_type),
&graph.GetOrCreateNodeArg("parent_constant", &tensor_float_type)},
ArgMap{&o1_def});
subgraph.AddNode("node2", "Add", "operator2",
ArgMap{&subgraph.GetOrCreateNodeArg("local_constant", &tensor_float_type),
&graph.GetOrCreateNodeArg("parent_constant", &tensor_float_type)},
ArgMap{&o2_def});
auto status = subgraph.Resolve();
ASSERT_TRUE(status.IsOK()) << status.ErrorMessage();
// main graph continued
// create the 'If' node
auto& if_node = graph.AddNode("node3", "If", "cpu operator2", ArgMap{&i1_def}, ArgMap{&o1_def, &o2_def});
if_node.AddAttribute("then_branch", {subgraph.ToGraphProto()});
if_node.AddAttribute("else_branch", {subgraph.ToGraphProto()});
onnxruntime::Graph* subgraph_1 = if_node.GetMutableGraphAttribute("then_branch");
for (auto& node : subgraph_1->Nodes()) {
if (node.Name() == "node2") {
// only this node is on GPU
node.SetExecutionProviderType(onnxruntime::kCudaExecutionProvider);
} else {
node.SetExecutionProviderType(onnxruntime::kCpuExecutionProvider);
}
}
onnxruntime::Graph* subgraph_2 = if_node.GetMutableGraphAttribute("else_branch");
for (auto& node : subgraph_2->Nodes()) {
node.SetExecutionProviderType(onnxruntime::kCpuExecutionProvider);
}
status = graph.Resolve();
ASSERT_TRUE(status.IsOK()) << status.ErrorMessage();
KernelRegistryManager kernel_registry_manager;
ExecutionProviders execution_providers;
execution_providers.Add(onnxruntime::kCudaExecutionProvider,
std::make_unique<CUDAExecutionProvider>(CUDAExecutionProviderInfo()));
execution_providers.Add(onnxruntime::kCpuExecutionProvider,
std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo()));
KernelRegistryManager test_registry_manager;
test_registry_manager.RegisterKernels(execution_providers);
MemcpyTransformer transformer({onnxruntime::kCudaExecutionProvider}, test_registry_manager);
bool modified = false;
status = transformer.Apply(graph, modified);
EXPECT_TRUE(status.IsOK()) << status.ErrorMessage();
EXPECT_TRUE(modified);
}
#endif
} // namespace test