mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-26 22:35:43 +00:00
Co-authored-by: Weixing Zhang <wezhan@microsoft.com> Support MIGraphXEP to work with ROCMEP for inference on AMD GPU
398 lines
18 KiB
C++
398 lines
18 KiB
C++
// Copyright (c) Microsoft Corporation. All rights reserved.
|
|
// Licensed under the MIT License.
|
|
|
|
#include "transformer_memcpy.h"
|
|
#include "core/framework/kernel_registry_manager.h"
|
|
#include "core/framework/execution_providers.h"
|
|
#include "core/framework/utils.h"
|
|
|
|
using namespace ONNX_NAMESPACE;
|
|
namespace onnxruntime {
|
|
|
|
// implements MemCpy node insertion in graph transform
|
|
// note that GraphTransformer::Apply() is supposed to be stateless, so this cannot derive from GraphTranformer
|
|
class TransformerMemcpyImpl {
|
|
public:
|
|
TransformerMemcpyImpl(onnxruntime::Graph& graph, const std::string& provider)
|
|
: graph_(graph), provider_(provider) {}
|
|
|
|
bool ModifyGraph(const KernelRegistryManager& schema_registries);
|
|
|
|
private:
|
|
void ProcessDefs(onnxruntime::Node& node, const KernelRegistryManager& kernel_registries, InitializedTensorSet& initializers_consumed);
|
|
void BuildDefsMapping(const onnxruntime::NodeArg* arg, const KernelRegistryManager& kernel_registries);
|
|
void AddCopyNode(onnxruntime::NodeArg* arg, bool is_input);
|
|
bool ProcessInitializers(const KernelRegistryManager& kernel_registries, const InitializedTensorSet& initializers_consumed);
|
|
|
|
private:
|
|
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(TransformerMemcpyImpl);
|
|
|
|
// use value-based compare to make sure transformer output order is consistent
|
|
struct NodeArgCompare {
|
|
bool operator()(const onnxruntime::NodeArg* lhs, const onnxruntime::NodeArg* rhs) const {
|
|
return lhs->Name() < rhs->Name();
|
|
}
|
|
};
|
|
|
|
std::set<onnxruntime::Node*, NodeCompare> provider_nodes_;
|
|
std::set<const onnxruntime::NodeArg*, NodeArgCompare> non_provider_input_defs_; // all input defs of non-provider nodes
|
|
std::set<onnxruntime::NodeArg*, NodeArgCompare> non_provider_output_defs_; // all output defs of non-provider nodes
|
|
std::set<const onnxruntime::NodeArg*, NodeArgCompare> provider_input_defs_; // all input defs of provider nodes that should be in provider allocator
|
|
std::set<onnxruntime::NodeArg*, NodeArgCompare> provider_output_defs_; // all output defs of provider nodes that should be in provider allocator
|
|
std::map<const onnxruntime::NodeArg*, std::set<onnxruntime::Node*, NodeCompare>> provider_input_nodes_;
|
|
std::map<const onnxruntime::NodeArg*, std::set<onnxruntime::Node*, NodeCompare>> provider_output_nodes_;
|
|
|
|
onnxruntime::Graph& graph_;
|
|
std::string provider_;
|
|
};
|
|
|
|
/** Helper that returns a pointer to the corresponding TensorProto for a name if it is an initializer.
|
|
@param check_outer_scope If true and the graph is a subgraph, check parent graph/s for 'name' if not found in 'graph'.
|
|
*/
|
|
static const onnx::TensorProto* GetInitializer(const Graph& graph, const std::string& name, bool check_outer_scope) {
|
|
const onnx::TensorProto* initializer = nullptr;
|
|
if (graph.GetInitializedTensor(name, initializer)) {
|
|
return initializer;
|
|
} else if (check_outer_scope && graph.IsSubgraph()) {
|
|
return GetInitializer(*graph.ParentGraph(), name, check_outer_scope);
|
|
}
|
|
return initializer;
|
|
}
|
|
|
|
// very simple GraphTransformer that uses TransformerMemcpyImpl for each graph
|
|
// and mainly provides the subgraph recursion functionality
|
|
common::Status MemcpyTransformer::ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const {
|
|
for (auto& provider : provider_types_) {
|
|
if (!utils::ProviderIsCpuBased(provider)) {
|
|
TransformerMemcpyImpl copy_impl(graph, provider);
|
|
auto current_modified = copy_impl.ModifyGraph(registry_manager_);
|
|
modified = modified || current_modified;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// TODO: We probably need to do the recursion inline when processing the main graph in order to maximize efficiency.
|
|
// e.g. data on GPU prior to an 'If' node. The 'If' must run on CPU, but if the subgraph is GPU based it could
|
|
// consume the data from GPU and we shouldn't insert a memcpy from GPU to CPU prior to the If node, and one from
|
|
// CPU back to GPU when beginning execution of the subgraph. To do that requires inspecting the subgraph (and any
|
|
// nested subgraphs) when deciding whether to insert a memcpy in the parent graph, and may need to be fully done
|
|
// within TransformerMemcpyImpl instead of via this simple GraphTransformer.
|
|
|
|
// handle any subgraphs in nodes
|
|
for (auto& node : graph.Nodes()) {
|
|
ORT_RETURN_IF_ERROR(Recurse(node, modified, graph_level, logger));
|
|
}
|
|
|
|
return Status::OK();
|
|
}
|
|
|
|
/*
|
|
|
|
Overview: The transformer transforms the input graph as follows:
|
|
|
|
(1) For every initializer W that is referenced by both provider and non-provider nodes,
|
|
we create a duplicate initializer W2 and change all provider nodes to reference this
|
|
duplicate copy.
|
|
|
|
(2) For every ml-value X that is computed by a provider node and referenced by a
|
|
non-provider node, we introduce a new ml-value X2. We replace all references to X
|
|
in provider nodes by X2, and introduce a copy from X2 to X. (All graph outputs
|
|
are considered as non-provider references here.)
|
|
|
|
(3 For every ml-value X that is computed by a non-provider node and referenced by
|
|
a provider node, we introduce a new ml-value X2. We replace all references to X in
|
|
provider nodes by X2, and introduce a copy from X to X2. (All graph inputs are
|
|
considered to be non-provider here.)
|
|
|
|
Note that every ml-value is computed at a unique point (either provider or non-provider),
|
|
but it may be referenced and used at multiple points (by both provider and non-provider).
|
|
|
|
This transformer does not currently optimize copies between, e.g., two different GPU devices, etc.
|
|
|
|
*/
|
|
|
|
bool TransformerMemcpyImpl::ModifyGraph(const KernelRegistryManager& kernel_registries) {
|
|
bool modified = false;
|
|
InitializedTensorSet initializers_consumed;
|
|
// find defs that require copy
|
|
for (auto& node : graph_.Nodes()) {
|
|
//as we process the defs, collect all the initializers consumed at the current graph level
|
|
ProcessDefs(node, kernel_registries, initializers_consumed);
|
|
}
|
|
|
|
// for initializers shared by different providers, create dups
|
|
if (ProcessInitializers(kernel_registries, initializers_consumed))
|
|
modified = true;
|
|
|
|
for (auto arg : graph_.GetInputs())
|
|
BuildDefsMapping(arg, kernel_registries);
|
|
|
|
for (auto arg : non_provider_input_defs_)
|
|
BuildDefsMapping(arg, kernel_registries);
|
|
|
|
for (auto arg : non_provider_output_defs_)
|
|
BuildDefsMapping(arg, kernel_registries);
|
|
|
|
for (auto arg : graph_.GetInputs())
|
|
// For inputs we need to create a copy node only when the input is connected to both provider
|
|
// and non-provider nodes. Otherwise utils::CopyInputsAcrossDevices() will do the job.
|
|
if (provider_input_defs_.count(arg) && non_provider_input_defs_.count(arg)) {
|
|
AddCopyNode(const_cast<onnxruntime::NodeArg*>(arg), true);
|
|
modified = true;
|
|
}
|
|
|
|
for (auto arg : non_provider_output_defs_)
|
|
if (provider_input_defs_.count(arg)) {
|
|
AddCopyNode(arg, true);
|
|
modified = true;
|
|
}
|
|
|
|
for (auto arg : provider_output_defs_)
|
|
if (non_provider_input_defs_.count(arg)) {
|
|
AddCopyNode(arg, false);
|
|
modified = true;
|
|
}
|
|
|
|
// Process implicit inputs in subgraphs that is explicitly consumed
|
|
// on both provider and non-provider nodes. This is mimicking
|
|
// logic for explicit graph inputs.
|
|
if (graph_.IsSubgraph()) {
|
|
for (auto arg : graph_.ParentNode()->ImplicitInputDefs()) {
|
|
// Looking into `provider_input_defs_` and `non_provider_input_defs_`
|
|
// using NodeArg pointers from the outer scope is okay because the
|
|
// comparator is only name based (and doesn't compare raw pointers)
|
|
if (provider_input_defs_.count(arg) && non_provider_input_defs_.count(arg)) {
|
|
// There should be at-least one explicit consumer of the NodeArg
|
|
// in both the provider node list and the non-provider node list.
|
|
// If there are no explicit consumers in both lists, we don't want
|
|
// to get into the business of adding copy nodes at this
|
|
// level.
|
|
// If there are explicit consumers in only one list (either provider
|
|
// or non-provider node consumers), there isn't any point in adding
|
|
// copy nodes in that case either as subgraph copy logic will take
|
|
// it to the required device (i.e.) we don't need to care about it here.
|
|
|
|
// Be sure to use the NodeArg* relevant to the current graph level
|
|
// (the name will be the same as the parent node's implicit input)
|
|
const auto* node_arg_in_current_graph_level = *provider_input_defs_.find(arg);
|
|
|
|
AddCopyNode(const_cast<onnxruntime::NodeArg*>(node_arg_in_current_graph_level), true);
|
|
modified = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
return modified;
|
|
}
|
|
|
|
void TransformerMemcpyImpl::ProcessDefs(onnxruntime::Node& node, const KernelRegistryManager& kernel_registries,
|
|
InitializedTensorSet& initializers_consumed) {
|
|
auto node_provider_type = node.GetExecutionProviderType();
|
|
if ((node_provider_type == provider_) ||
|
|
(node_provider_type == kCudaExecutionProvider && kTensorrtExecutionProvider == provider_)) {
|
|
provider_nodes_.insert(&node);
|
|
// note KernelCreateInfo might be nullptr for custom kernel
|
|
const KernelCreateInfo* kci = nullptr;
|
|
ORT_IGNORE_RETURN_VALUE(kernel_registries.SearchKernelRegistry(node, &kci));
|
|
|
|
bool is_implicit_input = false;
|
|
auto process_inputs =
|
|
[this, &node, &kci, &initializers_consumed, &is_implicit_input](const onnxruntime::NodeArg& arg, size_t index) {
|
|
// check if this NodeArg is an initializer defined in current outer graph level
|
|
const auto* initializer_tensor_proto = GetInitializer(graph_, arg.Name(), true);
|
|
if (initializer_tensor_proto != nullptr) {
|
|
initializers_consumed[arg.Name()] = initializer_tensor_proto;
|
|
}
|
|
|
|
// implicit inputs have no location info in the kernel def, so do nothing to them here, leaving the control
|
|
// flow op (Loop, Scan, If) to do the necessary copy if the input crosses different provider.
|
|
// PlannerImpl::ComputeUseCounts has matching logic so the allocation plan does the same thing
|
|
if (!is_implicit_input) {
|
|
if (utils::IsInputOnCpu(node, kci, index)) {
|
|
non_provider_input_defs_.insert(&arg);
|
|
} else {
|
|
provider_input_defs_.insert(&arg);
|
|
}
|
|
}
|
|
|
|
return Status::OK();
|
|
};
|
|
|
|
auto status = onnxruntime::Node::ForEachWithIndex(node.InputDefs(), process_inputs);
|
|
ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
|
|
|
|
is_implicit_input = true;
|
|
status = onnxruntime::Node::ForEachWithIndex(node.ImplicitInputDefs(), process_inputs);
|
|
ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
|
|
|
|
auto& output_defs = node.MutableOutputDefs();
|
|
for (size_t i = 0; i < output_defs.size(); ++i) {
|
|
auto arg = output_defs[i];
|
|
if (!arg->Exists())
|
|
continue;
|
|
|
|
if (kci && kci->kernel_def->IsOutputOnCpu(i))
|
|
non_provider_output_defs_.insert(arg);
|
|
else
|
|
provider_output_defs_.insert(arg);
|
|
}
|
|
} else if (node_provider_type != kCudaExecutionProvider && node_provider_type != kTensorrtExecutionProvider &&
|
|
node_provider_type != kRocmExecutionProvider && node_provider_type != kMIGraphXExecutionProvider) {
|
|
// TODO: copy between devices? i.e. multiple GPUs
|
|
if (node_provider_type != onnxruntime::kCpuExecutionProvider &&
|
|
node_provider_type != onnxruntime::kVitisAIExecutionProvider &&
|
|
!node_provider_type.empty()) {
|
|
ORT_THROW("Execution type '", node_provider_type, "' doesn't support memcpy ");
|
|
}
|
|
|
|
for (const auto* arg : node.InputDefs()) {
|
|
if (arg->Exists())
|
|
non_provider_input_defs_.insert(arg);
|
|
}
|
|
|
|
// Never add an implicit def to provider_input_defs_ or non_provider_input_defs_.
|
|
// This is because we don't want to add copy nodes on account of implicit
|
|
// inputs to nodes.
|
|
// We will rely on utils::CopyInputsAcrossDevices() to do the job.
|
|
//for (const auto* arg : node.ImplicitInputDefs()) {
|
|
// if (arg->Exists())
|
|
// non_provider_input_defs_.insert(arg);
|
|
//}
|
|
|
|
for (auto* arg : node.MutableOutputDefs()) {
|
|
if (arg->Exists())
|
|
non_provider_output_defs_.insert(arg);
|
|
}
|
|
}
|
|
}
|
|
|
|
//for non_provider defs, collect the nodes that expect it is provider tensor as input/output.
|
|
void TransformerMemcpyImpl::BuildDefsMapping(const onnxruntime::NodeArg* arg, const KernelRegistryManager& kernel_registries) {
|
|
for (auto& it : graph_.Nodes()) {
|
|
if (it.OpType() == "MemcpyFromHost" || it.OpType() == "MemcpyToHost") continue;
|
|
auto input_it =
|
|
std::find(it.MutableInputDefs().begin(), it.MutableInputDefs().end(), const_cast<onnxruntime::NodeArg*>(arg));
|
|
auto output_it =
|
|
std::find(it.MutableOutputDefs().begin(), it.MutableOutputDefs().end(), const_cast<onnxruntime::NodeArg*>(arg));
|
|
int arg_input_index =
|
|
input_it != it.MutableInputDefs().end() ? static_cast<int>(input_it - it.MutableInputDefs().begin()) : -1;
|
|
int arg_output_index =
|
|
output_it != it.MutableOutputDefs().end() ? static_cast<int>(output_it - it.MutableOutputDefs().begin()) : -1;
|
|
if (arg_input_index == -1 && arg_output_index == -1)
|
|
continue;
|
|
auto node_provider_type = it.GetExecutionProviderType();
|
|
if ((node_provider_type == provider_) || (node_provider_type == kCudaExecutionProvider && kTensorrtExecutionProvider == provider_)) {
|
|
const KernelCreateInfo* kci = nullptr;
|
|
ORT_IGNORE_RETURN_VALUE(kernel_registries.SearchKernelRegistry(it, &kci));
|
|
if (arg_input_index != -1) {
|
|
if (!kci || !utils::IsInputOnCpu(it, kci, arg_input_index)) provider_input_nodes_[arg].insert(&it);
|
|
}
|
|
if (arg_output_index != -1) {
|
|
if (!kci || !kci->kernel_def->IsOutputOnCpu(arg_output_index)) provider_output_nodes_[arg].insert(&it);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void TransformerMemcpyImpl::AddCopyNode(onnxruntime::NodeArg* arg, bool is_input) {
|
|
// create unique name for new def
|
|
std::string new_def_name = graph_.GenerateNodeArgName(arg->Name() + "_" + provider_);
|
|
|
|
auto* new_arg = &graph_.GetOrCreateNodeArg(new_def_name, arg->TypeAsProto());
|
|
auto* src_arg = is_input ? arg : new_arg;
|
|
auto* dst_arg = is_input ? new_arg : arg;
|
|
|
|
// create unique name for copy node
|
|
std::string new_node_name = graph_.GenerateNodeName("Memcpy");
|
|
|
|
const auto op_name = is_input ? "MemcpyFromHost" : "MemcpyToHost";
|
|
auto& new_node = graph_.AddNode(new_node_name, op_name, "Copy from/to host memory",
|
|
std::vector<onnxruntime::NodeArg*>{src_arg},
|
|
std::vector<onnxruntime::NodeArg*>{dst_arg});
|
|
new_node.SetExecutionProviderType(provider_);
|
|
std::map<const onnxruntime::NodeArg*, onnxruntime::NodeArg*> map = {{arg, new_arg}};
|
|
auto it = provider_input_nodes_.find(arg);
|
|
if (it != provider_input_nodes_.end()) {
|
|
for (auto* node : it->second)
|
|
node->ReplaceDefs(map);
|
|
}
|
|
it = provider_output_nodes_.find(arg);
|
|
if (it != provider_output_nodes_.end()) {
|
|
for (auto* node : it->second)
|
|
node->ReplaceDefs(map);
|
|
}
|
|
}
|
|
|
|
template <typename NodeArgSetType>
|
|
static const onnxruntime::NodeArg* FindNodeArg(const NodeArgSetType& def_set, const std::string& name) {
|
|
NodeArg def(name, nullptr);
|
|
auto it = def_set.find(&def); // this works since we use name to compare NodeArg
|
|
if (it != def_set.end())
|
|
return *it;
|
|
return nullptr;
|
|
}
|
|
|
|
// We duplicate any initializer that is used by both provider nodes and non-provider nodes
|
|
// to ensure that provider nodes and non-provider nodes don't share initializers, as they
|
|
// need to stay in different memory locations.
|
|
bool TransformerMemcpyImpl::ProcessInitializers(const KernelRegistryManager& kernel_registries, const InitializedTensorSet& initializers_consumed) {
|
|
std::map<const onnxruntime::NodeArg*, onnxruntime::NodeArg*> replacements;
|
|
for (const auto& pair : initializers_consumed) {
|
|
const auto& name = pair.first;
|
|
const onnxruntime::NodeArg* provider_def = FindNodeArg(provider_input_defs_, name);
|
|
const onnxruntime::NodeArg* non_provider_def = FindNodeArg(non_provider_input_defs_, name);
|
|
if (provider_def != nullptr && non_provider_def != nullptr) {
|
|
std::string new_def_name = graph_.GenerateNodeArgName(name);
|
|
auto& new_def = graph_.GetOrCreateNodeArg(new_def_name, provider_def->TypeAsProto());
|
|
|
|
// We make a copy of the initializer that is to be consumed by the provider Node so that
|
|
// session state initializer can copy it over to the provider device during its operation
|
|
// TODO: The copy being made is possibly redundant if this occurs in a subgraph
|
|
// When multiple subgraphs consume the same initializer as an implicit input,
|
|
// multiple copies of the initializer will be made into the provider device
|
|
// This should not directly affect runtime performance as the copies occur during initialization
|
|
// but overuse of the provider device's memory is definitely inefficient
|
|
// In future, we need to "statefully" make the copy only once and use it in all subgraphs referencing the initializer
|
|
const TensorProto* tensor_proto = pair.second;
|
|
TensorProto new_tensor_proto = *tensor_proto;
|
|
*(new_tensor_proto.mutable_name()) = new_def_name;
|
|
graph_.AddInitializedTensor(new_tensor_proto);
|
|
|
|
replacements.insert(std::make_pair(provider_def, &new_def));
|
|
}
|
|
}
|
|
|
|
for (auto p_node : provider_nodes_) {
|
|
// make a copy of replacement map as the node may exclude mapping for InputDefs with MemTypeOnCpuExplicitly
|
|
auto dup_replacements = replacements;
|
|
|
|
const KernelCreateInfo* kci = nullptr;
|
|
auto status = kernel_registries.SearchKernelRegistry(*p_node, &kci);
|
|
ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
|
|
if (kci == nullptr) continue;
|
|
if (kci->kernel_def == nullptr) continue;
|
|
ORT_THROW_IF_ERROR(Node::ForEachWithIndex(
|
|
p_node->InputDefs(),
|
|
[kci, &p_node, &dup_replacements](const onnxruntime::NodeArg& arg, size_t index) {
|
|
if (utils::IsInputOnCpu(*p_node, kci, index)) dup_replacements.erase(&arg);
|
|
return Status::OK();
|
|
}));
|
|
|
|
// normally initializers are only inputs, but things may change with ops like assign
|
|
ORT_THROW_IF_ERROR(Node::ForEachWithIndex(
|
|
p_node->OutputDefs(),
|
|
[kci, &dup_replacements](const onnxruntime::NodeArg& arg, size_t index) {
|
|
if (kci->kernel_def->IsOutputOnCpu(index)) {
|
|
ORT_ENFORCE(dup_replacements.find(&arg) == dup_replacements.end());
|
|
}
|
|
return Status::OK();
|
|
}));
|
|
|
|
p_node->ReplaceDefs(dup_replacements);
|
|
}
|
|
|
|
// This denotes a modification to the graph
|
|
return !replacements.empty();
|
|
}
|
|
|
|
} // namespace onnxruntime
|