mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-23 02:38:28 +00:00
* Place shape related nodes in CPU * visit candidates by topological order * Make CPU node placement a utility function * skip placing on CPU if the data typs is float16 or bfloat16 Co-authored-by: Sherlock <baihan.huang@gmail.com>
This commit is contained in:
parent
5de47affb1
commit
38e1bbce72
3 changed files with 184 additions and 56 deletions
|
|
@ -125,6 +125,18 @@ class GraphViewer {
|
|||
/** Get the Node containing this Graph if IsSubgraph is true. Returns nullptr otherwise. */
|
||||
const Node* ParentNode() const noexcept { return graph_->ParentNode(); }
|
||||
|
||||
#if !defined(ORT_MINIMAL_BUILD)
|
||||
/** Get the consumer nodes of a node arg */
|
||||
std::vector<const Node*> GetConsumerNodes(const std::string& node_arg_name) const {
|
||||
return graph_->GetConsumerNodes(node_arg_name);
|
||||
}
|
||||
|
||||
/** Get the producer node of a node arg */
|
||||
const Node* GetProducerNode(const std::string& node_arg_name) const {
|
||||
return graph_->GetProducerNode(node_arg_name);
|
||||
}
|
||||
#endif
|
||||
|
||||
private:
|
||||
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(GraphViewer);
|
||||
|
||||
|
|
|
|||
153
onnxruntime/core/framework/fallback_cpu_capability.h
Normal file
153
onnxruntime/core/framework/fallback_cpu_capability.h
Normal file
|
|
@ -0,0 +1,153 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#pragma once
|
||||
#include "core/graph/graph_viewer.h"
|
||||
#include "onnx/defs/data_type_utils.h"
|
||||
#include <queue>
|
||||
|
||||
using namespace ONNX_NAMESPACE::Utils;
|
||||
|
||||
namespace onnxruntime {
|
||||
|
||||
namespace {
|
||||
const int64_t Small_Initializer_Threshold = 100;
|
||||
|
||||
bool IsSmallInitializerWithSingleConsumer(const onnxruntime::GraphViewer& graph, const NodeArg* arg) {
|
||||
const ONNX_NAMESPACE::TensorProto* initializer_tensor;
|
||||
if (!graph.GetInitializedTensor(arg->Name(), initializer_tensor))
|
||||
return false;
|
||||
int64_t size = 1;
|
||||
for (auto& dim : initializer_tensor->dims()) {
|
||||
size *= dim;
|
||||
}
|
||||
return size <= Small_Initializer_Threshold &&
|
||||
graph.GetConsumerNodes(arg->Name()).size() == 1;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
/**
|
||||
Returns a list of nodes that are prefered on CPU.
|
||||
They are commonly shape-related computation subgraphs.
|
||||
@param graph Graph viewer
|
||||
@param provider_type The targe execution provider type
|
||||
@param kernel_registries Kernel registies for the target EP
|
||||
@param tentative_nodes Nodes that are tentative to be placed on on target EP
|
||||
*/
|
||||
std::unordered_set<NodeIndex> GetCpuPreferedNodes(const onnxruntime::GraphViewer& graph,
|
||||
const std::string& provider_type,
|
||||
const std::vector<const KernelRegistry*>& kernel_registries,
|
||||
const std::vector<NodeIndex>& tentative_nodes) {
|
||||
const std::vector<NodeIndex>& ordered_nodes = graph.GetNodesInTopologicalOrder();
|
||||
std::vector<size_t> node_id_to_order_map(graph.MaxNodeIndex());
|
||||
for (size_t id = 0; id < ordered_nodes.size(); ++id) {
|
||||
const NodeIndex& node_id = ordered_nodes[id];
|
||||
node_id_to_order_map[node_id] = id;
|
||||
}
|
||||
|
||||
// If return false, n1 will be output first; If return true, n2 will be output first
|
||||
auto greater_order_comp = [&](const NodeIndex n1, const NodeIndex n2) {
|
||||
return node_id_to_order_map[n1] > node_id_to_order_map[n2];
|
||||
};
|
||||
|
||||
std::priority_queue<NodeIndex, std::vector<NodeIndex>, decltype(greater_order_comp)> candidates(greater_order_comp);
|
||||
std::unordered_set<NodeIndex> visited;
|
||||
|
||||
std::unordered_set<const NodeArg*> cpu_output_args;
|
||||
std::unordered_set<NodeIndex> provider_nodes;
|
||||
std::unordered_map<NodeIndex, const KernelCreateInfo*> node_to_kernel;
|
||||
|
||||
for (auto& node_id : tentative_nodes) {
|
||||
provider_nodes.insert(node_id);
|
||||
const Node* node = graph.GetNode(node_id);
|
||||
|
||||
const KernelCreateInfo* kernel_info = nullptr;
|
||||
for (auto registry : kernel_registries) {
|
||||
auto st = registry->TryFindKernel(*node, provider_type, &kernel_info);
|
||||
if (st.IsOK())
|
||||
break;
|
||||
}
|
||||
// at least one registry has a target provider's kernel for this node
|
||||
ORT_ENFORCE(kernel_info != nullptr);
|
||||
node_to_kernel.insert({node_id, kernel_info});
|
||||
|
||||
// first, find all the direct consumer of cpu tensors.
|
||||
ORT_THROW_IF_ERROR(node->ForEachWithIndex(
|
||||
node->OutputDefs(),
|
||||
[&](const NodeArg& node_arg, size_t out_index) {
|
||||
if (kernel_info->kernel_def->IsOutputOnCpu(out_index)) {
|
||||
cpu_output_args.insert(&node_arg);
|
||||
auto consumer_nodes = graph.GetConsumerNodes(node_arg.Name());
|
||||
for (auto& consumer_node : consumer_nodes) {
|
||||
candidates.push(consumer_node->Index());
|
||||
LOGS_DEFAULT(INFO) << "Canditiate for fallback CPU execution: " << consumer_node->Name();
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}));
|
||||
}
|
||||
|
||||
const std::vector<const NodeArg*>& graph_inputs = graph.GetInputs();
|
||||
std::unordered_set<NodeIndex> cpu_nodes;
|
||||
// The algo below is trying to identity a subgraph that only depends on cpu tensors.
|
||||
// Usually it is a subgraph that doing shape calculation based on a GPU tensor, then reshape it back.
|
||||
// The detail:
|
||||
// for each candidate, if one of its input is a cpu tensor and the Non-CPU kernel doesn't mark it as cpu input,
|
||||
// force the node to CPU to avoid memory cpu and add its output to the small cpu tensors.
|
||||
while (!candidates.empty()) {
|
||||
NodeIndex cur = candidates.top();
|
||||
candidates.pop();
|
||||
if (visited.count(cur) != 0)
|
||||
continue;
|
||||
visited.insert(cur);
|
||||
|
||||
if (provider_nodes.find(cur) == provider_nodes.end())
|
||||
continue;
|
||||
|
||||
auto* node = graph.GetNode(cur);
|
||||
bool place_in_cpu = true;
|
||||
for (size_t i = 0; i < node->InputDefs().size(); ++i) {
|
||||
auto* input = node->InputDefs()[i];
|
||||
|
||||
// skip placing on CPU if the data typs is float16 or bfloat16
|
||||
if (input->Type() == DataTypeUtils::ToType("float16") ||
|
||||
input->Type() == DataTypeUtils::ToType("bfloat16")) {
|
||||
place_in_cpu = false;
|
||||
break;
|
||||
}
|
||||
|
||||
// allow placing on CPU if it's a small initializer or graph input
|
||||
if (IsSmallInitializerWithSingleConsumer(graph, input) ||
|
||||
std::find(graph_inputs.begin(), graph_inputs.end(), input) != graph_inputs.end()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// the input is not a CPU tensor
|
||||
if (cpu_output_args.find(input) == cpu_output_args.end()) {
|
||||
place_in_cpu = false;
|
||||
break;
|
||||
}
|
||||
|
||||
// input is a CPU tensor, but it's intended to be consumed as CPU input by the target EP
|
||||
if (node_to_kernel[cur]->kernel_def->IsInputOnCpu(i)) {
|
||||
place_in_cpu = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (place_in_cpu) {
|
||||
cpu_nodes.insert(cur);
|
||||
LOGS_DEFAULT(WARNING) << "Force fallback to CPU execution for node: " << node->Name();
|
||||
for (auto* output : node->OutputDefs()) {
|
||||
cpu_output_args.insert(output);
|
||||
}
|
||||
for (auto it = node->OutputNodesBegin(); it != node->OutputNodesEnd(); ++it) {
|
||||
candidates.push((*it).Index());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return cpu_nodes;
|
||||
}
|
||||
|
||||
} // namespace onnxruntime
|
||||
|
|
@ -7,6 +7,7 @@
|
|||
#include "cuda_allocator.h"
|
||||
#include "core/framework/kernel_registry.h"
|
||||
#include "core/framework/compute_capability.h"
|
||||
#include "core/framework/fallback_cpu_capability.h"
|
||||
#include "core/framework/memcpy.h"
|
||||
#include "core/graph/graph_utils.h"
|
||||
#include "core/providers/cuda/gpu_data_transfer.h"
|
||||
|
|
@ -1822,9 +1823,7 @@ std::unique_ptr<onnxruntime::IDataTransfer> CUDAExecutionProvider::GetDataTransf
|
|||
std::vector<std::unique_ptr<ComputeCapability>>
|
||||
CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
|
||||
const std::vector<const KernelRegistry*>& kernel_registries) const {
|
||||
std::vector<std::unique_ptr<ComputeCapability>> result;
|
||||
std::unordered_set<const NodeArg*> defs_outside_cuda;
|
||||
|
||||
std::vector<NodeIndex> candidates;
|
||||
for (auto& node_index : graph.GetNodesInTopologicalOrder()) {
|
||||
const auto* p_node = graph.GetNode(node_index);
|
||||
if (p_node == nullptr)
|
||||
|
|
@ -1833,7 +1832,6 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
|
|||
const auto& node = *p_node;
|
||||
const KernelCreateInfo* cuda_kernel_def = nullptr;
|
||||
if (!node.GetExecutionProviderType().empty()) {
|
||||
defs_outside_cuda.insert(node.OutputDefs().cbegin(), node.OutputDefs().cend());
|
||||
continue;
|
||||
}
|
||||
|
||||
|
|
@ -1847,14 +1845,10 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
|
|||
|
||||
// none of the provided registries has a CUDA kernel for this node
|
||||
if (cuda_kernel_def == nullptr) {
|
||||
// node is not in cuda exeuction provider if no kernel def found,
|
||||
// or if other execution provider already assigned to it
|
||||
defs_outside_cuda.insert(node.OutputDefs().cbegin(), node.OutputDefs().cend());
|
||||
continue;
|
||||
}
|
||||
|
||||
bool not_supported = false;
|
||||
bool force_outside = false;
|
||||
bool force_inside = false; // for some compute heavy ops, we'll force it to run inside CUDA
|
||||
if ("LSTM" == node.OpType()) {
|
||||
// the supported activations covers the bidirectional mode
|
||||
|
|
@ -1877,60 +1871,29 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
|
|||
// cast is not compute heavy, and may be placed outside
|
||||
}
|
||||
|
||||
//Below rule only works for inference, for training, we can't do constant folding.
|
||||
//We need find a better solution.
|
||||
//Temporary disable the check here, the cost is all the cast will be on GPU now.
|
||||
#ifndef ENABLE_TRAINING
|
||||
if (!not_supported && !force_inside) {
|
||||
// Note that nodes with only inputs from initializer would not be place on CUDA
|
||||
// Ideally, those nodes should be eliminated in constant folding
|
||||
bool should_force_outside = true;
|
||||
bool all_inputs_are_initializers = true;
|
||||
ORT_THROW_IF_ERROR(node.ForEachWithIndex(node.InputDefs(),
|
||||
[&](const NodeArg& def, size_t index) {
|
||||
// The input is not a initializer and the input is from CPU
|
||||
// or the input declared as CPU memory and is from CPU
|
||||
// in that case we should still keep the node on CUDA
|
||||
bool initializer_input = graph.IsConstantInitializer(def.Name(), /*check_outer_scope*/ true);
|
||||
bool input_is_on_cpu = defs_outside_cuda.count(&def) > 0;
|
||||
if ((!initializer_input && !input_is_on_cpu) ||
|
||||
(input_is_on_cpu && cuda_kernel_def->kernel_def->IsInputOnCpu(index))) {
|
||||
should_force_outside = false;
|
||||
}
|
||||
|
||||
if (!initializer_input) {
|
||||
all_inputs_are_initializers = false;
|
||||
}
|
||||
return Status::OK();
|
||||
}));
|
||||
|
||||
// If all the inputs are initializers, we shouldn't force it to CPU
|
||||
if (should_force_outside && !all_inputs_are_initializers) {
|
||||
force_outside = true;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (!force_inside && (not_supported || force_outside)) {
|
||||
defs_outside_cuda.insert(node.OutputDefs().cbegin(), node.OutputDefs().cend());
|
||||
if (!force_inside && not_supported) {
|
||||
if (not_supported) {
|
||||
LOGS_DEFAULT(WARNING) << "CUDA kernel not supported. Fallback to CPU execution provider for Op type: " << node.OpType() << " node name: " << node.Name();
|
||||
} else if (force_outside) {
|
||||
LOGS_DEFAULT(INFO) << "Force fallback to CPU execution provider for Op type: " << node.OpType() << " node name: " << node.Name();
|
||||
}
|
||||
} else {
|
||||
// for nodes placed on CUDA, check if its output is on CPU
|
||||
ORT_THROW_IF_ERROR(node.ForEachWithIndex(
|
||||
node.OutputDefs(),
|
||||
[&](const NodeArg& def, size_t out_index) {
|
||||
if (cuda_kernel_def->kernel_def->OutputMemoryType(out_index) != OrtMemTypeDefault)
|
||||
defs_outside_cuda.insert(&def);
|
||||
return Status::OK();
|
||||
}));
|
||||
std::unique_ptr<IndexedSubGraph> sub_graph = onnxruntime::make_unique<IndexedSubGraph>();
|
||||
sub_graph->nodes.push_back(node.Index());
|
||||
result.push_back(onnxruntime::make_unique<ComputeCapability>(std::move(sub_graph)));
|
||||
candidates.push_back(node.Index());
|
||||
}
|
||||
}
|
||||
|
||||
// For CUDA EP, exclude the subgraph that is preferred to be placed in CPU
|
||||
// These are usually shape related computation subgraphs
|
||||
// Following logic can be extended for other EPs
|
||||
std::unordered_set<NodeIndex> cpu_nodes = GetCpuPreferedNodes(graph, Type(), kernel_registries, candidates);
|
||||
|
||||
std::vector<std::unique_ptr<ComputeCapability>> result;
|
||||
for (auto& node_index : candidates) {
|
||||
if (cpu_nodes.count(node_index) > 0)
|
||||
continue;
|
||||
|
||||
std::unique_ptr<IndexedSubGraph> sub_graph = onnxruntime::make_unique<IndexedSubGraph>();
|
||||
sub_graph->nodes.push_back(node_index);
|
||||
result.push_back(onnxruntime::make_unique<ComputeCapability>(std::move(sub_graph)));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue