Place shape related compute nodes in CPU (#4940) (#5350)

* Place shape related nodes in CPU
* visit candidates by topological order
* Make CPU node placement a utility function
* skip placing on CPU if the data typs is float16 or bfloat16

Co-authored-by: Sherlock <baihan.huang@gmail.com>
This commit is contained in:
ashbhandare 2020-10-02 11:11:12 -07:00 committed by GitHub
parent 5de47affb1
commit 38e1bbce72
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 184 additions and 56 deletions

View file

@ -125,6 +125,18 @@ class GraphViewer {
/** Get the Node containing this Graph if IsSubgraph is true. Returns nullptr otherwise. */
const Node* ParentNode() const noexcept { return graph_->ParentNode(); }
#if !defined(ORT_MINIMAL_BUILD)
/** Get the consumer nodes of a node arg */
std::vector<const Node*> GetConsumerNodes(const std::string& node_arg_name) const {
return graph_->GetConsumerNodes(node_arg_name);
}
/** Get the producer node of a node arg */
const Node* GetProducerNode(const std::string& node_arg_name) const {
return graph_->GetProducerNode(node_arg_name);
}
#endif
private:
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(GraphViewer);

View file

@ -0,0 +1,153 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/graph/graph_viewer.h"
#include "onnx/defs/data_type_utils.h"
#include <queue>
using namespace ONNX_NAMESPACE::Utils;
namespace onnxruntime {
namespace {
const int64_t Small_Initializer_Threshold = 100;
bool IsSmallInitializerWithSingleConsumer(const onnxruntime::GraphViewer& graph, const NodeArg* arg) {
const ONNX_NAMESPACE::TensorProto* initializer_tensor;
if (!graph.GetInitializedTensor(arg->Name(), initializer_tensor))
return false;
int64_t size = 1;
for (auto& dim : initializer_tensor->dims()) {
size *= dim;
}
return size <= Small_Initializer_Threshold &&
graph.GetConsumerNodes(arg->Name()).size() == 1;
}
} // namespace
/**
Returns a list of nodes that are prefered on CPU.
They are commonly shape-related computation subgraphs.
@param graph Graph viewer
@param provider_type The targe execution provider type
@param kernel_registries Kernel registies for the target EP
@param tentative_nodes Nodes that are tentative to be placed on on target EP
*/
std::unordered_set<NodeIndex> GetCpuPreferedNodes(const onnxruntime::GraphViewer& graph,
const std::string& provider_type,
const std::vector<const KernelRegistry*>& kernel_registries,
const std::vector<NodeIndex>& tentative_nodes) {
const std::vector<NodeIndex>& ordered_nodes = graph.GetNodesInTopologicalOrder();
std::vector<size_t> node_id_to_order_map(graph.MaxNodeIndex());
for (size_t id = 0; id < ordered_nodes.size(); ++id) {
const NodeIndex& node_id = ordered_nodes[id];
node_id_to_order_map[node_id] = id;
}
// If return false, n1 will be output first; If return true, n2 will be output first
auto greater_order_comp = [&](const NodeIndex n1, const NodeIndex n2) {
return node_id_to_order_map[n1] > node_id_to_order_map[n2];
};
std::priority_queue<NodeIndex, std::vector<NodeIndex>, decltype(greater_order_comp)> candidates(greater_order_comp);
std::unordered_set<NodeIndex> visited;
std::unordered_set<const NodeArg*> cpu_output_args;
std::unordered_set<NodeIndex> provider_nodes;
std::unordered_map<NodeIndex, const KernelCreateInfo*> node_to_kernel;
for (auto& node_id : tentative_nodes) {
provider_nodes.insert(node_id);
const Node* node = graph.GetNode(node_id);
const KernelCreateInfo* kernel_info = nullptr;
for (auto registry : kernel_registries) {
auto st = registry->TryFindKernel(*node, provider_type, &kernel_info);
if (st.IsOK())
break;
}
// at least one registry has a target provider's kernel for this node
ORT_ENFORCE(kernel_info != nullptr);
node_to_kernel.insert({node_id, kernel_info});
// first, find all the direct consumer of cpu tensors.
ORT_THROW_IF_ERROR(node->ForEachWithIndex(
node->OutputDefs(),
[&](const NodeArg& node_arg, size_t out_index) {
if (kernel_info->kernel_def->IsOutputOnCpu(out_index)) {
cpu_output_args.insert(&node_arg);
auto consumer_nodes = graph.GetConsumerNodes(node_arg.Name());
for (auto& consumer_node : consumer_nodes) {
candidates.push(consumer_node->Index());
LOGS_DEFAULT(INFO) << "Canditiate for fallback CPU execution: " << consumer_node->Name();
}
}
return Status::OK();
}));
}
const std::vector<const NodeArg*>& graph_inputs = graph.GetInputs();
std::unordered_set<NodeIndex> cpu_nodes;
// The algo below is trying to identity a subgraph that only depends on cpu tensors.
// Usually it is a subgraph that doing shape calculation based on a GPU tensor, then reshape it back.
// The detail:
// for each candidate, if one of its input is a cpu tensor and the Non-CPU kernel doesn't mark it as cpu input,
// force the node to CPU to avoid memory cpu and add its output to the small cpu tensors.
while (!candidates.empty()) {
NodeIndex cur = candidates.top();
candidates.pop();
if (visited.count(cur) != 0)
continue;
visited.insert(cur);
if (provider_nodes.find(cur) == provider_nodes.end())
continue;
auto* node = graph.GetNode(cur);
bool place_in_cpu = true;
for (size_t i = 0; i < node->InputDefs().size(); ++i) {
auto* input = node->InputDefs()[i];
// skip placing on CPU if the data typs is float16 or bfloat16
if (input->Type() == DataTypeUtils::ToType("float16") ||
input->Type() == DataTypeUtils::ToType("bfloat16")) {
place_in_cpu = false;
break;
}
// allow placing on CPU if it's a small initializer or graph input
if (IsSmallInitializerWithSingleConsumer(graph, input) ||
std::find(graph_inputs.begin(), graph_inputs.end(), input) != graph_inputs.end()) {
continue;
}
// the input is not a CPU tensor
if (cpu_output_args.find(input) == cpu_output_args.end()) {
place_in_cpu = false;
break;
}
// input is a CPU tensor, but it's intended to be consumed as CPU input by the target EP
if (node_to_kernel[cur]->kernel_def->IsInputOnCpu(i)) {
place_in_cpu = false;
break;
}
}
if (place_in_cpu) {
cpu_nodes.insert(cur);
LOGS_DEFAULT(WARNING) << "Force fallback to CPU execution for node: " << node->Name();
for (auto* output : node->OutputDefs()) {
cpu_output_args.insert(output);
}
for (auto it = node->OutputNodesBegin(); it != node->OutputNodesEnd(); ++it) {
candidates.push((*it).Index());
}
}
}
return cpu_nodes;
}
} // namespace onnxruntime

View file

@ -7,6 +7,7 @@
#include "cuda_allocator.h"
#include "core/framework/kernel_registry.h"
#include "core/framework/compute_capability.h"
#include "core/framework/fallback_cpu_capability.h"
#include "core/framework/memcpy.h"
#include "core/graph/graph_utils.h"
#include "core/providers/cuda/gpu_data_transfer.h"
@ -1822,9 +1823,7 @@ std::unique_ptr<onnxruntime::IDataTransfer> CUDAExecutionProvider::GetDataTransf
std::vector<std::unique_ptr<ComputeCapability>>
CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
const std::vector<const KernelRegistry*>& kernel_registries) const {
std::vector<std::unique_ptr<ComputeCapability>> result;
std::unordered_set<const NodeArg*> defs_outside_cuda;
std::vector<NodeIndex> candidates;
for (auto& node_index : graph.GetNodesInTopologicalOrder()) {
const auto* p_node = graph.GetNode(node_index);
if (p_node == nullptr)
@ -1833,7 +1832,6 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
const auto& node = *p_node;
const KernelCreateInfo* cuda_kernel_def = nullptr;
if (!node.GetExecutionProviderType().empty()) {
defs_outside_cuda.insert(node.OutputDefs().cbegin(), node.OutputDefs().cend());
continue;
}
@ -1847,14 +1845,10 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
// none of the provided registries has a CUDA kernel for this node
if (cuda_kernel_def == nullptr) {
// node is not in cuda exeuction provider if no kernel def found,
// or if other execution provider already assigned to it
defs_outside_cuda.insert(node.OutputDefs().cbegin(), node.OutputDefs().cend());
continue;
}
bool not_supported = false;
bool force_outside = false;
bool force_inside = false; // for some compute heavy ops, we'll force it to run inside CUDA
if ("LSTM" == node.OpType()) {
// the supported activations covers the bidirectional mode
@ -1877,60 +1871,29 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
// cast is not compute heavy, and may be placed outside
}
//Below rule only works for inference, for training, we can't do constant folding.
//We need find a better solution.
//Temporary disable the check here, the cost is all the cast will be on GPU now.
#ifndef ENABLE_TRAINING
if (!not_supported && !force_inside) {
// Note that nodes with only inputs from initializer would not be place on CUDA
// Ideally, those nodes should be eliminated in constant folding
bool should_force_outside = true;
bool all_inputs_are_initializers = true;
ORT_THROW_IF_ERROR(node.ForEachWithIndex(node.InputDefs(),
[&](const NodeArg& def, size_t index) {
// The input is not a initializer and the input is from CPU
// or the input declared as CPU memory and is from CPU
// in that case we should still keep the node on CUDA
bool initializer_input = graph.IsConstantInitializer(def.Name(), /*check_outer_scope*/ true);
bool input_is_on_cpu = defs_outside_cuda.count(&def) > 0;
if ((!initializer_input && !input_is_on_cpu) ||
(input_is_on_cpu && cuda_kernel_def->kernel_def->IsInputOnCpu(index))) {
should_force_outside = false;
}
if (!initializer_input) {
all_inputs_are_initializers = false;
}
return Status::OK();
}));
// If all the inputs are initializers, we shouldn't force it to CPU
if (should_force_outside && !all_inputs_are_initializers) {
force_outside = true;
}
}
#endif
if (!force_inside && (not_supported || force_outside)) {
defs_outside_cuda.insert(node.OutputDefs().cbegin(), node.OutputDefs().cend());
if (!force_inside && not_supported) {
if (not_supported) {
LOGS_DEFAULT(WARNING) << "CUDA kernel not supported. Fallback to CPU execution provider for Op type: " << node.OpType() << " node name: " << node.Name();
} else if (force_outside) {
LOGS_DEFAULT(INFO) << "Force fallback to CPU execution provider for Op type: " << node.OpType() << " node name: " << node.Name();
}
} else {
// for nodes placed on CUDA, check if its output is on CPU
ORT_THROW_IF_ERROR(node.ForEachWithIndex(
node.OutputDefs(),
[&](const NodeArg& def, size_t out_index) {
if (cuda_kernel_def->kernel_def->OutputMemoryType(out_index) != OrtMemTypeDefault)
defs_outside_cuda.insert(&def);
return Status::OK();
}));
std::unique_ptr<IndexedSubGraph> sub_graph = onnxruntime::make_unique<IndexedSubGraph>();
sub_graph->nodes.push_back(node.Index());
result.push_back(onnxruntime::make_unique<ComputeCapability>(std::move(sub_graph)));
candidates.push_back(node.Index());
}
}
// For CUDA EP, exclude the subgraph that is preferred to be placed in CPU
// These are usually shape related computation subgraphs
// Following logic can be extended for other EPs
std::unordered_set<NodeIndex> cpu_nodes = GetCpuPreferedNodes(graph, Type(), kernel_registries, candidates);
std::vector<std::unique_ptr<ComputeCapability>> result;
for (auto& node_index : candidates) {
if (cpu_nodes.count(node_index) > 0)
continue;
std::unique_ptr<IndexedSubGraph> sub_graph = onnxruntime::make_unique<IndexedSubGraph>();
sub_graph->nodes.push_back(node_index);
result.push_back(onnxruntime::make_unique<ComputeCapability>(std::move(sub_graph)));
}
return result;
}