This commit is contained in:
Hariharan Seshadri 2021-05-13 14:11:17 -07:00 committed by GitHub
parent 1ab8a95eb6
commit 7bb3f243ff
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 12 additions and 393 deletions

View file

@ -7,10 +7,7 @@
#include "onnx/defs/data_type_utils.h"
#include "core/framework/execution_providers.h"
#include "core/framework/kernel_registry_manager.h"
#include "core/framework/op_kernel.h"
#include "core/providers/cpu/cpu_execution_provider.h"
using namespace ONNX_NAMESPACE::Utils;
@ -48,43 +45,18 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewe
return node_id_to_order_map[n1] > node_id_to_order_map[n2];
};
// If return false, n2 will be output first; If return true, n1 will be output first
auto lesser_order_comp = [&](const NodeIndex n1, const NodeIndex n2) {
return node_id_to_order_map[n1] < node_id_to_order_map[n2];
};
std::priority_queue<NodeIndex, std::vector<NodeIndex>, decltype(greater_order_comp)> candidates_fw(greater_order_comp);
std::priority_queue<NodeIndex, std::vector<NodeIndex>, decltype(lesser_order_comp)> candidates_bw(lesser_order_comp);
std::priority_queue<NodeIndex, std::vector<NodeIndex>, decltype(greater_order_comp)> candidates(greater_order_comp);
std::unordered_set<NodeIndex> visited;
std::unordered_set<const NodeArg*> cpu_args;
std::unordered_set<const NodeArg*> cpu_output_args;
std::unordered_set<NodeIndex> provider_nodes;
std::unordered_map<NodeIndex, const KernelCreateInfo*> node_to_kernel;
std::unordered_set<NodeIndex> cpu_kernel_available;
// create a temp CPU kernel registry
KernelRegistryManager mgr;
ExecutionProviders cpu_ep;
CPUExecutionProviderInfo epi{false};
ORT_ENFORCE(cpu_ep.Add(kCpuExecutionProvider, std::make_unique<CPUExecutionProvider>(epi)).IsOK());
ORT_ENFORCE(mgr.RegisterKernels(cpu_ep).IsOK());
std::vector<const KernelRegistry*> cpu_kernel_registries = mgr.GetKernelRegistriesByProviderType(kCpuExecutionProvider);
for (auto& node_id : tentative_nodes) {
provider_nodes.insert(node_id);
const Node* node = graph.GetNode(node_id);
const KernelCreateInfo* kernel_info = nullptr;
// Get the CPU kernel availability for this node
for (auto registry : cpu_kernel_registries) {
auto st = registry->TryFindKernel(*node, kCpuExecutionProvider, &kernel_info);
if (st.IsOK()) {
cpu_kernel_available.insert(node_id);
break;
}
}
for (auto registry : kernel_registries) {
auto st = registry->TryFindKernel(*node, provider_type, &kernel_info);
if (st.IsOK())
@ -99,26 +71,11 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewe
node->OutputDefs(),
[&](const NodeArg& node_arg, size_t out_index) {
if (kernel_info->kernel_def->IsOutputOnCpu(out_index)) {
cpu_args.insert(&node_arg);
cpu_output_args.insert(&node_arg);
auto consumer_nodes = graph.GetConsumerNodes(node_arg.Name());
for (auto& consumer_node : consumer_nodes) {
candidates_fw.push(consumer_node->Index());
LOGS_DEFAULT(INFO) << "Candidate for fallback CPU execution in forward trace: " << consumer_node->Name();
}
}
return Status::OK();
}));
// then, find all the direct producers of cpu tensors.
ORT_THROW_IF_ERROR(node->ForEachWithIndex(
node->InputDefs(),
[&](const NodeArg& node_arg, size_t in_index) {
if (kernel_info->kernel_def->IsInputOnCpu(in_index)) {
cpu_args.insert(&node_arg);
auto producer_node = graph.GetProducerNode(node_arg.Name());
if (producer_node != nullptr) {
candidates_bw.push(producer_node->Index());
LOGS_DEFAULT(INFO) << "Candidate for fallback CPU execution in backward trace: " << producer_node->Name();
candidates.push(consumer_node->Index());
LOGS_DEFAULT(INFO) << "Candidate for fallback CPU execution: " << consumer_node->Name();
}
}
return Status::OK();
@ -132,9 +89,9 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewe
// The detail:
// for each candidate, if one of its input is a cpu tensor and the Non-CPU kernel doesn't mark it as cpu input,
// force the node to CPU to avoid memory cpu and add its output to the small cpu tensors.
while (!candidates_fw.empty()) {
NodeIndex cur = candidates_fw.top();
candidates_fw.pop();
while (!candidates.empty()) {
NodeIndex cur = candidates.top();
candidates.pop();
if (visited.count(cur) != 0)
continue;
visited.insert(cur);
@ -161,7 +118,7 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewe
}
// the input is not a CPU tensor
if (cpu_args.find(input) == cpu_args.end()) {
if (cpu_output_args.find(input) == cpu_output_args.end()) {
place_in_cpu = false;
break;
}
@ -173,90 +130,16 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewe
}
}
if (place_in_cpu && cpu_kernel_available.count(cur) != 0) {
if (place_in_cpu) {
cpu_nodes.insert(cur);
LOGS_DEFAULT(INFO) << "ORT optimization- Force fallback to CPU execution for node: " << node->Name()
<< " because the CPU execution path is deemed faster than overhead involved with execution on other EPs "
<< " capable of executing this node";
for (auto* output : node->OutputDefs()) {
cpu_args.insert(output);
cpu_output_args.insert(output);
}
for (auto it = node->OutputNodesBegin(); it != node->OutputNodesEnd(); ++it) {
candidates_fw.push((*it).Index());
}
}
}
// clear the visited to prepare for backward trace
visited.clear();
// Trace the graph backwards to find additional CPU nodes
// Starting from nodes that must produce an output on CPU, trace the producer nodes
// The trace stops when we find that
// 1) The node is already picked for CPU
// 2) Input/Output type is unsupported on CPU(float16/bfloat16)
// 3) The output is not a CPU tensor
// 4) The search hits a node that produces a CPU output
while (!candidates_bw.empty()) {
NodeIndex cur = candidates_bw.top();
candidates_bw.pop();
if (visited.count(cur) != 0)
continue;
visited.insert(cur);
// node is already picked for CPU
if (cpu_nodes.count(cur) != 0)
continue;
if (provider_nodes.find(cur) == provider_nodes.end())
continue;
auto* node = graph.GetNode(cur);
bool place_in_cpu = true;
for (size_t i = 0; i < node->OutputDefs().size(); ++i) {
auto* output = node->OutputDefs()[i];
// skip placing on CPU if the data typs is float16 or bfloat16
if (output->Type() == DataTypeUtils::ToType("float16") ||
output->Type() == DataTypeUtils::ToType("bfloat16")) {
place_in_cpu = false;
break;
}
// the output is not a CPU tensor
if (cpu_args.find(output) == cpu_args.end()) {
place_in_cpu = false;
break;
}
// output is a CPU tensor, but it's intended to be consumed as CPU output by the target EP
if (node_to_kernel[cur]->kernel_def->IsOutputOnCpu(i)) {
place_in_cpu = false;
break;
}
}
// Next, check if the node inputs are of supported type
if (place_in_cpu) {
for (size_t i = 0; i < node->InputDefs().size(); ++i) {
auto* input = node->InputDefs()[i];
// skip placing on CPU if the data typs is float16 or bfloat16
if (input->Type() == DataTypeUtils::ToType("float16") ||
input->Type() == DataTypeUtils::ToType("bfloat16")) {
place_in_cpu = false;
break;
}
}
}
if (place_in_cpu && cpu_kernel_available.count(cur) != 0) {
cpu_nodes.insert(cur);
LOGS_DEFAULT(INFO) << "ORT optimization- Force fallback to CPU execution for node: " << node->Name()
<< " because the CPU execution path is deemed faster than overhead involved with execution on other EPs "
<< " capable of executing this node";
for (auto* input : node->InputDefs()) {
cpu_args.insert(input);
}
for (auto it = node->InputNodesBegin(); it != node->InputNodesEnd(); ++it) {
candidates_bw.push((*it).Index());
candidates.push((*it).Index());
}
}
}

View file

@ -20,12 +20,6 @@
#include "gtest/gtest.h"
#include "test/test_environment.h"
#ifdef USE_CUDA
#include "core/providers/cuda/cuda_execution_provider.h"
#elif USE_ROCM
#include "core/providers/rocm/rocm_execution_provider.h"
#endif
using namespace ONNX_NAMESPACE;
using namespace std;
namespace onnxruntime {
@ -179,94 +173,6 @@ TEST_P(SessionStateTestP, TestInitializerProcessing) {
}
}
#if defined(USE_CUDA) || defined(USE_ROCM)
static void TestCPUNodePlacement(const std::basic_string<ORTCHAR_T>& model_uri,
const std::unordered_set<std::string>& expected_cpu_nodes,
const std::unordered_set<std::string>& expected_gpu_nodes) {
std::shared_ptr<Model> model;
ASSERT_STATUS_OK(Model::Load(model_uri, model, nullptr, DefaultLoggingManager().DefaultLogger()));
Graph& graph = model->MainGraph();
ExecutionProviders execution_providers;
#if defined(USE_CUDA)
CUDAExecutionProviderInfo cuda_epi;
ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, std::make_unique<CUDAExecutionProvider>(cuda_epi)));
#elif defined(USE_ROCM)
ROCMExecutionProviderInfo rocm_epi;
ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kRocmExecutionProvider, std::make_unique<ROCMExecutionProvider>(rocm_epi)));
#endif
// add CPU EP
CPUExecutionProviderInfo epi;
ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCpuExecutionProvider, std::make_unique<CPUExecutionProvider>(epi)));
KernelRegistryManager krm;
ASSERT_STATUS_OK(krm.RegisterKernels(execution_providers));
DataTransferManager dtm;
profiling::Profiler profiler;
SessionState session_state(graph, execution_providers, false, nullptr, nullptr, dtm,
DefaultLoggingManager().DefaultLogger(), profiler);
// Partition the graph. Here, the graph partitioner assigns EPs to the nodes
GraphPartitioner partitioner(krm, execution_providers);
ASSERT_STATUS_OK(partitioner.Partition(graph, session_state.ExportDll(), session_state.GetMutableFuncMgr()));
// check which nodes are assigned to CPU and GPU
for (auto& node : graph.Nodes()) {
// assert that EP is assigned
ASSERT_TRUE(!node.GetExecutionProviderType().empty());
auto& ep = node.GetExecutionProviderType();
if (ep == onnxruntime::kCudaExecutionProvider || ep == onnxruntime::kRocmExecutionProvider) {
ASSERT_TRUE(expected_gpu_nodes.count(node.Name())) << "Node not found in expected gpu nodes: " << node.Name();
} else if (ep == onnxruntime::kCpuExecutionProvider) {
ASSERT_TRUE(expected_cpu_nodes.count(node.Name())) << "Node not found in expected cpu nodes: " << node.Name();
} else {
ASSERT_TRUE(false) << "Invalid execution provider assigned to node: " << node.Name() << " , value: " << ep;
}
}
}
TEST(SessionStateTest, CPUPlacementTest0) {
std::unordered_set<std::string> expected_cpu_nodes = {"reshape", "shape1", "const1", "mul", "equal", "where"};
std::unordered_set<std::string> expected_gpu_nodes = {"shape0", "expand"};
TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_0.onnx"), expected_cpu_nodes, expected_gpu_nodes);
}
TEST(SessionStateTest, CPUPlacementTest1) {
std::unordered_set<std::string> expected_cpu_nodes = {"const1"};
std::unordered_set<std::string> expected_gpu_nodes = {"shape0", "expand"};
TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_1.onnx"), expected_cpu_nodes, expected_gpu_nodes);
}
TEST(SessionStateTest, CPUPlacementTest2) {
std::unordered_set<std::string> expected_cpu_nodes = {"range"};
std::unordered_set<std::string> expected_gpu_nodes = {"size0", "reduce"};
TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_2.onnx"), expected_cpu_nodes, expected_gpu_nodes);
}
TEST(SessionStateTest, CPUPlacementTest3) {
std::unordered_set<std::string> expected_cpu_nodes = {"range0", "range1"};
std::unordered_set<std::string> expected_gpu_nodes = {"size0", "reduce0", "identity", "size1", "reduce1", "sum"};
TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_3.onnx"), expected_cpu_nodes, expected_gpu_nodes);
}
TEST(SessionStateTest, CPUPlacementTest4) {
// Currently, the behaviour is different for RocM and CUDA EP as Rocm EP is missing a valid kernel
// for ReduceSum for int64 type. This causes the backward trace in GetCpuPreferredNodes to stop
// earlier. The expected values can be modified to match CUDA once the RocM EP kernel is updated
#if defined(USE_CUDA)
std::unordered_set<std::string> expected_cpu_nodes = {"range", "reduce", "const1"};
std::unordered_set<std::string> expected_gpu_nodes = {"size0", "expand"};
#elif defined(USE_ROCM)
std::unordered_set<std::string> expected_cpu_nodes = {"const1", "reduce"};
std::unordered_set<std::string> expected_gpu_nodes = {"size0", "expand", "range"};
#endif
TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_4.onnx"), expected_cpu_nodes, expected_gpu_nodes);
}
TEST(SessionStateTest, CPUPlacementTest5) {
std::unordered_set<std::string> expected_cpu_nodes = {"gather0", "gather1", "concat"};
std::unordered_set<std::string> expected_gpu_nodes = {"shape0", "shape1", "reshape"};
TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_5.onnx"), expected_cpu_nodes, expected_gpu_nodes);
}
#endif
// Test that we allocate memory for an initializer from non-arena memory even if we provide an arena-based allocator
// if the relevant session option config flag is set
// For this test we need to enable the arena-based allocator which is not supported on x86 builds, so

View file

@ -1,170 +0,0 @@
import onnx
from onnx import helper
from onnx import TensorProto
from onnx import shape_inference
import numpy as np
graph_def_0 = helper.make_graph(
nodes=[
helper.make_node(op_type="Shape", inputs=['A'], outputs=['A_shape'], name='shape0'),
helper.make_node(op_type="Reshape", inputs=['A_shape', 'shape'], outputs=['A_reshaped'], name='reshape'),
helper.make_node(op_type="Shape", inputs=['A_reshaped'], outputs=['A_shape1'], name='shape1'),
helper.make_node(op_type="ConstantOfShape", inputs=['A_shape1'], outputs=['const1'], name='const1', value=helper.make_tensor('val', TensorProto.INT64,
[1], [1])),
helper.make_node(op_type="Mul", inputs=['const1', 'neg_one'], outputs=['mul'], name='mul'),
helper.make_node(op_type="Equal", inputs=['A_reshaped', 'mul'], outputs=['equal'], name='equal'),
helper.make_node(op_type="Where", inputs=['equal', 'const1', 'A_reshaped'], outputs=['where'], name='where'),
helper.make_node(op_type="Expand", inputs=['B','where'], outputs=['C'], name='expand'),
],
name='test-model',
inputs=[
# create inputs with symbolic dims
helper.make_tensor_value_info("A", TensorProto.FLOAT, None),
helper.make_tensor_value_info("B", TensorProto.FLOAT, None),
],
outputs=[
helper.make_tensor_value_info('C', TensorProto.FLOAT, None)
],
initializer=[
helper.make_tensor('shape', TensorProto.INT64, [1], [-1]),
helper.make_tensor('neg_one', TensorProto.INT64, [1], [-1]),
])
model = helper.make_model(graph_def_0, opset_imports=[helper.make_operatorsetid("", 12)])
onnx.save_model(model, "cpu_fallback_pattern_0.onnx")
graph_def_1 = helper.make_graph(
nodes=[
helper.make_node(op_type="Shape", inputs=['A'], outputs=['A_shape'], name='shape0'),
helper.make_node(op_type="ConstantOfShape", inputs=['A_shape'], outputs=['const1'], name='const1', value=helper.make_tensor('val', TensorProto.INT64,
[1], [1])),
helper.make_node(op_type="Expand", inputs=['B','const1'], outputs=['C'], name='expand'),
],
name='test-model',
inputs=[
# create inputs with symbolic dims
helper.make_tensor_value_info("A", TensorProto.FLOAT, None),
helper.make_tensor_value_info("B", TensorProto.FLOAT, None),
],
outputs=[
helper.make_tensor_value_info('C', TensorProto.FLOAT, None)
],
initializer=[])
model = helper.make_model(graph_def_1, opset_imports=[helper.make_operatorsetid("", 12)])
onnx.save_model(model, "cpu_fallback_pattern_1.onnx")
graph_def_2 = helper.make_graph(
nodes=[
helper.make_node(op_type="Size", inputs=['A'], outputs=['A_size'], name='size0'),
helper.make_node(op_type="Range", inputs=['zero', 'A_size', 'two'], outputs=['range'], name='range'),
helper.make_node(op_type="ReduceSum", inputs=['B', 'range'], outputs=['C'], name='reduce'),
],
name='test-model',
inputs=[
# create inputs with symbolic dims
helper.make_tensor_value_info("A", TensorProto.FLOAT, None),
helper.make_tensor_value_info("B", TensorProto.FLOAT, None),
],
outputs=[
helper.make_tensor_value_info('C', TensorProto.FLOAT, None)
],
initializer=[
helper.make_tensor('zero', TensorProto.INT64, [], [0]),
helper.make_tensor('two', TensorProto.INT64, [], [2]),
])
model = helper.make_model(graph_def_2, opset_imports=[helper.make_operatorsetid("", 13)])
onnx.save_model(model, "cpu_fallback_pattern_2.onnx")
graph_def_3 = helper.make_graph(
nodes=[
helper.make_node(op_type="Size", inputs=['A'], outputs=['size0'], name='size0'),
helper.make_node(op_type="Range", inputs=['zero', 'size0', 'two'], outputs=['range0'], name='range0'),
helper.make_node(op_type="ReduceSum", inputs=['B', 'range0'], outputs=['reduce0'], name='reduce0'),
helper.make_node(op_type="Identity", inputs=['reduce0'], outputs=['reduce0_cpy'], name='identity'),
helper.make_node(op_type="Size", inputs=['reduce0_cpy'], outputs=['size1'], name='size1'),
helper.make_node(op_type="Range", inputs=['zero', 'size1', 'two'], outputs=['range1'], name='range1'),
helper.make_node(op_type="ReduceSum", inputs=['B', 'range1'], outputs=['reduce1'], name='reduce1'),
helper.make_node(op_type="Sum", inputs=['reduce0', 'reduce1'], outputs=['C'], name='sum'),
],
name='test-model',
inputs=[
# create inputs with symbolic dims
helper.make_tensor_value_info("A", TensorProto.FLOAT, None),
helper.make_tensor_value_info("B", TensorProto.FLOAT, None),
],
outputs=[
helper.make_tensor_value_info('C', TensorProto.FLOAT, None)
],
initializer=[
helper.make_tensor('zero', TensorProto.INT64, [], [0]),
helper.make_tensor('two', TensorProto.INT64, [], [2]),
])
model = helper.make_model(graph_def_3, opset_imports=[helper.make_operatorsetid("", 13)])
onnx.save_model(model, "cpu_fallback_pattern_3.onnx")
graph_def_4 = helper.make_graph(
nodes=[
helper.make_node(op_type="Size", inputs=['A'], outputs=['A_size'], name='size0'),
helper.make_node(op_type="Range", inputs=['zero', 'A_size', 'two'], outputs=['range'], name='range'),
helper.make_node(op_type="ReduceSum", inputs=['B', 'range'], outputs=['reduce'], name='reduce'),
helper.make_node(op_type="ConstantOfShape", inputs=['reduce'], outputs=['const1'], name='const1', value=helper.make_tensor('val', TensorProto.INT64,
[1], [1])),
helper.make_node(op_type="Expand", inputs=['C','const1'], outputs=['D'], name='expand'),
],
name='test-model',
inputs=[
# create inputs with symbolic dims
helper.make_tensor_value_info("A", TensorProto.FLOAT, None),
helper.make_tensor_value_info("B", TensorProto.INT64, None),
helper.make_tensor_value_info("C", TensorProto.FLOAT, None),
],
outputs=[
helper.make_tensor_value_info('D', TensorProto.FLOAT, None)
],
initializer=[
helper.make_tensor('zero', TensorProto.INT64, [], [0]),
helper.make_tensor('two', TensorProto.INT64, [], [2]),
])
model = helper.make_model(graph_def_4, opset_imports=[helper.make_operatorsetid("", 13)])
onnx.save_model(model, "cpu_fallback_pattern_4.onnx")
graph_def_5 = helper.make_graph(
nodes=[
helper.make_node(op_type="Shape", inputs=['A'], outputs=['A_shape'], name='shape0'),
helper.make_node(op_type="Gather", inputs=['A_shape', 'zero'], outputs=['batch'], name='gather0'),
helper.make_node(op_type="Concat", inputs=['batch', 'seq_len'], outputs=['shape'], name='concat', axis=0),
helper.make_node(op_type="Shape", inputs=['B'], outputs=['B_shape'], name='shape1'),
helper.make_node(op_type="Gather", inputs=['B_shape', 'one'], outputs=['seq_len'], name='gather1'),
helper.make_node(op_type="Reshape", inputs=['C','shape'], outputs=['D'], name='reshape'),
],
name='test-model',
inputs=[
# create inputs with symbolic dims
helper.make_tensor_value_info("A", TensorProto.FLOAT, None),
helper.make_tensor_value_info("B", TensorProto.INT64, None),
helper.make_tensor_value_info("C", TensorProto.FLOAT, None),
],
outputs=[
helper.make_tensor_value_info('D', TensorProto.FLOAT, None)
],
initializer=[
helper.make_tensor('zero', TensorProto.INT64, [1], [0]),
helper.make_tensor('one', TensorProto.INT64, [1], [1]),
])
model = helper.make_model(graph_def_5, opset_imports=[helper.make_operatorsetid("", 13)])
onnx.save_model(model, "cpu_fallback_pattern_5.onnx")