Merge branch 'master' of https://github.com/microsoft/onnxruntime into ryanunderhill/cuda_shared

This commit is contained in:
Ryan Hill 2021-05-06 17:00:59 -07:00
commit af3824ce25
14 changed files with 447 additions and 62 deletions

View file

@ -13,7 +13,7 @@ enum class Severity {
kINFO = 1,
kWARNING = 2,
kERROR = 3,
kFATAL = 4,
kFATAL = 4
};
constexpr const char* SEVERITY_PREFIX = "VIWEF";

View file

@ -7,7 +7,10 @@
#include "onnx/defs/data_type_utils.h"
#include "core/framework/execution_providers.h"
#include "core/framework/kernel_registry_manager.h"
#include "core/framework/op_kernel.h"
#include "core/providers/cpu/cpu_execution_provider.h"
using namespace ONNX_NAMESPACE::Utils;
@ -45,18 +48,43 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewe
return node_id_to_order_map[n1] > node_id_to_order_map[n2];
};
std::priority_queue<NodeIndex, std::vector<NodeIndex>, decltype(greater_order_comp)> candidates(greater_order_comp);
// If return false, n2 will be output first; If return true, n1 will be output first
auto lesser_order_comp = [&](const NodeIndex n1, const NodeIndex n2) {
return node_id_to_order_map[n1] < node_id_to_order_map[n2];
};
std::priority_queue<NodeIndex, std::vector<NodeIndex>, decltype(greater_order_comp)> candidates_fw(greater_order_comp);
std::priority_queue<NodeIndex, std::vector<NodeIndex>, decltype(lesser_order_comp)> candidates_bw(lesser_order_comp);
std::unordered_set<NodeIndex> visited;
std::unordered_set<const NodeArg*> cpu_output_args;
std::unordered_set<const NodeArg*> cpu_args;
std::unordered_set<NodeIndex> provider_nodes;
std::unordered_map<NodeIndex, const KernelCreateInfo*> node_to_kernel;
std::unordered_set<NodeIndex> cpu_kernel_available;
// create a temp CPU kernel registry
KernelRegistryManager mgr;
ExecutionProviders cpu_ep;
CPUExecutionProviderInfo epi{false};
ORT_ENFORCE(cpu_ep.Add(kCpuExecutionProvider, std::make_unique<CPUExecutionProvider>(epi)).IsOK());
ORT_ENFORCE(mgr.RegisterKernels(cpu_ep).IsOK());
std::vector<const KernelRegistry*> cpu_kernel_registries = mgr.GetKernelRegistriesByProviderType(kCpuExecutionProvider);
for (auto& node_id : tentative_nodes) {
provider_nodes.insert(node_id);
const Node* node = graph.GetNode(node_id);
const KernelCreateInfo* kernel_info = nullptr;
// Get the CPU kernel availability for this node
for (auto registry : cpu_kernel_registries) {
auto st = registry->TryFindKernel(*node, kCpuExecutionProvider, &kernel_info);
if (st.IsOK()) {
cpu_kernel_available.insert(node_id);
break;
}
}
for (auto registry : kernel_registries) {
auto st = registry->TryFindKernel(*node, provider_type, &kernel_info);
if (st.IsOK())
@ -71,11 +99,26 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewe
node->OutputDefs(),
[&](const NodeArg& node_arg, size_t out_index) {
if (kernel_info->kernel_def->IsOutputOnCpu(out_index)) {
cpu_output_args.insert(&node_arg);
cpu_args.insert(&node_arg);
auto consumer_nodes = graph.GetConsumerNodes(node_arg.Name());
for (auto& consumer_node : consumer_nodes) {
candidates.push(consumer_node->Index());
LOGS_DEFAULT(INFO) << "Candidate for fallback CPU execution: " << consumer_node->Name();
candidates_fw.push(consumer_node->Index());
LOGS_DEFAULT(INFO) << "Candidate for fallback CPU execution in forward trace: " << consumer_node->Name();
}
}
return Status::OK();
}));
// then, find all the direct producers of cpu tensors.
ORT_THROW_IF_ERROR(node->ForEachWithIndex(
node->InputDefs(),
[&](const NodeArg& node_arg, size_t in_index) {
if (kernel_info->kernel_def->IsInputOnCpu(in_index)) {
cpu_args.insert(&node_arg);
auto producer_node = graph.GetProducerNode(node_arg.Name());
if (producer_node != nullptr) {
candidates_bw.push(producer_node->Index());
LOGS_DEFAULT(INFO) << "Candidate for fallback CPU execution in backward trace: " << producer_node->Name();
}
}
return Status::OK();
@ -89,9 +132,9 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewe
// The detail:
// for each candidate, if one of its input is a cpu tensor and the Non-CPU kernel doesn't mark it as cpu input,
// force the node to CPU to avoid memory cpu and add its output to the small cpu tensors.
while (!candidates.empty()) {
NodeIndex cur = candidates.top();
candidates.pop();
while (!candidates_fw.empty()) {
NodeIndex cur = candidates_fw.top();
candidates_fw.pop();
if (visited.count(cur) != 0)
continue;
visited.insert(cur);
@ -118,7 +161,7 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewe
}
// the input is not a CPU tensor
if (cpu_output_args.find(input) == cpu_output_args.end()) {
if (cpu_args.find(input) == cpu_args.end()) {
place_in_cpu = false;
break;
}
@ -130,16 +173,90 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewe
}
}
if (place_in_cpu) {
if (place_in_cpu && cpu_kernel_available.count(cur) != 0) {
cpu_nodes.insert(cur);
LOGS_DEFAULT(INFO) << "ORT optimization- Force fallback to CPU execution for node: " << node->Name()
<< " because the CPU execution path is deemed faster than overhead involved with execution on other EPs "
<< " capable of executing this node";
for (auto* output : node->OutputDefs()) {
cpu_output_args.insert(output);
cpu_args.insert(output);
}
for (auto it = node->OutputNodesBegin(); it != node->OutputNodesEnd(); ++it) {
candidates.push((*it).Index());
candidates_fw.push((*it).Index());
}
}
}
// clear the visited to prepare for backward trace
visited.clear();
// Trace the graph backwards to find additional CPU nodes
// Starting from nodes that must produce an output on CPU, trace the producer nodes
// The trace stops when we find that
// 1) The node is already picked for CPU
// 2) Input/Output type is unsupported on CPU(float16/bfloat16)
// 3) The output is not a CPU tensor
// 4) The search hits a node that produces a CPU output
while (!candidates_bw.empty()) {
NodeIndex cur = candidates_bw.top();
candidates_bw.pop();
if (visited.count(cur) != 0)
continue;
visited.insert(cur);
// node is already picked for CPU
if (cpu_nodes.count(cur) != 0)
continue;
if (provider_nodes.find(cur) == provider_nodes.end())
continue;
auto* node = graph.GetNode(cur);
bool place_in_cpu = true;
for (size_t i = 0; i < node->OutputDefs().size(); ++i) {
auto* output = node->OutputDefs()[i];
// skip placing on CPU if the data typs is float16 or bfloat16
if (output->Type() == DataTypeUtils::ToType("float16") ||
output->Type() == DataTypeUtils::ToType("bfloat16")) {
place_in_cpu = false;
break;
}
// the output is not a CPU tensor
if (cpu_args.find(output) == cpu_args.end()) {
place_in_cpu = false;
break;
}
// output is a CPU tensor, but it's intended to be consumed as CPU output by the target EP
if (node_to_kernel[cur]->kernel_def->IsOutputOnCpu(i)) {
place_in_cpu = false;
break;
}
}
// Next, check if the node inputs are of supported type
if (place_in_cpu) {
for (size_t i = 0; i < node->InputDefs().size(); ++i) {
auto* input = node->InputDefs()[i];
// skip placing on CPU if the data typs is float16 or bfloat16
if (input->Type() == DataTypeUtils::ToType("float16") ||
input->Type() == DataTypeUtils::ToType("bfloat16")) {
place_in_cpu = false;
break;
}
}
}
if (place_in_cpu && cpu_kernel_available.count(cur) != 0) {
cpu_nodes.insert(cur);
LOGS_DEFAULT(INFO) << "ORT optimization- Force fallback to CPU execution for node: " << node->Name()
<< " because the CPU execution path is deemed faster than overhead involved with execution on other EPs "
<< " capable of executing this node";
for (auto* input : node->InputDefs()) {
cpu_args.insert(input);
}
for (auto it = node->InputNodesBegin(); it != node->InputNodesEnd(); ++it) {
candidates_bw.push((*it).Index());
}
}
}

View file

@ -49,7 +49,6 @@
#include "core/session/onnxruntime_session_options_config_keys.h"
#include "core/util/protobuf_parsing_utils.h"
#include "core/util/thread_utils.h"
#include "onnxruntime_config.h"
// custom ops are not available in a minimal build unless ORT_MINIMAL_BUILD_CUSTOM_OPS is set
#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
@ -293,29 +292,6 @@ void InferenceSession::ConstructorCommon(const SessionOptions& session_options,
// a monotonically increasing session id for use in telemetry
session_id_ = global_session_id_.fetch_add(1);
allocator_manager_ = std::make_shared<onnxruntime::AllocatorManager>();
// Add log to allow serving platforms to quantify ORT usage.
// To avoid flooding the test logs, this is done for non-debug mode only
// TODO: plug-in a platform specific telemetry provider to send the telemetry to
#if defined(NDEBUG) && !defined(__wasm__) && !defined(ENABLE_TRAINING)
#ifdef _WIN32
std::wostringstream ostr;
#else
std::ostringstream ostr;
#endif
// Format: "ORT Telemetry: Ver = 1.7.0; Event = EventName (event_attr1: foo.onnx, event_attr2: 400us)"
// Format: "ORT Telemetry: Ver = 1.7.0; Event = SessionCreation (model: foo.onnx, ts: 400us)"
ostr << "ORT Telemetry: "
<< "Ver = " << ORT_VERSION << "; Event = SessionCreation";
if (!model_location_.empty()) {
ostr << " (model: " << model_location_ << ")";
}
#ifdef _WIN32
std::wcout << ostr.str() << "\n";
#else
std::cout << ostr.str() << "\n";
#endif
#endif
}
InferenceSession::InferenceSession(const SessionOptions& session_options, const Environment& session_env)

View file

@ -150,9 +150,9 @@ class FusionAttention(Fusion):
q_weight = self.model.get_initializer(q_matmul.input[1])
k_weight = self.model.get_initializer(k_matmul.input[1])
v_weight = self.model.get_initializer(v_matmul.input[1])
q_bias = self.model.get_initializer(q_add.input[1])
k_bias = self.model.get_initializer(k_add.input[1])
v_bias = self.model.get_initializer(v_add.input[1])
q_bias = self.model.get_initializer(q_add.input[1]) or self.model.get_initializer(q_add.input[0])
k_bias = self.model.get_initializer(k_add.input[1]) or self.model.get_initializer(k_add.input[0])
v_bias = self.model.get_initializer(v_add.input[1]) or self.model.get_initializer(v_add.input[0])
if q_weight is None:
print(f"{q_matmul.input[1]} is not initializer. Please set do_constant_folding=True in torch.onnx.export")
@ -166,14 +166,14 @@ class FusionAttention(Fusion):
# Check if all matrices have the same shape
assert qw.shape == kw.shape == vw.shape
# All the matrices have the same shape. For 2d weights, the shapes would be [in_size, out_size].
# All the matrices have the same shape. For 2d weights, the shapes would be [in_size, out_size].
# For 3d weights, shape would be [in_size, a, b] where a*b = out_size
in_size = qw.shape[0]
out_size = np.prod(qw.shape[1:])
qkv_weight = np.stack((qw, kw, vw), axis=1)
qb = NumpyHelper.to_array(q_bias)
qb = NumpyHelper.to_array(q_bias)
kb = NumpyHelper.to_array(k_bias)
vb = NumpyHelper.to_array(v_bias)
@ -233,13 +233,14 @@ class FusionAttention(Fusion):
# SkipLayerNormalization has two inputs, and one of them is the root input for attention.
qkv_nodes = self.model.match_parent_path(start_node, ['Add', 'MatMul', 'Reshape', 'Transpose', 'MatMul'],
[None, 0, 0, 0, 0])
[None, None, 0, 0, 0])
einsum_node = None
if qkv_nodes is not None:
(_, matmul_qkv, reshape_qkv, transpose_qkv, matmul_qkv) = qkv_nodes
else:
# Match Albert
qkv_nodes = self.model.match_parent_path(start_node, ['Add', 'Einsum', 'Transpose', 'MatMul'], [1, 0, 0, 0])
qkv_nodes = self.model.match_parent_path(start_node, ['Add', 'Einsum', 'Transpose', 'MatMul'],
[1, None, 0, 0])
if qkv_nodes is not None:
(_, einsum_node, transpose_qkv, matmul_qkv) = qkv_nodes
else:
@ -284,16 +285,16 @@ class FusionAttention(Fusion):
if children_types.count('MatMul') != 3:
return
v_nodes = self.model.match_parent_path(matmul_qkv, ['Transpose', 'Reshape', 'Add', 'MatMul'], [1, 0, 0, 0])
v_nodes = self.model.match_parent_path(matmul_qkv, ['Transpose', 'Reshape', 'Add', 'MatMul'], [1, 0, 0, None])
if v_nodes is None:
logger.debug("fuse_attention: failed to match v path")
return
(_, _, add_v, matmul_v) = v_nodes
is_distill = False
qk_nodes = self.model.match_parent_path(matmul_qkv, ['Softmax', 'Add', 'Div', 'MatMul'], [0, 0, 0, 0])
qk_nodes = self.model.match_parent_path(matmul_qkv, ['Softmax', 'Add', 'Div', 'MatMul'], [0, 0, None, 0])
if qk_nodes is None:
qk_nodes = self.model.match_parent_path(matmul_qkv, ['Softmax', 'Add', 'Mul', 'MatMul'], [0, 0, 0, 0])
qk_nodes = self.model.match_parent_path(matmul_qkv, ['Softmax', 'Add', 'Mul', 'MatMul'], [0, 0, None, 0])
if qk_nodes is None:
qk_nodes = self.model.match_parent_path(matmul_qkv, ['Softmax', 'Where', 'MatMul', 'Div'], [0, 0, 2, 0])
is_distill = True
@ -309,10 +310,10 @@ class FusionAttention(Fusion):
else:
(_, add_qk, _, matmul_qk) = qk_nodes
q_nodes = self.model.match_parent_path(matmul_qk, ['Transpose', 'Reshape', 'Add', 'MatMul'], [0, 0, 0, 0])
q_nodes = self.model.match_parent_path(matmul_qk, ['Transpose', 'Reshape', 'Add', 'MatMul'], [0, 0, 0, None])
if q_nodes is None:
q_nodes = self.model.match_parent_path(matmul_qk, ['Div', 'Transpose', 'Reshape', 'Add', 'MatMul'],
[0, 0, 0, 0, 0])
[0, 0, 0, 0, None])
if q_nodes is None:
logger.debug("fuse_attention: failed to match q path")
return
@ -320,10 +321,10 @@ class FusionAttention(Fusion):
add_q = q_nodes[-2]
matmul_q = q_nodes[-1]
k_nodes = self.model.match_parent_path(matmul_qk, ['Transpose', 'Reshape', 'Add', 'MatMul'], [1, 0, 0, 0])
k_nodes = self.model.match_parent_path(matmul_qk, ['Transpose', 'Reshape', 'Add', 'MatMul'], [1, 0, 0, None])
if k_nodes is None:
k_nodes = self.model.match_parent_path(matmul_qk, ['Transpose', 'Transpose', 'Reshape', 'Add', 'MatMul'],
[1, 0, 0, 0, 0])
[1, 0, 0, 0, None])
if k_nodes is None:
logger.debug("fuse_attention: failed to match k path")
return
@ -339,8 +340,8 @@ class FusionAttention(Fusion):
output_name_to_node)
else:
_, mask_nodes, _ = self.model.match_parent_paths(
add_qk, [(['Mul', 'Sub', 'Cast', 'Unsqueeze', 'Unsqueeze'], [1, 0, 1, 0, 0]),
(['Mul', 'Sub', 'Unsqueeze', 'Unsqueeze'], [1, 0, 1, 0])], output_name_to_node)
add_qk, [(['Mul', 'Sub', 'Cast', 'Unsqueeze', 'Unsqueeze'], [None, 0, 1, 0, 0]),
(['Mul', 'Sub', 'Unsqueeze', 'Unsqueeze'], [None, 0, 1, 0])], output_name_to_node)
if mask_nodes is None:
logger.debug("fuse_attention: failed to match mask path")
return

View file

@ -21,7 +21,17 @@ def float_tensor(name: str, shape: List[int], random=False):
return helper.make_tensor(name, TensorProto.FLOAT, shape, weights)
def create_bert_attention(input_hidden_size=16, pruned_num_heads=2, pruned_head_size=4, use_float_mask=False):
def reverse_if(inputs, reverse=False):
if reverse:
inputs.reverse()
return inputs
def create_bert_attention(input_hidden_size=16,
pruned_num_heads=2,
pruned_head_size=4,
use_float_mask=False,
switch_add_inputs=False):
# unsqueeze in opset version 13 has two inputs (axis is moved from attribute to input).
has_unsqueeze_two_inputs = (version.parse(onnx.__version__) >= version.parse('1.8.0'))
@ -36,13 +46,13 @@ def create_bert_attention(input_hidden_size=16, pruned_num_heads=2, pruned_head_
# q nodes
helper.make_node("MatMul", ["layernorm_out", "matmul_q_weight"], ["matmul_q_out"], "matmul_q"),
helper.make_node("Add", ["matmul_q_out", "add_q_weight"], ["add_q_out"], "add_q"),
helper.make_node("Add", reverse_if(["matmul_q_out", "add_q_weight"], switch_add_inputs), ["add_q_out"], "add_q"),
helper.make_node("Reshape", ["add_q_out", "reshape_weight_1"], ["reshape_q_out"], "reshape_q"),
helper.make_node("Transpose", ["reshape_q_out"], ["transpose_q_out"], "transpose_q", perm=[0, 2, 1, 3]),
# k nodes
helper.make_node("MatMul", ["layernorm_out", "matmul_k_weight"], ["matmul_k_out"], "matmul_k"),
helper.make_node("Add", ["matmul_k_out", "add_k_weight"], ["add_k_out"], "add_k"),
helper.make_node("Add", reverse_if(["matmul_k_out", "add_k_weight"], switch_add_inputs), ["add_k_out"], "add_k"),
helper.make_node("Reshape", ["add_k_out", "reshape_weight_1"], ["reshape_k_out"], "reshape_k"),
helper.make_node("Transpose", ["reshape_k_out"], ["transpose_k_out"], "transpose_k", perm=[0, 2, 3, 1]),
@ -60,7 +70,7 @@ def create_bert_attention(input_hidden_size=16, pruned_num_heads=2, pruned_head_
# qk nodes
helper.make_node("MatMul", ["transpose_q_out", "transpose_k_out"], ["matmul_qk_out"], "matmul_qk"),
helper.make_node("Div", ["matmul_qk_out", "div_weight"], ["div_qk_out"], "div_qk"),
helper.make_node("Add", ["div_qk_out", "mul_mask_out"], ["add_qk_out"], "add_qk"),
helper.make_node("Add", reverse_if(["div_qk_out", "mul_mask_out"], switch_add_inputs), ["add_qk_out"], "add_qk"),
helper.make_node("Softmax", ["add_qk_out"], ["softmax_qk_out"], "softmax_qk", axis=3),
# v nodes
@ -74,8 +84,8 @@ def create_bert_attention(input_hidden_size=16, pruned_num_heads=2, pruned_head_
helper.make_node("Transpose", ["matmul_qkv_1_out"], ["transpose_qkv_out"], "transpose_qkv", perm=[0, 2, 1, 3]),
helper.make_node("Reshape", ["transpose_qkv_out", "reshape_weight_2"], ["reshape_qkv_out"], "reshape_qkv"),
helper.make_node("MatMul", ["reshape_qkv_out", "matmul_qkv_weight"], ["matmul_qkv_2_out"], "matmul_qkv_2"),
helper.make_node("Add", ["matmul_qkv_2_out", "add_qkv_weight"], ["add_qkv_out"], "add_qkv"),
helper.make_node("Add", ["add_qkv_out", "layernorm_out"], ["skip_output"], "add_skip"),
helper.make_node("Add", reverse_if(["matmul_qkv_2_out", "add_qkv_weight"], switch_add_inputs), ["add_qkv_out"], "add_qkv"),
helper.make_node("Add", reverse_if(["add_qkv_out", "layernorm_out"], switch_add_inputs), ["skip_output"], "add_skip"),
helper.make_node("LayerNormalization", ["skip_output", "layer_norm_weight", "layer_norm_bias"], ["output"],
"layernorm2",
axis=-1,
@ -127,6 +137,7 @@ def create_bert_attention(input_hidden_size=16, pruned_num_heads=2, pruned_head_
model = helper.make_model(graph)
return model
def create_tf2onnx_attention_3d(input_hidden_size=16, num_heads=4, head_size=4, use_float_mask=False):
# unsqueeze in opset version 13 has two inputs (axis is moved from attribute to input).
has_unsqueeze_two_inputs = (version.parse(onnx.__version__) >= version.parse('1.8.0'))
@ -143,7 +154,7 @@ def create_tf2onnx_attention_3d(input_hidden_size=16, num_heads=4, head_size=4,
# q nodes
helper.make_node("Einsum", ["layernorm_out", "einsum_q_weight"], ["einsum_q_out"], "einsum_q", equation="abc,cde->abde"),
helper.make_node("Add", ["einsum_q_out", "add_q_weight"], ["add_q_out"], "add_q"),
# k nodes
helper.make_node("Einsum", ["layernorm_out", "einsum_k_weight"], ["einsum_k_out"], "einsum_k", equation="abc,cde->abde"),
helper.make_node("Add", ["einsum_k_out", "add_k_weight"], ["add_k_out"], "add_k"),
@ -229,5 +240,7 @@ def create_tf2onnx_attention_3d(input_hidden_size=16, num_heads=4, head_size=4,
if __name__ == "__main__":
model = create_bert_attention()
onnx.save(model, "pruned_bert_attention.onnx")
model = create_bert_attention(switch_add_inputs=True)
onnx.save(model, "bert_attention_reverse_add_order.onnx")
model = create_tf2onnx_attention_3d()
onnx.save(model, "bert_3d_attention.onnx")
onnx.save(model, "bert_3d_attention.onnx")

View file

@ -28,7 +28,21 @@ class TestFusion(unittest.TestCase):
'pruned_attention_opt.onnx')
expected = onnx.load(expected_model_path)
self.assertEqual(str(optimized_model.model.graph), str(expected.graph))
def test_attention_fusion_reverse_add_order(self):
model = create_bert_attention(switch_add_inputs=True)
dir = '.'
model_path = os.path.join(dir, "bert_attention_reverse_add_order.onnx")
onnx.save(model, model_path)
optimized_model = optimize_model(model_path)
os.remove(model_path)
# reverse add input order will get same optimized model
expected_model_path = os.path.join(os.path.dirname(__file__), 'test_data', 'fusion',
'pruned_attention_opt.onnx')
expected = onnx.load(expected_model_path)
self.assertEqual(str(optimized_model.model.graph), str(expected.graph))
def test_3d_attention_fusion_tf2onnx_model(self):
model = create_tf2onnx_attention_3d()
dir = '.'

View file

@ -20,6 +20,12 @@
#include "gtest/gtest.h"
#include "test/test_environment.h"
#ifdef USE_CUDA
#include "core/providers/cuda/cuda_execution_provider.h"
#elif USE_ROCM
#include "core/providers/rocm/rocm_execution_provider.h"
#endif
using namespace ONNX_NAMESPACE;
using namespace std;
namespace onnxruntime {
@ -173,6 +179,94 @@ TEST_P(SessionStateTestP, TestInitializerProcessing) {
}
}
#if defined(USE_CUDA) || defined(USE_ROCM)
static void TestCPUNodePlacement(const std::basic_string<ORTCHAR_T>& model_uri,
const std::unordered_set<std::string>& expected_cpu_nodes,
const std::unordered_set<std::string>& expected_gpu_nodes) {
std::shared_ptr<Model> model;
ASSERT_STATUS_OK(Model::Load(model_uri, model, nullptr, DefaultLoggingManager().DefaultLogger()));
Graph& graph = model->MainGraph();
ExecutionProviders execution_providers;
#if defined(USE_CUDA)
CUDAExecutionProviderInfo cuda_epi;
ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, std::make_unique<CUDAExecutionProvider>(cuda_epi)));
#elif defined(USE_ROCM)
ROCMExecutionProviderInfo rocm_epi;
ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kRocmExecutionProvider, std::make_unique<ROCMExecutionProvider>(rocm_epi)));
#endif
// add CPU EP
CPUExecutionProviderInfo epi;
ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCpuExecutionProvider, std::make_unique<CPUExecutionProvider>(epi)));
KernelRegistryManager krm;
ASSERT_STATUS_OK(krm.RegisterKernels(execution_providers));
DataTransferManager dtm;
profiling::Profiler profiler;
SessionState session_state(graph, execution_providers, false, nullptr, nullptr, dtm,
DefaultLoggingManager().DefaultLogger(), profiler);
// Partition the graph. Here, the graph partitioner assigns EPs to the nodes
GraphPartitioner partitioner(krm, execution_providers);
ASSERT_STATUS_OK(partitioner.Partition(graph, session_state.ExportDll(), session_state.GetMutableFuncMgr()));
// check which nodes are assigned to CPU and GPU
for (auto& node : graph.Nodes()) {
// assert that EP is assigned
ASSERT_TRUE(!node.GetExecutionProviderType().empty());
auto& ep = node.GetExecutionProviderType();
if (ep == onnxruntime::kCudaExecutionProvider || ep == onnxruntime::kRocmExecutionProvider) {
ASSERT_TRUE(expected_gpu_nodes.count(node.Name())) << "Node not found in expected gpu nodes: " << node.Name();
} else if (ep == onnxruntime::kCpuExecutionProvider) {
ASSERT_TRUE(expected_cpu_nodes.count(node.Name())) << "Node not found in expected cpu nodes: " << node.Name();
} else {
ASSERT_TRUE(false) << "Invalid execution provider assigned to node: " << node.Name() << " , value: " << ep;
}
}
}
TEST(SessionStateTest, CPUPlacementTest0) {
std::unordered_set<std::string> expected_cpu_nodes = {"reshape", "shape1", "const1", "mul", "equal", "where"};
std::unordered_set<std::string> expected_gpu_nodes = {"shape0", "expand"};
TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_0.onnx"), expected_cpu_nodes, expected_gpu_nodes);
}
TEST(SessionStateTest, CPUPlacementTest1) {
std::unordered_set<std::string> expected_cpu_nodes = {"const1"};
std::unordered_set<std::string> expected_gpu_nodes = {"shape0", "expand"};
TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_1.onnx"), expected_cpu_nodes, expected_gpu_nodes);
}
TEST(SessionStateTest, CPUPlacementTest2) {
std::unordered_set<std::string> expected_cpu_nodes = {"range"};
std::unordered_set<std::string> expected_gpu_nodes = {"size0", "reduce"};
TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_2.onnx"), expected_cpu_nodes, expected_gpu_nodes);
}
TEST(SessionStateTest, CPUPlacementTest3) {
std::unordered_set<std::string> expected_cpu_nodes = {"range0", "range1"};
std::unordered_set<std::string> expected_gpu_nodes = {"size0", "reduce0", "identity", "size1", "reduce1", "sum"};
TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_3.onnx"), expected_cpu_nodes, expected_gpu_nodes);
}
TEST(SessionStateTest, CPUPlacementTest4) {
// Currently, the behaviour is different for RocM and CUDA EP as Rocm EP is missing a valid kernel
// for ReduceSum for int64 type. This causes the backward trace in GetCpuPreferredNodes to stop
// earlier. The expected values can be modified to match CUDA once the RocM EP kernel is updated
#if defined(USE_CUDA)
std::unordered_set<std::string> expected_cpu_nodes = {"range", "reduce", "const1"};
std::unordered_set<std::string> expected_gpu_nodes = {"size0", "expand"};
#elif defined(USE_ROCM)
std::unordered_set<std::string> expected_cpu_nodes = {"const1", "reduce"};
std::unordered_set<std::string> expected_gpu_nodes = {"size0", "expand", "range"};
#endif
TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_4.onnx"), expected_cpu_nodes, expected_gpu_nodes);
}
TEST(SessionStateTest, CPUPlacementTest5) {
std::unordered_set<std::string> expected_cpu_nodes = {"gather0", "gather1", "concat"};
std::unordered_set<std::string> expected_gpu_nodes = {"shape0", "shape1", "reshape"};
TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_5.onnx"), expected_cpu_nodes, expected_gpu_nodes);
}
#endif
// Test that we allocate memory for an initializer from non-arena memory even if we provide an arena-based allocator
// if the relevant session option config flag is set
// For this test we need to enable the arena-based allocator which is not supported on x86 builds, so

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -0,0 +1,170 @@
import onnx
from onnx import helper
from onnx import TensorProto
from onnx import shape_inference
import numpy as np
graph_def_0 = helper.make_graph(
nodes=[
helper.make_node(op_type="Shape", inputs=['A'], outputs=['A_shape'], name='shape0'),
helper.make_node(op_type="Reshape", inputs=['A_shape', 'shape'], outputs=['A_reshaped'], name='reshape'),
helper.make_node(op_type="Shape", inputs=['A_reshaped'], outputs=['A_shape1'], name='shape1'),
helper.make_node(op_type="ConstantOfShape", inputs=['A_shape1'], outputs=['const1'], name='const1', value=helper.make_tensor('val', TensorProto.INT64,
[1], [1])),
helper.make_node(op_type="Mul", inputs=['const1', 'neg_one'], outputs=['mul'], name='mul'),
helper.make_node(op_type="Equal", inputs=['A_reshaped', 'mul'], outputs=['equal'], name='equal'),
helper.make_node(op_type="Where", inputs=['equal', 'const1', 'A_reshaped'], outputs=['where'], name='where'),
helper.make_node(op_type="Expand", inputs=['B','where'], outputs=['C'], name='expand'),
],
name='test-model',
inputs=[
# create inputs with symbolic dims
helper.make_tensor_value_info("A", TensorProto.FLOAT, None),
helper.make_tensor_value_info("B", TensorProto.FLOAT, None),
],
outputs=[
helper.make_tensor_value_info('C', TensorProto.FLOAT, None)
],
initializer=[
helper.make_tensor('shape', TensorProto.INT64, [1], [-1]),
helper.make_tensor('neg_one', TensorProto.INT64, [1], [-1]),
])
model = helper.make_model(graph_def_0, opset_imports=[helper.make_operatorsetid("", 12)])
onnx.save_model(model, "cpu_fallback_pattern_0.onnx")
graph_def_1 = helper.make_graph(
nodes=[
helper.make_node(op_type="Shape", inputs=['A'], outputs=['A_shape'], name='shape0'),
helper.make_node(op_type="ConstantOfShape", inputs=['A_shape'], outputs=['const1'], name='const1', value=helper.make_tensor('val', TensorProto.INT64,
[1], [1])),
helper.make_node(op_type="Expand", inputs=['B','const1'], outputs=['C'], name='expand'),
],
name='test-model',
inputs=[
# create inputs with symbolic dims
helper.make_tensor_value_info("A", TensorProto.FLOAT, None),
helper.make_tensor_value_info("B", TensorProto.FLOAT, None),
],
outputs=[
helper.make_tensor_value_info('C', TensorProto.FLOAT, None)
],
initializer=[])
model = helper.make_model(graph_def_1, opset_imports=[helper.make_operatorsetid("", 12)])
onnx.save_model(model, "cpu_fallback_pattern_1.onnx")
graph_def_2 = helper.make_graph(
nodes=[
helper.make_node(op_type="Size", inputs=['A'], outputs=['A_size'], name='size0'),
helper.make_node(op_type="Range", inputs=['zero', 'A_size', 'two'], outputs=['range'], name='range'),
helper.make_node(op_type="ReduceSum", inputs=['B', 'range'], outputs=['C'], name='reduce'),
],
name='test-model',
inputs=[
# create inputs with symbolic dims
helper.make_tensor_value_info("A", TensorProto.FLOAT, None),
helper.make_tensor_value_info("B", TensorProto.FLOAT, None),
],
outputs=[
helper.make_tensor_value_info('C', TensorProto.FLOAT, None)
],
initializer=[
helper.make_tensor('zero', TensorProto.INT64, [], [0]),
helper.make_tensor('two', TensorProto.INT64, [], [2]),
])
model = helper.make_model(graph_def_2, opset_imports=[helper.make_operatorsetid("", 13)])
onnx.save_model(model, "cpu_fallback_pattern_2.onnx")
graph_def_3 = helper.make_graph(
nodes=[
helper.make_node(op_type="Size", inputs=['A'], outputs=['size0'], name='size0'),
helper.make_node(op_type="Range", inputs=['zero', 'size0', 'two'], outputs=['range0'], name='range0'),
helper.make_node(op_type="ReduceSum", inputs=['B', 'range0'], outputs=['reduce0'], name='reduce0'),
helper.make_node(op_type="Identity", inputs=['reduce0'], outputs=['reduce0_cpy'], name='identity'),
helper.make_node(op_type="Size", inputs=['reduce0_cpy'], outputs=['size1'], name='size1'),
helper.make_node(op_type="Range", inputs=['zero', 'size1', 'two'], outputs=['range1'], name='range1'),
helper.make_node(op_type="ReduceSum", inputs=['B', 'range1'], outputs=['reduce1'], name='reduce1'),
helper.make_node(op_type="Sum", inputs=['reduce0', 'reduce1'], outputs=['C'], name='sum'),
],
name='test-model',
inputs=[
# create inputs with symbolic dims
helper.make_tensor_value_info("A", TensorProto.FLOAT, None),
helper.make_tensor_value_info("B", TensorProto.FLOAT, None),
],
outputs=[
helper.make_tensor_value_info('C', TensorProto.FLOAT, None)
],
initializer=[
helper.make_tensor('zero', TensorProto.INT64, [], [0]),
helper.make_tensor('two', TensorProto.INT64, [], [2]),
])
model = helper.make_model(graph_def_3, opset_imports=[helper.make_operatorsetid("", 13)])
onnx.save_model(model, "cpu_fallback_pattern_3.onnx")
graph_def_4 = helper.make_graph(
nodes=[
helper.make_node(op_type="Size", inputs=['A'], outputs=['A_size'], name='size0'),
helper.make_node(op_type="Range", inputs=['zero', 'A_size', 'two'], outputs=['range'], name='range'),
helper.make_node(op_type="ReduceSum", inputs=['B', 'range'], outputs=['reduce'], name='reduce'),
helper.make_node(op_type="ConstantOfShape", inputs=['reduce'], outputs=['const1'], name='const1', value=helper.make_tensor('val', TensorProto.INT64,
[1], [1])),
helper.make_node(op_type="Expand", inputs=['C','const1'], outputs=['D'], name='expand'),
],
name='test-model',
inputs=[
# create inputs with symbolic dims
helper.make_tensor_value_info("A", TensorProto.FLOAT, None),
helper.make_tensor_value_info("B", TensorProto.INT64, None),
helper.make_tensor_value_info("C", TensorProto.FLOAT, None),
],
outputs=[
helper.make_tensor_value_info('D', TensorProto.FLOAT, None)
],
initializer=[
helper.make_tensor('zero', TensorProto.INT64, [], [0]),
helper.make_tensor('two', TensorProto.INT64, [], [2]),
])
model = helper.make_model(graph_def_4, opset_imports=[helper.make_operatorsetid("", 13)])
onnx.save_model(model, "cpu_fallback_pattern_4.onnx")
graph_def_5 = helper.make_graph(
nodes=[
helper.make_node(op_type="Shape", inputs=['A'], outputs=['A_shape'], name='shape0'),
helper.make_node(op_type="Gather", inputs=['A_shape', 'zero'], outputs=['batch'], name='gather0'),
helper.make_node(op_type="Concat", inputs=['batch', 'seq_len'], outputs=['shape'], name='concat', axis=0),
helper.make_node(op_type="Shape", inputs=['B'], outputs=['B_shape'], name='shape1'),
helper.make_node(op_type="Gather", inputs=['B_shape', 'one'], outputs=['seq_len'], name='gather1'),
helper.make_node(op_type="Reshape", inputs=['C','shape'], outputs=['D'], name='reshape'),
],
name='test-model',
inputs=[
# create inputs with symbolic dims
helper.make_tensor_value_info("A", TensorProto.FLOAT, None),
helper.make_tensor_value_info("B", TensorProto.INT64, None),
helper.make_tensor_value_info("C", TensorProto.FLOAT, None),
],
outputs=[
helper.make_tensor_value_info('D', TensorProto.FLOAT, None)
],
initializer=[
helper.make_tensor('zero', TensorProto.INT64, [1], [0]),
helper.make_tensor('one', TensorProto.INT64, [1], [1]),
])
model = helper.make_model(graph_def_5, opset_imports=[helper.make_operatorsetid("", 13)])
onnx.save_model(model, "cpu_fallback_pattern_5.onnx")