diff --git a/include/onnxruntime/core/common/logging/severity.h b/include/onnxruntime/core/common/logging/severity.h index c377ceb809..e43f192eb1 100644 --- a/include/onnxruntime/core/common/logging/severity.h +++ b/include/onnxruntime/core/common/logging/severity.h @@ -13,7 +13,7 @@ enum class Severity { kINFO = 1, kWARNING = 2, kERROR = 3, - kFATAL = 4, + kFATAL = 4 }; constexpr const char* SEVERITY_PREFIX = "VIWEF"; diff --git a/onnxruntime/core/framework/fallback_cpu_capability.cc b/onnxruntime/core/framework/fallback_cpu_capability.cc index 011eaf9edb..c23ba1bcb3 100644 --- a/onnxruntime/core/framework/fallback_cpu_capability.cc +++ b/onnxruntime/core/framework/fallback_cpu_capability.cc @@ -7,7 +7,10 @@ #include "onnx/defs/data_type_utils.h" +#include "core/framework/execution_providers.h" +#include "core/framework/kernel_registry_manager.h" #include "core/framework/op_kernel.h" +#include "core/providers/cpu/cpu_execution_provider.h" using namespace ONNX_NAMESPACE::Utils; @@ -45,18 +48,43 @@ std::unordered_set GetCpuPreferredNodes(const onnxruntime::GraphViewe return node_id_to_order_map[n1] > node_id_to_order_map[n2]; }; - std::priority_queue, decltype(greater_order_comp)> candidates(greater_order_comp); + // If return false, n2 will be output first; If return true, n1 will be output first + auto lesser_order_comp = [&](const NodeIndex n1, const NodeIndex n2) { + return node_id_to_order_map[n1] < node_id_to_order_map[n2]; + }; + + std::priority_queue, decltype(greater_order_comp)> candidates_fw(greater_order_comp); + std::priority_queue, decltype(lesser_order_comp)> candidates_bw(lesser_order_comp); std::unordered_set visited; - std::unordered_set cpu_output_args; + std::unordered_set cpu_args; std::unordered_set provider_nodes; std::unordered_map node_to_kernel; + std::unordered_set cpu_kernel_available; + + // create a temp CPU kernel registry + KernelRegistryManager mgr; + ExecutionProviders cpu_ep; + CPUExecutionProviderInfo epi{false}; + ORT_ENFORCE(cpu_ep.Add(kCpuExecutionProvider, std::make_unique(epi)).IsOK()); + ORT_ENFORCE(mgr.RegisterKernels(cpu_ep).IsOK()); + std::vector cpu_kernel_registries = mgr.GetKernelRegistriesByProviderType(kCpuExecutionProvider); for (auto& node_id : tentative_nodes) { provider_nodes.insert(node_id); const Node* node = graph.GetNode(node_id); const KernelCreateInfo* kernel_info = nullptr; + + // Get the CPU kernel availability for this node + for (auto registry : cpu_kernel_registries) { + auto st = registry->TryFindKernel(*node, kCpuExecutionProvider, &kernel_info); + if (st.IsOK()) { + cpu_kernel_available.insert(node_id); + break; + } + } + for (auto registry : kernel_registries) { auto st = registry->TryFindKernel(*node, provider_type, &kernel_info); if (st.IsOK()) @@ -71,11 +99,26 @@ std::unordered_set GetCpuPreferredNodes(const onnxruntime::GraphViewe node->OutputDefs(), [&](const NodeArg& node_arg, size_t out_index) { if (kernel_info->kernel_def->IsOutputOnCpu(out_index)) { - cpu_output_args.insert(&node_arg); + cpu_args.insert(&node_arg); auto consumer_nodes = graph.GetConsumerNodes(node_arg.Name()); for (auto& consumer_node : consumer_nodes) { - candidates.push(consumer_node->Index()); - LOGS_DEFAULT(INFO) << "Candidate for fallback CPU execution: " << consumer_node->Name(); + candidates_fw.push(consumer_node->Index()); + LOGS_DEFAULT(INFO) << "Candidate for fallback CPU execution in forward trace: " << consumer_node->Name(); + } + } + return Status::OK(); + })); + + // then, find all the direct producers of cpu tensors. + ORT_THROW_IF_ERROR(node->ForEachWithIndex( + node->InputDefs(), + [&](const NodeArg& node_arg, size_t in_index) { + if (kernel_info->kernel_def->IsInputOnCpu(in_index)) { + cpu_args.insert(&node_arg); + auto producer_node = graph.GetProducerNode(node_arg.Name()); + if (producer_node != nullptr) { + candidates_bw.push(producer_node->Index()); + LOGS_DEFAULT(INFO) << "Candidate for fallback CPU execution in backward trace: " << producer_node->Name(); } } return Status::OK(); @@ -89,9 +132,9 @@ std::unordered_set GetCpuPreferredNodes(const onnxruntime::GraphViewe // The detail: // for each candidate, if one of its input is a cpu tensor and the Non-CPU kernel doesn't mark it as cpu input, // force the node to CPU to avoid memory cpu and add its output to the small cpu tensors. - while (!candidates.empty()) { - NodeIndex cur = candidates.top(); - candidates.pop(); + while (!candidates_fw.empty()) { + NodeIndex cur = candidates_fw.top(); + candidates_fw.pop(); if (visited.count(cur) != 0) continue; visited.insert(cur); @@ -118,7 +161,7 @@ std::unordered_set GetCpuPreferredNodes(const onnxruntime::GraphViewe } // the input is not a CPU tensor - if (cpu_output_args.find(input) == cpu_output_args.end()) { + if (cpu_args.find(input) == cpu_args.end()) { place_in_cpu = false; break; } @@ -130,16 +173,90 @@ std::unordered_set GetCpuPreferredNodes(const onnxruntime::GraphViewe } } - if (place_in_cpu) { + if (place_in_cpu && cpu_kernel_available.count(cur) != 0) { cpu_nodes.insert(cur); LOGS_DEFAULT(INFO) << "ORT optimization- Force fallback to CPU execution for node: " << node->Name() << " because the CPU execution path is deemed faster than overhead involved with execution on other EPs " << " capable of executing this node"; for (auto* output : node->OutputDefs()) { - cpu_output_args.insert(output); + cpu_args.insert(output); } for (auto it = node->OutputNodesBegin(); it != node->OutputNodesEnd(); ++it) { - candidates.push((*it).Index()); + candidates_fw.push((*it).Index()); + } + } + } + // clear the visited to prepare for backward trace + visited.clear(); + // Trace the graph backwards to find additional CPU nodes + // Starting from nodes that must produce an output on CPU, trace the producer nodes + // The trace stops when we find that + // 1) The node is already picked for CPU + // 2) Input/Output type is unsupported on CPU(float16/bfloat16) + // 3) The output is not a CPU tensor + // 4) The search hits a node that produces a CPU output + while (!candidates_bw.empty()) { + NodeIndex cur = candidates_bw.top(); + candidates_bw.pop(); + if (visited.count(cur) != 0) + continue; + visited.insert(cur); + + // node is already picked for CPU + if (cpu_nodes.count(cur) != 0) + continue; + + if (provider_nodes.find(cur) == provider_nodes.end()) + continue; + + auto* node = graph.GetNode(cur); + bool place_in_cpu = true; + for (size_t i = 0; i < node->OutputDefs().size(); ++i) { + auto* output = node->OutputDefs()[i]; + + // skip placing on CPU if the data typs is float16 or bfloat16 + if (output->Type() == DataTypeUtils::ToType("float16") || + output->Type() == DataTypeUtils::ToType("bfloat16")) { + place_in_cpu = false; + break; + } + + // the output is not a CPU tensor + if (cpu_args.find(output) == cpu_args.end()) { + place_in_cpu = false; + break; + } + + // output is a CPU tensor, but it's intended to be consumed as CPU output by the target EP + if (node_to_kernel[cur]->kernel_def->IsOutputOnCpu(i)) { + place_in_cpu = false; + break; + } + } + // Next, check if the node inputs are of supported type + if (place_in_cpu) { + for (size_t i = 0; i < node->InputDefs().size(); ++i) { + auto* input = node->InputDefs()[i]; + + // skip placing on CPU if the data typs is float16 or bfloat16 + if (input->Type() == DataTypeUtils::ToType("float16") || + input->Type() == DataTypeUtils::ToType("bfloat16")) { + place_in_cpu = false; + break; + } + } + } + + if (place_in_cpu && cpu_kernel_available.count(cur) != 0) { + cpu_nodes.insert(cur); + LOGS_DEFAULT(INFO) << "ORT optimization- Force fallback to CPU execution for node: " << node->Name() + << " because the CPU execution path is deemed faster than overhead involved with execution on other EPs " + << " capable of executing this node"; + for (auto* input : node->InputDefs()) { + cpu_args.insert(input); + } + for (auto it = node->InputNodesBegin(); it != node->InputNodesEnd(); ++it) { + candidates_bw.push((*it).Index()); } } } diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc index 42abdd12fa..75d258fecf 100644 --- a/onnxruntime/core/session/inference_session.cc +++ b/onnxruntime/core/session/inference_session.cc @@ -49,7 +49,6 @@ #include "core/session/onnxruntime_session_options_config_keys.h" #include "core/util/protobuf_parsing_utils.h" #include "core/util/thread_utils.h" -#include "onnxruntime_config.h" // custom ops are not available in a minimal build unless ORT_MINIMAL_BUILD_CUSTOM_OPS is set #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS) @@ -293,29 +292,6 @@ void InferenceSession::ConstructorCommon(const SessionOptions& session_options, // a monotonically increasing session id for use in telemetry session_id_ = global_session_id_.fetch_add(1); allocator_manager_ = std::make_shared(); - - // Add log to allow serving platforms to quantify ORT usage. - // To avoid flooding the test logs, this is done for non-debug mode only - // TODO: plug-in a platform specific telemetry provider to send the telemetry to -#if defined(NDEBUG) && !defined(__wasm__) && !defined(ENABLE_TRAINING) -#ifdef _WIN32 - std::wostringstream ostr; -#else - std::ostringstream ostr; -#endif - // Format: "ORT Telemetry: Ver = 1.7.0; Event = EventName (event_attr1: foo.onnx, event_attr2: 400us)" - // Format: "ORT Telemetry: Ver = 1.7.0; Event = SessionCreation (model: foo.onnx, ts: 400us)" - ostr << "ORT Telemetry: " - << "Ver = " << ORT_VERSION << "; Event = SessionCreation"; - if (!model_location_.empty()) { - ostr << " (model: " << model_location_ << ")"; - } -#ifdef _WIN32 - std::wcout << ostr.str() << "\n"; -#else - std::cout << ostr.str() << "\n"; -#endif -#endif } InferenceSession::InferenceSession(const SessionOptions& session_options, const Environment& session_env) diff --git a/onnxruntime/python/tools/transformers/fusion_attention.py b/onnxruntime/python/tools/transformers/fusion_attention.py index 657a4e717f..e6f9ec9df2 100644 --- a/onnxruntime/python/tools/transformers/fusion_attention.py +++ b/onnxruntime/python/tools/transformers/fusion_attention.py @@ -150,9 +150,9 @@ class FusionAttention(Fusion): q_weight = self.model.get_initializer(q_matmul.input[1]) k_weight = self.model.get_initializer(k_matmul.input[1]) v_weight = self.model.get_initializer(v_matmul.input[1]) - q_bias = self.model.get_initializer(q_add.input[1]) - k_bias = self.model.get_initializer(k_add.input[1]) - v_bias = self.model.get_initializer(v_add.input[1]) + q_bias = self.model.get_initializer(q_add.input[1]) or self.model.get_initializer(q_add.input[0]) + k_bias = self.model.get_initializer(k_add.input[1]) or self.model.get_initializer(k_add.input[0]) + v_bias = self.model.get_initializer(v_add.input[1]) or self.model.get_initializer(v_add.input[0]) if q_weight is None: print(f"{q_matmul.input[1]} is not initializer. Please set do_constant_folding=True in torch.onnx.export") @@ -166,14 +166,14 @@ class FusionAttention(Fusion): # Check if all matrices have the same shape assert qw.shape == kw.shape == vw.shape - # All the matrices have the same shape. For 2d weights, the shapes would be [in_size, out_size]. + # All the matrices have the same shape. For 2d weights, the shapes would be [in_size, out_size]. # For 3d weights, shape would be [in_size, a, b] where a*b = out_size in_size = qw.shape[0] out_size = np.prod(qw.shape[1:]) qkv_weight = np.stack((qw, kw, vw), axis=1) - qb = NumpyHelper.to_array(q_bias) + qb = NumpyHelper.to_array(q_bias) kb = NumpyHelper.to_array(k_bias) vb = NumpyHelper.to_array(v_bias) @@ -233,13 +233,14 @@ class FusionAttention(Fusion): # SkipLayerNormalization has two inputs, and one of them is the root input for attention. qkv_nodes = self.model.match_parent_path(start_node, ['Add', 'MatMul', 'Reshape', 'Transpose', 'MatMul'], - [None, 0, 0, 0, 0]) + [None, None, 0, 0, 0]) einsum_node = None if qkv_nodes is not None: (_, matmul_qkv, reshape_qkv, transpose_qkv, matmul_qkv) = qkv_nodes else: # Match Albert - qkv_nodes = self.model.match_parent_path(start_node, ['Add', 'Einsum', 'Transpose', 'MatMul'], [1, 0, 0, 0]) + qkv_nodes = self.model.match_parent_path(start_node, ['Add', 'Einsum', 'Transpose', 'MatMul'], + [1, None, 0, 0]) if qkv_nodes is not None: (_, einsum_node, transpose_qkv, matmul_qkv) = qkv_nodes else: @@ -284,16 +285,16 @@ class FusionAttention(Fusion): if children_types.count('MatMul') != 3: return - v_nodes = self.model.match_parent_path(matmul_qkv, ['Transpose', 'Reshape', 'Add', 'MatMul'], [1, 0, 0, 0]) + v_nodes = self.model.match_parent_path(matmul_qkv, ['Transpose', 'Reshape', 'Add', 'MatMul'], [1, 0, 0, None]) if v_nodes is None: logger.debug("fuse_attention: failed to match v path") return (_, _, add_v, matmul_v) = v_nodes is_distill = False - qk_nodes = self.model.match_parent_path(matmul_qkv, ['Softmax', 'Add', 'Div', 'MatMul'], [0, 0, 0, 0]) + qk_nodes = self.model.match_parent_path(matmul_qkv, ['Softmax', 'Add', 'Div', 'MatMul'], [0, 0, None, 0]) if qk_nodes is None: - qk_nodes = self.model.match_parent_path(matmul_qkv, ['Softmax', 'Add', 'Mul', 'MatMul'], [0, 0, 0, 0]) + qk_nodes = self.model.match_parent_path(matmul_qkv, ['Softmax', 'Add', 'Mul', 'MatMul'], [0, 0, None, 0]) if qk_nodes is None: qk_nodes = self.model.match_parent_path(matmul_qkv, ['Softmax', 'Where', 'MatMul', 'Div'], [0, 0, 2, 0]) is_distill = True @@ -309,10 +310,10 @@ class FusionAttention(Fusion): else: (_, add_qk, _, matmul_qk) = qk_nodes - q_nodes = self.model.match_parent_path(matmul_qk, ['Transpose', 'Reshape', 'Add', 'MatMul'], [0, 0, 0, 0]) + q_nodes = self.model.match_parent_path(matmul_qk, ['Transpose', 'Reshape', 'Add', 'MatMul'], [0, 0, 0, None]) if q_nodes is None: q_nodes = self.model.match_parent_path(matmul_qk, ['Div', 'Transpose', 'Reshape', 'Add', 'MatMul'], - [0, 0, 0, 0, 0]) + [0, 0, 0, 0, None]) if q_nodes is None: logger.debug("fuse_attention: failed to match q path") return @@ -320,10 +321,10 @@ class FusionAttention(Fusion): add_q = q_nodes[-2] matmul_q = q_nodes[-1] - k_nodes = self.model.match_parent_path(matmul_qk, ['Transpose', 'Reshape', 'Add', 'MatMul'], [1, 0, 0, 0]) + k_nodes = self.model.match_parent_path(matmul_qk, ['Transpose', 'Reshape', 'Add', 'MatMul'], [1, 0, 0, None]) if k_nodes is None: k_nodes = self.model.match_parent_path(matmul_qk, ['Transpose', 'Transpose', 'Reshape', 'Add', 'MatMul'], - [1, 0, 0, 0, 0]) + [1, 0, 0, 0, None]) if k_nodes is None: logger.debug("fuse_attention: failed to match k path") return @@ -339,8 +340,8 @@ class FusionAttention(Fusion): output_name_to_node) else: _, mask_nodes, _ = self.model.match_parent_paths( - add_qk, [(['Mul', 'Sub', 'Cast', 'Unsqueeze', 'Unsqueeze'], [1, 0, 1, 0, 0]), - (['Mul', 'Sub', 'Unsqueeze', 'Unsqueeze'], [1, 0, 1, 0])], output_name_to_node) + add_qk, [(['Mul', 'Sub', 'Cast', 'Unsqueeze', 'Unsqueeze'], [None, 0, 1, 0, 0]), + (['Mul', 'Sub', 'Unsqueeze', 'Unsqueeze'], [None, 0, 1, 0])], output_name_to_node) if mask_nodes is None: logger.debug("fuse_attention: failed to match mask path") return diff --git a/onnxruntime/python/tools/transformers/test/bert_model_generator.py b/onnxruntime/python/tools/transformers/test/bert_model_generator.py index 5d1a65f281..79ceec701d 100644 --- a/onnxruntime/python/tools/transformers/test/bert_model_generator.py +++ b/onnxruntime/python/tools/transformers/test/bert_model_generator.py @@ -21,7 +21,17 @@ def float_tensor(name: str, shape: List[int], random=False): return helper.make_tensor(name, TensorProto.FLOAT, shape, weights) -def create_bert_attention(input_hidden_size=16, pruned_num_heads=2, pruned_head_size=4, use_float_mask=False): +def reverse_if(inputs, reverse=False): + if reverse: + inputs.reverse() + return inputs + + +def create_bert_attention(input_hidden_size=16, + pruned_num_heads=2, + pruned_head_size=4, + use_float_mask=False, + switch_add_inputs=False): # unsqueeze in opset version 13 has two inputs (axis is moved from attribute to input). has_unsqueeze_two_inputs = (version.parse(onnx.__version__) >= version.parse('1.8.0')) @@ -36,13 +46,13 @@ def create_bert_attention(input_hidden_size=16, pruned_num_heads=2, pruned_head_ # q nodes helper.make_node("MatMul", ["layernorm_out", "matmul_q_weight"], ["matmul_q_out"], "matmul_q"), - helper.make_node("Add", ["matmul_q_out", "add_q_weight"], ["add_q_out"], "add_q"), + helper.make_node("Add", reverse_if(["matmul_q_out", "add_q_weight"], switch_add_inputs), ["add_q_out"], "add_q"), helper.make_node("Reshape", ["add_q_out", "reshape_weight_1"], ["reshape_q_out"], "reshape_q"), helper.make_node("Transpose", ["reshape_q_out"], ["transpose_q_out"], "transpose_q", perm=[0, 2, 1, 3]), # k nodes helper.make_node("MatMul", ["layernorm_out", "matmul_k_weight"], ["matmul_k_out"], "matmul_k"), - helper.make_node("Add", ["matmul_k_out", "add_k_weight"], ["add_k_out"], "add_k"), + helper.make_node("Add", reverse_if(["matmul_k_out", "add_k_weight"], switch_add_inputs), ["add_k_out"], "add_k"), helper.make_node("Reshape", ["add_k_out", "reshape_weight_1"], ["reshape_k_out"], "reshape_k"), helper.make_node("Transpose", ["reshape_k_out"], ["transpose_k_out"], "transpose_k", perm=[0, 2, 3, 1]), @@ -60,7 +70,7 @@ def create_bert_attention(input_hidden_size=16, pruned_num_heads=2, pruned_head_ # qk nodes helper.make_node("MatMul", ["transpose_q_out", "transpose_k_out"], ["matmul_qk_out"], "matmul_qk"), helper.make_node("Div", ["matmul_qk_out", "div_weight"], ["div_qk_out"], "div_qk"), - helper.make_node("Add", ["div_qk_out", "mul_mask_out"], ["add_qk_out"], "add_qk"), + helper.make_node("Add", reverse_if(["div_qk_out", "mul_mask_out"], switch_add_inputs), ["add_qk_out"], "add_qk"), helper.make_node("Softmax", ["add_qk_out"], ["softmax_qk_out"], "softmax_qk", axis=3), # v nodes @@ -74,8 +84,8 @@ def create_bert_attention(input_hidden_size=16, pruned_num_heads=2, pruned_head_ helper.make_node("Transpose", ["matmul_qkv_1_out"], ["transpose_qkv_out"], "transpose_qkv", perm=[0, 2, 1, 3]), helper.make_node("Reshape", ["transpose_qkv_out", "reshape_weight_2"], ["reshape_qkv_out"], "reshape_qkv"), helper.make_node("MatMul", ["reshape_qkv_out", "matmul_qkv_weight"], ["matmul_qkv_2_out"], "matmul_qkv_2"), - helper.make_node("Add", ["matmul_qkv_2_out", "add_qkv_weight"], ["add_qkv_out"], "add_qkv"), - helper.make_node("Add", ["add_qkv_out", "layernorm_out"], ["skip_output"], "add_skip"), + helper.make_node("Add", reverse_if(["matmul_qkv_2_out", "add_qkv_weight"], switch_add_inputs), ["add_qkv_out"], "add_qkv"), + helper.make_node("Add", reverse_if(["add_qkv_out", "layernorm_out"], switch_add_inputs), ["skip_output"], "add_skip"), helper.make_node("LayerNormalization", ["skip_output", "layer_norm_weight", "layer_norm_bias"], ["output"], "layernorm2", axis=-1, @@ -127,6 +137,7 @@ def create_bert_attention(input_hidden_size=16, pruned_num_heads=2, pruned_head_ model = helper.make_model(graph) return model + def create_tf2onnx_attention_3d(input_hidden_size=16, num_heads=4, head_size=4, use_float_mask=False): # unsqueeze in opset version 13 has two inputs (axis is moved from attribute to input). has_unsqueeze_two_inputs = (version.parse(onnx.__version__) >= version.parse('1.8.0')) @@ -143,7 +154,7 @@ def create_tf2onnx_attention_3d(input_hidden_size=16, num_heads=4, head_size=4, # q nodes helper.make_node("Einsum", ["layernorm_out", "einsum_q_weight"], ["einsum_q_out"], "einsum_q", equation="abc,cde->abde"), helper.make_node("Add", ["einsum_q_out", "add_q_weight"], ["add_q_out"], "add_q"), - + # k nodes helper.make_node("Einsum", ["layernorm_out", "einsum_k_weight"], ["einsum_k_out"], "einsum_k", equation="abc,cde->abde"), helper.make_node("Add", ["einsum_k_out", "add_k_weight"], ["add_k_out"], "add_k"), @@ -229,5 +240,7 @@ def create_tf2onnx_attention_3d(input_hidden_size=16, num_heads=4, head_size=4, if __name__ == "__main__": model = create_bert_attention() onnx.save(model, "pruned_bert_attention.onnx") + model = create_bert_attention(switch_add_inputs=True) + onnx.save(model, "bert_attention_reverse_add_order.onnx") model = create_tf2onnx_attention_3d() - onnx.save(model, "bert_3d_attention.onnx") \ No newline at end of file + onnx.save(model, "bert_3d_attention.onnx") diff --git a/onnxruntime/python/tools/transformers/test/test_attention_fusion.py b/onnxruntime/python/tools/transformers/test/test_attention_fusion.py index 2543ce06ce..da70e03ac9 100644 --- a/onnxruntime/python/tools/transformers/test/test_attention_fusion.py +++ b/onnxruntime/python/tools/transformers/test/test_attention_fusion.py @@ -28,7 +28,21 @@ class TestFusion(unittest.TestCase): 'pruned_attention_opt.onnx') expected = onnx.load(expected_model_path) self.assertEqual(str(optimized_model.model.graph), str(expected.graph)) - + + def test_attention_fusion_reverse_add_order(self): + model = create_bert_attention(switch_add_inputs=True) + dir = '.' + model_path = os.path.join(dir, "bert_attention_reverse_add_order.onnx") + onnx.save(model, model_path) + optimized_model = optimize_model(model_path) + os.remove(model_path) + + # reverse add input order will get same optimized model + expected_model_path = os.path.join(os.path.dirname(__file__), 'test_data', 'fusion', + 'pruned_attention_opt.onnx') + expected = onnx.load(expected_model_path) + self.assertEqual(str(optimized_model.model.graph), str(expected.graph)) + def test_3d_attention_fusion_tf2onnx_model(self): model = create_tf2onnx_attention_3d() dir = '.' diff --git a/onnxruntime/test/framework/session_state_test.cc b/onnxruntime/test/framework/session_state_test.cc index e127a6cad8..42bd593b61 100644 --- a/onnxruntime/test/framework/session_state_test.cc +++ b/onnxruntime/test/framework/session_state_test.cc @@ -20,6 +20,12 @@ #include "gtest/gtest.h" #include "test/test_environment.h" +#ifdef USE_CUDA +#include "core/providers/cuda/cuda_execution_provider.h" +#elif USE_ROCM +#include "core/providers/rocm/rocm_execution_provider.h" +#endif + using namespace ONNX_NAMESPACE; using namespace std; namespace onnxruntime { @@ -173,6 +179,94 @@ TEST_P(SessionStateTestP, TestInitializerProcessing) { } } +#if defined(USE_CUDA) || defined(USE_ROCM) +static void TestCPUNodePlacement(const std::basic_string& model_uri, + const std::unordered_set& expected_cpu_nodes, + const std::unordered_set& expected_gpu_nodes) { + std::shared_ptr model; + ASSERT_STATUS_OK(Model::Load(model_uri, model, nullptr, DefaultLoggingManager().DefaultLogger())); + Graph& graph = model->MainGraph(); + + ExecutionProviders execution_providers; +#if defined(USE_CUDA) + CUDAExecutionProviderInfo cuda_epi; + ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, std::make_unique(cuda_epi))); +#elif defined(USE_ROCM) + ROCMExecutionProviderInfo rocm_epi; + ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kRocmExecutionProvider, std::make_unique(rocm_epi))); +#endif + // add CPU EP + CPUExecutionProviderInfo epi; + ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCpuExecutionProvider, std::make_unique(epi))); + + KernelRegistryManager krm; + ASSERT_STATUS_OK(krm.RegisterKernels(execution_providers)); + + DataTransferManager dtm; + profiling::Profiler profiler; + + SessionState session_state(graph, execution_providers, false, nullptr, nullptr, dtm, + DefaultLoggingManager().DefaultLogger(), profiler); + + // Partition the graph. Here, the graph partitioner assigns EPs to the nodes + GraphPartitioner partitioner(krm, execution_providers); + ASSERT_STATUS_OK(partitioner.Partition(graph, session_state.ExportDll(), session_state.GetMutableFuncMgr())); + + // check which nodes are assigned to CPU and GPU + for (auto& node : graph.Nodes()) { + // assert that EP is assigned + ASSERT_TRUE(!node.GetExecutionProviderType().empty()); + auto& ep = node.GetExecutionProviderType(); + if (ep == onnxruntime::kCudaExecutionProvider || ep == onnxruntime::kRocmExecutionProvider) { + ASSERT_TRUE(expected_gpu_nodes.count(node.Name())) << "Node not found in expected gpu nodes: " << node.Name(); + } else if (ep == onnxruntime::kCpuExecutionProvider) { + ASSERT_TRUE(expected_cpu_nodes.count(node.Name())) << "Node not found in expected cpu nodes: " << node.Name(); + } else { + ASSERT_TRUE(false) << "Invalid execution provider assigned to node: " << node.Name() << " , value: " << ep; + } + } +} + +TEST(SessionStateTest, CPUPlacementTest0) { + std::unordered_set expected_cpu_nodes = {"reshape", "shape1", "const1", "mul", "equal", "where"}; + std::unordered_set expected_gpu_nodes = {"shape0", "expand"}; + TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_0.onnx"), expected_cpu_nodes, expected_gpu_nodes); +} +TEST(SessionStateTest, CPUPlacementTest1) { + std::unordered_set expected_cpu_nodes = {"const1"}; + std::unordered_set expected_gpu_nodes = {"shape0", "expand"}; + TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_1.onnx"), expected_cpu_nodes, expected_gpu_nodes); +} +TEST(SessionStateTest, CPUPlacementTest2) { + std::unordered_set expected_cpu_nodes = {"range"}; + std::unordered_set expected_gpu_nodes = {"size0", "reduce"}; + TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_2.onnx"), expected_cpu_nodes, expected_gpu_nodes); +} +TEST(SessionStateTest, CPUPlacementTest3) { + std::unordered_set expected_cpu_nodes = {"range0", "range1"}; + std::unordered_set expected_gpu_nodes = {"size0", "reduce0", "identity", "size1", "reduce1", "sum"}; + TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_3.onnx"), expected_cpu_nodes, expected_gpu_nodes); +} +TEST(SessionStateTest, CPUPlacementTest4) { + // Currently, the behaviour is different for RocM and CUDA EP as Rocm EP is missing a valid kernel + // for ReduceSum for int64 type. This causes the backward trace in GetCpuPreferredNodes to stop + // earlier. The expected values can be modified to match CUDA once the RocM EP kernel is updated +#if defined(USE_CUDA) + std::unordered_set expected_cpu_nodes = {"range", "reduce", "const1"}; + std::unordered_set expected_gpu_nodes = {"size0", "expand"}; +#elif defined(USE_ROCM) + std::unordered_set expected_cpu_nodes = {"const1", "reduce"}; + std::unordered_set expected_gpu_nodes = {"size0", "expand", "range"}; +#endif + TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_4.onnx"), expected_cpu_nodes, expected_gpu_nodes); +} +TEST(SessionStateTest, CPUPlacementTest5) { + std::unordered_set expected_cpu_nodes = {"gather0", "gather1", "concat"}; + std::unordered_set expected_gpu_nodes = {"shape0", "shape1", "reshape"}; + TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_5.onnx"), expected_cpu_nodes, expected_gpu_nodes); +} +#endif + // Test that we allocate memory for an initializer from non-arena memory even if we provide an arena-based allocator // if the relevant session option config flag is set // For this test we need to enable the arena-based allocator which is not supported on x86 builds, so diff --git a/onnxruntime/test/testdata/cpu_fallback_pattern_0.onnx b/onnxruntime/test/testdata/cpu_fallback_pattern_0.onnx new file mode 100644 index 0000000000..4186edb736 Binary files /dev/null and b/onnxruntime/test/testdata/cpu_fallback_pattern_0.onnx differ diff --git a/onnxruntime/test/testdata/cpu_fallback_pattern_1.onnx b/onnxruntime/test/testdata/cpu_fallback_pattern_1.onnx new file mode 100644 index 0000000000..81e7abb77c Binary files /dev/null and b/onnxruntime/test/testdata/cpu_fallback_pattern_1.onnx differ diff --git a/onnxruntime/test/testdata/cpu_fallback_pattern_2.onnx b/onnxruntime/test/testdata/cpu_fallback_pattern_2.onnx new file mode 100644 index 0000000000..6a6f25b0ba Binary files /dev/null and b/onnxruntime/test/testdata/cpu_fallback_pattern_2.onnx differ diff --git a/onnxruntime/test/testdata/cpu_fallback_pattern_3.onnx b/onnxruntime/test/testdata/cpu_fallback_pattern_3.onnx new file mode 100644 index 0000000000..5285a240d2 Binary files /dev/null and b/onnxruntime/test/testdata/cpu_fallback_pattern_3.onnx differ diff --git a/onnxruntime/test/testdata/cpu_fallback_pattern_4.onnx b/onnxruntime/test/testdata/cpu_fallback_pattern_4.onnx new file mode 100644 index 0000000000..be93737c4f Binary files /dev/null and b/onnxruntime/test/testdata/cpu_fallback_pattern_4.onnx differ diff --git a/onnxruntime/test/testdata/cpu_fallback_pattern_5.onnx b/onnxruntime/test/testdata/cpu_fallback_pattern_5.onnx new file mode 100644 index 0000000000..f2d866badb Binary files /dev/null and b/onnxruntime/test/testdata/cpu_fallback_pattern_5.onnx differ diff --git a/onnxruntime/test/testdata/cpu_fallback_test_gen.py b/onnxruntime/test/testdata/cpu_fallback_test_gen.py new file mode 100644 index 0000000000..8d8ec94639 --- /dev/null +++ b/onnxruntime/test/testdata/cpu_fallback_test_gen.py @@ -0,0 +1,170 @@ +import onnx +from onnx import helper +from onnx import TensorProto +from onnx import shape_inference +import numpy as np + +graph_def_0 = helper.make_graph( + nodes=[ + helper.make_node(op_type="Shape", inputs=['A'], outputs=['A_shape'], name='shape0'), + helper.make_node(op_type="Reshape", inputs=['A_shape', 'shape'], outputs=['A_reshaped'], name='reshape'), + helper.make_node(op_type="Shape", inputs=['A_reshaped'], outputs=['A_shape1'], name='shape1'), + helper.make_node(op_type="ConstantOfShape", inputs=['A_shape1'], outputs=['const1'], name='const1', value=helper.make_tensor('val', TensorProto.INT64, + [1], [1])), + helper.make_node(op_type="Mul", inputs=['const1', 'neg_one'], outputs=['mul'], name='mul'), + helper.make_node(op_type="Equal", inputs=['A_reshaped', 'mul'], outputs=['equal'], name='equal'), + helper.make_node(op_type="Where", inputs=['equal', 'const1', 'A_reshaped'], outputs=['where'], name='where'), + helper.make_node(op_type="Expand", inputs=['B','where'], outputs=['C'], name='expand'), + + ], + name='test-model', + inputs=[ + # create inputs with symbolic dims + helper.make_tensor_value_info("A", TensorProto.FLOAT, None), + helper.make_tensor_value_info("B", TensorProto.FLOAT, None), + ], + outputs=[ + helper.make_tensor_value_info('C', TensorProto.FLOAT, None) + ], + initializer=[ + helper.make_tensor('shape', TensorProto.INT64, [1], [-1]), + helper.make_tensor('neg_one', TensorProto.INT64, [1], [-1]), + ]) + +model = helper.make_model(graph_def_0, opset_imports=[helper.make_operatorsetid("", 12)]) +onnx.save_model(model, "cpu_fallback_pattern_0.onnx") + +graph_def_1 = helper.make_graph( + nodes=[ + helper.make_node(op_type="Shape", inputs=['A'], outputs=['A_shape'], name='shape0'), + helper.make_node(op_type="ConstantOfShape", inputs=['A_shape'], outputs=['const1'], name='const1', value=helper.make_tensor('val', TensorProto.INT64, + [1], [1])), + helper.make_node(op_type="Expand", inputs=['B','const1'], outputs=['C'], name='expand'), + + ], + name='test-model', + inputs=[ + # create inputs with symbolic dims + helper.make_tensor_value_info("A", TensorProto.FLOAT, None), + helper.make_tensor_value_info("B", TensorProto.FLOAT, None), + ], + outputs=[ + helper.make_tensor_value_info('C', TensorProto.FLOAT, None) + ], + initializer=[]) + +model = helper.make_model(graph_def_1, opset_imports=[helper.make_operatorsetid("", 12)]) +onnx.save_model(model, "cpu_fallback_pattern_1.onnx") + + +graph_def_2 = helper.make_graph( + nodes=[ + helper.make_node(op_type="Size", inputs=['A'], outputs=['A_size'], name='size0'), + helper.make_node(op_type="Range", inputs=['zero', 'A_size', 'two'], outputs=['range'], name='range'), + helper.make_node(op_type="ReduceSum", inputs=['B', 'range'], outputs=['C'], name='reduce'), + ], + name='test-model', + inputs=[ + # create inputs with symbolic dims + helper.make_tensor_value_info("A", TensorProto.FLOAT, None), + helper.make_tensor_value_info("B", TensorProto.FLOAT, None), + ], + outputs=[ + helper.make_tensor_value_info('C', TensorProto.FLOAT, None) + ], + initializer=[ + helper.make_tensor('zero', TensorProto.INT64, [], [0]), + helper.make_tensor('two', TensorProto.INT64, [], [2]), + ]) + +model = helper.make_model(graph_def_2, opset_imports=[helper.make_operatorsetid("", 13)]) +onnx.save_model(model, "cpu_fallback_pattern_2.onnx") + + +graph_def_3 = helper.make_graph( + nodes=[ + helper.make_node(op_type="Size", inputs=['A'], outputs=['size0'], name='size0'), + helper.make_node(op_type="Range", inputs=['zero', 'size0', 'two'], outputs=['range0'], name='range0'), + helper.make_node(op_type="ReduceSum", inputs=['B', 'range0'], outputs=['reduce0'], name='reduce0'), + + helper.make_node(op_type="Identity", inputs=['reduce0'], outputs=['reduce0_cpy'], name='identity'), + + helper.make_node(op_type="Size", inputs=['reduce0_cpy'], outputs=['size1'], name='size1'), + helper.make_node(op_type="Range", inputs=['zero', 'size1', 'two'], outputs=['range1'], name='range1'), + helper.make_node(op_type="ReduceSum", inputs=['B', 'range1'], outputs=['reduce1'], name='reduce1'), + + helper.make_node(op_type="Sum", inputs=['reduce0', 'reduce1'], outputs=['C'], name='sum'), + + ], + name='test-model', + inputs=[ + # create inputs with symbolic dims + helper.make_tensor_value_info("A", TensorProto.FLOAT, None), + helper.make_tensor_value_info("B", TensorProto.FLOAT, None), + ], + outputs=[ + helper.make_tensor_value_info('C', TensorProto.FLOAT, None) + ], + initializer=[ + helper.make_tensor('zero', TensorProto.INT64, [], [0]), + helper.make_tensor('two', TensorProto.INT64, [], [2]), + ]) + +model = helper.make_model(graph_def_3, opset_imports=[helper.make_operatorsetid("", 13)]) +onnx.save_model(model, "cpu_fallback_pattern_3.onnx") + +graph_def_4 = helper.make_graph( + nodes=[ + helper.make_node(op_type="Size", inputs=['A'], outputs=['A_size'], name='size0'), + helper.make_node(op_type="Range", inputs=['zero', 'A_size', 'two'], outputs=['range'], name='range'), + helper.make_node(op_type="ReduceSum", inputs=['B', 'range'], outputs=['reduce'], name='reduce'), + helper.make_node(op_type="ConstantOfShape", inputs=['reduce'], outputs=['const1'], name='const1', value=helper.make_tensor('val', TensorProto.INT64, + [1], [1])), + helper.make_node(op_type="Expand", inputs=['C','const1'], outputs=['D'], name='expand'), + + ], + name='test-model', + inputs=[ + # create inputs with symbolic dims + helper.make_tensor_value_info("A", TensorProto.FLOAT, None), + helper.make_tensor_value_info("B", TensorProto.INT64, None), + helper.make_tensor_value_info("C", TensorProto.FLOAT, None), + ], + outputs=[ + helper.make_tensor_value_info('D', TensorProto.FLOAT, None) + ], + initializer=[ + helper.make_tensor('zero', TensorProto.INT64, [], [0]), + helper.make_tensor('two', TensorProto.INT64, [], [2]), + ]) + +model = helper.make_model(graph_def_4, opset_imports=[helper.make_operatorsetid("", 13)]) +onnx.save_model(model, "cpu_fallback_pattern_4.onnx") + +graph_def_5 = helper.make_graph( + nodes=[ + helper.make_node(op_type="Shape", inputs=['A'], outputs=['A_shape'], name='shape0'), + helper.make_node(op_type="Gather", inputs=['A_shape', 'zero'], outputs=['batch'], name='gather0'), + helper.make_node(op_type="Concat", inputs=['batch', 'seq_len'], outputs=['shape'], name='concat', axis=0), + helper.make_node(op_type="Shape", inputs=['B'], outputs=['B_shape'], name='shape1'), + helper.make_node(op_type="Gather", inputs=['B_shape', 'one'], outputs=['seq_len'], name='gather1'), + helper.make_node(op_type="Reshape", inputs=['C','shape'], outputs=['D'], name='reshape'), + + ], + name='test-model', + inputs=[ + # create inputs with symbolic dims + helper.make_tensor_value_info("A", TensorProto.FLOAT, None), + helper.make_tensor_value_info("B", TensorProto.INT64, None), + helper.make_tensor_value_info("C", TensorProto.FLOAT, None), + ], + outputs=[ + helper.make_tensor_value_info('D', TensorProto.FLOAT, None) + ], + initializer=[ + helper.make_tensor('zero', TensorProto.INT64, [1], [0]), + helper.make_tensor('one', TensorProto.INT64, [1], [1]), + ]) + +model = helper.make_model(graph_def_5, opset_imports=[helper.make_operatorsetid("", 13)]) +onnx.save_model(model, "cpu_fallback_pattern_5.onnx")