mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-31 23:27:43 +00:00
Revert (#7663)
This commit is contained in:
parent
1ab8a95eb6
commit
7bb3f243ff
9 changed files with 12 additions and 393 deletions
|
|
@ -7,10 +7,7 @@
|
|||
|
||||
#include "onnx/defs/data_type_utils.h"
|
||||
|
||||
#include "core/framework/execution_providers.h"
|
||||
#include "core/framework/kernel_registry_manager.h"
|
||||
#include "core/framework/op_kernel.h"
|
||||
#include "core/providers/cpu/cpu_execution_provider.h"
|
||||
|
||||
using namespace ONNX_NAMESPACE::Utils;
|
||||
|
||||
|
|
@ -48,43 +45,18 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewe
|
|||
return node_id_to_order_map[n1] > node_id_to_order_map[n2];
|
||||
};
|
||||
|
||||
// If return false, n2 will be output first; If return true, n1 will be output first
|
||||
auto lesser_order_comp = [&](const NodeIndex n1, const NodeIndex n2) {
|
||||
return node_id_to_order_map[n1] < node_id_to_order_map[n2];
|
||||
};
|
||||
|
||||
std::priority_queue<NodeIndex, std::vector<NodeIndex>, decltype(greater_order_comp)> candidates_fw(greater_order_comp);
|
||||
std::priority_queue<NodeIndex, std::vector<NodeIndex>, decltype(lesser_order_comp)> candidates_bw(lesser_order_comp);
|
||||
std::priority_queue<NodeIndex, std::vector<NodeIndex>, decltype(greater_order_comp)> candidates(greater_order_comp);
|
||||
std::unordered_set<NodeIndex> visited;
|
||||
|
||||
std::unordered_set<const NodeArg*> cpu_args;
|
||||
std::unordered_set<const NodeArg*> cpu_output_args;
|
||||
std::unordered_set<NodeIndex> provider_nodes;
|
||||
std::unordered_map<NodeIndex, const KernelCreateInfo*> node_to_kernel;
|
||||
std::unordered_set<NodeIndex> cpu_kernel_available;
|
||||
|
||||
// create a temp CPU kernel registry
|
||||
KernelRegistryManager mgr;
|
||||
ExecutionProviders cpu_ep;
|
||||
CPUExecutionProviderInfo epi{false};
|
||||
ORT_ENFORCE(cpu_ep.Add(kCpuExecutionProvider, std::make_unique<CPUExecutionProvider>(epi)).IsOK());
|
||||
ORT_ENFORCE(mgr.RegisterKernels(cpu_ep).IsOK());
|
||||
std::vector<const KernelRegistry*> cpu_kernel_registries = mgr.GetKernelRegistriesByProviderType(kCpuExecutionProvider);
|
||||
|
||||
for (auto& node_id : tentative_nodes) {
|
||||
provider_nodes.insert(node_id);
|
||||
const Node* node = graph.GetNode(node_id);
|
||||
|
||||
const KernelCreateInfo* kernel_info = nullptr;
|
||||
|
||||
// Get the CPU kernel availability for this node
|
||||
for (auto registry : cpu_kernel_registries) {
|
||||
auto st = registry->TryFindKernel(*node, kCpuExecutionProvider, &kernel_info);
|
||||
if (st.IsOK()) {
|
||||
cpu_kernel_available.insert(node_id);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (auto registry : kernel_registries) {
|
||||
auto st = registry->TryFindKernel(*node, provider_type, &kernel_info);
|
||||
if (st.IsOK())
|
||||
|
|
@ -99,26 +71,11 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewe
|
|||
node->OutputDefs(),
|
||||
[&](const NodeArg& node_arg, size_t out_index) {
|
||||
if (kernel_info->kernel_def->IsOutputOnCpu(out_index)) {
|
||||
cpu_args.insert(&node_arg);
|
||||
cpu_output_args.insert(&node_arg);
|
||||
auto consumer_nodes = graph.GetConsumerNodes(node_arg.Name());
|
||||
for (auto& consumer_node : consumer_nodes) {
|
||||
candidates_fw.push(consumer_node->Index());
|
||||
LOGS_DEFAULT(INFO) << "Candidate for fallback CPU execution in forward trace: " << consumer_node->Name();
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}));
|
||||
|
||||
// then, find all the direct producers of cpu tensors.
|
||||
ORT_THROW_IF_ERROR(node->ForEachWithIndex(
|
||||
node->InputDefs(),
|
||||
[&](const NodeArg& node_arg, size_t in_index) {
|
||||
if (kernel_info->kernel_def->IsInputOnCpu(in_index)) {
|
||||
cpu_args.insert(&node_arg);
|
||||
auto producer_node = graph.GetProducerNode(node_arg.Name());
|
||||
if (producer_node != nullptr) {
|
||||
candidates_bw.push(producer_node->Index());
|
||||
LOGS_DEFAULT(INFO) << "Candidate for fallback CPU execution in backward trace: " << producer_node->Name();
|
||||
candidates.push(consumer_node->Index());
|
||||
LOGS_DEFAULT(INFO) << "Candidate for fallback CPU execution: " << consumer_node->Name();
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
|
|
@ -132,9 +89,9 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewe
|
|||
// The detail:
|
||||
// for each candidate, if one of its input is a cpu tensor and the Non-CPU kernel doesn't mark it as cpu input,
|
||||
// force the node to CPU to avoid memory cpu and add its output to the small cpu tensors.
|
||||
while (!candidates_fw.empty()) {
|
||||
NodeIndex cur = candidates_fw.top();
|
||||
candidates_fw.pop();
|
||||
while (!candidates.empty()) {
|
||||
NodeIndex cur = candidates.top();
|
||||
candidates.pop();
|
||||
if (visited.count(cur) != 0)
|
||||
continue;
|
||||
visited.insert(cur);
|
||||
|
|
@ -161,7 +118,7 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewe
|
|||
}
|
||||
|
||||
// the input is not a CPU tensor
|
||||
if (cpu_args.find(input) == cpu_args.end()) {
|
||||
if (cpu_output_args.find(input) == cpu_output_args.end()) {
|
||||
place_in_cpu = false;
|
||||
break;
|
||||
}
|
||||
|
|
@ -173,90 +130,16 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewe
|
|||
}
|
||||
}
|
||||
|
||||
if (place_in_cpu && cpu_kernel_available.count(cur) != 0) {
|
||||
if (place_in_cpu) {
|
||||
cpu_nodes.insert(cur);
|
||||
LOGS_DEFAULT(INFO) << "ORT optimization- Force fallback to CPU execution for node: " << node->Name()
|
||||
<< " because the CPU execution path is deemed faster than overhead involved with execution on other EPs "
|
||||
<< " capable of executing this node";
|
||||
for (auto* output : node->OutputDefs()) {
|
||||
cpu_args.insert(output);
|
||||
cpu_output_args.insert(output);
|
||||
}
|
||||
for (auto it = node->OutputNodesBegin(); it != node->OutputNodesEnd(); ++it) {
|
||||
candidates_fw.push((*it).Index());
|
||||
}
|
||||
}
|
||||
}
|
||||
// clear the visited to prepare for backward trace
|
||||
visited.clear();
|
||||
// Trace the graph backwards to find additional CPU nodes
|
||||
// Starting from nodes that must produce an output on CPU, trace the producer nodes
|
||||
// The trace stops when we find that
|
||||
// 1) The node is already picked for CPU
|
||||
// 2) Input/Output type is unsupported on CPU(float16/bfloat16)
|
||||
// 3) The output is not a CPU tensor
|
||||
// 4) The search hits a node that produces a CPU output
|
||||
while (!candidates_bw.empty()) {
|
||||
NodeIndex cur = candidates_bw.top();
|
||||
candidates_bw.pop();
|
||||
if (visited.count(cur) != 0)
|
||||
continue;
|
||||
visited.insert(cur);
|
||||
|
||||
// node is already picked for CPU
|
||||
if (cpu_nodes.count(cur) != 0)
|
||||
continue;
|
||||
|
||||
if (provider_nodes.find(cur) == provider_nodes.end())
|
||||
continue;
|
||||
|
||||
auto* node = graph.GetNode(cur);
|
||||
bool place_in_cpu = true;
|
||||
for (size_t i = 0; i < node->OutputDefs().size(); ++i) {
|
||||
auto* output = node->OutputDefs()[i];
|
||||
|
||||
// skip placing on CPU if the data typs is float16 or bfloat16
|
||||
if (output->Type() == DataTypeUtils::ToType("float16") ||
|
||||
output->Type() == DataTypeUtils::ToType("bfloat16")) {
|
||||
place_in_cpu = false;
|
||||
break;
|
||||
}
|
||||
|
||||
// the output is not a CPU tensor
|
||||
if (cpu_args.find(output) == cpu_args.end()) {
|
||||
place_in_cpu = false;
|
||||
break;
|
||||
}
|
||||
|
||||
// output is a CPU tensor, but it's intended to be consumed as CPU output by the target EP
|
||||
if (node_to_kernel[cur]->kernel_def->IsOutputOnCpu(i)) {
|
||||
place_in_cpu = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Next, check if the node inputs are of supported type
|
||||
if (place_in_cpu) {
|
||||
for (size_t i = 0; i < node->InputDefs().size(); ++i) {
|
||||
auto* input = node->InputDefs()[i];
|
||||
|
||||
// skip placing on CPU if the data typs is float16 or bfloat16
|
||||
if (input->Type() == DataTypeUtils::ToType("float16") ||
|
||||
input->Type() == DataTypeUtils::ToType("bfloat16")) {
|
||||
place_in_cpu = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (place_in_cpu && cpu_kernel_available.count(cur) != 0) {
|
||||
cpu_nodes.insert(cur);
|
||||
LOGS_DEFAULT(INFO) << "ORT optimization- Force fallback to CPU execution for node: " << node->Name()
|
||||
<< " because the CPU execution path is deemed faster than overhead involved with execution on other EPs "
|
||||
<< " capable of executing this node";
|
||||
for (auto* input : node->InputDefs()) {
|
||||
cpu_args.insert(input);
|
||||
}
|
||||
for (auto it = node->InputNodesBegin(); it != node->InputNodesEnd(); ++it) {
|
||||
candidates_bw.push((*it).Index());
|
||||
candidates.push((*it).Index());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -20,12 +20,6 @@
|
|||
#include "gtest/gtest.h"
|
||||
#include "test/test_environment.h"
|
||||
|
||||
#ifdef USE_CUDA
|
||||
#include "core/providers/cuda/cuda_execution_provider.h"
|
||||
#elif USE_ROCM
|
||||
#include "core/providers/rocm/rocm_execution_provider.h"
|
||||
#endif
|
||||
|
||||
using namespace ONNX_NAMESPACE;
|
||||
using namespace std;
|
||||
namespace onnxruntime {
|
||||
|
|
@ -179,94 +173,6 @@ TEST_P(SessionStateTestP, TestInitializerProcessing) {
|
|||
}
|
||||
}
|
||||
|
||||
#if defined(USE_CUDA) || defined(USE_ROCM)
|
||||
static void TestCPUNodePlacement(const std::basic_string<ORTCHAR_T>& model_uri,
|
||||
const std::unordered_set<std::string>& expected_cpu_nodes,
|
||||
const std::unordered_set<std::string>& expected_gpu_nodes) {
|
||||
std::shared_ptr<Model> model;
|
||||
ASSERT_STATUS_OK(Model::Load(model_uri, model, nullptr, DefaultLoggingManager().DefaultLogger()));
|
||||
Graph& graph = model->MainGraph();
|
||||
|
||||
ExecutionProviders execution_providers;
|
||||
#if defined(USE_CUDA)
|
||||
CUDAExecutionProviderInfo cuda_epi;
|
||||
ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, std::make_unique<CUDAExecutionProvider>(cuda_epi)));
|
||||
#elif defined(USE_ROCM)
|
||||
ROCMExecutionProviderInfo rocm_epi;
|
||||
ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kRocmExecutionProvider, std::make_unique<ROCMExecutionProvider>(rocm_epi)));
|
||||
#endif
|
||||
// add CPU EP
|
||||
CPUExecutionProviderInfo epi;
|
||||
ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCpuExecutionProvider, std::make_unique<CPUExecutionProvider>(epi)));
|
||||
|
||||
KernelRegistryManager krm;
|
||||
ASSERT_STATUS_OK(krm.RegisterKernels(execution_providers));
|
||||
|
||||
DataTransferManager dtm;
|
||||
profiling::Profiler profiler;
|
||||
|
||||
SessionState session_state(graph, execution_providers, false, nullptr, nullptr, dtm,
|
||||
DefaultLoggingManager().DefaultLogger(), profiler);
|
||||
|
||||
// Partition the graph. Here, the graph partitioner assigns EPs to the nodes
|
||||
GraphPartitioner partitioner(krm, execution_providers);
|
||||
ASSERT_STATUS_OK(partitioner.Partition(graph, session_state.ExportDll(), session_state.GetMutableFuncMgr()));
|
||||
|
||||
// check which nodes are assigned to CPU and GPU
|
||||
for (auto& node : graph.Nodes()) {
|
||||
// assert that EP is assigned
|
||||
ASSERT_TRUE(!node.GetExecutionProviderType().empty());
|
||||
auto& ep = node.GetExecutionProviderType();
|
||||
if (ep == onnxruntime::kCudaExecutionProvider || ep == onnxruntime::kRocmExecutionProvider) {
|
||||
ASSERT_TRUE(expected_gpu_nodes.count(node.Name())) << "Node not found in expected gpu nodes: " << node.Name();
|
||||
} else if (ep == onnxruntime::kCpuExecutionProvider) {
|
||||
ASSERT_TRUE(expected_cpu_nodes.count(node.Name())) << "Node not found in expected cpu nodes: " << node.Name();
|
||||
} else {
|
||||
ASSERT_TRUE(false) << "Invalid execution provider assigned to node: " << node.Name() << " , value: " << ep;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(SessionStateTest, CPUPlacementTest0) {
|
||||
std::unordered_set<std::string> expected_cpu_nodes = {"reshape", "shape1", "const1", "mul", "equal", "where"};
|
||||
std::unordered_set<std::string> expected_gpu_nodes = {"shape0", "expand"};
|
||||
TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_0.onnx"), expected_cpu_nodes, expected_gpu_nodes);
|
||||
}
|
||||
TEST(SessionStateTest, CPUPlacementTest1) {
|
||||
std::unordered_set<std::string> expected_cpu_nodes = {"const1"};
|
||||
std::unordered_set<std::string> expected_gpu_nodes = {"shape0", "expand"};
|
||||
TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_1.onnx"), expected_cpu_nodes, expected_gpu_nodes);
|
||||
}
|
||||
TEST(SessionStateTest, CPUPlacementTest2) {
|
||||
std::unordered_set<std::string> expected_cpu_nodes = {"range"};
|
||||
std::unordered_set<std::string> expected_gpu_nodes = {"size0", "reduce"};
|
||||
TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_2.onnx"), expected_cpu_nodes, expected_gpu_nodes);
|
||||
}
|
||||
TEST(SessionStateTest, CPUPlacementTest3) {
|
||||
std::unordered_set<std::string> expected_cpu_nodes = {"range0", "range1"};
|
||||
std::unordered_set<std::string> expected_gpu_nodes = {"size0", "reduce0", "identity", "size1", "reduce1", "sum"};
|
||||
TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_3.onnx"), expected_cpu_nodes, expected_gpu_nodes);
|
||||
}
|
||||
TEST(SessionStateTest, CPUPlacementTest4) {
|
||||
// Currently, the behaviour is different for RocM and CUDA EP as Rocm EP is missing a valid kernel
|
||||
// for ReduceSum for int64 type. This causes the backward trace in GetCpuPreferredNodes to stop
|
||||
// earlier. The expected values can be modified to match CUDA once the RocM EP kernel is updated
|
||||
#if defined(USE_CUDA)
|
||||
std::unordered_set<std::string> expected_cpu_nodes = {"range", "reduce", "const1"};
|
||||
std::unordered_set<std::string> expected_gpu_nodes = {"size0", "expand"};
|
||||
#elif defined(USE_ROCM)
|
||||
std::unordered_set<std::string> expected_cpu_nodes = {"const1", "reduce"};
|
||||
std::unordered_set<std::string> expected_gpu_nodes = {"size0", "expand", "range"};
|
||||
#endif
|
||||
TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_4.onnx"), expected_cpu_nodes, expected_gpu_nodes);
|
||||
}
|
||||
TEST(SessionStateTest, CPUPlacementTest5) {
|
||||
std::unordered_set<std::string> expected_cpu_nodes = {"gather0", "gather1", "concat"};
|
||||
std::unordered_set<std::string> expected_gpu_nodes = {"shape0", "shape1", "reshape"};
|
||||
TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_5.onnx"), expected_cpu_nodes, expected_gpu_nodes);
|
||||
}
|
||||
#endif
|
||||
|
||||
// Test that we allocate memory for an initializer from non-arena memory even if we provide an arena-based allocator
|
||||
// if the relevant session option config flag is set
|
||||
// For this test we need to enable the arena-based allocator which is not supported on x86 builds, so
|
||||
|
|
|
|||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
170
onnxruntime/test/testdata/cpu_fallback_test_gen.py
vendored
170
onnxruntime/test/testdata/cpu_fallback_test_gen.py
vendored
|
|
@ -1,170 +0,0 @@
|
|||
import onnx
|
||||
from onnx import helper
|
||||
from onnx import TensorProto
|
||||
from onnx import shape_inference
|
||||
import numpy as np
|
||||
|
||||
graph_def_0 = helper.make_graph(
|
||||
nodes=[
|
||||
helper.make_node(op_type="Shape", inputs=['A'], outputs=['A_shape'], name='shape0'),
|
||||
helper.make_node(op_type="Reshape", inputs=['A_shape', 'shape'], outputs=['A_reshaped'], name='reshape'),
|
||||
helper.make_node(op_type="Shape", inputs=['A_reshaped'], outputs=['A_shape1'], name='shape1'),
|
||||
helper.make_node(op_type="ConstantOfShape", inputs=['A_shape1'], outputs=['const1'], name='const1', value=helper.make_tensor('val', TensorProto.INT64,
|
||||
[1], [1])),
|
||||
helper.make_node(op_type="Mul", inputs=['const1', 'neg_one'], outputs=['mul'], name='mul'),
|
||||
helper.make_node(op_type="Equal", inputs=['A_reshaped', 'mul'], outputs=['equal'], name='equal'),
|
||||
helper.make_node(op_type="Where", inputs=['equal', 'const1', 'A_reshaped'], outputs=['where'], name='where'),
|
||||
helper.make_node(op_type="Expand", inputs=['B','where'], outputs=['C'], name='expand'),
|
||||
|
||||
],
|
||||
name='test-model',
|
||||
inputs=[
|
||||
# create inputs with symbolic dims
|
||||
helper.make_tensor_value_info("A", TensorProto.FLOAT, None),
|
||||
helper.make_tensor_value_info("B", TensorProto.FLOAT, None),
|
||||
],
|
||||
outputs=[
|
||||
helper.make_tensor_value_info('C', TensorProto.FLOAT, None)
|
||||
],
|
||||
initializer=[
|
||||
helper.make_tensor('shape', TensorProto.INT64, [1], [-1]),
|
||||
helper.make_tensor('neg_one', TensorProto.INT64, [1], [-1]),
|
||||
])
|
||||
|
||||
model = helper.make_model(graph_def_0, opset_imports=[helper.make_operatorsetid("", 12)])
|
||||
onnx.save_model(model, "cpu_fallback_pattern_0.onnx")
|
||||
|
||||
graph_def_1 = helper.make_graph(
|
||||
nodes=[
|
||||
helper.make_node(op_type="Shape", inputs=['A'], outputs=['A_shape'], name='shape0'),
|
||||
helper.make_node(op_type="ConstantOfShape", inputs=['A_shape'], outputs=['const1'], name='const1', value=helper.make_tensor('val', TensorProto.INT64,
|
||||
[1], [1])),
|
||||
helper.make_node(op_type="Expand", inputs=['B','const1'], outputs=['C'], name='expand'),
|
||||
|
||||
],
|
||||
name='test-model',
|
||||
inputs=[
|
||||
# create inputs with symbolic dims
|
||||
helper.make_tensor_value_info("A", TensorProto.FLOAT, None),
|
||||
helper.make_tensor_value_info("B", TensorProto.FLOAT, None),
|
||||
],
|
||||
outputs=[
|
||||
helper.make_tensor_value_info('C', TensorProto.FLOAT, None)
|
||||
],
|
||||
initializer=[])
|
||||
|
||||
model = helper.make_model(graph_def_1, opset_imports=[helper.make_operatorsetid("", 12)])
|
||||
onnx.save_model(model, "cpu_fallback_pattern_1.onnx")
|
||||
|
||||
|
||||
graph_def_2 = helper.make_graph(
|
||||
nodes=[
|
||||
helper.make_node(op_type="Size", inputs=['A'], outputs=['A_size'], name='size0'),
|
||||
helper.make_node(op_type="Range", inputs=['zero', 'A_size', 'two'], outputs=['range'], name='range'),
|
||||
helper.make_node(op_type="ReduceSum", inputs=['B', 'range'], outputs=['C'], name='reduce'),
|
||||
],
|
||||
name='test-model',
|
||||
inputs=[
|
||||
# create inputs with symbolic dims
|
||||
helper.make_tensor_value_info("A", TensorProto.FLOAT, None),
|
||||
helper.make_tensor_value_info("B", TensorProto.FLOAT, None),
|
||||
],
|
||||
outputs=[
|
||||
helper.make_tensor_value_info('C', TensorProto.FLOAT, None)
|
||||
],
|
||||
initializer=[
|
||||
helper.make_tensor('zero', TensorProto.INT64, [], [0]),
|
||||
helper.make_tensor('two', TensorProto.INT64, [], [2]),
|
||||
])
|
||||
|
||||
model = helper.make_model(graph_def_2, opset_imports=[helper.make_operatorsetid("", 13)])
|
||||
onnx.save_model(model, "cpu_fallback_pattern_2.onnx")
|
||||
|
||||
|
||||
graph_def_3 = helper.make_graph(
|
||||
nodes=[
|
||||
helper.make_node(op_type="Size", inputs=['A'], outputs=['size0'], name='size0'),
|
||||
helper.make_node(op_type="Range", inputs=['zero', 'size0', 'two'], outputs=['range0'], name='range0'),
|
||||
helper.make_node(op_type="ReduceSum", inputs=['B', 'range0'], outputs=['reduce0'], name='reduce0'),
|
||||
|
||||
helper.make_node(op_type="Identity", inputs=['reduce0'], outputs=['reduce0_cpy'], name='identity'),
|
||||
|
||||
helper.make_node(op_type="Size", inputs=['reduce0_cpy'], outputs=['size1'], name='size1'),
|
||||
helper.make_node(op_type="Range", inputs=['zero', 'size1', 'two'], outputs=['range1'], name='range1'),
|
||||
helper.make_node(op_type="ReduceSum", inputs=['B', 'range1'], outputs=['reduce1'], name='reduce1'),
|
||||
|
||||
helper.make_node(op_type="Sum", inputs=['reduce0', 'reduce1'], outputs=['C'], name='sum'),
|
||||
|
||||
],
|
||||
name='test-model',
|
||||
inputs=[
|
||||
# create inputs with symbolic dims
|
||||
helper.make_tensor_value_info("A", TensorProto.FLOAT, None),
|
||||
helper.make_tensor_value_info("B", TensorProto.FLOAT, None),
|
||||
],
|
||||
outputs=[
|
||||
helper.make_tensor_value_info('C', TensorProto.FLOAT, None)
|
||||
],
|
||||
initializer=[
|
||||
helper.make_tensor('zero', TensorProto.INT64, [], [0]),
|
||||
helper.make_tensor('two', TensorProto.INT64, [], [2]),
|
||||
])
|
||||
|
||||
model = helper.make_model(graph_def_3, opset_imports=[helper.make_operatorsetid("", 13)])
|
||||
onnx.save_model(model, "cpu_fallback_pattern_3.onnx")
|
||||
|
||||
graph_def_4 = helper.make_graph(
|
||||
nodes=[
|
||||
helper.make_node(op_type="Size", inputs=['A'], outputs=['A_size'], name='size0'),
|
||||
helper.make_node(op_type="Range", inputs=['zero', 'A_size', 'two'], outputs=['range'], name='range'),
|
||||
helper.make_node(op_type="ReduceSum", inputs=['B', 'range'], outputs=['reduce'], name='reduce'),
|
||||
helper.make_node(op_type="ConstantOfShape", inputs=['reduce'], outputs=['const1'], name='const1', value=helper.make_tensor('val', TensorProto.INT64,
|
||||
[1], [1])),
|
||||
helper.make_node(op_type="Expand", inputs=['C','const1'], outputs=['D'], name='expand'),
|
||||
|
||||
],
|
||||
name='test-model',
|
||||
inputs=[
|
||||
# create inputs with symbolic dims
|
||||
helper.make_tensor_value_info("A", TensorProto.FLOAT, None),
|
||||
helper.make_tensor_value_info("B", TensorProto.INT64, None),
|
||||
helper.make_tensor_value_info("C", TensorProto.FLOAT, None),
|
||||
],
|
||||
outputs=[
|
||||
helper.make_tensor_value_info('D', TensorProto.FLOAT, None)
|
||||
],
|
||||
initializer=[
|
||||
helper.make_tensor('zero', TensorProto.INT64, [], [0]),
|
||||
helper.make_tensor('two', TensorProto.INT64, [], [2]),
|
||||
])
|
||||
|
||||
model = helper.make_model(graph_def_4, opset_imports=[helper.make_operatorsetid("", 13)])
|
||||
onnx.save_model(model, "cpu_fallback_pattern_4.onnx")
|
||||
|
||||
graph_def_5 = helper.make_graph(
|
||||
nodes=[
|
||||
helper.make_node(op_type="Shape", inputs=['A'], outputs=['A_shape'], name='shape0'),
|
||||
helper.make_node(op_type="Gather", inputs=['A_shape', 'zero'], outputs=['batch'], name='gather0'),
|
||||
helper.make_node(op_type="Concat", inputs=['batch', 'seq_len'], outputs=['shape'], name='concat', axis=0),
|
||||
helper.make_node(op_type="Shape", inputs=['B'], outputs=['B_shape'], name='shape1'),
|
||||
helper.make_node(op_type="Gather", inputs=['B_shape', 'one'], outputs=['seq_len'], name='gather1'),
|
||||
helper.make_node(op_type="Reshape", inputs=['C','shape'], outputs=['D'], name='reshape'),
|
||||
|
||||
],
|
||||
name='test-model',
|
||||
inputs=[
|
||||
# create inputs with symbolic dims
|
||||
helper.make_tensor_value_info("A", TensorProto.FLOAT, None),
|
||||
helper.make_tensor_value_info("B", TensorProto.INT64, None),
|
||||
helper.make_tensor_value_info("C", TensorProto.FLOAT, None),
|
||||
],
|
||||
outputs=[
|
||||
helper.make_tensor_value_info('D', TensorProto.FLOAT, None)
|
||||
],
|
||||
initializer=[
|
||||
helper.make_tensor('zero', TensorProto.INT64, [1], [0]),
|
||||
helper.make_tensor('one', TensorProto.INT64, [1], [1]),
|
||||
])
|
||||
|
||||
model = helper.make_model(graph_def_5, opset_imports=[helper.make_operatorsetid("", 13)])
|
||||
onnx.save_model(model, "cpu_fallback_pattern_5.onnx")
|
||||
Loading…
Reference in a new issue