Revert (#7663)

2026-07-21 19:18:55 +00:00 · 2021-05-13 14:11:17 -07:00 · 2021-05-13 14:11:17 -07:00 · 7bb3f243ff
commit 7bb3f243ff
parent 1ab8a95eb6
9 changed files with 12 additions and 393 deletions
--- a/onnxruntime/core/framework/fallback_cpu_capability.cc
+++ b/onnxruntime/core/framework/fallback_cpu_capability.cc
@ -7,10 +7,7 @@

 #include "onnx/defs/data_type_utils.h"

-#include "core/framework/execution_providers.h"
-#include "core/framework/kernel_registry_manager.h"
 #include "core/framework/op_kernel.h"
-#include "core/providers/cpu/cpu_execution_provider.h"

 using namespace ONNX_NAMESPACE::Utils;

@ -48,43 +45,18 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewe
    return node_id_to_order_map[n1] > node_id_to_order_map[n2];
  };

-  // If return false, n2 will be output first; If return true, n1 will be output first
-  auto lesser_order_comp = [&](const NodeIndex n1, const NodeIndex n2) {
-    return node_id_to_order_map[n1] < node_id_to_order_map[n2];
-  };
-
-  std::priority_queue<NodeIndex, std::vector<NodeIndex>, decltype(greater_order_comp)> candidates_fw(greater_order_comp);
-  std::priority_queue<NodeIndex, std::vector<NodeIndex>, decltype(lesser_order_comp)> candidates_bw(lesser_order_comp);
+  std::priority_queue<NodeIndex, std::vector<NodeIndex>, decltype(greater_order_comp)> candidates(greater_order_comp);
  std::unordered_set<NodeIndex> visited;

-  std::unordered_set<const NodeArg*> cpu_args;
+  std::unordered_set<const NodeArg*> cpu_output_args;
  std::unordered_set<NodeIndex> provider_nodes;
  std::unordered_map<NodeIndex, const KernelCreateInfo*> node_to_kernel;
-  std::unordered_set<NodeIndex> cpu_kernel_available;
-
-  // create a temp CPU kernel registry
-  KernelRegistryManager mgr;
-  ExecutionProviders cpu_ep;
-  CPUExecutionProviderInfo epi{false};
-  ORT_ENFORCE(cpu_ep.Add(kCpuExecutionProvider, std::make_unique<CPUExecutionProvider>(epi)).IsOK());
-  ORT_ENFORCE(mgr.RegisterKernels(cpu_ep).IsOK());
-  std::vector<const KernelRegistry*> cpu_kernel_registries = mgr.GetKernelRegistriesByProviderType(kCpuExecutionProvider);

  for (auto& node_id : tentative_nodes) {
    provider_nodes.insert(node_id);
    const Node* node = graph.GetNode(node_id);

    const KernelCreateInfo* kernel_info = nullptr;
-
-    // Get the CPU kernel availability for this node
-    for (auto registry : cpu_kernel_registries) {
-      auto st = registry->TryFindKernel(*node, kCpuExecutionProvider, &kernel_info);
-      if (st.IsOK()) {
-        cpu_kernel_available.insert(node_id);
-        break;
-      }
-    }
-
    for (auto registry : kernel_registries) {
      auto st = registry->TryFindKernel(*node, provider_type, &kernel_info);
      if (st.IsOK())
@ -99,26 +71,11 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewe
        node->OutputDefs(),
        [&](const NodeArg& node_arg, size_t out_index) {
          if (kernel_info->kernel_def->IsOutputOnCpu(out_index)) {
-            cpu_args.insert(&node_arg);
+            cpu_output_args.insert(&node_arg);
            auto consumer_nodes = graph.GetConsumerNodes(node_arg.Name());
            for (auto& consumer_node : consumer_nodes) {
-              candidates_fw.push(consumer_node->Index());
-              LOGS_DEFAULT(INFO) << "Candidate for fallback CPU execution in forward trace: " << consumer_node->Name();
-            }
-          }
-          return Status::OK();
-        }));
-
-    // then, find all the direct producers of cpu tensors.
-    ORT_THROW_IF_ERROR(node->ForEachWithIndex(
-        node->InputDefs(),
-        [&](const NodeArg& node_arg, size_t in_index) {
-          if (kernel_info->kernel_def->IsInputOnCpu(in_index)) {
-            cpu_args.insert(&node_arg);
-            auto producer_node = graph.GetProducerNode(node_arg.Name());
-            if (producer_node != nullptr) {
-              candidates_bw.push(producer_node->Index());
-              LOGS_DEFAULT(INFO) << "Candidate for fallback CPU execution in backward trace: " << producer_node->Name();
+              candidates.push(consumer_node->Index());
+              LOGS_DEFAULT(INFO) << "Candidate for fallback CPU execution: " << consumer_node->Name();
            }
          }
          return Status::OK();
@ -132,9 +89,9 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewe
  // The detail:
  // for each candidate, if one of its input is a cpu tensor and the Non-CPU kernel doesn't mark it as cpu input,
  // force the node to CPU to avoid memory cpu and add its output to the small cpu tensors.
-  while (!candidates_fw.empty()) {
-    NodeIndex cur = candidates_fw.top();
-    candidates_fw.pop();
+  while (!candidates.empty()) {
+    NodeIndex cur = candidates.top();
+    candidates.pop();
    if (visited.count(cur) != 0)
      continue;
    visited.insert(cur);
@ -161,7 +118,7 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewe
      }

      // the input is not a CPU tensor
-      if (cpu_args.find(input) == cpu_args.end()) {
+      if (cpu_output_args.find(input) == cpu_output_args.end()) {
        place_in_cpu = false;
        break;
      }
@ -173,90 +130,16 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewe
      }
    }

-    if (place_in_cpu && cpu_kernel_available.count(cur) != 0) {
+    if (place_in_cpu) {
      cpu_nodes.insert(cur);
      LOGS_DEFAULT(INFO) << "ORT optimization- Force fallback to CPU execution for node: " << node->Name()
                         << " because the CPU execution path is deemed faster than overhead involved with execution on other EPs "
                         << " capable of executing this node";
      for (auto* output : node->OutputDefs()) {
-        cpu_args.insert(output);
+        cpu_output_args.insert(output);
      }
      for (auto it = node->OutputNodesBegin(); it != node->OutputNodesEnd(); ++it) {
-        candidates_fw.push((*it).Index());
-      }
-    }
-  }
-  // clear the visited to prepare for backward trace
-  visited.clear();
-  // Trace the graph backwards to find additional CPU nodes
-  // Starting from nodes that must produce an output on CPU, trace the producer nodes
-  // The trace stops when we find that
-  // 1) The node is already picked for CPU
-  // 2) Input/Output type is unsupported on CPU(float16/bfloat16)
-  // 3) The output is not a CPU tensor
-  // 4) The search hits a node that produces a CPU output
-  while (!candidates_bw.empty()) {
-    NodeIndex cur = candidates_bw.top();
-    candidates_bw.pop();
-    if (visited.count(cur) != 0)
-      continue;
-    visited.insert(cur);
-
-    // node is already picked for CPU
-    if (cpu_nodes.count(cur) != 0)
-      continue;
-
-    if (provider_nodes.find(cur) == provider_nodes.end())
-      continue;
-
-    auto* node = graph.GetNode(cur);
-    bool place_in_cpu = true;
-    for (size_t i = 0; i < node->OutputDefs().size(); ++i) {
-      auto* output = node->OutputDefs()[i];
-
-      // skip placing on CPU if the data typs is float16 or bfloat16
-      if (output->Type() == DataTypeUtils::ToType("float16") ||
-          output->Type() == DataTypeUtils::ToType("bfloat16")) {
-        place_in_cpu = false;
-        break;
-      }
-
-      // the output is not a CPU tensor
-      if (cpu_args.find(output) == cpu_args.end()) {
-        place_in_cpu = false;
-        break;
-      }
-
-      // output is a CPU tensor, but it's intended to be consumed as CPU output by the target EP
-      if (node_to_kernel[cur]->kernel_def->IsOutputOnCpu(i)) {
-        place_in_cpu = false;
-        break;
-      }
-    }
-    // Next, check if the node inputs are of supported type
-    if (place_in_cpu) {
-      for (size_t i = 0; i < node->InputDefs().size(); ++i) {
-        auto* input = node->InputDefs()[i];
-
-        // skip placing on CPU if the data typs is float16 or bfloat16
-        if (input->Type() == DataTypeUtils::ToType("float16") ||
-            input->Type() == DataTypeUtils::ToType("bfloat16")) {
-          place_in_cpu = false;
-          break;
-        }
-      }
-    }
-
-    if (place_in_cpu && cpu_kernel_available.count(cur) != 0) {
-      cpu_nodes.insert(cur);
-      LOGS_DEFAULT(INFO) << "ORT optimization- Force fallback to CPU execution for node: " << node->Name()
-                         << " because the CPU execution path is deemed faster than overhead involved with execution on other EPs "
-                         << " capable of executing this node";
-      for (auto* input : node->InputDefs()) {
-        cpu_args.insert(input);
-      }
-      for (auto it = node->InputNodesBegin(); it != node->InputNodesEnd(); ++it) {
-        candidates_bw.push((*it).Index());
+        candidates.push((*it).Index());
      }
    }
  }
--- a/onnxruntime/test/framework/session_state_test.cc
+++ b/onnxruntime/test/framework/session_state_test.cc
@ -20,12 +20,6 @@
 #include "gtest/gtest.h"
 #include "test/test_environment.h"

-#ifdef USE_CUDA
-#include "core/providers/cuda/cuda_execution_provider.h"
-#elif USE_ROCM
-#include "core/providers/rocm/rocm_execution_provider.h"
-#endif
-
 using namespace ONNX_NAMESPACE;
 using namespace std;
 namespace onnxruntime {
@ -179,94 +173,6 @@ TEST_P(SessionStateTestP, TestInitializerProcessing) {
  }
 }

-#if defined(USE_CUDA) || defined(USE_ROCM)
-static void TestCPUNodePlacement(const std::basic_string<ORTCHAR_T>& model_uri,
-                                 const std::unordered_set<std::string>& expected_cpu_nodes,
-                                 const std::unordered_set<std::string>& expected_gpu_nodes) {
-  std::shared_ptr<Model> model;
-  ASSERT_STATUS_OK(Model::Load(model_uri, model, nullptr, DefaultLoggingManager().DefaultLogger()));
-  Graph& graph = model->MainGraph();
-
-  ExecutionProviders execution_providers;
-#if defined(USE_CUDA)
-  CUDAExecutionProviderInfo cuda_epi;
-  ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, std::make_unique<CUDAExecutionProvider>(cuda_epi)));
-#elif defined(USE_ROCM)
-  ROCMExecutionProviderInfo rocm_epi;
-  ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kRocmExecutionProvider, std::make_unique<ROCMExecutionProvider>(rocm_epi)));
-#endif
-  // add CPU EP
-  CPUExecutionProviderInfo epi;
-  ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCpuExecutionProvider, std::make_unique<CPUExecutionProvider>(epi)));
-
-  KernelRegistryManager krm;
-  ASSERT_STATUS_OK(krm.RegisterKernels(execution_providers));
-
-  DataTransferManager dtm;
-  profiling::Profiler profiler;
-
-  SessionState session_state(graph, execution_providers, false, nullptr, nullptr, dtm,
-                             DefaultLoggingManager().DefaultLogger(), profiler);
-
-  // Partition the graph. Here, the graph partitioner assigns EPs to the nodes
-  GraphPartitioner partitioner(krm, execution_providers);
-  ASSERT_STATUS_OK(partitioner.Partition(graph, session_state.ExportDll(), session_state.GetMutableFuncMgr()));
-
-  // check which nodes are assigned to CPU and GPU
-  for (auto& node : graph.Nodes()) {
-    // assert that EP is assigned
-    ASSERT_TRUE(!node.GetExecutionProviderType().empty());
-    auto& ep = node.GetExecutionProviderType();
-    if (ep == onnxruntime::kCudaExecutionProvider || ep == onnxruntime::kRocmExecutionProvider) {
-      ASSERT_TRUE(expected_gpu_nodes.count(node.Name())) << "Node not found in expected gpu nodes: " << node.Name();
-    } else if (ep == onnxruntime::kCpuExecutionProvider) {
-      ASSERT_TRUE(expected_cpu_nodes.count(node.Name())) << "Node not found in expected cpu nodes: " << node.Name();
-    } else {
-      ASSERT_TRUE(false) << "Invalid execution provider assigned to node: " << node.Name() << " , value: " << ep;
-    }
-  }
-}
-
-TEST(SessionStateTest, CPUPlacementTest0) {
-  std::unordered_set<std::string> expected_cpu_nodes = {"reshape", "shape1", "const1", "mul", "equal", "where"};
-  std::unordered_set<std::string> expected_gpu_nodes = {"shape0", "expand"};
-  TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_0.onnx"), expected_cpu_nodes, expected_gpu_nodes);
-}
-TEST(SessionStateTest, CPUPlacementTest1) {
-  std::unordered_set<std::string> expected_cpu_nodes = {"const1"};
-  std::unordered_set<std::string> expected_gpu_nodes = {"shape0", "expand"};
-  TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_1.onnx"), expected_cpu_nodes, expected_gpu_nodes);
-}
-TEST(SessionStateTest, CPUPlacementTest2) {
-  std::unordered_set<std::string> expected_cpu_nodes = {"range"};
-  std::unordered_set<std::string> expected_gpu_nodes = {"size0", "reduce"};
-  TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_2.onnx"), expected_cpu_nodes, expected_gpu_nodes);
-}
-TEST(SessionStateTest, CPUPlacementTest3) {
-  std::unordered_set<std::string> expected_cpu_nodes = {"range0", "range1"};
-  std::unordered_set<std::string> expected_gpu_nodes = {"size0", "reduce0", "identity", "size1", "reduce1", "sum"};
-  TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_3.onnx"), expected_cpu_nodes, expected_gpu_nodes);
-}
-TEST(SessionStateTest, CPUPlacementTest4) {
-  // Currently, the behaviour is different for RocM and CUDA EP as Rocm EP is missing a valid kernel
-  // for ReduceSum for int64 type. This causes the backward trace in GetCpuPreferredNodes to stop
-  // earlier. The expected values can be modified to match CUDA once the RocM EP kernel is updated
-#if defined(USE_CUDA)
-  std::unordered_set<std::string> expected_cpu_nodes = {"range", "reduce", "const1"};
-  std::unordered_set<std::string> expected_gpu_nodes = {"size0", "expand"};
-#elif defined(USE_ROCM)
-  std::unordered_set<std::string> expected_cpu_nodes = {"const1", "reduce"};
-  std::unordered_set<std::string> expected_gpu_nodes = {"size0", "expand", "range"};
-#endif
-  TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_4.onnx"), expected_cpu_nodes, expected_gpu_nodes);
-}
-TEST(SessionStateTest, CPUPlacementTest5) {
-  std::unordered_set<std::string> expected_cpu_nodes = {"gather0", "gather1", "concat"};
-  std::unordered_set<std::string> expected_gpu_nodes = {"shape0", "shape1", "reshape"};
-  TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_5.onnx"), expected_cpu_nodes, expected_gpu_nodes);
-}
-#endif
-
 // Test that we allocate memory for an initializer from non-arena memory even if we provide an arena-based allocator
 // if the relevant session option config flag is set
 // For this test we need to enable the arena-based allocator which is not supported on x86 builds, so
--- a/onnxruntime/test/testdata/cpu_fallback_pattern_0.onnx
+++ b/onnxruntime/test/testdata/cpu_fallback_pattern_0.onnx
--- a/onnxruntime/test/testdata/cpu_fallback_pattern_1.onnx
+++ b/onnxruntime/test/testdata/cpu_fallback_pattern_1.onnx
--- a/onnxruntime/test/testdata/cpu_fallback_pattern_2.onnx
+++ b/onnxruntime/test/testdata/cpu_fallback_pattern_2.onnx
--- a/onnxruntime/test/testdata/cpu_fallback_pattern_3.onnx
+++ b/onnxruntime/test/testdata/cpu_fallback_pattern_3.onnx
--- a/onnxruntime/test/testdata/cpu_fallback_pattern_4.onnx
+++ b/onnxruntime/test/testdata/cpu_fallback_pattern_4.onnx
--- a/onnxruntime/test/testdata/cpu_fallback_pattern_5.onnx
+++ b/onnxruntime/test/testdata/cpu_fallback_pattern_5.onnx
--- a/onnxruntime/test/testdata/cpu_fallback_test_gen.py
+++ b/onnxruntime/test/testdata/cpu_fallback_test_gen.py
@ -1,170 +0,0 @@
-import onnx
-from onnx import helper
-from onnx import TensorProto
-from onnx import shape_inference
-import numpy as np
-
-graph_def_0 = helper.make_graph(
-    nodes=[
-        helper.make_node(op_type="Shape", inputs=['A'], outputs=['A_shape'], name='shape0'),
-        helper.make_node(op_type="Reshape", inputs=['A_shape', 'shape'], outputs=['A_reshaped'], name='reshape'),
-        helper.make_node(op_type="Shape", inputs=['A_reshaped'], outputs=['A_shape1'], name='shape1'),
-        helper.make_node(op_type="ConstantOfShape", inputs=['A_shape1'], outputs=['const1'], name='const1', value=helper.make_tensor('val', TensorProto.INT64,
-            [1], [1])),
-        helper.make_node(op_type="Mul", inputs=['const1', 'neg_one'], outputs=['mul'], name='mul'),
-        helper.make_node(op_type="Equal", inputs=['A_reshaped', 'mul'], outputs=['equal'], name='equal'),
-        helper.make_node(op_type="Where", inputs=['equal', 'const1', 'A_reshaped'], outputs=['where'], name='where'),
-        helper.make_node(op_type="Expand", inputs=['B','where'], outputs=['C'], name='expand'),
-
-    ],
-    name='test-model',
-    inputs=[
-        # create inputs with symbolic dims
-        helper.make_tensor_value_info("A", TensorProto.FLOAT, None),
-        helper.make_tensor_value_info("B", TensorProto.FLOAT, None),
-    ],
-    outputs=[
-        helper.make_tensor_value_info('C', TensorProto.FLOAT, None)
-    ],
-    initializer=[
-        helper.make_tensor('shape', TensorProto.INT64, [1], [-1]),
-        helper.make_tensor('neg_one', TensorProto.INT64, [1], [-1]),
-    ])
-
-model = helper.make_model(graph_def_0, opset_imports=[helper.make_operatorsetid("", 12)])
-onnx.save_model(model, "cpu_fallback_pattern_0.onnx")
-
-graph_def_1 = helper.make_graph(
-    nodes=[
-        helper.make_node(op_type="Shape", inputs=['A'], outputs=['A_shape'], name='shape0'),
-        helper.make_node(op_type="ConstantOfShape", inputs=['A_shape'], outputs=['const1'], name='const1', value=helper.make_tensor('val', TensorProto.INT64,
-            [1], [1])),
-        helper.make_node(op_type="Expand", inputs=['B','const1'], outputs=['C'], name='expand'),
-
-    ],
-    name='test-model',
-    inputs=[
-        # create inputs with symbolic dims
-        helper.make_tensor_value_info("A", TensorProto.FLOAT, None),
-        helper.make_tensor_value_info("B", TensorProto.FLOAT, None),
-    ],
-    outputs=[
-        helper.make_tensor_value_info('C', TensorProto.FLOAT, None)
-    ],
-    initializer=[])
-
-model = helper.make_model(graph_def_1, opset_imports=[helper.make_operatorsetid("", 12)])
-onnx.save_model(model, "cpu_fallback_pattern_1.onnx")
-
-
-graph_def_2 = helper.make_graph(
-    nodes=[
-        helper.make_node(op_type="Size", inputs=['A'], outputs=['A_size'], name='size0'),
-        helper.make_node(op_type="Range", inputs=['zero', 'A_size', 'two'], outputs=['range'], name='range'),
-        helper.make_node(op_type="ReduceSum", inputs=['B', 'range'], outputs=['C'], name='reduce'),
-    ],
-    name='test-model',
-    inputs=[
-        # create inputs with symbolic dims
-        helper.make_tensor_value_info("A", TensorProto.FLOAT, None),
-        helper.make_tensor_value_info("B", TensorProto.FLOAT, None),
-    ],
-    outputs=[
-        helper.make_tensor_value_info('C', TensorProto.FLOAT, None)
-    ],
-    initializer=[
-        helper.make_tensor('zero', TensorProto.INT64, [], [0]),
-        helper.make_tensor('two', TensorProto.INT64, [], [2]),
-    ])
-
-model = helper.make_model(graph_def_2, opset_imports=[helper.make_operatorsetid("", 13)])
-onnx.save_model(model, "cpu_fallback_pattern_2.onnx")
-
-
-graph_def_3 = helper.make_graph(
-    nodes=[
-        helper.make_node(op_type="Size", inputs=['A'], outputs=['size0'], name='size0'),
-        helper.make_node(op_type="Range", inputs=['zero', 'size0', 'two'], outputs=['range0'], name='range0'),
-        helper.make_node(op_type="ReduceSum", inputs=['B', 'range0'], outputs=['reduce0'], name='reduce0'),
-
-        helper.make_node(op_type="Identity", inputs=['reduce0'], outputs=['reduce0_cpy'], name='identity'),
-
-        helper.make_node(op_type="Size", inputs=['reduce0_cpy'], outputs=['size1'], name='size1'),
-        helper.make_node(op_type="Range", inputs=['zero', 'size1', 'two'], outputs=['range1'], name='range1'),
-        helper.make_node(op_type="ReduceSum", inputs=['B', 'range1'], outputs=['reduce1'], name='reduce1'),
-
-        helper.make_node(op_type="Sum", inputs=['reduce0', 'reduce1'], outputs=['C'], name='sum'),
-
-    ],
-    name='test-model',
-    inputs=[
-        # create inputs with symbolic dims
-        helper.make_tensor_value_info("A", TensorProto.FLOAT, None),
-        helper.make_tensor_value_info("B", TensorProto.FLOAT, None),
-    ],
-    outputs=[
-        helper.make_tensor_value_info('C', TensorProto.FLOAT, None)
-    ],
-    initializer=[
-        helper.make_tensor('zero', TensorProto.INT64, [], [0]),
-        helper.make_tensor('two', TensorProto.INT64, [], [2]),
-    ])
-
-model = helper.make_model(graph_def_3, opset_imports=[helper.make_operatorsetid("", 13)])
-onnx.save_model(model, "cpu_fallback_pattern_3.onnx")
-
-graph_def_4 = helper.make_graph(
-    nodes=[
-        helper.make_node(op_type="Size", inputs=['A'], outputs=['A_size'], name='size0'),
-        helper.make_node(op_type="Range", inputs=['zero', 'A_size', 'two'], outputs=['range'], name='range'),
-        helper.make_node(op_type="ReduceSum", inputs=['B', 'range'], outputs=['reduce'], name='reduce'),
-        helper.make_node(op_type="ConstantOfShape", inputs=['reduce'], outputs=['const1'], name='const1', value=helper.make_tensor('val', TensorProto.INT64,
-            [1], [1])),
-        helper.make_node(op_type="Expand", inputs=['C','const1'], outputs=['D'], name='expand'),
-        
-    ],
-    name='test-model',
-    inputs=[
-        # create inputs with symbolic dims
-        helper.make_tensor_value_info("A", TensorProto.FLOAT, None),
-        helper.make_tensor_value_info("B", TensorProto.INT64, None),
-        helper.make_tensor_value_info("C", TensorProto.FLOAT, None),
-    ],
-    outputs=[
-        helper.make_tensor_value_info('D', TensorProto.FLOAT, None)
-    ],
-    initializer=[
-        helper.make_tensor('zero', TensorProto.INT64, [], [0]),
-        helper.make_tensor('two', TensorProto.INT64, [], [2]),
-    ])
-
-model = helper.make_model(graph_def_4, opset_imports=[helper.make_operatorsetid("", 13)])
-onnx.save_model(model, "cpu_fallback_pattern_4.onnx")
-
-graph_def_5 = helper.make_graph(
-    nodes=[
-        helper.make_node(op_type="Shape", inputs=['A'], outputs=['A_shape'], name='shape0'),
-        helper.make_node(op_type="Gather", inputs=['A_shape', 'zero'], outputs=['batch'], name='gather0'),
-        helper.make_node(op_type="Concat", inputs=['batch', 'seq_len'], outputs=['shape'], name='concat', axis=0),
-        helper.make_node(op_type="Shape", inputs=['B'], outputs=['B_shape'], name='shape1'),
-        helper.make_node(op_type="Gather", inputs=['B_shape', 'one'], outputs=['seq_len'], name='gather1'),
-        helper.make_node(op_type="Reshape", inputs=['C','shape'], outputs=['D'], name='reshape'),
-        
-    ],
-    name='test-model',
-    inputs=[
-        # create inputs with symbolic dims
-        helper.make_tensor_value_info("A", TensorProto.FLOAT, None),
-        helper.make_tensor_value_info("B", TensorProto.INT64, None),
-        helper.make_tensor_value_info("C", TensorProto.FLOAT, None),
-    ],
-    outputs=[
-        helper.make_tensor_value_info('D', TensorProto.FLOAT, None)
-    ],
-    initializer=[
-        helper.make_tensor('zero', TensorProto.INT64, [1], [0]),
-        helper.make_tensor('one', TensorProto.INT64, [1], [1]),
-    ])
-
-model = helper.make_model(graph_def_5, opset_imports=[helper.make_operatorsetid("", 13)])
-onnx.save_model(model, "cpu_fallback_pattern_5.onnx")