From 7bb3f243ff7de673e042ad3f4860539731a3ecb8 Mon Sep 17 00:00:00 2001
From: Hariharan Seshadri <shariharan91@gmail.com>
Date: Thu, 13 May 2021 14:11:17 -0700
Subject: [PATCH] Revert (#7663)

---
 .../core/framework/fallback_cpu_capability.cc | 141 ++-------------
 .../test/framework/session_state_test.cc      |  94 ----------
 .../test/testdata/cpu_fallback_pattern_0.onnx | Bin 450 -> 0 bytes
 .../test/testdata/cpu_fallback_pattern_1.onnx | Bin 187 -> 0 bytes
 .../test/testdata/cpu_fallback_pattern_2.onnx | Bin 183 -> 0 bytes
 .../test/testdata/cpu_fallback_pattern_3.onnx | Bin 388 -> 0 bytes
 .../test/testdata/cpu_fallback_pattern_4.onnx | Bin 300 -> 0 bytes
 .../test/testdata/cpu_fallback_pattern_5.onnx | Bin 324 -> 0 bytes
 .../test/testdata/cpu_fallback_test_gen.py    | 170 ------------------
 9 files changed, 12 insertions(+), 393 deletions(-)
 delete mode 100644 onnxruntime/test/testdata/cpu_fallback_pattern_0.onnx
 delete mode 100644 onnxruntime/test/testdata/cpu_fallback_pattern_1.onnx
 delete mode 100644 onnxruntime/test/testdata/cpu_fallback_pattern_2.onnx
 delete mode 100644 onnxruntime/test/testdata/cpu_fallback_pattern_3.onnx
 delete mode 100644 onnxruntime/test/testdata/cpu_fallback_pattern_4.onnx
 delete mode 100644 onnxruntime/test/testdata/cpu_fallback_pattern_5.onnx
 delete mode 100644 onnxruntime/test/testdata/cpu_fallback_test_gen.py
diff --git a/onnxruntime/core/framework/fallback_cpu_capability.cc b/onnxruntime/core/framework/fallback_cpu_capability.cc
index c23ba1bcb3..011eaf9edb 100644
--- a/onnxruntime/core/framework/fallback_cpu_capability.cc
+++ b/onnxruntime/core/framework/fallback_cpu_capability.cc
@@ -7,10 +7,7 @@
 
 #include "onnx/defs/data_type_utils.h"
 
-#include "core/framework/execution_providers.h"
-#include "core/framework/kernel_registry_manager.h"
 #include "core/framework/op_kernel.h"
-#include "core/providers/cpu/cpu_execution_provider.h"
 
 using namespace ONNX_NAMESPACE::Utils;
 
@@ -48,43 +45,18 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewe
     return node_id_to_order_map[n1] > node_id_to_order_map[n2];
   };
 
-  // If return false, n2 will be output first; If return true, n1 will be output first
-  auto lesser_order_comp = [&](const NodeIndex n1, const NodeIndex n2) {
-    return node_id_to_order_map[n1] < node_id_to_order_map[n2];
-  };
-
-  std::priority_queue<NodeIndex, std::vector<NodeIndex>, decltype(greater_order_comp)> candidates_fw(greater_order_comp);
-  std::priority_queue<NodeIndex, std::vector<NodeIndex>, decltype(lesser_order_comp)> candidates_bw(lesser_order_comp);
+  std::priority_queue<NodeIndex, std::vector<NodeIndex>, decltype(greater_order_comp)> candidates(greater_order_comp);
   std::unordered_set<NodeIndex> visited;
 
-  std::unordered_set<const NodeArg*> cpu_args;
+  std::unordered_set<const NodeArg*> cpu_output_args;
   std::unordered_set<NodeIndex> provider_nodes;
   std::unordered_map<NodeIndex, const KernelCreateInfo*> node_to_kernel;
-  std::unordered_set<NodeIndex> cpu_kernel_available;
-
-  // create a temp CPU kernel registry
-  KernelRegistryManager mgr;
-  ExecutionProviders cpu_ep;
-  CPUExecutionProviderInfo epi{false};
-  ORT_ENFORCE(cpu_ep.Add(kCpuExecutionProvider, std::make_unique<CPUExecutionProvider>(epi)).IsOK());
-  ORT_ENFORCE(mgr.RegisterKernels(cpu_ep).IsOK());
-  std::vector<const KernelRegistry*> cpu_kernel_registries = mgr.GetKernelRegistriesByProviderType(kCpuExecutionProvider);
 
   for (auto& node_id : tentative_nodes) {
     provider_nodes.insert(node_id);
     const Node* node = graph.GetNode(node_id);
 
     const KernelCreateInfo* kernel_info = nullptr;
-
-    // Get the CPU kernel availability for this node
-    for (auto registry : cpu_kernel_registries) {
-      auto st = registry->TryFindKernel(*node, kCpuExecutionProvider, &kernel_info);
-      if (st.IsOK()) {
-        cpu_kernel_available.insert(node_id);
-        break;
-      }
-    }
-
     for (auto registry : kernel_registries) {
       auto st = registry->TryFindKernel(*node, provider_type, &kernel_info);
       if (st.IsOK())
@@ -99,26 +71,11 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewe
         node->OutputDefs(),
         [&](const NodeArg& node_arg, size_t out_index) {
           if (kernel_info->kernel_def->IsOutputOnCpu(out_index)) {
-            cpu_args.insert(&node_arg);
+            cpu_output_args.insert(&node_arg);
             auto consumer_nodes = graph.GetConsumerNodes(node_arg.Name());
             for (auto& consumer_node : consumer_nodes) {
-              candidates_fw.push(consumer_node->Index());
-              LOGS_DEFAULT(INFO) << "Candidate for fallback CPU execution in forward trace: " << consumer_node->Name();
-            }
-          }
-          return Status::OK();
-        }));
-
-    // then, find all the direct producers of cpu tensors.
-    ORT_THROW_IF_ERROR(node->ForEachWithIndex(
-        node->InputDefs(),
-        [&](const NodeArg& node_arg, size_t in_index) {
-          if (kernel_info->kernel_def->IsInputOnCpu(in_index)) {
-            cpu_args.insert(&node_arg);
-            auto producer_node = graph.GetProducerNode(node_arg.Name());
-            if (producer_node != nullptr) {
-              candidates_bw.push(producer_node->Index());
-              LOGS_DEFAULT(INFO) << "Candidate for fallback CPU execution in backward trace: " << producer_node->Name();
+              candidates.push(consumer_node->Index());
+              LOGS_DEFAULT(INFO) << "Candidate for fallback CPU execution: " << consumer_node->Name();
             }
           }
           return Status::OK();
@@ -132,9 +89,9 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewe
   // The detail:
   // for each candidate, if one of its input is a cpu tensor and the Non-CPU kernel doesn't mark it as cpu input,
   // force the node to CPU to avoid memory cpu and add its output to the small cpu tensors.
-  while (!candidates_fw.empty()) {
-    NodeIndex cur = candidates_fw.top();
-    candidates_fw.pop();
+  while (!candidates.empty()) {
+    NodeIndex cur = candidates.top();
+    candidates.pop();
     if (visited.count(cur) != 0)
       continue;
     visited.insert(cur);
@@ -161,7 +118,7 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewe
       }
 
       // the input is not a CPU tensor
-      if (cpu_args.find(input) == cpu_args.end()) {
+      if (cpu_output_args.find(input) == cpu_output_args.end()) {
         place_in_cpu = false;
         break;
       }
@@ -173,90 +130,16 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewe
       }
     }
 
-    if (place_in_cpu && cpu_kernel_available.count(cur) != 0) {
+    if (place_in_cpu) {
       cpu_nodes.insert(cur);
       LOGS_DEFAULT(INFO) << "ORT optimization- Force fallback to CPU execution for node: " << node->Name()
                          << " because the CPU execution path is deemed faster than overhead involved with execution on other EPs "
                          << " capable of executing this node";
       for (auto* output : node->OutputDefs()) {
-        cpu_args.insert(output);
+        cpu_output_args.insert(output);
       }
       for (auto it = node->OutputNodesBegin(); it != node->OutputNodesEnd(); ++it) {
-        candidates_fw.push((*it).Index());
-      }
-    }
-  }
-  // clear the visited to prepare for backward trace
-  visited.clear();
-  // Trace the graph backwards to find additional CPU nodes
-  // Starting from nodes that must produce an output on CPU, trace the producer nodes
-  // The trace stops when we find that
-  // 1) The node is already picked for CPU
-  // 2) Input/Output type is unsupported on CPU(float16/bfloat16)
-  // 3) The output is not a CPU tensor
-  // 4) The search hits a node that produces a CPU output
-  while (!candidates_bw.empty()) {
-    NodeIndex cur = candidates_bw.top();
-    candidates_bw.pop();
-    if (visited.count(cur) != 0)
-      continue;
-    visited.insert(cur);
-
-    // node is already picked for CPU
-    if (cpu_nodes.count(cur) != 0)
-      continue;
-
-    if (provider_nodes.find(cur) == provider_nodes.end())
-      continue;
-
-    auto* node = graph.GetNode(cur);
-    bool place_in_cpu = true;
-    for (size_t i = 0; i < node->OutputDefs().size(); ++i) {
-      auto* output = node->OutputDefs()[i];
-
-      // skip placing on CPU if the data typs is float16 or bfloat16
-      if (output->Type() == DataTypeUtils::ToType("float16") ||
-          output->Type() == DataTypeUtils::ToType("bfloat16")) {
-        place_in_cpu = false;
-        break;
-      }
-
-      // the output is not a CPU tensor
-      if (cpu_args.find(output) == cpu_args.end()) {
-        place_in_cpu = false;
-        break;
-      }
-
-      // output is a CPU tensor, but it's intended to be consumed as CPU output by the target EP
-      if (node_to_kernel[cur]->kernel_def->IsOutputOnCpu(i)) {
-        place_in_cpu = false;
-        break;
-      }
-    }
-    // Next, check if the node inputs are of supported type
-    if (place_in_cpu) {
-      for (size_t i = 0; i < node->InputDefs().size(); ++i) {
-        auto* input = node->InputDefs()[i];
-
-        // skip placing on CPU if the data typs is float16 or bfloat16
-        if (input->Type() == DataTypeUtils::ToType("float16") ||
-            input->Type() == DataTypeUtils::ToType("bfloat16")) {
-          place_in_cpu = false;
-          break;
-        }
-      }
-    }
-
-    if (place_in_cpu && cpu_kernel_available.count(cur) != 0) {
-      cpu_nodes.insert(cur);
-      LOGS_DEFAULT(INFO) << "ORT optimization- Force fallback to CPU execution for node: " << node->Name()
-                         << " because the CPU execution path is deemed faster than overhead involved with execution on other EPs "
-                         << " capable of executing this node";
-      for (auto* input : node->InputDefs()) {
-        cpu_args.insert(input);
-      }
-      for (auto it = node->InputNodesBegin(); it != node->InputNodesEnd(); ++it) {
-        candidates_bw.push((*it).Index());
+        candidates.push((*it).Index());
       }
     }
   }
diff --git a/onnxruntime/test/framework/session_state_test.cc b/onnxruntime/test/framework/session_state_test.cc
index 4109bb8e5b..cfe37b2af4 100644
--- a/onnxruntime/test/framework/session_state_test.cc
+++ b/onnxruntime/test/framework/session_state_test.cc
@@ -20,12 +20,6 @@
 #include "gtest/gtest.h"
 #include "test/test_environment.h"
 
-#ifdef USE_CUDA
-#include "core/providers/cuda/cuda_execution_provider.h"
-#elif USE_ROCM
-#include "core/providers/rocm/rocm_execution_provider.h"
-#endif
-
 using namespace ONNX_NAMESPACE;
 using namespace std;
 namespace onnxruntime {
@@ -179,94 +173,6 @@ TEST_P(SessionStateTestP, TestInitializerProcessing) {
   }
 }
 
-#if defined(USE_CUDA) || defined(USE_ROCM)
-static void TestCPUNodePlacement(const std::basic_string<ORTCHAR_T>& model_uri,
-                                 const std::unordered_set<std::string>& expected_cpu_nodes,
-                                 const std::unordered_set<std::string>& expected_gpu_nodes) {
-  std::shared_ptr<Model> model;
-  ASSERT_STATUS_OK(Model::Load(model_uri, model, nullptr, DefaultLoggingManager().DefaultLogger()));
-  Graph& graph = model->MainGraph();
-
-  ExecutionProviders execution_providers;
-#if defined(USE_CUDA)
-  CUDAExecutionProviderInfo cuda_epi;
-  ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, std::make_unique<CUDAExecutionProvider>(cuda_epi)));
-#elif defined(USE_ROCM)
-  ROCMExecutionProviderInfo rocm_epi;
-  ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kRocmExecutionProvider, std::make_unique<ROCMExecutionProvider>(rocm_epi)));
-#endif
-  // add CPU EP
-  CPUExecutionProviderInfo epi;
-  ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCpuExecutionProvider, std::make_unique<CPUExecutionProvider>(epi)));
-
-  KernelRegistryManager krm;
-  ASSERT_STATUS_OK(krm.RegisterKernels(execution_providers));
-
-  DataTransferManager dtm;
-  profiling::Profiler profiler;
-
-  SessionState session_state(graph, execution_providers, false, nullptr, nullptr, dtm,
-                             DefaultLoggingManager().DefaultLogger(), profiler);
-
-  // Partition the graph. Here, the graph partitioner assigns EPs to the nodes
-  GraphPartitioner partitioner(krm, execution_providers);
-  ASSERT_STATUS_OK(partitioner.Partition(graph, session_state.ExportDll(), session_state.GetMutableFuncMgr()));
-
-  // check which nodes are assigned to CPU and GPU
-  for (auto& node : graph.Nodes()) {
-    // assert that EP is assigned
-    ASSERT_TRUE(!node.GetExecutionProviderType().empty());
-    auto& ep = node.GetExecutionProviderType();
-    if (ep == onnxruntime::kCudaExecutionProvider || ep == onnxruntime::kRocmExecutionProvider) {
-      ASSERT_TRUE(expected_gpu_nodes.count(node.Name())) << "Node not found in expected gpu nodes: " << node.Name();
-    } else if (ep == onnxruntime::kCpuExecutionProvider) {
-      ASSERT_TRUE(expected_cpu_nodes.count(node.Name())) << "Node not found in expected cpu nodes: " << node.Name();
-    } else {
-      ASSERT_TRUE(false) << "Invalid execution provider assigned to node: " << node.Name() << " , value: " << ep;
-    }
-  }
-}
-
-TEST(SessionStateTest, CPUPlacementTest0) {
-  std::unordered_set<std::string> expected_cpu_nodes = {"reshape", "shape1", "const1", "mul", "equal", "where"};
-  std::unordered_set<std::string> expected_gpu_nodes = {"shape0", "expand"};
-  TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_0.onnx"), expected_cpu_nodes, expected_gpu_nodes);
-}
-TEST(SessionStateTest, CPUPlacementTest1) {
-  std::unordered_set<std::string> expected_cpu_nodes = {"const1"};
-  std::unordered_set<std::string> expected_gpu_nodes = {"shape0", "expand"};
-  TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_1.onnx"), expected_cpu_nodes, expected_gpu_nodes);
-}
-TEST(SessionStateTest, CPUPlacementTest2) {
-  std::unordered_set<std::string> expected_cpu_nodes = {"range"};
-  std::unordered_set<std::string> expected_gpu_nodes = {"size0", "reduce"};
-  TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_2.onnx"), expected_cpu_nodes, expected_gpu_nodes);
-}
-TEST(SessionStateTest, CPUPlacementTest3) {
-  std::unordered_set<std::string> expected_cpu_nodes = {"range0", "range1"};
-  std::unordered_set<std::string> expected_gpu_nodes = {"size0", "reduce0", "identity", "size1", "reduce1", "sum"};
-  TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_3.onnx"), expected_cpu_nodes, expected_gpu_nodes);
-}
-TEST(SessionStateTest, CPUPlacementTest4) {
-  // Currently, the behaviour is different for RocM and CUDA EP as Rocm EP is missing a valid kernel
-  // for ReduceSum for int64 type. This causes the backward trace in GetCpuPreferredNodes to stop
-  // earlier. The expected values can be modified to match CUDA once the RocM EP kernel is updated
-#if defined(USE_CUDA)
-  std::unordered_set<std::string> expected_cpu_nodes = {"range", "reduce", "const1"};
-  std::unordered_set<std::string> expected_gpu_nodes = {"size0", "expand"};
-#elif defined(USE_ROCM)
-  std::unordered_set<std::string> expected_cpu_nodes = {"const1", "reduce"};
-  std::unordered_set<std::string> expected_gpu_nodes = {"size0", "expand", "range"};
-#endif
-  TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_4.onnx"), expected_cpu_nodes, expected_gpu_nodes);
-}
-TEST(SessionStateTest, CPUPlacementTest5) {
-  std::unordered_set<std::string> expected_cpu_nodes = {"gather0", "gather1", "concat"};
-  std::unordered_set<std::string> expected_gpu_nodes = {"shape0", "shape1", "reshape"};
-  TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_5.onnx"), expected_cpu_nodes, expected_gpu_nodes);
-}
-#endif
-
 // Test that we allocate memory for an initializer from non-arena memory even if we provide an arena-based allocator
 // if the relevant session option config flag is set
 // For this test we need to enable the arena-based allocator which is not supported on x86 builds, so
diff --git a/onnxruntime/test/testdata/cpu_fallback_pattern_0.onnx b/onnxruntime/test/testdata/cpu_fallback_pattern_0.onnx
deleted file mode 100644
index 4186edb736c7284c60d9ab8fa6fe066cf6b8c0f9..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 450
zcmZWmK}*9h7&W$O>SN%@MHdkypofC8U3Y6sk227M;H6X>-Jxse+NwXtf3csZP125n
zIpn?fz2xzIiO-gM2NxhAXJJ~ev#k=Ifw`oQTH+3a18N+OFx@FbbHREh*@G?N8Y7?k
zed?l*T@0}a4i@)btFmsoBNpXptaVm4t2fg_a1Q#J6|D*;KAEy5A(7MRAL8N){Uxwc
zy{A>FxU*>sp*QK=w*_v-{Xo|_RUd6u2n&)XT44^W4}s$`=+|29l&~P_lUBF{iC{~d
z#KKeG+pNr`ml)(|RNdTes$3Ppnduw9!y=J=%V74;80L6B(ZAy2*e9AsmcDc}wls2a
HG@bkb+_Qu0

diff --git a/onnxruntime/test/testdata/cpu_fallback_pattern_1.onnx b/onnxruntime/test/testdata/cpu_fallback_pattern_1.onnx
deleted file mode 100644
index 81e7abb77c0c7cd004bfca1393bd86e3a027c6a3..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 187
zcmd;Jx7xtSCC$a?D8%j<U!0LxkSfInrVNx=gF##_7cQ8L5L<G7UU7+`6qHutcLwnj
z^Gf{Fz-qN5xLC^)b4pXScsLjZ*sT~DotT0A1&l0Qa$Jl~Tu^;NjLuSQsTBo@c_~V4
su3%b-t0c9!L^n4-B{e6C6XYBgE+!5}5Yq|5Oad~UAxtM0E(QS}0NY0^ZvX%Q

diff --git a/onnxruntime/test/testdata/cpu_fallback_pattern_2.onnx b/onnxruntime/test/testdata/cpu_fallback_pattern_2.onnx
deleted file mode 100644
index 6a6f25b0ba0bc53d5971b5a838d1b186e99563c5..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 183
zcmd;Jw_3x<CBen$D8%L%Uz}N$D#Z#S4U|}dffSbp7fV%YQ9c(`l8d>dJYR^lC^0V`
zs1Qsku?B&7Tnb!_PFxTnAx38@wxZOO(&SVn&LA)yT$(GyRgzj<qMMtalA5E%Ex>NY
q$lwGvT#F0DVR8bQ9mNT93JVt#2P25-1Ysrtna&WV6AKrE051TqJt*b?

diff --git a/onnxruntime/test/testdata/cpu_fallback_pattern_3.onnx b/onnxruntime/test/testdata/cpu_fallback_pattern_3.onnx
deleted file mode 100644
index 5285a240d2711a53a28f7c6c38d631dde7692a4d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 388
zcmZvYF>k^!5QT9PNO%Ydc|poj)l(a(Qg3C$Osuf6R4uWpNDZkHM-}|@ZrIpH8JO<7
zr#F0eS-Sd<umoH4aaUc`d+vrv_GZH?l8b5^xDc`aYxt!7Jbjy@pJKeTJ6^yfe8$sO
zmHp9_0*LAEmhAf)1wzVazV~tbJ?B|fs*|qtS!7#Jc*d~e9LYJ-BS#uKl6RyBjuadz
zIMUGZ>FyYSINRiL*ViI8M{sD>>9@Km)lV^7q$@U(w^tRgOC;^94paNr1X0E;N{3$t
LHx6Zj(PDlDzlCK&

diff --git a/onnxruntime/test/testdata/cpu_fallback_pattern_4.onnx b/onnxruntime/test/testdata/cpu_fallback_pattern_4.onnx
deleted file mode 100644
index be93737c4f9e22c46fcd7ca0689243bbcfbeb5c0..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 300
zcmXYsKWoD<5XEzjEIZeh22H_32SK423hkOLaxxd<txHhi>mNw$V9RaEr)j@wb+Q+|
z_j{*1cxkfyi*SLkQgXHHJ};ET{61+>dcg-27it?Ehd6cLjfz`azx$6tbi8H4djx|+
zq-0A~f1qqsJ=+E@`x6#8hDsiqx^vIjsQR{GwzYeCD?jX+@&(3U_SjQ?ox(g>0+^}i
zUl4eQ;O`9k3f7sVpR=tiEjK|GI_lixsj29gU-9b1gc><AM21_u&bYL|C<V4A*o18{
Oq<e4IgEayt^P7Je1xIE8

diff --git a/onnxruntime/test/testdata/cpu_fallback_pattern_5.onnx b/onnxruntime/test/testdata/cpu_fallback_pattern_5.onnx
deleted file mode 100644
index f2d866badba91af3ab8ba22ca5c2d74c7b9ca332..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 324
zcmY+Av5LY#5Qei}lF5w<vX$hDgEUG&P_VLd6S)5Du8}o27(o+Fwh*7j*YNFhcU{41
zzL_89V<v*dt%G04mK>Jrwv<PTLsO<nu+lA#&?V?!sHu6dk*X+T*h*DWGfl!D12{*U
zAhdK^@2TR!b74_eg;YtHnVNRdm*-tO^sdaf1qbowo;mjzU4kEVMci$6^4UxNjVwd!
z<1CM%A>&Tq*8)dHt(qL_f7+-0hz(%DJYfz>d$wglJN`cPGkoksOrtOe8Z$fHJTSK?
IeDntWJ8rm60{{R3

diff --git a/onnxruntime/test/testdata/cpu_fallback_test_gen.py b/onnxruntime/test/testdata/cpu_fallback_test_gen.py
deleted file mode 100644
index 8d8ec94639..0000000000
--- a/onnxruntime/test/testdata/cpu_fallback_test_gen.py
+++ /dev/null
@@ -1,170 +0,0 @@
-import onnx
-from onnx import helper
-from onnx import TensorProto
-from onnx import shape_inference
-import numpy as np
-
-graph_def_0 = helper.make_graph(
-    nodes=[
-        helper.make_node(op_type="Shape", inputs=['A'], outputs=['A_shape'], name='shape0'),
-        helper.make_node(op_type="Reshape", inputs=['A_shape', 'shape'], outputs=['A_reshaped'], name='reshape'),
-        helper.make_node(op_type="Shape", inputs=['A_reshaped'], outputs=['A_shape1'], name='shape1'),
-        helper.make_node(op_type="ConstantOfShape", inputs=['A_shape1'], outputs=['const1'], name='const1', value=helper.make_tensor('val', TensorProto.INT64,
-            [1], [1])),
-        helper.make_node(op_type="Mul", inputs=['const1', 'neg_one'], outputs=['mul'], name='mul'),
-        helper.make_node(op_type="Equal", inputs=['A_reshaped', 'mul'], outputs=['equal'], name='equal'),
-        helper.make_node(op_type="Where", inputs=['equal', 'const1', 'A_reshaped'], outputs=['where'], name='where'),
-        helper.make_node(op_type="Expand", inputs=['B','where'], outputs=['C'], name='expand'),
-
-    ],
-    name='test-model',
-    inputs=[
-        # create inputs with symbolic dims
-        helper.make_tensor_value_info("A", TensorProto.FLOAT, None),
-        helper.make_tensor_value_info("B", TensorProto.FLOAT, None),
-    ],
-    outputs=[
-        helper.make_tensor_value_info('C', TensorProto.FLOAT, None)
-    ],
-    initializer=[
-        helper.make_tensor('shape', TensorProto.INT64, [1], [-1]),
-        helper.make_tensor('neg_one', TensorProto.INT64, [1], [-1]),
-    ])
-
-model = helper.make_model(graph_def_0, opset_imports=[helper.make_operatorsetid("", 12)])
-onnx.save_model(model, "cpu_fallback_pattern_0.onnx")
-
-graph_def_1 = helper.make_graph(
-    nodes=[
-        helper.make_node(op_type="Shape", inputs=['A'], outputs=['A_shape'], name='shape0'),
-        helper.make_node(op_type="ConstantOfShape", inputs=['A_shape'], outputs=['const1'], name='const1', value=helper.make_tensor('val', TensorProto.INT64,
-            [1], [1])),
-        helper.make_node(op_type="Expand", inputs=['B','const1'], outputs=['C'], name='expand'),
-
-    ],
-    name='test-model',
-    inputs=[
-        # create inputs with symbolic dims
-        helper.make_tensor_value_info("A", TensorProto.FLOAT, None),
-        helper.make_tensor_value_info("B", TensorProto.FLOAT, None),
-    ],
-    outputs=[
-        helper.make_tensor_value_info('C', TensorProto.FLOAT, None)
-    ],
-    initializer=[])
-
-model = helper.make_model(graph_def_1, opset_imports=[helper.make_operatorsetid("", 12)])
-onnx.save_model(model, "cpu_fallback_pattern_1.onnx")
-
-
-graph_def_2 = helper.make_graph(
-    nodes=[
-        helper.make_node(op_type="Size", inputs=['A'], outputs=['A_size'], name='size0'),
-        helper.make_node(op_type="Range", inputs=['zero', 'A_size', 'two'], outputs=['range'], name='range'),
-        helper.make_node(op_type="ReduceSum", inputs=['B', 'range'], outputs=['C'], name='reduce'),
-    ],
-    name='test-model',
-    inputs=[
-        # create inputs with symbolic dims
-        helper.make_tensor_value_info("A", TensorProto.FLOAT, None),
-        helper.make_tensor_value_info("B", TensorProto.FLOAT, None),
-    ],
-    outputs=[
-        helper.make_tensor_value_info('C', TensorProto.FLOAT, None)
-    ],
-    initializer=[
-        helper.make_tensor('zero', TensorProto.INT64, [], [0]),
-        helper.make_tensor('two', TensorProto.INT64, [], [2]),
-    ])
-
-model = helper.make_model(graph_def_2, opset_imports=[helper.make_operatorsetid("", 13)])
-onnx.save_model(model, "cpu_fallback_pattern_2.onnx")
-
-
-graph_def_3 = helper.make_graph(
-    nodes=[
-        helper.make_node(op_type="Size", inputs=['A'], outputs=['size0'], name='size0'),
-        helper.make_node(op_type="Range", inputs=['zero', 'size0', 'two'], outputs=['range0'], name='range0'),
-        helper.make_node(op_type="ReduceSum", inputs=['B', 'range0'], outputs=['reduce0'], name='reduce0'),
-
-        helper.make_node(op_type="Identity", inputs=['reduce0'], outputs=['reduce0_cpy'], name='identity'),
-
-        helper.make_node(op_type="Size", inputs=['reduce0_cpy'], outputs=['size1'], name='size1'),
-        helper.make_node(op_type="Range", inputs=['zero', 'size1', 'two'], outputs=['range1'], name='range1'),
-        helper.make_node(op_type="ReduceSum", inputs=['B', 'range1'], outputs=['reduce1'], name='reduce1'),
-
-        helper.make_node(op_type="Sum", inputs=['reduce0', 'reduce1'], outputs=['C'], name='sum'),
-
-    ],
-    name='test-model',
-    inputs=[
-        # create inputs with symbolic dims
-        helper.make_tensor_value_info("A", TensorProto.FLOAT, None),
-        helper.make_tensor_value_info("B", TensorProto.FLOAT, None),
-    ],
-    outputs=[
-        helper.make_tensor_value_info('C', TensorProto.FLOAT, None)
-    ],
-    initializer=[
-        helper.make_tensor('zero', TensorProto.INT64, [], [0]),
-        helper.make_tensor('two', TensorProto.INT64, [], [2]),
-    ])
-
-model = helper.make_model(graph_def_3, opset_imports=[helper.make_operatorsetid("", 13)])
-onnx.save_model(model, "cpu_fallback_pattern_3.onnx")
-
-graph_def_4 = helper.make_graph(
-    nodes=[
-        helper.make_node(op_type="Size", inputs=['A'], outputs=['A_size'], name='size0'),
-        helper.make_node(op_type="Range", inputs=['zero', 'A_size', 'two'], outputs=['range'], name='range'),
-        helper.make_node(op_type="ReduceSum", inputs=['B', 'range'], outputs=['reduce'], name='reduce'),
-        helper.make_node(op_type="ConstantOfShape", inputs=['reduce'], outputs=['const1'], name='const1', value=helper.make_tensor('val', TensorProto.INT64,
-            [1], [1])),
-        helper.make_node(op_type="Expand", inputs=['C','const1'], outputs=['D'], name='expand'),
-        
-    ],
-    name='test-model',
-    inputs=[
-        # create inputs with symbolic dims
-        helper.make_tensor_value_info("A", TensorProto.FLOAT, None),
-        helper.make_tensor_value_info("B", TensorProto.INT64, None),
-        helper.make_tensor_value_info("C", TensorProto.FLOAT, None),
-    ],
-    outputs=[
-        helper.make_tensor_value_info('D', TensorProto.FLOAT, None)
-    ],
-    initializer=[
-        helper.make_tensor('zero', TensorProto.INT64, [], [0]),
-        helper.make_tensor('two', TensorProto.INT64, [], [2]),
-    ])
-
-model = helper.make_model(graph_def_4, opset_imports=[helper.make_operatorsetid("", 13)])
-onnx.save_model(model, "cpu_fallback_pattern_4.onnx")
-
-graph_def_5 = helper.make_graph(
-    nodes=[
-        helper.make_node(op_type="Shape", inputs=['A'], outputs=['A_shape'], name='shape0'),
-        helper.make_node(op_type="Gather", inputs=['A_shape', 'zero'], outputs=['batch'], name='gather0'),
-        helper.make_node(op_type="Concat", inputs=['batch', 'seq_len'], outputs=['shape'], name='concat', axis=0),
-        helper.make_node(op_type="Shape", inputs=['B'], outputs=['B_shape'], name='shape1'),
-        helper.make_node(op_type="Gather", inputs=['B_shape', 'one'], outputs=['seq_len'], name='gather1'),
-        helper.make_node(op_type="Reshape", inputs=['C','shape'], outputs=['D'], name='reshape'),
-        
-    ],
-    name='test-model',
-    inputs=[
-        # create inputs with symbolic dims
-        helper.make_tensor_value_info("A", TensorProto.FLOAT, None),
-        helper.make_tensor_value_info("B", TensorProto.INT64, None),
-        helper.make_tensor_value_info("C", TensorProto.FLOAT, None),
-    ],
-    outputs=[
-        helper.make_tensor_value_info('D', TensorProto.FLOAT, None)
-    ],
-    initializer=[
-        helper.make_tensor('zero', TensorProto.INT64, [1], [0]),
-        helper.make_tensor('one', TensorProto.INT64, [1], [1]),
-    ])
-
-model = helper.make_model(graph_def_5, opset_imports=[helper.make_operatorsetid("", 13)])
-onnx.save_model(model, "cpu_fallback_pattern_5.onnx")