diff --git a/include/onnxruntime/core/common/logging/severity.h b/include/onnxruntime/core/common/logging/severity.h
index c377ceb809..e43f192eb1 100644
--- a/include/onnxruntime/core/common/logging/severity.h
+++ b/include/onnxruntime/core/common/logging/severity.h
@@ -13,7 +13,7 @@ enum class Severity {
   kINFO = 1,
   kWARNING = 2,
   kERROR = 3,
-  kFATAL = 4,
+  kFATAL = 4
 };
 
 constexpr const char* SEVERITY_PREFIX = "VIWEF";
diff --git a/onnxruntime/core/framework/fallback_cpu_capability.cc b/onnxruntime/core/framework/fallback_cpu_capability.cc
index 011eaf9edb..c23ba1bcb3 100644
--- a/onnxruntime/core/framework/fallback_cpu_capability.cc
+++ b/onnxruntime/core/framework/fallback_cpu_capability.cc
@@ -7,7 +7,10 @@
 
 #include "onnx/defs/data_type_utils.h"
 
+#include "core/framework/execution_providers.h"
+#include "core/framework/kernel_registry_manager.h"
 #include "core/framework/op_kernel.h"
+#include "core/providers/cpu/cpu_execution_provider.h"
 
 using namespace ONNX_NAMESPACE::Utils;
 
@@ -45,18 +48,43 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewe
     return node_id_to_order_map[n1] > node_id_to_order_map[n2];
   };
 
-  std::priority_queue<NodeIndex, std::vector<NodeIndex>, decltype(greater_order_comp)> candidates(greater_order_comp);
+  // If return false, n2 will be output first; If return true, n1 will be output first
+  auto lesser_order_comp = [&](const NodeIndex n1, const NodeIndex n2) {
+    return node_id_to_order_map[n1] < node_id_to_order_map[n2];
+  };
+
+  std::priority_queue<NodeIndex, std::vector<NodeIndex>, decltype(greater_order_comp)> candidates_fw(greater_order_comp);
+  std::priority_queue<NodeIndex, std::vector<NodeIndex>, decltype(lesser_order_comp)> candidates_bw(lesser_order_comp);
   std::unordered_set<NodeIndex> visited;
 
-  std::unordered_set<const NodeArg*> cpu_output_args;
+  std::unordered_set<const NodeArg*> cpu_args;
   std::unordered_set<NodeIndex> provider_nodes;
   std::unordered_map<NodeIndex, const KernelCreateInfo*> node_to_kernel;
+  std::unordered_set<NodeIndex> cpu_kernel_available;
+
+  // create a temp CPU kernel registry
+  KernelRegistryManager mgr;
+  ExecutionProviders cpu_ep;
+  CPUExecutionProviderInfo epi{false};
+  ORT_ENFORCE(cpu_ep.Add(kCpuExecutionProvider, std::make_unique<CPUExecutionProvider>(epi)).IsOK());
+  ORT_ENFORCE(mgr.RegisterKernels(cpu_ep).IsOK());
+  std::vector<const KernelRegistry*> cpu_kernel_registries = mgr.GetKernelRegistriesByProviderType(kCpuExecutionProvider);
 
   for (auto& node_id : tentative_nodes) {
     provider_nodes.insert(node_id);
     const Node* node = graph.GetNode(node_id);
 
     const KernelCreateInfo* kernel_info = nullptr;
+
+    // Get the CPU kernel availability for this node
+    for (auto registry : cpu_kernel_registries) {
+      auto st = registry->TryFindKernel(*node, kCpuExecutionProvider, &kernel_info);
+      if (st.IsOK()) {
+        cpu_kernel_available.insert(node_id);
+        break;
+      }
+    }
+
     for (auto registry : kernel_registries) {
       auto st = registry->TryFindKernel(*node, provider_type, &kernel_info);
       if (st.IsOK())
@@ -71,11 +99,26 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewe
         node->OutputDefs(),
         [&](const NodeArg& node_arg, size_t out_index) {
           if (kernel_info->kernel_def->IsOutputOnCpu(out_index)) {
-            cpu_output_args.insert(&node_arg);
+            cpu_args.insert(&node_arg);
             auto consumer_nodes = graph.GetConsumerNodes(node_arg.Name());
             for (auto& consumer_node : consumer_nodes) {
-              candidates.push(consumer_node->Index());
-              LOGS_DEFAULT(INFO) << "Candidate for fallback CPU execution: " << consumer_node->Name();
+              candidates_fw.push(consumer_node->Index());
+              LOGS_DEFAULT(INFO) << "Candidate for fallback CPU execution in forward trace: " << consumer_node->Name();
+            }
+          }
+          return Status::OK();
+        }));
+
+    // then, find all the direct producers of cpu tensors.
+    ORT_THROW_IF_ERROR(node->ForEachWithIndex(
+        node->InputDefs(),
+        [&](const NodeArg& node_arg, size_t in_index) {
+          if (kernel_info->kernel_def->IsInputOnCpu(in_index)) {
+            cpu_args.insert(&node_arg);
+            auto producer_node = graph.GetProducerNode(node_arg.Name());
+            if (producer_node != nullptr) {
+              candidates_bw.push(producer_node->Index());
+              LOGS_DEFAULT(INFO) << "Candidate for fallback CPU execution in backward trace: " << producer_node->Name();
             }
           }
           return Status::OK();
@@ -89,9 +132,9 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewe
   // The detail:
   // for each candidate, if one of its input is a cpu tensor and the Non-CPU kernel doesn't mark it as cpu input,
   // force the node to CPU to avoid memory cpu and add its output to the small cpu tensors.
-  while (!candidates.empty()) {
-    NodeIndex cur = candidates.top();
-    candidates.pop();
+  while (!candidates_fw.empty()) {
+    NodeIndex cur = candidates_fw.top();
+    candidates_fw.pop();
     if (visited.count(cur) != 0)
       continue;
     visited.insert(cur);
@@ -118,7 +161,7 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewe
       }
 
       // the input is not a CPU tensor
-      if (cpu_output_args.find(input) == cpu_output_args.end()) {
+      if (cpu_args.find(input) == cpu_args.end()) {
         place_in_cpu = false;
         break;
       }
@@ -130,16 +173,90 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewe
       }
     }
 
-    if (place_in_cpu) {
+    if (place_in_cpu && cpu_kernel_available.count(cur) != 0) {
       cpu_nodes.insert(cur);
       LOGS_DEFAULT(INFO) << "ORT optimization- Force fallback to CPU execution for node: " << node->Name()
                          << " because the CPU execution path is deemed faster than overhead involved with execution on other EPs "
                          << " capable of executing this node";
       for (auto* output : node->OutputDefs()) {
-        cpu_output_args.insert(output);
+        cpu_args.insert(output);
       }
       for (auto it = node->OutputNodesBegin(); it != node->OutputNodesEnd(); ++it) {
-        candidates.push((*it).Index());
+        candidates_fw.push((*it).Index());
+      }
+    }
+  }
+  // clear the visited to prepare for backward trace
+  visited.clear();
+  // Trace the graph backwards to find additional CPU nodes
+  // Starting from nodes that must produce an output on CPU, trace the producer nodes
+  // The trace stops when we find that
+  // 1) The node is already picked for CPU
+  // 2) Input/Output type is unsupported on CPU(float16/bfloat16)
+  // 3) The output is not a CPU tensor
+  // 4) The search hits a node that produces a CPU output
+  while (!candidates_bw.empty()) {
+    NodeIndex cur = candidates_bw.top();
+    candidates_bw.pop();
+    if (visited.count(cur) != 0)
+      continue;
+    visited.insert(cur);
+
+    // node is already picked for CPU
+    if (cpu_nodes.count(cur) != 0)
+      continue;
+
+    if (provider_nodes.find(cur) == provider_nodes.end())
+      continue;
+
+    auto* node = graph.GetNode(cur);
+    bool place_in_cpu = true;
+    for (size_t i = 0; i < node->OutputDefs().size(); ++i) {
+      auto* output = node->OutputDefs()[i];
+
+      // skip placing on CPU if the data typs is float16 or bfloat16
+      if (output->Type() == DataTypeUtils::ToType("float16") ||
+          output->Type() == DataTypeUtils::ToType("bfloat16")) {
+        place_in_cpu = false;
+        break;
+      }
+
+      // the output is not a CPU tensor
+      if (cpu_args.find(output) == cpu_args.end()) {
+        place_in_cpu = false;
+        break;
+      }
+
+      // output is a CPU tensor, but it's intended to be consumed as CPU output by the target EP
+      if (node_to_kernel[cur]->kernel_def->IsOutputOnCpu(i)) {
+        place_in_cpu = false;
+        break;
+      }
+    }
+    // Next, check if the node inputs are of supported type
+    if (place_in_cpu) {
+      for (size_t i = 0; i < node->InputDefs().size(); ++i) {
+        auto* input = node->InputDefs()[i];
+
+        // skip placing on CPU if the data typs is float16 or bfloat16
+        if (input->Type() == DataTypeUtils::ToType("float16") ||
+            input->Type() == DataTypeUtils::ToType("bfloat16")) {
+          place_in_cpu = false;
+          break;
+        }
+      }
+    }
+
+    if (place_in_cpu && cpu_kernel_available.count(cur) != 0) {
+      cpu_nodes.insert(cur);
+      LOGS_DEFAULT(INFO) << "ORT optimization- Force fallback to CPU execution for node: " << node->Name()
+                         << " because the CPU execution path is deemed faster than overhead involved with execution on other EPs "
+                         << " capable of executing this node";
+      for (auto* input : node->InputDefs()) {
+        cpu_args.insert(input);
+      }
+      for (auto it = node->InputNodesBegin(); it != node->InputNodesEnd(); ++it) {
+        candidates_bw.push((*it).Index());
       }
     }
   }
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index 42abdd12fa..75d258fecf 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -49,7 +49,6 @@
 #include "core/session/onnxruntime_session_options_config_keys.h"
 #include "core/util/protobuf_parsing_utils.h"
 #include "core/util/thread_utils.h"
-#include "onnxruntime_config.h"
 
 // custom ops are not available in a minimal build unless ORT_MINIMAL_BUILD_CUSTOM_OPS is set
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
@@ -293,29 +292,6 @@ void InferenceSession::ConstructorCommon(const SessionOptions& session_options,
   // a monotonically increasing session id for use in telemetry
   session_id_ = global_session_id_.fetch_add(1);
   allocator_manager_ = std::make_shared<onnxruntime::AllocatorManager>();
-
-  // Add log to allow serving platforms to quantify ORT usage.
-  // To avoid flooding the test logs, this is done for non-debug mode only
-  // TODO: plug-in a platform specific telemetry provider to send the telemetry to
-#if defined(NDEBUG) && !defined(__wasm__) && !defined(ENABLE_TRAINING)
-#ifdef _WIN32
-  std::wostringstream ostr;
-#else
-  std::ostringstream ostr;
-#endif
-  // Format: "ORT Telemetry: Ver = 1.7.0; Event = EventName (event_attr1: foo.onnx, event_attr2: 400us)"
-  // Format: "ORT Telemetry: Ver = 1.7.0; Event = SessionCreation (model: foo.onnx, ts: 400us)"
-  ostr << "ORT Telemetry: "
-       << "Ver = " << ORT_VERSION << "; Event = SessionCreation";
-  if (!model_location_.empty()) {
-    ostr << " (model: " << model_location_ << ")";
-  }
-#ifdef _WIN32
-  std::wcout << ostr.str() << "\n";
-#else
-  std::cout << ostr.str() << "\n";
-#endif
-#endif
 }
 
 InferenceSession::InferenceSession(const SessionOptions& session_options, const Environment& session_env)
diff --git a/onnxruntime/python/tools/transformers/fusion_attention.py b/onnxruntime/python/tools/transformers/fusion_attention.py
index 657a4e717f..e6f9ec9df2 100644
--- a/onnxruntime/python/tools/transformers/fusion_attention.py
+++ b/onnxruntime/python/tools/transformers/fusion_attention.py
@@ -150,9 +150,9 @@ class FusionAttention(Fusion):
         q_weight = self.model.get_initializer(q_matmul.input[1])
         k_weight = self.model.get_initializer(k_matmul.input[1])
         v_weight = self.model.get_initializer(v_matmul.input[1])
-        q_bias = self.model.get_initializer(q_add.input[1])
-        k_bias = self.model.get_initializer(k_add.input[1])
-        v_bias = self.model.get_initializer(v_add.input[1])
+        q_bias = self.model.get_initializer(q_add.input[1]) or self.model.get_initializer(q_add.input[0])
+        k_bias = self.model.get_initializer(k_add.input[1]) or self.model.get_initializer(k_add.input[0])
+        v_bias = self.model.get_initializer(v_add.input[1]) or self.model.get_initializer(v_add.input[0])
 
         if q_weight is None:
             print(f"{q_matmul.input[1]} is not initializer. Please set do_constant_folding=True in torch.onnx.export")
@@ -166,14 +166,14 @@ class FusionAttention(Fusion):
         # Check if all matrices have the same shape
         assert qw.shape == kw.shape == vw.shape
 
-        # All the matrices have the same shape. For 2d weights, the shapes would be [in_size, out_size]. 
+        # All the matrices have the same shape. For 2d weights, the shapes would be [in_size, out_size].
         # For 3d weights, shape would be [in_size, a, b] where a*b = out_size
         in_size = qw.shape[0]
         out_size = np.prod(qw.shape[1:])
 
         qkv_weight = np.stack((qw, kw, vw), axis=1)
 
-        qb = NumpyHelper.to_array(q_bias)        
+        qb = NumpyHelper.to_array(q_bias)
         kb = NumpyHelper.to_array(k_bias)
         vb = NumpyHelper.to_array(v_bias)
 
@@ -233,13 +233,14 @@ class FusionAttention(Fusion):
 
         # SkipLayerNormalization has two inputs, and one of them is the root input for attention.
         qkv_nodes = self.model.match_parent_path(start_node, ['Add', 'MatMul', 'Reshape', 'Transpose', 'MatMul'],
-                                                 [None, 0, 0, 0, 0])
+                                                 [None, None, 0, 0, 0])
         einsum_node = None
         if qkv_nodes is not None:
             (_, matmul_qkv, reshape_qkv, transpose_qkv, matmul_qkv) = qkv_nodes
         else:
             # Match Albert
-            qkv_nodes = self.model.match_parent_path(start_node, ['Add', 'Einsum', 'Transpose', 'MatMul'], [1, 0, 0, 0])
+            qkv_nodes = self.model.match_parent_path(start_node, ['Add', 'Einsum', 'Transpose', 'MatMul'],
+                                                     [1, None, 0, 0])
             if qkv_nodes is not None:
                 (_, einsum_node, transpose_qkv, matmul_qkv) = qkv_nodes
             else:
@@ -284,16 +285,16 @@ class FusionAttention(Fusion):
         if children_types.count('MatMul') != 3:
             return
 
-        v_nodes = self.model.match_parent_path(matmul_qkv, ['Transpose', 'Reshape', 'Add', 'MatMul'], [1, 0, 0, 0])
+        v_nodes = self.model.match_parent_path(matmul_qkv, ['Transpose', 'Reshape', 'Add', 'MatMul'], [1, 0, 0, None])
         if v_nodes is None:
             logger.debug("fuse_attention: failed to match v path")
             return
         (_, _, add_v, matmul_v) = v_nodes
 
         is_distill = False
-        qk_nodes = self.model.match_parent_path(matmul_qkv, ['Softmax', 'Add', 'Div', 'MatMul'], [0, 0, 0, 0])
+        qk_nodes = self.model.match_parent_path(matmul_qkv, ['Softmax', 'Add', 'Div', 'MatMul'], [0, 0, None, 0])
         if qk_nodes is None:
-            qk_nodes = self.model.match_parent_path(matmul_qkv, ['Softmax', 'Add', 'Mul', 'MatMul'], [0, 0, 0, 0])
+            qk_nodes = self.model.match_parent_path(matmul_qkv, ['Softmax', 'Add', 'Mul', 'MatMul'], [0, 0, None, 0])
             if qk_nodes is None:
                 qk_nodes = self.model.match_parent_path(matmul_qkv, ['Softmax', 'Where', 'MatMul', 'Div'], [0, 0, 2, 0])
                 is_distill = True
@@ -309,10 +310,10 @@ class FusionAttention(Fusion):
         else:
             (_, add_qk, _, matmul_qk) = qk_nodes
 
-        q_nodes = self.model.match_parent_path(matmul_qk, ['Transpose', 'Reshape', 'Add', 'MatMul'], [0, 0, 0, 0])
+        q_nodes = self.model.match_parent_path(matmul_qk, ['Transpose', 'Reshape', 'Add', 'MatMul'], [0, 0, 0, None])
         if q_nodes is None:
             q_nodes = self.model.match_parent_path(matmul_qk, ['Div', 'Transpose', 'Reshape', 'Add', 'MatMul'],
-                                                   [0, 0, 0, 0, 0])
+                                                   [0, 0, 0, 0, None])
             if q_nodes is None:
                 logger.debug("fuse_attention: failed to match q path")
                 return
@@ -320,10 +321,10 @@ class FusionAttention(Fusion):
         add_q = q_nodes[-2]
         matmul_q = q_nodes[-1]
 
-        k_nodes = self.model.match_parent_path(matmul_qk, ['Transpose', 'Reshape', 'Add', 'MatMul'], [1, 0, 0, 0])
+        k_nodes = self.model.match_parent_path(matmul_qk, ['Transpose', 'Reshape', 'Add', 'MatMul'], [1, 0, 0, None])
         if k_nodes is None:
             k_nodes = self.model.match_parent_path(matmul_qk, ['Transpose', 'Transpose', 'Reshape', 'Add', 'MatMul'],
-                                                   [1, 0, 0, 0, 0])
+                                                   [1, 0, 0, 0, None])
             if k_nodes is None:
                 logger.debug("fuse_attention: failed to match k path")
                 return
@@ -339,8 +340,8 @@ class FusionAttention(Fusion):
                                                              output_name_to_node)
         else:
             _, mask_nodes, _ = self.model.match_parent_paths(
-                add_qk, [(['Mul', 'Sub', 'Cast', 'Unsqueeze', 'Unsqueeze'], [1, 0, 1, 0, 0]),
-                         (['Mul', 'Sub', 'Unsqueeze', 'Unsqueeze'], [1, 0, 1, 0])], output_name_to_node)
+                add_qk, [(['Mul', 'Sub', 'Cast', 'Unsqueeze', 'Unsqueeze'], [None, 0, 1, 0, 0]),
+                         (['Mul', 'Sub', 'Unsqueeze', 'Unsqueeze'], [None, 0, 1, 0])], output_name_to_node)
         if mask_nodes is None:
             logger.debug("fuse_attention: failed to match mask path")
             return
diff --git a/onnxruntime/python/tools/transformers/test/bert_model_generator.py b/onnxruntime/python/tools/transformers/test/bert_model_generator.py
index 5d1a65f281..79ceec701d 100644
--- a/onnxruntime/python/tools/transformers/test/bert_model_generator.py
+++ b/onnxruntime/python/tools/transformers/test/bert_model_generator.py
@@ -21,7 +21,17 @@ def float_tensor(name: str, shape: List[int], random=False):
     return helper.make_tensor(name, TensorProto.FLOAT, shape, weights)
 
 
-def create_bert_attention(input_hidden_size=16, pruned_num_heads=2, pruned_head_size=4, use_float_mask=False):
+def reverse_if(inputs, reverse=False):
+    if reverse:
+        inputs.reverse()
+    return inputs
+
+
+def create_bert_attention(input_hidden_size=16,
+                          pruned_num_heads=2,
+                          pruned_head_size=4,
+                          use_float_mask=False,
+                          switch_add_inputs=False):
     # unsqueeze in opset version 13 has two inputs (axis is moved from attribute to input).
     has_unsqueeze_two_inputs = (version.parse(onnx.__version__) >= version.parse('1.8.0'))
 
@@ -36,13 +46,13 @@ def create_bert_attention(input_hidden_size=16, pruned_num_heads=2, pruned_head_
 
         # q nodes
         helper.make_node("MatMul", ["layernorm_out", "matmul_q_weight"], ["matmul_q_out"], "matmul_q"),
-        helper.make_node("Add", ["matmul_q_out", "add_q_weight"], ["add_q_out"], "add_q"),
+        helper.make_node("Add", reverse_if(["matmul_q_out", "add_q_weight"], switch_add_inputs), ["add_q_out"], "add_q"),
         helper.make_node("Reshape", ["add_q_out", "reshape_weight_1"], ["reshape_q_out"], "reshape_q"),
         helper.make_node("Transpose", ["reshape_q_out"], ["transpose_q_out"], "transpose_q", perm=[0, 2, 1, 3]),
 
         # k nodes
         helper.make_node("MatMul", ["layernorm_out", "matmul_k_weight"], ["matmul_k_out"], "matmul_k"),
-        helper.make_node("Add", ["matmul_k_out", "add_k_weight"], ["add_k_out"], "add_k"),
+        helper.make_node("Add", reverse_if(["matmul_k_out", "add_k_weight"], switch_add_inputs), ["add_k_out"], "add_k"),
         helper.make_node("Reshape", ["add_k_out", "reshape_weight_1"], ["reshape_k_out"], "reshape_k"),
         helper.make_node("Transpose", ["reshape_k_out"], ["transpose_k_out"], "transpose_k", perm=[0, 2, 3, 1]),
 
@@ -60,7 +70,7 @@ def create_bert_attention(input_hidden_size=16, pruned_num_heads=2, pruned_head_
         # qk nodes
         helper.make_node("MatMul", ["transpose_q_out", "transpose_k_out"], ["matmul_qk_out"], "matmul_qk"),
         helper.make_node("Div", ["matmul_qk_out", "div_weight"], ["div_qk_out"], "div_qk"),
-        helper.make_node("Add", ["div_qk_out", "mul_mask_out"], ["add_qk_out"], "add_qk"),
+        helper.make_node("Add", reverse_if(["div_qk_out", "mul_mask_out"], switch_add_inputs), ["add_qk_out"], "add_qk"),
         helper.make_node("Softmax", ["add_qk_out"], ["softmax_qk_out"], "softmax_qk", axis=3),
 
         # v nodes
@@ -74,8 +84,8 @@ def create_bert_attention(input_hidden_size=16, pruned_num_heads=2, pruned_head_
         helper.make_node("Transpose", ["matmul_qkv_1_out"], ["transpose_qkv_out"], "transpose_qkv", perm=[0, 2, 1, 3]),
         helper.make_node("Reshape", ["transpose_qkv_out", "reshape_weight_2"], ["reshape_qkv_out"], "reshape_qkv"),
         helper.make_node("MatMul", ["reshape_qkv_out", "matmul_qkv_weight"], ["matmul_qkv_2_out"], "matmul_qkv_2"),
-        helper.make_node("Add", ["matmul_qkv_2_out", "add_qkv_weight"], ["add_qkv_out"], "add_qkv"),
-        helper.make_node("Add", ["add_qkv_out", "layernorm_out"], ["skip_output"], "add_skip"),
+        helper.make_node("Add", reverse_if(["matmul_qkv_2_out", "add_qkv_weight"], switch_add_inputs), ["add_qkv_out"], "add_qkv"),
+        helper.make_node("Add", reverse_if(["add_qkv_out", "layernorm_out"], switch_add_inputs), ["skip_output"], "add_skip"),
         helper.make_node("LayerNormalization", ["skip_output", "layer_norm_weight", "layer_norm_bias"], ["output"],
                          "layernorm2",
                          axis=-1,
@@ -127,6 +137,7 @@ def create_bert_attention(input_hidden_size=16, pruned_num_heads=2, pruned_head_
     model = helper.make_model(graph)
     return model
 
+
 def create_tf2onnx_attention_3d(input_hidden_size=16, num_heads=4, head_size=4, use_float_mask=False):
     # unsqueeze in opset version 13 has two inputs (axis is moved from attribute to input).
     has_unsqueeze_two_inputs = (version.parse(onnx.__version__) >= version.parse('1.8.0'))
@@ -143,7 +154,7 @@ def create_tf2onnx_attention_3d(input_hidden_size=16, num_heads=4, head_size=4,
         # q nodes
         helper.make_node("Einsum", ["layernorm_out", "einsum_q_weight"], ["einsum_q_out"], "einsum_q", equation="abc,cde->abde"),
         helper.make_node("Add", ["einsum_q_out", "add_q_weight"], ["add_q_out"], "add_q"),
-        
+
         # k nodes
         helper.make_node("Einsum", ["layernorm_out", "einsum_k_weight"], ["einsum_k_out"], "einsum_k", equation="abc,cde->abde"),
         helper.make_node("Add", ["einsum_k_out", "add_k_weight"], ["add_k_out"], "add_k"),
@@ -229,5 +240,7 @@ def create_tf2onnx_attention_3d(input_hidden_size=16, num_heads=4, head_size=4,
 if __name__ == "__main__":
     model = create_bert_attention()
     onnx.save(model, "pruned_bert_attention.onnx")
+    model = create_bert_attention(switch_add_inputs=True)
+    onnx.save(model, "bert_attention_reverse_add_order.onnx")
     model = create_tf2onnx_attention_3d()
-    onnx.save(model, "bert_3d_attention.onnx")
\ No newline at end of file
+    onnx.save(model, "bert_3d_attention.onnx")
diff --git a/onnxruntime/python/tools/transformers/test/test_attention_fusion.py b/onnxruntime/python/tools/transformers/test/test_attention_fusion.py
index 2543ce06ce..da70e03ac9 100644
--- a/onnxruntime/python/tools/transformers/test/test_attention_fusion.py
+++ b/onnxruntime/python/tools/transformers/test/test_attention_fusion.py
@@ -28,7 +28,21 @@ class TestFusion(unittest.TestCase):
                                            'pruned_attention_opt.onnx')
         expected = onnx.load(expected_model_path)
         self.assertEqual(str(optimized_model.model.graph), str(expected.graph))
-    
+
+    def test_attention_fusion_reverse_add_order(self):
+        model = create_bert_attention(switch_add_inputs=True)
+        dir = '.'
+        model_path = os.path.join(dir, "bert_attention_reverse_add_order.onnx")
+        onnx.save(model, model_path)
+        optimized_model = optimize_model(model_path)
+        os.remove(model_path)
+
+        # reverse add input order will get same optimized model
+        expected_model_path = os.path.join(os.path.dirname(__file__), 'test_data', 'fusion',
+                                           'pruned_attention_opt.onnx')
+        expected = onnx.load(expected_model_path)
+        self.assertEqual(str(optimized_model.model.graph), str(expected.graph))
+
     def test_3d_attention_fusion_tf2onnx_model(self):
         model = create_tf2onnx_attention_3d()
         dir = '.'
diff --git a/onnxruntime/test/framework/session_state_test.cc b/onnxruntime/test/framework/session_state_test.cc
index e127a6cad8..42bd593b61 100644
--- a/onnxruntime/test/framework/session_state_test.cc
+++ b/onnxruntime/test/framework/session_state_test.cc
@@ -20,6 +20,12 @@
 #include "gtest/gtest.h"
 #include "test/test_environment.h"
 
+#ifdef USE_CUDA
+#include "core/providers/cuda/cuda_execution_provider.h"
+#elif USE_ROCM
+#include "core/providers/rocm/rocm_execution_provider.h"
+#endif
+
 using namespace ONNX_NAMESPACE;
 using namespace std;
 namespace onnxruntime {
@@ -173,6 +179,94 @@ TEST_P(SessionStateTestP, TestInitializerProcessing) {
   }
 }
 
+#if defined(USE_CUDA) || defined(USE_ROCM)
+static void TestCPUNodePlacement(const std::basic_string<ORTCHAR_T>& model_uri,
+                                 const std::unordered_set<std::string>& expected_cpu_nodes,
+                                 const std::unordered_set<std::string>& expected_gpu_nodes) {
+  std::shared_ptr<Model> model;
+  ASSERT_STATUS_OK(Model::Load(model_uri, model, nullptr, DefaultLoggingManager().DefaultLogger()));
+  Graph& graph = model->MainGraph();
+
+  ExecutionProviders execution_providers;
+#if defined(USE_CUDA)
+  CUDAExecutionProviderInfo cuda_epi;
+  ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, std::make_unique<CUDAExecutionProvider>(cuda_epi)));
+#elif defined(USE_ROCM)
+  ROCMExecutionProviderInfo rocm_epi;
+  ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kRocmExecutionProvider, std::make_unique<ROCMExecutionProvider>(rocm_epi)));
+#endif
+  // add CPU EP
+  CPUExecutionProviderInfo epi;
+  ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCpuExecutionProvider, std::make_unique<CPUExecutionProvider>(epi)));
+
+  KernelRegistryManager krm;
+  ASSERT_STATUS_OK(krm.RegisterKernels(execution_providers));
+
+  DataTransferManager dtm;
+  profiling::Profiler profiler;
+
+  SessionState session_state(graph, execution_providers, false, nullptr, nullptr, dtm,
+                             DefaultLoggingManager().DefaultLogger(), profiler);
+
+  // Partition the graph. Here, the graph partitioner assigns EPs to the nodes
+  GraphPartitioner partitioner(krm, execution_providers);
+  ASSERT_STATUS_OK(partitioner.Partition(graph, session_state.ExportDll(), session_state.GetMutableFuncMgr()));
+
+  // check which nodes are assigned to CPU and GPU
+  for (auto& node : graph.Nodes()) {
+    // assert that EP is assigned
+    ASSERT_TRUE(!node.GetExecutionProviderType().empty());
+    auto& ep = node.GetExecutionProviderType();
+    if (ep == onnxruntime::kCudaExecutionProvider || ep == onnxruntime::kRocmExecutionProvider) {
+      ASSERT_TRUE(expected_gpu_nodes.count(node.Name())) << "Node not found in expected gpu nodes: " << node.Name();
+    } else if (ep == onnxruntime::kCpuExecutionProvider) {
+      ASSERT_TRUE(expected_cpu_nodes.count(node.Name())) << "Node not found in expected cpu nodes: " << node.Name();
+    } else {
+      ASSERT_TRUE(false) << "Invalid execution provider assigned to node: " << node.Name() << " , value: " << ep;
+    }
+  }
+}
+
+TEST(SessionStateTest, CPUPlacementTest0) {
+  std::unordered_set<std::string> expected_cpu_nodes = {"reshape", "shape1", "const1", "mul", "equal", "where"};
+  std::unordered_set<std::string> expected_gpu_nodes = {"shape0", "expand"};
+  TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_0.onnx"), expected_cpu_nodes, expected_gpu_nodes);
+}
+TEST(SessionStateTest, CPUPlacementTest1) {
+  std::unordered_set<std::string> expected_cpu_nodes = {"const1"};
+  std::unordered_set<std::string> expected_gpu_nodes = {"shape0", "expand"};
+  TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_1.onnx"), expected_cpu_nodes, expected_gpu_nodes);
+}
+TEST(SessionStateTest, CPUPlacementTest2) {
+  std::unordered_set<std::string> expected_cpu_nodes = {"range"};
+  std::unordered_set<std::string> expected_gpu_nodes = {"size0", "reduce"};
+  TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_2.onnx"), expected_cpu_nodes, expected_gpu_nodes);
+}
+TEST(SessionStateTest, CPUPlacementTest3) {
+  std::unordered_set<std::string> expected_cpu_nodes = {"range0", "range1"};
+  std::unordered_set<std::string> expected_gpu_nodes = {"size0", "reduce0", "identity", "size1", "reduce1", "sum"};
+  TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_3.onnx"), expected_cpu_nodes, expected_gpu_nodes);
+}
+TEST(SessionStateTest, CPUPlacementTest4) {
+  // Currently, the behaviour is different for RocM and CUDA EP as Rocm EP is missing a valid kernel 
+  // for ReduceSum for int64 type. This causes the backward trace in GetCpuPreferredNodes to stop 
+  // earlier. The expected values can be modified to match CUDA once the RocM EP kernel is updated 
+#if defined(USE_CUDA)
+  std::unordered_set<std::string> expected_cpu_nodes = {"range", "reduce", "const1"};
+  std::unordered_set<std::string> expected_gpu_nodes = {"size0", "expand"};
+#elif defined(USE_ROCM)
+  std::unordered_set<std::string> expected_cpu_nodes = {"const1", "reduce"};
+  std::unordered_set<std::string> expected_gpu_nodes = {"size0", "expand", "range"};
+#endif
+  TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_4.onnx"), expected_cpu_nodes, expected_gpu_nodes);
+}
+TEST(SessionStateTest, CPUPlacementTest5) {
+  std::unordered_set<std::string> expected_cpu_nodes = {"gather0", "gather1", "concat"};
+  std::unordered_set<std::string> expected_gpu_nodes = {"shape0", "shape1", "reshape"};
+  TestCPUNodePlacement(ORT_TSTR("testdata/cpu_fallback_pattern_5.onnx"), expected_cpu_nodes, expected_gpu_nodes);
+}
+#endif
+
 // Test that we allocate memory for an initializer from non-arena memory even if we provide an arena-based allocator
 // if the relevant session option config flag is set
 // For this test we need to enable the arena-based allocator which is not supported on x86 builds, so
diff --git a/onnxruntime/test/testdata/cpu_fallback_pattern_0.onnx b/onnxruntime/test/testdata/cpu_fallback_pattern_0.onnx
new file mode 100644
index 0000000000..4186edb736
Binary files /dev/null and b/onnxruntime/test/testdata/cpu_fallback_pattern_0.onnx differ
diff --git a/onnxruntime/test/testdata/cpu_fallback_pattern_1.onnx b/onnxruntime/test/testdata/cpu_fallback_pattern_1.onnx
new file mode 100644
index 0000000000..81e7abb77c
Binary files /dev/null and b/onnxruntime/test/testdata/cpu_fallback_pattern_1.onnx differ
diff --git a/onnxruntime/test/testdata/cpu_fallback_pattern_2.onnx b/onnxruntime/test/testdata/cpu_fallback_pattern_2.onnx
new file mode 100644
index 0000000000..6a6f25b0ba
Binary files /dev/null and b/onnxruntime/test/testdata/cpu_fallback_pattern_2.onnx differ
diff --git a/onnxruntime/test/testdata/cpu_fallback_pattern_3.onnx b/onnxruntime/test/testdata/cpu_fallback_pattern_3.onnx
new file mode 100644
index 0000000000..5285a240d2
Binary files /dev/null and b/onnxruntime/test/testdata/cpu_fallback_pattern_3.onnx differ
diff --git a/onnxruntime/test/testdata/cpu_fallback_pattern_4.onnx b/onnxruntime/test/testdata/cpu_fallback_pattern_4.onnx
new file mode 100644
index 0000000000..be93737c4f
Binary files /dev/null and b/onnxruntime/test/testdata/cpu_fallback_pattern_4.onnx differ
diff --git a/onnxruntime/test/testdata/cpu_fallback_pattern_5.onnx b/onnxruntime/test/testdata/cpu_fallback_pattern_5.onnx
new file mode 100644
index 0000000000..f2d866badb
Binary files /dev/null and b/onnxruntime/test/testdata/cpu_fallback_pattern_5.onnx differ
diff --git a/onnxruntime/test/testdata/cpu_fallback_test_gen.py b/onnxruntime/test/testdata/cpu_fallback_test_gen.py
new file mode 100644
index 0000000000..8d8ec94639
--- /dev/null
+++ b/onnxruntime/test/testdata/cpu_fallback_test_gen.py
@@ -0,0 +1,170 @@
+import onnx
+from onnx import helper
+from onnx import TensorProto
+from onnx import shape_inference
+import numpy as np
+
+graph_def_0 = helper.make_graph(
+    nodes=[
+        helper.make_node(op_type="Shape", inputs=['A'], outputs=['A_shape'], name='shape0'),
+        helper.make_node(op_type="Reshape", inputs=['A_shape', 'shape'], outputs=['A_reshaped'], name='reshape'),
+        helper.make_node(op_type="Shape", inputs=['A_reshaped'], outputs=['A_shape1'], name='shape1'),
+        helper.make_node(op_type="ConstantOfShape", inputs=['A_shape1'], outputs=['const1'], name='const1', value=helper.make_tensor('val', TensorProto.INT64,
+            [1], [1])),
+        helper.make_node(op_type="Mul", inputs=['const1', 'neg_one'], outputs=['mul'], name='mul'),
+        helper.make_node(op_type="Equal", inputs=['A_reshaped', 'mul'], outputs=['equal'], name='equal'),
+        helper.make_node(op_type="Where", inputs=['equal', 'const1', 'A_reshaped'], outputs=['where'], name='where'),
+        helper.make_node(op_type="Expand", inputs=['B','where'], outputs=['C'], name='expand'),
+
+    ],
+    name='test-model',
+    inputs=[
+        # create inputs with symbolic dims
+        helper.make_tensor_value_info("A", TensorProto.FLOAT, None),
+        helper.make_tensor_value_info("B", TensorProto.FLOAT, None),
+    ],
+    outputs=[
+        helper.make_tensor_value_info('C', TensorProto.FLOAT, None)
+    ],
+    initializer=[
+        helper.make_tensor('shape', TensorProto.INT64, [1], [-1]),
+        helper.make_tensor('neg_one', TensorProto.INT64, [1], [-1]),
+    ])
+
+model = helper.make_model(graph_def_0, opset_imports=[helper.make_operatorsetid("", 12)])
+onnx.save_model(model, "cpu_fallback_pattern_0.onnx")
+
+graph_def_1 = helper.make_graph(
+    nodes=[
+        helper.make_node(op_type="Shape", inputs=['A'], outputs=['A_shape'], name='shape0'),
+        helper.make_node(op_type="ConstantOfShape", inputs=['A_shape'], outputs=['const1'], name='const1', value=helper.make_tensor('val', TensorProto.INT64,
+            [1], [1])),
+        helper.make_node(op_type="Expand", inputs=['B','const1'], outputs=['C'], name='expand'),
+
+    ],
+    name='test-model',
+    inputs=[
+        # create inputs with symbolic dims
+        helper.make_tensor_value_info("A", TensorProto.FLOAT, None),
+        helper.make_tensor_value_info("B", TensorProto.FLOAT, None),
+    ],
+    outputs=[
+        helper.make_tensor_value_info('C', TensorProto.FLOAT, None)
+    ],
+    initializer=[])
+
+model = helper.make_model(graph_def_1, opset_imports=[helper.make_operatorsetid("", 12)])
+onnx.save_model(model, "cpu_fallback_pattern_1.onnx")
+
+
+graph_def_2 = helper.make_graph(
+    nodes=[
+        helper.make_node(op_type="Size", inputs=['A'], outputs=['A_size'], name='size0'),
+        helper.make_node(op_type="Range", inputs=['zero', 'A_size', 'two'], outputs=['range'], name='range'),
+        helper.make_node(op_type="ReduceSum", inputs=['B', 'range'], outputs=['C'], name='reduce'),
+    ],
+    name='test-model',
+    inputs=[
+        # create inputs with symbolic dims
+        helper.make_tensor_value_info("A", TensorProto.FLOAT, None),
+        helper.make_tensor_value_info("B", TensorProto.FLOAT, None),
+    ],
+    outputs=[
+        helper.make_tensor_value_info('C', TensorProto.FLOAT, None)
+    ],
+    initializer=[
+        helper.make_tensor('zero', TensorProto.INT64, [], [0]),
+        helper.make_tensor('two', TensorProto.INT64, [], [2]),
+    ])
+
+model = helper.make_model(graph_def_2, opset_imports=[helper.make_operatorsetid("", 13)])
+onnx.save_model(model, "cpu_fallback_pattern_2.onnx")
+
+
+graph_def_3 = helper.make_graph(
+    nodes=[
+        helper.make_node(op_type="Size", inputs=['A'], outputs=['size0'], name='size0'),
+        helper.make_node(op_type="Range", inputs=['zero', 'size0', 'two'], outputs=['range0'], name='range0'),
+        helper.make_node(op_type="ReduceSum", inputs=['B', 'range0'], outputs=['reduce0'], name='reduce0'),
+
+        helper.make_node(op_type="Identity", inputs=['reduce0'], outputs=['reduce0_cpy'], name='identity'),
+
+        helper.make_node(op_type="Size", inputs=['reduce0_cpy'], outputs=['size1'], name='size1'),
+        helper.make_node(op_type="Range", inputs=['zero', 'size1', 'two'], outputs=['range1'], name='range1'),
+        helper.make_node(op_type="ReduceSum", inputs=['B', 'range1'], outputs=['reduce1'], name='reduce1'),
+
+        helper.make_node(op_type="Sum", inputs=['reduce0', 'reduce1'], outputs=['C'], name='sum'),
+
+    ],
+    name='test-model',
+    inputs=[
+        # create inputs with symbolic dims
+        helper.make_tensor_value_info("A", TensorProto.FLOAT, None),
+        helper.make_tensor_value_info("B", TensorProto.FLOAT, None),
+    ],
+    outputs=[
+        helper.make_tensor_value_info('C', TensorProto.FLOAT, None)
+    ],
+    initializer=[
+        helper.make_tensor('zero', TensorProto.INT64, [], [0]),
+        helper.make_tensor('two', TensorProto.INT64, [], [2]),
+    ])
+
+model = helper.make_model(graph_def_3, opset_imports=[helper.make_operatorsetid("", 13)])
+onnx.save_model(model, "cpu_fallback_pattern_3.onnx")
+
+graph_def_4 = helper.make_graph(
+    nodes=[
+        helper.make_node(op_type="Size", inputs=['A'], outputs=['A_size'], name='size0'),
+        helper.make_node(op_type="Range", inputs=['zero', 'A_size', 'two'], outputs=['range'], name='range'),
+        helper.make_node(op_type="ReduceSum", inputs=['B', 'range'], outputs=['reduce'], name='reduce'),
+        helper.make_node(op_type="ConstantOfShape", inputs=['reduce'], outputs=['const1'], name='const1', value=helper.make_tensor('val', TensorProto.INT64,
+            [1], [1])),
+        helper.make_node(op_type="Expand", inputs=['C','const1'], outputs=['D'], name='expand'),
+        
+    ],
+    name='test-model',
+    inputs=[
+        # create inputs with symbolic dims
+        helper.make_tensor_value_info("A", TensorProto.FLOAT, None),
+        helper.make_tensor_value_info("B", TensorProto.INT64, None),
+        helper.make_tensor_value_info("C", TensorProto.FLOAT, None),
+    ],
+    outputs=[
+        helper.make_tensor_value_info('D', TensorProto.FLOAT, None)
+    ],
+    initializer=[
+        helper.make_tensor('zero', TensorProto.INT64, [], [0]),
+        helper.make_tensor('two', TensorProto.INT64, [], [2]),
+    ])
+
+model = helper.make_model(graph_def_4, opset_imports=[helper.make_operatorsetid("", 13)])
+onnx.save_model(model, "cpu_fallback_pattern_4.onnx")
+
+graph_def_5 = helper.make_graph(
+    nodes=[
+        helper.make_node(op_type="Shape", inputs=['A'], outputs=['A_shape'], name='shape0'),
+        helper.make_node(op_type="Gather", inputs=['A_shape', 'zero'], outputs=['batch'], name='gather0'),
+        helper.make_node(op_type="Concat", inputs=['batch', 'seq_len'], outputs=['shape'], name='concat', axis=0),
+        helper.make_node(op_type="Shape", inputs=['B'], outputs=['B_shape'], name='shape1'),
+        helper.make_node(op_type="Gather", inputs=['B_shape', 'one'], outputs=['seq_len'], name='gather1'),
+        helper.make_node(op_type="Reshape", inputs=['C','shape'], outputs=['D'], name='reshape'),
+        
+    ],
+    name='test-model',
+    inputs=[
+        # create inputs with symbolic dims
+        helper.make_tensor_value_info("A", TensorProto.FLOAT, None),
+        helper.make_tensor_value_info("B", TensorProto.INT64, None),
+        helper.make_tensor_value_info("C", TensorProto.FLOAT, None),
+    ],
+    outputs=[
+        helper.make_tensor_value_info('D', TensorProto.FLOAT, None)
+    ],
+    initializer=[
+        helper.make_tensor('zero', TensorProto.INT64, [1], [0]),
+        helper.make_tensor('one', TensorProto.INT64, [1], [1]),
+    ])
+
+model = helper.make_model(graph_def_5, opset_imports=[helper.make_operatorsetid("", 13)])
+onnx.save_model(model, "cpu_fallback_pattern_5.onnx")