diff --git a/dockerfiles/Dockerfile.openvino b/dockerfiles/Dockerfile.openvino
index 7f1238818c..958d31bc15 100644
--- a/dockerfiles/Dockerfile.openvino
+++ b/dockerfiles/Dockerfile.openvino
@@ -7,7 +7,7 @@ FROM ubuntu:16.04
 
 RUN apt update && \
     apt -y install git sudo wget \
-    zip x11-apps lsb-core cpio libboost-python-dev libpng-dev zlib1g-dev libnuma1 ocl-icd-libopencl1 clinfo libboost-filesystem1.58.0 libboost-thread1.58.0 protobuf-compiler libprotoc-dev libusb-1.0-0-dev
+    zip x11-apps lsb-core cpio libboost-python-dev libpng-dev zlib1g-dev libnuma1 ocl-icd-libopencl1 clinfo libboost-filesystem1.58.0 libboost-thread1.58.0 protobuf-compiler libprotoc-dev libusb-1.0-0-dev autoconf automake libtool
 
 ARG DEVICE=CPU_FP32
 ARG ONNXRUNTIME_REPO=https://github.com/microsoft/onnxruntime
diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h
index f08df584ba..12245aeb51 100644
--- a/include/onnxruntime/core/graph/graph.h
+++ b/include/onnxruntime/core/graph/graph.h
@@ -1106,6 +1106,14 @@ class Graph {
   // Graph value_info.
   std::vector<const NodeArg*> value_info_;
 
+  // Strings which have been used as node names.
+  // New node name should not conflict with this set.
+  std::unordered_set<std::string> generated_node_names_;
+
+  // Strings which have been used as node_arg names.
+  // New node_arg name should not conflict this this set.
+  std::unordered_set<std::string> generated_node_arg_names_;
+
   // All node args owned by <*this> graph. Key is node arg name.
   std::unordered_map<std::string, std::unique_ptr<NodeArg>> node_args_;
 
diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc
index a5e4c3c73f..d939d25876 100644
--- a/onnxruntime/core/graph/graph.cc
+++ b/onnxruntime/core/graph/graph.cc
@@ -2378,28 +2378,57 @@ Node& Graph::AddNode(const NodeProto& node_proto,
 }
 
 std::string Graph::GenerateNodeArgName(const std::string& base_name) {
-  std::string new_name;
-  do {
+  std::string new_name = base_name;
+  // Check if new_name has been used in as any of node_args_' names.
+  // Check if new_name has been generated by this function.
+  // If both are not, add new_name into name set and return the new_name
+  // as the generated name. Otherwise, keep generating new names.
+  while (node_args_.find(new_name) != node_args_.end() || 
+      generated_node_arg_names_.find(new_name) != generated_node_arg_names_.end()) {
     std::ostringstream str;
-    str << base_name << "_" << name_generator_++;
+    str << base_name << "_token_" << name_generator_++;
     new_name = str.str();
-  } while (node_args_.find(new_name) != node_args_.end());
+  }
+
+  generated_node_arg_names_.insert(new_name);
   return new_name;
 }
 
 std::string Graph::GenerateNodeName(const std::string& base_name) {
-  std::string new_name;
-  bool keep_going = true;
+  // Define name-checking function for node name.
+  // Return true if the input name hasn't been used. Otherwise, return false.
+  auto name_is_ok = [&] (const std::string name) {
+    for (auto it = nodes_.begin(); it != nodes_.end(); ++it) {
+      if (*it == nullptr) {
+        continue;
+      }
+      if (it->get()->Name() != name) {
+        continue;
+      }
+      // Find a matched name so we cannot reuse the input name.
+      return false;
+    }
 
-  do {
+    if (generated_node_names_.find(name) != generated_node_names_.end()) {
+      // Find a matched name so we cannot reuse the input name.
+      return false;
+    }
+    
+    // The input name can be reused.
+    return true;
+  };
+
+  // Start with the input name.
+  std::string new_name = base_name;
+
+  while (!name_is_ok(new_name)) {
     std::ostringstream str;
-    str << base_name << "_" << name_generator_++;
+    str << base_name << "_token_" << name_generator_++;
     new_name = str.str();
+  }
 
-    keep_going = std::find_if(nodes_.cbegin(), nodes_.cend(), [&new_name](const std::unique_ptr<Node>& n) {
-                   return (n != nullptr) && (n->Name() == new_name);
-                 }) != nodes_.end();
-  } while (keep_going);
+  // Make sure this new_name is not going to be reused.
+  generated_node_names_.insert(new_name);
 
   return new_name;
 }
diff --git a/onnxruntime/core/optimizer/fast_gelu_fusion.cc b/onnxruntime/core/optimizer/fast_gelu_fusion.cc
index ca122f79ba..c71555bb83 100644
--- a/onnxruntime/core/optimizer/fast_gelu_fusion.cc
+++ b/onnxruntime/core/optimizer/fast_gelu_fusion.cc
@@ -108,7 +108,7 @@ MatchResult FastGeluFusion::CheckFirstFormula(Graph& graph, Node& mul1_node,
 MatchResult FastGeluFusion::CheckSecondFormula(Graph& graph, Node& pow1_node,
                                                std::vector<std::reference_wrapper<Node>>& nodes_to_fuse) const {
   MatchResult matchResult{false, nullptr, nullptr};
-  if (!graph_utils::IsSupportedOptypeVersionAndDomain(pow1_node, "Pow", {7}) ||
+  if (!graph_utils::IsSupportedOptypeVersionAndDomain(pow1_node, "Pow", {7, 12}) ||
       !graph_utils::IsSupportedProvider(pow1_node, GetCompatibleExecutionProviders()) ||
       pow1_node.GetOutputEdgesCount() != 1 ||
       !IsSupportedDataType(pow1_node)) {
diff --git a/onnxruntime/core/optimizer/layer_norm_fusion.cc b/onnxruntime/core/optimizer/layer_norm_fusion.cc
index 327639f425..dddb62bf80 100644
--- a/onnxruntime/core/optimizer/layer_norm_fusion.cc
+++ b/onnxruntime/core/optimizer/layer_norm_fusion.cc
@@ -169,10 +169,10 @@ Status LayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level,
       continue;
     }
     nodes_to_remove.push_back(reduce_mean2_node);
-
+    
     // Traceback the reduceMean node to find pow --> reduceMean
     Node& pow_node = *graph.GetNode(reduce_mean2_node.InputNodesBegin()->Index());
-    if (!graph_utils::IsSupportedOptypeVersionAndDomain(pow_node, "Pow", {7}) ||
+    if (!graph_utils::IsSupportedOptypeVersionAndDomain(pow_node, "Pow", {7, 12}) ||
         pow_node.GetExecutionProviderType() != reduce_mean_node.GetExecutionProviderType() ||
         pow_node.GetOutputEdgesCount() != 1 ||
         !IsSupportedDataType(pow_node)) {
diff --git a/onnxruntime/core/optimizer/reshape_fusion.cc b/onnxruntime/core/optimizer/reshape_fusion.cc
index 5d39939d30..399dcf744c 100644
--- a/onnxruntime/core/optimizer/reshape_fusion.cc
+++ b/onnxruntime/core/optimizer/reshape_fusion.cc
@@ -46,7 +46,7 @@ each of which is a constant initializer or a Shape->Gather->Unsqueeze chain with
 index corresponding to the index of the argument.)
 
 Before fusion:
-   [Sub-graph    Root    Node ]
+   [Sub-graph    Root]
     |        /                  \
     |    Shape                   Shape
     |       |                      |
@@ -61,13 +61,14 @@ Before fusion:
          Reshape
 
 After fusion:
-    [Sub-graph Root Node]   (Constant Initializer)
+    [Sub-graph Root]   (Constant Initializer)
                   \         [0, a, 0, b]
                    \        /
                     Reshape
 */
 bool ReshapeFusion::Fuse_Subgraph1(Node& reshape, Graph& graph, const logging::Logger& logger) {
-  const Node* p_root = graph_utils::GetInputNode(reshape, 0);
+  // The root could be either a graph input or a node so use node arg to compare.
+  const NodeArg& root_input = *(reshape.InputDefs()[0]);
 
   const Node* p_concat = graph_utils::GetInputNode(reshape, 1);
   if (nullptr == p_concat) {
@@ -90,11 +91,8 @@ bool ReshapeFusion::Fuse_Subgraph1(Node& reshape, Graph& graph, const logging::L
   enum class NodeType { Unsqueeze, Gather, Shape };
   std::set<std::pair<NodeType, NodeIndex>> candidates_for_removal;
   for (int i = 0; i < concat_input_count; ++i) {
-    // First check if the i-th argument is an initializer.
-    // We do not check whether the initializer is constant.
-    // Some model uses constant initializer and some does not.
-    // Here we assume that no one will override the initializer using graph input.
-    if (optimizer_utils::AppendTensorFromInitializer(graph, *(concat.InputDefs()[i]), shape_value)) {
+    // First check if the i-th argument is a constant initializer.
+    if (optimizer_utils::AppendTensorFromInitializer(graph, *(concat.InputDefs()[i]), shape_value, true)) {
       continue;
     }
 
@@ -113,7 +111,8 @@ bool ReshapeFusion::Fuse_Subgraph1(Node& reshape, Graph& graph, const logging::L
     const Node& gather = edges[1]->GetNode();
     const Node& shape = edges[2]->GetNode();
 
-    if (graph_utils::GetInputNode(shape, 0) != p_root) {
+    const NodeArg& shape_input = *(shape.InputDefs()[0]);
+    if (shape_input.Name() != root_input.Name()) {
       return false;
     }
 
diff --git a/onnxruntime/core/optimizer/utils.cc b/onnxruntime/core/optimizer/utils.cc
index 4aff66352a..052f5ff67f 100644
--- a/onnxruntime/core/optimizer/utils.cc
+++ b/onnxruntime/core/optimizer/utils.cc
@@ -141,7 +141,11 @@ bool IsAttributeWithExpectedValues(const Node& node, const std::string& attr_nam
   return true;
 }
 
-bool AppendTensorFromInitializer(const Graph& graph, const NodeArg& input_arg, std::vector<int64_t>& data) {
+bool AppendTensorFromInitializer(const Graph& graph, const NodeArg& input_arg, std::vector<int64_t>& data, bool require_constant) {
+  if (require_constant && !graph_utils::IsConstantInitializer(graph, input_arg.Name(), true)) {
+    return false;
+  }
+
   const ONNX_NAMESPACE::TensorProto* tensor_proto = nullptr;
   if (!graph.GetInitializedTensor(input_arg.Name(), tensor_proto)) {
     return false;
diff --git a/onnxruntime/core/optimizer/utils.h b/onnxruntime/core/optimizer/utils.h
index 7887bfd360..0cafe0d49c 100644
--- a/onnxruntime/core/optimizer/utils.h
+++ b/onnxruntime/core/optimizer/utils.h
@@ -42,7 +42,7 @@ bool IsAttributeWithExpectedValues(const Node& node, const std::string& attr_nam
 /** Get values of an integer tensor from initializer, and append them to a vector.
 @remarks only support int32 and int64 tensor. This function does not clear vector before appending.
 */
-bool AppendTensorFromInitializer(const Graph& graph, const NodeArg& input_arg, std::vector<int64_t>& data);
+bool AppendTensorFromInitializer(const Graph& graph, const NodeArg& input_arg, std::vector<int64_t>& data, bool require_constant = true);
 
 /** Check Shape of node input or output.
 @remarks when expected dim value > 0, the dim is expected to known and match the dim value.
diff --git a/onnxruntime/core/providers/acl/nn/conv.cc b/onnxruntime/core/providers/acl/nn/conv.cc
index f95d6cd6ef..c7ccaf5f04 100644
--- a/onnxruntime/core/providers/acl/nn/conv.cc
+++ b/onnxruntime/core/providers/acl/nn/conv.cc
@@ -208,7 +208,7 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
       if(optimizable) {
         //optimized depthwise convolution
 #if defined(ACL_1902) || defined(ACL_1905)
-        auto layer = std::make_shared<arm_compute::NEDepthwiseConvolutionLayer>();
+        auto layer = std::make_shared<arm_compute::NEDepthwiseConvolutionLayer3x3>();
 #endif
 #ifdef ACL_1908
         auto layer = std::make_shared<arm_compute::NEDepthwiseConvolutionLayerOptimized>();
diff --git a/onnxruntime/core/providers/cpu/math/clip.cc b/onnxruntime/core/providers/cpu/math/clip.cc
index 75ae6d4cdd..92c7590d53 100644
--- a/onnxruntime/core/providers/cpu/math/clip.cc
+++ b/onnxruntime/core/providers/cpu/math/clip.cc
@@ -3,6 +3,7 @@
 
 #include "core/providers/cpu/math/clip.h"
 #include "core/framework/data_types_internal.h"
+#include "core/util/math_cpuonly.h"
 
 namespace onnxruntime {
 
@@ -31,6 +32,17 @@ ONNX_CPU_OPERATOR_VERSIONED_KERNEL(
 
 REG_KERNEL_NONTEMPL(Clip, 12, Clip, float, double, int8_t, uint8_t, int64_t, uint64_t);
 
+template<typename T>
+Status Clip_6<T>::Compute(OpKernelContext* ctx) const {
+    const auto* X = ctx->Input<Tensor>(0);
+    Tensor* Y = ctx->Output(0, X->Shape());
+    EigenVectorMap<T>(Y->template MutableData<T>(), Y->Shape().Size()) =
+        ConstEigenVectorMap<T>(X->template Data<T>(), X->Shape().Size())
+            .cwiseMax(this->min_)
+            .cwiseMin(this->max_);
+    return Status::OK();
+}
+
 template <typename T>
 struct Clip::ComputeImpl {
   void operator()(const Tensor* X, const Tensor* min, const Tensor* max, Tensor* Y) const {
diff --git a/onnxruntime/core/providers/cpu/math/clip.h b/onnxruntime/core/providers/cpu/math/clip.h
index b365aa33fd..b7c6032c6b 100644
--- a/onnxruntime/core/providers/cpu/math/clip.h
+++ b/onnxruntime/core/providers/cpu/math/clip.h
@@ -5,7 +5,6 @@
 
 #include "core/common/common.h"
 #include "core/framework/op_kernel.h"
-#include "core/util/math_cpuonly.h"
 
 namespace onnxruntime {
 
@@ -33,15 +32,7 @@ class Clip_6 final : public clip_internal::Clip_6Base<T>, public OpKernel {
   explicit Clip_6(const OpKernelInfo& info) : clip_internal::Clip_6Base<T>(info), OpKernel(info) {
   }
 
-  Status Compute(OpKernelContext* ctx) const override {
-    const auto* X = ctx->Input<Tensor>(0);
-    Tensor* Y = ctx->Output(0, X->Shape());
-    EigenVectorMap<T>(Y->template MutableData<T>(), Y->Shape().Size()) =
-        ConstEigenVectorMap<T>(X->template Data<T>(), X->Shape().Size())
-            .cwiseMax(this->min_)
-            .cwiseMin(this->max_);
-    return Status::OK();
-  }
+  Status Compute(OpKernelContext* ctx) const override;
 };
 
 // Since version 11. Min and Max are inputs
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp
index f815793d97..d806303fa0 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp
@@ -14,8 +14,21 @@ namespace Dml::GraphDescBuilder
     // mismatch is fixed (WindowsAI: 21114358, Lotus: 1953), this workaround should be removed.
     static std::string GetFusedNodeArgNameMatchingGraph(const std::string& fusedNodeArgeName)
     {
-        // The suffix used when inserting mem copies is equal to the below, followed by an incrementing number.
-        const char* suffix = strstr(fusedNodeArgeName.c_str(), "_DmlExecutionProvider_");
+        const char* suffix = nullptr;
+        
+        // The suffix used when inserting mem copies is equal to the below, probably followed by an incrementing number.
+        if (!suffix) {
+            suffix = strstr(fusedNodeArgeName.c_str(), "_DmlExecutionProvider_");
+        }
+
+        // The suffix used when inserting mem copies is equal to the below, not followed by an incrementing number.
+        if (!suffix) {
+            suffix = strstr(fusedNodeArgeName.c_str(), "_DmlExecutionProvider");
+        }
+        
+        if (!suffix) {
+            suffix = strstr(fusedNodeArgeName.c_str(), "_token_");
+        }
 
         if (suffix)
         {
@@ -23,9 +36,9 @@ namespace Dml::GraphDescBuilder
                 fusedNodeArgeName.begin(),
                 fusedNodeArgeName.begin() + (suffix - fusedNodeArgeName.c_str())
             );
+        } else {
+            return fusedNodeArgeName;
         }
-
-        return fusedNodeArgeName;
     }
 
     const std::string& GetUniqueNodeName(const onnxruntime::Node& node)
diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc
index 67f8a2b9dd..92db7addb4 100644
--- a/onnxruntime/test/onnx/main.cc
+++ b/onnxruntime/test/onnx/main.cc
@@ -495,7 +495,6 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
       {"bitshift_left_uint16", "BitShift(11) uint16 support not enabled currently"},
       {"dropout_default", "result differs", {"onnxtip"}},
       {"dropout_random", "result differs", {"onnxtip"}},
-      {"celu", "invalid model", {"onnxtip"}},
       {"maxunpool_export_with_output_shape", "Invalid output in ONNX test. See https://github.com/onnx/onnx/issues/2398"}
   };
 
diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc
index 7ff3f75e3d..6aae2911c0 100644
--- a/onnxruntime/test/optimizer/graph_transform_test.cc
+++ b/onnxruntime/test/optimizer/graph_transform_test.cc
@@ -1116,6 +1116,27 @@ TEST_F(GraphTransformationTests, ReshapeFusionInternalReuseTest) {
   }
 }
 
+
+TEST_F(GraphTransformationTests, ReshapeFusionGraphInputsTest) {
+  auto model_uri = MODEL_FOLDER "fusion/reshape_fusion_with_graph_inputs.onnx";
+  std::shared_ptr<Model> p_model;
+  ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
+  Graph& graph = p_model->MainGraph();
+
+  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  graph_transformation_mgr.Register(onnxruntime::make_unique<ReshapeFusion>(), TransformerLevel::Level1);
+  auto ret = graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_);
+  ASSERT_TRUE(ret.IsOK());
+
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  ASSERT_EQ(op_to_count["Shape"], 1);
+  ASSERT_EQ(op_to_count["Gather"], 1);
+  ASSERT_EQ(op_to_count["Unsqueeze"], 1);
+  ASSERT_EQ(op_to_count["Concat"], 1);
+  ASSERT_EQ(op_to_count["Reshape"], 1);
+}
+
+
 TEST_F(GraphTransformationTests, ExpandElimination) {
   auto model_uri = MODEL_FOLDER "expand_elimination.onnx";
   std::shared_ptr<Model> model;
diff --git a/onnxruntime/test/python/onnx_backend_test_series.py b/onnxruntime/test/python/onnx_backend_test_series.py
index de036c8b1f..5b9d8760ae 100644
--- a/onnxruntime/test/python/onnx_backend_test_series.py
+++ b/onnxruntime/test/python/onnx_backend_test_series.py
@@ -91,7 +91,6 @@ def create_backend_test(testname=None):
             '^test_batchnorm_epsilon_training_mode_cpu',
             '^test_batchnorm_example_old_cpu',
             '^test_batchnorm_example_training_mode_cpu',
-            '^test_celu_cpu',
             '^test_dropout_default_cpu',
             '^test_dropout_random_cpu',
             '^test_einsum_batch_diagonal_cpu',
diff --git a/onnxruntime/test/testdata/transform/fusion/reshape_fusion_gen.py b/onnxruntime/test/testdata/transform/fusion/reshape_fusion_gen.py
index 3f01ffea8f..156d9a2147 100644
--- a/onnxruntime/test/testdata/transform/fusion/reshape_fusion_gen.py
+++ b/onnxruntime/test/testdata/transform/fusion/reshape_fusion_gen.py
@@ -70,3 +70,30 @@ graph = helper.make_graph(
 
 save_model(graph, 'reshape_fusion_internal_node_is_graph_output.onnx')
 
+
+
+graph = helper.make_graph(
+    [ # nodes
+        helper.make_node("Shape", ["query"], ["shape0_out"], "shape0"),
+        helper.make_node("Gather", ["shape0_out", "indices0"], ["gather0_out"], "gather0", axis=0),
+        helper.make_node("Unsqueeze", ["gather0_out"], ["unsqueeze0_out"], "unsqueeze0", axes=[0]),
+        helper.make_node("Concat", ["a", "unsqueeze0_out"], ["concat_out"], "concat", axis=0),
+        helper.make_node("Reshape", ["doc_word_mask", "concat_out"], ["Result"], "reshape"),
+    ],
+    "Reshape_Fusion",  #name
+    [  # inputs
+        helper.make_tensor_value_info('query', TensorProto.FLOAT, [1, 50]),
+        helper.make_tensor_value_info('doc_word_mask', TensorProto.FLOAT, [1, 200, 50]),
+    ],
+    [  # outputs
+        helper.make_tensor_value_info('Result', TensorProto.FLOAT, [10, 20, 'unk']),
+    ],
+    [  # initializers
+        helper.make_tensor('a', TensorProto.INT64, [1], [-1]),
+        helper.make_tensor('indices0', TensorProto.INT64, [], [1]),
+    ]
+)
+
+save_model(graph, 'reshape_fusion_with_graph_inputs.onnx')
+
+
diff --git a/onnxruntime/test/testdata/transform/fusion/reshape_fusion_with_graph_inputs.onnx b/onnxruntime/test/testdata/transform/fusion/reshape_fusion_with_graph_inputs.onnx
new file mode 100644
index 0000000000..e3609a566d
Binary files /dev/null and b/onnxruntime/test/testdata/transform/fusion/reshape_fusion_with_graph_inputs.onnx differ
diff --git a/orttraining/orttraining/core/optimizer/insert_output_rewriter.cc b/orttraining/orttraining/core/optimizer/insert_output_rewriter.cc
index b1ef4b18e5..ceff7c3f77 100644
--- a/orttraining/orttraining/core/optimizer/insert_output_rewriter.cc
+++ b/orttraining/orttraining/core/optimizer/insert_output_rewriter.cc
@@ -25,7 +25,7 @@ Status InsertMaxPoolOutput::Apply(Graph& graph, Node& node, RewriteRuleEffect& r
 }
 
 bool InsertMaxPoolOutput::SatisfyCondition(const Graph& /*graph*/, const Node& node, const logging::Logger& /*logger*/) const {
-  if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "MaxPool", {8}) &&
+  if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "MaxPool", {8, 10, 11, 12}) &&
       node.OutputDefs().size() == 1) {
     return true;
   }