diff --git a/onnxruntime/core/providers/dnnl/dnnl_node_capability.cc b/onnxruntime/core/providers/dnnl/dnnl_node_capability.cc
index d4c72840dd..ab6fbb5f5d 100644
--- a/onnxruntime/core/providers/dnnl/dnnl_node_capability.cc
+++ b/onnxruntime/core/providers/dnnl/dnnl_node_capability.cc
@@ -191,34 +191,31 @@ bool DnnlBatchNormalizationNodeCapability::IsDimensionSupported(const Node* node
   return true;
 }
 
-// DnnlReduceMeanNodeCapability class
+// DnnlReduceNodeCapability class
 //-------------------------------------
-bool DnnlReduceMeanNodeCapability::Supported(const Node* node, const GraphViewer& graph_viewer) const {
-  ORT_UNUSED_PARAMETER(graph_viewer);
+bool DnnlReduceNodeCapability::Supported(const Node* node, const GraphViewer& graph_viewer) const {
+  // These reduction operators use elementwise ops so elementwise operators must also be supported.
+  if(node->OpType() == "ReduceLogSum" ||
+     node->OpType() == "ReduceLogSumExp" ||
+     node->OpType() == "ReduceSumSquare") {
+      if(!_eltwise.Supported(node, graph_viewer)) return false;
+  }
   if (!IsTypeSupported(node)) return false;
-  if (!IsAttributeSupported(node)) return false;
   if (!IsDimensionSupported(node)) return false;
   return true;
 }
 
-bool DnnlReduceMeanNodeCapability::IsAttributeSupported(const Node* node) const {
-  const NodeAttributes& attributes = node->GetAttributes();
-  auto attr = attributes.find("keepdims");
-  if (attr != attributes.end() && attr->second().i() == 0) {
-    return false;
-  }
-  return true;
-}
-
-bool DnnlReduceMeanNodeCapability::IsDimensionSupported(const Node* node) const {
+bool DnnlReduceNodeCapability::IsDimensionSupported(const Node* node) const {
   auto node_inputs = node->InputDefs();
   if (node_inputs[0]->Shape() != nullptr && node_inputs[0]->Shape()->dim_size() == 0) {
+    LOGS_DEFAULT(INFO) << "Reduction op not supported because input data is a scalar\n";
     return false;
   }
   return true;
 }
 
 // DnnlSoftmaxNodeCapability class
+//-------------------------------------
 bool DnnlSoftmaxNodeCapability::Supported(const Node* node, const GraphViewer& graph_viewer) const {
   ORT_UNUSED_PARAMETER(graph_viewer);
   if (!IsTypeSupported(node)) return false;
diff --git a/onnxruntime/core/providers/dnnl/dnnl_node_capability.h b/onnxruntime/core/providers/dnnl/dnnl_node_capability.h
index 0e3a3ef131..e81f80f320 100644
--- a/onnxruntime/core/providers/dnnl/dnnl_node_capability.h
+++ b/onnxruntime/core/providers/dnnl/dnnl_node_capability.h
@@ -145,22 +145,6 @@ class DnnlBatchNormalizationNodeCapability : public DnnlDefaultNodeCapability {
   bool IsDimensionSupported(const Node* node) const;
 };
 
-/**
- * Decide if a ReduceMean op is supported by DnnlExecutionProvider
- *
- * Dnnl does not support the "keepdims" attribute when it is `0`
- */
-class DnnlReduceMeanNodeCapability : public DnnlDefaultNodeCapability {
- public:
-  DnnlReduceMeanNodeCapability() : DnnlDefaultNodeCapability({type_float32}) {}
-
-  bool Supported(const Node* node, const GraphViewer& graph_viewer) const override;
-
- private:
-  bool IsAttributeSupported(const Node* node) const;
-  bool IsDimensionSupported(const Node* node) const;
-};
-
 /**
  * Decide if a Softmax op is supported by DnnlExecutionProvider
  *
@@ -249,6 +233,21 @@ class DnnlElementwiseCapability : public DnnlDefaultNodeCapability {
   bool IsDimensionSupported(const Node* node) const;
 };
 
+/**
+ * Decide if a Reduce op is supported by DnnlExecutionProvider
+ */
+class DnnlReduceNodeCapability : public DnnlDefaultNodeCapability {
+ public:
+  DnnlReduceNodeCapability() : DnnlDefaultNodeCapability({type_float32}) {}
+
+  bool Supported(const Node* node, const GraphViewer& graph_viewer) const override;
+
+ private:
+  bool IsDimensionSupported(const Node* node) const;
+  DnnlElementwiseCapability _eltwise;
+
+};
+
 class DnnlPowNodeCapability : public DnnlDefaultMultiInputNodeCapability {
  public:
   DnnlPowNodeCapability()
diff --git a/onnxruntime/core/providers/dnnl/dnnl_op_manager.cc b/onnxruntime/core/providers/dnnl/dnnl_op_manager.cc
index fa58933a47..43e3dc1508 100644
--- a/onnxruntime/core/providers/dnnl/dnnl_op_manager.cc
+++ b/onnxruntime/core/providers/dnnl/dnnl_op_manager.cc
@@ -31,7 +31,16 @@ DnnlOpManager::DnnlOpManager() {
   dnnl_ops_map_.emplace(std::make_pair("Mul", std::unique_ptr<DnnlNodeCapability>(new DnnlBinaryNodeCapability())));
   dnnl_ops_map_.emplace(std::make_pair("Pow", std::unique_ptr<DnnlNodeCapability>(new DnnlPowNodeCapability())));
   dnnl_ops_map_.emplace(std::make_pair("QAttention", std::unique_ptr<DnnlNodeCapability>(new DnnlQAttentionNodeCapability())));
-  dnnl_ops_map_.emplace(std::make_pair("ReduceMean", std::unique_ptr<DnnlNodeCapability>(new DnnlReduceMeanNodeCapability())));
+  dnnl_ops_map_.emplace(std::make_pair("ReduceL1", std::unique_ptr<DnnlNodeCapability>(new DnnlReduceNodeCapability())));
+  dnnl_ops_map_.emplace(std::make_pair("ReduceL2", std::unique_ptr<DnnlNodeCapability>(new DnnlReduceNodeCapability())));
+  dnnl_ops_map_.emplace(std::make_pair("ReduceLogSum", std::unique_ptr<DnnlNodeCapability>(new DnnlReduceNodeCapability())));
+  dnnl_ops_map_.emplace(std::make_pair("ReduceLogSumExp", std::unique_ptr<DnnlNodeCapability>(new DnnlReduceNodeCapability())));
+  dnnl_ops_map_.emplace(std::make_pair("ReduceMax", std::unique_ptr<DnnlNodeCapability>(new DnnlReduceNodeCapability())));
+  dnnl_ops_map_.emplace(std::make_pair("ReduceMean", std::unique_ptr<DnnlNodeCapability>(new DnnlReduceNodeCapability())));
+  dnnl_ops_map_.emplace(std::make_pair("ReduceMin", std::unique_ptr<DnnlNodeCapability>(new DnnlReduceNodeCapability())));
+  dnnl_ops_map_.emplace(std::make_pair("ReduceProd", std::unique_ptr<DnnlNodeCapability>(new DnnlReduceNodeCapability())));
+  dnnl_ops_map_.emplace(std::make_pair("ReduceSum", std::unique_ptr<DnnlNodeCapability>(new DnnlReduceNodeCapability())));
+  dnnl_ops_map_.emplace(std::make_pair("ReduceSumSquare", std::unique_ptr<DnnlNodeCapability>(new DnnlReduceNodeCapability())));
   dnnl_ops_map_.emplace(std::make_pair("Relu", std::unique_ptr<DnnlNodeCapability>(new DnnlElementwiseCapability())));
   dnnl_ops_map_.emplace(std::make_pair("Reshape", std::unique_ptr<DnnlNodeCapability>(new DnnlReshapeNodeCapability())));
   dnnl_ops_map_.emplace(std::make_pair("Round", std::unique_ptr<DnnlNodeCapability>(new DnnlElementwiseCapability())));
diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_qattention.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_qattention.cc
index d3f2361a0b..b08c1d1bb9 100644
--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_qattention.cc
+++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_qattention.cc
@@ -45,97 +45,53 @@ dnnl::memory DnnlQAttention::ComputeTotalScale(DnnlSubgraphPrimitive& sp, DnnlNo
 }
 
 /*
-input_tensor            weight_tensor
-
+   input_tensor            weight_tensor
          \                       /
-
           \                     /
-
            \                   /
-
             \                 /
-
-               matmulinteger 
+               matmulinteger
         with input and weight zero point,
         input and weight scale and bias
                     |
-
                     |
-
                     | QKV
-
                     |
-
                   slice
-
-                 /  |  \
-
                 /   |   \
-
                /    |    \
-
               /     |     \
-
+             /      |      \
             |Q      |K      |V
-
             |       |       |
-
          reshape  reshape  reshape
-
             |       |       |
-
          permute  permute  permute
-
             |       |       |
-
             |    transpose  |
-
-            \       |       |
-
              \      |       |
-
               \     |       |
-
                \    |       |
-
-                  matmul    |
-
-                    |       |
-
-                    |       |
-
- sqrt(head_dim)     |       |
-
-              \     |       |
-
-               \    |       |
-
                 \   |       |
-
+                  matmul    |
+                    |       |
+                    |       |
+   sqrt(head_dim)   |       |
+                \   |       |
+                 \  |       |
+                  \ |       |
                    div      |
-
-                    |       | 
-                  
-                  (mask)    |
-                  
-                    |       /
-
-                 softmax   /
-
-                    |    /
-
+                    |       |
+                  (mask)   /
+                    |     /
+                 softmax /
+                    |   /
                   matmul
-
                     |
-
                   permute
-
                     |
-
                   reshape
-
                     |
-
                   output
 */
 /*
diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_reduce.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_reduce.cc
new file mode 100644
index 0000000000..7be31bb49c
--- /dev/null
+++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_reduce.cc
@@ -0,0 +1,342 @@
+// Copyright(C) 2021 Intel Corporation
+// Licensed under the MIT License
+#include "dnnl_reduce.h"
+#include "dnnl_subgraph.h"
+#include "dnnl_subgraph_primitive.h"
+#include "core/providers/common.h"
+
+namespace onnxruntime {
+namespace ort_dnnl {
+
+DnnlReduce::DnnlReduce() {}
+
+// assume all dims are available
+void DnnlReduce::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) {
+
+  using namespace dnnl;
+
+  // get the engine, currently only support either single gpu or single cpu device
+  auto dnnl_engine = sp.GetEngine();
+
+  enum ReduceOp {
+      ReduceL1,
+      ReduceL2,
+      ReduceLogSum,
+      ReduceLogSumExp,
+      ReduceMax,
+      ReduceMean,
+      ReduceMin,
+      ReduceProd,
+      ReduceSum,
+      ReduceSumSquare
+  };
+
+  ReduceOp reduce_op = ReduceSum;
+  dnnl::algorithm algo = dnnl::algorithm::reduction_sum;
+  if (node.OpType() == "ReduceL1") {
+    reduce_op = ReduceL1;
+    algo = dnnl::algorithm::reduction_norm_lp_power_p_sum;
+  } else if (node.OpType() == "ReduceL2") {
+    reduce_op = ReduceL2;
+    algo = dnnl::algorithm::reduction_norm_lp_sum;
+  } else if(node.OpType() == "ReduceLogSum") {
+    reduce_op = ReduceLogSum;
+    algo = dnnl::algorithm::reduction_sum;
+  } else if(node.OpType() == "ReduceLogSumExp") {
+    reduce_op = ReduceLogSumExp;
+    algo = dnnl::algorithm::reduction_sum;
+  } else if (node.OpType() == "ReduceMax") {
+    reduce_op = ReduceMax;
+    algo = dnnl::algorithm::reduction_max;
+  } else if (node.OpType() == "ReduceMean") {
+    reduce_op = ReduceMean;
+    algo = dnnl::algorithm::reduction_mean;
+  } else if (node.OpType() == "ReduceMin") {
+    reduce_op = ReduceMin;
+    algo = dnnl::algorithm::reduction_min;
+  } else if (node.OpType() == "ReduceProd") {
+    reduce_op = ReduceProd;
+    algo = dnnl::algorithm::reduction_mul;
+  } else if (node.OpType() == "ReduceSum") {
+    reduce_op = ReduceSum;
+    algo = dnnl::algorithm::reduction_sum;
+  } else if (node.OpType() == "ReduceSumSquare") {
+    reduce_op = ReduceSumSquare;
+    algo = dnnl::algorithm::reduction_sum;
+  }
+
+
+
+  auto opset = node.SinceVersion();
+  dnnl::memory::dims axes;
+  if (reduce_op == ReduceSum) {
+    // in ReduceSum opset older than version 13 the Axes came in as an attribute
+    // after version 13 the axis is an optional tensor input.
+    if (opset < 13) {
+      axes = ReadAxes(node);
+    } else {
+      if (node.Input(IN_AXES).Exists()) {
+        auto axes_mem = sp.GetMemory(node.Input(IN_AXES));
+        dnnl::memory::dims axes_dims = axes_mem.get_desc().dims();
+        int64_t* p_axes_data = (int64_t*)axes_mem.get_data_handle();
+        axes = std::vector<int64_t>(p_axes_data, p_axes_data + axes_dims[0]);
+      }
+    }
+  } else {
+    axes = ReadAxes(node);
+  }
+
+  auto src_mem = sp.GetMemoryInOrtFormat(node.Input(IN_DATA), dnnl_engine);
+  auto src_md = src_mem.get_desc();
+
+  if (reduce_op == ReduceSum) {
+    // If axes is empty and the noop_with_empty_axes != 0 return the IN_DATA as the output.
+    if (axes.empty()) {
+      if (NoOpWithEmptyAxes(node)) {
+        sp.SetMemory(node.Output(OUT_REDUCED), src_mem, true);
+        return;
+      }
+    }
+  }
+
+  //We need to calculate output tensor shape
+  //First we initialize it with input shape and then we modify it based on the attribute values
+  //This is because the DNNL primitive functionality is determined by the input and output shapes.
+  auto src_dims = src_md.dims();
+  auto ndim = src_dims.size();
+
+  // convert negative axis values to the positive axis
+  for (size_t i = 0; i < axes.size(); ++i) {
+    axes[i] = HandleNegativeAxis(axes[i], ndim);
+  }
+  // Handle out of order and repeating dims.
+  std::sort(axes.begin(), axes.end());
+  axes.erase(std::unique(axes.begin(), axes.end()), axes.end());
+
+  // if axes is empty change all non-zero shape dims to 1
+  if (axes.size() == 0) {
+    for (size_t i = 0; i < ndim; ++i) {
+      if (src_dims[i] != 0)
+        src_dims[i] = 1;
+    }
+  //If there is axis, then make the respective dimensions 1, keeping the other dimension values untouched.
+  } else {
+    for (size_t i = 0; i < axes.size(); i++) {
+      if (src_dims[axes[i]] != 0)
+        src_dims[axes[i]] = 1;
+    }
+  }
+
+  auto dst_shape = TensorShape(src_dims.data(), ndim);
+  dnnl::memory::dims dst_dims_mkl(dst_shape.GetDims().begin(), dst_shape.GetDims().end());
+  auto dst_md = dnnl::memory::desc({dst_dims_mkl}, src_md.data_type(), dnnl::memory::format_tag::any);
+
+  // Check to see if the destination shape and source shape are the same.
+  bool src_and_dst_dims_equal = true;
+  if (src_md.dims().size() == dst_md.dims().size()) {
+    for (size_t i = 0; i < src_md.dims().size(); ++i) {
+      if (src_md.dims()[i] != dst_md.dims()[i]) {
+        src_and_dst_dims_equal = false;
+        break;
+      }
+    }
+  }
+
+  /*
+  * OneDNN will return an error if a reduction algorithm is called that does not result in a
+  * shape reduction. For this reason we have code paths that are taken if the source dimensions and
+  * destination dimensions are equal that will not call the reduction op.
+  *
+  * "ReduceLogSum" is equivelent to Log(ReduceSum(input))
+  *   - if the reduction op is called then the eltwise_log post op will added to the reduction primitive.
+  *   - if the reduction op is not called then the eltwise_log primitive is added as its own primitive
+  *   - NOTE "ReduceLogSum" follows the code flow of "All other reduce ops" with the exception of the added
+  *          post op and an extra check if src_dims == dest_dims.
+  * "ReduceLogSumExp" is equivelent to Log(ReduceSum(Exp(input)))
+  *   - if the reduction op is called then the eltwise_exp primitive is added before the reduction op
+  *     the eletwise_log post op will be added to the reduction primitive
+  *   - if the reduction op is not called then the input is not modified since Log(Exp(input) == input
+  * "ReduceSumSquare" is equivelent to ReduceSum(Square(input))
+  *   - the eltwise_square primitive is added before the reduction op
+  *   - if the source and destination dimensions are not equal the reduction op is called
+  * All other reduce ops
+  *   - if the source and destination dimensions are not equal call the reduction op
+  *   - otherwise don't modify the input.
+  *
+  * After the Reduction check the "KeepDims" attribute
+  *  - if KeepDims == 1 the output is the result of the reduction op
+  *  - if KeepDims == 0 we perform a squeeze operation on the output of the reduction op
+  *  - NOTE: Even if reduction op is not called KeepDims attribute can result in the output being modified
+  */
+  dnnl::memory reduce_src_mem;
+  dnnl::memory reduce_dst_mem;
+  dnnl::primitive_attr dnnl_primitive_attr;
+  if ((reduce_op == ReduceLogSum || reduce_op == ReduceLogSumExp ) && !src_and_dst_dims_equal) {
+    dnnl::post_ops eltwise_post_op;
+    eltwise_post_op.append_eltwise(1.0f, dnnl::algorithm::eltwise_log, 1.0f, 1.0f);
+    dnnl_primitive_attr.set_post_ops(eltwise_post_op);
+  }
+
+  if (reduce_op == ReduceLogSumExp) {
+    if (!src_and_dst_dims_equal) {
+      auto elementwise_desc = dnnl::eltwise_forward::desc(dnnl::prop_kind::forward_inference, dnnl::algorithm::eltwise_exp, src_md);
+      auto elementwise_pd = dnnl::eltwise_forward::primitive_desc(elementwise_desc, dnnl_engine);
+
+      auto elementwise_dst_mem = dnnl::memory(elementwise_pd.dst_desc(), dnnl_engine);
+
+      auto elemenwise_primitive = dnnl::eltwise_forward(elementwise_pd);
+      sp.AddPrimitive(elemenwise_primitive, {{DNNL_ARG_SRC, src_mem},
+                                           {DNNL_ARG_DST, elementwise_dst_mem}});
+      auto reduce_desc = dnnl::reduction::desc(algo, src_md, dst_md, 0.f, 0.f);
+      auto reduce_pd = dnnl::reduction::primitive_desc(reduce_desc, dnnl_primitive_attr, dnnl_engine);
+
+      reduce_dst_mem = dnnl::memory(reduce_pd.dst_desc(), dnnl_engine);
+
+      auto reducemean_op = dnnl::reduction(reduce_pd);
+      sp.AddPrimitive(reducemean_op, {{DNNL_ARG_SRC, elementwise_dst_mem},
+                                      {DNNL_ARG_DST, reduce_dst_mem}});
+    } else {
+      reduce_dst_mem = src_mem;
+    }
+  } else if(reduce_op == ReduceSumSquare) {
+    auto elementwise_desc = dnnl::eltwise_forward::desc(dnnl::prop_kind::forward_inference, dnnl::algorithm::eltwise_square, src_md);
+    auto elementwise_pd = dnnl::eltwise_forward::primitive_desc(elementwise_desc, dnnl_engine);
+
+    auto elementwise_dst_mem = dnnl::memory(elementwise_pd.dst_desc(), dnnl_engine);
+
+    auto elemenwise_primitive = dnnl::eltwise_forward(elementwise_pd);
+    sp.AddPrimitive(elemenwise_primitive, {{DNNL_ARG_SRC, src_mem},
+                                           {DNNL_ARG_DST, elementwise_dst_mem}});
+    if (!src_and_dst_dims_equal) {
+      auto reduce_desc = dnnl::reduction::desc(algo, src_md, dst_md, 0.f, 0.f);
+      auto reduce_pd = dnnl::reduction::primitive_desc(reduce_desc, dnnl_engine);
+
+      reduce_dst_mem = dnnl::memory(reduce_pd.dst_desc(), dnnl_engine);
+
+      auto reducemean_op = dnnl::reduction(reduce_pd);
+      sp.AddPrimitive(reducemean_op, {{DNNL_ARG_SRC, elementwise_dst_mem},
+                                      {DNNL_ARG_DST, reduce_dst_mem}});
+    } else {
+      reduce_dst_mem = elementwise_dst_mem;
+    }
+  } else {
+    // If calculated source and destination shape are the same do not do the reduction operation.
+    if (!src_and_dst_dims_equal) {
+      float p_val = 0.f;
+      if (reduce_op == ReduceL1) {
+        p_val = 1.0f;
+      } else if (reduce_op == ReduceL2) {
+        p_val = 2.0f;
+      }
+
+      auto reduce_desc = dnnl::reduction::desc(algo, src_md, dst_md, p_val, 0.f);
+      auto reduce_pd = dnnl::reduction::primitive_desc(reduce_desc, dnnl_primitive_attr, dnnl_engine);
+
+      // If using GPU this will move the memory from the CPU to the GPU.
+      reduce_src_mem = sp.GetMemoryAndReshape(node.Input(IN_DATA), reduce_pd.src_desc(), dnnl_engine);
+      reduce_dst_mem = dnnl::memory(reduce_pd.dst_desc(), dnnl_engine);
+
+      auto reducemean_op = dnnl::reduction(reduce_pd);
+      sp.AddPrimitive(reducemean_op, {{DNNL_ARG_SRC, reduce_src_mem},
+                                      {DNNL_ARG_DST, reduce_dst_mem}});
+    } else {
+      if (reduce_op == ReduceLogSum) {
+        auto elementwise_desc = dnnl::eltwise_forward::desc(dnnl::prop_kind::forward_inference, dnnl::algorithm::eltwise_log, src_md);
+        auto elementwise_pd = dnnl::eltwise_forward::primitive_desc(elementwise_desc, dnnl_engine);
+
+        reduce_dst_mem = dnnl::memory(elementwise_pd.dst_desc(), dnnl_engine);
+
+        auto elemenwise_primitive = dnnl::eltwise_forward(elementwise_pd);
+        sp.AddPrimitive(elemenwise_primitive, {{DNNL_ARG_SRC, src_mem},
+                                               {DNNL_ARG_DST, reduce_dst_mem}});
+      } else {
+        reduce_dst_mem = src_mem;
+      }
+    }
+  }
+
+
+  // If keepdims != 0 set the output to the reduce op results
+  auto keepdims = Keepdims(node);
+  if (keepdims) {
+    if (src_and_dst_dims_equal) {
+      sp.SetMemory(node.Output(OUT_REDUCED), reduce_dst_mem, true);
+    } else {
+      sp.SetMemory(node.Output(OUT_REDUCED), reduce_dst_mem);
+    }
+  // if keepdims == 0 we do a squeeze operation on reduce output shape.
+  } else {
+    std::vector<int64_t> output_shape;
+    size_t j = 0;
+    for (size_t i = 0; i < ndim; ++i) {
+      if ((j < axes.size() && axes[j] == static_cast<int64_t>(i)) ||
+          (axes.size() == 0 && src_dims[i] == 1)) {
+        ORT_ENFORCE(src_dims[i] == 1, "Dimension of input ", i, " must be 1 instead of ", src_dims[i],
+                    ". shape=", src_dims);
+        ++j;
+        continue;
+      }
+
+      if ((j < axes.size() && axes[j] == static_cast<int64_t>(i) && src_dims[i] == 0) ||
+          (axes.size() == 0 && src_dims[i] == 0)) {
+          ORT_ENFORCE(keepdims,
+              "Can't reduce on dim with value of 0 if 'keepdims' is false. "
+              "Invalid output shape would be produced. input_shape:",
+              TensorShape(src_md.dims()));
+      }
+      output_shape.push_back(src_dims[i]);
+    }
+
+    // OneDNN does not support scalar output if the output shape is {} change it to {1}
+    bool is_scalar_output = false;
+    if (output_shape.empty()) {
+      output_shape.push_back(1);
+      is_scalar_output = true;
+    }
+    dnnl::memory::desc squeeze_md(output_shape, node.Input(IN_DATA).Type(), sp.GetDnnlFormat(output_shape.size()));
+    dnnl::memory squeeze_mem = dnnl::memory(squeeze_md, dnnl_engine, nullptr);
+    // if the src and dst dims are equal then we will have a valid data handle here.
+    // Otherwise we must get the data handle at runtime using the AddReshape function.
+    // reading the data handle directy is more efficent if is it possible.
+    if (!src_and_dst_dims_equal) {
+      squeeze_mem.set_data_handle(reduce_dst_mem.get_data_handle());
+    } else {
+      sp.AddReshape(reduce_dst_mem, squeeze_mem);
+    }
+    sp.SetMemory(node.Output(OUT_REDUCED), squeeze_mem, true, is_scalar_output);
+  }
+}
+
+std::vector<int64_t> DnnlReduce::ReadAxes(DnnlNode& node) {
+  auto attr = node.Attributes().find("axes");
+  std::vector<int64_t> axes;
+  if (attr != node.Attributes().end()) {
+    auto& proto = attr->second();
+    axes.reserve(proto.ints_size());
+    for (int i = 0; i < proto.ints_size(); i++) {
+      axes.push_back(proto.ints(i));
+    }
+  }
+  return axes;
+}
+
+bool DnnlReduce::Keepdims(DnnlNode& node) {
+  auto attr = node.Attributes().find("keepdims");
+  if (attr != node.Attributes().end() && 
+      attr->second().i() == 0) {
+    return false;
+  }
+  return true;
+}
+
+bool DnnlReduce::NoOpWithEmptyAxes(DnnlNode& node) {
+  auto attr = node.Attributes().find("noop_with_empty_axes");
+  if (attr != node.Attributes().end() &&
+      attr->second().i() != 0) {
+    return true;
+  }
+  return false;
+}
+
+}  // namespace ort_dnnl
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_reducemean.h b/onnxruntime/core/providers/dnnl/subgraph/dnnl_reduce.h
similarity index 73%
rename from onnxruntime/core/providers/dnnl/subgraph/dnnl_reducemean.h
rename to onnxruntime/core/providers/dnnl/subgraph/dnnl_reduce.h
index 675fa0f0ec..b5a89fd53a 100644
--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_reducemean.h
+++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_reduce.h
@@ -8,18 +8,21 @@
 namespace onnxruntime {
 namespace ort_dnnl {
 
-class DnnlReduceMean {
+class DnnlReduce {
  public:
   enum InputTensors : int {
-    IN_X = 0
+    IN_DATA = 0,
+    IN_AXES = 1
   };
 
   enum OutputTensors : int {
-    OUT_Y = 0
+    OUT_REDUCED = 0
   };
-  DnnlReduceMean();
+  DnnlReduce();
   void CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node);
   std::vector<int64_t> ReadAxes(DnnlNode& node);
+  bool Keepdims(DnnlNode& node);
+  bool NoOpWithEmptyAxes(DnnlNode& node);
 };
 
 }  // namespace ort_dnnl
diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_reducemean.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_reducemean.cc
deleted file mode 100644
index 8f6f7c12c6..0000000000
--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_reducemean.cc
+++ /dev/null
@@ -1,75 +0,0 @@
-// Copyright(C) 2021 Intel Corporation
-// Licensed under the MIT License
-#include "dnnl_reducemean.h"
-#include "dnnl_subgraph.h"
-#include "dnnl_subgraph_primitive.h"
-
-namespace onnxruntime {
-namespace ort_dnnl {
-
-
-DnnlReduceMean::DnnlReduceMean() {}
-
-// assume all dims are available
-void DnnlReduceMean::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) {
-
-  using namespace dnnl;
-
-  // get the engine, currently only support either single gpu or single cpu device
-  auto dnnl_engine = sp.GetEngine();
-
-  auto axes = ReadAxes(node);
-  
-  auto reducemean_src_mem = sp.GetMemory(node.Input(IN_X));
-  auto src_md = reducemean_src_mem.get_desc();
-
-  //We need to calculate output tensor shape
-  //First we initialize it with input shape and then we modify it based on the attribute values
-  //This is because the DNNL primitive functionality is determined by the input and output shapes.
-  
-  auto src_dims = src_md.dims();
-  auto ndim = src_dims.size();
-  for (unsigned long int i = 0; i < ndim; i++) {
-    if (axes.size() == 0)
-      src_dims[i] = 1;  //If no axis is specified, then output shape is just all 1's
-    else if (i < axes.size()) {
-      if (axes[i] < 0)
-        src_dims[ndim + axes[i]] = 1;
-      else
-        src_dims[axes[i]] = 1;
-    }  //If there is axis, then make the respective dimensions 1, keeping the other dimension values untouched.
-  }
-
-  auto dst_shape = TensorShape(src_dims.data(), ndim);
-  dnnl::memory::dims dst_dims_mkl(dst_shape.GetDims().begin(), dst_shape.GetDims().end());
-  auto dst_md = dnnl::memory::desc({dst_dims_mkl}, src_md.data_type(), dnnl::memory::format_tag::any);
-
-  auto reducemean_desc = dnnl::reduction::desc(dnnl::algorithm::reduction_mean, src_md, dst_md, 0.f, 0.f);
-  auto reducemean_pd = dnnl::reduction::primitive_desc(reducemean_desc, dnnl_engine);
-
-  // If using GPU this will move the memory from the CPU to the GPU.
-  reducemean_src_mem = sp.GetMemoryAndReshape(node.Input(IN_X), reducemean_pd.src_desc(), dnnl_engine);
-  auto reducemean_dst_mem = dnnl::memory(reducemean_pd.dst_desc(), dnnl_engine);
-
-  auto reducemean_op = dnnl::reduction(reducemean_pd);
-  sp.AddPrimitive(reducemean_op, {{DNNL_ARG_SRC, reducemean_src_mem},
-                                  {DNNL_ARG_DST, reducemean_dst_mem}});
-
-  sp.SetMemory(node.Output(OUT_Y), reducemean_dst_mem);
-}
-
-std::vector<int64_t> DnnlReduceMean::ReadAxes(DnnlNode& node) {
-  auto attr = node.Attributes().find("axes");
-  std::vector<int64_t> axes;
-  if (attr != node.Attributes().end()) {
-    auto& proto = attr->second();
-    axes.reserve(proto.ints_size());
-    for (int i = 0; i < proto.ints_size(); i++) {
-      axes.push_back(proto.ints(i));
-    }
-  }
-  return axes;
-}
-
-}  // namespace ort_dnnl
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph.cc
index 2b56943bdc..b32402f1d0 100644
--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph.cc
+++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph.cc
@@ -175,6 +175,10 @@ NodeAttributes& DnnlNode::Attributes() {
   return *attr_;
 }
 
+int DnnlNode::SinceVersion() {
+  return onnx_node_->SinceVersion();
+}
+
 DnnlSubgraph::DnnlSubgraph(const GraphViewer& graph_viewer) : graph_viewer_(graph_viewer) {
   Build();
   is_dynamic_ = false;
diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph.h b/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph.h
index 6283bcda5d..963ed815c3 100644
--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph.h
+++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph.h
@@ -76,6 +76,7 @@ class DnnlNode {
   NodeAttributes& Attributes();
   std::vector<DnnlTensor*>& Inputs();
   std::vector<DnnlTensor*>& Outputs();
+  int SinceVersion();
 
  private:
   const Node* onnx_node_ = nullptr;
diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph_primitive.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph_primitive.cc
index 583e6151b5..6860b1039d 100644
--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph_primitive.cc
+++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph_primitive.cc
@@ -16,7 +16,7 @@
 #include "dnnl_pool.h"
 #include "dnnl_pow.h"
 #include "dnnl_qattention.h"
-#include "dnnl_reducemean.h"
+#include "dnnl_reduce.h"
 #include "dnnl_reshape.h"
 #include "dnnl_softmax.h"
 #include "dnnl_softmaxgrad.h"
@@ -126,6 +126,7 @@ void DnnlSubgraphPrimitive::AddKernels() {
   std::unordered_set<std::string> binary_ops = {"Add", "Div", "Mul", "Sub"};
   std::unordered_set<std::string> elementwise_ops = {"Abs", "Elu", "Exp", "LeakyRelu", "Log", "Relu", "Round", "Sigmoid", "Softplus", "Sqrt", "Tanh"};
   std::unordered_set<std::string> pool_ops = {"AveragePool", "GlobalAveragePool", "GlobalMaxPool", "MaxPool"};
+  std::unordered_set<std::string> reduce_ops = {"ReduceL1", "ReduceL2", "ReduceLogSum", "ReduceLogSumExp", "ReduceMax", "ReduceMean", "ReduceMin", "ReduceProd", "ReduceSum", "ReduceSumSquare"};
 
   auto indices = subgraph_->GetDnnlNodesInTopologicalOrder();
   for (auto index : indices) {
@@ -158,8 +159,8 @@ void DnnlSubgraphPrimitive::AddKernels() {
       DnnlPow().CreatePrimitive(*this, node);
     } else if (node.OpType() == "QAttention") {
       DnnlQAttention().CreatePrimitive(*this, node);
-    } else if (node.OpType() == "ReduceMean") {
-      DnnlReduceMean().CreatePrimitive(*this, node);
+    } else if (reduce_ops.count(node.OpType())) {
+      DnnlReduce().CreatePrimitive(*this, node);
     } else if (node.OpType() == "Reshape") {
       DnnlReshape().CreatePrimitive(*this, node);
     } else if (node.OpType() == "Softmax") {
diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
index c841f28f79..55f516bed7 100644
--- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
@@ -557,6 +557,28 @@ TEST(ReductionOpTest, ReduceLogSumExp_double) {
   test.Run();
 }
 
+TEST(ReductionOpTest, ReduceLogSumExp_float_no_reduction) {
+  OpTester test("ReduceLogSumExp");
+  test.AddAttribute("axes", std::vector<int64_t>{0});
+  test.AddAttribute("keepdims", (int64_t)0);
+  test.AddInput<float>("data", {1, 2, 2},
+                        {1.0f, 2.0f,
+                         3.0f, 4.0f});
+  test.AddOutput<float>("reduced", {2, 2}, {1.f, 2.f, 3.f, 4.f});
+  test.Run();
+}
+
+TEST(ReductionOpTest, ReduceLogSumExp_float_no_reduction_keepdims) {
+  OpTester test("ReduceLogSumExp");
+  test.AddAttribute("axes", std::vector<int64_t>{0});
+  test.AddAttribute("keepdims", (int64_t)1);
+  test.AddInput<float>("data", {1, 2, 2},
+                       {1.0f, 2.0f,
+                        3.0f, 4.0f});
+  test.AddOutput<float>("reduced", {1, 2, 2}, {1.f, 2.f, 3.f, 4.f});
+  test.Run();
+}
+
 #if defined(USE_CUDA) || defined(USE_ROCM)
 TEST(ReductionOpTest, ReduceLogSumExp_half) {
   OpTester test("ReduceLogSumExp");
@@ -1082,6 +1104,24 @@ TEST(ReductionOpTest, ReduceMean0DTensor_double) {
 }
 #endif  // !(defined USE_TVM)
 
+TEST(ReductionOpTest, ReduceMean_keepdims_results_in_noop) {
+  OpTester test("ReduceMean");
+  test.AddAttribute("axes", std::vector<int64_t>{0});
+  test.AddAttribute("keepdims", (int64_t)1);
+  test.AddInput<float>("data", {1, 3}, {1.0, 2.0, 3.0});
+  test.AddOutput<float>("reduced", {1, 3}, {1.0, 2.0, 3.0});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+TEST(ReductionOpTest, ReduceMean_keepdims_results_in_shape_change) {
+  OpTester test("ReduceMean");
+  test.AddAttribute("axes", std::vector<int64_t>{0});
+  test.AddAttribute("keepdims", (int64_t)0);
+  test.AddInput<float>("data", {1, 3}, {1.0, 2.0, 3.0});
+  test.AddOutput<float>("reduced", {3}, {1.0, 2.0, 3.0});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
 TEST(ReductionOpTest, ReduceMin_default_axes_keepdims) {
   OpTester test("ReduceMin");
   test.AddAttribute("keepdims", (int64_t)1);
@@ -2450,7 +2490,7 @@ TEST(ReductionOpTest, OptimizeShapeForFastReduce_ReduceDimWithZero3) {
   ASSERT_EQ(fast_kind, FastReduceKind::kKR);
 }
 
-TEST(ReductionOpTest, ReduceDimWithZero3) {
+TEST(ReductionOpTest, ReduceSum_ReduceDimWithZero3) {
   auto run = [](OpTester& tester, const std::string& error_msg = "") {
     auto expect = error_msg.empty() ? OpTester::ExpectResult::kExpectSuccess
                                     : OpTester::ExpectResult::kExpectFailure;
diff --git a/orttraining/orttraining/test/gradient/gradient_ops_test.cc b/orttraining/orttraining/test/gradient/gradient_ops_test.cc
index 84e35add8c..9b95925410 100644
--- a/orttraining/orttraining/test/gradient/gradient_ops_test.cc
+++ b/orttraining/orttraining/test/gradient/gradient_ops_test.cc
@@ -443,7 +443,7 @@ TEST(GradientCheckerTest, LogGrad) {
 
   float max_error;
 #ifdef USE_DNNL
-  float error_tolerance = 3e-3f;
+  float error_tolerance = 4e-3f;
 #else
   float error_tolerance = 1e-3f;
 #endif