Improves ReduceSum performance by removing transposition. (#5370)

* Improves ReduceSum performance * Add min, max, L1, L2, logsum, sumsquare * remove all reduce implementation including transpose
2026-06-24 02:47:54 +00:00 · 2020-10-20 10:36:31 +02:00 · 2020-10-20 10:36:31 +02:00 · 66c8a441e0
commit 66c8a441e0
parent 682898ae2b
3 changed files with 628 additions and 594 deletions
--- a/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc
+++ b/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc
@ -151,23 +151,13 @@ REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ArgMin, 1, 10);
 REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ArgMin, 11, 12);
 REGISTER_UNARY_ELEMENTWISE_KERNEL(ArgMin, 13);

-// When all reduce axes are located at the tail of the dims, quite general cases, transpose and extra
-// copy could be skipped to improve performance. If required by check_no_transpose = true, then
-// the calling code will check if the data was transposed and act accordingly.
-// return value: true means transposedInputData is not created/copied, input tensor data could
-// be directly used as row major matrix [block_size, blocks], where blocks is the
-// size of each reduce.
-// `input_shape_override` overrides the shape of `input` for compute purposes.
-template <typename T>
-bool PrepareForReduce(const Tensor* input_tensor_ptr,
-                      FastAllocVector<T>& transposed_input_data,
-                      int64_t& block_size,
-                      int64_t& blocks,
-                      const std::vector<int64_t>& axes_,
-                      bool keepdims_,
-                      /*out*/ std::vector<int64_t>& reduced_dims,
-                      bool check_no_transpose,
-                      const TensorShape* input_shape_override) {
+bool SetupForReduce(const Tensor* input_tensor_ptr,
+                    const std::vector<int64_t>& axes_,
+                    std::vector<int64_t>& axes,
+                    TensorShape& new_input_shape,
+                    std::vector<int64_t>& output_shape,
+                    bool& empty_reduce,
+                    const TensorShape* input_shape_override) {
  ORT_ENFORCE(input_tensor_ptr != nullptr, "Input to be reduced is null");

  if (input_shape_override) {
@ -175,25 +165,13 @@ bool PrepareForReduce(const Tensor* input_tensor_ptr,
                "The input shape override's size does not match the input tensor's shape size");
  }

-  const Tensor& input = *input_tensor_ptr;
-  const auto& input_shape = input_shape_override ? *input_shape_override : input.Shape();
-
-  size_t ndim = input_shape.NumDimensions();
-
-  // Scalar tensor
+  new_input_shape = input_shape_override ? *input_shape_override : input_tensor_ptr->Shape();
+  size_t ndim = new_input_shape.NumDimensions();
  if (ndim == 0) {
-    if (!check_no_transpose) {
-      auto size = input_shape.Size();
-      assert(size == 1);
-      transposed_input_data.resize(size, 0);
-      T* to_data = &transposed_input_data[0];
-      *to_data = *input.Data<T>();
-    }
-    block_size = blocks = 1;
-    return true;
+    empty_reduce = true;
+    return false;
  }

-  std::vector<int64_t> axes;
  axes.reserve(axes_.size());
  for (int64_t axis : axes_) {
    axes.push_back(HandleNegativeAxis(axis, static_cast<int64_t>(ndim)));
@ -210,598 +188,411 @@ bool PrepareForReduce(const Tensor* input_tensor_ptr,

  // If all reduced axes are located at the tail of the input shape, then copy could be skipped is required
  bool need_copy = true;
-  if (axes.size() <= ndim &&
+  if (axes.size() <= ndim && ndim > 0 &&
      axes.front() == static_cast<int64_t>(ndim - axes.size()) &&
      axes.back() == static_cast<int64_t>(ndim) - 1) {
    need_copy = false;
  }

-  std::vector<bool> keep_axis(ndim, true);
-  for (auto i : axes) {
-    keep_axis[i] = false;
+  empty_reduce = false;
+  output_shape = new_input_shape.GetDims();
+  for (auto a : axes) {
+    output_shape[a] = new_input_shape[a] > 0 ? 1 : 0;
+    empty_reduce |= output_shape[a] == 0;
+  }
+  return need_copy;
+}
+void NoTransposePrepareForReduce(const TensorShape& new_input_shape,
+                                 const std::vector<int64_t>& reduced_axes,
+                                 ResultsNoTransposePrepareForReduce& results) {
+  // Common initialisation for the indices.
+  std::vector<int64_t> cumulative_shape = new_input_shape.GetDims();
+  cumulative_shape[cumulative_shape.size() - 1] = 1;
+  for (int i = static_cast<int>(cumulative_shape.size()) - 2; i >= 0; --i) {
+    cumulative_shape[i] = cumulative_shape[i + 1] * new_input_shape[i + 1];
+  }
+  int64_t projection_size = 1;
+  for (auto a : reduced_axes) {
+    projection_size *= new_input_shape[a];
  }

-  //transpose the input so that all to-be-reduced axes are at the head
-  std::vector<int64_t> transposed_axes(axes.begin(), axes.end());
-  for (size_t i = 0; i < ndim; ++i) {
-    if (keep_axis[i]) {
-      transposed_axes.push_back(i);
-    }
-  }
-
-  std::vector<int64_t> new_dims(transposed_axes.size());
-  for (size_t i = 0; i < transposed_axes.size(); ++i) {
-    new_dims[i] = input_shape.GetDims().at(transposed_axes[i]);
-  }
-
-  int num_axes = static_cast<int>(transposed_axes.size());
-  auto in_dims = input_shape.GetDims();
-
-  // Measure amount of contiguous data we can copy at once
-  int64_t blocksize = 1;
-  int n_shared_idxs = 0;
-  for (int i = num_axes - 1; i >= 0; --i) {
-    if (transposed_axes[i] == i) {
-      blocksize *= new_dims[i];
-      ++n_shared_idxs;
-    } else {
+  int last_reduced_axis = static_cast<int>(reduced_axes.size()) - 1;
+  int loop_reduced_axis = 1;
+  results.last_loop_red_size = new_input_shape[reduced_axes[last_reduced_axis]];
+  results.last_loop_red_inc = cumulative_shape[reduced_axes[last_reduced_axis]];
+  projection_size /= new_input_shape[reduced_axes[last_reduced_axis]];
+  --last_reduced_axis;
+  while (last_reduced_axis >= 0) {
+    if (reduced_axes[last_reduced_axis] != reduced_axes[last_reduced_axis + 1] - 1)
      break;
-    }
+    results.last_loop_red_size *= new_input_shape[reduced_axes[last_reduced_axis]];
+    projection_size /= new_input_shape[reduced_axes[last_reduced_axis]];
+    --last_reduced_axis;
+    ++loop_reduced_axis;
  }

-  const T* from_data = input.template Data<T>();
-  size_t count = input_shape.Size();
-
-  //set to-be-reduced axes to one. squeeze is keepdims_ is false
-  int64_t first_dim = 1;
-  reduced_dims.reserve(in_dims.size());
-
-  for (size_t i = 0; i < in_dims.size(); i++) {
-    const auto in_dim = in_dims[i];
-    if (keep_axis[i]) {
-      reduced_dims.push_back(in_dim);
-    } else {
-      first_dim *= in_dim;
-      if (keepdims_) {
-        reduced_dims.push_back(in_dim == 0 ? 0 : 1);
-      } else {
-        // as we are reducing on this axis and not keeping a dim for it, we can't drop a dim value of 0.
-        // e.g. if input was {3, 0, 2} and we reduced on axis 1 without keeping it, the output shape would be
-        // {3, 2} which is invalid given the input was empty.
-        // note that if we do keep the dim the output shape will have a 0 in it,
-        // which is still valid for an empty tensor, so allow that.
-        ORT_ENFORCE(in_dim != 0,
-                    "Can't reduce on dim with value of 0 if 'keepdims' is false. "
-                    "Invalid output shape would be produced. input_shape:",
-                    input_shape);
-      }
-    }
-  }
-
-  auto num_elements = input_shape.Size();
-
-  // edge case. one or more input dims with value of 0.
-  if (num_elements == 0) {
-    block_size = blocks = 0;
-    return true;
-  }
-
-  if (0 == first_dim) {
-    return false;
-  }
-
-  block_size = num_elements / first_dim;
-  blocks = first_dim;
-
-  if (!need_copy && check_no_transpose) {
-    return true;
-  }
-
-  transposed_input_data.resize(input_shape.Size(), 0);
-  T* to_data = &transposed_input_data[0];
-  if (num_axes < 2 || n_shared_idxs == num_axes) {
-    memcpy(to_data, from_data, count * sizeof(T));
-    return false;
-  }
-
-  int itr_axes = num_axes - n_shared_idxs;
-
-  // Calculate strides
-  std::vector<int64_t> stride_x(itr_axes, 0);
-  for (size_t i = 0; static_cast<int>(i) < itr_axes; i++) {
-    stride_x[i] = 1;
-    for (size_t j = transposed_axes[i] + 1; static_cast<int>(j) < itr_axes; j++) {
-      stride_x[i] *= in_dims[j];
-    }
-  }
-
-  std::vector<int64_t> itr_idxs(itr_axes, 0);
-
-  // Branch here to avoid branching within the loop
-  if (blocksize > 1) {
-    for (size_t index = 0; index < (count / blocksize); index++) {
-      int64_t from_index = 0;
-      for (int i = 0; i < itr_axes; ++i) {
-        from_index += stride_x[i] * itr_idxs[i];
-      }
-
-      memcpy(
-          to_data + blocksize * index,
-          from_data + blocksize * from_index,
-          blocksize * sizeof(T));
-
-      ++itr_idxs[itr_axes - 1];
-      for (int i = itr_axes - 1; i >= 1; --i) {
-        auto expected_dim = new_dims[i];
-        if (itr_idxs[i] < expected_dim) {
-          break;
-        }
-        itr_idxs[i] %= expected_dim;
-        ++itr_idxs[i - 1];
-      }
-    }
+  // Builds the list of indices projected into the same sum.
+  int reduced_axes_size = static_cast<int>(reduced_axes.size()) - loop_reduced_axis;
+  if (reduced_axes_size == 0) {
+    results.projected_index.resize(1, 0);
  } else {
-    for (size_t index = 0; index < count; index++) {
-      int64_t from_index = 0;
-      for (int i = 0; i < itr_axes; ++i) {
-        from_index += stride_x[i] * itr_idxs[i];
-      }
-
-      *(to_data + index) = *(from_data + from_index);
-
-      ++itr_idxs[itr_axes - 1];
-      for (int i = itr_axes - 1; i >= 1; --i) {
-        auto expected_dim = new_dims[i];
-        if (itr_idxs[i] < expected_dim) {
+    results.projected_index.resize(projection_size);
+    std::vector<int64_t> projected_indices(reduced_axes_size, 0);
+    int64_t current_index = 0;
+    size_t current_pos = 0;
+    int j;
+    for (current_pos = 0; current_pos < results.projected_index.size(); ++current_pos) {
+      results.projected_index[current_pos] = current_index;
+      ++projected_indices[projected_indices.size() - 1];
+      current_index += cumulative_shape[reduced_axes[reduced_axes_size - 1]];
+      for (j = reduced_axes_size - 1; j > 0; --j) {
+        if (projected_indices[j] < new_input_shape[reduced_axes[j]])
          break;
-        }
-        itr_idxs[i] %= expected_dim;
-        ++itr_idxs[i - 1];
+        projected_indices[j] -= new_input_shape[reduced_axes[j]];
+        current_index -= new_input_shape[reduced_axes[j]] * cumulative_shape[reduced_axes[j]];
+        ++projected_indices[j - 1];
+        current_index += cumulative_shape[reduced_axes[j - 1]];
      }
    }
  }
-  return false;
+
+  // Builds the list of indices for the unprojected sum.
+  std::vector<int64_t> unreduced_axes;
+  for (int64_t i = 0; i < static_cast<int64_t>(cumulative_shape.size()); ++i) {
+    if (std::find(reduced_axes.begin(), reduced_axes.end(), i) != reduced_axes.end())
+      continue;
+    unreduced_axes.push_back(i);
+  }
+  int64_t unprojection_size = 1;
+  for (auto a : unreduced_axes) {
+    unprojection_size *= new_input_shape[a];
+  }
+  if (unprojection_size == 0) {
+    return;
+  }
+  std::vector<int64_t> unprojected_indices(unreduced_axes.size(), 0);
+
+  // The last index is usually an image size.
+  // We differently process the last unprojected dimension.
+  results.last_loop_size = new_input_shape[unreduced_axes[unreduced_axes.size() - 1]];
+  int64_t unprojection_size_before_last = unprojection_size / results.last_loop_size;
+  results.unprojected_index.reserve(unprojection_size_before_last);
+  results.last_loop_inc = cumulative_shape[unreduced_axes[unreduced_axes.size() - 1]];
+  if (unprojected_indices.size() <= 1) {
+    results.unprojected_index.push_back(0);
+  } else {
+    int64_t current_index = 0;
+    int j;
+    for (int64_t pos = 0; pos < unprojection_size_before_last; ++pos) {
+      results.unprojected_index.push_back(current_index);
+      ++unprojected_indices[unprojected_indices.size() - 2];
+      current_index += cumulative_shape[unreduced_axes[unreduced_axes.size() - 2]];
+      for (j = static_cast<int>(unreduced_axes.size()) - 2; j > 0; --j) {
+        if (unprojected_indices[j] < new_input_shape[unreduced_axes[j]])
+          break;
+        unprojected_indices[j] -= new_input_shape[unreduced_axes[j]];
+        current_index -= new_input_shape[unreduced_axes[j]] * cumulative_shape[unreduced_axes[j]];
+        ++unprojected_indices[j - 1];
+        current_index += cumulative_shape[unreduced_axes[j - 1]];
+      }
+    }
+  }
+}
+
+template <typename T, typename AGG>
+void NoTransposeReduce(Tensor* output, const TensorShape& new_input_shape, const Tensor& input,
+                       const std::vector<int64_t>& reduced_axes, concurrency::ThreadPool* tp,
+                       ResultsNoTransposePrepareForReduce& last_results) {
+  auto output_shape = output->Shape();
+  const T* from_data = input.template Data<T>();
+  typename AGG::value_type* to_data = output->template MutableData<typename AGG::value_type>();
+  int64_t count = output_shape.Size();
+
+  if (reduced_axes.size() == 0 || reduced_axes.size() == new_input_shape.NumDimensions()) {
+    ORT_ENFORCE(count == 1, "Reduction on all axes, output size should be 1.");
+    int64_t input_size = new_input_shape.Size();
+    to_data[0] = AGG(input_size, from_data[0]).aggall(from_data);
+    return;
+  }
+
+  if (!last_results.equal(new_input_shape.GetDims(), reduced_axes)) {
+    NoTransposePrepareForReduce(new_input_shape, reduced_axes, last_results);
+    if (last_results.last_loop_red_size == 0 || last_results.last_loop_size == 0)
+      return;
+  }
+  int64_t denominator = last_results.last_loop_red_size * last_results.projected_index.size();
+
+  if (AGG::two_loops()) {
+    auto fn = [&](std::ptrdiff_t first, std::ptrdiff_t end) {
+      int64_t loop;
+      const T* loop_red_ptr;
+      const T* loop_red_ptr_end;
+      int64_t current_index = first * last_results.last_loop_size;
+      for (int64_t main_index = first; main_index < end; ++main_index) {
+        for (loop = 0; loop < last_results.last_loop_size; ++loop, ++current_index) {
+          int64_t origin = last_results.unprojected_index[main_index] + loop * last_results.last_loop_inc;
+          AGG accumulator(denominator, from_data[origin + last_results.projected_index[0]]);
+          for (auto it = last_results.projected_index.begin(); it != last_results.projected_index.end(); ++it) {
+            loop_red_ptr = from_data + (origin + *it);
+            loop_red_ptr_end = loop_red_ptr + last_results.last_loop_red_size * last_results.last_loop_red_inc;
+            for (; loop_red_ptr != loop_red_ptr_end; loop_red_ptr += last_results.last_loop_red_inc) {
+              accumulator.update0(*loop_red_ptr);
+            }
+          }
+          for (auto it = last_results.projected_index.begin(); it != last_results.projected_index.end(); ++it) {
+            loop_red_ptr = from_data + (origin + *it);
+            loop_red_ptr_end = loop_red_ptr + last_results.last_loop_red_size * last_results.last_loop_red_inc;
+            for (; loop_red_ptr != loop_red_ptr_end; loop_red_ptr += last_results.last_loop_red_inc) {
+              accumulator.update(*loop_red_ptr);
+            }
+          }
+          to_data[current_index] = accumulator.get_value();
+        }
+      }
+    };
+
+    auto cost = TensorOpCost{(double)(last_results.projected_index.size() * sizeof(T) * last_results.last_loop_size * last_results.last_loop_red_size),
+                             (double)last_results.last_loop_size * last_results.last_loop_red_size,
+                             (double)last_results.projected_index.size() * last_results.last_loop_size * last_results.last_loop_red_size * 2};
+    concurrency::ThreadPool::TryParallelFor(tp, count / last_results.last_loop_size, cost, fn);
+  } else {
+    auto fn = [&](std::ptrdiff_t first, std::ptrdiff_t end) {
+      int64_t loop;
+      const T* loop_red_ptr;
+      const T* loop_red_ptr_end;
+      int64_t current_index = first * last_results.last_loop_size;
+      for (int64_t main_index = first; main_index < end; ++main_index) {
+        for (loop = 0; loop < last_results.last_loop_size; ++loop, ++current_index) {
+          int64_t origin = last_results.unprojected_index[main_index] + loop * last_results.last_loop_inc;
+          AGG accumulator(denominator, from_data[origin + last_results.projected_index[0]]);
+          for (auto it = last_results.projected_index.begin(); it != last_results.projected_index.end(); ++it) {
+            loop_red_ptr = from_data + (origin + *it);
+            loop_red_ptr_end = loop_red_ptr + last_results.last_loop_red_size * last_results.last_loop_red_inc;
+            for (; loop_red_ptr != loop_red_ptr_end; loop_red_ptr += last_results.last_loop_red_inc) {
+              accumulator.update(*loop_red_ptr);
+            }
+          }
+          to_data[current_index] = accumulator.get_value();
+        }
+      }
+    };
+
+    auto cost = TensorOpCost{(double)(last_results.projected_index.size() * sizeof(T) * last_results.last_loop_size * last_results.last_loop_red_size),
+                             (double)last_results.last_loop_size * last_results.last_loop_red_size,
+                             (double)last_results.projected_index.size() * last_results.last_loop_size * last_results.last_loop_red_size};
+    concurrency::ThreadPool::TryParallelFor(tp, count / last_results.last_loop_size, cost, fn);
+  }
+}
+
+void DropDimensions(const std::vector<int64_t>& input_shape, const std::vector<int64_t>& axes, std::vector<int64_t>& dropped_axes) {
+  auto dropped_dims = input_shape;
+  for (auto i : axes) {
+    dropped_dims[i] = -1;
+  }
+  for (auto it = dropped_dims.begin(); it != dropped_dims.end(); ++it) {
+    if (*it != -1) {
+      dropped_axes.push_back(*it);
+    }
+  }
+}
+
+template <typename T, typename AGG>
+void CommonReduce(OpKernelContext* ctx,
+                  const std::vector<int64_t> axes_, int64_t keepdims_,
+                  ResultsNoTransposePrepareForReduce& last_results,
+                  bool noop_with_empty_axes) {
+  std::vector<int64_t> axes;
+  const Tensor* input = ctx->Input<Tensor>(0);
+  auto reduced_dims = input->Shape().GetDims();
+  std::vector<int64_t> output_shape;
+  bool empty_reduce;
+  TensorShape new_input_shape;
+
+  if (ctx->InputCount() == 2) {
+    // second input holds the axes.
+    const Tensor* axes_tensor = ctx->Input<Tensor>(1);
+    ORT_ENFORCE(axes_tensor != nullptr, "Axes input is null");
+    ORT_ENFORCE(axes_tensor->Shape().NumDimensions() == 1,
+                "An axes tensor must be a vector tensor.");
+    auto nDims = static_cast<size_t>(axes_tensor->Shape()[0]);
+    const auto* data = axes_tensor->template Data<int64_t>();
+    std::vector<int64_t> input_axes(data, data + nDims);
+    if (input_axes.empty() && noop_with_empty_axes) {
+      auto* output = ctx->Output(0, input->Shape());
+      memcpy(output->template MutableData<typename AGG::value_type>(), input->template Data<T>(), input->SizeInBytes());
+      return;
+    }
+    SetupForReduce(input, input_axes, axes, new_input_shape, output_shape, empty_reduce, nullptr);
+  } else {
+    SetupForReduce(input, axes_, axes, new_input_shape, output_shape, empty_reduce, nullptr);
+  }
+
+  if (empty_reduce) {
+    Tensor* output = ctx->Output(0, keepdims_ ? output_shape : std::vector<int64_t>());
+    if (new_input_shape.Size() == 1) {
+      const T* from_data = input->template Data<T>();
+      typename AGG::value_type* to_data = output->template MutableData<typename AGG::value_type>();
+      AGG agg(1, *from_data);
+      if (agg.two_loops()) {
+        agg.update0(*from_data);
+        agg.update(*from_data);
+      } else {
+        agg.update(*from_data);
+      }
+      *to_data = agg.get_value();
+    } else {
+      ORT_ENFORCE(keepdims_,
+                  "Can't reduce on dim with value of 0 if 'keepdims' is false. "
+                  "Invalid output shape would be produced. input_shape:",
+                  input->Shape());
+    }
+    return;
+  }
+
+  Tensor* output;
+  if (keepdims_) {
+    output = ctx->Output(0, output_shape);
+  } else {
+    std::vector<int64_t> dropped_axes;
+    DropDimensions(output_shape, axes, dropped_axes);
+    output = ctx->Output(0, dropped_axes);
+  }
+  NoTransposeReduce<T, AGG>(output, new_input_shape, *input, axes, ctx->GetOperatorThreadPool(), last_results);
 }

 template <typename T>
 Status ReduceL1<T>::Compute(OpKernelContext* ctx) const {
-  FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
-  int64_t block_size;
-  int64_t blocks;
-  std::vector<int64_t> reduced_dims;
-
-  const Tensor* input = ctx->Input<Tensor>(0);
-
-  bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true);
-
-  Tensor* reduced = ctx->Output(0, reduced_dims);
-
-  T* output_data = reduced->template MutableData<T>();
-
-  if (no_transpose) {
-    const T* input_data = input->template Data<T>();
-
-    for (int64_t i = 0; i < block_size; ++i) {
-      output_data[i] = ConstEigenVectorMap<T>(input_data + (i * blocks), blocks).cwiseAbs().sum();
-    }
-  } else {
-    EigenVectorMap<T> out_vec(output_data, block_size);
-    out_vec = ConstEigenMatrixMap<T>(&transposed_input_data[0], block_size, blocks).cwiseAbs().rowwise().sum();
-  }
-
+  // The following variable does not change if the input tensor and the
+  // axes do not either. It could be either cached in ctx or precomputed
+  // in the constructor if shape and axes are known at this stage.
+  ResultsNoTransposePrepareForReduce last_results;
+  CommonReduce<T, ReduceAggregatorL1<T>>(ctx, axes_, keepdims_, last_results);
  return Status::OK();
 }

 template <typename T>
 Status ReduceL2<T>::Compute(OpKernelContext* ctx) const {
-  FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
-  int64_t block_size;
-  int64_t blocks;
-  std::vector<int64_t> reduced_dims;
-
-  const Tensor* input = ctx->Input<Tensor>(0);
-
-  bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true);
-
-  Tensor* reduced = ctx->Output(0, reduced_dims);
-
-  T* output_data = reduced->template MutableData<T>();
-
-  if (no_transpose) {
-    const T* input_data = input->template Data<T>();
-
-    for (int64_t i = 0; i < block_size; ++i) {
-      output_data[i] = ConstEigenVectorMap<T>(input_data + (i * blocks), blocks).norm();
-    }
-  } else {
-    EigenVectorMap<T> out_vec(output_data, block_size);
-    out_vec = ConstEigenMatrixMap<T>(&transposed_input_data[0], block_size, blocks).rowwise().norm();
-  }
-
+  ResultsNoTransposePrepareForReduce last_results;
+  CommonReduce<T, ReduceAggregatorL2<T>>(ctx, axes_, keepdims_, last_results);
  return Status::OK();
 }

 template <typename T>
 Status ReduceLogSum<T>::Compute(OpKernelContext* ctx) const {
-  FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
-  int64_t block_size;
-  int64_t blocks;
-  std::vector<int64_t> reduced_dims;
-
-  const Tensor* input = ctx->Input<Tensor>(0);
-
-  bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true);
-
-  Tensor* reduced = ctx->Output(0, reduced_dims);
-
-  T* output_data = reduced->template MutableData<T>();
-
-  if (no_transpose) {
-    const T* input_data = input->template Data<T>();
-
-    for (int64_t i = 0; i < block_size; ++i) {
-      output_data[i] = ConstEigenVectorMap<T>(input_data + (i * blocks), blocks).sum();
-    }
-  } else {
-    EigenVectorMap<T> out_vec(output_data, block_size);
-    out_vec = ConstEigenMatrixMap<T>(&transposed_input_data[0], block_size, blocks).rowwise().sum();
-  }
-
-  for (int j = 0; j < block_size; ++j) {
-    *(output_data) = static_cast<T>(std::log(*(output_data)));
-    ++output_data;
-  }
-
+  ResultsNoTransposePrepareForReduce last_results;
+  CommonReduce<T, ReduceAggregatorLogSum<T>>(ctx, axes_, keepdims_, last_results);
  return Status::OK();
 }

 template <typename T>
 Status ReduceLogSumExp<T>::Compute(OpKernelContext* ctx) const {
-  FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
-  int64_t block_size;
-  int64_t blocks;
-  std::vector<int64_t> reduced_dims;
-  const Tensor* input = ctx->Input<Tensor>(0);
-
-  PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims);
-
-  Tensor* reduced = ctx->Output(0, reduced_dims);
-
-  T* output_data = reduced->template MutableData<T>();
-
-  for (int j = 0; j < block_size; ++j) {
-    T max_value = std::numeric_limits<T>::lowest();
-    for (int i = 0; i < blocks; ++i) {
-      max_value = std::max(max_value, transposed_input_data[i * block_size + j]);
-    }
-    T scaled_exp_sum = 0;
-    for (int i = 0; i < blocks; ++i) {
-      scaled_exp_sum += static_cast<T>(std::exp(transposed_input_data[i * block_size + j] - max_value));
-    }
-    *(output_data++) = static_cast<T>(std::log(scaled_exp_sum) + max_value);
-  }
+  ResultsNoTransposePrepareForReduce last_results;
+  CommonReduce<T, ReduceAggregatorLogSumExp<T>>(ctx, axes_, keepdims_, last_results);
  return Status::OK();
 }

 template <typename T>
 Status ReduceMax<T>::Compute(OpKernelContext* ctx) const {
-  FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
-  int64_t block_size;
-  int64_t blocks;
-  std::vector<int64_t> reduced_dims;
-  const Tensor* input = ctx->Input<Tensor>(0);
-
-  bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true);
-
-  Tensor* reduced = ctx->Output(0, reduced_dims);
-
-  T* output_data = reduced->template MutableData<T>();
-
-  if (no_transpose) {
-    const T* input_data = input->template Data<T>();
-
-    for (int64_t i = 0; i < block_size; ++i) {
-      output_data[i] = ConstEigenVectorMap<T>(input_data + (i * blocks), blocks).maxCoeff();
-    }
-  } else {
-    EigenVectorMap<T> out_vec(output_data, block_size);
-    out_vec = ConstEigenMatrixMap<T>(&transposed_input_data[0], block_size, blocks).rowwise().maxCoeff();
-  }
-
+  ResultsNoTransposePrepareForReduce last_results;
+  CommonReduce<T, ReduceAggregatorMax<T>>(ctx, axes_, keepdims_, last_results);
  return Status::OK();
 }

 template <typename T>
 Status ReduceMean<T>::Compute(OpKernelContext* ctx) const {
-  FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
-  int64_t block_size;
-  int64_t blocks;
-  std::vector<int64_t> reduced_dims;
-  const Tensor* input = ctx->Input<Tensor>(0);
-
-  bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true);
-
-  Tensor* reduced = ctx->Output(0, reduced_dims);
-
-  T* output_data = reduced->template MutableData<T>();
-
-  if (no_transpose) {
-    const T* input_data = ctx->Input<Tensor>(0)->template Data<T>();
-    auto lambda = [input_data, blocks, output_data](ptrdiff_t i) {
-      output_data[i] = ConstEigenVectorMap<T>(input_data + (i * blocks), blocks).mean();
-    };
-    concurrency::ThreadPool::TryBatchParallelFor(ctx->GetOperatorThreadPool(), block_size, lambda, 0);
-  } else {
-    EigenVectorMap<T> out_vec(output_data, block_size);
-    out_vec = ConstEigenMatrixMap<T>(&transposed_input_data[0], block_size, blocks).rowwise().mean();
-  }
-
+  ResultsNoTransposePrepareForReduce last_results;
+  CommonReduce<T, ReduceAggregatorMean<T>>(ctx, axes_, keepdims_, last_results);
  return Status::OK();
 }

 template <typename T>
 Status ReduceMin<T>::Compute(OpKernelContext* ctx) const {
-  FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
-  int64_t block_size;
-  int64_t blocks;
-  std::vector<int64_t> reduced_dims;
-  const Tensor* input = ctx->Input<Tensor>(0);
-
-  bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true);
-
-  Tensor* reduced = ctx->Output(0, reduced_dims);
-
-  T* output_data = reduced->template MutableData<T>();
-
-  if (no_transpose) {
-    const T* input_data = input->template Data<T>();
-
-    for (int64_t i = 0; i < block_size; ++i) {
-      output_data[i] = ConstEigenVectorMap<T>(input_data + (i * blocks), blocks).minCoeff();
-    }
-  } else {
-    EigenVectorMap<T> out_vec(output_data, block_size);
-    out_vec = ConstEigenMatrixMap<T>(&transposed_input_data[0], block_size, blocks).rowwise().minCoeff();
-  }
-
+  ResultsNoTransposePrepareForReduce last_results;
+  CommonReduce<T, ReduceAggregatorMin<T>>(ctx, axes_, keepdims_, last_results);
  return Status::OK();
 }

 template <typename T>
 Status ReduceProd<T>::Compute(OpKernelContext* ctx) const {
-  FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
-  int64_t block_size;
-  int64_t blocks;
-  std::vector<int64_t> reduced_dims;
-  const Tensor* input = ctx->Input<Tensor>(0);
-
-  bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true);
-
-  Tensor* reduced = ctx->Output(0, reduced_dims);
-
-  T* output_data = reduced->template MutableData<T>();
-
-  if (no_transpose) {
-    const T* input_data = input->template Data<T>();
-
-    for (int64_t i = 0; i < block_size; ++i) {
-      output_data[i] = ConstEigenVectorMap<T>(input_data + (i * blocks), blocks).prod();
-    }
-  } else {
-    EigenVectorMap<T> out_vec(output_data, block_size);
-    out_vec = ConstEigenMatrixMap<T>(&transposed_input_data[0], block_size, blocks).rowwise().prod();
-  }
-
+  ResultsNoTransposePrepareForReduce last_results;
+  CommonReduce<T, ReduceAggregatorProd<T>>(ctx, axes_, keepdims_, last_results);
  return Status::OK();
 }

 template <typename T>
-void ReduceSumCore(const T* input_data, T* output_data, bool no_transpose,
-                   int64_t blocks, int64_t block_size, FastAllocVector<T>& transposed_input_data,
-                   concurrency::ThreadPool* tp) {
-  if (no_transpose) {
-    auto lambda = [input_data, blocks, output_data](ptrdiff_t i) {
-      // The ConstEigenMatrixMap type is expanded to work around a MS compiler issue
-      output_data[i] = Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(input_data + (i * blocks), blocks).sum();
-    };
-    concurrency::ThreadPool::TryBatchParallelFor(tp, block_size, lambda, 0);
-  } else {
-    EigenVectorMap<T> out_vec(output_data, block_size);
-    out_vec = ConstEigenMatrixMap<T>(&transposed_input_data[0], block_size, blocks).rowwise().sum();
-  }
+Status ReduceSum<T>::Compute(OpKernelContext* ctx) const {
+  ResultsNoTransposePrepareForReduce last_results;
+  CommonReduce<T, ReduceAggregatorSum<T>>(ctx, axes_, keepdims_, last_results);
+  return Status::OK();
 }

 template <typename T>
 Tensor ReduceSum<T>::Impl(const Tensor& input, const std::vector<int64_t>& reduce_axes,
                          AllocatorPtr allocator, concurrency::ThreadPool* tp, bool keep_dims,
                          const TensorShape* input_shape_override) {
-  FastAllocVector<T> transposed_input_data(allocator);
-  int64_t block_size;
-  int64_t blocks;
-  std::vector<int64_t> reduced_dims;
+  std::vector<int64_t> axes;
+  auto reduced_dims = input.Shape().GetDims();
+  std::vector<int64_t> output_shape;
+  TensorShape new_input_shape;
+  bool empty_reduce;
+  SetupForReduce(&input, reduce_axes, axes, new_input_shape, output_shape, empty_reduce, input_shape_override);

-  bool no_transpose = PrepareForReduce<T>(&input, transposed_input_data, block_size, blocks,
-                                          reduce_axes, keep_dims, reduced_dims, true, input_shape_override);
+  if (empty_reduce) {
+    Tensor output(input.DataType(), keep_dims ? output_shape : std::vector<int64_t>(), allocator);
+    if (new_input_shape.Size() == 1) {
+      const T* from_data = input.template Data<T>();
+      T* to_data = output.template MutableData<T>();
+      *to_data = *from_data;
+    } else {
+      ORT_ENFORCE(keep_dims,
+                  "Can't reduce on dim with value of 0 if 'keepdims' is false. "
+                  "Invalid output shape would be produced. input_shape:",
+                  new_input_shape);
+    }
+    return output;
+  }

-  Tensor output(input.DataType(), reduced_dims, allocator);
-
-  ReduceSumCore(input.template Data<T>(), output.template MutableData<T>(),
-                no_transpose, blocks, block_size, transposed_input_data, tp);
-
-  return output;
-}
-
-template <typename T>
-Status ReduceSum<T>::Compute(OpKernelContext* ctx) const {
-  FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
-  int64_t block_size;
-  int64_t blocks;
-  std::vector<int64_t> reduced_dims;
-  const Tensor* input = ctx->Input<Tensor>(0);
-
-  bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true);
-
-  auto* output = ctx->Output(0, reduced_dims);
-
-  ReduceSumCore(input->template Data<T>(), output->template MutableData<T>(),
-                no_transpose, blocks, block_size, transposed_input_data, ctx->GetOperatorThreadPool());
-
-  return Status::OK();
+  if (keep_dims) {
+    ResultsNoTransposePrepareForReduce last_results;
+    Tensor output(input.DataType(), output_shape, allocator);
+    NoTransposeReduce<T, ReduceAggregatorSum<T>>(&output, new_input_shape, input, axes, tp, last_results);
+    return output;
+  } else {
+    ResultsNoTransposePrepareForReduce last_results;
+    std::vector<int64_t> dropped_axes;
+    DropDimensions(output_shape, axes, dropped_axes);
+    Tensor output(input.DataType(), dropped_axes, allocator);
+    NoTransposeReduce<T, ReduceAggregatorSum<T>>(&output, new_input_shape, input, axes, tp, last_results);
+    return output;
+  }
 }

 template <typename T>
 Status ReduceSumSquare<T>::Compute(OpKernelContext* ctx) const {
-  FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
-  int64_t block_size;
-  int64_t blocks;
-  std::vector<int64_t> reduced_dims;
-  const Tensor* input = ctx->Input<Tensor>(0);
-
-  bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true);
-
-  Tensor* reduced = ctx->Output(0, reduced_dims);
-
-  T* output_data = reduced->template MutableData<T>();
-
-  if (no_transpose) {
-    const T* input_data = input->template Data<T>();
-
-    for (int64_t i = 0; i < block_size; ++i) {
-      output_data[i] = ConstEigenVectorMap<T>(input_data + (i * blocks), blocks).squaredNorm();
-    }
-  } else {
-    EigenVectorMap<T> out_vec(output_data, block_size);
-    out_vec = ConstEigenMatrixMap<T>(&transposed_input_data[0], block_size, blocks).rowwise().squaredNorm();
-  }
-
+  ResultsNoTransposePrepareForReduce last_results;
+  CommonReduce<T, ReduceAggregatorSumSquare<T>>(ctx, axes_, keepdims_, last_results);
  return Status::OK();
 }

 template <typename T>
 Status ArgMax<T>::Compute(OpKernelContext* ctx) const {
-  FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
-  int64_t block_size;
-  int64_t blocks;
-
-  std::vector<int64_t> reduced_dims;
-  const Tensor* input = ctx->Input<Tensor>(0);
-
-  bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true);
-
-  Tensor* reduced = ctx->Output(0, reduced_dims);
-  int64_t* output_data = reduced->template MutableData<int64_t>();
-  Eigen::MatrixXf::Index maxIndex;
-
-  if (no_transpose) {
-    const T* input_data = ctx->Input<Tensor>(0)->template Data<T>();
-    if (select_last_index_) {
-      assert(blocks > 0);
-      for (int64_t i = 0; i < block_size; ++i) {
-        gsl::span<const T> row(input_data, blocks);
-        auto first = row.cbegin();
-        auto const end = row.cend();
-        auto max_el = first;
-        while (++first < end) {
-          if (*first >= *max_el) {
-            max_el = first;
-          }
-        }
-        *(output_data++) = max_el - row.cbegin();
-        input_data += blocks;
-      }
-    } else {
-      for (int64_t i = 0; i < block_size; ++i) {
-        ConstEigenVectorMap<T>(input_data + (i * blocks), blocks).maxCoeff(&maxIndex);
-        *(output_data++) = maxIndex;
-      }
-    }
+  ResultsNoTransposePrepareForReduce last_results;
+  if (select_last_index_) {
+    CommonReduce<T, ReduceAggregatorArgMaxLastIndex<T>>(ctx, axes_, keepdims_, last_results);
  } else {
-    auto matrixData = ConstEigenMatrixMap<T>(&transposed_input_data[0], block_size, blocks);
-    if (select_last_index_) {
-      for (int i = 0; i < block_size; ++i) {
-        int idx = 0;
-        T max_val = matrixData(i, 0);
-        for (int c = 1; c < blocks; ++c) {
-          auto val = matrixData(i, c);
-          if (val >= max_val) {
-            idx = c;
-            max_val = val;
-          }
-        }
-        *(output_data++) = idx;
-      }
-    } else {
-      for (int i = 0; i < block_size; ++i) {
-        matrixData.row(i).maxCoeff(&maxIndex);
-        *(output_data++) = maxIndex;
-      }
-    }
+    CommonReduce<T, ReduceAggregatorArgMax<T>>(ctx, axes_, keepdims_, last_results);
  }
-
  return Status::OK();
 }

 template <typename T>
 Status ArgMin<T>::Compute(OpKernelContext* ctx) const {
-  FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
-  int64_t block_size;
-  int64_t blocks;
-
-  std::vector<int64_t> reduced_dims;
-  const Tensor* input = ctx->Input<Tensor>(0);
-
-  bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true);
-
-  Tensor* reduced = ctx->Output(0, reduced_dims);
-  int64_t* output_data = reduced->template MutableData<int64_t>();
-  Eigen::MatrixXf::Index minIndex;
-
-  if (no_transpose) {
-    const T* input_data = ctx->Input<Tensor>(0)->template Data<T>();
-    if (select_last_index_) {
-      assert(blocks > 0);
-      for (int64_t i = 0; i < block_size; ++i) {
-        gsl::span<const T> row(input_data, blocks);
-        auto first = row.cbegin();
-        auto const end = row.cend();
-        auto min_el = first;
-        while (++first < end) {
-          if (*first <= *min_el) {
-            min_el = first;
-          }
-        }
-        *(output_data++) = min_el - row.cbegin();
-        input_data += blocks;
-      }
-    } else {
-      for (int64_t i = 0; i < block_size; ++i) {
-        ConstEigenVectorMap<T>(input_data + (i * blocks), blocks).minCoeff(&minIndex);
-        *(output_data++) = minIndex;
-      }
-    }
+  ResultsNoTransposePrepareForReduce last_results;
+  if (select_last_index_) {
+    CommonReduce<T, ReduceAggregatorArgMinLastIndex<T>>(ctx, axes_, keepdims_, last_results);
  } else {
-    auto matrixData = ConstEigenMatrixMap<T>(&transposed_input_data[0], block_size, blocks);
-    if (select_last_index_) {
-      for (int i = 0; i < block_size; ++i) {
-        int idx = 0;
-        T min_val = matrixData(i, 0);
-        for (int c = 1; c < blocks; ++c) {
-          auto val = matrixData(i, c);
-          if (val <= min_val) {
-            idx = c;
-            min_val = val;
-          }
-        }
-        *(output_data++) = idx;
-      }
-    } else {
-      for (int i = 0; i < block_size; ++i) {
-        matrixData.row(i).minCoeff(&minIndex);
-        *(output_data++) = minIndex;
-      }
-    }
+    CommonReduce<T, ReduceAggregatorArgMin<T>>(ctx, axes_, keepdims_, last_results);
  }
-
  return Status::OK();
 }

@ -814,14 +605,4 @@ template class ReduceSum<int32_t>;
 template class ReduceSum<double>;
 template class ReduceSum<int64_t>;

-#define REGISTER_REDUCESUMCORE_TYPED(T)                                                                         \
-  template void ReduceSumCore<T>(const T* input_data, T* output_data, bool no_transpose,                        \
-                                 int64_t blocks, int64_t block_size, FastAllocVector<T>& transposed_input_data, \
-                                 concurrency::ThreadPool* tp);
-
-REGISTER_REDUCESUMCORE_TYPED(float)
-REGISTER_REDUCESUMCORE_TYPED(double)
-REGISTER_REDUCESUMCORE_TYPED(int32_t)
-REGISTER_REDUCESUMCORE_TYPED(int64_t)
-
 }  // namespace onnxruntime
--- a/onnxruntime/core/providers/cpu/reduction/reduction_ops.h
+++ b/onnxruntime/core/providers/cpu/reduction/reduction_ops.h
@ -10,24 +10,305 @@
 #include "core/providers/cpu/containers.h"
 #include "core/util/math_cpuonly.h"
 #include "core/platform/threadpool.h"
+#include <cmath>

 namespace onnxruntime {

-template <typename T>
-bool PrepareForReduce(const Tensor* input_tensor_ptr,
-                      FastAllocVector<T>& transposed_input_data,
-                      int64_t& block_size,
-                      int64_t& blocks,
-                      const std::vector<int64_t>& axes_,
-                      bool keepdims_,
-                      /*out*/ std::vector<int64_t>& reduced_dims,
-                      bool check_no_transpose = false,
-                      const TensorShape* input_shape_override = nullptr);
+class ResultsNoTransposePrepareForReduce {
+ public:
+  std::vector<int64_t> input_shape;
+  std::vector<int64_t> reduced_axes;
+  std::vector<int64_t> projected_index;
+  int64_t last_loop_red_size;
+  int64_t last_loop_red_inc;
+  std::vector<int64_t> unprojected_index;
+  int64_t last_loop_size;
+  int64_t last_loop_inc;
+  bool equal(const std::vector<int64_t>& local_input_shape, const std::vector<int64_t>& local_reduced_axes) {
+    if (input_shape.size() != local_input_shape.size())
+      return false;
+    if (reduced_axes.size() != local_reduced_axes.size())
+      return false;
+    for (std::vector<int64_t>::const_iterator it1 = input_shape.begin(), it2 = local_input_shape.begin();
+         it1 != input_shape.end(); ++it1, ++it2) {
+      if (*it1 != *it2)
+        return false;
+    }
+    for (std::vector<int64_t>::const_iterator it1 = reduced_axes.begin(), it2 = local_reduced_axes.begin();
+         it1 != reduced_axes.end(); ++it1, ++it2) {
+      if (*it1 != *it2)
+        return false;
+    }
+    return true;
+  }
+};

 template <typename T>
-void ReduceSumCore(const T* input_data, T* output_data, bool no_transpose,
-                   int64_t blocks, int64_t block_size, FastAllocVector<T>& transposed_input_data,
-                   concurrency::ThreadPool* tp);
+inline T reduce_sqrt(T value) { return std::sqrt(value); }
+
+template <>
+inline int64_t reduce_sqrt<int64_t>(int64_t value) { return static_cast<int64_t>(std::sqrt(static_cast<double>(value))); }
+
+template <>
+inline int32_t reduce_sqrt<int32_t>(int32_t value) { return static_cast<int32_t>(std::sqrt(static_cast<double>(value))); }
+
+template <typename T>
+inline T reduce_log(T value) { return static_cast<T>(std::log(value)); }
+
+template <>
+inline int64_t reduce_log<int64_t>(int64_t value) { return static_cast<int64_t>(std::log(static_cast<double>(value))); }
+
+template <>
+inline int32_t reduce_log<int32_t>(int32_t value) { return static_cast<int32_t>(std::log(static_cast<double>(value))); }
+
+template <typename T>
+inline T reduce_exp(T value) { return static_cast<T>(std::exp(value)); }
+
+template <typename T, typename TVAL = T>
+class ReduceAggregator {
+ public:
+  typedef TVAL value_type;
+
+ protected:
+  int64_t N_;
+  T accumulator_;
+
+ public:
+  inline ReduceAggregator(int64_t N, const T& init) {
+    N_ = N;
+    accumulator_ = init;
+  }
+  inline void update(const T&) { ORT_ENFORCE(false, "must be overloaded."); }
+  inline void update0(const T&) { ORT_ENFORCE(false, "must be overloaded."); }
+  inline TVAL aggall(const T*) { ORT_ENFORCE(false, "must be overloaded."); }
+  inline TVAL get_value() { return accumulator_; }
+  inline void enforce(const ResultsNoTransposePrepareForReduce&) {}
+  static inline bool two_loops() { return false; }
+};
+
+template <typename T, typename TVAL = T>
+class ReduceAggregatorSum : public ReduceAggregator<T, TVAL> {
+ public:
+  inline ReduceAggregatorSum(int64_t N, const T&) : ReduceAggregator<T, TVAL>(N, 0) {}
+  inline void update(const T& v) { this->accumulator_ += v; }
+  inline TVAL aggall(const T* from_data) {
+    return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).sum();
+  }
+};
+
+template <typename T, typename TVAL = T>
+class ReduceAggregatorSumSquare : public ReduceAggregator<T, TVAL> {
+ public:
+  inline ReduceAggregatorSumSquare(int64_t N, const T&) : ReduceAggregator<T, TVAL>(N, 0) {}
+  inline TVAL aggall(const T* from_data) {
+    return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).squaredNorm();
+  }
+  inline void update(const T& v) { this->accumulator_ += v * v; }
+};
+
+template <typename T, typename TVAL = T>
+class ReduceAggregatorMean : public ReduceAggregatorSum<T, TVAL> {
+ public:
+  inline ReduceAggregatorMean(int64_t N, const T&) : ReduceAggregatorSum<T, TVAL>(N, 0) {}
+  inline T aggall(const T* from_data) {
+    return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).mean();
+  }
+  inline T get_value() { return this->accumulator_ / static_cast<T>(this->N_); }
+};
+
+template <typename T, typename TVAL = T>
+class ReduceAggregatorMax : public ReduceAggregator<T, TVAL> {
+ public:
+  inline ReduceAggregatorMax(int64_t N, const T& init) : ReduceAggregator<T, TVAL>(N, init) {}
+  inline TVAL aggall(const T* from_data) {
+    return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).maxCoeff();
+  }
+  inline void update(const T& v) { this->accumulator_ = v > this->accumulator_ ? v : this->accumulator_; }
+};
+
+template <typename T, typename TVAL = int64_t>
+class ReduceAggregatorArgMinMax : public ReduceAggregator<T, TVAL> {
+ protected:
+  int64_t arg_;
+  int64_t index_;
+
+ public:
+  inline ReduceAggregatorArgMinMax(int64_t N, const T& init) : ReduceAggregator<T, TVAL>(N, init) {
+    arg_ = 0;
+    index_ = 0;
+  }
+  inline TVAL get_value() { return arg_; }
+  inline void enforce(const ResultsNoTransposePrepareForReduce& res) {
+    ORT_ENFORCE(res.projected_index.size() == 0, "Only one axis is allowed for reduction.");
+  }
+};
+
+template <typename T, typename TVAL = int64_t>
+class ReduceAggregatorArgMax : public ReduceAggregatorArgMinMax<T, TVAL> {
+ public:
+  inline ReduceAggregatorArgMax(int64_t N, const T& init) : ReduceAggregatorArgMinMax<T, TVAL>(N, init) {}
+  inline TVAL aggall(const T* from_data) {
+    Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).maxCoeff(&this->arg_);
+    return this->get_value();
+  }
+  inline void update(const T& v) {
+    if (v > this->accumulator_) {
+      this->accumulator_ = v;
+      this->arg_ = this->index_;
+    }
+    ++this->index_;
+  }
+};
+
+template <typename T, typename TVAL = int64_t>
+class ReduceAggregatorArgMaxLastIndex : public ReduceAggregatorArgMax<T, TVAL> {
+ public:
+  inline ReduceAggregatorArgMaxLastIndex(int64_t N, const T& init) : ReduceAggregatorArgMax<T, TVAL>(N, init) {}
+  inline TVAL aggall(const T* from_data) {
+    for (int64_t i = 0; i < this->N_; ++i) {
+      update(from_data[i]);
+    }
+    return this->get_value();
+  }
+  inline void update(const T& v) {
+    if (v >= this->accumulator_) {
+      this->accumulator_ = v;
+      this->arg_ = this->index_;
+    }
+    ++this->index_;
+  }
+};
+
+template <typename T, typename TVAL = int64_t>
+class ReduceAggregatorArgMin : public ReduceAggregatorArgMinMax<T, TVAL> {
+ public:
+  inline ReduceAggregatorArgMin(int64_t N, const T& init) : ReduceAggregatorArgMinMax<T, TVAL>(N, init) {}
+  inline TVAL aggall(const T* from_data) {
+    Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).minCoeff(&this->arg_);
+    return this->get_value();
+  }
+  inline void update(const T& v) {
+    if (v < this->accumulator_) {
+      this->accumulator_ = v;
+      this->arg_ = this->index_;
+    }
+    ++this->index_;
+  }
+};
+
+template <typename T, typename TVAL = int64_t>
+class ReduceAggregatorArgMinLastIndex : public ReduceAggregatorArgMin<T, TVAL> {
+ public:
+  inline ReduceAggregatorArgMinLastIndex(int64_t N, const T& init) : ReduceAggregatorArgMin<T, TVAL>(N, init) {}
+  inline TVAL aggall(const T* from_data) {
+    for (int64_t i = 0; i < this->N_; ++i) {
+      update(from_data[i]);
+    }
+    return this->get_value();
+  }
+  inline void update(const T& v) {
+    if (v <= this->accumulator_) {
+      this->accumulator_ = v;
+      this->arg_ = this->index_;
+    }
+    ++this->index_;
+  }
+};
+
+template <typename T, typename TVAL = T>
+class ReduceAggregatorMin : public ReduceAggregator<T, TVAL> {
+ public:
+  inline ReduceAggregatorMin(int64_t N, const T& init) : ReduceAggregator<T, TVAL>(N, init) {}
+  inline TVAL aggall(const T* from_data) {
+    return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).minCoeff();
+  }
+  inline void update(const T& v) { this->accumulator_ = v < this->accumulator_ ? v : this->accumulator_; }
+};
+
+template <typename T, typename TVAL = T>
+class ReduceAggregatorProd : public ReduceAggregator<T, TVAL> {
+ public:
+  inline ReduceAggregatorProd(int64_t N, const T&) : ReduceAggregator<T, TVAL>(N, 1) {}
+  inline TVAL aggall(const T* from_data) {
+    return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).prod();
+  }
+  inline void update(const T& v) { this->accumulator_ *= v; }
+};
+
+template <typename T, typename TVAL = T>
+class ReduceAggregatorL1 : public ReduceAggregator<T, TVAL> {
+ public:
+  inline ReduceAggregatorL1(int64_t N, const T&) : ReduceAggregator<T, TVAL>(N, 0) {}
+  inline TVAL aggall(const T* from_data) {
+    return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).cwiseAbs().sum();
+  }
+  inline void update(const T& v) { this->accumulator_ += v > 0 ? v : -v; }
+};
+
+template <typename T, typename TVAL = T>
+class ReduceAggregatorL2 : public ReduceAggregator<T, TVAL> {
+ public:
+  inline ReduceAggregatorL2(int64_t N, const T&) : ReduceAggregator<T, TVAL>(N, 0) {}
+  inline TVAL aggall(const T* from_data) {
+    return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).norm();
+  }
+  inline void update(const T& v) { this->accumulator_ += v * v; }
+  inline TVAL get_value() { return reduce_sqrt<T>(this->accumulator_); }
+};
+
+template <typename T, typename TVAL = T>
+class ReduceAggregatorLogSum : public ReduceAggregator<T, TVAL> {
+ public:
+  inline ReduceAggregatorLogSum(int64_t N, const T&) : ReduceAggregator<T, TVAL>(N, 0) {}
+  inline T aggall(const T* from_data) {
+    return reduce_log<T>(Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).sum());
+  }
+  inline void update(const T& v) { this->accumulator_ += v; }
+  inline TVAL get_value() { return reduce_log<T>(this->accumulator_); }
+};
+
+template <typename T, typename TVAL = T>
+class ReduceAggregatorLogSumExp : public ReduceAggregator<T, TVAL> {
+ protected:
+  T max_;
+
+ public:
+  inline ReduceAggregatorLogSumExp(int64_t N, const T&) : ReduceAggregator<T, TVAL>(N, 0) { max_ = this->accumulator_; }
+  inline TVAL aggall(const T* from_data) {
+    max_ = Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).maxCoeff();
+    for (int64_t i = 0; i < this->N_; ++i) {
+      update(from_data[i]);
+    }
+    return get_value();
+  }
+  inline void update0(const T& v) { max_ = v > max_ ? v : max_; }
+  inline void update(const T& v) { this->accumulator_ += reduce_exp(v - max_); }
+  inline TVAL get_value() { return reduce_log<T>(this->accumulator_) + max_; }
+  static inline bool two_loops() { return true; }
+};
+
+bool SetupForReduce(const Tensor* input_tensor_ptr,
+                    const std::vector<int64_t>& axes_,
+                    std::vector<int64_t>& axes,
+                    TensorShape& new_input_shape,
+                    std::vector<int64_t>& output_shape,
+                    bool& empty_reduce,
+                    const TensorShape* input_shape_override);
+
+void NoTransposePrepareForReduce(const TensorShape& new_input_shape,
+                                 const std::vector<int64_t>& reduced_axes,
+                                 ResultsNoTransposePrepareForReduce& results);
+
+template <typename T, typename AGG>
+void NoTransposeReduce(Tensor* output, const TensorShape& new_input_shape, const Tensor& input,
+                       const std::vector<int64_t>& reduced_axes, concurrency::ThreadPool* tp,
+                       ResultsNoTransposePrepareForReduce& last_results);
+
+template <typename T, typename AGG>
+void CommonReduce(OpKernelContext* ctx,
+                  const std::vector<int64_t> axes_, int64_t keepdims_,
+                  ResultsNoTransposePrepareForReduce& last_results,
+                  bool noop_with_empty_axes = false);

 template <bool allow_multi_axes>
 class ReduceKernelBase {
--- a/orttraining/orttraining/training_ops/cpu/reduction/reduction_ops.cc
+++ b/orttraining/orttraining/training_ops/cpu/reduction/reduction_ops.cc
@ -27,38 +27,10 @@ REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(double)
 REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(int32_t)
 REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(int64_t)

-
 template <typename T>
 Status ReduceSumTraining<T>::Compute(OpKernelContext* ctx) const {
-  FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
-  int64_t block_size;
-  int64_t blocks;
-  std::vector<int64_t> reduced_dims;
-  const Tensor* input = ctx->Input<Tensor>(0);
-
-  //override the attribute value with the input value for reduction_axes
-  const Tensor* axes_tensor = ctx->Input<Tensor>(1);
-  ORT_ENFORCE(axes_tensor != nullptr, "Axes input is null");
-  ORT_ENFORCE(axes_tensor->Shape().NumDimensions() == 1,
-              "An axes tensor must be a vector tensor.");
-  auto nDims = static_cast<size_t>(axes_tensor->Shape()[0]);
-  const auto* data = axes_tensor->template Data<int64_t>();
-  std::vector<int64_t> axes(data, data + nDims);
-
-  // empty axes and no-op
-  if (axes.empty() && noop_with_empty_axes_) {
-    auto* output = ctx->Output(0, input->Shape());
-    memcpy(output->template MutableData<T>(), input->template Data<T>(), input->SizeInBytes());
-    return Status::OK();
-  }
-
-  bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes, keepdims_, reduced_dims, true);
-
-  auto* output = ctx->Output(0, reduced_dims);
-
-  ReduceSumCore(input->template Data<T>(), output->template MutableData<T>(),
-                no_transpose, blocks, block_size, transposed_input_data, ctx->GetOperatorThreadPool());
-
+  ResultsNoTransposePrepareForReduce last_results;
+  CommonReduce<T, ReduceAggregatorSum<T>>(ctx, axes_, keepdims_, last_results, noop_with_empty_axes_);
  return Status::OK();
 }