Optimize ReduceSum, ReduceMean, ReduceMin, ReduceMax (#10280)

* Optimize ReduceSum, ReduceMean, ReduceMin, ReduceMax * improve reducemax, reducemin * faster, smaller * replace std::vector by gsl::span for shapes * fix merging issues
2026-07-13 18:08:13 +00:00 · 2022-02-18 12:51:01 +01:00 · 2022-02-18 12:51:01 +01:00 · 6f0640a57f
commit 6f0640a57f
parent df841ee87d
3 changed files with 520 additions and 56 deletions
--- a/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc
+++ b/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc
@ -259,10 +259,15 @@ static void ValidateFastReduceRK(const gsl::span<const int64_t>& fast_shape, con
 }

 static void ValidateFastReduceKRK(const gsl::span<const int64_t>& fast_shape, const Tensor& output) {
-  ORT_ENFORCE(fast_shape.size() == 3, "Only works on matrices with two dimensions.");
+  ORT_ENFORCE(fast_shape.size() == 3, "Only works on matrices with three dimensions.");
  ORT_ENFORCE(fast_shape[0] * fast_shape[2] == output.Shape().Size(), "Output size mismatch.");
 }

+static void ValidateFastReduceRKR(const gsl::span<const int64_t>& fast_shape, const Tensor& output) {
+  ORT_ENFORCE(fast_shape.size() == 3, "Only works on matrices with three dimensions.");
+  ORT_ENFORCE(fast_shape[1] == output.Shape().Size(), "Output size mismatch.");
+}
+
 void ReduceAggregatorBase::FastReduceKR(const Tensor&, const gsl::span<const int64_t>&, Tensor&, concurrency::ThreadPool*) {
  ValidateMustBeOverloaded();
 }
@ -272,6 +277,9 @@ void ReduceAggregatorBase::FastReduceRK(const Tensor&, const gsl::span<const int
 void ReduceAggregatorBase::FastReduceKRK(const Tensor&, const gsl::span<const int64_t>&, Tensor&, concurrency::ThreadPool*) {
  ValidateMustBeOverloaded();
 }
+void ReduceAggregatorBase::FastReduceRKR(const Tensor&, const gsl::span<const int64_t>&, Tensor&, concurrency::ThreadPool*) {
+  ValidateMustBeOverloaded();
+}

 void NoTransposePrepareForReduce(const TensorShape& new_input_shape,
                                 gsl::span<const int64_t> reduced_axes,
@ -624,8 +632,8 @@ FastReduceKind OptimizeShapeForFastReduce(gsl::span<const int64_t> input_shape,
  if (fast_shape.size() == 2) {
    return reduce[0] ? FastReduceKind::kRK : FastReduceKind::kKR;
  }
-  if (fast_shape.size() == 3 && !reduce[0]) {
-    return FastReduceKind::kKRK;
+  if (fast_shape.size() == 3) {
+    return reduce[0] ? FastReduceKind::kRKR : FastReduceKind::kKRK;
  }
  return FastReduceKind::kNone;
 }
@ -671,7 +679,8 @@ bool CommonFastReduceSwitch(OpKernelContext* ctx,
                            FastReduceKind which_fast_reduce,
                            fast_reduce_fct* case_kr,
                            fast_reduce_fct* case_rk,
-                            fast_reduce_fct* case_krk) {
+                            fast_reduce_fct* case_krk,
+                            fast_reduce_fct* case_rkr) {
  TensorShapeVector axes;
  const Tensor* input = ctx->Input<Tensor>(0);
  auto reduced_dims = input->Shape().GetDims();
@ -715,6 +724,14 @@ bool CommonFastReduceSwitch(OpKernelContext* ctx,
          } else {
            break;
          }
+        case FastReduceKind::kRKR:
+          ValidateFastReduceRKR(fast_shape, *output);
+          if (fast_shape[1] >= std::max(2, concurrency::ThreadPool::DegreeOfParallelism(ctx->GetOperatorThreadPool()))) {
+            case_rkr(*input, fast_shape, *output, ctx->GetOperatorThreadPool());
+            return true;
+          } else {
+            break;
+          }
        case FastReduceKind::kR:
        case FastReduceKind::kK:
        case FastReduceKind::kNone:
@ -738,7 +755,8 @@ bool CommonFastReduce(OpKernelContext* ctx,
                      TensorShapeVector& fast_axes) {
  return CommonFastReduceSwitch(ctx, axes_, keepdims_, noop_with_empty_axes,
                                fast_kind, fast_shape, output_shape, fast_axes,
-                                AGG::WhichFastReduce(), &AGG::FastReduceKR, &AGG::FastReduceRK, &AGG::FastReduceKRK);
+                                AGG::WhichFastReduce(), &AGG::FastReduceKR, &AGG::FastReduceRK,
+                                &AGG::FastReduceKRK, &AGG::FastReduceRKR);
 }

 static void ValidateKeepDims(const TensorShape& shape, int64_t keepdims) {
@ -925,6 +943,14 @@ std::unique_ptr<Tensor> ReduceSum<T>::Impl(const Tensor& input, gsl::span<const
        } else {
          break;
        }
+      case FastReduceKind::kRKR:
+        ValidateFastReduceRKR(fast_shape, *output);
+        if (fast_shape[0] >= std::max(2, concurrency::ThreadPool::DegreeOfParallelism(tp))) {
+          ReduceAggregatorSum<T>::FastReduceRKR(input, fast_shape, *output, tp);
+          return output;
+        } else {
+          break;
+        }
      case FastReduceKind::kR:
      case FastReduceKind::kK:
      case FastReduceKind::kNone:
--- a/onnxruntime/core/providers/cpu/reduction/reduction_ops.h
+++ b/onnxruntime/core/providers/cpu/reduction/reduction_ops.h
@ -25,7 +25,8 @@ enum FastReduceKind {
  kKR = 4,     // kept dim, reduced dim
  kRK = 8,     // reduced dim, kept dim
  kKRK = 16,   // kept dim, reduced dim, kept dim
-  kEmpty = 32  // empty reduce
+  kRKR = 32,   // reduced dim, kept dim, reduced dim
+  kEmpty = 64  // empty reduce
 };

 FastReduceKind operator|(FastReduceKind a, FastReduceKind b);
@ -54,6 +55,7 @@ constexpr TensorOpCost ParallelReduceFastCost(int64_t n_row, int64_t n_col, int6
  *  KR - reduction on the last dimensions
  *  RK - reduction on the first dimensions
  *  KRK - reduction on the middle dimensions.
+  *  RKR - reduction on all dimensions but the middle ones

  For these three configuration, the reduction may be optimized
  with vectors operations. Method WhichFastReduce() returns which case
@ -154,6 +156,7 @@ class ReduceAggregatorBase {
  static void FastReduceKR(const Tensor&, const gsl::span<const int64_t>&, Tensor&, concurrency::ThreadPool*);
  static void FastReduceRK(const Tensor&, const gsl::span<const int64_t>&, Tensor&, concurrency::ThreadPool*);
  static void FastReduceKRK(const Tensor&, const gsl::span<const int64_t>&, Tensor&, concurrency::ThreadPool*);
+  static void FastReduceRKR(const Tensor&, const gsl::span<const int64_t>&, Tensor&, concurrency::ThreadPool*);
 };

 template <typename T, typename TVAL = T>
@ -175,20 +178,48 @@ class ReduceAggregator : public ReduceAggregatorBase {
  inline void update0(const T&) {}
  inline TVAL aggall(const T*) {}
  inline TVAL get_value() { return accumulator_; }
+
+ protected:
+  static void CommonFastReduceRKR(const Tensor& input, const gsl::span<const int64_t>& fast_shape,
+                                  Tensor& output, concurrency::ThreadPool* tp,
+                                  std::function<TVAL(const T*)> f_init,
+                                  std::function<void(TVAL&, const T*, int64_t)> f_update) {
+    const T* data = input.Data<T>();
+    TVAL* out = output.MutableData<TVAL>();
+    int64_t d0 = fast_shape[0];
+    int64_t d2 = fast_shape[2];
+    int64_t inc = d2 * fast_shape[1];
+
+    concurrency::ThreadPool::TryParallelFor(
+        tp, fast_shape[1], ParallelReduceFastCost(fast_shape[1], fast_shape[0] * fast_shape[2], sizeof(T), 6),
+        [data, out, d0, d2, inc, f_init, f_update](ptrdiff_t begin, ptrdiff_t last) {
+          const T* p;
+          for (ptrdiff_t d = begin; d < last; ++d) {
+            p = data + d * d2;
+            out[d] = f_init(p);
+            for (int64_t i = 0; i < d0; ++i, p += inc) {
+              f_update(out[d], p, d2);
+            }
+          }
+        });
+  }
 };

-template <typename T, typename TVAL = T>
-class ReduceAggregatorSum : public ReduceAggregator<T, TVAL> {
+template <typename T>
+class ReduceAggregatorSum : public ReduceAggregator<T, T> {
 public:
-  inline ReduceAggregatorSum(int64_t N, const T&) : ReduceAggregator<T, TVAL>(N, 0) {}
+  inline ReduceAggregatorSum(int64_t N, const T&) : ReduceAggregator<T, T>(N, 0) {}
  inline void update(const T& v) { this->accumulator_ += v; }
-  inline TVAL aggall(const T* from_data) {
-    return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).sum();
+  static T aggall(const T* from_data, int64_t size) {
+    return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, size).sum();
+  }
+  inline T aggall(const T* from_data) {
+    return aggall(from_data, this->N_);
  }

  // Fast reduction
  static inline FastReduceKind WhichFastReduce() {
-    return FastReduceKind::kKR | FastReduceKind::kRK | FastReduceKind::kKRK;
+    return FastReduceKind::kKR | FastReduceKind::kRK | FastReduceKind::kKRK | FastReduceKind::kRKR;
  }

  static void FastReduceKR(const Tensor& input, const gsl::span<const int64_t>& fast_shape,
@ -200,7 +231,7 @@ class ReduceAggregatorSum : public ReduceAggregator<T, TVAL> {
        tp, fast_shape[0], ParallelReduceFastCost(1, stridei, sizeof(T), 6),
        [data, stridei, out](ptrdiff_t first, ptrdiff_t last) {
          for (ptrdiff_t d = first; d < last; ++d) {
-            out[d] = ConstEigenVectorArrayMap<T>(data + d * stridei, stridei).sum();
+            out[d] = aggall(data + d * stridei, stridei);
          }
        });
  }
@ -239,6 +270,16 @@ class ReduceAggregatorSum : public ReduceAggregator<T, TVAL> {
          }
        });
  }
+
+  static void FastReduceRKR(const Tensor& input, const gsl::span<const int64_t>& fast_shape,
+                            Tensor& output, concurrency::ThreadPool* tp) {
+    ReduceAggregator<T, T>::CommonFastReduceRKR(
+        input, fast_shape, output, tp,
+        [=](const T*) -> T { return 0; },
+        [=](T& value, const T* p, int64_t size) {
+          value += aggall(p, size);
+        });
+  }
 };

 template <typename T, typename TVAL = T>
@ -251,12 +292,15 @@ class ReduceAggregatorSumSquare : public ReduceAggregator<T, TVAL> {
  inline void update(const T& v) { this->accumulator_ += v * v; }
 };

-template <typename T, typename TVAL = T>
-class ReduceAggregatorMean : public ReduceAggregatorSum<T, TVAL> {
+template <typename T>
+class ReduceAggregatorMean : public ReduceAggregatorSum<T> {
 public:
-  inline ReduceAggregatorMean(int64_t N, const T&) : ReduceAggregatorSum<T, TVAL>(N, 0) {}
+  inline ReduceAggregatorMean(int64_t N, const T&) : ReduceAggregatorSum<T>(N, 0) {}
+  static T aggall(const T* from_data, int64_t size) {
+    return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, size).mean();
+  }
  inline T aggall(const T* from_data) {
-    return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).mean();
+    return aggall(from_data, this->N_);
  }
  inline T get_value() { return this->accumulator_ / static_cast<T>(this->N_); }

@ -265,7 +309,7 @@ class ReduceAggregatorMean : public ReduceAggregatorSum<T, TVAL> {

  static void FastReduceKR(const Tensor& input, const gsl::span<const int64_t>& fast_shape,
                           Tensor& output, concurrency::ThreadPool* tp) {
-    ReduceAggregatorSum<T, TVAL>::FastReduceKR(input, fast_shape, output, tp);
+    ReduceAggregatorSum<T>::FastReduceKR(input, fast_shape, output, tp);
    // TODO: use MLAS or BLAS
    T* out = output.MutableData<T>();
    T* end = out + fast_shape[0];
@ -276,7 +320,7 @@ class ReduceAggregatorMean : public ReduceAggregatorSum<T, TVAL> {

  static void FastReduceRK(const Tensor& input, const gsl::span<const int64_t>& fast_shape,
                           Tensor& output, concurrency::ThreadPool* tp) {
-    ReduceAggregatorSum<T, TVAL>::FastReduceRK(input, fast_shape, output, tp);
+    ReduceAggregatorSum<T>::FastReduceRK(input, fast_shape, output, tp);
    // TODO: use MLAS or BLAS
    T* out = output.MutableData<T>();
    T* end = out + fast_shape[1];
@ -287,7 +331,7 @@ class ReduceAggregatorMean : public ReduceAggregatorSum<T, TVAL> {

  static void FastReduceKRK(const Tensor& input, const gsl::span<const int64_t>& fast_shape,
                            Tensor& output, concurrency::ThreadPool* tp) {
-    ReduceAggregatorSum<T, TVAL>::FastReduceKRK(input, fast_shape, output, tp);
+    ReduceAggregatorSum<T>::FastReduceKRK(input, fast_shape, output, tp);
    int64_t strideo = fast_shape[2];
    T* out = output.MutableData<T>();
    T* begin;
@ -301,20 +345,34 @@ class ReduceAggregatorMean : public ReduceAggregatorSum<T, TVAL> {
      }
    }
  }
+
+  static void FastReduceRKR(const Tensor& input, const gsl::span<const int64_t>& fast_shape,
+                            Tensor& output, concurrency::ThreadPool* tp) {
+    ReduceAggregatorSum<T>::FastReduceRKR(input, fast_shape, output, tp);
+    T* out = output.MutableData<T>();
+    T div = static_cast<T>(fast_shape[0] * fast_shape[2]);
+    T* end = out + fast_shape[1];
+    for (; out != end; ++out) {
+      *out /= div;
+    }
+  }
 };

-template <typename T, typename TVAL = T>
-class ReduceAggregatorMax : public ReduceAggregator<T, TVAL> {
+template <typename T>
+class ReduceAggregatorMax : public ReduceAggregator<T> {
 public:
-  inline ReduceAggregatorMax(int64_t N, const T& init) : ReduceAggregator<T, TVAL>(N, init) {}
-  inline TVAL aggall(const T* from_data) {
-    return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).maxCoeff();
+  inline ReduceAggregatorMax(int64_t N, const T& init) : ReduceAggregator<T, T>(N, init) {}
+  static T aggall(const T* from_data, int64_t size) {
+    return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, size).maxCoeff();
+  }
+  inline T aggall(const T* from_data) {
+    return aggall(from_data, this->N_);
  }
  inline void update(const T& v) { this->accumulator_ = v > this->accumulator_ ? v : this->accumulator_; }

  // Fast reduction
  static inline FastReduceKind WhichFastReduce() {
-    return FastReduceKind::kKR | FastReduceKind::kRK | FastReduceKind::kKRK;
+    return FastReduceKind::kKR | FastReduceKind::kRK | FastReduceKind::kKRK | FastReduceKind::kRKR;
  }

  static void FastReduceKR(const Tensor& input, const gsl::span<const int64_t>& fast_shape,
@ -347,7 +405,8 @@ class ReduceAggregatorMax : public ReduceAggregator<T, TVAL> {
          for (int64_t row = 1; row < n_rows; ++row) {
            p = data + row * N;
            for (int64_t j = begin; j < end; ++j) {
-              out[j] = out[j] > p[j] ? out[j] : p[j];
+              if (out[j] < p[j])
+                out[j] = p[j];
            }
          }
        });
@ -371,6 +430,18 @@ class ReduceAggregatorMax : public ReduceAggregator<T, TVAL> {
          }
        });
  }
+
+  static void FastReduceRKR(const Tensor& input, const gsl::span<const int64_t>& fast_shape,
+                            Tensor& output, concurrency::ThreadPool* tp) {
+    ReduceAggregator<T, T>::CommonFastReduceRKR(
+        input, fast_shape, output, tp,
+        [=](const T* p) -> T { return p[0]; },
+        [=](T& value, const T* p, int64_t size) {
+          T v = aggall(p, size);
+          if (v > value)
+            value = v;
+        });
+  }
 };

 template <typename T, typename TVAL = int64_t>
@ -462,18 +533,21 @@ class ReduceAggregatorArgMinLastIndex : public ReduceAggregatorArgMin<T, TVAL> {
  }
 };

-template <typename T, typename TVAL = T>
-class ReduceAggregatorMin : public ReduceAggregator<T, TVAL> {
+template <typename T>
+class ReduceAggregatorMin : public ReduceAggregator<T, T> {
 public:
-  inline ReduceAggregatorMin(int64_t N, const T& init) : ReduceAggregator<T, TVAL>(N, init) {}
-  inline TVAL aggall(const T* from_data) {
-    return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).minCoeff();
+  inline ReduceAggregatorMin(int64_t N, const T& init) : ReduceAggregator<T, T>(N, init) {}
+  static T aggall(const T* from_data, int64_t size) {
+    return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, size).minCoeff();
+  }
+  inline T aggall(const T* from_data) {
+    return aggall(from_data, this->N_);
  }
  inline void update(const T& v) { this->accumulator_ = v < this->accumulator_ ? v : this->accumulator_; }

  // Fast reduction
  static inline FastReduceKind WhichFastReduce() {
-    return FastReduceKind::kKR | FastReduceKind::kRK | FastReduceKind::kKRK;
+    return FastReduceKind::kKR | FastReduceKind::kRK | FastReduceKind::kKRK | FastReduceKind::kRKR;
  }

  static void FastReduceKR(const Tensor& input, const gsl::span<const int64_t>& fast_shape,
@ -506,7 +580,8 @@ class ReduceAggregatorMin : public ReduceAggregator<T, TVAL> {
          for (int64_t row = 1; row < n_rows; ++row) {
            p = data + row * N;
            for (int64_t j = begin; j < end; ++j) {
-              out[j] = out[j] < p[j] ? out[j] : p[j];
+              if (out[j] > p[j])
+                out[j] = p[j];
            }
          }
        });
@ -530,60 +605,72 @@ class ReduceAggregatorMin : public ReduceAggregator<T, TVAL> {
          }
        });
  }
+
+  static void FastReduceRKR(const Tensor& input, const gsl::span<const int64_t>& fast_shape,
+                            Tensor& output, concurrency::ThreadPool* tp) {
+    ReduceAggregator<T, T>::CommonFastReduceRKR(
+        input, fast_shape, output, tp,
+        [=](const T* p) -> T { return p[0]; },
+        [=](T& value, const T* p, int64_t size) {
+          T v = aggall(p, size);
+          if (v < value)
+            value = v;
+        });
+  }
 };

-template <typename T, typename TVAL = T>
-class ReduceAggregatorProd : public ReduceAggregator<T, TVAL> {
+template <typename T>
+class ReduceAggregatorProd : public ReduceAggregator<T, T> {
 public:
-  inline ReduceAggregatorProd(int64_t N, const T&) : ReduceAggregator<T, TVAL>(N, 1) {}
-  inline TVAL aggall(const T* from_data) {
+  inline ReduceAggregatorProd(int64_t N, const T&) : ReduceAggregator<T, T>(N, 1) {}
+  inline T aggall(const T* from_data) {
    return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).prod();
  }
  inline void update(const T& v) { this->accumulator_ *= v; }
 };

-template <typename T, typename TVAL = T>
-class ReduceAggregatorL1 : public ReduceAggregator<T, TVAL> {
+template <typename T>
+class ReduceAggregatorL1 : public ReduceAggregator<T, T> {
 public:
-  inline ReduceAggregatorL1(int64_t N, const T&) : ReduceAggregator<T, TVAL>(N, 0) {}
-  inline TVAL aggall(const T* from_data) {
+  inline ReduceAggregatorL1(int64_t N, const T&) : ReduceAggregator<T, T>(N, 0) {}
+  inline T aggall(const T* from_data) {
    return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).cwiseAbs().sum();
  }
  inline void update(const T& v) { this->accumulator_ += v > 0 ? v : -v; }
 };

-template <typename T, typename TVAL = T>
-class ReduceAggregatorL2 : public ReduceAggregator<T, TVAL> {
+template <typename T>
+class ReduceAggregatorL2 : public ReduceAggregator<T, T> {
 public:
-  inline ReduceAggregatorL2(int64_t N, const T&) : ReduceAggregator<T, TVAL>(N, 0) {}
-  inline TVAL aggall(const T* from_data) {
+  inline ReduceAggregatorL2(int64_t N, const T&) : ReduceAggregator<T, T>(N, 0) {}
+  inline T aggall(const T* from_data) {
    return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).norm();
  }
  inline void update(const T& v) { this->accumulator_ += v * v; }
-  inline TVAL get_value() { return reduce_sqrt<T>(this->accumulator_); }
+  inline T get_value() { return reduce_sqrt<T>(this->accumulator_); }
 };

-template <typename T, typename TVAL = T>
-class ReduceAggregatorLogSum : public ReduceAggregator<T, TVAL> {
+template <typename T>
+class ReduceAggregatorLogSum : public ReduceAggregator<T, T> {
 public:
-  inline ReduceAggregatorLogSum(int64_t N, const T&) : ReduceAggregator<T, TVAL>(N, 0) {}
+  inline ReduceAggregatorLogSum(int64_t N, const T&) : ReduceAggregator<T, T>(N, 0) {}
  inline T aggall(const T* from_data) {
    return reduce_log<T>(Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).sum());
  }
  inline void update(const T& v) { this->accumulator_ += v; }
-  inline TVAL get_value() { return reduce_log<T>(this->accumulator_); }
+  inline T get_value() { return reduce_log<T>(this->accumulator_); }
 };

-template <typename T, typename TVAL = T>
-class ReduceAggregatorLogSumExp : public ReduceAggregator<T, TVAL> {
+template <typename T>
+class ReduceAggregatorLogSumExp : public ReduceAggregator<T, T> {
 protected:
  T max_;

 public:
-  inline ReduceAggregatorLogSumExp(int64_t N, const T& init) : ReduceAggregator<T, TVAL>(N, 0) {
+  inline ReduceAggregatorLogSumExp(int64_t N, const T& init) : ReduceAggregator<T, T>(N, 0) {
    max_ = reduce_isinf(init) ? this->accumulator_ : init;
  }
-  inline TVAL aggall(const T* from_data) {
+  inline T aggall(const T* from_data) {
    max_ = Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).maxCoeff();
    for (int64_t i = 0; i < this->N_; ++i) {
      update(from_data[i]);
@ -594,7 +681,7 @@ class ReduceAggregatorLogSumExp : public ReduceAggregator<T, TVAL> {
    max_ = (reduce_isinf(v) || reduce_isnan(v) || v < max_) ? max_ : v;
  }
  inline void update(const T& v) { this->accumulator_ += reduce_exp(v - max_); }
-  inline TVAL get_value() { return reduce_log<T>(this->accumulator_) + max_; }
+  inline T get_value() { return reduce_log<T>(this->accumulator_) + max_; }
 };

 void NoTransposePrepareForReduce(const TensorShape& new_input_shape,
--- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
@ -3165,6 +3165,96 @@ TEST(ReductionOpTest, OptimizeShapeForFastReduce_KRK) {
  ASSERT_EQ(fast_axes, expected_fast_axes);
 }

+TEST(ReductionOpTest, OptimizeShapeForFastReduce_RKR) {
+  FastReduceKind fast_kind;
+  TensorShapeVector fast_shape, fast_output_shape, fast_axes;
+  TensorShapeVector expected_fast_shape, expected_fast_output_shape, expected_fast_axes;
+
+  // RKR - keep_dims=1
+  fast_kind = OptimizeShapeForFastReduce(
+      std::vector<int64_t>{9, 10, 11}, std::vector<int64_t>{0, 2},
+      fast_shape, fast_output_shape, fast_axes, true);
+  expected_fast_shape = TensorShapeVector{9, 10, 11};
+  expected_fast_output_shape = TensorShapeVector{1, 10, 1};
+  expected_fast_axes = TensorShapeVector{0, 2};
+  ASSERT_EQ(fast_kind, FastReduceKind::kRKR);
+  ASSERT_EQ(fast_shape, expected_fast_shape);
+  ASSERT_EQ(fast_output_shape, expected_fast_output_shape);
+  ASSERT_EQ(fast_axes, expected_fast_axes);
+
+  fast_kind = OptimizeShapeForFastReduce(
+      std::vector<int64_t>{7, 9, 10, 11}, std::vector<int64_t>{0, 3},
+      fast_shape, fast_output_shape, fast_axes, true);
+  expected_fast_shape = TensorShapeVector{7, 90, 11};
+  expected_fast_output_shape = TensorShapeVector{1, 9, 10, 1};
+  ASSERT_EQ(fast_kind, FastReduceKind::kRKR);
+  ASSERT_EQ(fast_shape, expected_fast_shape);
+  ASSERT_EQ(fast_output_shape, expected_fast_output_shape);
+  ASSERT_EQ(fast_axes, expected_fast_axes);
+
+  fast_kind = OptimizeShapeForFastReduce(
+      std::vector<int64_t>{7, 9, 10, 11}, std::vector<int64_t>{0, 2, 3},
+      fast_shape, fast_output_shape, fast_axes, true);
+  expected_fast_shape = TensorShapeVector{7, 9, 110};
+  expected_fast_output_shape = TensorShapeVector{1, 9, 1, 1};
+  ASSERT_EQ(fast_kind, FastReduceKind::kRKR);
+  ASSERT_EQ(fast_shape, expected_fast_shape);
+  ASSERT_EQ(fast_output_shape, expected_fast_output_shape);
+  ASSERT_EQ(fast_axes, expected_fast_axes);
+
+  fast_kind = OptimizeShapeForFastReduce(
+      std::vector<int64_t>{7, 9, 10, 11}, std::vector<int64_t>{0, 1, 3},
+      fast_shape, fast_output_shape, fast_axes, true);
+  expected_fast_shape = TensorShapeVector{63, 10, 11};
+  expected_fast_output_shape = TensorShapeVector{1, 1, 10, 1};
+  ASSERT_EQ(fast_kind, FastReduceKind::kRKR);
+  ASSERT_EQ(fast_shape, expected_fast_shape);
+  ASSERT_EQ(fast_output_shape, expected_fast_output_shape);
+  ASSERT_EQ(fast_axes, expected_fast_axes);
+
+  // KRK - keep_dims=0
+  fast_kind = OptimizeShapeForFastReduce(
+      std::vector<int64_t>{9, 10, 11}, std::vector<int64_t>{0, 2},
+      fast_shape, fast_output_shape, fast_axes, false);
+  expected_fast_shape = TensorShapeVector{9, 10, 11};
+  expected_fast_output_shape = TensorShapeVector{10};
+  expected_fast_axes = TensorShapeVector{0, 2};
+  ASSERT_EQ(fast_kind, FastReduceKind::kRKR);
+  ASSERT_EQ(fast_shape, expected_fast_shape);
+  ASSERT_EQ(fast_output_shape, expected_fast_output_shape);
+  ASSERT_EQ(fast_axes, expected_fast_axes);
+
+  fast_kind = OptimizeShapeForFastReduce(
+      std::vector<int64_t>{7, 9, 10, 11}, std::vector<int64_t>{0, 3},
+      fast_shape, fast_output_shape, fast_axes, false);
+  expected_fast_shape = TensorShapeVector{7, 90, 11};
+  expected_fast_output_shape = TensorShapeVector{9, 10};
+  ASSERT_EQ(fast_kind, FastReduceKind::kRKR);
+  ASSERT_EQ(fast_shape, expected_fast_shape);
+  ASSERT_EQ(fast_output_shape, expected_fast_output_shape);
+  ASSERT_EQ(fast_axes, expected_fast_axes);
+
+  fast_kind = OptimizeShapeForFastReduce(
+      std::vector<int64_t>{7, 9, 10, 11}, std::vector<int64_t>{0, 2, 3},
+      fast_shape, fast_output_shape, fast_axes, false);
+  expected_fast_shape = TensorShapeVector{7, 9, 110};
+  expected_fast_output_shape = TensorShapeVector{9};
+  ASSERT_EQ(fast_kind, FastReduceKind::kRKR);
+  ASSERT_EQ(fast_shape, expected_fast_shape);
+  ASSERT_EQ(fast_output_shape, expected_fast_output_shape);
+  ASSERT_EQ(fast_axes, expected_fast_axes);
+
+  fast_kind = OptimizeShapeForFastReduce(
+      std::vector<int64_t>{7, 9, 10, 11}, std::vector<int64_t>{0, 1, 3},
+      fast_shape, fast_output_shape, fast_axes, false);
+  expected_fast_shape = TensorShapeVector{63, 10, 11};
+  expected_fast_output_shape = TensorShapeVector{10};
+  ASSERT_EQ(fast_kind, FastReduceKind::kRKR);
+  ASSERT_EQ(fast_shape, expected_fast_shape);
+  ASSERT_EQ(fast_output_shape, expected_fast_output_shape);
+  ASSERT_EQ(fast_axes, expected_fast_axes);
+}
+
 TEST(ReductionOpTest, OptimizeShapeForFastReduce_NONE) {
  FastReduceKind fast_kind;
  TensorShapeVector fast_shape, fast_output_shape, fast_axes;
@ -3427,6 +3517,53 @@ TEST(ReductionOpTest, ReduceMax_KRK_keepdims) {
  test.Run();
 }

+TEST(ReductionOpTest, ReduceMax_RKR) {
+  OpTester test("ReduceMax");
+  test.AddAttribute("axes", std::vector<int64_t>{0, 2});
+  test.AddAttribute("keepdims", (int64_t)0);
+  test.AddInput<float>("data", {3, 2, 2},
+                       {1.0f, 2.0f,
+                        3.0f, 4.0f,
+
+                        5.0f, 6.0f,
+                        7.0f, 8.0f,
+
+                        9.0f, 10.0f,
+                        11.0f, 12.0f});
+  test.AddOutput<float>("reduced", {2}, {10.f, 12.f});
+  test.Run();
+}
+
+TEST(ReductionOpTest, ReduceMax_RKR_parallel) {
+  OpTester test("ReduceMax");
+  test.AddAttribute("axes", std::vector<int64_t>{0, 2});
+  test.AddAttribute("keepdims", (int64_t)0);
+  test.AddInput<float>("data", {2, 16, 2},
+                       {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 
+                        17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f, 31.0f, 32.0f, 
+                        33.0f, 34.0f, 35.0f, 36.0f, 37.0f, 38.0f, 39.0f, 40.0f, 41.0f, 42.0f, 43.0f, 44.0f, 45.0f, 46.0f, 47.0f, 48.0f,
+                        49.0f, 50.0f, 51.0f, 52.0f, 53.0f, 54.0f, 55.0f, 56.0f, 57.0f, 58.0f, 59.0f, 60.0f, 61.0f, 62.0f, 63.0f});
+  test.AddOutput<float>("reduced", {16}, {33.0f, 35.0f, 37.0f, 39.0f, 41.0f, 43.0f, 45.0f, 47.0f, 49.0f, 51.0f, 53.0f, 55.0f, 57.0f, 59.0f, 61.0f, 63.0f});
+  test.Run();
+}
+
+TEST(ReductionOpTest, ReduceMax_RKR_keepdims) {
+  OpTester test("ReduceMax");
+  test.AddAttribute("axes", std::vector<int64_t>{0, 2});
+  test.AddAttribute("keepdims", (int64_t)1);
+  test.AddInput<float>("data", {3, 2, 2},
+                       {1.0f, 2.0f,
+                        3.0f, 4.0f,
+
+                        5.0f, 6.0f,
+                        7.0f, 8.0f,
+
+                        9.0f, 10.0f,
+                        11.0f, 12.0f});
+  test.AddOutput<float>("reduced", {1, 2, 1}, {10.f, 12.f});
+  test.Run();
+}
+
 TEST(ReductionOpTest, ReduceMax_RKRK) {
  OpTester test("ReduceMax");
  test.AddAttribute("axes", std::vector<int64_t>{0, 2});
@ -3581,6 +3718,53 @@ TEST(ReductionOpTest, ReduceMean_KRK_keepdims) {
  test.Run();
 }

+TEST(ReductionOpTest, ReduceMean_RKR) {
+  OpTester test("ReduceMean");
+  test.AddAttribute("axes", std::vector<int64_t>{0, 2});
+  test.AddAttribute("keepdims", (int64_t)0);
+  test.AddInput<float>("data", {3, 2, 2},
+                       {1.0f, 2.0f,
+                        3.0f, 4.0f,
+
+                        5.0f, 6.0f,
+                        7.0f, 8.0f,
+
+                        9.0f, 10.0f,
+                        11.0f, 12.0f});
+  test.AddOutput<float>("reduced", {2}, {5.5f, 7.5f});
+  test.Run();
+}
+
+TEST(ReductionOpTest, ReduceMean_RKR_parallel) {
+  OpTester test("ReduceMean");
+  test.AddAttribute("axes", std::vector<int64_t>{0, 2});
+  test.AddAttribute("keepdims", (int64_t)0);
+  test.AddInput<float>("data", {2, 16, 2},
+                       {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f,
+                        17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f, 31.0f, 32.0f,
+                        33.0f, 34.0f, 35.0f, 36.0f, 37.0f, 38.0f, 39.0f, 40.0f, 41.0f, 42.0f, 43.0f, 44.0f, 45.0f, 46.0f, 47.0f, 48.0f,
+                        49.0f, 50.0f, 51.0f, 52.0f, 53.0f, 54.0f, 55.0f, 56.0f, 57.0f, 58.0f, 59.0f, 60.0f, 61.0f, 62.0f, 63.0f});
+  test.AddOutput<float>("reduced", {16}, {16.5f, 18.5f, 20.5f, 22.5f, 24.5f, 26.5f, 28.5f, 30.5f, 32.5f, 34.5f, 36.5f, 38.5f, 40.5f, 42.5f, 44.5f, 46.5f});
+  test.Run();
+}
+
+TEST(ReductionOpTest, ReduceMean_RKR_keepdims) {
+  OpTester test("ReduceMean");
+  test.AddAttribute("axes", std::vector<int64_t>{0, 2});
+  test.AddAttribute("keepdims", (int64_t)1);
+  test.AddInput<float>("data", {3, 2, 2},
+                       {1.0f, 2.0f,
+                        3.0f, 4.0f,
+
+                        5.0f, 6.0f,
+                        7.0f, 8.0f,
+
+                        9.0f, 10.0f,
+                        11.0f, 12.0f});
+  test.AddOutput<float>("reduced", {1, 2, 1}, {5.5f, 7.5f});
+  test.Run();
+}
+
 TEST(ReductionOpTest, ReduceMean_RKRK) {
  OpTester test("ReduceMean");
  test.AddAttribute("axes", std::vector<int64_t>{0, 2});
@ -3771,6 +3955,53 @@ TEST(ReductionOpTest, ReduceMin_KRK_keepdims) {
  test.Run();
 }

+TEST(ReductionOpTest, ReduceMin_RKR) {
+  OpTester test("ReduceMin");
+  test.AddAttribute("axes", std::vector<int64_t>{0, 2});
+  test.AddAttribute("keepdims", (int64_t)0);
+  test.AddInput<float>("data", {3, 2, 2},
+                       {11.0f, 12.0f,
+                        13.0f, 14.0f,
+
+                        15.0f, 16.0f,
+                        17.0f, 18.0f,
+
+                        19.0f, 20.0f,
+                        21.0f, 22.0f});
+  test.AddOutput<float>("reduced", {2}, {11.f, 13.f});
+  test.Run();
+}
+
+TEST(ReductionOpTest, ReduceMin_RKR_parallel) {
+  OpTester test("ReduceMin");
+  test.AddAttribute("axes", std::vector<int64_t>{0, 2});
+  test.AddAttribute("keepdims", (int64_t)0);
+  test.AddInput<float>("data", {2, 16, 2},
+                       {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f,
+                        17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f, 31.0f, 32.0f,
+                        33.0f, 34.0f, 35.0f, 36.0f, 37.0f, 38.0f, 39.0f, 40.0f, 41.0f, 42.0f, 43.0f, 44.0f, 45.0f, 46.0f, 47.0f, 48.0f,
+                        49.0f, 50.0f, 51.0f, 52.0f, 53.0f, 54.0f, 55.0f, 56.0f, 57.0f, 58.0f, 59.0f, 60.0f, 61.0f, 62.0f, 63.0f});
+  test.AddOutput<float>("reduced", {16}, {0.0f, 2.0f, 4.0f, 6.0f, 8.0f, 10.0f, 12.0f, 14.0f, 16.0f, 18.0f, 20.0f, 22.0f, 24.0f, 26.0f, 28.0f, 30.0f});
+  test.Run();
+}
+
+TEST(ReductionOpTest, ReduceMin_RKR_keepdims) {
+  OpTester test("ReduceMin");
+  test.AddAttribute("axes", std::vector<int64_t>{0, 2});
+  test.AddAttribute("keepdims", (int64_t)1);
+  test.AddInput<float>("data", {3, 2, 2},
+                       {11.0f, 12.0f,
+                        13.0f, 14.0f,
+
+                        15.0f, 16.0f,
+                        17.0f, 18.0f,
+
+                        19.0f, 20.0f,
+                        21.0f, 22.0f});
+  test.AddOutput<float>("reduced", {1, 2, 1}, {11.f, 13.f});
+  test.Run();
+}
+
 TEST(ReductionOpTest, ReduceMin_RKRK) {
  OpTester test("ReduceMin");
  test.AddAttribute("axes", std::vector<int64_t>{0, 2});
@ -4102,6 +4333,126 @@ TEST(ReductionOpTest, ReduceSum_KRK2_keepdims) {
  test.Run();
 }

+TEST(ReductionOpTest, ReduceSum_RKR) {
+  OpTester test("ReduceSum");
+  test.AddAttribute("axes", std::vector<int64_t>{0, 2});
+  test.AddAttribute("keepdims", (int64_t)0);
+  test.AddInput<float>("data", {3, 2, 2},
+                       {1.0f, 2.0f,
+                        3.0f, 4.0f,
+
+                        5.0f, 6.0f,
+                        7.0f, 8.0f,
+
+                        9.0f, 10.0f,
+                        11.0f, 12.0f});
+  test.AddOutput<float>("reduced", {2}, {33.f, 45.f});
+  test.Run();
+}
+
+TEST(ReductionOpTest, ReduceSum_RKR_parallel) {
+  OpTester test("ReduceSum");
+  test.AddAttribute("axes", std::vector<int64_t>{0, 2});
+  test.AddAttribute("keepdims", (int64_t)0);
+  test.AddInput<float>("data", {2, 16, 2},
+                       {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f,
+                        17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f, 31.0f, 32.0f,
+                        33.0f, 34.0f, 35.0f, 36.0f, 37.0f, 38.0f, 39.0f, 40.0f, 41.0f, 42.0f, 43.0f, 44.0f, 45.0f, 46.0f, 47.0f, 48.0f,
+                        49.0f, 50.0f, 51.0f, 52.0f, 53.0f, 54.0f, 55.0f, 56.0f, 57.0f, 58.0f, 59.0f, 60.0f, 61.0f, 62.0f, 63.0f});
+  test.AddOutput<float>("reduced", {16}, {66.0f, 74.0f, 82.0f, 90.0f, 98.0f, 106.0f, 114.0f, 122.0f, 130.0f, 138.0f, 146.0f, 154.0f, 162.0f, 170.0f, 178.0f, 186.0f});
+  test.Run();
+}
+
+TEST(ReductionOpTest, ReduceSum_RKR_parallel_bigger) {
+  OpTester test("ReduceSum");
+  test.AddAttribute("axes", std::vector<int64_t>{0, 2});
+  test.AddAttribute("keepdims", (int64_t)0);
+  std::vector<float> in_data(512);
+  for (size_t i = 0; i < in_data.size(); ++i)
+    in_data[i] = (float)i;
+  test.AddInput<float>("data", {2, 128, 2}, in_data);
+  std::vector<float> expected(128);
+  for (size_t j = 0; j < 128; ++j) {
+    expected[j] = 0;
+    for (size_t i = 0; i < 2; ++i) {
+      for (size_t k = 0; k < 2; ++k) {
+        expected[j] += in_data[i * 256 + j * 2 + k];
+      }
+    }
+  }
+  test.AddOutput<float>("reduced", {128}, expected);
+  test.Run();
+}
+
+TEST(ReductionOpTest, ReduceSum_RKR_keepdims) {
+  OpTester test("ReduceSum");
+  test.AddAttribute("axes", std::vector<int64_t>{0, 2});
+  test.AddAttribute("keepdims", (int64_t)1);
+  test.AddInput<float>("data", {3, 2, 2},
+                       {1.0f, 2.0f,
+                        3.0f, 4.0f,
+
+                        5.0f, 6.0f,
+                        7.0f, 8.0f,
+
+                        9.0f, 10.0f,
+                        11.0f, 12.0f});
+  test.AddOutput<float>("reduced", {1, 2, 1}, {33.f, 45.f});
+  test.Run();
+}
+
+TEST(ReductionOpTest, ReduceSum_RKR2) {
+  OpTester test("ReduceSum");
+  test.AddAttribute("axes", std::vector<int64_t>{0, 3});
+  test.AddAttribute("keepdims", (int64_t)0);
+  test.AddInput<float>("data", {3, 2, 2, 2},
+                       {1.0f, 2.0f,
+                        3.0f, 4.0f,
+
+                        5.0f, 6.0f,
+                        7.0f, 8.0f,
+
+                        9.0f, 10.0f,
+                        11.0f, 12.0f,
+
+                        13.0f, 14.0f,
+                        15.0f, 16.0f,
+
+                        17.0f, 18.0f,
+                        19.0f, 20.0f,
+
+                        21.0f, 22.0f,
+                        23.0f, 24.0f});
+  test.AddOutput<float>("reduced", {2, 2}, {57.0f, 69.0f, 81.0f, 93.0f});
+  test.Run();
+}
+
+TEST(ReductionOpTest, ReduceSum_RKR2_keepdims) {
+  OpTester test("ReduceSum");
+  test.AddAttribute("axes", std::vector<int64_t>{0, 3});
+  test.AddAttribute("keepdims", (int64_t)1);
+  test.AddInput<float>("data", {3, 2, 2, 2},
+                       {1.0f, 2.0f,
+                        3.0f, 4.0f,
+
+                        5.0f, 6.0f,
+                        7.0f, 8.0f,
+
+                        9.0f, 10.0f,
+                        11.0f, 12.0f,
+
+                        13.0f, 14.0f,
+                        15.0f, 16.0f,
+
+                        17.0f, 18.0f,
+                        19.0f, 20.0f,
+
+                        21.0f, 22.0f,
+                        23.0f, 24.0f});
+  test.AddOutput<float>("reduced", {1, 2, 2, 1}, {57.0f, 69.0f, 81.0f, 93.0f});
+  test.Run();
+}
+
 TEST(ReductionOpTest, ReduceSum_RKRK) {
  OpTester test("ReduceSum");
  test.AddAttribute("axes", std::vector<int64_t>{0, 2});