diff --git a/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc b/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc index 97db23c50a..46014d3101 100644 --- a/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc +++ b/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc @@ -259,10 +259,15 @@ static void ValidateFastReduceRK(const gsl::span& fast_shape, con } static void ValidateFastReduceKRK(const gsl::span& fast_shape, const Tensor& output) { - ORT_ENFORCE(fast_shape.size() == 3, "Only works on matrices with two dimensions."); + ORT_ENFORCE(fast_shape.size() == 3, "Only works on matrices with three dimensions."); ORT_ENFORCE(fast_shape[0] * fast_shape[2] == output.Shape().Size(), "Output size mismatch."); } +static void ValidateFastReduceRKR(const gsl::span& fast_shape, const Tensor& output) { + ORT_ENFORCE(fast_shape.size() == 3, "Only works on matrices with three dimensions."); + ORT_ENFORCE(fast_shape[1] == output.Shape().Size(), "Output size mismatch."); +} + void ReduceAggregatorBase::FastReduceKR(const Tensor&, const gsl::span&, Tensor&, concurrency::ThreadPool*) { ValidateMustBeOverloaded(); } @@ -272,6 +277,9 @@ void ReduceAggregatorBase::FastReduceRK(const Tensor&, const gsl::span&, Tensor&, concurrency::ThreadPool*) { ValidateMustBeOverloaded(); } +void ReduceAggregatorBase::FastReduceRKR(const Tensor&, const gsl::span&, Tensor&, concurrency::ThreadPool*) { + ValidateMustBeOverloaded(); +} void NoTransposePrepareForReduce(const TensorShape& new_input_shape, gsl::span reduced_axes, @@ -624,8 +632,8 @@ FastReduceKind OptimizeShapeForFastReduce(gsl::span input_shape, if (fast_shape.size() == 2) { return reduce[0] ? FastReduceKind::kRK : FastReduceKind::kKR; } - if (fast_shape.size() == 3 && !reduce[0]) { - return FastReduceKind::kKRK; + if (fast_shape.size() == 3) { + return reduce[0] ? FastReduceKind::kRKR : FastReduceKind::kKRK; } return FastReduceKind::kNone; } @@ -671,7 +679,8 @@ bool CommonFastReduceSwitch(OpKernelContext* ctx, FastReduceKind which_fast_reduce, fast_reduce_fct* case_kr, fast_reduce_fct* case_rk, - fast_reduce_fct* case_krk) { + fast_reduce_fct* case_krk, + fast_reduce_fct* case_rkr) { TensorShapeVector axes; const Tensor* input = ctx->Input(0); auto reduced_dims = input->Shape().GetDims(); @@ -715,6 +724,14 @@ bool CommonFastReduceSwitch(OpKernelContext* ctx, } else { break; } + case FastReduceKind::kRKR: + ValidateFastReduceRKR(fast_shape, *output); + if (fast_shape[1] >= std::max(2, concurrency::ThreadPool::DegreeOfParallelism(ctx->GetOperatorThreadPool()))) { + case_rkr(*input, fast_shape, *output, ctx->GetOperatorThreadPool()); + return true; + } else { + break; + } case FastReduceKind::kR: case FastReduceKind::kK: case FastReduceKind::kNone: @@ -738,7 +755,8 @@ bool CommonFastReduce(OpKernelContext* ctx, TensorShapeVector& fast_axes) { return CommonFastReduceSwitch(ctx, axes_, keepdims_, noop_with_empty_axes, fast_kind, fast_shape, output_shape, fast_axes, - AGG::WhichFastReduce(), &AGG::FastReduceKR, &AGG::FastReduceRK, &AGG::FastReduceKRK); + AGG::WhichFastReduce(), &AGG::FastReduceKR, &AGG::FastReduceRK, + &AGG::FastReduceKRK, &AGG::FastReduceRKR); } static void ValidateKeepDims(const TensorShape& shape, int64_t keepdims) { @@ -925,6 +943,14 @@ std::unique_ptr ReduceSum::Impl(const Tensor& input, gsl::span= std::max(2, concurrency::ThreadPool::DegreeOfParallelism(tp))) { + ReduceAggregatorSum::FastReduceRKR(input, fast_shape, *output, tp); + return output; + } else { + break; + } case FastReduceKind::kR: case FastReduceKind::kK: case FastReduceKind::kNone: diff --git a/onnxruntime/core/providers/cpu/reduction/reduction_ops.h b/onnxruntime/core/providers/cpu/reduction/reduction_ops.h index ebfa9f36e0..a0fd97fc4d 100644 --- a/onnxruntime/core/providers/cpu/reduction/reduction_ops.h +++ b/onnxruntime/core/providers/cpu/reduction/reduction_ops.h @@ -25,7 +25,8 @@ enum FastReduceKind { kKR = 4, // kept dim, reduced dim kRK = 8, // reduced dim, kept dim kKRK = 16, // kept dim, reduced dim, kept dim - kEmpty = 32 // empty reduce + kRKR = 32, // reduced dim, kept dim, reduced dim + kEmpty = 64 // empty reduce }; FastReduceKind operator|(FastReduceKind a, FastReduceKind b); @@ -54,6 +55,7 @@ constexpr TensorOpCost ParallelReduceFastCost(int64_t n_row, int64_t n_col, int6 * KR - reduction on the last dimensions * RK - reduction on the first dimensions * KRK - reduction on the middle dimensions. + * RKR - reduction on all dimensions but the middle ones For these three configuration, the reduction may be optimized with vectors operations. Method WhichFastReduce() returns which case @@ -154,6 +156,7 @@ class ReduceAggregatorBase { static void FastReduceKR(const Tensor&, const gsl::span&, Tensor&, concurrency::ThreadPool*); static void FastReduceRK(const Tensor&, const gsl::span&, Tensor&, concurrency::ThreadPool*); static void FastReduceKRK(const Tensor&, const gsl::span&, Tensor&, concurrency::ThreadPool*); + static void FastReduceRKR(const Tensor&, const gsl::span&, Tensor&, concurrency::ThreadPool*); }; template @@ -175,20 +178,48 @@ class ReduceAggregator : public ReduceAggregatorBase { inline void update0(const T&) {} inline TVAL aggall(const T*) {} inline TVAL get_value() { return accumulator_; } + + protected: + static void CommonFastReduceRKR(const Tensor& input, const gsl::span& fast_shape, + Tensor& output, concurrency::ThreadPool* tp, + std::function f_init, + std::function f_update) { + const T* data = input.Data(); + TVAL* out = output.MutableData(); + int64_t d0 = fast_shape[0]; + int64_t d2 = fast_shape[2]; + int64_t inc = d2 * fast_shape[1]; + + concurrency::ThreadPool::TryParallelFor( + tp, fast_shape[1], ParallelReduceFastCost(fast_shape[1], fast_shape[0] * fast_shape[2], sizeof(T), 6), + [data, out, d0, d2, inc, f_init, f_update](ptrdiff_t begin, ptrdiff_t last) { + const T* p; + for (ptrdiff_t d = begin; d < last; ++d) { + p = data + d * d2; + out[d] = f_init(p); + for (int64_t i = 0; i < d0; ++i, p += inc) { + f_update(out[d], p, d2); + } + } + }); + } }; -template -class ReduceAggregatorSum : public ReduceAggregator { +template +class ReduceAggregatorSum : public ReduceAggregator { public: - inline ReduceAggregatorSum(int64_t N, const T&) : ReduceAggregator(N, 0) {} + inline ReduceAggregatorSum(int64_t N, const T&) : ReduceAggregator(N, 0) {} inline void update(const T& v) { this->accumulator_ += v; } - inline TVAL aggall(const T* from_data) { - return Eigen::Map>(from_data, this->N_).sum(); + static T aggall(const T* from_data, int64_t size) { + return Eigen::Map>(from_data, size).sum(); + } + inline T aggall(const T* from_data) { + return aggall(from_data, this->N_); } // Fast reduction static inline FastReduceKind WhichFastReduce() { - return FastReduceKind::kKR | FastReduceKind::kRK | FastReduceKind::kKRK; + return FastReduceKind::kKR | FastReduceKind::kRK | FastReduceKind::kKRK | FastReduceKind::kRKR; } static void FastReduceKR(const Tensor& input, const gsl::span& fast_shape, @@ -200,7 +231,7 @@ class ReduceAggregatorSum : public ReduceAggregator { tp, fast_shape[0], ParallelReduceFastCost(1, stridei, sizeof(T), 6), [data, stridei, out](ptrdiff_t first, ptrdiff_t last) { for (ptrdiff_t d = first; d < last; ++d) { - out[d] = ConstEigenVectorArrayMap(data + d * stridei, stridei).sum(); + out[d] = aggall(data + d * stridei, stridei); } }); } @@ -239,6 +270,16 @@ class ReduceAggregatorSum : public ReduceAggregator { } }); } + + static void FastReduceRKR(const Tensor& input, const gsl::span& fast_shape, + Tensor& output, concurrency::ThreadPool* tp) { + ReduceAggregator::CommonFastReduceRKR( + input, fast_shape, output, tp, + [=](const T*) -> T { return 0; }, + [=](T& value, const T* p, int64_t size) { + value += aggall(p, size); + }); + } }; template @@ -251,12 +292,15 @@ class ReduceAggregatorSumSquare : public ReduceAggregator { inline void update(const T& v) { this->accumulator_ += v * v; } }; -template -class ReduceAggregatorMean : public ReduceAggregatorSum { +template +class ReduceAggregatorMean : public ReduceAggregatorSum { public: - inline ReduceAggregatorMean(int64_t N, const T&) : ReduceAggregatorSum(N, 0) {} + inline ReduceAggregatorMean(int64_t N, const T&) : ReduceAggregatorSum(N, 0) {} + static T aggall(const T* from_data, int64_t size) { + return Eigen::Map>(from_data, size).mean(); + } inline T aggall(const T* from_data) { - return Eigen::Map>(from_data, this->N_).mean(); + return aggall(from_data, this->N_); } inline T get_value() { return this->accumulator_ / static_cast(this->N_); } @@ -265,7 +309,7 @@ class ReduceAggregatorMean : public ReduceAggregatorSum { static void FastReduceKR(const Tensor& input, const gsl::span& fast_shape, Tensor& output, concurrency::ThreadPool* tp) { - ReduceAggregatorSum::FastReduceKR(input, fast_shape, output, tp); + ReduceAggregatorSum::FastReduceKR(input, fast_shape, output, tp); // TODO: use MLAS or BLAS T* out = output.MutableData(); T* end = out + fast_shape[0]; @@ -276,7 +320,7 @@ class ReduceAggregatorMean : public ReduceAggregatorSum { static void FastReduceRK(const Tensor& input, const gsl::span& fast_shape, Tensor& output, concurrency::ThreadPool* tp) { - ReduceAggregatorSum::FastReduceRK(input, fast_shape, output, tp); + ReduceAggregatorSum::FastReduceRK(input, fast_shape, output, tp); // TODO: use MLAS or BLAS T* out = output.MutableData(); T* end = out + fast_shape[1]; @@ -287,7 +331,7 @@ class ReduceAggregatorMean : public ReduceAggregatorSum { static void FastReduceKRK(const Tensor& input, const gsl::span& fast_shape, Tensor& output, concurrency::ThreadPool* tp) { - ReduceAggregatorSum::FastReduceKRK(input, fast_shape, output, tp); + ReduceAggregatorSum::FastReduceKRK(input, fast_shape, output, tp); int64_t strideo = fast_shape[2]; T* out = output.MutableData(); T* begin; @@ -301,20 +345,34 @@ class ReduceAggregatorMean : public ReduceAggregatorSum { } } } + + static void FastReduceRKR(const Tensor& input, const gsl::span& fast_shape, + Tensor& output, concurrency::ThreadPool* tp) { + ReduceAggregatorSum::FastReduceRKR(input, fast_shape, output, tp); + T* out = output.MutableData(); + T div = static_cast(fast_shape[0] * fast_shape[2]); + T* end = out + fast_shape[1]; + for (; out != end; ++out) { + *out /= div; + } + } }; -template -class ReduceAggregatorMax : public ReduceAggregator { +template +class ReduceAggregatorMax : public ReduceAggregator { public: - inline ReduceAggregatorMax(int64_t N, const T& init) : ReduceAggregator(N, init) {} - inline TVAL aggall(const T* from_data) { - return Eigen::Map>(from_data, this->N_).maxCoeff(); + inline ReduceAggregatorMax(int64_t N, const T& init) : ReduceAggregator(N, init) {} + static T aggall(const T* from_data, int64_t size) { + return Eigen::Map>(from_data, size).maxCoeff(); + } + inline T aggall(const T* from_data) { + return aggall(from_data, this->N_); } inline void update(const T& v) { this->accumulator_ = v > this->accumulator_ ? v : this->accumulator_; } // Fast reduction static inline FastReduceKind WhichFastReduce() { - return FastReduceKind::kKR | FastReduceKind::kRK | FastReduceKind::kKRK; + return FastReduceKind::kKR | FastReduceKind::kRK | FastReduceKind::kKRK | FastReduceKind::kRKR; } static void FastReduceKR(const Tensor& input, const gsl::span& fast_shape, @@ -347,7 +405,8 @@ class ReduceAggregatorMax : public ReduceAggregator { for (int64_t row = 1; row < n_rows; ++row) { p = data + row * N; for (int64_t j = begin; j < end; ++j) { - out[j] = out[j] > p[j] ? out[j] : p[j]; + if (out[j] < p[j]) + out[j] = p[j]; } } }); @@ -371,6 +430,18 @@ class ReduceAggregatorMax : public ReduceAggregator { } }); } + + static void FastReduceRKR(const Tensor& input, const gsl::span& fast_shape, + Tensor& output, concurrency::ThreadPool* tp) { + ReduceAggregator::CommonFastReduceRKR( + input, fast_shape, output, tp, + [=](const T* p) -> T { return p[0]; }, + [=](T& value, const T* p, int64_t size) { + T v = aggall(p, size); + if (v > value) + value = v; + }); + } }; template @@ -462,18 +533,21 @@ class ReduceAggregatorArgMinLastIndex : public ReduceAggregatorArgMin { } }; -template -class ReduceAggregatorMin : public ReduceAggregator { +template +class ReduceAggregatorMin : public ReduceAggregator { public: - inline ReduceAggregatorMin(int64_t N, const T& init) : ReduceAggregator(N, init) {} - inline TVAL aggall(const T* from_data) { - return Eigen::Map>(from_data, this->N_).minCoeff(); + inline ReduceAggregatorMin(int64_t N, const T& init) : ReduceAggregator(N, init) {} + static T aggall(const T* from_data, int64_t size) { + return Eigen::Map>(from_data, size).minCoeff(); + } + inline T aggall(const T* from_data) { + return aggall(from_data, this->N_); } inline void update(const T& v) { this->accumulator_ = v < this->accumulator_ ? v : this->accumulator_; } // Fast reduction static inline FastReduceKind WhichFastReduce() { - return FastReduceKind::kKR | FastReduceKind::kRK | FastReduceKind::kKRK; + return FastReduceKind::kKR | FastReduceKind::kRK | FastReduceKind::kKRK | FastReduceKind::kRKR; } static void FastReduceKR(const Tensor& input, const gsl::span& fast_shape, @@ -506,7 +580,8 @@ class ReduceAggregatorMin : public ReduceAggregator { for (int64_t row = 1; row < n_rows; ++row) { p = data + row * N; for (int64_t j = begin; j < end; ++j) { - out[j] = out[j] < p[j] ? out[j] : p[j]; + if (out[j] > p[j]) + out[j] = p[j]; } } }); @@ -530,60 +605,72 @@ class ReduceAggregatorMin : public ReduceAggregator { } }); } + + static void FastReduceRKR(const Tensor& input, const gsl::span& fast_shape, + Tensor& output, concurrency::ThreadPool* tp) { + ReduceAggregator::CommonFastReduceRKR( + input, fast_shape, output, tp, + [=](const T* p) -> T { return p[0]; }, + [=](T& value, const T* p, int64_t size) { + T v = aggall(p, size); + if (v < value) + value = v; + }); + } }; -template -class ReduceAggregatorProd : public ReduceAggregator { +template +class ReduceAggregatorProd : public ReduceAggregator { public: - inline ReduceAggregatorProd(int64_t N, const T&) : ReduceAggregator(N, 1) {} - inline TVAL aggall(const T* from_data) { + inline ReduceAggregatorProd(int64_t N, const T&) : ReduceAggregator(N, 1) {} + inline T aggall(const T* from_data) { return Eigen::Map>(from_data, this->N_).prod(); } inline void update(const T& v) { this->accumulator_ *= v; } }; -template -class ReduceAggregatorL1 : public ReduceAggregator { +template +class ReduceAggregatorL1 : public ReduceAggregator { public: - inline ReduceAggregatorL1(int64_t N, const T&) : ReduceAggregator(N, 0) {} - inline TVAL aggall(const T* from_data) { + inline ReduceAggregatorL1(int64_t N, const T&) : ReduceAggregator(N, 0) {} + inline T aggall(const T* from_data) { return Eigen::Map>(from_data, this->N_).cwiseAbs().sum(); } inline void update(const T& v) { this->accumulator_ += v > 0 ? v : -v; } }; -template -class ReduceAggregatorL2 : public ReduceAggregator { +template +class ReduceAggregatorL2 : public ReduceAggregator { public: - inline ReduceAggregatorL2(int64_t N, const T&) : ReduceAggregator(N, 0) {} - inline TVAL aggall(const T* from_data) { + inline ReduceAggregatorL2(int64_t N, const T&) : ReduceAggregator(N, 0) {} + inline T aggall(const T* from_data) { return Eigen::Map>(from_data, this->N_).norm(); } inline void update(const T& v) { this->accumulator_ += v * v; } - inline TVAL get_value() { return reduce_sqrt(this->accumulator_); } + inline T get_value() { return reduce_sqrt(this->accumulator_); } }; -template -class ReduceAggregatorLogSum : public ReduceAggregator { +template +class ReduceAggregatorLogSum : public ReduceAggregator { public: - inline ReduceAggregatorLogSum(int64_t N, const T&) : ReduceAggregator(N, 0) {} + inline ReduceAggregatorLogSum(int64_t N, const T&) : ReduceAggregator(N, 0) {} inline T aggall(const T* from_data) { return reduce_log(Eigen::Map>(from_data, this->N_).sum()); } inline void update(const T& v) { this->accumulator_ += v; } - inline TVAL get_value() { return reduce_log(this->accumulator_); } + inline T get_value() { return reduce_log(this->accumulator_); } }; -template -class ReduceAggregatorLogSumExp : public ReduceAggregator { +template +class ReduceAggregatorLogSumExp : public ReduceAggregator { protected: T max_; public: - inline ReduceAggregatorLogSumExp(int64_t N, const T& init) : ReduceAggregator(N, 0) { + inline ReduceAggregatorLogSumExp(int64_t N, const T& init) : ReduceAggregator(N, 0) { max_ = reduce_isinf(init) ? this->accumulator_ : init; } - inline TVAL aggall(const T* from_data) { + inline T aggall(const T* from_data) { max_ = Eigen::Map>(from_data, this->N_).maxCoeff(); for (int64_t i = 0; i < this->N_; ++i) { update(from_data[i]); @@ -594,7 +681,7 @@ class ReduceAggregatorLogSumExp : public ReduceAggregator { max_ = (reduce_isinf(v) || reduce_isnan(v) || v < max_) ? max_ : v; } inline void update(const T& v) { this->accumulator_ += reduce_exp(v - max_); } - inline TVAL get_value() { return reduce_log(this->accumulator_) + max_; } + inline T get_value() { return reduce_log(this->accumulator_) + max_; } }; void NoTransposePrepareForReduce(const TensorShape& new_input_shape, diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc index e7de0ecc7e..633b9becdd 100644 --- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc +++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc @@ -3165,6 +3165,96 @@ TEST(ReductionOpTest, OptimizeShapeForFastReduce_KRK) { ASSERT_EQ(fast_axes, expected_fast_axes); } +TEST(ReductionOpTest, OptimizeShapeForFastReduce_RKR) { + FastReduceKind fast_kind; + TensorShapeVector fast_shape, fast_output_shape, fast_axes; + TensorShapeVector expected_fast_shape, expected_fast_output_shape, expected_fast_axes; + + // RKR - keep_dims=1 + fast_kind = OptimizeShapeForFastReduce( + std::vector{9, 10, 11}, std::vector{0, 2}, + fast_shape, fast_output_shape, fast_axes, true); + expected_fast_shape = TensorShapeVector{9, 10, 11}; + expected_fast_output_shape = TensorShapeVector{1, 10, 1}; + expected_fast_axes = TensorShapeVector{0, 2}; + ASSERT_EQ(fast_kind, FastReduceKind::kRKR); + ASSERT_EQ(fast_shape, expected_fast_shape); + ASSERT_EQ(fast_output_shape, expected_fast_output_shape); + ASSERT_EQ(fast_axes, expected_fast_axes); + + fast_kind = OptimizeShapeForFastReduce( + std::vector{7, 9, 10, 11}, std::vector{0, 3}, + fast_shape, fast_output_shape, fast_axes, true); + expected_fast_shape = TensorShapeVector{7, 90, 11}; + expected_fast_output_shape = TensorShapeVector{1, 9, 10, 1}; + ASSERT_EQ(fast_kind, FastReduceKind::kRKR); + ASSERT_EQ(fast_shape, expected_fast_shape); + ASSERT_EQ(fast_output_shape, expected_fast_output_shape); + ASSERT_EQ(fast_axes, expected_fast_axes); + + fast_kind = OptimizeShapeForFastReduce( + std::vector{7, 9, 10, 11}, std::vector{0, 2, 3}, + fast_shape, fast_output_shape, fast_axes, true); + expected_fast_shape = TensorShapeVector{7, 9, 110}; + expected_fast_output_shape = TensorShapeVector{1, 9, 1, 1}; + ASSERT_EQ(fast_kind, FastReduceKind::kRKR); + ASSERT_EQ(fast_shape, expected_fast_shape); + ASSERT_EQ(fast_output_shape, expected_fast_output_shape); + ASSERT_EQ(fast_axes, expected_fast_axes); + + fast_kind = OptimizeShapeForFastReduce( + std::vector{7, 9, 10, 11}, std::vector{0, 1, 3}, + fast_shape, fast_output_shape, fast_axes, true); + expected_fast_shape = TensorShapeVector{63, 10, 11}; + expected_fast_output_shape = TensorShapeVector{1, 1, 10, 1}; + ASSERT_EQ(fast_kind, FastReduceKind::kRKR); + ASSERT_EQ(fast_shape, expected_fast_shape); + ASSERT_EQ(fast_output_shape, expected_fast_output_shape); + ASSERT_EQ(fast_axes, expected_fast_axes); + + // KRK - keep_dims=0 + fast_kind = OptimizeShapeForFastReduce( + std::vector{9, 10, 11}, std::vector{0, 2}, + fast_shape, fast_output_shape, fast_axes, false); + expected_fast_shape = TensorShapeVector{9, 10, 11}; + expected_fast_output_shape = TensorShapeVector{10}; + expected_fast_axes = TensorShapeVector{0, 2}; + ASSERT_EQ(fast_kind, FastReduceKind::kRKR); + ASSERT_EQ(fast_shape, expected_fast_shape); + ASSERT_EQ(fast_output_shape, expected_fast_output_shape); + ASSERT_EQ(fast_axes, expected_fast_axes); + + fast_kind = OptimizeShapeForFastReduce( + std::vector{7, 9, 10, 11}, std::vector{0, 3}, + fast_shape, fast_output_shape, fast_axes, false); + expected_fast_shape = TensorShapeVector{7, 90, 11}; + expected_fast_output_shape = TensorShapeVector{9, 10}; + ASSERT_EQ(fast_kind, FastReduceKind::kRKR); + ASSERT_EQ(fast_shape, expected_fast_shape); + ASSERT_EQ(fast_output_shape, expected_fast_output_shape); + ASSERT_EQ(fast_axes, expected_fast_axes); + + fast_kind = OptimizeShapeForFastReduce( + std::vector{7, 9, 10, 11}, std::vector{0, 2, 3}, + fast_shape, fast_output_shape, fast_axes, false); + expected_fast_shape = TensorShapeVector{7, 9, 110}; + expected_fast_output_shape = TensorShapeVector{9}; + ASSERT_EQ(fast_kind, FastReduceKind::kRKR); + ASSERT_EQ(fast_shape, expected_fast_shape); + ASSERT_EQ(fast_output_shape, expected_fast_output_shape); + ASSERT_EQ(fast_axes, expected_fast_axes); + + fast_kind = OptimizeShapeForFastReduce( + std::vector{7, 9, 10, 11}, std::vector{0, 1, 3}, + fast_shape, fast_output_shape, fast_axes, false); + expected_fast_shape = TensorShapeVector{63, 10, 11}; + expected_fast_output_shape = TensorShapeVector{10}; + ASSERT_EQ(fast_kind, FastReduceKind::kRKR); + ASSERT_EQ(fast_shape, expected_fast_shape); + ASSERT_EQ(fast_output_shape, expected_fast_output_shape); + ASSERT_EQ(fast_axes, expected_fast_axes); +} + TEST(ReductionOpTest, OptimizeShapeForFastReduce_NONE) { FastReduceKind fast_kind; TensorShapeVector fast_shape, fast_output_shape, fast_axes; @@ -3427,6 +3517,53 @@ TEST(ReductionOpTest, ReduceMax_KRK_keepdims) { test.Run(); } +TEST(ReductionOpTest, ReduceMax_RKR) { + OpTester test("ReduceMax"); + test.AddAttribute("axes", std::vector{0, 2}); + test.AddAttribute("keepdims", (int64_t)0); + test.AddInput("data", {3, 2, 2}, + {1.0f, 2.0f, + 3.0f, 4.0f, + + 5.0f, 6.0f, + 7.0f, 8.0f, + + 9.0f, 10.0f, + 11.0f, 12.0f}); + test.AddOutput("reduced", {2}, {10.f, 12.f}); + test.Run(); +} + +TEST(ReductionOpTest, ReduceMax_RKR_parallel) { + OpTester test("ReduceMax"); + test.AddAttribute("axes", std::vector{0, 2}); + test.AddAttribute("keepdims", (int64_t)0); + test.AddInput("data", {2, 16, 2}, + {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, + 17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f, 31.0f, 32.0f, + 33.0f, 34.0f, 35.0f, 36.0f, 37.0f, 38.0f, 39.0f, 40.0f, 41.0f, 42.0f, 43.0f, 44.0f, 45.0f, 46.0f, 47.0f, 48.0f, + 49.0f, 50.0f, 51.0f, 52.0f, 53.0f, 54.0f, 55.0f, 56.0f, 57.0f, 58.0f, 59.0f, 60.0f, 61.0f, 62.0f, 63.0f}); + test.AddOutput("reduced", {16}, {33.0f, 35.0f, 37.0f, 39.0f, 41.0f, 43.0f, 45.0f, 47.0f, 49.0f, 51.0f, 53.0f, 55.0f, 57.0f, 59.0f, 61.0f, 63.0f}); + test.Run(); +} + +TEST(ReductionOpTest, ReduceMax_RKR_keepdims) { + OpTester test("ReduceMax"); + test.AddAttribute("axes", std::vector{0, 2}); + test.AddAttribute("keepdims", (int64_t)1); + test.AddInput("data", {3, 2, 2}, + {1.0f, 2.0f, + 3.0f, 4.0f, + + 5.0f, 6.0f, + 7.0f, 8.0f, + + 9.0f, 10.0f, + 11.0f, 12.0f}); + test.AddOutput("reduced", {1, 2, 1}, {10.f, 12.f}); + test.Run(); +} + TEST(ReductionOpTest, ReduceMax_RKRK) { OpTester test("ReduceMax"); test.AddAttribute("axes", std::vector{0, 2}); @@ -3581,6 +3718,53 @@ TEST(ReductionOpTest, ReduceMean_KRK_keepdims) { test.Run(); } +TEST(ReductionOpTest, ReduceMean_RKR) { + OpTester test("ReduceMean"); + test.AddAttribute("axes", std::vector{0, 2}); + test.AddAttribute("keepdims", (int64_t)0); + test.AddInput("data", {3, 2, 2}, + {1.0f, 2.0f, + 3.0f, 4.0f, + + 5.0f, 6.0f, + 7.0f, 8.0f, + + 9.0f, 10.0f, + 11.0f, 12.0f}); + test.AddOutput("reduced", {2}, {5.5f, 7.5f}); + test.Run(); +} + +TEST(ReductionOpTest, ReduceMean_RKR_parallel) { + OpTester test("ReduceMean"); + test.AddAttribute("axes", std::vector{0, 2}); + test.AddAttribute("keepdims", (int64_t)0); + test.AddInput("data", {2, 16, 2}, + {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, + 17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f, 31.0f, 32.0f, + 33.0f, 34.0f, 35.0f, 36.0f, 37.0f, 38.0f, 39.0f, 40.0f, 41.0f, 42.0f, 43.0f, 44.0f, 45.0f, 46.0f, 47.0f, 48.0f, + 49.0f, 50.0f, 51.0f, 52.0f, 53.0f, 54.0f, 55.0f, 56.0f, 57.0f, 58.0f, 59.0f, 60.0f, 61.0f, 62.0f, 63.0f}); + test.AddOutput("reduced", {16}, {16.5f, 18.5f, 20.5f, 22.5f, 24.5f, 26.5f, 28.5f, 30.5f, 32.5f, 34.5f, 36.5f, 38.5f, 40.5f, 42.5f, 44.5f, 46.5f}); + test.Run(); +} + +TEST(ReductionOpTest, ReduceMean_RKR_keepdims) { + OpTester test("ReduceMean"); + test.AddAttribute("axes", std::vector{0, 2}); + test.AddAttribute("keepdims", (int64_t)1); + test.AddInput("data", {3, 2, 2}, + {1.0f, 2.0f, + 3.0f, 4.0f, + + 5.0f, 6.0f, + 7.0f, 8.0f, + + 9.0f, 10.0f, + 11.0f, 12.0f}); + test.AddOutput("reduced", {1, 2, 1}, {5.5f, 7.5f}); + test.Run(); +} + TEST(ReductionOpTest, ReduceMean_RKRK) { OpTester test("ReduceMean"); test.AddAttribute("axes", std::vector{0, 2}); @@ -3771,6 +3955,53 @@ TEST(ReductionOpTest, ReduceMin_KRK_keepdims) { test.Run(); } +TEST(ReductionOpTest, ReduceMin_RKR) { + OpTester test("ReduceMin"); + test.AddAttribute("axes", std::vector{0, 2}); + test.AddAttribute("keepdims", (int64_t)0); + test.AddInput("data", {3, 2, 2}, + {11.0f, 12.0f, + 13.0f, 14.0f, + + 15.0f, 16.0f, + 17.0f, 18.0f, + + 19.0f, 20.0f, + 21.0f, 22.0f}); + test.AddOutput("reduced", {2}, {11.f, 13.f}); + test.Run(); +} + +TEST(ReductionOpTest, ReduceMin_RKR_parallel) { + OpTester test("ReduceMin"); + test.AddAttribute("axes", std::vector{0, 2}); + test.AddAttribute("keepdims", (int64_t)0); + test.AddInput("data", {2, 16, 2}, + {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, + 17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f, 31.0f, 32.0f, + 33.0f, 34.0f, 35.0f, 36.0f, 37.0f, 38.0f, 39.0f, 40.0f, 41.0f, 42.0f, 43.0f, 44.0f, 45.0f, 46.0f, 47.0f, 48.0f, + 49.0f, 50.0f, 51.0f, 52.0f, 53.0f, 54.0f, 55.0f, 56.0f, 57.0f, 58.0f, 59.0f, 60.0f, 61.0f, 62.0f, 63.0f}); + test.AddOutput("reduced", {16}, {0.0f, 2.0f, 4.0f, 6.0f, 8.0f, 10.0f, 12.0f, 14.0f, 16.0f, 18.0f, 20.0f, 22.0f, 24.0f, 26.0f, 28.0f, 30.0f}); + test.Run(); +} + +TEST(ReductionOpTest, ReduceMin_RKR_keepdims) { + OpTester test("ReduceMin"); + test.AddAttribute("axes", std::vector{0, 2}); + test.AddAttribute("keepdims", (int64_t)1); + test.AddInput("data", {3, 2, 2}, + {11.0f, 12.0f, + 13.0f, 14.0f, + + 15.0f, 16.0f, + 17.0f, 18.0f, + + 19.0f, 20.0f, + 21.0f, 22.0f}); + test.AddOutput("reduced", {1, 2, 1}, {11.f, 13.f}); + test.Run(); +} + TEST(ReductionOpTest, ReduceMin_RKRK) { OpTester test("ReduceMin"); test.AddAttribute("axes", std::vector{0, 2}); @@ -4102,6 +4333,126 @@ TEST(ReductionOpTest, ReduceSum_KRK2_keepdims) { test.Run(); } +TEST(ReductionOpTest, ReduceSum_RKR) { + OpTester test("ReduceSum"); + test.AddAttribute("axes", std::vector{0, 2}); + test.AddAttribute("keepdims", (int64_t)0); + test.AddInput("data", {3, 2, 2}, + {1.0f, 2.0f, + 3.0f, 4.0f, + + 5.0f, 6.0f, + 7.0f, 8.0f, + + 9.0f, 10.0f, + 11.0f, 12.0f}); + test.AddOutput("reduced", {2}, {33.f, 45.f}); + test.Run(); +} + +TEST(ReductionOpTest, ReduceSum_RKR_parallel) { + OpTester test("ReduceSum"); + test.AddAttribute("axes", std::vector{0, 2}); + test.AddAttribute("keepdims", (int64_t)0); + test.AddInput("data", {2, 16, 2}, + {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, + 17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f, 31.0f, 32.0f, + 33.0f, 34.0f, 35.0f, 36.0f, 37.0f, 38.0f, 39.0f, 40.0f, 41.0f, 42.0f, 43.0f, 44.0f, 45.0f, 46.0f, 47.0f, 48.0f, + 49.0f, 50.0f, 51.0f, 52.0f, 53.0f, 54.0f, 55.0f, 56.0f, 57.0f, 58.0f, 59.0f, 60.0f, 61.0f, 62.0f, 63.0f}); + test.AddOutput("reduced", {16}, {66.0f, 74.0f, 82.0f, 90.0f, 98.0f, 106.0f, 114.0f, 122.0f, 130.0f, 138.0f, 146.0f, 154.0f, 162.0f, 170.0f, 178.0f, 186.0f}); + test.Run(); +} + +TEST(ReductionOpTest, ReduceSum_RKR_parallel_bigger) { + OpTester test("ReduceSum"); + test.AddAttribute("axes", std::vector{0, 2}); + test.AddAttribute("keepdims", (int64_t)0); + std::vector in_data(512); + for (size_t i = 0; i < in_data.size(); ++i) + in_data[i] = (float)i; + test.AddInput("data", {2, 128, 2}, in_data); + std::vector expected(128); + for (size_t j = 0; j < 128; ++j) { + expected[j] = 0; + for (size_t i = 0; i < 2; ++i) { + for (size_t k = 0; k < 2; ++k) { + expected[j] += in_data[i * 256 + j * 2 + k]; + } + } + } + test.AddOutput("reduced", {128}, expected); + test.Run(); +} + +TEST(ReductionOpTest, ReduceSum_RKR_keepdims) { + OpTester test("ReduceSum"); + test.AddAttribute("axes", std::vector{0, 2}); + test.AddAttribute("keepdims", (int64_t)1); + test.AddInput("data", {3, 2, 2}, + {1.0f, 2.0f, + 3.0f, 4.0f, + + 5.0f, 6.0f, + 7.0f, 8.0f, + + 9.0f, 10.0f, + 11.0f, 12.0f}); + test.AddOutput("reduced", {1, 2, 1}, {33.f, 45.f}); + test.Run(); +} + +TEST(ReductionOpTest, ReduceSum_RKR2) { + OpTester test("ReduceSum"); + test.AddAttribute("axes", std::vector{0, 3}); + test.AddAttribute("keepdims", (int64_t)0); + test.AddInput("data", {3, 2, 2, 2}, + {1.0f, 2.0f, + 3.0f, 4.0f, + + 5.0f, 6.0f, + 7.0f, 8.0f, + + 9.0f, 10.0f, + 11.0f, 12.0f, + + 13.0f, 14.0f, + 15.0f, 16.0f, + + 17.0f, 18.0f, + 19.0f, 20.0f, + + 21.0f, 22.0f, + 23.0f, 24.0f}); + test.AddOutput("reduced", {2, 2}, {57.0f, 69.0f, 81.0f, 93.0f}); + test.Run(); +} + +TEST(ReductionOpTest, ReduceSum_RKR2_keepdims) { + OpTester test("ReduceSum"); + test.AddAttribute("axes", std::vector{0, 3}); + test.AddAttribute("keepdims", (int64_t)1); + test.AddInput("data", {3, 2, 2, 2}, + {1.0f, 2.0f, + 3.0f, 4.0f, + + 5.0f, 6.0f, + 7.0f, 8.0f, + + 9.0f, 10.0f, + 11.0f, 12.0f, + + 13.0f, 14.0f, + 15.0f, 16.0f, + + 17.0f, 18.0f, + 19.0f, 20.0f, + + 21.0f, 22.0f, + 23.0f, 24.0f}); + test.AddOutput("reduced", {1, 2, 2, 1}, {57.0f, 69.0f, 81.0f, 93.0f}); + test.Run(); +} + TEST(ReductionOpTest, ReduceSum_RKRK) { OpTester test("ReduceSum"); test.AddAttribute("axes", std::vector{0, 2});