Optimize ReduceSum, ReduceMean, ReduceMin, ReduceMax (#10280)

* Optimize ReduceSum, ReduceMean, ReduceMin, ReduceMax
* improve reducemax, reducemin
* faster, smaller
* replace std::vector by gsl::span for shapes
* fix merging issues
This commit is contained in:
Xavier Dupré 2022-02-18 12:51:01 +01:00 committed by GitHub
parent df841ee87d
commit 6f0640a57f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 520 additions and 56 deletions

View file

@ -259,10 +259,15 @@ static void ValidateFastReduceRK(const gsl::span<const int64_t>& fast_shape, con
}
static void ValidateFastReduceKRK(const gsl::span<const int64_t>& fast_shape, const Tensor& output) {
ORT_ENFORCE(fast_shape.size() == 3, "Only works on matrices with two dimensions.");
ORT_ENFORCE(fast_shape.size() == 3, "Only works on matrices with three dimensions.");
ORT_ENFORCE(fast_shape[0] * fast_shape[2] == output.Shape().Size(), "Output size mismatch.");
}
static void ValidateFastReduceRKR(const gsl::span<const int64_t>& fast_shape, const Tensor& output) {
ORT_ENFORCE(fast_shape.size() == 3, "Only works on matrices with three dimensions.");
ORT_ENFORCE(fast_shape[1] == output.Shape().Size(), "Output size mismatch.");
}
void ReduceAggregatorBase::FastReduceKR(const Tensor&, const gsl::span<const int64_t>&, Tensor&, concurrency::ThreadPool*) {
ValidateMustBeOverloaded();
}
@ -272,6 +277,9 @@ void ReduceAggregatorBase::FastReduceRK(const Tensor&, const gsl::span<const int
void ReduceAggregatorBase::FastReduceKRK(const Tensor&, const gsl::span<const int64_t>&, Tensor&, concurrency::ThreadPool*) {
ValidateMustBeOverloaded();
}
void ReduceAggregatorBase::FastReduceRKR(const Tensor&, const gsl::span<const int64_t>&, Tensor&, concurrency::ThreadPool*) {
ValidateMustBeOverloaded();
}
void NoTransposePrepareForReduce(const TensorShape& new_input_shape,
gsl::span<const int64_t> reduced_axes,
@ -624,8 +632,8 @@ FastReduceKind OptimizeShapeForFastReduce(gsl::span<const int64_t> input_shape,
if (fast_shape.size() == 2) {
return reduce[0] ? FastReduceKind::kRK : FastReduceKind::kKR;
}
if (fast_shape.size() == 3 && !reduce[0]) {
return FastReduceKind::kKRK;
if (fast_shape.size() == 3) {
return reduce[0] ? FastReduceKind::kRKR : FastReduceKind::kKRK;
}
return FastReduceKind::kNone;
}
@ -671,7 +679,8 @@ bool CommonFastReduceSwitch(OpKernelContext* ctx,
FastReduceKind which_fast_reduce,
fast_reduce_fct* case_kr,
fast_reduce_fct* case_rk,
fast_reduce_fct* case_krk) {
fast_reduce_fct* case_krk,
fast_reduce_fct* case_rkr) {
TensorShapeVector axes;
const Tensor* input = ctx->Input<Tensor>(0);
auto reduced_dims = input->Shape().GetDims();
@ -715,6 +724,14 @@ bool CommonFastReduceSwitch(OpKernelContext* ctx,
} else {
break;
}
case FastReduceKind::kRKR:
ValidateFastReduceRKR(fast_shape, *output);
if (fast_shape[1] >= std::max(2, concurrency::ThreadPool::DegreeOfParallelism(ctx->GetOperatorThreadPool()))) {
case_rkr(*input, fast_shape, *output, ctx->GetOperatorThreadPool());
return true;
} else {
break;
}
case FastReduceKind::kR:
case FastReduceKind::kK:
case FastReduceKind::kNone:
@ -738,7 +755,8 @@ bool CommonFastReduce(OpKernelContext* ctx,
TensorShapeVector& fast_axes) {
return CommonFastReduceSwitch(ctx, axes_, keepdims_, noop_with_empty_axes,
fast_kind, fast_shape, output_shape, fast_axes,
AGG::WhichFastReduce(), &AGG::FastReduceKR, &AGG::FastReduceRK, &AGG::FastReduceKRK);
AGG::WhichFastReduce(), &AGG::FastReduceKR, &AGG::FastReduceRK,
&AGG::FastReduceKRK, &AGG::FastReduceRKR);
}
static void ValidateKeepDims(const TensorShape& shape, int64_t keepdims) {
@ -925,6 +943,14 @@ std::unique_ptr<Tensor> ReduceSum<T>::Impl(const Tensor& input, gsl::span<const
} else {
break;
}
case FastReduceKind::kRKR:
ValidateFastReduceRKR(fast_shape, *output);
if (fast_shape[0] >= std::max(2, concurrency::ThreadPool::DegreeOfParallelism(tp))) {
ReduceAggregatorSum<T>::FastReduceRKR(input, fast_shape, *output, tp);
return output;
} else {
break;
}
case FastReduceKind::kR:
case FastReduceKind::kK:
case FastReduceKind::kNone:

View file

@ -25,7 +25,8 @@ enum FastReduceKind {
kKR = 4, // kept dim, reduced dim
kRK = 8, // reduced dim, kept dim
kKRK = 16, // kept dim, reduced dim, kept dim
kEmpty = 32 // empty reduce
kRKR = 32, // reduced dim, kept dim, reduced dim
kEmpty = 64 // empty reduce
};
FastReduceKind operator|(FastReduceKind a, FastReduceKind b);
@ -54,6 +55,7 @@ constexpr TensorOpCost ParallelReduceFastCost(int64_t n_row, int64_t n_col, int6
* KR - reduction on the last dimensions
* RK - reduction on the first dimensions
* KRK - reduction on the middle dimensions.
* RKR - reduction on all dimensions but the middle ones
For these three configuration, the reduction may be optimized
with vectors operations. Method WhichFastReduce() returns which case
@ -154,6 +156,7 @@ class ReduceAggregatorBase {
static void FastReduceKR(const Tensor&, const gsl::span<const int64_t>&, Tensor&, concurrency::ThreadPool*);
static void FastReduceRK(const Tensor&, const gsl::span<const int64_t>&, Tensor&, concurrency::ThreadPool*);
static void FastReduceKRK(const Tensor&, const gsl::span<const int64_t>&, Tensor&, concurrency::ThreadPool*);
static void FastReduceRKR(const Tensor&, const gsl::span<const int64_t>&, Tensor&, concurrency::ThreadPool*);
};
template <typename T, typename TVAL = T>
@ -175,20 +178,48 @@ class ReduceAggregator : public ReduceAggregatorBase {
inline void update0(const T&) {}
inline TVAL aggall(const T*) {}
inline TVAL get_value() { return accumulator_; }
protected:
static void CommonFastReduceRKR(const Tensor& input, const gsl::span<const int64_t>& fast_shape,
Tensor& output, concurrency::ThreadPool* tp,
std::function<TVAL(const T*)> f_init,
std::function<void(TVAL&, const T*, int64_t)> f_update) {
const T* data = input.Data<T>();
TVAL* out = output.MutableData<TVAL>();
int64_t d0 = fast_shape[0];
int64_t d2 = fast_shape[2];
int64_t inc = d2 * fast_shape[1];
concurrency::ThreadPool::TryParallelFor(
tp, fast_shape[1], ParallelReduceFastCost(fast_shape[1], fast_shape[0] * fast_shape[2], sizeof(T), 6),
[data, out, d0, d2, inc, f_init, f_update](ptrdiff_t begin, ptrdiff_t last) {
const T* p;
for (ptrdiff_t d = begin; d < last; ++d) {
p = data + d * d2;
out[d] = f_init(p);
for (int64_t i = 0; i < d0; ++i, p += inc) {
f_update(out[d], p, d2);
}
}
});
}
};
template <typename T, typename TVAL = T>
class ReduceAggregatorSum : public ReduceAggregator<T, TVAL> {
template <typename T>
class ReduceAggregatorSum : public ReduceAggregator<T, T> {
public:
inline ReduceAggregatorSum(int64_t N, const T&) : ReduceAggregator<T, TVAL>(N, 0) {}
inline ReduceAggregatorSum(int64_t N, const T&) : ReduceAggregator<T, T>(N, 0) {}
inline void update(const T& v) { this->accumulator_ += v; }
inline TVAL aggall(const T* from_data) {
return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).sum();
static T aggall(const T* from_data, int64_t size) {
return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, size).sum();
}
inline T aggall(const T* from_data) {
return aggall(from_data, this->N_);
}
// Fast reduction
static inline FastReduceKind WhichFastReduce() {
return FastReduceKind::kKR | FastReduceKind::kRK | FastReduceKind::kKRK;
return FastReduceKind::kKR | FastReduceKind::kRK | FastReduceKind::kKRK | FastReduceKind::kRKR;
}
static void FastReduceKR(const Tensor& input, const gsl::span<const int64_t>& fast_shape,
@ -200,7 +231,7 @@ class ReduceAggregatorSum : public ReduceAggregator<T, TVAL> {
tp, fast_shape[0], ParallelReduceFastCost(1, stridei, sizeof(T), 6),
[data, stridei, out](ptrdiff_t first, ptrdiff_t last) {
for (ptrdiff_t d = first; d < last; ++d) {
out[d] = ConstEigenVectorArrayMap<T>(data + d * stridei, stridei).sum();
out[d] = aggall(data + d * stridei, stridei);
}
});
}
@ -239,6 +270,16 @@ class ReduceAggregatorSum : public ReduceAggregator<T, TVAL> {
}
});
}
static void FastReduceRKR(const Tensor& input, const gsl::span<const int64_t>& fast_shape,
Tensor& output, concurrency::ThreadPool* tp) {
ReduceAggregator<T, T>::CommonFastReduceRKR(
input, fast_shape, output, tp,
[=](const T*) -> T { return 0; },
[=](T& value, const T* p, int64_t size) {
value += aggall(p, size);
});
}
};
template <typename T, typename TVAL = T>
@ -251,12 +292,15 @@ class ReduceAggregatorSumSquare : public ReduceAggregator<T, TVAL> {
inline void update(const T& v) { this->accumulator_ += v * v; }
};
template <typename T, typename TVAL = T>
class ReduceAggregatorMean : public ReduceAggregatorSum<T, TVAL> {
template <typename T>
class ReduceAggregatorMean : public ReduceAggregatorSum<T> {
public:
inline ReduceAggregatorMean(int64_t N, const T&) : ReduceAggregatorSum<T, TVAL>(N, 0) {}
inline ReduceAggregatorMean(int64_t N, const T&) : ReduceAggregatorSum<T>(N, 0) {}
static T aggall(const T* from_data, int64_t size) {
return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, size).mean();
}
inline T aggall(const T* from_data) {
return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).mean();
return aggall(from_data, this->N_);
}
inline T get_value() { return this->accumulator_ / static_cast<T>(this->N_); }
@ -265,7 +309,7 @@ class ReduceAggregatorMean : public ReduceAggregatorSum<T, TVAL> {
static void FastReduceKR(const Tensor& input, const gsl::span<const int64_t>& fast_shape,
Tensor& output, concurrency::ThreadPool* tp) {
ReduceAggregatorSum<T, TVAL>::FastReduceKR(input, fast_shape, output, tp);
ReduceAggregatorSum<T>::FastReduceKR(input, fast_shape, output, tp);
// TODO: use MLAS or BLAS
T* out = output.MutableData<T>();
T* end = out + fast_shape[0];
@ -276,7 +320,7 @@ class ReduceAggregatorMean : public ReduceAggregatorSum<T, TVAL> {
static void FastReduceRK(const Tensor& input, const gsl::span<const int64_t>& fast_shape,
Tensor& output, concurrency::ThreadPool* tp) {
ReduceAggregatorSum<T, TVAL>::FastReduceRK(input, fast_shape, output, tp);
ReduceAggregatorSum<T>::FastReduceRK(input, fast_shape, output, tp);
// TODO: use MLAS or BLAS
T* out = output.MutableData<T>();
T* end = out + fast_shape[1];
@ -287,7 +331,7 @@ class ReduceAggregatorMean : public ReduceAggregatorSum<T, TVAL> {
static void FastReduceKRK(const Tensor& input, const gsl::span<const int64_t>& fast_shape,
Tensor& output, concurrency::ThreadPool* tp) {
ReduceAggregatorSum<T, TVAL>::FastReduceKRK(input, fast_shape, output, tp);
ReduceAggregatorSum<T>::FastReduceKRK(input, fast_shape, output, tp);
int64_t strideo = fast_shape[2];
T* out = output.MutableData<T>();
T* begin;
@ -301,20 +345,34 @@ class ReduceAggregatorMean : public ReduceAggregatorSum<T, TVAL> {
}
}
}
static void FastReduceRKR(const Tensor& input, const gsl::span<const int64_t>& fast_shape,
Tensor& output, concurrency::ThreadPool* tp) {
ReduceAggregatorSum<T>::FastReduceRKR(input, fast_shape, output, tp);
T* out = output.MutableData<T>();
T div = static_cast<T>(fast_shape[0] * fast_shape[2]);
T* end = out + fast_shape[1];
for (; out != end; ++out) {
*out /= div;
}
}
};
template <typename T, typename TVAL = T>
class ReduceAggregatorMax : public ReduceAggregator<T, TVAL> {
template <typename T>
class ReduceAggregatorMax : public ReduceAggregator<T> {
public:
inline ReduceAggregatorMax(int64_t N, const T& init) : ReduceAggregator<T, TVAL>(N, init) {}
inline TVAL aggall(const T* from_data) {
return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).maxCoeff();
inline ReduceAggregatorMax(int64_t N, const T& init) : ReduceAggregator<T, T>(N, init) {}
static T aggall(const T* from_data, int64_t size) {
return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, size).maxCoeff();
}
inline T aggall(const T* from_data) {
return aggall(from_data, this->N_);
}
inline void update(const T& v) { this->accumulator_ = v > this->accumulator_ ? v : this->accumulator_; }
// Fast reduction
static inline FastReduceKind WhichFastReduce() {
return FastReduceKind::kKR | FastReduceKind::kRK | FastReduceKind::kKRK;
return FastReduceKind::kKR | FastReduceKind::kRK | FastReduceKind::kKRK | FastReduceKind::kRKR;
}
static void FastReduceKR(const Tensor& input, const gsl::span<const int64_t>& fast_shape,
@ -347,7 +405,8 @@ class ReduceAggregatorMax : public ReduceAggregator<T, TVAL> {
for (int64_t row = 1; row < n_rows; ++row) {
p = data + row * N;
for (int64_t j = begin; j < end; ++j) {
out[j] = out[j] > p[j] ? out[j] : p[j];
if (out[j] < p[j])
out[j] = p[j];
}
}
});
@ -371,6 +430,18 @@ class ReduceAggregatorMax : public ReduceAggregator<T, TVAL> {
}
});
}
static void FastReduceRKR(const Tensor& input, const gsl::span<const int64_t>& fast_shape,
Tensor& output, concurrency::ThreadPool* tp) {
ReduceAggregator<T, T>::CommonFastReduceRKR(
input, fast_shape, output, tp,
[=](const T* p) -> T { return p[0]; },
[=](T& value, const T* p, int64_t size) {
T v = aggall(p, size);
if (v > value)
value = v;
});
}
};
template <typename T, typename TVAL = int64_t>
@ -462,18 +533,21 @@ class ReduceAggregatorArgMinLastIndex : public ReduceAggregatorArgMin<T, TVAL> {
}
};
template <typename T, typename TVAL = T>
class ReduceAggregatorMin : public ReduceAggregator<T, TVAL> {
template <typename T>
class ReduceAggregatorMin : public ReduceAggregator<T, T> {
public:
inline ReduceAggregatorMin(int64_t N, const T& init) : ReduceAggregator<T, TVAL>(N, init) {}
inline TVAL aggall(const T* from_data) {
return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).minCoeff();
inline ReduceAggregatorMin(int64_t N, const T& init) : ReduceAggregator<T, T>(N, init) {}
static T aggall(const T* from_data, int64_t size) {
return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, size).minCoeff();
}
inline T aggall(const T* from_data) {
return aggall(from_data, this->N_);
}
inline void update(const T& v) { this->accumulator_ = v < this->accumulator_ ? v : this->accumulator_; }
// Fast reduction
static inline FastReduceKind WhichFastReduce() {
return FastReduceKind::kKR | FastReduceKind::kRK | FastReduceKind::kKRK;
return FastReduceKind::kKR | FastReduceKind::kRK | FastReduceKind::kKRK | FastReduceKind::kRKR;
}
static void FastReduceKR(const Tensor& input, const gsl::span<const int64_t>& fast_shape,
@ -506,7 +580,8 @@ class ReduceAggregatorMin : public ReduceAggregator<T, TVAL> {
for (int64_t row = 1; row < n_rows; ++row) {
p = data + row * N;
for (int64_t j = begin; j < end; ++j) {
out[j] = out[j] < p[j] ? out[j] : p[j];
if (out[j] > p[j])
out[j] = p[j];
}
}
});
@ -530,60 +605,72 @@ class ReduceAggregatorMin : public ReduceAggregator<T, TVAL> {
}
});
}
static void FastReduceRKR(const Tensor& input, const gsl::span<const int64_t>& fast_shape,
Tensor& output, concurrency::ThreadPool* tp) {
ReduceAggregator<T, T>::CommonFastReduceRKR(
input, fast_shape, output, tp,
[=](const T* p) -> T { return p[0]; },
[=](T& value, const T* p, int64_t size) {
T v = aggall(p, size);
if (v < value)
value = v;
});
}
};
template <typename T, typename TVAL = T>
class ReduceAggregatorProd : public ReduceAggregator<T, TVAL> {
template <typename T>
class ReduceAggregatorProd : public ReduceAggregator<T, T> {
public:
inline ReduceAggregatorProd(int64_t N, const T&) : ReduceAggregator<T, TVAL>(N, 1) {}
inline TVAL aggall(const T* from_data) {
inline ReduceAggregatorProd(int64_t N, const T&) : ReduceAggregator<T, T>(N, 1) {}
inline T aggall(const T* from_data) {
return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).prod();
}
inline void update(const T& v) { this->accumulator_ *= v; }
};
template <typename T, typename TVAL = T>
class ReduceAggregatorL1 : public ReduceAggregator<T, TVAL> {
template <typename T>
class ReduceAggregatorL1 : public ReduceAggregator<T, T> {
public:
inline ReduceAggregatorL1(int64_t N, const T&) : ReduceAggregator<T, TVAL>(N, 0) {}
inline TVAL aggall(const T* from_data) {
inline ReduceAggregatorL1(int64_t N, const T&) : ReduceAggregator<T, T>(N, 0) {}
inline T aggall(const T* from_data) {
return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).cwiseAbs().sum();
}
inline void update(const T& v) { this->accumulator_ += v > 0 ? v : -v; }
};
template <typename T, typename TVAL = T>
class ReduceAggregatorL2 : public ReduceAggregator<T, TVAL> {
template <typename T>
class ReduceAggregatorL2 : public ReduceAggregator<T, T> {
public:
inline ReduceAggregatorL2(int64_t N, const T&) : ReduceAggregator<T, TVAL>(N, 0) {}
inline TVAL aggall(const T* from_data) {
inline ReduceAggregatorL2(int64_t N, const T&) : ReduceAggregator<T, T>(N, 0) {}
inline T aggall(const T* from_data) {
return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).norm();
}
inline void update(const T& v) { this->accumulator_ += v * v; }
inline TVAL get_value() { return reduce_sqrt<T>(this->accumulator_); }
inline T get_value() { return reduce_sqrt<T>(this->accumulator_); }
};
template <typename T, typename TVAL = T>
class ReduceAggregatorLogSum : public ReduceAggregator<T, TVAL> {
template <typename T>
class ReduceAggregatorLogSum : public ReduceAggregator<T, T> {
public:
inline ReduceAggregatorLogSum(int64_t N, const T&) : ReduceAggregator<T, TVAL>(N, 0) {}
inline ReduceAggregatorLogSum(int64_t N, const T&) : ReduceAggregator<T, T>(N, 0) {}
inline T aggall(const T* from_data) {
return reduce_log<T>(Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).sum());
}
inline void update(const T& v) { this->accumulator_ += v; }
inline TVAL get_value() { return reduce_log<T>(this->accumulator_); }
inline T get_value() { return reduce_log<T>(this->accumulator_); }
};
template <typename T, typename TVAL = T>
class ReduceAggregatorLogSumExp : public ReduceAggregator<T, TVAL> {
template <typename T>
class ReduceAggregatorLogSumExp : public ReduceAggregator<T, T> {
protected:
T max_;
public:
inline ReduceAggregatorLogSumExp(int64_t N, const T& init) : ReduceAggregator<T, TVAL>(N, 0) {
inline ReduceAggregatorLogSumExp(int64_t N, const T& init) : ReduceAggregator<T, T>(N, 0) {
max_ = reduce_isinf(init) ? this->accumulator_ : init;
}
inline TVAL aggall(const T* from_data) {
inline T aggall(const T* from_data) {
max_ = Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).maxCoeff();
for (int64_t i = 0; i < this->N_; ++i) {
update(from_data[i]);
@ -594,7 +681,7 @@ class ReduceAggregatorLogSumExp : public ReduceAggregator<T, TVAL> {
max_ = (reduce_isinf(v) || reduce_isnan(v) || v < max_) ? max_ : v;
}
inline void update(const T& v) { this->accumulator_ += reduce_exp(v - max_); }
inline TVAL get_value() { return reduce_log<T>(this->accumulator_) + max_; }
inline T get_value() { return reduce_log<T>(this->accumulator_) + max_; }
};
void NoTransposePrepareForReduce(const TensorShape& new_input_shape,

View file

@ -3165,6 +3165,96 @@ TEST(ReductionOpTest, OptimizeShapeForFastReduce_KRK) {
ASSERT_EQ(fast_axes, expected_fast_axes);
}
TEST(ReductionOpTest, OptimizeShapeForFastReduce_RKR) {
FastReduceKind fast_kind;
TensorShapeVector fast_shape, fast_output_shape, fast_axes;
TensorShapeVector expected_fast_shape, expected_fast_output_shape, expected_fast_axes;
// RKR - keep_dims=1
fast_kind = OptimizeShapeForFastReduce(
std::vector<int64_t>{9, 10, 11}, std::vector<int64_t>{0, 2},
fast_shape, fast_output_shape, fast_axes, true);
expected_fast_shape = TensorShapeVector{9, 10, 11};
expected_fast_output_shape = TensorShapeVector{1, 10, 1};
expected_fast_axes = TensorShapeVector{0, 2};
ASSERT_EQ(fast_kind, FastReduceKind::kRKR);
ASSERT_EQ(fast_shape, expected_fast_shape);
ASSERT_EQ(fast_output_shape, expected_fast_output_shape);
ASSERT_EQ(fast_axes, expected_fast_axes);
fast_kind = OptimizeShapeForFastReduce(
std::vector<int64_t>{7, 9, 10, 11}, std::vector<int64_t>{0, 3},
fast_shape, fast_output_shape, fast_axes, true);
expected_fast_shape = TensorShapeVector{7, 90, 11};
expected_fast_output_shape = TensorShapeVector{1, 9, 10, 1};
ASSERT_EQ(fast_kind, FastReduceKind::kRKR);
ASSERT_EQ(fast_shape, expected_fast_shape);
ASSERT_EQ(fast_output_shape, expected_fast_output_shape);
ASSERT_EQ(fast_axes, expected_fast_axes);
fast_kind = OptimizeShapeForFastReduce(
std::vector<int64_t>{7, 9, 10, 11}, std::vector<int64_t>{0, 2, 3},
fast_shape, fast_output_shape, fast_axes, true);
expected_fast_shape = TensorShapeVector{7, 9, 110};
expected_fast_output_shape = TensorShapeVector{1, 9, 1, 1};
ASSERT_EQ(fast_kind, FastReduceKind::kRKR);
ASSERT_EQ(fast_shape, expected_fast_shape);
ASSERT_EQ(fast_output_shape, expected_fast_output_shape);
ASSERT_EQ(fast_axes, expected_fast_axes);
fast_kind = OptimizeShapeForFastReduce(
std::vector<int64_t>{7, 9, 10, 11}, std::vector<int64_t>{0, 1, 3},
fast_shape, fast_output_shape, fast_axes, true);
expected_fast_shape = TensorShapeVector{63, 10, 11};
expected_fast_output_shape = TensorShapeVector{1, 1, 10, 1};
ASSERT_EQ(fast_kind, FastReduceKind::kRKR);
ASSERT_EQ(fast_shape, expected_fast_shape);
ASSERT_EQ(fast_output_shape, expected_fast_output_shape);
ASSERT_EQ(fast_axes, expected_fast_axes);
// KRK - keep_dims=0
fast_kind = OptimizeShapeForFastReduce(
std::vector<int64_t>{9, 10, 11}, std::vector<int64_t>{0, 2},
fast_shape, fast_output_shape, fast_axes, false);
expected_fast_shape = TensorShapeVector{9, 10, 11};
expected_fast_output_shape = TensorShapeVector{10};
expected_fast_axes = TensorShapeVector{0, 2};
ASSERT_EQ(fast_kind, FastReduceKind::kRKR);
ASSERT_EQ(fast_shape, expected_fast_shape);
ASSERT_EQ(fast_output_shape, expected_fast_output_shape);
ASSERT_EQ(fast_axes, expected_fast_axes);
fast_kind = OptimizeShapeForFastReduce(
std::vector<int64_t>{7, 9, 10, 11}, std::vector<int64_t>{0, 3},
fast_shape, fast_output_shape, fast_axes, false);
expected_fast_shape = TensorShapeVector{7, 90, 11};
expected_fast_output_shape = TensorShapeVector{9, 10};
ASSERT_EQ(fast_kind, FastReduceKind::kRKR);
ASSERT_EQ(fast_shape, expected_fast_shape);
ASSERT_EQ(fast_output_shape, expected_fast_output_shape);
ASSERT_EQ(fast_axes, expected_fast_axes);
fast_kind = OptimizeShapeForFastReduce(
std::vector<int64_t>{7, 9, 10, 11}, std::vector<int64_t>{0, 2, 3},
fast_shape, fast_output_shape, fast_axes, false);
expected_fast_shape = TensorShapeVector{7, 9, 110};
expected_fast_output_shape = TensorShapeVector{9};
ASSERT_EQ(fast_kind, FastReduceKind::kRKR);
ASSERT_EQ(fast_shape, expected_fast_shape);
ASSERT_EQ(fast_output_shape, expected_fast_output_shape);
ASSERT_EQ(fast_axes, expected_fast_axes);
fast_kind = OptimizeShapeForFastReduce(
std::vector<int64_t>{7, 9, 10, 11}, std::vector<int64_t>{0, 1, 3},
fast_shape, fast_output_shape, fast_axes, false);
expected_fast_shape = TensorShapeVector{63, 10, 11};
expected_fast_output_shape = TensorShapeVector{10};
ASSERT_EQ(fast_kind, FastReduceKind::kRKR);
ASSERT_EQ(fast_shape, expected_fast_shape);
ASSERT_EQ(fast_output_shape, expected_fast_output_shape);
ASSERT_EQ(fast_axes, expected_fast_axes);
}
TEST(ReductionOpTest, OptimizeShapeForFastReduce_NONE) {
FastReduceKind fast_kind;
TensorShapeVector fast_shape, fast_output_shape, fast_axes;
@ -3427,6 +3517,53 @@ TEST(ReductionOpTest, ReduceMax_KRK_keepdims) {
test.Run();
}
TEST(ReductionOpTest, ReduceMax_RKR) {
OpTester test("ReduceMax");
test.AddAttribute("axes", std::vector<int64_t>{0, 2});
test.AddAttribute("keepdims", (int64_t)0);
test.AddInput<float>("data", {3, 2, 2},
{1.0f, 2.0f,
3.0f, 4.0f,
5.0f, 6.0f,
7.0f, 8.0f,
9.0f, 10.0f,
11.0f, 12.0f});
test.AddOutput<float>("reduced", {2}, {10.f, 12.f});
test.Run();
}
TEST(ReductionOpTest, ReduceMax_RKR_parallel) {
OpTester test("ReduceMax");
test.AddAttribute("axes", std::vector<int64_t>{0, 2});
test.AddAttribute("keepdims", (int64_t)0);
test.AddInput<float>("data", {2, 16, 2},
{0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f,
17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f, 31.0f, 32.0f,
33.0f, 34.0f, 35.0f, 36.0f, 37.0f, 38.0f, 39.0f, 40.0f, 41.0f, 42.0f, 43.0f, 44.0f, 45.0f, 46.0f, 47.0f, 48.0f,
49.0f, 50.0f, 51.0f, 52.0f, 53.0f, 54.0f, 55.0f, 56.0f, 57.0f, 58.0f, 59.0f, 60.0f, 61.0f, 62.0f, 63.0f});
test.AddOutput<float>("reduced", {16}, {33.0f, 35.0f, 37.0f, 39.0f, 41.0f, 43.0f, 45.0f, 47.0f, 49.0f, 51.0f, 53.0f, 55.0f, 57.0f, 59.0f, 61.0f, 63.0f});
test.Run();
}
TEST(ReductionOpTest, ReduceMax_RKR_keepdims) {
OpTester test("ReduceMax");
test.AddAttribute("axes", std::vector<int64_t>{0, 2});
test.AddAttribute("keepdims", (int64_t)1);
test.AddInput<float>("data", {3, 2, 2},
{1.0f, 2.0f,
3.0f, 4.0f,
5.0f, 6.0f,
7.0f, 8.0f,
9.0f, 10.0f,
11.0f, 12.0f});
test.AddOutput<float>("reduced", {1, 2, 1}, {10.f, 12.f});
test.Run();
}
TEST(ReductionOpTest, ReduceMax_RKRK) {
OpTester test("ReduceMax");
test.AddAttribute("axes", std::vector<int64_t>{0, 2});
@ -3581,6 +3718,53 @@ TEST(ReductionOpTest, ReduceMean_KRK_keepdims) {
test.Run();
}
TEST(ReductionOpTest, ReduceMean_RKR) {
OpTester test("ReduceMean");
test.AddAttribute("axes", std::vector<int64_t>{0, 2});
test.AddAttribute("keepdims", (int64_t)0);
test.AddInput<float>("data", {3, 2, 2},
{1.0f, 2.0f,
3.0f, 4.0f,
5.0f, 6.0f,
7.0f, 8.0f,
9.0f, 10.0f,
11.0f, 12.0f});
test.AddOutput<float>("reduced", {2}, {5.5f, 7.5f});
test.Run();
}
TEST(ReductionOpTest, ReduceMean_RKR_parallel) {
OpTester test("ReduceMean");
test.AddAttribute("axes", std::vector<int64_t>{0, 2});
test.AddAttribute("keepdims", (int64_t)0);
test.AddInput<float>("data", {2, 16, 2},
{0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f,
17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f, 31.0f, 32.0f,
33.0f, 34.0f, 35.0f, 36.0f, 37.0f, 38.0f, 39.0f, 40.0f, 41.0f, 42.0f, 43.0f, 44.0f, 45.0f, 46.0f, 47.0f, 48.0f,
49.0f, 50.0f, 51.0f, 52.0f, 53.0f, 54.0f, 55.0f, 56.0f, 57.0f, 58.0f, 59.0f, 60.0f, 61.0f, 62.0f, 63.0f});
test.AddOutput<float>("reduced", {16}, {16.5f, 18.5f, 20.5f, 22.5f, 24.5f, 26.5f, 28.5f, 30.5f, 32.5f, 34.5f, 36.5f, 38.5f, 40.5f, 42.5f, 44.5f, 46.5f});
test.Run();
}
TEST(ReductionOpTest, ReduceMean_RKR_keepdims) {
OpTester test("ReduceMean");
test.AddAttribute("axes", std::vector<int64_t>{0, 2});
test.AddAttribute("keepdims", (int64_t)1);
test.AddInput<float>("data", {3, 2, 2},
{1.0f, 2.0f,
3.0f, 4.0f,
5.0f, 6.0f,
7.0f, 8.0f,
9.0f, 10.0f,
11.0f, 12.0f});
test.AddOutput<float>("reduced", {1, 2, 1}, {5.5f, 7.5f});
test.Run();
}
TEST(ReductionOpTest, ReduceMean_RKRK) {
OpTester test("ReduceMean");
test.AddAttribute("axes", std::vector<int64_t>{0, 2});
@ -3771,6 +3955,53 @@ TEST(ReductionOpTest, ReduceMin_KRK_keepdims) {
test.Run();
}
TEST(ReductionOpTest, ReduceMin_RKR) {
OpTester test("ReduceMin");
test.AddAttribute("axes", std::vector<int64_t>{0, 2});
test.AddAttribute("keepdims", (int64_t)0);
test.AddInput<float>("data", {3, 2, 2},
{11.0f, 12.0f,
13.0f, 14.0f,
15.0f, 16.0f,
17.0f, 18.0f,
19.0f, 20.0f,
21.0f, 22.0f});
test.AddOutput<float>("reduced", {2}, {11.f, 13.f});
test.Run();
}
TEST(ReductionOpTest, ReduceMin_RKR_parallel) {
OpTester test("ReduceMin");
test.AddAttribute("axes", std::vector<int64_t>{0, 2});
test.AddAttribute("keepdims", (int64_t)0);
test.AddInput<float>("data", {2, 16, 2},
{0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f,
17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f, 31.0f, 32.0f,
33.0f, 34.0f, 35.0f, 36.0f, 37.0f, 38.0f, 39.0f, 40.0f, 41.0f, 42.0f, 43.0f, 44.0f, 45.0f, 46.0f, 47.0f, 48.0f,
49.0f, 50.0f, 51.0f, 52.0f, 53.0f, 54.0f, 55.0f, 56.0f, 57.0f, 58.0f, 59.0f, 60.0f, 61.0f, 62.0f, 63.0f});
test.AddOutput<float>("reduced", {16}, {0.0f, 2.0f, 4.0f, 6.0f, 8.0f, 10.0f, 12.0f, 14.0f, 16.0f, 18.0f, 20.0f, 22.0f, 24.0f, 26.0f, 28.0f, 30.0f});
test.Run();
}
TEST(ReductionOpTest, ReduceMin_RKR_keepdims) {
OpTester test("ReduceMin");
test.AddAttribute("axes", std::vector<int64_t>{0, 2});
test.AddAttribute("keepdims", (int64_t)1);
test.AddInput<float>("data", {3, 2, 2},
{11.0f, 12.0f,
13.0f, 14.0f,
15.0f, 16.0f,
17.0f, 18.0f,
19.0f, 20.0f,
21.0f, 22.0f});
test.AddOutput<float>("reduced", {1, 2, 1}, {11.f, 13.f});
test.Run();
}
TEST(ReductionOpTest, ReduceMin_RKRK) {
OpTester test("ReduceMin");
test.AddAttribute("axes", std::vector<int64_t>{0, 2});
@ -4102,6 +4333,126 @@ TEST(ReductionOpTest, ReduceSum_KRK2_keepdims) {
test.Run();
}
TEST(ReductionOpTest, ReduceSum_RKR) {
OpTester test("ReduceSum");
test.AddAttribute("axes", std::vector<int64_t>{0, 2});
test.AddAttribute("keepdims", (int64_t)0);
test.AddInput<float>("data", {3, 2, 2},
{1.0f, 2.0f,
3.0f, 4.0f,
5.0f, 6.0f,
7.0f, 8.0f,
9.0f, 10.0f,
11.0f, 12.0f});
test.AddOutput<float>("reduced", {2}, {33.f, 45.f});
test.Run();
}
TEST(ReductionOpTest, ReduceSum_RKR_parallel) {
OpTester test("ReduceSum");
test.AddAttribute("axes", std::vector<int64_t>{0, 2});
test.AddAttribute("keepdims", (int64_t)0);
test.AddInput<float>("data", {2, 16, 2},
{0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f,
17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f, 31.0f, 32.0f,
33.0f, 34.0f, 35.0f, 36.0f, 37.0f, 38.0f, 39.0f, 40.0f, 41.0f, 42.0f, 43.0f, 44.0f, 45.0f, 46.0f, 47.0f, 48.0f,
49.0f, 50.0f, 51.0f, 52.0f, 53.0f, 54.0f, 55.0f, 56.0f, 57.0f, 58.0f, 59.0f, 60.0f, 61.0f, 62.0f, 63.0f});
test.AddOutput<float>("reduced", {16}, {66.0f, 74.0f, 82.0f, 90.0f, 98.0f, 106.0f, 114.0f, 122.0f, 130.0f, 138.0f, 146.0f, 154.0f, 162.0f, 170.0f, 178.0f, 186.0f});
test.Run();
}
TEST(ReductionOpTest, ReduceSum_RKR_parallel_bigger) {
OpTester test("ReduceSum");
test.AddAttribute("axes", std::vector<int64_t>{0, 2});
test.AddAttribute("keepdims", (int64_t)0);
std::vector<float> in_data(512);
for (size_t i = 0; i < in_data.size(); ++i)
in_data[i] = (float)i;
test.AddInput<float>("data", {2, 128, 2}, in_data);
std::vector<float> expected(128);
for (size_t j = 0; j < 128; ++j) {
expected[j] = 0;
for (size_t i = 0; i < 2; ++i) {
for (size_t k = 0; k < 2; ++k) {
expected[j] += in_data[i * 256 + j * 2 + k];
}
}
}
test.AddOutput<float>("reduced", {128}, expected);
test.Run();
}
TEST(ReductionOpTest, ReduceSum_RKR_keepdims) {
OpTester test("ReduceSum");
test.AddAttribute("axes", std::vector<int64_t>{0, 2});
test.AddAttribute("keepdims", (int64_t)1);
test.AddInput<float>("data", {3, 2, 2},
{1.0f, 2.0f,
3.0f, 4.0f,
5.0f, 6.0f,
7.0f, 8.0f,
9.0f, 10.0f,
11.0f, 12.0f});
test.AddOutput<float>("reduced", {1, 2, 1}, {33.f, 45.f});
test.Run();
}
TEST(ReductionOpTest, ReduceSum_RKR2) {
OpTester test("ReduceSum");
test.AddAttribute("axes", std::vector<int64_t>{0, 3});
test.AddAttribute("keepdims", (int64_t)0);
test.AddInput<float>("data", {3, 2, 2, 2},
{1.0f, 2.0f,
3.0f, 4.0f,
5.0f, 6.0f,
7.0f, 8.0f,
9.0f, 10.0f,
11.0f, 12.0f,
13.0f, 14.0f,
15.0f, 16.0f,
17.0f, 18.0f,
19.0f, 20.0f,
21.0f, 22.0f,
23.0f, 24.0f});
test.AddOutput<float>("reduced", {2, 2}, {57.0f, 69.0f, 81.0f, 93.0f});
test.Run();
}
TEST(ReductionOpTest, ReduceSum_RKR2_keepdims) {
OpTester test("ReduceSum");
test.AddAttribute("axes", std::vector<int64_t>{0, 3});
test.AddAttribute("keepdims", (int64_t)1);
test.AddInput<float>("data", {3, 2, 2, 2},
{1.0f, 2.0f,
3.0f, 4.0f,
5.0f, 6.0f,
7.0f, 8.0f,
9.0f, 10.0f,
11.0f, 12.0f,
13.0f, 14.0f,
15.0f, 16.0f,
17.0f, 18.0f,
19.0f, 20.0f,
21.0f, 22.0f,
23.0f, 24.0f});
test.AddOutput<float>("reduced", {1, 2, 2, 1}, {57.0f, 69.0f, 81.0f, 93.0f});
test.Run();
}
TEST(ReductionOpTest, ReduceSum_RKRK) {
OpTester test("ReduceSum");
test.AddAttribute("axes", std::vector<int64_t>{0, 2});