mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-25 22:26:24 +00:00
Optimize ReduceSum, ReduceMean, ReduceMin, ReduceMax (#10280)
* Optimize ReduceSum, ReduceMean, ReduceMin, ReduceMax * improve reducemax, reducemin * faster, smaller * replace std::vector by gsl::span for shapes * fix merging issues
This commit is contained in:
parent
df841ee87d
commit
6f0640a57f
3 changed files with 520 additions and 56 deletions
|
|
@ -259,10 +259,15 @@ static void ValidateFastReduceRK(const gsl::span<const int64_t>& fast_shape, con
|
|||
}
|
||||
|
||||
static void ValidateFastReduceKRK(const gsl::span<const int64_t>& fast_shape, const Tensor& output) {
|
||||
ORT_ENFORCE(fast_shape.size() == 3, "Only works on matrices with two dimensions.");
|
||||
ORT_ENFORCE(fast_shape.size() == 3, "Only works on matrices with three dimensions.");
|
||||
ORT_ENFORCE(fast_shape[0] * fast_shape[2] == output.Shape().Size(), "Output size mismatch.");
|
||||
}
|
||||
|
||||
static void ValidateFastReduceRKR(const gsl::span<const int64_t>& fast_shape, const Tensor& output) {
|
||||
ORT_ENFORCE(fast_shape.size() == 3, "Only works on matrices with three dimensions.");
|
||||
ORT_ENFORCE(fast_shape[1] == output.Shape().Size(), "Output size mismatch.");
|
||||
}
|
||||
|
||||
void ReduceAggregatorBase::FastReduceKR(const Tensor&, const gsl::span<const int64_t>&, Tensor&, concurrency::ThreadPool*) {
|
||||
ValidateMustBeOverloaded();
|
||||
}
|
||||
|
|
@ -272,6 +277,9 @@ void ReduceAggregatorBase::FastReduceRK(const Tensor&, const gsl::span<const int
|
|||
void ReduceAggregatorBase::FastReduceKRK(const Tensor&, const gsl::span<const int64_t>&, Tensor&, concurrency::ThreadPool*) {
|
||||
ValidateMustBeOverloaded();
|
||||
}
|
||||
void ReduceAggregatorBase::FastReduceRKR(const Tensor&, const gsl::span<const int64_t>&, Tensor&, concurrency::ThreadPool*) {
|
||||
ValidateMustBeOverloaded();
|
||||
}
|
||||
|
||||
void NoTransposePrepareForReduce(const TensorShape& new_input_shape,
|
||||
gsl::span<const int64_t> reduced_axes,
|
||||
|
|
@ -624,8 +632,8 @@ FastReduceKind OptimizeShapeForFastReduce(gsl::span<const int64_t> input_shape,
|
|||
if (fast_shape.size() == 2) {
|
||||
return reduce[0] ? FastReduceKind::kRK : FastReduceKind::kKR;
|
||||
}
|
||||
if (fast_shape.size() == 3 && !reduce[0]) {
|
||||
return FastReduceKind::kKRK;
|
||||
if (fast_shape.size() == 3) {
|
||||
return reduce[0] ? FastReduceKind::kRKR : FastReduceKind::kKRK;
|
||||
}
|
||||
return FastReduceKind::kNone;
|
||||
}
|
||||
|
|
@ -671,7 +679,8 @@ bool CommonFastReduceSwitch(OpKernelContext* ctx,
|
|||
FastReduceKind which_fast_reduce,
|
||||
fast_reduce_fct* case_kr,
|
||||
fast_reduce_fct* case_rk,
|
||||
fast_reduce_fct* case_krk) {
|
||||
fast_reduce_fct* case_krk,
|
||||
fast_reduce_fct* case_rkr) {
|
||||
TensorShapeVector axes;
|
||||
const Tensor* input = ctx->Input<Tensor>(0);
|
||||
auto reduced_dims = input->Shape().GetDims();
|
||||
|
|
@ -715,6 +724,14 @@ bool CommonFastReduceSwitch(OpKernelContext* ctx,
|
|||
} else {
|
||||
break;
|
||||
}
|
||||
case FastReduceKind::kRKR:
|
||||
ValidateFastReduceRKR(fast_shape, *output);
|
||||
if (fast_shape[1] >= std::max(2, concurrency::ThreadPool::DegreeOfParallelism(ctx->GetOperatorThreadPool()))) {
|
||||
case_rkr(*input, fast_shape, *output, ctx->GetOperatorThreadPool());
|
||||
return true;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
case FastReduceKind::kR:
|
||||
case FastReduceKind::kK:
|
||||
case FastReduceKind::kNone:
|
||||
|
|
@ -738,7 +755,8 @@ bool CommonFastReduce(OpKernelContext* ctx,
|
|||
TensorShapeVector& fast_axes) {
|
||||
return CommonFastReduceSwitch(ctx, axes_, keepdims_, noop_with_empty_axes,
|
||||
fast_kind, fast_shape, output_shape, fast_axes,
|
||||
AGG::WhichFastReduce(), &AGG::FastReduceKR, &AGG::FastReduceRK, &AGG::FastReduceKRK);
|
||||
AGG::WhichFastReduce(), &AGG::FastReduceKR, &AGG::FastReduceRK,
|
||||
&AGG::FastReduceKRK, &AGG::FastReduceRKR);
|
||||
}
|
||||
|
||||
static void ValidateKeepDims(const TensorShape& shape, int64_t keepdims) {
|
||||
|
|
@ -925,6 +943,14 @@ std::unique_ptr<Tensor> ReduceSum<T>::Impl(const Tensor& input, gsl::span<const
|
|||
} else {
|
||||
break;
|
||||
}
|
||||
case FastReduceKind::kRKR:
|
||||
ValidateFastReduceRKR(fast_shape, *output);
|
||||
if (fast_shape[0] >= std::max(2, concurrency::ThreadPool::DegreeOfParallelism(tp))) {
|
||||
ReduceAggregatorSum<T>::FastReduceRKR(input, fast_shape, *output, tp);
|
||||
return output;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
case FastReduceKind::kR:
|
||||
case FastReduceKind::kK:
|
||||
case FastReduceKind::kNone:
|
||||
|
|
|
|||
|
|
@ -25,7 +25,8 @@ enum FastReduceKind {
|
|||
kKR = 4, // kept dim, reduced dim
|
||||
kRK = 8, // reduced dim, kept dim
|
||||
kKRK = 16, // kept dim, reduced dim, kept dim
|
||||
kEmpty = 32 // empty reduce
|
||||
kRKR = 32, // reduced dim, kept dim, reduced dim
|
||||
kEmpty = 64 // empty reduce
|
||||
};
|
||||
|
||||
FastReduceKind operator|(FastReduceKind a, FastReduceKind b);
|
||||
|
|
@ -54,6 +55,7 @@ constexpr TensorOpCost ParallelReduceFastCost(int64_t n_row, int64_t n_col, int6
|
|||
* KR - reduction on the last dimensions
|
||||
* RK - reduction on the first dimensions
|
||||
* KRK - reduction on the middle dimensions.
|
||||
* RKR - reduction on all dimensions but the middle ones
|
||||
|
||||
For these three configuration, the reduction may be optimized
|
||||
with vectors operations. Method WhichFastReduce() returns which case
|
||||
|
|
@ -154,6 +156,7 @@ class ReduceAggregatorBase {
|
|||
static void FastReduceKR(const Tensor&, const gsl::span<const int64_t>&, Tensor&, concurrency::ThreadPool*);
|
||||
static void FastReduceRK(const Tensor&, const gsl::span<const int64_t>&, Tensor&, concurrency::ThreadPool*);
|
||||
static void FastReduceKRK(const Tensor&, const gsl::span<const int64_t>&, Tensor&, concurrency::ThreadPool*);
|
||||
static void FastReduceRKR(const Tensor&, const gsl::span<const int64_t>&, Tensor&, concurrency::ThreadPool*);
|
||||
};
|
||||
|
||||
template <typename T, typename TVAL = T>
|
||||
|
|
@ -175,20 +178,48 @@ class ReduceAggregator : public ReduceAggregatorBase {
|
|||
inline void update0(const T&) {}
|
||||
inline TVAL aggall(const T*) {}
|
||||
inline TVAL get_value() { return accumulator_; }
|
||||
|
||||
protected:
|
||||
static void CommonFastReduceRKR(const Tensor& input, const gsl::span<const int64_t>& fast_shape,
|
||||
Tensor& output, concurrency::ThreadPool* tp,
|
||||
std::function<TVAL(const T*)> f_init,
|
||||
std::function<void(TVAL&, const T*, int64_t)> f_update) {
|
||||
const T* data = input.Data<T>();
|
||||
TVAL* out = output.MutableData<TVAL>();
|
||||
int64_t d0 = fast_shape[0];
|
||||
int64_t d2 = fast_shape[2];
|
||||
int64_t inc = d2 * fast_shape[1];
|
||||
|
||||
concurrency::ThreadPool::TryParallelFor(
|
||||
tp, fast_shape[1], ParallelReduceFastCost(fast_shape[1], fast_shape[0] * fast_shape[2], sizeof(T), 6),
|
||||
[data, out, d0, d2, inc, f_init, f_update](ptrdiff_t begin, ptrdiff_t last) {
|
||||
const T* p;
|
||||
for (ptrdiff_t d = begin; d < last; ++d) {
|
||||
p = data + d * d2;
|
||||
out[d] = f_init(p);
|
||||
for (int64_t i = 0; i < d0; ++i, p += inc) {
|
||||
f_update(out[d], p, d2);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename TVAL = T>
|
||||
class ReduceAggregatorSum : public ReduceAggregator<T, TVAL> {
|
||||
template <typename T>
|
||||
class ReduceAggregatorSum : public ReduceAggregator<T, T> {
|
||||
public:
|
||||
inline ReduceAggregatorSum(int64_t N, const T&) : ReduceAggregator<T, TVAL>(N, 0) {}
|
||||
inline ReduceAggregatorSum(int64_t N, const T&) : ReduceAggregator<T, T>(N, 0) {}
|
||||
inline void update(const T& v) { this->accumulator_ += v; }
|
||||
inline TVAL aggall(const T* from_data) {
|
||||
return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).sum();
|
||||
static T aggall(const T* from_data, int64_t size) {
|
||||
return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, size).sum();
|
||||
}
|
||||
inline T aggall(const T* from_data) {
|
||||
return aggall(from_data, this->N_);
|
||||
}
|
||||
|
||||
// Fast reduction
|
||||
static inline FastReduceKind WhichFastReduce() {
|
||||
return FastReduceKind::kKR | FastReduceKind::kRK | FastReduceKind::kKRK;
|
||||
return FastReduceKind::kKR | FastReduceKind::kRK | FastReduceKind::kKRK | FastReduceKind::kRKR;
|
||||
}
|
||||
|
||||
static void FastReduceKR(const Tensor& input, const gsl::span<const int64_t>& fast_shape,
|
||||
|
|
@ -200,7 +231,7 @@ class ReduceAggregatorSum : public ReduceAggregator<T, TVAL> {
|
|||
tp, fast_shape[0], ParallelReduceFastCost(1, stridei, sizeof(T), 6),
|
||||
[data, stridei, out](ptrdiff_t first, ptrdiff_t last) {
|
||||
for (ptrdiff_t d = first; d < last; ++d) {
|
||||
out[d] = ConstEigenVectorArrayMap<T>(data + d * stridei, stridei).sum();
|
||||
out[d] = aggall(data + d * stridei, stridei);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
|
@ -239,6 +270,16 @@ class ReduceAggregatorSum : public ReduceAggregator<T, TVAL> {
|
|||
}
|
||||
});
|
||||
}
|
||||
|
||||
static void FastReduceRKR(const Tensor& input, const gsl::span<const int64_t>& fast_shape,
|
||||
Tensor& output, concurrency::ThreadPool* tp) {
|
||||
ReduceAggregator<T, T>::CommonFastReduceRKR(
|
||||
input, fast_shape, output, tp,
|
||||
[=](const T*) -> T { return 0; },
|
||||
[=](T& value, const T* p, int64_t size) {
|
||||
value += aggall(p, size);
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename TVAL = T>
|
||||
|
|
@ -251,12 +292,15 @@ class ReduceAggregatorSumSquare : public ReduceAggregator<T, TVAL> {
|
|||
inline void update(const T& v) { this->accumulator_ += v * v; }
|
||||
};
|
||||
|
||||
template <typename T, typename TVAL = T>
|
||||
class ReduceAggregatorMean : public ReduceAggregatorSum<T, TVAL> {
|
||||
template <typename T>
|
||||
class ReduceAggregatorMean : public ReduceAggregatorSum<T> {
|
||||
public:
|
||||
inline ReduceAggregatorMean(int64_t N, const T&) : ReduceAggregatorSum<T, TVAL>(N, 0) {}
|
||||
inline ReduceAggregatorMean(int64_t N, const T&) : ReduceAggregatorSum<T>(N, 0) {}
|
||||
static T aggall(const T* from_data, int64_t size) {
|
||||
return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, size).mean();
|
||||
}
|
||||
inline T aggall(const T* from_data) {
|
||||
return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).mean();
|
||||
return aggall(from_data, this->N_);
|
||||
}
|
||||
inline T get_value() { return this->accumulator_ / static_cast<T>(this->N_); }
|
||||
|
||||
|
|
@ -265,7 +309,7 @@ class ReduceAggregatorMean : public ReduceAggregatorSum<T, TVAL> {
|
|||
|
||||
static void FastReduceKR(const Tensor& input, const gsl::span<const int64_t>& fast_shape,
|
||||
Tensor& output, concurrency::ThreadPool* tp) {
|
||||
ReduceAggregatorSum<T, TVAL>::FastReduceKR(input, fast_shape, output, tp);
|
||||
ReduceAggregatorSum<T>::FastReduceKR(input, fast_shape, output, tp);
|
||||
// TODO: use MLAS or BLAS
|
||||
T* out = output.MutableData<T>();
|
||||
T* end = out + fast_shape[0];
|
||||
|
|
@ -276,7 +320,7 @@ class ReduceAggregatorMean : public ReduceAggregatorSum<T, TVAL> {
|
|||
|
||||
static void FastReduceRK(const Tensor& input, const gsl::span<const int64_t>& fast_shape,
|
||||
Tensor& output, concurrency::ThreadPool* tp) {
|
||||
ReduceAggregatorSum<T, TVAL>::FastReduceRK(input, fast_shape, output, tp);
|
||||
ReduceAggregatorSum<T>::FastReduceRK(input, fast_shape, output, tp);
|
||||
// TODO: use MLAS or BLAS
|
||||
T* out = output.MutableData<T>();
|
||||
T* end = out + fast_shape[1];
|
||||
|
|
@ -287,7 +331,7 @@ class ReduceAggregatorMean : public ReduceAggregatorSum<T, TVAL> {
|
|||
|
||||
static void FastReduceKRK(const Tensor& input, const gsl::span<const int64_t>& fast_shape,
|
||||
Tensor& output, concurrency::ThreadPool* tp) {
|
||||
ReduceAggregatorSum<T, TVAL>::FastReduceKRK(input, fast_shape, output, tp);
|
||||
ReduceAggregatorSum<T>::FastReduceKRK(input, fast_shape, output, tp);
|
||||
int64_t strideo = fast_shape[2];
|
||||
T* out = output.MutableData<T>();
|
||||
T* begin;
|
||||
|
|
@ -301,20 +345,34 @@ class ReduceAggregatorMean : public ReduceAggregatorSum<T, TVAL> {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void FastReduceRKR(const Tensor& input, const gsl::span<const int64_t>& fast_shape,
|
||||
Tensor& output, concurrency::ThreadPool* tp) {
|
||||
ReduceAggregatorSum<T>::FastReduceRKR(input, fast_shape, output, tp);
|
||||
T* out = output.MutableData<T>();
|
||||
T div = static_cast<T>(fast_shape[0] * fast_shape[2]);
|
||||
T* end = out + fast_shape[1];
|
||||
for (; out != end; ++out) {
|
||||
*out /= div;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename TVAL = T>
|
||||
class ReduceAggregatorMax : public ReduceAggregator<T, TVAL> {
|
||||
template <typename T>
|
||||
class ReduceAggregatorMax : public ReduceAggregator<T> {
|
||||
public:
|
||||
inline ReduceAggregatorMax(int64_t N, const T& init) : ReduceAggregator<T, TVAL>(N, init) {}
|
||||
inline TVAL aggall(const T* from_data) {
|
||||
return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).maxCoeff();
|
||||
inline ReduceAggregatorMax(int64_t N, const T& init) : ReduceAggregator<T, T>(N, init) {}
|
||||
static T aggall(const T* from_data, int64_t size) {
|
||||
return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, size).maxCoeff();
|
||||
}
|
||||
inline T aggall(const T* from_data) {
|
||||
return aggall(from_data, this->N_);
|
||||
}
|
||||
inline void update(const T& v) { this->accumulator_ = v > this->accumulator_ ? v : this->accumulator_; }
|
||||
|
||||
// Fast reduction
|
||||
static inline FastReduceKind WhichFastReduce() {
|
||||
return FastReduceKind::kKR | FastReduceKind::kRK | FastReduceKind::kKRK;
|
||||
return FastReduceKind::kKR | FastReduceKind::kRK | FastReduceKind::kKRK | FastReduceKind::kRKR;
|
||||
}
|
||||
|
||||
static void FastReduceKR(const Tensor& input, const gsl::span<const int64_t>& fast_shape,
|
||||
|
|
@ -347,7 +405,8 @@ class ReduceAggregatorMax : public ReduceAggregator<T, TVAL> {
|
|||
for (int64_t row = 1; row < n_rows; ++row) {
|
||||
p = data + row * N;
|
||||
for (int64_t j = begin; j < end; ++j) {
|
||||
out[j] = out[j] > p[j] ? out[j] : p[j];
|
||||
if (out[j] < p[j])
|
||||
out[j] = p[j];
|
||||
}
|
||||
}
|
||||
});
|
||||
|
|
@ -371,6 +430,18 @@ class ReduceAggregatorMax : public ReduceAggregator<T, TVAL> {
|
|||
}
|
||||
});
|
||||
}
|
||||
|
||||
static void FastReduceRKR(const Tensor& input, const gsl::span<const int64_t>& fast_shape,
|
||||
Tensor& output, concurrency::ThreadPool* tp) {
|
||||
ReduceAggregator<T, T>::CommonFastReduceRKR(
|
||||
input, fast_shape, output, tp,
|
||||
[=](const T* p) -> T { return p[0]; },
|
||||
[=](T& value, const T* p, int64_t size) {
|
||||
T v = aggall(p, size);
|
||||
if (v > value)
|
||||
value = v;
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename TVAL = int64_t>
|
||||
|
|
@ -462,18 +533,21 @@ class ReduceAggregatorArgMinLastIndex : public ReduceAggregatorArgMin<T, TVAL> {
|
|||
}
|
||||
};
|
||||
|
||||
template <typename T, typename TVAL = T>
|
||||
class ReduceAggregatorMin : public ReduceAggregator<T, TVAL> {
|
||||
template <typename T>
|
||||
class ReduceAggregatorMin : public ReduceAggregator<T, T> {
|
||||
public:
|
||||
inline ReduceAggregatorMin(int64_t N, const T& init) : ReduceAggregator<T, TVAL>(N, init) {}
|
||||
inline TVAL aggall(const T* from_data) {
|
||||
return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).minCoeff();
|
||||
inline ReduceAggregatorMin(int64_t N, const T& init) : ReduceAggregator<T, T>(N, init) {}
|
||||
static T aggall(const T* from_data, int64_t size) {
|
||||
return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, size).minCoeff();
|
||||
}
|
||||
inline T aggall(const T* from_data) {
|
||||
return aggall(from_data, this->N_);
|
||||
}
|
||||
inline void update(const T& v) { this->accumulator_ = v < this->accumulator_ ? v : this->accumulator_; }
|
||||
|
||||
// Fast reduction
|
||||
static inline FastReduceKind WhichFastReduce() {
|
||||
return FastReduceKind::kKR | FastReduceKind::kRK | FastReduceKind::kKRK;
|
||||
return FastReduceKind::kKR | FastReduceKind::kRK | FastReduceKind::kKRK | FastReduceKind::kRKR;
|
||||
}
|
||||
|
||||
static void FastReduceKR(const Tensor& input, const gsl::span<const int64_t>& fast_shape,
|
||||
|
|
@ -506,7 +580,8 @@ class ReduceAggregatorMin : public ReduceAggregator<T, TVAL> {
|
|||
for (int64_t row = 1; row < n_rows; ++row) {
|
||||
p = data + row * N;
|
||||
for (int64_t j = begin; j < end; ++j) {
|
||||
out[j] = out[j] < p[j] ? out[j] : p[j];
|
||||
if (out[j] > p[j])
|
||||
out[j] = p[j];
|
||||
}
|
||||
}
|
||||
});
|
||||
|
|
@ -530,60 +605,72 @@ class ReduceAggregatorMin : public ReduceAggregator<T, TVAL> {
|
|||
}
|
||||
});
|
||||
}
|
||||
|
||||
static void FastReduceRKR(const Tensor& input, const gsl::span<const int64_t>& fast_shape,
|
||||
Tensor& output, concurrency::ThreadPool* tp) {
|
||||
ReduceAggregator<T, T>::CommonFastReduceRKR(
|
||||
input, fast_shape, output, tp,
|
||||
[=](const T* p) -> T { return p[0]; },
|
||||
[=](T& value, const T* p, int64_t size) {
|
||||
T v = aggall(p, size);
|
||||
if (v < value)
|
||||
value = v;
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename TVAL = T>
|
||||
class ReduceAggregatorProd : public ReduceAggregator<T, TVAL> {
|
||||
template <typename T>
|
||||
class ReduceAggregatorProd : public ReduceAggregator<T, T> {
|
||||
public:
|
||||
inline ReduceAggregatorProd(int64_t N, const T&) : ReduceAggregator<T, TVAL>(N, 1) {}
|
||||
inline TVAL aggall(const T* from_data) {
|
||||
inline ReduceAggregatorProd(int64_t N, const T&) : ReduceAggregator<T, T>(N, 1) {}
|
||||
inline T aggall(const T* from_data) {
|
||||
return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).prod();
|
||||
}
|
||||
inline void update(const T& v) { this->accumulator_ *= v; }
|
||||
};
|
||||
|
||||
template <typename T, typename TVAL = T>
|
||||
class ReduceAggregatorL1 : public ReduceAggregator<T, TVAL> {
|
||||
template <typename T>
|
||||
class ReduceAggregatorL1 : public ReduceAggregator<T, T> {
|
||||
public:
|
||||
inline ReduceAggregatorL1(int64_t N, const T&) : ReduceAggregator<T, TVAL>(N, 0) {}
|
||||
inline TVAL aggall(const T* from_data) {
|
||||
inline ReduceAggregatorL1(int64_t N, const T&) : ReduceAggregator<T, T>(N, 0) {}
|
||||
inline T aggall(const T* from_data) {
|
||||
return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).cwiseAbs().sum();
|
||||
}
|
||||
inline void update(const T& v) { this->accumulator_ += v > 0 ? v : -v; }
|
||||
};
|
||||
|
||||
template <typename T, typename TVAL = T>
|
||||
class ReduceAggregatorL2 : public ReduceAggregator<T, TVAL> {
|
||||
template <typename T>
|
||||
class ReduceAggregatorL2 : public ReduceAggregator<T, T> {
|
||||
public:
|
||||
inline ReduceAggregatorL2(int64_t N, const T&) : ReduceAggregator<T, TVAL>(N, 0) {}
|
||||
inline TVAL aggall(const T* from_data) {
|
||||
inline ReduceAggregatorL2(int64_t N, const T&) : ReduceAggregator<T, T>(N, 0) {}
|
||||
inline T aggall(const T* from_data) {
|
||||
return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).norm();
|
||||
}
|
||||
inline void update(const T& v) { this->accumulator_ += v * v; }
|
||||
inline TVAL get_value() { return reduce_sqrt<T>(this->accumulator_); }
|
||||
inline T get_value() { return reduce_sqrt<T>(this->accumulator_); }
|
||||
};
|
||||
|
||||
template <typename T, typename TVAL = T>
|
||||
class ReduceAggregatorLogSum : public ReduceAggregator<T, TVAL> {
|
||||
template <typename T>
|
||||
class ReduceAggregatorLogSum : public ReduceAggregator<T, T> {
|
||||
public:
|
||||
inline ReduceAggregatorLogSum(int64_t N, const T&) : ReduceAggregator<T, TVAL>(N, 0) {}
|
||||
inline ReduceAggregatorLogSum(int64_t N, const T&) : ReduceAggregator<T, T>(N, 0) {}
|
||||
inline T aggall(const T* from_data) {
|
||||
return reduce_log<T>(Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).sum());
|
||||
}
|
||||
inline void update(const T& v) { this->accumulator_ += v; }
|
||||
inline TVAL get_value() { return reduce_log<T>(this->accumulator_); }
|
||||
inline T get_value() { return reduce_log<T>(this->accumulator_); }
|
||||
};
|
||||
|
||||
template <typename T, typename TVAL = T>
|
||||
class ReduceAggregatorLogSumExp : public ReduceAggregator<T, TVAL> {
|
||||
template <typename T>
|
||||
class ReduceAggregatorLogSumExp : public ReduceAggregator<T, T> {
|
||||
protected:
|
||||
T max_;
|
||||
|
||||
public:
|
||||
inline ReduceAggregatorLogSumExp(int64_t N, const T& init) : ReduceAggregator<T, TVAL>(N, 0) {
|
||||
inline ReduceAggregatorLogSumExp(int64_t N, const T& init) : ReduceAggregator<T, T>(N, 0) {
|
||||
max_ = reduce_isinf(init) ? this->accumulator_ : init;
|
||||
}
|
||||
inline TVAL aggall(const T* from_data) {
|
||||
inline T aggall(const T* from_data) {
|
||||
max_ = Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).maxCoeff();
|
||||
for (int64_t i = 0; i < this->N_; ++i) {
|
||||
update(from_data[i]);
|
||||
|
|
@ -594,7 +681,7 @@ class ReduceAggregatorLogSumExp : public ReduceAggregator<T, TVAL> {
|
|||
max_ = (reduce_isinf(v) || reduce_isnan(v) || v < max_) ? max_ : v;
|
||||
}
|
||||
inline void update(const T& v) { this->accumulator_ += reduce_exp(v - max_); }
|
||||
inline TVAL get_value() { return reduce_log<T>(this->accumulator_) + max_; }
|
||||
inline T get_value() { return reduce_log<T>(this->accumulator_) + max_; }
|
||||
};
|
||||
|
||||
void NoTransposePrepareForReduce(const TensorShape& new_input_shape,
|
||||
|
|
|
|||
|
|
@ -3165,6 +3165,96 @@ TEST(ReductionOpTest, OptimizeShapeForFastReduce_KRK) {
|
|||
ASSERT_EQ(fast_axes, expected_fast_axes);
|
||||
}
|
||||
|
||||
TEST(ReductionOpTest, OptimizeShapeForFastReduce_RKR) {
|
||||
FastReduceKind fast_kind;
|
||||
TensorShapeVector fast_shape, fast_output_shape, fast_axes;
|
||||
TensorShapeVector expected_fast_shape, expected_fast_output_shape, expected_fast_axes;
|
||||
|
||||
// RKR - keep_dims=1
|
||||
fast_kind = OptimizeShapeForFastReduce(
|
||||
std::vector<int64_t>{9, 10, 11}, std::vector<int64_t>{0, 2},
|
||||
fast_shape, fast_output_shape, fast_axes, true);
|
||||
expected_fast_shape = TensorShapeVector{9, 10, 11};
|
||||
expected_fast_output_shape = TensorShapeVector{1, 10, 1};
|
||||
expected_fast_axes = TensorShapeVector{0, 2};
|
||||
ASSERT_EQ(fast_kind, FastReduceKind::kRKR);
|
||||
ASSERT_EQ(fast_shape, expected_fast_shape);
|
||||
ASSERT_EQ(fast_output_shape, expected_fast_output_shape);
|
||||
ASSERT_EQ(fast_axes, expected_fast_axes);
|
||||
|
||||
fast_kind = OptimizeShapeForFastReduce(
|
||||
std::vector<int64_t>{7, 9, 10, 11}, std::vector<int64_t>{0, 3},
|
||||
fast_shape, fast_output_shape, fast_axes, true);
|
||||
expected_fast_shape = TensorShapeVector{7, 90, 11};
|
||||
expected_fast_output_shape = TensorShapeVector{1, 9, 10, 1};
|
||||
ASSERT_EQ(fast_kind, FastReduceKind::kRKR);
|
||||
ASSERT_EQ(fast_shape, expected_fast_shape);
|
||||
ASSERT_EQ(fast_output_shape, expected_fast_output_shape);
|
||||
ASSERT_EQ(fast_axes, expected_fast_axes);
|
||||
|
||||
fast_kind = OptimizeShapeForFastReduce(
|
||||
std::vector<int64_t>{7, 9, 10, 11}, std::vector<int64_t>{0, 2, 3},
|
||||
fast_shape, fast_output_shape, fast_axes, true);
|
||||
expected_fast_shape = TensorShapeVector{7, 9, 110};
|
||||
expected_fast_output_shape = TensorShapeVector{1, 9, 1, 1};
|
||||
ASSERT_EQ(fast_kind, FastReduceKind::kRKR);
|
||||
ASSERT_EQ(fast_shape, expected_fast_shape);
|
||||
ASSERT_EQ(fast_output_shape, expected_fast_output_shape);
|
||||
ASSERT_EQ(fast_axes, expected_fast_axes);
|
||||
|
||||
fast_kind = OptimizeShapeForFastReduce(
|
||||
std::vector<int64_t>{7, 9, 10, 11}, std::vector<int64_t>{0, 1, 3},
|
||||
fast_shape, fast_output_shape, fast_axes, true);
|
||||
expected_fast_shape = TensorShapeVector{63, 10, 11};
|
||||
expected_fast_output_shape = TensorShapeVector{1, 1, 10, 1};
|
||||
ASSERT_EQ(fast_kind, FastReduceKind::kRKR);
|
||||
ASSERT_EQ(fast_shape, expected_fast_shape);
|
||||
ASSERT_EQ(fast_output_shape, expected_fast_output_shape);
|
||||
ASSERT_EQ(fast_axes, expected_fast_axes);
|
||||
|
||||
// KRK - keep_dims=0
|
||||
fast_kind = OptimizeShapeForFastReduce(
|
||||
std::vector<int64_t>{9, 10, 11}, std::vector<int64_t>{0, 2},
|
||||
fast_shape, fast_output_shape, fast_axes, false);
|
||||
expected_fast_shape = TensorShapeVector{9, 10, 11};
|
||||
expected_fast_output_shape = TensorShapeVector{10};
|
||||
expected_fast_axes = TensorShapeVector{0, 2};
|
||||
ASSERT_EQ(fast_kind, FastReduceKind::kRKR);
|
||||
ASSERT_EQ(fast_shape, expected_fast_shape);
|
||||
ASSERT_EQ(fast_output_shape, expected_fast_output_shape);
|
||||
ASSERT_EQ(fast_axes, expected_fast_axes);
|
||||
|
||||
fast_kind = OptimizeShapeForFastReduce(
|
||||
std::vector<int64_t>{7, 9, 10, 11}, std::vector<int64_t>{0, 3},
|
||||
fast_shape, fast_output_shape, fast_axes, false);
|
||||
expected_fast_shape = TensorShapeVector{7, 90, 11};
|
||||
expected_fast_output_shape = TensorShapeVector{9, 10};
|
||||
ASSERT_EQ(fast_kind, FastReduceKind::kRKR);
|
||||
ASSERT_EQ(fast_shape, expected_fast_shape);
|
||||
ASSERT_EQ(fast_output_shape, expected_fast_output_shape);
|
||||
ASSERT_EQ(fast_axes, expected_fast_axes);
|
||||
|
||||
fast_kind = OptimizeShapeForFastReduce(
|
||||
std::vector<int64_t>{7, 9, 10, 11}, std::vector<int64_t>{0, 2, 3},
|
||||
fast_shape, fast_output_shape, fast_axes, false);
|
||||
expected_fast_shape = TensorShapeVector{7, 9, 110};
|
||||
expected_fast_output_shape = TensorShapeVector{9};
|
||||
ASSERT_EQ(fast_kind, FastReduceKind::kRKR);
|
||||
ASSERT_EQ(fast_shape, expected_fast_shape);
|
||||
ASSERT_EQ(fast_output_shape, expected_fast_output_shape);
|
||||
ASSERT_EQ(fast_axes, expected_fast_axes);
|
||||
|
||||
fast_kind = OptimizeShapeForFastReduce(
|
||||
std::vector<int64_t>{7, 9, 10, 11}, std::vector<int64_t>{0, 1, 3},
|
||||
fast_shape, fast_output_shape, fast_axes, false);
|
||||
expected_fast_shape = TensorShapeVector{63, 10, 11};
|
||||
expected_fast_output_shape = TensorShapeVector{10};
|
||||
ASSERT_EQ(fast_kind, FastReduceKind::kRKR);
|
||||
ASSERT_EQ(fast_shape, expected_fast_shape);
|
||||
ASSERT_EQ(fast_output_shape, expected_fast_output_shape);
|
||||
ASSERT_EQ(fast_axes, expected_fast_axes);
|
||||
}
|
||||
|
||||
TEST(ReductionOpTest, OptimizeShapeForFastReduce_NONE) {
|
||||
FastReduceKind fast_kind;
|
||||
TensorShapeVector fast_shape, fast_output_shape, fast_axes;
|
||||
|
|
@ -3427,6 +3517,53 @@ TEST(ReductionOpTest, ReduceMax_KRK_keepdims) {
|
|||
test.Run();
|
||||
}
|
||||
|
||||
TEST(ReductionOpTest, ReduceMax_RKR) {
|
||||
OpTester test("ReduceMax");
|
||||
test.AddAttribute("axes", std::vector<int64_t>{0, 2});
|
||||
test.AddAttribute("keepdims", (int64_t)0);
|
||||
test.AddInput<float>("data", {3, 2, 2},
|
||||
{1.0f, 2.0f,
|
||||
3.0f, 4.0f,
|
||||
|
||||
5.0f, 6.0f,
|
||||
7.0f, 8.0f,
|
||||
|
||||
9.0f, 10.0f,
|
||||
11.0f, 12.0f});
|
||||
test.AddOutput<float>("reduced", {2}, {10.f, 12.f});
|
||||
test.Run();
|
||||
}
|
||||
|
||||
TEST(ReductionOpTest, ReduceMax_RKR_parallel) {
|
||||
OpTester test("ReduceMax");
|
||||
test.AddAttribute("axes", std::vector<int64_t>{0, 2});
|
||||
test.AddAttribute("keepdims", (int64_t)0);
|
||||
test.AddInput<float>("data", {2, 16, 2},
|
||||
{0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f,
|
||||
17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f, 31.0f, 32.0f,
|
||||
33.0f, 34.0f, 35.0f, 36.0f, 37.0f, 38.0f, 39.0f, 40.0f, 41.0f, 42.0f, 43.0f, 44.0f, 45.0f, 46.0f, 47.0f, 48.0f,
|
||||
49.0f, 50.0f, 51.0f, 52.0f, 53.0f, 54.0f, 55.0f, 56.0f, 57.0f, 58.0f, 59.0f, 60.0f, 61.0f, 62.0f, 63.0f});
|
||||
test.AddOutput<float>("reduced", {16}, {33.0f, 35.0f, 37.0f, 39.0f, 41.0f, 43.0f, 45.0f, 47.0f, 49.0f, 51.0f, 53.0f, 55.0f, 57.0f, 59.0f, 61.0f, 63.0f});
|
||||
test.Run();
|
||||
}
|
||||
|
||||
TEST(ReductionOpTest, ReduceMax_RKR_keepdims) {
|
||||
OpTester test("ReduceMax");
|
||||
test.AddAttribute("axes", std::vector<int64_t>{0, 2});
|
||||
test.AddAttribute("keepdims", (int64_t)1);
|
||||
test.AddInput<float>("data", {3, 2, 2},
|
||||
{1.0f, 2.0f,
|
||||
3.0f, 4.0f,
|
||||
|
||||
5.0f, 6.0f,
|
||||
7.0f, 8.0f,
|
||||
|
||||
9.0f, 10.0f,
|
||||
11.0f, 12.0f});
|
||||
test.AddOutput<float>("reduced", {1, 2, 1}, {10.f, 12.f});
|
||||
test.Run();
|
||||
}
|
||||
|
||||
TEST(ReductionOpTest, ReduceMax_RKRK) {
|
||||
OpTester test("ReduceMax");
|
||||
test.AddAttribute("axes", std::vector<int64_t>{0, 2});
|
||||
|
|
@ -3581,6 +3718,53 @@ TEST(ReductionOpTest, ReduceMean_KRK_keepdims) {
|
|||
test.Run();
|
||||
}
|
||||
|
||||
TEST(ReductionOpTest, ReduceMean_RKR) {
|
||||
OpTester test("ReduceMean");
|
||||
test.AddAttribute("axes", std::vector<int64_t>{0, 2});
|
||||
test.AddAttribute("keepdims", (int64_t)0);
|
||||
test.AddInput<float>("data", {3, 2, 2},
|
||||
{1.0f, 2.0f,
|
||||
3.0f, 4.0f,
|
||||
|
||||
5.0f, 6.0f,
|
||||
7.0f, 8.0f,
|
||||
|
||||
9.0f, 10.0f,
|
||||
11.0f, 12.0f});
|
||||
test.AddOutput<float>("reduced", {2}, {5.5f, 7.5f});
|
||||
test.Run();
|
||||
}
|
||||
|
||||
TEST(ReductionOpTest, ReduceMean_RKR_parallel) {
|
||||
OpTester test("ReduceMean");
|
||||
test.AddAttribute("axes", std::vector<int64_t>{0, 2});
|
||||
test.AddAttribute("keepdims", (int64_t)0);
|
||||
test.AddInput<float>("data", {2, 16, 2},
|
||||
{0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f,
|
||||
17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f, 31.0f, 32.0f,
|
||||
33.0f, 34.0f, 35.0f, 36.0f, 37.0f, 38.0f, 39.0f, 40.0f, 41.0f, 42.0f, 43.0f, 44.0f, 45.0f, 46.0f, 47.0f, 48.0f,
|
||||
49.0f, 50.0f, 51.0f, 52.0f, 53.0f, 54.0f, 55.0f, 56.0f, 57.0f, 58.0f, 59.0f, 60.0f, 61.0f, 62.0f, 63.0f});
|
||||
test.AddOutput<float>("reduced", {16}, {16.5f, 18.5f, 20.5f, 22.5f, 24.5f, 26.5f, 28.5f, 30.5f, 32.5f, 34.5f, 36.5f, 38.5f, 40.5f, 42.5f, 44.5f, 46.5f});
|
||||
test.Run();
|
||||
}
|
||||
|
||||
TEST(ReductionOpTest, ReduceMean_RKR_keepdims) {
|
||||
OpTester test("ReduceMean");
|
||||
test.AddAttribute("axes", std::vector<int64_t>{0, 2});
|
||||
test.AddAttribute("keepdims", (int64_t)1);
|
||||
test.AddInput<float>("data", {3, 2, 2},
|
||||
{1.0f, 2.0f,
|
||||
3.0f, 4.0f,
|
||||
|
||||
5.0f, 6.0f,
|
||||
7.0f, 8.0f,
|
||||
|
||||
9.0f, 10.0f,
|
||||
11.0f, 12.0f});
|
||||
test.AddOutput<float>("reduced", {1, 2, 1}, {5.5f, 7.5f});
|
||||
test.Run();
|
||||
}
|
||||
|
||||
TEST(ReductionOpTest, ReduceMean_RKRK) {
|
||||
OpTester test("ReduceMean");
|
||||
test.AddAttribute("axes", std::vector<int64_t>{0, 2});
|
||||
|
|
@ -3771,6 +3955,53 @@ TEST(ReductionOpTest, ReduceMin_KRK_keepdims) {
|
|||
test.Run();
|
||||
}
|
||||
|
||||
TEST(ReductionOpTest, ReduceMin_RKR) {
|
||||
OpTester test("ReduceMin");
|
||||
test.AddAttribute("axes", std::vector<int64_t>{0, 2});
|
||||
test.AddAttribute("keepdims", (int64_t)0);
|
||||
test.AddInput<float>("data", {3, 2, 2},
|
||||
{11.0f, 12.0f,
|
||||
13.0f, 14.0f,
|
||||
|
||||
15.0f, 16.0f,
|
||||
17.0f, 18.0f,
|
||||
|
||||
19.0f, 20.0f,
|
||||
21.0f, 22.0f});
|
||||
test.AddOutput<float>("reduced", {2}, {11.f, 13.f});
|
||||
test.Run();
|
||||
}
|
||||
|
||||
TEST(ReductionOpTest, ReduceMin_RKR_parallel) {
|
||||
OpTester test("ReduceMin");
|
||||
test.AddAttribute("axes", std::vector<int64_t>{0, 2});
|
||||
test.AddAttribute("keepdims", (int64_t)0);
|
||||
test.AddInput<float>("data", {2, 16, 2},
|
||||
{0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f,
|
||||
17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f, 31.0f, 32.0f,
|
||||
33.0f, 34.0f, 35.0f, 36.0f, 37.0f, 38.0f, 39.0f, 40.0f, 41.0f, 42.0f, 43.0f, 44.0f, 45.0f, 46.0f, 47.0f, 48.0f,
|
||||
49.0f, 50.0f, 51.0f, 52.0f, 53.0f, 54.0f, 55.0f, 56.0f, 57.0f, 58.0f, 59.0f, 60.0f, 61.0f, 62.0f, 63.0f});
|
||||
test.AddOutput<float>("reduced", {16}, {0.0f, 2.0f, 4.0f, 6.0f, 8.0f, 10.0f, 12.0f, 14.0f, 16.0f, 18.0f, 20.0f, 22.0f, 24.0f, 26.0f, 28.0f, 30.0f});
|
||||
test.Run();
|
||||
}
|
||||
|
||||
TEST(ReductionOpTest, ReduceMin_RKR_keepdims) {
|
||||
OpTester test("ReduceMin");
|
||||
test.AddAttribute("axes", std::vector<int64_t>{0, 2});
|
||||
test.AddAttribute("keepdims", (int64_t)1);
|
||||
test.AddInput<float>("data", {3, 2, 2},
|
||||
{11.0f, 12.0f,
|
||||
13.0f, 14.0f,
|
||||
|
||||
15.0f, 16.0f,
|
||||
17.0f, 18.0f,
|
||||
|
||||
19.0f, 20.0f,
|
||||
21.0f, 22.0f});
|
||||
test.AddOutput<float>("reduced", {1, 2, 1}, {11.f, 13.f});
|
||||
test.Run();
|
||||
}
|
||||
|
||||
TEST(ReductionOpTest, ReduceMin_RKRK) {
|
||||
OpTester test("ReduceMin");
|
||||
test.AddAttribute("axes", std::vector<int64_t>{0, 2});
|
||||
|
|
@ -4102,6 +4333,126 @@ TEST(ReductionOpTest, ReduceSum_KRK2_keepdims) {
|
|||
test.Run();
|
||||
}
|
||||
|
||||
TEST(ReductionOpTest, ReduceSum_RKR) {
|
||||
OpTester test("ReduceSum");
|
||||
test.AddAttribute("axes", std::vector<int64_t>{0, 2});
|
||||
test.AddAttribute("keepdims", (int64_t)0);
|
||||
test.AddInput<float>("data", {3, 2, 2},
|
||||
{1.0f, 2.0f,
|
||||
3.0f, 4.0f,
|
||||
|
||||
5.0f, 6.0f,
|
||||
7.0f, 8.0f,
|
||||
|
||||
9.0f, 10.0f,
|
||||
11.0f, 12.0f});
|
||||
test.AddOutput<float>("reduced", {2}, {33.f, 45.f});
|
||||
test.Run();
|
||||
}
|
||||
|
||||
TEST(ReductionOpTest, ReduceSum_RKR_parallel) {
|
||||
OpTester test("ReduceSum");
|
||||
test.AddAttribute("axes", std::vector<int64_t>{0, 2});
|
||||
test.AddAttribute("keepdims", (int64_t)0);
|
||||
test.AddInput<float>("data", {2, 16, 2},
|
||||
{0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f,
|
||||
17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f, 31.0f, 32.0f,
|
||||
33.0f, 34.0f, 35.0f, 36.0f, 37.0f, 38.0f, 39.0f, 40.0f, 41.0f, 42.0f, 43.0f, 44.0f, 45.0f, 46.0f, 47.0f, 48.0f,
|
||||
49.0f, 50.0f, 51.0f, 52.0f, 53.0f, 54.0f, 55.0f, 56.0f, 57.0f, 58.0f, 59.0f, 60.0f, 61.0f, 62.0f, 63.0f});
|
||||
test.AddOutput<float>("reduced", {16}, {66.0f, 74.0f, 82.0f, 90.0f, 98.0f, 106.0f, 114.0f, 122.0f, 130.0f, 138.0f, 146.0f, 154.0f, 162.0f, 170.0f, 178.0f, 186.0f});
|
||||
test.Run();
|
||||
}
|
||||
|
||||
TEST(ReductionOpTest, ReduceSum_RKR_parallel_bigger) {
|
||||
OpTester test("ReduceSum");
|
||||
test.AddAttribute("axes", std::vector<int64_t>{0, 2});
|
||||
test.AddAttribute("keepdims", (int64_t)0);
|
||||
std::vector<float> in_data(512);
|
||||
for (size_t i = 0; i < in_data.size(); ++i)
|
||||
in_data[i] = (float)i;
|
||||
test.AddInput<float>("data", {2, 128, 2}, in_data);
|
||||
std::vector<float> expected(128);
|
||||
for (size_t j = 0; j < 128; ++j) {
|
||||
expected[j] = 0;
|
||||
for (size_t i = 0; i < 2; ++i) {
|
||||
for (size_t k = 0; k < 2; ++k) {
|
||||
expected[j] += in_data[i * 256 + j * 2 + k];
|
||||
}
|
||||
}
|
||||
}
|
||||
test.AddOutput<float>("reduced", {128}, expected);
|
||||
test.Run();
|
||||
}
|
||||
|
||||
TEST(ReductionOpTest, ReduceSum_RKR_keepdims) {
|
||||
OpTester test("ReduceSum");
|
||||
test.AddAttribute("axes", std::vector<int64_t>{0, 2});
|
||||
test.AddAttribute("keepdims", (int64_t)1);
|
||||
test.AddInput<float>("data", {3, 2, 2},
|
||||
{1.0f, 2.0f,
|
||||
3.0f, 4.0f,
|
||||
|
||||
5.0f, 6.0f,
|
||||
7.0f, 8.0f,
|
||||
|
||||
9.0f, 10.0f,
|
||||
11.0f, 12.0f});
|
||||
test.AddOutput<float>("reduced", {1, 2, 1}, {33.f, 45.f});
|
||||
test.Run();
|
||||
}
|
||||
|
||||
TEST(ReductionOpTest, ReduceSum_RKR2) {
|
||||
OpTester test("ReduceSum");
|
||||
test.AddAttribute("axes", std::vector<int64_t>{0, 3});
|
||||
test.AddAttribute("keepdims", (int64_t)0);
|
||||
test.AddInput<float>("data", {3, 2, 2, 2},
|
||||
{1.0f, 2.0f,
|
||||
3.0f, 4.0f,
|
||||
|
||||
5.0f, 6.0f,
|
||||
7.0f, 8.0f,
|
||||
|
||||
9.0f, 10.0f,
|
||||
11.0f, 12.0f,
|
||||
|
||||
13.0f, 14.0f,
|
||||
15.0f, 16.0f,
|
||||
|
||||
17.0f, 18.0f,
|
||||
19.0f, 20.0f,
|
||||
|
||||
21.0f, 22.0f,
|
||||
23.0f, 24.0f});
|
||||
test.AddOutput<float>("reduced", {2, 2}, {57.0f, 69.0f, 81.0f, 93.0f});
|
||||
test.Run();
|
||||
}
|
||||
|
||||
TEST(ReductionOpTest, ReduceSum_RKR2_keepdims) {
|
||||
OpTester test("ReduceSum");
|
||||
test.AddAttribute("axes", std::vector<int64_t>{0, 3});
|
||||
test.AddAttribute("keepdims", (int64_t)1);
|
||||
test.AddInput<float>("data", {3, 2, 2, 2},
|
||||
{1.0f, 2.0f,
|
||||
3.0f, 4.0f,
|
||||
|
||||
5.0f, 6.0f,
|
||||
7.0f, 8.0f,
|
||||
|
||||
9.0f, 10.0f,
|
||||
11.0f, 12.0f,
|
||||
|
||||
13.0f, 14.0f,
|
||||
15.0f, 16.0f,
|
||||
|
||||
17.0f, 18.0f,
|
||||
19.0f, 20.0f,
|
||||
|
||||
21.0f, 22.0f,
|
||||
23.0f, 24.0f});
|
||||
test.AddOutput<float>("reduced", {1, 2, 2, 1}, {57.0f, 69.0f, 81.0f, 93.0f});
|
||||
test.Run();
|
||||
}
|
||||
|
||||
TEST(ReductionOpTest, ReduceSum_RKRK) {
|
||||
OpTester test("ReduceSum");
|
||||
test.AddAttribute("axes", std::vector<int64_t>{0, 2});
|
||||
|
|
|
|||
Loading…
Reference in a new issue