Improves ReduceSum performance by removing transposition. (#5370)

* Improves ReduceSum performance
* Add min, max, L1, L2, logsum, sumsquare
* remove all reduce implementation including transpose
This commit is contained in:
Xavier Dupré 2020-10-20 10:36:31 +02:00 committed by GitHub
parent 682898ae2b
commit 66c8a441e0
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 628 additions and 594 deletions

View file

@ -151,23 +151,13 @@ REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ArgMin, 1, 10);
REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ArgMin, 11, 12);
REGISTER_UNARY_ELEMENTWISE_KERNEL(ArgMin, 13);
// When all reduce axes are located at the tail of the dims, quite general cases, transpose and extra
// copy could be skipped to improve performance. If required by check_no_transpose = true, then
// the calling code will check if the data was transposed and act accordingly.
// return value: true means transposedInputData is not created/copied, input tensor data could
// be directly used as row major matrix [block_size, blocks], where blocks is the
// size of each reduce.
// `input_shape_override` overrides the shape of `input` for compute purposes.
template <typename T>
bool PrepareForReduce(const Tensor* input_tensor_ptr,
FastAllocVector<T>& transposed_input_data,
int64_t& block_size,
int64_t& blocks,
const std::vector<int64_t>& axes_,
bool keepdims_,
/*out*/ std::vector<int64_t>& reduced_dims,
bool check_no_transpose,
const TensorShape* input_shape_override) {
bool SetupForReduce(const Tensor* input_tensor_ptr,
const std::vector<int64_t>& axes_,
std::vector<int64_t>& axes,
TensorShape& new_input_shape,
std::vector<int64_t>& output_shape,
bool& empty_reduce,
const TensorShape* input_shape_override) {
ORT_ENFORCE(input_tensor_ptr != nullptr, "Input to be reduced is null");
if (input_shape_override) {
@ -175,25 +165,13 @@ bool PrepareForReduce(const Tensor* input_tensor_ptr,
"The input shape override's size does not match the input tensor's shape size");
}
const Tensor& input = *input_tensor_ptr;
const auto& input_shape = input_shape_override ? *input_shape_override : input.Shape();
size_t ndim = input_shape.NumDimensions();
// Scalar tensor
new_input_shape = input_shape_override ? *input_shape_override : input_tensor_ptr->Shape();
size_t ndim = new_input_shape.NumDimensions();
if (ndim == 0) {
if (!check_no_transpose) {
auto size = input_shape.Size();
assert(size == 1);
transposed_input_data.resize(size, 0);
T* to_data = &transposed_input_data[0];
*to_data = *input.Data<T>();
}
block_size = blocks = 1;
return true;
empty_reduce = true;
return false;
}
std::vector<int64_t> axes;
axes.reserve(axes_.size());
for (int64_t axis : axes_) {
axes.push_back(HandleNegativeAxis(axis, static_cast<int64_t>(ndim)));
@ -210,598 +188,411 @@ bool PrepareForReduce(const Tensor* input_tensor_ptr,
// If all reduced axes are located at the tail of the input shape, then copy could be skipped is required
bool need_copy = true;
if (axes.size() <= ndim &&
if (axes.size() <= ndim && ndim > 0 &&
axes.front() == static_cast<int64_t>(ndim - axes.size()) &&
axes.back() == static_cast<int64_t>(ndim) - 1) {
need_copy = false;
}
std::vector<bool> keep_axis(ndim, true);
for (auto i : axes) {
keep_axis[i] = false;
empty_reduce = false;
output_shape = new_input_shape.GetDims();
for (auto a : axes) {
output_shape[a] = new_input_shape[a] > 0 ? 1 : 0;
empty_reduce |= output_shape[a] == 0;
}
return need_copy;
}
void NoTransposePrepareForReduce(const TensorShape& new_input_shape,
const std::vector<int64_t>& reduced_axes,
ResultsNoTransposePrepareForReduce& results) {
// Common initialisation for the indices.
std::vector<int64_t> cumulative_shape = new_input_shape.GetDims();
cumulative_shape[cumulative_shape.size() - 1] = 1;
for (int i = static_cast<int>(cumulative_shape.size()) - 2; i >= 0; --i) {
cumulative_shape[i] = cumulative_shape[i + 1] * new_input_shape[i + 1];
}
int64_t projection_size = 1;
for (auto a : reduced_axes) {
projection_size *= new_input_shape[a];
}
//transpose the input so that all to-be-reduced axes are at the head
std::vector<int64_t> transposed_axes(axes.begin(), axes.end());
for (size_t i = 0; i < ndim; ++i) {
if (keep_axis[i]) {
transposed_axes.push_back(i);
}
}
std::vector<int64_t> new_dims(transposed_axes.size());
for (size_t i = 0; i < transposed_axes.size(); ++i) {
new_dims[i] = input_shape.GetDims().at(transposed_axes[i]);
}
int num_axes = static_cast<int>(transposed_axes.size());
auto in_dims = input_shape.GetDims();
// Measure amount of contiguous data we can copy at once
int64_t blocksize = 1;
int n_shared_idxs = 0;
for (int i = num_axes - 1; i >= 0; --i) {
if (transposed_axes[i] == i) {
blocksize *= new_dims[i];
++n_shared_idxs;
} else {
int last_reduced_axis = static_cast<int>(reduced_axes.size()) - 1;
int loop_reduced_axis = 1;
results.last_loop_red_size = new_input_shape[reduced_axes[last_reduced_axis]];
results.last_loop_red_inc = cumulative_shape[reduced_axes[last_reduced_axis]];
projection_size /= new_input_shape[reduced_axes[last_reduced_axis]];
--last_reduced_axis;
while (last_reduced_axis >= 0) {
if (reduced_axes[last_reduced_axis] != reduced_axes[last_reduced_axis + 1] - 1)
break;
}
results.last_loop_red_size *= new_input_shape[reduced_axes[last_reduced_axis]];
projection_size /= new_input_shape[reduced_axes[last_reduced_axis]];
--last_reduced_axis;
++loop_reduced_axis;
}
const T* from_data = input.template Data<T>();
size_t count = input_shape.Size();
//set to-be-reduced axes to one. squeeze is keepdims_ is false
int64_t first_dim = 1;
reduced_dims.reserve(in_dims.size());
for (size_t i = 0; i < in_dims.size(); i++) {
const auto in_dim = in_dims[i];
if (keep_axis[i]) {
reduced_dims.push_back(in_dim);
} else {
first_dim *= in_dim;
if (keepdims_) {
reduced_dims.push_back(in_dim == 0 ? 0 : 1);
} else {
// as we are reducing on this axis and not keeping a dim for it, we can't drop a dim value of 0.
// e.g. if input was {3, 0, 2} and we reduced on axis 1 without keeping it, the output shape would be
// {3, 2} which is invalid given the input was empty.
// note that if we do keep the dim the output shape will have a 0 in it,
// which is still valid for an empty tensor, so allow that.
ORT_ENFORCE(in_dim != 0,
"Can't reduce on dim with value of 0 if 'keepdims' is false. "
"Invalid output shape would be produced. input_shape:",
input_shape);
}
}
}
auto num_elements = input_shape.Size();
// edge case. one or more input dims with value of 0.
if (num_elements == 0) {
block_size = blocks = 0;
return true;
}
if (0 == first_dim) {
return false;
}
block_size = num_elements / first_dim;
blocks = first_dim;
if (!need_copy && check_no_transpose) {
return true;
}
transposed_input_data.resize(input_shape.Size(), 0);
T* to_data = &transposed_input_data[0];
if (num_axes < 2 || n_shared_idxs == num_axes) {
memcpy(to_data, from_data, count * sizeof(T));
return false;
}
int itr_axes = num_axes - n_shared_idxs;
// Calculate strides
std::vector<int64_t> stride_x(itr_axes, 0);
for (size_t i = 0; static_cast<int>(i) < itr_axes; i++) {
stride_x[i] = 1;
for (size_t j = transposed_axes[i] + 1; static_cast<int>(j) < itr_axes; j++) {
stride_x[i] *= in_dims[j];
}
}
std::vector<int64_t> itr_idxs(itr_axes, 0);
// Branch here to avoid branching within the loop
if (blocksize > 1) {
for (size_t index = 0; index < (count / blocksize); index++) {
int64_t from_index = 0;
for (int i = 0; i < itr_axes; ++i) {
from_index += stride_x[i] * itr_idxs[i];
}
memcpy(
to_data + blocksize * index,
from_data + blocksize * from_index,
blocksize * sizeof(T));
++itr_idxs[itr_axes - 1];
for (int i = itr_axes - 1; i >= 1; --i) {
auto expected_dim = new_dims[i];
if (itr_idxs[i] < expected_dim) {
break;
}
itr_idxs[i] %= expected_dim;
++itr_idxs[i - 1];
}
}
// Builds the list of indices projected into the same sum.
int reduced_axes_size = static_cast<int>(reduced_axes.size()) - loop_reduced_axis;
if (reduced_axes_size == 0) {
results.projected_index.resize(1, 0);
} else {
for (size_t index = 0; index < count; index++) {
int64_t from_index = 0;
for (int i = 0; i < itr_axes; ++i) {
from_index += stride_x[i] * itr_idxs[i];
}
*(to_data + index) = *(from_data + from_index);
++itr_idxs[itr_axes - 1];
for (int i = itr_axes - 1; i >= 1; --i) {
auto expected_dim = new_dims[i];
if (itr_idxs[i] < expected_dim) {
results.projected_index.resize(projection_size);
std::vector<int64_t> projected_indices(reduced_axes_size, 0);
int64_t current_index = 0;
size_t current_pos = 0;
int j;
for (current_pos = 0; current_pos < results.projected_index.size(); ++current_pos) {
results.projected_index[current_pos] = current_index;
++projected_indices[projected_indices.size() - 1];
current_index += cumulative_shape[reduced_axes[reduced_axes_size - 1]];
for (j = reduced_axes_size - 1; j > 0; --j) {
if (projected_indices[j] < new_input_shape[reduced_axes[j]])
break;
}
itr_idxs[i] %= expected_dim;
++itr_idxs[i - 1];
projected_indices[j] -= new_input_shape[reduced_axes[j]];
current_index -= new_input_shape[reduced_axes[j]] * cumulative_shape[reduced_axes[j]];
++projected_indices[j - 1];
current_index += cumulative_shape[reduced_axes[j - 1]];
}
}
}
return false;
// Builds the list of indices for the unprojected sum.
std::vector<int64_t> unreduced_axes;
for (int64_t i = 0; i < static_cast<int64_t>(cumulative_shape.size()); ++i) {
if (std::find(reduced_axes.begin(), reduced_axes.end(), i) != reduced_axes.end())
continue;
unreduced_axes.push_back(i);
}
int64_t unprojection_size = 1;
for (auto a : unreduced_axes) {
unprojection_size *= new_input_shape[a];
}
if (unprojection_size == 0) {
return;
}
std::vector<int64_t> unprojected_indices(unreduced_axes.size(), 0);
// The last index is usually an image size.
// We differently process the last unprojected dimension.
results.last_loop_size = new_input_shape[unreduced_axes[unreduced_axes.size() - 1]];
int64_t unprojection_size_before_last = unprojection_size / results.last_loop_size;
results.unprojected_index.reserve(unprojection_size_before_last);
results.last_loop_inc = cumulative_shape[unreduced_axes[unreduced_axes.size() - 1]];
if (unprojected_indices.size() <= 1) {
results.unprojected_index.push_back(0);
} else {
int64_t current_index = 0;
int j;
for (int64_t pos = 0; pos < unprojection_size_before_last; ++pos) {
results.unprojected_index.push_back(current_index);
++unprojected_indices[unprojected_indices.size() - 2];
current_index += cumulative_shape[unreduced_axes[unreduced_axes.size() - 2]];
for (j = static_cast<int>(unreduced_axes.size()) - 2; j > 0; --j) {
if (unprojected_indices[j] < new_input_shape[unreduced_axes[j]])
break;
unprojected_indices[j] -= new_input_shape[unreduced_axes[j]];
current_index -= new_input_shape[unreduced_axes[j]] * cumulative_shape[unreduced_axes[j]];
++unprojected_indices[j - 1];
current_index += cumulative_shape[unreduced_axes[j - 1]];
}
}
}
}
template <typename T, typename AGG>
void NoTransposeReduce(Tensor* output, const TensorShape& new_input_shape, const Tensor& input,
const std::vector<int64_t>& reduced_axes, concurrency::ThreadPool* tp,
ResultsNoTransposePrepareForReduce& last_results) {
auto output_shape = output->Shape();
const T* from_data = input.template Data<T>();
typename AGG::value_type* to_data = output->template MutableData<typename AGG::value_type>();
int64_t count = output_shape.Size();
if (reduced_axes.size() == 0 || reduced_axes.size() == new_input_shape.NumDimensions()) {
ORT_ENFORCE(count == 1, "Reduction on all axes, output size should be 1.");
int64_t input_size = new_input_shape.Size();
to_data[0] = AGG(input_size, from_data[0]).aggall(from_data);
return;
}
if (!last_results.equal(new_input_shape.GetDims(), reduced_axes)) {
NoTransposePrepareForReduce(new_input_shape, reduced_axes, last_results);
if (last_results.last_loop_red_size == 0 || last_results.last_loop_size == 0)
return;
}
int64_t denominator = last_results.last_loop_red_size * last_results.projected_index.size();
if (AGG::two_loops()) {
auto fn = [&](std::ptrdiff_t first, std::ptrdiff_t end) {
int64_t loop;
const T* loop_red_ptr;
const T* loop_red_ptr_end;
int64_t current_index = first * last_results.last_loop_size;
for (int64_t main_index = first; main_index < end; ++main_index) {
for (loop = 0; loop < last_results.last_loop_size; ++loop, ++current_index) {
int64_t origin = last_results.unprojected_index[main_index] + loop * last_results.last_loop_inc;
AGG accumulator(denominator, from_data[origin + last_results.projected_index[0]]);
for (auto it = last_results.projected_index.begin(); it != last_results.projected_index.end(); ++it) {
loop_red_ptr = from_data + (origin + *it);
loop_red_ptr_end = loop_red_ptr + last_results.last_loop_red_size * last_results.last_loop_red_inc;
for (; loop_red_ptr != loop_red_ptr_end; loop_red_ptr += last_results.last_loop_red_inc) {
accumulator.update0(*loop_red_ptr);
}
}
for (auto it = last_results.projected_index.begin(); it != last_results.projected_index.end(); ++it) {
loop_red_ptr = from_data + (origin + *it);
loop_red_ptr_end = loop_red_ptr + last_results.last_loop_red_size * last_results.last_loop_red_inc;
for (; loop_red_ptr != loop_red_ptr_end; loop_red_ptr += last_results.last_loop_red_inc) {
accumulator.update(*loop_red_ptr);
}
}
to_data[current_index] = accumulator.get_value();
}
}
};
auto cost = TensorOpCost{(double)(last_results.projected_index.size() * sizeof(T) * last_results.last_loop_size * last_results.last_loop_red_size),
(double)last_results.last_loop_size * last_results.last_loop_red_size,
(double)last_results.projected_index.size() * last_results.last_loop_size * last_results.last_loop_red_size * 2};
concurrency::ThreadPool::TryParallelFor(tp, count / last_results.last_loop_size, cost, fn);
} else {
auto fn = [&](std::ptrdiff_t first, std::ptrdiff_t end) {
int64_t loop;
const T* loop_red_ptr;
const T* loop_red_ptr_end;
int64_t current_index = first * last_results.last_loop_size;
for (int64_t main_index = first; main_index < end; ++main_index) {
for (loop = 0; loop < last_results.last_loop_size; ++loop, ++current_index) {
int64_t origin = last_results.unprojected_index[main_index] + loop * last_results.last_loop_inc;
AGG accumulator(denominator, from_data[origin + last_results.projected_index[0]]);
for (auto it = last_results.projected_index.begin(); it != last_results.projected_index.end(); ++it) {
loop_red_ptr = from_data + (origin + *it);
loop_red_ptr_end = loop_red_ptr + last_results.last_loop_red_size * last_results.last_loop_red_inc;
for (; loop_red_ptr != loop_red_ptr_end; loop_red_ptr += last_results.last_loop_red_inc) {
accumulator.update(*loop_red_ptr);
}
}
to_data[current_index] = accumulator.get_value();
}
}
};
auto cost = TensorOpCost{(double)(last_results.projected_index.size() * sizeof(T) * last_results.last_loop_size * last_results.last_loop_red_size),
(double)last_results.last_loop_size * last_results.last_loop_red_size,
(double)last_results.projected_index.size() * last_results.last_loop_size * last_results.last_loop_red_size};
concurrency::ThreadPool::TryParallelFor(tp, count / last_results.last_loop_size, cost, fn);
}
}
void DropDimensions(const std::vector<int64_t>& input_shape, const std::vector<int64_t>& axes, std::vector<int64_t>& dropped_axes) {
auto dropped_dims = input_shape;
for (auto i : axes) {
dropped_dims[i] = -1;
}
for (auto it = dropped_dims.begin(); it != dropped_dims.end(); ++it) {
if (*it != -1) {
dropped_axes.push_back(*it);
}
}
}
template <typename T, typename AGG>
void CommonReduce(OpKernelContext* ctx,
const std::vector<int64_t> axes_, int64_t keepdims_,
ResultsNoTransposePrepareForReduce& last_results,
bool noop_with_empty_axes) {
std::vector<int64_t> axes;
const Tensor* input = ctx->Input<Tensor>(0);
auto reduced_dims = input->Shape().GetDims();
std::vector<int64_t> output_shape;
bool empty_reduce;
TensorShape new_input_shape;
if (ctx->InputCount() == 2) {
// second input holds the axes.
const Tensor* axes_tensor = ctx->Input<Tensor>(1);
ORT_ENFORCE(axes_tensor != nullptr, "Axes input is null");
ORT_ENFORCE(axes_tensor->Shape().NumDimensions() == 1,
"An axes tensor must be a vector tensor.");
auto nDims = static_cast<size_t>(axes_tensor->Shape()[0]);
const auto* data = axes_tensor->template Data<int64_t>();
std::vector<int64_t> input_axes(data, data + nDims);
if (input_axes.empty() && noop_with_empty_axes) {
auto* output = ctx->Output(0, input->Shape());
memcpy(output->template MutableData<typename AGG::value_type>(), input->template Data<T>(), input->SizeInBytes());
return;
}
SetupForReduce(input, input_axes, axes, new_input_shape, output_shape, empty_reduce, nullptr);
} else {
SetupForReduce(input, axes_, axes, new_input_shape, output_shape, empty_reduce, nullptr);
}
if (empty_reduce) {
Tensor* output = ctx->Output(0, keepdims_ ? output_shape : std::vector<int64_t>());
if (new_input_shape.Size() == 1) {
const T* from_data = input->template Data<T>();
typename AGG::value_type* to_data = output->template MutableData<typename AGG::value_type>();
AGG agg(1, *from_data);
if (agg.two_loops()) {
agg.update0(*from_data);
agg.update(*from_data);
} else {
agg.update(*from_data);
}
*to_data = agg.get_value();
} else {
ORT_ENFORCE(keepdims_,
"Can't reduce on dim with value of 0 if 'keepdims' is false. "
"Invalid output shape would be produced. input_shape:",
input->Shape());
}
return;
}
Tensor* output;
if (keepdims_) {
output = ctx->Output(0, output_shape);
} else {
std::vector<int64_t> dropped_axes;
DropDimensions(output_shape, axes, dropped_axes);
output = ctx->Output(0, dropped_axes);
}
NoTransposeReduce<T, AGG>(output, new_input_shape, *input, axes, ctx->GetOperatorThreadPool(), last_results);
}
template <typename T>
Status ReduceL1<T>::Compute(OpKernelContext* ctx) const {
FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
int64_t block_size;
int64_t blocks;
std::vector<int64_t> reduced_dims;
const Tensor* input = ctx->Input<Tensor>(0);
bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true);
Tensor* reduced = ctx->Output(0, reduced_dims);
T* output_data = reduced->template MutableData<T>();
if (no_transpose) {
const T* input_data = input->template Data<T>();
for (int64_t i = 0; i < block_size; ++i) {
output_data[i] = ConstEigenVectorMap<T>(input_data + (i * blocks), blocks).cwiseAbs().sum();
}
} else {
EigenVectorMap<T> out_vec(output_data, block_size);
out_vec = ConstEigenMatrixMap<T>(&transposed_input_data[0], block_size, blocks).cwiseAbs().rowwise().sum();
}
// The following variable does not change if the input tensor and the
// axes do not either. It could be either cached in ctx or precomputed
// in the constructor if shape and axes are known at this stage.
ResultsNoTransposePrepareForReduce last_results;
CommonReduce<T, ReduceAggregatorL1<T>>(ctx, axes_, keepdims_, last_results);
return Status::OK();
}
template <typename T>
Status ReduceL2<T>::Compute(OpKernelContext* ctx) const {
FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
int64_t block_size;
int64_t blocks;
std::vector<int64_t> reduced_dims;
const Tensor* input = ctx->Input<Tensor>(0);
bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true);
Tensor* reduced = ctx->Output(0, reduced_dims);
T* output_data = reduced->template MutableData<T>();
if (no_transpose) {
const T* input_data = input->template Data<T>();
for (int64_t i = 0; i < block_size; ++i) {
output_data[i] = ConstEigenVectorMap<T>(input_data + (i * blocks), blocks).norm();
}
} else {
EigenVectorMap<T> out_vec(output_data, block_size);
out_vec = ConstEigenMatrixMap<T>(&transposed_input_data[0], block_size, blocks).rowwise().norm();
}
ResultsNoTransposePrepareForReduce last_results;
CommonReduce<T, ReduceAggregatorL2<T>>(ctx, axes_, keepdims_, last_results);
return Status::OK();
}
template <typename T>
Status ReduceLogSum<T>::Compute(OpKernelContext* ctx) const {
FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
int64_t block_size;
int64_t blocks;
std::vector<int64_t> reduced_dims;
const Tensor* input = ctx->Input<Tensor>(0);
bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true);
Tensor* reduced = ctx->Output(0, reduced_dims);
T* output_data = reduced->template MutableData<T>();
if (no_transpose) {
const T* input_data = input->template Data<T>();
for (int64_t i = 0; i < block_size; ++i) {
output_data[i] = ConstEigenVectorMap<T>(input_data + (i * blocks), blocks).sum();
}
} else {
EigenVectorMap<T> out_vec(output_data, block_size);
out_vec = ConstEigenMatrixMap<T>(&transposed_input_data[0], block_size, blocks).rowwise().sum();
}
for (int j = 0; j < block_size; ++j) {
*(output_data) = static_cast<T>(std::log(*(output_data)));
++output_data;
}
ResultsNoTransposePrepareForReduce last_results;
CommonReduce<T, ReduceAggregatorLogSum<T>>(ctx, axes_, keepdims_, last_results);
return Status::OK();
}
template <typename T>
Status ReduceLogSumExp<T>::Compute(OpKernelContext* ctx) const {
FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
int64_t block_size;
int64_t blocks;
std::vector<int64_t> reduced_dims;
const Tensor* input = ctx->Input<Tensor>(0);
PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims);
Tensor* reduced = ctx->Output(0, reduced_dims);
T* output_data = reduced->template MutableData<T>();
for (int j = 0; j < block_size; ++j) {
T max_value = std::numeric_limits<T>::lowest();
for (int i = 0; i < blocks; ++i) {
max_value = std::max(max_value, transposed_input_data[i * block_size + j]);
}
T scaled_exp_sum = 0;
for (int i = 0; i < blocks; ++i) {
scaled_exp_sum += static_cast<T>(std::exp(transposed_input_data[i * block_size + j] - max_value));
}
*(output_data++) = static_cast<T>(std::log(scaled_exp_sum) + max_value);
}
ResultsNoTransposePrepareForReduce last_results;
CommonReduce<T, ReduceAggregatorLogSumExp<T>>(ctx, axes_, keepdims_, last_results);
return Status::OK();
}
template <typename T>
Status ReduceMax<T>::Compute(OpKernelContext* ctx) const {
FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
int64_t block_size;
int64_t blocks;
std::vector<int64_t> reduced_dims;
const Tensor* input = ctx->Input<Tensor>(0);
bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true);
Tensor* reduced = ctx->Output(0, reduced_dims);
T* output_data = reduced->template MutableData<T>();
if (no_transpose) {
const T* input_data = input->template Data<T>();
for (int64_t i = 0; i < block_size; ++i) {
output_data[i] = ConstEigenVectorMap<T>(input_data + (i * blocks), blocks).maxCoeff();
}
} else {
EigenVectorMap<T> out_vec(output_data, block_size);
out_vec = ConstEigenMatrixMap<T>(&transposed_input_data[0], block_size, blocks).rowwise().maxCoeff();
}
ResultsNoTransposePrepareForReduce last_results;
CommonReduce<T, ReduceAggregatorMax<T>>(ctx, axes_, keepdims_, last_results);
return Status::OK();
}
template <typename T>
Status ReduceMean<T>::Compute(OpKernelContext* ctx) const {
FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
int64_t block_size;
int64_t blocks;
std::vector<int64_t> reduced_dims;
const Tensor* input = ctx->Input<Tensor>(0);
bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true);
Tensor* reduced = ctx->Output(0, reduced_dims);
T* output_data = reduced->template MutableData<T>();
if (no_transpose) {
const T* input_data = ctx->Input<Tensor>(0)->template Data<T>();
auto lambda = [input_data, blocks, output_data](ptrdiff_t i) {
output_data[i] = ConstEigenVectorMap<T>(input_data + (i * blocks), blocks).mean();
};
concurrency::ThreadPool::TryBatchParallelFor(ctx->GetOperatorThreadPool(), block_size, lambda, 0);
} else {
EigenVectorMap<T> out_vec(output_data, block_size);
out_vec = ConstEigenMatrixMap<T>(&transposed_input_data[0], block_size, blocks).rowwise().mean();
}
ResultsNoTransposePrepareForReduce last_results;
CommonReduce<T, ReduceAggregatorMean<T>>(ctx, axes_, keepdims_, last_results);
return Status::OK();
}
template <typename T>
Status ReduceMin<T>::Compute(OpKernelContext* ctx) const {
FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
int64_t block_size;
int64_t blocks;
std::vector<int64_t> reduced_dims;
const Tensor* input = ctx->Input<Tensor>(0);
bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true);
Tensor* reduced = ctx->Output(0, reduced_dims);
T* output_data = reduced->template MutableData<T>();
if (no_transpose) {
const T* input_data = input->template Data<T>();
for (int64_t i = 0; i < block_size; ++i) {
output_data[i] = ConstEigenVectorMap<T>(input_data + (i * blocks), blocks).minCoeff();
}
} else {
EigenVectorMap<T> out_vec(output_data, block_size);
out_vec = ConstEigenMatrixMap<T>(&transposed_input_data[0], block_size, blocks).rowwise().minCoeff();
}
ResultsNoTransposePrepareForReduce last_results;
CommonReduce<T, ReduceAggregatorMin<T>>(ctx, axes_, keepdims_, last_results);
return Status::OK();
}
template <typename T>
Status ReduceProd<T>::Compute(OpKernelContext* ctx) const {
FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
int64_t block_size;
int64_t blocks;
std::vector<int64_t> reduced_dims;
const Tensor* input = ctx->Input<Tensor>(0);
bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true);
Tensor* reduced = ctx->Output(0, reduced_dims);
T* output_data = reduced->template MutableData<T>();
if (no_transpose) {
const T* input_data = input->template Data<T>();
for (int64_t i = 0; i < block_size; ++i) {
output_data[i] = ConstEigenVectorMap<T>(input_data + (i * blocks), blocks).prod();
}
} else {
EigenVectorMap<T> out_vec(output_data, block_size);
out_vec = ConstEigenMatrixMap<T>(&transposed_input_data[0], block_size, blocks).rowwise().prod();
}
ResultsNoTransposePrepareForReduce last_results;
CommonReduce<T, ReduceAggregatorProd<T>>(ctx, axes_, keepdims_, last_results);
return Status::OK();
}
template <typename T>
void ReduceSumCore(const T* input_data, T* output_data, bool no_transpose,
int64_t blocks, int64_t block_size, FastAllocVector<T>& transposed_input_data,
concurrency::ThreadPool* tp) {
if (no_transpose) {
auto lambda = [input_data, blocks, output_data](ptrdiff_t i) {
// The ConstEigenMatrixMap type is expanded to work around a MS compiler issue
output_data[i] = Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(input_data + (i * blocks), blocks).sum();
};
concurrency::ThreadPool::TryBatchParallelFor(tp, block_size, lambda, 0);
} else {
EigenVectorMap<T> out_vec(output_data, block_size);
out_vec = ConstEigenMatrixMap<T>(&transposed_input_data[0], block_size, blocks).rowwise().sum();
}
Status ReduceSum<T>::Compute(OpKernelContext* ctx) const {
ResultsNoTransposePrepareForReduce last_results;
CommonReduce<T, ReduceAggregatorSum<T>>(ctx, axes_, keepdims_, last_results);
return Status::OK();
}
template <typename T>
Tensor ReduceSum<T>::Impl(const Tensor& input, const std::vector<int64_t>& reduce_axes,
AllocatorPtr allocator, concurrency::ThreadPool* tp, bool keep_dims,
const TensorShape* input_shape_override) {
FastAllocVector<T> transposed_input_data(allocator);
int64_t block_size;
int64_t blocks;
std::vector<int64_t> reduced_dims;
std::vector<int64_t> axes;
auto reduced_dims = input.Shape().GetDims();
std::vector<int64_t> output_shape;
TensorShape new_input_shape;
bool empty_reduce;
SetupForReduce(&input, reduce_axes, axes, new_input_shape, output_shape, empty_reduce, input_shape_override);
bool no_transpose = PrepareForReduce<T>(&input, transposed_input_data, block_size, blocks,
reduce_axes, keep_dims, reduced_dims, true, input_shape_override);
if (empty_reduce) {
Tensor output(input.DataType(), keep_dims ? output_shape : std::vector<int64_t>(), allocator);
if (new_input_shape.Size() == 1) {
const T* from_data = input.template Data<T>();
T* to_data = output.template MutableData<T>();
*to_data = *from_data;
} else {
ORT_ENFORCE(keep_dims,
"Can't reduce on dim with value of 0 if 'keepdims' is false. "
"Invalid output shape would be produced. input_shape:",
new_input_shape);
}
return output;
}
Tensor output(input.DataType(), reduced_dims, allocator);
ReduceSumCore(input.template Data<T>(), output.template MutableData<T>(),
no_transpose, blocks, block_size, transposed_input_data, tp);
return output;
}
template <typename T>
Status ReduceSum<T>::Compute(OpKernelContext* ctx) const {
FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
int64_t block_size;
int64_t blocks;
std::vector<int64_t> reduced_dims;
const Tensor* input = ctx->Input<Tensor>(0);
bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true);
auto* output = ctx->Output(0, reduced_dims);
ReduceSumCore(input->template Data<T>(), output->template MutableData<T>(),
no_transpose, blocks, block_size, transposed_input_data, ctx->GetOperatorThreadPool());
return Status::OK();
if (keep_dims) {
ResultsNoTransposePrepareForReduce last_results;
Tensor output(input.DataType(), output_shape, allocator);
NoTransposeReduce<T, ReduceAggregatorSum<T>>(&output, new_input_shape, input, axes, tp, last_results);
return output;
} else {
ResultsNoTransposePrepareForReduce last_results;
std::vector<int64_t> dropped_axes;
DropDimensions(output_shape, axes, dropped_axes);
Tensor output(input.DataType(), dropped_axes, allocator);
NoTransposeReduce<T, ReduceAggregatorSum<T>>(&output, new_input_shape, input, axes, tp, last_results);
return output;
}
}
template <typename T>
Status ReduceSumSquare<T>::Compute(OpKernelContext* ctx) const {
FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
int64_t block_size;
int64_t blocks;
std::vector<int64_t> reduced_dims;
const Tensor* input = ctx->Input<Tensor>(0);
bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true);
Tensor* reduced = ctx->Output(0, reduced_dims);
T* output_data = reduced->template MutableData<T>();
if (no_transpose) {
const T* input_data = input->template Data<T>();
for (int64_t i = 0; i < block_size; ++i) {
output_data[i] = ConstEigenVectorMap<T>(input_data + (i * blocks), blocks).squaredNorm();
}
} else {
EigenVectorMap<T> out_vec(output_data, block_size);
out_vec = ConstEigenMatrixMap<T>(&transposed_input_data[0], block_size, blocks).rowwise().squaredNorm();
}
ResultsNoTransposePrepareForReduce last_results;
CommonReduce<T, ReduceAggregatorSumSquare<T>>(ctx, axes_, keepdims_, last_results);
return Status::OK();
}
template <typename T>
Status ArgMax<T>::Compute(OpKernelContext* ctx) const {
FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
int64_t block_size;
int64_t blocks;
std::vector<int64_t> reduced_dims;
const Tensor* input = ctx->Input<Tensor>(0);
bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true);
Tensor* reduced = ctx->Output(0, reduced_dims);
int64_t* output_data = reduced->template MutableData<int64_t>();
Eigen::MatrixXf::Index maxIndex;
if (no_transpose) {
const T* input_data = ctx->Input<Tensor>(0)->template Data<T>();
if (select_last_index_) {
assert(blocks > 0);
for (int64_t i = 0; i < block_size; ++i) {
gsl::span<const T> row(input_data, blocks);
auto first = row.cbegin();
auto const end = row.cend();
auto max_el = first;
while (++first < end) {
if (*first >= *max_el) {
max_el = first;
}
}
*(output_data++) = max_el - row.cbegin();
input_data += blocks;
}
} else {
for (int64_t i = 0; i < block_size; ++i) {
ConstEigenVectorMap<T>(input_data + (i * blocks), blocks).maxCoeff(&maxIndex);
*(output_data++) = maxIndex;
}
}
ResultsNoTransposePrepareForReduce last_results;
if (select_last_index_) {
CommonReduce<T, ReduceAggregatorArgMaxLastIndex<T>>(ctx, axes_, keepdims_, last_results);
} else {
auto matrixData = ConstEigenMatrixMap<T>(&transposed_input_data[0], block_size, blocks);
if (select_last_index_) {
for (int i = 0; i < block_size; ++i) {
int idx = 0;
T max_val = matrixData(i, 0);
for (int c = 1; c < blocks; ++c) {
auto val = matrixData(i, c);
if (val >= max_val) {
idx = c;
max_val = val;
}
}
*(output_data++) = idx;
}
} else {
for (int i = 0; i < block_size; ++i) {
matrixData.row(i).maxCoeff(&maxIndex);
*(output_data++) = maxIndex;
}
}
CommonReduce<T, ReduceAggregatorArgMax<T>>(ctx, axes_, keepdims_, last_results);
}
return Status::OK();
}
template <typename T>
Status ArgMin<T>::Compute(OpKernelContext* ctx) const {
FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
int64_t block_size;
int64_t blocks;
std::vector<int64_t> reduced_dims;
const Tensor* input = ctx->Input<Tensor>(0);
bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true);
Tensor* reduced = ctx->Output(0, reduced_dims);
int64_t* output_data = reduced->template MutableData<int64_t>();
Eigen::MatrixXf::Index minIndex;
if (no_transpose) {
const T* input_data = ctx->Input<Tensor>(0)->template Data<T>();
if (select_last_index_) {
assert(blocks > 0);
for (int64_t i = 0; i < block_size; ++i) {
gsl::span<const T> row(input_data, blocks);
auto first = row.cbegin();
auto const end = row.cend();
auto min_el = first;
while (++first < end) {
if (*first <= *min_el) {
min_el = first;
}
}
*(output_data++) = min_el - row.cbegin();
input_data += blocks;
}
} else {
for (int64_t i = 0; i < block_size; ++i) {
ConstEigenVectorMap<T>(input_data + (i * blocks), blocks).minCoeff(&minIndex);
*(output_data++) = minIndex;
}
}
ResultsNoTransposePrepareForReduce last_results;
if (select_last_index_) {
CommonReduce<T, ReduceAggregatorArgMinLastIndex<T>>(ctx, axes_, keepdims_, last_results);
} else {
auto matrixData = ConstEigenMatrixMap<T>(&transposed_input_data[0], block_size, blocks);
if (select_last_index_) {
for (int i = 0; i < block_size; ++i) {
int idx = 0;
T min_val = matrixData(i, 0);
for (int c = 1; c < blocks; ++c) {
auto val = matrixData(i, c);
if (val <= min_val) {
idx = c;
min_val = val;
}
}
*(output_data++) = idx;
}
} else {
for (int i = 0; i < block_size; ++i) {
matrixData.row(i).minCoeff(&minIndex);
*(output_data++) = minIndex;
}
}
CommonReduce<T, ReduceAggregatorArgMin<T>>(ctx, axes_, keepdims_, last_results);
}
return Status::OK();
}
@ -814,14 +605,4 @@ template class ReduceSum<int32_t>;
template class ReduceSum<double>;
template class ReduceSum<int64_t>;
#define REGISTER_REDUCESUMCORE_TYPED(T) \
template void ReduceSumCore<T>(const T* input_data, T* output_data, bool no_transpose, \
int64_t blocks, int64_t block_size, FastAllocVector<T>& transposed_input_data, \
concurrency::ThreadPool* tp);
REGISTER_REDUCESUMCORE_TYPED(float)
REGISTER_REDUCESUMCORE_TYPED(double)
REGISTER_REDUCESUMCORE_TYPED(int32_t)
REGISTER_REDUCESUMCORE_TYPED(int64_t)
} // namespace onnxruntime

View file

@ -10,24 +10,305 @@
#include "core/providers/cpu/containers.h"
#include "core/util/math_cpuonly.h"
#include "core/platform/threadpool.h"
#include <cmath>
namespace onnxruntime {
template <typename T>
bool PrepareForReduce(const Tensor* input_tensor_ptr,
FastAllocVector<T>& transposed_input_data,
int64_t& block_size,
int64_t& blocks,
const std::vector<int64_t>& axes_,
bool keepdims_,
/*out*/ std::vector<int64_t>& reduced_dims,
bool check_no_transpose = false,
const TensorShape* input_shape_override = nullptr);
class ResultsNoTransposePrepareForReduce {
public:
std::vector<int64_t> input_shape;
std::vector<int64_t> reduced_axes;
std::vector<int64_t> projected_index;
int64_t last_loop_red_size;
int64_t last_loop_red_inc;
std::vector<int64_t> unprojected_index;
int64_t last_loop_size;
int64_t last_loop_inc;
bool equal(const std::vector<int64_t>& local_input_shape, const std::vector<int64_t>& local_reduced_axes) {
if (input_shape.size() != local_input_shape.size())
return false;
if (reduced_axes.size() != local_reduced_axes.size())
return false;
for (std::vector<int64_t>::const_iterator it1 = input_shape.begin(), it2 = local_input_shape.begin();
it1 != input_shape.end(); ++it1, ++it2) {
if (*it1 != *it2)
return false;
}
for (std::vector<int64_t>::const_iterator it1 = reduced_axes.begin(), it2 = local_reduced_axes.begin();
it1 != reduced_axes.end(); ++it1, ++it2) {
if (*it1 != *it2)
return false;
}
return true;
}
};
template <typename T>
void ReduceSumCore(const T* input_data, T* output_data, bool no_transpose,
int64_t blocks, int64_t block_size, FastAllocVector<T>& transposed_input_data,
concurrency::ThreadPool* tp);
inline T reduce_sqrt(T value) { return std::sqrt(value); }
template <>
inline int64_t reduce_sqrt<int64_t>(int64_t value) { return static_cast<int64_t>(std::sqrt(static_cast<double>(value))); }
template <>
inline int32_t reduce_sqrt<int32_t>(int32_t value) { return static_cast<int32_t>(std::sqrt(static_cast<double>(value))); }
template <typename T>
inline T reduce_log(T value) { return static_cast<T>(std::log(value)); }
template <>
inline int64_t reduce_log<int64_t>(int64_t value) { return static_cast<int64_t>(std::log(static_cast<double>(value))); }
template <>
inline int32_t reduce_log<int32_t>(int32_t value) { return static_cast<int32_t>(std::log(static_cast<double>(value))); }
template <typename T>
inline T reduce_exp(T value) { return static_cast<T>(std::exp(value)); }
template <typename T, typename TVAL = T>
class ReduceAggregator {
public:
typedef TVAL value_type;
protected:
int64_t N_;
T accumulator_;
public:
inline ReduceAggregator(int64_t N, const T& init) {
N_ = N;
accumulator_ = init;
}
inline void update(const T&) { ORT_ENFORCE(false, "must be overloaded."); }
inline void update0(const T&) { ORT_ENFORCE(false, "must be overloaded."); }
inline TVAL aggall(const T*) { ORT_ENFORCE(false, "must be overloaded."); }
inline TVAL get_value() { return accumulator_; }
inline void enforce(const ResultsNoTransposePrepareForReduce&) {}
static inline bool two_loops() { return false; }
};
template <typename T, typename TVAL = T>
class ReduceAggregatorSum : public ReduceAggregator<T, TVAL> {
public:
inline ReduceAggregatorSum(int64_t N, const T&) : ReduceAggregator<T, TVAL>(N, 0) {}
inline void update(const T& v) { this->accumulator_ += v; }
inline TVAL aggall(const T* from_data) {
return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).sum();
}
};
template <typename T, typename TVAL = T>
class ReduceAggregatorSumSquare : public ReduceAggregator<T, TVAL> {
public:
inline ReduceAggregatorSumSquare(int64_t N, const T&) : ReduceAggregator<T, TVAL>(N, 0) {}
inline TVAL aggall(const T* from_data) {
return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).squaredNorm();
}
inline void update(const T& v) { this->accumulator_ += v * v; }
};
template <typename T, typename TVAL = T>
class ReduceAggregatorMean : public ReduceAggregatorSum<T, TVAL> {
public:
inline ReduceAggregatorMean(int64_t N, const T&) : ReduceAggregatorSum<T, TVAL>(N, 0) {}
inline T aggall(const T* from_data) {
return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).mean();
}
inline T get_value() { return this->accumulator_ / static_cast<T>(this->N_); }
};
template <typename T, typename TVAL = T>
class ReduceAggregatorMax : public ReduceAggregator<T, TVAL> {
public:
inline ReduceAggregatorMax(int64_t N, const T& init) : ReduceAggregator<T, TVAL>(N, init) {}
inline TVAL aggall(const T* from_data) {
return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).maxCoeff();
}
inline void update(const T& v) { this->accumulator_ = v > this->accumulator_ ? v : this->accumulator_; }
};
template <typename T, typename TVAL = int64_t>
class ReduceAggregatorArgMinMax : public ReduceAggregator<T, TVAL> {
protected:
int64_t arg_;
int64_t index_;
public:
inline ReduceAggregatorArgMinMax(int64_t N, const T& init) : ReduceAggregator<T, TVAL>(N, init) {
arg_ = 0;
index_ = 0;
}
inline TVAL get_value() { return arg_; }
inline void enforce(const ResultsNoTransposePrepareForReduce& res) {
ORT_ENFORCE(res.projected_index.size() == 0, "Only one axis is allowed for reduction.");
}
};
template <typename T, typename TVAL = int64_t>
class ReduceAggregatorArgMax : public ReduceAggregatorArgMinMax<T, TVAL> {
public:
inline ReduceAggregatorArgMax(int64_t N, const T& init) : ReduceAggregatorArgMinMax<T, TVAL>(N, init) {}
inline TVAL aggall(const T* from_data) {
Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).maxCoeff(&this->arg_);
return this->get_value();
}
inline void update(const T& v) {
if (v > this->accumulator_) {
this->accumulator_ = v;
this->arg_ = this->index_;
}
++this->index_;
}
};
template <typename T, typename TVAL = int64_t>
class ReduceAggregatorArgMaxLastIndex : public ReduceAggregatorArgMax<T, TVAL> {
public:
inline ReduceAggregatorArgMaxLastIndex(int64_t N, const T& init) : ReduceAggregatorArgMax<T, TVAL>(N, init) {}
inline TVAL aggall(const T* from_data) {
for (int64_t i = 0; i < this->N_; ++i) {
update(from_data[i]);
}
return this->get_value();
}
inline void update(const T& v) {
if (v >= this->accumulator_) {
this->accumulator_ = v;
this->arg_ = this->index_;
}
++this->index_;
}
};
template <typename T, typename TVAL = int64_t>
class ReduceAggregatorArgMin : public ReduceAggregatorArgMinMax<T, TVAL> {
public:
inline ReduceAggregatorArgMin(int64_t N, const T& init) : ReduceAggregatorArgMinMax<T, TVAL>(N, init) {}
inline TVAL aggall(const T* from_data) {
Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).minCoeff(&this->arg_);
return this->get_value();
}
inline void update(const T& v) {
if (v < this->accumulator_) {
this->accumulator_ = v;
this->arg_ = this->index_;
}
++this->index_;
}
};
template <typename T, typename TVAL = int64_t>
class ReduceAggregatorArgMinLastIndex : public ReduceAggregatorArgMin<T, TVAL> {
public:
inline ReduceAggregatorArgMinLastIndex(int64_t N, const T& init) : ReduceAggregatorArgMin<T, TVAL>(N, init) {}
inline TVAL aggall(const T* from_data) {
for (int64_t i = 0; i < this->N_; ++i) {
update(from_data[i]);
}
return this->get_value();
}
inline void update(const T& v) {
if (v <= this->accumulator_) {
this->accumulator_ = v;
this->arg_ = this->index_;
}
++this->index_;
}
};
template <typename T, typename TVAL = T>
class ReduceAggregatorMin : public ReduceAggregator<T, TVAL> {
public:
inline ReduceAggregatorMin(int64_t N, const T& init) : ReduceAggregator<T, TVAL>(N, init) {}
inline TVAL aggall(const T* from_data) {
return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).minCoeff();
}
inline void update(const T& v) { this->accumulator_ = v < this->accumulator_ ? v : this->accumulator_; }
};
template <typename T, typename TVAL = T>
class ReduceAggregatorProd : public ReduceAggregator<T, TVAL> {
public:
inline ReduceAggregatorProd(int64_t N, const T&) : ReduceAggregator<T, TVAL>(N, 1) {}
inline TVAL aggall(const T* from_data) {
return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).prod();
}
inline void update(const T& v) { this->accumulator_ *= v; }
};
template <typename T, typename TVAL = T>
class ReduceAggregatorL1 : public ReduceAggregator<T, TVAL> {
public:
inline ReduceAggregatorL1(int64_t N, const T&) : ReduceAggregator<T, TVAL>(N, 0) {}
inline TVAL aggall(const T* from_data) {
return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).cwiseAbs().sum();
}
inline void update(const T& v) { this->accumulator_ += v > 0 ? v : -v; }
};
template <typename T, typename TVAL = T>
class ReduceAggregatorL2 : public ReduceAggregator<T, TVAL> {
public:
inline ReduceAggregatorL2(int64_t N, const T&) : ReduceAggregator<T, TVAL>(N, 0) {}
inline TVAL aggall(const T* from_data) {
return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).norm();
}
inline void update(const T& v) { this->accumulator_ += v * v; }
inline TVAL get_value() { return reduce_sqrt<T>(this->accumulator_); }
};
template <typename T, typename TVAL = T>
class ReduceAggregatorLogSum : public ReduceAggregator<T, TVAL> {
public:
inline ReduceAggregatorLogSum(int64_t N, const T&) : ReduceAggregator<T, TVAL>(N, 0) {}
inline T aggall(const T* from_data) {
return reduce_log<T>(Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).sum());
}
inline void update(const T& v) { this->accumulator_ += v; }
inline TVAL get_value() { return reduce_log<T>(this->accumulator_); }
};
template <typename T, typename TVAL = T>
class ReduceAggregatorLogSumExp : public ReduceAggregator<T, TVAL> {
protected:
T max_;
public:
inline ReduceAggregatorLogSumExp(int64_t N, const T&) : ReduceAggregator<T, TVAL>(N, 0) { max_ = this->accumulator_; }
inline TVAL aggall(const T* from_data) {
max_ = Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).maxCoeff();
for (int64_t i = 0; i < this->N_; ++i) {
update(from_data[i]);
}
return get_value();
}
inline void update0(const T& v) { max_ = v > max_ ? v : max_; }
inline void update(const T& v) { this->accumulator_ += reduce_exp(v - max_); }
inline TVAL get_value() { return reduce_log<T>(this->accumulator_) + max_; }
static inline bool two_loops() { return true; }
};
bool SetupForReduce(const Tensor* input_tensor_ptr,
const std::vector<int64_t>& axes_,
std::vector<int64_t>& axes,
TensorShape& new_input_shape,
std::vector<int64_t>& output_shape,
bool& empty_reduce,
const TensorShape* input_shape_override);
void NoTransposePrepareForReduce(const TensorShape& new_input_shape,
const std::vector<int64_t>& reduced_axes,
ResultsNoTransposePrepareForReduce& results);
template <typename T, typename AGG>
void NoTransposeReduce(Tensor* output, const TensorShape& new_input_shape, const Tensor& input,
const std::vector<int64_t>& reduced_axes, concurrency::ThreadPool* tp,
ResultsNoTransposePrepareForReduce& last_results);
template <typename T, typename AGG>
void CommonReduce(OpKernelContext* ctx,
const std::vector<int64_t> axes_, int64_t keepdims_,
ResultsNoTransposePrepareForReduce& last_results,
bool noop_with_empty_axes = false);
template <bool allow_multi_axes>
class ReduceKernelBase {

View file

@ -27,38 +27,10 @@ REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(double)
REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(int32_t)
REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(int64_t)
template <typename T>
Status ReduceSumTraining<T>::Compute(OpKernelContext* ctx) const {
FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
int64_t block_size;
int64_t blocks;
std::vector<int64_t> reduced_dims;
const Tensor* input = ctx->Input<Tensor>(0);
//override the attribute value with the input value for reduction_axes
const Tensor* axes_tensor = ctx->Input<Tensor>(1);
ORT_ENFORCE(axes_tensor != nullptr, "Axes input is null");
ORT_ENFORCE(axes_tensor->Shape().NumDimensions() == 1,
"An axes tensor must be a vector tensor.");
auto nDims = static_cast<size_t>(axes_tensor->Shape()[0]);
const auto* data = axes_tensor->template Data<int64_t>();
std::vector<int64_t> axes(data, data + nDims);
// empty axes and no-op
if (axes.empty() && noop_with_empty_axes_) {
auto* output = ctx->Output(0, input->Shape());
memcpy(output->template MutableData<T>(), input->template Data<T>(), input->SizeInBytes());
return Status::OK();
}
bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes, keepdims_, reduced_dims, true);
auto* output = ctx->Output(0, reduced_dims);
ReduceSumCore(input->template Data<T>(), output->template MutableData<T>(),
no_transpose, blocks, block_size, transposed_input_data, ctx->GetOperatorThreadPool());
ResultsNoTransposePrepareForReduce last_results;
CommonReduce<T, ReduceAggregatorSum<T>>(ctx, axes_, keepdims_, last_results, noop_with_empty_axes_);
return Status::OK();
}