mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-24 02:47:54 +00:00
Improves ReduceSum performance by removing transposition. (#5370)
* Improves ReduceSum performance * Add min, max, L1, L2, logsum, sumsquare * remove all reduce implementation including transpose
This commit is contained in:
parent
682898ae2b
commit
66c8a441e0
3 changed files with 628 additions and 594 deletions
|
|
@ -151,23 +151,13 @@ REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ArgMin, 1, 10);
|
|||
REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ArgMin, 11, 12);
|
||||
REGISTER_UNARY_ELEMENTWISE_KERNEL(ArgMin, 13);
|
||||
|
||||
// When all reduce axes are located at the tail of the dims, quite general cases, transpose and extra
|
||||
// copy could be skipped to improve performance. If required by check_no_transpose = true, then
|
||||
// the calling code will check if the data was transposed and act accordingly.
|
||||
// return value: true means transposedInputData is not created/copied, input tensor data could
|
||||
// be directly used as row major matrix [block_size, blocks], where blocks is the
|
||||
// size of each reduce.
|
||||
// `input_shape_override` overrides the shape of `input` for compute purposes.
|
||||
template <typename T>
|
||||
bool PrepareForReduce(const Tensor* input_tensor_ptr,
|
||||
FastAllocVector<T>& transposed_input_data,
|
||||
int64_t& block_size,
|
||||
int64_t& blocks,
|
||||
const std::vector<int64_t>& axes_,
|
||||
bool keepdims_,
|
||||
/*out*/ std::vector<int64_t>& reduced_dims,
|
||||
bool check_no_transpose,
|
||||
const TensorShape* input_shape_override) {
|
||||
bool SetupForReduce(const Tensor* input_tensor_ptr,
|
||||
const std::vector<int64_t>& axes_,
|
||||
std::vector<int64_t>& axes,
|
||||
TensorShape& new_input_shape,
|
||||
std::vector<int64_t>& output_shape,
|
||||
bool& empty_reduce,
|
||||
const TensorShape* input_shape_override) {
|
||||
ORT_ENFORCE(input_tensor_ptr != nullptr, "Input to be reduced is null");
|
||||
|
||||
if (input_shape_override) {
|
||||
|
|
@ -175,25 +165,13 @@ bool PrepareForReduce(const Tensor* input_tensor_ptr,
|
|||
"The input shape override's size does not match the input tensor's shape size");
|
||||
}
|
||||
|
||||
const Tensor& input = *input_tensor_ptr;
|
||||
const auto& input_shape = input_shape_override ? *input_shape_override : input.Shape();
|
||||
|
||||
size_t ndim = input_shape.NumDimensions();
|
||||
|
||||
// Scalar tensor
|
||||
new_input_shape = input_shape_override ? *input_shape_override : input_tensor_ptr->Shape();
|
||||
size_t ndim = new_input_shape.NumDimensions();
|
||||
if (ndim == 0) {
|
||||
if (!check_no_transpose) {
|
||||
auto size = input_shape.Size();
|
||||
assert(size == 1);
|
||||
transposed_input_data.resize(size, 0);
|
||||
T* to_data = &transposed_input_data[0];
|
||||
*to_data = *input.Data<T>();
|
||||
}
|
||||
block_size = blocks = 1;
|
||||
return true;
|
||||
empty_reduce = true;
|
||||
return false;
|
||||
}
|
||||
|
||||
std::vector<int64_t> axes;
|
||||
axes.reserve(axes_.size());
|
||||
for (int64_t axis : axes_) {
|
||||
axes.push_back(HandleNegativeAxis(axis, static_cast<int64_t>(ndim)));
|
||||
|
|
@ -210,598 +188,411 @@ bool PrepareForReduce(const Tensor* input_tensor_ptr,
|
|||
|
||||
// If all reduced axes are located at the tail of the input shape, then copy could be skipped is required
|
||||
bool need_copy = true;
|
||||
if (axes.size() <= ndim &&
|
||||
if (axes.size() <= ndim && ndim > 0 &&
|
||||
axes.front() == static_cast<int64_t>(ndim - axes.size()) &&
|
||||
axes.back() == static_cast<int64_t>(ndim) - 1) {
|
||||
need_copy = false;
|
||||
}
|
||||
|
||||
std::vector<bool> keep_axis(ndim, true);
|
||||
for (auto i : axes) {
|
||||
keep_axis[i] = false;
|
||||
empty_reduce = false;
|
||||
output_shape = new_input_shape.GetDims();
|
||||
for (auto a : axes) {
|
||||
output_shape[a] = new_input_shape[a] > 0 ? 1 : 0;
|
||||
empty_reduce |= output_shape[a] == 0;
|
||||
}
|
||||
return need_copy;
|
||||
}
|
||||
void NoTransposePrepareForReduce(const TensorShape& new_input_shape,
|
||||
const std::vector<int64_t>& reduced_axes,
|
||||
ResultsNoTransposePrepareForReduce& results) {
|
||||
// Common initialisation for the indices.
|
||||
std::vector<int64_t> cumulative_shape = new_input_shape.GetDims();
|
||||
cumulative_shape[cumulative_shape.size() - 1] = 1;
|
||||
for (int i = static_cast<int>(cumulative_shape.size()) - 2; i >= 0; --i) {
|
||||
cumulative_shape[i] = cumulative_shape[i + 1] * new_input_shape[i + 1];
|
||||
}
|
||||
int64_t projection_size = 1;
|
||||
for (auto a : reduced_axes) {
|
||||
projection_size *= new_input_shape[a];
|
||||
}
|
||||
|
||||
//transpose the input so that all to-be-reduced axes are at the head
|
||||
std::vector<int64_t> transposed_axes(axes.begin(), axes.end());
|
||||
for (size_t i = 0; i < ndim; ++i) {
|
||||
if (keep_axis[i]) {
|
||||
transposed_axes.push_back(i);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<int64_t> new_dims(transposed_axes.size());
|
||||
for (size_t i = 0; i < transposed_axes.size(); ++i) {
|
||||
new_dims[i] = input_shape.GetDims().at(transposed_axes[i]);
|
||||
}
|
||||
|
||||
int num_axes = static_cast<int>(transposed_axes.size());
|
||||
auto in_dims = input_shape.GetDims();
|
||||
|
||||
// Measure amount of contiguous data we can copy at once
|
||||
int64_t blocksize = 1;
|
||||
int n_shared_idxs = 0;
|
||||
for (int i = num_axes - 1; i >= 0; --i) {
|
||||
if (transposed_axes[i] == i) {
|
||||
blocksize *= new_dims[i];
|
||||
++n_shared_idxs;
|
||||
} else {
|
||||
int last_reduced_axis = static_cast<int>(reduced_axes.size()) - 1;
|
||||
int loop_reduced_axis = 1;
|
||||
results.last_loop_red_size = new_input_shape[reduced_axes[last_reduced_axis]];
|
||||
results.last_loop_red_inc = cumulative_shape[reduced_axes[last_reduced_axis]];
|
||||
projection_size /= new_input_shape[reduced_axes[last_reduced_axis]];
|
||||
--last_reduced_axis;
|
||||
while (last_reduced_axis >= 0) {
|
||||
if (reduced_axes[last_reduced_axis] != reduced_axes[last_reduced_axis + 1] - 1)
|
||||
break;
|
||||
}
|
||||
results.last_loop_red_size *= new_input_shape[reduced_axes[last_reduced_axis]];
|
||||
projection_size /= new_input_shape[reduced_axes[last_reduced_axis]];
|
||||
--last_reduced_axis;
|
||||
++loop_reduced_axis;
|
||||
}
|
||||
|
||||
const T* from_data = input.template Data<T>();
|
||||
size_t count = input_shape.Size();
|
||||
|
||||
//set to-be-reduced axes to one. squeeze is keepdims_ is false
|
||||
int64_t first_dim = 1;
|
||||
reduced_dims.reserve(in_dims.size());
|
||||
|
||||
for (size_t i = 0; i < in_dims.size(); i++) {
|
||||
const auto in_dim = in_dims[i];
|
||||
if (keep_axis[i]) {
|
||||
reduced_dims.push_back(in_dim);
|
||||
} else {
|
||||
first_dim *= in_dim;
|
||||
if (keepdims_) {
|
||||
reduced_dims.push_back(in_dim == 0 ? 0 : 1);
|
||||
} else {
|
||||
// as we are reducing on this axis and not keeping a dim for it, we can't drop a dim value of 0.
|
||||
// e.g. if input was {3, 0, 2} and we reduced on axis 1 without keeping it, the output shape would be
|
||||
// {3, 2} which is invalid given the input was empty.
|
||||
// note that if we do keep the dim the output shape will have a 0 in it,
|
||||
// which is still valid for an empty tensor, so allow that.
|
||||
ORT_ENFORCE(in_dim != 0,
|
||||
"Can't reduce on dim with value of 0 if 'keepdims' is false. "
|
||||
"Invalid output shape would be produced. input_shape:",
|
||||
input_shape);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto num_elements = input_shape.Size();
|
||||
|
||||
// edge case. one or more input dims with value of 0.
|
||||
if (num_elements == 0) {
|
||||
block_size = blocks = 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (0 == first_dim) {
|
||||
return false;
|
||||
}
|
||||
|
||||
block_size = num_elements / first_dim;
|
||||
blocks = first_dim;
|
||||
|
||||
if (!need_copy && check_no_transpose) {
|
||||
return true;
|
||||
}
|
||||
|
||||
transposed_input_data.resize(input_shape.Size(), 0);
|
||||
T* to_data = &transposed_input_data[0];
|
||||
if (num_axes < 2 || n_shared_idxs == num_axes) {
|
||||
memcpy(to_data, from_data, count * sizeof(T));
|
||||
return false;
|
||||
}
|
||||
|
||||
int itr_axes = num_axes - n_shared_idxs;
|
||||
|
||||
// Calculate strides
|
||||
std::vector<int64_t> stride_x(itr_axes, 0);
|
||||
for (size_t i = 0; static_cast<int>(i) < itr_axes; i++) {
|
||||
stride_x[i] = 1;
|
||||
for (size_t j = transposed_axes[i] + 1; static_cast<int>(j) < itr_axes; j++) {
|
||||
stride_x[i] *= in_dims[j];
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<int64_t> itr_idxs(itr_axes, 0);
|
||||
|
||||
// Branch here to avoid branching within the loop
|
||||
if (blocksize > 1) {
|
||||
for (size_t index = 0; index < (count / blocksize); index++) {
|
||||
int64_t from_index = 0;
|
||||
for (int i = 0; i < itr_axes; ++i) {
|
||||
from_index += stride_x[i] * itr_idxs[i];
|
||||
}
|
||||
|
||||
memcpy(
|
||||
to_data + blocksize * index,
|
||||
from_data + blocksize * from_index,
|
||||
blocksize * sizeof(T));
|
||||
|
||||
++itr_idxs[itr_axes - 1];
|
||||
for (int i = itr_axes - 1; i >= 1; --i) {
|
||||
auto expected_dim = new_dims[i];
|
||||
if (itr_idxs[i] < expected_dim) {
|
||||
break;
|
||||
}
|
||||
itr_idxs[i] %= expected_dim;
|
||||
++itr_idxs[i - 1];
|
||||
}
|
||||
}
|
||||
// Builds the list of indices projected into the same sum.
|
||||
int reduced_axes_size = static_cast<int>(reduced_axes.size()) - loop_reduced_axis;
|
||||
if (reduced_axes_size == 0) {
|
||||
results.projected_index.resize(1, 0);
|
||||
} else {
|
||||
for (size_t index = 0; index < count; index++) {
|
||||
int64_t from_index = 0;
|
||||
for (int i = 0; i < itr_axes; ++i) {
|
||||
from_index += stride_x[i] * itr_idxs[i];
|
||||
}
|
||||
|
||||
*(to_data + index) = *(from_data + from_index);
|
||||
|
||||
++itr_idxs[itr_axes - 1];
|
||||
for (int i = itr_axes - 1; i >= 1; --i) {
|
||||
auto expected_dim = new_dims[i];
|
||||
if (itr_idxs[i] < expected_dim) {
|
||||
results.projected_index.resize(projection_size);
|
||||
std::vector<int64_t> projected_indices(reduced_axes_size, 0);
|
||||
int64_t current_index = 0;
|
||||
size_t current_pos = 0;
|
||||
int j;
|
||||
for (current_pos = 0; current_pos < results.projected_index.size(); ++current_pos) {
|
||||
results.projected_index[current_pos] = current_index;
|
||||
++projected_indices[projected_indices.size() - 1];
|
||||
current_index += cumulative_shape[reduced_axes[reduced_axes_size - 1]];
|
||||
for (j = reduced_axes_size - 1; j > 0; --j) {
|
||||
if (projected_indices[j] < new_input_shape[reduced_axes[j]])
|
||||
break;
|
||||
}
|
||||
itr_idxs[i] %= expected_dim;
|
||||
++itr_idxs[i - 1];
|
||||
projected_indices[j] -= new_input_shape[reduced_axes[j]];
|
||||
current_index -= new_input_shape[reduced_axes[j]] * cumulative_shape[reduced_axes[j]];
|
||||
++projected_indices[j - 1];
|
||||
current_index += cumulative_shape[reduced_axes[j - 1]];
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
|
||||
// Builds the list of indices for the unprojected sum.
|
||||
std::vector<int64_t> unreduced_axes;
|
||||
for (int64_t i = 0; i < static_cast<int64_t>(cumulative_shape.size()); ++i) {
|
||||
if (std::find(reduced_axes.begin(), reduced_axes.end(), i) != reduced_axes.end())
|
||||
continue;
|
||||
unreduced_axes.push_back(i);
|
||||
}
|
||||
int64_t unprojection_size = 1;
|
||||
for (auto a : unreduced_axes) {
|
||||
unprojection_size *= new_input_shape[a];
|
||||
}
|
||||
if (unprojection_size == 0) {
|
||||
return;
|
||||
}
|
||||
std::vector<int64_t> unprojected_indices(unreduced_axes.size(), 0);
|
||||
|
||||
// The last index is usually an image size.
|
||||
// We differently process the last unprojected dimension.
|
||||
results.last_loop_size = new_input_shape[unreduced_axes[unreduced_axes.size() - 1]];
|
||||
int64_t unprojection_size_before_last = unprojection_size / results.last_loop_size;
|
||||
results.unprojected_index.reserve(unprojection_size_before_last);
|
||||
results.last_loop_inc = cumulative_shape[unreduced_axes[unreduced_axes.size() - 1]];
|
||||
if (unprojected_indices.size() <= 1) {
|
||||
results.unprojected_index.push_back(0);
|
||||
} else {
|
||||
int64_t current_index = 0;
|
||||
int j;
|
||||
for (int64_t pos = 0; pos < unprojection_size_before_last; ++pos) {
|
||||
results.unprojected_index.push_back(current_index);
|
||||
++unprojected_indices[unprojected_indices.size() - 2];
|
||||
current_index += cumulative_shape[unreduced_axes[unreduced_axes.size() - 2]];
|
||||
for (j = static_cast<int>(unreduced_axes.size()) - 2; j > 0; --j) {
|
||||
if (unprojected_indices[j] < new_input_shape[unreduced_axes[j]])
|
||||
break;
|
||||
unprojected_indices[j] -= new_input_shape[unreduced_axes[j]];
|
||||
current_index -= new_input_shape[unreduced_axes[j]] * cumulative_shape[unreduced_axes[j]];
|
||||
++unprojected_indices[j - 1];
|
||||
current_index += cumulative_shape[unreduced_axes[j - 1]];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, typename AGG>
|
||||
void NoTransposeReduce(Tensor* output, const TensorShape& new_input_shape, const Tensor& input,
|
||||
const std::vector<int64_t>& reduced_axes, concurrency::ThreadPool* tp,
|
||||
ResultsNoTransposePrepareForReduce& last_results) {
|
||||
auto output_shape = output->Shape();
|
||||
const T* from_data = input.template Data<T>();
|
||||
typename AGG::value_type* to_data = output->template MutableData<typename AGG::value_type>();
|
||||
int64_t count = output_shape.Size();
|
||||
|
||||
if (reduced_axes.size() == 0 || reduced_axes.size() == new_input_shape.NumDimensions()) {
|
||||
ORT_ENFORCE(count == 1, "Reduction on all axes, output size should be 1.");
|
||||
int64_t input_size = new_input_shape.Size();
|
||||
to_data[0] = AGG(input_size, from_data[0]).aggall(from_data);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!last_results.equal(new_input_shape.GetDims(), reduced_axes)) {
|
||||
NoTransposePrepareForReduce(new_input_shape, reduced_axes, last_results);
|
||||
if (last_results.last_loop_red_size == 0 || last_results.last_loop_size == 0)
|
||||
return;
|
||||
}
|
||||
int64_t denominator = last_results.last_loop_red_size * last_results.projected_index.size();
|
||||
|
||||
if (AGG::two_loops()) {
|
||||
auto fn = [&](std::ptrdiff_t first, std::ptrdiff_t end) {
|
||||
int64_t loop;
|
||||
const T* loop_red_ptr;
|
||||
const T* loop_red_ptr_end;
|
||||
int64_t current_index = first * last_results.last_loop_size;
|
||||
for (int64_t main_index = first; main_index < end; ++main_index) {
|
||||
for (loop = 0; loop < last_results.last_loop_size; ++loop, ++current_index) {
|
||||
int64_t origin = last_results.unprojected_index[main_index] + loop * last_results.last_loop_inc;
|
||||
AGG accumulator(denominator, from_data[origin + last_results.projected_index[0]]);
|
||||
for (auto it = last_results.projected_index.begin(); it != last_results.projected_index.end(); ++it) {
|
||||
loop_red_ptr = from_data + (origin + *it);
|
||||
loop_red_ptr_end = loop_red_ptr + last_results.last_loop_red_size * last_results.last_loop_red_inc;
|
||||
for (; loop_red_ptr != loop_red_ptr_end; loop_red_ptr += last_results.last_loop_red_inc) {
|
||||
accumulator.update0(*loop_red_ptr);
|
||||
}
|
||||
}
|
||||
for (auto it = last_results.projected_index.begin(); it != last_results.projected_index.end(); ++it) {
|
||||
loop_red_ptr = from_data + (origin + *it);
|
||||
loop_red_ptr_end = loop_red_ptr + last_results.last_loop_red_size * last_results.last_loop_red_inc;
|
||||
for (; loop_red_ptr != loop_red_ptr_end; loop_red_ptr += last_results.last_loop_red_inc) {
|
||||
accumulator.update(*loop_red_ptr);
|
||||
}
|
||||
}
|
||||
to_data[current_index] = accumulator.get_value();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
auto cost = TensorOpCost{(double)(last_results.projected_index.size() * sizeof(T) * last_results.last_loop_size * last_results.last_loop_red_size),
|
||||
(double)last_results.last_loop_size * last_results.last_loop_red_size,
|
||||
(double)last_results.projected_index.size() * last_results.last_loop_size * last_results.last_loop_red_size * 2};
|
||||
concurrency::ThreadPool::TryParallelFor(tp, count / last_results.last_loop_size, cost, fn);
|
||||
} else {
|
||||
auto fn = [&](std::ptrdiff_t first, std::ptrdiff_t end) {
|
||||
int64_t loop;
|
||||
const T* loop_red_ptr;
|
||||
const T* loop_red_ptr_end;
|
||||
int64_t current_index = first * last_results.last_loop_size;
|
||||
for (int64_t main_index = first; main_index < end; ++main_index) {
|
||||
for (loop = 0; loop < last_results.last_loop_size; ++loop, ++current_index) {
|
||||
int64_t origin = last_results.unprojected_index[main_index] + loop * last_results.last_loop_inc;
|
||||
AGG accumulator(denominator, from_data[origin + last_results.projected_index[0]]);
|
||||
for (auto it = last_results.projected_index.begin(); it != last_results.projected_index.end(); ++it) {
|
||||
loop_red_ptr = from_data + (origin + *it);
|
||||
loop_red_ptr_end = loop_red_ptr + last_results.last_loop_red_size * last_results.last_loop_red_inc;
|
||||
for (; loop_red_ptr != loop_red_ptr_end; loop_red_ptr += last_results.last_loop_red_inc) {
|
||||
accumulator.update(*loop_red_ptr);
|
||||
}
|
||||
}
|
||||
to_data[current_index] = accumulator.get_value();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
auto cost = TensorOpCost{(double)(last_results.projected_index.size() * sizeof(T) * last_results.last_loop_size * last_results.last_loop_red_size),
|
||||
(double)last_results.last_loop_size * last_results.last_loop_red_size,
|
||||
(double)last_results.projected_index.size() * last_results.last_loop_size * last_results.last_loop_red_size};
|
||||
concurrency::ThreadPool::TryParallelFor(tp, count / last_results.last_loop_size, cost, fn);
|
||||
}
|
||||
}
|
||||
|
||||
void DropDimensions(const std::vector<int64_t>& input_shape, const std::vector<int64_t>& axes, std::vector<int64_t>& dropped_axes) {
|
||||
auto dropped_dims = input_shape;
|
||||
for (auto i : axes) {
|
||||
dropped_dims[i] = -1;
|
||||
}
|
||||
for (auto it = dropped_dims.begin(); it != dropped_dims.end(); ++it) {
|
||||
if (*it != -1) {
|
||||
dropped_axes.push_back(*it);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, typename AGG>
|
||||
void CommonReduce(OpKernelContext* ctx,
|
||||
const std::vector<int64_t> axes_, int64_t keepdims_,
|
||||
ResultsNoTransposePrepareForReduce& last_results,
|
||||
bool noop_with_empty_axes) {
|
||||
std::vector<int64_t> axes;
|
||||
const Tensor* input = ctx->Input<Tensor>(0);
|
||||
auto reduced_dims = input->Shape().GetDims();
|
||||
std::vector<int64_t> output_shape;
|
||||
bool empty_reduce;
|
||||
TensorShape new_input_shape;
|
||||
|
||||
if (ctx->InputCount() == 2) {
|
||||
// second input holds the axes.
|
||||
const Tensor* axes_tensor = ctx->Input<Tensor>(1);
|
||||
ORT_ENFORCE(axes_tensor != nullptr, "Axes input is null");
|
||||
ORT_ENFORCE(axes_tensor->Shape().NumDimensions() == 1,
|
||||
"An axes tensor must be a vector tensor.");
|
||||
auto nDims = static_cast<size_t>(axes_tensor->Shape()[0]);
|
||||
const auto* data = axes_tensor->template Data<int64_t>();
|
||||
std::vector<int64_t> input_axes(data, data + nDims);
|
||||
if (input_axes.empty() && noop_with_empty_axes) {
|
||||
auto* output = ctx->Output(0, input->Shape());
|
||||
memcpy(output->template MutableData<typename AGG::value_type>(), input->template Data<T>(), input->SizeInBytes());
|
||||
return;
|
||||
}
|
||||
SetupForReduce(input, input_axes, axes, new_input_shape, output_shape, empty_reduce, nullptr);
|
||||
} else {
|
||||
SetupForReduce(input, axes_, axes, new_input_shape, output_shape, empty_reduce, nullptr);
|
||||
}
|
||||
|
||||
if (empty_reduce) {
|
||||
Tensor* output = ctx->Output(0, keepdims_ ? output_shape : std::vector<int64_t>());
|
||||
if (new_input_shape.Size() == 1) {
|
||||
const T* from_data = input->template Data<T>();
|
||||
typename AGG::value_type* to_data = output->template MutableData<typename AGG::value_type>();
|
||||
AGG agg(1, *from_data);
|
||||
if (agg.two_loops()) {
|
||||
agg.update0(*from_data);
|
||||
agg.update(*from_data);
|
||||
} else {
|
||||
agg.update(*from_data);
|
||||
}
|
||||
*to_data = agg.get_value();
|
||||
} else {
|
||||
ORT_ENFORCE(keepdims_,
|
||||
"Can't reduce on dim with value of 0 if 'keepdims' is false. "
|
||||
"Invalid output shape would be produced. input_shape:",
|
||||
input->Shape());
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
Tensor* output;
|
||||
if (keepdims_) {
|
||||
output = ctx->Output(0, output_shape);
|
||||
} else {
|
||||
std::vector<int64_t> dropped_axes;
|
||||
DropDimensions(output_shape, axes, dropped_axes);
|
||||
output = ctx->Output(0, dropped_axes);
|
||||
}
|
||||
NoTransposeReduce<T, AGG>(output, new_input_shape, *input, axes, ctx->GetOperatorThreadPool(), last_results);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Status ReduceL1<T>::Compute(OpKernelContext* ctx) const {
|
||||
FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
|
||||
int64_t block_size;
|
||||
int64_t blocks;
|
||||
std::vector<int64_t> reduced_dims;
|
||||
|
||||
const Tensor* input = ctx->Input<Tensor>(0);
|
||||
|
||||
bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true);
|
||||
|
||||
Tensor* reduced = ctx->Output(0, reduced_dims);
|
||||
|
||||
T* output_data = reduced->template MutableData<T>();
|
||||
|
||||
if (no_transpose) {
|
||||
const T* input_data = input->template Data<T>();
|
||||
|
||||
for (int64_t i = 0; i < block_size; ++i) {
|
||||
output_data[i] = ConstEigenVectorMap<T>(input_data + (i * blocks), blocks).cwiseAbs().sum();
|
||||
}
|
||||
} else {
|
||||
EigenVectorMap<T> out_vec(output_data, block_size);
|
||||
out_vec = ConstEigenMatrixMap<T>(&transposed_input_data[0], block_size, blocks).cwiseAbs().rowwise().sum();
|
||||
}
|
||||
|
||||
// The following variable does not change if the input tensor and the
|
||||
// axes do not either. It could be either cached in ctx or precomputed
|
||||
// in the constructor if shape and axes are known at this stage.
|
||||
ResultsNoTransposePrepareForReduce last_results;
|
||||
CommonReduce<T, ReduceAggregatorL1<T>>(ctx, axes_, keepdims_, last_results);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Status ReduceL2<T>::Compute(OpKernelContext* ctx) const {
|
||||
FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
|
||||
int64_t block_size;
|
||||
int64_t blocks;
|
||||
std::vector<int64_t> reduced_dims;
|
||||
|
||||
const Tensor* input = ctx->Input<Tensor>(0);
|
||||
|
||||
bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true);
|
||||
|
||||
Tensor* reduced = ctx->Output(0, reduced_dims);
|
||||
|
||||
T* output_data = reduced->template MutableData<T>();
|
||||
|
||||
if (no_transpose) {
|
||||
const T* input_data = input->template Data<T>();
|
||||
|
||||
for (int64_t i = 0; i < block_size; ++i) {
|
||||
output_data[i] = ConstEigenVectorMap<T>(input_data + (i * blocks), blocks).norm();
|
||||
}
|
||||
} else {
|
||||
EigenVectorMap<T> out_vec(output_data, block_size);
|
||||
out_vec = ConstEigenMatrixMap<T>(&transposed_input_data[0], block_size, blocks).rowwise().norm();
|
||||
}
|
||||
|
||||
ResultsNoTransposePrepareForReduce last_results;
|
||||
CommonReduce<T, ReduceAggregatorL2<T>>(ctx, axes_, keepdims_, last_results);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Status ReduceLogSum<T>::Compute(OpKernelContext* ctx) const {
|
||||
FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
|
||||
int64_t block_size;
|
||||
int64_t blocks;
|
||||
std::vector<int64_t> reduced_dims;
|
||||
|
||||
const Tensor* input = ctx->Input<Tensor>(0);
|
||||
|
||||
bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true);
|
||||
|
||||
Tensor* reduced = ctx->Output(0, reduced_dims);
|
||||
|
||||
T* output_data = reduced->template MutableData<T>();
|
||||
|
||||
if (no_transpose) {
|
||||
const T* input_data = input->template Data<T>();
|
||||
|
||||
for (int64_t i = 0; i < block_size; ++i) {
|
||||
output_data[i] = ConstEigenVectorMap<T>(input_data + (i * blocks), blocks).sum();
|
||||
}
|
||||
} else {
|
||||
EigenVectorMap<T> out_vec(output_data, block_size);
|
||||
out_vec = ConstEigenMatrixMap<T>(&transposed_input_data[0], block_size, blocks).rowwise().sum();
|
||||
}
|
||||
|
||||
for (int j = 0; j < block_size; ++j) {
|
||||
*(output_data) = static_cast<T>(std::log(*(output_data)));
|
||||
++output_data;
|
||||
}
|
||||
|
||||
ResultsNoTransposePrepareForReduce last_results;
|
||||
CommonReduce<T, ReduceAggregatorLogSum<T>>(ctx, axes_, keepdims_, last_results);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Status ReduceLogSumExp<T>::Compute(OpKernelContext* ctx) const {
|
||||
FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
|
||||
int64_t block_size;
|
||||
int64_t blocks;
|
||||
std::vector<int64_t> reduced_dims;
|
||||
const Tensor* input = ctx->Input<Tensor>(0);
|
||||
|
||||
PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims);
|
||||
|
||||
Tensor* reduced = ctx->Output(0, reduced_dims);
|
||||
|
||||
T* output_data = reduced->template MutableData<T>();
|
||||
|
||||
for (int j = 0; j < block_size; ++j) {
|
||||
T max_value = std::numeric_limits<T>::lowest();
|
||||
for (int i = 0; i < blocks; ++i) {
|
||||
max_value = std::max(max_value, transposed_input_data[i * block_size + j]);
|
||||
}
|
||||
T scaled_exp_sum = 0;
|
||||
for (int i = 0; i < blocks; ++i) {
|
||||
scaled_exp_sum += static_cast<T>(std::exp(transposed_input_data[i * block_size + j] - max_value));
|
||||
}
|
||||
*(output_data++) = static_cast<T>(std::log(scaled_exp_sum) + max_value);
|
||||
}
|
||||
ResultsNoTransposePrepareForReduce last_results;
|
||||
CommonReduce<T, ReduceAggregatorLogSumExp<T>>(ctx, axes_, keepdims_, last_results);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Status ReduceMax<T>::Compute(OpKernelContext* ctx) const {
|
||||
FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
|
||||
int64_t block_size;
|
||||
int64_t blocks;
|
||||
std::vector<int64_t> reduced_dims;
|
||||
const Tensor* input = ctx->Input<Tensor>(0);
|
||||
|
||||
bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true);
|
||||
|
||||
Tensor* reduced = ctx->Output(0, reduced_dims);
|
||||
|
||||
T* output_data = reduced->template MutableData<T>();
|
||||
|
||||
if (no_transpose) {
|
||||
const T* input_data = input->template Data<T>();
|
||||
|
||||
for (int64_t i = 0; i < block_size; ++i) {
|
||||
output_data[i] = ConstEigenVectorMap<T>(input_data + (i * blocks), blocks).maxCoeff();
|
||||
}
|
||||
} else {
|
||||
EigenVectorMap<T> out_vec(output_data, block_size);
|
||||
out_vec = ConstEigenMatrixMap<T>(&transposed_input_data[0], block_size, blocks).rowwise().maxCoeff();
|
||||
}
|
||||
|
||||
ResultsNoTransposePrepareForReduce last_results;
|
||||
CommonReduce<T, ReduceAggregatorMax<T>>(ctx, axes_, keepdims_, last_results);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Status ReduceMean<T>::Compute(OpKernelContext* ctx) const {
|
||||
FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
|
||||
int64_t block_size;
|
||||
int64_t blocks;
|
||||
std::vector<int64_t> reduced_dims;
|
||||
const Tensor* input = ctx->Input<Tensor>(0);
|
||||
|
||||
bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true);
|
||||
|
||||
Tensor* reduced = ctx->Output(0, reduced_dims);
|
||||
|
||||
T* output_data = reduced->template MutableData<T>();
|
||||
|
||||
if (no_transpose) {
|
||||
const T* input_data = ctx->Input<Tensor>(0)->template Data<T>();
|
||||
auto lambda = [input_data, blocks, output_data](ptrdiff_t i) {
|
||||
output_data[i] = ConstEigenVectorMap<T>(input_data + (i * blocks), blocks).mean();
|
||||
};
|
||||
concurrency::ThreadPool::TryBatchParallelFor(ctx->GetOperatorThreadPool(), block_size, lambda, 0);
|
||||
} else {
|
||||
EigenVectorMap<T> out_vec(output_data, block_size);
|
||||
out_vec = ConstEigenMatrixMap<T>(&transposed_input_data[0], block_size, blocks).rowwise().mean();
|
||||
}
|
||||
|
||||
ResultsNoTransposePrepareForReduce last_results;
|
||||
CommonReduce<T, ReduceAggregatorMean<T>>(ctx, axes_, keepdims_, last_results);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Status ReduceMin<T>::Compute(OpKernelContext* ctx) const {
|
||||
FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
|
||||
int64_t block_size;
|
||||
int64_t blocks;
|
||||
std::vector<int64_t> reduced_dims;
|
||||
const Tensor* input = ctx->Input<Tensor>(0);
|
||||
|
||||
bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true);
|
||||
|
||||
Tensor* reduced = ctx->Output(0, reduced_dims);
|
||||
|
||||
T* output_data = reduced->template MutableData<T>();
|
||||
|
||||
if (no_transpose) {
|
||||
const T* input_data = input->template Data<T>();
|
||||
|
||||
for (int64_t i = 0; i < block_size; ++i) {
|
||||
output_data[i] = ConstEigenVectorMap<T>(input_data + (i * blocks), blocks).minCoeff();
|
||||
}
|
||||
} else {
|
||||
EigenVectorMap<T> out_vec(output_data, block_size);
|
||||
out_vec = ConstEigenMatrixMap<T>(&transposed_input_data[0], block_size, blocks).rowwise().minCoeff();
|
||||
}
|
||||
|
||||
ResultsNoTransposePrepareForReduce last_results;
|
||||
CommonReduce<T, ReduceAggregatorMin<T>>(ctx, axes_, keepdims_, last_results);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Status ReduceProd<T>::Compute(OpKernelContext* ctx) const {
|
||||
FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
|
||||
int64_t block_size;
|
||||
int64_t blocks;
|
||||
std::vector<int64_t> reduced_dims;
|
||||
const Tensor* input = ctx->Input<Tensor>(0);
|
||||
|
||||
bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true);
|
||||
|
||||
Tensor* reduced = ctx->Output(0, reduced_dims);
|
||||
|
||||
T* output_data = reduced->template MutableData<T>();
|
||||
|
||||
if (no_transpose) {
|
||||
const T* input_data = input->template Data<T>();
|
||||
|
||||
for (int64_t i = 0; i < block_size; ++i) {
|
||||
output_data[i] = ConstEigenVectorMap<T>(input_data + (i * blocks), blocks).prod();
|
||||
}
|
||||
} else {
|
||||
EigenVectorMap<T> out_vec(output_data, block_size);
|
||||
out_vec = ConstEigenMatrixMap<T>(&transposed_input_data[0], block_size, blocks).rowwise().prod();
|
||||
}
|
||||
|
||||
ResultsNoTransposePrepareForReduce last_results;
|
||||
CommonReduce<T, ReduceAggregatorProd<T>>(ctx, axes_, keepdims_, last_results);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void ReduceSumCore(const T* input_data, T* output_data, bool no_transpose,
|
||||
int64_t blocks, int64_t block_size, FastAllocVector<T>& transposed_input_data,
|
||||
concurrency::ThreadPool* tp) {
|
||||
if (no_transpose) {
|
||||
auto lambda = [input_data, blocks, output_data](ptrdiff_t i) {
|
||||
// The ConstEigenMatrixMap type is expanded to work around a MS compiler issue
|
||||
output_data[i] = Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(input_data + (i * blocks), blocks).sum();
|
||||
};
|
||||
concurrency::ThreadPool::TryBatchParallelFor(tp, block_size, lambda, 0);
|
||||
} else {
|
||||
EigenVectorMap<T> out_vec(output_data, block_size);
|
||||
out_vec = ConstEigenMatrixMap<T>(&transposed_input_data[0], block_size, blocks).rowwise().sum();
|
||||
}
|
||||
Status ReduceSum<T>::Compute(OpKernelContext* ctx) const {
|
||||
ResultsNoTransposePrepareForReduce last_results;
|
||||
CommonReduce<T, ReduceAggregatorSum<T>>(ctx, axes_, keepdims_, last_results);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Tensor ReduceSum<T>::Impl(const Tensor& input, const std::vector<int64_t>& reduce_axes,
|
||||
AllocatorPtr allocator, concurrency::ThreadPool* tp, bool keep_dims,
|
||||
const TensorShape* input_shape_override) {
|
||||
FastAllocVector<T> transposed_input_data(allocator);
|
||||
int64_t block_size;
|
||||
int64_t blocks;
|
||||
std::vector<int64_t> reduced_dims;
|
||||
std::vector<int64_t> axes;
|
||||
auto reduced_dims = input.Shape().GetDims();
|
||||
std::vector<int64_t> output_shape;
|
||||
TensorShape new_input_shape;
|
||||
bool empty_reduce;
|
||||
SetupForReduce(&input, reduce_axes, axes, new_input_shape, output_shape, empty_reduce, input_shape_override);
|
||||
|
||||
bool no_transpose = PrepareForReduce<T>(&input, transposed_input_data, block_size, blocks,
|
||||
reduce_axes, keep_dims, reduced_dims, true, input_shape_override);
|
||||
if (empty_reduce) {
|
||||
Tensor output(input.DataType(), keep_dims ? output_shape : std::vector<int64_t>(), allocator);
|
||||
if (new_input_shape.Size() == 1) {
|
||||
const T* from_data = input.template Data<T>();
|
||||
T* to_data = output.template MutableData<T>();
|
||||
*to_data = *from_data;
|
||||
} else {
|
||||
ORT_ENFORCE(keep_dims,
|
||||
"Can't reduce on dim with value of 0 if 'keepdims' is false. "
|
||||
"Invalid output shape would be produced. input_shape:",
|
||||
new_input_shape);
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
Tensor output(input.DataType(), reduced_dims, allocator);
|
||||
|
||||
ReduceSumCore(input.template Data<T>(), output.template MutableData<T>(),
|
||||
no_transpose, blocks, block_size, transposed_input_data, tp);
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Status ReduceSum<T>::Compute(OpKernelContext* ctx) const {
|
||||
FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
|
||||
int64_t block_size;
|
||||
int64_t blocks;
|
||||
std::vector<int64_t> reduced_dims;
|
||||
const Tensor* input = ctx->Input<Tensor>(0);
|
||||
|
||||
bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true);
|
||||
|
||||
auto* output = ctx->Output(0, reduced_dims);
|
||||
|
||||
ReduceSumCore(input->template Data<T>(), output->template MutableData<T>(),
|
||||
no_transpose, blocks, block_size, transposed_input_data, ctx->GetOperatorThreadPool());
|
||||
|
||||
return Status::OK();
|
||||
if (keep_dims) {
|
||||
ResultsNoTransposePrepareForReduce last_results;
|
||||
Tensor output(input.DataType(), output_shape, allocator);
|
||||
NoTransposeReduce<T, ReduceAggregatorSum<T>>(&output, new_input_shape, input, axes, tp, last_results);
|
||||
return output;
|
||||
} else {
|
||||
ResultsNoTransposePrepareForReduce last_results;
|
||||
std::vector<int64_t> dropped_axes;
|
||||
DropDimensions(output_shape, axes, dropped_axes);
|
||||
Tensor output(input.DataType(), dropped_axes, allocator);
|
||||
NoTransposeReduce<T, ReduceAggregatorSum<T>>(&output, new_input_shape, input, axes, tp, last_results);
|
||||
return output;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Status ReduceSumSquare<T>::Compute(OpKernelContext* ctx) const {
|
||||
FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
|
||||
int64_t block_size;
|
||||
int64_t blocks;
|
||||
std::vector<int64_t> reduced_dims;
|
||||
const Tensor* input = ctx->Input<Tensor>(0);
|
||||
|
||||
bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true);
|
||||
|
||||
Tensor* reduced = ctx->Output(0, reduced_dims);
|
||||
|
||||
T* output_data = reduced->template MutableData<T>();
|
||||
|
||||
if (no_transpose) {
|
||||
const T* input_data = input->template Data<T>();
|
||||
|
||||
for (int64_t i = 0; i < block_size; ++i) {
|
||||
output_data[i] = ConstEigenVectorMap<T>(input_data + (i * blocks), blocks).squaredNorm();
|
||||
}
|
||||
} else {
|
||||
EigenVectorMap<T> out_vec(output_data, block_size);
|
||||
out_vec = ConstEigenMatrixMap<T>(&transposed_input_data[0], block_size, blocks).rowwise().squaredNorm();
|
||||
}
|
||||
|
||||
ResultsNoTransposePrepareForReduce last_results;
|
||||
CommonReduce<T, ReduceAggregatorSumSquare<T>>(ctx, axes_, keepdims_, last_results);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Status ArgMax<T>::Compute(OpKernelContext* ctx) const {
|
||||
FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
|
||||
int64_t block_size;
|
||||
int64_t blocks;
|
||||
|
||||
std::vector<int64_t> reduced_dims;
|
||||
const Tensor* input = ctx->Input<Tensor>(0);
|
||||
|
||||
bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true);
|
||||
|
||||
Tensor* reduced = ctx->Output(0, reduced_dims);
|
||||
int64_t* output_data = reduced->template MutableData<int64_t>();
|
||||
Eigen::MatrixXf::Index maxIndex;
|
||||
|
||||
if (no_transpose) {
|
||||
const T* input_data = ctx->Input<Tensor>(0)->template Data<T>();
|
||||
if (select_last_index_) {
|
||||
assert(blocks > 0);
|
||||
for (int64_t i = 0; i < block_size; ++i) {
|
||||
gsl::span<const T> row(input_data, blocks);
|
||||
auto first = row.cbegin();
|
||||
auto const end = row.cend();
|
||||
auto max_el = first;
|
||||
while (++first < end) {
|
||||
if (*first >= *max_el) {
|
||||
max_el = first;
|
||||
}
|
||||
}
|
||||
*(output_data++) = max_el - row.cbegin();
|
||||
input_data += blocks;
|
||||
}
|
||||
} else {
|
||||
for (int64_t i = 0; i < block_size; ++i) {
|
||||
ConstEigenVectorMap<T>(input_data + (i * blocks), blocks).maxCoeff(&maxIndex);
|
||||
*(output_data++) = maxIndex;
|
||||
}
|
||||
}
|
||||
ResultsNoTransposePrepareForReduce last_results;
|
||||
if (select_last_index_) {
|
||||
CommonReduce<T, ReduceAggregatorArgMaxLastIndex<T>>(ctx, axes_, keepdims_, last_results);
|
||||
} else {
|
||||
auto matrixData = ConstEigenMatrixMap<T>(&transposed_input_data[0], block_size, blocks);
|
||||
if (select_last_index_) {
|
||||
for (int i = 0; i < block_size; ++i) {
|
||||
int idx = 0;
|
||||
T max_val = matrixData(i, 0);
|
||||
for (int c = 1; c < blocks; ++c) {
|
||||
auto val = matrixData(i, c);
|
||||
if (val >= max_val) {
|
||||
idx = c;
|
||||
max_val = val;
|
||||
}
|
||||
}
|
||||
*(output_data++) = idx;
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < block_size; ++i) {
|
||||
matrixData.row(i).maxCoeff(&maxIndex);
|
||||
*(output_data++) = maxIndex;
|
||||
}
|
||||
}
|
||||
CommonReduce<T, ReduceAggregatorArgMax<T>>(ctx, axes_, keepdims_, last_results);
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Status ArgMin<T>::Compute(OpKernelContext* ctx) const {
|
||||
FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
|
||||
int64_t block_size;
|
||||
int64_t blocks;
|
||||
|
||||
std::vector<int64_t> reduced_dims;
|
||||
const Tensor* input = ctx->Input<Tensor>(0);
|
||||
|
||||
bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true);
|
||||
|
||||
Tensor* reduced = ctx->Output(0, reduced_dims);
|
||||
int64_t* output_data = reduced->template MutableData<int64_t>();
|
||||
Eigen::MatrixXf::Index minIndex;
|
||||
|
||||
if (no_transpose) {
|
||||
const T* input_data = ctx->Input<Tensor>(0)->template Data<T>();
|
||||
if (select_last_index_) {
|
||||
assert(blocks > 0);
|
||||
for (int64_t i = 0; i < block_size; ++i) {
|
||||
gsl::span<const T> row(input_data, blocks);
|
||||
auto first = row.cbegin();
|
||||
auto const end = row.cend();
|
||||
auto min_el = first;
|
||||
while (++first < end) {
|
||||
if (*first <= *min_el) {
|
||||
min_el = first;
|
||||
}
|
||||
}
|
||||
*(output_data++) = min_el - row.cbegin();
|
||||
input_data += blocks;
|
||||
}
|
||||
} else {
|
||||
for (int64_t i = 0; i < block_size; ++i) {
|
||||
ConstEigenVectorMap<T>(input_data + (i * blocks), blocks).minCoeff(&minIndex);
|
||||
*(output_data++) = minIndex;
|
||||
}
|
||||
}
|
||||
ResultsNoTransposePrepareForReduce last_results;
|
||||
if (select_last_index_) {
|
||||
CommonReduce<T, ReduceAggregatorArgMinLastIndex<T>>(ctx, axes_, keepdims_, last_results);
|
||||
} else {
|
||||
auto matrixData = ConstEigenMatrixMap<T>(&transposed_input_data[0], block_size, blocks);
|
||||
if (select_last_index_) {
|
||||
for (int i = 0; i < block_size; ++i) {
|
||||
int idx = 0;
|
||||
T min_val = matrixData(i, 0);
|
||||
for (int c = 1; c < blocks; ++c) {
|
||||
auto val = matrixData(i, c);
|
||||
if (val <= min_val) {
|
||||
idx = c;
|
||||
min_val = val;
|
||||
}
|
||||
}
|
||||
*(output_data++) = idx;
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < block_size; ++i) {
|
||||
matrixData.row(i).minCoeff(&minIndex);
|
||||
*(output_data++) = minIndex;
|
||||
}
|
||||
}
|
||||
CommonReduce<T, ReduceAggregatorArgMin<T>>(ctx, axes_, keepdims_, last_results);
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
|
|
@ -814,14 +605,4 @@ template class ReduceSum<int32_t>;
|
|||
template class ReduceSum<double>;
|
||||
template class ReduceSum<int64_t>;
|
||||
|
||||
#define REGISTER_REDUCESUMCORE_TYPED(T) \
|
||||
template void ReduceSumCore<T>(const T* input_data, T* output_data, bool no_transpose, \
|
||||
int64_t blocks, int64_t block_size, FastAllocVector<T>& transposed_input_data, \
|
||||
concurrency::ThreadPool* tp);
|
||||
|
||||
REGISTER_REDUCESUMCORE_TYPED(float)
|
||||
REGISTER_REDUCESUMCORE_TYPED(double)
|
||||
REGISTER_REDUCESUMCORE_TYPED(int32_t)
|
||||
REGISTER_REDUCESUMCORE_TYPED(int64_t)
|
||||
|
||||
} // namespace onnxruntime
|
||||
|
|
|
|||
|
|
@ -10,24 +10,305 @@
|
|||
#include "core/providers/cpu/containers.h"
|
||||
#include "core/util/math_cpuonly.h"
|
||||
#include "core/platform/threadpool.h"
|
||||
#include <cmath>
|
||||
|
||||
namespace onnxruntime {
|
||||
|
||||
template <typename T>
|
||||
bool PrepareForReduce(const Tensor* input_tensor_ptr,
|
||||
FastAllocVector<T>& transposed_input_data,
|
||||
int64_t& block_size,
|
||||
int64_t& blocks,
|
||||
const std::vector<int64_t>& axes_,
|
||||
bool keepdims_,
|
||||
/*out*/ std::vector<int64_t>& reduced_dims,
|
||||
bool check_no_transpose = false,
|
||||
const TensorShape* input_shape_override = nullptr);
|
||||
class ResultsNoTransposePrepareForReduce {
|
||||
public:
|
||||
std::vector<int64_t> input_shape;
|
||||
std::vector<int64_t> reduced_axes;
|
||||
std::vector<int64_t> projected_index;
|
||||
int64_t last_loop_red_size;
|
||||
int64_t last_loop_red_inc;
|
||||
std::vector<int64_t> unprojected_index;
|
||||
int64_t last_loop_size;
|
||||
int64_t last_loop_inc;
|
||||
bool equal(const std::vector<int64_t>& local_input_shape, const std::vector<int64_t>& local_reduced_axes) {
|
||||
if (input_shape.size() != local_input_shape.size())
|
||||
return false;
|
||||
if (reduced_axes.size() != local_reduced_axes.size())
|
||||
return false;
|
||||
for (std::vector<int64_t>::const_iterator it1 = input_shape.begin(), it2 = local_input_shape.begin();
|
||||
it1 != input_shape.end(); ++it1, ++it2) {
|
||||
if (*it1 != *it2)
|
||||
return false;
|
||||
}
|
||||
for (std::vector<int64_t>::const_iterator it1 = reduced_axes.begin(), it2 = local_reduced_axes.begin();
|
||||
it1 != reduced_axes.end(); ++it1, ++it2) {
|
||||
if (*it1 != *it2)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
void ReduceSumCore(const T* input_data, T* output_data, bool no_transpose,
|
||||
int64_t blocks, int64_t block_size, FastAllocVector<T>& transposed_input_data,
|
||||
concurrency::ThreadPool* tp);
|
||||
inline T reduce_sqrt(T value) { return std::sqrt(value); }
|
||||
|
||||
template <>
|
||||
inline int64_t reduce_sqrt<int64_t>(int64_t value) { return static_cast<int64_t>(std::sqrt(static_cast<double>(value))); }
|
||||
|
||||
template <>
|
||||
inline int32_t reduce_sqrt<int32_t>(int32_t value) { return static_cast<int32_t>(std::sqrt(static_cast<double>(value))); }
|
||||
|
||||
template <typename T>
|
||||
inline T reduce_log(T value) { return static_cast<T>(std::log(value)); }
|
||||
|
||||
template <>
|
||||
inline int64_t reduce_log<int64_t>(int64_t value) { return static_cast<int64_t>(std::log(static_cast<double>(value))); }
|
||||
|
||||
template <>
|
||||
inline int32_t reduce_log<int32_t>(int32_t value) { return static_cast<int32_t>(std::log(static_cast<double>(value))); }
|
||||
|
||||
template <typename T>
|
||||
inline T reduce_exp(T value) { return static_cast<T>(std::exp(value)); }
|
||||
|
||||
template <typename T, typename TVAL = T>
|
||||
class ReduceAggregator {
|
||||
public:
|
||||
typedef TVAL value_type;
|
||||
|
||||
protected:
|
||||
int64_t N_;
|
||||
T accumulator_;
|
||||
|
||||
public:
|
||||
inline ReduceAggregator(int64_t N, const T& init) {
|
||||
N_ = N;
|
||||
accumulator_ = init;
|
||||
}
|
||||
inline void update(const T&) { ORT_ENFORCE(false, "must be overloaded."); }
|
||||
inline void update0(const T&) { ORT_ENFORCE(false, "must be overloaded."); }
|
||||
inline TVAL aggall(const T*) { ORT_ENFORCE(false, "must be overloaded."); }
|
||||
inline TVAL get_value() { return accumulator_; }
|
||||
inline void enforce(const ResultsNoTransposePrepareForReduce&) {}
|
||||
static inline bool two_loops() { return false; }
|
||||
};
|
||||
|
||||
template <typename T, typename TVAL = T>
|
||||
class ReduceAggregatorSum : public ReduceAggregator<T, TVAL> {
|
||||
public:
|
||||
inline ReduceAggregatorSum(int64_t N, const T&) : ReduceAggregator<T, TVAL>(N, 0) {}
|
||||
inline void update(const T& v) { this->accumulator_ += v; }
|
||||
inline TVAL aggall(const T* from_data) {
|
||||
return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).sum();
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename TVAL = T>
|
||||
class ReduceAggregatorSumSquare : public ReduceAggregator<T, TVAL> {
|
||||
public:
|
||||
inline ReduceAggregatorSumSquare(int64_t N, const T&) : ReduceAggregator<T, TVAL>(N, 0) {}
|
||||
inline TVAL aggall(const T* from_data) {
|
||||
return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).squaredNorm();
|
||||
}
|
||||
inline void update(const T& v) { this->accumulator_ += v * v; }
|
||||
};
|
||||
|
||||
template <typename T, typename TVAL = T>
|
||||
class ReduceAggregatorMean : public ReduceAggregatorSum<T, TVAL> {
|
||||
public:
|
||||
inline ReduceAggregatorMean(int64_t N, const T&) : ReduceAggregatorSum<T, TVAL>(N, 0) {}
|
||||
inline T aggall(const T* from_data) {
|
||||
return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).mean();
|
||||
}
|
||||
inline T get_value() { return this->accumulator_ / static_cast<T>(this->N_); }
|
||||
};
|
||||
|
||||
template <typename T, typename TVAL = T>
|
||||
class ReduceAggregatorMax : public ReduceAggregator<T, TVAL> {
|
||||
public:
|
||||
inline ReduceAggregatorMax(int64_t N, const T& init) : ReduceAggregator<T, TVAL>(N, init) {}
|
||||
inline TVAL aggall(const T* from_data) {
|
||||
return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).maxCoeff();
|
||||
}
|
||||
inline void update(const T& v) { this->accumulator_ = v > this->accumulator_ ? v : this->accumulator_; }
|
||||
};
|
||||
|
||||
template <typename T, typename TVAL = int64_t>
|
||||
class ReduceAggregatorArgMinMax : public ReduceAggregator<T, TVAL> {
|
||||
protected:
|
||||
int64_t arg_;
|
||||
int64_t index_;
|
||||
|
||||
public:
|
||||
inline ReduceAggregatorArgMinMax(int64_t N, const T& init) : ReduceAggregator<T, TVAL>(N, init) {
|
||||
arg_ = 0;
|
||||
index_ = 0;
|
||||
}
|
||||
inline TVAL get_value() { return arg_; }
|
||||
inline void enforce(const ResultsNoTransposePrepareForReduce& res) {
|
||||
ORT_ENFORCE(res.projected_index.size() == 0, "Only one axis is allowed for reduction.");
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename TVAL = int64_t>
|
||||
class ReduceAggregatorArgMax : public ReduceAggregatorArgMinMax<T, TVAL> {
|
||||
public:
|
||||
inline ReduceAggregatorArgMax(int64_t N, const T& init) : ReduceAggregatorArgMinMax<T, TVAL>(N, init) {}
|
||||
inline TVAL aggall(const T* from_data) {
|
||||
Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).maxCoeff(&this->arg_);
|
||||
return this->get_value();
|
||||
}
|
||||
inline void update(const T& v) {
|
||||
if (v > this->accumulator_) {
|
||||
this->accumulator_ = v;
|
||||
this->arg_ = this->index_;
|
||||
}
|
||||
++this->index_;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename TVAL = int64_t>
|
||||
class ReduceAggregatorArgMaxLastIndex : public ReduceAggregatorArgMax<T, TVAL> {
|
||||
public:
|
||||
inline ReduceAggregatorArgMaxLastIndex(int64_t N, const T& init) : ReduceAggregatorArgMax<T, TVAL>(N, init) {}
|
||||
inline TVAL aggall(const T* from_data) {
|
||||
for (int64_t i = 0; i < this->N_; ++i) {
|
||||
update(from_data[i]);
|
||||
}
|
||||
return this->get_value();
|
||||
}
|
||||
inline void update(const T& v) {
|
||||
if (v >= this->accumulator_) {
|
||||
this->accumulator_ = v;
|
||||
this->arg_ = this->index_;
|
||||
}
|
||||
++this->index_;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename TVAL = int64_t>
|
||||
class ReduceAggregatorArgMin : public ReduceAggregatorArgMinMax<T, TVAL> {
|
||||
public:
|
||||
inline ReduceAggregatorArgMin(int64_t N, const T& init) : ReduceAggregatorArgMinMax<T, TVAL>(N, init) {}
|
||||
inline TVAL aggall(const T* from_data) {
|
||||
Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).minCoeff(&this->arg_);
|
||||
return this->get_value();
|
||||
}
|
||||
inline void update(const T& v) {
|
||||
if (v < this->accumulator_) {
|
||||
this->accumulator_ = v;
|
||||
this->arg_ = this->index_;
|
||||
}
|
||||
++this->index_;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename TVAL = int64_t>
|
||||
class ReduceAggregatorArgMinLastIndex : public ReduceAggregatorArgMin<T, TVAL> {
|
||||
public:
|
||||
inline ReduceAggregatorArgMinLastIndex(int64_t N, const T& init) : ReduceAggregatorArgMin<T, TVAL>(N, init) {}
|
||||
inline TVAL aggall(const T* from_data) {
|
||||
for (int64_t i = 0; i < this->N_; ++i) {
|
||||
update(from_data[i]);
|
||||
}
|
||||
return this->get_value();
|
||||
}
|
||||
inline void update(const T& v) {
|
||||
if (v <= this->accumulator_) {
|
||||
this->accumulator_ = v;
|
||||
this->arg_ = this->index_;
|
||||
}
|
||||
++this->index_;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename TVAL = T>
|
||||
class ReduceAggregatorMin : public ReduceAggregator<T, TVAL> {
|
||||
public:
|
||||
inline ReduceAggregatorMin(int64_t N, const T& init) : ReduceAggregator<T, TVAL>(N, init) {}
|
||||
inline TVAL aggall(const T* from_data) {
|
||||
return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).minCoeff();
|
||||
}
|
||||
inline void update(const T& v) { this->accumulator_ = v < this->accumulator_ ? v : this->accumulator_; }
|
||||
};
|
||||
|
||||
template <typename T, typename TVAL = T>
|
||||
class ReduceAggregatorProd : public ReduceAggregator<T, TVAL> {
|
||||
public:
|
||||
inline ReduceAggregatorProd(int64_t N, const T&) : ReduceAggregator<T, TVAL>(N, 1) {}
|
||||
inline TVAL aggall(const T* from_data) {
|
||||
return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).prod();
|
||||
}
|
||||
inline void update(const T& v) { this->accumulator_ *= v; }
|
||||
};
|
||||
|
||||
template <typename T, typename TVAL = T>
|
||||
class ReduceAggregatorL1 : public ReduceAggregator<T, TVAL> {
|
||||
public:
|
||||
inline ReduceAggregatorL1(int64_t N, const T&) : ReduceAggregator<T, TVAL>(N, 0) {}
|
||||
inline TVAL aggall(const T* from_data) {
|
||||
return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).cwiseAbs().sum();
|
||||
}
|
||||
inline void update(const T& v) { this->accumulator_ += v > 0 ? v : -v; }
|
||||
};
|
||||
|
||||
template <typename T, typename TVAL = T>
|
||||
class ReduceAggregatorL2 : public ReduceAggregator<T, TVAL> {
|
||||
public:
|
||||
inline ReduceAggregatorL2(int64_t N, const T&) : ReduceAggregator<T, TVAL>(N, 0) {}
|
||||
inline TVAL aggall(const T* from_data) {
|
||||
return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).norm();
|
||||
}
|
||||
inline void update(const T& v) { this->accumulator_ += v * v; }
|
||||
inline TVAL get_value() { return reduce_sqrt<T>(this->accumulator_); }
|
||||
};
|
||||
|
||||
template <typename T, typename TVAL = T>
|
||||
class ReduceAggregatorLogSum : public ReduceAggregator<T, TVAL> {
|
||||
public:
|
||||
inline ReduceAggregatorLogSum(int64_t N, const T&) : ReduceAggregator<T, TVAL>(N, 0) {}
|
||||
inline T aggall(const T* from_data) {
|
||||
return reduce_log<T>(Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).sum());
|
||||
}
|
||||
inline void update(const T& v) { this->accumulator_ += v; }
|
||||
inline TVAL get_value() { return reduce_log<T>(this->accumulator_); }
|
||||
};
|
||||
|
||||
template <typename T, typename TVAL = T>
|
||||
class ReduceAggregatorLogSumExp : public ReduceAggregator<T, TVAL> {
|
||||
protected:
|
||||
T max_;
|
||||
|
||||
public:
|
||||
inline ReduceAggregatorLogSumExp(int64_t N, const T&) : ReduceAggregator<T, TVAL>(N, 0) { max_ = this->accumulator_; }
|
||||
inline TVAL aggall(const T* from_data) {
|
||||
max_ = Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, this->N_).maxCoeff();
|
||||
for (int64_t i = 0; i < this->N_; ++i) {
|
||||
update(from_data[i]);
|
||||
}
|
||||
return get_value();
|
||||
}
|
||||
inline void update0(const T& v) { max_ = v > max_ ? v : max_; }
|
||||
inline void update(const T& v) { this->accumulator_ += reduce_exp(v - max_); }
|
||||
inline TVAL get_value() { return reduce_log<T>(this->accumulator_) + max_; }
|
||||
static inline bool two_loops() { return true; }
|
||||
};
|
||||
|
||||
bool SetupForReduce(const Tensor* input_tensor_ptr,
|
||||
const std::vector<int64_t>& axes_,
|
||||
std::vector<int64_t>& axes,
|
||||
TensorShape& new_input_shape,
|
||||
std::vector<int64_t>& output_shape,
|
||||
bool& empty_reduce,
|
||||
const TensorShape* input_shape_override);
|
||||
|
||||
void NoTransposePrepareForReduce(const TensorShape& new_input_shape,
|
||||
const std::vector<int64_t>& reduced_axes,
|
||||
ResultsNoTransposePrepareForReduce& results);
|
||||
|
||||
template <typename T, typename AGG>
|
||||
void NoTransposeReduce(Tensor* output, const TensorShape& new_input_shape, const Tensor& input,
|
||||
const std::vector<int64_t>& reduced_axes, concurrency::ThreadPool* tp,
|
||||
ResultsNoTransposePrepareForReduce& last_results);
|
||||
|
||||
template <typename T, typename AGG>
|
||||
void CommonReduce(OpKernelContext* ctx,
|
||||
const std::vector<int64_t> axes_, int64_t keepdims_,
|
||||
ResultsNoTransposePrepareForReduce& last_results,
|
||||
bool noop_with_empty_axes = false);
|
||||
|
||||
template <bool allow_multi_axes>
|
||||
class ReduceKernelBase {
|
||||
|
|
|
|||
|
|
@ -27,38 +27,10 @@ REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(double)
|
|||
REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(int32_t)
|
||||
REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(int64_t)
|
||||
|
||||
|
||||
template <typename T>
|
||||
Status ReduceSumTraining<T>::Compute(OpKernelContext* ctx) const {
|
||||
FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
|
||||
int64_t block_size;
|
||||
int64_t blocks;
|
||||
std::vector<int64_t> reduced_dims;
|
||||
const Tensor* input = ctx->Input<Tensor>(0);
|
||||
|
||||
//override the attribute value with the input value for reduction_axes
|
||||
const Tensor* axes_tensor = ctx->Input<Tensor>(1);
|
||||
ORT_ENFORCE(axes_tensor != nullptr, "Axes input is null");
|
||||
ORT_ENFORCE(axes_tensor->Shape().NumDimensions() == 1,
|
||||
"An axes tensor must be a vector tensor.");
|
||||
auto nDims = static_cast<size_t>(axes_tensor->Shape()[0]);
|
||||
const auto* data = axes_tensor->template Data<int64_t>();
|
||||
std::vector<int64_t> axes(data, data + nDims);
|
||||
|
||||
// empty axes and no-op
|
||||
if (axes.empty() && noop_with_empty_axes_) {
|
||||
auto* output = ctx->Output(0, input->Shape());
|
||||
memcpy(output->template MutableData<T>(), input->template Data<T>(), input->SizeInBytes());
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes, keepdims_, reduced_dims, true);
|
||||
|
||||
auto* output = ctx->Output(0, reduced_dims);
|
||||
|
||||
ReduceSumCore(input->template Data<T>(), output->template MutableData<T>(),
|
||||
no_transpose, blocks, block_size, transposed_input_data, ctx->GetOperatorThreadPool());
|
||||
|
||||
ResultsNoTransposePrepareForReduce last_results;
|
||||
CommonReduce<T, ReduceAggregatorSum<T>>(ctx, axes_, keepdims_, last_results, noop_with_empty_axes_);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue