diff --git a/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc b/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc index e6355d9b46..4ffdf5c458 100644 --- a/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc +++ b/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc @@ -151,23 +151,13 @@ REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ArgMin, 1, 10); REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ArgMin, 11, 12); REGISTER_UNARY_ELEMENTWISE_KERNEL(ArgMin, 13); -// When all reduce axes are located at the tail of the dims, quite general cases, transpose and extra -// copy could be skipped to improve performance. If required by check_no_transpose = true, then -// the calling code will check if the data was transposed and act accordingly. -// return value: true means transposedInputData is not created/copied, input tensor data could -// be directly used as row major matrix [block_size, blocks], where blocks is the -// size of each reduce. -// `input_shape_override` overrides the shape of `input` for compute purposes. -template -bool PrepareForReduce(const Tensor* input_tensor_ptr, - FastAllocVector& transposed_input_data, - int64_t& block_size, - int64_t& blocks, - const std::vector& axes_, - bool keepdims_, - /*out*/ std::vector& reduced_dims, - bool check_no_transpose, - const TensorShape* input_shape_override) { +bool SetupForReduce(const Tensor* input_tensor_ptr, + const std::vector& axes_, + std::vector& axes, + TensorShape& new_input_shape, + std::vector& output_shape, + bool& empty_reduce, + const TensorShape* input_shape_override) { ORT_ENFORCE(input_tensor_ptr != nullptr, "Input to be reduced is null"); if (input_shape_override) { @@ -175,25 +165,13 @@ bool PrepareForReduce(const Tensor* input_tensor_ptr, "The input shape override's size does not match the input tensor's shape size"); } - const Tensor& input = *input_tensor_ptr; - const auto& input_shape = input_shape_override ? *input_shape_override : input.Shape(); - - size_t ndim = input_shape.NumDimensions(); - - // Scalar tensor + new_input_shape = input_shape_override ? *input_shape_override : input_tensor_ptr->Shape(); + size_t ndim = new_input_shape.NumDimensions(); if (ndim == 0) { - if (!check_no_transpose) { - auto size = input_shape.Size(); - assert(size == 1); - transposed_input_data.resize(size, 0); - T* to_data = &transposed_input_data[0]; - *to_data = *input.Data(); - } - block_size = blocks = 1; - return true; + empty_reduce = true; + return false; } - std::vector axes; axes.reserve(axes_.size()); for (int64_t axis : axes_) { axes.push_back(HandleNegativeAxis(axis, static_cast(ndim))); @@ -210,598 +188,411 @@ bool PrepareForReduce(const Tensor* input_tensor_ptr, // If all reduced axes are located at the tail of the input shape, then copy could be skipped is required bool need_copy = true; - if (axes.size() <= ndim && + if (axes.size() <= ndim && ndim > 0 && axes.front() == static_cast(ndim - axes.size()) && axes.back() == static_cast(ndim) - 1) { need_copy = false; } - std::vector keep_axis(ndim, true); - for (auto i : axes) { - keep_axis[i] = false; + empty_reduce = false; + output_shape = new_input_shape.GetDims(); + for (auto a : axes) { + output_shape[a] = new_input_shape[a] > 0 ? 1 : 0; + empty_reduce |= output_shape[a] == 0; + } + return need_copy; +} +void NoTransposePrepareForReduce(const TensorShape& new_input_shape, + const std::vector& reduced_axes, + ResultsNoTransposePrepareForReduce& results) { + // Common initialisation for the indices. + std::vector cumulative_shape = new_input_shape.GetDims(); + cumulative_shape[cumulative_shape.size() - 1] = 1; + for (int i = static_cast(cumulative_shape.size()) - 2; i >= 0; --i) { + cumulative_shape[i] = cumulative_shape[i + 1] * new_input_shape[i + 1]; + } + int64_t projection_size = 1; + for (auto a : reduced_axes) { + projection_size *= new_input_shape[a]; } - //transpose the input so that all to-be-reduced axes are at the head - std::vector transposed_axes(axes.begin(), axes.end()); - for (size_t i = 0; i < ndim; ++i) { - if (keep_axis[i]) { - transposed_axes.push_back(i); - } - } - - std::vector new_dims(transposed_axes.size()); - for (size_t i = 0; i < transposed_axes.size(); ++i) { - new_dims[i] = input_shape.GetDims().at(transposed_axes[i]); - } - - int num_axes = static_cast(transposed_axes.size()); - auto in_dims = input_shape.GetDims(); - - // Measure amount of contiguous data we can copy at once - int64_t blocksize = 1; - int n_shared_idxs = 0; - for (int i = num_axes - 1; i >= 0; --i) { - if (transposed_axes[i] == i) { - blocksize *= new_dims[i]; - ++n_shared_idxs; - } else { + int last_reduced_axis = static_cast(reduced_axes.size()) - 1; + int loop_reduced_axis = 1; + results.last_loop_red_size = new_input_shape[reduced_axes[last_reduced_axis]]; + results.last_loop_red_inc = cumulative_shape[reduced_axes[last_reduced_axis]]; + projection_size /= new_input_shape[reduced_axes[last_reduced_axis]]; + --last_reduced_axis; + while (last_reduced_axis >= 0) { + if (reduced_axes[last_reduced_axis] != reduced_axes[last_reduced_axis + 1] - 1) break; - } + results.last_loop_red_size *= new_input_shape[reduced_axes[last_reduced_axis]]; + projection_size /= new_input_shape[reduced_axes[last_reduced_axis]]; + --last_reduced_axis; + ++loop_reduced_axis; } - const T* from_data = input.template Data(); - size_t count = input_shape.Size(); - - //set to-be-reduced axes to one. squeeze is keepdims_ is false - int64_t first_dim = 1; - reduced_dims.reserve(in_dims.size()); - - for (size_t i = 0; i < in_dims.size(); i++) { - const auto in_dim = in_dims[i]; - if (keep_axis[i]) { - reduced_dims.push_back(in_dim); - } else { - first_dim *= in_dim; - if (keepdims_) { - reduced_dims.push_back(in_dim == 0 ? 0 : 1); - } else { - // as we are reducing on this axis and not keeping a dim for it, we can't drop a dim value of 0. - // e.g. if input was {3, 0, 2} and we reduced on axis 1 without keeping it, the output shape would be - // {3, 2} which is invalid given the input was empty. - // note that if we do keep the dim the output shape will have a 0 in it, - // which is still valid for an empty tensor, so allow that. - ORT_ENFORCE(in_dim != 0, - "Can't reduce on dim with value of 0 if 'keepdims' is false. " - "Invalid output shape would be produced. input_shape:", - input_shape); - } - } - } - - auto num_elements = input_shape.Size(); - - // edge case. one or more input dims with value of 0. - if (num_elements == 0) { - block_size = blocks = 0; - return true; - } - - if (0 == first_dim) { - return false; - } - - block_size = num_elements / first_dim; - blocks = first_dim; - - if (!need_copy && check_no_transpose) { - return true; - } - - transposed_input_data.resize(input_shape.Size(), 0); - T* to_data = &transposed_input_data[0]; - if (num_axes < 2 || n_shared_idxs == num_axes) { - memcpy(to_data, from_data, count * sizeof(T)); - return false; - } - - int itr_axes = num_axes - n_shared_idxs; - - // Calculate strides - std::vector stride_x(itr_axes, 0); - for (size_t i = 0; static_cast(i) < itr_axes; i++) { - stride_x[i] = 1; - for (size_t j = transposed_axes[i] + 1; static_cast(j) < itr_axes; j++) { - stride_x[i] *= in_dims[j]; - } - } - - std::vector itr_idxs(itr_axes, 0); - - // Branch here to avoid branching within the loop - if (blocksize > 1) { - for (size_t index = 0; index < (count / blocksize); index++) { - int64_t from_index = 0; - for (int i = 0; i < itr_axes; ++i) { - from_index += stride_x[i] * itr_idxs[i]; - } - - memcpy( - to_data + blocksize * index, - from_data + blocksize * from_index, - blocksize * sizeof(T)); - - ++itr_idxs[itr_axes - 1]; - for (int i = itr_axes - 1; i >= 1; --i) { - auto expected_dim = new_dims[i]; - if (itr_idxs[i] < expected_dim) { - break; - } - itr_idxs[i] %= expected_dim; - ++itr_idxs[i - 1]; - } - } + // Builds the list of indices projected into the same sum. + int reduced_axes_size = static_cast(reduced_axes.size()) - loop_reduced_axis; + if (reduced_axes_size == 0) { + results.projected_index.resize(1, 0); } else { - for (size_t index = 0; index < count; index++) { - int64_t from_index = 0; - for (int i = 0; i < itr_axes; ++i) { - from_index += stride_x[i] * itr_idxs[i]; - } - - *(to_data + index) = *(from_data + from_index); - - ++itr_idxs[itr_axes - 1]; - for (int i = itr_axes - 1; i >= 1; --i) { - auto expected_dim = new_dims[i]; - if (itr_idxs[i] < expected_dim) { + results.projected_index.resize(projection_size); + std::vector projected_indices(reduced_axes_size, 0); + int64_t current_index = 0; + size_t current_pos = 0; + int j; + for (current_pos = 0; current_pos < results.projected_index.size(); ++current_pos) { + results.projected_index[current_pos] = current_index; + ++projected_indices[projected_indices.size() - 1]; + current_index += cumulative_shape[reduced_axes[reduced_axes_size - 1]]; + for (j = reduced_axes_size - 1; j > 0; --j) { + if (projected_indices[j] < new_input_shape[reduced_axes[j]]) break; - } - itr_idxs[i] %= expected_dim; - ++itr_idxs[i - 1]; + projected_indices[j] -= new_input_shape[reduced_axes[j]]; + current_index -= new_input_shape[reduced_axes[j]] * cumulative_shape[reduced_axes[j]]; + ++projected_indices[j - 1]; + current_index += cumulative_shape[reduced_axes[j - 1]]; } } } - return false; + + // Builds the list of indices for the unprojected sum. + std::vector unreduced_axes; + for (int64_t i = 0; i < static_cast(cumulative_shape.size()); ++i) { + if (std::find(reduced_axes.begin(), reduced_axes.end(), i) != reduced_axes.end()) + continue; + unreduced_axes.push_back(i); + } + int64_t unprojection_size = 1; + for (auto a : unreduced_axes) { + unprojection_size *= new_input_shape[a]; + } + if (unprojection_size == 0) { + return; + } + std::vector unprojected_indices(unreduced_axes.size(), 0); + + // The last index is usually an image size. + // We differently process the last unprojected dimension. + results.last_loop_size = new_input_shape[unreduced_axes[unreduced_axes.size() - 1]]; + int64_t unprojection_size_before_last = unprojection_size / results.last_loop_size; + results.unprojected_index.reserve(unprojection_size_before_last); + results.last_loop_inc = cumulative_shape[unreduced_axes[unreduced_axes.size() - 1]]; + if (unprojected_indices.size() <= 1) { + results.unprojected_index.push_back(0); + } else { + int64_t current_index = 0; + int j; + for (int64_t pos = 0; pos < unprojection_size_before_last; ++pos) { + results.unprojected_index.push_back(current_index); + ++unprojected_indices[unprojected_indices.size() - 2]; + current_index += cumulative_shape[unreduced_axes[unreduced_axes.size() - 2]]; + for (j = static_cast(unreduced_axes.size()) - 2; j > 0; --j) { + if (unprojected_indices[j] < new_input_shape[unreduced_axes[j]]) + break; + unprojected_indices[j] -= new_input_shape[unreduced_axes[j]]; + current_index -= new_input_shape[unreduced_axes[j]] * cumulative_shape[unreduced_axes[j]]; + ++unprojected_indices[j - 1]; + current_index += cumulative_shape[unreduced_axes[j - 1]]; + } + } + } +} + +template +void NoTransposeReduce(Tensor* output, const TensorShape& new_input_shape, const Tensor& input, + const std::vector& reduced_axes, concurrency::ThreadPool* tp, + ResultsNoTransposePrepareForReduce& last_results) { + auto output_shape = output->Shape(); + const T* from_data = input.template Data(); + typename AGG::value_type* to_data = output->template MutableData(); + int64_t count = output_shape.Size(); + + if (reduced_axes.size() == 0 || reduced_axes.size() == new_input_shape.NumDimensions()) { + ORT_ENFORCE(count == 1, "Reduction on all axes, output size should be 1."); + int64_t input_size = new_input_shape.Size(); + to_data[0] = AGG(input_size, from_data[0]).aggall(from_data); + return; + } + + if (!last_results.equal(new_input_shape.GetDims(), reduced_axes)) { + NoTransposePrepareForReduce(new_input_shape, reduced_axes, last_results); + if (last_results.last_loop_red_size == 0 || last_results.last_loop_size == 0) + return; + } + int64_t denominator = last_results.last_loop_red_size * last_results.projected_index.size(); + + if (AGG::two_loops()) { + auto fn = [&](std::ptrdiff_t first, std::ptrdiff_t end) { + int64_t loop; + const T* loop_red_ptr; + const T* loop_red_ptr_end; + int64_t current_index = first * last_results.last_loop_size; + for (int64_t main_index = first; main_index < end; ++main_index) { + for (loop = 0; loop < last_results.last_loop_size; ++loop, ++current_index) { + int64_t origin = last_results.unprojected_index[main_index] + loop * last_results.last_loop_inc; + AGG accumulator(denominator, from_data[origin + last_results.projected_index[0]]); + for (auto it = last_results.projected_index.begin(); it != last_results.projected_index.end(); ++it) { + loop_red_ptr = from_data + (origin + *it); + loop_red_ptr_end = loop_red_ptr + last_results.last_loop_red_size * last_results.last_loop_red_inc; + for (; loop_red_ptr != loop_red_ptr_end; loop_red_ptr += last_results.last_loop_red_inc) { + accumulator.update0(*loop_red_ptr); + } + } + for (auto it = last_results.projected_index.begin(); it != last_results.projected_index.end(); ++it) { + loop_red_ptr = from_data + (origin + *it); + loop_red_ptr_end = loop_red_ptr + last_results.last_loop_red_size * last_results.last_loop_red_inc; + for (; loop_red_ptr != loop_red_ptr_end; loop_red_ptr += last_results.last_loop_red_inc) { + accumulator.update(*loop_red_ptr); + } + } + to_data[current_index] = accumulator.get_value(); + } + } + }; + + auto cost = TensorOpCost{(double)(last_results.projected_index.size() * sizeof(T) * last_results.last_loop_size * last_results.last_loop_red_size), + (double)last_results.last_loop_size * last_results.last_loop_red_size, + (double)last_results.projected_index.size() * last_results.last_loop_size * last_results.last_loop_red_size * 2}; + concurrency::ThreadPool::TryParallelFor(tp, count / last_results.last_loop_size, cost, fn); + } else { + auto fn = [&](std::ptrdiff_t first, std::ptrdiff_t end) { + int64_t loop; + const T* loop_red_ptr; + const T* loop_red_ptr_end; + int64_t current_index = first * last_results.last_loop_size; + for (int64_t main_index = first; main_index < end; ++main_index) { + for (loop = 0; loop < last_results.last_loop_size; ++loop, ++current_index) { + int64_t origin = last_results.unprojected_index[main_index] + loop * last_results.last_loop_inc; + AGG accumulator(denominator, from_data[origin + last_results.projected_index[0]]); + for (auto it = last_results.projected_index.begin(); it != last_results.projected_index.end(); ++it) { + loop_red_ptr = from_data + (origin + *it); + loop_red_ptr_end = loop_red_ptr + last_results.last_loop_red_size * last_results.last_loop_red_inc; + for (; loop_red_ptr != loop_red_ptr_end; loop_red_ptr += last_results.last_loop_red_inc) { + accumulator.update(*loop_red_ptr); + } + } + to_data[current_index] = accumulator.get_value(); + } + } + }; + + auto cost = TensorOpCost{(double)(last_results.projected_index.size() * sizeof(T) * last_results.last_loop_size * last_results.last_loop_red_size), + (double)last_results.last_loop_size * last_results.last_loop_red_size, + (double)last_results.projected_index.size() * last_results.last_loop_size * last_results.last_loop_red_size}; + concurrency::ThreadPool::TryParallelFor(tp, count / last_results.last_loop_size, cost, fn); + } +} + +void DropDimensions(const std::vector& input_shape, const std::vector& axes, std::vector& dropped_axes) { + auto dropped_dims = input_shape; + for (auto i : axes) { + dropped_dims[i] = -1; + } + for (auto it = dropped_dims.begin(); it != dropped_dims.end(); ++it) { + if (*it != -1) { + dropped_axes.push_back(*it); + } + } +} + +template +void CommonReduce(OpKernelContext* ctx, + const std::vector axes_, int64_t keepdims_, + ResultsNoTransposePrepareForReduce& last_results, + bool noop_with_empty_axes) { + std::vector axes; + const Tensor* input = ctx->Input(0); + auto reduced_dims = input->Shape().GetDims(); + std::vector output_shape; + bool empty_reduce; + TensorShape new_input_shape; + + if (ctx->InputCount() == 2) { + // second input holds the axes. + const Tensor* axes_tensor = ctx->Input(1); + ORT_ENFORCE(axes_tensor != nullptr, "Axes input is null"); + ORT_ENFORCE(axes_tensor->Shape().NumDimensions() == 1, + "An axes tensor must be a vector tensor."); + auto nDims = static_cast(axes_tensor->Shape()[0]); + const auto* data = axes_tensor->template Data(); + std::vector input_axes(data, data + nDims); + if (input_axes.empty() && noop_with_empty_axes) { + auto* output = ctx->Output(0, input->Shape()); + memcpy(output->template MutableData(), input->template Data(), input->SizeInBytes()); + return; + } + SetupForReduce(input, input_axes, axes, new_input_shape, output_shape, empty_reduce, nullptr); + } else { + SetupForReduce(input, axes_, axes, new_input_shape, output_shape, empty_reduce, nullptr); + } + + if (empty_reduce) { + Tensor* output = ctx->Output(0, keepdims_ ? output_shape : std::vector()); + if (new_input_shape.Size() == 1) { + const T* from_data = input->template Data(); + typename AGG::value_type* to_data = output->template MutableData(); + AGG agg(1, *from_data); + if (agg.two_loops()) { + agg.update0(*from_data); + agg.update(*from_data); + } else { + agg.update(*from_data); + } + *to_data = agg.get_value(); + } else { + ORT_ENFORCE(keepdims_, + "Can't reduce on dim with value of 0 if 'keepdims' is false. " + "Invalid output shape would be produced. input_shape:", + input->Shape()); + } + return; + } + + Tensor* output; + if (keepdims_) { + output = ctx->Output(0, output_shape); + } else { + std::vector dropped_axes; + DropDimensions(output_shape, axes, dropped_axes); + output = ctx->Output(0, dropped_axes); + } + NoTransposeReduce(output, new_input_shape, *input, axes, ctx->GetOperatorThreadPool(), last_results); } template Status ReduceL1::Compute(OpKernelContext* ctx) const { - FastAllocVector transposed_input_data(GetAllocator(*ctx)); - int64_t block_size; - int64_t blocks; - std::vector reduced_dims; - - const Tensor* input = ctx->Input(0); - - bool no_transpose = PrepareForReduce(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true); - - Tensor* reduced = ctx->Output(0, reduced_dims); - - T* output_data = reduced->template MutableData(); - - if (no_transpose) { - const T* input_data = input->template Data(); - - for (int64_t i = 0; i < block_size; ++i) { - output_data[i] = ConstEigenVectorMap(input_data + (i * blocks), blocks).cwiseAbs().sum(); - } - } else { - EigenVectorMap out_vec(output_data, block_size); - out_vec = ConstEigenMatrixMap(&transposed_input_data[0], block_size, blocks).cwiseAbs().rowwise().sum(); - } - + // The following variable does not change if the input tensor and the + // axes do not either. It could be either cached in ctx or precomputed + // in the constructor if shape and axes are known at this stage. + ResultsNoTransposePrepareForReduce last_results; + CommonReduce>(ctx, axes_, keepdims_, last_results); return Status::OK(); } template Status ReduceL2::Compute(OpKernelContext* ctx) const { - FastAllocVector transposed_input_data(GetAllocator(*ctx)); - int64_t block_size; - int64_t blocks; - std::vector reduced_dims; - - const Tensor* input = ctx->Input(0); - - bool no_transpose = PrepareForReduce(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true); - - Tensor* reduced = ctx->Output(0, reduced_dims); - - T* output_data = reduced->template MutableData(); - - if (no_transpose) { - const T* input_data = input->template Data(); - - for (int64_t i = 0; i < block_size; ++i) { - output_data[i] = ConstEigenVectorMap(input_data + (i * blocks), blocks).norm(); - } - } else { - EigenVectorMap out_vec(output_data, block_size); - out_vec = ConstEigenMatrixMap(&transposed_input_data[0], block_size, blocks).rowwise().norm(); - } - + ResultsNoTransposePrepareForReduce last_results; + CommonReduce>(ctx, axes_, keepdims_, last_results); return Status::OK(); } template Status ReduceLogSum::Compute(OpKernelContext* ctx) const { - FastAllocVector transposed_input_data(GetAllocator(*ctx)); - int64_t block_size; - int64_t blocks; - std::vector reduced_dims; - - const Tensor* input = ctx->Input(0); - - bool no_transpose = PrepareForReduce(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true); - - Tensor* reduced = ctx->Output(0, reduced_dims); - - T* output_data = reduced->template MutableData(); - - if (no_transpose) { - const T* input_data = input->template Data(); - - for (int64_t i = 0; i < block_size; ++i) { - output_data[i] = ConstEigenVectorMap(input_data + (i * blocks), blocks).sum(); - } - } else { - EigenVectorMap out_vec(output_data, block_size); - out_vec = ConstEigenMatrixMap(&transposed_input_data[0], block_size, blocks).rowwise().sum(); - } - - for (int j = 0; j < block_size; ++j) { - *(output_data) = static_cast(std::log(*(output_data))); - ++output_data; - } - + ResultsNoTransposePrepareForReduce last_results; + CommonReduce>(ctx, axes_, keepdims_, last_results); return Status::OK(); } template Status ReduceLogSumExp::Compute(OpKernelContext* ctx) const { - FastAllocVector transposed_input_data(GetAllocator(*ctx)); - int64_t block_size; - int64_t blocks; - std::vector reduced_dims; - const Tensor* input = ctx->Input(0); - - PrepareForReduce(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims); - - Tensor* reduced = ctx->Output(0, reduced_dims); - - T* output_data = reduced->template MutableData(); - - for (int j = 0; j < block_size; ++j) { - T max_value = std::numeric_limits::lowest(); - for (int i = 0; i < blocks; ++i) { - max_value = std::max(max_value, transposed_input_data[i * block_size + j]); - } - T scaled_exp_sum = 0; - for (int i = 0; i < blocks; ++i) { - scaled_exp_sum += static_cast(std::exp(transposed_input_data[i * block_size + j] - max_value)); - } - *(output_data++) = static_cast(std::log(scaled_exp_sum) + max_value); - } + ResultsNoTransposePrepareForReduce last_results; + CommonReduce>(ctx, axes_, keepdims_, last_results); return Status::OK(); } template Status ReduceMax::Compute(OpKernelContext* ctx) const { - FastAllocVector transposed_input_data(GetAllocator(*ctx)); - int64_t block_size; - int64_t blocks; - std::vector reduced_dims; - const Tensor* input = ctx->Input(0); - - bool no_transpose = PrepareForReduce(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true); - - Tensor* reduced = ctx->Output(0, reduced_dims); - - T* output_data = reduced->template MutableData(); - - if (no_transpose) { - const T* input_data = input->template Data(); - - for (int64_t i = 0; i < block_size; ++i) { - output_data[i] = ConstEigenVectorMap(input_data + (i * blocks), blocks).maxCoeff(); - } - } else { - EigenVectorMap out_vec(output_data, block_size); - out_vec = ConstEigenMatrixMap(&transposed_input_data[0], block_size, blocks).rowwise().maxCoeff(); - } - + ResultsNoTransposePrepareForReduce last_results; + CommonReduce>(ctx, axes_, keepdims_, last_results); return Status::OK(); } template Status ReduceMean::Compute(OpKernelContext* ctx) const { - FastAllocVector transposed_input_data(GetAllocator(*ctx)); - int64_t block_size; - int64_t blocks; - std::vector reduced_dims; - const Tensor* input = ctx->Input(0); - - bool no_transpose = PrepareForReduce(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true); - - Tensor* reduced = ctx->Output(0, reduced_dims); - - T* output_data = reduced->template MutableData(); - - if (no_transpose) { - const T* input_data = ctx->Input(0)->template Data(); - auto lambda = [input_data, blocks, output_data](ptrdiff_t i) { - output_data[i] = ConstEigenVectorMap(input_data + (i * blocks), blocks).mean(); - }; - concurrency::ThreadPool::TryBatchParallelFor(ctx->GetOperatorThreadPool(), block_size, lambda, 0); - } else { - EigenVectorMap out_vec(output_data, block_size); - out_vec = ConstEigenMatrixMap(&transposed_input_data[0], block_size, blocks).rowwise().mean(); - } - + ResultsNoTransposePrepareForReduce last_results; + CommonReduce>(ctx, axes_, keepdims_, last_results); return Status::OK(); } template Status ReduceMin::Compute(OpKernelContext* ctx) const { - FastAllocVector transposed_input_data(GetAllocator(*ctx)); - int64_t block_size; - int64_t blocks; - std::vector reduced_dims; - const Tensor* input = ctx->Input(0); - - bool no_transpose = PrepareForReduce(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true); - - Tensor* reduced = ctx->Output(0, reduced_dims); - - T* output_data = reduced->template MutableData(); - - if (no_transpose) { - const T* input_data = input->template Data(); - - for (int64_t i = 0; i < block_size; ++i) { - output_data[i] = ConstEigenVectorMap(input_data + (i * blocks), blocks).minCoeff(); - } - } else { - EigenVectorMap out_vec(output_data, block_size); - out_vec = ConstEigenMatrixMap(&transposed_input_data[0], block_size, blocks).rowwise().minCoeff(); - } - + ResultsNoTransposePrepareForReduce last_results; + CommonReduce>(ctx, axes_, keepdims_, last_results); return Status::OK(); } template Status ReduceProd::Compute(OpKernelContext* ctx) const { - FastAllocVector transposed_input_data(GetAllocator(*ctx)); - int64_t block_size; - int64_t blocks; - std::vector reduced_dims; - const Tensor* input = ctx->Input(0); - - bool no_transpose = PrepareForReduce(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true); - - Tensor* reduced = ctx->Output(0, reduced_dims); - - T* output_data = reduced->template MutableData(); - - if (no_transpose) { - const T* input_data = input->template Data(); - - for (int64_t i = 0; i < block_size; ++i) { - output_data[i] = ConstEigenVectorMap(input_data + (i * blocks), blocks).prod(); - } - } else { - EigenVectorMap out_vec(output_data, block_size); - out_vec = ConstEigenMatrixMap(&transposed_input_data[0], block_size, blocks).rowwise().prod(); - } - + ResultsNoTransposePrepareForReduce last_results; + CommonReduce>(ctx, axes_, keepdims_, last_results); return Status::OK(); } template -void ReduceSumCore(const T* input_data, T* output_data, bool no_transpose, - int64_t blocks, int64_t block_size, FastAllocVector& transposed_input_data, - concurrency::ThreadPool* tp) { - if (no_transpose) { - auto lambda = [input_data, blocks, output_data](ptrdiff_t i) { - // The ConstEigenMatrixMap type is expanded to work around a MS compiler issue - output_data[i] = Eigen::Map>(input_data + (i * blocks), blocks).sum(); - }; - concurrency::ThreadPool::TryBatchParallelFor(tp, block_size, lambda, 0); - } else { - EigenVectorMap out_vec(output_data, block_size); - out_vec = ConstEigenMatrixMap(&transposed_input_data[0], block_size, blocks).rowwise().sum(); - } +Status ReduceSum::Compute(OpKernelContext* ctx) const { + ResultsNoTransposePrepareForReduce last_results; + CommonReduce>(ctx, axes_, keepdims_, last_results); + return Status::OK(); } template Tensor ReduceSum::Impl(const Tensor& input, const std::vector& reduce_axes, AllocatorPtr allocator, concurrency::ThreadPool* tp, bool keep_dims, const TensorShape* input_shape_override) { - FastAllocVector transposed_input_data(allocator); - int64_t block_size; - int64_t blocks; - std::vector reduced_dims; + std::vector axes; + auto reduced_dims = input.Shape().GetDims(); + std::vector output_shape; + TensorShape new_input_shape; + bool empty_reduce; + SetupForReduce(&input, reduce_axes, axes, new_input_shape, output_shape, empty_reduce, input_shape_override); - bool no_transpose = PrepareForReduce(&input, transposed_input_data, block_size, blocks, - reduce_axes, keep_dims, reduced_dims, true, input_shape_override); + if (empty_reduce) { + Tensor output(input.DataType(), keep_dims ? output_shape : std::vector(), allocator); + if (new_input_shape.Size() == 1) { + const T* from_data = input.template Data(); + T* to_data = output.template MutableData(); + *to_data = *from_data; + } else { + ORT_ENFORCE(keep_dims, + "Can't reduce on dim with value of 0 if 'keepdims' is false. " + "Invalid output shape would be produced. input_shape:", + new_input_shape); + } + return output; + } - Tensor output(input.DataType(), reduced_dims, allocator); - - ReduceSumCore(input.template Data(), output.template MutableData(), - no_transpose, blocks, block_size, transposed_input_data, tp); - - return output; -} - -template -Status ReduceSum::Compute(OpKernelContext* ctx) const { - FastAllocVector transposed_input_data(GetAllocator(*ctx)); - int64_t block_size; - int64_t blocks; - std::vector reduced_dims; - const Tensor* input = ctx->Input(0); - - bool no_transpose = PrepareForReduce(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true); - - auto* output = ctx->Output(0, reduced_dims); - - ReduceSumCore(input->template Data(), output->template MutableData(), - no_transpose, blocks, block_size, transposed_input_data, ctx->GetOperatorThreadPool()); - - return Status::OK(); + if (keep_dims) { + ResultsNoTransposePrepareForReduce last_results; + Tensor output(input.DataType(), output_shape, allocator); + NoTransposeReduce>(&output, new_input_shape, input, axes, tp, last_results); + return output; + } else { + ResultsNoTransposePrepareForReduce last_results; + std::vector dropped_axes; + DropDimensions(output_shape, axes, dropped_axes); + Tensor output(input.DataType(), dropped_axes, allocator); + NoTransposeReduce>(&output, new_input_shape, input, axes, tp, last_results); + return output; + } } template Status ReduceSumSquare::Compute(OpKernelContext* ctx) const { - FastAllocVector transposed_input_data(GetAllocator(*ctx)); - int64_t block_size; - int64_t blocks; - std::vector reduced_dims; - const Tensor* input = ctx->Input(0); - - bool no_transpose = PrepareForReduce(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true); - - Tensor* reduced = ctx->Output(0, reduced_dims); - - T* output_data = reduced->template MutableData(); - - if (no_transpose) { - const T* input_data = input->template Data(); - - for (int64_t i = 0; i < block_size; ++i) { - output_data[i] = ConstEigenVectorMap(input_data + (i * blocks), blocks).squaredNorm(); - } - } else { - EigenVectorMap out_vec(output_data, block_size); - out_vec = ConstEigenMatrixMap(&transposed_input_data[0], block_size, blocks).rowwise().squaredNorm(); - } - + ResultsNoTransposePrepareForReduce last_results; + CommonReduce>(ctx, axes_, keepdims_, last_results); return Status::OK(); } template Status ArgMax::Compute(OpKernelContext* ctx) const { - FastAllocVector transposed_input_data(GetAllocator(*ctx)); - int64_t block_size; - int64_t blocks; - - std::vector reduced_dims; - const Tensor* input = ctx->Input(0); - - bool no_transpose = PrepareForReduce(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true); - - Tensor* reduced = ctx->Output(0, reduced_dims); - int64_t* output_data = reduced->template MutableData(); - Eigen::MatrixXf::Index maxIndex; - - if (no_transpose) { - const T* input_data = ctx->Input(0)->template Data(); - if (select_last_index_) { - assert(blocks > 0); - for (int64_t i = 0; i < block_size; ++i) { - gsl::span row(input_data, blocks); - auto first = row.cbegin(); - auto const end = row.cend(); - auto max_el = first; - while (++first < end) { - if (*first >= *max_el) { - max_el = first; - } - } - *(output_data++) = max_el - row.cbegin(); - input_data += blocks; - } - } else { - for (int64_t i = 0; i < block_size; ++i) { - ConstEigenVectorMap(input_data + (i * blocks), blocks).maxCoeff(&maxIndex); - *(output_data++) = maxIndex; - } - } + ResultsNoTransposePrepareForReduce last_results; + if (select_last_index_) { + CommonReduce>(ctx, axes_, keepdims_, last_results); } else { - auto matrixData = ConstEigenMatrixMap(&transposed_input_data[0], block_size, blocks); - if (select_last_index_) { - for (int i = 0; i < block_size; ++i) { - int idx = 0; - T max_val = matrixData(i, 0); - for (int c = 1; c < blocks; ++c) { - auto val = matrixData(i, c); - if (val >= max_val) { - idx = c; - max_val = val; - } - } - *(output_data++) = idx; - } - } else { - for (int i = 0; i < block_size; ++i) { - matrixData.row(i).maxCoeff(&maxIndex); - *(output_data++) = maxIndex; - } - } + CommonReduce>(ctx, axes_, keepdims_, last_results); } - return Status::OK(); } template Status ArgMin::Compute(OpKernelContext* ctx) const { - FastAllocVector transposed_input_data(GetAllocator(*ctx)); - int64_t block_size; - int64_t blocks; - - std::vector reduced_dims; - const Tensor* input = ctx->Input(0); - - bool no_transpose = PrepareForReduce(input, transposed_input_data, block_size, blocks, axes_, keepdims_, reduced_dims, true); - - Tensor* reduced = ctx->Output(0, reduced_dims); - int64_t* output_data = reduced->template MutableData(); - Eigen::MatrixXf::Index minIndex; - - if (no_transpose) { - const T* input_data = ctx->Input(0)->template Data(); - if (select_last_index_) { - assert(blocks > 0); - for (int64_t i = 0; i < block_size; ++i) { - gsl::span row(input_data, blocks); - auto first = row.cbegin(); - auto const end = row.cend(); - auto min_el = first; - while (++first < end) { - if (*first <= *min_el) { - min_el = first; - } - } - *(output_data++) = min_el - row.cbegin(); - input_data += blocks; - } - } else { - for (int64_t i = 0; i < block_size; ++i) { - ConstEigenVectorMap(input_data + (i * blocks), blocks).minCoeff(&minIndex); - *(output_data++) = minIndex; - } - } + ResultsNoTransposePrepareForReduce last_results; + if (select_last_index_) { + CommonReduce>(ctx, axes_, keepdims_, last_results); } else { - auto matrixData = ConstEigenMatrixMap(&transposed_input_data[0], block_size, blocks); - if (select_last_index_) { - for (int i = 0; i < block_size; ++i) { - int idx = 0; - T min_val = matrixData(i, 0); - for (int c = 1; c < blocks; ++c) { - auto val = matrixData(i, c); - if (val <= min_val) { - idx = c; - min_val = val; - } - } - *(output_data++) = idx; - } - } else { - for (int i = 0; i < block_size; ++i) { - matrixData.row(i).minCoeff(&minIndex); - *(output_data++) = minIndex; - } - } + CommonReduce>(ctx, axes_, keepdims_, last_results); } - return Status::OK(); } @@ -814,14 +605,4 @@ template class ReduceSum; template class ReduceSum; template class ReduceSum; -#define REGISTER_REDUCESUMCORE_TYPED(T) \ - template void ReduceSumCore(const T* input_data, T* output_data, bool no_transpose, \ - int64_t blocks, int64_t block_size, FastAllocVector& transposed_input_data, \ - concurrency::ThreadPool* tp); - -REGISTER_REDUCESUMCORE_TYPED(float) -REGISTER_REDUCESUMCORE_TYPED(double) -REGISTER_REDUCESUMCORE_TYPED(int32_t) -REGISTER_REDUCESUMCORE_TYPED(int64_t) - } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cpu/reduction/reduction_ops.h b/onnxruntime/core/providers/cpu/reduction/reduction_ops.h index 9ef1b43a40..0f5366cfc0 100644 --- a/onnxruntime/core/providers/cpu/reduction/reduction_ops.h +++ b/onnxruntime/core/providers/cpu/reduction/reduction_ops.h @@ -10,24 +10,305 @@ #include "core/providers/cpu/containers.h" #include "core/util/math_cpuonly.h" #include "core/platform/threadpool.h" +#include namespace onnxruntime { -template -bool PrepareForReduce(const Tensor* input_tensor_ptr, - FastAllocVector& transposed_input_data, - int64_t& block_size, - int64_t& blocks, - const std::vector& axes_, - bool keepdims_, - /*out*/ std::vector& reduced_dims, - bool check_no_transpose = false, - const TensorShape* input_shape_override = nullptr); +class ResultsNoTransposePrepareForReduce { + public: + std::vector input_shape; + std::vector reduced_axes; + std::vector projected_index; + int64_t last_loop_red_size; + int64_t last_loop_red_inc; + std::vector unprojected_index; + int64_t last_loop_size; + int64_t last_loop_inc; + bool equal(const std::vector& local_input_shape, const std::vector& local_reduced_axes) { + if (input_shape.size() != local_input_shape.size()) + return false; + if (reduced_axes.size() != local_reduced_axes.size()) + return false; + for (std::vector::const_iterator it1 = input_shape.begin(), it2 = local_input_shape.begin(); + it1 != input_shape.end(); ++it1, ++it2) { + if (*it1 != *it2) + return false; + } + for (std::vector::const_iterator it1 = reduced_axes.begin(), it2 = local_reduced_axes.begin(); + it1 != reduced_axes.end(); ++it1, ++it2) { + if (*it1 != *it2) + return false; + } + return true; + } +}; template -void ReduceSumCore(const T* input_data, T* output_data, bool no_transpose, - int64_t blocks, int64_t block_size, FastAllocVector& transposed_input_data, - concurrency::ThreadPool* tp); +inline T reduce_sqrt(T value) { return std::sqrt(value); } + +template <> +inline int64_t reduce_sqrt(int64_t value) { return static_cast(std::sqrt(static_cast(value))); } + +template <> +inline int32_t reduce_sqrt(int32_t value) { return static_cast(std::sqrt(static_cast(value))); } + +template +inline T reduce_log(T value) { return static_cast(std::log(value)); } + +template <> +inline int64_t reduce_log(int64_t value) { return static_cast(std::log(static_cast(value))); } + +template <> +inline int32_t reduce_log(int32_t value) { return static_cast(std::log(static_cast(value))); } + +template +inline T reduce_exp(T value) { return static_cast(std::exp(value)); } + +template +class ReduceAggregator { + public: + typedef TVAL value_type; + + protected: + int64_t N_; + T accumulator_; + + public: + inline ReduceAggregator(int64_t N, const T& init) { + N_ = N; + accumulator_ = init; + } + inline void update(const T&) { ORT_ENFORCE(false, "must be overloaded."); } + inline void update0(const T&) { ORT_ENFORCE(false, "must be overloaded."); } + inline TVAL aggall(const T*) { ORT_ENFORCE(false, "must be overloaded."); } + inline TVAL get_value() { return accumulator_; } + inline void enforce(const ResultsNoTransposePrepareForReduce&) {} + static inline bool two_loops() { return false; } +}; + +template +class ReduceAggregatorSum : public ReduceAggregator { + public: + inline ReduceAggregatorSum(int64_t N, const T&) : ReduceAggregator(N, 0) {} + inline void update(const T& v) { this->accumulator_ += v; } + inline TVAL aggall(const T* from_data) { + return Eigen::Map>(from_data, this->N_).sum(); + } +}; + +template +class ReduceAggregatorSumSquare : public ReduceAggregator { + public: + inline ReduceAggregatorSumSquare(int64_t N, const T&) : ReduceAggregator(N, 0) {} + inline TVAL aggall(const T* from_data) { + return Eigen::Map>(from_data, this->N_).squaredNorm(); + } + inline void update(const T& v) { this->accumulator_ += v * v; } +}; + +template +class ReduceAggregatorMean : public ReduceAggregatorSum { + public: + inline ReduceAggregatorMean(int64_t N, const T&) : ReduceAggregatorSum(N, 0) {} + inline T aggall(const T* from_data) { + return Eigen::Map>(from_data, this->N_).mean(); + } + inline T get_value() { return this->accumulator_ / static_cast(this->N_); } +}; + +template +class ReduceAggregatorMax : public ReduceAggregator { + public: + inline ReduceAggregatorMax(int64_t N, const T& init) : ReduceAggregator(N, init) {} + inline TVAL aggall(const T* from_data) { + return Eigen::Map>(from_data, this->N_).maxCoeff(); + } + inline void update(const T& v) { this->accumulator_ = v > this->accumulator_ ? v : this->accumulator_; } +}; + +template +class ReduceAggregatorArgMinMax : public ReduceAggregator { + protected: + int64_t arg_; + int64_t index_; + + public: + inline ReduceAggregatorArgMinMax(int64_t N, const T& init) : ReduceAggregator(N, init) { + arg_ = 0; + index_ = 0; + } + inline TVAL get_value() { return arg_; } + inline void enforce(const ResultsNoTransposePrepareForReduce& res) { + ORT_ENFORCE(res.projected_index.size() == 0, "Only one axis is allowed for reduction."); + } +}; + +template +class ReduceAggregatorArgMax : public ReduceAggregatorArgMinMax { + public: + inline ReduceAggregatorArgMax(int64_t N, const T& init) : ReduceAggregatorArgMinMax(N, init) {} + inline TVAL aggall(const T* from_data) { + Eigen::Map>(from_data, this->N_).maxCoeff(&this->arg_); + return this->get_value(); + } + inline void update(const T& v) { + if (v > this->accumulator_) { + this->accumulator_ = v; + this->arg_ = this->index_; + } + ++this->index_; + } +}; + +template +class ReduceAggregatorArgMaxLastIndex : public ReduceAggregatorArgMax { + public: + inline ReduceAggregatorArgMaxLastIndex(int64_t N, const T& init) : ReduceAggregatorArgMax(N, init) {} + inline TVAL aggall(const T* from_data) { + for (int64_t i = 0; i < this->N_; ++i) { + update(from_data[i]); + } + return this->get_value(); + } + inline void update(const T& v) { + if (v >= this->accumulator_) { + this->accumulator_ = v; + this->arg_ = this->index_; + } + ++this->index_; + } +}; + +template +class ReduceAggregatorArgMin : public ReduceAggregatorArgMinMax { + public: + inline ReduceAggregatorArgMin(int64_t N, const T& init) : ReduceAggregatorArgMinMax(N, init) {} + inline TVAL aggall(const T* from_data) { + Eigen::Map>(from_data, this->N_).minCoeff(&this->arg_); + return this->get_value(); + } + inline void update(const T& v) { + if (v < this->accumulator_) { + this->accumulator_ = v; + this->arg_ = this->index_; + } + ++this->index_; + } +}; + +template +class ReduceAggregatorArgMinLastIndex : public ReduceAggregatorArgMin { + public: + inline ReduceAggregatorArgMinLastIndex(int64_t N, const T& init) : ReduceAggregatorArgMin(N, init) {} + inline TVAL aggall(const T* from_data) { + for (int64_t i = 0; i < this->N_; ++i) { + update(from_data[i]); + } + return this->get_value(); + } + inline void update(const T& v) { + if (v <= this->accumulator_) { + this->accumulator_ = v; + this->arg_ = this->index_; + } + ++this->index_; + } +}; + +template +class ReduceAggregatorMin : public ReduceAggregator { + public: + inline ReduceAggregatorMin(int64_t N, const T& init) : ReduceAggregator(N, init) {} + inline TVAL aggall(const T* from_data) { + return Eigen::Map>(from_data, this->N_).minCoeff(); + } + inline void update(const T& v) { this->accumulator_ = v < this->accumulator_ ? v : this->accumulator_; } +}; + +template +class ReduceAggregatorProd : public ReduceAggregator { + public: + inline ReduceAggregatorProd(int64_t N, const T&) : ReduceAggregator(N, 1) {} + inline TVAL aggall(const T* from_data) { + return Eigen::Map>(from_data, this->N_).prod(); + } + inline void update(const T& v) { this->accumulator_ *= v; } +}; + +template +class ReduceAggregatorL1 : public ReduceAggregator { + public: + inline ReduceAggregatorL1(int64_t N, const T&) : ReduceAggregator(N, 0) {} + inline TVAL aggall(const T* from_data) { + return Eigen::Map>(from_data, this->N_).cwiseAbs().sum(); + } + inline void update(const T& v) { this->accumulator_ += v > 0 ? v : -v; } +}; + +template +class ReduceAggregatorL2 : public ReduceAggregator { + public: + inline ReduceAggregatorL2(int64_t N, const T&) : ReduceAggregator(N, 0) {} + inline TVAL aggall(const T* from_data) { + return Eigen::Map>(from_data, this->N_).norm(); + } + inline void update(const T& v) { this->accumulator_ += v * v; } + inline TVAL get_value() { return reduce_sqrt(this->accumulator_); } +}; + +template +class ReduceAggregatorLogSum : public ReduceAggregator { + public: + inline ReduceAggregatorLogSum(int64_t N, const T&) : ReduceAggregator(N, 0) {} + inline T aggall(const T* from_data) { + return reduce_log(Eigen::Map>(from_data, this->N_).sum()); + } + inline void update(const T& v) { this->accumulator_ += v; } + inline TVAL get_value() { return reduce_log(this->accumulator_); } +}; + +template +class ReduceAggregatorLogSumExp : public ReduceAggregator { + protected: + T max_; + + public: + inline ReduceAggregatorLogSumExp(int64_t N, const T&) : ReduceAggregator(N, 0) { max_ = this->accumulator_; } + inline TVAL aggall(const T* from_data) { + max_ = Eigen::Map>(from_data, this->N_).maxCoeff(); + for (int64_t i = 0; i < this->N_; ++i) { + update(from_data[i]); + } + return get_value(); + } + inline void update0(const T& v) { max_ = v > max_ ? v : max_; } + inline void update(const T& v) { this->accumulator_ += reduce_exp(v - max_); } + inline TVAL get_value() { return reduce_log(this->accumulator_) + max_; } + static inline bool two_loops() { return true; } +}; + +bool SetupForReduce(const Tensor* input_tensor_ptr, + const std::vector& axes_, + std::vector& axes, + TensorShape& new_input_shape, + std::vector& output_shape, + bool& empty_reduce, + const TensorShape* input_shape_override); + +void NoTransposePrepareForReduce(const TensorShape& new_input_shape, + const std::vector& reduced_axes, + ResultsNoTransposePrepareForReduce& results); + +template +void NoTransposeReduce(Tensor* output, const TensorShape& new_input_shape, const Tensor& input, + const std::vector& reduced_axes, concurrency::ThreadPool* tp, + ResultsNoTransposePrepareForReduce& last_results); + +template +void CommonReduce(OpKernelContext* ctx, + const std::vector axes_, int64_t keepdims_, + ResultsNoTransposePrepareForReduce& last_results, + bool noop_with_empty_axes = false); template class ReduceKernelBase { diff --git a/orttraining/orttraining/training_ops/cpu/reduction/reduction_ops.cc b/orttraining/orttraining/training_ops/cpu/reduction/reduction_ops.cc index 0732a43441..400f7688c1 100644 --- a/orttraining/orttraining/training_ops/cpu/reduction/reduction_ops.cc +++ b/orttraining/orttraining/training_ops/cpu/reduction/reduction_ops.cc @@ -27,38 +27,10 @@ REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(double) REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(int32_t) REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(int64_t) - template Status ReduceSumTraining::Compute(OpKernelContext* ctx) const { - FastAllocVector transposed_input_data(GetAllocator(*ctx)); - int64_t block_size; - int64_t blocks; - std::vector reduced_dims; - const Tensor* input = ctx->Input(0); - - //override the attribute value with the input value for reduction_axes - const Tensor* axes_tensor = ctx->Input(1); - ORT_ENFORCE(axes_tensor != nullptr, "Axes input is null"); - ORT_ENFORCE(axes_tensor->Shape().NumDimensions() == 1, - "An axes tensor must be a vector tensor."); - auto nDims = static_cast(axes_tensor->Shape()[0]); - const auto* data = axes_tensor->template Data(); - std::vector axes(data, data + nDims); - - // empty axes and no-op - if (axes.empty() && noop_with_empty_axes_) { - auto* output = ctx->Output(0, input->Shape()); - memcpy(output->template MutableData(), input->template Data(), input->SizeInBytes()); - return Status::OK(); - } - - bool no_transpose = PrepareForReduce(input, transposed_input_data, block_size, blocks, axes, keepdims_, reduced_dims, true); - - auto* output = ctx->Output(0, reduced_dims); - - ReduceSumCore(input->template Data(), output->template MutableData(), - no_transpose, blocks, block_size, transposed_input_data, ctx->GetOperatorThreadPool()); - + ResultsNoTransposePrepareForReduce last_results; + CommonReduce>(ctx, axes_, keepdims_, last_results, noop_with_empty_axes_); return Status::OK(); }