From 275eb404bfa3b6f87692ff54a88d65613eeda0c8 Mon Sep 17 00:00:00 2001 From: Atanas Dimitrov <70822030+neNasko1@users.noreply.github.com> Date: Wed, 18 Sep 2024 01:53:07 +0300 Subject: [PATCH] Speedup `CumSum` for large arrays (#22048) ### Description This PR refactors the `CPU` kernel for the `CumSum` operator. The new implementation strives to have as little indirection as possible. ### Motivation and Context Currently the `CumSum` operator perform very poorly in the case of 1D tensors(it was slower than a python loop). This is caused by the extensive use of the `SliceIterator`-s. Here is a relevant snippet: ```python import time import ndonnx as ndx import onnxruntime as ort import numpy as np import onnx def test_cumsum(sz): a = ndx.array(shape=(sz,), dtype=ndx.int64) b = ndx.cumsum(a) model = ndx.build({'a': a}, {'b': b}) onnx.save(model, "model.onnx") input = np.ones(sz, np.int64) start = time.time() result = ort.InferenceSession(model.SerializeToString()).run(None, {'a': input}) end = time.time() return end - start def test_cumsum_by_hand(sz): input = np.ones(sz, np.int64) start = time.time() answer = [0] for i in input: answer.append(answer[-1] + i) end = time.time() return end - start print(test_cumsum(int(1e7))) print(test_cumsum_by_hand(int(1e7))) ``` Before ```console 0.9794480800628662 0.4518160820007324 ``` After ```console 0.02483987808227539 0.5496008396148682 ``` The `model.onnx`: image The flame graph: ![profile-3](https://github.com/user-attachments/assets/c7418a05-cb65-4d72-a76d-6a6b05b4ba4d) --- onnxruntime/core/providers/cpu/math/cumsum.cc | 169 ++++++++---------- .../test/providers/cpu/math/cumsum_test.cc | 10 ++ 2 files changed, 86 insertions(+), 93 deletions(-) diff --git a/onnxruntime/core/providers/cpu/math/cumsum.cc b/onnxruntime/core/providers/cpu/math/cumsum.cc index 083cdfbbaf..8321b81021 100644 --- a/onnxruntime/core/providers/cpu/math/cumsum.cc +++ b/onnxruntime/core/providers/cpu/math/cumsum.cc @@ -1,6 +1,8 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. +#include + #include "cumsum.h" #include "core/providers/common.h" #include "core/providers/cpu/tensor/utils.h" @@ -9,46 +11,6 @@ using namespace onnxruntime; -namespace { -// static section -std::vector GetStarts(int64_t rank, int64_t axis, int64_t index) { - std::vector starts(onnxruntime::narrow(rank), 0); - starts[onnxruntime::narrow(axis)] = index; - return starts; -} -template -void ZeroOutSliceAtIndex(Tensor& output, int64_t rank, int64_t axis, int64_t index, - gsl::span slice_dims, const std::vector& steps, const int64_t slice_size) { - T zero{}; - auto output_starts(GetStarts(rank, axis, index)); - WritableSliceIterator output_iterator(output, output_starts, slice_dims, steps); - for (int64_t k = 0; k < slice_size; ++k, ++output_iterator) { - *output_iterator = zero; - } -} -template -void CopySlices(const Tensor& input, Tensor& output, - const std::vector& input_starts, const std::vector& output_starts, - gsl::span slice_dims, const std::vector& steps, const int64_t slice_size) { - SliceIterator input_iterator(input, input_starts, slice_dims, steps); - WritableSliceIterator output_iterator(output, output_starts, slice_dims, steps); - for (int64_t k = 0; k < slice_size; ++k, ++output_iterator, ++input_iterator) { - *output_iterator = *input_iterator; - } -} -template -void SumSlices(const Tensor& input, Tensor& output, - const std::vector& input_starts, const std::vector& output_starts, const std::vector& previous_output_starts, - gsl::span slice_dims, const std::vector& steps, const int64_t slice_size) { - SliceIterator input_iterator(input, input_starts, slice_dims, steps); - WritableSliceIterator output_iterator(output, output_starts, slice_dims, steps); - SliceIterator previous_output_iterator(output, previous_output_starts, slice_dims, steps); - for (int64_t k = 0; k < slice_size; ++k, ++output_iterator, ++input_iterator, ++previous_output_iterator) { - *output_iterator = *input_iterator + *previous_output_iterator; - } -} -} // namespace - namespace onnxruntime { namespace cumsum_op { @@ -183,8 +145,8 @@ CumSum::CumSum(const OpKernelInfo& info) : OpKernel(info), exclusive_(), reve template Status CumSum::Compute(OpKernelContext* ctx) const { - const Tensor* input = ctx->Input(0); // input tensor - auto rank = static_cast(input->Shape().NumDimensions()); // the rank of the input/output + const Tensor* input = ctx->Input(0); // input tensor + size_t rank = input->Shape().NumDimensions(); // the rank of the input/output if (rank == 0) return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Cannot apply CumSum operator on a scalar"); @@ -197,65 +159,86 @@ Status CumSum::Compute(OpKernelContext* ctx) const { if (output_shape.Size() == 0) return Status::OK(); - int64_t axis = 0; - ORT_THROW_IF_ERROR(cumsum_op::GetAxis(axis_tensor, rank, axis)); + int64_t axis_input = 0; + ORT_THROW_IF_ERROR(cumsum_op::GetAxis(axis_tensor, rank, axis_input)); - auto dim(output_tensor.Shape()[onnxruntime::narrow(axis)]); // dimension size for the axis - TensorShape slice_shape(input->Shape()); // the shape of one slice of input/output for the given value of the axis - slice_shape[onnxruntime::narrow(axis)] = 1; - auto slice_size(slice_shape.Size()); // total number of elements in each slice - auto slice_dims(slice_shape.GetDims()); // dim array for the slice + // we solve the problem by using the identity that(in the case of exclusive) + // 1) out[upper_dims...][0][lower_dims...] = 0 + // 2) out[upper_dims...][i][lower_dims...] = + // in[upper_dims...][i-1][lower_dims...] + out[upper_dims...][i-1][lower_dims...] + // we loop through the [upper_dims...] and start applying the identity in each slice + // since the [lower_dims...] are adjecent in memory, so we can add them like vectors - std::vector steps(onnxruntime::narrow(rank), 1); // steps for the slice -- always set to 1 + const auto input_shape = input->Shape().GetDims(); + const size_t axis = onnxruntime::narrow(axis_input); + const int64_t dim = input->Shape()[axis]; // dimension size for the axis + const int64_t upper_dim_count = // number of slices we can walk through iteratively + std::accumulate(input_shape.begin(), input_shape.begin() + axis, static_cast(1), std::multiplies()); + const int64_t lower_dim_size = // sizes of the slices we can treat as 1D arrays + std::accumulate(input_shape.begin() + axis + 1, input_shape.end(), static_cast(1), std::multiplies()); if (!reverse_) { - int64_t index(0); // the index we use as we walkthrough the given axis - // If (exclusive == true) the first slice is always 0 + const auto* input_iter = input->Data(); + auto* output_iter = output_tensor.MutableData(); + const auto* prev_output_iter = output_iter; + if (exclusive_) { - ::ZeroOutSliceAtIndex(output_tensor, rank, axis, index, slice_dims, steps, slice_size); - ++index; - } - - if (index < dim) { - // The next slice is a copy of the input (if exclusive == false then this is the first slice) - auto input_starts(::GetStarts(rank, axis, 0)); - auto output_starts(::GetStarts(rank, axis, index)); - ::CopySlices(*input, output_tensor, input_starts, output_starts, slice_dims, steps, slice_size); - ++index; - } - - for (; index < dim; ++index) { - // Each output slice is the sum of corresponding input slice and the previous output slice - auto input_starts(::GetStarts(rank, axis, exclusive_ ? index - 1 : index)); - auto output_starts(::GetStarts(rank, axis, index)); - auto previous_starts(::GetStarts(rank, axis, index - 1)); - ::SumSlices(*input, output_tensor, input_starts, output_starts, previous_starts, - slice_dims, steps, slice_size); + for (int64_t outer = 0; outer < upper_dim_count; outer++) { + prev_output_iter = output_iter; + for (int64_t inner = 0; inner < lower_dim_size; inner++) { + *(output_iter++) = 0; + } + for (int64_t cum_axis = 1; cum_axis < dim; cum_axis++) { + for (int64_t inner = 0; inner < lower_dim_size; inner++) { + *(output_iter++) = *(prev_output_iter++) + *(input_iter++); + } + } + input_iter += lower_dim_size; + } + } else { + for (int64_t outer = 0; outer < upper_dim_count; outer++) { + prev_output_iter = output_iter; + for (int64_t inner = 0; inner < lower_dim_size; inner++) { + *(output_iter++) = *(input_iter++); + } + for (int64_t cum_axis = 1; cum_axis < dim; cum_axis++) { + for (int64_t inner = 0; inner < lower_dim_size; inner++) { + *(output_iter++) = *(prev_output_iter++) + *(input_iter++); + } + } + } } } else { - //_reverse == true - int64_t index(dim - 1); // the index we use as we walkthrough the given axis - // If (exclusive == true) the first slice is always 0 + // in this case the logic is mostly the same, but we start from the end + const auto* input_iter = input->Data() + input->Shape().Size(); + auto* output_iter = output_tensor.MutableData() + output_shape.Size(); + const auto* prev_output_iter = output_iter; + if (exclusive_) { - ::ZeroOutSliceAtIndex(output_tensor, rank, axis, index, slice_dims, steps, slice_size); - --index; - } - - if (index >= 0) { - // The next slice is a copy of the input (if exclusive == false then this is the first slice) - auto input_starts(::GetStarts(rank, axis, dim - 1)); - auto output_starts(::GetStarts(rank, axis, index)); - ::CopySlices(*input, output_tensor, input_starts, output_starts, slice_dims, steps, slice_size); - --index; - } - - for (; index >= 0; --index) { - // Each output slice is the sum of corresponding input slice and the previous output slice - auto input_starts(::GetStarts(rank, axis, exclusive_ ? index + 1 : index)); - auto output_starts(::GetStarts(rank, axis, index)); - auto previous_starts(::GetStarts(rank, axis, index + 1)); - ::SumSlices(*input, output_tensor, input_starts, output_starts, previous_starts, - slice_dims, steps, slice_size); + for (int64_t outer = upper_dim_count - 1; outer >= 0; outer--) { + prev_output_iter = output_iter; + for (int64_t inner = lower_dim_size - 1; inner >= 0; inner--) { + *(--output_iter) = 0; + } + for (int64_t cum_axis = dim - 1; cum_axis > 0; cum_axis--) { + for (int64_t inner = lower_dim_size - 1; inner >= 0; inner--) { + *(--output_iter) = *(--prev_output_iter) + *(--input_iter); + } + } + input_iter -= lower_dim_size; + } + } else { + for (int64_t outer = upper_dim_count - 1; outer >= 0; outer--) { + prev_output_iter = output_iter; + for (int64_t inner = lower_dim_size - 1; inner >= 0; inner--) { + *(--output_iter) = *(--input_iter); + } + for (int64_t cum_axis = dim - 1; cum_axis > 0; cum_axis--) { + for (int64_t inner = lower_dim_size - 1; inner >= 0; inner--) { + *(--output_iter) = *(--prev_output_iter) + *(--input_iter); + } + } + } } } diff --git a/onnxruntime/test/providers/cpu/math/cumsum_test.cc b/onnxruntime/test/providers/cpu/math/cumsum_test.cc index 845530c588..16563794b0 100644 --- a/onnxruntime/test/providers/cpu/math/cumsum_test.cc +++ b/onnxruntime/test/providers/cpu/math/cumsum_test.cc @@ -247,5 +247,15 @@ TEST(CumSumTest, _1DTestdouble_WithInt64Axis) { test.AddOutput("y", {5}, {1., 3., 6., 10., 15.}); test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); } +TEST(CumSumTest, _1DTestLong) { + OpTester test("CumSum", 11, onnxruntime::kOnnxDomain); + constexpr int N = 100000; + std::vector output_value(N); + std::iota(output_value.begin(), output_value.end(), 1); + test.AddInput("x", {N}, std::vector(N, 1)); + test.AddInput("axis", {}, {0}); + test.AddOutput("y", {N}, output_value); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); +} } // namespace test } // namespace onnxruntime