mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-26 03:00:54 +00:00
Update get_applicable_matrix_reduction() to combine dimensions of 1 with the given reduction axes. (#5734)
This commit is contained in:
parent
63b85fc696
commit
4c6118eb49
3 changed files with 192 additions and 88 deletions
|
|
@ -4,6 +4,8 @@
|
|||
#include "core/providers/cuda/reduction/reduction_functions.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <iterator>
|
||||
#include <utility>
|
||||
|
||||
#include "core/common/optional.h"
|
||||
|
|
@ -13,6 +15,88 @@
|
|||
namespace onnxruntime {
|
||||
namespace cuda {
|
||||
|
||||
namespace {
|
||||
// std::make_reverse_iterator is not implemented in older versions of GCC
|
||||
#if !defined(__GNUC__) || __GNUC__ >= 5
|
||||
using std::make_reverse_iterator;
|
||||
#else
|
||||
template <typename It>
|
||||
std::reverse_iterator<It> make_reverse_iterator(It it) {
|
||||
return std::reverse_iterator<It>(it);
|
||||
}
|
||||
#endif
|
||||
|
||||
// gets min and max of single contiguous range of axes if available
|
||||
optional<std::pair<int64_t, int64_t>> GetMinAndMaxContiguousAxes(
|
||||
int64_t rank,
|
||||
const std::vector<int64_t>& dims,
|
||||
const std::vector<int64_t>& original_axes) {
|
||||
assert(rank == static_cast<int64_t>(dims.size()));
|
||||
|
||||
// empty axes means reduce all dimensions
|
||||
if (original_axes.empty()) {
|
||||
return std::make_pair(int64_t{0}, rank - 1);
|
||||
}
|
||||
|
||||
// normalize axis values and sort
|
||||
const std::vector<int64_t> axes = [&original_axes, rank]() {
|
||||
std::vector<int64_t> result(original_axes);
|
||||
std::for_each(
|
||||
result.begin(), result.end(),
|
||||
[rank](int64_t& axis) { axis = HandleNegativeAxis(axis, rank); });
|
||||
std::sort(result.begin(), result.end());
|
||||
return result;
|
||||
}();
|
||||
|
||||
assert(!axes.empty());
|
||||
|
||||
const auto is_dim_one = [](int64_t dim) { return dim == 1; };
|
||||
|
||||
for (auto a = axes.begin(), b = axes.begin() + 1;
|
||||
b != axes.end();
|
||||
++a, ++b) {
|
||||
ORT_ENFORCE(*a != *b, "axes must not contain duplicate values");
|
||||
|
||||
// if axis values are adjacent, the axes are contiguous
|
||||
if (*a + 1 == *b) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// if all dimension values between adjacent axes are 1,
|
||||
// treat the axes as contiguous
|
||||
if (std::all_of(dims.begin() + *a + 1, dims.begin() + *b, is_dim_one)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// otherwise, not contiguous
|
||||
return nullopt;
|
||||
}
|
||||
|
||||
// expand axes over surrounding dimensions with value of 1
|
||||
const int64_t min_axis = [&dims, &axes, &is_dim_one]() {
|
||||
const auto& min_given_axis = axes.front();
|
||||
// note that std::reverse_iterator(it) refers to the element at (it-1)
|
||||
// it -> reverse it: element offset of -1
|
||||
const auto before_min_given_axis_rit =
|
||||
make_reverse_iterator(dims.begin() + min_given_axis);
|
||||
const auto before_min_axis_rit =
|
||||
std::find_if_not(before_min_given_axis_rit, dims.rend(), is_dim_one);
|
||||
// reverse it -> it: element offset of +1
|
||||
return std::distance(dims.begin(), before_min_axis_rit.base());
|
||||
}();
|
||||
|
||||
const int64_t max_axis = [&dims, &axes, &is_dim_one]() {
|
||||
const auto& max_given_axis = axes.back();
|
||||
const auto after_max_given_axis_it = dims.begin() + max_given_axis + 1;
|
||||
const auto after_max_axis_it =
|
||||
std::find_if_not(after_max_given_axis_it, dims.end(), is_dim_one);
|
||||
return std::distance(dims.begin(), after_max_axis_it - 1);
|
||||
}();
|
||||
|
||||
return std::make_pair(min_axis, max_axis);
|
||||
}
|
||||
} // namespace
|
||||
|
||||
ApplicableMatrixReduction get_applicable_matrix_reduction(
|
||||
const cudnnReduceTensorOp_t cudnn_reduce_op,
|
||||
const std::vector<int64_t>& dims, const std::vector<int64_t>& original_axes,
|
||||
|
|
@ -22,53 +106,25 @@ ApplicableMatrixReduction get_applicable_matrix_reduction(
|
|||
}
|
||||
|
||||
const auto rank = gsl::narrow<int64_t>(dims.size());
|
||||
|
||||
// min and max of single contiguous range of axes
|
||||
const auto minmax_axes = [&]() -> optional<std::pair<int64_t, int64_t>> {
|
||||
// empty axes means reduce all dimensions
|
||||
if (original_axes.empty()) {
|
||||
return std::make_pair(int64_t{0}, rank - 1);
|
||||
}
|
||||
|
||||
// normalize axis values and sort
|
||||
const std::vector<int64_t> axes = [&original_axes, rank]() {
|
||||
std::vector<int64_t> result(original_axes);
|
||||
std::for_each(
|
||||
result.begin(), result.end(),
|
||||
[rank](int64_t& axis) { axis = HandleNegativeAxis(axis, rank); });
|
||||
std::sort(result.begin(), result.end());
|
||||
return result;
|
||||
}();
|
||||
|
||||
for (auto a = axes.begin(), b = axes.begin() + 1;
|
||||
b != axes.end();
|
||||
++a, ++b) {
|
||||
ORT_ENFORCE(*a != *b, "axes must not contain duplicate values");
|
||||
if (*a + 1 != *b) { // not contiguous
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
||||
return std::make_pair(axes.front(), axes.back());
|
||||
}();
|
||||
|
||||
if (!minmax_axes.has_value()) {
|
||||
const auto min_and_max_axes = GetMinAndMaxContiguousAxes(rank, dims, original_axes);
|
||||
if (!min_and_max_axes.has_value()) {
|
||||
return ApplicableMatrixReduction::None;
|
||||
}
|
||||
|
||||
const auto& min_axis = minmax_axes.value().first;
|
||||
const auto& max_axis = minmax_axes.value().second;
|
||||
const auto& min_axis = min_and_max_axes.value().first;
|
||||
const auto& max_axis = min_and_max_axes.value().second;
|
||||
|
||||
// axes from beginning means row reduction, axes to end means column reduction
|
||||
// currently we don't handle axes from beginning to end, but that could be either
|
||||
// for axes from beginning to end, either works and we do row reduction
|
||||
const bool axes_from_beginning = min_axis == 0;
|
||||
const bool axes_to_end = max_axis == rank - 1;
|
||||
|
||||
// handle axes anchored to one of beginning or end, not both
|
||||
if (axes_from_beginning == axes_to_end) {
|
||||
// handle axes anchored to beginning or end
|
||||
if (!axes_from_beginning && !axes_to_end) {
|
||||
return ApplicableMatrixReduction::None;
|
||||
}
|
||||
|
||||
// the axis index right after the last flattened into matrix rows
|
||||
const int64_t m_end_axis = axes_from_beginning ? max_axis + 1 : min_axis;
|
||||
|
||||
const TensorShape& shape = TensorShape::ReinterpretBaseType(dims);
|
||||
|
|
|
|||
|
|
@ -663,7 +663,7 @@ Status ReduceKernel<allow_multi_axes>::ComputeImpl(OpKernelContext* ctx, cudnnRe
|
|||
Tensor* Y = ctx->Output(0, prepare_reduce_metadata.squeezed_output_dims);
|
||||
const bool fast_reduction = fast_reduction_ && !ctx->GetUseDeterministicCompute();
|
||||
|
||||
return ReduceComputeCore<T, ReduceTensorIndices>(*cuda_ep_, *X, prepare_reduce_metadata, *Y, cudnn_reduce_op, axes_,
|
||||
return ReduceComputeCore<T, ReduceTensorIndices>(*cuda_ep_, *X, prepare_reduce_metadata, *Y, cudnn_reduce_op, axes,
|
||||
calculate_log_, calculate_sqt_, log_sum_exp_, fast_reduction);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -7,11 +7,15 @@
|
|||
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
#include "core/common/optional.h"
|
||||
#include "core/providers/cuda/reduction/reduction_functions.h"
|
||||
#include "test/common/tensor_op_test_utils.h"
|
||||
#include "test/util/include/asserts.h"
|
||||
|
||||
using onnxruntime::test::RandomValueGenerator;
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace cuda {
|
||||
namespace test {
|
||||
|
||||
namespace {
|
||||
|
|
@ -55,7 +59,7 @@ void TestReduceRowToScalarApis(int size, float relative_error_tolerance = 1e-4f)
|
|||
expected_output_mean += input_value / float(size);
|
||||
}
|
||||
const auto buffer_size_in_bytes =
|
||||
cuda::compute_reduction_buffer_size<float>(size);
|
||||
compute_reduction_buffer_size<float>(size);
|
||||
|
||||
auto device_input = AllocateDeviceMemory<float>(size);
|
||||
auto device_output_sum = AllocateDeviceMemory<float>();
|
||||
|
|
@ -65,19 +69,19 @@ void TestReduceRowToScalarApis(int size, float relative_error_tolerance = 1e-4f)
|
|||
|
||||
cudaMemcpy(device_input.get(), input.data(), size * sizeof(float), cudaMemcpyHostToDevice);
|
||||
|
||||
ASSERT_STATUS_OK(cuda::reduce_sum(
|
||||
ASSERT_STATUS_OK(reduce_sum(
|
||||
device_input.get(),
|
||||
device_output_sum.get(),
|
||||
size,
|
||||
buffer.get(),
|
||||
buffer_size_in_bytes));
|
||||
ASSERT_STATUS_OK(cuda::reduce_square_sum(
|
||||
ASSERT_STATUS_OK(reduce_square_sum(
|
||||
device_input.get(),
|
||||
device_output_square_sum.get(),
|
||||
size,
|
||||
buffer.get(),
|
||||
buffer_size_in_bytes));
|
||||
ASSERT_STATUS_OK(cuda::reduce_mean(
|
||||
ASSERT_STATUS_OK(reduce_mean(
|
||||
device_input.get(),
|
||||
device_output_mean.get(),
|
||||
size,
|
||||
|
|
@ -116,10 +120,10 @@ void TestReduceRowsToRow(int m, int n, bool reset_initial_output, float relative
|
|||
|
||||
if (!reset_initial_output) {
|
||||
// manually initialize output data
|
||||
cuda::Fill(d_out.get(), initial_value, n);
|
||||
Fill(d_out.get(), initial_value, n);
|
||||
}
|
||||
|
||||
ASSERT_STATUS_OK(cuda::reduce_matrix_rows(
|
||||
ASSERT_STATUS_OK(reduce_matrix_rows(
|
||||
d_in.get(), d_out.get(),
|
||||
m, n,
|
||||
reset_initial_output));
|
||||
|
|
@ -155,10 +159,10 @@ void TestReduceColumnsToColumn(int m, int n, float relative_error_tolerance = 1e
|
|||
cudaMemcpy(d_in.get(), values.data(), m * n * sizeof(float), cudaMemcpyHostToDevice);
|
||||
|
||||
size_t buffer_size_in_bytes =
|
||||
cuda::compute_reduce_matrix_columns_buffer_size<float>(m, n);
|
||||
compute_reduce_matrix_columns_buffer_size<float>(m, n);
|
||||
auto d_buffer = AllocateDeviceMemory<char>(buffer_size_in_bytes);
|
||||
|
||||
ASSERT_STATUS_OK(cuda::reduce_matrix_columns(
|
||||
ASSERT_STATUS_OK(reduce_matrix_columns(
|
||||
d_in.get(), d_out.get(),
|
||||
m, n,
|
||||
d_buffer.get(), buffer_size_in_bytes));
|
||||
|
|
@ -202,7 +206,7 @@ TEST(ReductionFunctionsTest, BufferOffsets) {
|
|||
const size_t max_buffer_offset = 15;
|
||||
|
||||
const size_t buffer_size_in_bytes =
|
||||
cuda::compute_reduce_matrix_columns_buffer_size<double>(m, n) + max_buffer_offset;
|
||||
compute_reduce_matrix_columns_buffer_size<double>(m, n) + max_buffer_offset;
|
||||
|
||||
auto d_input = AllocateDeviceMemory<double>(m * n);
|
||||
auto d_output = AllocateDeviceMemory<double>(m);
|
||||
|
|
@ -217,7 +221,7 @@ TEST(ReductionFunctionsTest, BufferOffsets) {
|
|||
const auto input = random.Uniform<double>({m, n}, 1.0, 10.0);
|
||||
cudaMemcpy(d_input.get(), input.data(), m * n * sizeof(double), cudaMemcpyHostToDevice);
|
||||
|
||||
ASSERT_STATUS_OK(cuda::reduce_matrix_columns(
|
||||
ASSERT_STATUS_OK(reduce_matrix_columns(
|
||||
d_input.get(), d_output.get(),
|
||||
m, n,
|
||||
d_buffer.get() + buffer_offset,
|
||||
|
|
@ -234,7 +238,7 @@ TEST(ReductionFunctionsTest, InvalidBufferSize) {
|
|||
|
||||
// this should be too small
|
||||
const size_t buffer_size_in_bytes =
|
||||
cuda::compute_reduce_matrix_columns_buffer_size<float>(m, n) / 10;
|
||||
compute_reduce_matrix_columns_buffer_size<float>(m, n) / 10;
|
||||
|
||||
auto d_input = AllocateDeviceMemory<float>(m * n);
|
||||
auto d_output = AllocateDeviceMemory<float>(m);
|
||||
|
|
@ -245,64 +249,108 @@ TEST(ReductionFunctionsTest, InvalidBufferSize) {
|
|||
cudaMemcpy(d_input.get(), input.data(), m * n * sizeof(float), cudaMemcpyHostToDevice);
|
||||
|
||||
const auto status =
|
||||
cuda::reduce_matrix_columns(d_input.get(), d_output.get(), m, n, d_buffer.get(), buffer_size_in_bytes);
|
||||
reduce_matrix_columns(d_input.get(), d_output.get(), m, n, d_buffer.get(), buffer_size_in_bytes);
|
||||
ASSERT_FALSE(status.IsOK());
|
||||
}
|
||||
|
||||
TEST(ReductionFunctionsTest, GetApplicableMatrixReduction) {
|
||||
auto test_get_applicable_matrix_reduction =
|
||||
[](cudnnReduceTensorOp_t cudnn_op,
|
||||
const std::vector<int64_t>& dims, const std::vector<int64_t>& axes,
|
||||
ApplicableMatrixReduction expected_reduction,
|
||||
const optional<int>& expected_m = nullopt,
|
||||
const optional<int>& expected_n = nullopt) {
|
||||
SCOPED_TRACE(MakeString(
|
||||
"cudnn_op: ", cudnn_op,
|
||||
", dims: ", TensorShape::ReinterpretBaseType(dims),
|
||||
", axes: ", TensorShape::ReinterpretBaseType(axes)));
|
||||
int m{}, n{};
|
||||
EXPECT_EQ(
|
||||
static_cast<int>(get_applicable_matrix_reduction(cudnn_op, dims, axes, m, n)),
|
||||
static_cast<int>(expected_reduction));
|
||||
if (expected_m) {
|
||||
EXPECT_EQ(m, *expected_m);
|
||||
}
|
||||
if (expected_n) {
|
||||
EXPECT_EQ(n, *expected_n);
|
||||
}
|
||||
};
|
||||
|
||||
const cudnnReduceTensorOp_t valid_op_type = CUDNN_REDUCE_TENSOR_ADD;
|
||||
int m{}, n{};
|
||||
|
||||
// contiguous axes from beginning
|
||||
EXPECT_EQ(
|
||||
cuda::get_applicable_matrix_reduction(
|
||||
valid_op_type, {2, 4, 8, 16}, {0, 1}, m, n),
|
||||
cuda::ApplicableMatrixReduction::Rows);
|
||||
EXPECT_EQ(m, 2 * 4);
|
||||
EXPECT_EQ(n, 8 * 16);
|
||||
test_get_applicable_matrix_reduction(
|
||||
valid_op_type, {2, 4, 8, 16}, {0, 1},
|
||||
ApplicableMatrixReduction::Rows, 2 * 4, 8 * 16);
|
||||
|
||||
// contiguous axes to end
|
||||
EXPECT_EQ(
|
||||
cuda::get_applicable_matrix_reduction(
|
||||
valid_op_type, {2, 4, 8, 16}, {1, 2, 3}, m, n),
|
||||
cuda::ApplicableMatrixReduction::Columns);
|
||||
EXPECT_EQ(m, 2);
|
||||
EXPECT_EQ(n, 4 * 8 * 16);
|
||||
test_get_applicable_matrix_reduction(
|
||||
valid_op_type, {2, 4, 8, 16}, {1, 2, 3},
|
||||
ApplicableMatrixReduction::Columns, 2, 4 * 8 * 16);
|
||||
|
||||
// single axis
|
||||
EXPECT_EQ(
|
||||
cuda::get_applicable_matrix_reduction(
|
||||
valid_op_type, {2, 4, 8, 16}, {3}, m, n),
|
||||
cuda::ApplicableMatrixReduction::Columns);
|
||||
EXPECT_EQ(m, 2 * 4 * 8);
|
||||
EXPECT_EQ(n, 16);
|
||||
test_get_applicable_matrix_reduction(
|
||||
valid_op_type, {2, 4, 8, 16}, {3},
|
||||
ApplicableMatrixReduction::Columns, 2 * 4 * 8, 16);
|
||||
|
||||
// empty axes
|
||||
test_get_applicable_matrix_reduction(
|
||||
valid_op_type, {2, 4, 8, 16}, {},
|
||||
ApplicableMatrixReduction::Rows, 2 * 4 * 8 * 16, 1);
|
||||
|
||||
// all axes
|
||||
test_get_applicable_matrix_reduction(
|
||||
valid_op_type, {2, 4, 8, 16}, {0, 1, 2, 3},
|
||||
ApplicableMatrixReduction::Rows, 2 * 4 * 8 * 16, 1);
|
||||
|
||||
// handle ones
|
||||
test_get_applicable_matrix_reduction(
|
||||
valid_op_type, {1, 2, 1, 1, 4, 1, 8, 1}, {0},
|
||||
ApplicableMatrixReduction::Rows, 1, 2 * 4 * 8);
|
||||
test_get_applicable_matrix_reduction(
|
||||
valid_op_type, {1, 2, 1, 1, 4, 1, 8, 1}, {1},
|
||||
ApplicableMatrixReduction::Rows, 2, 4 * 8);
|
||||
test_get_applicable_matrix_reduction(
|
||||
valid_op_type, {1, 2, 1, 1, 4, 1, 8, 1}, {1, 3},
|
||||
ApplicableMatrixReduction::Rows, 2, 4 * 8);
|
||||
test_get_applicable_matrix_reduction(
|
||||
valid_op_type, {1, 2, 1, 1, 4, 1, 8, 1}, {1, 3, 4},
|
||||
ApplicableMatrixReduction::Rows, 2 * 4, 8);
|
||||
test_get_applicable_matrix_reduction(
|
||||
valid_op_type, {1, 2, 1, 1, 4, 1, 8, 1}, {1, 3, 4, 6},
|
||||
ApplicableMatrixReduction::Rows, 2 * 4 * 8, 1);
|
||||
test_get_applicable_matrix_reduction(
|
||||
valid_op_type, {1, 2, 1, 1, 4, 1, 8, 1}, {3, 4, 6},
|
||||
ApplicableMatrixReduction::Columns, 2, 4 * 8);
|
||||
test_get_applicable_matrix_reduction(
|
||||
valid_op_type, {1, 2, 1, 1, 4, 1, 8, 1}, {4, 6},
|
||||
ApplicableMatrixReduction::Columns, 2, 4 * 8);
|
||||
test_get_applicable_matrix_reduction(
|
||||
valid_op_type, {1, 2, 1, 1, 4, 1, 8, 1}, {6},
|
||||
ApplicableMatrixReduction::Columns, 2 * 4, 8);
|
||||
test_get_applicable_matrix_reduction(
|
||||
valid_op_type, {1, 2, 1, 1, 4, 1, 8, 1}, {7},
|
||||
ApplicableMatrixReduction::Columns, 2 * 4 * 8, 1);
|
||||
|
||||
// unsupported axes
|
||||
EXPECT_EQ(
|
||||
cuda::get_applicable_matrix_reduction(
|
||||
valid_op_type, {2, 4, 8, 16}, {0, 1, 2, 3}, m, n),
|
||||
cuda::ApplicableMatrixReduction::None);
|
||||
EXPECT_EQ(
|
||||
cuda::get_applicable_matrix_reduction(
|
||||
valid_op_type, {2, 4, 8, 16}, {}, m, n),
|
||||
cuda::ApplicableMatrixReduction::None);
|
||||
EXPECT_EQ(
|
||||
cuda::get_applicable_matrix_reduction(
|
||||
valid_op_type, {2, 4, 8, 16, 32, 64}, {0, 1, 3, 4}, m, n),
|
||||
cuda::ApplicableMatrixReduction::None);
|
||||
EXPECT_EQ(
|
||||
cuda::get_applicable_matrix_reduction(
|
||||
valid_op_type, {2, 4, 8, 16}, {1, 2}, m, n),
|
||||
cuda::ApplicableMatrixReduction::None);
|
||||
test_get_applicable_matrix_reduction(
|
||||
valid_op_type, {2, 4, 8, 16, 32, 64}, {0, 1, 3, 4},
|
||||
ApplicableMatrixReduction::None);
|
||||
test_get_applicable_matrix_reduction(
|
||||
valid_op_type, {2, 4, 8, 16}, {1, 2},
|
||||
ApplicableMatrixReduction::None);
|
||||
test_get_applicable_matrix_reduction(
|
||||
valid_op_type, {1, 2, 1, 1, 4, 1, 8, 1}, {3, 6},
|
||||
ApplicableMatrixReduction::None);
|
||||
|
||||
// invalid op type
|
||||
EXPECT_EQ(
|
||||
cuda::get_applicable_matrix_reduction(
|
||||
CUDNN_REDUCE_TENSOR_MAX, {2, 4, 8, 16}, {0, 1}, m, n),
|
||||
cuda::ApplicableMatrixReduction::None);
|
||||
test_get_applicable_matrix_reduction(
|
||||
CUDNN_REDUCE_TENSOR_MAX, {2, 4, 8, 16}, {0, 1},
|
||||
ApplicableMatrixReduction::None);
|
||||
}
|
||||
|
||||
} // namespace test
|
||||
} // namespace cuda
|
||||
} // namespace onnxruntime
|
||||
|
||||
#endif
|
||||
|
|
|
|||
Loading…
Reference in a new issue