Update get_applicable_matrix_reduction() to combine dimensions of 1 with the given reduction axes. (#5734)

This commit is contained in:
edgchen1 2020-11-10 10:32:50 -08:00 committed by GitHub
parent 63b85fc696
commit 4c6118eb49
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 192 additions and 88 deletions

View file

@ -4,6 +4,8 @@
#include "core/providers/cuda/reduction/reduction_functions.h"
#include <algorithm>
#include <cassert>
#include <iterator>
#include <utility>
#include "core/common/optional.h"
@ -13,6 +15,88 @@
namespace onnxruntime {
namespace cuda {
namespace {
// std::make_reverse_iterator is not implemented in older versions of GCC
#if !defined(__GNUC__) || __GNUC__ >= 5
using std::make_reverse_iterator;
#else
template <typename It>
std::reverse_iterator<It> make_reverse_iterator(It it) {
return std::reverse_iterator<It>(it);
}
#endif
// gets min and max of single contiguous range of axes if available
optional<std::pair<int64_t, int64_t>> GetMinAndMaxContiguousAxes(
int64_t rank,
const std::vector<int64_t>& dims,
const std::vector<int64_t>& original_axes) {
assert(rank == static_cast<int64_t>(dims.size()));
// empty axes means reduce all dimensions
if (original_axes.empty()) {
return std::make_pair(int64_t{0}, rank - 1);
}
// normalize axis values and sort
const std::vector<int64_t> axes = [&original_axes, rank]() {
std::vector<int64_t> result(original_axes);
std::for_each(
result.begin(), result.end(),
[rank](int64_t& axis) { axis = HandleNegativeAxis(axis, rank); });
std::sort(result.begin(), result.end());
return result;
}();
assert(!axes.empty());
const auto is_dim_one = [](int64_t dim) { return dim == 1; };
for (auto a = axes.begin(), b = axes.begin() + 1;
b != axes.end();
++a, ++b) {
ORT_ENFORCE(*a != *b, "axes must not contain duplicate values");
// if axis values are adjacent, the axes are contiguous
if (*a + 1 == *b) {
continue;
}
// if all dimension values between adjacent axes are 1,
// treat the axes as contiguous
if (std::all_of(dims.begin() + *a + 1, dims.begin() + *b, is_dim_one)) {
continue;
}
// otherwise, not contiguous
return nullopt;
}
// expand axes over surrounding dimensions with value of 1
const int64_t min_axis = [&dims, &axes, &is_dim_one]() {
const auto& min_given_axis = axes.front();
// note that std::reverse_iterator(it) refers to the element at (it-1)
// it -> reverse it: element offset of -1
const auto before_min_given_axis_rit =
make_reverse_iterator(dims.begin() + min_given_axis);
const auto before_min_axis_rit =
std::find_if_not(before_min_given_axis_rit, dims.rend(), is_dim_one);
// reverse it -> it: element offset of +1
return std::distance(dims.begin(), before_min_axis_rit.base());
}();
const int64_t max_axis = [&dims, &axes, &is_dim_one]() {
const auto& max_given_axis = axes.back();
const auto after_max_given_axis_it = dims.begin() + max_given_axis + 1;
const auto after_max_axis_it =
std::find_if_not(after_max_given_axis_it, dims.end(), is_dim_one);
return std::distance(dims.begin(), after_max_axis_it - 1);
}();
return std::make_pair(min_axis, max_axis);
}
} // namespace
ApplicableMatrixReduction get_applicable_matrix_reduction(
const cudnnReduceTensorOp_t cudnn_reduce_op,
const std::vector<int64_t>& dims, const std::vector<int64_t>& original_axes,
@ -22,53 +106,25 @@ ApplicableMatrixReduction get_applicable_matrix_reduction(
}
const auto rank = gsl::narrow<int64_t>(dims.size());
// min and max of single contiguous range of axes
const auto minmax_axes = [&]() -> optional<std::pair<int64_t, int64_t>> {
// empty axes means reduce all dimensions
if (original_axes.empty()) {
return std::make_pair(int64_t{0}, rank - 1);
}
// normalize axis values and sort
const std::vector<int64_t> axes = [&original_axes, rank]() {
std::vector<int64_t> result(original_axes);
std::for_each(
result.begin(), result.end(),
[rank](int64_t& axis) { axis = HandleNegativeAxis(axis, rank); });
std::sort(result.begin(), result.end());
return result;
}();
for (auto a = axes.begin(), b = axes.begin() + 1;
b != axes.end();
++a, ++b) {
ORT_ENFORCE(*a != *b, "axes must not contain duplicate values");
if (*a + 1 != *b) { // not contiguous
return {};
}
}
return std::make_pair(axes.front(), axes.back());
}();
if (!minmax_axes.has_value()) {
const auto min_and_max_axes = GetMinAndMaxContiguousAxes(rank, dims, original_axes);
if (!min_and_max_axes.has_value()) {
return ApplicableMatrixReduction::None;
}
const auto& min_axis = minmax_axes.value().first;
const auto& max_axis = minmax_axes.value().second;
const auto& min_axis = min_and_max_axes.value().first;
const auto& max_axis = min_and_max_axes.value().second;
// axes from beginning means row reduction, axes to end means column reduction
// currently we don't handle axes from beginning to end, but that could be either
// for axes from beginning to end, either works and we do row reduction
const bool axes_from_beginning = min_axis == 0;
const bool axes_to_end = max_axis == rank - 1;
// handle axes anchored to one of beginning or end, not both
if (axes_from_beginning == axes_to_end) {
// handle axes anchored to beginning or end
if (!axes_from_beginning && !axes_to_end) {
return ApplicableMatrixReduction::None;
}
// the axis index right after the last flattened into matrix rows
const int64_t m_end_axis = axes_from_beginning ? max_axis + 1 : min_axis;
const TensorShape& shape = TensorShape::ReinterpretBaseType(dims);

View file

@ -663,7 +663,7 @@ Status ReduceKernel<allow_multi_axes>::ComputeImpl(OpKernelContext* ctx, cudnnRe
Tensor* Y = ctx->Output(0, prepare_reduce_metadata.squeezed_output_dims);
const bool fast_reduction = fast_reduction_ && !ctx->GetUseDeterministicCompute();
return ReduceComputeCore<T, ReduceTensorIndices>(*cuda_ep_, *X, prepare_reduce_metadata, *Y, cudnn_reduce_op, axes_,
return ReduceComputeCore<T, ReduceTensorIndices>(*cuda_ep_, *X, prepare_reduce_metadata, *Y, cudnn_reduce_op, axes,
calculate_log_, calculate_sqt_, log_sum_exp_, fast_reduction);
}

View file

@ -7,11 +7,15 @@
#include "gtest/gtest.h"
#include "core/common/optional.h"
#include "core/providers/cuda/reduction/reduction_functions.h"
#include "test/common/tensor_op_test_utils.h"
#include "test/util/include/asserts.h"
using onnxruntime::test::RandomValueGenerator;
namespace onnxruntime {
namespace cuda {
namespace test {
namespace {
@ -55,7 +59,7 @@ void TestReduceRowToScalarApis(int size, float relative_error_tolerance = 1e-4f)
expected_output_mean += input_value / float(size);
}
const auto buffer_size_in_bytes =
cuda::compute_reduction_buffer_size<float>(size);
compute_reduction_buffer_size<float>(size);
auto device_input = AllocateDeviceMemory<float>(size);
auto device_output_sum = AllocateDeviceMemory<float>();
@ -65,19 +69,19 @@ void TestReduceRowToScalarApis(int size, float relative_error_tolerance = 1e-4f)
cudaMemcpy(device_input.get(), input.data(), size * sizeof(float), cudaMemcpyHostToDevice);
ASSERT_STATUS_OK(cuda::reduce_sum(
ASSERT_STATUS_OK(reduce_sum(
device_input.get(),
device_output_sum.get(),
size,
buffer.get(),
buffer_size_in_bytes));
ASSERT_STATUS_OK(cuda::reduce_square_sum(
ASSERT_STATUS_OK(reduce_square_sum(
device_input.get(),
device_output_square_sum.get(),
size,
buffer.get(),
buffer_size_in_bytes));
ASSERT_STATUS_OK(cuda::reduce_mean(
ASSERT_STATUS_OK(reduce_mean(
device_input.get(),
device_output_mean.get(),
size,
@ -116,10 +120,10 @@ void TestReduceRowsToRow(int m, int n, bool reset_initial_output, float relative
if (!reset_initial_output) {
// manually initialize output data
cuda::Fill(d_out.get(), initial_value, n);
Fill(d_out.get(), initial_value, n);
}
ASSERT_STATUS_OK(cuda::reduce_matrix_rows(
ASSERT_STATUS_OK(reduce_matrix_rows(
d_in.get(), d_out.get(),
m, n,
reset_initial_output));
@ -155,10 +159,10 @@ void TestReduceColumnsToColumn(int m, int n, float relative_error_tolerance = 1e
cudaMemcpy(d_in.get(), values.data(), m * n * sizeof(float), cudaMemcpyHostToDevice);
size_t buffer_size_in_bytes =
cuda::compute_reduce_matrix_columns_buffer_size<float>(m, n);
compute_reduce_matrix_columns_buffer_size<float>(m, n);
auto d_buffer = AllocateDeviceMemory<char>(buffer_size_in_bytes);
ASSERT_STATUS_OK(cuda::reduce_matrix_columns(
ASSERT_STATUS_OK(reduce_matrix_columns(
d_in.get(), d_out.get(),
m, n,
d_buffer.get(), buffer_size_in_bytes));
@ -202,7 +206,7 @@ TEST(ReductionFunctionsTest, BufferOffsets) {
const size_t max_buffer_offset = 15;
const size_t buffer_size_in_bytes =
cuda::compute_reduce_matrix_columns_buffer_size<double>(m, n) + max_buffer_offset;
compute_reduce_matrix_columns_buffer_size<double>(m, n) + max_buffer_offset;
auto d_input = AllocateDeviceMemory<double>(m * n);
auto d_output = AllocateDeviceMemory<double>(m);
@ -217,7 +221,7 @@ TEST(ReductionFunctionsTest, BufferOffsets) {
const auto input = random.Uniform<double>({m, n}, 1.0, 10.0);
cudaMemcpy(d_input.get(), input.data(), m * n * sizeof(double), cudaMemcpyHostToDevice);
ASSERT_STATUS_OK(cuda::reduce_matrix_columns(
ASSERT_STATUS_OK(reduce_matrix_columns(
d_input.get(), d_output.get(),
m, n,
d_buffer.get() + buffer_offset,
@ -234,7 +238,7 @@ TEST(ReductionFunctionsTest, InvalidBufferSize) {
// this should be too small
const size_t buffer_size_in_bytes =
cuda::compute_reduce_matrix_columns_buffer_size<float>(m, n) / 10;
compute_reduce_matrix_columns_buffer_size<float>(m, n) / 10;
auto d_input = AllocateDeviceMemory<float>(m * n);
auto d_output = AllocateDeviceMemory<float>(m);
@ -245,64 +249,108 @@ TEST(ReductionFunctionsTest, InvalidBufferSize) {
cudaMemcpy(d_input.get(), input.data(), m * n * sizeof(float), cudaMemcpyHostToDevice);
const auto status =
cuda::reduce_matrix_columns(d_input.get(), d_output.get(), m, n, d_buffer.get(), buffer_size_in_bytes);
reduce_matrix_columns(d_input.get(), d_output.get(), m, n, d_buffer.get(), buffer_size_in_bytes);
ASSERT_FALSE(status.IsOK());
}
TEST(ReductionFunctionsTest, GetApplicableMatrixReduction) {
auto test_get_applicable_matrix_reduction =
[](cudnnReduceTensorOp_t cudnn_op,
const std::vector<int64_t>& dims, const std::vector<int64_t>& axes,
ApplicableMatrixReduction expected_reduction,
const optional<int>& expected_m = nullopt,
const optional<int>& expected_n = nullopt) {
SCOPED_TRACE(MakeString(
"cudnn_op: ", cudnn_op,
", dims: ", TensorShape::ReinterpretBaseType(dims),
", axes: ", TensorShape::ReinterpretBaseType(axes)));
int m{}, n{};
EXPECT_EQ(
static_cast<int>(get_applicable_matrix_reduction(cudnn_op, dims, axes, m, n)),
static_cast<int>(expected_reduction));
if (expected_m) {
EXPECT_EQ(m, *expected_m);
}
if (expected_n) {
EXPECT_EQ(n, *expected_n);
}
};
const cudnnReduceTensorOp_t valid_op_type = CUDNN_REDUCE_TENSOR_ADD;
int m{}, n{};
// contiguous axes from beginning
EXPECT_EQ(
cuda::get_applicable_matrix_reduction(
valid_op_type, {2, 4, 8, 16}, {0, 1}, m, n),
cuda::ApplicableMatrixReduction::Rows);
EXPECT_EQ(m, 2 * 4);
EXPECT_EQ(n, 8 * 16);
test_get_applicable_matrix_reduction(
valid_op_type, {2, 4, 8, 16}, {0, 1},
ApplicableMatrixReduction::Rows, 2 * 4, 8 * 16);
// contiguous axes to end
EXPECT_EQ(
cuda::get_applicable_matrix_reduction(
valid_op_type, {2, 4, 8, 16}, {1, 2, 3}, m, n),
cuda::ApplicableMatrixReduction::Columns);
EXPECT_EQ(m, 2);
EXPECT_EQ(n, 4 * 8 * 16);
test_get_applicable_matrix_reduction(
valid_op_type, {2, 4, 8, 16}, {1, 2, 3},
ApplicableMatrixReduction::Columns, 2, 4 * 8 * 16);
// single axis
EXPECT_EQ(
cuda::get_applicable_matrix_reduction(
valid_op_type, {2, 4, 8, 16}, {3}, m, n),
cuda::ApplicableMatrixReduction::Columns);
EXPECT_EQ(m, 2 * 4 * 8);
EXPECT_EQ(n, 16);
test_get_applicable_matrix_reduction(
valid_op_type, {2, 4, 8, 16}, {3},
ApplicableMatrixReduction::Columns, 2 * 4 * 8, 16);
// empty axes
test_get_applicable_matrix_reduction(
valid_op_type, {2, 4, 8, 16}, {},
ApplicableMatrixReduction::Rows, 2 * 4 * 8 * 16, 1);
// all axes
test_get_applicable_matrix_reduction(
valid_op_type, {2, 4, 8, 16}, {0, 1, 2, 3},
ApplicableMatrixReduction::Rows, 2 * 4 * 8 * 16, 1);
// handle ones
test_get_applicable_matrix_reduction(
valid_op_type, {1, 2, 1, 1, 4, 1, 8, 1}, {0},
ApplicableMatrixReduction::Rows, 1, 2 * 4 * 8);
test_get_applicable_matrix_reduction(
valid_op_type, {1, 2, 1, 1, 4, 1, 8, 1}, {1},
ApplicableMatrixReduction::Rows, 2, 4 * 8);
test_get_applicable_matrix_reduction(
valid_op_type, {1, 2, 1, 1, 4, 1, 8, 1}, {1, 3},
ApplicableMatrixReduction::Rows, 2, 4 * 8);
test_get_applicable_matrix_reduction(
valid_op_type, {1, 2, 1, 1, 4, 1, 8, 1}, {1, 3, 4},
ApplicableMatrixReduction::Rows, 2 * 4, 8);
test_get_applicable_matrix_reduction(
valid_op_type, {1, 2, 1, 1, 4, 1, 8, 1}, {1, 3, 4, 6},
ApplicableMatrixReduction::Rows, 2 * 4 * 8, 1);
test_get_applicable_matrix_reduction(
valid_op_type, {1, 2, 1, 1, 4, 1, 8, 1}, {3, 4, 6},
ApplicableMatrixReduction::Columns, 2, 4 * 8);
test_get_applicable_matrix_reduction(
valid_op_type, {1, 2, 1, 1, 4, 1, 8, 1}, {4, 6},
ApplicableMatrixReduction::Columns, 2, 4 * 8);
test_get_applicable_matrix_reduction(
valid_op_type, {1, 2, 1, 1, 4, 1, 8, 1}, {6},
ApplicableMatrixReduction::Columns, 2 * 4, 8);
test_get_applicable_matrix_reduction(
valid_op_type, {1, 2, 1, 1, 4, 1, 8, 1}, {7},
ApplicableMatrixReduction::Columns, 2 * 4 * 8, 1);
// unsupported axes
EXPECT_EQ(
cuda::get_applicable_matrix_reduction(
valid_op_type, {2, 4, 8, 16}, {0, 1, 2, 3}, m, n),
cuda::ApplicableMatrixReduction::None);
EXPECT_EQ(
cuda::get_applicable_matrix_reduction(
valid_op_type, {2, 4, 8, 16}, {}, m, n),
cuda::ApplicableMatrixReduction::None);
EXPECT_EQ(
cuda::get_applicable_matrix_reduction(
valid_op_type, {2, 4, 8, 16, 32, 64}, {0, 1, 3, 4}, m, n),
cuda::ApplicableMatrixReduction::None);
EXPECT_EQ(
cuda::get_applicable_matrix_reduction(
valid_op_type, {2, 4, 8, 16}, {1, 2}, m, n),
cuda::ApplicableMatrixReduction::None);
test_get_applicable_matrix_reduction(
valid_op_type, {2, 4, 8, 16, 32, 64}, {0, 1, 3, 4},
ApplicableMatrixReduction::None);
test_get_applicable_matrix_reduction(
valid_op_type, {2, 4, 8, 16}, {1, 2},
ApplicableMatrixReduction::None);
test_get_applicable_matrix_reduction(
valid_op_type, {1, 2, 1, 1, 4, 1, 8, 1}, {3, 6},
ApplicableMatrixReduction::None);
// invalid op type
EXPECT_EQ(
cuda::get_applicable_matrix_reduction(
CUDNN_REDUCE_TENSOR_MAX, {2, 4, 8, 16}, {0, 1}, m, n),
cuda::ApplicableMatrixReduction::None);
test_get_applicable_matrix_reduction(
CUDNN_REDUCE_TENSOR_MAX, {2, 4, 8, 16}, {0, 1},
ApplicableMatrixReduction::None);
}
} // namespace test
} // namespace cuda
} // namespace onnxruntime
#endif