diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_functions.cc b/onnxruntime/core/providers/cuda/reduction/reduction_functions.cc index cb58191c1d..6889fcfbb1 100644 --- a/onnxruntime/core/providers/cuda/reduction/reduction_functions.cc +++ b/onnxruntime/core/providers/cuda/reduction/reduction_functions.cc @@ -4,6 +4,8 @@ #include "core/providers/cuda/reduction/reduction_functions.h" #include +#include +#include #include #include "core/common/optional.h" @@ -13,6 +15,88 @@ namespace onnxruntime { namespace cuda { +namespace { +// std::make_reverse_iterator is not implemented in older versions of GCC +#if !defined(__GNUC__) || __GNUC__ >= 5 +using std::make_reverse_iterator; +#else +template +std::reverse_iterator make_reverse_iterator(It it) { + return std::reverse_iterator(it); +} +#endif + +// gets min and max of single contiguous range of axes if available +optional> GetMinAndMaxContiguousAxes( + int64_t rank, + const std::vector& dims, + const std::vector& original_axes) { + assert(rank == static_cast(dims.size())); + + // empty axes means reduce all dimensions + if (original_axes.empty()) { + return std::make_pair(int64_t{0}, rank - 1); + } + + // normalize axis values and sort + const std::vector axes = [&original_axes, rank]() { + std::vector result(original_axes); + std::for_each( + result.begin(), result.end(), + [rank](int64_t& axis) { axis = HandleNegativeAxis(axis, rank); }); + std::sort(result.begin(), result.end()); + return result; + }(); + + assert(!axes.empty()); + + const auto is_dim_one = [](int64_t dim) { return dim == 1; }; + + for (auto a = axes.begin(), b = axes.begin() + 1; + b != axes.end(); + ++a, ++b) { + ORT_ENFORCE(*a != *b, "axes must not contain duplicate values"); + + // if axis values are adjacent, the axes are contiguous + if (*a + 1 == *b) { + continue; + } + + // if all dimension values between adjacent axes are 1, + // treat the axes as contiguous + if (std::all_of(dims.begin() + *a + 1, dims.begin() + *b, is_dim_one)) { + continue; + } + + // otherwise, not contiguous + return nullopt; + } + + // expand axes over surrounding dimensions with value of 1 + const int64_t min_axis = [&dims, &axes, &is_dim_one]() { + const auto& min_given_axis = axes.front(); + // note that std::reverse_iterator(it) refers to the element at (it-1) + // it -> reverse it: element offset of -1 + const auto before_min_given_axis_rit = + make_reverse_iterator(dims.begin() + min_given_axis); + const auto before_min_axis_rit = + std::find_if_not(before_min_given_axis_rit, dims.rend(), is_dim_one); + // reverse it -> it: element offset of +1 + return std::distance(dims.begin(), before_min_axis_rit.base()); + }(); + + const int64_t max_axis = [&dims, &axes, &is_dim_one]() { + const auto& max_given_axis = axes.back(); + const auto after_max_given_axis_it = dims.begin() + max_given_axis + 1; + const auto after_max_axis_it = + std::find_if_not(after_max_given_axis_it, dims.end(), is_dim_one); + return std::distance(dims.begin(), after_max_axis_it - 1); + }(); + + return std::make_pair(min_axis, max_axis); +} +} // namespace + ApplicableMatrixReduction get_applicable_matrix_reduction( const cudnnReduceTensorOp_t cudnn_reduce_op, const std::vector& dims, const std::vector& original_axes, @@ -22,53 +106,25 @@ ApplicableMatrixReduction get_applicable_matrix_reduction( } const auto rank = gsl::narrow(dims.size()); - - // min and max of single contiguous range of axes - const auto minmax_axes = [&]() -> optional> { - // empty axes means reduce all dimensions - if (original_axes.empty()) { - return std::make_pair(int64_t{0}, rank - 1); - } - - // normalize axis values and sort - const std::vector axes = [&original_axes, rank]() { - std::vector result(original_axes); - std::for_each( - result.begin(), result.end(), - [rank](int64_t& axis) { axis = HandleNegativeAxis(axis, rank); }); - std::sort(result.begin(), result.end()); - return result; - }(); - - for (auto a = axes.begin(), b = axes.begin() + 1; - b != axes.end(); - ++a, ++b) { - ORT_ENFORCE(*a != *b, "axes must not contain duplicate values"); - if (*a + 1 != *b) { // not contiguous - return {}; - } - } - - return std::make_pair(axes.front(), axes.back()); - }(); - - if (!minmax_axes.has_value()) { + const auto min_and_max_axes = GetMinAndMaxContiguousAxes(rank, dims, original_axes); + if (!min_and_max_axes.has_value()) { return ApplicableMatrixReduction::None; } - const auto& min_axis = minmax_axes.value().first; - const auto& max_axis = minmax_axes.value().second; + const auto& min_axis = min_and_max_axes.value().first; + const auto& max_axis = min_and_max_axes.value().second; // axes from beginning means row reduction, axes to end means column reduction - // currently we don't handle axes from beginning to end, but that could be either + // for axes from beginning to end, either works and we do row reduction const bool axes_from_beginning = min_axis == 0; const bool axes_to_end = max_axis == rank - 1; - // handle axes anchored to one of beginning or end, not both - if (axes_from_beginning == axes_to_end) { + // handle axes anchored to beginning or end + if (!axes_from_beginning && !axes_to_end) { return ApplicableMatrixReduction::None; } + // the axis index right after the last flattened into matrix rows const int64_t m_end_axis = axes_from_beginning ? max_axis + 1 : min_axis; const TensorShape& shape = TensorShape::ReinterpretBaseType(dims); diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc index d85e414b5d..8f5b1bb45a 100644 --- a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc +++ b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc @@ -663,7 +663,7 @@ Status ReduceKernel::ComputeImpl(OpKernelContext* ctx, cudnnRe Tensor* Y = ctx->Output(0, prepare_reduce_metadata.squeezed_output_dims); const bool fast_reduction = fast_reduction_ && !ctx->GetUseDeterministicCompute(); - return ReduceComputeCore(*cuda_ep_, *X, prepare_reduce_metadata, *Y, cudnn_reduce_op, axes_, + return ReduceComputeCore(*cuda_ep_, *X, prepare_reduce_metadata, *Y, cudnn_reduce_op, axes, calculate_log_, calculate_sqt_, log_sum_exp_, fast_reduction); } diff --git a/onnxruntime/test/providers/cuda/reduction_functions_test.cc b/onnxruntime/test/providers/cuda/reduction_functions_test.cc index f504cf1161..e7aebfe1b3 100644 --- a/onnxruntime/test/providers/cuda/reduction_functions_test.cc +++ b/onnxruntime/test/providers/cuda/reduction_functions_test.cc @@ -7,11 +7,15 @@ #include "gtest/gtest.h" +#include "core/common/optional.h" #include "core/providers/cuda/reduction/reduction_functions.h" #include "test/common/tensor_op_test_utils.h" #include "test/util/include/asserts.h" +using onnxruntime::test::RandomValueGenerator; + namespace onnxruntime { +namespace cuda { namespace test { namespace { @@ -55,7 +59,7 @@ void TestReduceRowToScalarApis(int size, float relative_error_tolerance = 1e-4f) expected_output_mean += input_value / float(size); } const auto buffer_size_in_bytes = - cuda::compute_reduction_buffer_size(size); + compute_reduction_buffer_size(size); auto device_input = AllocateDeviceMemory(size); auto device_output_sum = AllocateDeviceMemory(); @@ -65,19 +69,19 @@ void TestReduceRowToScalarApis(int size, float relative_error_tolerance = 1e-4f) cudaMemcpy(device_input.get(), input.data(), size * sizeof(float), cudaMemcpyHostToDevice); - ASSERT_STATUS_OK(cuda::reduce_sum( + ASSERT_STATUS_OK(reduce_sum( device_input.get(), device_output_sum.get(), size, buffer.get(), buffer_size_in_bytes)); - ASSERT_STATUS_OK(cuda::reduce_square_sum( + ASSERT_STATUS_OK(reduce_square_sum( device_input.get(), device_output_square_sum.get(), size, buffer.get(), buffer_size_in_bytes)); - ASSERT_STATUS_OK(cuda::reduce_mean( + ASSERT_STATUS_OK(reduce_mean( device_input.get(), device_output_mean.get(), size, @@ -116,10 +120,10 @@ void TestReduceRowsToRow(int m, int n, bool reset_initial_output, float relative if (!reset_initial_output) { // manually initialize output data - cuda::Fill(d_out.get(), initial_value, n); + Fill(d_out.get(), initial_value, n); } - ASSERT_STATUS_OK(cuda::reduce_matrix_rows( + ASSERT_STATUS_OK(reduce_matrix_rows( d_in.get(), d_out.get(), m, n, reset_initial_output)); @@ -155,10 +159,10 @@ void TestReduceColumnsToColumn(int m, int n, float relative_error_tolerance = 1e cudaMemcpy(d_in.get(), values.data(), m * n * sizeof(float), cudaMemcpyHostToDevice); size_t buffer_size_in_bytes = - cuda::compute_reduce_matrix_columns_buffer_size(m, n); + compute_reduce_matrix_columns_buffer_size(m, n); auto d_buffer = AllocateDeviceMemory(buffer_size_in_bytes); - ASSERT_STATUS_OK(cuda::reduce_matrix_columns( + ASSERT_STATUS_OK(reduce_matrix_columns( d_in.get(), d_out.get(), m, n, d_buffer.get(), buffer_size_in_bytes)); @@ -202,7 +206,7 @@ TEST(ReductionFunctionsTest, BufferOffsets) { const size_t max_buffer_offset = 15; const size_t buffer_size_in_bytes = - cuda::compute_reduce_matrix_columns_buffer_size(m, n) + max_buffer_offset; + compute_reduce_matrix_columns_buffer_size(m, n) + max_buffer_offset; auto d_input = AllocateDeviceMemory(m * n); auto d_output = AllocateDeviceMemory(m); @@ -217,7 +221,7 @@ TEST(ReductionFunctionsTest, BufferOffsets) { const auto input = random.Uniform({m, n}, 1.0, 10.0); cudaMemcpy(d_input.get(), input.data(), m * n * sizeof(double), cudaMemcpyHostToDevice); - ASSERT_STATUS_OK(cuda::reduce_matrix_columns( + ASSERT_STATUS_OK(reduce_matrix_columns( d_input.get(), d_output.get(), m, n, d_buffer.get() + buffer_offset, @@ -234,7 +238,7 @@ TEST(ReductionFunctionsTest, InvalidBufferSize) { // this should be too small const size_t buffer_size_in_bytes = - cuda::compute_reduce_matrix_columns_buffer_size(m, n) / 10; + compute_reduce_matrix_columns_buffer_size(m, n) / 10; auto d_input = AllocateDeviceMemory(m * n); auto d_output = AllocateDeviceMemory(m); @@ -245,64 +249,108 @@ TEST(ReductionFunctionsTest, InvalidBufferSize) { cudaMemcpy(d_input.get(), input.data(), m * n * sizeof(float), cudaMemcpyHostToDevice); const auto status = - cuda::reduce_matrix_columns(d_input.get(), d_output.get(), m, n, d_buffer.get(), buffer_size_in_bytes); + reduce_matrix_columns(d_input.get(), d_output.get(), m, n, d_buffer.get(), buffer_size_in_bytes); ASSERT_FALSE(status.IsOK()); } TEST(ReductionFunctionsTest, GetApplicableMatrixReduction) { + auto test_get_applicable_matrix_reduction = + [](cudnnReduceTensorOp_t cudnn_op, + const std::vector& dims, const std::vector& axes, + ApplicableMatrixReduction expected_reduction, + const optional& expected_m = nullopt, + const optional& expected_n = nullopt) { + SCOPED_TRACE(MakeString( + "cudnn_op: ", cudnn_op, + ", dims: ", TensorShape::ReinterpretBaseType(dims), + ", axes: ", TensorShape::ReinterpretBaseType(axes))); + int m{}, n{}; + EXPECT_EQ( + static_cast(get_applicable_matrix_reduction(cudnn_op, dims, axes, m, n)), + static_cast(expected_reduction)); + if (expected_m) { + EXPECT_EQ(m, *expected_m); + } + if (expected_n) { + EXPECT_EQ(n, *expected_n); + } + }; + const cudnnReduceTensorOp_t valid_op_type = CUDNN_REDUCE_TENSOR_ADD; - int m{}, n{}; // contiguous axes from beginning - EXPECT_EQ( - cuda::get_applicable_matrix_reduction( - valid_op_type, {2, 4, 8, 16}, {0, 1}, m, n), - cuda::ApplicableMatrixReduction::Rows); - EXPECT_EQ(m, 2 * 4); - EXPECT_EQ(n, 8 * 16); + test_get_applicable_matrix_reduction( + valid_op_type, {2, 4, 8, 16}, {0, 1}, + ApplicableMatrixReduction::Rows, 2 * 4, 8 * 16); // contiguous axes to end - EXPECT_EQ( - cuda::get_applicable_matrix_reduction( - valid_op_type, {2, 4, 8, 16}, {1, 2, 3}, m, n), - cuda::ApplicableMatrixReduction::Columns); - EXPECT_EQ(m, 2); - EXPECT_EQ(n, 4 * 8 * 16); + test_get_applicable_matrix_reduction( + valid_op_type, {2, 4, 8, 16}, {1, 2, 3}, + ApplicableMatrixReduction::Columns, 2, 4 * 8 * 16); // single axis - EXPECT_EQ( - cuda::get_applicable_matrix_reduction( - valid_op_type, {2, 4, 8, 16}, {3}, m, n), - cuda::ApplicableMatrixReduction::Columns); - EXPECT_EQ(m, 2 * 4 * 8); - EXPECT_EQ(n, 16); + test_get_applicable_matrix_reduction( + valid_op_type, {2, 4, 8, 16}, {3}, + ApplicableMatrixReduction::Columns, 2 * 4 * 8, 16); + + // empty axes + test_get_applicable_matrix_reduction( + valid_op_type, {2, 4, 8, 16}, {}, + ApplicableMatrixReduction::Rows, 2 * 4 * 8 * 16, 1); + + // all axes + test_get_applicable_matrix_reduction( + valid_op_type, {2, 4, 8, 16}, {0, 1, 2, 3}, + ApplicableMatrixReduction::Rows, 2 * 4 * 8 * 16, 1); + + // handle ones + test_get_applicable_matrix_reduction( + valid_op_type, {1, 2, 1, 1, 4, 1, 8, 1}, {0}, + ApplicableMatrixReduction::Rows, 1, 2 * 4 * 8); + test_get_applicable_matrix_reduction( + valid_op_type, {1, 2, 1, 1, 4, 1, 8, 1}, {1}, + ApplicableMatrixReduction::Rows, 2, 4 * 8); + test_get_applicable_matrix_reduction( + valid_op_type, {1, 2, 1, 1, 4, 1, 8, 1}, {1, 3}, + ApplicableMatrixReduction::Rows, 2, 4 * 8); + test_get_applicable_matrix_reduction( + valid_op_type, {1, 2, 1, 1, 4, 1, 8, 1}, {1, 3, 4}, + ApplicableMatrixReduction::Rows, 2 * 4, 8); + test_get_applicable_matrix_reduction( + valid_op_type, {1, 2, 1, 1, 4, 1, 8, 1}, {1, 3, 4, 6}, + ApplicableMatrixReduction::Rows, 2 * 4 * 8, 1); + test_get_applicable_matrix_reduction( + valid_op_type, {1, 2, 1, 1, 4, 1, 8, 1}, {3, 4, 6}, + ApplicableMatrixReduction::Columns, 2, 4 * 8); + test_get_applicable_matrix_reduction( + valid_op_type, {1, 2, 1, 1, 4, 1, 8, 1}, {4, 6}, + ApplicableMatrixReduction::Columns, 2, 4 * 8); + test_get_applicable_matrix_reduction( + valid_op_type, {1, 2, 1, 1, 4, 1, 8, 1}, {6}, + ApplicableMatrixReduction::Columns, 2 * 4, 8); + test_get_applicable_matrix_reduction( + valid_op_type, {1, 2, 1, 1, 4, 1, 8, 1}, {7}, + ApplicableMatrixReduction::Columns, 2 * 4 * 8, 1); // unsupported axes - EXPECT_EQ( - cuda::get_applicable_matrix_reduction( - valid_op_type, {2, 4, 8, 16}, {0, 1, 2, 3}, m, n), - cuda::ApplicableMatrixReduction::None); - EXPECT_EQ( - cuda::get_applicable_matrix_reduction( - valid_op_type, {2, 4, 8, 16}, {}, m, n), - cuda::ApplicableMatrixReduction::None); - EXPECT_EQ( - cuda::get_applicable_matrix_reduction( - valid_op_type, {2, 4, 8, 16, 32, 64}, {0, 1, 3, 4}, m, n), - cuda::ApplicableMatrixReduction::None); - EXPECT_EQ( - cuda::get_applicable_matrix_reduction( - valid_op_type, {2, 4, 8, 16}, {1, 2}, m, n), - cuda::ApplicableMatrixReduction::None); + test_get_applicable_matrix_reduction( + valid_op_type, {2, 4, 8, 16, 32, 64}, {0, 1, 3, 4}, + ApplicableMatrixReduction::None); + test_get_applicable_matrix_reduction( + valid_op_type, {2, 4, 8, 16}, {1, 2}, + ApplicableMatrixReduction::None); + test_get_applicable_matrix_reduction( + valid_op_type, {1, 2, 1, 1, 4, 1, 8, 1}, {3, 6}, + ApplicableMatrixReduction::None); // invalid op type - EXPECT_EQ( - cuda::get_applicable_matrix_reduction( - CUDNN_REDUCE_TENSOR_MAX, {2, 4, 8, 16}, {0, 1}, m, n), - cuda::ApplicableMatrixReduction::None); + test_get_applicable_matrix_reduction( + CUDNN_REDUCE_TENSOR_MAX, {2, 4, 8, 16}, {0, 1}, + ApplicableMatrixReduction::None); } } // namespace test +} // namespace cuda } // namespace onnxruntime #endif