Reduce Kernel Optimization (#8067)

* reduce optimization

* bug fix

* add a check

* add ut

* refactor

* add ut cases for keepdims=true
This commit is contained in:
Vincent Wang 2021-06-16 19:53:46 +08:00 committed by GitHub
parent 0ebaa71f49
commit de8f2ecda9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 75 additions and 28 deletions

View file

@ -105,8 +105,42 @@ ApplicableMatrixReduction get_applicable_matrix_reduction(
return ApplicableMatrixReduction::None;
}
const auto rank = gsl::narrow<int64_t>(dims.size());
const auto min_and_max_axes = GetMinAndMaxContiguousAxes(rank, dims, original_axes);
// Remove all dims with value 1. This can help to optimize case like:
// dims=[2,3,1,4,1,5] and axes=[0,2,4], which is same as dims=[2,3,4,5] and axes=[0].
std::vector<int64_t> new_dims;
std::vector<int64_t> new_axes;
const auto original_rank = gsl::narrow<int64_t>(dims.size());
std::set<int64_t> original_axes_set;
for (const auto axis : original_axes) {
original_axes_set.insert(HandleNegativeAxis(axis, original_rank));
}
int64_t new_axis = 0;
for (size_t i = 0; i < dims.size(); i++) {
if (dims[i] != 1) {
new_dims.emplace_back(dims[i]);
if (original_axes_set.find(gsl::narrow<int64_t>(i)) != original_axes_set.end()) {
new_axes.emplace_back(new_axis);
}
new_axis++;
}
}
// Empty axes means reduce all dimensions, which has different meaning,
// so add a new dim to the end if all original axes are on dims with value 1.
if (!original_axes.empty() && new_axes.empty()) {
new_dims.emplace_back(1);
new_axes.emplace_back(new_axis);
}
// If all dims are value 1, make sure it's not empty by adding a new dim.
if (!dims.empty() && new_dims.empty()) {
new_dims.emplace_back(1);
}
const auto rank = gsl::narrow<int64_t>(new_dims.size());
const auto min_and_max_axes = GetMinAndMaxContiguousAxes(rank, new_dims, new_axes);
if (!min_and_max_axes.has_value()) {
return ApplicableMatrixReduction::None;
}
@ -127,7 +161,7 @@ ApplicableMatrixReduction get_applicable_matrix_reduction(
// the axis index right after the last flattened into matrix rows
const int64_t m_end_axis = axes_from_beginning ? max_axis + 1 : min_axis;
const TensorShape& shape = TensorShape::ReinterpretBaseType(dims);
const TensorShape& shape = TensorShape::ReinterpretBaseType(new_dims);
const auto m = shape.SizeToDimension(m_end_axis);
const auto n = shape.SizeFromDimension(m_end_axis);

View file

@ -19,7 +19,7 @@ static void TestReduceSum(const std::vector<int64_t>& X_dims,
double per_sample_tolerance = 2e-4,
double relative_per_sample_tolerance = 2e-4) {
CompareOpTester test("ReduceSum");
test.AddAttribute("axes", axes);
if (!axes.empty()) test.AddAttribute("axes", axes);
test.AddAttribute("keepdims", int64_t(keepdims));
// create rand inputs
@ -38,66 +38,79 @@ static void TestReduceSum(const std::vector<int64_t>& X_dims,
TEST(CudaKernelTest, ReduceSum_Scalar) {
std::vector<int64_t> X_dims{1};
std::vector<int64_t> Y_dims{};
std::vector<int64_t> axes{0};
bool keepdims = false;
TestReduceSum(X_dims, Y_dims, axes, keepdims);
TestReduceSum(X_dims, {}, axes, false);
TestReduceSum(X_dims, {1}, axes, true);
}
TEST(CudaKernelTest, ReduceSum_2DtoLastDim) {
std::vector<int64_t> X_dims{16, 2};
std::vector<int64_t> Y_dims{2};
std::vector<int64_t> axes{0};
bool keepdims = false;
TestReduceSum(X_dims, Y_dims, axes, keepdims);
TestReduceSum(X_dims, {2}, axes, false);
TestReduceSum(X_dims, {1, 2}, axes, true);
}
TEST(CudaKernelTest, ReduceSum_SmallTensor) {
std::vector<int64_t> X_dims{2, 128, 128};
std::vector<int64_t> Y_dims{128};
std::vector<int64_t> axes{0, 1};
bool keepdims = false;
TestReduceSum(X_dims, Y_dims, axes, keepdims);
TestReduceSum(X_dims, {128}, axes, false);
TestReduceSum(X_dims, {1, 1, 128}, axes, true);
}
TEST(CudaKernelTest, ReduceSum_MidTensor) {
std::vector<int64_t> X_dims{2, 512, 3072};
std::vector<int64_t> Y_dims{3072};
std::vector<int64_t> axes{0, 1};
bool keepdims = false;
TestReduceSum(X_dims, Y_dims, axes, keepdims);
TestReduceSum(X_dims, {3072}, axes, false);
TestReduceSum(X_dims, {1, 1, 3072}, axes, true);
}
TEST(CudaKernelTest, ReduceSum_LargeTensor) {
std::vector<int64_t> X_dims{4, 512, 30528};
std::vector<int64_t> Y_dims{30528};
std::vector<int64_t> axes{0, 1};
bool keepdims = false;
TestReduceSum(X_dims, Y_dims, axes, keepdims);
TestReduceSum(X_dims, {30528}, axes, false);
TestReduceSum(X_dims, {1, 1, 30528}, axes, true);
}
TEST(CudaKernelTest, ReduceSum_SmallTensorTrailingAxes) {
std::vector<int64_t> X_dims{128, 2, 128};
std::vector<int64_t> Y_dims{128};
std::vector<int64_t> axes{1, 2};
bool keepdims = false;
TestReduceSum(X_dims, Y_dims, axes, keepdims);
TestReduceSum(X_dims, {128}, axes, false);
TestReduceSum(X_dims, {128, 1, 1}, axes, true);
}
TEST(CudaKernelTest, ReduceSum_MidTensorTrailingAxes) {
std::vector<int64_t> X_dims{3072, 2, 512};
std::vector<int64_t> Y_dims{3072};
std::vector<int64_t> axes{1, 2};
bool keepdims = false;
TestReduceSum(X_dims, Y_dims, axes, keepdims);
TestReduceSum(X_dims, {3072}, axes, false);
TestReduceSum(X_dims, {3072, 1, 1}, axes, true);
}
TEST(CudaKernelTest, ReduceSum_LargeTensorTrailingAxes) {
std::vector<int64_t> X_dims{30528, 4, 512};
std::vector<int64_t> Y_dims{30528};
std::vector<int64_t> axes{1, 2};
bool keepdims = false;
TestReduceSum(X_dims, Y_dims, axes, keepdims);
TestReduceSum(X_dims, {30528}, axes, false);
TestReduceSum(X_dims, {30528, 1, 1}, axes, true);
}
TEST(CudaKernelTest, ReduceSum_OneDimsOptimization) {
std::vector<int64_t> X_dims{2, 3, 1, 4, 1, 5};
std::vector<int64_t> axes{0, 2, 4};
TestReduceSum(X_dims, {3, 4, 5}, axes, false);
TestReduceSum(X_dims, {1, 3, 1, 4, 1, 5}, axes, true);
}
TEST(CudaKernelTest, ReduceSum_ReduceOnOneDims) {
std::vector<int64_t> X_dims{2, 1, 1};
std::vector<int64_t> axes{1, 2};
TestReduceSum(X_dims, {2}, axes, false);
TestReduceSum(X_dims, {2, 1, 1}, axes, true);
}
TEST(CudaKernelTest, ReduceSum_AllOneDims) {
std::vector<int64_t> X_dims{1, 1};
std::vector<int64_t> axes{};
TestReduceSum(X_dims, {}, axes, false);
TestReduceSum(X_dims, {1, 1}, axes, true);
}
} // namespace test