From 2b1a59f01abd38d5fd40d75b3f41547791980fbc Mon Sep 17 00:00:00 2001 From: liqun Fu Date: Wed, 25 Jan 2023 23:14:56 -0800 Subject: [PATCH] cpu support of LpPool(18) (#14205) Signed-off-by: Liqun Fu ### Description To support LpPool (18) ### Motivation and Context for Ort 1.14 release Signed-off-by: Liqun Fu --- docs/OperatorKernels.md | 3 +- .../providers/cpu/cpu_execution_provider.cc | 6 +- onnxruntime/core/providers/cpu/nn/pool.cc | 87 ++++++++- onnxruntime/core/providers/cpu/nn/pool.h | 16 ++ .../core/providers/cpu/nn/pool_functors.h | 166 ++++++++++++++++++ .../cpu/nn/lp_pool_test_generator.py | 62 +++++++ .../test/providers/cpu/nn/pool_op_test.cc | 82 +++++++++ 7 files changed, 417 insertions(+), 5 deletions(-) create mode 100644 onnxruntime/test/providers/cpu/nn/lp_pool_test_generator.py diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md index ad571dacb2..41c0da6fa0 100644 --- a/docs/OperatorKernels.md +++ b/docs/OperatorKernels.md @@ -175,7 +175,8 @@ Do not modify directly.* |||[11, 12]|**B** = tensor(bool)
**I** = tensor(int64)
**V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)| |||[1, 10]|**B** = tensor(bool)
**I** = tensor(int64)
**V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)| |LpNormalization|*in* input:**T**
*out* output:**T**|1+|**T** = tensor(double), tensor(float)| -|LpPool|*in* X:**T**
*out* Y:**T**|11+|**T** = tensor(float)| +|LpPool|*in* X:**T**
*out* Y:**T**|18+|**T** = tensor(float)| +|||[11, 17]|**T** = tensor(float)| |||[2, 10]|**T** = tensor(float)| |MatMul|*in* A:**T**
*in* B:**T**
*out* Y:**T**|13+|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)| |||[9, 12]|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)| diff --git a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc index 75060fbf9f..caba009075 100644 --- a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc +++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc @@ -439,7 +439,7 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDoma class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, NonMaxSuppression); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, AveragePool); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, MaxUnpool); -class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, LpPool); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 17, LpPool); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, Conv); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, ConvTranspose); class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, If); @@ -830,6 +830,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, float, ReduceSumSquare); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, double, ReduceSumSquare); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t, ReduceSumSquare); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, LpPool); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, Col2Im); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int8_t, BitwiseAnd); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int16_t, BitwiseAnd); @@ -1471,7 +1472,7 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) { NonMaxSuppression)>, BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -2164,6 +2165,7 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) { ReduceSumSquare)>, BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, diff --git a/onnxruntime/core/providers/cpu/nn/pool.cc b/onnxruntime/core/providers/cpu/nn/pool.cc index 53c24e7e3d..0f3681d64c 100644 --- a/onnxruntime/core/providers/cpu/nn/pool.cc +++ b/onnxruntime/core/providers/cpu/nn/pool.cc @@ -249,6 +249,81 @@ Status MaxPoolV8::ComputeImpl(OpKernelContext* context) const { return Status::OK(); } +template +Status LpPoolV18::Compute(OpKernelContext* context) const { + concurrency::ThreadPool* tp = context->GetOperatorThreadPool(); + bool need_dilation = false; + for (auto n : pool_attrs_.dilations) { + need_dilation |= n > 1; + } + + const auto* X = context->Input(0); + const TensorShape& x_shape = X->Shape(); + + ORT_RETURN_IF_NOT(x_shape.NumDimensions() >= 3, "Input dimension cannot be less than 3."); + + auto pads = pool_attrs_.pads; + auto kernel_shape = pool_attrs_.kernel_shape; + + auto output_dims = pool_attrs_.SetOutputSize(x_shape, x_shape[1], &pads); + Tensor* Y = context->Output(0, output_dims); + + const auto* X_data = X->Data(); + auto* Y_data = Y->MutableData(); + + // The main loop + int64_t channels = x_shape[1]; + int64_t height = x_shape[2]; + int64_t width = kernel_shape.size() > 1 ? x_shape[3] : 1; + int64_t depth = kernel_shape.size() > 2 ? x_shape[4] : 1; + int64_t pooled_height = output_dims[2]; + int64_t pooled_width = kernel_shape.size() > 1 ? output_dims[3] : 1; + int64_t pooled_depth = kernel_shape.size() > 2 ? output_dims[4] : 1; + const int64_t total_channels = x_shape[0] * channels; + + switch (kernel_shape.size()) { + case 1: { + int64_t x_step = height; + int64_t y_step = pooled_height; + const int64_t dilation_h = pool_attrs_.dilations[0]; + + RunLoop>(tp, onnxruntime::narrow(total_channels), + {X_data, Y_data, x_step, y_step, dilation_h, pooled_height, stride_h(), + height, kernel_shape, pads, p_}); + break; + } + + case 2: { + int64_t x_step = height * width; + int64_t y_step = pooled_height * pooled_width; + const int64_t dilation_h = pool_attrs_.dilations[0]; + const int64_t dilation_w = pool_attrs_.dilations[1]; + RunLoop>( + tp, onnxruntime::narrow(total_channels), + {X_data, Y_data, x_step, y_step, dilation_h, dilation_w, pooled_height, pooled_width, stride_h(), + stride_w(), height, width, kernel_shape, pads, p_}); + break; + } + case 3: { + int64_t x_step = height * width * depth; + int64_t y_step = pooled_height * pooled_width * pooled_depth; + const int64_t dilation_h = pool_attrs_.dilations[0]; + const int64_t dilation_w = pool_attrs_.dilations[1]; + const int64_t dilation_d = pool_attrs_.dilations[2]; + RunLoop>(tp, onnxruntime::narrow(total_channels), + {X_data, Y_data, x_step, y_step, + dilation_h, dilation_w, dilation_d, pooled_height, pooled_width, + pooled_depth, stride_h(), stride_w(), stride_d(), height, + width, depth, kernel_shape, pads, p_}); + break; + } + default: + return Status(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported kernel dimension : " + std::to_string(kernel_shape.size())); + } + + return Status::OK(); +} + ONNX_CPU_OPERATOR_VERSIONED_KERNEL(AveragePool, 7, 9, KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), Pool); @@ -284,8 +359,16 @@ ONNX_CPU_OPERATOR_VERSIONED_KERNEL(LpPool, 2, 10, KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), Pool); -ONNX_CPU_OPERATOR_KERNEL(LpPool, 11, KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Pool); +ONNX_CPU_OPERATOR_VERSIONED_KERNEL(LpPool, 11, 17, + KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), + Pool); + +ONNX_CPU_OPERATOR_KERNEL(LpPool, 18, + KernelDefBuilder() + .TypeConstraint( + "T", + DataTypeImpl::GetTensorType()), + LpPoolV18); ONNX_CPU_OPERATOR_KERNEL(GlobalLpPool, 2, KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), Pool); diff --git a/onnxruntime/core/providers/cpu/nn/pool.h b/onnxruntime/core/providers/cpu/nn/pool.h index 5458c5ba6c..7e4899ea1d 100644 --- a/onnxruntime/core/providers/cpu/nn/pool.h +++ b/onnxruntime/core/providers/cpu/nn/pool.h @@ -46,4 +46,20 @@ class MaxPoolV8 : public OpKernel, public PoolBase { template Status ComputeImpl(OpKernelContext* context) const; }; + +// For lppool v18 and beyond +// version 18: Added ceil_mode and dilations +template +class LpPoolV18 : public OpKernel, public PoolBase { + public: + LpPoolV18(const OpKernelInfo& info) : OpKernel(info), PoolBase(info) { + ORT_ENFORCE(info.GetAttr("p", &p_).IsOK()); + } + + Status Compute(OpKernelContext* context) const override; + +private: + int64_t p_; +}; + } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cpu/nn/pool_functors.h b/onnxruntime/core/providers/cpu/nn/pool_functors.h index 19ffb8f2d5..9948e1d809 100644 --- a/onnxruntime/core/providers/cpu/nn/pool_functors.h +++ b/onnxruntime/core/providers/cpu/nn/pool_functors.h @@ -377,4 +377,170 @@ struct MaxPool3DTask { } }; +template +struct LpPool1DTask final { + const T* X_data; + T* Y_data; + int64_t x_step; + int64_t y_step; + int64_t dilation_h; + int64_t pooled_height; + int64_t stride_h; + int64_t height; + gsl::span kernel_shape; + gsl::span pads; + int64_t p; + TensorOpCost Cost() { + double loop_count = static_cast(pooled_height * kernel_shape[0]); + return TensorOpCost{loop_count, loop_count, loop_count}; + } + + void operator()(std::ptrdiff_t begin, std::ptrdiff_t end) const { + for (std::ptrdiff_t c = begin; c < end; ++c) { + operator()(c); + } + } + void operator()(std::ptrdiff_t c) const { + const T* x_d = X_data + c * x_step; + T* y_d = Y_data + c * y_step; + for (int64_t ph = 0; ph < pooled_height; ++ph) { + int64_t hstart = ph * stride_h - pads[0]; + int64_t hend = hstart + kernel_shape[0] * dilation_h; + y_d[ph] = 0; + for (int64_t h = hstart; h < hend; h += dilation_h) { + if (math::is_a_ge_zero_and_a_lt_b(h, height)) { + y_d[ph] += static_cast(std::pow(std::abs(x_d[h]), p)); + } + } + y_d[ph] = static_cast(std::pow(y_d[ph], 1.0f / p)); + } + } +}; + +template +struct LpPool2DTask final { + const T* X_data; + T* Y_data; + int64_t x_step; + int64_t y_step; + int64_t dilation_h; + int64_t dilation_w; + int64_t pooled_height; + int64_t pooled_width; + int64_t stride_h; + int64_t stride_w; + int64_t height; + int64_t width; + gsl::span kernel_shape; + gsl::span pads; + int64_t p; + + TensorOpCost Cost() { + double loop_count = static_cast(pooled_height * pooled_width * kernel_shape[0] * kernel_shape[1]); + return TensorOpCost{loop_count, loop_count, loop_count}; + } + + void operator()(std::ptrdiff_t begin, std::ptrdiff_t end) const { + for (std::ptrdiff_t c = begin; c < end; ++c) { + operator()(c); + } + } + + void operator()(std::ptrdiff_t c) const { + const T* x_d = X_data + c * x_step; + T* y_d = Y_data + c * y_step; + for (int64_t ph = 0; ph < pooled_height; ++ph) { + int64_t hstart = ph * stride_h - pads[0]; + int64_t hend = hstart + kernel_shape[0] * dilation_h; + for (int64_t pw = 0; pw < pooled_width; ++pw) { + int64_t wstart = pw * stride_w - pads[1]; + int64_t wend = wstart + kernel_shape[1] * dilation_w; + const int64_t pool_index = ph * pooled_width + pw; + y_d[pool_index] = 0; + for (int64_t h = hstart; h < hend; h += dilation_h) { + if (math::is_a_ge_zero_and_a_lt_b(h, height)) { + for (int64_t w = wstart; w < wend; w += dilation_w) { + if (math::is_a_ge_zero_and_a_lt_b(w, width)) { + const int64_t input_index = h * width + w; + y_d[pool_index] += static_cast(std::pow(std::abs(x_d[input_index]), p)); + } + } + } + } + y_d[pool_index] = static_cast(std::pow(y_d[pool_index], 1.0f / p)); + } + } + } +}; + +template +struct LpPool3DTask { + const T* X_data; + T* Y_data; + int64_t x_step; + int64_t y_step; + int64_t dilation_h; + int64_t dilation_w; + int64_t dilation_d; + int64_t pooled_height; + int64_t pooled_width; + int64_t pooled_depth; + int64_t stride_h; + int64_t stride_w; + int64_t stride_d; + int64_t height; + int64_t width; + int64_t depth; + gsl::span kernel_shape; + gsl::span pads; + int64_t p; + + void operator()(std::ptrdiff_t begin, std::ptrdiff_t end) const { + for (std::ptrdiff_t c = begin; c < end; ++c) { + operator()(c); + } + } + + TensorOpCost Cost() { + double loop_count = static_cast(pooled_height * pooled_width * pooled_depth * kernel_shape[0] * + kernel_shape[1] * kernel_shape[2]); + return TensorOpCost{loop_count, loop_count, loop_count}; + } + + void operator()(std::ptrdiff_t c) const { + const T* x_d = X_data + c * x_step; + T* y_d = Y_data + c * y_step; + + for (int64_t ph = 0; ph < pooled_height; ++ph) { + int64_t hstart = ph * stride_h - pads[0]; + int64_t hend = hstart + kernel_shape[0] * dilation_h; + for (int64_t pw = 0; pw < pooled_width; ++pw) { + int64_t wstart = pw * stride_w - pads[1]; + int64_t wend = wstart + kernel_shape[1] * dilation_w; + for (int64_t pd = 0; pd < pooled_depth; ++pd) { + int64_t dstart = pd * stride_d - pads[2]; + int64_t dend = dstart + kernel_shape[2] * dilation_d; + const int64_t pool_index = ph * pooled_width * pooled_depth + pw * pooled_depth + pd; + y_d[pool_index] = 0; + for (int64_t h = hstart; h < hend; h += dilation_h) { + if (math::is_a_ge_zero_and_a_lt_b(h, height)) { + for (int64_t w = wstart; w < wend; w += dilation_w) { + if (math::is_a_ge_zero_and_a_lt_b(w, width)) { + for (int64_t d = dstart; d < dend; d += dilation_d) { + if (math::is_a_ge_zero_and_a_lt_b(d, depth)) { + const int64_t input_index = h * width * depth + w * depth + d; + y_d[pool_index] += static_cast(std::pow(std::abs(x_d[input_index]), p)); + } + } + } + } + } + } + y_d[pool_index] = static_cast(std::pow(y_d[pool_index], 1.0f / p)); + } + } + } + } +}; + } // namespace onnxruntime diff --git a/onnxruntime/test/providers/cpu/nn/lp_pool_test_generator.py b/onnxruntime/test/providers/cpu/nn/lp_pool_test_generator.py new file mode 100644 index 0000000000..e068784557 --- /dev/null +++ b/onnxruntime/test/providers/cpu/nn/lp_pool_test_generator.py @@ -0,0 +1,62 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +import numpy as np +import torch +from torch import nn + + +# use this code to generate test data for PoolTest.LpPool1d and PoolTest.LpPool2d +def generate_lppool_1d_test_cases() -> None: + p = 2 + x = np.array( + [ + [ + [1, 2, 3, 4], + ] + ] + ).astype(np.float32) + + print(x) + kernel_sizes = [2, 3] + strides = [[1], [2]] + for kernel_size in kernel_sizes: + for stride in strides: + print(kernel_size) + print(stride) + model = nn.LPPool1d(p, kernel_size=kernel_size, stride=stride) + pt_y = model(torch.from_numpy(x)) + print(torch.flatten(pt_y)) + print(pt_y.shape) + + +def generate_lppool_2d_test_cases() -> None: + p = 2 + x = np.array( + [ + [ + [ + [1, 2, 3, 4], + [5, 6, 7, 8], + [9, 10, 11, 12], + [13, 14, 15, 16], + ] + ] + ] + ).astype(np.float32) + + print(x) + kernel_sizes = [[2, 2], [3, 3]] + strides = [[1, 1], [2, 2]] + for kernel_size in kernel_sizes: + for stride in strides: + model = nn.LPPool2d(p, kernel_size=kernel_size, stride=stride) + pt_y = model(torch.from_numpy(x)) + print(kernel_size) + print(stride) + print(torch.flatten(pt_y)) + print(pt_y.shape) + + +generate_lppool_1d_test_cases() +generate_lppool_2d_test_cases() diff --git a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc index c68d9839d2..44f81df407 100644 --- a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc +++ b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc @@ -1331,6 +1331,88 @@ TEST(PoolTest, LpPool) { test.Run(); } +// test data generated with lp_pool_test_generator.py +TEST(PoolTest, LpPool1d) { + std::vector kernel_sizes[2] = {{2}, {3}}; + std::vector strides[2] = {{1}, {2}}; + std::vector ys[4] = { + {2.2361f, 3.6056f, 5.0000f}, + {2.2361f, 5.0000f}, + {3.7417f, 5.3852f}, + {3.7417f}}; + std::vector y_sizes[4] = { + {1, 1, 3}, + {1, 1, 2}, + {1, 1, 2}, + {1, 1, 1}, + }; + int y_count = 0; + for (int kernel_size_count = 0; kernel_size_count < 2; kernel_size_count++) + for (int stride_count = 0; stride_count < 2; stride_count++) { + OpTester test("LpPool", 18); + test.AddAttribute("auto_pad", ""); + test.AddAttribute("p", static_cast(2)); + test.AddInput("X", {1, 1, 4}, {1, 2, 3, 4}); + test.AddAttribute("strides", strides[stride_count]); + test.AddAttribute("kernel_shape", kernel_sizes[kernel_size_count]); + + test.AddOutput("Y", y_sizes[y_count], ys[y_count]); + + // https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/classnvinfer1_1_1_i_network_definition.html#a94f434942252e6d98ac17705c06ce060 + // TensorRT does not support 1d pooling + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); + y_count++; + } +} + +// test data generated with lp_pool_test_generator.py +TEST(PoolTest, LpPool2d) { + std::vector kernel_sizes[2] = {{2, 2}, {3, 3}}; + std::vector strides[2] = {{1, 1}, {2, 2}}; + std::vector ys[4] = { + {8.1240f, 9.8995f, 11.7473f, 15.5563f, 17.4929f, 19.4422f, 23.3666f, 25.3377f, 27.3130f}, + {8.1240f, 11.7473f, 23.3666f, 27.3130f}, + {20.6398f, 23.3024f, 31.6544f, 34.5109f}, + {20.6398f}}; + std::vector y_sizes[4] = { + {1, 1, 3, 3}, + {1, 1, 2, 2}, + {1, 1, 2, 2}, + {1, 1, 1, 1}, + }; + int y_count = 0; + for (int kernel_size_count = 0; kernel_size_count < 2; kernel_size_count++) + for (int stride_count = 0; stride_count < 2; stride_count++) { + OpTester test("LpPool", 18); + test.AddAttribute("auto_pad", ""); + test.AddAttribute("p", static_cast(2)); + test.AddInput("X", {1, 1, 4, 4}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); + test.AddAttribute("strides", strides[stride_count]); + test.AddAttribute("kernel_shape", kernel_sizes[kernel_size_count]); + + test.AddOutput("Y", y_sizes[y_count], ys[y_count]); + test.Run(); + y_count++; + } +} + +TEST(PoolTest, LpPoolCeilMode) { + OpTester test("LpPool", 18); + + test.AddAttribute("auto_pad", ""); + test.AddAttribute("strides", std::vector{2}); + test.AddAttribute("kernel_shape", vector{3}); + test.AddAttribute("ceil_mode", static_cast(1)); + test.AddAttribute("p", static_cast(1)); + test.AddInput("X", {1, 1, 4}, {1, 2, 3, 4}); + test.AddOutput("Y", {1, 1, 2}, {6, 7}); + + // https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/classnvinfer1_1_1_i_network_definition.html#a94f434942252e6d98ac17705c06ce060 + // TensorRT does not support 1d pooling + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); +} + TEST(PoolTest, GlobalLpPool) { OpTester test("GlobalLpPool"); test.AddAttribute("p", static_cast(3));