cpu support of LpPool(18) (#14205)

Signed-off-by: Liqun Fu <liqfu@microsoft.com>

### Description
To support LpPool (18)



### Motivation and Context
for Ort 1.14 release

Signed-off-by: Liqun Fu <liqfu@microsoft.com>
This commit is contained in:
liqun Fu 2023-01-25 23:14:56 -08:00 committed by GitHub
parent edb377f2cb
commit 2b1a59f01a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 417 additions and 5 deletions

View file

@ -175,7 +175,8 @@ Do not modify directly.*
|||[11, 12]|**B** = tensor(bool)<br/> **I** = tensor(int64)<br/> **V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[1, 10]|**B** = tensor(bool)<br/> **I** = tensor(int64)<br/> **V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|LpNormalization|*in* input:**T**<br> *out* output:**T**|1+|**T** = tensor(double), tensor(float)|
|LpPool|*in* X:**T**<br> *out* Y:**T**|11+|**T** = tensor(float)|
|LpPool|*in* X:**T**<br> *out* Y:**T**|18+|**T** = tensor(float)|
|||[11, 17]|**T** = tensor(float)|
|||[2, 10]|**T** = tensor(float)|
|MatMul|*in* A:**T**<br> *in* B:**T**<br> *out* Y:**T**|13+|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
|||[9, 12]|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|

View file

@ -439,7 +439,7 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDoma
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, NonMaxSuppression);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, AveragePool);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, MaxUnpool);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, LpPool);
class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 17, LpPool);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, Conv);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, ConvTranspose);
class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, If);
@ -830,6 +830,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain,
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, float, ReduceSumSquare);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, double, ReduceSumSquare);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t, ReduceSumSquare);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, LpPool);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, Col2Im);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int8_t, BitwiseAnd);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int16_t, BitwiseAnd);
@ -1471,7 +1472,7 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
NonMaxSuppression)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, AveragePool)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, MaxUnpool)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, LpPool)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 17, LpPool)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, Conv)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, ConvTranspose)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, If)>,
@ -2164,6 +2165,7 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
ReduceSumSquare)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, double,
ReduceSumSquare)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, LpPool)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, Col2Im)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int8_t, BitwiseAnd)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int16_t, BitwiseAnd)>,

View file

@ -249,6 +249,81 @@ Status MaxPoolV8::ComputeImpl(OpKernelContext* context) const {
return Status::OK();
}
template <typename T>
Status LpPoolV18<T>::Compute(OpKernelContext* context) const {
concurrency::ThreadPool* tp = context->GetOperatorThreadPool();
bool need_dilation = false;
for (auto n : pool_attrs_.dilations) {
need_dilation |= n > 1;
}
const auto* X = context->Input<Tensor>(0);
const TensorShape& x_shape = X->Shape();
ORT_RETURN_IF_NOT(x_shape.NumDimensions() >= 3, "Input dimension cannot be less than 3.");
auto pads = pool_attrs_.pads;
auto kernel_shape = pool_attrs_.kernel_shape;
auto output_dims = pool_attrs_.SetOutputSize(x_shape, x_shape[1], &pads);
Tensor* Y = context->Output(0, output_dims);
const auto* X_data = X->Data<T>();
auto* Y_data = Y->MutableData<T>();
// The main loop
int64_t channels = x_shape[1];
int64_t height = x_shape[2];
int64_t width = kernel_shape.size() > 1 ? x_shape[3] : 1;
int64_t depth = kernel_shape.size() > 2 ? x_shape[4] : 1;
int64_t pooled_height = output_dims[2];
int64_t pooled_width = kernel_shape.size() > 1 ? output_dims[3] : 1;
int64_t pooled_depth = kernel_shape.size() > 2 ? output_dims[4] : 1;
const int64_t total_channels = x_shape[0] * channels;
switch (kernel_shape.size()) {
case 1: {
int64_t x_step = height;
int64_t y_step = pooled_height;
const int64_t dilation_h = pool_attrs_.dilations[0];
RunLoop<LpPool1DTask<T>>(tp, onnxruntime::narrow<size_t>(total_channels),
{X_data, Y_data, x_step, y_step, dilation_h, pooled_height, stride_h(),
height, kernel_shape, pads, p_});
break;
}
case 2: {
int64_t x_step = height * width;
int64_t y_step = pooled_height * pooled_width;
const int64_t dilation_h = pool_attrs_.dilations[0];
const int64_t dilation_w = pool_attrs_.dilations[1];
RunLoop<LpPool2DTask<T>>(
tp, onnxruntime::narrow<size_t>(total_channels),
{X_data, Y_data, x_step, y_step, dilation_h, dilation_w, pooled_height, pooled_width, stride_h(),
stride_w(), height, width, kernel_shape, pads, p_});
break;
}
case 3: {
int64_t x_step = height * width * depth;
int64_t y_step = pooled_height * pooled_width * pooled_depth;
const int64_t dilation_h = pool_attrs_.dilations[0];
const int64_t dilation_w = pool_attrs_.dilations[1];
const int64_t dilation_d = pool_attrs_.dilations[2];
RunLoop<LpPool3DTask<T>>(tp, onnxruntime::narrow<size_t>(total_channels),
{X_data, Y_data, x_step, y_step,
dilation_h, dilation_w, dilation_d, pooled_height, pooled_width,
pooled_depth, stride_h(), stride_w(), stride_d(), height,
width, depth, kernel_shape, pads, p_});
break;
}
default:
return Status(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported kernel dimension : " + std::to_string(kernel_shape.size()));
}
return Status::OK();
}
ONNX_CPU_OPERATOR_VERSIONED_KERNEL(AveragePool, 7, 9,
KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
Pool<float, AveragePool>);
@ -284,8 +359,16 @@ ONNX_CPU_OPERATOR_VERSIONED_KERNEL(LpPool, 2, 10,
KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
Pool<float, LpPool>);
ONNX_CPU_OPERATOR_KERNEL(LpPool, 11, KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
Pool<float, LpPool>);
ONNX_CPU_OPERATOR_VERSIONED_KERNEL(LpPool, 11, 17,
KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
Pool<float, LpPool>);
ONNX_CPU_OPERATOR_KERNEL(LpPool, 18,
KernelDefBuilder()
.TypeConstraint(
"T",
DataTypeImpl::GetTensorType<float>()),
LpPoolV18<float>);
ONNX_CPU_OPERATOR_KERNEL(GlobalLpPool, 2, KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
Pool<float, LpPool>);

View file

@ -46,4 +46,20 @@ class MaxPoolV8 : public OpKernel, public PoolBase {
template <typename T>
Status ComputeImpl(OpKernelContext* context) const;
};
// For lppool v18 and beyond
// version 18: Added ceil_mode and dilations
template <typename T>
class LpPoolV18 : public OpKernel, public PoolBase {
public:
LpPoolV18(const OpKernelInfo& info) : OpKernel(info), PoolBase(info) {
ORT_ENFORCE(info.GetAttr<int64_t>("p", &p_).IsOK());
}
Status Compute(OpKernelContext* context) const override;
private:
int64_t p_;
};
} // namespace onnxruntime

View file

@ -377,4 +377,170 @@ struct MaxPool3DTask {
}
};
template <typename T>
struct LpPool1DTask final {
const T* X_data;
T* Y_data;
int64_t x_step;
int64_t y_step;
int64_t dilation_h;
int64_t pooled_height;
int64_t stride_h;
int64_t height;
gsl::span<const int64_t> kernel_shape;
gsl::span<const int64_t> pads;
int64_t p;
TensorOpCost Cost() {
double loop_count = static_cast<double>(pooled_height * kernel_shape[0]);
return TensorOpCost{loop_count, loop_count, loop_count};
}
void operator()(std::ptrdiff_t begin, std::ptrdiff_t end) const {
for (std::ptrdiff_t c = begin; c < end; ++c) {
operator()(c);
}
}
void operator()(std::ptrdiff_t c) const {
const T* x_d = X_data + c * x_step;
T* y_d = Y_data + c * y_step;
for (int64_t ph = 0; ph < pooled_height; ++ph) {
int64_t hstart = ph * stride_h - pads[0];
int64_t hend = hstart + kernel_shape[0] * dilation_h;
y_d[ph] = 0;
for (int64_t h = hstart; h < hend; h += dilation_h) {
if (math::is_a_ge_zero_and_a_lt_b(h, height)) {
y_d[ph] += static_cast<T>(std::pow(std::abs(x_d[h]), p));
}
}
y_d[ph] = static_cast<T>(std::pow(y_d[ph], 1.0f / p));
}
}
};
template <typename T>
struct LpPool2DTask final {
const T* X_data;
T* Y_data;
int64_t x_step;
int64_t y_step;
int64_t dilation_h;
int64_t dilation_w;
int64_t pooled_height;
int64_t pooled_width;
int64_t stride_h;
int64_t stride_w;
int64_t height;
int64_t width;
gsl::span<const int64_t> kernel_shape;
gsl::span<const int64_t> pads;
int64_t p;
TensorOpCost Cost() {
double loop_count = static_cast<double>(pooled_height * pooled_width * kernel_shape[0] * kernel_shape[1]);
return TensorOpCost{loop_count, loop_count, loop_count};
}
void operator()(std::ptrdiff_t begin, std::ptrdiff_t end) const {
for (std::ptrdiff_t c = begin; c < end; ++c) {
operator()(c);
}
}
void operator()(std::ptrdiff_t c) const {
const T* x_d = X_data + c * x_step;
T* y_d = Y_data + c * y_step;
for (int64_t ph = 0; ph < pooled_height; ++ph) {
int64_t hstart = ph * stride_h - pads[0];
int64_t hend = hstart + kernel_shape[0] * dilation_h;
for (int64_t pw = 0; pw < pooled_width; ++pw) {
int64_t wstart = pw * stride_w - pads[1];
int64_t wend = wstart + kernel_shape[1] * dilation_w;
const int64_t pool_index = ph * pooled_width + pw;
y_d[pool_index] = 0;
for (int64_t h = hstart; h < hend; h += dilation_h) {
if (math::is_a_ge_zero_and_a_lt_b(h, height)) {
for (int64_t w = wstart; w < wend; w += dilation_w) {
if (math::is_a_ge_zero_and_a_lt_b(w, width)) {
const int64_t input_index = h * width + w;
y_d[pool_index] += static_cast<T>(std::pow(std::abs(x_d[input_index]), p));
}
}
}
}
y_d[pool_index] = static_cast<T>(std::pow(y_d[pool_index], 1.0f / p));
}
}
}
};
template <typename T>
struct LpPool3DTask {
const T* X_data;
T* Y_data;
int64_t x_step;
int64_t y_step;
int64_t dilation_h;
int64_t dilation_w;
int64_t dilation_d;
int64_t pooled_height;
int64_t pooled_width;
int64_t pooled_depth;
int64_t stride_h;
int64_t stride_w;
int64_t stride_d;
int64_t height;
int64_t width;
int64_t depth;
gsl::span<const int64_t> kernel_shape;
gsl::span<const int64_t> pads;
int64_t p;
void operator()(std::ptrdiff_t begin, std::ptrdiff_t end) const {
for (std::ptrdiff_t c = begin; c < end; ++c) {
operator()(c);
}
}
TensorOpCost Cost() {
double loop_count = static_cast<double>(pooled_height * pooled_width * pooled_depth * kernel_shape[0] *
kernel_shape[1] * kernel_shape[2]);
return TensorOpCost{loop_count, loop_count, loop_count};
}
void operator()(std::ptrdiff_t c) const {
const T* x_d = X_data + c * x_step;
T* y_d = Y_data + c * y_step;
for (int64_t ph = 0; ph < pooled_height; ++ph) {
int64_t hstart = ph * stride_h - pads[0];
int64_t hend = hstart + kernel_shape[0] * dilation_h;
for (int64_t pw = 0; pw < pooled_width; ++pw) {
int64_t wstart = pw * stride_w - pads[1];
int64_t wend = wstart + kernel_shape[1] * dilation_w;
for (int64_t pd = 0; pd < pooled_depth; ++pd) {
int64_t dstart = pd * stride_d - pads[2];
int64_t dend = dstart + kernel_shape[2] * dilation_d;
const int64_t pool_index = ph * pooled_width * pooled_depth + pw * pooled_depth + pd;
y_d[pool_index] = 0;
for (int64_t h = hstart; h < hend; h += dilation_h) {
if (math::is_a_ge_zero_and_a_lt_b(h, height)) {
for (int64_t w = wstart; w < wend; w += dilation_w) {
if (math::is_a_ge_zero_and_a_lt_b(w, width)) {
for (int64_t d = dstart; d < dend; d += dilation_d) {
if (math::is_a_ge_zero_and_a_lt_b(d, depth)) {
const int64_t input_index = h * width * depth + w * depth + d;
y_d[pool_index] += static_cast<T>(std::pow(std::abs(x_d[input_index]), p));
}
}
}
}
}
}
y_d[pool_index] = static_cast<T>(std::pow(y_d[pool_index], 1.0f / p));
}
}
}
}
};
} // namespace onnxruntime

View file

@ -0,0 +1,62 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import numpy as np
import torch
from torch import nn
# use this code to generate test data for PoolTest.LpPool1d and PoolTest.LpPool2d
def generate_lppool_1d_test_cases() -> None:
p = 2
x = np.array(
[
[
[1, 2, 3, 4],
]
]
).astype(np.float32)
print(x)
kernel_sizes = [2, 3]
strides = [[1], [2]]
for kernel_size in kernel_sizes:
for stride in strides:
print(kernel_size)
print(stride)
model = nn.LPPool1d(p, kernel_size=kernel_size, stride=stride)
pt_y = model(torch.from_numpy(x))
print(torch.flatten(pt_y))
print(pt_y.shape)
def generate_lppool_2d_test_cases() -> None:
p = 2
x = np.array(
[
[
[
[1, 2, 3, 4],
[5, 6, 7, 8],
[9, 10, 11, 12],
[13, 14, 15, 16],
]
]
]
).astype(np.float32)
print(x)
kernel_sizes = [[2, 2], [3, 3]]
strides = [[1, 1], [2, 2]]
for kernel_size in kernel_sizes:
for stride in strides:
model = nn.LPPool2d(p, kernel_size=kernel_size, stride=stride)
pt_y = model(torch.from_numpy(x))
print(kernel_size)
print(stride)
print(torch.flatten(pt_y))
print(pt_y.shape)
generate_lppool_1d_test_cases()
generate_lppool_2d_test_cases()

View file

@ -1331,6 +1331,88 @@ TEST(PoolTest, LpPool) {
test.Run();
}
// test data generated with lp_pool_test_generator.py
TEST(PoolTest, LpPool1d) {
std::vector<int64_t> kernel_sizes[2] = {{2}, {3}};
std::vector<int64_t> strides[2] = {{1}, {2}};
std::vector<float> ys[4] = {
{2.2361f, 3.6056f, 5.0000f},
{2.2361f, 5.0000f},
{3.7417f, 5.3852f},
{3.7417f}};
std::vector<int64_t> y_sizes[4] = {
{1, 1, 3},
{1, 1, 2},
{1, 1, 2},
{1, 1, 1},
};
int y_count = 0;
for (int kernel_size_count = 0; kernel_size_count < 2; kernel_size_count++)
for (int stride_count = 0; stride_count < 2; stride_count++) {
OpTester test("LpPool", 18);
test.AddAttribute("auto_pad", "");
test.AddAttribute("p", static_cast<int64_t>(2));
test.AddInput<float>("X", {1, 1, 4}, {1, 2, 3, 4});
test.AddAttribute("strides", strides[stride_count]);
test.AddAttribute("kernel_shape", kernel_sizes[kernel_size_count]);
test.AddOutput<float>("Y", y_sizes[y_count], ys[y_count]);
// https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/classnvinfer1_1_1_i_network_definition.html#a94f434942252e6d98ac17705c06ce060
// TensorRT does not support 1d pooling
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
y_count++;
}
}
// test data generated with lp_pool_test_generator.py
TEST(PoolTest, LpPool2d) {
std::vector<int64_t> kernel_sizes[2] = {{2, 2}, {3, 3}};
std::vector<int64_t> strides[2] = {{1, 1}, {2, 2}};
std::vector<float> ys[4] = {
{8.1240f, 9.8995f, 11.7473f, 15.5563f, 17.4929f, 19.4422f, 23.3666f, 25.3377f, 27.3130f},
{8.1240f, 11.7473f, 23.3666f, 27.3130f},
{20.6398f, 23.3024f, 31.6544f, 34.5109f},
{20.6398f}};
std::vector<int64_t> y_sizes[4] = {
{1, 1, 3, 3},
{1, 1, 2, 2},
{1, 1, 2, 2},
{1, 1, 1, 1},
};
int y_count = 0;
for (int kernel_size_count = 0; kernel_size_count < 2; kernel_size_count++)
for (int stride_count = 0; stride_count < 2; stride_count++) {
OpTester test("LpPool", 18);
test.AddAttribute("auto_pad", "");
test.AddAttribute("p", static_cast<int64_t>(2));
test.AddInput<float>("X", {1, 1, 4, 4},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
test.AddAttribute("strides", strides[stride_count]);
test.AddAttribute("kernel_shape", kernel_sizes[kernel_size_count]);
test.AddOutput<float>("Y", y_sizes[y_count], ys[y_count]);
test.Run();
y_count++;
}
}
TEST(PoolTest, LpPoolCeilMode) {
OpTester test("LpPool", 18);
test.AddAttribute("auto_pad", "");
test.AddAttribute("strides", std::vector<int64_t>{2});
test.AddAttribute("kernel_shape", vector<int64_t>{3});
test.AddAttribute("ceil_mode", static_cast<int64_t>(1));
test.AddAttribute("p", static_cast<int64_t>(1));
test.AddInput<float>("X", {1, 1, 4}, {1, 2, 3, 4});
test.AddOutput<float>("Y", {1, 1, 2}, {6, 7});
// https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/classnvinfer1_1_1_i_network_definition.html#a94f434942252e6d98ac17705c06ce060
// TensorRT does not support 1d pooling
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
}
TEST(PoolTest, GlobalLpPool) {
OpTester test("GlobalLpPool");
test.AddAttribute("p", static_cast<int64_t>(3));