From 2b1a59f01abd38d5fd40d75b3f41547791980fbc Mon Sep 17 00:00:00 2001
From: liqun Fu <liqfu@microsoft.com>
Date: Wed, 25 Jan 2023 23:14:56 -0800
Subject: [PATCH] cpu support of LpPool(18) (#14205)

Signed-off-by: Liqun Fu <liqfu@microsoft.com>

### Description
To support LpPool (18)



### Motivation and Context
for Ort 1.14 release

Signed-off-by: Liqun Fu <liqfu@microsoft.com>
---
 docs/OperatorKernels.md                       |   3 +-
 .../providers/cpu/cpu_execution_provider.cc   |   6 +-
 onnxruntime/core/providers/cpu/nn/pool.cc     |  87 ++++++++-
 onnxruntime/core/providers/cpu/nn/pool.h      |  16 ++
 .../core/providers/cpu/nn/pool_functors.h     | 166 ++++++++++++++++++
 .../cpu/nn/lp_pool_test_generator.py          |  62 +++++++
 .../test/providers/cpu/nn/pool_op_test.cc     |  82 +++++++++
 7 files changed, 417 insertions(+), 5 deletions(-)
 create mode 100644 onnxruntime/test/providers/cpu/nn/lp_pool_test_generator.py
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index ad571dacb2..41c0da6fa0 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -175,7 +175,8 @@ Do not modify directly.*
 |||[11, 12]|**B** = tensor(bool)<br/> **I** = tensor(int64)<br/> **V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[1, 10]|**B** = tensor(bool)<br/> **I** = tensor(int64)<br/> **V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |LpNormalization|*in* input:**T**<br> *out* output:**T**|1+|**T** = tensor(double), tensor(float)|
-|LpPool|*in* X:**T**<br> *out* Y:**T**|11+|**T** = tensor(float)|
+|LpPool|*in* X:**T**<br> *out* Y:**T**|18+|**T** = tensor(float)|
+|||[11, 17]|**T** = tensor(float)|
 |||[2, 10]|**T** = tensor(float)|
 |MatMul|*in* A:**T**<br> *in* B:**T**<br> *out* Y:**T**|13+|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
 |||[9, 12]|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
diff --git a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
index 75060fbf9f..caba009075 100644
--- a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
+++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
@@ -439,7 +439,7 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDoma
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, NonMaxSuppression);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, AveragePool);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, MaxUnpool);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, LpPool);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 17, LpPool);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, Conv);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, ConvTranspose);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, If);
@@ -830,6 +830,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, float, ReduceSumSquare);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, double, ReduceSumSquare);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t, ReduceSumSquare);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, LpPool);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, Col2Im);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int8_t, BitwiseAnd);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int16_t, BitwiseAnd);
@@ -1471,7 +1472,7 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
                                                           NonMaxSuppression)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, AveragePool)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, MaxUnpool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, LpPool)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 17, LpPool)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, Conv)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, ConvTranspose)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, If)>,
@@ -2164,6 +2165,7 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
                                                                 ReduceSumSquare)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, double,
                                                                 ReduceSumSquare)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, LpPool)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, Col2Im)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int8_t, BitwiseAnd)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int16_t, BitwiseAnd)>,
diff --git a/onnxruntime/core/providers/cpu/nn/pool.cc b/onnxruntime/core/providers/cpu/nn/pool.cc
index 53c24e7e3d..0f3681d64c 100644
--- a/onnxruntime/core/providers/cpu/nn/pool.cc
+++ b/onnxruntime/core/providers/cpu/nn/pool.cc
@@ -249,6 +249,81 @@ Status MaxPoolV8::ComputeImpl(OpKernelContext* context) const {
   return Status::OK();
 }
 
+template <typename T>
+Status LpPoolV18<T>::Compute(OpKernelContext* context) const {
+  concurrency::ThreadPool* tp = context->GetOperatorThreadPool();
+  bool need_dilation = false;
+  for (auto n : pool_attrs_.dilations) {
+    need_dilation |= n > 1;
+  }
+
+  const auto* X = context->Input<Tensor>(0);
+  const TensorShape& x_shape = X->Shape();
+
+  ORT_RETURN_IF_NOT(x_shape.NumDimensions() >= 3, "Input dimension cannot be less than 3.");
+
+  auto pads = pool_attrs_.pads;
+  auto kernel_shape = pool_attrs_.kernel_shape;
+
+  auto output_dims = pool_attrs_.SetOutputSize(x_shape, x_shape[1], &pads);
+  Tensor* Y = context->Output(0, output_dims);
+
+  const auto* X_data = X->Data<T>();
+  auto* Y_data = Y->MutableData<T>();
+
+  // The main loop
+  int64_t channels = x_shape[1];
+  int64_t height = x_shape[2];
+  int64_t width = kernel_shape.size() > 1 ? x_shape[3] : 1;
+  int64_t depth = kernel_shape.size() > 2 ? x_shape[4] : 1;
+  int64_t pooled_height = output_dims[2];
+  int64_t pooled_width = kernel_shape.size() > 1 ? output_dims[3] : 1;
+  int64_t pooled_depth = kernel_shape.size() > 2 ? output_dims[4] : 1;
+  const int64_t total_channels = x_shape[0] * channels;
+
+  switch (kernel_shape.size()) {
+    case 1: {
+      int64_t x_step = height;
+      int64_t y_step = pooled_height;
+      const int64_t dilation_h = pool_attrs_.dilations[0];
+
+      RunLoop<LpPool1DTask<T>>(tp, onnxruntime::narrow<size_t>(total_channels),
+                                {X_data, Y_data, x_step, y_step, dilation_h, pooled_height, stride_h(),
+                                 height, kernel_shape, pads, p_});
+      break;
+    }
+
+    case 2: {
+      int64_t x_step = height * width;
+      int64_t y_step = pooled_height * pooled_width;
+      const int64_t dilation_h = pool_attrs_.dilations[0];
+      const int64_t dilation_w = pool_attrs_.dilations[1];
+      RunLoop<LpPool2DTask<T>>(
+          tp, onnxruntime::narrow<size_t>(total_channels),
+          {X_data, Y_data, x_step, y_step, dilation_h, dilation_w, pooled_height, pooled_width, stride_h(),
+           stride_w(), height, width, kernel_shape, pads, p_});
+      break;
+    }
+    case 3: {
+      int64_t x_step = height * width * depth;
+      int64_t y_step = pooled_height * pooled_width * pooled_depth;
+      const int64_t dilation_h = pool_attrs_.dilations[0];
+      const int64_t dilation_w = pool_attrs_.dilations[1];
+      const int64_t dilation_d = pool_attrs_.dilations[2];
+      RunLoop<LpPool3DTask<T>>(tp, onnxruntime::narrow<size_t>(total_channels),
+                                {X_data, Y_data, x_step, y_step,
+                                 dilation_h, dilation_w, dilation_d, pooled_height, pooled_width,
+                                 pooled_depth, stride_h(), stride_w(), stride_d(), height,
+                                width, depth, kernel_shape, pads, p_});
+      break;
+    }
+    default:
+      return Status(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported kernel dimension : " + std::to_string(kernel_shape.size()));
+  }
+
+  return Status::OK();
+}
+
 ONNX_CPU_OPERATOR_VERSIONED_KERNEL(AveragePool, 7, 9,
                                    KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
                                    Pool<float, AveragePool>);
@@ -284,8 +359,16 @@ ONNX_CPU_OPERATOR_VERSIONED_KERNEL(LpPool, 2, 10,
                                    KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
                                    Pool<float, LpPool>);
 
-ONNX_CPU_OPERATOR_KERNEL(LpPool, 11, KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
-                         Pool<float, LpPool>);
+ONNX_CPU_OPERATOR_VERSIONED_KERNEL(LpPool, 11, 17,
+                                   KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+                                   Pool<float, LpPool>);
+
+ONNX_CPU_OPERATOR_KERNEL(LpPool, 18,
+                         KernelDefBuilder()
+                            .TypeConstraint(
+                                "T", 
+                                DataTypeImpl::GetTensorType<float>()),
+                         LpPoolV18<float>);
 
 ONNX_CPU_OPERATOR_KERNEL(GlobalLpPool, 2, KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
                          Pool<float, LpPool>);
diff --git a/onnxruntime/core/providers/cpu/nn/pool.h b/onnxruntime/core/providers/cpu/nn/pool.h
index 5458c5ba6c..7e4899ea1d 100644
--- a/onnxruntime/core/providers/cpu/nn/pool.h
+++ b/onnxruntime/core/providers/cpu/nn/pool.h
@@ -46,4 +46,20 @@ class MaxPoolV8 : public OpKernel, public PoolBase {
   template <typename T>
   Status ComputeImpl(OpKernelContext* context) const;
 };
+
+// For lppool v18 and beyond
+// version 18: Added ceil_mode and dilations
+template <typename T>
+class LpPoolV18 : public OpKernel, public PoolBase {
+ public:
+  LpPoolV18(const OpKernelInfo& info) : OpKernel(info), PoolBase(info) {
+    ORT_ENFORCE(info.GetAttr<int64_t>("p", &p_).IsOK());
+  }
+
+  Status Compute(OpKernelContext* context) const override;
+
+private:
+  int64_t p_;
+};
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/nn/pool_functors.h b/onnxruntime/core/providers/cpu/nn/pool_functors.h
index 19ffb8f2d5..9948e1d809 100644
--- a/onnxruntime/core/providers/cpu/nn/pool_functors.h
+++ b/onnxruntime/core/providers/cpu/nn/pool_functors.h
@@ -377,4 +377,170 @@ struct MaxPool3DTask {
   }
 };
 
+template <typename T>
+struct LpPool1DTask final {
+  const T* X_data;
+  T* Y_data;
+  int64_t x_step;
+  int64_t y_step;
+  int64_t dilation_h;
+  int64_t pooled_height;
+  int64_t stride_h;
+  int64_t height;
+  gsl::span<const int64_t> kernel_shape;
+  gsl::span<const int64_t> pads;
+  int64_t p;
+  TensorOpCost Cost() {
+    double loop_count = static_cast<double>(pooled_height * kernel_shape[0]);
+    return TensorOpCost{loop_count, loop_count, loop_count};
+  }
+
+  void operator()(std::ptrdiff_t begin, std::ptrdiff_t end) const {
+    for (std::ptrdiff_t c = begin; c < end; ++c) {
+      operator()(c);
+    }
+  }
+  void operator()(std::ptrdiff_t c) const {
+    const T* x_d = X_data + c * x_step;
+    T* y_d = Y_data + c * y_step;
+    for (int64_t ph = 0; ph < pooled_height; ++ph) {
+      int64_t hstart = ph * stride_h - pads[0];
+      int64_t hend = hstart + kernel_shape[0] * dilation_h;
+      y_d[ph] = 0;
+      for (int64_t h = hstart; h < hend; h += dilation_h) {
+        if (math::is_a_ge_zero_and_a_lt_b(h, height)) {
+          y_d[ph] += static_cast<T>(std::pow(std::abs(x_d[h]), p));
+        }
+      }
+      y_d[ph] = static_cast<T>(std::pow(y_d[ph], 1.0f / p));
+    }
+  }
+};
+
+template <typename T>
+struct LpPool2DTask final {
+  const T* X_data;
+  T* Y_data;
+  int64_t x_step;
+  int64_t y_step;
+  int64_t dilation_h;
+  int64_t dilation_w;
+  int64_t pooled_height;
+  int64_t pooled_width;
+  int64_t stride_h;
+  int64_t stride_w;
+  int64_t height;
+  int64_t width;
+  gsl::span<const int64_t> kernel_shape;
+  gsl::span<const int64_t> pads;
+  int64_t p;
+
+  TensorOpCost Cost() {
+    double loop_count = static_cast<double>(pooled_height * pooled_width * kernel_shape[0] * kernel_shape[1]);
+    return TensorOpCost{loop_count, loop_count, loop_count};
+  }
+
+  void operator()(std::ptrdiff_t begin, std::ptrdiff_t end) const {
+    for (std::ptrdiff_t c = begin; c < end; ++c) {
+      operator()(c);
+    }
+  }
+
+  void operator()(std::ptrdiff_t c) const {
+    const T* x_d = X_data + c * x_step;
+    T* y_d = Y_data + c * y_step;
+    for (int64_t ph = 0; ph < pooled_height; ++ph) {
+      int64_t hstart = ph * stride_h - pads[0];
+      int64_t hend = hstart + kernel_shape[0] * dilation_h;
+      for (int64_t pw = 0; pw < pooled_width; ++pw) {
+        int64_t wstart = pw * stride_w - pads[1];
+        int64_t wend = wstart + kernel_shape[1] * dilation_w;
+        const int64_t pool_index = ph * pooled_width + pw;
+        y_d[pool_index] = 0;
+        for (int64_t h = hstart; h < hend; h += dilation_h) {
+          if (math::is_a_ge_zero_and_a_lt_b(h, height)) {
+            for (int64_t w = wstart; w < wend; w += dilation_w) {
+              if (math::is_a_ge_zero_and_a_lt_b(w, width)) {
+                const int64_t input_index = h * width + w;
+                y_d[pool_index] += static_cast<T>(std::pow(std::abs(x_d[input_index]), p));
+              }
+            }
+          }
+        }
+        y_d[pool_index] = static_cast<T>(std::pow(y_d[pool_index], 1.0f / p));
+      }
+    }
+  }
+};
+
+template <typename T>
+struct LpPool3DTask {
+  const T* X_data;
+  T* Y_data;
+  int64_t x_step;
+  int64_t y_step;
+  int64_t dilation_h;
+  int64_t dilation_w;
+  int64_t dilation_d;
+  int64_t pooled_height;
+  int64_t pooled_width;
+  int64_t pooled_depth;
+  int64_t stride_h;
+  int64_t stride_w;
+  int64_t stride_d;
+  int64_t height;
+  int64_t width;
+  int64_t depth;
+  gsl::span<const int64_t> kernel_shape;
+  gsl::span<const int64_t> pads;
+  int64_t p;
+
+  void operator()(std::ptrdiff_t begin, std::ptrdiff_t end) const {
+    for (std::ptrdiff_t c = begin; c < end; ++c) {
+      operator()(c);
+    }
+  }
+
+  TensorOpCost Cost() {
+    double loop_count = static_cast<double>(pooled_height * pooled_width * pooled_depth * kernel_shape[0] *
+                                            kernel_shape[1] * kernel_shape[2]);
+    return TensorOpCost{loop_count, loop_count, loop_count};
+  }
+
+  void operator()(std::ptrdiff_t c) const {
+    const T* x_d = X_data + c * x_step;
+    T* y_d = Y_data + c * y_step;
+
+    for (int64_t ph = 0; ph < pooled_height; ++ph) {
+      int64_t hstart = ph * stride_h - pads[0];
+      int64_t hend = hstart + kernel_shape[0] * dilation_h;
+      for (int64_t pw = 0; pw < pooled_width; ++pw) {
+        int64_t wstart = pw * stride_w - pads[1];
+        int64_t wend = wstart + kernel_shape[1] * dilation_w;
+        for (int64_t pd = 0; pd < pooled_depth; ++pd) {
+          int64_t dstart = pd * stride_d - pads[2];
+          int64_t dend = dstart + kernel_shape[2] * dilation_d;
+          const int64_t pool_index = ph * pooled_width * pooled_depth + pw * pooled_depth + pd;
+          y_d[pool_index] = 0;
+          for (int64_t h = hstart; h < hend; h += dilation_h) {
+            if (math::is_a_ge_zero_and_a_lt_b(h, height)) {
+              for (int64_t w = wstart; w < wend; w += dilation_w) {
+                if (math::is_a_ge_zero_and_a_lt_b(w, width)) {
+                  for (int64_t d = dstart; d < dend; d += dilation_d) {
+                    if (math::is_a_ge_zero_and_a_lt_b(d, depth)) {
+                      const int64_t input_index = h * width * depth + w * depth + d;
+                      y_d[pool_index] += static_cast<T>(std::pow(std::abs(x_d[input_index]), p));
+                    }
+                  }
+                }
+              }
+            }
+          }
+          y_d[pool_index] = static_cast<T>(std::pow(y_d[pool_index], 1.0f / p));
+        }
+      }
+    }
+  }
+};
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/nn/lp_pool_test_generator.py b/onnxruntime/test/providers/cpu/nn/lp_pool_test_generator.py
new file mode 100644
index 0000000000..e068784557
--- /dev/null
+++ b/onnxruntime/test/providers/cpu/nn/lp_pool_test_generator.py
@@ -0,0 +1,62 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import numpy as np
+import torch
+from torch import nn
+
+
+# use this code to generate test data for PoolTest.LpPool1d and PoolTest.LpPool2d
+def generate_lppool_1d_test_cases() -> None:
+    p = 2
+    x = np.array(
+        [
+            [
+                [1, 2, 3, 4],
+            ]
+        ]
+    ).astype(np.float32)
+
+    print(x)
+    kernel_sizes = [2, 3]
+    strides = [[1], [2]]
+    for kernel_size in kernel_sizes:
+        for stride in strides:
+            print(kernel_size)
+            print(stride)
+            model = nn.LPPool1d(p, kernel_size=kernel_size, stride=stride)
+            pt_y = model(torch.from_numpy(x))
+            print(torch.flatten(pt_y))
+            print(pt_y.shape)
+
+
+def generate_lppool_2d_test_cases() -> None:
+    p = 2
+    x = np.array(
+        [
+            [
+                [
+                    [1, 2, 3, 4],
+                    [5, 6, 7, 8],
+                    [9, 10, 11, 12],
+                    [13, 14, 15, 16],
+                ]
+            ]
+        ]
+    ).astype(np.float32)
+
+    print(x)
+    kernel_sizes = [[2, 2], [3, 3]]
+    strides = [[1, 1], [2, 2]]
+    for kernel_size in kernel_sizes:
+        for stride in strides:
+            model = nn.LPPool2d(p, kernel_size=kernel_size, stride=stride)
+            pt_y = model(torch.from_numpy(x))
+            print(kernel_size)
+            print(stride)
+            print(torch.flatten(pt_y))
+            print(pt_y.shape)
+
+
+generate_lppool_1d_test_cases()
+generate_lppool_2d_test_cases()
diff --git a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
index c68d9839d2..44f81df407 100644
--- a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
@@ -1331,6 +1331,88 @@ TEST(PoolTest, LpPool) {
   test.Run();
 }
 
+// test data generated with lp_pool_test_generator.py
+TEST(PoolTest, LpPool1d) {
+  std::vector<int64_t> kernel_sizes[2] = {{2}, {3}};
+  std::vector<int64_t> strides[2] = {{1}, {2}};
+  std::vector<float> ys[4] = {
+      {2.2361f, 3.6056f, 5.0000f},
+      {2.2361f, 5.0000f},
+      {3.7417f, 5.3852f},
+      {3.7417f}};
+  std::vector<int64_t> y_sizes[4] = {
+      {1, 1, 3},
+      {1, 1, 2},
+      {1, 1, 2},
+      {1, 1, 1},
+  };
+  int y_count = 0;
+  for (int kernel_size_count = 0; kernel_size_count < 2; kernel_size_count++)
+    for (int stride_count = 0; stride_count < 2; stride_count++) {
+      OpTester test("LpPool", 18);
+      test.AddAttribute("auto_pad", "");
+      test.AddAttribute("p", static_cast<int64_t>(2));
+      test.AddInput<float>("X", {1, 1, 4}, {1, 2, 3, 4});
+      test.AddAttribute("strides", strides[stride_count]);
+      test.AddAttribute("kernel_shape", kernel_sizes[kernel_size_count]);
+
+      test.AddOutput<float>("Y", y_sizes[y_count], ys[y_count]);
+
+      // https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/classnvinfer1_1_1_i_network_definition.html#a94f434942252e6d98ac17705c06ce060
+      // TensorRT does not support 1d pooling
+      test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+      y_count++;
+    }
+}
+
+// test data generated with lp_pool_test_generator.py
+TEST(PoolTest, LpPool2d) {
+  std::vector<int64_t> kernel_sizes[2] = {{2, 2}, {3, 3}};
+  std::vector<int64_t> strides[2] = {{1, 1}, {2, 2}};
+  std::vector<float> ys[4] = {
+      {8.1240f, 9.8995f, 11.7473f, 15.5563f, 17.4929f, 19.4422f, 23.3666f, 25.3377f, 27.3130f},
+      {8.1240f, 11.7473f, 23.3666f, 27.3130f},
+      {20.6398f, 23.3024f, 31.6544f, 34.5109f},
+      {20.6398f}};
+  std::vector<int64_t> y_sizes[4] = {
+    {1, 1, 3, 3},
+    {1, 1, 2, 2},
+    {1, 1, 2, 2},
+    {1, 1, 1, 1},
+  };
+  int y_count = 0;
+  for (int kernel_size_count = 0; kernel_size_count < 2; kernel_size_count++)
+    for (int stride_count = 0; stride_count < 2; stride_count++) {
+      OpTester test("LpPool", 18);
+      test.AddAttribute("auto_pad", "");
+      test.AddAttribute("p", static_cast<int64_t>(2));
+      test.AddInput<float>("X", {1, 1, 4, 4},
+                           {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+      test.AddAttribute("strides", strides[stride_count]);
+      test.AddAttribute("kernel_shape", kernel_sizes[kernel_size_count]);
+
+      test.AddOutput<float>("Y", y_sizes[y_count], ys[y_count]);
+      test.Run();
+      y_count++;    
+    }
+}
+
+TEST(PoolTest, LpPoolCeilMode) {
+  OpTester test("LpPool", 18);
+
+  test.AddAttribute("auto_pad", "");
+  test.AddAttribute("strides", std::vector<int64_t>{2});
+  test.AddAttribute("kernel_shape", vector<int64_t>{3});
+  test.AddAttribute("ceil_mode", static_cast<int64_t>(1));
+  test.AddAttribute("p", static_cast<int64_t>(1));
+  test.AddInput<float>("X", {1, 1, 4}, {1, 2, 3, 4});
+  test.AddOutput<float>("Y", {1, 1, 2}, {6, 7});
+
+  // https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/classnvinfer1_1_1_i_network_definition.html#a94f434942252e6d98ac17705c06ce060
+  // TensorRT does not support 1d pooling
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
 TEST(PoolTest, GlobalLpPool) {
   OpTester test("GlobalLpPool");
   test.AddAttribute("p", static_cast<int64_t>(3));