diff --git a/onnxruntime/core/providers/cpu/nn/pool.cc b/onnxruntime/core/providers/cpu/nn/pool.cc
index d37424c3ed..367a9256a0 100644
--- a/onnxruntime/core/providers/cpu/nn/pool.cc
+++ b/onnxruntime/core/providers/cpu/nn/pool.cc
@@ -3,7 +3,7 @@
 
 #include "core/framework/op_kernel_context_internal.h"
 #include "core/providers/cpu/nn/pool.h"
-#include <cmath>
+
 using namespace ::onnxruntime::common;
 
 namespace onnxruntime {
@@ -25,7 +25,7 @@ Status Pool<T, PoolType>::Compute(OpKernelContext* context) const {
   }
 
   std::vector<int64_t> output_dims = PoolBase::SetOutputSize(x_shape, x_shape[1], &pads, dilations_, ceil_mode_);
-  Tensor* Y = context->Output(0, TensorShape(output_dims));
+  Tensor* Y = context->Output(0, output_dims);
 
   const auto* X_data = X->template Data<float>();
   auto* Y_data = Y->template MutableData<float>();
@@ -185,7 +185,7 @@ Status PoolBase::Compute(OpKernelContext* context, MLAS_POOLING_KIND kind) const
 
   std::vector<int64_t> pads = pads_;
   std::vector<int64_t> output_dims = PoolBase::SetOutputSize(x_shape, x_shape[1], &pads, dilations_, ceil_mode_);
-  Tensor* Y = context->Output(0, TensorShape(output_dims));
+  Tensor* Y = context->Output(0, output_dims);
 
   // Get access to the internal threadpool
   // Temporarily derive concurrency parameters without access to session state
@@ -222,8 +222,9 @@ Status Pool<float, MaxPool<8 /*VERSION*/>>::Compute(OpKernelContext* context) co
   // and also if dilation is not required
 
   bool need_dilation = false;
-  for (auto n : dilations_)
+  for (auto n : dilations_) {
     need_dilation |= n > 1;
+  }
 
   if (OpKernel::Node().OutputDefs().size() == 1 && !need_dilation) {
     return PoolBase::Compute(context, MlasMaximumPooling);
@@ -238,8 +239,8 @@ Status Pool<float, MaxPool<8 /*VERSION*/>>::Compute(OpKernelContext* context) co
   std::vector<int64_t> kernel_shape = kernel_shape_;
 
   std::vector<int64_t> output_dims = PoolBase::SetOutputSize(x_shape, x_shape[1], &pads, dilations_, ceil_mode_);
-  Tensor* Y = context->Output(0, TensorShape(output_dims));
-  Tensor* I = context->Output(1, TensorShape(output_dims));
+  Tensor* Y = context->Output(0, output_dims);
+  Tensor* I = context->Output(1, output_dims);
 
   const auto* X_data = X->template Data<float>();
   auto* Y_data = Y->template MutableData<float>();
@@ -270,14 +271,15 @@ Status Pool<float, MaxPool<8 /*VERSION*/>>::Compute(OpKernelContext* context) co
         int64_t* i_d = I_data ? I_data + c * y_step : nullptr;
         for (int64_t ph = 0; ph < pooled_height; ++ph) {
           int64_t hstart = ph * stride_h() - pads[0];
-          int64_t hend = std::min(hstart + kernel_shape[0] * dilation_h - dilation_h + 1, height);
-          hstart = std::max(hstart, static_cast<int64_t>(0));
+          int64_t hend = hstart + kernel_shape[0] * dilation_h;
           float Yh = std::numeric_limits<float>::lowest();
           int64_t h_index = -1;
           for (int64_t h = hstart; h < hend; h += dilation_h) {
-            if (x_d[h] > Yh) {
-              Yh = x_d[h];
-              h_index = h;
+            if (math::is_a_ge_zero_and_a_lt_b(h, height)) {
+              if (x_d[h] > Yh) {
+                Yh = x_d[h];
+                h_index = h;
+              }
             }
           }
           y_d[ph] = Yh;
@@ -305,23 +307,25 @@ Status Pool<float, MaxPool<8 /*VERSION*/>>::Compute(OpKernelContext* context) co
 
         for (int64_t ph = 0; ph < pooled_height; ++ph) {
           int64_t hstart = ph * stride_h() - pads[0];
-          int64_t hend = std::min(hstart + kernel_shape[0] * dilation_h - dilation_h + 1, height);
-          hstart = std::max(hstart, static_cast<int64_t>(0));
+          int64_t hend = hstart + kernel_shape[0] * dilation_h;
           for (int64_t pw = 0; pw < pooled_width; ++pw) {
             int64_t wstart = pw * stride_w() - pads[1];
-            int64_t wend = std::min(wstart + kernel_shape[1] * dilation_w - dilation_w + 1, width);
-            wstart = std::max(wstart, static_cast<int64_t>(0));
+            int64_t wend = wstart + kernel_shape[1] * dilation_w;
             const int64_t pool_index = ph * pooled_width + pw;
             float Yh = std::numeric_limits<float>::lowest();
             int64_t h_index = -1;
             int64_t w_index = -1;
             for (int64_t h = hstart; h < hend; h += dilation_h) {
-              for (int64_t w = wstart; w < wend; w += dilation_w) {
-                const int64_t input_index = h * width + w;
-                if (x_d[input_index] > Yh) {
-                  Yh = x_d[input_index];
-                  h_index = h;
-                  w_index = w;
+              if (math::is_a_ge_zero_and_a_lt_b(h, height)) {
+                for (int64_t w = wstart; w < wend; w += dilation_w) {
+                  if (math::is_a_ge_zero_and_a_lt_b(w, width)) {
+                    const int64_t input_index = h * width + w;
+                    if (x_d[input_index] > Yh) {
+                      Yh = x_d[input_index];
+                      h_index = h;
+                      w_index = w;
+                    }
+                  }
                 }
               }
             }
@@ -353,16 +357,13 @@ Status Pool<float, MaxPool<8 /*VERSION*/>>::Compute(OpKernelContext* context) co
 
         for (int64_t ph = 0; ph < pooled_height; ++ph) {
           int64_t hstart = ph * stride_h() - pads[0];
-          int64_t hend = std::min(hstart + kernel_shape[0] * dilation_h - dilation_h + 1, height);
-          hstart = std::max(hstart, static_cast<int64_t>(0));
+          int64_t hend = hstart + kernel_shape[0] * dilation_h;
           for (int64_t pw = 0; pw < pooled_width; ++pw) {
             int64_t wstart = pw * stride_w() - pads[1];
-            int64_t wend = std::min(wstart + kernel_shape[1] * dilation_w - dilation_w + 1, width);
-            wstart = std::max(wstart, static_cast<int64_t>(0));
+            int64_t wend = wstart + kernel_shape[1] * dilation_w;
             for (int64_t pd = 0; pd < pooled_depth; ++pd) {
               int64_t dstart = pd * stride_d() - pads[2];
-              int64_t dend = std::min(dstart + kernel_shape[2] * dilation_d - dilation_d + 1, depth);
-              dstart = std::max(dstart, static_cast<int64_t>(0));
+              int64_t dend = dstart + kernel_shape[2] * dilation_d;
               const int64_t pool_index =
                   ph * pooled_width * pooled_depth + pw * pooled_depth + pd;
               float Yh = std::numeric_limits<float>::lowest();
@@ -370,14 +371,20 @@ Status Pool<float, MaxPool<8 /*VERSION*/>>::Compute(OpKernelContext* context) co
               int64_t w_index = -1;
               int64_t d_index = -1;
               for (int64_t h = hstart; h < hend; h += dilation_h) {
-                for (int64_t w = wstart; w < wend; w += dilation_w) {
-                  for (int64_t d = dstart; d < dend; d += dilation_d) {
-                    const int64_t input_index = h * width * depth + w * depth + d;
-                    if (x_d[input_index] > Yh) {
-                      Yh = x_d[input_index];
-                      h_index = h;
-                      w_index = w;
-                      d_index = d;
+                if (math::is_a_ge_zero_and_a_lt_b(h, height)) {
+                  for (int64_t w = wstart; w < wend; w += dilation_w) {
+                    if (math::is_a_ge_zero_and_a_lt_b(w, width)) {
+                      for (int64_t d = dstart; d < dend; d += dilation_d) {
+                        if (math::is_a_ge_zero_and_a_lt_b(d, depth)) {
+                          const int64_t input_index = h * width * depth + w * depth + d;
+                          if (x_d[input_index] > Yh) {
+                            Yh = x_d[input_index];
+                            h_index = h;
+                            w_index = w;
+                            d_index = d;
+                          }
+                        }
+                      }
                     }
                   }
                 }
diff --git a/onnxruntime/core/providers/cpu/nn/pool_base.h b/onnxruntime/core/providers/cpu/nn/pool_base.h
index 11b70ac364..43f81982dd 100644
--- a/onnxruntime/core/providers/cpu/nn/pool_base.h
+++ b/onnxruntime/core/providers/cpu/nn/pool_base.h
@@ -7,6 +7,7 @@
 #include "core/common/common.h"
 #include "core/framework/op_kernel.h"
 #include "core/providers/cpu/nn/autopad_type.h"
+#include "core/util/math.h"
 #include "core/mlas/inc/mlas.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
index 27658f1cc7..73f0ca2eb9 100644
--- a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
@@ -51,7 +51,7 @@ TEST(PoolTest, MaxPool) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); //TensorRT: result differs
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: result differs
 }
 
 // Only CUDA kernel has float 16 support
@@ -104,11 +104,11 @@ TEST(PoolTest, MaxPool_F16) {
 
   test.AddInput<MLFloat16>("X", x_dims, f_X);
   test.AddOutput<MLFloat16>("Y", expected_dims, f_Y);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); //TensorRT: Assertion `!attrs.count("pads")' failed
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: Assertion `!attrs.count("pads")' failed
 }
 #endif
 
-static void MaxPool_8_WithIndexTest(bool has_index, int64_t storage_order=0) {
+static void MaxPool_8_WithIndexTest(bool has_index, int64_t storage_order = 0) {
   OpTester test("MaxPool", 8);
 
   test.AddAttribute("auto_pad", "");
@@ -160,7 +160,7 @@ static void MaxPool_8_WithIndexTest(bool has_index, int64_t storage_order=0) {
 }
 
 TEST(PoolTest, MaxPool_8_With_Index) {
-  MaxPool_8_WithIndexTest(false);  // row major
+  MaxPool_8_WithIndexTest(false);                      // row major
   MaxPool_8_WithIndexTest(true, 0 /*storage_order*/);  // row major
   MaxPool_8_WithIndexTest(true, 1 /*storage_order*/);  // col major
 }
@@ -229,6 +229,26 @@ TEST(PoolTest, MaxPool_10_Dilation_1d) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
+TEST(PoolTest, MaxPool_10_DilationPadding_1d) {
+  OpTester test("MaxPool", 10);
+
+  test.AddAttribute("auto_pad", "");
+  test.AddAttribute("strides", std::vector<int64_t>{1});
+  test.AddAttribute("pads", vector<int64_t>{1, 1});
+  test.AddAttribute("kernel_shape", vector<int64_t>{3});
+  test.AddAttribute("dilations", vector<int64_t>{3});
+
+  std::vector<float> x_vals = {
+      1, 3, 2, 4, -1, -3, -2, -4, -6, -5, -4, -2};
+  std::vector<int64_t> x_dims = {1, 1, 12};
+  std::vector<int64_t> expected_dims = {1, 1, 8};
+  std::vector<float> expected_vals = {2, 4, 3, 2, 4, -1, -2, -2};
+
+  test.AddInput<float>("X", x_dims, x_vals);
+  test.AddOutput<float>("Y", expected_dims, expected_vals);
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kTensorrtExecutionProvider});
+}
+
 TEST(PoolTest, MaxPool_10_Dilation_2d) {
   OpTester test("MaxPool", 10);
 
@@ -239,11 +259,10 @@ TEST(PoolTest, MaxPool_10_Dilation_2d) {
   test.AddAttribute("dilations", vector<int64_t>{2, 2});
 
   std::vector<float> x_vals = {
-	  1,  3,  2,  4, -1,
-	  5,  7,  6,  8, -2,
-	  9,  11, 10, 12, -3,
-	  13, 15, 14, 16, -4,
-      };
+      1, 3, 2, 4, -1,
+      5, 7, 6, 8, -2,
+      9, 11, 10, 12, -3,
+      13, 15, 14, 16, -4};
   std::vector<int64_t> x_dims = {1, 1, 4, 5};
   std::vector<int64_t> expected_dims = {1, 1, 2, 3};
   std::vector<float> expected_vals = {10, 12, 10, 14, 16, 14};
@@ -253,6 +272,33 @@ TEST(PoolTest, MaxPool_10_Dilation_2d) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
+TEST(PoolTest, MaxPool_10_DilationPadding_2d) {
+  OpTester test("MaxPool", 10);
+
+  test.AddAttribute("auto_pad", "");
+  test.AddAttribute("strides", std::vector<int64_t>{1, 1});
+  test.AddAttribute("pads", vector<int64_t>{1, 1, 1, 1});
+  test.AddAttribute("kernel_shape", vector<int64_t>{2, 2});
+  test.AddAttribute("dilations", vector<int64_t>{2, 2});
+
+  std::vector<float> x_vals = {
+      1, 3, 2, 4, -1,
+      5, 7, 6, 8, -2,
+      9, 11, 10, 12, -3,
+      13, 15, 14, 16, -4};
+  std::vector<int64_t> x_dims = {1, 1, 4, 5};
+  std::vector<int64_t> expected_dims = {1, 1, 4, 5};
+  std::vector<float> expected_vals = {
+      7, 6, 8, 6, 8,
+      11, 10, 12, 10, 12,
+      15, 14, 16, 14, 16,
+      11, 10, 12, 10, 12};
+
+  test.AddInput<float>("X", x_dims, x_vals);
+  test.AddOutput<float>("Y", expected_dims, expected_vals);
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kTensorrtExecutionProvider});
+}
+
 TEST(PoolTest, MaxPool_10_Dilation_Ceil0_2d) {
   OpTester test("MaxPool", 10);
 
@@ -263,11 +309,10 @@ TEST(PoolTest, MaxPool_10_Dilation_Ceil0_2d) {
   test.AddAttribute("dilations", vector<int64_t>{2, 2});
 
   std::vector<float> x_vals = {
-	  1,  3,  2,  4, -1,
-	  5,  7,  6,  8, -2,
-	  9,  11, 10, 12, -3,
-	  13, 15, 14, 16, -4,
-      };
+      1, 3, 2, 4, -1,
+      5, 7, 6, 8, -2,
+      9, 11, 10, 12, -3,
+      13, 15, 14, 16, -4};
   std::vector<int64_t> x_dims = {1, 1, 4, 5};
   std::vector<int64_t> expected_dims = {1, 1, 1, 3};
   std::vector<float> expected_vals = {10, 12, 10};
@@ -288,11 +333,10 @@ TEST(PoolTest, MaxPool_10_Dilation_Ceil1_2d) {
   test.AddAttribute("ceil_mode", (int64_t)1);
 
   std::vector<float> x_vals = {
-	  1,  3,  2,  4, -1,
-	  5,  7,  6,  8, -2,
-	  9,  11, 10, 12, -3,
-	  13, 15, 14, 16, -4,
-      };
+      1, 3, 2, 4, -1,
+      5, 7, 6, 8, -2,
+      9, 11, 10, 12, -3,
+      13, 15, 14, 16, -4};
   std::vector<int64_t> x_dims = {1, 1, 4, 5};
   std::vector<int64_t> expected_dims = {1, 1, 2, 3};
   std::vector<float> expected_vals = {10, 12, 10, 10, 12, 10};
@@ -302,6 +346,41 @@ TEST(PoolTest, MaxPool_10_Dilation_Ceil1_2d) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
+TEST(PoolTest, MaxPool_10_DilationPadding_3d) {
+  OpTester test("MaxPool", 10);
+
+  test.AddAttribute("auto_pad", "");
+  test.AddAttribute("strides", std::vector<int64_t>{1, 1, 1});
+  test.AddAttribute("pads", vector<int64_t>{1, 1, 1, 1, 1, 1});
+  test.AddAttribute("kernel_shape", vector<int64_t>{2, 2, 2});
+  test.AddAttribute("dilations", vector<int64_t>{2, 2, 2});
+
+  std::vector<float> x_vals = {
+      1, 3, 2, 4, -1,
+      5, 7, 6, 8, -2,
+      9, 11, 10, 12, -3,
+      13, 15, 14, 16, -4,
+      1, 3, 2, 4, -1,
+      5, 7, 6, 8, -2,
+      9, 11, 10, 12, -3,
+      13, 15, 14, 16, -4};
+  std::vector<int64_t> x_dims = {1, 1, 2, 4, 5};
+  std::vector<int64_t> expected_dims = {1, 1, 2, 4, 5};
+  std::vector<float> expected_vals = {
+      7, 6, 8, 6, 8,
+      11, 10, 12, 10, 12,
+      15, 14, 16, 14, 16,
+      11, 10, 12, 10, 12,
+      7, 6, 8, 6, 8,
+      11, 10, 12, 10, 12,
+      15, 14, 16, 14, 16,
+      11, 10, 12, 10, 12};
+
+  test.AddInput<float>("X", x_dims, x_vals);
+  test.AddOutput<float>("Y", expected_dims, expected_vals);
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kTensorrtExecutionProvider});
+}
+
 TEST(PoolTest, GlobalMaxPool) {
   OpTester test("GlobalMaxPool");
 
@@ -566,17 +645,16 @@ TEST(PoolTest, AveragePool_10_ceil1_2d) {
   test.AddAttribute("strides", std::vector<int64_t>{3, 1});
   test.AddAttribute("pads", vector<int64_t>{0, 0, 0, 0});
   test.AddAttribute("kernel_shape", vector<int64_t>{2, 2});
-  test.AddAttribute("ceil_mode", (int64_t) 1);
+  test.AddAttribute("ceil_mode", (int64_t)1);
 
   std::vector<float> x_vals = {
-	  1,  3,  2,  4,
-	  5,  7,  6,  8,
-	  9,  11, 10, 12,
-	  13, 15, 14, 16,
-      };
+      1, 3, 2, 4,
+      5, 7, 6, 8,
+      9, 11, 10, 12,
+      13, 15, 14, 16};
   std::vector<int64_t> x_dims = {1, 1, 4, 4};
   std::vector<int64_t> expected_dims = {1, 1, 2, 3};
-  std::vector<float> expected_vals = {4.0f, 4.5f, 5.0f , 14.0f, 14.5f, 15.0f};
+  std::vector<float> expected_vals = {4.0f, 4.5f, 5.0f, 14.0f, 14.5f, 15.0f};
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);