From acfe7ac4cebf313662f396faacd93e2115bad8fb Mon Sep 17 00:00:00 2001
From: Zhang Lei <zhang.huanning@hotmail.com>
Date: Wed, 10 Mar 2021 10:02:01 -0800
Subject: [PATCH] Implement QLinearAveragePool with unit tests. (#6896)

Implement QLinearAveragePool with unit tests.
---
 .../contrib_ops/cpu/cpu_contrib_kernels.cc    |   2 +
 onnxruntime/contrib_ops/cpu/qlinear_pool.cc   | 327 ++++++++++++++++++
 onnxruntime/contrib_ops/cpu/qlinear_pool.h    |  27 ++
 onnxruntime/core/providers/cpu/nn/pool_base.h |   3 +-
 .../test/contrib_ops/qlinear_pool_test.cc     | 247 +++++++++++++
 5 files changed, 605 insertions(+), 1 deletion(-)
 create mode 100644 onnxruntime/contrib_ops/cpu/qlinear_pool.cc
 create mode 100644 onnxruntime/contrib_ops/cpu/qlinear_pool.h
 create mode 100644 onnxruntime/test/contrib_ops/qlinear_pool_test.cc
diff --git a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
index 534d9c3243..89fb51fd2a 100644
--- a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
@@ -47,6 +47,7 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSExperimentalDoma
 // ******** Start: Quantization ******************* //
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MatMulInteger16);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, QLinearGlobalAveragePool);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, QLinearAveragePool);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, DequantizeLinear);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, DequantizeLinear);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, QuantizeLinear);
@@ -131,6 +132,7 @@ Status RegisterQuantizationKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<void>,  //default entry to avoid the list become empty after ops-reducing
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MatMulInteger16)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, QLinearGlobalAveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, QLinearAveragePool)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, DequantizeLinear)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, DequantizeLinear)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, QuantizeLinear)>,
diff --git a/onnxruntime/contrib_ops/cpu/qlinear_pool.cc b/onnxruntime/contrib_ops/cpu/qlinear_pool.cc
new file mode 100644
index 0000000000..9923de913a
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/qlinear_pool.cc
@@ -0,0 +1,327 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "qlinear_pool.h"
+
+#include "core/util/math_cpuonly.h"
+#include "core/providers/common.h"
+#include "core/platform/threadpool.h"
+#include "core/util/math.h"
+#include "core/mlas/inc/mlas.h"
+
+#include <functional>
+
+namespace onnxruntime {
+
+using concurrency::ThreadPool;
+
+namespace contrib {
+
+template <typename T8Bits>
+static inline float dequantize_value(T8Bits x, float x_scale, T8Bits x_zero_point);
+
+template <typename T8Bits>
+static inline T8Bits quantize_value(float y, float y_scale, T8Bits y_zero_point);
+
+template <>
+inline float dequantize_value<uint8_t>(uint8_t x, float x_scale, uint8_t x_zero_point) {
+    return x_scale * (static_cast<int>(x) - x_zero_point);
+}
+
+template <>
+inline uint8_t quantize_value<uint8_t>(float y, float y_scale, uint8_t y_zero_point) {
+    return static_cast<uint8_t>(std::max(0.0f, std::min(std::nearbyintf(y / y_scale + y_zero_point), 255.0f)));
+}
+
+template <typename T8Bits, typename PoolType>
+struct QLinearPool1DTask final {
+  const float* X_data;
+  T8Bits* Y_data;
+  float y_scale;
+  T8Bits y_zero_point;
+  int64_t x_step;
+  int64_t y_step;
+  int64_t pooled_height;
+  int64_t stride_h;
+  int64_t height;
+  const std::vector<int64_t>& kernel_shape;
+  const std::vector<int64_t>& pads;
+  const PoolProcessContext& pool_context_;
+  const PoolAttributes& pool_attrs_;
+
+  TensorOpCost Cost() {
+    double loop_count = static_cast<double>(pooled_height * kernel_shape[0]);
+    return TensorOpCost{loop_count, loop_count, loop_count};
+  }
+
+  void operator()(std::ptrdiff_t begin, std::ptrdiff_t end) const {
+    for (int64_t c = begin; c < end; ++c) {
+      operator()(c);
+    }
+  }
+
+  void operator()(std::ptrdiff_t c) const {
+    const float* x_d = X_data + c * x_step;
+    T8Bits* y_d = Y_data + c * y_step;
+
+    for (int64_t ph = 0; ph < pooled_height; ++ph) {
+      int64_t hstart = ph * stride_h - pads[0];
+      int64_t hend = std::min(hstart + kernel_shape[0], height);
+      hstart = std::max(hstart, static_cast<int64_t>(0));
+      float Yh = PoolType::Initialize();
+      for (int64_t h = hstart; h < hend; ++h) {
+        PoolType::Process(x_d[h], Yh, pool_context_);
+      }
+      if (pool_attrs_.count_include_pad) {
+        PoolType::Finalize(kernel_shape[0], Yh, pool_context_);
+      } else {
+        PoolType::Finalize(hend - hstart, Yh, pool_context_);
+      }
+      y_d[ph] = quantize_value(Yh, y_scale, y_zero_point);
+    }
+  }
+};
+
+
+template <typename T8Bits, typename PoolType>
+struct QLinearPool2DTask final {
+  const float* X_data;
+  T8Bits* Y_data;
+  float y_scale;
+  T8Bits y_zero_point;
+  int64_t x_step;
+  int64_t y_step;
+  int64_t pooled_height;
+  int64_t pooled_width;
+  int64_t stride_h;
+  int64_t stride_w;
+  int64_t height;
+  int64_t width;
+  const std::vector<int64_t>& kernel_shape;
+  const std::vector<int64_t>& pads;
+  const PoolProcessContext& pool_context_;
+  const PoolAttributes& pool_attrs_;
+
+  TensorOpCost Cost() {
+    double loop_count = static_cast<double>(pooled_height * pooled_width * kernel_shape[0] * kernel_shape[1]);
+    return TensorOpCost{loop_count, loop_count, loop_count};
+  }
+
+  void operator()(std::ptrdiff_t begin, std::ptrdiff_t end) const {
+    for (int64_t c = begin; c < end; ++c) {
+      operator()(c);
+    }
+  }
+
+  void operator()(std::ptrdiff_t c) const {
+    const float* x_d = X_data + c * x_step;
+    T8Bits* y_d = Y_data + c * y_step;
+
+    for (int64_t ph = 0; ph < pooled_height; ++ph) {
+      int64_t hstart = ph * stride_h - pads[0];
+      int64_t hend = std::min(hstart + kernel_shape[0], height);
+      hstart = std::max(hstart, static_cast<int64_t>(0));
+      for (int64_t pw = 0; pw < pooled_width; ++pw) {
+        int64_t wstart = pw * stride_w - pads[1];
+        int64_t wend = std::min(wstart + kernel_shape[1], width);
+        wstart = std::max(wstart, static_cast<int64_t>(0));
+        const int64_t pool_index = ph * pooled_width + pw;
+        float Yh = PoolType::Initialize();
+        for (int64_t h = hstart; h < hend; ++h) {
+          int64_t input_index = h * width + wstart;
+          for (int64_t w = wstart; w < wend; ++w) {
+            PoolType::Process(x_d[input_index++], Yh, pool_context_);
+          }
+        }
+        if (pool_attrs_.count_include_pad) {
+          PoolType::Finalize(kernel_shape[0] * kernel_shape[1], Yh, pool_context_);
+        } else {
+          PoolType::Finalize((hend - hstart) * (wend - wstart), Yh, pool_context_);
+        }
+        y_d[pool_index] = quantize_value(Yh, y_scale, y_zero_point);
+      }
+    }
+  }
+};
+
+template <typename T8Bits, typename PoolType>
+struct QLinearPool3DTask final {
+  const float* X_data;
+  T8Bits* Y_data;
+  float y_scale;
+  T8Bits y_zero_point;
+  int64_t x_step;
+  int64_t y_step;
+  int64_t pooled_height;
+  int64_t pooled_width;
+  int64_t pooled_depth;
+  int64_t stride_h;
+  int64_t stride_w;
+  int64_t stride_d;
+  int64_t height;
+  int64_t width;
+  int64_t depth;
+  const std::vector<int64_t>& kernel_shape;
+  const std::vector<int64_t>& pads;
+  const PoolProcessContext& pool_context_;
+  const PoolAttributes& pool_attrs_;
+
+  TensorOpCost Cost() {
+    double loop_count = static_cast<double>(pooled_height * pooled_width * pooled_depth * kernel_shape[0] *
+                                            kernel_shape[1] * kernel_shape[2]);
+    return TensorOpCost{loop_count, loop_count, loop_count};
+  }
+
+  void operator()(std::ptrdiff_t begin, std::ptrdiff_t end) const {
+    for (int64_t c = begin; c < end; ++c) {
+      operator()(c);
+    }
+  }
+
+  void operator()(std::ptrdiff_t c) const {
+    const float* x_d = X_data + c * x_step;
+    T8Bits* y_d = Y_data + c * y_step;
+
+    for (int64_t ph = 0; ph < pooled_height; ++ph) {
+      int64_t hstart = ph * stride_h - pads[0];
+      int64_t hend = std::min(hstart + kernel_shape[0], height);
+      hstart = std::max(hstart, static_cast<int64_t>(0));
+      for (int64_t pw = 0; pw < pooled_width; ++pw) {
+        int64_t wstart = pw * stride_w - pads[1];
+        int64_t wend = std::min(wstart + kernel_shape[1], width);
+        wstart = std::max(wstart, static_cast<int64_t>(0));
+        for (int64_t pd = 0; pd < pooled_depth; ++pd) {
+          int64_t dstart = pd * stride_d - pads[2];
+          int64_t dend = std::min(dstart + kernel_shape[2], depth);
+          dstart = std::max(dstart, static_cast<int64_t>(0));
+          const int64_t pool_index = ph * pooled_width * pooled_depth + pw * pooled_depth + pd;
+          float Yh = PoolType::Initialize();
+          for (int64_t h = hstart; h < hend; ++h) {
+            const int64_t input_index_h = h * width * depth;
+            for (int64_t w = wstart; w < wend; ++w) {
+              int64_t input_index = input_index_h + w * depth + dstart;
+              for (int64_t d = dstart; d < dend; ++d) {
+                PoolType::Process(x_d[input_index++], Yh, pool_context_);
+              }
+            }
+          }
+          if (pool_attrs_.count_include_pad) {
+            PoolType::Finalize(kernel_shape[0] * kernel_shape[1] * kernel_shape[2], Yh, pool_context_);
+          } else {
+            PoolType::Finalize((hend - hstart) * (wend - wstart) * (dend - dstart), Yh, pool_context_);
+          }
+          auto y_value = quantize_value(Yh, y_scale, y_zero_point);
+          y_d[pool_index] = y_value;
+        }
+      }
+    }
+  }
+};
+
+Status QLinearAveragePool::Compute(OpKernelContext* context) const {
+  const auto tensor_x_scale = context->Input<Tensor>(1);
+  const auto tensor_x_zero_point = context->Input<Tensor>(2);
+  const auto tensor_y_scale = context->Input<Tensor>(3);
+  const auto tensor_y_zero_point = context->Input<Tensor>(4);
+
+  ORT_ENFORCE(IsScalarOr1ElementVector(tensor_x_scale),
+              "Input x_scale must be a scalar or 1D tensor of size 1");
+  ORT_ENFORCE(tensor_x_zero_point == nullptr || IsScalarOr1ElementVector(tensor_x_zero_point),
+              "input x_zero_point must be a scalar or 1D tensor of size 1 if given");
+  ORT_ENFORCE(IsScalarOr1ElementVector(tensor_y_scale),
+              "input y_scale must be a scalar or 1D tensor of size 1");
+  ORT_ENFORCE(tensor_y_zero_point == nullptr || IsScalarOr1ElementVector(tensor_y_zero_point),
+              "input y_zero_point must be a scalar or 1D tensor of size 1 if given");
+
+  const auto* X = context->Input<Tensor>(0);
+  auto dtype = X->GetElementType();
+  if (dtype != ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
+      ORT_THROW("Unsupported 'dtype' in QLinear Pooling:", dtype);
+  }
+  const TensorShape& x_shape = X->Shape();
+  const float x_scale = *(tensor_x_scale->Data<float>());
+  const float y_scale = *(tensor_y_scale->Data<float>());
+  uint8_t x_zero_point = (tensor_x_zero_point ? *(tensor_x_zero_point->Data<uint8_t>()) : (uint8_t)0);
+  uint8_t y_zero_point = (tensor_y_zero_point ? *(tensor_y_zero_point->Data<uint8_t>()) : (uint8_t)0);
+
+  ORT_RETURN_IF_NOT(x_shape.NumDimensions() >= 3, "Input dimension cannot be less than 3.");
+  std::vector<int64_t> pads = pool_attrs_.pads;
+  std::vector<int64_t> strides = pool_attrs_.strides;
+  std::vector<int64_t> kernel_shape = pool_attrs_.kernel_shape;
+
+  std::vector<int64_t> output_dims = pool_attrs_.SetOutputSize(x_shape, x_shape[1], &pads);
+  Tensor* Y = context->Output(0, output_dims);
+
+  const auto* X_data = X->Data<uint8_t>();
+  auto* Y_data = Y->MutableData<uint8_t>();
+
+  const int64_t channels = x_shape[1];
+  const int64_t height = x_shape[2];
+  const int64_t width = kernel_shape.size() > 1 ? x_shape[3] : 1;
+  const int64_t depth = kernel_shape.size() > 2 ? x_shape[4] : 1;
+  const int64_t pooled_height = output_dims[2];
+  const int64_t pooled_width = kernel_shape.size() > 1 ? output_dims[3] : 1;
+  const int64_t pooled_depth = kernel_shape.size() > 2 ? output_dims[4] : 1;
+  const int64_t total_channels = x_shape[0] * channels;
+  const int64_t x_step = height * width * depth;
+  const int64_t y_step = pooled_height * pooled_width * pooled_depth;
+
+  ThreadPool* tp = context->GetOperatorThreadPool();
+  std::vector<float> x_data_fp32;
+  if (kernel_shape.size() <= 3) {
+    x_data_fp32.resize(x_shape.Size());
+    ThreadPool::TryParallelFor(tp, x_shape.Size(), 1.0f, [=, &x_data_fp32](ptrdiff_t first, ptrdiff_t last) {
+      const auto* x8 = X_data + first;
+      float* x32 = x_data_fp32.data() + first;
+      for (ptrdiff_t i = 0, sz = last - first; i < sz; ++i) {
+          *x32++ = dequantize_value(x8[i], x_scale, x_zero_point);
+      }
+    });
+  }
+
+  switch (kernel_shape.size()) {
+    case 1:
+    {
+      QLinearPool1DTask<uint8_t, onnxruntime::AveragePool> avg_pool_task_1d = {
+          x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_step, y_step,
+          pooled_height, strides[0], height, kernel_shape, pads, pool_context_, pool_attrs_};
+      ThreadPool::TryParallelFor(tp, total_channels, avg_pool_task_1d.Cost(), avg_pool_task_1d);
+      break;
+    }
+
+    case 2:
+    {
+      QLinearPool2DTask<uint8_t, onnxruntime::AveragePool> avg_pool_task_2d = {
+          x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_step, y_step,
+          pooled_height, pooled_width, strides[0], strides[1], height, width, kernel_shape, pads, pool_context_, pool_attrs_};
+      ThreadPool::TryParallelFor(tp, total_channels, avg_pool_task_2d.Cost(), avg_pool_task_2d);
+      break;
+    }
+
+    case 3:
+    {
+      QLinearPool3DTask<uint8_t, onnxruntime::AveragePool> avg_pool_task_3d = {
+          x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_step, y_step,
+          pooled_height, pooled_width, pooled_depth, strides[0], strides[1], strides[2], height, width, depth,
+          kernel_shape, pads, pool_context_, pool_attrs_};
+      ThreadPool::TryParallelFor(tp, total_channels, avg_pool_task_3d.Cost(), avg_pool_task_3d);
+      break;
+    }
+
+    default:
+    {
+      return onnxruntime::common::Status(
+          onnxruntime::common::ONNXRUNTIME,
+          onnxruntime::common::INVALID_ARGUMENT,
+          "QLinear Pooling unsupported pooling size!");
+    }
+  }
+
+  return Status::OK();
+}
+
+ONNX_OPERATOR_KERNEL_EX(QLinearAveragePool, kMSDomain, 1, kCpuExecutionProvider, KernelDefBuilder(), QLinearAveragePool);
+
+}  // namespace contrib
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cpu/qlinear_pool.h b/onnxruntime/contrib_ops/cpu/qlinear_pool.h
new file mode 100644
index 0000000000..13175052f0
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/qlinear_pool.h
@@ -0,0 +1,27 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/common/common.h"
+#include "core/framework/op_kernel.h"
+#include "core/providers/cpu/nn/pool_base.h"
+
+namespace onnxruntime {
+namespace contrib {
+
+class QLinearAveragePool final : public OpKernel, public PoolBase {
+ public:
+  QLinearAveragePool(const OpKernelInfo& info) : OpKernel(info), PoolBase(info) { }
+
+  ~QLinearAveragePool() override = default;
+
+  Status Compute(OpKernelContext* context) const override;
+
+private:
+  PoolProcessContext pool_context_;
+
+};
+
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/nn/pool_base.h b/onnxruntime/core/providers/cpu/nn/pool_base.h
index 3d1edfbdaf..4e2c9910b9 100644
--- a/onnxruntime/core/providers/cpu/nn/pool_base.h
+++ b/onnxruntime/core/providers/cpu/nn/pool_base.h
@@ -106,7 +106,8 @@ class PoolBase {
 
  protected:
   PoolBase(const OpKernelInfo& info)
-      : op_name_(info.GetKernelDef().OpName()),
+      : op_name_(info.GetKernelDef().OpName().rfind("QLinear", 0) != 0 ?
+                     info.GetKernelDef().OpName() : info.GetKernelDef().OpName().substr(7)),
         pool_attrs_(info, op_name_, GetStartVersion(info)) {
   }
 
diff --git a/onnxruntime/test/contrib_ops/qlinear_pool_test.cc b/onnxruntime/test/contrib_ops/qlinear_pool_test.cc
new file mode 100644
index 0000000000..f248df5ca0
--- /dev/null
+++ b/onnxruntime/test/contrib_ops/qlinear_pool_test.cc
@@ -0,0 +1,247 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "gtest/gtest.h"
+#include "test/common/tensor_op_test_utils.h"
+#include "test/providers/provider_test_utils.h"
+#include "core/providers/common.h"
+
+namespace onnxruntime {
+namespace test {
+
+static inline float dequantize_u8(uint8_t x, float x_scale, uint8_t x_zero_point) {
+  return x_scale * (static_cast<int>(x) - x_zero_point);
+}
+
+static inline uint8_t quantize_u8(float y, float y_scale, uint8_t y_zero_point) {
+  return static_cast<uint8_t>(std::max(0.0f, std::min(std::nearbyintf(y / y_scale + y_zero_point), 255.0f)));
+}
+
+struct DimIterator {
+  DimIterator(const std::vector<int64_t>& dims) : dims_(dims) {
+    size_ = std::accumulate(dims_.begin(), dims_.end(), 1LL, std::multiplies<int64_t>());
+    restart();
+  }
+
+  void restart() {
+    pos_.resize(dims_.size(), 0LL);
+    index_ = 0LL;
+  }
+
+  bool has_next() { return index_ < size_; }
+
+  // if has more data return current data ptr and iterator to next pos_
+  // otherwise return -1
+  int64_t next() {
+    if (has_next()) {
+      for (size_t i = dims_.size(); i > 0;) {
+        i--;
+        ++pos_[i];
+        if (pos_[i] < dims_[i]) {
+          break;
+        }
+        pos_[i] = 0;
+      }
+      return index_++;
+    }
+    return -1L;
+  }
+
+  const std::vector<int64_t> dims_;
+  std::vector<int64_t> pos_;
+  int64_t size_;
+  int64_t index_;
+};
+
+static void
+CalculateAvgPoolNchwU8(
+    uint8_t* x,
+    const std::vector<int64_t> x_dims,
+    float x_scale,
+    int x_zero_point,
+    uint8_t* y,
+    const std::vector<int64_t> y_dims,
+    float y_scale,
+    int y_zero_point,
+    const std::vector<int64_t> kernel_shape,
+    const std::vector<int64_t> strides,
+    const std::vector<int64_t> pads,
+    const int64_t count_include_pad) {
+  int64_t batch = y_dims[0];
+  int64_t channel = y_dims[1];
+
+  std::vector<int64_t> y_img_dims(y_dims.begin() + 2, y_dims.end());
+  std::vector<int64_t> x_img_dims(x_dims.begin() + 2, x_dims.end());
+  std::vector<int64_t> x_img_strides(x_img_dims.size(), 1LL);
+  for (size_t i = x_img_dims.size() - 1; i > 0;) {
+    i--;
+    x_img_strides[i] = x_img_strides[i + 1] * x_img_dims[i + 1];
+  }
+
+  int64_t y_step = std::accumulate(y_img_dims.begin(), y_img_dims.end(), 1LL, std::multiplies<int64_t>());
+  int64_t x_step = std::accumulate(x_img_dims.begin(), x_img_dims.end(), 1LL, std::multiplies<int64_t>());
+  for (int64_t b = 0; b < batch; ++b) {
+    for (int64_t c = 0; c < channel; ++c) {
+      uint8_t* ybc = y + (b * channel + c) * y_step;
+      uint8_t* xbc = x + (b * channel + c) * x_step;
+
+      DimIterator yit(y_img_dims);
+      while (yit.has_next()) {
+        std::vector<int64_t> kernel_topleft(y_img_dims.size(), 0);
+        for (size_t i = 0; i < y_img_dims.size(); ++i) {
+          kernel_topleft[i] = yit.pos_[i] * strides[i];
+        }
+
+        float y_value_sum = 0.0f;
+        int count = 0;
+        for (DimIterator kit(kernel_shape); kit.has_next(); kit.next()) {
+          int64_t kernel_offset = 0;
+          for (size_t i = 0; kernel_offset >= 0 && i < kernel_shape.size(); ++i) {
+            int64_t x_real_dim = kernel_topleft[i] + kit.pos_[i] - pads[i];
+            if (x_real_dim >= 0 && x_real_dim < x_img_dims[i]) {
+              kernel_offset += x_real_dim * x_img_strides[i];
+            } else {
+              kernel_offset = -1LL;  // padding element
+            }
+          }
+          if (kernel_offset >= 0) {
+            y_value_sum += dequantize_u8(xbc[kernel_offset], x_scale, static_cast<uint8_t>(x_zero_point));
+            ++count;
+          } else {
+            count += count_include_pad ? 1 : 0;
+          }
+        }
+        auto y_offset = yit.next();
+        auto y_u8 = quantize_u8(y_value_sum / count, y_scale, static_cast<uint8_t>(y_zero_point));
+        ybc[y_offset] = y_u8;
+      }
+    }
+  }
+}
+
+void RunQLinearAveragePoolNchwU8(
+    const std::vector<int64_t> x_dims,
+    const std::vector<int64_t> y_dims,
+    const std::vector<int64_t> kernel_shape,
+    const std::vector<int64_t> strides,
+    const std::vector<int64_t> pads,
+    const int64_t count_include_pad = 0) {
+  float x_scale = 1.0f / 255.0f;
+  uint8_t x_zero_point = 128;
+  RandomValueGenerator random{};
+  std::vector<float> x_data_fp32 = random.Uniform<float>(x_dims, -0.5f, 0.5f);
+  std::vector<uint8_t> x_data(x_data_fp32.size());
+  for (size_t i = 0; i < x_data.size(); ++i) {
+    x_data[i] = quantize_u8(x_data_fp32[i], x_scale, x_zero_point);
+  }
+
+  float y_scale = 1.0f / 255.0f;
+  uint8_t y_zero_point = 100;
+  int64_t y_size = std::accumulate(y_dims.begin(), y_dims.end(), 1LL, std::multiplies<int64_t>());
+  std::vector<uint8_t> y_data(y_size);
+  CalculateAvgPoolNchwU8(
+      x_data.data(), x_dims, x_scale, x_zero_point,
+      y_data.data(), y_dims, y_scale, y_zero_point,
+      kernel_shape, strides, pads, count_include_pad);
+
+  OpTester test("QLinearAveragePool", 1, onnxruntime::kMSDomain);
+
+  test.AddAttribute("auto_pad", "");
+  test.AddAttribute("strides", strides);
+  test.AddAttribute("pads", pads);
+  test.AddAttribute("kernel_shape", kernel_shape);
+  test.AddAttribute("count_include_pad", count_include_pad);
+
+  test.AddInput<uint8_t>("X", x_dims, x_data);
+  test.AddInput<float>("x_scale", {}, {x_scale});
+  test.AddInput<uint8_t>("x_zero_point", {}, {x_zero_point});
+  test.AddInput<float>("y_scale", {}, {y_scale});
+  test.AddInput<uint8_t>("y_zero_point", {}, {y_zero_point});
+  test.AddOutput<uint8_t>("Y", y_dims, y_data);
+
+  auto q8checker = [&](const std::vector<OrtValue>& fetches, const std::string& provider_type) {
+    const OrtValue& ort_value = fetches[0];
+    if (ort_value.Fence()) {
+      ort_value.Fence()->BeforeUsingAsInput(onnxruntime::kCpuExecutionProvider, 0);
+    }
+
+    auto y_shape = TensorShape(y_dims);
+    const Tensor& output_tensor = ort_value.Get<Tensor>();
+    ORT_ENFORCE(y_shape == output_tensor.Shape(),
+                "Expected output shape [" + y_shape.ToString() + "] did not match run output shape [" +
+                    output_tensor.Shape().ToString() + "] for Y @" + provider_type);
+    auto* output = output_tensor.Data<uint8_t>();
+    auto size = static_cast<int>(output_tensor.Shape().Size());
+    for (int i = 0; i < size; ++i) {
+      int diff = abs(y_data[i] - output[i]);
+      EXPECT_LE(diff, 1) << "i:" << i << " expected:" << y_data[i] << " " << (int)y_data[i]
+                         << ", got:" << output[i] << " " << (int)output[i] << ", provider_type: " << provider_type;
+    }
+  };
+  test.SetCustomOutputVerifier(q8checker);
+
+  test.Run();
+}
+
+TEST(QLinearPoolTest, AveragePool1D_ExcludePadPixel) {
+  RunQLinearAveragePoolNchwU8(
+      {1, 1, 5},  // x shape
+      {1, 1, 6},  // expected y shape
+      {3},        // kernel shape
+      {1},        // strides
+      {1, 2},     // pads
+      0);         // count_include_pad
+}
+
+TEST(QLinearPoolTest, AveragePool1D_IncludePadPixel) {
+  RunQLinearAveragePoolNchwU8(
+      {1, 1, 5},  // x shape
+      {1, 1, 6},  // expected y shape
+      {3},        // kernel shape
+      {1},        // strides
+      {1, 2},     // pads
+      1);         // count_include_pad
+}
+
+TEST(QLinearPoolTest, AveragePool2D_ExcludePadPixel) {
+  RunQLinearAveragePoolNchwU8(
+      {1, 1, 5, 7},  // x shape
+      {1, 1, 6, 4},  // expected y shape
+      {3, 4},        // kernel shape
+      {1, 2},        // strides
+      {1, 3, 2, 1},  // pads
+      0);            // count_include_pad
+}
+
+TEST(QLinearPoolTest, AveragePool2D_IncludePadPixel) {
+  RunQLinearAveragePoolNchwU8(
+      {1, 1, 5, 7},  // x shape
+      {1, 1, 6, 4},  // expected y shape
+      {3, 4},        // kernel shape
+      {1, 2},        // strides
+      {1, 3, 2, 1},  // pads
+      1);            // count_include_pad
+}
+
+TEST(QLinearPoolTest, AveragePool3D_ExcludePadPixel) {
+  RunQLinearAveragePoolNchwU8(
+      {1, 1, 5, 7, 9},     // x shape
+      {1, 1, 6, 4, 3},     // expected y shape
+      {3, 4, 5},           // kernel shape
+      {1, 2, 3},           // strides
+      {1, 3, 2, 2, 1, 2},  // pads
+      0);                  // count_include_pad
+}
+
+TEST(QLinearPoolTest, AveragePool3D_IncludePadPixel) {
+  RunQLinearAveragePoolNchwU8(
+      {1, 1, 5, 7, 9},     // x shape
+      {1, 1, 6, 4, 3},     // expected y shape
+      {3, 4, 5},           // kernel shape
+      {1, 2, 3},           // strides
+      {1, 3, 2, 2, 1, 2},  // pads
+      1);                  // count_include_pad
+}
+
+}  // namespace test
+}  // namespace onnxruntime