From 50c5edcf13ac6db8e6bfcaf415fee87715db70ef Mon Sep 17 00:00:00 2001
From: Zhang Lei <zhang.huanning@hotmail.com>
Date: Thu, 13 May 2021 22:05:30 -0700
Subject: [PATCH] Add nhwc support for QLinearAveragePool operator (#7656)

* Add nhwc support for QLinearAveragePool operator

* Update ContribOperators.md

* Update OperatorKernels.md with cpu,dnnl and cuda enabled.
---
 docs/ContribOperators.md                      |   2 +
 docs/OperatorKernels.md                       |  10 +
 onnxruntime/contrib_ops/cpu/qlinear_pool.cc   | 388 ++++++++++++++++--
 onnxruntime/contrib_ops/cpu/qlinear_pool.h    |   8 +-
 .../graph/contrib_ops/nhwc_schema_defs.cc     | 152 ++++++-
 .../graph/contrib_ops/quantization_defs.cc    | 149 +------
 .../graph/contrib_ops/quantization_defs.h     |  19 +
 .../core/optimizer/nhwc_transformer.cc        |   3 +-
 .../test/contrib_ops/qlinear_pool_test.cc     | 162 ++++++++
 .../test/optimizer/nhwc_transformer_test.cc   |  45 ++
 10 files changed, 741 insertions(+), 197 deletions(-)
 create mode 100644 onnxruntime/core/graph/contrib_ops/quantization_defs.h
diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
index 4e18737a31..f317e45b0a 100644
--- a/docs/ContribOperators.md
+++ b/docs/ContribOperators.md
@@ -1784,6 +1784,8 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dd>auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where default value is NOTSET, which means explicit padding is used. SAME_UPPER or SAME_LOWER mean pad the input so that the output spatial size match the input.In case of odd number add the extra padding at the end for SAME_UPPER and at the beginning for SAME_LOWER. VALID mean no padding.</dd>
 <dt><tt>ceil_mode</tt> : int</dt>
 <dd>Whether to use ceil or floor (default) to compute the output shape.</dd>
+<dt><tt>channels_last</tt> : int</dt>
+<dd>Works on NHWC layout or not? Default not.</dd>
 <dt><tt>count_include_pad</tt> : int</dt>
 <dd>Whether include pad pixels when calculating values for the edges. Default is 0, doesn't count include pad.</dd>
 <dt><tt>kernel_shape</tt> : list of ints (required)</dt>
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index 7ad397f6ef..2b2fa79019 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -687,3 +687,13 @@
 |TransposeMatMul|(*in* A:**T**, *in* B:**T**, *out* Y:**T**)|1+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
 | |
 | |
+
+
+## Operators implemented by DnnlExecutionProvider
+
+| Op Name | Parameters | OpSet Version | Types Supported |
+|---------|------------|---------------|-----------------|
+|**Operator Domain:** *ai.onnx.ml*||||
+|Gemm|(*in* A:**T**, *in* B:**T**, *in* C:**T**, *out* Y:**T**)|7+|**T** = tensor(float)|
+| |
+| |
diff --git a/onnxruntime/contrib_ops/cpu/qlinear_pool.cc b/onnxruntime/contrib_ops/cpu/qlinear_pool.cc
index 9923de913a..0d39e85797 100644
--- a/onnxruntime/contrib_ops/cpu/qlinear_pool.cc
+++ b/onnxruntime/contrib_ops/cpu/qlinear_pool.cc
@@ -25,22 +25,33 @@ static inline T8Bits quantize_value(float y, float y_scale, T8Bits y_zero_point)
 
 template <>
 inline float dequantize_value<uint8_t>(uint8_t x, float x_scale, uint8_t x_zero_point) {
-    return x_scale * (static_cast<int>(x) - x_zero_point);
+  return x_scale * (static_cast<int>(x) - x_zero_point);
 }
 
 template <>
 inline uint8_t quantize_value<uint8_t>(float y, float y_scale, uint8_t y_zero_point) {
-    return static_cast<uint8_t>(std::max(0.0f, std::min(std::nearbyintf(y / y_scale + y_zero_point), 255.0f)));
+  return static_cast<uint8_t>(std::max(0.0f, std::min(std::nearbyintf(y / y_scale + y_zero_point), 255.0f)));
 }
 
+static void SwitchDimsNchwNhwc(std::vector<int64_t>& dims, bool from_nchw_to_nhwc) {
+  if (from_nchw_to_nhwc) {
+    int64_t channel = dims[1];
+    dims.erase(dims.begin() + 1);
+    dims.push_back(channel);
+  } else {
+    int64_t channel = dims.back();
+    dims.insert(dims.begin() + 1, channel);
+    dims.pop_back();
+  }
+}
 template <typename T8Bits, typename PoolType>
 struct QLinearPool1DTask final {
   const float* X_data;
   T8Bits* Y_data;
   float y_scale;
   T8Bits y_zero_point;
-  int64_t x_step;
-  int64_t y_step;
+  int64_t x_image_size;
+  int64_t y_image_size;
   int64_t pooled_height;
   int64_t stride_h;
   int64_t height;
@@ -61,8 +72,8 @@ struct QLinearPool1DTask final {
   }
 
   void operator()(std::ptrdiff_t c) const {
-    const float* x_d = X_data + c * x_step;
-    T8Bits* y_d = Y_data + c * y_step;
+    const float* x_d = X_data + c * x_image_size;
+    T8Bits* y_d = Y_data + c * y_image_size;
 
     for (int64_t ph = 0; ph < pooled_height; ++ph) {
       int64_t hstart = ph * stride_h - pads[0];
@@ -82,6 +93,67 @@ struct QLinearPool1DTask final {
   }
 };
 
+template <typename T8Bits, typename PoolType>
+struct QLinearPoolNhwc1DTask final {
+  const float* X_data;
+  T8Bits* Y_data;
+  float y_scale;
+  T8Bits y_zero_point;
+  int64_t channels;
+  int64_t pooled_height;
+  int64_t stride_h;
+  int64_t height;
+  const std::vector<int64_t>& kernel_shape;
+  const std::vector<int64_t>& pads;
+  const PoolProcessContext& pool_context_;
+  const PoolAttributes& pool_attrs_;
+
+  TensorOpCost Cost() {
+    double loop_count = static_cast<double>(channels * kernel_shape[0]);
+    return TensorOpCost{loop_count, loop_count, loop_count};
+  }
+
+  void operator()(std::ptrdiff_t begin, std::ptrdiff_t end) const {
+    int64_t y_image_size = pooled_height;
+    int64_t batch = begin / y_image_size;
+    int64_t offset = begin % y_image_size;
+
+    for (int64_t remains = end - begin; remains > 0; offset = 0, batch++) {
+      if (offset + remains <= y_image_size) {
+        operator()(std::ptrdiff_t(batch), std::ptrdiff_t(offset), std::ptrdiff_t(offset + remains));
+        remains = 0;
+      } else {
+        operator()(std::ptrdiff_t(batch), std::ptrdiff_t(offset), std::ptrdiff_t(y_image_size));
+        remains -= (y_image_size - offset);
+      }
+    }
+  }
+
+  void operator()(std::ptrdiff_t batch, std::ptrdiff_t begin, std::ptrdiff_t end) const {
+    const float* x_d = X_data + batch * height * channels;
+    T8Bits* y_d = Y_data + batch * pooled_height * channels;
+    std::vector<float> Yh(channels, PoolType::Initialize());
+
+    for (int64_t ph = begin, phc = begin * channels; ph < end; ++ph, phc += channels) {
+      int64_t hstart = ph * stride_h - pads[0];
+      int64_t hend = std::min(hstart + kernel_shape[0], height);
+      hstart = std::max(hstart, static_cast<int64_t>(0));
+
+      std::fill(Yh.begin(), Yh.end(), PoolType::Initialize());
+      for (int64_t h = hstart, hc = hstart * channels; h < hend; ++h, hc += channels) {
+        for (int64_t c = 0; c < channels; ++c) {
+          PoolType::Process(x_d[hc + c], Yh[c], pool_context_);
+        }
+      }
+
+      int64_t element_count = (pool_attrs_.count_include_pad) ? kernel_shape[0] : hend - hstart;
+      for (int64_t c = 0; c < channels; ++c) {
+        PoolType::Finalize(element_count, Yh[c], pool_context_);
+        y_d[phc + c] = quantize_value(Yh[c], y_scale, y_zero_point);
+      }
+    }
+  }
+};
 
 template <typename T8Bits, typename PoolType>
 struct QLinearPool2DTask final {
@@ -89,8 +161,8 @@ struct QLinearPool2DTask final {
   T8Bits* Y_data;
   float y_scale;
   T8Bits y_zero_point;
-  int64_t x_step;
-  int64_t y_step;
+  int64_t x_image_size;
+  int64_t y_image_size;
   int64_t pooled_height;
   int64_t pooled_width;
   int64_t stride_h;
@@ -114,8 +186,8 @@ struct QLinearPool2DTask final {
   }
 
   void operator()(std::ptrdiff_t c) const {
-    const float* x_d = X_data + c * x_step;
-    T8Bits* y_d = Y_data + c * y_step;
+    const float* x_d = X_data + c * x_image_size;
+    T8Bits* y_d = Y_data + c * y_image_size;
 
     for (int64_t ph = 0; ph < pooled_height; ++ph) {
       int64_t hstart = ph * stride_h - pads[0];
@@ -144,14 +216,105 @@ struct QLinearPool2DTask final {
   }
 };
 
+template <typename T8Bits, typename PoolType>
+struct QLinearPoolNhwc2DTask final {
+  const float* X_data;
+  T8Bits* Y_data;
+  float y_scale;
+  T8Bits y_zero_point;
+  int64_t x_image_size;
+  int64_t y_image_size;
+  int64_t kernel_size;
+  int64_t channels;
+  int64_t pooled_height;
+  int64_t pooled_width;
+  int64_t stride_h;
+  int64_t stride_w;
+  int64_t height;
+  int64_t width;
+  const std::vector<int64_t>& kernel_shape;
+  const std::vector<int64_t>& pads;
+  const PoolProcessContext& pool_context_;
+  const PoolAttributes& pool_attrs_;
+
+  TensorOpCost Cost() {
+    double loop_count = static_cast<double>(channels * kernel_size);
+    return TensorOpCost{loop_count, loop_count, loop_count};
+  }
+
+  void operator()(std::ptrdiff_t begin, std::ptrdiff_t end) const {
+    int64_t batch = begin / y_image_size;
+    int64_t offset = begin % y_image_size;
+
+    for (int64_t remains = end - begin; remains > 0; offset = 0, batch++) {
+      if (offset + remains <= y_image_size) {
+        operator()(std::ptrdiff_t(batch), std::ptrdiff_t(offset), std::ptrdiff_t(offset + remains));
+        remains = 0;
+      } else {
+        operator()(std::ptrdiff_t(batch), std::ptrdiff_t(offset), std::ptrdiff_t(y_image_size));
+        remains -= (y_image_size - offset);
+      }
+    }
+  }
+
+  void operator()(std::ptrdiff_t batch, std::ptrdiff_t begin, std::ptrdiff_t end) const {
+    const float* x_d = X_data + batch * x_image_size * channels;
+    T8Bits* y_d = Y_data + batch * y_image_size * channels;
+
+    // Calculate starting pooled_h, pooled_w, pooled_d
+    int64_t start_pw = begin;
+    int64_t start_ph = start_pw / pooled_width;
+    start_pw -= (start_ph * pooled_width);
+
+    int64_t pool_index = channels * begin;
+    int64_t remains = end - begin;
+    std::vector<float> Yh(channels);
+
+    for (int64_t ph = start_ph; remains > 0 && ph < pooled_height; ++ph) {
+      int64_t hstart = ph * stride_h - pads[0];
+      int64_t hend = std::min(hstart + kernel_shape[0], height);
+      hstart = std::max(hstart, static_cast<int64_t>(0));
+      for (int64_t pw = start_pw; remains > 0 && pw < pooled_width; ++pw) {
+        int64_t wstart = pw * stride_w - pads[1];
+        int64_t wend = std::min(wstart + kernel_shape[1], width);
+        wstart = std::max(wstart, static_cast<int64_t>(0));
+
+        // do the pooling here
+        float pool_init_value = PoolType::Initialize();
+        std::fill(Yh.data(), Yh.data() + channels, pool_init_value);
+        for (int64_t h = hstart; h < hend; ++h) {
+          int64_t input_index = channels * (h * width + wstart);
+          for (int64_t w = wstart; w < wend; ++w) {
+            for (int64_t c = 0; c < channels; c++) {
+              PoolType::Process(x_d[input_index + c], Yh[c], pool_context_);
+            }
+            input_index += channels;
+          }
+        }
+
+        int64_t elements_count = (pool_attrs_.count_include_pad) ? kernel_size : (hend - hstart) * (wend - wstart);
+        for (int64_t c = 0; c < channels; c++) {
+          PoolType::Finalize(elements_count, Yh[c], pool_context_);
+          auto y_value = quantize_value(Yh[c], y_scale, y_zero_point);
+          y_d[pool_index + c] = y_value;
+        }
+
+        pool_index += channels;
+        remains--;
+      }
+      start_pw = 0;
+    }
+  }
+};
+
 template <typename T8Bits, typename PoolType>
 struct QLinearPool3DTask final {
   const float* X_data;
   T8Bits* Y_data;
   float y_scale;
   T8Bits y_zero_point;
-  int64_t x_step;
-  int64_t y_step;
+  int64_t x_image_size;
+  int64_t y_image_size;
   int64_t pooled_height;
   int64_t pooled_width;
   int64_t pooled_depth;
@@ -179,8 +342,8 @@ struct QLinearPool3DTask final {
   }
 
   void operator()(std::ptrdiff_t c) const {
-    const float* x_d = X_data + c * x_step;
-    T8Bits* y_d = Y_data + c * y_step;
+    const float* x_d = X_data + c * x_image_size;
+    T8Bits* y_d = Y_data + c * y_image_size;
 
     for (int64_t ph = 0; ph < pooled_height; ++ph) {
       int64_t hstart = ph * stride_h - pads[0];
@@ -218,6 +381,110 @@ struct QLinearPool3DTask final {
   }
 };
 
+template <typename T8Bits, typename PoolType>
+struct QLinearPoolNhwc3DTask final {
+  const float* X_data;
+  T8Bits* Y_data;
+  float y_scale;
+  T8Bits y_zero_point;
+  int64_t x_image_size;
+  int64_t y_image_size;
+  int64_t kernel_size;
+  int64_t channels;
+  int64_t pooled_height;
+  int64_t pooled_width;
+  int64_t pooled_depth;
+  int64_t stride_h;
+  int64_t stride_w;
+  int64_t stride_d;
+  int64_t height;
+  int64_t width;
+  int64_t depth;
+  const std::vector<int64_t>& kernel_shape;
+  const std::vector<int64_t>& pads;
+  const PoolProcessContext& pool_context_;
+  const PoolAttributes& pool_attrs_;
+
+  TensorOpCost Cost() {
+    double loop_count = static_cast<double>(channels * kernel_size);
+    return TensorOpCost{loop_count, loop_count, loop_count};
+  }
+
+  void operator()(std::ptrdiff_t begin, std::ptrdiff_t end) const {
+    int64_t batch = begin / y_image_size;
+    int64_t offset = begin % y_image_size;
+
+    for (int64_t remains = end - begin; remains > 0; offset = 0, batch++) {
+      if (offset + remains <= y_image_size) {
+        operator()(std::ptrdiff_t(batch), std::ptrdiff_t(offset), std::ptrdiff_t(offset + remains));
+        remains = 0;
+      } else {
+        operator()(std::ptrdiff_t(batch), std::ptrdiff_t(offset), std::ptrdiff_t(y_image_size));
+        remains -= (y_image_size - offset);
+      }
+    }
+  }
+
+  void operator()(std::ptrdiff_t batch, std::ptrdiff_t begin, std::ptrdiff_t end) const {
+    const float* x_d = X_data + batch * x_image_size * channels;
+    T8Bits* y_d = Y_data + batch * y_image_size * channels;
+
+    // Calculate starting pooled_h, pooled_w, pooled_d
+    int64_t start_pd = begin;
+    int64_t start_ph = start_pd / (pooled_width * pooled_depth);
+    start_pd = start_pd - (start_ph * pooled_width * pooled_depth);
+    int64_t start_pw = start_pd / pooled_depth;
+    start_pd = start_pd - start_pw * pooled_depth;
+    int64_t pool_index = channels * begin;
+    int64_t remains = end - begin;
+
+    std::vector<float> Yh(channels);
+
+    for (int64_t ph = start_ph; remains > 0 && ph < pooled_height; ++ph) {
+      int64_t hstart = ph * stride_h - pads[0];
+      int64_t hend = std::min(hstart + kernel_shape[0], height);
+      hstart = std::max(hstart, static_cast<int64_t>(0));
+      for (int64_t pw = start_pw; remains > 0 && pw < pooled_width; ++pw) {
+        int64_t wstart = pw * stride_w - pads[1];
+        int64_t wend = std::min(wstart + kernel_shape[1], width);
+        wstart = std::max(wstart, static_cast<int64_t>(0));
+        for (int64_t pd = start_pd; remains > 0 && pd < pooled_depth; ++pd) {
+          int64_t dstart = pd * stride_d - pads[2];
+          int64_t dend = std::min(dstart + kernel_shape[2], depth);
+          dstart = std::max(dstart, static_cast<int64_t>(0));
+
+          // do the pooling here
+          std::fill(Yh.begin(), Yh.end(), PoolType::Initialize());
+          for (int64_t h = hstart; h < hend; ++h) {
+            const int64_t input_index_h = h * width * depth;
+            for (int64_t w = wstart; w < wend; ++w) {
+              int64_t input_index = channels * (input_index_h + w * depth + dstart);
+              for (int64_t d = dstart; d < dend; ++d) {
+                for (int64_t c = 0; c < channels; c++) {
+                  PoolType::Process(x_d[input_index + c], Yh[c], pool_context_);
+                }
+                input_index += channels;
+              }
+            }
+          }
+
+          int64_t elements_count = (pool_attrs_.count_include_pad) ? kernel_size : (hend - hstart) * (wend - wstart) * (dend - dstart);
+          for (int64_t c = 0; c < channels; c++) {
+            PoolType::Finalize(elements_count, Yh[c], pool_context_);
+            auto y_value = quantize_value(Yh[c], y_scale, y_zero_point);
+            y_d[pool_index + c] = y_value;
+          }
+
+          pool_index += channels;
+          remains--;
+        }
+        start_pd = 0;
+      }
+      start_pw = 0;
+    }
+  }
+};
+
 Status QLinearAveragePool::Compute(OpKernelContext* context) const {
   const auto tensor_x_scale = context->Input<Tensor>(1);
   const auto tensor_x_zero_point = context->Input<Tensor>(2);
@@ -236,9 +503,10 @@ Status QLinearAveragePool::Compute(OpKernelContext* context) const {
   const auto* X = context->Input<Tensor>(0);
   auto dtype = X->GetElementType();
   if (dtype != ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
-      ORT_THROW("Unsupported 'dtype' in QLinear Pooling:", dtype);
+    ORT_THROW("Unsupported 'dtype' in QLinear Pooling:", dtype);
   }
-  const TensorShape& x_shape = X->Shape();
+
+  TensorShape x_shape = X->Shape();
   const float x_scale = *(tensor_x_scale->Data<float>());
   const float y_scale = *(tensor_y_scale->Data<float>());
   uint8_t x_zero_point = (tensor_x_zero_point ? *(tensor_x_zero_point->Data<uint8_t>()) : (uint8_t)0);
@@ -249,12 +517,14 @@ Status QLinearAveragePool::Compute(OpKernelContext* context) const {
   std::vector<int64_t> strides = pool_attrs_.strides;
   std::vector<int64_t> kernel_shape = pool_attrs_.kernel_shape;
 
+  if (channels_last_) {
+    std::vector<int64_t> x_dims = x_shape.GetDims();
+    SwitchDimsNchwNhwc(x_dims, false);
+    x_shape = TensorShape(x_dims);
+  }
   std::vector<int64_t> output_dims = pool_attrs_.SetOutputSize(x_shape, x_shape[1], &pads);
-  Tensor* Y = context->Output(0, output_dims);
-
-  const auto* X_data = X->Data<uint8_t>();
-  auto* Y_data = Y->MutableData<uint8_t>();
 
+  int64_t batch_count = x_shape[0];
   const int64_t channels = x_shape[1];
   const int64_t height = x_shape[2];
   const int64_t width = kernel_shape.size() > 1 ? x_shape[3] : 1;
@@ -262,9 +532,17 @@ Status QLinearAveragePool::Compute(OpKernelContext* context) const {
   const int64_t pooled_height = output_dims[2];
   const int64_t pooled_width = kernel_shape.size() > 1 ? output_dims[3] : 1;
   const int64_t pooled_depth = kernel_shape.size() > 2 ? output_dims[4] : 1;
-  const int64_t total_channels = x_shape[0] * channels;
-  const int64_t x_step = height * width * depth;
-  const int64_t y_step = pooled_height * pooled_width * pooled_depth;
+  const int64_t total_channels = batch_count * channels;
+  const int64_t x_image_size = height * width * depth;
+  const int64_t y_image_size = pooled_height * pooled_width * pooled_depth;
+  const int64_t kernel_size = std::accumulate(kernel_shape.begin(), kernel_shape.end(), 1LL, std::multiplies<int64_t>());
+
+  if (channels_last_) {
+    SwitchDimsNchwNhwc(output_dims, true);
+  }
+  Tensor* Y = context->Output(0, output_dims);
+  const auto* X_data = X->Data<uint8_t>();
+  auto* Y_data = Y->MutableData<uint8_t>();
 
   ThreadPool* tp = context->GetOperatorThreadPool();
   std::vector<float> x_data_fp32;
@@ -274,42 +552,62 @@ Status QLinearAveragePool::Compute(OpKernelContext* context) const {
       const auto* x8 = X_data + first;
       float* x32 = x_data_fp32.data() + first;
       for (ptrdiff_t i = 0, sz = last - first; i < sz; ++i) {
-          *x32++ = dequantize_value(x8[i], x_scale, x_zero_point);
+        *x32++ = dequantize_value(x8[i], x_scale, x_zero_point);
       }
     });
   }
 
   switch (kernel_shape.size()) {
-    case 1:
-    {
-      QLinearPool1DTask<uint8_t, onnxruntime::AveragePool> avg_pool_task_1d = {
-          x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_step, y_step,
-          pooled_height, strides[0], height, kernel_shape, pads, pool_context_, pool_attrs_};
-      ThreadPool::TryParallelFor(tp, total_channels, avg_pool_task_1d.Cost(), avg_pool_task_1d);
+    case 1: {
+      if (channels_last_) {
+        QLinearPoolNhwc1DTask<uint8_t, onnxruntime::AveragePool> avg_pool_task_1d = {
+            x_data_fp32.data(), Y_data, y_scale, y_zero_point, channels,
+            pooled_height, strides[0], height, kernel_shape, pads, pool_context_, pool_attrs_};
+        ThreadPool::TryParallelFor(tp, y_image_size * batch_count, avg_pool_task_1d.Cost(), avg_pool_task_1d);
+      } else {
+        QLinearPool1DTask<uint8_t, onnxruntime::AveragePool> avg_pool_task_1d = {
+            x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_image_size, y_image_size,
+            pooled_height, strides[0], height, kernel_shape, pads, pool_context_, pool_attrs_};
+        ThreadPool::TryParallelFor(tp, total_channels, avg_pool_task_1d.Cost(), avg_pool_task_1d);
+      }
       break;
     }
 
-    case 2:
-    {
-      QLinearPool2DTask<uint8_t, onnxruntime::AveragePool> avg_pool_task_2d = {
-          x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_step, y_step,
-          pooled_height, pooled_width, strides[0], strides[1], height, width, kernel_shape, pads, pool_context_, pool_attrs_};
-      ThreadPool::TryParallelFor(tp, total_channels, avg_pool_task_2d.Cost(), avg_pool_task_2d);
+    case 2: {
+      if (channels_last_) {
+        QLinearPoolNhwc2DTask<uint8_t, onnxruntime::AveragePool> avg_pool_task_2d = {
+            x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_image_size, y_image_size, kernel_size, channels,
+            pooled_height, pooled_width, strides[0], strides[1], height, width, kernel_shape, pads, pool_context_, pool_attrs_};
+        ThreadPool::TryParallelFor(tp, y_image_size * batch_count, avg_pool_task_2d.Cost(), avg_pool_task_2d);
+
+      } else {
+        QLinearPool2DTask<uint8_t, onnxruntime::AveragePool> avg_pool_task_2d = {
+            x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_image_size, y_image_size,
+            pooled_height, pooled_width, strides[0], strides[1], height, width, kernel_shape, pads, pool_context_, pool_attrs_};
+        ThreadPool::TryParallelFor(tp, total_channels, avg_pool_task_2d.Cost(), avg_pool_task_2d);
+      }
       break;
     }
 
-    case 3:
-    {
-      QLinearPool3DTask<uint8_t, onnxruntime::AveragePool> avg_pool_task_3d = {
-          x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_step, y_step,
-          pooled_height, pooled_width, pooled_depth, strides[0], strides[1], strides[2], height, width, depth,
-          kernel_shape, pads, pool_context_, pool_attrs_};
-      ThreadPool::TryParallelFor(tp, total_channels, avg_pool_task_3d.Cost(), avg_pool_task_3d);
+    case 3: {
+      if (channels_last_) {
+        QLinearPoolNhwc3DTask<uint8_t, onnxruntime::AveragePool> avg_pool_task_3d = {
+            x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_image_size, y_image_size, kernel_size, channels,
+            pooled_height, pooled_width, pooled_depth, strides[0], strides[1], strides[2], height, width, depth,
+            kernel_shape, pads, pool_context_, pool_attrs_};
+        ThreadPool::TryParallelFor(tp, y_image_size * batch_count, avg_pool_task_3d.Cost(), avg_pool_task_3d);
+
+      } else {
+        QLinearPool3DTask<uint8_t, onnxruntime::AveragePool> avg_pool_task_3d = {
+            x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_image_size, y_image_size,
+            pooled_height, pooled_width, pooled_depth, strides[0], strides[1], strides[2], height, width, depth,
+            kernel_shape, pads, pool_context_, pool_attrs_};
+        ThreadPool::TryParallelFor(tp, total_channels, avg_pool_task_3d.Cost(), avg_pool_task_3d);
+      }
       break;
     }
 
-    default:
-    {
+    default: {
       return onnxruntime::common::Status(
           onnxruntime::common::ONNXRUNTIME,
           onnxruntime::common::INVALID_ARGUMENT,
diff --git a/onnxruntime/contrib_ops/cpu/qlinear_pool.h b/onnxruntime/contrib_ops/cpu/qlinear_pool.h
index 13175052f0..92285e4f78 100644
--- a/onnxruntime/contrib_ops/cpu/qlinear_pool.h
+++ b/onnxruntime/contrib_ops/cpu/qlinear_pool.h
@@ -12,15 +12,17 @@ namespace contrib {
 
 class QLinearAveragePool final : public OpKernel, public PoolBase {
  public:
-  QLinearAveragePool(const OpKernelInfo& info) : OpKernel(info), PoolBase(info) { }
+  QLinearAveragePool(const OpKernelInfo& info) : OpKernel(info), PoolBase(info) {
+    channels_last_ = (info.GetAttrOrDefault<int64_t>("channels_last", static_cast<int64_t>(0)) != 0);
+  }
 
   ~QLinearAveragePool() override = default;
 
   Status Compute(OpKernelContext* context) const override;
 
-private:
+ private:
   PoolProcessContext pool_context_;
-
+  bool channels_last_;
 };
 
 }  // namespace contrib
diff --git a/onnxruntime/core/graph/contrib_ops/nhwc_schema_defs.cc b/onnxruntime/core/graph/contrib_ops/nhwc_schema_defs.cc
index fa38df6ea6..b06d9ad72d 100644
--- a/onnxruntime/core/graph/contrib_ops/nhwc_schema_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/nhwc_schema_defs.cc
@@ -4,6 +4,7 @@
 #include "core/framework/tensorprotoutils.h"
 #include "core/graph/constants.h"
 #include "core/graph/contrib_ops/contrib_defs.h"
+#include "core/graph/contrib_ops/quantization_defs.h"
 
 namespace ONNX_NAMESPACE {
 void convPoolShapeInference(
@@ -18,7 +19,6 @@ using namespace ONNX_NAMESPACE;
 
 namespace onnxruntime {
 namespace contrib {
-
 class NhwcInferenceContext : public InferenceContext {
  public:
   NhwcInferenceContext(InferenceContext& ctx) : ctx_(ctx) {
@@ -263,6 +263,156 @@ equal to the spatial dimension of input tensor. Input is of type uint8_t or int8
           ++image_dim_index;
         }
       });
+
+  const char* QLinearAveragePoolDoc_ver1 = R"DOC(
+ QLinearAveragePool consumes an input tensor X and applies average pooling across
+ the tensor according to kernel sizes, stride sizes, and pad lengths.
+ average pooling consisting of computing the average on all values of a
+ subset of the input tensor according to the kernel size and downsampling the
+ data into the output tensor Y for further processing. The output spatial shape will be following:
+ ```
+ output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - kernel_spatial_shape[i]) / strides_spatial_shape[i] + 1)
+ ```
+ or
+ ```
+ output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - kernel_spatial_shape[i]) / strides_spatial_shape[i] + 1)
+ ```
+ if ceil_mode is enabled
+
+ ```
+ * pad_shape[i] is sum of pads along axis i
+ ```
+
+ `auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following:
+ ```
+ VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - kernel_spatial_shape[i] + 1) / strides_spatial_shape[i])
+ SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides_spatial_shape[i])
+ ```
+ And pad shape will be following if `SAME_UPPER` or `SAME_LOWER`:
+ ```
+ pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + kernel_spatial_shape[i] - input_spatial_shape[i]
+ ```
+
+The output of each pooling window is divided by the number of elements (exclude pad when attribute count_include_pad is zero).
+
+Input and output scales and zero points are used to convert the output to a new quantization range.
+Output = Dequantize(Input) -> AveragePool on fp32 data -> Quantize(output)
+)DOC";
+
+  static const char* contrib_ops_pads_doc =
+      "Padding for the beginning and ending along each spatial axis, it can take any value greater "
+      "than or equal to 0. The value represent the number of pixels added to the beginning "
+      "and end part of the corresponding axis. `pads` format should be as follow "
+      "[x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin the number of pixels "
+      "added at the beginning of axis `i` and xi_end, the number of pixels added at "
+      "the end of axis `i`. This attribute cannot be used simultaneously with "
+      "auto_pad attribute. If not present, the padding defaults to 0 along start and end of each spatial axis.";
+  static const char* contrib_ops_auto_pad_doc =
+      "auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where "
+      "default value is NOTSET, which means explicit padding is used. "
+      "SAME_UPPER or SAME_LOWER mean pad the input so that the output spatial size match the input."
+      "In case of odd number add the extra padding at the end for SAME_UPPER and at the "
+      "beginning for SAME_LOWER. VALID mean no padding.";
+
+  ONNX_CONTRIB_OPERATOR_SCHEMA(QLinearAveragePool)
+      .SetDomain(kMSDomain)
+      .SinceVersion(1)
+      .SetDoc(QLinearAveragePoolDoc_ver1)
+      .Attr(
+          "count_include_pad",
+          "Whether include pad pixels when calculating values for the edges. Default is 0, doesn't count include pad.",
+          AttributeProto::INT,
+          static_cast<int64_t>(0))
+      .Attr(
+          "kernel_shape",
+          "The size of the kernel along each axis.",
+          AttributeProto::INTS)
+      .Attr(
+          "strides",
+          "Stride along each spatial axis. If not present, the stride defaults to 1 along each spatial axis.",
+          AttributeProto::INTS,
+          OPTIONAL_VALUE)
+      .Attr(
+          "auto_pad",
+          contrib_ops_auto_pad_doc,
+          AttributeProto::STRING,
+          std::string("NOTSET"))
+      .Attr("pads", contrib_ops_pads_doc, AttributeProto::INTS, OPTIONAL_VALUE)
+      .Attr(
+          "ceil_mode",
+          "Whether to use ceil or floor (default) to compute the output shape.",
+          AttributeProto::INT,
+          static_cast<int64_t>(0))
+      .Attr("channels_last", "Works on NHWC layout or not? Default not.", AttributeProto::INT, static_cast<int64_t>(0))
+      .Input(
+          0,
+          "X",
+          "Input data tensor from the previous operator; "
+          "dimensions for image case are (N x C x H x W), "
+          "where N is the batch size, C is the number of "
+          "channels, and H and W are the height and the "
+          "width of the data. For non image case, the "
+          "dimensions are in the form of "
+          "(N x C x D1 x D2 ... Dn), where N is the batch "
+          "size. Optionally, if dimension denotation is "
+          "in effect, the operation expects the input "
+          "data tensor to arrive with the dimension denotation "
+          "of [DATA_BATCH, DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].",
+          "T")
+      .Input(
+          1,
+          "x_scale",
+          "Input scale. It's a scalar, which means a per-tensor/layer quantization.",
+          "tensor(float)")
+      .Input(
+          2,
+          "x_zero_point",
+          "Input zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.",
+          "T",
+          OpSchema::Optional)
+      .Input(
+          3,
+          "y_scale",
+          "Output scale. It's a scalar, which means a per-tensor/layer quantization.",
+          "tensor(float)")
+      .Input(
+          4,
+          "y_zero_point",
+          "Output zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.",
+          "T",
+          OpSchema::Optional)
+      .Output(
+          0,
+          "Y",
+          "Output data tensor from average or max pooling across "
+          "the input tensor. Dimensions will vary based "
+          "on various kernel, stride, and pad sizes. Floor value of "
+          "the dimension is used",
+          "T")
+      .TypeConstraint(
+          "T",
+          {"tensor(uint8)", "tensor(int8)"},
+          "Constrain input and output types to 8 bit tensors.")
+      .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
+        ONNX_NAMESPACE::propagateElemTypeFromInputToOutput(ctx, 0, 0);
+
+        auto data_type = ctx.getInputType(0);
+        if (nullptr == data_type || data_type->value_case() != ONNX_NAMESPACE::TypeProto::kTensorType) {
+          fail_type_inference("inputs are expected to have tensor type.");
+        }
+
+        // validate scale and zero points
+        ValidateTypeAndShapeForScaleAndZP(ctx, 1, ONNX_NAMESPACE::TensorProto::FLOAT, true);
+        ValidateTypeAndShapeForScaleAndZP(ctx, 2, data_type->tensor_type().elem_type(), true);
+        ValidateTypeAndShapeForScaleAndZP(ctx, 3, ONNX_NAMESPACE::TensorProto::FLOAT, true);
+        ValidateTypeAndShapeForScaleAndZP(ctx, 4, data_type->tensor_type().elem_type(), true);
+
+        if (getAttribute(ctx, "channels_last", 0) == 0) {
+          ONNX_NAMESPACE::convPoolShapeInference(ctx, false, true, 0, 5);
+        } else {
+          convPoolShapeInferenceNhwc(ctx, false, true, 0, 5);
+        }
+      });
 }
 
 }  // namespace contrib
diff --git a/onnxruntime/core/graph/contrib_ops/quantization_defs.cc b/onnxruntime/core/graph/contrib_ops/quantization_defs.cc
index ef3a91727c..2202b640fc 100644
--- a/onnxruntime/core/graph/contrib_ops/quantization_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/quantization_defs.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/framework/tensorprotoutils.h"
+#include "core/graph/contrib_ops/quantization_defs.h"
 #include "core/graph/constants.h"
 #include "core/graph/contrib_ops/contrib_defs.h"
 
@@ -28,7 +28,7 @@ using ONNX_NAMESPACE::InferenceContext;
 using ONNX_NAMESPACE::OpSchema;
 using ONNX_NAMESPACE::OPTIONAL_VALUE;
 
-void ValidateTypeAndShapeForScaleAndZP(ONNX_NAMESPACE::InferenceContext& ctx, int index, ::google::protobuf::int32 expectedType, bool isScalar, int expectedTensorSize = 0) {
+void ValidateTypeAndShapeForScaleAndZP(ONNX_NAMESPACE::InferenceContext& ctx, int index, ::google::protobuf::int32 expectedType, bool isScalar, int expectedTensorSize) {
   if (ctx.getNumInputs() > static_cast<size_t>(index)) {
     auto data_type = ctx.getInputType(index);
     if (nullptr == data_type) {
@@ -546,151 +546,6 @@ This helps to improve accuracy as after ReduceMean operation the range of the ou
         }
       });
 
-  const char* QLinearAveragePoolDoc_ver1 = R"DOC(
- QLinearAveragePool consumes an input tensor X and applies average pooling across
- the tensor according to kernel sizes, stride sizes, and pad lengths.
- average pooling consisting of computing the average on all values of a
- subset of the input tensor according to the kernel size and downsampling the
- data into the output tensor Y for further processing. The output spatial shape will be following:
- ```
- output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - kernel_spatial_shape[i]) / strides_spatial_shape[i] + 1)
- ```
- or
- ```
- output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - kernel_spatial_shape[i]) / strides_spatial_shape[i] + 1)
- ```
- if ceil_mode is enabled
-
- ```
- * pad_shape[i] is sum of pads along axis i
- ```
-
- `auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following:
- ```
- VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - kernel_spatial_shape[i] + 1) / strides_spatial_shape[i])
- SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides_spatial_shape[i])
- ```
- And pad shape will be following if `SAME_UPPER` or `SAME_LOWER`:
- ```
- pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + kernel_spatial_shape[i] - input_spatial_shape[i]
- ```
-
-The output of each pooling window is divided by the number of elements (exclude pad when attribute count_include_pad is zero).
-
-Input and output scales and zero points are used to convert the output to a new quantization range.
-Output = Dequantize(Input) -> AveragePool on fp32 data -> Quantize(output)
-)DOC";
-
-  static const char* contrib_ops_pads_doc =
-      "Padding for the beginning and ending along each spatial axis, it can take any value greater "
-      "than or equal to 0. The value represent the number of pixels added to the beginning "
-      "and end part of the corresponding axis. `pads` format should be as follow "
-      "[x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin the number of pixels "
-      "added at the beginning of axis `i` and xi_end, the number of pixels added at "
-      "the end of axis `i`. This attribute cannot be used simultaneously with "
-      "auto_pad attribute. If not present, the padding defaults to 0 along start and end of each spatial axis.";
-  static const char* contrib_ops_auto_pad_doc =
-      "auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where "
-      "default value is NOTSET, which means explicit padding is used. "
-      "SAME_UPPER or SAME_LOWER mean pad the input so that the output spatial size match the input."
-      "In case of odd number add the extra padding at the end for SAME_UPPER and at the "
-      "beginning for SAME_LOWER. VALID mean no padding.";
-
-  ONNX_CONTRIB_OPERATOR_SCHEMA(QLinearAveragePool)
-      .SetDomain(kMSDomain)
-      .SinceVersion(1)
-      .SetDoc(QLinearAveragePoolDoc_ver1)
-      .Attr(
-          "count_include_pad",
-          "Whether include pad pixels when calculating values for the edges. Default is 0, doesn't count include pad.",
-          AttributeProto::INT,
-          static_cast<int64_t>(0))
-      .Attr(
-          "kernel_shape",
-          "The size of the kernel along each axis.",
-          AttributeProto::INTS)
-      .Attr(
-          "strides",
-          "Stride along each spatial axis. If not present, the stride defaults to 1 along each spatial axis.",
-          AttributeProto::INTS,
-          OPTIONAL_VALUE)
-      .Attr(
-          "auto_pad",
-          contrib_ops_auto_pad_doc,
-          AttributeProto::STRING,
-          std::string("NOTSET"))
-      .Attr("pads", contrib_ops_pads_doc, AttributeProto::INTS, OPTIONAL_VALUE)
-      .Attr(
-          "ceil_mode",
-          "Whether to use ceil or floor (default) to compute the output shape.",
-          AttributeProto::INT,
-          static_cast<int64_t>(0))
-      .Input(
-          0,
-          "X",
-          "Input data tensor from the previous operator; "
-          "dimensions for image case are (N x C x H x W), "
-          "where N is the batch size, C is the number of "
-          "channels, and H and W are the height and the "
-          "width of the data. For non image case, the "
-          "dimensions are in the form of "
-          "(N x C x D1 x D2 ... Dn), where N is the batch "
-          "size. Optionally, if dimension denotation is "
-          "in effect, the operation expects the input "
-          "data tensor to arrive with the dimension denotation "
-          "of [DATA_BATCH, DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].",
-          "T")
-      .Input(
-          1,
-          "x_scale",
-          "Input scale. It's a scalar, which means a per-tensor/layer quantization.",
-          "tensor(float)")
-      .Input(
-          2,
-          "x_zero_point",
-          "Input zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.",
-          "T",
-          OpSchema::Optional)
-      .Input(
-          3,
-          "y_scale",
-          "Output scale. It's a scalar, which means a per-tensor/layer quantization.",
-          "tensor(float)")
-      .Input(
-          4,
-          "y_zero_point",
-          "Output zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.",
-          "T",
-          OpSchema::Optional)
-      .Output(
-          0,
-          "Y",
-          "Output data tensor from average or max pooling across "
-          "the input tensor. Dimensions will vary based "
-          "on various kernel, stride, and pad sizes. Floor value of "
-          "the dimension is used",
-          "T")
-      .TypeConstraint(
-          "T",
-          {"tensor(uint8)", "tensor(int8)"},
-          "Constrain input and output types to 8 bit tensors.")
-      .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
-        ONNX_NAMESPACE::propagateElemTypeFromInputToOutput(ctx, 0, 0);
-
-        auto data_type = ctx.getInputType(0);
-        if (nullptr == data_type || data_type->value_case() != ONNX_NAMESPACE::TypeProto::kTensorType) {
-          fail_type_inference("inputs are expected to have tensor type.");
-        }
-
-        // validate scale and zero points
-        ValidateTypeAndShapeForScaleAndZP(ctx, 1, ONNX_NAMESPACE::TensorProto::FLOAT, true);
-        ValidateTypeAndShapeForScaleAndZP(ctx, 2, data_type->tensor_type().elem_type(), true);
-        ValidateTypeAndShapeForScaleAndZP(ctx, 3, ONNX_NAMESPACE::TensorProto::FLOAT, true);
-        ValidateTypeAndShapeForScaleAndZP(ctx, 4, data_type->tensor_type().elem_type(), true);
-
-        ONNX_NAMESPACE::convPoolShapeInference(ctx, false, true, 0, 5);
-      });
-
   const char* QLinearLeakyReluDoc_ver1 = R"DOC(
 QLinearLeakyRelu takes quantized input data (Tensor), an argument alpha, and quantize parameter for output,
 and produces one output data (Tensor<T>) where the function `f(x) = quantize(alpha * dequantize(x)) for dequantize(x) < 0`,
diff --git a/onnxruntime/core/graph/contrib_ops/quantization_defs.h b/onnxruntime/core/graph/contrib_ops/quantization_defs.h
new file mode 100644
index 0000000000..44ab4b0147
--- /dev/null
+++ b/onnxruntime/core/graph/contrib_ops/quantization_defs.h
@@ -0,0 +1,19 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+
+#include "core/graph/onnx_protobuf.h"
+#include "core/framework/tensorprotoutils.h"
+
+namespace onnxruntime {
+namespace contrib {
+
+void ValidateTypeAndShapeForScaleAndZP(
+    ONNX_NAMESPACE::InferenceContext& ctx,
+    int index,
+    ::google::protobuf::int32 expectedType,
+    bool isScalar,
+    int expectedTensorSize = 0);
+
+}
+}  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/optimizer/nhwc_transformer.cc b/onnxruntime/core/optimizer/nhwc_transformer.cc
index 26829f71fc..a05d00c290 100644
--- a/onnxruntime/core/optimizer/nhwc_transformer.cc
+++ b/onnxruntime/core/optimizer/nhwc_transformer.cc
@@ -427,7 +427,8 @@ void NhwcTransformerImpl::Transform(Node& node) {
   } else if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "QLinearLeakyRelu", {1}, kMSDomain) ||
              graph_utils::IsSupportedOptypeVersionAndDomain(node, "QLinearSigmoid", {1}, kMSDomain)) {
     TransformQLinearActivation(node);
-  } else if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "QLinearGlobalAveragePool", {1}, kMSDomain)) {
+  } else if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "QLinearGlobalAveragePool", {1}, kMSDomain) ||
+             graph_utils::IsSupportedOptypeVersionAndDomain(node, "QLinearAveragePool", {1}, kMSDomain)) {
     TransformQLinearGlobalAveragePool(node);
   } else if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "QLinearConcat", {1}, kMSDomain)) {
     TransformQLinearConcat(node);
diff --git a/onnxruntime/test/contrib_ops/qlinear_pool_test.cc b/onnxruntime/test/contrib_ops/qlinear_pool_test.cc
index 7678887fa0..94916ebec3 100644
--- a/onnxruntime/test/contrib_ops/qlinear_pool_test.cc
+++ b/onnxruntime/test/contrib_ops/qlinear_pool_test.cc
@@ -192,6 +192,105 @@ void RunQLinearAveragePoolNchwU8(
   run_test(true /* only_x_not_initializer */, true /* x_y_same_zero_point */);
 }
 
+static std::vector<int64_t> dims_to_nhwc(const std::vector<int64_t>& nchw) {
+  std::vector<int64_t> nhwc(nchw);
+  nhwc.erase(nhwc.begin() + 1);
+  nhwc.push_back(nchw[1]);
+  return nhwc;
+}
+
+static std::vector<uint8_t> transpose_to_nhwc(const std::vector<uint8_t>& nchw_data, const std::vector<int64_t>& nchw_dims) {
+  std::vector<uint8_t> nhwc_data(nchw_data.size());
+
+  auto batch_count = nchw_dims[0];
+  auto channels = nchw_dims[1];
+  int64_t image_size = std::accumulate(nchw_dims.begin() + 2, nchw_dims.end(), 1LL, std::multiplies<int64_t>());
+  for (int64_t b = 0; b < batch_count; b++) {
+    const uint8_t* nchw_image = nchw_data.data() + (b * image_size);
+    uint8_t* nhwc_image = nhwc_data.data() + (b * image_size);
+    for (int64_t img_index = 0; img_index < image_size; ++img_index) {
+      for (int64_t c = 0; c < channels; c++) {
+        *nhwc_image++ = nchw_image[c * image_size + img_index];
+      }
+    }
+  }
+
+  return nhwc_data;
+}
+
+void RunQLinearAveragePoolNhwcU8(
+    const std::vector<int64_t> x_dims,
+    const std::vector<int64_t> y_dims,
+    const std::vector<int64_t> kernel_shape,
+    const std::vector<int64_t> strides,
+    const std::vector<int64_t> pads,
+    const int64_t count_include_pad = 0) {
+  float x_scale = 1.0f / 255.0f;
+  uint8_t x_zero_point = 128;
+  RandomValueGenerator random{};
+  std::vector<float> x_data_fp32 = random.Uniform<float>(x_dims, -0.5f, 0.5f);
+  std::vector<uint8_t> x_data(x_data_fp32.size());
+  for (size_t i = 0; i < x_data.size(); ++i) {
+    x_data[i] = quantize_u8(x_data_fp32[i], x_scale, x_zero_point);
+  }
+
+  float y_scale = 1.0f / 255.0f;
+  uint8_t y_zero_point = 100;
+  int64_t y_size = std::accumulate(y_dims.begin(), y_dims.end(), 1LL, std::multiplies<int64_t>());
+  std::vector<uint8_t> y_data(y_size);
+  CalculateAvgPoolNchwU8(
+      x_data.data(), x_dims, x_scale, x_zero_point,
+      y_data.data(), y_dims, y_scale, y_zero_point,
+      kernel_shape, strides, pads, count_include_pad);
+
+  // transpose the result
+  std::vector<uint8_t> y_data_nhwc = transpose_to_nhwc(y_data, y_dims);
+  std::vector<uint8_t> x_data_nhwc = transpose_to_nhwc(x_data, x_dims);
+  auto x_dims_nhwc = dims_to_nhwc(x_dims);
+  auto y_dims_nhwc = dims_to_nhwc(y_dims);
+
+  OpTester test("QLinearAveragePool", 1, onnxruntime::kMSDomain);
+
+  test.AddAttribute("auto_pad", "");
+  test.AddAttribute("strides", strides);
+  test.AddAttribute("pads", pads);
+  test.AddAttribute("kernel_shape", kernel_shape);
+  test.AddAttribute("count_include_pad", count_include_pad);
+  test.AddAttribute("channels_last", (int64_t)1LL);
+
+  test.AddInput<uint8_t>("X", x_dims_nhwc, x_data_nhwc);
+  test.AddInput<float>("x_scale", {}, {x_scale});
+  test.AddInput<uint8_t>("x_zero_point", {}, {x_zero_point});
+  test.AddInput<float>("y_scale", {}, {y_scale});
+  test.AddInput<uint8_t>("y_zero_point", {}, {y_zero_point});
+  test.AddOutput<uint8_t>("Y", y_dims_nhwc, y_data_nhwc);
+
+  auto q8checker = [&](const std::vector<OrtValue>& fetches, const std::string& provider_type) {
+    const OrtValue& ort_value = fetches[0];
+    if (ort_value.Fence()) {
+      ort_value.Fence()->BeforeUsingAsInput(onnxruntime::kCpuExecutionProvider, 0);
+    }
+
+    auto y_shape = TensorShape(y_dims_nhwc);
+    const Tensor& output_tensor = ort_value.Get<Tensor>();
+    ORT_ENFORCE(y_shape == output_tensor.Shape(),
+                "Expected output shape [" + y_shape.ToString() + "] did not match run output shape [" +
+                    output_tensor.Shape().ToString() + "] for Y @" + provider_type);
+    auto* output = output_tensor.Data<uint8_t>();
+    auto size = static_cast<int>(output_tensor.Shape().Size());
+    for (int i = 0; i < size; ++i) {
+      int diff = abs(y_data_nhwc[i] - output[i]);
+      EXPECT_LE(diff, 1) << "i:" << i << " expected:" << y_data_nhwc[i] << " " << (int)y_data_nhwc[i]
+                         << ", got:" << output[i] << " " << (int)output[i] << ", provider_type: " << provider_type;
+    }
+  };
+  test.SetCustomOutputVerifier(q8checker);
+
+  static std::unordered_set<std::string> excluded_providers = {kNnapiExecutionProvider};
+
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", excluded_providers);
+}
+
 TEST(QLinearPoolTest, AveragePool1D_ExcludePadPixel) {
   RunQLinearAveragePoolNchwU8(
       {1, 1, 5},  // x shape
@@ -252,5 +351,68 @@ TEST(QLinearPoolTest, AveragePool3D_IncludePadPixel) {
       1);                  // count_include_pad
 }
 
+/*************************************************
+* Channels last test
+**************************************************/
+TEST(QLinearPoolTest, AveragePool1D_ExcludePadPixel_nhwc) {
+  RunQLinearAveragePoolNhwcU8(
+      {1, 1, 5},  // x shape
+      {1, 1, 6},  // expected y shape
+      {3},        // kernel shape
+      {1},        // strides
+      {1, 2},     // pads
+      0);         // count_include_pad
+}
+
+TEST(QLinearPoolTest, AveragePool1D_IncludePadPixel_nhwc) {
+  RunQLinearAveragePoolNhwcU8(
+      {1, 1, 5},  // x shape
+      {1, 1, 6},  // expected y shape
+      {3},        // kernel shape
+      {1},        // strides
+      {1, 2},     // pads
+      1);         // count_include_pad
+}
+
+TEST(QLinearPoolTest, AveragePool2D_ExcludePadPixel_nhwc) {
+  RunQLinearAveragePoolNhwcU8(
+      {1, 1, 5, 7},  // x shape
+      {1, 1, 6, 4},  // expected y shape
+      {3, 4},        // kernel shape
+      {1, 2},        // strides
+      {1, 3, 2, 1},  // pads
+      0);            // count_include_pad
+}
+
+TEST(QLinearPoolTest, AveragePool2D_IncludePadPixel_nhwc) {
+  RunQLinearAveragePoolNhwcU8(
+      {1, 1, 5, 7},  // x shape
+      {1, 1, 6, 4},  // expected y shape
+      {3, 4},        // kernel shape
+      {1, 2},        // strides
+      {1, 3, 2, 1},  // pads
+      1);            // count_include_pad
+}
+
+TEST(QLinearPoolTest, AveragePool3D_ExcludePadPixel_nhwc) {
+  RunQLinearAveragePoolNhwcU8(
+      {1, 1, 5, 7, 9},     // x shape
+      {1, 1, 6, 4, 3},     // expected y shape
+      {3, 4, 5},           // kernel shape
+      {1, 2, 3},           // strides
+      {1, 3, 2, 2, 1, 2},  // pads
+      0);                  // count_include_pad
+}
+
+TEST(QLinearPoolTest, AveragePool3D_IncludePadPixel_nhwc) {
+  RunQLinearAveragePoolNhwcU8(
+      {1, 1, 5, 7, 9},     // x shape
+      {1, 1, 6, 4, 3},     // expected y shape
+      {3, 4, 5},           // kernel shape
+      {1, 2, 3},           // strides
+      {1, 3, 2, 2, 1, 2},  // pads
+      1);                  // count_include_pad
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/optimizer/nhwc_transformer_test.cc b/onnxruntime/test/optimizer/nhwc_transformer_test.cc
index 55d3dd2f4b..f5824de82f 100644
--- a/onnxruntime/test/optimizer/nhwc_transformer_test.cc
+++ b/onnxruntime/test/optimizer/nhwc_transformer_test.cc
@@ -245,6 +245,51 @@ TEST(NhwcTransformerTests, ConvGlobalAveragePool) {
                     TransformerLevel::Level3);
 }
 
+TEST(NhwcTransformerTests, ConvAveragePool) {
+  auto build_test_case = [&](ModelTestBuilder& builder) {
+    auto* input_arg = builder.MakeInput<uint8_t>({1, 23, 13, 13}, 0, 31);
+    auto* conv1_output_arg = builder.MakeIntermediate();
+    auto* conv2_output_arg = builder.MakeIntermediate();
+    auto* avgpool1_output_arg = builder.MakeIntermediate();
+    auto* avgpool2_output_arg = builder.MakeIntermediate();
+    auto* output_arg = builder.MakeOutput();
+    auto* conv1_weight_arg = NhwcMakeInitializer<uint8_t>(builder, {30, 23, 3, 3});
+    auto* conv2_weight_arg = NhwcMakeInitializer<uint8_t>(builder, {16, 30, 3, 3});
+
+    Node& conv1_node = builder.AddQLinearConvNode<uint8_t>(input_arg, .01f, 135,
+                                                           conv1_weight_arg, .02f, 126,
+                                                           conv1_output_arg, .37f, 131);
+    conv1_node.AddAttribute("pads", std::vector<int64_t>{1, 1, 1, 1});
+    Node& avgpool_node1 = builder.AddQLinearActivationNode("QLinearAveragePool",
+                                                           conv1_output_arg, .37f, 131,
+                                                           avgpool1_output_arg, .43f, 111);
+    avgpool_node1.AddAttribute("kernel_shape", std::vector<int64_t>{3, 3});
+    avgpool_node1.AddAttribute("pads", std::vector<int64_t>{1, 1, 1, 1});
+
+    builder.AddQLinearConvNode<uint8_t>(avgpool1_output_arg, .43f, 111,
+                                        conv2_weight_arg, .015f, 129,
+                                        conv2_output_arg, .37f, 131);
+    Node& avgpool_node2 = builder.AddQLinearActivationNode("QLinearAveragePool",
+                                                         conv2_output_arg, .37f, 131,
+                                                         avgpool2_output_arg, .37f, 131);
+    avgpool_node2.AddAttribute("kernel_shape", std::vector<int64_t>{3, 3});
+    avgpool_node2.AddAttribute("pads", std::vector<int64_t>{1, 1, 1, 1});
+
+    builder.AddDequantizeLinearNode<uint8_t>(avgpool2_output_arg, .37f, 131, output_arg);
+  };
+
+  auto check_nhwc_graph = [&](InferenceSessionWrapper& session) {
+    auto op_to_count = CountOpsInGraph(session.GetGraph());
+    EXPECT_EQ(op_to_count["com.microsoft.QLinearConv"], 2);
+    EXPECT_EQ(op_to_count["Transpose"], 2);
+  };
+
+  TransformerTester(build_test_case,
+                    check_nhwc_graph,
+                    TransformerLevel::Level2,
+                    TransformerLevel::Level3);
+}
+
 TEST(NhwcTransformerTests, ConvSplit) {
   for (int64_t axis = -4LL; axis < 4; axis++) {
     auto build_test_case = [&, axis](ModelTestBuilder& builder) {