From acfe7ac4cebf313662f396faacd93e2115bad8fb Mon Sep 17 00:00:00 2001 From: Zhang Lei Date: Wed, 10 Mar 2021 10:02:01 -0800 Subject: [PATCH] Implement QLinearAveragePool with unit tests. (#6896) Implement QLinearAveragePool with unit tests. --- .../contrib_ops/cpu/cpu_contrib_kernels.cc | 2 + onnxruntime/contrib_ops/cpu/qlinear_pool.cc | 327 ++++++++++++++++++ onnxruntime/contrib_ops/cpu/qlinear_pool.h | 27 ++ onnxruntime/core/providers/cpu/nn/pool_base.h | 3 +- .../test/contrib_ops/qlinear_pool_test.cc | 247 +++++++++++++ 5 files changed, 605 insertions(+), 1 deletion(-) create mode 100644 onnxruntime/contrib_ops/cpu/qlinear_pool.cc create mode 100644 onnxruntime/contrib_ops/cpu/qlinear_pool.h create mode 100644 onnxruntime/test/contrib_ops/qlinear_pool_test.cc diff --git a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc index 534d9c3243..89fb51fd2a 100644 --- a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc +++ b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc @@ -47,6 +47,7 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSExperimentalDoma // ******** Start: Quantization ******************* // class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MatMulInteger16); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, QLinearGlobalAveragePool); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, QLinearAveragePool); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, DequantizeLinear); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, DequantizeLinear); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, QuantizeLinear); @@ -131,6 +132,7 @@ Status RegisterQuantizationKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, //default entry to avoid the list become empty after ops-reducing BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, diff --git a/onnxruntime/contrib_ops/cpu/qlinear_pool.cc b/onnxruntime/contrib_ops/cpu/qlinear_pool.cc new file mode 100644 index 0000000000..9923de913a --- /dev/null +++ b/onnxruntime/contrib_ops/cpu/qlinear_pool.cc @@ -0,0 +1,327 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "qlinear_pool.h" + +#include "core/util/math_cpuonly.h" +#include "core/providers/common.h" +#include "core/platform/threadpool.h" +#include "core/util/math.h" +#include "core/mlas/inc/mlas.h" + +#include + +namespace onnxruntime { + +using concurrency::ThreadPool; + +namespace contrib { + +template +static inline float dequantize_value(T8Bits x, float x_scale, T8Bits x_zero_point); + +template +static inline T8Bits quantize_value(float y, float y_scale, T8Bits y_zero_point); + +template <> +inline float dequantize_value(uint8_t x, float x_scale, uint8_t x_zero_point) { + return x_scale * (static_cast(x) - x_zero_point); +} + +template <> +inline uint8_t quantize_value(float y, float y_scale, uint8_t y_zero_point) { + return static_cast(std::max(0.0f, std::min(std::nearbyintf(y / y_scale + y_zero_point), 255.0f))); +} + +template +struct QLinearPool1DTask final { + const float* X_data; + T8Bits* Y_data; + float y_scale; + T8Bits y_zero_point; + int64_t x_step; + int64_t y_step; + int64_t pooled_height; + int64_t stride_h; + int64_t height; + const std::vector& kernel_shape; + const std::vector& pads; + const PoolProcessContext& pool_context_; + const PoolAttributes& pool_attrs_; + + TensorOpCost Cost() { + double loop_count = static_cast(pooled_height * kernel_shape[0]); + return TensorOpCost{loop_count, loop_count, loop_count}; + } + + void operator()(std::ptrdiff_t begin, std::ptrdiff_t end) const { + for (int64_t c = begin; c < end; ++c) { + operator()(c); + } + } + + void operator()(std::ptrdiff_t c) const { + const float* x_d = X_data + c * x_step; + T8Bits* y_d = Y_data + c * y_step; + + for (int64_t ph = 0; ph < pooled_height; ++ph) { + int64_t hstart = ph * stride_h - pads[0]; + int64_t hend = std::min(hstart + kernel_shape[0], height); + hstart = std::max(hstart, static_cast(0)); + float Yh = PoolType::Initialize(); + for (int64_t h = hstart; h < hend; ++h) { + PoolType::Process(x_d[h], Yh, pool_context_); + } + if (pool_attrs_.count_include_pad) { + PoolType::Finalize(kernel_shape[0], Yh, pool_context_); + } else { + PoolType::Finalize(hend - hstart, Yh, pool_context_); + } + y_d[ph] = quantize_value(Yh, y_scale, y_zero_point); + } + } +}; + + +template +struct QLinearPool2DTask final { + const float* X_data; + T8Bits* Y_data; + float y_scale; + T8Bits y_zero_point; + int64_t x_step; + int64_t y_step; + int64_t pooled_height; + int64_t pooled_width; + int64_t stride_h; + int64_t stride_w; + int64_t height; + int64_t width; + const std::vector& kernel_shape; + const std::vector& pads; + const PoolProcessContext& pool_context_; + const PoolAttributes& pool_attrs_; + + TensorOpCost Cost() { + double loop_count = static_cast(pooled_height * pooled_width * kernel_shape[0] * kernel_shape[1]); + return TensorOpCost{loop_count, loop_count, loop_count}; + } + + void operator()(std::ptrdiff_t begin, std::ptrdiff_t end) const { + for (int64_t c = begin; c < end; ++c) { + operator()(c); + } + } + + void operator()(std::ptrdiff_t c) const { + const float* x_d = X_data + c * x_step; + T8Bits* y_d = Y_data + c * y_step; + + for (int64_t ph = 0; ph < pooled_height; ++ph) { + int64_t hstart = ph * stride_h - pads[0]; + int64_t hend = std::min(hstart + kernel_shape[0], height); + hstart = std::max(hstart, static_cast(0)); + for (int64_t pw = 0; pw < pooled_width; ++pw) { + int64_t wstart = pw * stride_w - pads[1]; + int64_t wend = std::min(wstart + kernel_shape[1], width); + wstart = std::max(wstart, static_cast(0)); + const int64_t pool_index = ph * pooled_width + pw; + float Yh = PoolType::Initialize(); + for (int64_t h = hstart; h < hend; ++h) { + int64_t input_index = h * width + wstart; + for (int64_t w = wstart; w < wend; ++w) { + PoolType::Process(x_d[input_index++], Yh, pool_context_); + } + } + if (pool_attrs_.count_include_pad) { + PoolType::Finalize(kernel_shape[0] * kernel_shape[1], Yh, pool_context_); + } else { + PoolType::Finalize((hend - hstart) * (wend - wstart), Yh, pool_context_); + } + y_d[pool_index] = quantize_value(Yh, y_scale, y_zero_point); + } + } + } +}; + +template +struct QLinearPool3DTask final { + const float* X_data; + T8Bits* Y_data; + float y_scale; + T8Bits y_zero_point; + int64_t x_step; + int64_t y_step; + int64_t pooled_height; + int64_t pooled_width; + int64_t pooled_depth; + int64_t stride_h; + int64_t stride_w; + int64_t stride_d; + int64_t height; + int64_t width; + int64_t depth; + const std::vector& kernel_shape; + const std::vector& pads; + const PoolProcessContext& pool_context_; + const PoolAttributes& pool_attrs_; + + TensorOpCost Cost() { + double loop_count = static_cast(pooled_height * pooled_width * pooled_depth * kernel_shape[0] * + kernel_shape[1] * kernel_shape[2]); + return TensorOpCost{loop_count, loop_count, loop_count}; + } + + void operator()(std::ptrdiff_t begin, std::ptrdiff_t end) const { + for (int64_t c = begin; c < end; ++c) { + operator()(c); + } + } + + void operator()(std::ptrdiff_t c) const { + const float* x_d = X_data + c * x_step; + T8Bits* y_d = Y_data + c * y_step; + + for (int64_t ph = 0; ph < pooled_height; ++ph) { + int64_t hstart = ph * stride_h - pads[0]; + int64_t hend = std::min(hstart + kernel_shape[0], height); + hstart = std::max(hstart, static_cast(0)); + for (int64_t pw = 0; pw < pooled_width; ++pw) { + int64_t wstart = pw * stride_w - pads[1]; + int64_t wend = std::min(wstart + kernel_shape[1], width); + wstart = std::max(wstart, static_cast(0)); + for (int64_t pd = 0; pd < pooled_depth; ++pd) { + int64_t dstart = pd * stride_d - pads[2]; + int64_t dend = std::min(dstart + kernel_shape[2], depth); + dstart = std::max(dstart, static_cast(0)); + const int64_t pool_index = ph * pooled_width * pooled_depth + pw * pooled_depth + pd; + float Yh = PoolType::Initialize(); + for (int64_t h = hstart; h < hend; ++h) { + const int64_t input_index_h = h * width * depth; + for (int64_t w = wstart; w < wend; ++w) { + int64_t input_index = input_index_h + w * depth + dstart; + for (int64_t d = dstart; d < dend; ++d) { + PoolType::Process(x_d[input_index++], Yh, pool_context_); + } + } + } + if (pool_attrs_.count_include_pad) { + PoolType::Finalize(kernel_shape[0] * kernel_shape[1] * kernel_shape[2], Yh, pool_context_); + } else { + PoolType::Finalize((hend - hstart) * (wend - wstart) * (dend - dstart), Yh, pool_context_); + } + auto y_value = quantize_value(Yh, y_scale, y_zero_point); + y_d[pool_index] = y_value; + } + } + } + } +}; + +Status QLinearAveragePool::Compute(OpKernelContext* context) const { + const auto tensor_x_scale = context->Input(1); + const auto tensor_x_zero_point = context->Input(2); + const auto tensor_y_scale = context->Input(3); + const auto tensor_y_zero_point = context->Input(4); + + ORT_ENFORCE(IsScalarOr1ElementVector(tensor_x_scale), + "Input x_scale must be a scalar or 1D tensor of size 1"); + ORT_ENFORCE(tensor_x_zero_point == nullptr || IsScalarOr1ElementVector(tensor_x_zero_point), + "input x_zero_point must be a scalar or 1D tensor of size 1 if given"); + ORT_ENFORCE(IsScalarOr1ElementVector(tensor_y_scale), + "input y_scale must be a scalar or 1D tensor of size 1"); + ORT_ENFORCE(tensor_y_zero_point == nullptr || IsScalarOr1ElementVector(tensor_y_zero_point), + "input y_zero_point must be a scalar or 1D tensor of size 1 if given"); + + const auto* X = context->Input(0); + auto dtype = X->GetElementType(); + if (dtype != ONNX_NAMESPACE::TensorProto_DataType_UINT8) { + ORT_THROW("Unsupported 'dtype' in QLinear Pooling:", dtype); + } + const TensorShape& x_shape = X->Shape(); + const float x_scale = *(tensor_x_scale->Data()); + const float y_scale = *(tensor_y_scale->Data()); + uint8_t x_zero_point = (tensor_x_zero_point ? *(tensor_x_zero_point->Data()) : (uint8_t)0); + uint8_t y_zero_point = (tensor_y_zero_point ? *(tensor_y_zero_point->Data()) : (uint8_t)0); + + ORT_RETURN_IF_NOT(x_shape.NumDimensions() >= 3, "Input dimension cannot be less than 3."); + std::vector pads = pool_attrs_.pads; + std::vector strides = pool_attrs_.strides; + std::vector kernel_shape = pool_attrs_.kernel_shape; + + std::vector output_dims = pool_attrs_.SetOutputSize(x_shape, x_shape[1], &pads); + Tensor* Y = context->Output(0, output_dims); + + const auto* X_data = X->Data(); + auto* Y_data = Y->MutableData(); + + const int64_t channels = x_shape[1]; + const int64_t height = x_shape[2]; + const int64_t width = kernel_shape.size() > 1 ? x_shape[3] : 1; + const int64_t depth = kernel_shape.size() > 2 ? x_shape[4] : 1; + const int64_t pooled_height = output_dims[2]; + const int64_t pooled_width = kernel_shape.size() > 1 ? output_dims[3] : 1; + const int64_t pooled_depth = kernel_shape.size() > 2 ? output_dims[4] : 1; + const int64_t total_channels = x_shape[0] * channels; + const int64_t x_step = height * width * depth; + const int64_t y_step = pooled_height * pooled_width * pooled_depth; + + ThreadPool* tp = context->GetOperatorThreadPool(); + std::vector x_data_fp32; + if (kernel_shape.size() <= 3) { + x_data_fp32.resize(x_shape.Size()); + ThreadPool::TryParallelFor(tp, x_shape.Size(), 1.0f, [=, &x_data_fp32](ptrdiff_t first, ptrdiff_t last) { + const auto* x8 = X_data + first; + float* x32 = x_data_fp32.data() + first; + for (ptrdiff_t i = 0, sz = last - first; i < sz; ++i) { + *x32++ = dequantize_value(x8[i], x_scale, x_zero_point); + } + }); + } + + switch (kernel_shape.size()) { + case 1: + { + QLinearPool1DTask avg_pool_task_1d = { + x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_step, y_step, + pooled_height, strides[0], height, kernel_shape, pads, pool_context_, pool_attrs_}; + ThreadPool::TryParallelFor(tp, total_channels, avg_pool_task_1d.Cost(), avg_pool_task_1d); + break; + } + + case 2: + { + QLinearPool2DTask avg_pool_task_2d = { + x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_step, y_step, + pooled_height, pooled_width, strides[0], strides[1], height, width, kernel_shape, pads, pool_context_, pool_attrs_}; + ThreadPool::TryParallelFor(tp, total_channels, avg_pool_task_2d.Cost(), avg_pool_task_2d); + break; + } + + case 3: + { + QLinearPool3DTask avg_pool_task_3d = { + x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_step, y_step, + pooled_height, pooled_width, pooled_depth, strides[0], strides[1], strides[2], height, width, depth, + kernel_shape, pads, pool_context_, pool_attrs_}; + ThreadPool::TryParallelFor(tp, total_channels, avg_pool_task_3d.Cost(), avg_pool_task_3d); + break; + } + + default: + { + return onnxruntime::common::Status( + onnxruntime::common::ONNXRUNTIME, + onnxruntime::common::INVALID_ARGUMENT, + "QLinear Pooling unsupported pooling size!"); + } + } + + return Status::OK(); +} + +ONNX_OPERATOR_KERNEL_EX(QLinearAveragePool, kMSDomain, 1, kCpuExecutionProvider, KernelDefBuilder(), QLinearAveragePool); + +} // namespace contrib + +} // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cpu/qlinear_pool.h b/onnxruntime/contrib_ops/cpu/qlinear_pool.h new file mode 100644 index 0000000000..13175052f0 --- /dev/null +++ b/onnxruntime/contrib_ops/cpu/qlinear_pool.h @@ -0,0 +1,27 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "core/common/common.h" +#include "core/framework/op_kernel.h" +#include "core/providers/cpu/nn/pool_base.h" + +namespace onnxruntime { +namespace contrib { + +class QLinearAveragePool final : public OpKernel, public PoolBase { + public: + QLinearAveragePool(const OpKernelInfo& info) : OpKernel(info), PoolBase(info) { } + + ~QLinearAveragePool() override = default; + + Status Compute(OpKernelContext* context) const override; + +private: + PoolProcessContext pool_context_; + +}; + +} // namespace contrib +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/cpu/nn/pool_base.h b/onnxruntime/core/providers/cpu/nn/pool_base.h index 3d1edfbdaf..4e2c9910b9 100644 --- a/onnxruntime/core/providers/cpu/nn/pool_base.h +++ b/onnxruntime/core/providers/cpu/nn/pool_base.h @@ -106,7 +106,8 @@ class PoolBase { protected: PoolBase(const OpKernelInfo& info) - : op_name_(info.GetKernelDef().OpName()), + : op_name_(info.GetKernelDef().OpName().rfind("QLinear", 0) != 0 ? + info.GetKernelDef().OpName() : info.GetKernelDef().OpName().substr(7)), pool_attrs_(info, op_name_, GetStartVersion(info)) { } diff --git a/onnxruntime/test/contrib_ops/qlinear_pool_test.cc b/onnxruntime/test/contrib_ops/qlinear_pool_test.cc new file mode 100644 index 0000000000..f248df5ca0 --- /dev/null +++ b/onnxruntime/test/contrib_ops/qlinear_pool_test.cc @@ -0,0 +1,247 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "gtest/gtest.h" +#include "test/common/tensor_op_test_utils.h" +#include "test/providers/provider_test_utils.h" +#include "core/providers/common.h" + +namespace onnxruntime { +namespace test { + +static inline float dequantize_u8(uint8_t x, float x_scale, uint8_t x_zero_point) { + return x_scale * (static_cast(x) - x_zero_point); +} + +static inline uint8_t quantize_u8(float y, float y_scale, uint8_t y_zero_point) { + return static_cast(std::max(0.0f, std::min(std::nearbyintf(y / y_scale + y_zero_point), 255.0f))); +} + +struct DimIterator { + DimIterator(const std::vector& dims) : dims_(dims) { + size_ = std::accumulate(dims_.begin(), dims_.end(), 1LL, std::multiplies()); + restart(); + } + + void restart() { + pos_.resize(dims_.size(), 0LL); + index_ = 0LL; + } + + bool has_next() { return index_ < size_; } + + // if has more data return current data ptr and iterator to next pos_ + // otherwise return -1 + int64_t next() { + if (has_next()) { + for (size_t i = dims_.size(); i > 0;) { + i--; + ++pos_[i]; + if (pos_[i] < dims_[i]) { + break; + } + pos_[i] = 0; + } + return index_++; + } + return -1L; + } + + const std::vector dims_; + std::vector pos_; + int64_t size_; + int64_t index_; +}; + +static void +CalculateAvgPoolNchwU8( + uint8_t* x, + const std::vector x_dims, + float x_scale, + int x_zero_point, + uint8_t* y, + const std::vector y_dims, + float y_scale, + int y_zero_point, + const std::vector kernel_shape, + const std::vector strides, + const std::vector pads, + const int64_t count_include_pad) { + int64_t batch = y_dims[0]; + int64_t channel = y_dims[1]; + + std::vector y_img_dims(y_dims.begin() + 2, y_dims.end()); + std::vector x_img_dims(x_dims.begin() + 2, x_dims.end()); + std::vector x_img_strides(x_img_dims.size(), 1LL); + for (size_t i = x_img_dims.size() - 1; i > 0;) { + i--; + x_img_strides[i] = x_img_strides[i + 1] * x_img_dims[i + 1]; + } + + int64_t y_step = std::accumulate(y_img_dims.begin(), y_img_dims.end(), 1LL, std::multiplies()); + int64_t x_step = std::accumulate(x_img_dims.begin(), x_img_dims.end(), 1LL, std::multiplies()); + for (int64_t b = 0; b < batch; ++b) { + for (int64_t c = 0; c < channel; ++c) { + uint8_t* ybc = y + (b * channel + c) * y_step; + uint8_t* xbc = x + (b * channel + c) * x_step; + + DimIterator yit(y_img_dims); + while (yit.has_next()) { + std::vector kernel_topleft(y_img_dims.size(), 0); + for (size_t i = 0; i < y_img_dims.size(); ++i) { + kernel_topleft[i] = yit.pos_[i] * strides[i]; + } + + float y_value_sum = 0.0f; + int count = 0; + for (DimIterator kit(kernel_shape); kit.has_next(); kit.next()) { + int64_t kernel_offset = 0; + for (size_t i = 0; kernel_offset >= 0 && i < kernel_shape.size(); ++i) { + int64_t x_real_dim = kernel_topleft[i] + kit.pos_[i] - pads[i]; + if (x_real_dim >= 0 && x_real_dim < x_img_dims[i]) { + kernel_offset += x_real_dim * x_img_strides[i]; + } else { + kernel_offset = -1LL; // padding element + } + } + if (kernel_offset >= 0) { + y_value_sum += dequantize_u8(xbc[kernel_offset], x_scale, static_cast(x_zero_point)); + ++count; + } else { + count += count_include_pad ? 1 : 0; + } + } + auto y_offset = yit.next(); + auto y_u8 = quantize_u8(y_value_sum / count, y_scale, static_cast(y_zero_point)); + ybc[y_offset] = y_u8; + } + } + } +} + +void RunQLinearAveragePoolNchwU8( + const std::vector x_dims, + const std::vector y_dims, + const std::vector kernel_shape, + const std::vector strides, + const std::vector pads, + const int64_t count_include_pad = 0) { + float x_scale = 1.0f / 255.0f; + uint8_t x_zero_point = 128; + RandomValueGenerator random{}; + std::vector x_data_fp32 = random.Uniform(x_dims, -0.5f, 0.5f); + std::vector x_data(x_data_fp32.size()); + for (size_t i = 0; i < x_data.size(); ++i) { + x_data[i] = quantize_u8(x_data_fp32[i], x_scale, x_zero_point); + } + + float y_scale = 1.0f / 255.0f; + uint8_t y_zero_point = 100; + int64_t y_size = std::accumulate(y_dims.begin(), y_dims.end(), 1LL, std::multiplies()); + std::vector y_data(y_size); + CalculateAvgPoolNchwU8( + x_data.data(), x_dims, x_scale, x_zero_point, + y_data.data(), y_dims, y_scale, y_zero_point, + kernel_shape, strides, pads, count_include_pad); + + OpTester test("QLinearAveragePool", 1, onnxruntime::kMSDomain); + + test.AddAttribute("auto_pad", ""); + test.AddAttribute("strides", strides); + test.AddAttribute("pads", pads); + test.AddAttribute("kernel_shape", kernel_shape); + test.AddAttribute("count_include_pad", count_include_pad); + + test.AddInput("X", x_dims, x_data); + test.AddInput("x_scale", {}, {x_scale}); + test.AddInput("x_zero_point", {}, {x_zero_point}); + test.AddInput("y_scale", {}, {y_scale}); + test.AddInput("y_zero_point", {}, {y_zero_point}); + test.AddOutput("Y", y_dims, y_data); + + auto q8checker = [&](const std::vector& fetches, const std::string& provider_type) { + const OrtValue& ort_value = fetches[0]; + if (ort_value.Fence()) { + ort_value.Fence()->BeforeUsingAsInput(onnxruntime::kCpuExecutionProvider, 0); + } + + auto y_shape = TensorShape(y_dims); + const Tensor& output_tensor = ort_value.Get(); + ORT_ENFORCE(y_shape == output_tensor.Shape(), + "Expected output shape [" + y_shape.ToString() + "] did not match run output shape [" + + output_tensor.Shape().ToString() + "] for Y @" + provider_type); + auto* output = output_tensor.Data(); + auto size = static_cast(output_tensor.Shape().Size()); + for (int i = 0; i < size; ++i) { + int diff = abs(y_data[i] - output[i]); + EXPECT_LE(diff, 1) << "i:" << i << " expected:" << y_data[i] << " " << (int)y_data[i] + << ", got:" << output[i] << " " << (int)output[i] << ", provider_type: " << provider_type; + } + }; + test.SetCustomOutputVerifier(q8checker); + + test.Run(); +} + +TEST(QLinearPoolTest, AveragePool1D_ExcludePadPixel) { + RunQLinearAveragePoolNchwU8( + {1, 1, 5}, // x shape + {1, 1, 6}, // expected y shape + {3}, // kernel shape + {1}, // strides + {1, 2}, // pads + 0); // count_include_pad +} + +TEST(QLinearPoolTest, AveragePool1D_IncludePadPixel) { + RunQLinearAveragePoolNchwU8( + {1, 1, 5}, // x shape + {1, 1, 6}, // expected y shape + {3}, // kernel shape + {1}, // strides + {1, 2}, // pads + 1); // count_include_pad +} + +TEST(QLinearPoolTest, AveragePool2D_ExcludePadPixel) { + RunQLinearAveragePoolNchwU8( + {1, 1, 5, 7}, // x shape + {1, 1, 6, 4}, // expected y shape + {3, 4}, // kernel shape + {1, 2}, // strides + {1, 3, 2, 1}, // pads + 0); // count_include_pad +} + +TEST(QLinearPoolTest, AveragePool2D_IncludePadPixel) { + RunQLinearAveragePoolNchwU8( + {1, 1, 5, 7}, // x shape + {1, 1, 6, 4}, // expected y shape + {3, 4}, // kernel shape + {1, 2}, // strides + {1, 3, 2, 1}, // pads + 1); // count_include_pad +} + +TEST(QLinearPoolTest, AveragePool3D_ExcludePadPixel) { + RunQLinearAveragePoolNchwU8( + {1, 1, 5, 7, 9}, // x shape + {1, 1, 6, 4, 3}, // expected y shape + {3, 4, 5}, // kernel shape + {1, 2, 3}, // strides + {1, 3, 2, 2, 1, 2}, // pads + 0); // count_include_pad +} + +TEST(QLinearPoolTest, AveragePool3D_IncludePadPixel) { + RunQLinearAveragePoolNchwU8( + {1, 1, 5, 7, 9}, // x shape + {1, 1, 6, 4, 3}, // expected y shape + {3, 4, 5}, // kernel shape + {1, 2, 3}, // strides + {1, 3, 2, 2, 1, 2}, // pads + 1); // count_include_pad +} + +} // namespace test +} // namespace onnxruntime