diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
index 4e18737a31..f317e45b0a 100644
--- a/docs/ContribOperators.md
+++ b/docs/ContribOperators.md
@@ -1784,6 +1784,8 @@ This version of the operator has been available since version 1 of the 'com.micr
auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where default value is NOTSET, which means explicit padding is used. SAME_UPPER or SAME_LOWER mean pad the input so that the output spatial size match the input.In case of odd number add the extra padding at the end for SAME_UPPER and at the beginning for SAME_LOWER. VALID mean no padding.
ceil_mode : int
Whether to use ceil or floor (default) to compute the output shape.
+channels_last : int
+Works on NHWC layout or not? Default not.
count_include_pad : int
Whether include pad pixels when calculating values for the edges. Default is 0, doesn't count include pad.
kernel_shape : list of ints (required)
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index 7ad397f6ef..2b2fa79019 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -687,3 +687,13 @@
|TransposeMatMul|(*in* A:**T**, *in* B:**T**, *out* Y:**T**)|1+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
| |
| |
+
+
+## Operators implemented by DnnlExecutionProvider
+
+| Op Name | Parameters | OpSet Version | Types Supported |
+|---------|------------|---------------|-----------------|
+|**Operator Domain:** *ai.onnx.ml*||||
+|Gemm|(*in* A:**T**, *in* B:**T**, *in* C:**T**, *out* Y:**T**)|7+|**T** = tensor(float)|
+| |
+| |
diff --git a/onnxruntime/contrib_ops/cpu/qlinear_pool.cc b/onnxruntime/contrib_ops/cpu/qlinear_pool.cc
index 9923de913a..0d39e85797 100644
--- a/onnxruntime/contrib_ops/cpu/qlinear_pool.cc
+++ b/onnxruntime/contrib_ops/cpu/qlinear_pool.cc
@@ -25,22 +25,33 @@ static inline T8Bits quantize_value(float y, float y_scale, T8Bits y_zero_point)
template <>
inline float dequantize_value(uint8_t x, float x_scale, uint8_t x_zero_point) {
- return x_scale * (static_cast(x) - x_zero_point);
+ return x_scale * (static_cast(x) - x_zero_point);
}
template <>
inline uint8_t quantize_value(float y, float y_scale, uint8_t y_zero_point) {
- return static_cast(std::max(0.0f, std::min(std::nearbyintf(y / y_scale + y_zero_point), 255.0f)));
+ return static_cast(std::max(0.0f, std::min(std::nearbyintf(y / y_scale + y_zero_point), 255.0f)));
}
+static void SwitchDimsNchwNhwc(std::vector& dims, bool from_nchw_to_nhwc) {
+ if (from_nchw_to_nhwc) {
+ int64_t channel = dims[1];
+ dims.erase(dims.begin() + 1);
+ dims.push_back(channel);
+ } else {
+ int64_t channel = dims.back();
+ dims.insert(dims.begin() + 1, channel);
+ dims.pop_back();
+ }
+}
template
struct QLinearPool1DTask final {
const float* X_data;
T8Bits* Y_data;
float y_scale;
T8Bits y_zero_point;
- int64_t x_step;
- int64_t y_step;
+ int64_t x_image_size;
+ int64_t y_image_size;
int64_t pooled_height;
int64_t stride_h;
int64_t height;
@@ -61,8 +72,8 @@ struct QLinearPool1DTask final {
}
void operator()(std::ptrdiff_t c) const {
- const float* x_d = X_data + c * x_step;
- T8Bits* y_d = Y_data + c * y_step;
+ const float* x_d = X_data + c * x_image_size;
+ T8Bits* y_d = Y_data + c * y_image_size;
for (int64_t ph = 0; ph < pooled_height; ++ph) {
int64_t hstart = ph * stride_h - pads[0];
@@ -82,6 +93,67 @@ struct QLinearPool1DTask final {
}
};
+template
+struct QLinearPoolNhwc1DTask final {
+ const float* X_data;
+ T8Bits* Y_data;
+ float y_scale;
+ T8Bits y_zero_point;
+ int64_t channels;
+ int64_t pooled_height;
+ int64_t stride_h;
+ int64_t height;
+ const std::vector& kernel_shape;
+ const std::vector& pads;
+ const PoolProcessContext& pool_context_;
+ const PoolAttributes& pool_attrs_;
+
+ TensorOpCost Cost() {
+ double loop_count = static_cast(channels * kernel_shape[0]);
+ return TensorOpCost{loop_count, loop_count, loop_count};
+ }
+
+ void operator()(std::ptrdiff_t begin, std::ptrdiff_t end) const {
+ int64_t y_image_size = pooled_height;
+ int64_t batch = begin / y_image_size;
+ int64_t offset = begin % y_image_size;
+
+ for (int64_t remains = end - begin; remains > 0; offset = 0, batch++) {
+ if (offset + remains <= y_image_size) {
+ operator()(std::ptrdiff_t(batch), std::ptrdiff_t(offset), std::ptrdiff_t(offset + remains));
+ remains = 0;
+ } else {
+ operator()(std::ptrdiff_t(batch), std::ptrdiff_t(offset), std::ptrdiff_t(y_image_size));
+ remains -= (y_image_size - offset);
+ }
+ }
+ }
+
+ void operator()(std::ptrdiff_t batch, std::ptrdiff_t begin, std::ptrdiff_t end) const {
+ const float* x_d = X_data + batch * height * channels;
+ T8Bits* y_d = Y_data + batch * pooled_height * channels;
+ std::vector Yh(channels, PoolType::Initialize());
+
+ for (int64_t ph = begin, phc = begin * channels; ph < end; ++ph, phc += channels) {
+ int64_t hstart = ph * stride_h - pads[0];
+ int64_t hend = std::min(hstart + kernel_shape[0], height);
+ hstart = std::max(hstart, static_cast(0));
+
+ std::fill(Yh.begin(), Yh.end(), PoolType::Initialize());
+ for (int64_t h = hstart, hc = hstart * channels; h < hend; ++h, hc += channels) {
+ for (int64_t c = 0; c < channels; ++c) {
+ PoolType::Process(x_d[hc + c], Yh[c], pool_context_);
+ }
+ }
+
+ int64_t element_count = (pool_attrs_.count_include_pad) ? kernel_shape[0] : hend - hstart;
+ for (int64_t c = 0; c < channels; ++c) {
+ PoolType::Finalize(element_count, Yh[c], pool_context_);
+ y_d[phc + c] = quantize_value(Yh[c], y_scale, y_zero_point);
+ }
+ }
+ }
+};
template
struct QLinearPool2DTask final {
@@ -89,8 +161,8 @@ struct QLinearPool2DTask final {
T8Bits* Y_data;
float y_scale;
T8Bits y_zero_point;
- int64_t x_step;
- int64_t y_step;
+ int64_t x_image_size;
+ int64_t y_image_size;
int64_t pooled_height;
int64_t pooled_width;
int64_t stride_h;
@@ -114,8 +186,8 @@ struct QLinearPool2DTask final {
}
void operator()(std::ptrdiff_t c) const {
- const float* x_d = X_data + c * x_step;
- T8Bits* y_d = Y_data + c * y_step;
+ const float* x_d = X_data + c * x_image_size;
+ T8Bits* y_d = Y_data + c * y_image_size;
for (int64_t ph = 0; ph < pooled_height; ++ph) {
int64_t hstart = ph * stride_h - pads[0];
@@ -144,14 +216,105 @@ struct QLinearPool2DTask final {
}
};
+template
+struct QLinearPoolNhwc2DTask final {
+ const float* X_data;
+ T8Bits* Y_data;
+ float y_scale;
+ T8Bits y_zero_point;
+ int64_t x_image_size;
+ int64_t y_image_size;
+ int64_t kernel_size;
+ int64_t channels;
+ int64_t pooled_height;
+ int64_t pooled_width;
+ int64_t stride_h;
+ int64_t stride_w;
+ int64_t height;
+ int64_t width;
+ const std::vector& kernel_shape;
+ const std::vector& pads;
+ const PoolProcessContext& pool_context_;
+ const PoolAttributes& pool_attrs_;
+
+ TensorOpCost Cost() {
+ double loop_count = static_cast(channels * kernel_size);
+ return TensorOpCost{loop_count, loop_count, loop_count};
+ }
+
+ void operator()(std::ptrdiff_t begin, std::ptrdiff_t end) const {
+ int64_t batch = begin / y_image_size;
+ int64_t offset = begin % y_image_size;
+
+ for (int64_t remains = end - begin; remains > 0; offset = 0, batch++) {
+ if (offset + remains <= y_image_size) {
+ operator()(std::ptrdiff_t(batch), std::ptrdiff_t(offset), std::ptrdiff_t(offset + remains));
+ remains = 0;
+ } else {
+ operator()(std::ptrdiff_t(batch), std::ptrdiff_t(offset), std::ptrdiff_t(y_image_size));
+ remains -= (y_image_size - offset);
+ }
+ }
+ }
+
+ void operator()(std::ptrdiff_t batch, std::ptrdiff_t begin, std::ptrdiff_t end) const {
+ const float* x_d = X_data + batch * x_image_size * channels;
+ T8Bits* y_d = Y_data + batch * y_image_size * channels;
+
+ // Calculate starting pooled_h, pooled_w, pooled_d
+ int64_t start_pw = begin;
+ int64_t start_ph = start_pw / pooled_width;
+ start_pw -= (start_ph * pooled_width);
+
+ int64_t pool_index = channels * begin;
+ int64_t remains = end - begin;
+ std::vector Yh(channels);
+
+ for (int64_t ph = start_ph; remains > 0 && ph < pooled_height; ++ph) {
+ int64_t hstart = ph * stride_h - pads[0];
+ int64_t hend = std::min(hstart + kernel_shape[0], height);
+ hstart = std::max(hstart, static_cast(0));
+ for (int64_t pw = start_pw; remains > 0 && pw < pooled_width; ++pw) {
+ int64_t wstart = pw * stride_w - pads[1];
+ int64_t wend = std::min(wstart + kernel_shape[1], width);
+ wstart = std::max(wstart, static_cast(0));
+
+ // do the pooling here
+ float pool_init_value = PoolType::Initialize();
+ std::fill(Yh.data(), Yh.data() + channels, pool_init_value);
+ for (int64_t h = hstart; h < hend; ++h) {
+ int64_t input_index = channels * (h * width + wstart);
+ for (int64_t w = wstart; w < wend; ++w) {
+ for (int64_t c = 0; c < channels; c++) {
+ PoolType::Process(x_d[input_index + c], Yh[c], pool_context_);
+ }
+ input_index += channels;
+ }
+ }
+
+ int64_t elements_count = (pool_attrs_.count_include_pad) ? kernel_size : (hend - hstart) * (wend - wstart);
+ for (int64_t c = 0; c < channels; c++) {
+ PoolType::Finalize(elements_count, Yh[c], pool_context_);
+ auto y_value = quantize_value(Yh[c], y_scale, y_zero_point);
+ y_d[pool_index + c] = y_value;
+ }
+
+ pool_index += channels;
+ remains--;
+ }
+ start_pw = 0;
+ }
+ }
+};
+
template
struct QLinearPool3DTask final {
const float* X_data;
T8Bits* Y_data;
float y_scale;
T8Bits y_zero_point;
- int64_t x_step;
- int64_t y_step;
+ int64_t x_image_size;
+ int64_t y_image_size;
int64_t pooled_height;
int64_t pooled_width;
int64_t pooled_depth;
@@ -179,8 +342,8 @@ struct QLinearPool3DTask final {
}
void operator()(std::ptrdiff_t c) const {
- const float* x_d = X_data + c * x_step;
- T8Bits* y_d = Y_data + c * y_step;
+ const float* x_d = X_data + c * x_image_size;
+ T8Bits* y_d = Y_data + c * y_image_size;
for (int64_t ph = 0; ph < pooled_height; ++ph) {
int64_t hstart = ph * stride_h - pads[0];
@@ -218,6 +381,110 @@ struct QLinearPool3DTask final {
}
};
+template
+struct QLinearPoolNhwc3DTask final {
+ const float* X_data;
+ T8Bits* Y_data;
+ float y_scale;
+ T8Bits y_zero_point;
+ int64_t x_image_size;
+ int64_t y_image_size;
+ int64_t kernel_size;
+ int64_t channels;
+ int64_t pooled_height;
+ int64_t pooled_width;
+ int64_t pooled_depth;
+ int64_t stride_h;
+ int64_t stride_w;
+ int64_t stride_d;
+ int64_t height;
+ int64_t width;
+ int64_t depth;
+ const std::vector& kernel_shape;
+ const std::vector& pads;
+ const PoolProcessContext& pool_context_;
+ const PoolAttributes& pool_attrs_;
+
+ TensorOpCost Cost() {
+ double loop_count = static_cast(channels * kernel_size);
+ return TensorOpCost{loop_count, loop_count, loop_count};
+ }
+
+ void operator()(std::ptrdiff_t begin, std::ptrdiff_t end) const {
+ int64_t batch = begin / y_image_size;
+ int64_t offset = begin % y_image_size;
+
+ for (int64_t remains = end - begin; remains > 0; offset = 0, batch++) {
+ if (offset + remains <= y_image_size) {
+ operator()(std::ptrdiff_t(batch), std::ptrdiff_t(offset), std::ptrdiff_t(offset + remains));
+ remains = 0;
+ } else {
+ operator()(std::ptrdiff_t(batch), std::ptrdiff_t(offset), std::ptrdiff_t(y_image_size));
+ remains -= (y_image_size - offset);
+ }
+ }
+ }
+
+ void operator()(std::ptrdiff_t batch, std::ptrdiff_t begin, std::ptrdiff_t end) const {
+ const float* x_d = X_data + batch * x_image_size * channels;
+ T8Bits* y_d = Y_data + batch * y_image_size * channels;
+
+ // Calculate starting pooled_h, pooled_w, pooled_d
+ int64_t start_pd = begin;
+ int64_t start_ph = start_pd / (pooled_width * pooled_depth);
+ start_pd = start_pd - (start_ph * pooled_width * pooled_depth);
+ int64_t start_pw = start_pd / pooled_depth;
+ start_pd = start_pd - start_pw * pooled_depth;
+ int64_t pool_index = channels * begin;
+ int64_t remains = end - begin;
+
+ std::vector Yh(channels);
+
+ for (int64_t ph = start_ph; remains > 0 && ph < pooled_height; ++ph) {
+ int64_t hstart = ph * stride_h - pads[0];
+ int64_t hend = std::min(hstart + kernel_shape[0], height);
+ hstart = std::max(hstart, static_cast(0));
+ for (int64_t pw = start_pw; remains > 0 && pw < pooled_width; ++pw) {
+ int64_t wstart = pw * stride_w - pads[1];
+ int64_t wend = std::min(wstart + kernel_shape[1], width);
+ wstart = std::max(wstart, static_cast(0));
+ for (int64_t pd = start_pd; remains > 0 && pd < pooled_depth; ++pd) {
+ int64_t dstart = pd * stride_d - pads[2];
+ int64_t dend = std::min(dstart + kernel_shape[2], depth);
+ dstart = std::max(dstart, static_cast(0));
+
+ // do the pooling here
+ std::fill(Yh.begin(), Yh.end(), PoolType::Initialize());
+ for (int64_t h = hstart; h < hend; ++h) {
+ const int64_t input_index_h = h * width * depth;
+ for (int64_t w = wstart; w < wend; ++w) {
+ int64_t input_index = channels * (input_index_h + w * depth + dstart);
+ for (int64_t d = dstart; d < dend; ++d) {
+ for (int64_t c = 0; c < channels; c++) {
+ PoolType::Process(x_d[input_index + c], Yh[c], pool_context_);
+ }
+ input_index += channels;
+ }
+ }
+ }
+
+ int64_t elements_count = (pool_attrs_.count_include_pad) ? kernel_size : (hend - hstart) * (wend - wstart) * (dend - dstart);
+ for (int64_t c = 0; c < channels; c++) {
+ PoolType::Finalize(elements_count, Yh[c], pool_context_);
+ auto y_value = quantize_value(Yh[c], y_scale, y_zero_point);
+ y_d[pool_index + c] = y_value;
+ }
+
+ pool_index += channels;
+ remains--;
+ }
+ start_pd = 0;
+ }
+ start_pw = 0;
+ }
+ }
+};
+
Status QLinearAveragePool::Compute(OpKernelContext* context) const {
const auto tensor_x_scale = context->Input(1);
const auto tensor_x_zero_point = context->Input(2);
@@ -236,9 +503,10 @@ Status QLinearAveragePool::Compute(OpKernelContext* context) const {
const auto* X = context->Input(0);
auto dtype = X->GetElementType();
if (dtype != ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
- ORT_THROW("Unsupported 'dtype' in QLinear Pooling:", dtype);
+ ORT_THROW("Unsupported 'dtype' in QLinear Pooling:", dtype);
}
- const TensorShape& x_shape = X->Shape();
+
+ TensorShape x_shape = X->Shape();
const float x_scale = *(tensor_x_scale->Data());
const float y_scale = *(tensor_y_scale->Data());
uint8_t x_zero_point = (tensor_x_zero_point ? *(tensor_x_zero_point->Data()) : (uint8_t)0);
@@ -249,12 +517,14 @@ Status QLinearAveragePool::Compute(OpKernelContext* context) const {
std::vector strides = pool_attrs_.strides;
std::vector kernel_shape = pool_attrs_.kernel_shape;
+ if (channels_last_) {
+ std::vector x_dims = x_shape.GetDims();
+ SwitchDimsNchwNhwc(x_dims, false);
+ x_shape = TensorShape(x_dims);
+ }
std::vector output_dims = pool_attrs_.SetOutputSize(x_shape, x_shape[1], &pads);
- Tensor* Y = context->Output(0, output_dims);
-
- const auto* X_data = X->Data();
- auto* Y_data = Y->MutableData();
+ int64_t batch_count = x_shape[0];
const int64_t channels = x_shape[1];
const int64_t height = x_shape[2];
const int64_t width = kernel_shape.size() > 1 ? x_shape[3] : 1;
@@ -262,9 +532,17 @@ Status QLinearAveragePool::Compute(OpKernelContext* context) const {
const int64_t pooled_height = output_dims[2];
const int64_t pooled_width = kernel_shape.size() > 1 ? output_dims[3] : 1;
const int64_t pooled_depth = kernel_shape.size() > 2 ? output_dims[4] : 1;
- const int64_t total_channels = x_shape[0] * channels;
- const int64_t x_step = height * width * depth;
- const int64_t y_step = pooled_height * pooled_width * pooled_depth;
+ const int64_t total_channels = batch_count * channels;
+ const int64_t x_image_size = height * width * depth;
+ const int64_t y_image_size = pooled_height * pooled_width * pooled_depth;
+ const int64_t kernel_size = std::accumulate(kernel_shape.begin(), kernel_shape.end(), 1LL, std::multiplies());
+
+ if (channels_last_) {
+ SwitchDimsNchwNhwc(output_dims, true);
+ }
+ Tensor* Y = context->Output(0, output_dims);
+ const auto* X_data = X->Data();
+ auto* Y_data = Y->MutableData();
ThreadPool* tp = context->GetOperatorThreadPool();
std::vector x_data_fp32;
@@ -274,42 +552,62 @@ Status QLinearAveragePool::Compute(OpKernelContext* context) const {
const auto* x8 = X_data + first;
float* x32 = x_data_fp32.data() + first;
for (ptrdiff_t i = 0, sz = last - first; i < sz; ++i) {
- *x32++ = dequantize_value(x8[i], x_scale, x_zero_point);
+ *x32++ = dequantize_value(x8[i], x_scale, x_zero_point);
}
});
}
switch (kernel_shape.size()) {
- case 1:
- {
- QLinearPool1DTask avg_pool_task_1d = {
- x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_step, y_step,
- pooled_height, strides[0], height, kernel_shape, pads, pool_context_, pool_attrs_};
- ThreadPool::TryParallelFor(tp, total_channels, avg_pool_task_1d.Cost(), avg_pool_task_1d);
+ case 1: {
+ if (channels_last_) {
+ QLinearPoolNhwc1DTask avg_pool_task_1d = {
+ x_data_fp32.data(), Y_data, y_scale, y_zero_point, channels,
+ pooled_height, strides[0], height, kernel_shape, pads, pool_context_, pool_attrs_};
+ ThreadPool::TryParallelFor(tp, y_image_size * batch_count, avg_pool_task_1d.Cost(), avg_pool_task_1d);
+ } else {
+ QLinearPool1DTask avg_pool_task_1d = {
+ x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_image_size, y_image_size,
+ pooled_height, strides[0], height, kernel_shape, pads, pool_context_, pool_attrs_};
+ ThreadPool::TryParallelFor(tp, total_channels, avg_pool_task_1d.Cost(), avg_pool_task_1d);
+ }
break;
}
- case 2:
- {
- QLinearPool2DTask avg_pool_task_2d = {
- x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_step, y_step,
- pooled_height, pooled_width, strides[0], strides[1], height, width, kernel_shape, pads, pool_context_, pool_attrs_};
- ThreadPool::TryParallelFor(tp, total_channels, avg_pool_task_2d.Cost(), avg_pool_task_2d);
+ case 2: {
+ if (channels_last_) {
+ QLinearPoolNhwc2DTask avg_pool_task_2d = {
+ x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_image_size, y_image_size, kernel_size, channels,
+ pooled_height, pooled_width, strides[0], strides[1], height, width, kernel_shape, pads, pool_context_, pool_attrs_};
+ ThreadPool::TryParallelFor(tp, y_image_size * batch_count, avg_pool_task_2d.Cost(), avg_pool_task_2d);
+
+ } else {
+ QLinearPool2DTask avg_pool_task_2d = {
+ x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_image_size, y_image_size,
+ pooled_height, pooled_width, strides[0], strides[1], height, width, kernel_shape, pads, pool_context_, pool_attrs_};
+ ThreadPool::TryParallelFor(tp, total_channels, avg_pool_task_2d.Cost(), avg_pool_task_2d);
+ }
break;
}
- case 3:
- {
- QLinearPool3DTask avg_pool_task_3d = {
- x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_step, y_step,
- pooled_height, pooled_width, pooled_depth, strides[0], strides[1], strides[2], height, width, depth,
- kernel_shape, pads, pool_context_, pool_attrs_};
- ThreadPool::TryParallelFor(tp, total_channels, avg_pool_task_3d.Cost(), avg_pool_task_3d);
+ case 3: {
+ if (channels_last_) {
+ QLinearPoolNhwc3DTask avg_pool_task_3d = {
+ x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_image_size, y_image_size, kernel_size, channels,
+ pooled_height, pooled_width, pooled_depth, strides[0], strides[1], strides[2], height, width, depth,
+ kernel_shape, pads, pool_context_, pool_attrs_};
+ ThreadPool::TryParallelFor(tp, y_image_size * batch_count, avg_pool_task_3d.Cost(), avg_pool_task_3d);
+
+ } else {
+ QLinearPool3DTask avg_pool_task_3d = {
+ x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_image_size, y_image_size,
+ pooled_height, pooled_width, pooled_depth, strides[0], strides[1], strides[2], height, width, depth,
+ kernel_shape, pads, pool_context_, pool_attrs_};
+ ThreadPool::TryParallelFor(tp, total_channels, avg_pool_task_3d.Cost(), avg_pool_task_3d);
+ }
break;
}
- default:
- {
+ default: {
return onnxruntime::common::Status(
onnxruntime::common::ONNXRUNTIME,
onnxruntime::common::INVALID_ARGUMENT,
diff --git a/onnxruntime/contrib_ops/cpu/qlinear_pool.h b/onnxruntime/contrib_ops/cpu/qlinear_pool.h
index 13175052f0..92285e4f78 100644
--- a/onnxruntime/contrib_ops/cpu/qlinear_pool.h
+++ b/onnxruntime/contrib_ops/cpu/qlinear_pool.h
@@ -12,15 +12,17 @@ namespace contrib {
class QLinearAveragePool final : public OpKernel, public PoolBase {
public:
- QLinearAveragePool(const OpKernelInfo& info) : OpKernel(info), PoolBase(info) { }
+ QLinearAveragePool(const OpKernelInfo& info) : OpKernel(info), PoolBase(info) {
+ channels_last_ = (info.GetAttrOrDefault("channels_last", static_cast(0)) != 0);
+ }
~QLinearAveragePool() override = default;
Status Compute(OpKernelContext* context) const override;
-private:
+ private:
PoolProcessContext pool_context_;
-
+ bool channels_last_;
};
} // namespace contrib
diff --git a/onnxruntime/core/graph/contrib_ops/nhwc_schema_defs.cc b/onnxruntime/core/graph/contrib_ops/nhwc_schema_defs.cc
index fa38df6ea6..b06d9ad72d 100644
--- a/onnxruntime/core/graph/contrib_ops/nhwc_schema_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/nhwc_schema_defs.cc
@@ -4,6 +4,7 @@
#include "core/framework/tensorprotoutils.h"
#include "core/graph/constants.h"
#include "core/graph/contrib_ops/contrib_defs.h"
+#include "core/graph/contrib_ops/quantization_defs.h"
namespace ONNX_NAMESPACE {
void convPoolShapeInference(
@@ -18,7 +19,6 @@ using namespace ONNX_NAMESPACE;
namespace onnxruntime {
namespace contrib {
-
class NhwcInferenceContext : public InferenceContext {
public:
NhwcInferenceContext(InferenceContext& ctx) : ctx_(ctx) {
@@ -263,6 +263,156 @@ equal to the spatial dimension of input tensor. Input is of type uint8_t or int8
++image_dim_index;
}
});
+
+ const char* QLinearAveragePoolDoc_ver1 = R"DOC(
+ QLinearAveragePool consumes an input tensor X and applies average pooling across
+ the tensor according to kernel sizes, stride sizes, and pad lengths.
+ average pooling consisting of computing the average on all values of a
+ subset of the input tensor according to the kernel size and downsampling the
+ data into the output tensor Y for further processing. The output spatial shape will be following:
+ ```
+ output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - kernel_spatial_shape[i]) / strides_spatial_shape[i] + 1)
+ ```
+ or
+ ```
+ output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - kernel_spatial_shape[i]) / strides_spatial_shape[i] + 1)
+ ```
+ if ceil_mode is enabled
+
+ ```
+ * pad_shape[i] is sum of pads along axis i
+ ```
+
+ `auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following:
+ ```
+ VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - kernel_spatial_shape[i] + 1) / strides_spatial_shape[i])
+ SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides_spatial_shape[i])
+ ```
+ And pad shape will be following if `SAME_UPPER` or `SAME_LOWER`:
+ ```
+ pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + kernel_spatial_shape[i] - input_spatial_shape[i]
+ ```
+
+The output of each pooling window is divided by the number of elements (exclude pad when attribute count_include_pad is zero).
+
+Input and output scales and zero points are used to convert the output to a new quantization range.
+Output = Dequantize(Input) -> AveragePool on fp32 data -> Quantize(output)
+)DOC";
+
+ static const char* contrib_ops_pads_doc =
+ "Padding for the beginning and ending along each spatial axis, it can take any value greater "
+ "than or equal to 0. The value represent the number of pixels added to the beginning "
+ "and end part of the corresponding axis. `pads` format should be as follow "
+ "[x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin the number of pixels "
+ "added at the beginning of axis `i` and xi_end, the number of pixels added at "
+ "the end of axis `i`. This attribute cannot be used simultaneously with "
+ "auto_pad attribute. If not present, the padding defaults to 0 along start and end of each spatial axis.";
+ static const char* contrib_ops_auto_pad_doc =
+ "auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where "
+ "default value is NOTSET, which means explicit padding is used. "
+ "SAME_UPPER or SAME_LOWER mean pad the input so that the output spatial size match the input."
+ "In case of odd number add the extra padding at the end for SAME_UPPER and at the "
+ "beginning for SAME_LOWER. VALID mean no padding.";
+
+ ONNX_CONTRIB_OPERATOR_SCHEMA(QLinearAveragePool)
+ .SetDomain(kMSDomain)
+ .SinceVersion(1)
+ .SetDoc(QLinearAveragePoolDoc_ver1)
+ .Attr(
+ "count_include_pad",
+ "Whether include pad pixels when calculating values for the edges. Default is 0, doesn't count include pad.",
+ AttributeProto::INT,
+ static_cast(0))
+ .Attr(
+ "kernel_shape",
+ "The size of the kernel along each axis.",
+ AttributeProto::INTS)
+ .Attr(
+ "strides",
+ "Stride along each spatial axis. If not present, the stride defaults to 1 along each spatial axis.",
+ AttributeProto::INTS,
+ OPTIONAL_VALUE)
+ .Attr(
+ "auto_pad",
+ contrib_ops_auto_pad_doc,
+ AttributeProto::STRING,
+ std::string("NOTSET"))
+ .Attr("pads", contrib_ops_pads_doc, AttributeProto::INTS, OPTIONAL_VALUE)
+ .Attr(
+ "ceil_mode",
+ "Whether to use ceil or floor (default) to compute the output shape.",
+ AttributeProto::INT,
+ static_cast(0))
+ .Attr("channels_last", "Works on NHWC layout or not? Default not.", AttributeProto::INT, static_cast(0))
+ .Input(
+ 0,
+ "X",
+ "Input data tensor from the previous operator; "
+ "dimensions for image case are (N x C x H x W), "
+ "where N is the batch size, C is the number of "
+ "channels, and H and W are the height and the "
+ "width of the data. For non image case, the "
+ "dimensions are in the form of "
+ "(N x C x D1 x D2 ... Dn), where N is the batch "
+ "size. Optionally, if dimension denotation is "
+ "in effect, the operation expects the input "
+ "data tensor to arrive with the dimension denotation "
+ "of [DATA_BATCH, DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].",
+ "T")
+ .Input(
+ 1,
+ "x_scale",
+ "Input scale. It's a scalar, which means a per-tensor/layer quantization.",
+ "tensor(float)")
+ .Input(
+ 2,
+ "x_zero_point",
+ "Input zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.",
+ "T",
+ OpSchema::Optional)
+ .Input(
+ 3,
+ "y_scale",
+ "Output scale. It's a scalar, which means a per-tensor/layer quantization.",
+ "tensor(float)")
+ .Input(
+ 4,
+ "y_zero_point",
+ "Output zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.",
+ "T",
+ OpSchema::Optional)
+ .Output(
+ 0,
+ "Y",
+ "Output data tensor from average or max pooling across "
+ "the input tensor. Dimensions will vary based "
+ "on various kernel, stride, and pad sizes. Floor value of "
+ "the dimension is used",
+ "T")
+ .TypeConstraint(
+ "T",
+ {"tensor(uint8)", "tensor(int8)"},
+ "Constrain input and output types to 8 bit tensors.")
+ .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
+ ONNX_NAMESPACE::propagateElemTypeFromInputToOutput(ctx, 0, 0);
+
+ auto data_type = ctx.getInputType(0);
+ if (nullptr == data_type || data_type->value_case() != ONNX_NAMESPACE::TypeProto::kTensorType) {
+ fail_type_inference("inputs are expected to have tensor type.");
+ }
+
+ // validate scale and zero points
+ ValidateTypeAndShapeForScaleAndZP(ctx, 1, ONNX_NAMESPACE::TensorProto::FLOAT, true);
+ ValidateTypeAndShapeForScaleAndZP(ctx, 2, data_type->tensor_type().elem_type(), true);
+ ValidateTypeAndShapeForScaleAndZP(ctx, 3, ONNX_NAMESPACE::TensorProto::FLOAT, true);
+ ValidateTypeAndShapeForScaleAndZP(ctx, 4, data_type->tensor_type().elem_type(), true);
+
+ if (getAttribute(ctx, "channels_last", 0) == 0) {
+ ONNX_NAMESPACE::convPoolShapeInference(ctx, false, true, 0, 5);
+ } else {
+ convPoolShapeInferenceNhwc(ctx, false, true, 0, 5);
+ }
+ });
}
} // namespace contrib
diff --git a/onnxruntime/core/graph/contrib_ops/quantization_defs.cc b/onnxruntime/core/graph/contrib_ops/quantization_defs.cc
index ef3a91727c..2202b640fc 100644
--- a/onnxruntime/core/graph/contrib_ops/quantization_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/quantization_defs.cc
@@ -1,7 +1,7 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
-#include "core/framework/tensorprotoutils.h"
+#include "core/graph/contrib_ops/quantization_defs.h"
#include "core/graph/constants.h"
#include "core/graph/contrib_ops/contrib_defs.h"
@@ -28,7 +28,7 @@ using ONNX_NAMESPACE::InferenceContext;
using ONNX_NAMESPACE::OpSchema;
using ONNX_NAMESPACE::OPTIONAL_VALUE;
-void ValidateTypeAndShapeForScaleAndZP(ONNX_NAMESPACE::InferenceContext& ctx, int index, ::google::protobuf::int32 expectedType, bool isScalar, int expectedTensorSize = 0) {
+void ValidateTypeAndShapeForScaleAndZP(ONNX_NAMESPACE::InferenceContext& ctx, int index, ::google::protobuf::int32 expectedType, bool isScalar, int expectedTensorSize) {
if (ctx.getNumInputs() > static_cast(index)) {
auto data_type = ctx.getInputType(index);
if (nullptr == data_type) {
@@ -546,151 +546,6 @@ This helps to improve accuracy as after ReduceMean operation the range of the ou
}
});
- const char* QLinearAveragePoolDoc_ver1 = R"DOC(
- QLinearAveragePool consumes an input tensor X and applies average pooling across
- the tensor according to kernel sizes, stride sizes, and pad lengths.
- average pooling consisting of computing the average on all values of a
- subset of the input tensor according to the kernel size and downsampling the
- data into the output tensor Y for further processing. The output spatial shape will be following:
- ```
- output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - kernel_spatial_shape[i]) / strides_spatial_shape[i] + 1)
- ```
- or
- ```
- output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - kernel_spatial_shape[i]) / strides_spatial_shape[i] + 1)
- ```
- if ceil_mode is enabled
-
- ```
- * pad_shape[i] is sum of pads along axis i
- ```
-
- `auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following:
- ```
- VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - kernel_spatial_shape[i] + 1) / strides_spatial_shape[i])
- SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides_spatial_shape[i])
- ```
- And pad shape will be following if `SAME_UPPER` or `SAME_LOWER`:
- ```
- pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + kernel_spatial_shape[i] - input_spatial_shape[i]
- ```
-
-The output of each pooling window is divided by the number of elements (exclude pad when attribute count_include_pad is zero).
-
-Input and output scales and zero points are used to convert the output to a new quantization range.
-Output = Dequantize(Input) -> AveragePool on fp32 data -> Quantize(output)
-)DOC";
-
- static const char* contrib_ops_pads_doc =
- "Padding for the beginning and ending along each spatial axis, it can take any value greater "
- "than or equal to 0. The value represent the number of pixels added to the beginning "
- "and end part of the corresponding axis. `pads` format should be as follow "
- "[x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin the number of pixels "
- "added at the beginning of axis `i` and xi_end, the number of pixels added at "
- "the end of axis `i`. This attribute cannot be used simultaneously with "
- "auto_pad attribute. If not present, the padding defaults to 0 along start and end of each spatial axis.";
- static const char* contrib_ops_auto_pad_doc =
- "auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where "
- "default value is NOTSET, which means explicit padding is used. "
- "SAME_UPPER or SAME_LOWER mean pad the input so that the output spatial size match the input."
- "In case of odd number add the extra padding at the end for SAME_UPPER and at the "
- "beginning for SAME_LOWER. VALID mean no padding.";
-
- ONNX_CONTRIB_OPERATOR_SCHEMA(QLinearAveragePool)
- .SetDomain(kMSDomain)
- .SinceVersion(1)
- .SetDoc(QLinearAveragePoolDoc_ver1)
- .Attr(
- "count_include_pad",
- "Whether include pad pixels when calculating values for the edges. Default is 0, doesn't count include pad.",
- AttributeProto::INT,
- static_cast(0))
- .Attr(
- "kernel_shape",
- "The size of the kernel along each axis.",
- AttributeProto::INTS)
- .Attr(
- "strides",
- "Stride along each spatial axis. If not present, the stride defaults to 1 along each spatial axis.",
- AttributeProto::INTS,
- OPTIONAL_VALUE)
- .Attr(
- "auto_pad",
- contrib_ops_auto_pad_doc,
- AttributeProto::STRING,
- std::string("NOTSET"))
- .Attr("pads", contrib_ops_pads_doc, AttributeProto::INTS, OPTIONAL_VALUE)
- .Attr(
- "ceil_mode",
- "Whether to use ceil or floor (default) to compute the output shape.",
- AttributeProto::INT,
- static_cast(0))
- .Input(
- 0,
- "X",
- "Input data tensor from the previous operator; "
- "dimensions for image case are (N x C x H x W), "
- "where N is the batch size, C is the number of "
- "channels, and H and W are the height and the "
- "width of the data. For non image case, the "
- "dimensions are in the form of "
- "(N x C x D1 x D2 ... Dn), where N is the batch "
- "size. Optionally, if dimension denotation is "
- "in effect, the operation expects the input "
- "data tensor to arrive with the dimension denotation "
- "of [DATA_BATCH, DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].",
- "T")
- .Input(
- 1,
- "x_scale",
- "Input scale. It's a scalar, which means a per-tensor/layer quantization.",
- "tensor(float)")
- .Input(
- 2,
- "x_zero_point",
- "Input zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.",
- "T",
- OpSchema::Optional)
- .Input(
- 3,
- "y_scale",
- "Output scale. It's a scalar, which means a per-tensor/layer quantization.",
- "tensor(float)")
- .Input(
- 4,
- "y_zero_point",
- "Output zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.",
- "T",
- OpSchema::Optional)
- .Output(
- 0,
- "Y",
- "Output data tensor from average or max pooling across "
- "the input tensor. Dimensions will vary based "
- "on various kernel, stride, and pad sizes. Floor value of "
- "the dimension is used",
- "T")
- .TypeConstraint(
- "T",
- {"tensor(uint8)", "tensor(int8)"},
- "Constrain input and output types to 8 bit tensors.")
- .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
- ONNX_NAMESPACE::propagateElemTypeFromInputToOutput(ctx, 0, 0);
-
- auto data_type = ctx.getInputType(0);
- if (nullptr == data_type || data_type->value_case() != ONNX_NAMESPACE::TypeProto::kTensorType) {
- fail_type_inference("inputs are expected to have tensor type.");
- }
-
- // validate scale and zero points
- ValidateTypeAndShapeForScaleAndZP(ctx, 1, ONNX_NAMESPACE::TensorProto::FLOAT, true);
- ValidateTypeAndShapeForScaleAndZP(ctx, 2, data_type->tensor_type().elem_type(), true);
- ValidateTypeAndShapeForScaleAndZP(ctx, 3, ONNX_NAMESPACE::TensorProto::FLOAT, true);
- ValidateTypeAndShapeForScaleAndZP(ctx, 4, data_type->tensor_type().elem_type(), true);
-
- ONNX_NAMESPACE::convPoolShapeInference(ctx, false, true, 0, 5);
- });
-
const char* QLinearLeakyReluDoc_ver1 = R"DOC(
QLinearLeakyRelu takes quantized input data (Tensor), an argument alpha, and quantize parameter for output,
and produces one output data (Tensor) where the function `f(x) = quantize(alpha * dequantize(x)) for dequantize(x) < 0`,
diff --git a/onnxruntime/core/graph/contrib_ops/quantization_defs.h b/onnxruntime/core/graph/contrib_ops/quantization_defs.h
new file mode 100644
index 0000000000..44ab4b0147
--- /dev/null
+++ b/onnxruntime/core/graph/contrib_ops/quantization_defs.h
@@ -0,0 +1,19 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+
+#include "core/graph/onnx_protobuf.h"
+#include "core/framework/tensorprotoutils.h"
+
+namespace onnxruntime {
+namespace contrib {
+
+void ValidateTypeAndShapeForScaleAndZP(
+ ONNX_NAMESPACE::InferenceContext& ctx,
+ int index,
+ ::google::protobuf::int32 expectedType,
+ bool isScalar,
+ int expectedTensorSize = 0);
+
+}
+} // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/optimizer/nhwc_transformer.cc b/onnxruntime/core/optimizer/nhwc_transformer.cc
index 26829f71fc..a05d00c290 100644
--- a/onnxruntime/core/optimizer/nhwc_transformer.cc
+++ b/onnxruntime/core/optimizer/nhwc_transformer.cc
@@ -427,7 +427,8 @@ void NhwcTransformerImpl::Transform(Node& node) {
} else if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "QLinearLeakyRelu", {1}, kMSDomain) ||
graph_utils::IsSupportedOptypeVersionAndDomain(node, "QLinearSigmoid", {1}, kMSDomain)) {
TransformQLinearActivation(node);
- } else if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "QLinearGlobalAveragePool", {1}, kMSDomain)) {
+ } else if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "QLinearGlobalAveragePool", {1}, kMSDomain) ||
+ graph_utils::IsSupportedOptypeVersionAndDomain(node, "QLinearAveragePool", {1}, kMSDomain)) {
TransformQLinearGlobalAveragePool(node);
} else if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "QLinearConcat", {1}, kMSDomain)) {
TransformQLinearConcat(node);
diff --git a/onnxruntime/test/contrib_ops/qlinear_pool_test.cc b/onnxruntime/test/contrib_ops/qlinear_pool_test.cc
index 7678887fa0..94916ebec3 100644
--- a/onnxruntime/test/contrib_ops/qlinear_pool_test.cc
+++ b/onnxruntime/test/contrib_ops/qlinear_pool_test.cc
@@ -192,6 +192,105 @@ void RunQLinearAveragePoolNchwU8(
run_test(true /* only_x_not_initializer */, true /* x_y_same_zero_point */);
}
+static std::vector dims_to_nhwc(const std::vector& nchw) {
+ std::vector nhwc(nchw);
+ nhwc.erase(nhwc.begin() + 1);
+ nhwc.push_back(nchw[1]);
+ return nhwc;
+}
+
+static std::vector transpose_to_nhwc(const std::vector& nchw_data, const std::vector& nchw_dims) {
+ std::vector nhwc_data(nchw_data.size());
+
+ auto batch_count = nchw_dims[0];
+ auto channels = nchw_dims[1];
+ int64_t image_size = std::accumulate(nchw_dims.begin() + 2, nchw_dims.end(), 1LL, std::multiplies());
+ for (int64_t b = 0; b < batch_count; b++) {
+ const uint8_t* nchw_image = nchw_data.data() + (b * image_size);
+ uint8_t* nhwc_image = nhwc_data.data() + (b * image_size);
+ for (int64_t img_index = 0; img_index < image_size; ++img_index) {
+ for (int64_t c = 0; c < channels; c++) {
+ *nhwc_image++ = nchw_image[c * image_size + img_index];
+ }
+ }
+ }
+
+ return nhwc_data;
+}
+
+void RunQLinearAveragePoolNhwcU8(
+ const std::vector x_dims,
+ const std::vector y_dims,
+ const std::vector kernel_shape,
+ const std::vector strides,
+ const std::vector pads,
+ const int64_t count_include_pad = 0) {
+ float x_scale = 1.0f / 255.0f;
+ uint8_t x_zero_point = 128;
+ RandomValueGenerator random{};
+ std::vector x_data_fp32 = random.Uniform(x_dims, -0.5f, 0.5f);
+ std::vector x_data(x_data_fp32.size());
+ for (size_t i = 0; i < x_data.size(); ++i) {
+ x_data[i] = quantize_u8(x_data_fp32[i], x_scale, x_zero_point);
+ }
+
+ float y_scale = 1.0f / 255.0f;
+ uint8_t y_zero_point = 100;
+ int64_t y_size = std::accumulate(y_dims.begin(), y_dims.end(), 1LL, std::multiplies());
+ std::vector y_data(y_size);
+ CalculateAvgPoolNchwU8(
+ x_data.data(), x_dims, x_scale, x_zero_point,
+ y_data.data(), y_dims, y_scale, y_zero_point,
+ kernel_shape, strides, pads, count_include_pad);
+
+ // transpose the result
+ std::vector y_data_nhwc = transpose_to_nhwc(y_data, y_dims);
+ std::vector x_data_nhwc = transpose_to_nhwc(x_data, x_dims);
+ auto x_dims_nhwc = dims_to_nhwc(x_dims);
+ auto y_dims_nhwc = dims_to_nhwc(y_dims);
+
+ OpTester test("QLinearAveragePool", 1, onnxruntime::kMSDomain);
+
+ test.AddAttribute("auto_pad", "");
+ test.AddAttribute("strides", strides);
+ test.AddAttribute("pads", pads);
+ test.AddAttribute("kernel_shape", kernel_shape);
+ test.AddAttribute("count_include_pad", count_include_pad);
+ test.AddAttribute("channels_last", (int64_t)1LL);
+
+ test.AddInput("X", x_dims_nhwc, x_data_nhwc);
+ test.AddInput("x_scale", {}, {x_scale});
+ test.AddInput("x_zero_point", {}, {x_zero_point});
+ test.AddInput("y_scale", {}, {y_scale});
+ test.AddInput("y_zero_point", {}, {y_zero_point});
+ test.AddOutput("Y", y_dims_nhwc, y_data_nhwc);
+
+ auto q8checker = [&](const std::vector& fetches, const std::string& provider_type) {
+ const OrtValue& ort_value = fetches[0];
+ if (ort_value.Fence()) {
+ ort_value.Fence()->BeforeUsingAsInput(onnxruntime::kCpuExecutionProvider, 0);
+ }
+
+ auto y_shape = TensorShape(y_dims_nhwc);
+ const Tensor& output_tensor = ort_value.Get();
+ ORT_ENFORCE(y_shape == output_tensor.Shape(),
+ "Expected output shape [" + y_shape.ToString() + "] did not match run output shape [" +
+ output_tensor.Shape().ToString() + "] for Y @" + provider_type);
+ auto* output = output_tensor.Data();
+ auto size = static_cast(output_tensor.Shape().Size());
+ for (int i = 0; i < size; ++i) {
+ int diff = abs(y_data_nhwc[i] - output[i]);
+ EXPECT_LE(diff, 1) << "i:" << i << " expected:" << y_data_nhwc[i] << " " << (int)y_data_nhwc[i]
+ << ", got:" << output[i] << " " << (int)output[i] << ", provider_type: " << provider_type;
+ }
+ };
+ test.SetCustomOutputVerifier(q8checker);
+
+ static std::unordered_set excluded_providers = {kNnapiExecutionProvider};
+
+ test.Run(OpTester::ExpectResult::kExpectSuccess, "", excluded_providers);
+}
+
TEST(QLinearPoolTest, AveragePool1D_ExcludePadPixel) {
RunQLinearAveragePoolNchwU8(
{1, 1, 5}, // x shape
@@ -252,5 +351,68 @@ TEST(QLinearPoolTest, AveragePool3D_IncludePadPixel) {
1); // count_include_pad
}
+/*************************************************
+* Channels last test
+**************************************************/
+TEST(QLinearPoolTest, AveragePool1D_ExcludePadPixel_nhwc) {
+ RunQLinearAveragePoolNhwcU8(
+ {1, 1, 5}, // x shape
+ {1, 1, 6}, // expected y shape
+ {3}, // kernel shape
+ {1}, // strides
+ {1, 2}, // pads
+ 0); // count_include_pad
+}
+
+TEST(QLinearPoolTest, AveragePool1D_IncludePadPixel_nhwc) {
+ RunQLinearAveragePoolNhwcU8(
+ {1, 1, 5}, // x shape
+ {1, 1, 6}, // expected y shape
+ {3}, // kernel shape
+ {1}, // strides
+ {1, 2}, // pads
+ 1); // count_include_pad
+}
+
+TEST(QLinearPoolTest, AveragePool2D_ExcludePadPixel_nhwc) {
+ RunQLinearAveragePoolNhwcU8(
+ {1, 1, 5, 7}, // x shape
+ {1, 1, 6, 4}, // expected y shape
+ {3, 4}, // kernel shape
+ {1, 2}, // strides
+ {1, 3, 2, 1}, // pads
+ 0); // count_include_pad
+}
+
+TEST(QLinearPoolTest, AveragePool2D_IncludePadPixel_nhwc) {
+ RunQLinearAveragePoolNhwcU8(
+ {1, 1, 5, 7}, // x shape
+ {1, 1, 6, 4}, // expected y shape
+ {3, 4}, // kernel shape
+ {1, 2}, // strides
+ {1, 3, 2, 1}, // pads
+ 1); // count_include_pad
+}
+
+TEST(QLinearPoolTest, AveragePool3D_ExcludePadPixel_nhwc) {
+ RunQLinearAveragePoolNhwcU8(
+ {1, 1, 5, 7, 9}, // x shape
+ {1, 1, 6, 4, 3}, // expected y shape
+ {3, 4, 5}, // kernel shape
+ {1, 2, 3}, // strides
+ {1, 3, 2, 2, 1, 2}, // pads
+ 0); // count_include_pad
+}
+
+TEST(QLinearPoolTest, AveragePool3D_IncludePadPixel_nhwc) {
+ RunQLinearAveragePoolNhwcU8(
+ {1, 1, 5, 7, 9}, // x shape
+ {1, 1, 6, 4, 3}, // expected y shape
+ {3, 4, 5}, // kernel shape
+ {1, 2, 3}, // strides
+ {1, 3, 2, 2, 1, 2}, // pads
+ 1); // count_include_pad
+}
+
} // namespace test
} // namespace onnxruntime
diff --git a/onnxruntime/test/optimizer/nhwc_transformer_test.cc b/onnxruntime/test/optimizer/nhwc_transformer_test.cc
index 55d3dd2f4b..f5824de82f 100644
--- a/onnxruntime/test/optimizer/nhwc_transformer_test.cc
+++ b/onnxruntime/test/optimizer/nhwc_transformer_test.cc
@@ -245,6 +245,51 @@ TEST(NhwcTransformerTests, ConvGlobalAveragePool) {
TransformerLevel::Level3);
}
+TEST(NhwcTransformerTests, ConvAveragePool) {
+ auto build_test_case = [&](ModelTestBuilder& builder) {
+ auto* input_arg = builder.MakeInput({1, 23, 13, 13}, 0, 31);
+ auto* conv1_output_arg = builder.MakeIntermediate();
+ auto* conv2_output_arg = builder.MakeIntermediate();
+ auto* avgpool1_output_arg = builder.MakeIntermediate();
+ auto* avgpool2_output_arg = builder.MakeIntermediate();
+ auto* output_arg = builder.MakeOutput();
+ auto* conv1_weight_arg = NhwcMakeInitializer(builder, {30, 23, 3, 3});
+ auto* conv2_weight_arg = NhwcMakeInitializer(builder, {16, 30, 3, 3});
+
+ Node& conv1_node = builder.AddQLinearConvNode(input_arg, .01f, 135,
+ conv1_weight_arg, .02f, 126,
+ conv1_output_arg, .37f, 131);
+ conv1_node.AddAttribute("pads", std::vector{1, 1, 1, 1});
+ Node& avgpool_node1 = builder.AddQLinearActivationNode("QLinearAveragePool",
+ conv1_output_arg, .37f, 131,
+ avgpool1_output_arg, .43f, 111);
+ avgpool_node1.AddAttribute("kernel_shape", std::vector{3, 3});
+ avgpool_node1.AddAttribute("pads", std::vector{1, 1, 1, 1});
+
+ builder.AddQLinearConvNode(avgpool1_output_arg, .43f, 111,
+ conv2_weight_arg, .015f, 129,
+ conv2_output_arg, .37f, 131);
+ Node& avgpool_node2 = builder.AddQLinearActivationNode("QLinearAveragePool",
+ conv2_output_arg, .37f, 131,
+ avgpool2_output_arg, .37f, 131);
+ avgpool_node2.AddAttribute("kernel_shape", std::vector{3, 3});
+ avgpool_node2.AddAttribute("pads", std::vector{1, 1, 1, 1});
+
+ builder.AddDequantizeLinearNode(avgpool2_output_arg, .37f, 131, output_arg);
+ };
+
+ auto check_nhwc_graph = [&](InferenceSessionWrapper& session) {
+ auto op_to_count = CountOpsInGraph(session.GetGraph());
+ EXPECT_EQ(op_to_count["com.microsoft.QLinearConv"], 2);
+ EXPECT_EQ(op_to_count["Transpose"], 2);
+ };
+
+ TransformerTester(build_test_case,
+ check_nhwc_graph,
+ TransformerLevel::Level2,
+ TransformerLevel::Level3);
+}
+
TEST(NhwcTransformerTests, ConvSplit) {
for (int64_t axis = -4LL; axis < 4; axis++) {
auto build_test_case = [&, axis](ModelTestBuilder& builder) {