From 50c5edcf13ac6db8e6bfcaf415fee87715db70ef Mon Sep 17 00:00:00 2001 From: Zhang Lei Date: Thu, 13 May 2021 22:05:30 -0700 Subject: [PATCH] Add nhwc support for QLinearAveragePool operator (#7656) * Add nhwc support for QLinearAveragePool operator * Update ContribOperators.md * Update OperatorKernels.md with cpu,dnnl and cuda enabled. --- docs/ContribOperators.md | 2 + docs/OperatorKernels.md | 10 + onnxruntime/contrib_ops/cpu/qlinear_pool.cc | 388 ++++++++++++++++-- onnxruntime/contrib_ops/cpu/qlinear_pool.h | 8 +- .../graph/contrib_ops/nhwc_schema_defs.cc | 152 ++++++- .../graph/contrib_ops/quantization_defs.cc | 149 +------ .../graph/contrib_ops/quantization_defs.h | 19 + .../core/optimizer/nhwc_transformer.cc | 3 +- .../test/contrib_ops/qlinear_pool_test.cc | 162 ++++++++ .../test/optimizer/nhwc_transformer_test.cc | 45 ++ 10 files changed, 741 insertions(+), 197 deletions(-) create mode 100644 onnxruntime/core/graph/contrib_ops/quantization_defs.h diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md index 4e18737a31..f317e45b0a 100644 --- a/docs/ContribOperators.md +++ b/docs/ContribOperators.md @@ -1784,6 +1784,8 @@ This version of the operator has been available since version 1 of the 'com.micr
auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where default value is NOTSET, which means explicit padding is used. SAME_UPPER or SAME_LOWER mean pad the input so that the output spatial size match the input.In case of odd number add the extra padding at the end for SAME_UPPER and at the beginning for SAME_LOWER. VALID mean no padding.
ceil_mode : int
Whether to use ceil or floor (default) to compute the output shape.
+
channels_last : int
+
Works on NHWC layout or not? Default not.
count_include_pad : int
Whether include pad pixels when calculating values for the edges. Default is 0, doesn't count include pad.
kernel_shape : list of ints (required)
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md index 7ad397f6ef..2b2fa79019 100644 --- a/docs/OperatorKernels.md +++ b/docs/OperatorKernels.md @@ -687,3 +687,13 @@ |TransposeMatMul|(*in* A:**T**, *in* B:**T**, *out* Y:**T**)|1+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)| | | | | + + +## Operators implemented by DnnlExecutionProvider + +| Op Name | Parameters | OpSet Version | Types Supported | +|---------|------------|---------------|-----------------| +|**Operator Domain:** *ai.onnx.ml*|||| +|Gemm|(*in* A:**T**, *in* B:**T**, *in* C:**T**, *out* Y:**T**)|7+|**T** = tensor(float)| +| | +| | diff --git a/onnxruntime/contrib_ops/cpu/qlinear_pool.cc b/onnxruntime/contrib_ops/cpu/qlinear_pool.cc index 9923de913a..0d39e85797 100644 --- a/onnxruntime/contrib_ops/cpu/qlinear_pool.cc +++ b/onnxruntime/contrib_ops/cpu/qlinear_pool.cc @@ -25,22 +25,33 @@ static inline T8Bits quantize_value(float y, float y_scale, T8Bits y_zero_point) template <> inline float dequantize_value(uint8_t x, float x_scale, uint8_t x_zero_point) { - return x_scale * (static_cast(x) - x_zero_point); + return x_scale * (static_cast(x) - x_zero_point); } template <> inline uint8_t quantize_value(float y, float y_scale, uint8_t y_zero_point) { - return static_cast(std::max(0.0f, std::min(std::nearbyintf(y / y_scale + y_zero_point), 255.0f))); + return static_cast(std::max(0.0f, std::min(std::nearbyintf(y / y_scale + y_zero_point), 255.0f))); } +static void SwitchDimsNchwNhwc(std::vector& dims, bool from_nchw_to_nhwc) { + if (from_nchw_to_nhwc) { + int64_t channel = dims[1]; + dims.erase(dims.begin() + 1); + dims.push_back(channel); + } else { + int64_t channel = dims.back(); + dims.insert(dims.begin() + 1, channel); + dims.pop_back(); + } +} template struct QLinearPool1DTask final { const float* X_data; T8Bits* Y_data; float y_scale; T8Bits y_zero_point; - int64_t x_step; - int64_t y_step; + int64_t x_image_size; + int64_t y_image_size; int64_t pooled_height; int64_t stride_h; int64_t height; @@ -61,8 +72,8 @@ struct QLinearPool1DTask final { } void operator()(std::ptrdiff_t c) const { - const float* x_d = X_data + c * x_step; - T8Bits* y_d = Y_data + c * y_step; + const float* x_d = X_data + c * x_image_size; + T8Bits* y_d = Y_data + c * y_image_size; for (int64_t ph = 0; ph < pooled_height; ++ph) { int64_t hstart = ph * stride_h - pads[0]; @@ -82,6 +93,67 @@ struct QLinearPool1DTask final { } }; +template +struct QLinearPoolNhwc1DTask final { + const float* X_data; + T8Bits* Y_data; + float y_scale; + T8Bits y_zero_point; + int64_t channels; + int64_t pooled_height; + int64_t stride_h; + int64_t height; + const std::vector& kernel_shape; + const std::vector& pads; + const PoolProcessContext& pool_context_; + const PoolAttributes& pool_attrs_; + + TensorOpCost Cost() { + double loop_count = static_cast(channels * kernel_shape[0]); + return TensorOpCost{loop_count, loop_count, loop_count}; + } + + void operator()(std::ptrdiff_t begin, std::ptrdiff_t end) const { + int64_t y_image_size = pooled_height; + int64_t batch = begin / y_image_size; + int64_t offset = begin % y_image_size; + + for (int64_t remains = end - begin; remains > 0; offset = 0, batch++) { + if (offset + remains <= y_image_size) { + operator()(std::ptrdiff_t(batch), std::ptrdiff_t(offset), std::ptrdiff_t(offset + remains)); + remains = 0; + } else { + operator()(std::ptrdiff_t(batch), std::ptrdiff_t(offset), std::ptrdiff_t(y_image_size)); + remains -= (y_image_size - offset); + } + } + } + + void operator()(std::ptrdiff_t batch, std::ptrdiff_t begin, std::ptrdiff_t end) const { + const float* x_d = X_data + batch * height * channels; + T8Bits* y_d = Y_data + batch * pooled_height * channels; + std::vector Yh(channels, PoolType::Initialize()); + + for (int64_t ph = begin, phc = begin * channels; ph < end; ++ph, phc += channels) { + int64_t hstart = ph * stride_h - pads[0]; + int64_t hend = std::min(hstart + kernel_shape[0], height); + hstart = std::max(hstart, static_cast(0)); + + std::fill(Yh.begin(), Yh.end(), PoolType::Initialize()); + for (int64_t h = hstart, hc = hstart * channels; h < hend; ++h, hc += channels) { + for (int64_t c = 0; c < channels; ++c) { + PoolType::Process(x_d[hc + c], Yh[c], pool_context_); + } + } + + int64_t element_count = (pool_attrs_.count_include_pad) ? kernel_shape[0] : hend - hstart; + for (int64_t c = 0; c < channels; ++c) { + PoolType::Finalize(element_count, Yh[c], pool_context_); + y_d[phc + c] = quantize_value(Yh[c], y_scale, y_zero_point); + } + } + } +}; template struct QLinearPool2DTask final { @@ -89,8 +161,8 @@ struct QLinearPool2DTask final { T8Bits* Y_data; float y_scale; T8Bits y_zero_point; - int64_t x_step; - int64_t y_step; + int64_t x_image_size; + int64_t y_image_size; int64_t pooled_height; int64_t pooled_width; int64_t stride_h; @@ -114,8 +186,8 @@ struct QLinearPool2DTask final { } void operator()(std::ptrdiff_t c) const { - const float* x_d = X_data + c * x_step; - T8Bits* y_d = Y_data + c * y_step; + const float* x_d = X_data + c * x_image_size; + T8Bits* y_d = Y_data + c * y_image_size; for (int64_t ph = 0; ph < pooled_height; ++ph) { int64_t hstart = ph * stride_h - pads[0]; @@ -144,14 +216,105 @@ struct QLinearPool2DTask final { } }; +template +struct QLinearPoolNhwc2DTask final { + const float* X_data; + T8Bits* Y_data; + float y_scale; + T8Bits y_zero_point; + int64_t x_image_size; + int64_t y_image_size; + int64_t kernel_size; + int64_t channels; + int64_t pooled_height; + int64_t pooled_width; + int64_t stride_h; + int64_t stride_w; + int64_t height; + int64_t width; + const std::vector& kernel_shape; + const std::vector& pads; + const PoolProcessContext& pool_context_; + const PoolAttributes& pool_attrs_; + + TensorOpCost Cost() { + double loop_count = static_cast(channels * kernel_size); + return TensorOpCost{loop_count, loop_count, loop_count}; + } + + void operator()(std::ptrdiff_t begin, std::ptrdiff_t end) const { + int64_t batch = begin / y_image_size; + int64_t offset = begin % y_image_size; + + for (int64_t remains = end - begin; remains > 0; offset = 0, batch++) { + if (offset + remains <= y_image_size) { + operator()(std::ptrdiff_t(batch), std::ptrdiff_t(offset), std::ptrdiff_t(offset + remains)); + remains = 0; + } else { + operator()(std::ptrdiff_t(batch), std::ptrdiff_t(offset), std::ptrdiff_t(y_image_size)); + remains -= (y_image_size - offset); + } + } + } + + void operator()(std::ptrdiff_t batch, std::ptrdiff_t begin, std::ptrdiff_t end) const { + const float* x_d = X_data + batch * x_image_size * channels; + T8Bits* y_d = Y_data + batch * y_image_size * channels; + + // Calculate starting pooled_h, pooled_w, pooled_d + int64_t start_pw = begin; + int64_t start_ph = start_pw / pooled_width; + start_pw -= (start_ph * pooled_width); + + int64_t pool_index = channels * begin; + int64_t remains = end - begin; + std::vector Yh(channels); + + for (int64_t ph = start_ph; remains > 0 && ph < pooled_height; ++ph) { + int64_t hstart = ph * stride_h - pads[0]; + int64_t hend = std::min(hstart + kernel_shape[0], height); + hstart = std::max(hstart, static_cast(0)); + for (int64_t pw = start_pw; remains > 0 && pw < pooled_width; ++pw) { + int64_t wstart = pw * stride_w - pads[1]; + int64_t wend = std::min(wstart + kernel_shape[1], width); + wstart = std::max(wstart, static_cast(0)); + + // do the pooling here + float pool_init_value = PoolType::Initialize(); + std::fill(Yh.data(), Yh.data() + channels, pool_init_value); + for (int64_t h = hstart; h < hend; ++h) { + int64_t input_index = channels * (h * width + wstart); + for (int64_t w = wstart; w < wend; ++w) { + for (int64_t c = 0; c < channels; c++) { + PoolType::Process(x_d[input_index + c], Yh[c], pool_context_); + } + input_index += channels; + } + } + + int64_t elements_count = (pool_attrs_.count_include_pad) ? kernel_size : (hend - hstart) * (wend - wstart); + for (int64_t c = 0; c < channels; c++) { + PoolType::Finalize(elements_count, Yh[c], pool_context_); + auto y_value = quantize_value(Yh[c], y_scale, y_zero_point); + y_d[pool_index + c] = y_value; + } + + pool_index += channels; + remains--; + } + start_pw = 0; + } + } +}; + template struct QLinearPool3DTask final { const float* X_data; T8Bits* Y_data; float y_scale; T8Bits y_zero_point; - int64_t x_step; - int64_t y_step; + int64_t x_image_size; + int64_t y_image_size; int64_t pooled_height; int64_t pooled_width; int64_t pooled_depth; @@ -179,8 +342,8 @@ struct QLinearPool3DTask final { } void operator()(std::ptrdiff_t c) const { - const float* x_d = X_data + c * x_step; - T8Bits* y_d = Y_data + c * y_step; + const float* x_d = X_data + c * x_image_size; + T8Bits* y_d = Y_data + c * y_image_size; for (int64_t ph = 0; ph < pooled_height; ++ph) { int64_t hstart = ph * stride_h - pads[0]; @@ -218,6 +381,110 @@ struct QLinearPool3DTask final { } }; +template +struct QLinearPoolNhwc3DTask final { + const float* X_data; + T8Bits* Y_data; + float y_scale; + T8Bits y_zero_point; + int64_t x_image_size; + int64_t y_image_size; + int64_t kernel_size; + int64_t channels; + int64_t pooled_height; + int64_t pooled_width; + int64_t pooled_depth; + int64_t stride_h; + int64_t stride_w; + int64_t stride_d; + int64_t height; + int64_t width; + int64_t depth; + const std::vector& kernel_shape; + const std::vector& pads; + const PoolProcessContext& pool_context_; + const PoolAttributes& pool_attrs_; + + TensorOpCost Cost() { + double loop_count = static_cast(channels * kernel_size); + return TensorOpCost{loop_count, loop_count, loop_count}; + } + + void operator()(std::ptrdiff_t begin, std::ptrdiff_t end) const { + int64_t batch = begin / y_image_size; + int64_t offset = begin % y_image_size; + + for (int64_t remains = end - begin; remains > 0; offset = 0, batch++) { + if (offset + remains <= y_image_size) { + operator()(std::ptrdiff_t(batch), std::ptrdiff_t(offset), std::ptrdiff_t(offset + remains)); + remains = 0; + } else { + operator()(std::ptrdiff_t(batch), std::ptrdiff_t(offset), std::ptrdiff_t(y_image_size)); + remains -= (y_image_size - offset); + } + } + } + + void operator()(std::ptrdiff_t batch, std::ptrdiff_t begin, std::ptrdiff_t end) const { + const float* x_d = X_data + batch * x_image_size * channels; + T8Bits* y_d = Y_data + batch * y_image_size * channels; + + // Calculate starting pooled_h, pooled_w, pooled_d + int64_t start_pd = begin; + int64_t start_ph = start_pd / (pooled_width * pooled_depth); + start_pd = start_pd - (start_ph * pooled_width * pooled_depth); + int64_t start_pw = start_pd / pooled_depth; + start_pd = start_pd - start_pw * pooled_depth; + int64_t pool_index = channels * begin; + int64_t remains = end - begin; + + std::vector Yh(channels); + + for (int64_t ph = start_ph; remains > 0 && ph < pooled_height; ++ph) { + int64_t hstart = ph * stride_h - pads[0]; + int64_t hend = std::min(hstart + kernel_shape[0], height); + hstart = std::max(hstart, static_cast(0)); + for (int64_t pw = start_pw; remains > 0 && pw < pooled_width; ++pw) { + int64_t wstart = pw * stride_w - pads[1]; + int64_t wend = std::min(wstart + kernel_shape[1], width); + wstart = std::max(wstart, static_cast(0)); + for (int64_t pd = start_pd; remains > 0 && pd < pooled_depth; ++pd) { + int64_t dstart = pd * stride_d - pads[2]; + int64_t dend = std::min(dstart + kernel_shape[2], depth); + dstart = std::max(dstart, static_cast(0)); + + // do the pooling here + std::fill(Yh.begin(), Yh.end(), PoolType::Initialize()); + for (int64_t h = hstart; h < hend; ++h) { + const int64_t input_index_h = h * width * depth; + for (int64_t w = wstart; w < wend; ++w) { + int64_t input_index = channels * (input_index_h + w * depth + dstart); + for (int64_t d = dstart; d < dend; ++d) { + for (int64_t c = 0; c < channels; c++) { + PoolType::Process(x_d[input_index + c], Yh[c], pool_context_); + } + input_index += channels; + } + } + } + + int64_t elements_count = (pool_attrs_.count_include_pad) ? kernel_size : (hend - hstart) * (wend - wstart) * (dend - dstart); + for (int64_t c = 0; c < channels; c++) { + PoolType::Finalize(elements_count, Yh[c], pool_context_); + auto y_value = quantize_value(Yh[c], y_scale, y_zero_point); + y_d[pool_index + c] = y_value; + } + + pool_index += channels; + remains--; + } + start_pd = 0; + } + start_pw = 0; + } + } +}; + Status QLinearAveragePool::Compute(OpKernelContext* context) const { const auto tensor_x_scale = context->Input(1); const auto tensor_x_zero_point = context->Input(2); @@ -236,9 +503,10 @@ Status QLinearAveragePool::Compute(OpKernelContext* context) const { const auto* X = context->Input(0); auto dtype = X->GetElementType(); if (dtype != ONNX_NAMESPACE::TensorProto_DataType_UINT8) { - ORT_THROW("Unsupported 'dtype' in QLinear Pooling:", dtype); + ORT_THROW("Unsupported 'dtype' in QLinear Pooling:", dtype); } - const TensorShape& x_shape = X->Shape(); + + TensorShape x_shape = X->Shape(); const float x_scale = *(tensor_x_scale->Data()); const float y_scale = *(tensor_y_scale->Data()); uint8_t x_zero_point = (tensor_x_zero_point ? *(tensor_x_zero_point->Data()) : (uint8_t)0); @@ -249,12 +517,14 @@ Status QLinearAveragePool::Compute(OpKernelContext* context) const { std::vector strides = pool_attrs_.strides; std::vector kernel_shape = pool_attrs_.kernel_shape; + if (channels_last_) { + std::vector x_dims = x_shape.GetDims(); + SwitchDimsNchwNhwc(x_dims, false); + x_shape = TensorShape(x_dims); + } std::vector output_dims = pool_attrs_.SetOutputSize(x_shape, x_shape[1], &pads); - Tensor* Y = context->Output(0, output_dims); - - const auto* X_data = X->Data(); - auto* Y_data = Y->MutableData(); + int64_t batch_count = x_shape[0]; const int64_t channels = x_shape[1]; const int64_t height = x_shape[2]; const int64_t width = kernel_shape.size() > 1 ? x_shape[3] : 1; @@ -262,9 +532,17 @@ Status QLinearAveragePool::Compute(OpKernelContext* context) const { const int64_t pooled_height = output_dims[2]; const int64_t pooled_width = kernel_shape.size() > 1 ? output_dims[3] : 1; const int64_t pooled_depth = kernel_shape.size() > 2 ? output_dims[4] : 1; - const int64_t total_channels = x_shape[0] * channels; - const int64_t x_step = height * width * depth; - const int64_t y_step = pooled_height * pooled_width * pooled_depth; + const int64_t total_channels = batch_count * channels; + const int64_t x_image_size = height * width * depth; + const int64_t y_image_size = pooled_height * pooled_width * pooled_depth; + const int64_t kernel_size = std::accumulate(kernel_shape.begin(), kernel_shape.end(), 1LL, std::multiplies()); + + if (channels_last_) { + SwitchDimsNchwNhwc(output_dims, true); + } + Tensor* Y = context->Output(0, output_dims); + const auto* X_data = X->Data(); + auto* Y_data = Y->MutableData(); ThreadPool* tp = context->GetOperatorThreadPool(); std::vector x_data_fp32; @@ -274,42 +552,62 @@ Status QLinearAveragePool::Compute(OpKernelContext* context) const { const auto* x8 = X_data + first; float* x32 = x_data_fp32.data() + first; for (ptrdiff_t i = 0, sz = last - first; i < sz; ++i) { - *x32++ = dequantize_value(x8[i], x_scale, x_zero_point); + *x32++ = dequantize_value(x8[i], x_scale, x_zero_point); } }); } switch (kernel_shape.size()) { - case 1: - { - QLinearPool1DTask avg_pool_task_1d = { - x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_step, y_step, - pooled_height, strides[0], height, kernel_shape, pads, pool_context_, pool_attrs_}; - ThreadPool::TryParallelFor(tp, total_channels, avg_pool_task_1d.Cost(), avg_pool_task_1d); + case 1: { + if (channels_last_) { + QLinearPoolNhwc1DTask avg_pool_task_1d = { + x_data_fp32.data(), Y_data, y_scale, y_zero_point, channels, + pooled_height, strides[0], height, kernel_shape, pads, pool_context_, pool_attrs_}; + ThreadPool::TryParallelFor(tp, y_image_size * batch_count, avg_pool_task_1d.Cost(), avg_pool_task_1d); + } else { + QLinearPool1DTask avg_pool_task_1d = { + x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_image_size, y_image_size, + pooled_height, strides[0], height, kernel_shape, pads, pool_context_, pool_attrs_}; + ThreadPool::TryParallelFor(tp, total_channels, avg_pool_task_1d.Cost(), avg_pool_task_1d); + } break; } - case 2: - { - QLinearPool2DTask avg_pool_task_2d = { - x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_step, y_step, - pooled_height, pooled_width, strides[0], strides[1], height, width, kernel_shape, pads, pool_context_, pool_attrs_}; - ThreadPool::TryParallelFor(tp, total_channels, avg_pool_task_2d.Cost(), avg_pool_task_2d); + case 2: { + if (channels_last_) { + QLinearPoolNhwc2DTask avg_pool_task_2d = { + x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_image_size, y_image_size, kernel_size, channels, + pooled_height, pooled_width, strides[0], strides[1], height, width, kernel_shape, pads, pool_context_, pool_attrs_}; + ThreadPool::TryParallelFor(tp, y_image_size * batch_count, avg_pool_task_2d.Cost(), avg_pool_task_2d); + + } else { + QLinearPool2DTask avg_pool_task_2d = { + x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_image_size, y_image_size, + pooled_height, pooled_width, strides[0], strides[1], height, width, kernel_shape, pads, pool_context_, pool_attrs_}; + ThreadPool::TryParallelFor(tp, total_channels, avg_pool_task_2d.Cost(), avg_pool_task_2d); + } break; } - case 3: - { - QLinearPool3DTask avg_pool_task_3d = { - x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_step, y_step, - pooled_height, pooled_width, pooled_depth, strides[0], strides[1], strides[2], height, width, depth, - kernel_shape, pads, pool_context_, pool_attrs_}; - ThreadPool::TryParallelFor(tp, total_channels, avg_pool_task_3d.Cost(), avg_pool_task_3d); + case 3: { + if (channels_last_) { + QLinearPoolNhwc3DTask avg_pool_task_3d = { + x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_image_size, y_image_size, kernel_size, channels, + pooled_height, pooled_width, pooled_depth, strides[0], strides[1], strides[2], height, width, depth, + kernel_shape, pads, pool_context_, pool_attrs_}; + ThreadPool::TryParallelFor(tp, y_image_size * batch_count, avg_pool_task_3d.Cost(), avg_pool_task_3d); + + } else { + QLinearPool3DTask avg_pool_task_3d = { + x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_image_size, y_image_size, + pooled_height, pooled_width, pooled_depth, strides[0], strides[1], strides[2], height, width, depth, + kernel_shape, pads, pool_context_, pool_attrs_}; + ThreadPool::TryParallelFor(tp, total_channels, avg_pool_task_3d.Cost(), avg_pool_task_3d); + } break; } - default: - { + default: { return onnxruntime::common::Status( onnxruntime::common::ONNXRUNTIME, onnxruntime::common::INVALID_ARGUMENT, diff --git a/onnxruntime/contrib_ops/cpu/qlinear_pool.h b/onnxruntime/contrib_ops/cpu/qlinear_pool.h index 13175052f0..92285e4f78 100644 --- a/onnxruntime/contrib_ops/cpu/qlinear_pool.h +++ b/onnxruntime/contrib_ops/cpu/qlinear_pool.h @@ -12,15 +12,17 @@ namespace contrib { class QLinearAveragePool final : public OpKernel, public PoolBase { public: - QLinearAveragePool(const OpKernelInfo& info) : OpKernel(info), PoolBase(info) { } + QLinearAveragePool(const OpKernelInfo& info) : OpKernel(info), PoolBase(info) { + channels_last_ = (info.GetAttrOrDefault("channels_last", static_cast(0)) != 0); + } ~QLinearAveragePool() override = default; Status Compute(OpKernelContext* context) const override; -private: + private: PoolProcessContext pool_context_; - + bool channels_last_; }; } // namespace contrib diff --git a/onnxruntime/core/graph/contrib_ops/nhwc_schema_defs.cc b/onnxruntime/core/graph/contrib_ops/nhwc_schema_defs.cc index fa38df6ea6..b06d9ad72d 100644 --- a/onnxruntime/core/graph/contrib_ops/nhwc_schema_defs.cc +++ b/onnxruntime/core/graph/contrib_ops/nhwc_schema_defs.cc @@ -4,6 +4,7 @@ #include "core/framework/tensorprotoutils.h" #include "core/graph/constants.h" #include "core/graph/contrib_ops/contrib_defs.h" +#include "core/graph/contrib_ops/quantization_defs.h" namespace ONNX_NAMESPACE { void convPoolShapeInference( @@ -18,7 +19,6 @@ using namespace ONNX_NAMESPACE; namespace onnxruntime { namespace contrib { - class NhwcInferenceContext : public InferenceContext { public: NhwcInferenceContext(InferenceContext& ctx) : ctx_(ctx) { @@ -263,6 +263,156 @@ equal to the spatial dimension of input tensor. Input is of type uint8_t or int8 ++image_dim_index; } }); + + const char* QLinearAveragePoolDoc_ver1 = R"DOC( + QLinearAveragePool consumes an input tensor X and applies average pooling across + the tensor according to kernel sizes, stride sizes, and pad lengths. + average pooling consisting of computing the average on all values of a + subset of the input tensor according to the kernel size and downsampling the + data into the output tensor Y for further processing. The output spatial shape will be following: + ``` + output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - kernel_spatial_shape[i]) / strides_spatial_shape[i] + 1) + ``` + or + ``` + output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - kernel_spatial_shape[i]) / strides_spatial_shape[i] + 1) + ``` + if ceil_mode is enabled + + ``` + * pad_shape[i] is sum of pads along axis i + ``` + + `auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following: + ``` + VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - kernel_spatial_shape[i] + 1) / strides_spatial_shape[i]) + SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides_spatial_shape[i]) + ``` + And pad shape will be following if `SAME_UPPER` or `SAME_LOWER`: + ``` + pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + kernel_spatial_shape[i] - input_spatial_shape[i] + ``` + +The output of each pooling window is divided by the number of elements (exclude pad when attribute count_include_pad is zero). + +Input and output scales and zero points are used to convert the output to a new quantization range. +Output = Dequantize(Input) -> AveragePool on fp32 data -> Quantize(output) +)DOC"; + + static const char* contrib_ops_pads_doc = + "Padding for the beginning and ending along each spatial axis, it can take any value greater " + "than or equal to 0. The value represent the number of pixels added to the beginning " + "and end part of the corresponding axis. `pads` format should be as follow " + "[x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin the number of pixels " + "added at the beginning of axis `i` and xi_end, the number of pixels added at " + "the end of axis `i`. This attribute cannot be used simultaneously with " + "auto_pad attribute. If not present, the padding defaults to 0 along start and end of each spatial axis."; + static const char* contrib_ops_auto_pad_doc = + "auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where " + "default value is NOTSET, which means explicit padding is used. " + "SAME_UPPER or SAME_LOWER mean pad the input so that the output spatial size match the input." + "In case of odd number add the extra padding at the end for SAME_UPPER and at the " + "beginning for SAME_LOWER. VALID mean no padding."; + + ONNX_CONTRIB_OPERATOR_SCHEMA(QLinearAveragePool) + .SetDomain(kMSDomain) + .SinceVersion(1) + .SetDoc(QLinearAveragePoolDoc_ver1) + .Attr( + "count_include_pad", + "Whether include pad pixels when calculating values for the edges. Default is 0, doesn't count include pad.", + AttributeProto::INT, + static_cast(0)) + .Attr( + "kernel_shape", + "The size of the kernel along each axis.", + AttributeProto::INTS) + .Attr( + "strides", + "Stride along each spatial axis. If not present, the stride defaults to 1 along each spatial axis.", + AttributeProto::INTS, + OPTIONAL_VALUE) + .Attr( + "auto_pad", + contrib_ops_auto_pad_doc, + AttributeProto::STRING, + std::string("NOTSET")) + .Attr("pads", contrib_ops_pads_doc, AttributeProto::INTS, OPTIONAL_VALUE) + .Attr( + "ceil_mode", + "Whether to use ceil or floor (default) to compute the output shape.", + AttributeProto::INT, + static_cast(0)) + .Attr("channels_last", "Works on NHWC layout or not? Default not.", AttributeProto::INT, static_cast(0)) + .Input( + 0, + "X", + "Input data tensor from the previous operator; " + "dimensions for image case are (N x C x H x W), " + "where N is the batch size, C is the number of " + "channels, and H and W are the height and the " + "width of the data. For non image case, the " + "dimensions are in the form of " + "(N x C x D1 x D2 ... Dn), where N is the batch " + "size. Optionally, if dimension denotation is " + "in effect, the operation expects the input " + "data tensor to arrive with the dimension denotation " + "of [DATA_BATCH, DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].", + "T") + .Input( + 1, + "x_scale", + "Input scale. It's a scalar, which means a per-tensor/layer quantization.", + "tensor(float)") + .Input( + 2, + "x_zero_point", + "Input zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.", + "T", + OpSchema::Optional) + .Input( + 3, + "y_scale", + "Output scale. It's a scalar, which means a per-tensor/layer quantization.", + "tensor(float)") + .Input( + 4, + "y_zero_point", + "Output zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.", + "T", + OpSchema::Optional) + .Output( + 0, + "Y", + "Output data tensor from average or max pooling across " + "the input tensor. Dimensions will vary based " + "on various kernel, stride, and pad sizes. Floor value of " + "the dimension is used", + "T") + .TypeConstraint( + "T", + {"tensor(uint8)", "tensor(int8)"}, + "Constrain input and output types to 8 bit tensors.") + .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) { + ONNX_NAMESPACE::propagateElemTypeFromInputToOutput(ctx, 0, 0); + + auto data_type = ctx.getInputType(0); + if (nullptr == data_type || data_type->value_case() != ONNX_NAMESPACE::TypeProto::kTensorType) { + fail_type_inference("inputs are expected to have tensor type."); + } + + // validate scale and zero points + ValidateTypeAndShapeForScaleAndZP(ctx, 1, ONNX_NAMESPACE::TensorProto::FLOAT, true); + ValidateTypeAndShapeForScaleAndZP(ctx, 2, data_type->tensor_type().elem_type(), true); + ValidateTypeAndShapeForScaleAndZP(ctx, 3, ONNX_NAMESPACE::TensorProto::FLOAT, true); + ValidateTypeAndShapeForScaleAndZP(ctx, 4, data_type->tensor_type().elem_type(), true); + + if (getAttribute(ctx, "channels_last", 0) == 0) { + ONNX_NAMESPACE::convPoolShapeInference(ctx, false, true, 0, 5); + } else { + convPoolShapeInferenceNhwc(ctx, false, true, 0, 5); + } + }); } } // namespace contrib diff --git a/onnxruntime/core/graph/contrib_ops/quantization_defs.cc b/onnxruntime/core/graph/contrib_ops/quantization_defs.cc index ef3a91727c..2202b640fc 100644 --- a/onnxruntime/core/graph/contrib_ops/quantization_defs.cc +++ b/onnxruntime/core/graph/contrib_ops/quantization_defs.cc @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/framework/tensorprotoutils.h" +#include "core/graph/contrib_ops/quantization_defs.h" #include "core/graph/constants.h" #include "core/graph/contrib_ops/contrib_defs.h" @@ -28,7 +28,7 @@ using ONNX_NAMESPACE::InferenceContext; using ONNX_NAMESPACE::OpSchema; using ONNX_NAMESPACE::OPTIONAL_VALUE; -void ValidateTypeAndShapeForScaleAndZP(ONNX_NAMESPACE::InferenceContext& ctx, int index, ::google::protobuf::int32 expectedType, bool isScalar, int expectedTensorSize = 0) { +void ValidateTypeAndShapeForScaleAndZP(ONNX_NAMESPACE::InferenceContext& ctx, int index, ::google::protobuf::int32 expectedType, bool isScalar, int expectedTensorSize) { if (ctx.getNumInputs() > static_cast(index)) { auto data_type = ctx.getInputType(index); if (nullptr == data_type) { @@ -546,151 +546,6 @@ This helps to improve accuracy as after ReduceMean operation the range of the ou } }); - const char* QLinearAveragePoolDoc_ver1 = R"DOC( - QLinearAveragePool consumes an input tensor X and applies average pooling across - the tensor according to kernel sizes, stride sizes, and pad lengths. - average pooling consisting of computing the average on all values of a - subset of the input tensor according to the kernel size and downsampling the - data into the output tensor Y for further processing. The output spatial shape will be following: - ``` - output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - kernel_spatial_shape[i]) / strides_spatial_shape[i] + 1) - ``` - or - ``` - output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - kernel_spatial_shape[i]) / strides_spatial_shape[i] + 1) - ``` - if ceil_mode is enabled - - ``` - * pad_shape[i] is sum of pads along axis i - ``` - - `auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following: - ``` - VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - kernel_spatial_shape[i] + 1) / strides_spatial_shape[i]) - SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides_spatial_shape[i]) - ``` - And pad shape will be following if `SAME_UPPER` or `SAME_LOWER`: - ``` - pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + kernel_spatial_shape[i] - input_spatial_shape[i] - ``` - -The output of each pooling window is divided by the number of elements (exclude pad when attribute count_include_pad is zero). - -Input and output scales and zero points are used to convert the output to a new quantization range. -Output = Dequantize(Input) -> AveragePool on fp32 data -> Quantize(output) -)DOC"; - - static const char* contrib_ops_pads_doc = - "Padding for the beginning and ending along each spatial axis, it can take any value greater " - "than or equal to 0. The value represent the number of pixels added to the beginning " - "and end part of the corresponding axis. `pads` format should be as follow " - "[x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin the number of pixels " - "added at the beginning of axis `i` and xi_end, the number of pixels added at " - "the end of axis `i`. This attribute cannot be used simultaneously with " - "auto_pad attribute. If not present, the padding defaults to 0 along start and end of each spatial axis."; - static const char* contrib_ops_auto_pad_doc = - "auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where " - "default value is NOTSET, which means explicit padding is used. " - "SAME_UPPER or SAME_LOWER mean pad the input so that the output spatial size match the input." - "In case of odd number add the extra padding at the end for SAME_UPPER and at the " - "beginning for SAME_LOWER. VALID mean no padding."; - - ONNX_CONTRIB_OPERATOR_SCHEMA(QLinearAveragePool) - .SetDomain(kMSDomain) - .SinceVersion(1) - .SetDoc(QLinearAveragePoolDoc_ver1) - .Attr( - "count_include_pad", - "Whether include pad pixels when calculating values for the edges. Default is 0, doesn't count include pad.", - AttributeProto::INT, - static_cast(0)) - .Attr( - "kernel_shape", - "The size of the kernel along each axis.", - AttributeProto::INTS) - .Attr( - "strides", - "Stride along each spatial axis. If not present, the stride defaults to 1 along each spatial axis.", - AttributeProto::INTS, - OPTIONAL_VALUE) - .Attr( - "auto_pad", - contrib_ops_auto_pad_doc, - AttributeProto::STRING, - std::string("NOTSET")) - .Attr("pads", contrib_ops_pads_doc, AttributeProto::INTS, OPTIONAL_VALUE) - .Attr( - "ceil_mode", - "Whether to use ceil or floor (default) to compute the output shape.", - AttributeProto::INT, - static_cast(0)) - .Input( - 0, - "X", - "Input data tensor from the previous operator; " - "dimensions for image case are (N x C x H x W), " - "where N is the batch size, C is the number of " - "channels, and H and W are the height and the " - "width of the data. For non image case, the " - "dimensions are in the form of " - "(N x C x D1 x D2 ... Dn), where N is the batch " - "size. Optionally, if dimension denotation is " - "in effect, the operation expects the input " - "data tensor to arrive with the dimension denotation " - "of [DATA_BATCH, DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].", - "T") - .Input( - 1, - "x_scale", - "Input scale. It's a scalar, which means a per-tensor/layer quantization.", - "tensor(float)") - .Input( - 2, - "x_zero_point", - "Input zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.", - "T", - OpSchema::Optional) - .Input( - 3, - "y_scale", - "Output scale. It's a scalar, which means a per-tensor/layer quantization.", - "tensor(float)") - .Input( - 4, - "y_zero_point", - "Output zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.", - "T", - OpSchema::Optional) - .Output( - 0, - "Y", - "Output data tensor from average or max pooling across " - "the input tensor. Dimensions will vary based " - "on various kernel, stride, and pad sizes. Floor value of " - "the dimension is used", - "T") - .TypeConstraint( - "T", - {"tensor(uint8)", "tensor(int8)"}, - "Constrain input and output types to 8 bit tensors.") - .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) { - ONNX_NAMESPACE::propagateElemTypeFromInputToOutput(ctx, 0, 0); - - auto data_type = ctx.getInputType(0); - if (nullptr == data_type || data_type->value_case() != ONNX_NAMESPACE::TypeProto::kTensorType) { - fail_type_inference("inputs are expected to have tensor type."); - } - - // validate scale and zero points - ValidateTypeAndShapeForScaleAndZP(ctx, 1, ONNX_NAMESPACE::TensorProto::FLOAT, true); - ValidateTypeAndShapeForScaleAndZP(ctx, 2, data_type->tensor_type().elem_type(), true); - ValidateTypeAndShapeForScaleAndZP(ctx, 3, ONNX_NAMESPACE::TensorProto::FLOAT, true); - ValidateTypeAndShapeForScaleAndZP(ctx, 4, data_type->tensor_type().elem_type(), true); - - ONNX_NAMESPACE::convPoolShapeInference(ctx, false, true, 0, 5); - }); - const char* QLinearLeakyReluDoc_ver1 = R"DOC( QLinearLeakyRelu takes quantized input data (Tensor), an argument alpha, and quantize parameter for output, and produces one output data (Tensor) where the function `f(x) = quantize(alpha * dequantize(x)) for dequantize(x) < 0`, diff --git a/onnxruntime/core/graph/contrib_ops/quantization_defs.h b/onnxruntime/core/graph/contrib_ops/quantization_defs.h new file mode 100644 index 0000000000..44ab4b0147 --- /dev/null +++ b/onnxruntime/core/graph/contrib_ops/quantization_defs.h @@ -0,0 +1,19 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +#pragma once + +#include "core/graph/onnx_protobuf.h" +#include "core/framework/tensorprotoutils.h" + +namespace onnxruntime { +namespace contrib { + +void ValidateTypeAndShapeForScaleAndZP( + ONNX_NAMESPACE::InferenceContext& ctx, + int index, + ::google::protobuf::int32 expectedType, + bool isScalar, + int expectedTensorSize = 0); + +} +} // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/core/optimizer/nhwc_transformer.cc b/onnxruntime/core/optimizer/nhwc_transformer.cc index 26829f71fc..a05d00c290 100644 --- a/onnxruntime/core/optimizer/nhwc_transformer.cc +++ b/onnxruntime/core/optimizer/nhwc_transformer.cc @@ -427,7 +427,8 @@ void NhwcTransformerImpl::Transform(Node& node) { } else if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "QLinearLeakyRelu", {1}, kMSDomain) || graph_utils::IsSupportedOptypeVersionAndDomain(node, "QLinearSigmoid", {1}, kMSDomain)) { TransformQLinearActivation(node); - } else if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "QLinearGlobalAveragePool", {1}, kMSDomain)) { + } else if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "QLinearGlobalAveragePool", {1}, kMSDomain) || + graph_utils::IsSupportedOptypeVersionAndDomain(node, "QLinearAveragePool", {1}, kMSDomain)) { TransformQLinearGlobalAveragePool(node); } else if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "QLinearConcat", {1}, kMSDomain)) { TransformQLinearConcat(node); diff --git a/onnxruntime/test/contrib_ops/qlinear_pool_test.cc b/onnxruntime/test/contrib_ops/qlinear_pool_test.cc index 7678887fa0..94916ebec3 100644 --- a/onnxruntime/test/contrib_ops/qlinear_pool_test.cc +++ b/onnxruntime/test/contrib_ops/qlinear_pool_test.cc @@ -192,6 +192,105 @@ void RunQLinearAveragePoolNchwU8( run_test(true /* only_x_not_initializer */, true /* x_y_same_zero_point */); } +static std::vector dims_to_nhwc(const std::vector& nchw) { + std::vector nhwc(nchw); + nhwc.erase(nhwc.begin() + 1); + nhwc.push_back(nchw[1]); + return nhwc; +} + +static std::vector transpose_to_nhwc(const std::vector& nchw_data, const std::vector& nchw_dims) { + std::vector nhwc_data(nchw_data.size()); + + auto batch_count = nchw_dims[0]; + auto channels = nchw_dims[1]; + int64_t image_size = std::accumulate(nchw_dims.begin() + 2, nchw_dims.end(), 1LL, std::multiplies()); + for (int64_t b = 0; b < batch_count; b++) { + const uint8_t* nchw_image = nchw_data.data() + (b * image_size); + uint8_t* nhwc_image = nhwc_data.data() + (b * image_size); + for (int64_t img_index = 0; img_index < image_size; ++img_index) { + for (int64_t c = 0; c < channels; c++) { + *nhwc_image++ = nchw_image[c * image_size + img_index]; + } + } + } + + return nhwc_data; +} + +void RunQLinearAveragePoolNhwcU8( + const std::vector x_dims, + const std::vector y_dims, + const std::vector kernel_shape, + const std::vector strides, + const std::vector pads, + const int64_t count_include_pad = 0) { + float x_scale = 1.0f / 255.0f; + uint8_t x_zero_point = 128; + RandomValueGenerator random{}; + std::vector x_data_fp32 = random.Uniform(x_dims, -0.5f, 0.5f); + std::vector x_data(x_data_fp32.size()); + for (size_t i = 0; i < x_data.size(); ++i) { + x_data[i] = quantize_u8(x_data_fp32[i], x_scale, x_zero_point); + } + + float y_scale = 1.0f / 255.0f; + uint8_t y_zero_point = 100; + int64_t y_size = std::accumulate(y_dims.begin(), y_dims.end(), 1LL, std::multiplies()); + std::vector y_data(y_size); + CalculateAvgPoolNchwU8( + x_data.data(), x_dims, x_scale, x_zero_point, + y_data.data(), y_dims, y_scale, y_zero_point, + kernel_shape, strides, pads, count_include_pad); + + // transpose the result + std::vector y_data_nhwc = transpose_to_nhwc(y_data, y_dims); + std::vector x_data_nhwc = transpose_to_nhwc(x_data, x_dims); + auto x_dims_nhwc = dims_to_nhwc(x_dims); + auto y_dims_nhwc = dims_to_nhwc(y_dims); + + OpTester test("QLinearAveragePool", 1, onnxruntime::kMSDomain); + + test.AddAttribute("auto_pad", ""); + test.AddAttribute("strides", strides); + test.AddAttribute("pads", pads); + test.AddAttribute("kernel_shape", kernel_shape); + test.AddAttribute("count_include_pad", count_include_pad); + test.AddAttribute("channels_last", (int64_t)1LL); + + test.AddInput("X", x_dims_nhwc, x_data_nhwc); + test.AddInput("x_scale", {}, {x_scale}); + test.AddInput("x_zero_point", {}, {x_zero_point}); + test.AddInput("y_scale", {}, {y_scale}); + test.AddInput("y_zero_point", {}, {y_zero_point}); + test.AddOutput("Y", y_dims_nhwc, y_data_nhwc); + + auto q8checker = [&](const std::vector& fetches, const std::string& provider_type) { + const OrtValue& ort_value = fetches[0]; + if (ort_value.Fence()) { + ort_value.Fence()->BeforeUsingAsInput(onnxruntime::kCpuExecutionProvider, 0); + } + + auto y_shape = TensorShape(y_dims_nhwc); + const Tensor& output_tensor = ort_value.Get(); + ORT_ENFORCE(y_shape == output_tensor.Shape(), + "Expected output shape [" + y_shape.ToString() + "] did not match run output shape [" + + output_tensor.Shape().ToString() + "] for Y @" + provider_type); + auto* output = output_tensor.Data(); + auto size = static_cast(output_tensor.Shape().Size()); + for (int i = 0; i < size; ++i) { + int diff = abs(y_data_nhwc[i] - output[i]); + EXPECT_LE(diff, 1) << "i:" << i << " expected:" << y_data_nhwc[i] << " " << (int)y_data_nhwc[i] + << ", got:" << output[i] << " " << (int)output[i] << ", provider_type: " << provider_type; + } + }; + test.SetCustomOutputVerifier(q8checker); + + static std::unordered_set excluded_providers = {kNnapiExecutionProvider}; + + test.Run(OpTester::ExpectResult::kExpectSuccess, "", excluded_providers); +} + TEST(QLinearPoolTest, AveragePool1D_ExcludePadPixel) { RunQLinearAveragePoolNchwU8( {1, 1, 5}, // x shape @@ -252,5 +351,68 @@ TEST(QLinearPoolTest, AveragePool3D_IncludePadPixel) { 1); // count_include_pad } +/************************************************* +* Channels last test +**************************************************/ +TEST(QLinearPoolTest, AveragePool1D_ExcludePadPixel_nhwc) { + RunQLinearAveragePoolNhwcU8( + {1, 1, 5}, // x shape + {1, 1, 6}, // expected y shape + {3}, // kernel shape + {1}, // strides + {1, 2}, // pads + 0); // count_include_pad +} + +TEST(QLinearPoolTest, AveragePool1D_IncludePadPixel_nhwc) { + RunQLinearAveragePoolNhwcU8( + {1, 1, 5}, // x shape + {1, 1, 6}, // expected y shape + {3}, // kernel shape + {1}, // strides + {1, 2}, // pads + 1); // count_include_pad +} + +TEST(QLinearPoolTest, AveragePool2D_ExcludePadPixel_nhwc) { + RunQLinearAveragePoolNhwcU8( + {1, 1, 5, 7}, // x shape + {1, 1, 6, 4}, // expected y shape + {3, 4}, // kernel shape + {1, 2}, // strides + {1, 3, 2, 1}, // pads + 0); // count_include_pad +} + +TEST(QLinearPoolTest, AveragePool2D_IncludePadPixel_nhwc) { + RunQLinearAveragePoolNhwcU8( + {1, 1, 5, 7}, // x shape + {1, 1, 6, 4}, // expected y shape + {3, 4}, // kernel shape + {1, 2}, // strides + {1, 3, 2, 1}, // pads + 1); // count_include_pad +} + +TEST(QLinearPoolTest, AveragePool3D_ExcludePadPixel_nhwc) { + RunQLinearAveragePoolNhwcU8( + {1, 1, 5, 7, 9}, // x shape + {1, 1, 6, 4, 3}, // expected y shape + {3, 4, 5}, // kernel shape + {1, 2, 3}, // strides + {1, 3, 2, 2, 1, 2}, // pads + 0); // count_include_pad +} + +TEST(QLinearPoolTest, AveragePool3D_IncludePadPixel_nhwc) { + RunQLinearAveragePoolNhwcU8( + {1, 1, 5, 7, 9}, // x shape + {1, 1, 6, 4, 3}, // expected y shape + {3, 4, 5}, // kernel shape + {1, 2, 3}, // strides + {1, 3, 2, 2, 1, 2}, // pads + 1); // count_include_pad +} + } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/optimizer/nhwc_transformer_test.cc b/onnxruntime/test/optimizer/nhwc_transformer_test.cc index 55d3dd2f4b..f5824de82f 100644 --- a/onnxruntime/test/optimizer/nhwc_transformer_test.cc +++ b/onnxruntime/test/optimizer/nhwc_transformer_test.cc @@ -245,6 +245,51 @@ TEST(NhwcTransformerTests, ConvGlobalAveragePool) { TransformerLevel::Level3); } +TEST(NhwcTransformerTests, ConvAveragePool) { + auto build_test_case = [&](ModelTestBuilder& builder) { + auto* input_arg = builder.MakeInput({1, 23, 13, 13}, 0, 31); + auto* conv1_output_arg = builder.MakeIntermediate(); + auto* conv2_output_arg = builder.MakeIntermediate(); + auto* avgpool1_output_arg = builder.MakeIntermediate(); + auto* avgpool2_output_arg = builder.MakeIntermediate(); + auto* output_arg = builder.MakeOutput(); + auto* conv1_weight_arg = NhwcMakeInitializer(builder, {30, 23, 3, 3}); + auto* conv2_weight_arg = NhwcMakeInitializer(builder, {16, 30, 3, 3}); + + Node& conv1_node = builder.AddQLinearConvNode(input_arg, .01f, 135, + conv1_weight_arg, .02f, 126, + conv1_output_arg, .37f, 131); + conv1_node.AddAttribute("pads", std::vector{1, 1, 1, 1}); + Node& avgpool_node1 = builder.AddQLinearActivationNode("QLinearAveragePool", + conv1_output_arg, .37f, 131, + avgpool1_output_arg, .43f, 111); + avgpool_node1.AddAttribute("kernel_shape", std::vector{3, 3}); + avgpool_node1.AddAttribute("pads", std::vector{1, 1, 1, 1}); + + builder.AddQLinearConvNode(avgpool1_output_arg, .43f, 111, + conv2_weight_arg, .015f, 129, + conv2_output_arg, .37f, 131); + Node& avgpool_node2 = builder.AddQLinearActivationNode("QLinearAveragePool", + conv2_output_arg, .37f, 131, + avgpool2_output_arg, .37f, 131); + avgpool_node2.AddAttribute("kernel_shape", std::vector{3, 3}); + avgpool_node2.AddAttribute("pads", std::vector{1, 1, 1, 1}); + + builder.AddDequantizeLinearNode(avgpool2_output_arg, .37f, 131, output_arg); + }; + + auto check_nhwc_graph = [&](InferenceSessionWrapper& session) { + auto op_to_count = CountOpsInGraph(session.GetGraph()); + EXPECT_EQ(op_to_count["com.microsoft.QLinearConv"], 2); + EXPECT_EQ(op_to_count["Transpose"], 2); + }; + + TransformerTester(build_test_case, + check_nhwc_graph, + TransformerLevel::Level2, + TransformerLevel::Level3); +} + TEST(NhwcTransformerTests, ConvSplit) { for (int64_t axis = -4LL; axis < 4; axis++) { auto build_test_case = [&, axis](ModelTestBuilder& builder) {