Add nhwc support for QLinearAveragePool operator (#7656)

* Add nhwc support for QLinearAveragePool operator

* Update ContribOperators.md

* Update OperatorKernels.md with cpu,dnnl and cuda enabled.
This commit is contained in:
Zhang Lei 2021-05-13 22:05:30 -07:00 committed by GitHub
parent 37f69fcee5
commit 50c5edcf13
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 741 additions and 197 deletions

View file

@ -1784,6 +1784,8 @@ This version of the operator has been available since version 1 of the 'com.micr
<dd>auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where default value is NOTSET, which means explicit padding is used. SAME_UPPER or SAME_LOWER mean pad the input so that the output spatial size match the input.In case of odd number add the extra padding at the end for SAME_UPPER and at the beginning for SAME_LOWER. VALID mean no padding.</dd>
<dt><tt>ceil_mode</tt> : int</dt>
<dd>Whether to use ceil or floor (default) to compute the output shape.</dd>
<dt><tt>channels_last</tt> : int</dt>
<dd>Works on NHWC layout or not? Default not.</dd>
<dt><tt>count_include_pad</tt> : int</dt>
<dd>Whether include pad pixels when calculating values for the edges. Default is 0, doesn't count include pad.</dd>
<dt><tt>kernel_shape</tt> : list of ints (required)</dt>

View file

@ -687,3 +687,13 @@
|TransposeMatMul|(*in* A:**T**, *in* B:**T**, *out* Y:**T**)|1+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
| |
| |
## Operators implemented by DnnlExecutionProvider
| Op Name | Parameters | OpSet Version | Types Supported |
|---------|------------|---------------|-----------------|
|**Operator Domain:** *ai.onnx.ml*||||
|Gemm|(*in* A:**T**, *in* B:**T**, *in* C:**T**, *out* Y:**T**)|7+|**T** = tensor(float)|
| |
| |

View file

@ -25,22 +25,33 @@ static inline T8Bits quantize_value(float y, float y_scale, T8Bits y_zero_point)
template <>
inline float dequantize_value<uint8_t>(uint8_t x, float x_scale, uint8_t x_zero_point) {
return x_scale * (static_cast<int>(x) - x_zero_point);
return x_scale * (static_cast<int>(x) - x_zero_point);
}
template <>
inline uint8_t quantize_value<uint8_t>(float y, float y_scale, uint8_t y_zero_point) {
return static_cast<uint8_t>(std::max(0.0f, std::min(std::nearbyintf(y / y_scale + y_zero_point), 255.0f)));
return static_cast<uint8_t>(std::max(0.0f, std::min(std::nearbyintf(y / y_scale + y_zero_point), 255.0f)));
}
static void SwitchDimsNchwNhwc(std::vector<int64_t>& dims, bool from_nchw_to_nhwc) {
if (from_nchw_to_nhwc) {
int64_t channel = dims[1];
dims.erase(dims.begin() + 1);
dims.push_back(channel);
} else {
int64_t channel = dims.back();
dims.insert(dims.begin() + 1, channel);
dims.pop_back();
}
}
template <typename T8Bits, typename PoolType>
struct QLinearPool1DTask final {
const float* X_data;
T8Bits* Y_data;
float y_scale;
T8Bits y_zero_point;
int64_t x_step;
int64_t y_step;
int64_t x_image_size;
int64_t y_image_size;
int64_t pooled_height;
int64_t stride_h;
int64_t height;
@ -61,8 +72,8 @@ struct QLinearPool1DTask final {
}
void operator()(std::ptrdiff_t c) const {
const float* x_d = X_data + c * x_step;
T8Bits* y_d = Y_data + c * y_step;
const float* x_d = X_data + c * x_image_size;
T8Bits* y_d = Y_data + c * y_image_size;
for (int64_t ph = 0; ph < pooled_height; ++ph) {
int64_t hstart = ph * stride_h - pads[0];
@ -82,6 +93,67 @@ struct QLinearPool1DTask final {
}
};
template <typename T8Bits, typename PoolType>
struct QLinearPoolNhwc1DTask final {
const float* X_data;
T8Bits* Y_data;
float y_scale;
T8Bits y_zero_point;
int64_t channels;
int64_t pooled_height;
int64_t stride_h;
int64_t height;
const std::vector<int64_t>& kernel_shape;
const std::vector<int64_t>& pads;
const PoolProcessContext& pool_context_;
const PoolAttributes& pool_attrs_;
TensorOpCost Cost() {
double loop_count = static_cast<double>(channels * kernel_shape[0]);
return TensorOpCost{loop_count, loop_count, loop_count};
}
void operator()(std::ptrdiff_t begin, std::ptrdiff_t end) const {
int64_t y_image_size = pooled_height;
int64_t batch = begin / y_image_size;
int64_t offset = begin % y_image_size;
for (int64_t remains = end - begin; remains > 0; offset = 0, batch++) {
if (offset + remains <= y_image_size) {
operator()(std::ptrdiff_t(batch), std::ptrdiff_t(offset), std::ptrdiff_t(offset + remains));
remains = 0;
} else {
operator()(std::ptrdiff_t(batch), std::ptrdiff_t(offset), std::ptrdiff_t(y_image_size));
remains -= (y_image_size - offset);
}
}
}
void operator()(std::ptrdiff_t batch, std::ptrdiff_t begin, std::ptrdiff_t end) const {
const float* x_d = X_data + batch * height * channels;
T8Bits* y_d = Y_data + batch * pooled_height * channels;
std::vector<float> Yh(channels, PoolType::Initialize());
for (int64_t ph = begin, phc = begin * channels; ph < end; ++ph, phc += channels) {
int64_t hstart = ph * stride_h - pads[0];
int64_t hend = std::min(hstart + kernel_shape[0], height);
hstart = std::max(hstart, static_cast<int64_t>(0));
std::fill(Yh.begin(), Yh.end(), PoolType::Initialize());
for (int64_t h = hstart, hc = hstart * channels; h < hend; ++h, hc += channels) {
for (int64_t c = 0; c < channels; ++c) {
PoolType::Process(x_d[hc + c], Yh[c], pool_context_);
}
}
int64_t element_count = (pool_attrs_.count_include_pad) ? kernel_shape[0] : hend - hstart;
for (int64_t c = 0; c < channels; ++c) {
PoolType::Finalize(element_count, Yh[c], pool_context_);
y_d[phc + c] = quantize_value(Yh[c], y_scale, y_zero_point);
}
}
}
};
template <typename T8Bits, typename PoolType>
struct QLinearPool2DTask final {
@ -89,8 +161,8 @@ struct QLinearPool2DTask final {
T8Bits* Y_data;
float y_scale;
T8Bits y_zero_point;
int64_t x_step;
int64_t y_step;
int64_t x_image_size;
int64_t y_image_size;
int64_t pooled_height;
int64_t pooled_width;
int64_t stride_h;
@ -114,8 +186,8 @@ struct QLinearPool2DTask final {
}
void operator()(std::ptrdiff_t c) const {
const float* x_d = X_data + c * x_step;
T8Bits* y_d = Y_data + c * y_step;
const float* x_d = X_data + c * x_image_size;
T8Bits* y_d = Y_data + c * y_image_size;
for (int64_t ph = 0; ph < pooled_height; ++ph) {
int64_t hstart = ph * stride_h - pads[0];
@ -144,14 +216,105 @@ struct QLinearPool2DTask final {
}
};
template <typename T8Bits, typename PoolType>
struct QLinearPoolNhwc2DTask final {
const float* X_data;
T8Bits* Y_data;
float y_scale;
T8Bits y_zero_point;
int64_t x_image_size;
int64_t y_image_size;
int64_t kernel_size;
int64_t channels;
int64_t pooled_height;
int64_t pooled_width;
int64_t stride_h;
int64_t stride_w;
int64_t height;
int64_t width;
const std::vector<int64_t>& kernel_shape;
const std::vector<int64_t>& pads;
const PoolProcessContext& pool_context_;
const PoolAttributes& pool_attrs_;
TensorOpCost Cost() {
double loop_count = static_cast<double>(channels * kernel_size);
return TensorOpCost{loop_count, loop_count, loop_count};
}
void operator()(std::ptrdiff_t begin, std::ptrdiff_t end) const {
int64_t batch = begin / y_image_size;
int64_t offset = begin % y_image_size;
for (int64_t remains = end - begin; remains > 0; offset = 0, batch++) {
if (offset + remains <= y_image_size) {
operator()(std::ptrdiff_t(batch), std::ptrdiff_t(offset), std::ptrdiff_t(offset + remains));
remains = 0;
} else {
operator()(std::ptrdiff_t(batch), std::ptrdiff_t(offset), std::ptrdiff_t(y_image_size));
remains -= (y_image_size - offset);
}
}
}
void operator()(std::ptrdiff_t batch, std::ptrdiff_t begin, std::ptrdiff_t end) const {
const float* x_d = X_data + batch * x_image_size * channels;
T8Bits* y_d = Y_data + batch * y_image_size * channels;
// Calculate starting pooled_h, pooled_w, pooled_d
int64_t start_pw = begin;
int64_t start_ph = start_pw / pooled_width;
start_pw -= (start_ph * pooled_width);
int64_t pool_index = channels * begin;
int64_t remains = end - begin;
std::vector<float> Yh(channels);
for (int64_t ph = start_ph; remains > 0 && ph < pooled_height; ++ph) {
int64_t hstart = ph * stride_h - pads[0];
int64_t hend = std::min(hstart + kernel_shape[0], height);
hstart = std::max(hstart, static_cast<int64_t>(0));
for (int64_t pw = start_pw; remains > 0 && pw < pooled_width; ++pw) {
int64_t wstart = pw * stride_w - pads[1];
int64_t wend = std::min(wstart + kernel_shape[1], width);
wstart = std::max(wstart, static_cast<int64_t>(0));
// do the pooling here
float pool_init_value = PoolType::Initialize();
std::fill(Yh.data(), Yh.data() + channels, pool_init_value);
for (int64_t h = hstart; h < hend; ++h) {
int64_t input_index = channels * (h * width + wstart);
for (int64_t w = wstart; w < wend; ++w) {
for (int64_t c = 0; c < channels; c++) {
PoolType::Process(x_d[input_index + c], Yh[c], pool_context_);
}
input_index += channels;
}
}
int64_t elements_count = (pool_attrs_.count_include_pad) ? kernel_size : (hend - hstart) * (wend - wstart);
for (int64_t c = 0; c < channels; c++) {
PoolType::Finalize(elements_count, Yh[c], pool_context_);
auto y_value = quantize_value(Yh[c], y_scale, y_zero_point);
y_d[pool_index + c] = y_value;
}
pool_index += channels;
remains--;
}
start_pw = 0;
}
}
};
template <typename T8Bits, typename PoolType>
struct QLinearPool3DTask final {
const float* X_data;
T8Bits* Y_data;
float y_scale;
T8Bits y_zero_point;
int64_t x_step;
int64_t y_step;
int64_t x_image_size;
int64_t y_image_size;
int64_t pooled_height;
int64_t pooled_width;
int64_t pooled_depth;
@ -179,8 +342,8 @@ struct QLinearPool3DTask final {
}
void operator()(std::ptrdiff_t c) const {
const float* x_d = X_data + c * x_step;
T8Bits* y_d = Y_data + c * y_step;
const float* x_d = X_data + c * x_image_size;
T8Bits* y_d = Y_data + c * y_image_size;
for (int64_t ph = 0; ph < pooled_height; ++ph) {
int64_t hstart = ph * stride_h - pads[0];
@ -218,6 +381,110 @@ struct QLinearPool3DTask final {
}
};
template <typename T8Bits, typename PoolType>
struct QLinearPoolNhwc3DTask final {
const float* X_data;
T8Bits* Y_data;
float y_scale;
T8Bits y_zero_point;
int64_t x_image_size;
int64_t y_image_size;
int64_t kernel_size;
int64_t channels;
int64_t pooled_height;
int64_t pooled_width;
int64_t pooled_depth;
int64_t stride_h;
int64_t stride_w;
int64_t stride_d;
int64_t height;
int64_t width;
int64_t depth;
const std::vector<int64_t>& kernel_shape;
const std::vector<int64_t>& pads;
const PoolProcessContext& pool_context_;
const PoolAttributes& pool_attrs_;
TensorOpCost Cost() {
double loop_count = static_cast<double>(channels * kernel_size);
return TensorOpCost{loop_count, loop_count, loop_count};
}
void operator()(std::ptrdiff_t begin, std::ptrdiff_t end) const {
int64_t batch = begin / y_image_size;
int64_t offset = begin % y_image_size;
for (int64_t remains = end - begin; remains > 0; offset = 0, batch++) {
if (offset + remains <= y_image_size) {
operator()(std::ptrdiff_t(batch), std::ptrdiff_t(offset), std::ptrdiff_t(offset + remains));
remains = 0;
} else {
operator()(std::ptrdiff_t(batch), std::ptrdiff_t(offset), std::ptrdiff_t(y_image_size));
remains -= (y_image_size - offset);
}
}
}
void operator()(std::ptrdiff_t batch, std::ptrdiff_t begin, std::ptrdiff_t end) const {
const float* x_d = X_data + batch * x_image_size * channels;
T8Bits* y_d = Y_data + batch * y_image_size * channels;
// Calculate starting pooled_h, pooled_w, pooled_d
int64_t start_pd = begin;
int64_t start_ph = start_pd / (pooled_width * pooled_depth);
start_pd = start_pd - (start_ph * pooled_width * pooled_depth);
int64_t start_pw = start_pd / pooled_depth;
start_pd = start_pd - start_pw * pooled_depth;
int64_t pool_index = channels * begin;
int64_t remains = end - begin;
std::vector<float> Yh(channels);
for (int64_t ph = start_ph; remains > 0 && ph < pooled_height; ++ph) {
int64_t hstart = ph * stride_h - pads[0];
int64_t hend = std::min(hstart + kernel_shape[0], height);
hstart = std::max(hstart, static_cast<int64_t>(0));
for (int64_t pw = start_pw; remains > 0 && pw < pooled_width; ++pw) {
int64_t wstart = pw * stride_w - pads[1];
int64_t wend = std::min(wstart + kernel_shape[1], width);
wstart = std::max(wstart, static_cast<int64_t>(0));
for (int64_t pd = start_pd; remains > 0 && pd < pooled_depth; ++pd) {
int64_t dstart = pd * stride_d - pads[2];
int64_t dend = std::min(dstart + kernel_shape[2], depth);
dstart = std::max(dstart, static_cast<int64_t>(0));
// do the pooling here
std::fill(Yh.begin(), Yh.end(), PoolType::Initialize());
for (int64_t h = hstart; h < hend; ++h) {
const int64_t input_index_h = h * width * depth;
for (int64_t w = wstart; w < wend; ++w) {
int64_t input_index = channels * (input_index_h + w * depth + dstart);
for (int64_t d = dstart; d < dend; ++d) {
for (int64_t c = 0; c < channels; c++) {
PoolType::Process(x_d[input_index + c], Yh[c], pool_context_);
}
input_index += channels;
}
}
}
int64_t elements_count = (pool_attrs_.count_include_pad) ? kernel_size : (hend - hstart) * (wend - wstart) * (dend - dstart);
for (int64_t c = 0; c < channels; c++) {
PoolType::Finalize(elements_count, Yh[c], pool_context_);
auto y_value = quantize_value(Yh[c], y_scale, y_zero_point);
y_d[pool_index + c] = y_value;
}
pool_index += channels;
remains--;
}
start_pd = 0;
}
start_pw = 0;
}
}
};
Status QLinearAveragePool::Compute(OpKernelContext* context) const {
const auto tensor_x_scale = context->Input<Tensor>(1);
const auto tensor_x_zero_point = context->Input<Tensor>(2);
@ -236,9 +503,10 @@ Status QLinearAveragePool::Compute(OpKernelContext* context) const {
const auto* X = context->Input<Tensor>(0);
auto dtype = X->GetElementType();
if (dtype != ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
ORT_THROW("Unsupported 'dtype' in QLinear Pooling:", dtype);
ORT_THROW("Unsupported 'dtype' in QLinear Pooling:", dtype);
}
const TensorShape& x_shape = X->Shape();
TensorShape x_shape = X->Shape();
const float x_scale = *(tensor_x_scale->Data<float>());
const float y_scale = *(tensor_y_scale->Data<float>());
uint8_t x_zero_point = (tensor_x_zero_point ? *(tensor_x_zero_point->Data<uint8_t>()) : (uint8_t)0);
@ -249,12 +517,14 @@ Status QLinearAveragePool::Compute(OpKernelContext* context) const {
std::vector<int64_t> strides = pool_attrs_.strides;
std::vector<int64_t> kernel_shape = pool_attrs_.kernel_shape;
if (channels_last_) {
std::vector<int64_t> x_dims = x_shape.GetDims();
SwitchDimsNchwNhwc(x_dims, false);
x_shape = TensorShape(x_dims);
}
std::vector<int64_t> output_dims = pool_attrs_.SetOutputSize(x_shape, x_shape[1], &pads);
Tensor* Y = context->Output(0, output_dims);
const auto* X_data = X->Data<uint8_t>();
auto* Y_data = Y->MutableData<uint8_t>();
int64_t batch_count = x_shape[0];
const int64_t channels = x_shape[1];
const int64_t height = x_shape[2];
const int64_t width = kernel_shape.size() > 1 ? x_shape[3] : 1;
@ -262,9 +532,17 @@ Status QLinearAveragePool::Compute(OpKernelContext* context) const {
const int64_t pooled_height = output_dims[2];
const int64_t pooled_width = kernel_shape.size() > 1 ? output_dims[3] : 1;
const int64_t pooled_depth = kernel_shape.size() > 2 ? output_dims[4] : 1;
const int64_t total_channels = x_shape[0] * channels;
const int64_t x_step = height * width * depth;
const int64_t y_step = pooled_height * pooled_width * pooled_depth;
const int64_t total_channels = batch_count * channels;
const int64_t x_image_size = height * width * depth;
const int64_t y_image_size = pooled_height * pooled_width * pooled_depth;
const int64_t kernel_size = std::accumulate(kernel_shape.begin(), kernel_shape.end(), 1LL, std::multiplies<int64_t>());
if (channels_last_) {
SwitchDimsNchwNhwc(output_dims, true);
}
Tensor* Y = context->Output(0, output_dims);
const auto* X_data = X->Data<uint8_t>();
auto* Y_data = Y->MutableData<uint8_t>();
ThreadPool* tp = context->GetOperatorThreadPool();
std::vector<float> x_data_fp32;
@ -274,42 +552,62 @@ Status QLinearAveragePool::Compute(OpKernelContext* context) const {
const auto* x8 = X_data + first;
float* x32 = x_data_fp32.data() + first;
for (ptrdiff_t i = 0, sz = last - first; i < sz; ++i) {
*x32++ = dequantize_value(x8[i], x_scale, x_zero_point);
*x32++ = dequantize_value(x8[i], x_scale, x_zero_point);
}
});
}
switch (kernel_shape.size()) {
case 1:
{
QLinearPool1DTask<uint8_t, onnxruntime::AveragePool> avg_pool_task_1d = {
x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_step, y_step,
pooled_height, strides[0], height, kernel_shape, pads, pool_context_, pool_attrs_};
ThreadPool::TryParallelFor(tp, total_channels, avg_pool_task_1d.Cost(), avg_pool_task_1d);
case 1: {
if (channels_last_) {
QLinearPoolNhwc1DTask<uint8_t, onnxruntime::AveragePool> avg_pool_task_1d = {
x_data_fp32.data(), Y_data, y_scale, y_zero_point, channels,
pooled_height, strides[0], height, kernel_shape, pads, pool_context_, pool_attrs_};
ThreadPool::TryParallelFor(tp, y_image_size * batch_count, avg_pool_task_1d.Cost(), avg_pool_task_1d);
} else {
QLinearPool1DTask<uint8_t, onnxruntime::AveragePool> avg_pool_task_1d = {
x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_image_size, y_image_size,
pooled_height, strides[0], height, kernel_shape, pads, pool_context_, pool_attrs_};
ThreadPool::TryParallelFor(tp, total_channels, avg_pool_task_1d.Cost(), avg_pool_task_1d);
}
break;
}
case 2:
{
QLinearPool2DTask<uint8_t, onnxruntime::AveragePool> avg_pool_task_2d = {
x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_step, y_step,
pooled_height, pooled_width, strides[0], strides[1], height, width, kernel_shape, pads, pool_context_, pool_attrs_};
ThreadPool::TryParallelFor(tp, total_channels, avg_pool_task_2d.Cost(), avg_pool_task_2d);
case 2: {
if (channels_last_) {
QLinearPoolNhwc2DTask<uint8_t, onnxruntime::AveragePool> avg_pool_task_2d = {
x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_image_size, y_image_size, kernel_size, channels,
pooled_height, pooled_width, strides[0], strides[1], height, width, kernel_shape, pads, pool_context_, pool_attrs_};
ThreadPool::TryParallelFor(tp, y_image_size * batch_count, avg_pool_task_2d.Cost(), avg_pool_task_2d);
} else {
QLinearPool2DTask<uint8_t, onnxruntime::AveragePool> avg_pool_task_2d = {
x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_image_size, y_image_size,
pooled_height, pooled_width, strides[0], strides[1], height, width, kernel_shape, pads, pool_context_, pool_attrs_};
ThreadPool::TryParallelFor(tp, total_channels, avg_pool_task_2d.Cost(), avg_pool_task_2d);
}
break;
}
case 3:
{
QLinearPool3DTask<uint8_t, onnxruntime::AveragePool> avg_pool_task_3d = {
x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_step, y_step,
pooled_height, pooled_width, pooled_depth, strides[0], strides[1], strides[2], height, width, depth,
kernel_shape, pads, pool_context_, pool_attrs_};
ThreadPool::TryParallelFor(tp, total_channels, avg_pool_task_3d.Cost(), avg_pool_task_3d);
case 3: {
if (channels_last_) {
QLinearPoolNhwc3DTask<uint8_t, onnxruntime::AveragePool> avg_pool_task_3d = {
x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_image_size, y_image_size, kernel_size, channels,
pooled_height, pooled_width, pooled_depth, strides[0], strides[1], strides[2], height, width, depth,
kernel_shape, pads, pool_context_, pool_attrs_};
ThreadPool::TryParallelFor(tp, y_image_size * batch_count, avg_pool_task_3d.Cost(), avg_pool_task_3d);
} else {
QLinearPool3DTask<uint8_t, onnxruntime::AveragePool> avg_pool_task_3d = {
x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_image_size, y_image_size,
pooled_height, pooled_width, pooled_depth, strides[0], strides[1], strides[2], height, width, depth,
kernel_shape, pads, pool_context_, pool_attrs_};
ThreadPool::TryParallelFor(tp, total_channels, avg_pool_task_3d.Cost(), avg_pool_task_3d);
}
break;
}
default:
{
default: {
return onnxruntime::common::Status(
onnxruntime::common::ONNXRUNTIME,
onnxruntime::common::INVALID_ARGUMENT,

View file

@ -12,15 +12,17 @@ namespace contrib {
class QLinearAveragePool final : public OpKernel, public PoolBase {
public:
QLinearAveragePool(const OpKernelInfo& info) : OpKernel(info), PoolBase(info) { }
QLinearAveragePool(const OpKernelInfo& info) : OpKernel(info), PoolBase(info) {
channels_last_ = (info.GetAttrOrDefault<int64_t>("channels_last", static_cast<int64_t>(0)) != 0);
}
~QLinearAveragePool() override = default;
Status Compute(OpKernelContext* context) const override;
private:
private:
PoolProcessContext pool_context_;
bool channels_last_;
};
} // namespace contrib

View file

@ -4,6 +4,7 @@
#include "core/framework/tensorprotoutils.h"
#include "core/graph/constants.h"
#include "core/graph/contrib_ops/contrib_defs.h"
#include "core/graph/contrib_ops/quantization_defs.h"
namespace ONNX_NAMESPACE {
void convPoolShapeInference(
@ -18,7 +19,6 @@ using namespace ONNX_NAMESPACE;
namespace onnxruntime {
namespace contrib {
class NhwcInferenceContext : public InferenceContext {
public:
NhwcInferenceContext(InferenceContext& ctx) : ctx_(ctx) {
@ -263,6 +263,156 @@ equal to the spatial dimension of input tensor. Input is of type uint8_t or int8
++image_dim_index;
}
});
const char* QLinearAveragePoolDoc_ver1 = R"DOC(
QLinearAveragePool consumes an input tensor X and applies average pooling across
the tensor according to kernel sizes, stride sizes, and pad lengths.
average pooling consisting of computing the average on all values of a
subset of the input tensor according to the kernel size and downsampling the
data into the output tensor Y for further processing. The output spatial shape will be following:
```
output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - kernel_spatial_shape[i]) / strides_spatial_shape[i] + 1)
```
or
```
output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - kernel_spatial_shape[i]) / strides_spatial_shape[i] + 1)
```
if ceil_mode is enabled
```
* pad_shape[i] is sum of pads along axis i
```
`auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following:
```
VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - kernel_spatial_shape[i] + 1) / strides_spatial_shape[i])
SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides_spatial_shape[i])
```
And pad shape will be following if `SAME_UPPER` or `SAME_LOWER`:
```
pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + kernel_spatial_shape[i] - input_spatial_shape[i]
```
The output of each pooling window is divided by the number of elements (exclude pad when attribute count_include_pad is zero).
Input and output scales and zero points are used to convert the output to a new quantization range.
Output = Dequantize(Input) -> AveragePool on fp32 data -> Quantize(output)
)DOC";
static const char* contrib_ops_pads_doc =
"Padding for the beginning and ending along each spatial axis, it can take any value greater "
"than or equal to 0. The value represent the number of pixels added to the beginning "
"and end part of the corresponding axis. `pads` format should be as follow "
"[x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin the number of pixels "
"added at the beginning of axis `i` and xi_end, the number of pixels added at "
"the end of axis `i`. This attribute cannot be used simultaneously with "
"auto_pad attribute. If not present, the padding defaults to 0 along start and end of each spatial axis.";
static const char* contrib_ops_auto_pad_doc =
"auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where "
"default value is NOTSET, which means explicit padding is used. "
"SAME_UPPER or SAME_LOWER mean pad the input so that the output spatial size match the input."
"In case of odd number add the extra padding at the end for SAME_UPPER and at the "
"beginning for SAME_LOWER. VALID mean no padding.";
ONNX_CONTRIB_OPERATOR_SCHEMA(QLinearAveragePool)
.SetDomain(kMSDomain)
.SinceVersion(1)
.SetDoc(QLinearAveragePoolDoc_ver1)
.Attr(
"count_include_pad",
"Whether include pad pixels when calculating values for the edges. Default is 0, doesn't count include pad.",
AttributeProto::INT,
static_cast<int64_t>(0))
.Attr(
"kernel_shape",
"The size of the kernel along each axis.",
AttributeProto::INTS)
.Attr(
"strides",
"Stride along each spatial axis. If not present, the stride defaults to 1 along each spatial axis.",
AttributeProto::INTS,
OPTIONAL_VALUE)
.Attr(
"auto_pad",
contrib_ops_auto_pad_doc,
AttributeProto::STRING,
std::string("NOTSET"))
.Attr("pads", contrib_ops_pads_doc, AttributeProto::INTS, OPTIONAL_VALUE)
.Attr(
"ceil_mode",
"Whether to use ceil or floor (default) to compute the output shape.",
AttributeProto::INT,
static_cast<int64_t>(0))
.Attr("channels_last", "Works on NHWC layout or not? Default not.", AttributeProto::INT, static_cast<int64_t>(0))
.Input(
0,
"X",
"Input data tensor from the previous operator; "
"dimensions for image case are (N x C x H x W), "
"where N is the batch size, C is the number of "
"channels, and H and W are the height and the "
"width of the data. For non image case, the "
"dimensions are in the form of "
"(N x C x D1 x D2 ... Dn), where N is the batch "
"size. Optionally, if dimension denotation is "
"in effect, the operation expects the input "
"data tensor to arrive with the dimension denotation "
"of [DATA_BATCH, DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].",
"T")
.Input(
1,
"x_scale",
"Input scale. It's a scalar, which means a per-tensor/layer quantization.",
"tensor(float)")
.Input(
2,
"x_zero_point",
"Input zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.",
"T",
OpSchema::Optional)
.Input(
3,
"y_scale",
"Output scale. It's a scalar, which means a per-tensor/layer quantization.",
"tensor(float)")
.Input(
4,
"y_zero_point",
"Output zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.",
"T",
OpSchema::Optional)
.Output(
0,
"Y",
"Output data tensor from average or max pooling across "
"the input tensor. Dimensions will vary based "
"on various kernel, stride, and pad sizes. Floor value of "
"the dimension is used",
"T")
.TypeConstraint(
"T",
{"tensor(uint8)", "tensor(int8)"},
"Constrain input and output types to 8 bit tensors.")
.TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
ONNX_NAMESPACE::propagateElemTypeFromInputToOutput(ctx, 0, 0);
auto data_type = ctx.getInputType(0);
if (nullptr == data_type || data_type->value_case() != ONNX_NAMESPACE::TypeProto::kTensorType) {
fail_type_inference("inputs are expected to have tensor type.");
}
// validate scale and zero points
ValidateTypeAndShapeForScaleAndZP(ctx, 1, ONNX_NAMESPACE::TensorProto::FLOAT, true);
ValidateTypeAndShapeForScaleAndZP(ctx, 2, data_type->tensor_type().elem_type(), true);
ValidateTypeAndShapeForScaleAndZP(ctx, 3, ONNX_NAMESPACE::TensorProto::FLOAT, true);
ValidateTypeAndShapeForScaleAndZP(ctx, 4, data_type->tensor_type().elem_type(), true);
if (getAttribute(ctx, "channels_last", 0) == 0) {
ONNX_NAMESPACE::convPoolShapeInference(ctx, false, true, 0, 5);
} else {
convPoolShapeInferenceNhwc(ctx, false, true, 0, 5);
}
});
}
} // namespace contrib

View file

@ -1,7 +1,7 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/framework/tensorprotoutils.h"
#include "core/graph/contrib_ops/quantization_defs.h"
#include "core/graph/constants.h"
#include "core/graph/contrib_ops/contrib_defs.h"
@ -28,7 +28,7 @@ using ONNX_NAMESPACE::InferenceContext;
using ONNX_NAMESPACE::OpSchema;
using ONNX_NAMESPACE::OPTIONAL_VALUE;
void ValidateTypeAndShapeForScaleAndZP(ONNX_NAMESPACE::InferenceContext& ctx, int index, ::google::protobuf::int32 expectedType, bool isScalar, int expectedTensorSize = 0) {
void ValidateTypeAndShapeForScaleAndZP(ONNX_NAMESPACE::InferenceContext& ctx, int index, ::google::protobuf::int32 expectedType, bool isScalar, int expectedTensorSize) {
if (ctx.getNumInputs() > static_cast<size_t>(index)) {
auto data_type = ctx.getInputType(index);
if (nullptr == data_type) {
@ -546,151 +546,6 @@ This helps to improve accuracy as after ReduceMean operation the range of the ou
}
});
const char* QLinearAveragePoolDoc_ver1 = R"DOC(
QLinearAveragePool consumes an input tensor X and applies average pooling across
the tensor according to kernel sizes, stride sizes, and pad lengths.
average pooling consisting of computing the average on all values of a
subset of the input tensor according to the kernel size and downsampling the
data into the output tensor Y for further processing. The output spatial shape will be following:
```
output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - kernel_spatial_shape[i]) / strides_spatial_shape[i] + 1)
```
or
```
output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - kernel_spatial_shape[i]) / strides_spatial_shape[i] + 1)
```
if ceil_mode is enabled
```
* pad_shape[i] is sum of pads along axis i
```
`auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following:
```
VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - kernel_spatial_shape[i] + 1) / strides_spatial_shape[i])
SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides_spatial_shape[i])
```
And pad shape will be following if `SAME_UPPER` or `SAME_LOWER`:
```
pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + kernel_spatial_shape[i] - input_spatial_shape[i]
```
The output of each pooling window is divided by the number of elements (exclude pad when attribute count_include_pad is zero).
Input and output scales and zero points are used to convert the output to a new quantization range.
Output = Dequantize(Input) -> AveragePool on fp32 data -> Quantize(output)
)DOC";
static const char* contrib_ops_pads_doc =
"Padding for the beginning and ending along each spatial axis, it can take any value greater "
"than or equal to 0. The value represent the number of pixels added to the beginning "
"and end part of the corresponding axis. `pads` format should be as follow "
"[x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin the number of pixels "
"added at the beginning of axis `i` and xi_end, the number of pixels added at "
"the end of axis `i`. This attribute cannot be used simultaneously with "
"auto_pad attribute. If not present, the padding defaults to 0 along start and end of each spatial axis.";
static const char* contrib_ops_auto_pad_doc =
"auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where "
"default value is NOTSET, which means explicit padding is used. "
"SAME_UPPER or SAME_LOWER mean pad the input so that the output spatial size match the input."
"In case of odd number add the extra padding at the end for SAME_UPPER and at the "
"beginning for SAME_LOWER. VALID mean no padding.";
ONNX_CONTRIB_OPERATOR_SCHEMA(QLinearAveragePool)
.SetDomain(kMSDomain)
.SinceVersion(1)
.SetDoc(QLinearAveragePoolDoc_ver1)
.Attr(
"count_include_pad",
"Whether include pad pixels when calculating values for the edges. Default is 0, doesn't count include pad.",
AttributeProto::INT,
static_cast<int64_t>(0))
.Attr(
"kernel_shape",
"The size of the kernel along each axis.",
AttributeProto::INTS)
.Attr(
"strides",
"Stride along each spatial axis. If not present, the stride defaults to 1 along each spatial axis.",
AttributeProto::INTS,
OPTIONAL_VALUE)
.Attr(
"auto_pad",
contrib_ops_auto_pad_doc,
AttributeProto::STRING,
std::string("NOTSET"))
.Attr("pads", contrib_ops_pads_doc, AttributeProto::INTS, OPTIONAL_VALUE)
.Attr(
"ceil_mode",
"Whether to use ceil or floor (default) to compute the output shape.",
AttributeProto::INT,
static_cast<int64_t>(0))
.Input(
0,
"X",
"Input data tensor from the previous operator; "
"dimensions for image case are (N x C x H x W), "
"where N is the batch size, C is the number of "
"channels, and H and W are the height and the "
"width of the data. For non image case, the "
"dimensions are in the form of "
"(N x C x D1 x D2 ... Dn), where N is the batch "
"size. Optionally, if dimension denotation is "
"in effect, the operation expects the input "
"data tensor to arrive with the dimension denotation "
"of [DATA_BATCH, DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].",
"T")
.Input(
1,
"x_scale",
"Input scale. It's a scalar, which means a per-tensor/layer quantization.",
"tensor(float)")
.Input(
2,
"x_zero_point",
"Input zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.",
"T",
OpSchema::Optional)
.Input(
3,
"y_scale",
"Output scale. It's a scalar, which means a per-tensor/layer quantization.",
"tensor(float)")
.Input(
4,
"y_zero_point",
"Output zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.",
"T",
OpSchema::Optional)
.Output(
0,
"Y",
"Output data tensor from average or max pooling across "
"the input tensor. Dimensions will vary based "
"on various kernel, stride, and pad sizes. Floor value of "
"the dimension is used",
"T")
.TypeConstraint(
"T",
{"tensor(uint8)", "tensor(int8)"},
"Constrain input and output types to 8 bit tensors.")
.TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
ONNX_NAMESPACE::propagateElemTypeFromInputToOutput(ctx, 0, 0);
auto data_type = ctx.getInputType(0);
if (nullptr == data_type || data_type->value_case() != ONNX_NAMESPACE::TypeProto::kTensorType) {
fail_type_inference("inputs are expected to have tensor type.");
}
// validate scale and zero points
ValidateTypeAndShapeForScaleAndZP(ctx, 1, ONNX_NAMESPACE::TensorProto::FLOAT, true);
ValidateTypeAndShapeForScaleAndZP(ctx, 2, data_type->tensor_type().elem_type(), true);
ValidateTypeAndShapeForScaleAndZP(ctx, 3, ONNX_NAMESPACE::TensorProto::FLOAT, true);
ValidateTypeAndShapeForScaleAndZP(ctx, 4, data_type->tensor_type().elem_type(), true);
ONNX_NAMESPACE::convPoolShapeInference(ctx, false, true, 0, 5);
});
const char* QLinearLeakyReluDoc_ver1 = R"DOC(
QLinearLeakyRelu takes quantized input data (Tensor), an argument alpha, and quantize parameter for output,
and produces one output data (Tensor<T>) where the function `f(x) = quantize(alpha * dequantize(x)) for dequantize(x) < 0`,

View file

@ -0,0 +1,19 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/graph/onnx_protobuf.h"
#include "core/framework/tensorprotoutils.h"
namespace onnxruntime {
namespace contrib {
void ValidateTypeAndShapeForScaleAndZP(
ONNX_NAMESPACE::InferenceContext& ctx,
int index,
::google::protobuf::int32 expectedType,
bool isScalar,
int expectedTensorSize = 0);
}
} // namespace onnxruntime

View file

@ -427,7 +427,8 @@ void NhwcTransformerImpl::Transform(Node& node) {
} else if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "QLinearLeakyRelu", {1}, kMSDomain) ||
graph_utils::IsSupportedOptypeVersionAndDomain(node, "QLinearSigmoid", {1}, kMSDomain)) {
TransformQLinearActivation(node);
} else if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "QLinearGlobalAveragePool", {1}, kMSDomain)) {
} else if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "QLinearGlobalAveragePool", {1}, kMSDomain) ||
graph_utils::IsSupportedOptypeVersionAndDomain(node, "QLinearAveragePool", {1}, kMSDomain)) {
TransformQLinearGlobalAveragePool(node);
} else if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "QLinearConcat", {1}, kMSDomain)) {
TransformQLinearConcat(node);

View file

@ -192,6 +192,105 @@ void RunQLinearAveragePoolNchwU8(
run_test(true /* only_x_not_initializer */, true /* x_y_same_zero_point */);
}
static std::vector<int64_t> dims_to_nhwc(const std::vector<int64_t>& nchw) {
std::vector<int64_t> nhwc(nchw);
nhwc.erase(nhwc.begin() + 1);
nhwc.push_back(nchw[1]);
return nhwc;
}
static std::vector<uint8_t> transpose_to_nhwc(const std::vector<uint8_t>& nchw_data, const std::vector<int64_t>& nchw_dims) {
std::vector<uint8_t> nhwc_data(nchw_data.size());
auto batch_count = nchw_dims[0];
auto channels = nchw_dims[1];
int64_t image_size = std::accumulate(nchw_dims.begin() + 2, nchw_dims.end(), 1LL, std::multiplies<int64_t>());
for (int64_t b = 0; b < batch_count; b++) {
const uint8_t* nchw_image = nchw_data.data() + (b * image_size);
uint8_t* nhwc_image = nhwc_data.data() + (b * image_size);
for (int64_t img_index = 0; img_index < image_size; ++img_index) {
for (int64_t c = 0; c < channels; c++) {
*nhwc_image++ = nchw_image[c * image_size + img_index];
}
}
}
return nhwc_data;
}
void RunQLinearAveragePoolNhwcU8(
const std::vector<int64_t> x_dims,
const std::vector<int64_t> y_dims,
const std::vector<int64_t> kernel_shape,
const std::vector<int64_t> strides,
const std::vector<int64_t> pads,
const int64_t count_include_pad = 0) {
float x_scale = 1.0f / 255.0f;
uint8_t x_zero_point = 128;
RandomValueGenerator random{};
std::vector<float> x_data_fp32 = random.Uniform<float>(x_dims, -0.5f, 0.5f);
std::vector<uint8_t> x_data(x_data_fp32.size());
for (size_t i = 0; i < x_data.size(); ++i) {
x_data[i] = quantize_u8(x_data_fp32[i], x_scale, x_zero_point);
}
float y_scale = 1.0f / 255.0f;
uint8_t y_zero_point = 100;
int64_t y_size = std::accumulate(y_dims.begin(), y_dims.end(), 1LL, std::multiplies<int64_t>());
std::vector<uint8_t> y_data(y_size);
CalculateAvgPoolNchwU8(
x_data.data(), x_dims, x_scale, x_zero_point,
y_data.data(), y_dims, y_scale, y_zero_point,
kernel_shape, strides, pads, count_include_pad);
// transpose the result
std::vector<uint8_t> y_data_nhwc = transpose_to_nhwc(y_data, y_dims);
std::vector<uint8_t> x_data_nhwc = transpose_to_nhwc(x_data, x_dims);
auto x_dims_nhwc = dims_to_nhwc(x_dims);
auto y_dims_nhwc = dims_to_nhwc(y_dims);
OpTester test("QLinearAveragePool", 1, onnxruntime::kMSDomain);
test.AddAttribute("auto_pad", "");
test.AddAttribute("strides", strides);
test.AddAttribute("pads", pads);
test.AddAttribute("kernel_shape", kernel_shape);
test.AddAttribute("count_include_pad", count_include_pad);
test.AddAttribute("channels_last", (int64_t)1LL);
test.AddInput<uint8_t>("X", x_dims_nhwc, x_data_nhwc);
test.AddInput<float>("x_scale", {}, {x_scale});
test.AddInput<uint8_t>("x_zero_point", {}, {x_zero_point});
test.AddInput<float>("y_scale", {}, {y_scale});
test.AddInput<uint8_t>("y_zero_point", {}, {y_zero_point});
test.AddOutput<uint8_t>("Y", y_dims_nhwc, y_data_nhwc);
auto q8checker = [&](const std::vector<OrtValue>& fetches, const std::string& provider_type) {
const OrtValue& ort_value = fetches[0];
if (ort_value.Fence()) {
ort_value.Fence()->BeforeUsingAsInput(onnxruntime::kCpuExecutionProvider, 0);
}
auto y_shape = TensorShape(y_dims_nhwc);
const Tensor& output_tensor = ort_value.Get<Tensor>();
ORT_ENFORCE(y_shape == output_tensor.Shape(),
"Expected output shape [" + y_shape.ToString() + "] did not match run output shape [" +
output_tensor.Shape().ToString() + "] for Y @" + provider_type);
auto* output = output_tensor.Data<uint8_t>();
auto size = static_cast<int>(output_tensor.Shape().Size());
for (int i = 0; i < size; ++i) {
int diff = abs(y_data_nhwc[i] - output[i]);
EXPECT_LE(diff, 1) << "i:" << i << " expected:" << y_data_nhwc[i] << " " << (int)y_data_nhwc[i]
<< ", got:" << output[i] << " " << (int)output[i] << ", provider_type: " << provider_type;
}
};
test.SetCustomOutputVerifier(q8checker);
static std::unordered_set<std::string> excluded_providers = {kNnapiExecutionProvider};
test.Run(OpTester::ExpectResult::kExpectSuccess, "", excluded_providers);
}
TEST(QLinearPoolTest, AveragePool1D_ExcludePadPixel) {
RunQLinearAveragePoolNchwU8(
{1, 1, 5}, // x shape
@ -252,5 +351,68 @@ TEST(QLinearPoolTest, AveragePool3D_IncludePadPixel) {
1); // count_include_pad
}
/*************************************************
* Channels last test
**************************************************/
TEST(QLinearPoolTest, AveragePool1D_ExcludePadPixel_nhwc) {
RunQLinearAveragePoolNhwcU8(
{1, 1, 5}, // x shape
{1, 1, 6}, // expected y shape
{3}, // kernel shape
{1}, // strides
{1, 2}, // pads
0); // count_include_pad
}
TEST(QLinearPoolTest, AveragePool1D_IncludePadPixel_nhwc) {
RunQLinearAveragePoolNhwcU8(
{1, 1, 5}, // x shape
{1, 1, 6}, // expected y shape
{3}, // kernel shape
{1}, // strides
{1, 2}, // pads
1); // count_include_pad
}
TEST(QLinearPoolTest, AveragePool2D_ExcludePadPixel_nhwc) {
RunQLinearAveragePoolNhwcU8(
{1, 1, 5, 7}, // x shape
{1, 1, 6, 4}, // expected y shape
{3, 4}, // kernel shape
{1, 2}, // strides
{1, 3, 2, 1}, // pads
0); // count_include_pad
}
TEST(QLinearPoolTest, AveragePool2D_IncludePadPixel_nhwc) {
RunQLinearAveragePoolNhwcU8(
{1, 1, 5, 7}, // x shape
{1, 1, 6, 4}, // expected y shape
{3, 4}, // kernel shape
{1, 2}, // strides
{1, 3, 2, 1}, // pads
1); // count_include_pad
}
TEST(QLinearPoolTest, AveragePool3D_ExcludePadPixel_nhwc) {
RunQLinearAveragePoolNhwcU8(
{1, 1, 5, 7, 9}, // x shape
{1, 1, 6, 4, 3}, // expected y shape
{3, 4, 5}, // kernel shape
{1, 2, 3}, // strides
{1, 3, 2, 2, 1, 2}, // pads
0); // count_include_pad
}
TEST(QLinearPoolTest, AveragePool3D_IncludePadPixel_nhwc) {
RunQLinearAveragePoolNhwcU8(
{1, 1, 5, 7, 9}, // x shape
{1, 1, 6, 4, 3}, // expected y shape
{3, 4, 5}, // kernel shape
{1, 2, 3}, // strides
{1, 3, 2, 2, 1, 2}, // pads
1); // count_include_pad
}
} // namespace test
} // namespace onnxruntime

View file

@ -245,6 +245,51 @@ TEST(NhwcTransformerTests, ConvGlobalAveragePool) {
TransformerLevel::Level3);
}
TEST(NhwcTransformerTests, ConvAveragePool) {
auto build_test_case = [&](ModelTestBuilder& builder) {
auto* input_arg = builder.MakeInput<uint8_t>({1, 23, 13, 13}, 0, 31);
auto* conv1_output_arg = builder.MakeIntermediate();
auto* conv2_output_arg = builder.MakeIntermediate();
auto* avgpool1_output_arg = builder.MakeIntermediate();
auto* avgpool2_output_arg = builder.MakeIntermediate();
auto* output_arg = builder.MakeOutput();
auto* conv1_weight_arg = NhwcMakeInitializer<uint8_t>(builder, {30, 23, 3, 3});
auto* conv2_weight_arg = NhwcMakeInitializer<uint8_t>(builder, {16, 30, 3, 3});
Node& conv1_node = builder.AddQLinearConvNode<uint8_t>(input_arg, .01f, 135,
conv1_weight_arg, .02f, 126,
conv1_output_arg, .37f, 131);
conv1_node.AddAttribute("pads", std::vector<int64_t>{1, 1, 1, 1});
Node& avgpool_node1 = builder.AddQLinearActivationNode("QLinearAveragePool",
conv1_output_arg, .37f, 131,
avgpool1_output_arg, .43f, 111);
avgpool_node1.AddAttribute("kernel_shape", std::vector<int64_t>{3, 3});
avgpool_node1.AddAttribute("pads", std::vector<int64_t>{1, 1, 1, 1});
builder.AddQLinearConvNode<uint8_t>(avgpool1_output_arg, .43f, 111,
conv2_weight_arg, .015f, 129,
conv2_output_arg, .37f, 131);
Node& avgpool_node2 = builder.AddQLinearActivationNode("QLinearAveragePool",
conv2_output_arg, .37f, 131,
avgpool2_output_arg, .37f, 131);
avgpool_node2.AddAttribute("kernel_shape", std::vector<int64_t>{3, 3});
avgpool_node2.AddAttribute("pads", std::vector<int64_t>{1, 1, 1, 1});
builder.AddDequantizeLinearNode<uint8_t>(avgpool2_output_arg, .37f, 131, output_arg);
};
auto check_nhwc_graph = [&](InferenceSessionWrapper& session) {
auto op_to_count = CountOpsInGraph(session.GetGraph());
EXPECT_EQ(op_to_count["com.microsoft.QLinearConv"], 2);
EXPECT_EQ(op_to_count["Transpose"], 2);
};
TransformerTester(build_test_case,
check_nhwc_graph,
TransformerLevel::Level2,
TransformerLevel::Level3);
}
TEST(NhwcTransformerTests, ConvSplit) {
for (int64_t axis = -4LL; axis < 4; axis++) {
auto build_test_case = [&, axis](ModelTestBuilder& builder) {