mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-07-03 03:58:54 +00:00
Add nhwc support for QLinearAveragePool operator (#7656)
* Add nhwc support for QLinearAveragePool operator * Update ContribOperators.md * Update OperatorKernels.md with cpu,dnnl and cuda enabled.
This commit is contained in:
parent
37f69fcee5
commit
50c5edcf13
10 changed files with 741 additions and 197 deletions
|
|
@ -1784,6 +1784,8 @@ This version of the operator has been available since version 1 of the 'com.micr
|
|||
<dd>auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where default value is NOTSET, which means explicit padding is used. SAME_UPPER or SAME_LOWER mean pad the input so that the output spatial size match the input.In case of odd number add the extra padding at the end for SAME_UPPER and at the beginning for SAME_LOWER. VALID mean no padding.</dd>
|
||||
<dt><tt>ceil_mode</tt> : int</dt>
|
||||
<dd>Whether to use ceil or floor (default) to compute the output shape.</dd>
|
||||
<dt><tt>channels_last</tt> : int</dt>
|
||||
<dd>Works on NHWC layout or not? Default not.</dd>
|
||||
<dt><tt>count_include_pad</tt> : int</dt>
|
||||
<dd>Whether include pad pixels when calculating values for the edges. Default is 0, doesn't count include pad.</dd>
|
||||
<dt><tt>kernel_shape</tt> : list of ints (required)</dt>
|
||||
|
|
|
|||
|
|
@ -687,3 +687,13 @@
|
|||
|TransposeMatMul|(*in* A:**T**, *in* B:**T**, *out* Y:**T**)|1+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
|
||||
| |
|
||||
| |
|
||||
|
||||
|
||||
## Operators implemented by DnnlExecutionProvider
|
||||
|
||||
| Op Name | Parameters | OpSet Version | Types Supported |
|
||||
|---------|------------|---------------|-----------------|
|
||||
|**Operator Domain:** *ai.onnx.ml*||||
|
||||
|Gemm|(*in* A:**T**, *in* B:**T**, *in* C:**T**, *out* Y:**T**)|7+|**T** = tensor(float)|
|
||||
| |
|
||||
| |
|
||||
|
|
|
|||
|
|
@ -25,22 +25,33 @@ static inline T8Bits quantize_value(float y, float y_scale, T8Bits y_zero_point)
|
|||
|
||||
template <>
|
||||
inline float dequantize_value<uint8_t>(uint8_t x, float x_scale, uint8_t x_zero_point) {
|
||||
return x_scale * (static_cast<int>(x) - x_zero_point);
|
||||
return x_scale * (static_cast<int>(x) - x_zero_point);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline uint8_t quantize_value<uint8_t>(float y, float y_scale, uint8_t y_zero_point) {
|
||||
return static_cast<uint8_t>(std::max(0.0f, std::min(std::nearbyintf(y / y_scale + y_zero_point), 255.0f)));
|
||||
return static_cast<uint8_t>(std::max(0.0f, std::min(std::nearbyintf(y / y_scale + y_zero_point), 255.0f)));
|
||||
}
|
||||
|
||||
static void SwitchDimsNchwNhwc(std::vector<int64_t>& dims, bool from_nchw_to_nhwc) {
|
||||
if (from_nchw_to_nhwc) {
|
||||
int64_t channel = dims[1];
|
||||
dims.erase(dims.begin() + 1);
|
||||
dims.push_back(channel);
|
||||
} else {
|
||||
int64_t channel = dims.back();
|
||||
dims.insert(dims.begin() + 1, channel);
|
||||
dims.pop_back();
|
||||
}
|
||||
}
|
||||
template <typename T8Bits, typename PoolType>
|
||||
struct QLinearPool1DTask final {
|
||||
const float* X_data;
|
||||
T8Bits* Y_data;
|
||||
float y_scale;
|
||||
T8Bits y_zero_point;
|
||||
int64_t x_step;
|
||||
int64_t y_step;
|
||||
int64_t x_image_size;
|
||||
int64_t y_image_size;
|
||||
int64_t pooled_height;
|
||||
int64_t stride_h;
|
||||
int64_t height;
|
||||
|
|
@ -61,8 +72,8 @@ struct QLinearPool1DTask final {
|
|||
}
|
||||
|
||||
void operator()(std::ptrdiff_t c) const {
|
||||
const float* x_d = X_data + c * x_step;
|
||||
T8Bits* y_d = Y_data + c * y_step;
|
||||
const float* x_d = X_data + c * x_image_size;
|
||||
T8Bits* y_d = Y_data + c * y_image_size;
|
||||
|
||||
for (int64_t ph = 0; ph < pooled_height; ++ph) {
|
||||
int64_t hstart = ph * stride_h - pads[0];
|
||||
|
|
@ -82,6 +93,67 @@ struct QLinearPool1DTask final {
|
|||
}
|
||||
};
|
||||
|
||||
template <typename T8Bits, typename PoolType>
|
||||
struct QLinearPoolNhwc1DTask final {
|
||||
const float* X_data;
|
||||
T8Bits* Y_data;
|
||||
float y_scale;
|
||||
T8Bits y_zero_point;
|
||||
int64_t channels;
|
||||
int64_t pooled_height;
|
||||
int64_t stride_h;
|
||||
int64_t height;
|
||||
const std::vector<int64_t>& kernel_shape;
|
||||
const std::vector<int64_t>& pads;
|
||||
const PoolProcessContext& pool_context_;
|
||||
const PoolAttributes& pool_attrs_;
|
||||
|
||||
TensorOpCost Cost() {
|
||||
double loop_count = static_cast<double>(channels * kernel_shape[0]);
|
||||
return TensorOpCost{loop_count, loop_count, loop_count};
|
||||
}
|
||||
|
||||
void operator()(std::ptrdiff_t begin, std::ptrdiff_t end) const {
|
||||
int64_t y_image_size = pooled_height;
|
||||
int64_t batch = begin / y_image_size;
|
||||
int64_t offset = begin % y_image_size;
|
||||
|
||||
for (int64_t remains = end - begin; remains > 0; offset = 0, batch++) {
|
||||
if (offset + remains <= y_image_size) {
|
||||
operator()(std::ptrdiff_t(batch), std::ptrdiff_t(offset), std::ptrdiff_t(offset + remains));
|
||||
remains = 0;
|
||||
} else {
|
||||
operator()(std::ptrdiff_t(batch), std::ptrdiff_t(offset), std::ptrdiff_t(y_image_size));
|
||||
remains -= (y_image_size - offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void operator()(std::ptrdiff_t batch, std::ptrdiff_t begin, std::ptrdiff_t end) const {
|
||||
const float* x_d = X_data + batch * height * channels;
|
||||
T8Bits* y_d = Y_data + batch * pooled_height * channels;
|
||||
std::vector<float> Yh(channels, PoolType::Initialize());
|
||||
|
||||
for (int64_t ph = begin, phc = begin * channels; ph < end; ++ph, phc += channels) {
|
||||
int64_t hstart = ph * stride_h - pads[0];
|
||||
int64_t hend = std::min(hstart + kernel_shape[0], height);
|
||||
hstart = std::max(hstart, static_cast<int64_t>(0));
|
||||
|
||||
std::fill(Yh.begin(), Yh.end(), PoolType::Initialize());
|
||||
for (int64_t h = hstart, hc = hstart * channels; h < hend; ++h, hc += channels) {
|
||||
for (int64_t c = 0; c < channels; ++c) {
|
||||
PoolType::Process(x_d[hc + c], Yh[c], pool_context_);
|
||||
}
|
||||
}
|
||||
|
||||
int64_t element_count = (pool_attrs_.count_include_pad) ? kernel_shape[0] : hend - hstart;
|
||||
for (int64_t c = 0; c < channels; ++c) {
|
||||
PoolType::Finalize(element_count, Yh[c], pool_context_);
|
||||
y_d[phc + c] = quantize_value(Yh[c], y_scale, y_zero_point);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T8Bits, typename PoolType>
|
||||
struct QLinearPool2DTask final {
|
||||
|
|
@ -89,8 +161,8 @@ struct QLinearPool2DTask final {
|
|||
T8Bits* Y_data;
|
||||
float y_scale;
|
||||
T8Bits y_zero_point;
|
||||
int64_t x_step;
|
||||
int64_t y_step;
|
||||
int64_t x_image_size;
|
||||
int64_t y_image_size;
|
||||
int64_t pooled_height;
|
||||
int64_t pooled_width;
|
||||
int64_t stride_h;
|
||||
|
|
@ -114,8 +186,8 @@ struct QLinearPool2DTask final {
|
|||
}
|
||||
|
||||
void operator()(std::ptrdiff_t c) const {
|
||||
const float* x_d = X_data + c * x_step;
|
||||
T8Bits* y_d = Y_data + c * y_step;
|
||||
const float* x_d = X_data + c * x_image_size;
|
||||
T8Bits* y_d = Y_data + c * y_image_size;
|
||||
|
||||
for (int64_t ph = 0; ph < pooled_height; ++ph) {
|
||||
int64_t hstart = ph * stride_h - pads[0];
|
||||
|
|
@ -144,14 +216,105 @@ struct QLinearPool2DTask final {
|
|||
}
|
||||
};
|
||||
|
||||
template <typename T8Bits, typename PoolType>
|
||||
struct QLinearPoolNhwc2DTask final {
|
||||
const float* X_data;
|
||||
T8Bits* Y_data;
|
||||
float y_scale;
|
||||
T8Bits y_zero_point;
|
||||
int64_t x_image_size;
|
||||
int64_t y_image_size;
|
||||
int64_t kernel_size;
|
||||
int64_t channels;
|
||||
int64_t pooled_height;
|
||||
int64_t pooled_width;
|
||||
int64_t stride_h;
|
||||
int64_t stride_w;
|
||||
int64_t height;
|
||||
int64_t width;
|
||||
const std::vector<int64_t>& kernel_shape;
|
||||
const std::vector<int64_t>& pads;
|
||||
const PoolProcessContext& pool_context_;
|
||||
const PoolAttributes& pool_attrs_;
|
||||
|
||||
TensorOpCost Cost() {
|
||||
double loop_count = static_cast<double>(channels * kernel_size);
|
||||
return TensorOpCost{loop_count, loop_count, loop_count};
|
||||
}
|
||||
|
||||
void operator()(std::ptrdiff_t begin, std::ptrdiff_t end) const {
|
||||
int64_t batch = begin / y_image_size;
|
||||
int64_t offset = begin % y_image_size;
|
||||
|
||||
for (int64_t remains = end - begin; remains > 0; offset = 0, batch++) {
|
||||
if (offset + remains <= y_image_size) {
|
||||
operator()(std::ptrdiff_t(batch), std::ptrdiff_t(offset), std::ptrdiff_t(offset + remains));
|
||||
remains = 0;
|
||||
} else {
|
||||
operator()(std::ptrdiff_t(batch), std::ptrdiff_t(offset), std::ptrdiff_t(y_image_size));
|
||||
remains -= (y_image_size - offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void operator()(std::ptrdiff_t batch, std::ptrdiff_t begin, std::ptrdiff_t end) const {
|
||||
const float* x_d = X_data + batch * x_image_size * channels;
|
||||
T8Bits* y_d = Y_data + batch * y_image_size * channels;
|
||||
|
||||
// Calculate starting pooled_h, pooled_w, pooled_d
|
||||
int64_t start_pw = begin;
|
||||
int64_t start_ph = start_pw / pooled_width;
|
||||
start_pw -= (start_ph * pooled_width);
|
||||
|
||||
int64_t pool_index = channels * begin;
|
||||
int64_t remains = end - begin;
|
||||
std::vector<float> Yh(channels);
|
||||
|
||||
for (int64_t ph = start_ph; remains > 0 && ph < pooled_height; ++ph) {
|
||||
int64_t hstart = ph * stride_h - pads[0];
|
||||
int64_t hend = std::min(hstart + kernel_shape[0], height);
|
||||
hstart = std::max(hstart, static_cast<int64_t>(0));
|
||||
for (int64_t pw = start_pw; remains > 0 && pw < pooled_width; ++pw) {
|
||||
int64_t wstart = pw * stride_w - pads[1];
|
||||
int64_t wend = std::min(wstart + kernel_shape[1], width);
|
||||
wstart = std::max(wstart, static_cast<int64_t>(0));
|
||||
|
||||
// do the pooling here
|
||||
float pool_init_value = PoolType::Initialize();
|
||||
std::fill(Yh.data(), Yh.data() + channels, pool_init_value);
|
||||
for (int64_t h = hstart; h < hend; ++h) {
|
||||
int64_t input_index = channels * (h * width + wstart);
|
||||
for (int64_t w = wstart; w < wend; ++w) {
|
||||
for (int64_t c = 0; c < channels; c++) {
|
||||
PoolType::Process(x_d[input_index + c], Yh[c], pool_context_);
|
||||
}
|
||||
input_index += channels;
|
||||
}
|
||||
}
|
||||
|
||||
int64_t elements_count = (pool_attrs_.count_include_pad) ? kernel_size : (hend - hstart) * (wend - wstart);
|
||||
for (int64_t c = 0; c < channels; c++) {
|
||||
PoolType::Finalize(elements_count, Yh[c], pool_context_);
|
||||
auto y_value = quantize_value(Yh[c], y_scale, y_zero_point);
|
||||
y_d[pool_index + c] = y_value;
|
||||
}
|
||||
|
||||
pool_index += channels;
|
||||
remains--;
|
||||
}
|
||||
start_pw = 0;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T8Bits, typename PoolType>
|
||||
struct QLinearPool3DTask final {
|
||||
const float* X_data;
|
||||
T8Bits* Y_data;
|
||||
float y_scale;
|
||||
T8Bits y_zero_point;
|
||||
int64_t x_step;
|
||||
int64_t y_step;
|
||||
int64_t x_image_size;
|
||||
int64_t y_image_size;
|
||||
int64_t pooled_height;
|
||||
int64_t pooled_width;
|
||||
int64_t pooled_depth;
|
||||
|
|
@ -179,8 +342,8 @@ struct QLinearPool3DTask final {
|
|||
}
|
||||
|
||||
void operator()(std::ptrdiff_t c) const {
|
||||
const float* x_d = X_data + c * x_step;
|
||||
T8Bits* y_d = Y_data + c * y_step;
|
||||
const float* x_d = X_data + c * x_image_size;
|
||||
T8Bits* y_d = Y_data + c * y_image_size;
|
||||
|
||||
for (int64_t ph = 0; ph < pooled_height; ++ph) {
|
||||
int64_t hstart = ph * stride_h - pads[0];
|
||||
|
|
@ -218,6 +381,110 @@ struct QLinearPool3DTask final {
|
|||
}
|
||||
};
|
||||
|
||||
template <typename T8Bits, typename PoolType>
|
||||
struct QLinearPoolNhwc3DTask final {
|
||||
const float* X_data;
|
||||
T8Bits* Y_data;
|
||||
float y_scale;
|
||||
T8Bits y_zero_point;
|
||||
int64_t x_image_size;
|
||||
int64_t y_image_size;
|
||||
int64_t kernel_size;
|
||||
int64_t channels;
|
||||
int64_t pooled_height;
|
||||
int64_t pooled_width;
|
||||
int64_t pooled_depth;
|
||||
int64_t stride_h;
|
||||
int64_t stride_w;
|
||||
int64_t stride_d;
|
||||
int64_t height;
|
||||
int64_t width;
|
||||
int64_t depth;
|
||||
const std::vector<int64_t>& kernel_shape;
|
||||
const std::vector<int64_t>& pads;
|
||||
const PoolProcessContext& pool_context_;
|
||||
const PoolAttributes& pool_attrs_;
|
||||
|
||||
TensorOpCost Cost() {
|
||||
double loop_count = static_cast<double>(channels * kernel_size);
|
||||
return TensorOpCost{loop_count, loop_count, loop_count};
|
||||
}
|
||||
|
||||
void operator()(std::ptrdiff_t begin, std::ptrdiff_t end) const {
|
||||
int64_t batch = begin / y_image_size;
|
||||
int64_t offset = begin % y_image_size;
|
||||
|
||||
for (int64_t remains = end - begin; remains > 0; offset = 0, batch++) {
|
||||
if (offset + remains <= y_image_size) {
|
||||
operator()(std::ptrdiff_t(batch), std::ptrdiff_t(offset), std::ptrdiff_t(offset + remains));
|
||||
remains = 0;
|
||||
} else {
|
||||
operator()(std::ptrdiff_t(batch), std::ptrdiff_t(offset), std::ptrdiff_t(y_image_size));
|
||||
remains -= (y_image_size - offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void operator()(std::ptrdiff_t batch, std::ptrdiff_t begin, std::ptrdiff_t end) const {
|
||||
const float* x_d = X_data + batch * x_image_size * channels;
|
||||
T8Bits* y_d = Y_data + batch * y_image_size * channels;
|
||||
|
||||
// Calculate starting pooled_h, pooled_w, pooled_d
|
||||
int64_t start_pd = begin;
|
||||
int64_t start_ph = start_pd / (pooled_width * pooled_depth);
|
||||
start_pd = start_pd - (start_ph * pooled_width * pooled_depth);
|
||||
int64_t start_pw = start_pd / pooled_depth;
|
||||
start_pd = start_pd - start_pw * pooled_depth;
|
||||
int64_t pool_index = channels * begin;
|
||||
int64_t remains = end - begin;
|
||||
|
||||
std::vector<float> Yh(channels);
|
||||
|
||||
for (int64_t ph = start_ph; remains > 0 && ph < pooled_height; ++ph) {
|
||||
int64_t hstart = ph * stride_h - pads[0];
|
||||
int64_t hend = std::min(hstart + kernel_shape[0], height);
|
||||
hstart = std::max(hstart, static_cast<int64_t>(0));
|
||||
for (int64_t pw = start_pw; remains > 0 && pw < pooled_width; ++pw) {
|
||||
int64_t wstart = pw * stride_w - pads[1];
|
||||
int64_t wend = std::min(wstart + kernel_shape[1], width);
|
||||
wstart = std::max(wstart, static_cast<int64_t>(0));
|
||||
for (int64_t pd = start_pd; remains > 0 && pd < pooled_depth; ++pd) {
|
||||
int64_t dstart = pd * stride_d - pads[2];
|
||||
int64_t dend = std::min(dstart + kernel_shape[2], depth);
|
||||
dstart = std::max(dstart, static_cast<int64_t>(0));
|
||||
|
||||
// do the pooling here
|
||||
std::fill(Yh.begin(), Yh.end(), PoolType::Initialize());
|
||||
for (int64_t h = hstart; h < hend; ++h) {
|
||||
const int64_t input_index_h = h * width * depth;
|
||||
for (int64_t w = wstart; w < wend; ++w) {
|
||||
int64_t input_index = channels * (input_index_h + w * depth + dstart);
|
||||
for (int64_t d = dstart; d < dend; ++d) {
|
||||
for (int64_t c = 0; c < channels; c++) {
|
||||
PoolType::Process(x_d[input_index + c], Yh[c], pool_context_);
|
||||
}
|
||||
input_index += channels;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int64_t elements_count = (pool_attrs_.count_include_pad) ? kernel_size : (hend - hstart) * (wend - wstart) * (dend - dstart);
|
||||
for (int64_t c = 0; c < channels; c++) {
|
||||
PoolType::Finalize(elements_count, Yh[c], pool_context_);
|
||||
auto y_value = quantize_value(Yh[c], y_scale, y_zero_point);
|
||||
y_d[pool_index + c] = y_value;
|
||||
}
|
||||
|
||||
pool_index += channels;
|
||||
remains--;
|
||||
}
|
||||
start_pd = 0;
|
||||
}
|
||||
start_pw = 0;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
Status QLinearAveragePool::Compute(OpKernelContext* context) const {
|
||||
const auto tensor_x_scale = context->Input<Tensor>(1);
|
||||
const auto tensor_x_zero_point = context->Input<Tensor>(2);
|
||||
|
|
@ -236,9 +503,10 @@ Status QLinearAveragePool::Compute(OpKernelContext* context) const {
|
|||
const auto* X = context->Input<Tensor>(0);
|
||||
auto dtype = X->GetElementType();
|
||||
if (dtype != ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
|
||||
ORT_THROW("Unsupported 'dtype' in QLinear Pooling:", dtype);
|
||||
ORT_THROW("Unsupported 'dtype' in QLinear Pooling:", dtype);
|
||||
}
|
||||
const TensorShape& x_shape = X->Shape();
|
||||
|
||||
TensorShape x_shape = X->Shape();
|
||||
const float x_scale = *(tensor_x_scale->Data<float>());
|
||||
const float y_scale = *(tensor_y_scale->Data<float>());
|
||||
uint8_t x_zero_point = (tensor_x_zero_point ? *(tensor_x_zero_point->Data<uint8_t>()) : (uint8_t)0);
|
||||
|
|
@ -249,12 +517,14 @@ Status QLinearAveragePool::Compute(OpKernelContext* context) const {
|
|||
std::vector<int64_t> strides = pool_attrs_.strides;
|
||||
std::vector<int64_t> kernel_shape = pool_attrs_.kernel_shape;
|
||||
|
||||
if (channels_last_) {
|
||||
std::vector<int64_t> x_dims = x_shape.GetDims();
|
||||
SwitchDimsNchwNhwc(x_dims, false);
|
||||
x_shape = TensorShape(x_dims);
|
||||
}
|
||||
std::vector<int64_t> output_dims = pool_attrs_.SetOutputSize(x_shape, x_shape[1], &pads);
|
||||
Tensor* Y = context->Output(0, output_dims);
|
||||
|
||||
const auto* X_data = X->Data<uint8_t>();
|
||||
auto* Y_data = Y->MutableData<uint8_t>();
|
||||
|
||||
int64_t batch_count = x_shape[0];
|
||||
const int64_t channels = x_shape[1];
|
||||
const int64_t height = x_shape[2];
|
||||
const int64_t width = kernel_shape.size() > 1 ? x_shape[3] : 1;
|
||||
|
|
@ -262,9 +532,17 @@ Status QLinearAveragePool::Compute(OpKernelContext* context) const {
|
|||
const int64_t pooled_height = output_dims[2];
|
||||
const int64_t pooled_width = kernel_shape.size() > 1 ? output_dims[3] : 1;
|
||||
const int64_t pooled_depth = kernel_shape.size() > 2 ? output_dims[4] : 1;
|
||||
const int64_t total_channels = x_shape[0] * channels;
|
||||
const int64_t x_step = height * width * depth;
|
||||
const int64_t y_step = pooled_height * pooled_width * pooled_depth;
|
||||
const int64_t total_channels = batch_count * channels;
|
||||
const int64_t x_image_size = height * width * depth;
|
||||
const int64_t y_image_size = pooled_height * pooled_width * pooled_depth;
|
||||
const int64_t kernel_size = std::accumulate(kernel_shape.begin(), kernel_shape.end(), 1LL, std::multiplies<int64_t>());
|
||||
|
||||
if (channels_last_) {
|
||||
SwitchDimsNchwNhwc(output_dims, true);
|
||||
}
|
||||
Tensor* Y = context->Output(0, output_dims);
|
||||
const auto* X_data = X->Data<uint8_t>();
|
||||
auto* Y_data = Y->MutableData<uint8_t>();
|
||||
|
||||
ThreadPool* tp = context->GetOperatorThreadPool();
|
||||
std::vector<float> x_data_fp32;
|
||||
|
|
@ -274,42 +552,62 @@ Status QLinearAveragePool::Compute(OpKernelContext* context) const {
|
|||
const auto* x8 = X_data + first;
|
||||
float* x32 = x_data_fp32.data() + first;
|
||||
for (ptrdiff_t i = 0, sz = last - first; i < sz; ++i) {
|
||||
*x32++ = dequantize_value(x8[i], x_scale, x_zero_point);
|
||||
*x32++ = dequantize_value(x8[i], x_scale, x_zero_point);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
switch (kernel_shape.size()) {
|
||||
case 1:
|
||||
{
|
||||
QLinearPool1DTask<uint8_t, onnxruntime::AveragePool> avg_pool_task_1d = {
|
||||
x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_step, y_step,
|
||||
pooled_height, strides[0], height, kernel_shape, pads, pool_context_, pool_attrs_};
|
||||
ThreadPool::TryParallelFor(tp, total_channels, avg_pool_task_1d.Cost(), avg_pool_task_1d);
|
||||
case 1: {
|
||||
if (channels_last_) {
|
||||
QLinearPoolNhwc1DTask<uint8_t, onnxruntime::AveragePool> avg_pool_task_1d = {
|
||||
x_data_fp32.data(), Y_data, y_scale, y_zero_point, channels,
|
||||
pooled_height, strides[0], height, kernel_shape, pads, pool_context_, pool_attrs_};
|
||||
ThreadPool::TryParallelFor(tp, y_image_size * batch_count, avg_pool_task_1d.Cost(), avg_pool_task_1d);
|
||||
} else {
|
||||
QLinearPool1DTask<uint8_t, onnxruntime::AveragePool> avg_pool_task_1d = {
|
||||
x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_image_size, y_image_size,
|
||||
pooled_height, strides[0], height, kernel_shape, pads, pool_context_, pool_attrs_};
|
||||
ThreadPool::TryParallelFor(tp, total_channels, avg_pool_task_1d.Cost(), avg_pool_task_1d);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case 2:
|
||||
{
|
||||
QLinearPool2DTask<uint8_t, onnxruntime::AveragePool> avg_pool_task_2d = {
|
||||
x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_step, y_step,
|
||||
pooled_height, pooled_width, strides[0], strides[1], height, width, kernel_shape, pads, pool_context_, pool_attrs_};
|
||||
ThreadPool::TryParallelFor(tp, total_channels, avg_pool_task_2d.Cost(), avg_pool_task_2d);
|
||||
case 2: {
|
||||
if (channels_last_) {
|
||||
QLinearPoolNhwc2DTask<uint8_t, onnxruntime::AveragePool> avg_pool_task_2d = {
|
||||
x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_image_size, y_image_size, kernel_size, channels,
|
||||
pooled_height, pooled_width, strides[0], strides[1], height, width, kernel_shape, pads, pool_context_, pool_attrs_};
|
||||
ThreadPool::TryParallelFor(tp, y_image_size * batch_count, avg_pool_task_2d.Cost(), avg_pool_task_2d);
|
||||
|
||||
} else {
|
||||
QLinearPool2DTask<uint8_t, onnxruntime::AveragePool> avg_pool_task_2d = {
|
||||
x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_image_size, y_image_size,
|
||||
pooled_height, pooled_width, strides[0], strides[1], height, width, kernel_shape, pads, pool_context_, pool_attrs_};
|
||||
ThreadPool::TryParallelFor(tp, total_channels, avg_pool_task_2d.Cost(), avg_pool_task_2d);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case 3:
|
||||
{
|
||||
QLinearPool3DTask<uint8_t, onnxruntime::AveragePool> avg_pool_task_3d = {
|
||||
x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_step, y_step,
|
||||
pooled_height, pooled_width, pooled_depth, strides[0], strides[1], strides[2], height, width, depth,
|
||||
kernel_shape, pads, pool_context_, pool_attrs_};
|
||||
ThreadPool::TryParallelFor(tp, total_channels, avg_pool_task_3d.Cost(), avg_pool_task_3d);
|
||||
case 3: {
|
||||
if (channels_last_) {
|
||||
QLinearPoolNhwc3DTask<uint8_t, onnxruntime::AveragePool> avg_pool_task_3d = {
|
||||
x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_image_size, y_image_size, kernel_size, channels,
|
||||
pooled_height, pooled_width, pooled_depth, strides[0], strides[1], strides[2], height, width, depth,
|
||||
kernel_shape, pads, pool_context_, pool_attrs_};
|
||||
ThreadPool::TryParallelFor(tp, y_image_size * batch_count, avg_pool_task_3d.Cost(), avg_pool_task_3d);
|
||||
|
||||
} else {
|
||||
QLinearPool3DTask<uint8_t, onnxruntime::AveragePool> avg_pool_task_3d = {
|
||||
x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_image_size, y_image_size,
|
||||
pooled_height, pooled_width, pooled_depth, strides[0], strides[1], strides[2], height, width, depth,
|
||||
kernel_shape, pads, pool_context_, pool_attrs_};
|
||||
ThreadPool::TryParallelFor(tp, total_channels, avg_pool_task_3d.Cost(), avg_pool_task_3d);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
{
|
||||
default: {
|
||||
return onnxruntime::common::Status(
|
||||
onnxruntime::common::ONNXRUNTIME,
|
||||
onnxruntime::common::INVALID_ARGUMENT,
|
||||
|
|
|
|||
|
|
@ -12,15 +12,17 @@ namespace contrib {
|
|||
|
||||
class QLinearAveragePool final : public OpKernel, public PoolBase {
|
||||
public:
|
||||
QLinearAveragePool(const OpKernelInfo& info) : OpKernel(info), PoolBase(info) { }
|
||||
QLinearAveragePool(const OpKernelInfo& info) : OpKernel(info), PoolBase(info) {
|
||||
channels_last_ = (info.GetAttrOrDefault<int64_t>("channels_last", static_cast<int64_t>(0)) != 0);
|
||||
}
|
||||
|
||||
~QLinearAveragePool() override = default;
|
||||
|
||||
Status Compute(OpKernelContext* context) const override;
|
||||
|
||||
private:
|
||||
private:
|
||||
PoolProcessContext pool_context_;
|
||||
|
||||
bool channels_last_;
|
||||
};
|
||||
|
||||
} // namespace contrib
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@
|
|||
#include "core/framework/tensorprotoutils.h"
|
||||
#include "core/graph/constants.h"
|
||||
#include "core/graph/contrib_ops/contrib_defs.h"
|
||||
#include "core/graph/contrib_ops/quantization_defs.h"
|
||||
|
||||
namespace ONNX_NAMESPACE {
|
||||
void convPoolShapeInference(
|
||||
|
|
@ -18,7 +19,6 @@ using namespace ONNX_NAMESPACE;
|
|||
|
||||
namespace onnxruntime {
|
||||
namespace contrib {
|
||||
|
||||
class NhwcInferenceContext : public InferenceContext {
|
||||
public:
|
||||
NhwcInferenceContext(InferenceContext& ctx) : ctx_(ctx) {
|
||||
|
|
@ -263,6 +263,156 @@ equal to the spatial dimension of input tensor. Input is of type uint8_t or int8
|
|||
++image_dim_index;
|
||||
}
|
||||
});
|
||||
|
||||
const char* QLinearAveragePoolDoc_ver1 = R"DOC(
|
||||
QLinearAveragePool consumes an input tensor X and applies average pooling across
|
||||
the tensor according to kernel sizes, stride sizes, and pad lengths.
|
||||
average pooling consisting of computing the average on all values of a
|
||||
subset of the input tensor according to the kernel size and downsampling the
|
||||
data into the output tensor Y for further processing. The output spatial shape will be following:
|
||||
```
|
||||
output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - kernel_spatial_shape[i]) / strides_spatial_shape[i] + 1)
|
||||
```
|
||||
or
|
||||
```
|
||||
output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - kernel_spatial_shape[i]) / strides_spatial_shape[i] + 1)
|
||||
```
|
||||
if ceil_mode is enabled
|
||||
|
||||
```
|
||||
* pad_shape[i] is sum of pads along axis i
|
||||
```
|
||||
|
||||
`auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following:
|
||||
```
|
||||
VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - kernel_spatial_shape[i] + 1) / strides_spatial_shape[i])
|
||||
SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides_spatial_shape[i])
|
||||
```
|
||||
And pad shape will be following if `SAME_UPPER` or `SAME_LOWER`:
|
||||
```
|
||||
pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + kernel_spatial_shape[i] - input_spatial_shape[i]
|
||||
```
|
||||
|
||||
The output of each pooling window is divided by the number of elements (exclude pad when attribute count_include_pad is zero).
|
||||
|
||||
Input and output scales and zero points are used to convert the output to a new quantization range.
|
||||
Output = Dequantize(Input) -> AveragePool on fp32 data -> Quantize(output)
|
||||
)DOC";
|
||||
|
||||
static const char* contrib_ops_pads_doc =
|
||||
"Padding for the beginning and ending along each spatial axis, it can take any value greater "
|
||||
"than or equal to 0. The value represent the number of pixels added to the beginning "
|
||||
"and end part of the corresponding axis. `pads` format should be as follow "
|
||||
"[x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin the number of pixels "
|
||||
"added at the beginning of axis `i` and xi_end, the number of pixels added at "
|
||||
"the end of axis `i`. This attribute cannot be used simultaneously with "
|
||||
"auto_pad attribute. If not present, the padding defaults to 0 along start and end of each spatial axis.";
|
||||
static const char* contrib_ops_auto_pad_doc =
|
||||
"auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where "
|
||||
"default value is NOTSET, which means explicit padding is used. "
|
||||
"SAME_UPPER or SAME_LOWER mean pad the input so that the output spatial size match the input."
|
||||
"In case of odd number add the extra padding at the end for SAME_UPPER and at the "
|
||||
"beginning for SAME_LOWER. VALID mean no padding.";
|
||||
|
||||
ONNX_CONTRIB_OPERATOR_SCHEMA(QLinearAveragePool)
|
||||
.SetDomain(kMSDomain)
|
||||
.SinceVersion(1)
|
||||
.SetDoc(QLinearAveragePoolDoc_ver1)
|
||||
.Attr(
|
||||
"count_include_pad",
|
||||
"Whether include pad pixels when calculating values for the edges. Default is 0, doesn't count include pad.",
|
||||
AttributeProto::INT,
|
||||
static_cast<int64_t>(0))
|
||||
.Attr(
|
||||
"kernel_shape",
|
||||
"The size of the kernel along each axis.",
|
||||
AttributeProto::INTS)
|
||||
.Attr(
|
||||
"strides",
|
||||
"Stride along each spatial axis. If not present, the stride defaults to 1 along each spatial axis.",
|
||||
AttributeProto::INTS,
|
||||
OPTIONAL_VALUE)
|
||||
.Attr(
|
||||
"auto_pad",
|
||||
contrib_ops_auto_pad_doc,
|
||||
AttributeProto::STRING,
|
||||
std::string("NOTSET"))
|
||||
.Attr("pads", contrib_ops_pads_doc, AttributeProto::INTS, OPTIONAL_VALUE)
|
||||
.Attr(
|
||||
"ceil_mode",
|
||||
"Whether to use ceil or floor (default) to compute the output shape.",
|
||||
AttributeProto::INT,
|
||||
static_cast<int64_t>(0))
|
||||
.Attr("channels_last", "Works on NHWC layout or not? Default not.", AttributeProto::INT, static_cast<int64_t>(0))
|
||||
.Input(
|
||||
0,
|
||||
"X",
|
||||
"Input data tensor from the previous operator; "
|
||||
"dimensions for image case are (N x C x H x W), "
|
||||
"where N is the batch size, C is the number of "
|
||||
"channels, and H and W are the height and the "
|
||||
"width of the data. For non image case, the "
|
||||
"dimensions are in the form of "
|
||||
"(N x C x D1 x D2 ... Dn), where N is the batch "
|
||||
"size. Optionally, if dimension denotation is "
|
||||
"in effect, the operation expects the input "
|
||||
"data tensor to arrive with the dimension denotation "
|
||||
"of [DATA_BATCH, DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].",
|
||||
"T")
|
||||
.Input(
|
||||
1,
|
||||
"x_scale",
|
||||
"Input scale. It's a scalar, which means a per-tensor/layer quantization.",
|
||||
"tensor(float)")
|
||||
.Input(
|
||||
2,
|
||||
"x_zero_point",
|
||||
"Input zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.",
|
||||
"T",
|
||||
OpSchema::Optional)
|
||||
.Input(
|
||||
3,
|
||||
"y_scale",
|
||||
"Output scale. It's a scalar, which means a per-tensor/layer quantization.",
|
||||
"tensor(float)")
|
||||
.Input(
|
||||
4,
|
||||
"y_zero_point",
|
||||
"Output zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.",
|
||||
"T",
|
||||
OpSchema::Optional)
|
||||
.Output(
|
||||
0,
|
||||
"Y",
|
||||
"Output data tensor from average or max pooling across "
|
||||
"the input tensor. Dimensions will vary based "
|
||||
"on various kernel, stride, and pad sizes. Floor value of "
|
||||
"the dimension is used",
|
||||
"T")
|
||||
.TypeConstraint(
|
||||
"T",
|
||||
{"tensor(uint8)", "tensor(int8)"},
|
||||
"Constrain input and output types to 8 bit tensors.")
|
||||
.TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
|
||||
ONNX_NAMESPACE::propagateElemTypeFromInputToOutput(ctx, 0, 0);
|
||||
|
||||
auto data_type = ctx.getInputType(0);
|
||||
if (nullptr == data_type || data_type->value_case() != ONNX_NAMESPACE::TypeProto::kTensorType) {
|
||||
fail_type_inference("inputs are expected to have tensor type.");
|
||||
}
|
||||
|
||||
// validate scale and zero points
|
||||
ValidateTypeAndShapeForScaleAndZP(ctx, 1, ONNX_NAMESPACE::TensorProto::FLOAT, true);
|
||||
ValidateTypeAndShapeForScaleAndZP(ctx, 2, data_type->tensor_type().elem_type(), true);
|
||||
ValidateTypeAndShapeForScaleAndZP(ctx, 3, ONNX_NAMESPACE::TensorProto::FLOAT, true);
|
||||
ValidateTypeAndShapeForScaleAndZP(ctx, 4, data_type->tensor_type().elem_type(), true);
|
||||
|
||||
if (getAttribute(ctx, "channels_last", 0) == 0) {
|
||||
ONNX_NAMESPACE::convPoolShapeInference(ctx, false, true, 0, 5);
|
||||
} else {
|
||||
convPoolShapeInferenceNhwc(ctx, false, true, 0, 5);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
} // namespace contrib
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#include "core/framework/tensorprotoutils.h"
|
||||
#include "core/graph/contrib_ops/quantization_defs.h"
|
||||
#include "core/graph/constants.h"
|
||||
#include "core/graph/contrib_ops/contrib_defs.h"
|
||||
|
||||
|
|
@ -28,7 +28,7 @@ using ONNX_NAMESPACE::InferenceContext;
|
|||
using ONNX_NAMESPACE::OpSchema;
|
||||
using ONNX_NAMESPACE::OPTIONAL_VALUE;
|
||||
|
||||
void ValidateTypeAndShapeForScaleAndZP(ONNX_NAMESPACE::InferenceContext& ctx, int index, ::google::protobuf::int32 expectedType, bool isScalar, int expectedTensorSize = 0) {
|
||||
void ValidateTypeAndShapeForScaleAndZP(ONNX_NAMESPACE::InferenceContext& ctx, int index, ::google::protobuf::int32 expectedType, bool isScalar, int expectedTensorSize) {
|
||||
if (ctx.getNumInputs() > static_cast<size_t>(index)) {
|
||||
auto data_type = ctx.getInputType(index);
|
||||
if (nullptr == data_type) {
|
||||
|
|
@ -546,151 +546,6 @@ This helps to improve accuracy as after ReduceMean operation the range of the ou
|
|||
}
|
||||
});
|
||||
|
||||
const char* QLinearAveragePoolDoc_ver1 = R"DOC(
|
||||
QLinearAveragePool consumes an input tensor X and applies average pooling across
|
||||
the tensor according to kernel sizes, stride sizes, and pad lengths.
|
||||
average pooling consisting of computing the average on all values of a
|
||||
subset of the input tensor according to the kernel size and downsampling the
|
||||
data into the output tensor Y for further processing. The output spatial shape will be following:
|
||||
```
|
||||
output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - kernel_spatial_shape[i]) / strides_spatial_shape[i] + 1)
|
||||
```
|
||||
or
|
||||
```
|
||||
output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - kernel_spatial_shape[i]) / strides_spatial_shape[i] + 1)
|
||||
```
|
||||
if ceil_mode is enabled
|
||||
|
||||
```
|
||||
* pad_shape[i] is sum of pads along axis i
|
||||
```
|
||||
|
||||
`auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following:
|
||||
```
|
||||
VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - kernel_spatial_shape[i] + 1) / strides_spatial_shape[i])
|
||||
SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides_spatial_shape[i])
|
||||
```
|
||||
And pad shape will be following if `SAME_UPPER` or `SAME_LOWER`:
|
||||
```
|
||||
pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + kernel_spatial_shape[i] - input_spatial_shape[i]
|
||||
```
|
||||
|
||||
The output of each pooling window is divided by the number of elements (exclude pad when attribute count_include_pad is zero).
|
||||
|
||||
Input and output scales and zero points are used to convert the output to a new quantization range.
|
||||
Output = Dequantize(Input) -> AveragePool on fp32 data -> Quantize(output)
|
||||
)DOC";
|
||||
|
||||
static const char* contrib_ops_pads_doc =
|
||||
"Padding for the beginning and ending along each spatial axis, it can take any value greater "
|
||||
"than or equal to 0. The value represent the number of pixels added to the beginning "
|
||||
"and end part of the corresponding axis. `pads` format should be as follow "
|
||||
"[x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin the number of pixels "
|
||||
"added at the beginning of axis `i` and xi_end, the number of pixels added at "
|
||||
"the end of axis `i`. This attribute cannot be used simultaneously with "
|
||||
"auto_pad attribute. If not present, the padding defaults to 0 along start and end of each spatial axis.";
|
||||
static const char* contrib_ops_auto_pad_doc =
|
||||
"auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where "
|
||||
"default value is NOTSET, which means explicit padding is used. "
|
||||
"SAME_UPPER or SAME_LOWER mean pad the input so that the output spatial size match the input."
|
||||
"In case of odd number add the extra padding at the end for SAME_UPPER and at the "
|
||||
"beginning for SAME_LOWER. VALID mean no padding.";
|
||||
|
||||
ONNX_CONTRIB_OPERATOR_SCHEMA(QLinearAveragePool)
|
||||
.SetDomain(kMSDomain)
|
||||
.SinceVersion(1)
|
||||
.SetDoc(QLinearAveragePoolDoc_ver1)
|
||||
.Attr(
|
||||
"count_include_pad",
|
||||
"Whether include pad pixels when calculating values for the edges. Default is 0, doesn't count include pad.",
|
||||
AttributeProto::INT,
|
||||
static_cast<int64_t>(0))
|
||||
.Attr(
|
||||
"kernel_shape",
|
||||
"The size of the kernel along each axis.",
|
||||
AttributeProto::INTS)
|
||||
.Attr(
|
||||
"strides",
|
||||
"Stride along each spatial axis. If not present, the stride defaults to 1 along each spatial axis.",
|
||||
AttributeProto::INTS,
|
||||
OPTIONAL_VALUE)
|
||||
.Attr(
|
||||
"auto_pad",
|
||||
contrib_ops_auto_pad_doc,
|
||||
AttributeProto::STRING,
|
||||
std::string("NOTSET"))
|
||||
.Attr("pads", contrib_ops_pads_doc, AttributeProto::INTS, OPTIONAL_VALUE)
|
||||
.Attr(
|
||||
"ceil_mode",
|
||||
"Whether to use ceil or floor (default) to compute the output shape.",
|
||||
AttributeProto::INT,
|
||||
static_cast<int64_t>(0))
|
||||
.Input(
|
||||
0,
|
||||
"X",
|
||||
"Input data tensor from the previous operator; "
|
||||
"dimensions for image case are (N x C x H x W), "
|
||||
"where N is the batch size, C is the number of "
|
||||
"channels, and H and W are the height and the "
|
||||
"width of the data. For non image case, the "
|
||||
"dimensions are in the form of "
|
||||
"(N x C x D1 x D2 ... Dn), where N is the batch "
|
||||
"size. Optionally, if dimension denotation is "
|
||||
"in effect, the operation expects the input "
|
||||
"data tensor to arrive with the dimension denotation "
|
||||
"of [DATA_BATCH, DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].",
|
||||
"T")
|
||||
.Input(
|
||||
1,
|
||||
"x_scale",
|
||||
"Input scale. It's a scalar, which means a per-tensor/layer quantization.",
|
||||
"tensor(float)")
|
||||
.Input(
|
||||
2,
|
||||
"x_zero_point",
|
||||
"Input zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.",
|
||||
"T",
|
||||
OpSchema::Optional)
|
||||
.Input(
|
||||
3,
|
||||
"y_scale",
|
||||
"Output scale. It's a scalar, which means a per-tensor/layer quantization.",
|
||||
"tensor(float)")
|
||||
.Input(
|
||||
4,
|
||||
"y_zero_point",
|
||||
"Output zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.",
|
||||
"T",
|
||||
OpSchema::Optional)
|
||||
.Output(
|
||||
0,
|
||||
"Y",
|
||||
"Output data tensor from average or max pooling across "
|
||||
"the input tensor. Dimensions will vary based "
|
||||
"on various kernel, stride, and pad sizes. Floor value of "
|
||||
"the dimension is used",
|
||||
"T")
|
||||
.TypeConstraint(
|
||||
"T",
|
||||
{"tensor(uint8)", "tensor(int8)"},
|
||||
"Constrain input and output types to 8 bit tensors.")
|
||||
.TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
|
||||
ONNX_NAMESPACE::propagateElemTypeFromInputToOutput(ctx, 0, 0);
|
||||
|
||||
auto data_type = ctx.getInputType(0);
|
||||
if (nullptr == data_type || data_type->value_case() != ONNX_NAMESPACE::TypeProto::kTensorType) {
|
||||
fail_type_inference("inputs are expected to have tensor type.");
|
||||
}
|
||||
|
||||
// validate scale and zero points
|
||||
ValidateTypeAndShapeForScaleAndZP(ctx, 1, ONNX_NAMESPACE::TensorProto::FLOAT, true);
|
||||
ValidateTypeAndShapeForScaleAndZP(ctx, 2, data_type->tensor_type().elem_type(), true);
|
||||
ValidateTypeAndShapeForScaleAndZP(ctx, 3, ONNX_NAMESPACE::TensorProto::FLOAT, true);
|
||||
ValidateTypeAndShapeForScaleAndZP(ctx, 4, data_type->tensor_type().elem_type(), true);
|
||||
|
||||
ONNX_NAMESPACE::convPoolShapeInference(ctx, false, true, 0, 5);
|
||||
});
|
||||
|
||||
const char* QLinearLeakyReluDoc_ver1 = R"DOC(
|
||||
QLinearLeakyRelu takes quantized input data (Tensor), an argument alpha, and quantize parameter for output,
|
||||
and produces one output data (Tensor<T>) where the function `f(x) = quantize(alpha * dequantize(x)) for dequantize(x) < 0`,
|
||||
|
|
|
|||
19
onnxruntime/core/graph/contrib_ops/quantization_defs.h
Normal file
19
onnxruntime/core/graph/contrib_ops/quantization_defs.h
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
#pragma once
|
||||
|
||||
#include "core/graph/onnx_protobuf.h"
|
||||
#include "core/framework/tensorprotoutils.h"
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace contrib {
|
||||
|
||||
void ValidateTypeAndShapeForScaleAndZP(
|
||||
ONNX_NAMESPACE::InferenceContext& ctx,
|
||||
int index,
|
||||
::google::protobuf::int32 expectedType,
|
||||
bool isScalar,
|
||||
int expectedTensorSize = 0);
|
||||
|
||||
}
|
||||
} // namespace onnxruntime
|
||||
|
|
@ -427,7 +427,8 @@ void NhwcTransformerImpl::Transform(Node& node) {
|
|||
} else if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "QLinearLeakyRelu", {1}, kMSDomain) ||
|
||||
graph_utils::IsSupportedOptypeVersionAndDomain(node, "QLinearSigmoid", {1}, kMSDomain)) {
|
||||
TransformQLinearActivation(node);
|
||||
} else if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "QLinearGlobalAveragePool", {1}, kMSDomain)) {
|
||||
} else if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "QLinearGlobalAveragePool", {1}, kMSDomain) ||
|
||||
graph_utils::IsSupportedOptypeVersionAndDomain(node, "QLinearAveragePool", {1}, kMSDomain)) {
|
||||
TransformQLinearGlobalAveragePool(node);
|
||||
} else if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "QLinearConcat", {1}, kMSDomain)) {
|
||||
TransformQLinearConcat(node);
|
||||
|
|
|
|||
|
|
@ -192,6 +192,105 @@ void RunQLinearAveragePoolNchwU8(
|
|||
run_test(true /* only_x_not_initializer */, true /* x_y_same_zero_point */);
|
||||
}
|
||||
|
||||
static std::vector<int64_t> dims_to_nhwc(const std::vector<int64_t>& nchw) {
|
||||
std::vector<int64_t> nhwc(nchw);
|
||||
nhwc.erase(nhwc.begin() + 1);
|
||||
nhwc.push_back(nchw[1]);
|
||||
return nhwc;
|
||||
}
|
||||
|
||||
static std::vector<uint8_t> transpose_to_nhwc(const std::vector<uint8_t>& nchw_data, const std::vector<int64_t>& nchw_dims) {
|
||||
std::vector<uint8_t> nhwc_data(nchw_data.size());
|
||||
|
||||
auto batch_count = nchw_dims[0];
|
||||
auto channels = nchw_dims[1];
|
||||
int64_t image_size = std::accumulate(nchw_dims.begin() + 2, nchw_dims.end(), 1LL, std::multiplies<int64_t>());
|
||||
for (int64_t b = 0; b < batch_count; b++) {
|
||||
const uint8_t* nchw_image = nchw_data.data() + (b * image_size);
|
||||
uint8_t* nhwc_image = nhwc_data.data() + (b * image_size);
|
||||
for (int64_t img_index = 0; img_index < image_size; ++img_index) {
|
||||
for (int64_t c = 0; c < channels; c++) {
|
||||
*nhwc_image++ = nchw_image[c * image_size + img_index];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nhwc_data;
|
||||
}
|
||||
|
||||
void RunQLinearAveragePoolNhwcU8(
|
||||
const std::vector<int64_t> x_dims,
|
||||
const std::vector<int64_t> y_dims,
|
||||
const std::vector<int64_t> kernel_shape,
|
||||
const std::vector<int64_t> strides,
|
||||
const std::vector<int64_t> pads,
|
||||
const int64_t count_include_pad = 0) {
|
||||
float x_scale = 1.0f / 255.0f;
|
||||
uint8_t x_zero_point = 128;
|
||||
RandomValueGenerator random{};
|
||||
std::vector<float> x_data_fp32 = random.Uniform<float>(x_dims, -0.5f, 0.5f);
|
||||
std::vector<uint8_t> x_data(x_data_fp32.size());
|
||||
for (size_t i = 0; i < x_data.size(); ++i) {
|
||||
x_data[i] = quantize_u8(x_data_fp32[i], x_scale, x_zero_point);
|
||||
}
|
||||
|
||||
float y_scale = 1.0f / 255.0f;
|
||||
uint8_t y_zero_point = 100;
|
||||
int64_t y_size = std::accumulate(y_dims.begin(), y_dims.end(), 1LL, std::multiplies<int64_t>());
|
||||
std::vector<uint8_t> y_data(y_size);
|
||||
CalculateAvgPoolNchwU8(
|
||||
x_data.data(), x_dims, x_scale, x_zero_point,
|
||||
y_data.data(), y_dims, y_scale, y_zero_point,
|
||||
kernel_shape, strides, pads, count_include_pad);
|
||||
|
||||
// transpose the result
|
||||
std::vector<uint8_t> y_data_nhwc = transpose_to_nhwc(y_data, y_dims);
|
||||
std::vector<uint8_t> x_data_nhwc = transpose_to_nhwc(x_data, x_dims);
|
||||
auto x_dims_nhwc = dims_to_nhwc(x_dims);
|
||||
auto y_dims_nhwc = dims_to_nhwc(y_dims);
|
||||
|
||||
OpTester test("QLinearAveragePool", 1, onnxruntime::kMSDomain);
|
||||
|
||||
test.AddAttribute("auto_pad", "");
|
||||
test.AddAttribute("strides", strides);
|
||||
test.AddAttribute("pads", pads);
|
||||
test.AddAttribute("kernel_shape", kernel_shape);
|
||||
test.AddAttribute("count_include_pad", count_include_pad);
|
||||
test.AddAttribute("channels_last", (int64_t)1LL);
|
||||
|
||||
test.AddInput<uint8_t>("X", x_dims_nhwc, x_data_nhwc);
|
||||
test.AddInput<float>("x_scale", {}, {x_scale});
|
||||
test.AddInput<uint8_t>("x_zero_point", {}, {x_zero_point});
|
||||
test.AddInput<float>("y_scale", {}, {y_scale});
|
||||
test.AddInput<uint8_t>("y_zero_point", {}, {y_zero_point});
|
||||
test.AddOutput<uint8_t>("Y", y_dims_nhwc, y_data_nhwc);
|
||||
|
||||
auto q8checker = [&](const std::vector<OrtValue>& fetches, const std::string& provider_type) {
|
||||
const OrtValue& ort_value = fetches[0];
|
||||
if (ort_value.Fence()) {
|
||||
ort_value.Fence()->BeforeUsingAsInput(onnxruntime::kCpuExecutionProvider, 0);
|
||||
}
|
||||
|
||||
auto y_shape = TensorShape(y_dims_nhwc);
|
||||
const Tensor& output_tensor = ort_value.Get<Tensor>();
|
||||
ORT_ENFORCE(y_shape == output_tensor.Shape(),
|
||||
"Expected output shape [" + y_shape.ToString() + "] did not match run output shape [" +
|
||||
output_tensor.Shape().ToString() + "] for Y @" + provider_type);
|
||||
auto* output = output_tensor.Data<uint8_t>();
|
||||
auto size = static_cast<int>(output_tensor.Shape().Size());
|
||||
for (int i = 0; i < size; ++i) {
|
||||
int diff = abs(y_data_nhwc[i] - output[i]);
|
||||
EXPECT_LE(diff, 1) << "i:" << i << " expected:" << y_data_nhwc[i] << " " << (int)y_data_nhwc[i]
|
||||
<< ", got:" << output[i] << " " << (int)output[i] << ", provider_type: " << provider_type;
|
||||
}
|
||||
};
|
||||
test.SetCustomOutputVerifier(q8checker);
|
||||
|
||||
static std::unordered_set<std::string> excluded_providers = {kNnapiExecutionProvider};
|
||||
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess, "", excluded_providers);
|
||||
}
|
||||
|
||||
TEST(QLinearPoolTest, AveragePool1D_ExcludePadPixel) {
|
||||
RunQLinearAveragePoolNchwU8(
|
||||
{1, 1, 5}, // x shape
|
||||
|
|
@ -252,5 +351,68 @@ TEST(QLinearPoolTest, AveragePool3D_IncludePadPixel) {
|
|||
1); // count_include_pad
|
||||
}
|
||||
|
||||
/*************************************************
|
||||
* Channels last test
|
||||
**************************************************/
|
||||
TEST(QLinearPoolTest, AveragePool1D_ExcludePadPixel_nhwc) {
|
||||
RunQLinearAveragePoolNhwcU8(
|
||||
{1, 1, 5}, // x shape
|
||||
{1, 1, 6}, // expected y shape
|
||||
{3}, // kernel shape
|
||||
{1}, // strides
|
||||
{1, 2}, // pads
|
||||
0); // count_include_pad
|
||||
}
|
||||
|
||||
TEST(QLinearPoolTest, AveragePool1D_IncludePadPixel_nhwc) {
|
||||
RunQLinearAveragePoolNhwcU8(
|
||||
{1, 1, 5}, // x shape
|
||||
{1, 1, 6}, // expected y shape
|
||||
{3}, // kernel shape
|
||||
{1}, // strides
|
||||
{1, 2}, // pads
|
||||
1); // count_include_pad
|
||||
}
|
||||
|
||||
TEST(QLinearPoolTest, AveragePool2D_ExcludePadPixel_nhwc) {
|
||||
RunQLinearAveragePoolNhwcU8(
|
||||
{1, 1, 5, 7}, // x shape
|
||||
{1, 1, 6, 4}, // expected y shape
|
||||
{3, 4}, // kernel shape
|
||||
{1, 2}, // strides
|
||||
{1, 3, 2, 1}, // pads
|
||||
0); // count_include_pad
|
||||
}
|
||||
|
||||
TEST(QLinearPoolTest, AveragePool2D_IncludePadPixel_nhwc) {
|
||||
RunQLinearAveragePoolNhwcU8(
|
||||
{1, 1, 5, 7}, // x shape
|
||||
{1, 1, 6, 4}, // expected y shape
|
||||
{3, 4}, // kernel shape
|
||||
{1, 2}, // strides
|
||||
{1, 3, 2, 1}, // pads
|
||||
1); // count_include_pad
|
||||
}
|
||||
|
||||
TEST(QLinearPoolTest, AveragePool3D_ExcludePadPixel_nhwc) {
|
||||
RunQLinearAveragePoolNhwcU8(
|
||||
{1, 1, 5, 7, 9}, // x shape
|
||||
{1, 1, 6, 4, 3}, // expected y shape
|
||||
{3, 4, 5}, // kernel shape
|
||||
{1, 2, 3}, // strides
|
||||
{1, 3, 2, 2, 1, 2}, // pads
|
||||
0); // count_include_pad
|
||||
}
|
||||
|
||||
TEST(QLinearPoolTest, AveragePool3D_IncludePadPixel_nhwc) {
|
||||
RunQLinearAveragePoolNhwcU8(
|
||||
{1, 1, 5, 7, 9}, // x shape
|
||||
{1, 1, 6, 4, 3}, // expected y shape
|
||||
{3, 4, 5}, // kernel shape
|
||||
{1, 2, 3}, // strides
|
||||
{1, 3, 2, 2, 1, 2}, // pads
|
||||
1); // count_include_pad
|
||||
}
|
||||
|
||||
} // namespace test
|
||||
} // namespace onnxruntime
|
||||
|
|
|
|||
|
|
@ -245,6 +245,51 @@ TEST(NhwcTransformerTests, ConvGlobalAveragePool) {
|
|||
TransformerLevel::Level3);
|
||||
}
|
||||
|
||||
TEST(NhwcTransformerTests, ConvAveragePool) {
|
||||
auto build_test_case = [&](ModelTestBuilder& builder) {
|
||||
auto* input_arg = builder.MakeInput<uint8_t>({1, 23, 13, 13}, 0, 31);
|
||||
auto* conv1_output_arg = builder.MakeIntermediate();
|
||||
auto* conv2_output_arg = builder.MakeIntermediate();
|
||||
auto* avgpool1_output_arg = builder.MakeIntermediate();
|
||||
auto* avgpool2_output_arg = builder.MakeIntermediate();
|
||||
auto* output_arg = builder.MakeOutput();
|
||||
auto* conv1_weight_arg = NhwcMakeInitializer<uint8_t>(builder, {30, 23, 3, 3});
|
||||
auto* conv2_weight_arg = NhwcMakeInitializer<uint8_t>(builder, {16, 30, 3, 3});
|
||||
|
||||
Node& conv1_node = builder.AddQLinearConvNode<uint8_t>(input_arg, .01f, 135,
|
||||
conv1_weight_arg, .02f, 126,
|
||||
conv1_output_arg, .37f, 131);
|
||||
conv1_node.AddAttribute("pads", std::vector<int64_t>{1, 1, 1, 1});
|
||||
Node& avgpool_node1 = builder.AddQLinearActivationNode("QLinearAveragePool",
|
||||
conv1_output_arg, .37f, 131,
|
||||
avgpool1_output_arg, .43f, 111);
|
||||
avgpool_node1.AddAttribute("kernel_shape", std::vector<int64_t>{3, 3});
|
||||
avgpool_node1.AddAttribute("pads", std::vector<int64_t>{1, 1, 1, 1});
|
||||
|
||||
builder.AddQLinearConvNode<uint8_t>(avgpool1_output_arg, .43f, 111,
|
||||
conv2_weight_arg, .015f, 129,
|
||||
conv2_output_arg, .37f, 131);
|
||||
Node& avgpool_node2 = builder.AddQLinearActivationNode("QLinearAveragePool",
|
||||
conv2_output_arg, .37f, 131,
|
||||
avgpool2_output_arg, .37f, 131);
|
||||
avgpool_node2.AddAttribute("kernel_shape", std::vector<int64_t>{3, 3});
|
||||
avgpool_node2.AddAttribute("pads", std::vector<int64_t>{1, 1, 1, 1});
|
||||
|
||||
builder.AddDequantizeLinearNode<uint8_t>(avgpool2_output_arg, .37f, 131, output_arg);
|
||||
};
|
||||
|
||||
auto check_nhwc_graph = [&](InferenceSessionWrapper& session) {
|
||||
auto op_to_count = CountOpsInGraph(session.GetGraph());
|
||||
EXPECT_EQ(op_to_count["com.microsoft.QLinearConv"], 2);
|
||||
EXPECT_EQ(op_to_count["Transpose"], 2);
|
||||
};
|
||||
|
||||
TransformerTester(build_test_case,
|
||||
check_nhwc_graph,
|
||||
TransformerLevel::Level2,
|
||||
TransformerLevel::Level3);
|
||||
}
|
||||
|
||||
TEST(NhwcTransformerTests, ConvSplit) {
|
||||
for (int64_t axis = -4LL; axis < 4; axis++) {
|
||||
auto build_test_case = [&, axis](ModelTestBuilder& builder) {
|
||||
|
|
|
|||
Loading…
Reference in a new issue