mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-18 21:21:17 +00:00
Implement QLinearAveragePool with unit tests. (#6896)
Implement QLinearAveragePool with unit tests.
This commit is contained in:
parent
a8b897f710
commit
acfe7ac4ce
5 changed files with 605 additions and 1 deletions
|
|
@ -47,6 +47,7 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSExperimentalDoma
|
|||
// ******** Start: Quantization ******************* //
|
||||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MatMulInteger16);
|
||||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, QLinearGlobalAveragePool);
|
||||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, QLinearAveragePool);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, DequantizeLinear);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, DequantizeLinear);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, QuantizeLinear);
|
||||
|
|
@ -131,6 +132,7 @@ Status RegisterQuantizationKernels(KernelRegistry& kernel_registry) {
|
|||
BuildKernelCreateInfo<void>, //default entry to avoid the list become empty after ops-reducing
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MatMulInteger16)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, QLinearGlobalAveragePool)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, QLinearAveragePool)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, DequantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, DequantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, QuantizeLinear)>,
|
||||
|
|
|
|||
327
onnxruntime/contrib_ops/cpu/qlinear_pool.cc
Normal file
327
onnxruntime/contrib_ops/cpu/qlinear_pool.cc
Normal file
|
|
@ -0,0 +1,327 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#include "qlinear_pool.h"
|
||||
|
||||
#include "core/util/math_cpuonly.h"
|
||||
#include "core/providers/common.h"
|
||||
#include "core/platform/threadpool.h"
|
||||
#include "core/util/math.h"
|
||||
#include "core/mlas/inc/mlas.h"
|
||||
|
||||
#include <functional>
|
||||
|
||||
namespace onnxruntime {
|
||||
|
||||
using concurrency::ThreadPool;
|
||||
|
||||
namespace contrib {
|
||||
|
||||
template <typename T8Bits>
|
||||
static inline float dequantize_value(T8Bits x, float x_scale, T8Bits x_zero_point);
|
||||
|
||||
template <typename T8Bits>
|
||||
static inline T8Bits quantize_value(float y, float y_scale, T8Bits y_zero_point);
|
||||
|
||||
template <>
|
||||
inline float dequantize_value<uint8_t>(uint8_t x, float x_scale, uint8_t x_zero_point) {
|
||||
return x_scale * (static_cast<int>(x) - x_zero_point);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline uint8_t quantize_value<uint8_t>(float y, float y_scale, uint8_t y_zero_point) {
|
||||
return static_cast<uint8_t>(std::max(0.0f, std::min(std::nearbyintf(y / y_scale + y_zero_point), 255.0f)));
|
||||
}
|
||||
|
||||
template <typename T8Bits, typename PoolType>
|
||||
struct QLinearPool1DTask final {
|
||||
const float* X_data;
|
||||
T8Bits* Y_data;
|
||||
float y_scale;
|
||||
T8Bits y_zero_point;
|
||||
int64_t x_step;
|
||||
int64_t y_step;
|
||||
int64_t pooled_height;
|
||||
int64_t stride_h;
|
||||
int64_t height;
|
||||
const std::vector<int64_t>& kernel_shape;
|
||||
const std::vector<int64_t>& pads;
|
||||
const PoolProcessContext& pool_context_;
|
||||
const PoolAttributes& pool_attrs_;
|
||||
|
||||
TensorOpCost Cost() {
|
||||
double loop_count = static_cast<double>(pooled_height * kernel_shape[0]);
|
||||
return TensorOpCost{loop_count, loop_count, loop_count};
|
||||
}
|
||||
|
||||
void operator()(std::ptrdiff_t begin, std::ptrdiff_t end) const {
|
||||
for (int64_t c = begin; c < end; ++c) {
|
||||
operator()(c);
|
||||
}
|
||||
}
|
||||
|
||||
void operator()(std::ptrdiff_t c) const {
|
||||
const float* x_d = X_data + c * x_step;
|
||||
T8Bits* y_d = Y_data + c * y_step;
|
||||
|
||||
for (int64_t ph = 0; ph < pooled_height; ++ph) {
|
||||
int64_t hstart = ph * stride_h - pads[0];
|
||||
int64_t hend = std::min(hstart + kernel_shape[0], height);
|
||||
hstart = std::max(hstart, static_cast<int64_t>(0));
|
||||
float Yh = PoolType::Initialize();
|
||||
for (int64_t h = hstart; h < hend; ++h) {
|
||||
PoolType::Process(x_d[h], Yh, pool_context_);
|
||||
}
|
||||
if (pool_attrs_.count_include_pad) {
|
||||
PoolType::Finalize(kernel_shape[0], Yh, pool_context_);
|
||||
} else {
|
||||
PoolType::Finalize(hend - hstart, Yh, pool_context_);
|
||||
}
|
||||
y_d[ph] = quantize_value(Yh, y_scale, y_zero_point);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template <typename T8Bits, typename PoolType>
|
||||
struct QLinearPool2DTask final {
|
||||
const float* X_data;
|
||||
T8Bits* Y_data;
|
||||
float y_scale;
|
||||
T8Bits y_zero_point;
|
||||
int64_t x_step;
|
||||
int64_t y_step;
|
||||
int64_t pooled_height;
|
||||
int64_t pooled_width;
|
||||
int64_t stride_h;
|
||||
int64_t stride_w;
|
||||
int64_t height;
|
||||
int64_t width;
|
||||
const std::vector<int64_t>& kernel_shape;
|
||||
const std::vector<int64_t>& pads;
|
||||
const PoolProcessContext& pool_context_;
|
||||
const PoolAttributes& pool_attrs_;
|
||||
|
||||
TensorOpCost Cost() {
|
||||
double loop_count = static_cast<double>(pooled_height * pooled_width * kernel_shape[0] * kernel_shape[1]);
|
||||
return TensorOpCost{loop_count, loop_count, loop_count};
|
||||
}
|
||||
|
||||
void operator()(std::ptrdiff_t begin, std::ptrdiff_t end) const {
|
||||
for (int64_t c = begin; c < end; ++c) {
|
||||
operator()(c);
|
||||
}
|
||||
}
|
||||
|
||||
void operator()(std::ptrdiff_t c) const {
|
||||
const float* x_d = X_data + c * x_step;
|
||||
T8Bits* y_d = Y_data + c * y_step;
|
||||
|
||||
for (int64_t ph = 0; ph < pooled_height; ++ph) {
|
||||
int64_t hstart = ph * stride_h - pads[0];
|
||||
int64_t hend = std::min(hstart + kernel_shape[0], height);
|
||||
hstart = std::max(hstart, static_cast<int64_t>(0));
|
||||
for (int64_t pw = 0; pw < pooled_width; ++pw) {
|
||||
int64_t wstart = pw * stride_w - pads[1];
|
||||
int64_t wend = std::min(wstart + kernel_shape[1], width);
|
||||
wstart = std::max(wstart, static_cast<int64_t>(0));
|
||||
const int64_t pool_index = ph * pooled_width + pw;
|
||||
float Yh = PoolType::Initialize();
|
||||
for (int64_t h = hstart; h < hend; ++h) {
|
||||
int64_t input_index = h * width + wstart;
|
||||
for (int64_t w = wstart; w < wend; ++w) {
|
||||
PoolType::Process(x_d[input_index++], Yh, pool_context_);
|
||||
}
|
||||
}
|
||||
if (pool_attrs_.count_include_pad) {
|
||||
PoolType::Finalize(kernel_shape[0] * kernel_shape[1], Yh, pool_context_);
|
||||
} else {
|
||||
PoolType::Finalize((hend - hstart) * (wend - wstart), Yh, pool_context_);
|
||||
}
|
||||
y_d[pool_index] = quantize_value(Yh, y_scale, y_zero_point);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T8Bits, typename PoolType>
|
||||
struct QLinearPool3DTask final {
|
||||
const float* X_data;
|
||||
T8Bits* Y_data;
|
||||
float y_scale;
|
||||
T8Bits y_zero_point;
|
||||
int64_t x_step;
|
||||
int64_t y_step;
|
||||
int64_t pooled_height;
|
||||
int64_t pooled_width;
|
||||
int64_t pooled_depth;
|
||||
int64_t stride_h;
|
||||
int64_t stride_w;
|
||||
int64_t stride_d;
|
||||
int64_t height;
|
||||
int64_t width;
|
||||
int64_t depth;
|
||||
const std::vector<int64_t>& kernel_shape;
|
||||
const std::vector<int64_t>& pads;
|
||||
const PoolProcessContext& pool_context_;
|
||||
const PoolAttributes& pool_attrs_;
|
||||
|
||||
TensorOpCost Cost() {
|
||||
double loop_count = static_cast<double>(pooled_height * pooled_width * pooled_depth * kernel_shape[0] *
|
||||
kernel_shape[1] * kernel_shape[2]);
|
||||
return TensorOpCost{loop_count, loop_count, loop_count};
|
||||
}
|
||||
|
||||
void operator()(std::ptrdiff_t begin, std::ptrdiff_t end) const {
|
||||
for (int64_t c = begin; c < end; ++c) {
|
||||
operator()(c);
|
||||
}
|
||||
}
|
||||
|
||||
void operator()(std::ptrdiff_t c) const {
|
||||
const float* x_d = X_data + c * x_step;
|
||||
T8Bits* y_d = Y_data + c * y_step;
|
||||
|
||||
for (int64_t ph = 0; ph < pooled_height; ++ph) {
|
||||
int64_t hstart = ph * stride_h - pads[0];
|
||||
int64_t hend = std::min(hstart + kernel_shape[0], height);
|
||||
hstart = std::max(hstart, static_cast<int64_t>(0));
|
||||
for (int64_t pw = 0; pw < pooled_width; ++pw) {
|
||||
int64_t wstart = pw * stride_w - pads[1];
|
||||
int64_t wend = std::min(wstart + kernel_shape[1], width);
|
||||
wstart = std::max(wstart, static_cast<int64_t>(0));
|
||||
for (int64_t pd = 0; pd < pooled_depth; ++pd) {
|
||||
int64_t dstart = pd * stride_d - pads[2];
|
||||
int64_t dend = std::min(dstart + kernel_shape[2], depth);
|
||||
dstart = std::max(dstart, static_cast<int64_t>(0));
|
||||
const int64_t pool_index = ph * pooled_width * pooled_depth + pw * pooled_depth + pd;
|
||||
float Yh = PoolType::Initialize();
|
||||
for (int64_t h = hstart; h < hend; ++h) {
|
||||
const int64_t input_index_h = h * width * depth;
|
||||
for (int64_t w = wstart; w < wend; ++w) {
|
||||
int64_t input_index = input_index_h + w * depth + dstart;
|
||||
for (int64_t d = dstart; d < dend; ++d) {
|
||||
PoolType::Process(x_d[input_index++], Yh, pool_context_);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (pool_attrs_.count_include_pad) {
|
||||
PoolType::Finalize(kernel_shape[0] * kernel_shape[1] * kernel_shape[2], Yh, pool_context_);
|
||||
} else {
|
||||
PoolType::Finalize((hend - hstart) * (wend - wstart) * (dend - dstart), Yh, pool_context_);
|
||||
}
|
||||
auto y_value = quantize_value(Yh, y_scale, y_zero_point);
|
||||
y_d[pool_index] = y_value;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
Status QLinearAveragePool::Compute(OpKernelContext* context) const {
|
||||
const auto tensor_x_scale = context->Input<Tensor>(1);
|
||||
const auto tensor_x_zero_point = context->Input<Tensor>(2);
|
||||
const auto tensor_y_scale = context->Input<Tensor>(3);
|
||||
const auto tensor_y_zero_point = context->Input<Tensor>(4);
|
||||
|
||||
ORT_ENFORCE(IsScalarOr1ElementVector(tensor_x_scale),
|
||||
"Input x_scale must be a scalar or 1D tensor of size 1");
|
||||
ORT_ENFORCE(tensor_x_zero_point == nullptr || IsScalarOr1ElementVector(tensor_x_zero_point),
|
||||
"input x_zero_point must be a scalar or 1D tensor of size 1 if given");
|
||||
ORT_ENFORCE(IsScalarOr1ElementVector(tensor_y_scale),
|
||||
"input y_scale must be a scalar or 1D tensor of size 1");
|
||||
ORT_ENFORCE(tensor_y_zero_point == nullptr || IsScalarOr1ElementVector(tensor_y_zero_point),
|
||||
"input y_zero_point must be a scalar or 1D tensor of size 1 if given");
|
||||
|
||||
const auto* X = context->Input<Tensor>(0);
|
||||
auto dtype = X->GetElementType();
|
||||
if (dtype != ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
|
||||
ORT_THROW("Unsupported 'dtype' in QLinear Pooling:", dtype);
|
||||
}
|
||||
const TensorShape& x_shape = X->Shape();
|
||||
const float x_scale = *(tensor_x_scale->Data<float>());
|
||||
const float y_scale = *(tensor_y_scale->Data<float>());
|
||||
uint8_t x_zero_point = (tensor_x_zero_point ? *(tensor_x_zero_point->Data<uint8_t>()) : (uint8_t)0);
|
||||
uint8_t y_zero_point = (tensor_y_zero_point ? *(tensor_y_zero_point->Data<uint8_t>()) : (uint8_t)0);
|
||||
|
||||
ORT_RETURN_IF_NOT(x_shape.NumDimensions() >= 3, "Input dimension cannot be less than 3.");
|
||||
std::vector<int64_t> pads = pool_attrs_.pads;
|
||||
std::vector<int64_t> strides = pool_attrs_.strides;
|
||||
std::vector<int64_t> kernel_shape = pool_attrs_.kernel_shape;
|
||||
|
||||
std::vector<int64_t> output_dims = pool_attrs_.SetOutputSize(x_shape, x_shape[1], &pads);
|
||||
Tensor* Y = context->Output(0, output_dims);
|
||||
|
||||
const auto* X_data = X->Data<uint8_t>();
|
||||
auto* Y_data = Y->MutableData<uint8_t>();
|
||||
|
||||
const int64_t channels = x_shape[1];
|
||||
const int64_t height = x_shape[2];
|
||||
const int64_t width = kernel_shape.size() > 1 ? x_shape[3] : 1;
|
||||
const int64_t depth = kernel_shape.size() > 2 ? x_shape[4] : 1;
|
||||
const int64_t pooled_height = output_dims[2];
|
||||
const int64_t pooled_width = kernel_shape.size() > 1 ? output_dims[3] : 1;
|
||||
const int64_t pooled_depth = kernel_shape.size() > 2 ? output_dims[4] : 1;
|
||||
const int64_t total_channels = x_shape[0] * channels;
|
||||
const int64_t x_step = height * width * depth;
|
||||
const int64_t y_step = pooled_height * pooled_width * pooled_depth;
|
||||
|
||||
ThreadPool* tp = context->GetOperatorThreadPool();
|
||||
std::vector<float> x_data_fp32;
|
||||
if (kernel_shape.size() <= 3) {
|
||||
x_data_fp32.resize(x_shape.Size());
|
||||
ThreadPool::TryParallelFor(tp, x_shape.Size(), 1.0f, [=, &x_data_fp32](ptrdiff_t first, ptrdiff_t last) {
|
||||
const auto* x8 = X_data + first;
|
||||
float* x32 = x_data_fp32.data() + first;
|
||||
for (ptrdiff_t i = 0, sz = last - first; i < sz; ++i) {
|
||||
*x32++ = dequantize_value(x8[i], x_scale, x_zero_point);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
switch (kernel_shape.size()) {
|
||||
case 1:
|
||||
{
|
||||
QLinearPool1DTask<uint8_t, onnxruntime::AveragePool> avg_pool_task_1d = {
|
||||
x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_step, y_step,
|
||||
pooled_height, strides[0], height, kernel_shape, pads, pool_context_, pool_attrs_};
|
||||
ThreadPool::TryParallelFor(tp, total_channels, avg_pool_task_1d.Cost(), avg_pool_task_1d);
|
||||
break;
|
||||
}
|
||||
|
||||
case 2:
|
||||
{
|
||||
QLinearPool2DTask<uint8_t, onnxruntime::AveragePool> avg_pool_task_2d = {
|
||||
x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_step, y_step,
|
||||
pooled_height, pooled_width, strides[0], strides[1], height, width, kernel_shape, pads, pool_context_, pool_attrs_};
|
||||
ThreadPool::TryParallelFor(tp, total_channels, avg_pool_task_2d.Cost(), avg_pool_task_2d);
|
||||
break;
|
||||
}
|
||||
|
||||
case 3:
|
||||
{
|
||||
QLinearPool3DTask<uint8_t, onnxruntime::AveragePool> avg_pool_task_3d = {
|
||||
x_data_fp32.data(), Y_data, y_scale, y_zero_point, x_step, y_step,
|
||||
pooled_height, pooled_width, pooled_depth, strides[0], strides[1], strides[2], height, width, depth,
|
||||
kernel_shape, pads, pool_context_, pool_attrs_};
|
||||
ThreadPool::TryParallelFor(tp, total_channels, avg_pool_task_3d.Cost(), avg_pool_task_3d);
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
{
|
||||
return onnxruntime::common::Status(
|
||||
onnxruntime::common::ONNXRUNTIME,
|
||||
onnxruntime::common::INVALID_ARGUMENT,
|
||||
"QLinear Pooling unsupported pooling size!");
|
||||
}
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
ONNX_OPERATOR_KERNEL_EX(QLinearAveragePool, kMSDomain, 1, kCpuExecutionProvider, KernelDefBuilder(), QLinearAveragePool);
|
||||
|
||||
} // namespace contrib
|
||||
|
||||
} // namespace onnxruntime
|
||||
27
onnxruntime/contrib_ops/cpu/qlinear_pool.h
Normal file
27
onnxruntime/contrib_ops/cpu/qlinear_pool.h
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "core/common/common.h"
|
||||
#include "core/framework/op_kernel.h"
|
||||
#include "core/providers/cpu/nn/pool_base.h"
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace contrib {
|
||||
|
||||
class QLinearAveragePool final : public OpKernel, public PoolBase {
|
||||
public:
|
||||
QLinearAveragePool(const OpKernelInfo& info) : OpKernel(info), PoolBase(info) { }
|
||||
|
||||
~QLinearAveragePool() override = default;
|
||||
|
||||
Status Compute(OpKernelContext* context) const override;
|
||||
|
||||
private:
|
||||
PoolProcessContext pool_context_;
|
||||
|
||||
};
|
||||
|
||||
} // namespace contrib
|
||||
} // namespace onnxruntime
|
||||
|
|
@ -106,7 +106,8 @@ class PoolBase {
|
|||
|
||||
protected:
|
||||
PoolBase(const OpKernelInfo& info)
|
||||
: op_name_(info.GetKernelDef().OpName()),
|
||||
: op_name_(info.GetKernelDef().OpName().rfind("QLinear", 0) != 0 ?
|
||||
info.GetKernelDef().OpName() : info.GetKernelDef().OpName().substr(7)),
|
||||
pool_attrs_(info, op_name_, GetStartVersion(info)) {
|
||||
}
|
||||
|
||||
|
|
|
|||
247
onnxruntime/test/contrib_ops/qlinear_pool_test.cc
Normal file
247
onnxruntime/test/contrib_ops/qlinear_pool_test.cc
Normal file
|
|
@ -0,0 +1,247 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "test/common/tensor_op_test_utils.h"
|
||||
#include "test/providers/provider_test_utils.h"
|
||||
#include "core/providers/common.h"
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace test {
|
||||
|
||||
static inline float dequantize_u8(uint8_t x, float x_scale, uint8_t x_zero_point) {
|
||||
return x_scale * (static_cast<int>(x) - x_zero_point);
|
||||
}
|
||||
|
||||
static inline uint8_t quantize_u8(float y, float y_scale, uint8_t y_zero_point) {
|
||||
return static_cast<uint8_t>(std::max(0.0f, std::min(std::nearbyintf(y / y_scale + y_zero_point), 255.0f)));
|
||||
}
|
||||
|
||||
struct DimIterator {
|
||||
DimIterator(const std::vector<int64_t>& dims) : dims_(dims) {
|
||||
size_ = std::accumulate(dims_.begin(), dims_.end(), 1LL, std::multiplies<int64_t>());
|
||||
restart();
|
||||
}
|
||||
|
||||
void restart() {
|
||||
pos_.resize(dims_.size(), 0LL);
|
||||
index_ = 0LL;
|
||||
}
|
||||
|
||||
bool has_next() { return index_ < size_; }
|
||||
|
||||
// if has more data return current data ptr and iterator to next pos_
|
||||
// otherwise return -1
|
||||
int64_t next() {
|
||||
if (has_next()) {
|
||||
for (size_t i = dims_.size(); i > 0;) {
|
||||
i--;
|
||||
++pos_[i];
|
||||
if (pos_[i] < dims_[i]) {
|
||||
break;
|
||||
}
|
||||
pos_[i] = 0;
|
||||
}
|
||||
return index_++;
|
||||
}
|
||||
return -1L;
|
||||
}
|
||||
|
||||
const std::vector<int64_t> dims_;
|
||||
std::vector<int64_t> pos_;
|
||||
int64_t size_;
|
||||
int64_t index_;
|
||||
};
|
||||
|
||||
static void
|
||||
CalculateAvgPoolNchwU8(
|
||||
uint8_t* x,
|
||||
const std::vector<int64_t> x_dims,
|
||||
float x_scale,
|
||||
int x_zero_point,
|
||||
uint8_t* y,
|
||||
const std::vector<int64_t> y_dims,
|
||||
float y_scale,
|
||||
int y_zero_point,
|
||||
const std::vector<int64_t> kernel_shape,
|
||||
const std::vector<int64_t> strides,
|
||||
const std::vector<int64_t> pads,
|
||||
const int64_t count_include_pad) {
|
||||
int64_t batch = y_dims[0];
|
||||
int64_t channel = y_dims[1];
|
||||
|
||||
std::vector<int64_t> y_img_dims(y_dims.begin() + 2, y_dims.end());
|
||||
std::vector<int64_t> x_img_dims(x_dims.begin() + 2, x_dims.end());
|
||||
std::vector<int64_t> x_img_strides(x_img_dims.size(), 1LL);
|
||||
for (size_t i = x_img_dims.size() - 1; i > 0;) {
|
||||
i--;
|
||||
x_img_strides[i] = x_img_strides[i + 1] * x_img_dims[i + 1];
|
||||
}
|
||||
|
||||
int64_t y_step = std::accumulate(y_img_dims.begin(), y_img_dims.end(), 1LL, std::multiplies<int64_t>());
|
||||
int64_t x_step = std::accumulate(x_img_dims.begin(), x_img_dims.end(), 1LL, std::multiplies<int64_t>());
|
||||
for (int64_t b = 0; b < batch; ++b) {
|
||||
for (int64_t c = 0; c < channel; ++c) {
|
||||
uint8_t* ybc = y + (b * channel + c) * y_step;
|
||||
uint8_t* xbc = x + (b * channel + c) * x_step;
|
||||
|
||||
DimIterator yit(y_img_dims);
|
||||
while (yit.has_next()) {
|
||||
std::vector<int64_t> kernel_topleft(y_img_dims.size(), 0);
|
||||
for (size_t i = 0; i < y_img_dims.size(); ++i) {
|
||||
kernel_topleft[i] = yit.pos_[i] * strides[i];
|
||||
}
|
||||
|
||||
float y_value_sum = 0.0f;
|
||||
int count = 0;
|
||||
for (DimIterator kit(kernel_shape); kit.has_next(); kit.next()) {
|
||||
int64_t kernel_offset = 0;
|
||||
for (size_t i = 0; kernel_offset >= 0 && i < kernel_shape.size(); ++i) {
|
||||
int64_t x_real_dim = kernel_topleft[i] + kit.pos_[i] - pads[i];
|
||||
if (x_real_dim >= 0 && x_real_dim < x_img_dims[i]) {
|
||||
kernel_offset += x_real_dim * x_img_strides[i];
|
||||
} else {
|
||||
kernel_offset = -1LL; // padding element
|
||||
}
|
||||
}
|
||||
if (kernel_offset >= 0) {
|
||||
y_value_sum += dequantize_u8(xbc[kernel_offset], x_scale, static_cast<uint8_t>(x_zero_point));
|
||||
++count;
|
||||
} else {
|
||||
count += count_include_pad ? 1 : 0;
|
||||
}
|
||||
}
|
||||
auto y_offset = yit.next();
|
||||
auto y_u8 = quantize_u8(y_value_sum / count, y_scale, static_cast<uint8_t>(y_zero_point));
|
||||
ybc[y_offset] = y_u8;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void RunQLinearAveragePoolNchwU8(
|
||||
const std::vector<int64_t> x_dims,
|
||||
const std::vector<int64_t> y_dims,
|
||||
const std::vector<int64_t> kernel_shape,
|
||||
const std::vector<int64_t> strides,
|
||||
const std::vector<int64_t> pads,
|
||||
const int64_t count_include_pad = 0) {
|
||||
float x_scale = 1.0f / 255.0f;
|
||||
uint8_t x_zero_point = 128;
|
||||
RandomValueGenerator random{};
|
||||
std::vector<float> x_data_fp32 = random.Uniform<float>(x_dims, -0.5f, 0.5f);
|
||||
std::vector<uint8_t> x_data(x_data_fp32.size());
|
||||
for (size_t i = 0; i < x_data.size(); ++i) {
|
||||
x_data[i] = quantize_u8(x_data_fp32[i], x_scale, x_zero_point);
|
||||
}
|
||||
|
||||
float y_scale = 1.0f / 255.0f;
|
||||
uint8_t y_zero_point = 100;
|
||||
int64_t y_size = std::accumulate(y_dims.begin(), y_dims.end(), 1LL, std::multiplies<int64_t>());
|
||||
std::vector<uint8_t> y_data(y_size);
|
||||
CalculateAvgPoolNchwU8(
|
||||
x_data.data(), x_dims, x_scale, x_zero_point,
|
||||
y_data.data(), y_dims, y_scale, y_zero_point,
|
||||
kernel_shape, strides, pads, count_include_pad);
|
||||
|
||||
OpTester test("QLinearAveragePool", 1, onnxruntime::kMSDomain);
|
||||
|
||||
test.AddAttribute("auto_pad", "");
|
||||
test.AddAttribute("strides", strides);
|
||||
test.AddAttribute("pads", pads);
|
||||
test.AddAttribute("kernel_shape", kernel_shape);
|
||||
test.AddAttribute("count_include_pad", count_include_pad);
|
||||
|
||||
test.AddInput<uint8_t>("X", x_dims, x_data);
|
||||
test.AddInput<float>("x_scale", {}, {x_scale});
|
||||
test.AddInput<uint8_t>("x_zero_point", {}, {x_zero_point});
|
||||
test.AddInput<float>("y_scale", {}, {y_scale});
|
||||
test.AddInput<uint8_t>("y_zero_point", {}, {y_zero_point});
|
||||
test.AddOutput<uint8_t>("Y", y_dims, y_data);
|
||||
|
||||
auto q8checker = [&](const std::vector<OrtValue>& fetches, const std::string& provider_type) {
|
||||
const OrtValue& ort_value = fetches[0];
|
||||
if (ort_value.Fence()) {
|
||||
ort_value.Fence()->BeforeUsingAsInput(onnxruntime::kCpuExecutionProvider, 0);
|
||||
}
|
||||
|
||||
auto y_shape = TensorShape(y_dims);
|
||||
const Tensor& output_tensor = ort_value.Get<Tensor>();
|
||||
ORT_ENFORCE(y_shape == output_tensor.Shape(),
|
||||
"Expected output shape [" + y_shape.ToString() + "] did not match run output shape [" +
|
||||
output_tensor.Shape().ToString() + "] for Y @" + provider_type);
|
||||
auto* output = output_tensor.Data<uint8_t>();
|
||||
auto size = static_cast<int>(output_tensor.Shape().Size());
|
||||
for (int i = 0; i < size; ++i) {
|
||||
int diff = abs(y_data[i] - output[i]);
|
||||
EXPECT_LE(diff, 1) << "i:" << i << " expected:" << y_data[i] << " " << (int)y_data[i]
|
||||
<< ", got:" << output[i] << " " << (int)output[i] << ", provider_type: " << provider_type;
|
||||
}
|
||||
};
|
||||
test.SetCustomOutputVerifier(q8checker);
|
||||
|
||||
test.Run();
|
||||
}
|
||||
|
||||
TEST(QLinearPoolTest, AveragePool1D_ExcludePadPixel) {
|
||||
RunQLinearAveragePoolNchwU8(
|
||||
{1, 1, 5}, // x shape
|
||||
{1, 1, 6}, // expected y shape
|
||||
{3}, // kernel shape
|
||||
{1}, // strides
|
||||
{1, 2}, // pads
|
||||
0); // count_include_pad
|
||||
}
|
||||
|
||||
TEST(QLinearPoolTest, AveragePool1D_IncludePadPixel) {
|
||||
RunQLinearAveragePoolNchwU8(
|
||||
{1, 1, 5}, // x shape
|
||||
{1, 1, 6}, // expected y shape
|
||||
{3}, // kernel shape
|
||||
{1}, // strides
|
||||
{1, 2}, // pads
|
||||
1); // count_include_pad
|
||||
}
|
||||
|
||||
TEST(QLinearPoolTest, AveragePool2D_ExcludePadPixel) {
|
||||
RunQLinearAveragePoolNchwU8(
|
||||
{1, 1, 5, 7}, // x shape
|
||||
{1, 1, 6, 4}, // expected y shape
|
||||
{3, 4}, // kernel shape
|
||||
{1, 2}, // strides
|
||||
{1, 3, 2, 1}, // pads
|
||||
0); // count_include_pad
|
||||
}
|
||||
|
||||
TEST(QLinearPoolTest, AveragePool2D_IncludePadPixel) {
|
||||
RunQLinearAveragePoolNchwU8(
|
||||
{1, 1, 5, 7}, // x shape
|
||||
{1, 1, 6, 4}, // expected y shape
|
||||
{3, 4}, // kernel shape
|
||||
{1, 2}, // strides
|
||||
{1, 3, 2, 1}, // pads
|
||||
1); // count_include_pad
|
||||
}
|
||||
|
||||
TEST(QLinearPoolTest, AveragePool3D_ExcludePadPixel) {
|
||||
RunQLinearAveragePoolNchwU8(
|
||||
{1, 1, 5, 7, 9}, // x shape
|
||||
{1, 1, 6, 4, 3}, // expected y shape
|
||||
{3, 4, 5}, // kernel shape
|
||||
{1, 2, 3}, // strides
|
||||
{1, 3, 2, 2, 1, 2}, // pads
|
||||
0); // count_include_pad
|
||||
}
|
||||
|
||||
TEST(QLinearPoolTest, AveragePool3D_IncludePadPixel) {
|
||||
RunQLinearAveragePoolNchwU8(
|
||||
{1, 1, 5, 7, 9}, // x shape
|
||||
{1, 1, 6, 4, 3}, // expected y shape
|
||||
{3, 4, 5}, // kernel shape
|
||||
{1, 2, 3}, // strides
|
||||
{1, 3, 2, 2, 1, 2}, // pads
|
||||
1); // count_include_pad
|
||||
}
|
||||
|
||||
} // namespace test
|
||||
} // namespace onnxruntime
|
||||
Loading…
Reference in a new issue