mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-23 02:38:28 +00:00
[NNAPI EP] Enable per-channel quantization for QlinearConv (#6155)
* Enable qlinearconv per-channel quantization * Fix the android CI test failure * Add Android Version Check for Per-Channel Quant * Address PR comments * Fix some minor issues * Add verification of per-channel zero points * Make the error tolerance configurable
This commit is contained in:
parent
39aedbc97f
commit
bbb52e9274
10 changed files with 410 additions and 79 deletions
|
|
@ -7,6 +7,7 @@
|
|||
|
||||
#include <core/common/safeint.h>
|
||||
#include <core/common/logging/logging.h>
|
||||
#include <core/framework/tensorprotoutils.h>
|
||||
#include <core/graph/graph.h>
|
||||
#include <core/graph/graph_viewer.h>
|
||||
#include <core/providers/common.h>
|
||||
|
|
@ -64,6 +65,32 @@ QLinearOpType GetQLinearOpType(const onnxruntime::Node& node) {
|
|||
return QLinearOpType::Unknown;
|
||||
}
|
||||
|
||||
ConvType GetConvType(const onnxruntime::Node& node, const InitializedTensorSet& initializers) {
|
||||
const auto& op_type = node.OpType();
|
||||
bool is_qlinear_conv = (op_type == "QLinearConv");
|
||||
ORT_ENFORCE(op_type == "Conv" || is_qlinear_conv);
|
||||
|
||||
NodeAttrHelper helper(node);
|
||||
const auto group = helper.Get("group", 1);
|
||||
|
||||
size_t w_idx = is_qlinear_conv ? 3 : 1;
|
||||
const auto& weight = node.InputDefs()[w_idx]->Name();
|
||||
const auto& weight_tensor = *initializers.at(weight);
|
||||
|
||||
// For ONNX we only have 1 conv ops
|
||||
// For NNAPI we have 3
|
||||
// Input is (N, C, H, W)
|
||||
// group == 1, --> regular conv
|
||||
// group != 1 && weight is (M, 1, kH, kW), --> depthwise conv
|
||||
// group != 1 && weight is (M, C/group, kH, kW), --> grouped conv
|
||||
if (group == 1)
|
||||
return ConvType::Regular;
|
||||
else if ((weight_tensor.dims()[1] == 1))
|
||||
return ConvType::Depthwise;
|
||||
else
|
||||
return ConvType::Grouped;
|
||||
}
|
||||
|
||||
bool IsQLinearBinaryOp(QLinearOpType qlinear_op_type) {
|
||||
return qlinear_op_type == QLinearOpType::QLinearConv ||
|
||||
qlinear_op_type == QLinearOpType::QLinearMatMul ||
|
||||
|
|
@ -71,8 +98,9 @@ bool IsQLinearBinaryOp(QLinearOpType qlinear_op_type) {
|
|||
}
|
||||
|
||||
bool HasValidBinaryOpQuantizedInputs(const Node& node) {
|
||||
auto op_type = GetQLinearOpType(node);
|
||||
int32_t a_input_type, b_input_type;
|
||||
if (!IsQLinearBinaryOp(GetQLinearOpType(node))) {
|
||||
if (!IsQLinearBinaryOp(op_type)) {
|
||||
LOGS_DEFAULT(VERBOSE) << "[" << node.OpType() << "] is not a binary qlinear op";
|
||||
return false;
|
||||
}
|
||||
|
|
@ -83,7 +111,16 @@ bool HasValidBinaryOpQuantizedInputs(const Node& node) {
|
|||
if (!GetType(*input_defs[3], b_input_type))
|
||||
return false;
|
||||
|
||||
if (a_input_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8 || a_input_type != b_input_type) {
|
||||
// QlinearConv supports u8u8 or u8s8
|
||||
// QLinearMatMul/Add only support u8u8
|
||||
bool is_qlinear_conv = op_type == QLinearOpType::QLinearConv;
|
||||
bool has_valid_qlinear_conv_weight =
|
||||
(b_input_type == ONNX_NAMESPACE::TensorProto_DataType_UINT8 ||
|
||||
b_input_type == ONNX_NAMESPACE::TensorProto_DataType_INT8);
|
||||
|
||||
if (a_input_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8 ||
|
||||
(!is_qlinear_conv && a_input_type != b_input_type) ||
|
||||
(is_qlinear_conv && !has_valid_qlinear_conv_weight)) {
|
||||
LOGS_DEFAULT(VERBOSE) << "[" << node.OpType()
|
||||
<< "] A Input type: [" << a_input_type
|
||||
<< "] B Input type: [" << b_input_type
|
||||
|
|
@ -95,8 +132,9 @@ bool HasValidBinaryOpQuantizedInputs(const Node& node) {
|
|||
}
|
||||
|
||||
bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const Node& node,
|
||||
const std::vector<size_t>& indices) {
|
||||
const auto& op = node.OpType();
|
||||
const std::vector<size_t>& indices, const OpSupportCheckParams& params) {
|
||||
const auto& op_type = node.OpType();
|
||||
bool is_qlinear_conv = (op_type == "QLinearConv");
|
||||
const auto input_defs(node.InputDefs());
|
||||
for (const auto idx : indices) {
|
||||
if (idx >= input_defs.size()) {
|
||||
|
|
@ -106,13 +144,42 @@ bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const
|
|||
}
|
||||
const auto scale_name = input_defs[idx]->Name();
|
||||
if (Contains(initializers, scale_name)) {
|
||||
const auto& tensor = *initializers.at(scale_name);
|
||||
if (!tensor.dims().empty() && tensor.dims()[0] != 1) {
|
||||
LOGS_DEFAULT(VERBOSE) << op << " does not support per-channel quantization";
|
||||
return false;
|
||||
const auto& scale_tensor = *initializers.at(scale_name);
|
||||
int64_t scales_dim = scale_tensor.dims().empty() ? 1 : scale_tensor.dims()[0];
|
||||
bool is_conv_weight = is_qlinear_conv && idx == 4;
|
||||
bool is_conv_u8s8_weight = false;
|
||||
|
||||
if (is_conv_weight) {
|
||||
const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
|
||||
is_conv_u8s8_weight = weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8;
|
||||
}
|
||||
|
||||
// We need to check the per-channel quantization scales dimensions for u8s8 QlinearConv
|
||||
// We only support per-channel quantization for u8s8
|
||||
// For all other cases, the scales should be a scalar
|
||||
if (is_conv_u8s8_weight) {
|
||||
if (params.android_sdk_ver < 29) {
|
||||
LOGS_DEFAULT(VERBOSE) << op_type << " only supports per-channel quantization on Android API 29+, "
|
||||
<< "system API level: " << params.android_sdk_ver;
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
|
||||
if (weight_tensor.dims()[0] != scales_dim) {
|
||||
LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight,"
|
||||
<< " weight dimension[0] " << weight_tensor.dims()[0]
|
||||
<< " scale dimension " << scales_dim;
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
if (scales_dim != 1) {
|
||||
LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, "
|
||||
<< " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+";
|
||||
return false;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
LOGS_DEFAULT(VERBOSE) << "The scale of " << op << " must be known";
|
||||
LOGS_DEFAULT(VERBOSE) << "The scale of " << op_type << " must be known";
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
@ -122,7 +189,8 @@ bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const
|
|||
|
||||
bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, const Node& node,
|
||||
const std::vector<size_t>& indices) {
|
||||
const auto& op = node.OpType();
|
||||
const auto& op_type = node.OpType();
|
||||
bool is_qlinear_conv = (op_type == "QLinearConv");
|
||||
const auto input_defs(node.InputDefs());
|
||||
for (const auto idx : indices) {
|
||||
if (idx >= input_defs.size()) {
|
||||
|
|
@ -130,20 +198,63 @@ bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, co
|
|||
<< " >= input number, " << input_defs.size();
|
||||
return false;
|
||||
}
|
||||
const auto zero_point_name = node.InputDefs()[idx]->Name();
|
||||
|
||||
const auto zero_point_name = input_defs[idx]->Name();
|
||||
if (Contains(initializers, zero_point_name)) {
|
||||
const auto& tensor = *initializers.at(zero_point_name);
|
||||
if (!tensor.dims().empty() && tensor.dims()[0] != 1) {
|
||||
LOGS_DEFAULT(VERBOSE) << op << " does not support per-channel quantization";
|
||||
return false;
|
||||
bool is_conv_weight = is_qlinear_conv && idx == 5;
|
||||
bool is_conv_u8s8_weight = false;
|
||||
if (is_conv_weight) {
|
||||
const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
|
||||
is_conv_u8s8_weight = weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8;
|
||||
}
|
||||
if (tensor.data_type() != ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
|
||||
LOGS_DEFAULT(VERBOSE) << op << " does not support zero point data type "
|
||||
<< std::to_string(tensor.data_type());
|
||||
return false;
|
||||
|
||||
const auto& zero_tensor = *initializers.at(zero_point_name);
|
||||
int64_t zero_dim = zero_tensor.dims().empty() ? 1 : zero_tensor.dims()[0];
|
||||
if (is_conv_u8s8_weight) {
|
||||
if (zero_tensor.data_type() != ONNX_NAMESPACE::TensorProto_DataType_INT8) {
|
||||
LOGS_DEFAULT(VERBOSE) << "u8s8 QlinearConv only supports int8 zero point for weight, "
|
||||
<< "actual zero point type: [" << zero_tensor.data_type() << "]";
|
||||
return false;
|
||||
}
|
||||
|
||||
// For onnx, u8s8 QlinearConv, the weight zero point can be a scalar,
|
||||
// or a tensor with same channel as weight, for NNAPI we only support it be
|
||||
// 0 (scalar) or all 0 (tensor), NNAPI will assume the zero point for per-channel
|
||||
// quantization is 0 there is no input for it
|
||||
const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
|
||||
if (weight_tensor.dims()[0] != zero_dim && zero_dim != 1) {
|
||||
LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight,"
|
||||
<< " weight dimension[0] " << weight_tensor.dims()[0]
|
||||
<< " zero point dimension " << zero_dim;
|
||||
return false;
|
||||
}
|
||||
|
||||
std::unique_ptr<uint8_t[]> unpacked_tensor;
|
||||
size_t tensor_byte_size;
|
||||
auto status = onnxruntime::utils::UnpackInitializerData(zero_tensor, unpacked_tensor, tensor_byte_size);
|
||||
if (!status.IsOK()) {
|
||||
LOGS_DEFAULT(ERROR) << "QLinearConv erro when unpack zero tensor:" << status.ErrorMessage();
|
||||
return false;
|
||||
}
|
||||
|
||||
// Verify all onnx weight zero point(s) are 0(s)
|
||||
const int8_t* zero_points = reinterpret_cast<const int8_t*>(unpacked_tensor.get());
|
||||
for (size_t i = 0; i < tensor_byte_size; i++) {
|
||||
if (zero_points[i] != 0) {
|
||||
LOGS_DEFAULT(VERBOSE) << "QLinearConv only support 0 as zero point, "
|
||||
<< "zero_points[" << i << "] has value: " << zero_points[i];
|
||||
return false;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (zero_dim != 1) {
|
||||
LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, "
|
||||
<< " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+";
|
||||
return false;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
LOGS_DEFAULT(VERBOSE) << "The zero point of " << op << " must be known";
|
||||
LOGS_DEFAULT(VERBOSE) << "The zero point of " << op_type << " must be known";
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -87,8 +87,18 @@ enum class QLinearOpType : uint8_t {
|
|||
// QLinearReduceMean,
|
||||
};
|
||||
|
||||
enum class ConvType : uint8_t {
|
||||
Regular,
|
||||
Depthwise,
|
||||
Grouped,
|
||||
};
|
||||
|
||||
QLinearOpType GetQLinearOpType(const onnxruntime::Node& node);
|
||||
|
||||
// Return the type of the conv ops,
|
||||
// This function assumes the input is a 2d conv node
|
||||
ConvType GetConvType(const onnxruntime::Node& node, const InitializedTensorSet& initializers);
|
||||
|
||||
// This qlinear op is an operator takes 2 input and produces 1 output
|
||||
// Such as QLinearConv, QLinearMatMul, QLinearAdd, ...
|
||||
bool IsQLinearBinaryOp(QLinearOpType qlinear_op_type);
|
||||
|
|
@ -97,7 +107,7 @@ bool IsQLinearBinaryOp(QLinearOpType qlinear_op_type);
|
|||
bool HasValidBinaryOpQuantizedInputs(const Node& node);
|
||||
// Check if a qlinear op has valid scales for given indices
|
||||
bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const Node& node,
|
||||
const std::vector<size_t>& indices);
|
||||
const std::vector<size_t>& indices, const OpSupportCheckParams& params);
|
||||
// Check if a qlinear op has valid zero points for given indices
|
||||
bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, const Node& node,
|
||||
const std::vector<size_t>& indices);
|
||||
|
|
|
|||
|
|
@ -26,7 +26,7 @@ int32_t ModelBuilder::GetAndroidSdkVer() const {
|
|||
// Scalar operand is copied into the model, no need to persist
|
||||
#define DEFINE_ADD_OPERAND_FROM_SCALAR(scalar_type, op_type) \
|
||||
Status ModelBuilder::AddOperandFromScalar(scalar_type value, uint32_t& index) { \
|
||||
OperandType operandType(Type::op_type); \
|
||||
OperandType operandType(Type::op_type, vector<uint32_t>{}); \
|
||||
ORT_RETURN_IF_ERROR(AddNewNNAPIOperand(operandType, index)); \
|
||||
RETURN_STATUS_ON_ERROR_WITH_NOTE( \
|
||||
nnapi_->ANeuralNetworksModel_setOperandValue( \
|
||||
|
|
@ -377,6 +377,18 @@ Status ModelBuilder::AddNewNNAPIOperand(const OperandType& operand_type, uint32_
|
|||
RETURN_STATUS_ON_ERROR(
|
||||
nnapi_->ANeuralNetworksModel_addOperand(nnapi_model_->model_, &operand_type.operandType));
|
||||
index = next_index_++;
|
||||
|
||||
if (operand_type.channelQuant) {
|
||||
if (GetAndroidSdkVer() < 29) {
|
||||
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
|
||||
"Per-channel quantization is only supported on Android API level 29+,",
|
||||
" system API level: ", GetAndroidSdkVer());
|
||||
}
|
||||
|
||||
RETURN_STATUS_ON_ERROR(nnapi_->ANeuralNetworksModel_setOperandSymmPerChannelQuantParams(
|
||||
nnapi_model_->model_, index, &operand_type.channelQuant->params));
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -458,6 +458,7 @@ static Status HandleAutoPad(const Shape& input_shape,
|
|||
vector<int32_t>& onnx_pads,
|
||||
int32_t& nnapi_padding_code,
|
||||
bool& use_auto_pad) {
|
||||
use_auto_pad = false;
|
||||
if (auto_pad_type != AutoPadType::NOTSET) {
|
||||
ORT_RETURN_IF_ERROR(ComputeConvPads(input_shape, weight_size_y, weight_size_x,
|
||||
onnx_pads, onnx_strides, onnx_dilations,
|
||||
|
|
@ -524,6 +525,47 @@ static Status GetBinaryOpQuantizationScaleAndZeroPoint(
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
static Status GetConvOpQuantizationScaleAndZeroPoint(
|
||||
const ModelBuilder& model_builder, const Node& node,
|
||||
float& a_scale, float& w_scale, float& y_scale,
|
||||
int32_t& a_zero_point, int32_t& w_zero_point, int32_t& y_zero_point,
|
||||
optional<vector<float>>& w_scales) ORT_MUST_USE_RESULT;
|
||||
static Status GetConvOpQuantizationScaleAndZeroPoint(
|
||||
const ModelBuilder& model_builder, const Node& node,
|
||||
float& a_scale, float& w_scale, float& y_scale,
|
||||
int32_t& a_zero_point, int32_t& w_zero_point, int32_t& y_zero_point,
|
||||
optional<vector<float>>& w_scales) {
|
||||
// Get scale and zero points
|
||||
// We will handle per-channel weight scale and zero point later
|
||||
ORT_RETURN_IF_ERROR(
|
||||
GetBinaryOpQuantizationScaleAndZeroPoint(model_builder, node,
|
||||
a_scale, w_scale, y_scale,
|
||||
a_zero_point, w_zero_point, y_zero_point));
|
||||
|
||||
const auto input_defs = node.InputDefs();
|
||||
const auto& initializers(model_builder.GetInitializerTensors());
|
||||
const auto& weight_tensor = *initializers.at(input_defs[3]->Name());
|
||||
|
||||
// We are done here is this is u8u8 QLinearConv
|
||||
if (weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_UINT8)
|
||||
return Status::OK();
|
||||
|
||||
// Now we have u8s8 QlinearConv
|
||||
// u8s8 QlinearConv always have 0 as zero point so we are not getting it here
|
||||
// and we do not use w_scale here, so we reset them back to 0
|
||||
w_scale = 0.0f;
|
||||
w_zero_point = 0;
|
||||
|
||||
// We need to copy the 1d scales array for per-channel quantization
|
||||
const auto& scale_tensor = *initializers.at(input_defs[4]->Name());
|
||||
const auto* scales = GetTensorFloatData(scale_tensor);
|
||||
size_t scales_size = scale_tensor.dims().empty() ? 1 : scale_tensor.dims()[0];
|
||||
vector<float> scales_vec(scales_size, 0.0f);
|
||||
memcpy(scales_vec.data(), scales, sizeof(float) * scales_size);
|
||||
w_scales = onnxruntime::make_optional(std::move(scales_vec));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// NNAPI has the quantization scale and zero point embedded in the ANeuralNetworksOperandType
|
||||
// ONNX has the quantization scale and zero point as the inputs of the qlinear operators
|
||||
// We want to verify the scale and zeropoint of the ONNX inputs matches the values embedded in the NNAPI inputs
|
||||
|
|
@ -553,6 +595,35 @@ static Status IsValidInputQuantizedType(const ModelBuilder& model_builder,
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
static Status IsValidConvWeightQuantizedType(const ModelBuilder& model_builder,
|
||||
const std::string& input_name,
|
||||
float scale,
|
||||
int32_t zero_point,
|
||||
const optional<vector<float>>& scales) ORT_MUST_USE_RESULT;
|
||||
static Status IsValidConvWeightQuantizedType(const ModelBuilder& model_builder,
|
||||
const std::string& input_name,
|
||||
float scale,
|
||||
int32_t zero_point,
|
||||
const optional<vector<float>>& scales) {
|
||||
// first verify as the weight has no per-channel quantization
|
||||
ORT_RETURN_IF_ERROR(IsValidInputQuantizedType(model_builder, input_name, scale, zero_point));
|
||||
|
||||
if (scales) {
|
||||
const OperandType& input_operand_type = model_builder.GetOperandTypes().at(input_name);
|
||||
if (!input_operand_type.channelQuant) {
|
||||
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
|
||||
"Input [", input_name, "] has no channelQuant");
|
||||
}
|
||||
|
||||
if (input_operand_type.channelQuant.value().scales != scales.value()) {
|
||||
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
|
||||
"Input [", input_name, "] has mismatch scales between onnx and NNAPI");
|
||||
}
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
static void AddBinaryOpQuantizationScaleAndZeroPointToSkip(ModelBuilder& model_builder, const Node& node) {
|
||||
const auto input_defs(node.InputDefs());
|
||||
model_builder.AddInitializerToSkip(input_defs[1]->Name()); // a_scale
|
||||
|
|
@ -1253,6 +1324,13 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
|
|||
}
|
||||
}
|
||||
|
||||
const auto& weight = input_defs[w_idx]->Name();
|
||||
const auto& weight_tensor = *initializers.at(weight);
|
||||
auto conv_type = GetConvType(node, model_builder.GetGraphViewer().GetAllInitializedTensors());
|
||||
bool conv_2d = (conv_type == ConvType::Regular),
|
||||
depthwise_conv_2d = (conv_type == ConvType::Depthwise),
|
||||
grouped_conv_2d = (conv_type == ConvType::Grouped);
|
||||
|
||||
float x_scale = 0.0f,
|
||||
w_scale = 0.0f,
|
||||
y_scale = 0.0f;
|
||||
|
|
@ -1260,31 +1338,16 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
|
|||
w_zero_point = 0,
|
||||
y_zero_point = 0;
|
||||
|
||||
// this is for per-channel quantization weights
|
||||
optional<vector<float>> w_scales;
|
||||
|
||||
if (is_qlinear_conv) {
|
||||
ORT_RETURN_IF_ERROR(GetBinaryOpQuantizationScaleAndZeroPoint(model_builder, node,
|
||||
x_scale, w_scale, y_scale,
|
||||
x_zero_point, w_zero_point, y_zero_point));
|
||||
ORT_RETURN_IF_ERROR(GetConvOpQuantizationScaleAndZeroPoint(model_builder, node,
|
||||
x_scale, w_scale, y_scale,
|
||||
x_zero_point, w_zero_point, y_zero_point,
|
||||
w_scales));
|
||||
}
|
||||
|
||||
const auto& weight = input_defs[w_idx]->Name();
|
||||
const auto& weight_tensor = *initializers.at(weight);
|
||||
bool conv_2d = false,
|
||||
depthwise_conv_2d = false,
|
||||
grouped_conv_2d = false;
|
||||
|
||||
// For ONNX we only have 1 conv ops
|
||||
// For NNAPI we have 3
|
||||
// Input is (N, C, H, W)
|
||||
// group == 1, --> regular conv
|
||||
// group != 1 && weight is (M, 1, kH, kW), --> depthwise conv
|
||||
// group != 1 && weight is (M, C/group, kH, kW), --> grouped conv
|
||||
if (group == 1)
|
||||
conv_2d = true;
|
||||
else if ((weight_tensor.dims()[1] == 1))
|
||||
depthwise_conv_2d = true;
|
||||
else
|
||||
grouped_conv_2d = true;
|
||||
|
||||
Shape onnx_weight_shape;
|
||||
for (auto dim : weight_tensor.dims())
|
||||
onnx_weight_shape.push_back(SafeInt<uint32_t>(dim));
|
||||
|
|
@ -1297,12 +1360,22 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
|
|||
case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
|
||||
onnx_weight_type = Type::TENSOR_QUANT8_ASYMM;
|
||||
break;
|
||||
case ONNX_NAMESPACE::TensorProto_DataType_INT8:
|
||||
onnx_weight_type = Type::TENSOR_QUANT8_SYMM_PER_CHANNEL;
|
||||
break;
|
||||
default:
|
||||
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
|
||||
"The initializer of graph ", weight, " doesn't have valid type: ", weight_tensor.data_type());
|
||||
}
|
||||
|
||||
OperandType onnx_weight_operand_type(onnx_weight_type, onnx_weight_shape, w_scale, w_zero_point);
|
||||
// Get weight operand type
|
||||
// Per-channel quantized weight is handled differently
|
||||
OperandType onnx_weight_operand_type =
|
||||
(is_qlinear_conv && w_scales.has_value())
|
||||
? OperandType{onnx_weight_type, onnx_weight_shape,
|
||||
SymmPerChannelQuantParams{w_scales.value(),
|
||||
depthwise_conv_2d ? 3u : 0u}} // channelDim is 3 for depthwise-conv
|
||||
: OperandType{onnx_weight_type, onnx_weight_shape, w_scale, w_zero_point};
|
||||
|
||||
// Pre-process weights
|
||||
if (conv_2d || grouped_conv_2d) {
|
||||
|
|
@ -1314,7 +1387,7 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
|
|||
if (is_qlinear_conv) {
|
||||
// Verify if the scale and zero point matchs from onnx input/weight and nnapi input/weight
|
||||
ORT_RETURN_IF_ERROR(IsValidInputQuantizedType(model_builder, input, x_scale, x_zero_point));
|
||||
ORT_RETURN_IF_ERROR(IsValidInputQuantizedType(model_builder, weight, w_scale, w_zero_point));
|
||||
ORT_RETURN_IF_ERROR(IsValidConvWeightQuantizedType(model_builder, weight, w_scale, w_zero_point, w_scales));
|
||||
}
|
||||
|
||||
bool hasBias = (input_defs.size() > b_idx);
|
||||
|
|
@ -1332,14 +1405,15 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
|
|||
vector<float> buffer(bias_dimen[0], 0.0f);
|
||||
OperandType bias_operand_type(Type::TENSOR_FLOAT32, bias_dimen, x_scale * w_scale);
|
||||
ORT_RETURN_IF_ERROR(model_builder.AddOperandFromPersistMemoryBuffer(bias, buffer.data(), bias_operand_type));
|
||||
} else if (weight_type == Type::TENSOR_QUANT8_ASYMM) {
|
||||
} else if (weight_type == Type::TENSOR_QUANT8_ASYMM || weight_type == Type::TENSOR_QUANT8_SYMM_PER_CHANNEL) {
|
||||
vector<int32_t> buffer(bias_dimen[0], 0);
|
||||
OperandType bias_operand_type(Type::TENSOR_INT32, bias_dimen, x_scale * w_scale);
|
||||
ORT_RETURN_IF_ERROR(model_builder.AddOperandFromPersistMemoryBuffer(bias, buffer.data(), bias_operand_type));
|
||||
} else {
|
||||
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unknown weight type ", TypeToStr(weight_type));
|
||||
}
|
||||
} else if (is_qlinear_conv) { // QLinearConv's bias type need special handling
|
||||
} else if (is_qlinear_conv) {
|
||||
// QLinearConv's bias type need special handling to add scale for quantization input
|
||||
const auto& bias_tensor = *model_builder.GetInitializerTensors().at(bias);
|
||||
ORT_RETURN_IF_NOT(bias_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT32,
|
||||
"bias of QLinearConv should be int32, actual type: ", bias_tensor.data_type());
|
||||
|
|
|
|||
|
|
@ -228,7 +228,7 @@ bool BinaryOpSupportChecker::HasSupportedInputsImpl(const Node& node) const {
|
|||
}
|
||||
|
||||
bool BinaryOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
|
||||
const OpSupportCheckParams& /* params */) const {
|
||||
const OpSupportCheckParams& params) const {
|
||||
const auto& op_type(node.OpType());
|
||||
const auto input_defs(node.InputDefs());
|
||||
bool op_is_qlinear = op_type == "QLinearAdd";
|
||||
|
|
@ -265,7 +265,7 @@ bool BinaryOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initi
|
|||
|
||||
// All scale/zero points are initializer scalars
|
||||
// a/b/y_scale
|
||||
if (!HasValidQuantizationScales(initializers, node, {1, 4, 6}))
|
||||
if (!HasValidQuantizationScales(initializers, node, {1, 4, 6}, params))
|
||||
return false;
|
||||
|
||||
// a/b/y_zero_point
|
||||
|
|
@ -599,7 +599,7 @@ bool ConvOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial
|
|||
}
|
||||
|
||||
// a/b/y_scale
|
||||
if (!HasValidQuantizationScales(initializers, node, {1, 4, 6}))
|
||||
if (!HasValidQuantizationScales(initializers, node, {1, 4, 6}, params))
|
||||
return false;
|
||||
|
||||
// a/b/y_zero_point
|
||||
|
|
@ -860,7 +860,7 @@ bool GemmOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial
|
|||
|
||||
// All scale/zero points are initializer scalars
|
||||
// a/b/y_scale
|
||||
if (!HasValidQuantizationScales(initializers, node, {1, 4, 6}))
|
||||
if (!HasValidQuantizationScales(initializers, node, {1, 4, 6}, params))
|
||||
return false;
|
||||
|
||||
// a/b/y_zero_point
|
||||
|
|
@ -1003,7 +1003,7 @@ class QuantizeLinearOpSupportChecker : public BaseOpSupportChecker {
|
|||
};
|
||||
|
||||
bool QuantizeLinearOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
|
||||
const OpSupportCheckParams& /* params */) const {
|
||||
const OpSupportCheckParams& params) const {
|
||||
const auto input_defs(node.InputDefs());
|
||||
const auto output_defs(node.OutputDefs());
|
||||
|
||||
|
|
@ -1018,7 +1018,7 @@ bool QuantizeLinearOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSe
|
|||
return false;
|
||||
}
|
||||
|
||||
if (!HasValidQuantizationScales(initializers, node, {1}))
|
||||
if (!HasValidQuantizationScales(initializers, node, {1}, params))
|
||||
return false;
|
||||
|
||||
if (input_defs.size() == 3) { // has zero_point input
|
||||
|
|
@ -1045,9 +1045,9 @@ class DequantizeLinearOpSupportChecker : public BaseOpSupportChecker {
|
|||
};
|
||||
|
||||
bool DequantizeLinearOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
|
||||
const OpSupportCheckParams& /* params */) const {
|
||||
const OpSupportCheckParams& params) const {
|
||||
const auto input_defs(node.InputDefs());
|
||||
if (!HasValidQuantizationScales(initializers, node, {1}))
|
||||
if (!HasValidQuantizationScales(initializers, node, {1}, params))
|
||||
return false;
|
||||
|
||||
if (input_defs.size() == 3) { // has zero_point input
|
||||
|
|
|
|||
|
|
@ -32,10 +32,22 @@ OperandType::OperandType(Type type, const std::vector<uint32_t>& d, float scale,
|
|||
};
|
||||
}
|
||||
|
||||
OperandType::OperandType(const OperandType& other) {
|
||||
type = other.type;
|
||||
dimensions = other.dimensions;
|
||||
operandType = other.operandType;
|
||||
OperandType::OperandType(Type type, const std::vector<uint32_t>& d, SymmPerChannelQuantParams&& channelQuant)
|
||||
: type(type), dimensions(d), channelQuant(std::move(channelQuant)) {
|
||||
operandType = {
|
||||
.type = static_cast<int32_t>(type),
|
||||
.dimensionCount = static_cast<uint32_t>(dimensions.size()),
|
||||
.dimensions = dimensions.size() > 0 ? dimensions.data() : nullptr,
|
||||
.scale = 0.0f,
|
||||
.zeroPoint = 0,
|
||||
};
|
||||
}
|
||||
|
||||
OperandType::OperandType(const OperandType& other)
|
||||
: operandType(other.operandType),
|
||||
type(other.type),
|
||||
dimensions(other.dimensions),
|
||||
channelQuant(other.channelQuant) {
|
||||
operandType.dimensions = dimensions.size() > 0 ? dimensions.data() : nullptr;
|
||||
}
|
||||
|
||||
|
|
@ -44,6 +56,7 @@ OperandType& OperandType::operator=(const OperandType& other) {
|
|||
type = other.type;
|
||||
dimensions = other.dimensions;
|
||||
operandType = other.operandType;
|
||||
channelQuant = other.channelQuant;
|
||||
operandType.dimensions = dimensions.size() > 0 ? dimensions.data() : nullptr;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -21,6 +21,9 @@
|
|||
|
||||
#include "NeuralNetworksTypes.h"
|
||||
|
||||
// Move to std::optional when we switch to c++ 17
|
||||
#include "core/common/optional.h"
|
||||
|
||||
template <typename T>
|
||||
T Product(const std::vector<T>& v) {
|
||||
return static_cast<T>(
|
||||
|
|
@ -99,12 +102,40 @@ inline std::string TypeToStr(const Type& type) {
|
|||
}
|
||||
}
|
||||
|
||||
struct SymmPerChannelQuantParams {
|
||||
ANeuralNetworksSymmPerChannelQuantParams params;
|
||||
std::vector<float> scales;
|
||||
SymmPerChannelQuantParams(std::vector<float> scalesVec, uint32_t channelDim)
|
||||
: scales(std::move(scalesVec)) {
|
||||
params = {
|
||||
.channelDim = channelDim,
|
||||
.scaleCount = static_cast<uint32_t>(scales.size()),
|
||||
.scales = scales.size() > 0 ? scales.data() : nullptr,
|
||||
};
|
||||
}
|
||||
SymmPerChannelQuantParams(const SymmPerChannelQuantParams& other)
|
||||
: params(other.params), scales(other.scales) {
|
||||
params.scales = scales.size() > 0 ? scales.data() : nullptr;
|
||||
}
|
||||
SymmPerChannelQuantParams& operator=(const SymmPerChannelQuantParams& other) {
|
||||
if (this != &other) {
|
||||
params = other.params;
|
||||
scales = other.scales;
|
||||
params.scales = scales.size() > 0 ? scales.data() : nullptr;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
struct OperandType {
|
||||
ANeuralNetworksOperandType operandType;
|
||||
Type type;
|
||||
std::vector<uint32_t> dimensions;
|
||||
onnxruntime::optional<SymmPerChannelQuantParams> channelQuant;
|
||||
|
||||
explicit OperandType(Type type, const std::vector<uint32_t>& d, float scale = 0.0f, int32_t zeroPoint = 0);
|
||||
explicit OperandType(Type type, const std::vector<uint32_t>& d, SymmPerChannelQuantParams&& channelQuant);
|
||||
|
||||
explicit OperandType(Type type, const std::vector<uint32_t>& d = {}, float scale = 0.0f, int32_t zeroPoint = 0);
|
||||
OperandType(const OperandType& other);
|
||||
OperandType& operator=(const OperandType& other);
|
||||
|
||||
|
|
|
|||
|
|
@ -389,7 +389,9 @@ class QLinearConvOpTester {
|
|||
Y_shape.push_back(output_channels);
|
||||
for (size_t n = 0; n < kernel_rank; n++) {
|
||||
Y_shape.push_back(((input_shape[n] + pads[n] + pads[kernel_rank + n]) -
|
||||
(dilations[n] * (kernel_shape[n] - 1) + 1)) / strides[n] + 1);
|
||||
(dilations[n] * (kernel_shape[n] - 1) + 1)) /
|
||||
strides[n] +
|
||||
1);
|
||||
}
|
||||
const int64_t* output_shape = Y_shape.data() + 2;
|
||||
Y_data.resize(ShapeSize(Y_shape));
|
||||
|
|
@ -464,22 +466,38 @@ class QLinearConvOpTester {
|
|||
|
||||
test.AddInput<T1>("x", X_.shape_, X_.data_);
|
||||
test.AddInput<float>("x_scale", {}, X_.scale_, all_input_initializer_except_x);
|
||||
test.AddInput<T1>("x_zero_point", {}, {X_.zero_point_});
|
||||
test.AddInput<T1>("x_zero_point", {}, {X_.zero_point_}, all_input_initializer_except_x);
|
||||
|
||||
const std::vector<int64_t> W_scale_shape{static_cast<int64_t>(W_.scale_.size())};
|
||||
test.AddInput<T2>("w", W_.shape_, W_.data_, all_input_initializer_except_x);
|
||||
test.AddInput<float>("w_scale", W_scale_shape, W_.scale_, all_input_initializer_except_x);
|
||||
test.AddInput<T2>("w_zero_point", {}, {W_.zero_point_});
|
||||
test.AddInput<T2>("w_zero_point", {}, {W_.zero_point_}, all_input_initializer_except_x);
|
||||
|
||||
test.AddInput<float>("y_scale", {}, {output_scale_}, all_input_initializer_except_x);
|
||||
test.AddInput<T1>("y_zero_point", {}, {output_zero_point_});
|
||||
test.AddInput<T1>("y_zero_point", {}, {output_zero_point_}, all_input_initializer_except_x);
|
||||
|
||||
if (!B_.empty()) {
|
||||
const std::vector<int64_t> B_shape{static_cast<int64_t>(B_.size())};
|
||||
test.AddInput<int32_t>("b", B_shape, B_);
|
||||
test.AddInput<int32_t>("b", B_shape, B_, all_input_initializer_except_x);
|
||||
}
|
||||
|
||||
test.AddOutput<uint8_t>("y", Y_shape, Y_data);
|
||||
float abs_error = 0.0f;
|
||||
|
||||
// For quantized models, NNAPI's rounding is different than CPU provider
|
||||
// Sometimes the result is within +/-1 of result of CPU provider
|
||||
// For ONNX, we use rounding to nearest ties to even.
|
||||
// For NNAPI, it is using std::round which is HALF_AWAY_FROM_ZERO, see
|
||||
// https://android.googlesource.com/platform/frameworks/ml/+/refs/heads/master/nn/common/operations/Quantize.cpp
|
||||
// Use 1 as abs_error which is the smallest possbile for uint8_t
|
||||
//
|
||||
// NOTE, for now the tolerance will only apply if the NNAPI is actually used,
|
||||
// if for any reason the execution falls back to CPU, we still expect an exact match
|
||||
// See, 'void Check<uint8_t>(...' in onnxruntime/test/providers/provider_test_utils.cc
|
||||
#ifdef USE_NNAPI
|
||||
abs_error = 1.0f;
|
||||
#endif
|
||||
|
||||
test.AddOutput<uint8_t>("y", Y_shape, Y_data, false /* sort_output */, 0.0f /* rel_error */, abs_error);
|
||||
|
||||
if (!pads_.empty()) {
|
||||
test.AddAttribute("pads", pads_);
|
||||
|
|
|
|||
|
|
@ -68,6 +68,53 @@ void Check(const OpTester::Data& expected_data, const Tensor& output_tensor,
|
|||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
void Check<uint8_t>(const OpTester::Data& expected_data,
|
||||
const Tensor& output_tensor,
|
||||
const std::string& provider_type) {
|
||||
auto& expected_tensor = expected_data.data_.Get<Tensor>();
|
||||
auto* expected = expected_tensor.template Data<uint8_t>();
|
||||
auto* output = output_tensor.template Data<uint8_t>();
|
||||
auto size = output_tensor.Shape().Size();
|
||||
|
||||
bool has_abs_err = expected_data.absolute_error_.has_value();
|
||||
bool has_rel_err = expected_data.relative_error_.has_value();
|
||||
|
||||
if (expected_data.sort_output_) {
|
||||
// if order can be jumbled in the output of an operator, sort both the
|
||||
// expected and output buffers prior to
|
||||
// comparison this is a "best-effort" algo and should satisfy the
|
||||
// requirement for the few ops that do require this
|
||||
// support without investing in a more sophisticated infrastructure for the
|
||||
// same
|
||||
sort_expected_and_actual_buffers<uint8_t>(expected, output, size);
|
||||
}
|
||||
|
||||
// For uint8_t results, we only allow NNAPI EP to have an error tolerance, see below for the reason
|
||||
// For any other EPs, we still expect an exact match for the results
|
||||
if (provider_type == kNnapiExecutionProvider && (has_abs_err || has_rel_err)) {
|
||||
double threshold = has_abs_err
|
||||
? expected_data.absolute_error_.value()
|
||||
: 0.0;
|
||||
|
||||
for (int i = 0; i < size; ++i) {
|
||||
if (has_rel_err) {
|
||||
EXPECT_NEAR(expected[i], output[i],
|
||||
expected_data.relative_error_.value() * expected[i]) // expected[i] is unsigned, can't be negative
|
||||
<< "i:" << i << ", provider_type: " << provider_type;
|
||||
} else { // has_abs_err
|
||||
EXPECT_NEAR(expected[i], output[i], threshold)
|
||||
<< "i:" << i << ", provider_type: " << provider_type;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < size; ++i) {
|
||||
EXPECT_EQ(expected[i], output[i]) << "i:" << i
|
||||
<< ", provider_type: " << provider_type;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
void Check<double>(const OpTester::Data& expected_data,
|
||||
const Tensor& output_tensor,
|
||||
|
|
@ -747,8 +794,7 @@ void OpTester::Run(
|
|||
kAclExecutionProvider,
|
||||
kArmNNExecutionProvider,
|
||||
kNnapiExecutionProvider,
|
||||
kRocmExecutionProvider
|
||||
};
|
||||
kRocmExecutionProvider};
|
||||
|
||||
bool has_run = false;
|
||||
|
||||
|
|
@ -844,8 +890,7 @@ void OpTester::Run(
|
|||
}
|
||||
}
|
||||
|
||||
if (!valid)
|
||||
{
|
||||
if (!valid) {
|
||||
std::cerr << "No kernel registered from EP: " << provider_type << "for node: " << node.OpType() << std::endl;
|
||||
break;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -324,20 +324,24 @@ class OpTester {
|
|||
|
||||
template <typename T>
|
||||
void AddOutput(const char* name, const std::vector<int64_t>& dims, const std::initializer_list<T>& expected_values,
|
||||
bool sort_output = false) {
|
||||
AddData(output_data_, name, dims, expected_values.begin(), expected_values.size(), false, sort_output);
|
||||
bool sort_output = false, float rel_error = 0.0f, float abs_error = 0.0f) {
|
||||
AddData(output_data_, name, dims, expected_values.begin(), expected_values.size(), false,
|
||||
sort_output, nullptr /* dim_params */, rel_error, abs_error);
|
||||
}
|
||||
|
||||
// This function doesn't work for vector<bool> because const vector<bool> cannot invoke its data().
|
||||
template <typename T>
|
||||
void AddOutput(const char* name, const std::vector<int64_t>& dims, const std::vector<T>& expected_values,
|
||||
bool sort_output = false) {
|
||||
AddData(output_data_, name, dims, expected_values.data(), expected_values.size(), false, sort_output);
|
||||
bool sort_output = false, float rel_error = 0.0f, float abs_error = 0.0f) {
|
||||
AddData(output_data_, name, dims, expected_values.data(), expected_values.size(), false,
|
||||
sort_output, nullptr /* dim_params */, rel_error, abs_error);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void AddOutput(const char* name, const std::vector<int64_t>& dims, const T* p_values, const size_t size) {
|
||||
AddData(output_data_, name, dims, p_values, size);
|
||||
void AddOutput(const char* name, const std::vector<int64_t>& dims, const T* p_values, const size_t size,
|
||||
bool sort_output = false, float rel_error = 0.0f, float abs_error = 0.0f) {
|
||||
AddData(output_data_, name, dims, p_values, size, false,
|
||||
sort_output, nullptr /* dim_params */, rel_error, abs_error);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
|
@ -521,7 +525,8 @@ class OpTester {
|
|||
template <typename T>
|
||||
void AddData(std::vector<Data>& data, const char* name, const std::vector<int64_t>& dims, const T* values,
|
||||
int64_t values_count, bool is_initializer = false, bool sort_output = false,
|
||||
const std::vector<std::string>* dim_params = nullptr) {
|
||||
const std::vector<std::string>* dim_params = nullptr,
|
||||
float rel_error = 0.0f, float abs_error = 0.0f) {
|
||||
ORT_TRY {
|
||||
TensorShape shape{dims};
|
||||
ORT_ENFORCE(shape.Size() == values_count, values_count, " input values doesn't match tensor size of ",
|
||||
|
|
@ -565,7 +570,19 @@ class OpTester {
|
|||
}
|
||||
node_arg.SetShape(new_shape);
|
||||
}
|
||||
data.push_back(Data(std::move(node_arg), std::move(value), optional<float>(), optional<float>(), sort_output));
|
||||
|
||||
optional<float> rel;
|
||||
optional<float> abs;
|
||||
|
||||
if (rel_error != 0.0f) {
|
||||
rel = rel_error;
|
||||
}
|
||||
|
||||
if (abs_error != 0.0f) {
|
||||
abs = abs_error;
|
||||
}
|
||||
|
||||
data.push_back(Data(std::move(node_arg), std::move(value), std::move(rel), std::move(abs), sort_output));
|
||||
if (is_initializer) initializer_index_.push_back(data.size() - 1);
|
||||
}
|
||||
ORT_CATCH(const std::exception& ex) {
|
||||
|
|
|
|||
Loading…
Reference in a new issue