[NNAPI EP] Enable per-channel quantization for QlinearConv (#6155)

* Enable qlinearconv per-channel quantization

* Fix the android CI test failure

* Add Android Version Check for Per-Channel Quant

* Address PR comments

* Fix some minor issues

* Add verification of per-channel zero points

* Make the error tolerance configurable
This commit is contained in:
Guoyu Wang 2020-12-18 16:13:22 -08:00 committed by GitHub
parent 39aedbc97f
commit bbb52e9274
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 410 additions and 79 deletions

View file

@ -7,6 +7,7 @@
#include <core/common/safeint.h>
#include <core/common/logging/logging.h>
#include <core/framework/tensorprotoutils.h>
#include <core/graph/graph.h>
#include <core/graph/graph_viewer.h>
#include <core/providers/common.h>
@ -64,6 +65,32 @@ QLinearOpType GetQLinearOpType(const onnxruntime::Node& node) {
return QLinearOpType::Unknown;
}
ConvType GetConvType(const onnxruntime::Node& node, const InitializedTensorSet& initializers) {
const auto& op_type = node.OpType();
bool is_qlinear_conv = (op_type == "QLinearConv");
ORT_ENFORCE(op_type == "Conv" || is_qlinear_conv);
NodeAttrHelper helper(node);
const auto group = helper.Get("group", 1);
size_t w_idx = is_qlinear_conv ? 3 : 1;
const auto& weight = node.InputDefs()[w_idx]->Name();
const auto& weight_tensor = *initializers.at(weight);
// For ONNX we only have 1 conv ops
// For NNAPI we have 3
// Input is (N, C, H, W)
// group == 1, --> regular conv
// group != 1 && weight is (M, 1, kH, kW), --> depthwise conv
// group != 1 && weight is (M, C/group, kH, kW), --> grouped conv
if (group == 1)
return ConvType::Regular;
else if ((weight_tensor.dims()[1] == 1))
return ConvType::Depthwise;
else
return ConvType::Grouped;
}
bool IsQLinearBinaryOp(QLinearOpType qlinear_op_type) {
return qlinear_op_type == QLinearOpType::QLinearConv ||
qlinear_op_type == QLinearOpType::QLinearMatMul ||
@ -71,8 +98,9 @@ bool IsQLinearBinaryOp(QLinearOpType qlinear_op_type) {
}
bool HasValidBinaryOpQuantizedInputs(const Node& node) {
auto op_type = GetQLinearOpType(node);
int32_t a_input_type, b_input_type;
if (!IsQLinearBinaryOp(GetQLinearOpType(node))) {
if (!IsQLinearBinaryOp(op_type)) {
LOGS_DEFAULT(VERBOSE) << "[" << node.OpType() << "] is not a binary qlinear op";
return false;
}
@ -83,7 +111,16 @@ bool HasValidBinaryOpQuantizedInputs(const Node& node) {
if (!GetType(*input_defs[3], b_input_type))
return false;
if (a_input_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8 || a_input_type != b_input_type) {
// QlinearConv supports u8u8 or u8s8
// QLinearMatMul/Add only support u8u8
bool is_qlinear_conv = op_type == QLinearOpType::QLinearConv;
bool has_valid_qlinear_conv_weight =
(b_input_type == ONNX_NAMESPACE::TensorProto_DataType_UINT8 ||
b_input_type == ONNX_NAMESPACE::TensorProto_DataType_INT8);
if (a_input_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8 ||
(!is_qlinear_conv && a_input_type != b_input_type) ||
(is_qlinear_conv && !has_valid_qlinear_conv_weight)) {
LOGS_DEFAULT(VERBOSE) << "[" << node.OpType()
<< "] A Input type: [" << a_input_type
<< "] B Input type: [" << b_input_type
@ -95,8 +132,9 @@ bool HasValidBinaryOpQuantizedInputs(const Node& node) {
}
bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const Node& node,
const std::vector<size_t>& indices) {
const auto& op = node.OpType();
const std::vector<size_t>& indices, const OpSupportCheckParams& params) {
const auto& op_type = node.OpType();
bool is_qlinear_conv = (op_type == "QLinearConv");
const auto input_defs(node.InputDefs());
for (const auto idx : indices) {
if (idx >= input_defs.size()) {
@ -106,13 +144,42 @@ bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const
}
const auto scale_name = input_defs[idx]->Name();
if (Contains(initializers, scale_name)) {
const auto& tensor = *initializers.at(scale_name);
if (!tensor.dims().empty() && tensor.dims()[0] != 1) {
LOGS_DEFAULT(VERBOSE) << op << " does not support per-channel quantization";
return false;
const auto& scale_tensor = *initializers.at(scale_name);
int64_t scales_dim = scale_tensor.dims().empty() ? 1 : scale_tensor.dims()[0];
bool is_conv_weight = is_qlinear_conv && idx == 4;
bool is_conv_u8s8_weight = false;
if (is_conv_weight) {
const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
is_conv_u8s8_weight = weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8;
}
// We need to check the per-channel quantization scales dimensions for u8s8 QlinearConv
// We only support per-channel quantization for u8s8
// For all other cases, the scales should be a scalar
if (is_conv_u8s8_weight) {
if (params.android_sdk_ver < 29) {
LOGS_DEFAULT(VERBOSE) << op_type << " only supports per-channel quantization on Android API 29+, "
<< "system API level: " << params.android_sdk_ver;
return false;
}
const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
if (weight_tensor.dims()[0] != scales_dim) {
LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight,"
<< " weight dimension[0] " << weight_tensor.dims()[0]
<< " scale dimension " << scales_dim;
return false;
}
} else {
if (scales_dim != 1) {
LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, "
<< " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+";
return false;
}
}
} else {
LOGS_DEFAULT(VERBOSE) << "The scale of " << op << " must be known";
LOGS_DEFAULT(VERBOSE) << "The scale of " << op_type << " must be known";
return false;
}
}
@ -122,7 +189,8 @@ bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const
bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, const Node& node,
const std::vector<size_t>& indices) {
const auto& op = node.OpType();
const auto& op_type = node.OpType();
bool is_qlinear_conv = (op_type == "QLinearConv");
const auto input_defs(node.InputDefs());
for (const auto idx : indices) {
if (idx >= input_defs.size()) {
@ -130,20 +198,63 @@ bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, co
<< " >= input number, " << input_defs.size();
return false;
}
const auto zero_point_name = node.InputDefs()[idx]->Name();
const auto zero_point_name = input_defs[idx]->Name();
if (Contains(initializers, zero_point_name)) {
const auto& tensor = *initializers.at(zero_point_name);
if (!tensor.dims().empty() && tensor.dims()[0] != 1) {
LOGS_DEFAULT(VERBOSE) << op << " does not support per-channel quantization";
return false;
bool is_conv_weight = is_qlinear_conv && idx == 5;
bool is_conv_u8s8_weight = false;
if (is_conv_weight) {
const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
is_conv_u8s8_weight = weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8;
}
if (tensor.data_type() != ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
LOGS_DEFAULT(VERBOSE) << op << " does not support zero point data type "
<< std::to_string(tensor.data_type());
return false;
const auto& zero_tensor = *initializers.at(zero_point_name);
int64_t zero_dim = zero_tensor.dims().empty() ? 1 : zero_tensor.dims()[0];
if (is_conv_u8s8_weight) {
if (zero_tensor.data_type() != ONNX_NAMESPACE::TensorProto_DataType_INT8) {
LOGS_DEFAULT(VERBOSE) << "u8s8 QlinearConv only supports int8 zero point for weight, "
<< "actual zero point type: [" << zero_tensor.data_type() << "]";
return false;
}
// For onnx, u8s8 QlinearConv, the weight zero point can be a scalar,
// or a tensor with same channel as weight, for NNAPI we only support it be
// 0 (scalar) or all 0 (tensor), NNAPI will assume the zero point for per-channel
// quantization is 0 there is no input for it
const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
if (weight_tensor.dims()[0] != zero_dim && zero_dim != 1) {
LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight,"
<< " weight dimension[0] " << weight_tensor.dims()[0]
<< " zero point dimension " << zero_dim;
return false;
}
std::unique_ptr<uint8_t[]> unpacked_tensor;
size_t tensor_byte_size;
auto status = onnxruntime::utils::UnpackInitializerData(zero_tensor, unpacked_tensor, tensor_byte_size);
if (!status.IsOK()) {
LOGS_DEFAULT(ERROR) << "QLinearConv erro when unpack zero tensor:" << status.ErrorMessage();
return false;
}
// Verify all onnx weight zero point(s) are 0(s)
const int8_t* zero_points = reinterpret_cast<const int8_t*>(unpacked_tensor.get());
for (size_t i = 0; i < tensor_byte_size; i++) {
if (zero_points[i] != 0) {
LOGS_DEFAULT(VERBOSE) << "QLinearConv only support 0 as zero point, "
<< "zero_points[" << i << "] has value: " << zero_points[i];
return false;
}
}
} else {
if (zero_dim != 1) {
LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, "
<< " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+";
return false;
}
}
} else {
LOGS_DEFAULT(VERBOSE) << "The zero point of " << op << " must be known";
LOGS_DEFAULT(VERBOSE) << "The zero point of " << op_type << " must be known";
return false;
}
}

View file

@ -87,8 +87,18 @@ enum class QLinearOpType : uint8_t {
// QLinearReduceMean,
};
enum class ConvType : uint8_t {
Regular,
Depthwise,
Grouped,
};
QLinearOpType GetQLinearOpType(const onnxruntime::Node& node);
// Return the type of the conv ops,
// This function assumes the input is a 2d conv node
ConvType GetConvType(const onnxruntime::Node& node, const InitializedTensorSet& initializers);
// This qlinear op is an operator takes 2 input and produces 1 output
// Such as QLinearConv, QLinearMatMul, QLinearAdd, ...
bool IsQLinearBinaryOp(QLinearOpType qlinear_op_type);
@ -97,7 +107,7 @@ bool IsQLinearBinaryOp(QLinearOpType qlinear_op_type);
bool HasValidBinaryOpQuantizedInputs(const Node& node);
// Check if a qlinear op has valid scales for given indices
bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const Node& node,
const std::vector<size_t>& indices);
const std::vector<size_t>& indices, const OpSupportCheckParams& params);
// Check if a qlinear op has valid zero points for given indices
bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, const Node& node,
const std::vector<size_t>& indices);

View file

@ -26,7 +26,7 @@ int32_t ModelBuilder::GetAndroidSdkVer() const {
// Scalar operand is copied into the model, no need to persist
#define DEFINE_ADD_OPERAND_FROM_SCALAR(scalar_type, op_type) \
Status ModelBuilder::AddOperandFromScalar(scalar_type value, uint32_t& index) { \
OperandType operandType(Type::op_type); \
OperandType operandType(Type::op_type, vector<uint32_t>{}); \
ORT_RETURN_IF_ERROR(AddNewNNAPIOperand(operandType, index)); \
RETURN_STATUS_ON_ERROR_WITH_NOTE( \
nnapi_->ANeuralNetworksModel_setOperandValue( \
@ -377,6 +377,18 @@ Status ModelBuilder::AddNewNNAPIOperand(const OperandType& operand_type, uint32_
RETURN_STATUS_ON_ERROR(
nnapi_->ANeuralNetworksModel_addOperand(nnapi_model_->model_, &operand_type.operandType));
index = next_index_++;
if (operand_type.channelQuant) {
if (GetAndroidSdkVer() < 29) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
"Per-channel quantization is only supported on Android API level 29+,",
" system API level: ", GetAndroidSdkVer());
}
RETURN_STATUS_ON_ERROR(nnapi_->ANeuralNetworksModel_setOperandSymmPerChannelQuantParams(
nnapi_model_->model_, index, &operand_type.channelQuant->params));
}
return Status::OK();
}

View file

@ -458,6 +458,7 @@ static Status HandleAutoPad(const Shape& input_shape,
vector<int32_t>& onnx_pads,
int32_t& nnapi_padding_code,
bool& use_auto_pad) {
use_auto_pad = false;
if (auto_pad_type != AutoPadType::NOTSET) {
ORT_RETURN_IF_ERROR(ComputeConvPads(input_shape, weight_size_y, weight_size_x,
onnx_pads, onnx_strides, onnx_dilations,
@ -524,6 +525,47 @@ static Status GetBinaryOpQuantizationScaleAndZeroPoint(
return Status::OK();
}
static Status GetConvOpQuantizationScaleAndZeroPoint(
const ModelBuilder& model_builder, const Node& node,
float& a_scale, float& w_scale, float& y_scale,
int32_t& a_zero_point, int32_t& w_zero_point, int32_t& y_zero_point,
optional<vector<float>>& w_scales) ORT_MUST_USE_RESULT;
static Status GetConvOpQuantizationScaleAndZeroPoint(
const ModelBuilder& model_builder, const Node& node,
float& a_scale, float& w_scale, float& y_scale,
int32_t& a_zero_point, int32_t& w_zero_point, int32_t& y_zero_point,
optional<vector<float>>& w_scales) {
// Get scale and zero points
// We will handle per-channel weight scale and zero point later
ORT_RETURN_IF_ERROR(
GetBinaryOpQuantizationScaleAndZeroPoint(model_builder, node,
a_scale, w_scale, y_scale,
a_zero_point, w_zero_point, y_zero_point));
const auto input_defs = node.InputDefs();
const auto& initializers(model_builder.GetInitializerTensors());
const auto& weight_tensor = *initializers.at(input_defs[3]->Name());
// We are done here is this is u8u8 QLinearConv
if (weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_UINT8)
return Status::OK();
// Now we have u8s8 QlinearConv
// u8s8 QlinearConv always have 0 as zero point so we are not getting it here
// and we do not use w_scale here, so we reset them back to 0
w_scale = 0.0f;
w_zero_point = 0;
// We need to copy the 1d scales array for per-channel quantization
const auto& scale_tensor = *initializers.at(input_defs[4]->Name());
const auto* scales = GetTensorFloatData(scale_tensor);
size_t scales_size = scale_tensor.dims().empty() ? 1 : scale_tensor.dims()[0];
vector<float> scales_vec(scales_size, 0.0f);
memcpy(scales_vec.data(), scales, sizeof(float) * scales_size);
w_scales = onnxruntime::make_optional(std::move(scales_vec));
return Status::OK();
}
// NNAPI has the quantization scale and zero point embedded in the ANeuralNetworksOperandType
// ONNX has the quantization scale and zero point as the inputs of the qlinear operators
// We want to verify the scale and zeropoint of the ONNX inputs matches the values embedded in the NNAPI inputs
@ -553,6 +595,35 @@ static Status IsValidInputQuantizedType(const ModelBuilder& model_builder,
return Status::OK();
}
static Status IsValidConvWeightQuantizedType(const ModelBuilder& model_builder,
const std::string& input_name,
float scale,
int32_t zero_point,
const optional<vector<float>>& scales) ORT_MUST_USE_RESULT;
static Status IsValidConvWeightQuantizedType(const ModelBuilder& model_builder,
const std::string& input_name,
float scale,
int32_t zero_point,
const optional<vector<float>>& scales) {
// first verify as the weight has no per-channel quantization
ORT_RETURN_IF_ERROR(IsValidInputQuantizedType(model_builder, input_name, scale, zero_point));
if (scales) {
const OperandType& input_operand_type = model_builder.GetOperandTypes().at(input_name);
if (!input_operand_type.channelQuant) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
"Input [", input_name, "] has no channelQuant");
}
if (input_operand_type.channelQuant.value().scales != scales.value()) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
"Input [", input_name, "] has mismatch scales between onnx and NNAPI");
}
}
return Status::OK();
}
static void AddBinaryOpQuantizationScaleAndZeroPointToSkip(ModelBuilder& model_builder, const Node& node) {
const auto input_defs(node.InputDefs());
model_builder.AddInitializerToSkip(input_defs[1]->Name()); // a_scale
@ -1253,6 +1324,13 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
}
}
const auto& weight = input_defs[w_idx]->Name();
const auto& weight_tensor = *initializers.at(weight);
auto conv_type = GetConvType(node, model_builder.GetGraphViewer().GetAllInitializedTensors());
bool conv_2d = (conv_type == ConvType::Regular),
depthwise_conv_2d = (conv_type == ConvType::Depthwise),
grouped_conv_2d = (conv_type == ConvType::Grouped);
float x_scale = 0.0f,
w_scale = 0.0f,
y_scale = 0.0f;
@ -1260,31 +1338,16 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
w_zero_point = 0,
y_zero_point = 0;
// this is for per-channel quantization weights
optional<vector<float>> w_scales;
if (is_qlinear_conv) {
ORT_RETURN_IF_ERROR(GetBinaryOpQuantizationScaleAndZeroPoint(model_builder, node,
x_scale, w_scale, y_scale,
x_zero_point, w_zero_point, y_zero_point));
ORT_RETURN_IF_ERROR(GetConvOpQuantizationScaleAndZeroPoint(model_builder, node,
x_scale, w_scale, y_scale,
x_zero_point, w_zero_point, y_zero_point,
w_scales));
}
const auto& weight = input_defs[w_idx]->Name();
const auto& weight_tensor = *initializers.at(weight);
bool conv_2d = false,
depthwise_conv_2d = false,
grouped_conv_2d = false;
// For ONNX we only have 1 conv ops
// For NNAPI we have 3
// Input is (N, C, H, W)
// group == 1, --> regular conv
// group != 1 && weight is (M, 1, kH, kW), --> depthwise conv
// group != 1 && weight is (M, C/group, kH, kW), --> grouped conv
if (group == 1)
conv_2d = true;
else if ((weight_tensor.dims()[1] == 1))
depthwise_conv_2d = true;
else
grouped_conv_2d = true;
Shape onnx_weight_shape;
for (auto dim : weight_tensor.dims())
onnx_weight_shape.push_back(SafeInt<uint32_t>(dim));
@ -1297,12 +1360,22 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
onnx_weight_type = Type::TENSOR_QUANT8_ASYMM;
break;
case ONNX_NAMESPACE::TensorProto_DataType_INT8:
onnx_weight_type = Type::TENSOR_QUANT8_SYMM_PER_CHANNEL;
break;
default:
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
"The initializer of graph ", weight, " doesn't have valid type: ", weight_tensor.data_type());
}
OperandType onnx_weight_operand_type(onnx_weight_type, onnx_weight_shape, w_scale, w_zero_point);
// Get weight operand type
// Per-channel quantized weight is handled differently
OperandType onnx_weight_operand_type =
(is_qlinear_conv && w_scales.has_value())
? OperandType{onnx_weight_type, onnx_weight_shape,
SymmPerChannelQuantParams{w_scales.value(),
depthwise_conv_2d ? 3u : 0u}} // channelDim is 3 for depthwise-conv
: OperandType{onnx_weight_type, onnx_weight_shape, w_scale, w_zero_point};
// Pre-process weights
if (conv_2d || grouped_conv_2d) {
@ -1314,7 +1387,7 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
if (is_qlinear_conv) {
// Verify if the scale and zero point matchs from onnx input/weight and nnapi input/weight
ORT_RETURN_IF_ERROR(IsValidInputQuantizedType(model_builder, input, x_scale, x_zero_point));
ORT_RETURN_IF_ERROR(IsValidInputQuantizedType(model_builder, weight, w_scale, w_zero_point));
ORT_RETURN_IF_ERROR(IsValidConvWeightQuantizedType(model_builder, weight, w_scale, w_zero_point, w_scales));
}
bool hasBias = (input_defs.size() > b_idx);
@ -1332,14 +1405,15 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
vector<float> buffer(bias_dimen[0], 0.0f);
OperandType bias_operand_type(Type::TENSOR_FLOAT32, bias_dimen, x_scale * w_scale);
ORT_RETURN_IF_ERROR(model_builder.AddOperandFromPersistMemoryBuffer(bias, buffer.data(), bias_operand_type));
} else if (weight_type == Type::TENSOR_QUANT8_ASYMM) {
} else if (weight_type == Type::TENSOR_QUANT8_ASYMM || weight_type == Type::TENSOR_QUANT8_SYMM_PER_CHANNEL) {
vector<int32_t> buffer(bias_dimen[0], 0);
OperandType bias_operand_type(Type::TENSOR_INT32, bias_dimen, x_scale * w_scale);
ORT_RETURN_IF_ERROR(model_builder.AddOperandFromPersistMemoryBuffer(bias, buffer.data(), bias_operand_type));
} else {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unknown weight type ", TypeToStr(weight_type));
}
} else if (is_qlinear_conv) { // QLinearConv's bias type need special handling
} else if (is_qlinear_conv) {
// QLinearConv's bias type need special handling to add scale for quantization input
const auto& bias_tensor = *model_builder.GetInitializerTensors().at(bias);
ORT_RETURN_IF_NOT(bias_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT32,
"bias of QLinearConv should be int32, actual type: ", bias_tensor.data_type());

View file

@ -228,7 +228,7 @@ bool BinaryOpSupportChecker::HasSupportedInputsImpl(const Node& node) const {
}
bool BinaryOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
const OpSupportCheckParams& /* params */) const {
const OpSupportCheckParams& params) const {
const auto& op_type(node.OpType());
const auto input_defs(node.InputDefs());
bool op_is_qlinear = op_type == "QLinearAdd";
@ -265,7 +265,7 @@ bool BinaryOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initi
// All scale/zero points are initializer scalars
// a/b/y_scale
if (!HasValidQuantizationScales(initializers, node, {1, 4, 6}))
if (!HasValidQuantizationScales(initializers, node, {1, 4, 6}, params))
return false;
// a/b/y_zero_point
@ -599,7 +599,7 @@ bool ConvOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial
}
// a/b/y_scale
if (!HasValidQuantizationScales(initializers, node, {1, 4, 6}))
if (!HasValidQuantizationScales(initializers, node, {1, 4, 6}, params))
return false;
// a/b/y_zero_point
@ -860,7 +860,7 @@ bool GemmOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial
// All scale/zero points are initializer scalars
// a/b/y_scale
if (!HasValidQuantizationScales(initializers, node, {1, 4, 6}))
if (!HasValidQuantizationScales(initializers, node, {1, 4, 6}, params))
return false;
// a/b/y_zero_point
@ -1003,7 +1003,7 @@ class QuantizeLinearOpSupportChecker : public BaseOpSupportChecker {
};
bool QuantizeLinearOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
const OpSupportCheckParams& /* params */) const {
const OpSupportCheckParams& params) const {
const auto input_defs(node.InputDefs());
const auto output_defs(node.OutputDefs());
@ -1018,7 +1018,7 @@ bool QuantizeLinearOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSe
return false;
}
if (!HasValidQuantizationScales(initializers, node, {1}))
if (!HasValidQuantizationScales(initializers, node, {1}, params))
return false;
if (input_defs.size() == 3) { // has zero_point input
@ -1045,9 +1045,9 @@ class DequantizeLinearOpSupportChecker : public BaseOpSupportChecker {
};
bool DequantizeLinearOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
const OpSupportCheckParams& /* params */) const {
const OpSupportCheckParams& params) const {
const auto input_defs(node.InputDefs());
if (!HasValidQuantizationScales(initializers, node, {1}))
if (!HasValidQuantizationScales(initializers, node, {1}, params))
return false;
if (input_defs.size() == 3) { // has zero_point input

View file

@ -32,10 +32,22 @@ OperandType::OperandType(Type type, const std::vector<uint32_t>& d, float scale,
};
}
OperandType::OperandType(const OperandType& other) {
type = other.type;
dimensions = other.dimensions;
operandType = other.operandType;
OperandType::OperandType(Type type, const std::vector<uint32_t>& d, SymmPerChannelQuantParams&& channelQuant)
: type(type), dimensions(d), channelQuant(std::move(channelQuant)) {
operandType = {
.type = static_cast<int32_t>(type),
.dimensionCount = static_cast<uint32_t>(dimensions.size()),
.dimensions = dimensions.size() > 0 ? dimensions.data() : nullptr,
.scale = 0.0f,
.zeroPoint = 0,
};
}
OperandType::OperandType(const OperandType& other)
: operandType(other.operandType),
type(other.type),
dimensions(other.dimensions),
channelQuant(other.channelQuant) {
operandType.dimensions = dimensions.size() > 0 ? dimensions.data() : nullptr;
}
@ -44,6 +56,7 @@ OperandType& OperandType::operator=(const OperandType& other) {
type = other.type;
dimensions = other.dimensions;
operandType = other.operandType;
channelQuant = other.channelQuant;
operandType.dimensions = dimensions.size() > 0 ? dimensions.data() : nullptr;
}

View file

@ -21,6 +21,9 @@
#include "NeuralNetworksTypes.h"
// Move to std::optional when we switch to c++ 17
#include "core/common/optional.h"
template <typename T>
T Product(const std::vector<T>& v) {
return static_cast<T>(
@ -99,12 +102,40 @@ inline std::string TypeToStr(const Type& type) {
}
}
struct SymmPerChannelQuantParams {
ANeuralNetworksSymmPerChannelQuantParams params;
std::vector<float> scales;
SymmPerChannelQuantParams(std::vector<float> scalesVec, uint32_t channelDim)
: scales(std::move(scalesVec)) {
params = {
.channelDim = channelDim,
.scaleCount = static_cast<uint32_t>(scales.size()),
.scales = scales.size() > 0 ? scales.data() : nullptr,
};
}
SymmPerChannelQuantParams(const SymmPerChannelQuantParams& other)
: params(other.params), scales(other.scales) {
params.scales = scales.size() > 0 ? scales.data() : nullptr;
}
SymmPerChannelQuantParams& operator=(const SymmPerChannelQuantParams& other) {
if (this != &other) {
params = other.params;
scales = other.scales;
params.scales = scales.size() > 0 ? scales.data() : nullptr;
}
return *this;
}
};
struct OperandType {
ANeuralNetworksOperandType operandType;
Type type;
std::vector<uint32_t> dimensions;
onnxruntime::optional<SymmPerChannelQuantParams> channelQuant;
explicit OperandType(Type type, const std::vector<uint32_t>& d, float scale = 0.0f, int32_t zeroPoint = 0);
explicit OperandType(Type type, const std::vector<uint32_t>& d, SymmPerChannelQuantParams&& channelQuant);
explicit OperandType(Type type, const std::vector<uint32_t>& d = {}, float scale = 0.0f, int32_t zeroPoint = 0);
OperandType(const OperandType& other);
OperandType& operator=(const OperandType& other);

View file

@ -389,7 +389,9 @@ class QLinearConvOpTester {
Y_shape.push_back(output_channels);
for (size_t n = 0; n < kernel_rank; n++) {
Y_shape.push_back(((input_shape[n] + pads[n] + pads[kernel_rank + n]) -
(dilations[n] * (kernel_shape[n] - 1) + 1)) / strides[n] + 1);
(dilations[n] * (kernel_shape[n] - 1) + 1)) /
strides[n] +
1);
}
const int64_t* output_shape = Y_shape.data() + 2;
Y_data.resize(ShapeSize(Y_shape));
@ -464,22 +466,38 @@ class QLinearConvOpTester {
test.AddInput<T1>("x", X_.shape_, X_.data_);
test.AddInput<float>("x_scale", {}, X_.scale_, all_input_initializer_except_x);
test.AddInput<T1>("x_zero_point", {}, {X_.zero_point_});
test.AddInput<T1>("x_zero_point", {}, {X_.zero_point_}, all_input_initializer_except_x);
const std::vector<int64_t> W_scale_shape{static_cast<int64_t>(W_.scale_.size())};
test.AddInput<T2>("w", W_.shape_, W_.data_, all_input_initializer_except_x);
test.AddInput<float>("w_scale", W_scale_shape, W_.scale_, all_input_initializer_except_x);
test.AddInput<T2>("w_zero_point", {}, {W_.zero_point_});
test.AddInput<T2>("w_zero_point", {}, {W_.zero_point_}, all_input_initializer_except_x);
test.AddInput<float>("y_scale", {}, {output_scale_}, all_input_initializer_except_x);
test.AddInput<T1>("y_zero_point", {}, {output_zero_point_});
test.AddInput<T1>("y_zero_point", {}, {output_zero_point_}, all_input_initializer_except_x);
if (!B_.empty()) {
const std::vector<int64_t> B_shape{static_cast<int64_t>(B_.size())};
test.AddInput<int32_t>("b", B_shape, B_);
test.AddInput<int32_t>("b", B_shape, B_, all_input_initializer_except_x);
}
test.AddOutput<uint8_t>("y", Y_shape, Y_data);
float abs_error = 0.0f;
// For quantized models, NNAPI's rounding is different than CPU provider
// Sometimes the result is within +/-1 of result of CPU provider
// For ONNX, we use rounding to nearest ties to even.
// For NNAPI, it is using std::round which is HALF_AWAY_FROM_ZERO, see
// https://android.googlesource.com/platform/frameworks/ml/+/refs/heads/master/nn/common/operations/Quantize.cpp
// Use 1 as abs_error which is the smallest possbile for uint8_t
//
// NOTE, for now the tolerance will only apply if the NNAPI is actually used,
// if for any reason the execution falls back to CPU, we still expect an exact match
// See, 'void Check<uint8_t>(...' in onnxruntime/test/providers/provider_test_utils.cc
#ifdef USE_NNAPI
abs_error = 1.0f;
#endif
test.AddOutput<uint8_t>("y", Y_shape, Y_data, false /* sort_output */, 0.0f /* rel_error */, abs_error);
if (!pads_.empty()) {
test.AddAttribute("pads", pads_);

View file

@ -68,6 +68,53 @@ void Check(const OpTester::Data& expected_data, const Tensor& output_tensor,
}
}
template <>
void Check<uint8_t>(const OpTester::Data& expected_data,
const Tensor& output_tensor,
const std::string& provider_type) {
auto& expected_tensor = expected_data.data_.Get<Tensor>();
auto* expected = expected_tensor.template Data<uint8_t>();
auto* output = output_tensor.template Data<uint8_t>();
auto size = output_tensor.Shape().Size();
bool has_abs_err = expected_data.absolute_error_.has_value();
bool has_rel_err = expected_data.relative_error_.has_value();
if (expected_data.sort_output_) {
// if order can be jumbled in the output of an operator, sort both the
// expected and output buffers prior to
// comparison this is a "best-effort" algo and should satisfy the
// requirement for the few ops that do require this
// support without investing in a more sophisticated infrastructure for the
// same
sort_expected_and_actual_buffers<uint8_t>(expected, output, size);
}
// For uint8_t results, we only allow NNAPI EP to have an error tolerance, see below for the reason
// For any other EPs, we still expect an exact match for the results
if (provider_type == kNnapiExecutionProvider && (has_abs_err || has_rel_err)) {
double threshold = has_abs_err
? expected_data.absolute_error_.value()
: 0.0;
for (int i = 0; i < size; ++i) {
if (has_rel_err) {
EXPECT_NEAR(expected[i], output[i],
expected_data.relative_error_.value() * expected[i]) // expected[i] is unsigned, can't be negative
<< "i:" << i << ", provider_type: " << provider_type;
} else { // has_abs_err
EXPECT_NEAR(expected[i], output[i], threshold)
<< "i:" << i << ", provider_type: " << provider_type;
}
}
} else {
for (int i = 0; i < size; ++i) {
EXPECT_EQ(expected[i], output[i]) << "i:" << i
<< ", provider_type: " << provider_type;
}
}
}
template <>
void Check<double>(const OpTester::Data& expected_data,
const Tensor& output_tensor,
@ -747,8 +794,7 @@ void OpTester::Run(
kAclExecutionProvider,
kArmNNExecutionProvider,
kNnapiExecutionProvider,
kRocmExecutionProvider
};
kRocmExecutionProvider};
bool has_run = false;
@ -844,8 +890,7 @@ void OpTester::Run(
}
}
if (!valid)
{
if (!valid) {
std::cerr << "No kernel registered from EP: " << provider_type << "for node: " << node.OpType() << std::endl;
break;
}

View file

@ -324,20 +324,24 @@ class OpTester {
template <typename T>
void AddOutput(const char* name, const std::vector<int64_t>& dims, const std::initializer_list<T>& expected_values,
bool sort_output = false) {
AddData(output_data_, name, dims, expected_values.begin(), expected_values.size(), false, sort_output);
bool sort_output = false, float rel_error = 0.0f, float abs_error = 0.0f) {
AddData(output_data_, name, dims, expected_values.begin(), expected_values.size(), false,
sort_output, nullptr /* dim_params */, rel_error, abs_error);
}
// This function doesn't work for vector<bool> because const vector<bool> cannot invoke its data().
template <typename T>
void AddOutput(const char* name, const std::vector<int64_t>& dims, const std::vector<T>& expected_values,
bool sort_output = false) {
AddData(output_data_, name, dims, expected_values.data(), expected_values.size(), false, sort_output);
bool sort_output = false, float rel_error = 0.0f, float abs_error = 0.0f) {
AddData(output_data_, name, dims, expected_values.data(), expected_values.size(), false,
sort_output, nullptr /* dim_params */, rel_error, abs_error);
}
template <typename T>
void AddOutput(const char* name, const std::vector<int64_t>& dims, const T* p_values, const size_t size) {
AddData(output_data_, name, dims, p_values, size);
void AddOutput(const char* name, const std::vector<int64_t>& dims, const T* p_values, const size_t size,
bool sort_output = false, float rel_error = 0.0f, float abs_error = 0.0f) {
AddData(output_data_, name, dims, p_values, size, false,
sort_output, nullptr /* dim_params */, rel_error, abs_error);
}
template <typename T>
@ -521,7 +525,8 @@ class OpTester {
template <typename T>
void AddData(std::vector<Data>& data, const char* name, const std::vector<int64_t>& dims, const T* values,
int64_t values_count, bool is_initializer = false, bool sort_output = false,
const std::vector<std::string>* dim_params = nullptr) {
const std::vector<std::string>* dim_params = nullptr,
float rel_error = 0.0f, float abs_error = 0.0f) {
ORT_TRY {
TensorShape shape{dims};
ORT_ENFORCE(shape.Size() == values_count, values_count, " input values doesn't match tensor size of ",
@ -565,7 +570,19 @@ class OpTester {
}
node_arg.SetShape(new_shape);
}
data.push_back(Data(std::move(node_arg), std::move(value), optional<float>(), optional<float>(), sort_output));
optional<float> rel;
optional<float> abs;
if (rel_error != 0.0f) {
rel = rel_error;
}
if (abs_error != 0.0f) {
abs = abs_error;
}
data.push_back(Data(std::move(node_arg), std::move(value), std::move(rel), std::move(abs), sort_output));
if (is_initializer) initializer_index_.push_back(data.size() - 1);
}
ORT_CATCH(const std::exception& ex) {