[NNAPI QDQ] AddQDQAdd/Mul, update to NNAPI QDQ handling, update some test settings (#10483)

* Squashed commit of the following:

commit 12380491a9
Author: Guoyu Wang <wanggy@outlook.com>
Date:   Mon Feb 7 12:59:04 2022 -0800

    Add qdq mul support

commit 9cadda7f2c
Merge: 7a32847761 0f5d0a091a
Author: Guoyu Wang <wanggy@outlook.com>
Date:   Mon Feb 7 11:24:47 2022 -0800

    Merge remote-tracking branch 'origin/master' into gwang-msft/qdq_mul

commit 7a32847761
Author: Guoyu Wang <wanggy@outlook.com>
Date:   Mon Feb 7 00:41:30 2022 -0800

    move test case to util

commit c1a8f0d81e
Author: Guoyu Wang <wanggy@outlook.com>
Date:   Fri Feb 4 13:04:26 2022 -0800

    update input/output check

commit a6f0a0d504
Author: Guoyu Wang <wanggy@outlook.com>
Date:   Thu Feb 3 18:37:21 2022 -0800

    update quantized io check functions

commit 87f4d1dcfe
Merge: 7849f07109 97b8f6f394
Author: Guoyu Wang <wanggy@outlook.com>
Date:   Wed Feb 2 17:22:58 2022 -0800

    Merge remote-tracking branch 'origin/master' into gwang-msft/qdq_mul

commit 7849f07109
Author: Guoyu Wang <wanggy@outlook.com>
Date:   Wed Feb 2 17:22:55 2022 -0800

    minor update

commit 7196cdf419
Author: Guoyu Wang <wanggy@outlook.com>
Date:   Wed Feb 2 10:50:10 2022 -0800

    init change

commit 84c00772a1
Merge: a8c7dce22f 7318361645
Author: Guoyu Wang <wanggy@outlook.com>
Date:   Tue Feb 1 18:21:17 2022 -0800

    Merge remote-tracking branch 'origin/master' into gwang-msft/qdq_mul

commit a8c7dce22f
Merge: 55e536c182 ef7b4dc05c
Author: Guoyu Wang <wanggy@outlook.com>
Date:   Tue Feb 1 13:51:04 2022 -0800

    Merge remote-tracking branch 'origin/master' into gwang-msft/qdq_mul

commit 55e536c182
Author: Guoyu Wang <wanggy@outlook.com>
Date:   Tue Feb 1 11:44:34 2022 -0800

    address cr comments

commit d460f5b776
Author: Guoyu Wang <wanggy@outlook.com>
Date:   Tue Feb 1 00:33:54 2022 -0800

    fix android UT failure

commit 52146cf06f
Author: Guoyu Wang <wanggy@outlook.com>
Date:   Mon Jan 31 16:01:13 2022 -0800

    fix build break

commit ec6d07df8b
Author: Guoyu Wang <wanggy@outlook.com>
Date:   Mon Jan 31 15:41:52 2022 -0800

    minor update to UT

commit 8ec8490b4f
Author: Guoyu Wang <wanggy@outlook.com>
Date:   Mon Jan 31 15:01:30 2022 -0800

    Add NNAPI support of QDQ Resize

* Update qdq add/mul test case, fix build break

* Address CR comments

* Add QLinearMul support

* remove unused params

* Address CR comments
This commit is contained in:
Guoyu Wang 2022-02-08 20:44:15 -08:00 committed by GitHub
parent 655f490c95
commit e4dc4e4d3c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 712 additions and 747 deletions

View file

@ -61,6 +61,8 @@ QuantizedOpType GetQuantizedOpType(const NodeUnit& node_unit) {
return QuantizedOpType::QLinearMatMul;
else if (op_type == "QLinearAdd")
return QuantizedOpType::QLinearAdd;
else if (op_type == "QLinearMul")
return QuantizedOpType::QLinearMul;
else if (op_type == "QLinearSigmoid")
return QuantizedOpType::QLinearSigmoid;
else if (op_type == "QLinearAveragePool")
@ -72,6 +74,10 @@ QuantizedOpType GetQuantizedOpType(const NodeUnit& node_unit) {
return QuantizedOpType::QDQResize;
else if (op_type == "AveragePool")
return QuantizedOpType::QDQAveragePool;
else if (op_type == "Add")
return QuantizedOpType::QDQAdd;
else if (op_type == "Mul")
return QuantizedOpType::QDQMul;
} else {
// throw?
// Do we want to throw here? seems got neglected last time
@ -114,25 +120,13 @@ bool IsQuantizedPool(QuantizedOpType quant_op_type) {
bool IsQuantizedBinaryOp(QuantizedOpType quant_op_type) {
return quant_op_type == QuantizedOpType::QLinearMatMul ||
quant_op_type == QuantizedOpType::QLinearAdd ||
quant_op_type == QuantizedOpType::QLinearMul ||
quant_op_type == QuantizedOpType::QDQAdd ||
quant_op_type == QuantizedOpType::QDQMul ||
IsQuantizedConv(quant_op_type);
}
bool HasValidUnaryOpQuantizedInputs(const NodeUnit& node_unit) {
int32_t input_type;
if (!GetType(node_unit.Inputs()[0].node_arg, input_type))
return false;
if (input_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
LOGS_DEFAULT(VERBOSE) << "[" << node_unit.OpType()
<< "] Input type: [" << input_type
<< "] is not supported for now";
return false;
}
return true;
}
bool HasValidBinaryOpQuantizedInputs(const NodeUnit& node_unit) {
bool HasValidBinaryOpQuantizedInputTypes(const NodeUnit& node_unit) {
auto quant_op_type = GetQuantizedOpType(node_unit);
int32_t a_input_type, b_input_type;
if (!IsQuantizedBinaryOp(quant_op_type)) {
@ -146,16 +140,17 @@ bool HasValidBinaryOpQuantizedInputs(const NodeUnit& node_unit) {
if (!GetType(inputs[1].node_arg, b_input_type))
return false;
// QlinearConv supports u8u8 or u8s8
// QLinearMatMul/Add only support u8u8
bool is_quant_conv = IsQuantizedConv(quant_op_type);
// QlinearConv/MatMul supports u8u8 or u8s8
// QLinearAdd/QLinearMul only support u8u8
bool is_quant_conv_or_matmul = IsQuantizedConv(quant_op_type) || (quant_op_type == QuantizedOpType::QLinearMatMul);
bool has_valid_qlinear_conv_weight =
(b_input_type == ONNX_NAMESPACE::TensorProto_DataType_UINT8 ||
b_input_type == ONNX_NAMESPACE::TensorProto_DataType_INT8);
if (a_input_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8 ||
(!is_quant_conv && a_input_type != b_input_type) ||
(is_quant_conv && !has_valid_qlinear_conv_weight)) {
(!is_quant_conv_or_matmul && a_input_type != b_input_type) ||
(is_quant_conv_or_matmul && !has_valid_qlinear_conv_weight)) {
LOGS_DEFAULT(VERBOSE) << "[" << node_unit.OpType()
<< "] A Input type: [" << a_input_type
<< "] B Input type: [" << b_input_type
@ -166,182 +161,6 @@ bool HasValidBinaryOpQuantizedInputs(const NodeUnit& node_unit) {
return true;
}
bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
const std::vector<size_t>& indices, const OpSupportCheckParams& params, bool is_input) {
const auto& op_type = node_unit.OpType();
auto quant_op_type = GetQuantizedOpType(node_unit);
bool is_quant_conv = IsQuantizedConv(quant_op_type);
bool is_quant_matmul = (quant_op_type == QuantizedOpType::QLinearMatMul);
const auto& io_defs = is_input ? node_unit.Inputs() : node_unit.Outputs();
for (const auto idx : indices) {
if (idx >= io_defs.size()) {
LOGS_DEFAULT(VERBOSE) << (is_input ? "Input" : "Output") << " index, " << idx
<< " >= size, " << io_defs.size()
<< " of NodeUnit: " << node_unit.Name();
return false;
}
const auto& io_def = io_defs[idx];
if (!io_def.quant_param.has_value()) {
LOGS_DEFAULT(VERBOSE) << "HasValidQuantizationZeroPoints, Input index, " << idx
<< " has no quant_param";
return false;
}
const auto scale_name = io_def.quant_param->scale.Name();
if (!Contains(initializers, scale_name)) {
LOGS_DEFAULT(VERBOSE) << "The scale of " << op_type << " must be an initializer tensor";
return false;
}
// If this op is Qlinear[Conv/MatMul], we want to check u8s8 support for weight tensor (or B tensor for QlinearMatMul)
bool is_conv_matmul_weight = is_input && (is_quant_conv || is_quant_matmul) && idx == 1;
bool is_conv_matmul_u8s8_weight = false;
if (is_conv_matmul_weight) {
const auto& weight_tensor = *initializers.at(io_def.node_arg.Name());
is_conv_matmul_u8s8_weight = weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8;
}
const auto& scale_tensor = *initializers.at(scale_name);
int64_t scales_dim = scale_tensor.dims().empty() ? 1 : scale_tensor.dims()[0];
if (!is_conv_matmul_u8s8_weight) {
if (scales_dim != 1) {
LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, "
<< " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+";
return false;
}
} else if (scales_dim != 1) {
// For u8s8 Qlinear[Conv/MatMul], we support
// 1. Per-tensor, the weight will be transformed to uint8 later
// 2. Per-channel, only from Android API level 29
if (is_quant_matmul) {
LOGS_DEFAULT(VERBOSE) << "QLinearMatMul does not support per-channel quantization";
return false;
}
if (params.android_feature_level < ANEURALNETWORKS_FEATURE_LEVEL_3) {
LOGS_DEFAULT(VERBOSE) << op_type << " only supports per-channel quantization on Android API 29+, "
<< "system NNAPI feature level: " << params.android_feature_level;
return false;
}
const auto& weight_tensor = *initializers.at(io_def.node_arg.Name());
if (weight_tensor.dims()[0] != scales_dim) {
LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight,"
<< " weight dimension[0] " << weight_tensor.dims()[0]
<< " scale dimension " << scales_dim;
return false;
}
}
}
return true;
}
bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
const std::vector<size_t>& indices, bool is_input) {
const auto& op_type = node_unit.OpType();
auto quant_op_type = GetQuantizedOpType(node_unit);
bool is_quant_conv = IsQuantizedConv(quant_op_type);
bool is_quant_matmul = (quant_op_type == QuantizedOpType::QLinearMatMul);
const auto& io_defs = is_input ? node_unit.Inputs() : node_unit.Outputs();
for (const auto idx : indices) {
if (idx >= io_defs.size()) {
LOGS_DEFAULT(VERBOSE) << "HasValidQuantizationZeroPoints, "
<< (is_input ? "Input" : "Output") << " index, " << idx
<< " >= size, " << io_defs.size();
return false;
}
const auto& io_def = io_defs[idx];
if (!io_def.quant_param.has_value()) {
LOGS_DEFAULT(VERBOSE) << "HasValidQuantizationZeroPoints, Input index, " << idx
<< " has no quant_param";
return false;
}
// zero point is optional here
if (!io_def.quant_param->zero_point)
return true;
const auto& zero_point_name = io_def.quant_param->zero_point->Name();
if (!Contains(initializers, zero_point_name)) {
LOGS_DEFAULT(VERBOSE) << "The zero point of " << op_type << " must be an initializer tensor";
return false;
}
bool is_conv_matmul_weight = is_input && (is_quant_conv || is_quant_matmul) && idx == 1;
bool is_conv_matmul_u8s8_weight = false;
if (is_conv_matmul_weight) {
const auto& weight_tensor = *initializers.at(io_def.node_arg.Name());
is_conv_matmul_u8s8_weight = weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8;
}
const auto& zero_tensor = *initializers.at(zero_point_name);
int64_t zero_dim = zero_tensor.dims().empty() ? 1 : zero_tensor.dims()[0];
if (!is_conv_matmul_u8s8_weight) {
if (zero_dim != 1) {
LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, "
<< " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+";
return false;
}
} else {
// For u8s8 Qlinear[Conv/MatMul], we support
// 1. Per-tensor, the weight will be transformed to uint8 later
// 2. Per-channel, only from Android API level 29
if (zero_tensor.data_type() != ONNX_NAMESPACE::TensorProto_DataType_INT8) {
LOGS_DEFAULT(VERBOSE) << "u8s8 Qlinear[Conv/MatMul] only supports int8 zero point for weight, "
<< "actual zero point type: [" << zero_tensor.data_type() << "]";
return false;
}
if (zero_dim != 1) {
if (is_quant_matmul) {
LOGS_DEFAULT(VERBOSE) << "QLinearMatMul does not support per-channel quantization";
return false;
}
}
// For onnx, u8s8 QlinearConv, the weight zero point can be a scalar,
// or a tensor with same channel as weight, for NNAPI we only support it be
// 0 (scalar) or all 0 (tensor), NNAPI will assume the zero point for per-channel
// quantization is 0 there is no input for it
const auto& weight_tensor = *initializers.at(io_def.node_arg.Name());
if (weight_tensor.dims()[0] != zero_dim && zero_dim != 1) {
LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight,"
<< " weight dimension[0] " << weight_tensor.dims()[0]
<< " zero point dimension " << zero_dim;
return false;
}
std::vector<uint8_t> unpacked_tensor;
auto status = onnxruntime::utils::UnpackInitializerData(zero_tensor, node_unit.ModelPath(), unpacked_tensor);
if (!status.IsOK()) {
LOGS_DEFAULT(ERROR) << "Qlinear[Conv/MatMul] error when unpack zero tensor: " << zero_point_name
<< ", error msg: " << status.ErrorMessage();
return false;
}
// Verify all onnx weight zero point(s) are 0(s)
const int8_t* zero_points = reinterpret_cast<const int8_t*>(unpacked_tensor.data());
for (size_t i = 0; i < unpacked_tensor.size(); i++) {
if (zero_points[i] != 0) {
LOGS_DEFAULT(VERBOSE) << "u8s8 Qlinear[Conv/MatMul] only support 0 as zero point, "
<< "zero_points[" << i << "] has value: " << zero_points[i];
return false;
}
}
}
}
return true;
}
common::Status GetQuantizationScaleAndZeroPoint(
const InitializedTensorSet& initializers, const NodeUnitIODef& io_def, const Path& model_path,
float& scale, int32_t& zero_point) {
@ -387,8 +206,8 @@ common::Status GetQuantizationScaleAndZeroPoint(
common::Status GetQuantizationScaleAndZeroPoint(
const InitializedTensorSet& initializers, const NodeUnit& node_unit, const std::string& name,
float& scale, int32_t& zero_point, bool is_input) {
const auto& io_defs = is_input ? node_unit.Inputs() : node_unit.Outputs();
float& scale, int32_t& zero_point, IOKind io_kind) {
const auto& io_defs = io_kind == IOKind::Input ? node_unit.Inputs() : node_unit.Outputs();
for (const auto& io_def : io_defs) {
if (io_def.node_arg.Name() == name)
return GetQuantizationScaleAndZeroPoint(initializers, io_def, node_unit.ModelPath(),

View file

@ -82,12 +82,14 @@ enum class QuantizedOpType : uint8_t {
QLinearAdd,
QLinearSigmoid,
QLinearAveragePool,
QLinearMul,
// Not yet supported
// QLinearMul,
// QLinearReduceMean,
QDQConv,
QDQResize,
QDQAveragePool,
QDQAdd,
QDQMul,
// TODO, add other QDQ NodeUnit types
};
@ -97,6 +99,11 @@ enum class ConvType : uint8_t {
Grouped,
};
enum class IOKind : uint8_t {
Input,
Output,
};
QuantizedOpType GetQuantizedOpType(const NodeUnit& node_unit);
// Return the type of the conv ops,
@ -113,18 +120,8 @@ bool IsQuantizedPool(QuantizedOpType quant_op_type);
// Such as QLinearConv, QLinearMatMul, QLinearAdd, QDQConv,...
bool IsQuantizedBinaryOp(QuantizedOpType quant_op_type);
// Check if a qlinear unary op has valid inputs, Qlinear[Sigmoid/AveragePool]
bool HasValidUnaryOpQuantizedInputs(const NodeUnit& node_unit);
// Check if a qlinear binary op has valid inputs, Qlinear[Conv/MatMul/Add]
bool HasValidBinaryOpQuantizedInputs(const NodeUnit& node_unit);
// Check if a qlinear op has valid scales for given indices
bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
const std::vector<size_t>& indices, const OpSupportCheckParams& params, bool is_input);
// Check if a qlinear op has valid zero points for given indices
bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
const std::vector<size_t>& indices, bool is_input);
bool HasValidBinaryOpQuantizedInputTypes(const NodeUnit& node_unit);
common::Status GetQuantizationScaleAndZeroPoint(
const InitializedTensorSet& initializers, const NodeUnitIODef& io_def, const Path& model_path,
@ -132,7 +129,7 @@ common::Status GetQuantizationScaleAndZeroPoint(
common::Status GetQuantizationScaleAndZeroPoint(
const InitializedTensorSet& initializers, const NodeUnit& node_unit, const std::string& name,
float& scale, int32_t& zero_point, bool is_input = true);
float& scale, int32_t& zero_point, IOKind io_kind = IOKind::Input);
// Get Shape/Type of a NodeArg
// TODO, move to shared_utils

View file

@ -210,7 +210,7 @@ static Status GetInputDataType(
// TODO, verify the scale and zero point match if there are multiple op using same input
const auto* node_unit = all_quantized_op_inputs.at(name)[0];
ORT_RETURN_IF_ERROR(GetQuantizationScaleAndZeroPoint(
initializers, *node_unit, name, scale, zero_point, true /* is_input */));
initializers, *node_unit, name, scale, zero_point, IOKind::Input));
break;
}
// case ONNX_NAMESPACE::TensorProto_DataType_INT8:

View file

@ -452,7 +452,7 @@ static Status HandleAutoPad(const Shape& input_shape,
}
// Get scales and zero points for the qlinear binary ops (which has 2 input and 1 output)
// QLinearConv, QLinearMatmul, QLinearAdd
// QLinearConv, QLinearMatmul, QLinearAdd, QLinearMul
// a, b are inputs, and y is output
static Status GetBinaryOpQuantizationScaleAndZeroPoint(
const InitializedTensorSet& initializers, const NodeUnit& node_unit,
@ -656,8 +656,11 @@ class BinaryOpBuilder : public BaseOpBuilder {
};
/* static */ bool BinaryOpBuilder::IsQuantizedOp(const NodeUnit& node_unit) {
// TODO, add support for QDQ NodeUnit
return node_unit.OpType() == "QLinearAdd";
const auto quant_type = GetQuantizedOpType(node_unit);
return quant_type == QuantizedOpType::QLinearAdd ||
quant_type == QuantizedOpType::QLinearMul ||
quant_type == QuantizedOpType::QDQAdd ||
quant_type == QuantizedOpType::QDQMul;
}
void BinaryOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
@ -680,6 +683,7 @@ void BinaryOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const N
"Mul",
"Div",
"QLinearAdd",
"QLinearMul",
"Pow",
});
}
@ -690,12 +694,12 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
int32_t op_code;
bool add_activation = true;
bool op_is_qlinear = op_type == "QLinearAdd";
if (op_type == "Add" || op_is_qlinear) {
bool is_quant_op = IsQuantizedOp(node_unit);
if (op_type == "Add" || op_type == "QLinearAdd") { // Add/QLinearAdd/QDQAdd
op_code = ANEURALNETWORKS_ADD;
} else if (op_type == "Sub") {
op_code = ANEURALNETWORKS_SUB;
} else if (op_type == "Mul") {
} else if (op_type == "Mul" || op_type == "QLinearMul") { // Mul/QLinearMul/QDQMul
op_code = ANEURALNETWORKS_MUL;
} else if (op_type == "Div") {
op_code = ANEURALNETWORKS_DIV;
@ -721,7 +725,7 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
b_zero_point = 0,
y_zero_point = 0;
if (op_is_qlinear) {
if (is_quant_op) {
ORT_RETURN_IF_ERROR(GetBinaryOpQuantizationScaleAndZeroPoint(
model_builder.GetInitializerTensors(), node_unit,
a_scale, b_scale, y_scale,
@ -729,7 +733,7 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
}
// Verify if the scale and zero point matchs from onnx input and nnapi input match
if (op_is_qlinear) {
if (is_quant_op) {
ORT_RETURN_IF_ERROR(IsValidInputQuantizedType(model_builder, input1, a_scale, a_zero_point));
ORT_RETURN_IF_ERROR(IsValidInputQuantizedType(model_builder, input2, b_scale, b_zero_point));
}
@ -2717,6 +2721,7 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() {
NNAPI_EP_ADD_SHARED_OP_BUILDER("Mul", BinaryOpBuilder);
NNAPI_EP_ADD_SHARED_OP_BUILDER("Pow", BinaryOpBuilder);
NNAPI_EP_ADD_SHARED_OP_BUILDER("QLinearAdd", BinaryOpBuilder);
NNAPI_EP_ADD_SHARED_OP_BUILDER("QLinearMul", BinaryOpBuilder);
NNAPI_EP_ADD_SHARED_OP_BUILDER("Sub", BinaryOpBuilder);
}

View file

@ -22,7 +22,21 @@ struct OpSupportCheckerRegistrations {
std::unordered_map<std::string, const IOpSupportChecker*> op_support_checker_map;
};
bool HasExternalInitializer(const InitializedTensorSet& initializers, const NodeUnit& node_unit) {
template <class T>
void CreateSharedOpSupportCheckerImpl(const std::string& op_type,
OpSupportCheckerRegistrations& op_registrations,
const std::vector<std::string>& op_types) {
// The shared OpSupportChecker is already in the OpSupportCheckerRegistrations
if (op_registrations.op_support_checker_map.find(op_type) != op_registrations.op_support_checker_map.cend())
return;
op_registrations.support_checkers.push_back(std::make_unique<T>());
for (const auto& op : op_types) {
op_registrations.op_support_checker_map.emplace(op, op_registrations.support_checkers.back().get());
}
}
static bool HasExternalInitializer(const InitializedTensorSet& initializers, const NodeUnit& node_unit) {
const auto is_ext_initializer =
[&](const NodeArg& node_arg) {
const auto& input_name(node_arg.Name());
@ -58,18 +72,200 @@ bool HasExternalInitializer(const InitializedTensorSet& initializers, const Node
return false;
}
template <class T>
void CreateSharedOpSupportCheckerImpl(const std::string& op_type,
OpSupportCheckerRegistrations& op_registrations,
const std::vector<std::string>& op_types) {
// The shared OpSupportChecker is already in the OpSupportCheckerRegistrations
if (op_registrations.op_support_checker_map.find(op_type) != op_registrations.op_support_checker_map.cend())
return;
op_registrations.support_checkers.push_back(std::make_unique<T>());
for (const auto& op : op_types) {
op_registrations.op_support_checker_map.emplace(op, op_registrations.support_checkers.back().get());
static bool IsQuantizationScaleSupported(const InitializedTensorSet& initializers,
const NodeUnitIODef& io_def,
const OpSupportCheckParams& params,
const std::string& op_type,
bool is_quant_matmul,
bool is_conv_matmul_u8s8_weight) {
const auto scale_name = io_def.quant_param->scale.Name();
auto it = initializers.find(scale_name);
if (it == initializers.cend()) {
LOGS_DEFAULT(VERBOSE) << "The scale of " << op_type << " must be an initializer tensor";
return false;
}
const auto& scale_tensor = *it->second;
int64_t scales_dim = scale_tensor.dims().empty() ? 1 : scale_tensor.dims()[0];
if (!is_conv_matmul_u8s8_weight) {
if (scales_dim != 1) {
LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, "
<< " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+";
return false;
}
} else if (scales_dim != 1) {
// For u8s8 Qlinear[Conv/MatMul], we support
// 1. Per-tensor, the weight will be transformed to uint8 later
// 2. Per-channel, only from Android API level 29
if (is_quant_matmul) {
LOGS_DEFAULT(VERBOSE) << "QLinearMatMul does not support per-channel quantization";
return false;
}
if (params.android_feature_level < ANEURALNETWORKS_FEATURE_LEVEL_3) {
LOGS_DEFAULT(VERBOSE) << op_type << " only supports per-channel quantization on Android API 29+, "
<< "system NNAPI feature level: " << params.android_feature_level;
return false;
}
Shape weight_shape;
if (!GetShape(io_def.node_arg, weight_shape))
return false;
if (weight_shape[0] != scales_dim) {
LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight,"
<< " weight dimension[0] " << weight_shape[0]
<< " scale dimension " << scales_dim;
return false;
}
}
return true;
}
static bool IsQuantizationZeroPointSupported(const InitializedTensorSet& initializers,
const NodeUnitIODef& io_def,
const std::string& op_type,
const Path& model_path,
bool is_quant_matmul,
bool is_conv_matmul_u8s8_weight) {
// zero point is optional here
if (!io_def.quant_param->zero_point)
return true;
const auto& zero_point_name = io_def.quant_param->zero_point->Name();
if (!Contains(initializers, zero_point_name)) {
LOGS_DEFAULT(VERBOSE) << "The zero point of " << op_type << " must be an initializer tensor";
return false;
}
const auto& zero_tensor = *initializers.at(zero_point_name);
int64_t zero_dim = zero_tensor.dims().empty() ? 1 : zero_tensor.dims()[0];
if (!is_conv_matmul_u8s8_weight) {
if (zero_dim != 1) {
LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, "
<< " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+";
return false;
}
} else {
// For u8s8 Qlinear[Conv/MatMul], we support
// 1. Per-tensor, the weight will be transformed to uint8 later
// 2. Per-channel, only from Android API level 29
if (zero_tensor.data_type() != ONNX_NAMESPACE::TensorProto_DataType_INT8) {
LOGS_DEFAULT(VERBOSE) << "u8s8 Qlinear[Conv/MatMul] only supports int8 zero point for weight, "
<< "actual zero point type: [" << zero_tensor.data_type() << "]";
return false;
}
if (zero_dim != 1) {
if (is_quant_matmul) {
LOGS_DEFAULT(VERBOSE) << "QLinearMatMul does not support per-channel quantization";
return false;
}
}
// For onnx, u8s8 QlinearConv, the weight zero point can be a scalar,
// or a tensor with same channel as weight, for NNAPI we only support it be
// 0 (scalar) or all 0 (tensor), NNAPI will assume the zero point for per-channel
// quantization is 0 there is no input for it
Shape weight_shape;
if (!GetShape(io_def.node_arg, weight_shape))
return false;
if (weight_shape[0] != zero_dim && zero_dim != 1) {
LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight,"
<< " weight dimension[0] " << weight_shape[0]
<< " zero point dimension " << zero_dim;
return false;
}
std::vector<uint8_t> unpacked_tensor;
auto status = onnxruntime::utils::UnpackInitializerData(zero_tensor, model_path, unpacked_tensor);
if (!status.IsOK()) {
LOGS_DEFAULT(ERROR) << "Qlinear[Conv/MatMul] error when unpack zero tensor: " << zero_point_name
<< ", error msg: " << status.ErrorMessage();
return false;
}
// Verify all onnx weight zero point(s) are 0(s)
const int8_t* zero_points = reinterpret_cast<const int8_t*>(unpacked_tensor.data());
for (size_t i = 0; i < unpacked_tensor.size(); i++) {
if (zero_points[i] != 0) {
LOGS_DEFAULT(VERBOSE) << "u8s8 Qlinear[Conv/MatMul] only support 0 as zero point, "
<< "zero_points[" << i << "] has value: " << zero_points[i];
return false;
}
}
}
return true;
}
// Check if the given quantized input(s) or output(s) is supported
static bool IsQuantizedIOSupported(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
const std::vector<size_t>& indices, const OpSupportCheckParams& params, IOKind io_kind) {
const auto& op_type = node_unit.OpType();
auto quant_op_type = GetQuantizedOpType(node_unit);
ORT_ENFORCE(quant_op_type != QuantizedOpType::Unknown, "[", op_type, "] is not a quantized op");
bool is_input = io_kind == IOKind::Input;
bool is_quant_conv = IsQuantizedConv(quant_op_type);
bool is_quant_matmul = (quant_op_type == QuantizedOpType::QLinearMatMul);
const auto& io_defs = is_input ? node_unit.Inputs() : node_unit.Outputs();
for (const auto idx : indices) {
if (idx >= io_defs.size()) {
LOGS_DEFAULT(VERBOSE) << (is_input ? "Input" : "Output") << " index, " << idx
<< " >= size, " << io_defs.size()
<< " of NodeUnit: " << node_unit.Name();
return false;
}
const auto& io_def = io_defs[idx];
ORT_ENFORCE(io_def.quant_param.has_value(), "Input index, ", idx, " has no quant_param");
// If this op is Qlinear[Conv/MatMul], we want to check u8s8 support for weight tensor (or B tensor for QlinearMatMul)
bool is_conv_matmul_weight = is_input && (is_quant_conv || is_quant_matmul) && idx == 1;
bool is_conv_matmul_u8s8_weight = false;
if (is_conv_matmul_weight) {
int32_t weight_type;
if (!GetType(io_def.node_arg, weight_type))
return false;
is_conv_matmul_u8s8_weight = weight_type == ONNX_NAMESPACE::TensorProto_DataType_INT8;
}
int32_t input_type;
if (!GetType(io_def.node_arg, input_type))
return false;
// We only support u8 for most of the inputs and all outputs, with the exception for Quantized MatMul and Conv,
// which allows s8 weight (u8s8)
// TODO, add support of s8s8
if (input_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8 &&
!(input_type == ONNX_NAMESPACE::TensorProto_DataType_INT8 && is_conv_matmul_u8s8_weight)) {
LOGS_DEFAULT(VERBOSE) << op_type << "NodeUnit [" << node_unit.Name()
<< "], type [" << op_type << "]'s "
<< (is_input ? "Input" : "Output") << " index [" << idx
<< "] has unsupported type [" << input_type << "]";
return false;
}
// Check scale and zero point
if (!IsQuantizationScaleSupported(initializers, io_def, params, op_type,
is_quant_matmul, is_conv_matmul_u8s8_weight)) {
return false;
}
if (!IsQuantizationZeroPointSupported(initializers, io_def, op_type, node_unit.ModelPath(),
is_quant_matmul, is_conv_matmul_u8s8_weight)) {
return false;
}
}
return true;
}
#pragma endregion helpers
@ -100,7 +296,9 @@ class BaseOpSupportChecker : public IOpSupportChecker {
return ANEURALNETWORKS_FEATURE_LEVEL_1;
}
virtual bool HasSupportedInputsImpl(const NodeUnit& node_unit) const;
virtual bool HasSupportedInputOutputsImpl(
const InitializedTensorSet& initializers, const NodeUnit& node_unit,
const OpSupportCheckParams& params) const;
virtual int GetMinSupportedOpSet(const NodeUnit& /* node_unit */) const { return 1; }
virtual int GetMaxSupportedOpSet(const NodeUnit& /* node_unit */) const { return 15; }
@ -112,7 +310,8 @@ class BaseOpSupportChecker : public IOpSupportChecker {
private:
bool HasSupportedOpSet(const NodeUnit& node_unit) const;
bool HasSupportedInputs(const NodeUnit& node_unit) const;
bool HasSupportedInputOutputs(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
const OpSupportCheckParams& params) const;
};
/* static */ void BaseOpSupportChecker::CreateSharedOpSupportChecker(
@ -138,7 +337,7 @@ bool BaseOpSupportChecker::IsOpSupported(const InitializedTensorSet& initializer
if (!IsNodeUnitTypeSupported(node_unit))
return false;
if (!HasSupportedInputs(node_unit))
if (!HasSupportedInputOutputs(initializers, node_unit, params))
return false;
// We do not support external initializers for now
@ -151,7 +350,8 @@ bool BaseOpSupportChecker::IsOpSupported(const InitializedTensorSet& initializer
return IsOpSupportedImpl(initializers, node_unit, params);
}
bool BaseOpSupportChecker::HasSupportedInputs(const NodeUnit& node_unit) const {
bool BaseOpSupportChecker::HasSupportedInputOutputs(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
const OpSupportCheckParams& params) const {
// We do not support unknown(null) input shape
auto has_supported_shape = [](const NodeArg& node_arg, const std::string& name, const std::string op_type) {
const auto* shape_proto = node_arg.Shape();
@ -185,10 +385,12 @@ bool BaseOpSupportChecker::HasSupportedInputs(const NodeUnit& node_unit) const {
return false;
}
}
return HasSupportedInputsImpl(node_unit);
return HasSupportedInputOutputsImpl(initializers, node_unit, params);
}
bool BaseOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const {
bool BaseOpSupportChecker::HasSupportedInputOutputsImpl(
const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
const OpSupportCheckParams& /* params */) const {
// We only check the type of input 0 by default
// specific op builder can override this
const auto& input = node_unit.Inputs()[0].node_arg;
@ -245,8 +447,13 @@ class BinaryOpSupportChecker : public BaseOpSupportChecker {
const OpSupportCheckParams& params) const override;
bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
const OpSupportCheckParams& params) const override;
bool HasSupportedInputsImpl(const NodeUnit& node_unit) const override;
bool HasSupportedInputOutputsImpl(
const InitializedTensorSet& initializers, const NodeUnit& node_unit,
const OpSupportCheckParams& params) const override;
int GetMinSupportedOpSet(const NodeUnit& node_unit) const override;
bool IsNodeUnitTypeSupported(const NodeUnit& node_unit) const override;
static bool IsQuantizedOp(const NodeUnit& node_unit);
};
/* static */ void BinaryOpSupportChecker::CreateSharedOpSupportChecker(
@ -259,10 +466,29 @@ class BinaryOpSupportChecker : public BaseOpSupportChecker {
"Mul",
"Div",
"QLinearAdd",
"QLinearMul",
"Pow",
});
}
bool BinaryOpSupportChecker::IsNodeUnitTypeSupported(const NodeUnit& node_unit) const {
if (node_unit.UnitType() == NodeUnit::Type::QDQGroup) {
const auto quant_type = GetQuantizedOpType(node_unit);
return quant_type == QuantizedOpType::QDQAdd ||
quant_type == QuantizedOpType::QDQMul;
}
return true;
}
/* static */ bool BinaryOpSupportChecker::IsQuantizedOp(const NodeUnit& node_unit) {
const auto quant_type = GetQuantizedOpType(node_unit);
return quant_type == QuantizedOpType::QLinearAdd ||
quant_type == QuantizedOpType::QLinearMul ||
quant_type == QuantizedOpType::QDQAdd ||
quant_type == QuantizedOpType::QDQMul;
}
int32_t BinaryOpSupportChecker::GetMinSupportedNNAPIFeatureLevel(
const NodeUnit& node_unit, const OpSupportCheckParams& /* params */) const {
const auto& op(node_unit.OpType());
@ -281,21 +507,29 @@ int BinaryOpSupportChecker::GetMinSupportedOpSet(const NodeUnit& node_unit) cons
const auto& op(node_unit.OpType());
// Add/Sub/Mul/Div/Pow opset 6- has broadcast attributes we do not support now
if (op != "QLinearAdd")
if (op != "QLinearAdd" && op != "QLinearMul")
return 7;
return 1;
}
bool BinaryOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const {
bool is_qlinear_add = node_unit.OpType() == "QLinearAdd";
bool BinaryOpSupportChecker::HasSupportedInputOutputsImpl(
const InitializedTensorSet& initializers, const NodeUnit& node_unit,
const OpSupportCheckParams& params) const {
bool is_quantized_op = IsQuantizedOp(node_unit);
bool is_pow = node_unit.OpType() == "Pow";
if (!is_qlinear_add && !is_pow)
return BaseOpSupportChecker::HasSupportedInputsImpl(node_unit);
if (!is_quantized_op && !is_pow)
return BaseOpSupportChecker::HasSupportedInputOutputsImpl(initializers, node_unit, params);
if (is_qlinear_add) {
// QLinearAdd
if (!HasValidBinaryOpQuantizedInputs(node_unit))
if (is_quantized_op) {
// QLinearAdd/QDQAdd/QLinearMul/QDQMul
if (!HasValidBinaryOpQuantizedInputTypes(node_unit))
return false;
if (!IsQuantizedIOSupported(initializers, node_unit, {0, 1}, params, IOKind::Input))
return false;
if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Output))
return false;
}
@ -320,11 +554,10 @@ bool BinaryOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) c
return true;
}
bool BinaryOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
const OpSupportCheckParams& params) const {
bool BinaryOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
const OpSupportCheckParams& /* params */) const {
const auto& op_type(node_unit.OpType());
const auto& inputs = node_unit.Inputs();
bool op_is_qlinear = op_type == "QLinearAdd";
Shape input1_shape, input2_shape;
if (!GetShape(inputs[0].node_arg, input1_shape) ||
!GetShape(inputs[1].node_arg, input2_shape))
@ -339,32 +572,6 @@ bool BinaryOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initi
return false;
}
if (op_is_qlinear) {
// For QLinearAdd, we only support uint8 output now
int32_t output_type;
if (!GetType(node_unit.Outputs()[0].node_arg, output_type))
return false;
if (output_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
LOGS_DEFAULT(VERBOSE) << "[" << op_type
<< "] output type: [" << output_type
<< "] is not supported for now";
return false;
}
// Check input scales and ZPs
if (!HasValidQuantizationScales(initializers, node_unit, {0, 1}, params, true /* is_input */))
return false;
if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0, 1}, true /* is_input */))
return false;
// Check output scale and ZP
if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, false /* is_input */))
return false;
if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, false /* is_input */))
return false;
}
return true;
}
@ -382,7 +589,9 @@ class TransposeOpSupportChecker : public BaseOpSupportChecker {
return ANEURALNETWORKS_FEATURE_LEVEL_2;
}
bool HasSupportedInputsImpl(const NodeUnit& node_unit) const override;
bool HasSupportedInputOutputsImpl(
const InitializedTensorSet& initializers, const NodeUnit& node_unit,
const OpSupportCheckParams& params) const override;
};
bool TransposeOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
@ -401,7 +610,9 @@ bool TransposeOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& /*
return true;
}
bool TransposeOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const {
bool TransposeOpSupportChecker::HasSupportedInputOutputsImpl(
const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
const OpSupportCheckParams& /* params */) const {
int32_t input_type;
if (!GetType(node_unit.Inputs()[0].node_arg, input_type))
return false;
@ -561,8 +772,10 @@ class PoolOpSupportChecker : public BaseOpSupportChecker {
return params.use_nchw ? ANEURALNETWORKS_FEATURE_LEVEL_3 : ANEURALNETWORKS_FEATURE_LEVEL_2;
}
bool HasSupportedInputsImpl(const NodeUnit& node_unit) const override;
bool IsNodeUnitTypeSupported(const NodeUnit& /* node_unit */) const override { return true; }
bool HasSupportedInputOutputsImpl(
const InitializedTensorSet& initializers, const NodeUnit& node_unit,
const OpSupportCheckParams& params) const override;
bool IsNodeUnitTypeSupported(const NodeUnit& /* node_unit */) const override;
static bool IsQuantizedOp(const NodeUnit& node_unit);
};
@ -579,12 +792,21 @@ class PoolOpSupportChecker : public BaseOpSupportChecker {
});
}
bool PoolOpSupportChecker::IsNodeUnitTypeSupported(const NodeUnit& node_unit) const {
if (node_unit.UnitType() == NodeUnit::Type::QDQGroup) {
const auto quant_type = GetQuantizedOpType(node_unit);
return quant_type == QuantizedOpType::QDQAveragePool;
}
return true;
}
/* static */ bool PoolOpSupportChecker::IsQuantizedOp(const NodeUnit& node_unit) {
return IsQuantizedPool(GetQuantizedOpType(node_unit));
}
bool PoolOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
const OpSupportCheckParams& params) const {
const OpSupportCheckParams& /* params */) const {
const auto& op_name = node_unit.Name();
const auto& op_type = node_unit.OpType();
const auto& inputs = node_unit.Inputs();
@ -601,7 +823,8 @@ bool PoolOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial
}
bool is_quant_pool = IsQuantizedOp(node_unit);
if (op_type == "AveragePool" || op_type == "MaxPool" || op_type == "QLinearAveragePool") {
bool is_average_pool = op_type == "AveragePool" || op_type == "QLinearAveragePool";
if (is_average_pool || op_type == "MaxPool") {
NodeAttrHelper helper(node_unit);
const auto count_include_pad = helper.Get("count_include_pad", 0);
@ -642,20 +865,7 @@ bool PoolOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial
}
// We need to check if we have valid scales and zero points for QLinearAveragePool
if (is_quant_pool) {
// Check input scales and ZPs
if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, true /* is_input */))
return false;
if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, true /* is_input */))
return false;
// Check output scale and ZP
if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, false /* is_input */))
return false;
if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, false /* is_input */))
return false;
if (is_average_pool && is_quant_pool) {
// NNAPI requires Quantized Average Pool has same scale and zero point for both input and output
float input_scale = 0.0f;
int32_t input_zp = 0;
@ -697,14 +907,23 @@ bool PoolOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial
return true;
}
bool PoolOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const {
bool is_max_pool = node_unit.OpType() == "MaxPool";
bool PoolOpSupportChecker::HasSupportedInputOutputsImpl(
const InitializedTensorSet& initializers, const NodeUnit& node_unit,
const OpSupportCheckParams& params) const {
const auto& op_type = node_unit.OpType();
bool is_quant_pool = IsQuantizedOp(node_unit);
if (!is_max_pool && !is_quant_pool)
return BaseOpSupportChecker::HasSupportedInputsImpl(node_unit);
bool is_max_pool = op_type == "MaxPool";
bool is_average_pool = op_type == "AveragePool" || op_type == "QLinearAveragePool";
bool is_quant_average_pool = is_quant_pool && is_average_pool;
if (!is_max_pool && !is_quant_average_pool)
return BaseOpSupportChecker::HasSupportedInputOutputsImpl(initializers, node_unit, params);
if (is_quant_pool) {
return HasValidUnaryOpQuantizedInputs(node_unit);
if (is_quant_average_pool) {
if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Input))
return false;
if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Output))
return false;
}
// is_max_pool
@ -742,7 +961,9 @@ class ConvOpSupportChecker : public BaseOpSupportChecker {
return params.use_nchw ? ANEURALNETWORKS_FEATURE_LEVEL_3 : ANEURALNETWORKS_FEATURE_LEVEL_2;
}
bool HasSupportedInputsImpl(const NodeUnit& node_unit) const override;
bool HasSupportedInputOutputsImpl(
const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
const OpSupportCheckParams& /* params */) const override;
bool IsNodeUnitTypeSupported(const NodeUnit& /* node_unit */) const override { return true; }
static bool IsQuantizedOp(const NodeUnit& node_unit);
};
@ -761,12 +982,20 @@ class ConvOpSupportChecker : public BaseOpSupportChecker {
return IsQuantizedConv(GetQuantizedOpType(node_unit));
}
bool ConvOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const {
bool ConvOpSupportChecker::HasSupportedInputOutputsImpl(
const InitializedTensorSet& initializers, const NodeUnit& node_unit,
const OpSupportCheckParams& params) const {
if (!IsQuantizedOp(node_unit))
return BaseOpSupportChecker::HasSupportedInputsImpl(node_unit);
return BaseOpSupportChecker::HasSupportedInputOutputsImpl(initializers, node_unit, params);
// QLinearConv only supports input of uint8 for now
if (!HasValidBinaryOpQuantizedInputs(node_unit))
if (!HasValidBinaryOpQuantizedInputTypes(node_unit))
return false;
if (!IsQuantizedIOSupported(initializers, node_unit, {0, 1}, params, IOKind::Input))
return false;
if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Output))
return false;
return true;
@ -813,34 +1042,10 @@ bool ConvOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial
}
if (is_quant_conv) {
// For QLinearConv, we only support uint8 output now
int32_t output_type;
if (!GetType(node_unit.Outputs()[0].node_arg, output_type))
return false;
if (output_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
LOGS_DEFAULT(VERBOSE) << "[" << op_type
<< "] output type: [" << output_type
<< "] is not supported for now";
return false;
}
if (inputs.size() > 2 && !Contains(initializers, inputs[2].node_arg.Name())) {
LOGS_DEFAULT(VERBOSE) << "Bias of QLinearConv must be known";
return false;
}
// Check input scales and ZPs
if (!HasValidQuantizationScales(initializers, node_unit, {0, 1}, params, true /* is_input */))
return false;
if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0, 1}, true /* is_input */))
return false;
// Check output scale and ZP
if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, false /* is_input */))
return false;
if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, false /* is_input */))
return false;
}
return true;
@ -931,16 +1136,26 @@ class GemmOpSupportChecker : public BaseOpSupportChecker {
private:
bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
const OpSupportCheckParams& params) const override;
bool HasSupportedInputsImpl(const NodeUnit& node_unit) const override;
bool HasSupportedInputOutputsImpl(
const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
const OpSupportCheckParams& /* params */) const override;
int GetMinSupportedOpSet(const NodeUnit& node_unit) const override;
};
bool GemmOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const {
bool GemmOpSupportChecker::HasSupportedInputOutputsImpl(
const InitializedTensorSet& initializers, const NodeUnit& node_unit,
const OpSupportCheckParams& params) const {
if (node_unit.OpType() != "QLinearMatMul")
return BaseOpSupportChecker::HasSupportedInputsImpl(node_unit);
return BaseOpSupportChecker::HasSupportedInputOutputsImpl(initializers, node_unit, params);
// QLinearMatMul
if (!HasValidBinaryOpQuantizedInputs(node_unit))
if (!HasValidBinaryOpQuantizedInputTypes(node_unit))
return false;
if (!IsQuantizedIOSupported(initializers, node_unit, {0, 1}, params, IOKind::Input))
return false;
if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Output))
return false;
return true;
@ -1077,33 +1292,6 @@ bool GemmOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial
LOGS_DEFAULT(VERBOSE) << "B of MatMul must be known";
return false;
}
if (is_qlinear_matmul) {
// For QLinearMatMul, we only support uint8 output now
int32_t output_type;
if (!GetType(node_unit.Outputs()[0].node_arg, output_type))
return false;
if (output_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
LOGS_DEFAULT(VERBOSE) << "[" << op_type
<< "] output type: [" << output_type
<< "] is not supported for now";
return false;
}
// All scale/zero points are initializer scalars
// Check input scales and ZPs
if (!HasValidQuantizationScales(initializers, node_unit, {0, 1}, params, true /* is_input */))
return false;
if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0, 1}, true /* is_input */))
return false;
// Check output scale and ZP
if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, false /* is_input */))
return false;
if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, false /* is_input */))
return false;
}
} else {
LOGS_DEFAULT(VERBOSE) << "GemmOpSupportChecker, unknown op: " << op_type;
}
@ -1127,7 +1315,9 @@ class UnaryOpSupportChecker : public BaseOpSupportChecker {
int32_t GetMinSupportedNNAPIFeatureLevel(const NodeUnit& /* node_unit */,
const OpSupportCheckParams& params) const override;
bool HasSupportedInputsImpl(const NodeUnit& node_unit) const override;
bool HasSupportedInputOutputsImpl(
const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
const OpSupportCheckParams& /* params */) const override;
int GetMinSupportedOpSet(const NodeUnit& node_unit) const override;
@ -1176,12 +1366,20 @@ int32_t UnaryOpSupportChecker::GetMinSupportedNNAPIFeatureLevel(const NodeUnit&
return ANEURALNETWORKS_FEATURE_LEVEL_1;
}
bool UnaryOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const {
bool UnaryOpSupportChecker::HasSupportedInputOutputsImpl(
const InitializedTensorSet& initializers, const NodeUnit& node_unit,
const OpSupportCheckParams& params) const {
// We only need to override input check for QLinearSigmoid
if (node_unit.OpType() != "QLinearSigmoid")
return BaseOpSupportChecker::HasSupportedInputsImpl(node_unit);
return BaseOpSupportChecker::HasSupportedInputOutputsImpl(initializers, node_unit, params);
return HasValidUnaryOpQuantizedInputs(node_unit);
if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Input))
return false;
if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Output))
return false;
return true;
}
// All ops except "Sin" opset 5- uses consumed_inputs attribute which is not supported for now
@ -1195,24 +1393,11 @@ int UnaryOpSupportChecker::GetMinSupportedOpSet(const NodeUnit& node_unit) const
}
/* static */ bool UnaryOpSupportChecker::IsQuantizedOpSupported(
const InitializedTensorSet& initializers, const NodeUnit& node_unit, const OpSupportCheckParams& params) {
const InitializedTensorSet& initializers, const NodeUnit& node_unit, const OpSupportCheckParams& /* params */) {
const auto& op_type = node_unit.OpType();
ORT_ENFORCE(op_type == "QLinearSigmoid");
const auto& op_name = node_unit.Name();
// Check input scales and ZPs
if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, true /* is_input */))
return false;
if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, true /* is_input */))
return false;
// Check output scale and ZP
if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, false /* is_input */))
return false;
if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, false /* is_input */))
return false;
// NNAPI requires the scale be 1.f/256 and zero point to be 0
// See https://android.googlesource.com/platform/frameworks/ml/+/refs/heads/android10-c2f2-release/nn/common/operations/Activation.cpp#180
float output_scale = 0.0f;
@ -1249,7 +1434,9 @@ class ConcatOpSupportChecker : public BaseOpSupportChecker {
bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
const OpSupportCheckParams& params) const override;
bool HasSupportedInputsImpl(const NodeUnit& node_unit) const override;
bool HasSupportedInputOutputsImpl(
const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
const OpSupportCheckParams& /* params */) const override;
};
bool ConcatOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
@ -1268,7 +1455,9 @@ bool ConcatOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& /* in
return true;
}
bool ConcatOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const {
bool ConcatOpSupportChecker::HasSupportedInputOutputsImpl(
const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
const OpSupportCheckParams& /* params */) const {
int32_t input_type;
if (!GetType(node_unit.Inputs()[0].node_arg, input_type))
return false;
@ -1331,37 +1520,17 @@ bool SqueezeOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& init
class QuantizeLinearOpSupportChecker : public BaseOpSupportChecker {
private:
bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
const OpSupportCheckParams& params) const override;
int32_t GetMinSupportedNNAPIFeatureLevel(const NodeUnit& /* node_unit */,
const OpSupportCheckParams& /* params */) const override {
return ANEURALNETWORKS_FEATURE_LEVEL_3;
}
};
bool QuantizeLinearOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
const OpSupportCheckParams& params) const {
int32_t output_type;
if (!GetType(node_unit.Outputs()[0].node_arg, output_type))
return false;
if (output_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
LOGS_DEFAULT(VERBOSE) << "[" << node_unit.OpType()
<< "] output type: [" << output_type
<< "] is not supported for now";
return false;
bool HasSupportedInputOutputsImpl(
const InitializedTensorSet& initializers, const NodeUnit& node_unit,
const OpSupportCheckParams& params) const override {
return IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Output);
}
// For QuantizeLinear only output is quantized
// Check output scale and ZP
if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, false /* is_input */))
return false;
if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, false /* is_input */))
return false;
return true;
}
};
#pragma endregion
@ -1369,42 +1538,17 @@ bool QuantizeLinearOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSe
class DequantizeLinearOpSupportChecker : public BaseOpSupportChecker {
private:
bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
const OpSupportCheckParams& params) const override;
int32_t GetMinSupportedNNAPIFeatureLevel(const NodeUnit& /* node_unit */,
const OpSupportCheckParams& /* params */) const override {
return ANEURALNETWORKS_FEATURE_LEVEL_1;
}
bool HasSupportedInputsImpl(const NodeUnit& node_unit) const override;
};
bool DequantizeLinearOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
const OpSupportCheckParams& params) const {
// For DequantizeLinear only input is quantized
// Check input scale and ZP
if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, true /* is_input */))
return false;
if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, true /* is_input */))
return false;
return true;
}
bool DequantizeLinearOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const {
int32_t input_type;
if (!GetType(node_unit.Inputs()[0].node_arg, input_type))
return false;
if (input_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
LOGS_DEFAULT(VERBOSE) << "[" << node_unit.OpType()
<< "] Input type: [" << input_type
<< "] is not supported for now";
return false;
bool HasSupportedInputOutputsImpl(
const InitializedTensorSet& initializers, const NodeUnit& node_unit,
const OpSupportCheckParams& params) const override {
return IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Input);
}
return true;
}
};
#pragma endregion
@ -1480,7 +1624,9 @@ class ResizeOpSupportChecker : public BaseOpSupportChecker {
// We only support Resize opset 11+ here
int GetMinSupportedOpSet(const NodeUnit& /* node_unit */) const override { return 11; }
bool HasSupportedInputsImpl(const NodeUnit& node_unit) const override;
bool HasSupportedInputOutputsImpl(
const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
const OpSupportCheckParams& /* params */) const override;
bool IsNodeUnitTypeSupported(const NodeUnit& /* node_unit */) const override { return true; }
static bool IsQuantizedOp(const NodeUnit& node_unit) ORT_MUST_USE_RESULT; // TODO, see if we want to move this to BaseOpBuilder
};
@ -1609,33 +1755,6 @@ bool ResizeOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initi
}
}
if (IsQuantizedOp(node_unit)) {
// For QDQResize, we only support uint8 output now
// TODO, add int8 support to NNAPI, and maybe move all the output type check into a virtual function
// similar to HasSupportedInputsImpl
int32_t output_type;
if (!GetType(node_unit.Outputs()[0].node_arg, output_type))
return false;
if (output_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
LOGS_DEFAULT(VERBOSE) << "[Resize] output type: [" << output_type
<< "] is not supported for now";
return false;
}
// Check input scales and ZPs
if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, true /* is_input */))
return false;
if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, true /* is_input */))
return false;
// Check output scale and ZP
if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, false /* is_input */))
return false;
if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, false /* is_input */))
return false;
}
return true;
}
@ -1653,7 +1772,9 @@ int32_t ResizeOpSupportChecker::GetMinSupportedNNAPIFeatureLevel(const NodeUnit&
return ANEURALNETWORKS_FEATURE_LEVEL_2;
}
bool ResizeOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const {
bool ResizeOpSupportChecker::HasSupportedInputOutputsImpl(
const InitializedTensorSet& initializers, const NodeUnit& node_unit,
const OpSupportCheckParams& params) const {
int32_t input_type;
if (!GetType(node_unit.Inputs()[0].node_arg, input_type))
return false;
@ -1666,6 +1787,14 @@ bool ResizeOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) c
return false;
}
if (IsQuantizedOp(node_unit)) {
if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Input))
return false;
if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Output))
return false;
}
return true;
}
@ -1870,6 +1999,7 @@ static OpSupportCheckerRegistrations CreateOpSupportCheckerRegistrations() {
NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("Mul", BinaryOpSupportChecker);
NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("Pow", BinaryOpSupportChecker);
NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("QLinearAdd", BinaryOpSupportChecker);
NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("QLinearMul", BinaryOpSupportChecker);
NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("Sub", BinaryOpSupportChecker);
}

View file

@ -43,61 +43,87 @@ void RunQLinearMathTestFromFloat(
const quantization::Params<T>& a_params,
const std::vector<float>& b, const std::vector<int64_t>& b_shape_origin,
const quantization::Params<T>& b_params,
const quantization::Params<T>& c_params,
bool input_b_is_initializer = false,
bool all_initializer_scale_zero_point = false) {
size_t number_dims = std::max(a_shape_origin.size(), b_shape_origin.size());
std::vector<int64_t> a_shape = PrefixingDims(a_shape_origin, number_dims);
std::vector<int64_t> b_shape = PrefixingDims(b_shape_origin, number_dims);
// calc broadcasting shaped
std::vector<int64_t> c_shape(number_dims, 1);
for (size_t axis = 0; axis < number_dims; ++axis) {
if (a_shape[axis] != b_shape[axis] && (a_shape[axis] != 1 && b_shape[axis] != 1)) {
ORT_THROW("Shapes can not be broadcasted");
}
c_shape[axis] = std::max(a_shape[axis], b_shape[axis]);
}
std::vector<int64_t> a_strides, b_strides, c_strides;
auto c_size = CalcStrides(c_shape, c_strides, false);
auto a_size = CalcStrides(a_shape, a_strides, true);
auto b_size = CalcStrides(b_shape, b_strides, true);
if (a_size != static_cast<int64_t>(a.size()) || b_size != static_cast<int64_t>(b.size())) {
ORT_THROW("Input size not match input shape!");
}
constexpr int qmax = std::numeric_limits<T>::max();
constexpr int qmin = std::numeric_limits<T>::min();
OpTester test(op_name, 1, onnxruntime::kMSDomain);
std::vector<T> a_quantized = QuantizeTestVector<T>(a, a_params);
test.template AddInput<T>("A", a_shape_origin, a_quantized);
test.AddInput<float>("A_scale", {}, {a_params.scale}, all_initializer_scale_zero_point);
test.template AddInput<T>("A_zero_point", {}, {a_params.zero_point}, all_initializer_scale_zero_point);
std::vector<T> b_quantized = QuantizeTestVector<T>(b, b_params);
test.template AddInput<T>("B", b_shape_origin, b_quantized, input_b_is_initializer);
test.AddInput<float>("B_scale", {}, {b_params.scale}, all_initializer_scale_zero_point);
test.template AddInput<T>("B_zero_point", {}, {b_params.zero_point}, all_initializer_scale_zero_point);
test.AddInput<float>("C_scale", {}, {c_params.scale}, all_initializer_scale_zero_point);
test.template AddInput<T>("C_zero_point", {}, {c_params.zero_point}, all_initializer_scale_zero_point);
std::vector<T> c(c_size);
for (int64_t offset = 0; offset < c_size; ++offset) {
int64_t remain = offset, a_offset = 0, b_offset = 0;
const quantization::Params<T>& c_params) {
const auto run_test = [&](bool input_b_is_initializer,
bool all_initializer_scale_zero_point) {
size_t number_dims = std::max(a_shape_origin.size(), b_shape_origin.size());
std::vector<int64_t> a_shape = PrefixingDims(a_shape_origin, number_dims);
std::vector<int64_t> b_shape = PrefixingDims(b_shape_origin, number_dims);
// calc broadcasting shaped
std::vector<int64_t> c_shape(number_dims, 1);
for (size_t axis = 0; axis < number_dims; ++axis) {
int64_t index = remain / c_strides[axis];
remain = remain % c_strides[axis];
a_offset += index * a_strides[axis];
b_offset += index * b_strides[axis];
if (a_shape[axis] != b_shape[axis] && (a_shape[axis] != 1 && b_shape[axis] != 1)) {
ORT_THROW("Shapes can not be broadcasted");
}
c_shape[axis] = std::max(a_shape[axis], b_shape[axis]);
}
float a_dequantized = quantization::Dequantize(a_quantized[a_offset], a_params);
float b_dequantized = quantization::Dequantize(b_quantized[b_offset], b_params);
c[offset] = clampi<T>(static_cast<int>(std::nearbyintf(calc(a_dequantized, b_dequantized) / c_params.scale)) + c_params.zero_point, qmin, qmax);
}
test.template AddOutput<T>("C", c_shape, c);
std::vector<int64_t> a_strides, b_strides, c_strides;
auto c_size = CalcStrides(c_shape, c_strides, false);
auto a_size = CalcStrides(a_shape, a_strides, true);
auto b_size = CalcStrides(b_shape, b_strides, true);
if (a_size != static_cast<int64_t>(a.size()) || b_size != static_cast<int64_t>(b.size())) {
ORT_THROW("Input size not match input shape!");
}
constexpr int qmax = std::numeric_limits<T>::max();
constexpr int qmin = std::numeric_limits<T>::min();
test.Run();
OpTester test(op_name, 1, onnxruntime::kMSDomain);
std::vector<T> a_quantized = QuantizeTestVector<T>(a, a_params);
test.template AddInput<T>("A", a_shape_origin, a_quantized);
test.AddInput<float>("A_scale", {}, {a_params.scale}, all_initializer_scale_zero_point);
test.template AddInput<T>("A_zero_point", {}, {a_params.zero_point}, all_initializer_scale_zero_point);
std::vector<T> b_quantized = QuantizeTestVector<T>(b, b_params);
test.template AddInput<T>("B", b_shape_origin, b_quantized, input_b_is_initializer);
test.AddInput<float>("B_scale", {}, {b_params.scale}, all_initializer_scale_zero_point);
test.template AddInput<T>("B_zero_point", {}, {b_params.zero_point}, all_initializer_scale_zero_point);
test.AddInput<float>("C_scale", {}, {c_params.scale}, all_initializer_scale_zero_point);
test.template AddInput<T>("C_zero_point", {}, {c_params.zero_point}, all_initializer_scale_zero_point);
std::vector<T> c(c_size);
for (int64_t offset = 0; offset < c_size; ++offset) {
int64_t remain = offset, a_offset = 0, b_offset = 0;
for (size_t axis = 0; axis < number_dims; ++axis) {
int64_t index = remain / c_strides[axis];
remain = remain % c_strides[axis];
a_offset += index * a_strides[axis];
b_offset += index * b_strides[axis];
}
float a_dequantized = quantization::Dequantize(a_quantized[a_offset], a_params);
float b_dequantized = quantization::Dequantize(b_quantized[b_offset], b_params);
c[offset] = clampi<T>(static_cast<int>(std::nearbyintf(calc(a_dequantized, b_dequantized) / c_params.scale)) + c_params.zero_point, qmin, qmax);
}
float abs_error = 0.0f;
// For quantized models, NNAPI's rounding is different than CPU provider
// Sometimes the result is within +/-1 of result of CPU provider
// For ONNX, we use rounding to nearest ties to even.
// For NNAPI, it is using std::round which is HALF_AWAY_FROM_ZERO, see
// https://android.googlesource.com/platform/frameworks/ml/+/refs/heads/master/nn/common/operations/Quantize.cpp
// Use 1 as abs_error which is the smallest possbile for uint8_t
//
// NOTE, for now the tolerance will only apply if the NNAPI is actually used,
// if for any reason the execution falls back to CPU, we still expect an exact match
// See, 'void Check<uint8_t>(...' in onnxruntime/test/providers/provider_test_utils.cc
#ifdef USE_NNAPI
abs_error = 1.0f;
#endif
test.template AddOutput<T>("C", c_shape, c, false /* sort_output */, 0.0f /* rel_error */, abs_error);
test.Run();
};
run_test(false /* input_b_is_initializer */, false /* all_initializer_scale_zero_point */);
// NNAPI will require all the scales and zero points be initializers
run_test(false /* input_b_is_initializer */, true /* all_initializer_scale_zero_point */);
// We also want to test the case input B is an initializer
run_test(true /* input_b_is_initializer */, true /* all_initializer_scale_zero_point */);
}
// total 32 + 31 elements to cover all path
@ -145,22 +171,6 @@ TEST(QLinearBinaryOpTest, AddU8VectorVectorFull) {
A, {63}, A_params,
B, {63}, B_params,
C_params);
// NNAPI will require all the scales and zero points be initializers
// We also want to test the case input B is an initializer
RunQLinearMathTestFromFloat("QLinearAdd", add_function,
A, {63}, A_params,
B, {63}, B_params,
C_params,
false /* input_b_is_initializer */,
true /* all_initializer_scale_zero_point */);
RunQLinearMathTestFromFloat("QLinearAdd", add_function,
A, {63}, A_params,
B, {63}, B_params,
C_params,
true /* input_b_is_initializer */,
true /* all_initializer_scale_zero_point */);
}
TEST(QLinearBinaryOpTest, AddU8VectorVectorBroadcast) {
@ -180,22 +190,6 @@ TEST(QLinearBinaryOpTest, AddU8VectorVectorBroadcast) {
A, {3, 3, 7}, A_params,
B, {3, 1, 7}, B_params,
C_params);
// NNAPI will require all the scales and zero points be initializers
// We also want to test the case input B is an initializer
RunQLinearMathTestFromFloat("QLinearAdd", add_function,
A, {3, 3, 7}, A_params,
B, {3, 1, 7}, B_params,
C_params,
false /* input_b_is_initializer */,
true /* all_initializer_scale_zero_point */);
RunQLinearMathTestFromFloat("QLinearAdd", add_function,
A, {3, 3, 7}, A_params,
B, {3, 1, 7}, B_params,
C_params,
true /* input_b_is_initializer */,
true /* all_initializer_scale_zero_point */);
}
TEST(QLinearBinaryOpTest, AddU8ScalarVectorFull) {
@ -212,22 +206,6 @@ TEST(QLinearBinaryOpTest, AddU8ScalarVectorFull) {
B, {1}, B_params,
A, {63}, A_params,
C_params);
// NNAPI will require all the scales and zero points be initializers
// We also want to test the case input B is an initializer
RunQLinearMathTestFromFloat("QLinearAdd", add_function,
B, {1}, B_params,
A, {63}, A_params,
C_params,
false /* input_b_is_initializer */,
true /* all_initializer_scale_zero_point */);
RunQLinearMathTestFromFloat("QLinearAdd", add_function,
B, {1}, B_params,
A, {63}, A_params,
C_params,
true /* input_b_is_initializer */,
true /* all_initializer_scale_zero_point */);
}
TEST(QLinearBinaryOpTest, AddU8ScalarVectorBroadcast) {
@ -244,22 +222,6 @@ TEST(QLinearBinaryOpTest, AddU8ScalarVectorBroadcast) {
B, {3, 1, 1}, B_params,
A, {3, 7, 3}, A_params,
C_params);
// NNAPI will require all the scales and zero points be initializers
// We also want to test the case input B is an initializer
RunQLinearMathTestFromFloat("QLinearAdd", add_function,
B, {3, 1, 1}, B_params,
A, {3, 7, 3}, A_params,
C_params,
false /* input_b_is_initializer */,
true /* all_initializer_scale_zero_point */);
RunQLinearMathTestFromFloat("QLinearAdd", add_function,
B, {3, 1, 1}, B_params,
A, {3, 7, 3}, A_params,
C_params,
true /* input_b_is_initializer */,
true /* all_initializer_scale_zero_point */);
}
TEST(QLinearBinaryOpTest, AddU8VectorScalarFull) {
@ -276,22 +238,6 @@ TEST(QLinearBinaryOpTest, AddU8VectorScalarFull) {
A, {63}, A_params,
B, {1}, B_params,
C_params);
// NNAPI will require all the scales and zero points be initializers
// We also want to test the case input B is an initializer
RunQLinearMathTestFromFloat("QLinearAdd", add_function,
A, {63}, A_params,
B, {1}, B_params,
C_params,
false /* input_b_is_initializer */,
true /* all_initializer_scale_zero_point */);
RunQLinearMathTestFromFloat("QLinearAdd", add_function,
A, {63}, A_params,
B, {1}, B_params,
C_params,
true /* input_b_is_initializer */,
true /* all_initializer_scale_zero_point */);
}
TEST(QLinearBinaryOpTest, AddU8VectorScalarBroadcast) {
@ -308,22 +254,6 @@ TEST(QLinearBinaryOpTest, AddU8VectorScalarBroadcast) {
A, {3, 7, 3}, A_params,
B, {1, 1, 3}, B_params,
C_params);
// NNAPI will require all the scales and zero points be initializers
// We also want to test the case input B is an initializer
RunQLinearMathTestFromFloat("QLinearAdd", add_function,
A, {3, 7, 3}, A_params,
B, {1, 1, 3}, B_params,
C_params,
false /* input_b_is_initializer */,
true /* all_initializer_scale_zero_point */);
RunQLinearMathTestFromFloat("QLinearAdd", add_function,
A, {3, 7, 3}, A_params,
B, {1, 1, 3}, B_params,
C_params,
true /* input_b_is_initializer */,
true /* all_initializer_scale_zero_point */);
}
TEST(QLinearBinaryOpTest, AddS8VectorVectorFull) {

View file

@ -81,10 +81,27 @@ GetQDQTestCaseFn BuildQDQConvTestCase(const std::vector<int64_t>& input_shape, c
template <typename InputType, typename OutputType>
GetQDQTestCaseFn BuildQDQAveragePoolTestCase(const std::vector<int64_t>& input_shape) {
return [input_shape](ModelTestBuilder& builder) {
#ifdef USE_NNAPI // NNAPI require consistent scales/ZPs for DQ -> Pool -> Q
float dq_scale = 0.0038f;
float pool_output_scale = 0.0038f;
float q_scale = 0.0038f;
InputType dq_zp = std::numeric_limits<OutputType>::max() / 2;
InputType pool_output_zp = std::numeric_limits<OutputType>::max() / 2;
InputType q_zp = std::numeric_limits<OutputType>::max() / 2;
#else
float dq_scale = 0.0035f;
float pool_output_scale = 0.0038f;
float q_scale = 0.0039f;
InputType dq_zp = 7;
InputType pool_output_zp = std::numeric_limits<OutputType>::max() / 2;
InputType q_zp = std::numeric_limits<OutputType>::max() / 2;
#endif
auto* input_arg = builder.MakeInput<float>(input_shape, -1.f, 1.f);
auto* output_arg = builder.MakeOutput();
// add QDQ + AveragePool
auto* dq_output = AddQDQNodePair<InputType>(builder, input_arg, .0035f, 7);
auto* dq_output = AddQDQNodePair<InputType>(builder, input_arg, dq_scale, dq_zp);
auto* averagepool_output = builder.MakeIntermediate();
Node& pool_node = builder.AddNode("AveragePool", {dq_output}, {averagepool_output});
std::vector<int64_t> pads((input_shape.size() - 2) * 2, 1);
@ -95,12 +112,12 @@ GetQDQTestCaseFn BuildQDQAveragePoolTestCase(const std::vector<int64_t>& input_s
// add QDQ output
auto* q_output = builder.MakeIntermediate();
builder.AddQuantizeLinearNode<OutputType>(averagepool_output,
.0038f,
std::numeric_limits<OutputType>::max() / 2,
pool_output_scale,
pool_output_zp,
q_output);
builder.AddDequantizeLinearNode<OutputType>(q_output,
.0039f,
std::numeric_limits<OutputType>::max() / 2,
q_scale,
q_zp,
output_arg);
};
}
@ -110,5 +127,65 @@ GetQDQTestCaseFn BuildQDQResizeTestCase(const std::vector<int64_t>& input_shape,
const std::string& mode = "nearest",
const std::string& coordinate_transformation_mode = "half_pixel");
template <typename Input1Type, typename Input2Type, typename OutputType>
GetQDQTestCaseFn BuildBinaryOpTestCase(const std::vector<int64_t>& input_shape,
const std::string& op_type) {
return [input_shape, op_type](ModelTestBuilder& builder) {
auto* input1_arg = builder.MakeInput<float>(input_shape, -1.f, 1.f);
auto* input2_arg = builder.MakeInput<float>(input_shape, -1.f, 1.f);
auto* output_arg = builder.MakeOutput();
#ifdef USE_NNAPI // NNAPI require consistent scales for DQ -> bin_op_input and bin_op_output-> Q
float q_scale = 0.008f;
float op_input_scale = 0.008f;
float op_output_scale = 0.0076f;
float dq_scale = 0.0076f;
#else
float q_scale = 0.008f;
float op_input_scale = 0.0079f;
float op_output_scale = 0.0076f;
float dq_scale = 0.0078f;
#endif
// add QDQ 1
auto* q1_output = builder.MakeIntermediate();
auto* dq1_output = builder.MakeIntermediate();
builder.AddQuantizeLinearNode<Input1Type>(input1_arg,
q_scale,
std::numeric_limits<Input1Type>::max() / 2,
q1_output);
builder.AddDequantizeLinearNode<Input1Type>(q1_output,
op_input_scale,
std::numeric_limits<Input1Type>::max() / 2,
dq1_output);
// add QDQ 2
auto* q2_output = builder.MakeIntermediate();
auto* dq2_output = builder.MakeIntermediate();
builder.AddQuantizeLinearNode<Input2Type>(input2_arg,
q_scale,
std::numeric_limits<Input2Type>::max() / 2,
q2_output);
builder.AddDequantizeLinearNode<Input2Type>(q2_output,
op_input_scale,
std::numeric_limits<Input2Type>::max() / 2,
dq2_output);
// add binary operator
auto* binary_op_output = builder.MakeIntermediate();
builder.AddNode(op_type, {dq1_output, dq2_output}, {binary_op_output});
// add QDQ output
auto* q3_output = builder.MakeIntermediate();
builder.AddQuantizeLinearNode<OutputType>(binary_op_output,
op_output_scale,
std::numeric_limits<OutputType>::max() / 2,
q3_output);
builder.AddDequantizeLinearNode<OutputType>(q3_output,
dq_scale,
std::numeric_limits<OutputType>::max() / 2,
output_arg);
};
}
} // namespace test
} // namespace onnxruntime

View file

@ -39,7 +39,7 @@ namespace test {
template <typename InputType, typename WeightType, typename BiasType, typename OutputType>
void QDQTransformerConvTests() {
auto test_case = [&](const std::vector<int64_t>& input_shape, const std::vector<int64_t>& weights_shape) {
auto check_conv_graph = [&](InferenceSessionWrapper& session) {
auto check_graph = [&](InferenceSessionWrapper& session) {
auto op_to_count = CountOpsInGraph(session.GetGraph());
if constexpr (std::is_same<InputType, OutputType>::value &&
std::is_same<BiasType, int32_t>::value &&
@ -57,7 +57,7 @@ void QDQTransformerConvTests() {
};
TransformerTester(BuildQDQConvTestCase<InputType, WeightType, BiasType, OutputType>(input_shape, weights_shape),
check_conv_graph,
check_graph,
TransformerLevel::Level1,
TransformerLevel::Level2,
12 /*opset_version*/,
@ -136,7 +136,7 @@ TEST(QDQTransformerTests, ConvMaxPoolReshape_UInt8) {
builder.AddQuantizeLinearNode<uint8_t>(reshape_output, .0039f, 135, output_arg);
};
auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
auto check_graph = [&](InferenceSessionWrapper& session) {
auto op_to_count = CountOpsInGraph(session.GetGraph());
EXPECT_EQ(op_to_count["QLinearConv"], 1);
EXPECT_EQ(op_to_count["MaxPool"], 1);
@ -146,7 +146,7 @@ TEST(QDQTransformerTests, ConvMaxPoolReshape_UInt8) {
};
TransformerTester(build_test_case,
check_mp_reshape_graph,
check_graph,
TransformerLevel::Level1,
TransformerLevel::Level2,
opset_version);
@ -197,7 +197,7 @@ TEST(QDQTransformerTests, ConvMaxPoolReshape_Int8) {
}
};
auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
auto check_graph = [&](InferenceSessionWrapper& session) {
auto op_to_count = CountOpsInGraph(session.GetGraph());
EXPECT_EQ(op_to_count["QLinearConv"], 1);
EXPECT_EQ(op_to_count["MaxPool"], 1);
@ -206,7 +206,7 @@ TEST(QDQTransformerTests, ConvMaxPoolReshape_Int8) {
EXPECT_EQ(op_to_count["DequantizeLinear"], 0);
};
TransformerTester(build_test_case, check_mp_reshape_graph, TransformerLevel::Level1, TransformerLevel::Level2);
TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2);
};
test_case({1, 12, 37}, {32, 12, 5});
@ -217,7 +217,7 @@ TEST(QDQTransformerTests, ConvMaxPoolReshape_Int8) {
template <typename InputType, typename OutputType>
void QDQTransformerAveragePoolTests() {
auto test_case = [&](const std::vector<int64_t>& input_shape) {
auto check_averagepool_op_graph = [&](InferenceSessionWrapper& session) {
auto check_graph = [&](InferenceSessionWrapper& session) {
auto op_to_count = CountOpsInGraph(session.GetGraph());
if constexpr (std::is_same<InputType, OutputType>::value) {
EXPECT_EQ(op_to_count["com.microsoft.QLinearAveragePool"], 1);
@ -233,7 +233,7 @@ void QDQTransformerAveragePoolTests() {
};
TransformerTester(BuildQDQAveragePoolTestCase<InputType, OutputType>(input_shape),
check_averagepool_op_graph,
check_graph,
TransformerLevel::Level1,
TransformerLevel::Level2,
12 /*opset_version*/,
@ -266,52 +266,7 @@ TEST(QDQTransformerTests, AveragePool_U8S8) {
template <typename Input1Type, typename Input2Type, typename OutputType>
void QDQTransformerBinaryOpTests(const std::string& op_type) {
auto test_case = [&](const std::vector<int64_t>& input_shape) {
auto build_test_case = [&](ModelTestBuilder& builder) {
auto* input1_arg = builder.MakeInput<float>(input_shape, -1.f, 1.f);
auto* input2_arg = builder.MakeInput<float>(input_shape, -1.f, 1.f);
auto* output_arg = builder.MakeOutput();
// add QDQ 1
auto* q1_output = builder.MakeIntermediate();
auto* dq1_output = builder.MakeIntermediate();
builder.AddQuantizeLinearNode<Input1Type>(input1_arg,
.004f,
std::numeric_limits<Input1Type>::max() / 2,
q1_output);
builder.AddDequantizeLinearNode<Input1Type>(q1_output,
.0039f,
std::numeric_limits<Input1Type>::max() / 2,
dq1_output);
// add QDQ 2
auto* q2_output = builder.MakeIntermediate();
auto* dq2_output = builder.MakeIntermediate();
builder.AddQuantizeLinearNode<Input2Type>(input2_arg,
.004f,
std::numeric_limits<Input2Type>::max() / 2,
q2_output);
builder.AddDequantizeLinearNode<Input2Type>(q2_output,
.0039f,
std::numeric_limits<Input2Type>::max() / 2,
dq2_output);
// add binary operator
auto* binary_op_output = builder.MakeIntermediate();
builder.AddNode(op_type, {dq1_output, dq2_output}, {binary_op_output});
// add QDQ output
auto* q3_output = builder.MakeIntermediate();
builder.AddQuantizeLinearNode<OutputType>(binary_op_output,
.0038f,
std::numeric_limits<OutputType>::max() / 2,
q3_output);
builder.AddDequantizeLinearNode<OutputType>(q3_output,
.0039f,
std::numeric_limits<OutputType>::max() / 2,
output_arg);
};
auto check_binary_op_graph = [&](InferenceSessionWrapper& session) {
auto check_graph = [&](InferenceSessionWrapper& session) {
auto op_to_count = CountOpsInGraph(session.GetGraph());
if (std::is_same<Input1Type, Input2Type>::value &&
std::is_same<Input1Type, OutputType>::value) {
@ -327,8 +282,8 @@ void QDQTransformerBinaryOpTests(const std::string& op_type) {
}
};
TransformerTester(build_test_case,
check_binary_op_graph,
TransformerTester(BuildBinaryOpTestCase<Input1Type, Input2Type, OutputType>(input_shape, op_type),
check_graph,
TransformerLevel::Level1,
TransformerLevel::Level2,
12 /*opset_version*/,
@ -426,7 +381,7 @@ void QDQTransformerMatMulTests(bool has_output_q) {
}
};
auto check_binary_op_graph = [&](InferenceSessionWrapper& session) {
auto check_graph = [&](InferenceSessionWrapper& session) {
auto op_to_count = CountOpsInGraph(session.GetGraph());
if (has_output_q) {
if constexpr (std::is_same<Input1Type, OutputType>::value &&
@ -459,7 +414,7 @@ void QDQTransformerMatMulTests(bool has_output_q) {
};
TransformerTester(build_test_case,
check_binary_op_graph,
check_graph,
TransformerLevel::Level1,
TransformerLevel::Level2,
12 /*opset_version*/,
@ -696,14 +651,14 @@ TEST(QDQTransformerTests, Gather) {
builder.AddQuantizeLinearNode<int8_t>(gather_output, .003f, 1, output_arg);
};
auto check_matmul_graph = [&](InferenceSessionWrapper& session) {
auto check_graph = [&](InferenceSessionWrapper& session) {
auto op_to_count = CountOpsInGraph(session.GetGraph());
EXPECT_EQ(op_to_count["Gather"], 1);
EXPECT_EQ(op_to_count["QuantizeLinear"], 0);
EXPECT_EQ(op_to_count["DequantizeLinear"], 0);
};
TransformerTester(build_test_case, check_matmul_graph, TransformerLevel::Level1, TransformerLevel::Level2);
TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2);
};
test_case({12, 37}, {24, 12});
@ -728,14 +683,14 @@ TEST(QDQTransformerTests, Transpose) {
builder.AddQuantizeLinearNode<int8_t>(transpose_output, .003f, 1, output_arg);
};
auto check_matmul_graph = [&](InferenceSessionWrapper& session) {
auto check_graph = [&](InferenceSessionWrapper& session) {
auto op_to_count = CountOpsInGraph(session.GetGraph());
EXPECT_EQ(op_to_count["Transpose"], 1);
EXPECT_EQ(op_to_count["QuantizeLinear"], 0);
EXPECT_EQ(op_to_count["DequantizeLinear"], 0);
};
TransformerTester(build_test_case, check_matmul_graph, TransformerLevel::Level1, TransformerLevel::Level2);
TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2);
};
test_case({2, 13, 12, 37}, {0, 3, 1, 2});
@ -760,13 +715,13 @@ TEST(QDQTransformerTests, Transpose_No_Fusion) {
builder.AddQuantizeLinearNode<int8_t>(transpose_output, .003f, 1, output_arg);
};
auto check_matmul_graph = [&](InferenceSessionWrapper& session) {
auto check_graph = [&](InferenceSessionWrapper& session) {
auto op_to_count = CountOpsInGraph(session.GetGraph());
EXPECT_EQ(op_to_count["QuantizeLinear"], 1);
EXPECT_EQ(op_to_count["DequantizeLinear"], 1);
};
TransformerTester(build_test_case, check_matmul_graph, TransformerLevel::Level1, TransformerLevel::Level2);
TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2);
};
test_case({2, 13, 12, 37}, {0, 3, 1, 2});
@ -775,7 +730,7 @@ TEST(QDQTransformerTests, Transpose_No_Fusion) {
TEST(QDQTransformerTests, Resize) {
auto test_case = [&](const std::vector<int64_t>& input1_shape,
const std::vector<int64_t>& sizes_shape) {
auto check_resize_graph = [&](InferenceSessionWrapper& session) {
auto check_graph = [&](InferenceSessionWrapper& session) {
auto op_to_count = CountOpsInGraph(session.GetGraph());
EXPECT_EQ(op_to_count["Resize"], 1);
EXPECT_EQ(op_to_count["QuantizeLinear"], 0);
@ -783,7 +738,7 @@ TEST(QDQTransformerTests, Resize) {
};
TransformerTester(BuildQDQResizeTestCase(input1_shape, sizes_shape),
check_resize_graph,
check_graph,
TransformerLevel::Level1,
TransformerLevel::Level2);
};
@ -828,7 +783,7 @@ TEST(QDQTransformerTests, Resize_No_Fusion) {
builder.AddQuantizeLinearNode<uint8_t>(resize_output, .003f, 1, output_arg);
};
auto check_qdq_graph = [&](InferenceSessionWrapper& session) {
auto check_graph = [&](InferenceSessionWrapper& session) {
auto op_to_count = CountOpsInGraph(session.GetGraph());
EXPECT_EQ(op_to_count["Resize"], 1);
EXPECT_EQ(op_to_count["Concat"], 1);
@ -836,7 +791,7 @@ TEST(QDQTransformerTests, Resize_No_Fusion) {
EXPECT_EQ(op_to_count["DequantizeLinear"], 1);
};
TransformerTester(build_test_case, check_qdq_graph,
TransformerTester(build_test_case, check_graph,
TransformerLevel::Level1,
TransformerLevel::Level2);
};
@ -867,7 +822,7 @@ TEST(QDQTransformerTests, ResizeReshape) {
builder.AddNode("Reshape", {qdq_resize_output, reshape_shape}, {output_arg});
};
auto check_qdq_graph = [&](InferenceSessionWrapper& session) {
auto check_graph = [&](InferenceSessionWrapper& session) {
auto op_to_count = CountOpsInGraph(session.GetGraph());
EXPECT_EQ(op_to_count["Resize"], 1);
EXPECT_EQ(op_to_count["Reshape"], 1);
@ -875,7 +830,7 @@ TEST(QDQTransformerTests, ResizeReshape) {
EXPECT_EQ(op_to_count["DequantizeLinear"], 1);
};
TransformerTester(build_test_case, check_qdq_graph,
TransformerTester(build_test_case, check_graph,
TransformerLevel::Level1,
TransformerLevel::Level2);
};
@ -905,13 +860,13 @@ TEST(QDQTransformerTests, ArgMax) {
argmax_node.AddAttribute("select_last_index", static_cast<int64_t>(select_last_index));
};
auto check_argmax_graph = [&](InferenceSessionWrapper& session) {
auto check_graph = [&](InferenceSessionWrapper& session) {
auto op_to_count = CountOpsInGraph(session.GetGraph());
EXPECT_EQ(op_to_count["ArgMax"], 1);
EXPECT_EQ(op_to_count["DequantizeLinear"], 0);
};
TransformerTester(build_test_case, check_argmax_graph,
TransformerTester(build_test_case, check_graph,
TransformerLevel::Level1,
TransformerLevel::Level2,
/* opset_version */ 13);
@ -939,14 +894,14 @@ TEST(QDQTransformerTests, QLinearMatMul) {
builder.AddQuantizeLinearNode<uint8_t>(matmul_output, .0039f, 135, output_arg);
};
auto check_matmul_graph = [&](InferenceSessionWrapper& session) {
auto check_graph = [&](InferenceSessionWrapper& session) {
auto op_to_count = CountOpsInGraph(session.GetGraph());
EXPECT_EQ(op_to_count["QLinearMatMul"], 1);
EXPECT_EQ(op_to_count["QuantizeLinear"], 2);
EXPECT_EQ(op_to_count["DequantizeLinear"], 0);
};
TransformerTester(build_test_case, check_matmul_graph, TransformerLevel::Level1, TransformerLevel::Level2);
TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2);
};
test_case({12, 37}, {37, 12});
@ -970,7 +925,7 @@ TEST(QDQTransformerTests, MatMul_No_Fusion) {
builder.AddQuantizeLinearNode<uint8_t>(matmul_output, .0039f, 135, output_arg);
};
auto check_matmul_graph = [&](InferenceSessionWrapper& session) {
auto check_graph = [&](InferenceSessionWrapper& session) {
auto op_to_count = CountOpsInGraph(session.GetGraph());
EXPECT_EQ(op_to_count["MatMul"], 1);
EXPECT_EQ(op_to_count["QLinearMatMul"], 0);
@ -978,7 +933,7 @@ TEST(QDQTransformerTests, MatMul_No_Fusion) {
EXPECT_EQ(op_to_count["DequantizeLinear"], 1);
};
TransformerTester(build_test_case, check_matmul_graph, TransformerLevel::Level1, TransformerLevel::Level2);
TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2);
};
test_case({12, 37}, {37, 12});
@ -1006,7 +961,7 @@ TEST(QDQTransformerTests, MatMul_1st_Input_Int8) {
builder.AddQuantizeLinearNode<uint8_t>(matmul_output, .0039f, 135, output_arg);
};
auto check_matmul_graph = [&](InferenceSessionWrapper& session) {
auto check_graph = [&](InferenceSessionWrapper& session) {
auto op_to_count = CountOpsInGraph(session.GetGraph());
EXPECT_EQ(op_to_count["MatMul"], 1);
EXPECT_EQ(op_to_count["QLinearMatMul"], 0);
@ -1014,7 +969,7 @@ TEST(QDQTransformerTests, MatMul_1st_Input_Int8) {
EXPECT_EQ(op_to_count["DequantizeLinear"], 2);
};
TransformerTester(build_test_case, check_matmul_graph, TransformerLevel::Level1, TransformerLevel::Level2);
TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2);
};
test_case({12, 37}, {37, 12});
@ -1043,7 +998,7 @@ TEST(QDQTransformerTests, MatMulIntegerToFloat) {
builder.AddNode("MatMul", {dq_output_1, dq_output_2}, {output_arg});
};
auto check_matmul_graph = [&](InferenceSessionWrapper& session) {
auto check_graph = [&](InferenceSessionWrapper& session) {
auto op_to_count = CountOpsInGraph(session.GetGraph());
EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 1);
EXPECT_EQ(op_to_count["QuantizeLinear"], 0);
@ -1051,7 +1006,7 @@ TEST(QDQTransformerTests, MatMulIntegerToFloat) {
};
TransformerTester(build_test_case,
check_matmul_graph,
check_graph,
TransformerLevel::Level1,
TransformerLevel::Level2,
12 /*opset_version*/,
@ -1086,7 +1041,7 @@ TEST(QDQTransformerTests, ConvRelu) {
builder.AddQuantizeLinearNode<uint8_t>(relu_output, .0039f, is_zp_zero ? 0 : 1, output_arg);
};
auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
auto check_graph = [&](InferenceSessionWrapper& session) {
auto op_to_count = CountOpsInGraph(session.GetGraph());
if (is_zp_zero) {
EXPECT_EQ(op_to_count["QLinearConv"], 1);
@ -1104,7 +1059,7 @@ TEST(QDQTransformerTests, ConvRelu) {
}
};
TransformerTester(build_test_case, check_mp_reshape_graph, TransformerLevel::Level1, TransformerLevel::Level2);
TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2);
};
test_case({1, 12, 37}, {32, 12, 5}, true);
@ -1150,7 +1105,7 @@ TEST(QDQTransformerTests, ConvAveragePoolReshape_UInt8) {
builder.AddDequantizeLinearNode<uint8_t>(q_output, .0035f, 135, output_arg);
};
auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
auto check_graph = [&](InferenceSessionWrapper& session) {
auto op_to_count = CountOpsInGraph(session.GetGraph());
EXPECT_EQ(op_to_count["QLinearConv"], 1);
EXPECT_EQ(op_to_count["com.microsoft.QLinearAveragePool"], 1);
@ -1160,7 +1115,7 @@ TEST(QDQTransformerTests, ConvAveragePoolReshape_UInt8) {
};
TransformerTester(build_test_case,
check_mp_reshape_graph,
check_graph,
TransformerLevel::Level1,
TransformerLevel::Level2,
12 /*opset_version*/,
@ -1213,7 +1168,7 @@ TEST(QDQTransformerTests, ConvAveragePoolReshape_Int8) {
}
};
auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
auto check_graph = [&](InferenceSessionWrapper& session) {
auto op_to_count = CountOpsInGraph(session.GetGraph());
EXPECT_EQ(op_to_count["QLinearConv"], 1);
EXPECT_EQ(op_to_count["com.microsoft.QLinearAveragePool"], 1);
@ -1223,7 +1178,7 @@ TEST(QDQTransformerTests, ConvAveragePoolReshape_Int8) {
};
TransformerTester(build_test_case,
check_mp_reshape_graph,
check_graph,
TransformerLevel::Level1,
TransformerLevel::Level2,
12 /*opset_version*/,
@ -1277,7 +1232,7 @@ TEST(QDQTransformerTests, ConvAveragePoolReshape_Int8_Fail) {
}
};
auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
auto check_graph = [&](InferenceSessionWrapper& session) {
auto op_to_count = CountOpsInGraph(session.GetGraph());
EXPECT_EQ(op_to_count["Conv"], 1);
EXPECT_EQ(op_to_count["QLinearConv"], 0);
@ -1288,7 +1243,7 @@ TEST(QDQTransformerTests, ConvAveragePoolReshape_Int8_Fail) {
};
TransformerTester(build_test_case,
check_mp_reshape_graph,
check_graph,
TransformerLevel::Level1,
TransformerLevel::Level2,
12 /*opset_version*/,
@ -1325,7 +1280,7 @@ void QDQTransformerLeakyReluTests() {
output_arg);
};
auto check_binary_op_graph = [&](InferenceSessionWrapper& session) {
auto check_graph = [&](InferenceSessionWrapper& session) {
auto op_to_count = CountOpsInGraph(session.GetGraph());
if constexpr (std::is_same<InputType, OutputType>::value) {
EXPECT_EQ(op_to_count["com.microsoft.QLinearLeakyRelu"], 1);
@ -1341,7 +1296,7 @@ void QDQTransformerLeakyReluTests() {
};
TransformerTester(build_test_case,
check_binary_op_graph,
check_graph,
TransformerLevel::Level1,
TransformerLevel::Level2,
12 /*opset_version*/,
@ -1401,7 +1356,7 @@ TEST(QDQTransformerTests, ConvTranspose_QBackward) {
}
};
auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
auto check_graph = [&](InferenceSessionWrapper& session) {
auto op_to_count = CountOpsInGraph(session.GetGraph());
EXPECT_EQ(op_to_count["QLinearConv"], 1);
EXPECT_EQ(op_to_count["Transpose"], 1);
@ -1410,7 +1365,7 @@ TEST(QDQTransformerTests, ConvTranspose_QBackward) {
};
TransformerTester(build_test_case,
check_mp_reshape_graph,
check_graph,
TransformerLevel::Level1,
TransformerLevel::Level2);
};
@ -1461,7 +1416,7 @@ TEST(QDQTransformerTests, QBackward_MutilpleSteps) {
}
};
auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
auto check_graph = [&](InferenceSessionWrapper& session) {
auto op_to_count = CountOpsInGraph(session.GetGraph());
EXPECT_EQ(op_to_count["QLinearConv"], 1);
EXPECT_EQ(op_to_count["MaxPool"], 1);
@ -1472,7 +1427,7 @@ TEST(QDQTransformerTests, QBackward_MutilpleSteps) {
};
TransformerTester(build_test_case,
check_mp_reshape_graph,
check_graph,
TransformerLevel::Level1,
TransformerLevel::Level2);
};
@ -1512,7 +1467,7 @@ TEST(QDQTransformerTests, ConvTranspose_DQForward) {
}
};
auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
auto check_graph = [&](InferenceSessionWrapper& session) {
auto op_to_count = CountOpsInGraph(session.GetGraph());
EXPECT_EQ(op_to_count["QLinearConv"], 1);
EXPECT_EQ(op_to_count["Transpose"], 1);
@ -1521,7 +1476,7 @@ TEST(QDQTransformerTests, ConvTranspose_DQForward) {
};
TransformerTester(build_test_case,
check_mp_reshape_graph,
check_graph,
TransformerLevel::Level1,
TransformerLevel::Level2);
};
@ -1572,7 +1527,7 @@ TEST(QDQTransformerTests, DQForward_MutilpleSteps) {
}
};
auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
auto check_graph = [&](InferenceSessionWrapper& session) {
auto op_to_count = CountOpsInGraph(session.GetGraph());
EXPECT_EQ(op_to_count["QLinearConv"], 1);
EXPECT_EQ(op_to_count["MaxPool"], 1);
@ -1583,7 +1538,7 @@ TEST(QDQTransformerTests, DQForward_MutilpleSteps) {
};
TransformerTester(build_test_case,
check_mp_reshape_graph,
check_graph,
TransformerLevel::Level1,
TransformerLevel::Level2);
};
@ -1704,7 +1659,7 @@ TEST(QDQTransformerTests, Concat) {
}
};
auto check_mp_reshape_graph = [&input_shapes, &has_input_float, &has_input_int8, &has_output_int8](InferenceSessionWrapper& session) {
auto check_graph = [&input_shapes, &has_input_float, &has_input_int8, &has_output_int8](InferenceSessionWrapper& session) {
auto op_to_count = CountOpsInGraph(session.GetGraph());
if (has_input_float || has_input_int8 || has_output_int8) {
EXPECT_EQ(op_to_count["com.microsoft.QLinearConcat"], 0);
@ -1716,7 +1671,7 @@ TEST(QDQTransformerTests, Concat) {
};
TransformerTester(build_test_case,
check_mp_reshape_graph,
check_graph,
TransformerLevel::Level1,
TransformerLevel::Level2,
12 /*opset_version*/,
@ -1763,7 +1718,7 @@ TEST(QDQTransformerTests, QDQPropagation_QDQCancelOut) {
builder.AddNode("Reshape", {maxpool_output, reshape_shape}, {output_arg});
};
auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
auto check_graph = [&](InferenceSessionWrapper& session) {
auto op_to_count = CountOpsInGraph(session.GetGraph());
EXPECT_EQ(op_to_count["MaxPool"], 1);
EXPECT_EQ(op_to_count["Reshape"], 1);
@ -1773,7 +1728,7 @@ TEST(QDQTransformerTests, QDQPropagation_QDQCancelOut) {
};
TransformerTester(build_test_case,
check_mp_reshape_graph,
check_graph,
TransformerLevel::Level1,
TransformerLevel::Level2);
};
@ -1799,7 +1754,7 @@ TEST(QDQTransformerTests, QDQPropagation_QDQ_CancelOut_More) {
builder.AddQuantizeLinearNode<uint8_t>(reshape_output, same_scale ? .004f : .0039f, same_zp ? 129 : 128, output_arg);
};
auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
auto check_graph = [&](InferenceSessionWrapper& session) {
auto op_to_count = CountOpsInGraph(session.GetGraph());
EXPECT_EQ(op_to_count["Reshape"], 1);
EXPECT_EQ(op_to_count["QuantizeLinear"], same_scale && same_zp ? 1 : 2);
@ -1807,7 +1762,7 @@ TEST(QDQTransformerTests, QDQPropagation_QDQ_CancelOut_More) {
};
TransformerTester(build_test_case,
check_mp_reshape_graph,
check_graph,
TransformerLevel::Level1,
TransformerLevel::Level2);
};
@ -1833,7 +1788,7 @@ TEST(QDQTransformerTests, QDQPropagation_Q_No_Parent) {
builder.AddQuantizeLinearNode<uint8_t>(transpose_output, .0035f, 135, output_arg);
};
auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
auto check_graph = [&](InferenceSessionWrapper& session) {
GraphViewer graph_viewer(session.GetGraph());
const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder();
EXPECT_EQ(graph_viewer.GetNode(node_topology_list[0])->OpType(), "QuantizeLinear");
@ -1841,7 +1796,7 @@ TEST(QDQTransformerTests, QDQPropagation_Q_No_Parent) {
};
TransformerTester(build_test_case,
check_mp_reshape_graph,
check_graph,
TransformerLevel::Level1,
TransformerLevel::Level2);
};
@ -1866,7 +1821,7 @@ TEST(QDQTransformerTests, QDQPropagation_DQ_No_Children) {
transpose_node.AddAttribute("perm", perms);
};
auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
auto check_graph = [&](InferenceSessionWrapper& session) {
auto op_to_count = CountOpsInGraph(session.GetGraph());
GraphViewer graph_viewer(session.GetGraph());
const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder();
@ -1875,7 +1830,7 @@ TEST(QDQTransformerTests, QDQPropagation_DQ_No_Children) {
};
TransformerTester(build_test_case,
check_mp_reshape_graph,
check_graph,
TransformerLevel::Level1,
TransformerLevel::Level2);
};
@ -1902,7 +1857,7 @@ TEST(QDQTransformerTests, QDQPropagation_Per_Layer_No_Propagation) {
transpose_node.AddAttribute("perm", perms);
};
auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
auto check_graph = [&](InferenceSessionWrapper& session) {
auto op_to_count = CountOpsInGraph(session.GetGraph());
GraphViewer graph_viewer(session.GetGraph());
const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder();
@ -1911,7 +1866,7 @@ TEST(QDQTransformerTests, QDQPropagation_Per_Layer_No_Propagation) {
};
TransformerTester(build_test_case,
check_mp_reshape_graph,
check_graph,
TransformerLevel::Level1,
TransformerLevel::Level2);
};
@ -1935,14 +1890,14 @@ TEST(QDQTransformerTests, QDQPropagation_DQ_Q) {
builder.AddQuantizeLinearNode<uint8_t>(dq_output, .0035f, 135, output_arg);
};
auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
auto check_graph = [&](InferenceSessionWrapper& session) {
auto op_to_count = CountOpsInGraph(session.GetGraph());
EXPECT_EQ(op_to_count["QuantizeLinear"], 1);
EXPECT_EQ(op_to_count["DequantizeLinear"], 1);
};
TransformerTester(build_test_case,
check_mp_reshape_graph,
check_graph,
TransformerLevel::Level1,
TransformerLevel::Level2);
};

View file

@ -271,7 +271,9 @@ TEST(NnapiExecutionProviderTest, TestNoShapeInputModel) {
<< "No node should be taken by the NNAPI EP";
}
static void RunQDQModelTest(const GetQDQTestCaseFn& build_test_case, const char* test_description) {
static void RunQDQModelTest(const GetQDQTestCaseFn& build_test_case,
const char* test_description,
const EPVerificationParams& params = EPVerificationParams()) {
onnxruntime::Model model(test_description, false, DefaultLoggingManager().DefaultLogger());
Graph& graph = model.MainGraph();
ModelTestBuilder helper(graph);
@ -286,7 +288,7 @@ static void RunQDQModelTest(const GetQDQTestCaseFn& build_test_case, const char*
#if defined(__ANDROID__)
RunAndVerifyOutputsWithEP(model_data, "NnapiExecutionProviderTest.TestQDQModel",
std::make_unique<NnapiExecutionProvider>(0),
helper.feeds_);
helper.feeds_, params);
#else
// test load only
SessionOptions so;
@ -306,7 +308,8 @@ TEST(NnapiExecutionProviderTest, TestQDQConv) {
uint8_t /* OutputType */>(
{1, 1, 5, 5} /*input_shape*/,
{1, 1, 3, 3} /*weights_shape*/),
"nnapi_qdq_test_graph_conv");
"nnapi_qdq_test_graph_conv",
{true /* verify_entire_graph_use_ep */});
}
TEST(NnapiExecutionProviderTest, TestQDQResize) {
@ -316,14 +319,44 @@ TEST(NnapiExecutionProviderTest, TestQDQResize) {
{1, 3, 32, 32} /* sizes_data */,
"linear" /* mode */,
"asymmetric" /* coordinate_transformation_mode */),
"nnapi_qdq_test_graph_resize");
"nnapi_qdq_test_graph_resize",
{true /* verify_entire_graph_use_ep */});
}
TEST(NnapiExecutionProviderTest, TestQDQAveragePool) {
// NNAPI use different rounding, which may cause ~1% difference in the result
RunQDQModelTest(BuildQDQAveragePoolTestCase<uint8_t /* InputType */,
uint8_t /* OutputType */>(
{1, 3, 32, 32} /* input_shape */),
"nnapi_qdq_test_graph_averagepool");
"nnapi_qdq_test_graph_averagepool",
{
true /* verify_entire_graph_use_ep */,
1e-2f /* fp32_abs_err */,
});
}
TEST(NnapiExecutionProviderTest, TestQDQAdd) {
RunQDQModelTest(BuildBinaryOpTestCase<uint8_t /* Input1Type */,
uint8_t /* Input2Type */,
uint8_t /* OutputType */>(
{1, 23, 13, 13} /* input_shape */,
"Add" /* op_type */),
"nnapi_qdq_test_graph_add",
{true /* verify_entire_graph_use_ep */});
}
TEST(NnapiExecutionProviderTest, TestQDQMul) {
// NNAPI use different rounding, which may cause ~1% difference in the result
RunQDQModelTest(BuildBinaryOpTestCase<uint8_t /* Input1Type */,
uint8_t /* Input2Type */,
uint8_t /* OutputType */>(
{1, 23, 13, 13} /* input_shape */,
"Mul" /* op_type */),
"nnapi_qdq_test_graph_mul",
{
true /* verify_entire_graph_use_ep */,
1e-2f /* fp32_abs_err */,
});
}
#endif // !(ORT_MINIMAL_BUILD)

View file

@ -15,6 +15,18 @@ class Graph;
namespace test {
// struct to hold some verification params for RunAndVerifyOutputsWithEP
struct EPVerificationParams {
// Verify the entire graph is taken by the EP
// if this is set to false, then will verify that at least one node is assigned to 'execution_provider'
bool verify_entire_graph_use_ep{false};
// Some EP may use different rounding than ORT CPU EP, which may cause a bigger abs error than
// the default of 1e-5f, especially for scenarios such as [Q -> Quantized op -> DQ]
// Set this only if this is necessary
float fp32_abs_err = 1e-5f;
};
// return number of nodes in the Graph and any subgraphs that are assigned to the specified execution provider
int CountAssignedNodes(const Graph& current_graph, const std::string& ep_type);
@ -23,13 +35,14 @@ int CountAssignedNodes(const Graph& current_graph, const std::string& ep_type);
void RunAndVerifyOutputsWithEP(const ORTCHAR_T* model_path,
const char* log_id,
std::unique_ptr<IExecutionProvider> execution_provider,
const NameMLValMap& feeds);
const NameMLValMap& feeds,
const EPVerificationParams& params = EPVerificationParams());
// helper function that takes in model_data
// used in nnapi qdq model tests
void RunAndVerifyOutputsWithEP(const std::string& model_data,
const char* log_id,
std::unique_ptr<IExecutionProvider> execution_provider,
const NameMLValMap& feeds);
const NameMLValMap& feeds,
const EPVerificationParams& params = EPVerificationParams());
} // namespace test
} // namespace onnxruntime

View file

@ -18,7 +18,8 @@ namespace onnxruntime {
namespace test {
static void VerifyOutputs(const std::vector<std::string>& output_names,
const std::vector<OrtValue>& expected_fetches,
const std::vector<OrtValue>& fetches) {
const std::vector<OrtValue>& fetches,
const EPVerificationParams& params) {
ASSERT_EQ(expected_fetches.size(), fetches.size());
for (size_t i = 0, end = expected_fetches.size(); i < end; ++i) {
@ -40,10 +41,8 @@ static void VerifyOutputs(const std::vector<std::string>& output_names,
<< " mismatch for " << output_names[i];
break;
case ONNX_NAMESPACE::TensorProto_DataType_FLOAT: {
constexpr float abs_err = 1e-5f;
EXPECT_THAT(ltensor.DataAsSpan<float>(),
::testing::Pointwise(::testing::FloatNear(abs_err), rtensor.DataAsSpan<float>()));
::testing::Pointwise(::testing::FloatNear(params.fp32_abs_err), rtensor.DataAsSpan<float>()));
break;
}
default:
@ -72,16 +71,18 @@ int CountAssignedNodes(const Graph& current_graph, const std::string& ep_type) {
void RunAndVerifyOutputsWithEP(const ORTCHAR_T* model_path, const char* log_id,
std::unique_ptr<IExecutionProvider> execution_provider,
const NameMLValMap& feeds) {
const NameMLValMap& feeds,
const EPVerificationParams& params) {
// read raw data from model provided by the model_path
std::ifstream stream(model_path, std::ios::in | std::ios::binary);
std::string model_data((std::istreambuf_iterator<char>(stream)), std::istreambuf_iterator<char>());
RunAndVerifyOutputsWithEP(model_data, log_id, std::move(execution_provider), feeds);
RunAndVerifyOutputsWithEP(model_data, log_id, std::move(execution_provider), feeds, params);
}
void RunAndVerifyOutputsWithEP(const std::string& model_data, const char* log_id,
std::unique_ptr<IExecutionProvider> execution_provider,
const NameMLValMap& feeds) {
const NameMLValMap& feeds,
const EPVerificationParams& params) {
SessionOptions so;
so.session_logid = log_id;
RunOptions run_options;
@ -122,12 +123,17 @@ void RunAndVerifyOutputsWithEP(const std::string& model_data, const char* log_id
// make sure that some nodes are assigned to the EP, otherwise this test is pointless...
const auto& graph2 = session_object2.GetGraph();
auto ep_nodes = CountAssignedNodes(graph2, provider_type);
ASSERT_GT(ep_nodes, 0) << "No nodes were assigned to " << provider_type;
if (params.verify_entire_graph_use_ep) {
// Verify the entire graph is assigned to the EP
ASSERT_EQ(ep_nodes, graph2.NumberOfNodes()) << "Not all nodes were assigned to " << provider_type;
} else {
ASSERT_GT(ep_nodes, 0) << "No nodes were assigned to " << provider_type;
}
// Run with EP and verify the result
std::vector<OrtValue> fetches;
ASSERT_STATUS_OK(session_object2.Run(run_options, feeds, output_names, &fetches));
VerifyOutputs(output_names, expected_fetches, fetches);
VerifyOutputs(output_names, expected_fetches, fetches, params);
}
#if !defined(DISABLE_SPARSE_TENSORS)