mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-14 20:48:00 +00:00
[QNN EP] Apply workaround for Conv validation bug when bias input is implicit (#21764)
### Description - Adds a dummy bias of all zeros when translating a Conv without an explicit bias input. This is a workaround for a QNN validation issue that fails when the optional bias input is not provided. - Corrects logic for unpacking of **non-zero int4** zero-points. Bug does not impact models because we currently only support int4 zero-points equal to 0 (symmetric quant). But this would become an issue in the future if/when QNN supports non-zero int4 zero-points (so good to fix now). ### Motivation and Context Support Conv operators without a bias input on QNN EP with the latest QNN SDK.
This commit is contained in:
parent
6c1a3f85a6
commit
514b4699b4
8 changed files with 207 additions and 59 deletions
|
|
@ -80,6 +80,64 @@ Status BaseOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
Status BaseOpBuilder::AddZeroBiasInput(QnnModelWrapper& qnn_model_wrapper,
|
||||
const QnnQuantParamsWrapper& input0_qparams,
|
||||
const QnnQuantParamsWrapper& input1_qparams,
|
||||
std::vector<uint32_t>&& bias_shape,
|
||||
const std::string& bias_name,
|
||||
const logging::Logger& logger,
|
||||
std::vector<std::string>& input_names) const {
|
||||
ORT_UNUSED_PARAMETER(logger);
|
||||
// For now, only handle case where input0 is per-tensor quantized and input1 is either per-tensor
|
||||
// or per-channel quantized.
|
||||
ORT_RETURN_IF_NOT(input0_qparams.IsPerTensor(/*include_bw*/ true) && input1_qparams.IsQuantized(),
|
||||
"QNN EP currently only supports adding a dummy zero bias input for per-tensor ",
|
||||
"input[0] and per-tensor/per-channel input[1]");
|
||||
|
||||
size_t num_bias_elems = 1;
|
||||
for (size_t i = 0; i < bias_shape.size(); i++) {
|
||||
num_bias_elems *= static_cast<size_t>(bias_shape[i]);
|
||||
}
|
||||
|
||||
// Bias static input should be all zeros.
|
||||
std::vector<uint8_t> bias_bytes(num_bias_elems * sizeof(int32_t), 0);
|
||||
|
||||
// Bias's quantization scale(s) should be the product of the other inputs' quantization scales.
|
||||
// Input[0] is expected to have one scale (per-tensor).
|
||||
// If input[1] is per-channel (many scales), then the dummy bias also needs to be per-channel.
|
||||
std::vector<float> input0_quant_scales;
|
||||
std::vector<float> input1_quant_scales;
|
||||
ORT_RETURN_IF_ERROR(input0_qparams.GetScales(input0_quant_scales));
|
||||
ORT_RETURN_IF_ERROR(input1_qparams.GetScales(input1_quant_scales));
|
||||
|
||||
const size_t num_bias_scales_offsets = input1_quant_scales.size();
|
||||
assert(input0_quant_scales.size() == 1); // Expected for per-tensor.
|
||||
ORT_RETURN_IF_NOT(num_bias_scales_offsets >= input0_quant_scales.size(),
|
||||
"Input[1] should have >= 1 quantization scale values");
|
||||
|
||||
std::vector<float> bias_scales(num_bias_scales_offsets);
|
||||
for (size_t i = 0; i < num_bias_scales_offsets; i++) {
|
||||
bias_scales[i] = input0_quant_scales[0] * input1_quant_scales[i];
|
||||
}
|
||||
|
||||
std::vector<int32_t> bias_offsets(num_bias_scales_offsets, 0); // Bias's zero-points should be all zeros.
|
||||
QnnQuantParamsWrapper bias_qparams;
|
||||
|
||||
if (input1_qparams.IsPerChannel()) {
|
||||
bias_qparams = QnnQuantParamsWrapper(bias_scales, bias_offsets, /*axis*/ 0, /*is_int4*/ false);
|
||||
} else {
|
||||
bias_qparams = QnnQuantParamsWrapper(bias_scales[0], bias_offsets[0]);
|
||||
}
|
||||
|
||||
auto tensor_wrapper = QnnTensorWrapper(bias_name, QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_SFIXED_POINT_32,
|
||||
std::move(bias_qparams), std::move(bias_shape), std::move(bias_bytes));
|
||||
|
||||
qnn_model_wrapper.AddTensorWrapper(std::move(tensor_wrapper));
|
||||
input_names.push_back(bias_name);
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status BaseOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
|
||||
const NodeUnit& node_unit,
|
||||
std::vector<std::string>&& input_names,
|
||||
|
|
|
|||
|
|
@ -95,6 +95,14 @@ class BaseOpBuilder : public IOpBuilder {
|
|||
const logging::Logger& logger,
|
||||
std::vector<std::string>& input_names) const ORT_MUST_USE_RESULT;
|
||||
|
||||
Status AddZeroBiasInput(QnnModelWrapper& qnn_model_wrapper,
|
||||
const QnnQuantParamsWrapper& input0_qparams,
|
||||
const QnnQuantParamsWrapper& input1_qparams,
|
||||
std::vector<uint32_t>&& bias_shape,
|
||||
const std::string& bias_name,
|
||||
const logging::Logger& logger,
|
||||
std::vector<std::string>& input_names) const ORT_MUST_USE_RESULT;
|
||||
|
||||
Status SetOutputQParamEqualToInputIfNearlyEqual(QnnModelWrapper& qnn_model_wrapper,
|
||||
const NodeUnit& node_unit,
|
||||
const logging::Logger& logger,
|
||||
|
|
|
|||
|
|
@ -289,10 +289,30 @@ Status ConvOpBuilder::ProcessConv2D3DInputs(QnnModelWrapper& qnn_model_wrapper,
|
|||
//
|
||||
// Input 2: bias
|
||||
//
|
||||
if (num_inputs == 3) {
|
||||
const bool has_bias_input = num_inputs == 3;
|
||||
if (has_bias_input) {
|
||||
ORT_RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, inputs[2], logger, input_names));
|
||||
}
|
||||
|
||||
#if QNN_API_VERSION_MAJOR == 2 && (QNN_API_VERSION_MINOR >= 16 && QNN_API_VERSION_MINOR <= 18)
|
||||
if (!has_bias_input && IsNpuBackend(qnn_model_wrapper.GetQnnBackendType())) {
|
||||
// Bias is implicit. QNN SDK 2.23/2.24/2.25 (QNN API version 2.16/2.17/2.18) has a validation bug for
|
||||
// implicit bias inputs, so provide an explicit bias of all 0 (quantized int32).
|
||||
TensorInfo input0_info = {};
|
||||
ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[0], input0_info));
|
||||
|
||||
TensorInfo input1_info = {};
|
||||
ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[1], input1_info));
|
||||
|
||||
if (input0_info.quant_param.IsPerTensor(/*include_bw*/ true) && input1_info.quant_param.IsQuantized()) {
|
||||
const std::string bias_name = qnn::utils::GetNodeName(node_unit) + "_implicit_bias_ort_qnn_ep";
|
||||
std::vector<uint32_t> bias_shape = {input1_info.shape[0]};
|
||||
ORT_RETURN_IF_ERROR(AddZeroBiasInput(qnn_model_wrapper, input0_info.quant_param, input1_info.quant_param,
|
||||
std::move(bias_shape), bias_name, logger, input_names));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -99,47 +99,9 @@ Status LayerNormOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
|
|||
|
||||
if (x_input_info.quant_param.IsPerTensor(/*include_bw*/ true) && scale_input_info.quant_param.IsQuantized()) {
|
||||
const std::string bias_name = qnn::utils::GetNodeName(node_unit) + "_implicit_bias_ort_qnn_ep";
|
||||
|
||||
// Make dummy bias input have the same shape as the scale input.
|
||||
std::vector<uint32_t> bias_shape = scale_input_info.shape;
|
||||
size_t num_bias_elems = 1;
|
||||
for (size_t i = 0; i < bias_shape.size(); i++) {
|
||||
num_bias_elems *= static_cast<size_t>(bias_shape[i]);
|
||||
}
|
||||
|
||||
// Bias static input should be all zeros.
|
||||
std::vector<uint8_t> bias_bytes(num_bias_elems * sizeof(int32_t), 0);
|
||||
|
||||
// Bias's quantization scale should be the product of the other inputs' quantization scales.
|
||||
std::vector<float> input0_quant_scales;
|
||||
std::vector<float> input1_quant_scales;
|
||||
ORT_RETURN_IF_ERROR(x_input_info.quant_param.GetScales(input0_quant_scales));
|
||||
ORT_RETURN_IF_ERROR(scale_input_info.quant_param.GetScales(input1_quant_scales));
|
||||
|
||||
const size_t num_bias_scales_offsets = input1_quant_scales.size();
|
||||
assert(input0_quant_scales.size() == 1); // Expected for per-tensor.
|
||||
ORT_RETURN_IF_NOT(num_bias_scales_offsets >= input0_quant_scales.size(),
|
||||
"Input[1] should have >= 1 quantization scale values");
|
||||
|
||||
std::vector<float> bias_scales(num_bias_scales_offsets);
|
||||
for (size_t i = 0; i < num_bias_scales_offsets; i++) {
|
||||
bias_scales[i] = input0_quant_scales[0] * input1_quant_scales[i];
|
||||
}
|
||||
|
||||
std::vector<int32_t> bias_offsets(num_bias_scales_offsets, 0); // Bias's zero-points should be all zeros.
|
||||
QnnQuantParamsWrapper bias_qparams;
|
||||
|
||||
if (scale_input_info.quant_param.IsPerChannel()) {
|
||||
bias_qparams = QnnQuantParamsWrapper(bias_scales, bias_offsets, /*axis*/ 0, /*is_int4*/ false);
|
||||
} else {
|
||||
bias_qparams = QnnQuantParamsWrapper(bias_scales[0], bias_offsets[0]);
|
||||
}
|
||||
|
||||
auto tensor_wrapper = QnnTensorWrapper(bias_name, QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_SFIXED_POINT_32,
|
||||
std::move(bias_qparams), std::move(bias_shape), std::move(bias_bytes));
|
||||
|
||||
qnn_model_wrapper.AddTensorWrapper(std::move(tensor_wrapper));
|
||||
input_names.push_back(bias_name);
|
||||
ORT_RETURN_IF_ERROR(AddZeroBiasInput(qnn_model_wrapper, x_input_info.quant_param, scale_input_info.quant_param,
|
||||
std::move(bias_shape), bias_name, logger, input_names));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -335,7 +335,18 @@ Status QnnModelWrapper::UnpackZeroPoints(const std::string& initializer_name,
|
|||
|
||||
switch (onnx_data_type) {
|
||||
// QNN use -offset for some reason
|
||||
case ONNX_NAMESPACE::TensorProto_DataType_INT4: // INT4 zero-points are unpacked as 8-bit values for QNN
|
||||
case ONNX_NAMESPACE::TensorProto_DataType_INT4: { // INT4 zero-points are unpacked as 8-bit values for QNN
|
||||
auto int8_span = ReinterpretAsSpan<const int8_t>(gsl::make_span(initializer_bytes));
|
||||
std::transform(int8_span.begin(), int8_span.end(), std::back_inserter(zero_points),
|
||||
[](int8_t masked_zp) -> int32_t {
|
||||
// We currently unpack int4 as int8 but with the top 4-bits masked off due to QNN bug.
|
||||
// Need to undo the masking so that the zero-point value is correct.
|
||||
// (Not really a problem yet because QNN only supports symmetric INT4 quantization with zp == 0).
|
||||
int8_t zp = Int4x2::SignExtendLower4Bits(std::byte(masked_zp));
|
||||
return -static_cast<int32_t>(zp);
|
||||
});
|
||||
break;
|
||||
}
|
||||
case ONNX_NAMESPACE::TensorProto_DataType_INT8: {
|
||||
auto int8_span = ReinterpretAsSpan<const int8_t>(gsl::make_span(initializer_bytes));
|
||||
std::transform(int8_span.begin(), int8_span.end(), std::back_inserter(zero_points),
|
||||
|
|
|
|||
|
|
@ -231,6 +231,8 @@ std::ostream& operator<<(std::ostream& out, const Qnn_QuantizeParams_t& quantize
|
|||
} else if (quantize_params.quantizationEncoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
|
||||
out << " axis=" << quantize_params.axisScaleOffsetEncoding.axis;
|
||||
size_t num_elems = quantize_params.axisScaleOffsetEncoding.numScaleOffsets;
|
||||
bool truncate = num_elems > 20;
|
||||
num_elems = truncate ? 20 : num_elems;
|
||||
out << " scales=(";
|
||||
for (size_t i = 0; i < num_elems; i++) {
|
||||
out << quantize_params.axisScaleOffsetEncoding.scaleOffset[i].scale << (i == num_elems - 1 ? "" : " ");
|
||||
|
|
@ -239,11 +241,13 @@ std::ostream& operator<<(std::ostream& out, const Qnn_QuantizeParams_t& quantize
|
|||
for (size_t i = 0; i < num_elems; i++) {
|
||||
out << quantize_params.axisScaleOffsetEncoding.scaleOffset[i].offset << (i == num_elems - 1 ? "" : " ");
|
||||
}
|
||||
out << ")";
|
||||
out << (truncate ? "...)" : ")");
|
||||
} else if (quantize_params.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) {
|
||||
out << " axis=" << quantize_params.bwAxisScaleOffsetEncoding.axis;
|
||||
out << " bw=" << quantize_params.bwAxisScaleOffsetEncoding.bitwidth;
|
||||
size_t num_elems = quantize_params.bwAxisScaleOffsetEncoding.numElements;
|
||||
bool truncate = num_elems > 20;
|
||||
num_elems = truncate ? 20 : num_elems;
|
||||
out << " scales=(";
|
||||
for (size_t i = 0; i < num_elems; i++) {
|
||||
out << quantize_params.bwAxisScaleOffsetEncoding.scales[i] << (i == num_elems - 1 ? "" : " ");
|
||||
|
|
@ -252,7 +256,7 @@ std::ostream& operator<<(std::ostream& out, const Qnn_QuantizeParams_t& quantize
|
|||
for (size_t i = 0; i < num_elems; i++) {
|
||||
out << quantize_params.bwAxisScaleOffsetEncoding.offsets[i] << (i == num_elems - 1 ? "" : " ");
|
||||
}
|
||||
out << ")";
|
||||
out << (truncate ? "...)" : ")");
|
||||
} else {
|
||||
out << " encoding not supported.";
|
||||
}
|
||||
|
|
|
|||
|
|
@ -423,14 +423,14 @@ static void LogNodeSupport(const logging::Logger& logger,
|
|||
return;
|
||||
}
|
||||
|
||||
size_t num_nodes = 0;
|
||||
std::ostringstream oss;
|
||||
oss << (support_status.IsOK() ? "Validation PASSED " : "Validation FAILED ") << "for nodes ("
|
||||
<< qnn_node_group.Type() << "):" << std::endl;
|
||||
for (const NodeUnit* node_unit : qnn_node_group.GetNodeUnits()) {
|
||||
for (const Node* node : node_unit->GetAllNodesInGroup()) {
|
||||
oss << "\tOperator type: " << node->OpType()
|
||||
<< " Node name: " << node->Name()
|
||||
<< " Node index: " << node->Index() << std::endl;
|
||||
num_nodes += 1;
|
||||
}
|
||||
}
|
||||
if (!support_status.IsOK()) {
|
||||
|
|
@ -440,6 +440,9 @@ static void LogNodeSupport(const logging::Logger& logger,
|
|||
logging::Capture(logger, log_severity, logging::Category::onnxruntime,
|
||||
log_data_type, call_site)
|
||||
.Stream()
|
||||
<< (support_status.IsOK() ? "Validation PASSED " : "Validation FAILED ") << "for " << num_nodes
|
||||
<< " nodes in " << qnn_node_group.Type() << " (" << qnn_node_group.GetTargetNodeUnit()->OpType() << ") :"
|
||||
<< std::endl
|
||||
<< oss.str();
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -793,19 +793,101 @@ TEST_F(QnnHTPBackendTests, ConvU16S4S32_PerChannel) {
|
|||
TestInputDef<float> bias_def(bias_shape, true,
|
||||
GetFloatDataInRange(-1.0f, 1.0f, TensorShape(bias_shape).Size()));
|
||||
|
||||
RunHTPConvOpPerChannelTest<uint8_t, Int4x2>("Conv",
|
||||
input_def,
|
||||
weight_def,
|
||||
bias_def,
|
||||
0, // weight quant axis
|
||||
{1, 1}, // Strides
|
||||
{0, 0, 0, 0}, // Pads
|
||||
{1, 1}, // Dilations
|
||||
1, // default group
|
||||
"NOTSET",
|
||||
ExpectedEPNodeAssignment::All,
|
||||
false, // use_qdq_contrib_ops
|
||||
21); // opset
|
||||
RunHTPConvOpPerChannelTest<uint16_t, Int4x2>("Conv",
|
||||
input_def,
|
||||
weight_def,
|
||||
bias_def,
|
||||
0, // weight quant axis
|
||||
{1, 1}, // Strides
|
||||
{0, 0, 0, 0}, // Pads
|
||||
{1, 1}, // Dilations
|
||||
1, // default group
|
||||
"NOTSET",
|
||||
ExpectedEPNodeAssignment::All,
|
||||
false, // use_qdq_contrib_ops
|
||||
21); // opset
|
||||
}
|
||||
|
||||
// Test per-channel QDQ Conv with INT4 weights and no bias.
|
||||
// in0: u16, in1 (weight): s4, out: u8
|
||||
// Tests bug in QNN SDK 2.25 when validating Conv without a bias (QNN EP adds a dummy bias).
|
||||
TEST_F(QnnHTPBackendTests, ConvU16S4_PerChannel_NoBias) {
|
||||
std::vector<int64_t> input_shape = {1, 2, 4, 4};
|
||||
std::vector<int64_t> weight_shape = {3, 2, 2, 2};
|
||||
|
||||
TestInputDef<float> input_def(input_shape, false,
|
||||
GetFloatDataInRange(0.0f, 1.0f, TensorShape(input_shape).Size()));
|
||||
TestInputDef<float> weight_def(weight_shape, true,
|
||||
GetFloatDataInRange(-1.0f, 5.0f, TensorShape(weight_shape).Size()));
|
||||
|
||||
RunHTPConvOpPerChannelTest<uint16_t, Int4x2>("Conv",
|
||||
input_def,
|
||||
weight_def,
|
||||
TestInputDef<float>(),
|
||||
0, // weight quant axis
|
||||
{1, 1}, // Strides
|
||||
{0, 0, 0, 0}, // Pads
|
||||
{1, 1}, // Dilations
|
||||
1, // default group
|
||||
"NOTSET",
|
||||
ExpectedEPNodeAssignment::All,
|
||||
false, // use_qdq_contrib_ops
|
||||
21); // opset
|
||||
}
|
||||
|
||||
// Test per-channel QDQ Conv with uint16 input[0], uint8 weights, and no bias.
|
||||
// in0: u16, in1 (weight): s4, out: u8
|
||||
// Tests bug in QNN SDK 2.25 when validating Conv without a bias (QNN EP adds a dummy bias).
|
||||
TEST_F(QnnHTPBackendTests, ConvU16U8_PerTensor_NoBias) {
|
||||
std::vector<int64_t> input_shape = {1, 2, 4, 4};
|
||||
std::vector<int64_t> weight_shape = {3, 2, 2, 2};
|
||||
|
||||
TestInputDef<float> input_def(input_shape, false,
|
||||
GetFloatDataInRange(0.0f, 1.0f, TensorShape(input_shape).Size()));
|
||||
TestInputDef<float> weight_def(weight_shape, true,
|
||||
GetFloatDataInRange(-1.0f, 5.0f, TensorShape(weight_shape).Size()));
|
||||
|
||||
RunHTPConvOpTest<uint16_t, uint8_t>("Conv",
|
||||
input_def,
|
||||
weight_def,
|
||||
TestInputDef<float>(),
|
||||
{1, 1}, // Strides
|
||||
{0, 0, 0, 0}, // Pads
|
||||
{1, 1}, // Dilations
|
||||
1, // default group
|
||||
"NOTSET",
|
||||
ExpectedEPNodeAssignment::All,
|
||||
false, // use_qdq_contrib_ops
|
||||
21); // opset
|
||||
}
|
||||
|
||||
TEST_F(QnnHTPBackendTests, ConvU16S4_PerChannel_NoBias_LargeINT4Weight) {
|
||||
std::vector<int64_t> input_shape = {1, 3072, 1, 512};
|
||||
std::vector<int64_t> weight_shape = {9216, 3072, 1, 1};
|
||||
std::vector<float> input_data(TensorShape(input_shape).Size(), 0.1f);
|
||||
input_data[0] = 0.2f;
|
||||
std::vector<float> weight_data(TensorShape(weight_shape).Size(), -0.1f);
|
||||
for (size_t c = 0; c < static_cast<size_t>(weight_shape[0]); c++) {
|
||||
size_t i = c * 3072;
|
||||
weight_data[i] = 0.1f;
|
||||
}
|
||||
|
||||
TestInputDef<float> input_def(input_shape, false, input_data);
|
||||
TestInputDef<float> weight_def(weight_shape, true, weight_data);
|
||||
|
||||
RunHTPConvOpPerChannelTest<uint16_t, Int4x2>("Conv",
|
||||
input_def,
|
||||
weight_def,
|
||||
TestInputDef<float>(),
|
||||
0, // weight quant axis
|
||||
{1, 1}, // Strides
|
||||
{0, 0, 0, 0}, // Pads
|
||||
{1, 1}, // Dilations
|
||||
1, // default group
|
||||
"NOTSET",
|
||||
ExpectedEPNodeAssignment::All,
|
||||
false, // use_qdq_contrib_ops
|
||||
21); // opset
|
||||
}
|
||||
|
||||
// Test fusion of DQs -> Conv -> Relu/Clip -> Q.
|
||||
|
|
|
|||
Loading…
Reference in a new issue