[QNN EP] Apply workaround for Conv validation bug when bias input is implicit (#21764)

### Description
- Adds a dummy bias of all zeros when translating a Conv without an
explicit bias input. This is a workaround for a QNN validation issue
that fails when the optional bias input is not provided.
- Corrects logic for unpacking of **non-zero int4** zero-points. Bug
does not impact models because we currently only support int4
zero-points equal to 0 (symmetric quant). But this would become an issue
in the future if/when QNN supports non-zero int4 zero-points (so good to
fix now).



### Motivation and Context
Support Conv operators without a bias input on QNN EP with the latest
QNN SDK.
This commit is contained in:
Adrian Lizarraga 2024-08-22 10:38:03 -07:00 committed by GitHub
parent 6c1a3f85a6
commit 514b4699b4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 207 additions and 59 deletions

View file

@ -80,6 +80,64 @@ Status BaseOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
return Status::OK();
}
Status BaseOpBuilder::AddZeroBiasInput(QnnModelWrapper& qnn_model_wrapper,
const QnnQuantParamsWrapper& input0_qparams,
const QnnQuantParamsWrapper& input1_qparams,
std::vector<uint32_t>&& bias_shape,
const std::string& bias_name,
const logging::Logger& logger,
std::vector<std::string>& input_names) const {
ORT_UNUSED_PARAMETER(logger);
// For now, only handle case where input0 is per-tensor quantized and input1 is either per-tensor
// or per-channel quantized.
ORT_RETURN_IF_NOT(input0_qparams.IsPerTensor(/*include_bw*/ true) && input1_qparams.IsQuantized(),
"QNN EP currently only supports adding a dummy zero bias input for per-tensor ",
"input[0] and per-tensor/per-channel input[1]");
size_t num_bias_elems = 1;
for (size_t i = 0; i < bias_shape.size(); i++) {
num_bias_elems *= static_cast<size_t>(bias_shape[i]);
}
// Bias static input should be all zeros.
std::vector<uint8_t> bias_bytes(num_bias_elems * sizeof(int32_t), 0);
// Bias's quantization scale(s) should be the product of the other inputs' quantization scales.
// Input[0] is expected to have one scale (per-tensor).
// If input[1] is per-channel (many scales), then the dummy bias also needs to be per-channel.
std::vector<float> input0_quant_scales;
std::vector<float> input1_quant_scales;
ORT_RETURN_IF_ERROR(input0_qparams.GetScales(input0_quant_scales));
ORT_RETURN_IF_ERROR(input1_qparams.GetScales(input1_quant_scales));
const size_t num_bias_scales_offsets = input1_quant_scales.size();
assert(input0_quant_scales.size() == 1); // Expected for per-tensor.
ORT_RETURN_IF_NOT(num_bias_scales_offsets >= input0_quant_scales.size(),
"Input[1] should have >= 1 quantization scale values");
std::vector<float> bias_scales(num_bias_scales_offsets);
for (size_t i = 0; i < num_bias_scales_offsets; i++) {
bias_scales[i] = input0_quant_scales[0] * input1_quant_scales[i];
}
std::vector<int32_t> bias_offsets(num_bias_scales_offsets, 0); // Bias's zero-points should be all zeros.
QnnQuantParamsWrapper bias_qparams;
if (input1_qparams.IsPerChannel()) {
bias_qparams = QnnQuantParamsWrapper(bias_scales, bias_offsets, /*axis*/ 0, /*is_int4*/ false);
} else {
bias_qparams = QnnQuantParamsWrapper(bias_scales[0], bias_offsets[0]);
}
auto tensor_wrapper = QnnTensorWrapper(bias_name, QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_SFIXED_POINT_32,
std::move(bias_qparams), std::move(bias_shape), std::move(bias_bytes));
qnn_model_wrapper.AddTensorWrapper(std::move(tensor_wrapper));
input_names.push_back(bias_name);
return Status::OK();
}
Status BaseOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& node_unit,
std::vector<std::string>&& input_names,

View file

@ -95,6 +95,14 @@ class BaseOpBuilder : public IOpBuilder {
const logging::Logger& logger,
std::vector<std::string>& input_names) const ORT_MUST_USE_RESULT;
Status AddZeroBiasInput(QnnModelWrapper& qnn_model_wrapper,
const QnnQuantParamsWrapper& input0_qparams,
const QnnQuantParamsWrapper& input1_qparams,
std::vector<uint32_t>&& bias_shape,
const std::string& bias_name,
const logging::Logger& logger,
std::vector<std::string>& input_names) const ORT_MUST_USE_RESULT;
Status SetOutputQParamEqualToInputIfNearlyEqual(QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& node_unit,
const logging::Logger& logger,

View file

@ -289,10 +289,30 @@ Status ConvOpBuilder::ProcessConv2D3DInputs(QnnModelWrapper& qnn_model_wrapper,
//
// Input 2: bias
//
if (num_inputs == 3) {
const bool has_bias_input = num_inputs == 3;
if (has_bias_input) {
ORT_RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, inputs[2], logger, input_names));
}
#if QNN_API_VERSION_MAJOR == 2 && (QNN_API_VERSION_MINOR >= 16 && QNN_API_VERSION_MINOR <= 18)
if (!has_bias_input && IsNpuBackend(qnn_model_wrapper.GetQnnBackendType())) {
// Bias is implicit. QNN SDK 2.23/2.24/2.25 (QNN API version 2.16/2.17/2.18) has a validation bug for
// implicit bias inputs, so provide an explicit bias of all 0 (quantized int32).
TensorInfo input0_info = {};
ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[0], input0_info));
TensorInfo input1_info = {};
ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[1], input1_info));
if (input0_info.quant_param.IsPerTensor(/*include_bw*/ true) && input1_info.quant_param.IsQuantized()) {
const std::string bias_name = qnn::utils::GetNodeName(node_unit) + "_implicit_bias_ort_qnn_ep";
std::vector<uint32_t> bias_shape = {input1_info.shape[0]};
ORT_RETURN_IF_ERROR(AddZeroBiasInput(qnn_model_wrapper, input0_info.quant_param, input1_info.quant_param,
std::move(bias_shape), bias_name, logger, input_names));
}
}
#endif
return Status::OK();
}

View file

@ -99,47 +99,9 @@ Status LayerNormOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
if (x_input_info.quant_param.IsPerTensor(/*include_bw*/ true) && scale_input_info.quant_param.IsQuantized()) {
const std::string bias_name = qnn::utils::GetNodeName(node_unit) + "_implicit_bias_ort_qnn_ep";
// Make dummy bias input have the same shape as the scale input.
std::vector<uint32_t> bias_shape = scale_input_info.shape;
size_t num_bias_elems = 1;
for (size_t i = 0; i < bias_shape.size(); i++) {
num_bias_elems *= static_cast<size_t>(bias_shape[i]);
}
// Bias static input should be all zeros.
std::vector<uint8_t> bias_bytes(num_bias_elems * sizeof(int32_t), 0);
// Bias's quantization scale should be the product of the other inputs' quantization scales.
std::vector<float> input0_quant_scales;
std::vector<float> input1_quant_scales;
ORT_RETURN_IF_ERROR(x_input_info.quant_param.GetScales(input0_quant_scales));
ORT_RETURN_IF_ERROR(scale_input_info.quant_param.GetScales(input1_quant_scales));
const size_t num_bias_scales_offsets = input1_quant_scales.size();
assert(input0_quant_scales.size() == 1); // Expected for per-tensor.
ORT_RETURN_IF_NOT(num_bias_scales_offsets >= input0_quant_scales.size(),
"Input[1] should have >= 1 quantization scale values");
std::vector<float> bias_scales(num_bias_scales_offsets);
for (size_t i = 0; i < num_bias_scales_offsets; i++) {
bias_scales[i] = input0_quant_scales[0] * input1_quant_scales[i];
}
std::vector<int32_t> bias_offsets(num_bias_scales_offsets, 0); // Bias's zero-points should be all zeros.
QnnQuantParamsWrapper bias_qparams;
if (scale_input_info.quant_param.IsPerChannel()) {
bias_qparams = QnnQuantParamsWrapper(bias_scales, bias_offsets, /*axis*/ 0, /*is_int4*/ false);
} else {
bias_qparams = QnnQuantParamsWrapper(bias_scales[0], bias_offsets[0]);
}
auto tensor_wrapper = QnnTensorWrapper(bias_name, QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_SFIXED_POINT_32,
std::move(bias_qparams), std::move(bias_shape), std::move(bias_bytes));
qnn_model_wrapper.AddTensorWrapper(std::move(tensor_wrapper));
input_names.push_back(bias_name);
ORT_RETURN_IF_ERROR(AddZeroBiasInput(qnn_model_wrapper, x_input_info.quant_param, scale_input_info.quant_param,
std::move(bias_shape), bias_name, logger, input_names));
}
}
#endif

View file

@ -335,7 +335,18 @@ Status QnnModelWrapper::UnpackZeroPoints(const std::string& initializer_name,
switch (onnx_data_type) {
// QNN use -offset for some reason
case ONNX_NAMESPACE::TensorProto_DataType_INT4: // INT4 zero-points are unpacked as 8-bit values for QNN
case ONNX_NAMESPACE::TensorProto_DataType_INT4: { // INT4 zero-points are unpacked as 8-bit values for QNN
auto int8_span = ReinterpretAsSpan<const int8_t>(gsl::make_span(initializer_bytes));
std::transform(int8_span.begin(), int8_span.end(), std::back_inserter(zero_points),
[](int8_t masked_zp) -> int32_t {
// We currently unpack int4 as int8 but with the top 4-bits masked off due to QNN bug.
// Need to undo the masking so that the zero-point value is correct.
// (Not really a problem yet because QNN only supports symmetric INT4 quantization with zp == 0).
int8_t zp = Int4x2::SignExtendLower4Bits(std::byte(masked_zp));
return -static_cast<int32_t>(zp);
});
break;
}
case ONNX_NAMESPACE::TensorProto_DataType_INT8: {
auto int8_span = ReinterpretAsSpan<const int8_t>(gsl::make_span(initializer_bytes));
std::transform(int8_span.begin(), int8_span.end(), std::back_inserter(zero_points),

View file

@ -231,6 +231,8 @@ std::ostream& operator<<(std::ostream& out, const Qnn_QuantizeParams_t& quantize
} else if (quantize_params.quantizationEncoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
out << " axis=" << quantize_params.axisScaleOffsetEncoding.axis;
size_t num_elems = quantize_params.axisScaleOffsetEncoding.numScaleOffsets;
bool truncate = num_elems > 20;
num_elems = truncate ? 20 : num_elems;
out << " scales=(";
for (size_t i = 0; i < num_elems; i++) {
out << quantize_params.axisScaleOffsetEncoding.scaleOffset[i].scale << (i == num_elems - 1 ? "" : " ");
@ -239,11 +241,13 @@ std::ostream& operator<<(std::ostream& out, const Qnn_QuantizeParams_t& quantize
for (size_t i = 0; i < num_elems; i++) {
out << quantize_params.axisScaleOffsetEncoding.scaleOffset[i].offset << (i == num_elems - 1 ? "" : " ");
}
out << ")";
out << (truncate ? "...)" : ")");
} else if (quantize_params.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) {
out << " axis=" << quantize_params.bwAxisScaleOffsetEncoding.axis;
out << " bw=" << quantize_params.bwAxisScaleOffsetEncoding.bitwidth;
size_t num_elems = quantize_params.bwAxisScaleOffsetEncoding.numElements;
bool truncate = num_elems > 20;
num_elems = truncate ? 20 : num_elems;
out << " scales=(";
for (size_t i = 0; i < num_elems; i++) {
out << quantize_params.bwAxisScaleOffsetEncoding.scales[i] << (i == num_elems - 1 ? "" : " ");
@ -252,7 +256,7 @@ std::ostream& operator<<(std::ostream& out, const Qnn_QuantizeParams_t& quantize
for (size_t i = 0; i < num_elems; i++) {
out << quantize_params.bwAxisScaleOffsetEncoding.offsets[i] << (i == num_elems - 1 ? "" : " ");
}
out << ")";
out << (truncate ? "...)" : ")");
} else {
out << " encoding not supported.";
}

View file

@ -423,14 +423,14 @@ static void LogNodeSupport(const logging::Logger& logger,
return;
}
size_t num_nodes = 0;
std::ostringstream oss;
oss << (support_status.IsOK() ? "Validation PASSED " : "Validation FAILED ") << "for nodes ("
<< qnn_node_group.Type() << "):" << std::endl;
for (const NodeUnit* node_unit : qnn_node_group.GetNodeUnits()) {
for (const Node* node : node_unit->GetAllNodesInGroup()) {
oss << "\tOperator type: " << node->OpType()
<< " Node name: " << node->Name()
<< " Node index: " << node->Index() << std::endl;
num_nodes += 1;
}
}
if (!support_status.IsOK()) {
@ -440,6 +440,9 @@ static void LogNodeSupport(const logging::Logger& logger,
logging::Capture(logger, log_severity, logging::Category::onnxruntime,
log_data_type, call_site)
.Stream()
<< (support_status.IsOK() ? "Validation PASSED " : "Validation FAILED ") << "for " << num_nodes
<< " nodes in " << qnn_node_group.Type() << " (" << qnn_node_group.GetTargetNodeUnit()->OpType() << ") :"
<< std::endl
<< oss.str();
}

View file

@ -793,19 +793,101 @@ TEST_F(QnnHTPBackendTests, ConvU16S4S32_PerChannel) {
TestInputDef<float> bias_def(bias_shape, true,
GetFloatDataInRange(-1.0f, 1.0f, TensorShape(bias_shape).Size()));
RunHTPConvOpPerChannelTest<uint8_t, Int4x2>("Conv",
input_def,
weight_def,
bias_def,
0, // weight quant axis
{1, 1}, // Strides
{0, 0, 0, 0}, // Pads
{1, 1}, // Dilations
1, // default group
"NOTSET",
ExpectedEPNodeAssignment::All,
false, // use_qdq_contrib_ops
21); // opset
RunHTPConvOpPerChannelTest<uint16_t, Int4x2>("Conv",
input_def,
weight_def,
bias_def,
0, // weight quant axis
{1, 1}, // Strides
{0, 0, 0, 0}, // Pads
{1, 1}, // Dilations
1, // default group
"NOTSET",
ExpectedEPNodeAssignment::All,
false, // use_qdq_contrib_ops
21); // opset
}
// Test per-channel QDQ Conv with INT4 weights and no bias.
// in0: u16, in1 (weight): s4, out: u8
// Tests bug in QNN SDK 2.25 when validating Conv without a bias (QNN EP adds a dummy bias).
TEST_F(QnnHTPBackendTests, ConvU16S4_PerChannel_NoBias) {
std::vector<int64_t> input_shape = {1, 2, 4, 4};
std::vector<int64_t> weight_shape = {3, 2, 2, 2};
TestInputDef<float> input_def(input_shape, false,
GetFloatDataInRange(0.0f, 1.0f, TensorShape(input_shape).Size()));
TestInputDef<float> weight_def(weight_shape, true,
GetFloatDataInRange(-1.0f, 5.0f, TensorShape(weight_shape).Size()));
RunHTPConvOpPerChannelTest<uint16_t, Int4x2>("Conv",
input_def,
weight_def,
TestInputDef<float>(),
0, // weight quant axis
{1, 1}, // Strides
{0, 0, 0, 0}, // Pads
{1, 1}, // Dilations
1, // default group
"NOTSET",
ExpectedEPNodeAssignment::All,
false, // use_qdq_contrib_ops
21); // opset
}
// Test per-channel QDQ Conv with uint16 input[0], uint8 weights, and no bias.
// in0: u16, in1 (weight): s4, out: u8
// Tests bug in QNN SDK 2.25 when validating Conv without a bias (QNN EP adds a dummy bias).
TEST_F(QnnHTPBackendTests, ConvU16U8_PerTensor_NoBias) {
std::vector<int64_t> input_shape = {1, 2, 4, 4};
std::vector<int64_t> weight_shape = {3, 2, 2, 2};
TestInputDef<float> input_def(input_shape, false,
GetFloatDataInRange(0.0f, 1.0f, TensorShape(input_shape).Size()));
TestInputDef<float> weight_def(weight_shape, true,
GetFloatDataInRange(-1.0f, 5.0f, TensorShape(weight_shape).Size()));
RunHTPConvOpTest<uint16_t, uint8_t>("Conv",
input_def,
weight_def,
TestInputDef<float>(),
{1, 1}, // Strides
{0, 0, 0, 0}, // Pads
{1, 1}, // Dilations
1, // default group
"NOTSET",
ExpectedEPNodeAssignment::All,
false, // use_qdq_contrib_ops
21); // opset
}
TEST_F(QnnHTPBackendTests, ConvU16S4_PerChannel_NoBias_LargeINT4Weight) {
std::vector<int64_t> input_shape = {1, 3072, 1, 512};
std::vector<int64_t> weight_shape = {9216, 3072, 1, 1};
std::vector<float> input_data(TensorShape(input_shape).Size(), 0.1f);
input_data[0] = 0.2f;
std::vector<float> weight_data(TensorShape(weight_shape).Size(), -0.1f);
for (size_t c = 0; c < static_cast<size_t>(weight_shape[0]); c++) {
size_t i = c * 3072;
weight_data[i] = 0.1f;
}
TestInputDef<float> input_def(input_shape, false, input_data);
TestInputDef<float> weight_def(weight_shape, true, weight_data);
RunHTPConvOpPerChannelTest<uint16_t, Int4x2>("Conv",
input_def,
weight_def,
TestInputDef<float>(),
0, // weight quant axis
{1, 1}, // Strides
{0, 0, 0, 0}, // Pads
{1, 1}, // Dilations
1, // default group
"NOTSET",
ExpectedEPNodeAssignment::All,
false, // use_qdq_contrib_ops
21); // opset
}
// Test fusion of DQs -> Conv -> Relu/Clip -> Q.