[QNN EP] Fix regression for MatMul with two quantized/dynamic uint16 inputs (#23419)

### Description
- Fixes regression for MatMul with two quantized/dynamic uint16 inputs.
We need to convert input[1] to uint8 to pass QNN validation.
- Separates translation of `ONNX MatMul -> QNN MatMul` and `ONNX MatMul
-> QNN FullyConnected` to separate functions to make the code more
readable.


### Motivation and Context
The following PR updated the handling of MatMul. The logic to handle
MatMul with two non-const uint16 inputs was not ported from
[simple_op_builder.cc](c64fa18834/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc (L107))
to the new
[matmul_op_builder.cc](c64fa18834/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc (L57)).

https://github.com/microsoft/onnxruntime/pull/22639
This commit is contained in:
Adrian Lizarraga 2025-01-17 15:45:49 -08:00 committed by GitHub
parent d461ca9dcd
commit a9bf0bedd8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 359 additions and 170 deletions

View file

@ -13,11 +13,12 @@ namespace onnxruntime {
namespace qnn {
/**
* ONNX's MatMul supports 1D tensor as input on both size, but neither QNN's MatMul nor FullyConnected supports it.
* So we need to add Reshape Ops if necessary.
* An ONNX MatMul can be translated to either a QNN MatMul or a QNN FullyConnected.
* ONNX's MatMul suports inputs of rank 1, but neither QNN's MatMul nor FullyConnected support two rank 1 inputs.
* So, we need to add Reshape Ops if necessary.
* In two cases, FullyConnected (input_1's shape is [n, k]) is used instead of MatMul without extra Transpose Op:
* 1. input_1 is 2D initializer.
* 2. input_1 is 1D tensor.
* 1. input_1 is a rank 2 initializer.
* 2. input_1 is a rank 1 tensor.
*/
class MatMulOpBuilder : public BaseOpBuilder {
public:
@ -31,29 +32,149 @@ class MatMulOpBuilder : public BaseOpBuilder {
Status ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit,
std::vector<std::string>&& input_names, const logging::Logger& logger,
bool do_op_validation) const override ORT_MUST_USE_RESULT;
private:
Status ProcessInputsForQnnMatMul(QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& node_unit,
const TensorInfo& input_info_0,
const TensorInfo& input_info_1,
const logging::Logger& logger,
std::vector<std::string>& input_names,
bool do_op_validation) const ORT_MUST_USE_RESULT;
Status ProcessInputsForQnnFullyConnected(QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& node_unit,
const TensorInfo& input_info_0,
const TensorInfo& input_info_1,
const logging::Logger& logger,
std::vector<std::string>& input_names,
bool do_op_validation) const ORT_MUST_USE_RESULT;
};
namespace {
// Inserts a QNN Convert operator to convert from one quantization type (e.g., uint16) to another (e.g., uint8).
Status InsertConvertOp(QnnModelWrapper& qnn_model_wrapper,
const std::string& convert_input_name,
const std::string& convert_output_name,
Qnn_DataType_t input_qnn_data_type,
Qnn_DataType_t output_qnn_data_type,
int32_t input_offset,
float input_scale,
const std::vector<uint32_t>& output_shape,
bool do_op_validation) {
// Assume input is already handled.
float qmin = 0.0f;
float qmax = 255.0f;
ORT_RETURN_IF_ERROR(qnn::utils::GetQminQmax(input_qnn_data_type, qmin, qmax));
double value_min = qnn::utils::Dequantize(input_offset, input_scale, qmin);
double value_max = qnn::utils::Dequantize(input_offset, input_scale, qmax);
float scale = 0.0f;
int32_t offset = 0;
ORT_RETURN_IF_ERROR(qnn::utils::GetQuantParams(static_cast<float>(value_min),
static_cast<float>(value_max),
output_qnn_data_type,
scale,
offset));
std::vector<uint32_t> output_shape_copy = output_shape;
QnnTensorWrapper convert_output_tensorwrapper(convert_output_name,
QNN_TENSOR_TYPE_NATIVE,
output_qnn_data_type,
QnnQuantParamsWrapper(scale, offset),
std::move(output_shape_copy));
ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(convert_output_tensorwrapper)), "Failed to add tensor.");
ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(convert_output_name,
QNN_OP_PACKAGE_NAME_QTI_AISW,
"Convert",
{convert_input_name},
{convert_output_name},
{},
do_op_validation),
"Failed to add node.");
return Status::OK();
}
inline bool IsQuant16bit(Qnn_DataType_t qnn_data_type) {
return qnn_data_type == QNN_DATATYPE_UFIXED_POINT_16 || qnn_data_type == QNN_DATATYPE_SFIXED_POINT_16;
}
Status CheckInputs(const QnnModelWrapper& qnn_model_wrapper, const NodeUnitIODef& input_def_0,
const NodeUnitIODef& input_def_1, TensorInfo& input_info_0, TensorInfo& input_info_1,
bool& use_fully_connected) {
ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(input_def_0, input_info_0));
ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(input_def_1, input_info_1));
// Use FullyConnected if 2nd input is 2D initializer or 1D tensor.
#if QNN_API_VERSION_MAJOR >= 2 && QNN_API_VERSION_MINOR <= 20
// Validation crashes if use QNN FullyConnected in QNN SDK versions 2.26 - 2.27
// Just use QNN MatMul for these older QNN SDK versions.
use_fully_connected = false;
#else
// Use FullyConnected if 2nd input is a rank 2 initializer or a rank 1 tensor.
// FullyConnected cannot pass the Op validation if keep_dims is true, so if input_0 is per-channel quantized tensor
// with rank > 2, it's not easy to set the quantization parameters for the output reshaped 2D tensor.
// with rank > 2, it's not easy to set the quantization parameters for the output reshaped rank 2 tensor.
// In this case, we will not use FullyConnected.
use_fully_connected =
(input_info_1.shape.size() == 2 && input_info_1.is_initializer) || input_info_1.shape.size() == 1;
use_fully_connected =
use_fully_connected && !(input_info_0.quant_param.IsPerChannel() && input_info_0.shape.size() > 2);
// Don't use FullyConnected if both inputs are dynamic and uint16 (quantized)
use_fully_connected = use_fully_connected && !(IsQuant16bit(input_info_0.qnn_data_type) &&
!input_info_0.is_initializer &&
IsQuant16bit(input_info_1.qnn_data_type) &&
!input_info_1.is_initializer);
#endif
return Status::OK();
}
// Process input[0] for ONNX MatMul that can be translated to either a QNN MatMul or a QNN FullyConnected.
Status ProcessInput0(QnnModelWrapper& qnn_model_wrapper,
const TensorInfo& input_0_info,
const std::string& original_input_0_name,
std::vector<std::string>& input_names,
const logging::Logger& logger,
bool do_op_validation) {
bool reshape_input_0 = input_0_info.shape.size() == 1;
std::string actual_input_0_name = original_input_0_name;
if (reshape_input_0) {
actual_input_0_name = original_input_0_name + "_ort_qnn_ep_reshape";
std::vector<uint32_t> shape_2d{1, input_0_info.shape[0]};
QnnQuantParamsWrapper quant_param_2d = input_0_info.quant_param.Copy();
ORT_RETURN_IF_ERROR(quant_param_2d.HandleUnsqueeze<uint32_t>(input_0_info.shape, shape_2d));
// If input_0 is initializer, unpack it and add the tensor with new quantization parameter and shape.
// Otherwise, add a Reshape node.
if (input_0_info.is_initializer) {
std::vector<uint8_t> unpacked_tensor;
ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_0_info.initializer_tensor, unpacked_tensor));
QnnTensorWrapper input_tensorwrapper(actual_input_0_name, QNN_TENSOR_TYPE_STATIC, input_0_info.qnn_data_type,
std::move(quant_param_2d), std::move(shape_2d), std::move(unpacked_tensor));
ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(input_tensorwrapper)), "Failed to add tensor.");
} else {
ORT_RETURN_IF_ERROR(qnn_model_wrapper.AddReshapeNode(original_input_0_name, actual_input_0_name,
input_0_info.shape, shape_2d,
input_0_info.qnn_data_type, input_0_info.quant_param,
quant_param_2d, do_op_validation,
qnn_model_wrapper.IsGraphInput(original_input_0_name), false));
}
} else {
if (qnn_model_wrapper.IsQnnTensorWrapperExist(actual_input_0_name)) {
LOGS(logger, VERBOSE) << "Tensor already added, skip it: " << actual_input_0_name;
} else {
QnnTensorWrapper input_0_tensor;
ORT_RETURN_IF_ERROR(qnn_model_wrapper.MakeTensorWrapper(input_0_info, actual_input_0_name, input_0_tensor));
ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(input_0_tensor)), "Failed to add tensor.");
}
}
input_names.emplace_back(actual_input_0_name);
return Status::OK();
}
} // namespace
// Process operator inputs. Dispatches to other processing functions depending on whether we're
// translating an ONNX MatMul to a QNN MatMul or a QNN FullyConnected.
Status MatMulOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit,
const logging::Logger& logger, std::vector<std::string>& input_names,
bool do_op_validation) const {
@ -63,77 +184,55 @@ Status MatMulOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper, const
bool use_fully_connected = false;
ORT_RETURN_IF_ERROR(
CheckInputs(qnn_model_wrapper, inputs[0], inputs[1], input_info_0, input_info_1, use_fully_connected));
bool reshape_input_0 = input_info_0.shape.size() == 1;
bool reshape_input_1 = input_info_1.shape.size() == 1;
// Process input 0.
const std::string& org_input_0_name = inputs[0].node_arg.Name();
std::string input_0_name = org_input_0_name;
if (reshape_input_0) {
input_0_name = org_input_0_name + "_ort_qnn_ep_reshape";
std::vector<uint32_t> shape_2d{1, input_info_0.shape[0]};
QnnQuantParamsWrapper quant_param_2d = input_info_0.quant_param.Copy();
ORT_RETURN_IF_ERROR(quant_param_2d.HandleUnsqueeze<uint32_t>(input_info_0.shape, shape_2d));
// If input_0 is initializer, unpack it and add the tensor with new quantization parameter and shape.
// Otherwise, add a Reshape node.
if (input_info_0.is_initializer) {
std::vector<uint8_t> unpacked_tensor;
ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_info_0.initializer_tensor, unpacked_tensor));
Qnn_TensorType_t tensor_type = qnn_model_wrapper.GetTensorType(org_input_0_name);
QnnTensorWrapper input_tensorwrapper(input_0_name, tensor_type, input_info_0.qnn_data_type,
std::move(quant_param_2d), std::move(shape_2d), std::move(unpacked_tensor));
ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(input_tensorwrapper)), "Failed to add tensor.");
} else {
ORT_RETURN_IF_ERROR(qnn_model_wrapper.AddReshapeNode(org_input_0_name, input_0_name, input_info_0.shape, shape_2d,
input_info_0.qnn_data_type, input_info_0.quant_param,
quant_param_2d, do_op_validation,
qnn_model_wrapper.IsGraphInput(org_input_0_name), false));
}
} else {
if (qnn_model_wrapper.IsQnnTensorWrapperExist(input_0_name)) {
LOGS(logger, VERBOSE) << "Tensor already added, skip it: " << input_0_name;
} else {
QnnTensorWrapper input_0_tensor;
ORT_RETURN_IF_ERROR(qnn_model_wrapper.MakeTensorWrapper(inputs[0], input_0_tensor));
ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(input_0_tensor)), "Failed to add tensor.");
}
if (use_fully_connected) {
return ProcessInputsForQnnFullyConnected(qnn_model_wrapper,
node_unit,
input_info_0,
input_info_1,
logger,
input_names,
do_op_validation);
}
input_names.emplace_back(input_0_name);
return ProcessInputsForQnnMatMul(qnn_model_wrapper,
node_unit,
input_info_0,
input_info_1,
logger,
input_names,
do_op_validation);
}
Status MatMulOpBuilder::ProcessInputsForQnnMatMul(QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& node_unit,
const TensorInfo& input_info_0,
const TensorInfo& input_info_1,
const logging::Logger& logger,
std::vector<std::string>& input_names,
bool do_op_validation) const {
const auto& inputs = node_unit.Inputs();
const bool reshape_input_1 = input_info_1.shape.size() == 1;
const std::string& org_input_0_name = inputs[0].node_arg.Name();
ORT_RETURN_IF_ERROR(ProcessInput0(qnn_model_wrapper, input_info_0, org_input_0_name, input_names,
logger, do_op_validation));
// Process input 1.
const std::string& org_input_1_name = inputs[1].node_arg.Name();
std::string input_1_name = org_input_1_name;
if (reshape_input_1 || use_fully_connected) {
if (reshape_input_1) {
// Input[1] is a rank 1 tensor that needs to be reshaped.
std::vector<uint32_t> shape_2d;
QnnQuantParamsWrapper quant_param_2d = input_info_1.quant_param.Copy();
if (reshape_input_1) {
// Input is 1D tensor.
input_1_name = org_input_1_name + "_ort_qnn_ep_reshape";
if (use_fully_connected) {
// FullyConnected requires input_1's shape to be [n, k].
shape_2d = {1, input_info_1.shape[0]};
} else {
shape_2d = {input_info_1.shape[0], 1};
}
ORT_RETURN_IF_ERROR(quant_param_2d.HandleUnsqueeze<uint32_t>(input_info_1.shape, shape_2d));
} else {
input_1_name = org_input_1_name + "_ort_qnn_ep_transpose";
shape_2d = {input_info_1.shape[1], input_info_1.shape[0]};
ORT_RETURN_IF_ERROR(quant_param_2d.HandleTranspose<uint32_t>(std::vector<uint32_t>({1, 0})));
}
input_1_name = org_input_1_name + "_ort_qnn_ep_reshape";
shape_2d = {input_info_1.shape[0], 1};
ORT_RETURN_IF_ERROR(quant_param_2d.HandleUnsqueeze<uint32_t>(input_info_1.shape, shape_2d));
// If input_1 is initializer, unpack it and add the tensor with new quantization parameter and shape.
// Otherwise, add a Reshape node.
if (input_info_1.is_initializer) {
std::vector<uint8_t> unpacked_tensor;
if (use_fully_connected && !reshape_input_1) {
// 2D initializer should be transposed to [n, k].
ORT_RETURN_IF_ERROR(TwoDimensionTranspose(qnn_model_wrapper, input_info_1.shape,
*input_info_1.initializer_tensor, unpacked_tensor));
} else {
ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_info_1.initializer_tensor, unpacked_tensor));
}
ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_info_1.initializer_tensor, unpacked_tensor));
Qnn_TensorType_t tensor_type = qnn_model_wrapper.GetTensorType(org_input_1_name);
QnnTensorWrapper input_tensorwrapper(input_1_name, tensor_type, input_info_1.qnn_data_type,
@ -156,6 +255,108 @@ Status MatMulOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper, const
}
input_names.emplace_back(input_1_name);
// Workaround that inserts a QNN Convert op before input[1] (converts from quantized uint16 to quantized uint8)
// to avoid a QNN validation failure.
//
// QNN graph WITHOUT workaround (fails validation):
// input_0_uint16 ---> MatMul ---> output_uint16
// ^
// |
// input_1_uint16 -----+
//
// QNN graph WITH workaround (passes validation):
// input_0_uint16 ----------------------> MatMul ---> output_uint16
// ^
// |
// input_1_uint16 --> Convert(to uint8) --+
if (!input_info_0.is_initializer && !input_info_1.is_initializer &&
input_info_0.qnn_data_type == input_info_1.qnn_data_type &&
input_info_0.qnn_data_type == QNN_DATATYPE_UFIXED_POINT_16) {
ORT_RETURN_IF_NOT(input_info_1.quant_param.IsPerTensor(),
"MatMul's activation inputs only support per-tensor quantization");
const Qnn_QuantizeParams_t& quant_param = input_info_1.quant_param.Get();
// insert Convert op after input1
std::string convert_input_name = input_names.back();
input_names.pop_back();
const std::string& matmul_output_name = node_unit.Outputs()[0].node_arg.Name();
std::string convert_output_name = convert_input_name + "_convert_" + matmul_output_name;
std::vector<uint32_t> input_1_shape = input_info_1.shape;
if (reshape_input_1) {
input_1_shape = {input_info_1.shape[0], 1};
}
ORT_RETURN_IF_ERROR(InsertConvertOp(qnn_model_wrapper,
convert_input_name,
convert_output_name,
input_info_1.qnn_data_type,
QNN_DATATYPE_UFIXED_POINT_8,
quant_param.scaleOffsetEncoding.offset,
quant_param.scaleOffsetEncoding.scale,
input_1_shape,
do_op_validation));
input_names.push_back(convert_output_name);
}
return Status::OK();
}
Status MatMulOpBuilder::ProcessInputsForQnnFullyConnected(QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& node_unit,
const TensorInfo& input_info_0,
const TensorInfo& input_info_1,
const logging::Logger& logger,
std::vector<std::string>& input_names,
bool do_op_validation) const {
const auto& inputs = node_unit.Inputs();
const bool reshape_input_1 = input_info_1.shape.size() == 1;
const std::string& org_input_0_name = inputs[0].node_arg.Name();
ORT_RETURN_IF_ERROR(ProcessInput0(qnn_model_wrapper, input_info_0, org_input_0_name, input_names,
logger, do_op_validation));
// Process input 1.
const std::string& org_input_1_name = inputs[1].node_arg.Name();
std::string input_1_name = org_input_1_name;
std::vector<uint32_t> shape_2d;
QnnQuantParamsWrapper quant_param_2d = input_info_1.quant_param.Copy();
if (reshape_input_1) {
// Input[1] is a rank 1 tensor that needs to be reshaped.
input_1_name = org_input_1_name + "_ort_qnn_ep_reshape";
// FullyConnected requires input_1's shape to be [n, k].
shape_2d = {1, input_info_1.shape[0]};
ORT_RETURN_IF_ERROR(quant_param_2d.HandleUnsqueeze<uint32_t>(input_info_1.shape, shape_2d));
} else {
assert(input_info_1.shape.size() == 2);
input_1_name = org_input_1_name + "_ort_qnn_ep_transpose";
shape_2d = {input_info_1.shape[1], input_info_1.shape[0]};
ORT_RETURN_IF_ERROR(quant_param_2d.HandleTranspose<uint32_t>(std::vector<uint32_t>({1, 0})));
}
// If input_1 is initializer, unpack it and add the tensor with new quantization parameter and shape.
// Otherwise, add a Reshape node.
if (input_info_1.is_initializer) {
std::vector<uint8_t> unpacked_tensor;
if (!reshape_input_1) {
// 2D initializer should be transposed to [n, k].
std::vector<uint32_t> original_shape_copy = input_info_1.shape;
ORT_RETURN_IF_ERROR(TwoDimensionTranspose(qnn_model_wrapper,
original_shape_copy, // Will be modified to new shape (unnecessary)
*input_info_1.initializer_tensor,
unpacked_tensor));
} else {
ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_info_1.initializer_tensor, unpacked_tensor));
}
Qnn_TensorType_t tensor_type = qnn_model_wrapper.GetTensorType(org_input_1_name);
QnnTensorWrapper input_tensorwrapper(input_1_name, tensor_type, input_info_1.qnn_data_type,
std::move(quant_param_2d), std::move(shape_2d), std::move(unpacked_tensor));
ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(input_tensorwrapper)), "Failed to add tensor.");
} else {
ORT_RETURN_IF_ERROR(qnn_model_wrapper.AddReshapeNode(org_input_1_name, input_1_name, input_info_1.shape, shape_2d,
input_info_1.qnn_data_type, input_info_1.quant_param,
quant_param_2d, do_op_validation,
qnn_model_wrapper.IsGraphInput(org_input_1_name), false));
}
input_names.emplace_back(input_1_name);
return Status::OK();
}
@ -172,6 +373,24 @@ Status MatMulOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_w
bool reshape_input_1 = input_info_1.shape.size() == 1;
bool reshape_output = reshape_input_0 || reshape_input_1 || (use_fully_connected && input_info_0.shape.size() > 2);
// For QNN MatMul: set the input transpose parameters to their default values of 0. These parameters should be
// optional, but older versions of QNN SDK failed validation if not explicitly provided.
std::vector<std::string> param_tensor_names;
if (!use_fully_connected) {
Qnn_Scalar_t scalar_param = QNN_SCALAR_INIT;
scalar_param.dataType = QNN_DATATYPE_BOOL_8;
scalar_param.bool8Value = 0;
QnnParamWrapper transpose_in0_param(node_unit.Index(), node_unit.Name(), QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN0,
scalar_param);
param_tensor_names.push_back(transpose_in0_param.GetParamTensorName());
qnn_model_wrapper.AddParamWrapper(std::move(transpose_in0_param));
QnnParamWrapper transpose_in1_param(node_unit.Index(), node_unit.Name(), QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1,
scalar_param);
param_tensor_names.push_back(transpose_in1_param.GetParamTensorName());
qnn_model_wrapper.AddParamWrapper(std::move(transpose_in1_param));
}
const std::string& org_output_name = node_unit.Outputs()[0].node_arg.Name();
std::string op_output_name = org_output_name;
TensorInfo output_info{};
@ -207,7 +426,8 @@ Status MatMulOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_w
"Failed to add output tensor.");
ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(utils::GetNodeName(node_unit), QNN_OP_PACKAGE_NAME_QTI_AISW,
use_fully_connected ? QNN_OP_FULLY_CONNECTED : QNN_OP_MAT_MUL,
std::move(input_names), {op_output_name}, {}, do_op_validation),
std::move(input_names), {op_output_name},
std::move(param_tensor_names), do_op_validation),
"Failed to add fused Matmul node.");
if (reshape_output) {

View file

@ -22,11 +22,6 @@ class SimpleOpBuilder : public BaseOpBuilder {
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(SimpleOpBuilder);
protected:
Status ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& node_unit,
const logging::Logger& logger,
std::vector<std::string>& input_names,
bool do_op_validation) const override ORT_MUST_USE_RESULT;
Status ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& node_unit,
std::vector<std::string>&& input_names,
@ -53,91 +48,6 @@ class SimpleOpBuilder : public BaseOpBuilder {
static constexpr std::array<std::string_view, 3> gridsample_supported_padding_modes = {"zeros", "border", "reflection"};
};
// Move to qnn_utils if it's re-usable
Status InsertConvertOp(QnnModelWrapper& qnn_model_wrapper,
const std::string& convert_input_name,
const std::string& convert_output_name,
Qnn_DataType_t input_qnn_data_type,
Qnn_DataType_t output_qnn_data_type,
int32_t input_offset,
float input_scale,
const std::vector<uint32_t>& output_shape,
bool do_op_validation) {
// Assume input is already handled.
float qmin = 0.0f;
float qmax = 255.0f;
ORT_RETURN_IF_ERROR(qnn::utils::GetQminQmax(input_qnn_data_type, qmin, qmax));
double value_min = qnn::utils::Dequantize(input_offset, input_scale, qmin);
double value_max = qnn::utils::Dequantize(input_offset, input_scale, qmax);
float scale = 0.0f;
int32_t offset = 0;
ORT_RETURN_IF_ERROR(qnn::utils::GetQuantParams(static_cast<float>(value_min),
static_cast<float>(value_max),
output_qnn_data_type,
scale,
offset));
std::vector<uint32_t> output_shape_copy = output_shape;
QnnTensorWrapper convert_output_tensorwrapper(convert_output_name,
QNN_TENSOR_TYPE_NATIVE,
output_qnn_data_type,
QnnQuantParamsWrapper(scale, offset),
std::move(output_shape_copy));
ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(convert_output_tensorwrapper)), "Failed to add tensor.");
ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(convert_output_name,
QNN_OP_PACKAGE_NAME_QTI_AISW,
"Convert",
{convert_input_name},
{convert_output_name},
{},
do_op_validation),
"Failed to add node.");
return Status::OK();
}
Status SimpleOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& node_unit,
const logging::Logger& logger,
std::vector<std::string>& input_names,
bool do_op_validation) const {
const std::string& op_type = node_unit.OpType();
ORT_RETURN_IF_ERROR(BaseOpBuilder::ProcessInputs(qnn_model_wrapper, node_unit, logger, input_names, do_op_validation));
if (op_type == "MatMul") {
const auto& inputs = node_unit.Inputs();
TensorInfo input0_info = {};
TensorInfo input1_info = {};
ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[0], input0_info));
ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[1], input1_info));
// Need to insert Convert op if both inputs are dynamic inputs and are ufixed_16
if (!input0_info.is_initializer && !input1_info.is_initializer &&
input0_info.qnn_data_type == input1_info.qnn_data_type &&
input0_info.qnn_data_type == QNN_DATATYPE_UFIXED_POINT_16) {
ORT_RETURN_IF_NOT(input1_info.quant_param.IsPerTensor(),
"MatMul's activation inputs only support per-tensor quantization");
const Qnn_QuantizeParams_t& quant_param = input1_info.quant_param.Get();
// insert Convert op after input1
std::string convert_input_name = input_names.back();
input_names.pop_back();
const std::string& matmul_output_name = node_unit.Outputs()[0].node_arg.Name();
std::string convert_output_name = convert_input_name + "_convert_" + matmul_output_name;
ORT_RETURN_IF_ERROR(InsertConvertOp(qnn_model_wrapper,
convert_input_name,
convert_output_name,
input1_info.qnn_data_type,
QNN_DATATYPE_UFIXED_POINT_8,
quant_param.scaleOffsetEncoding.offset,
quant_param.scaleOffsetEncoding.scale,
input1_info.shape,
do_op_validation));
input_names.push_back(convert_output_name);
}
}
return Status::OK();
}
Status SimpleOpBuilder::ExplicitOpCheck(QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& node_unit) const {
const std::string& op_type = node_unit.OpType();
@ -378,19 +288,6 @@ Status SimpleOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_w
ORT_RETURN_IF(norm_p_order != 2, "QNN EP only supports LpNormalization with 'p' attribute equal to 2.");
}
if (op_type == "MatMul") {
Qnn_Scalar_t scalar_param = QNN_SCALAR_INIT;
scalar_param.dataType = QNN_DATATYPE_BOOL_8;
scalar_param.bool8Value = 0;
QnnParamWrapper transpose_in0_param(node_unit.Index(), node_unit.Name(), QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN0, scalar_param);
param_tensor_names.push_back(transpose_in0_param.GetParamTensorName());
qnn_model_wrapper.AddParamWrapper(std::move(transpose_in0_param));
QnnParamWrapper transpose_in1_param(node_unit.Index(), node_unit.Name(), QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1, scalar_param);
param_tensor_names.push_back(transpose_in1_param.GetParamTensorName());
qnn_model_wrapper.AddParamWrapper(std::move(transpose_in1_param));
}
if (op_type == "LeakyRelu") {
std::string input_name = "alpha";
ORT_RETURN_IF_ERROR(ProcessAlphaAttributeAsInput(qnn_model_wrapper, node_unit, input_name));

View file

@ -75,6 +75,20 @@ Status QnnModelWrapper::MakeTensorWrapper(const NodeUnitIODef& tensor, QnnTensor
return Status::OK();
}
Status QnnModelWrapper::MakeTensorWrapper(const TensorInfo& tensor_info,
const std::string& tensor_name,
QnnTensorWrapper& tensor_wrapper) const {
std::vector<uint8_t> unpacked_tensor;
if (tensor_info.is_initializer) {
ORT_RETURN_IF_ERROR(UnpackInitializerData(*tensor_info.initializer_tensor, unpacked_tensor));
}
tensor_wrapper = QnnTensorWrapper(tensor_name, GetTensorType(tensor_name), tensor_info.qnn_data_type,
tensor_info.quant_param.Copy(), std::vector<uint32_t>(tensor_info.shape),
std::move(unpacked_tensor));
return Status::OK();
}
bool QnnModelWrapper::AddTensorWrapper(QnnTensorWrapper&& tensor_wrapper) {
// Keep a copy of tensor name sine it will be moved with the wrapper into model_tensors_map_
std::string tensor_name = tensor_wrapper.GetName();

View file

@ -66,6 +66,9 @@ class QnnModelWrapper {
// Make a QnnTensorWrapper from an onnx input or output.
Status MakeTensorWrapper(const NodeUnitIODef& tensor, QnnTensorWrapper& tensor_wrapper) const;
Status MakeTensorWrapper(const TensorInfo& tensor_info,
const std::string& tensor_name,
QnnTensorWrapper& tensor_wrapper) const;
// Add to internal tensor wrapper table
bool AddTensorWrapper(QnnTensorWrapper&& tensor_wrapper);

View file

@ -290,10 +290,65 @@ TEST_F(QnnHTPBackendTests, MatMulOp_QDQ) {
RunQDQPerChannelMatMulOpTest<uint16_t, Int4x2, uint16_t>({2, 3, 3, 3}, {3, 2}, -1, QDQTolerance(),
ExpectedEPNodeAssignment::All, 18, true);
// // UINT16, per-channel INT8 weight
// UINT16, per-channel INT8 weight
RunQDQPerChannelMatMulOpTest<uint16_t, int8_t, uint16_t>({2, 3}, {3, 2}, 1, QDQTolerance(),
ExpectedEPNodeAssignment::All, 21, false, false);
RunQDQPerChannelMatMulOpTest<uint16_t, int8_t, uint16_t>({2, 3, 3}, {3}, -1);
RunQDQPerChannelMatMulOpTest<uint16_t, int8_t, uint16_t>({2, 3, 3}, {3}, -1, QDQTolerance(0.0041f));
}
// Tests MatMul with two uint16 (quantized) inputs that are both dynamic.
// This exercises a workaround in QNN EP that inserts a QNN Convert op before input[1] (converts from uint16 to uint8).
// This workaround prevents a validation error for this specific MatMul configuration.
// Got specific shapes and input ranges (quant params) from customer model.
TEST_F(QnnHTPBackendTests, MatMulOp_QDQ_Regression_uint16_dynamic_inputs) {
ProviderOptions provider_options;
#if defined(_WIN32)
provider_options["backend_path"] = "QnnHtp.dll";
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
// Test with rank 4 inputs
{
std::vector<int64_t> shape_0 = {1, 12, 512, 96};
TestInputDef<float> input0_def(
{1, 12, 512, 96}, false,
GetFloatDataInRange(-5.087f, 4.992f,
static_cast<size_t>(std::accumulate(shape_0.begin(), shape_0.end(), static_cast<int64_t>(1),
std::multiplies<int64_t>()))));
std::vector<int64_t> shape_1 = {1, 12, 96, 512};
TestInputDef<float> input1_def(
shape_1, false,
GetFloatDataInRange(-6.772f, 7.258f,
static_cast<size_t>(std::accumulate(shape_1.begin(), shape_1.end(), static_cast<int64_t>(1),
std::multiplies<int64_t>()))));
TestQDQModelAccuracy(
BuildMatMulOpTestCase(input0_def, input1_def),
BuildMatMulOpQDQTestCase<uint16_t, uint16_t, uint16_t>(input0_def, input1_def, false),
provider_options, 21, ExpectedEPNodeAssignment::All, QDQTolerance());
}
// Test with input[1] as rank 1
{
std::vector<int64_t> shape_0 = {1, 12, 512, 96};
TestInputDef<float> input0_def(
{1, 12, 512, 96}, false,
GetFloatDataInRange(-5.087f, 4.992f,
static_cast<size_t>(std::accumulate(shape_0.begin(), shape_0.end(), static_cast<int64_t>(1),
std::multiplies<int64_t>()))));
std::vector<int64_t> shape_1 = {96};
TestInputDef<float> input1_def(
shape_1, false,
GetFloatDataInRange(-6.772f, 7.258f,
static_cast<size_t>(std::accumulate(shape_1.begin(), shape_1.end(), static_cast<int64_t>(1),
std::multiplies<int64_t>()))));
TestQDQModelAccuracy(
BuildMatMulOpTestCase(input0_def, input1_def),
BuildMatMulOpQDQTestCase<uint16_t, uint16_t, uint16_t>(input0_def, input1_def, false),
provider_options, 21, ExpectedEPNodeAssignment::All, QDQTolerance());
}
}
#endif // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)