mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-31 23:27:43 +00:00
[QNN EP] Fix regression for MatMul with two quantized/dynamic uint16 inputs (#23419)
### Description - Fixes regression for MatMul with two quantized/dynamic uint16 inputs. We need to convert input[1] to uint8 to pass QNN validation. - Separates translation of `ONNX MatMul -> QNN MatMul` and `ONNX MatMul -> QNN FullyConnected` to separate functions to make the code more readable. ### Motivation and Context The following PR updated the handling of MatMul. The logic to handle MatMul with two non-const uint16 inputs was not ported from [simple_op_builder.cc](c64fa18834/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc (L107)) to the new [matmul_op_builder.cc](c64fa18834/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc (L57)). https://github.com/microsoft/onnxruntime/pull/22639
This commit is contained in:
parent
d461ca9dcd
commit
a9bf0bedd8
5 changed files with 359 additions and 170 deletions
|
|
@ -13,11 +13,12 @@ namespace onnxruntime {
|
|||
namespace qnn {
|
||||
|
||||
/**
|
||||
* ONNX's MatMul supports 1D tensor as input on both size, but neither QNN's MatMul nor FullyConnected supports it.
|
||||
* So we need to add Reshape Ops if necessary.
|
||||
* An ONNX MatMul can be translated to either a QNN MatMul or a QNN FullyConnected.
|
||||
* ONNX's MatMul suports inputs of rank 1, but neither QNN's MatMul nor FullyConnected support two rank 1 inputs.
|
||||
* So, we need to add Reshape Ops if necessary.
|
||||
* In two cases, FullyConnected (input_1's shape is [n, k]) is used instead of MatMul without extra Transpose Op:
|
||||
* 1. input_1 is 2D initializer.
|
||||
* 2. input_1 is 1D tensor.
|
||||
* 1. input_1 is a rank 2 initializer.
|
||||
* 2. input_1 is a rank 1 tensor.
|
||||
*/
|
||||
class MatMulOpBuilder : public BaseOpBuilder {
|
||||
public:
|
||||
|
|
@ -31,29 +32,149 @@ class MatMulOpBuilder : public BaseOpBuilder {
|
|||
Status ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit,
|
||||
std::vector<std::string>&& input_names, const logging::Logger& logger,
|
||||
bool do_op_validation) const override ORT_MUST_USE_RESULT;
|
||||
|
||||
private:
|
||||
Status ProcessInputsForQnnMatMul(QnnModelWrapper& qnn_model_wrapper,
|
||||
const NodeUnit& node_unit,
|
||||
const TensorInfo& input_info_0,
|
||||
const TensorInfo& input_info_1,
|
||||
const logging::Logger& logger,
|
||||
std::vector<std::string>& input_names,
|
||||
bool do_op_validation) const ORT_MUST_USE_RESULT;
|
||||
Status ProcessInputsForQnnFullyConnected(QnnModelWrapper& qnn_model_wrapper,
|
||||
const NodeUnit& node_unit,
|
||||
const TensorInfo& input_info_0,
|
||||
const TensorInfo& input_info_1,
|
||||
const logging::Logger& logger,
|
||||
std::vector<std::string>& input_names,
|
||||
bool do_op_validation) const ORT_MUST_USE_RESULT;
|
||||
};
|
||||
|
||||
namespace {
|
||||
|
||||
// Inserts a QNN Convert operator to convert from one quantization type (e.g., uint16) to another (e.g., uint8).
|
||||
Status InsertConvertOp(QnnModelWrapper& qnn_model_wrapper,
|
||||
const std::string& convert_input_name,
|
||||
const std::string& convert_output_name,
|
||||
Qnn_DataType_t input_qnn_data_type,
|
||||
Qnn_DataType_t output_qnn_data_type,
|
||||
int32_t input_offset,
|
||||
float input_scale,
|
||||
const std::vector<uint32_t>& output_shape,
|
||||
bool do_op_validation) {
|
||||
// Assume input is already handled.
|
||||
float qmin = 0.0f;
|
||||
float qmax = 255.0f;
|
||||
ORT_RETURN_IF_ERROR(qnn::utils::GetQminQmax(input_qnn_data_type, qmin, qmax));
|
||||
double value_min = qnn::utils::Dequantize(input_offset, input_scale, qmin);
|
||||
double value_max = qnn::utils::Dequantize(input_offset, input_scale, qmax);
|
||||
float scale = 0.0f;
|
||||
int32_t offset = 0;
|
||||
ORT_RETURN_IF_ERROR(qnn::utils::GetQuantParams(static_cast<float>(value_min),
|
||||
static_cast<float>(value_max),
|
||||
output_qnn_data_type,
|
||||
scale,
|
||||
offset));
|
||||
|
||||
std::vector<uint32_t> output_shape_copy = output_shape;
|
||||
QnnTensorWrapper convert_output_tensorwrapper(convert_output_name,
|
||||
QNN_TENSOR_TYPE_NATIVE,
|
||||
output_qnn_data_type,
|
||||
QnnQuantParamsWrapper(scale, offset),
|
||||
std::move(output_shape_copy));
|
||||
ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(convert_output_tensorwrapper)), "Failed to add tensor.");
|
||||
|
||||
ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(convert_output_name,
|
||||
QNN_OP_PACKAGE_NAME_QTI_AISW,
|
||||
"Convert",
|
||||
{convert_input_name},
|
||||
{convert_output_name},
|
||||
{},
|
||||
do_op_validation),
|
||||
"Failed to add node.");
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
inline bool IsQuant16bit(Qnn_DataType_t qnn_data_type) {
|
||||
return qnn_data_type == QNN_DATATYPE_UFIXED_POINT_16 || qnn_data_type == QNN_DATATYPE_SFIXED_POINT_16;
|
||||
}
|
||||
|
||||
Status CheckInputs(const QnnModelWrapper& qnn_model_wrapper, const NodeUnitIODef& input_def_0,
|
||||
const NodeUnitIODef& input_def_1, TensorInfo& input_info_0, TensorInfo& input_info_1,
|
||||
bool& use_fully_connected) {
|
||||
ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(input_def_0, input_info_0));
|
||||
ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(input_def_1, input_info_1));
|
||||
|
||||
// Use FullyConnected if 2nd input is 2D initializer or 1D tensor.
|
||||
#if QNN_API_VERSION_MAJOR >= 2 && QNN_API_VERSION_MINOR <= 20
|
||||
// Validation crashes if use QNN FullyConnected in QNN SDK versions 2.26 - 2.27
|
||||
// Just use QNN MatMul for these older QNN SDK versions.
|
||||
use_fully_connected = false;
|
||||
#else
|
||||
// Use FullyConnected if 2nd input is a rank 2 initializer or a rank 1 tensor.
|
||||
// FullyConnected cannot pass the Op validation if keep_dims is true, so if input_0 is per-channel quantized tensor
|
||||
// with rank > 2, it's not easy to set the quantization parameters for the output reshaped 2D tensor.
|
||||
// with rank > 2, it's not easy to set the quantization parameters for the output reshaped rank 2 tensor.
|
||||
// In this case, we will not use FullyConnected.
|
||||
use_fully_connected =
|
||||
(input_info_1.shape.size() == 2 && input_info_1.is_initializer) || input_info_1.shape.size() == 1;
|
||||
use_fully_connected =
|
||||
use_fully_connected && !(input_info_0.quant_param.IsPerChannel() && input_info_0.shape.size() > 2);
|
||||
// Don't use FullyConnected if both inputs are dynamic and uint16 (quantized)
|
||||
use_fully_connected = use_fully_connected && !(IsQuant16bit(input_info_0.qnn_data_type) &&
|
||||
!input_info_0.is_initializer &&
|
||||
IsQuant16bit(input_info_1.qnn_data_type) &&
|
||||
!input_info_1.is_initializer);
|
||||
#endif
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Process input[0] for ONNX MatMul that can be translated to either a QNN MatMul or a QNN FullyConnected.
|
||||
Status ProcessInput0(QnnModelWrapper& qnn_model_wrapper,
|
||||
const TensorInfo& input_0_info,
|
||||
const std::string& original_input_0_name,
|
||||
std::vector<std::string>& input_names,
|
||||
const logging::Logger& logger,
|
||||
bool do_op_validation) {
|
||||
bool reshape_input_0 = input_0_info.shape.size() == 1;
|
||||
std::string actual_input_0_name = original_input_0_name;
|
||||
|
||||
if (reshape_input_0) {
|
||||
actual_input_0_name = original_input_0_name + "_ort_qnn_ep_reshape";
|
||||
std::vector<uint32_t> shape_2d{1, input_0_info.shape[0]};
|
||||
QnnQuantParamsWrapper quant_param_2d = input_0_info.quant_param.Copy();
|
||||
ORT_RETURN_IF_ERROR(quant_param_2d.HandleUnsqueeze<uint32_t>(input_0_info.shape, shape_2d));
|
||||
|
||||
// If input_0 is initializer, unpack it and add the tensor with new quantization parameter and shape.
|
||||
// Otherwise, add a Reshape node.
|
||||
if (input_0_info.is_initializer) {
|
||||
std::vector<uint8_t> unpacked_tensor;
|
||||
ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_0_info.initializer_tensor, unpacked_tensor));
|
||||
QnnTensorWrapper input_tensorwrapper(actual_input_0_name, QNN_TENSOR_TYPE_STATIC, input_0_info.qnn_data_type,
|
||||
std::move(quant_param_2d), std::move(shape_2d), std::move(unpacked_tensor));
|
||||
ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(input_tensorwrapper)), "Failed to add tensor.");
|
||||
} else {
|
||||
ORT_RETURN_IF_ERROR(qnn_model_wrapper.AddReshapeNode(original_input_0_name, actual_input_0_name,
|
||||
input_0_info.shape, shape_2d,
|
||||
input_0_info.qnn_data_type, input_0_info.quant_param,
|
||||
quant_param_2d, do_op_validation,
|
||||
qnn_model_wrapper.IsGraphInput(original_input_0_name), false));
|
||||
}
|
||||
} else {
|
||||
if (qnn_model_wrapper.IsQnnTensorWrapperExist(actual_input_0_name)) {
|
||||
LOGS(logger, VERBOSE) << "Tensor already added, skip it: " << actual_input_0_name;
|
||||
} else {
|
||||
QnnTensorWrapper input_0_tensor;
|
||||
ORT_RETURN_IF_ERROR(qnn_model_wrapper.MakeTensorWrapper(input_0_info, actual_input_0_name, input_0_tensor));
|
||||
ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(input_0_tensor)), "Failed to add tensor.");
|
||||
}
|
||||
}
|
||||
input_names.emplace_back(actual_input_0_name);
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace
|
||||
|
||||
// Process operator inputs. Dispatches to other processing functions depending on whether we're
|
||||
// translating an ONNX MatMul to a QNN MatMul or a QNN FullyConnected.
|
||||
Status MatMulOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit,
|
||||
const logging::Logger& logger, std::vector<std::string>& input_names,
|
||||
bool do_op_validation) const {
|
||||
|
|
@ -63,77 +184,55 @@ Status MatMulOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper, const
|
|||
bool use_fully_connected = false;
|
||||
ORT_RETURN_IF_ERROR(
|
||||
CheckInputs(qnn_model_wrapper, inputs[0], inputs[1], input_info_0, input_info_1, use_fully_connected));
|
||||
bool reshape_input_0 = input_info_0.shape.size() == 1;
|
||||
bool reshape_input_1 = input_info_1.shape.size() == 1;
|
||||
|
||||
// Process input 0.
|
||||
const std::string& org_input_0_name = inputs[0].node_arg.Name();
|
||||
std::string input_0_name = org_input_0_name;
|
||||
if (reshape_input_0) {
|
||||
input_0_name = org_input_0_name + "_ort_qnn_ep_reshape";
|
||||
std::vector<uint32_t> shape_2d{1, input_info_0.shape[0]};
|
||||
QnnQuantParamsWrapper quant_param_2d = input_info_0.quant_param.Copy();
|
||||
ORT_RETURN_IF_ERROR(quant_param_2d.HandleUnsqueeze<uint32_t>(input_info_0.shape, shape_2d));
|
||||
|
||||
// If input_0 is initializer, unpack it and add the tensor with new quantization parameter and shape.
|
||||
// Otherwise, add a Reshape node.
|
||||
if (input_info_0.is_initializer) {
|
||||
std::vector<uint8_t> unpacked_tensor;
|
||||
ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_info_0.initializer_tensor, unpacked_tensor));
|
||||
Qnn_TensorType_t tensor_type = qnn_model_wrapper.GetTensorType(org_input_0_name);
|
||||
QnnTensorWrapper input_tensorwrapper(input_0_name, tensor_type, input_info_0.qnn_data_type,
|
||||
std::move(quant_param_2d), std::move(shape_2d), std::move(unpacked_tensor));
|
||||
ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(input_tensorwrapper)), "Failed to add tensor.");
|
||||
} else {
|
||||
ORT_RETURN_IF_ERROR(qnn_model_wrapper.AddReshapeNode(org_input_0_name, input_0_name, input_info_0.shape, shape_2d,
|
||||
input_info_0.qnn_data_type, input_info_0.quant_param,
|
||||
quant_param_2d, do_op_validation,
|
||||
qnn_model_wrapper.IsGraphInput(org_input_0_name), false));
|
||||
}
|
||||
} else {
|
||||
if (qnn_model_wrapper.IsQnnTensorWrapperExist(input_0_name)) {
|
||||
LOGS(logger, VERBOSE) << "Tensor already added, skip it: " << input_0_name;
|
||||
} else {
|
||||
QnnTensorWrapper input_0_tensor;
|
||||
ORT_RETURN_IF_ERROR(qnn_model_wrapper.MakeTensorWrapper(inputs[0], input_0_tensor));
|
||||
ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(input_0_tensor)), "Failed to add tensor.");
|
||||
}
|
||||
if (use_fully_connected) {
|
||||
return ProcessInputsForQnnFullyConnected(qnn_model_wrapper,
|
||||
node_unit,
|
||||
input_info_0,
|
||||
input_info_1,
|
||||
logger,
|
||||
input_names,
|
||||
do_op_validation);
|
||||
}
|
||||
input_names.emplace_back(input_0_name);
|
||||
return ProcessInputsForQnnMatMul(qnn_model_wrapper,
|
||||
node_unit,
|
||||
input_info_0,
|
||||
input_info_1,
|
||||
logger,
|
||||
input_names,
|
||||
do_op_validation);
|
||||
}
|
||||
|
||||
Status MatMulOpBuilder::ProcessInputsForQnnMatMul(QnnModelWrapper& qnn_model_wrapper,
|
||||
const NodeUnit& node_unit,
|
||||
const TensorInfo& input_info_0,
|
||||
const TensorInfo& input_info_1,
|
||||
const logging::Logger& logger,
|
||||
std::vector<std::string>& input_names,
|
||||
bool do_op_validation) const {
|
||||
const auto& inputs = node_unit.Inputs();
|
||||
const bool reshape_input_1 = input_info_1.shape.size() == 1;
|
||||
|
||||
const std::string& org_input_0_name = inputs[0].node_arg.Name();
|
||||
ORT_RETURN_IF_ERROR(ProcessInput0(qnn_model_wrapper, input_info_0, org_input_0_name, input_names,
|
||||
logger, do_op_validation));
|
||||
|
||||
// Process input 1.
|
||||
const std::string& org_input_1_name = inputs[1].node_arg.Name();
|
||||
std::string input_1_name = org_input_1_name;
|
||||
if (reshape_input_1 || use_fully_connected) {
|
||||
if (reshape_input_1) {
|
||||
// Input[1] is a rank 1 tensor that needs to be reshaped.
|
||||
std::vector<uint32_t> shape_2d;
|
||||
QnnQuantParamsWrapper quant_param_2d = input_info_1.quant_param.Copy();
|
||||
if (reshape_input_1) {
|
||||
// Input is 1D tensor.
|
||||
input_1_name = org_input_1_name + "_ort_qnn_ep_reshape";
|
||||
if (use_fully_connected) {
|
||||
// FullyConnected requires input_1's shape to be [n, k].
|
||||
shape_2d = {1, input_info_1.shape[0]};
|
||||
} else {
|
||||
shape_2d = {input_info_1.shape[0], 1};
|
||||
}
|
||||
ORT_RETURN_IF_ERROR(quant_param_2d.HandleUnsqueeze<uint32_t>(input_info_1.shape, shape_2d));
|
||||
} else {
|
||||
input_1_name = org_input_1_name + "_ort_qnn_ep_transpose";
|
||||
shape_2d = {input_info_1.shape[1], input_info_1.shape[0]};
|
||||
ORT_RETURN_IF_ERROR(quant_param_2d.HandleTranspose<uint32_t>(std::vector<uint32_t>({1, 0})));
|
||||
}
|
||||
input_1_name = org_input_1_name + "_ort_qnn_ep_reshape";
|
||||
shape_2d = {input_info_1.shape[0], 1};
|
||||
ORT_RETURN_IF_ERROR(quant_param_2d.HandleUnsqueeze<uint32_t>(input_info_1.shape, shape_2d));
|
||||
|
||||
// If input_1 is initializer, unpack it and add the tensor with new quantization parameter and shape.
|
||||
// Otherwise, add a Reshape node.
|
||||
if (input_info_1.is_initializer) {
|
||||
std::vector<uint8_t> unpacked_tensor;
|
||||
if (use_fully_connected && !reshape_input_1) {
|
||||
// 2D initializer should be transposed to [n, k].
|
||||
ORT_RETURN_IF_ERROR(TwoDimensionTranspose(qnn_model_wrapper, input_info_1.shape,
|
||||
*input_info_1.initializer_tensor, unpacked_tensor));
|
||||
} else {
|
||||
ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_info_1.initializer_tensor, unpacked_tensor));
|
||||
}
|
||||
ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_info_1.initializer_tensor, unpacked_tensor));
|
||||
|
||||
Qnn_TensorType_t tensor_type = qnn_model_wrapper.GetTensorType(org_input_1_name);
|
||||
QnnTensorWrapper input_tensorwrapper(input_1_name, tensor_type, input_info_1.qnn_data_type,
|
||||
|
|
@ -156,6 +255,108 @@ Status MatMulOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper, const
|
|||
}
|
||||
input_names.emplace_back(input_1_name);
|
||||
|
||||
// Workaround that inserts a QNN Convert op before input[1] (converts from quantized uint16 to quantized uint8)
|
||||
// to avoid a QNN validation failure.
|
||||
//
|
||||
// QNN graph WITHOUT workaround (fails validation):
|
||||
// input_0_uint16 ---> MatMul ---> output_uint16
|
||||
// ^
|
||||
// |
|
||||
// input_1_uint16 -----+
|
||||
//
|
||||
// QNN graph WITH workaround (passes validation):
|
||||
// input_0_uint16 ----------------------> MatMul ---> output_uint16
|
||||
// ^
|
||||
// |
|
||||
// input_1_uint16 --> Convert(to uint8) --+
|
||||
if (!input_info_0.is_initializer && !input_info_1.is_initializer &&
|
||||
input_info_0.qnn_data_type == input_info_1.qnn_data_type &&
|
||||
input_info_0.qnn_data_type == QNN_DATATYPE_UFIXED_POINT_16) {
|
||||
ORT_RETURN_IF_NOT(input_info_1.quant_param.IsPerTensor(),
|
||||
"MatMul's activation inputs only support per-tensor quantization");
|
||||
const Qnn_QuantizeParams_t& quant_param = input_info_1.quant_param.Get();
|
||||
// insert Convert op after input1
|
||||
std::string convert_input_name = input_names.back();
|
||||
input_names.pop_back();
|
||||
const std::string& matmul_output_name = node_unit.Outputs()[0].node_arg.Name();
|
||||
std::string convert_output_name = convert_input_name + "_convert_" + matmul_output_name;
|
||||
std::vector<uint32_t> input_1_shape = input_info_1.shape;
|
||||
if (reshape_input_1) {
|
||||
input_1_shape = {input_info_1.shape[0], 1};
|
||||
}
|
||||
ORT_RETURN_IF_ERROR(InsertConvertOp(qnn_model_wrapper,
|
||||
convert_input_name,
|
||||
convert_output_name,
|
||||
input_info_1.qnn_data_type,
|
||||
QNN_DATATYPE_UFIXED_POINT_8,
|
||||
quant_param.scaleOffsetEncoding.offset,
|
||||
quant_param.scaleOffsetEncoding.scale,
|
||||
input_1_shape,
|
||||
do_op_validation));
|
||||
input_names.push_back(convert_output_name);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status MatMulOpBuilder::ProcessInputsForQnnFullyConnected(QnnModelWrapper& qnn_model_wrapper,
|
||||
const NodeUnit& node_unit,
|
||||
const TensorInfo& input_info_0,
|
||||
const TensorInfo& input_info_1,
|
||||
const logging::Logger& logger,
|
||||
std::vector<std::string>& input_names,
|
||||
bool do_op_validation) const {
|
||||
const auto& inputs = node_unit.Inputs();
|
||||
const bool reshape_input_1 = input_info_1.shape.size() == 1;
|
||||
|
||||
const std::string& org_input_0_name = inputs[0].node_arg.Name();
|
||||
ORT_RETURN_IF_ERROR(ProcessInput0(qnn_model_wrapper, input_info_0, org_input_0_name, input_names,
|
||||
logger, do_op_validation));
|
||||
|
||||
// Process input 1.
|
||||
const std::string& org_input_1_name = inputs[1].node_arg.Name();
|
||||
std::string input_1_name = org_input_1_name;
|
||||
std::vector<uint32_t> shape_2d;
|
||||
QnnQuantParamsWrapper quant_param_2d = input_info_1.quant_param.Copy();
|
||||
if (reshape_input_1) {
|
||||
// Input[1] is a rank 1 tensor that needs to be reshaped.
|
||||
input_1_name = org_input_1_name + "_ort_qnn_ep_reshape";
|
||||
|
||||
// FullyConnected requires input_1's shape to be [n, k].
|
||||
shape_2d = {1, input_info_1.shape[0]};
|
||||
ORT_RETURN_IF_ERROR(quant_param_2d.HandleUnsqueeze<uint32_t>(input_info_1.shape, shape_2d));
|
||||
} else {
|
||||
assert(input_info_1.shape.size() == 2);
|
||||
input_1_name = org_input_1_name + "_ort_qnn_ep_transpose";
|
||||
shape_2d = {input_info_1.shape[1], input_info_1.shape[0]};
|
||||
ORT_RETURN_IF_ERROR(quant_param_2d.HandleTranspose<uint32_t>(std::vector<uint32_t>({1, 0})));
|
||||
}
|
||||
|
||||
// If input_1 is initializer, unpack it and add the tensor with new quantization parameter and shape.
|
||||
// Otherwise, add a Reshape node.
|
||||
if (input_info_1.is_initializer) {
|
||||
std::vector<uint8_t> unpacked_tensor;
|
||||
if (!reshape_input_1) {
|
||||
// 2D initializer should be transposed to [n, k].
|
||||
std::vector<uint32_t> original_shape_copy = input_info_1.shape;
|
||||
ORT_RETURN_IF_ERROR(TwoDimensionTranspose(qnn_model_wrapper,
|
||||
original_shape_copy, // Will be modified to new shape (unnecessary)
|
||||
*input_info_1.initializer_tensor,
|
||||
unpacked_tensor));
|
||||
} else {
|
||||
ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_info_1.initializer_tensor, unpacked_tensor));
|
||||
}
|
||||
|
||||
Qnn_TensorType_t tensor_type = qnn_model_wrapper.GetTensorType(org_input_1_name);
|
||||
QnnTensorWrapper input_tensorwrapper(input_1_name, tensor_type, input_info_1.qnn_data_type,
|
||||
std::move(quant_param_2d), std::move(shape_2d), std::move(unpacked_tensor));
|
||||
ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(input_tensorwrapper)), "Failed to add tensor.");
|
||||
} else {
|
||||
ORT_RETURN_IF_ERROR(qnn_model_wrapper.AddReshapeNode(org_input_1_name, input_1_name, input_info_1.shape, shape_2d,
|
||||
input_info_1.qnn_data_type, input_info_1.quant_param,
|
||||
quant_param_2d, do_op_validation,
|
||||
qnn_model_wrapper.IsGraphInput(org_input_1_name), false));
|
||||
}
|
||||
input_names.emplace_back(input_1_name);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
|
|
@ -172,6 +373,24 @@ Status MatMulOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_w
|
|||
bool reshape_input_1 = input_info_1.shape.size() == 1;
|
||||
bool reshape_output = reshape_input_0 || reshape_input_1 || (use_fully_connected && input_info_0.shape.size() > 2);
|
||||
|
||||
// For QNN MatMul: set the input transpose parameters to their default values of 0. These parameters should be
|
||||
// optional, but older versions of QNN SDK failed validation if not explicitly provided.
|
||||
std::vector<std::string> param_tensor_names;
|
||||
if (!use_fully_connected) {
|
||||
Qnn_Scalar_t scalar_param = QNN_SCALAR_INIT;
|
||||
scalar_param.dataType = QNN_DATATYPE_BOOL_8;
|
||||
scalar_param.bool8Value = 0;
|
||||
QnnParamWrapper transpose_in0_param(node_unit.Index(), node_unit.Name(), QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN0,
|
||||
scalar_param);
|
||||
param_tensor_names.push_back(transpose_in0_param.GetParamTensorName());
|
||||
qnn_model_wrapper.AddParamWrapper(std::move(transpose_in0_param));
|
||||
|
||||
QnnParamWrapper transpose_in1_param(node_unit.Index(), node_unit.Name(), QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1,
|
||||
scalar_param);
|
||||
param_tensor_names.push_back(transpose_in1_param.GetParamTensorName());
|
||||
qnn_model_wrapper.AddParamWrapper(std::move(transpose_in1_param));
|
||||
}
|
||||
|
||||
const std::string& org_output_name = node_unit.Outputs()[0].node_arg.Name();
|
||||
std::string op_output_name = org_output_name;
|
||||
TensorInfo output_info{};
|
||||
|
|
@ -207,7 +426,8 @@ Status MatMulOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_w
|
|||
"Failed to add output tensor.");
|
||||
ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(utils::GetNodeName(node_unit), QNN_OP_PACKAGE_NAME_QTI_AISW,
|
||||
use_fully_connected ? QNN_OP_FULLY_CONNECTED : QNN_OP_MAT_MUL,
|
||||
std::move(input_names), {op_output_name}, {}, do_op_validation),
|
||||
std::move(input_names), {op_output_name},
|
||||
std::move(param_tensor_names), do_op_validation),
|
||||
"Failed to add fused Matmul node.");
|
||||
|
||||
if (reshape_output) {
|
||||
|
|
|
|||
|
|
@ -22,11 +22,6 @@ class SimpleOpBuilder : public BaseOpBuilder {
|
|||
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(SimpleOpBuilder);
|
||||
|
||||
protected:
|
||||
Status ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
|
||||
const NodeUnit& node_unit,
|
||||
const logging::Logger& logger,
|
||||
std::vector<std::string>& input_names,
|
||||
bool do_op_validation) const override ORT_MUST_USE_RESULT;
|
||||
Status ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
|
||||
const NodeUnit& node_unit,
|
||||
std::vector<std::string>&& input_names,
|
||||
|
|
@ -53,91 +48,6 @@ class SimpleOpBuilder : public BaseOpBuilder {
|
|||
static constexpr std::array<std::string_view, 3> gridsample_supported_padding_modes = {"zeros", "border", "reflection"};
|
||||
};
|
||||
|
||||
// Move to qnn_utils if it's re-usable
|
||||
Status InsertConvertOp(QnnModelWrapper& qnn_model_wrapper,
|
||||
const std::string& convert_input_name,
|
||||
const std::string& convert_output_name,
|
||||
Qnn_DataType_t input_qnn_data_type,
|
||||
Qnn_DataType_t output_qnn_data_type,
|
||||
int32_t input_offset,
|
||||
float input_scale,
|
||||
const std::vector<uint32_t>& output_shape,
|
||||
bool do_op_validation) {
|
||||
// Assume input is already handled.
|
||||
float qmin = 0.0f;
|
||||
float qmax = 255.0f;
|
||||
ORT_RETURN_IF_ERROR(qnn::utils::GetQminQmax(input_qnn_data_type, qmin, qmax));
|
||||
double value_min = qnn::utils::Dequantize(input_offset, input_scale, qmin);
|
||||
double value_max = qnn::utils::Dequantize(input_offset, input_scale, qmax);
|
||||
float scale = 0.0f;
|
||||
int32_t offset = 0;
|
||||
ORT_RETURN_IF_ERROR(qnn::utils::GetQuantParams(static_cast<float>(value_min),
|
||||
static_cast<float>(value_max),
|
||||
output_qnn_data_type,
|
||||
scale,
|
||||
offset));
|
||||
|
||||
std::vector<uint32_t> output_shape_copy = output_shape;
|
||||
QnnTensorWrapper convert_output_tensorwrapper(convert_output_name,
|
||||
QNN_TENSOR_TYPE_NATIVE,
|
||||
output_qnn_data_type,
|
||||
QnnQuantParamsWrapper(scale, offset),
|
||||
std::move(output_shape_copy));
|
||||
ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(convert_output_tensorwrapper)), "Failed to add tensor.");
|
||||
|
||||
ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(convert_output_name,
|
||||
QNN_OP_PACKAGE_NAME_QTI_AISW,
|
||||
"Convert",
|
||||
{convert_input_name},
|
||||
{convert_output_name},
|
||||
{},
|
||||
do_op_validation),
|
||||
"Failed to add node.");
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status SimpleOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
|
||||
const NodeUnit& node_unit,
|
||||
const logging::Logger& logger,
|
||||
std::vector<std::string>& input_names,
|
||||
bool do_op_validation) const {
|
||||
const std::string& op_type = node_unit.OpType();
|
||||
ORT_RETURN_IF_ERROR(BaseOpBuilder::ProcessInputs(qnn_model_wrapper, node_unit, logger, input_names, do_op_validation));
|
||||
|
||||
if (op_type == "MatMul") {
|
||||
const auto& inputs = node_unit.Inputs();
|
||||
TensorInfo input0_info = {};
|
||||
TensorInfo input1_info = {};
|
||||
ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[0], input0_info));
|
||||
ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[1], input1_info));
|
||||
// Need to insert Convert op if both inputs are dynamic inputs and are ufixed_16
|
||||
if (!input0_info.is_initializer && !input1_info.is_initializer &&
|
||||
input0_info.qnn_data_type == input1_info.qnn_data_type &&
|
||||
input0_info.qnn_data_type == QNN_DATATYPE_UFIXED_POINT_16) {
|
||||
ORT_RETURN_IF_NOT(input1_info.quant_param.IsPerTensor(),
|
||||
"MatMul's activation inputs only support per-tensor quantization");
|
||||
const Qnn_QuantizeParams_t& quant_param = input1_info.quant_param.Get();
|
||||
// insert Convert op after input1
|
||||
std::string convert_input_name = input_names.back();
|
||||
input_names.pop_back();
|
||||
const std::string& matmul_output_name = node_unit.Outputs()[0].node_arg.Name();
|
||||
std::string convert_output_name = convert_input_name + "_convert_" + matmul_output_name;
|
||||
ORT_RETURN_IF_ERROR(InsertConvertOp(qnn_model_wrapper,
|
||||
convert_input_name,
|
||||
convert_output_name,
|
||||
input1_info.qnn_data_type,
|
||||
QNN_DATATYPE_UFIXED_POINT_8,
|
||||
quant_param.scaleOffsetEncoding.offset,
|
||||
quant_param.scaleOffsetEncoding.scale,
|
||||
input1_info.shape,
|
||||
do_op_validation));
|
||||
input_names.push_back(convert_output_name);
|
||||
}
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status SimpleOpBuilder::ExplicitOpCheck(QnnModelWrapper& qnn_model_wrapper,
|
||||
const NodeUnit& node_unit) const {
|
||||
const std::string& op_type = node_unit.OpType();
|
||||
|
|
@ -378,19 +288,6 @@ Status SimpleOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_w
|
|||
ORT_RETURN_IF(norm_p_order != 2, "QNN EP only supports LpNormalization with 'p' attribute equal to 2.");
|
||||
}
|
||||
|
||||
if (op_type == "MatMul") {
|
||||
Qnn_Scalar_t scalar_param = QNN_SCALAR_INIT;
|
||||
scalar_param.dataType = QNN_DATATYPE_BOOL_8;
|
||||
scalar_param.bool8Value = 0;
|
||||
QnnParamWrapper transpose_in0_param(node_unit.Index(), node_unit.Name(), QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN0, scalar_param);
|
||||
param_tensor_names.push_back(transpose_in0_param.GetParamTensorName());
|
||||
qnn_model_wrapper.AddParamWrapper(std::move(transpose_in0_param));
|
||||
|
||||
QnnParamWrapper transpose_in1_param(node_unit.Index(), node_unit.Name(), QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1, scalar_param);
|
||||
param_tensor_names.push_back(transpose_in1_param.GetParamTensorName());
|
||||
qnn_model_wrapper.AddParamWrapper(std::move(transpose_in1_param));
|
||||
}
|
||||
|
||||
if (op_type == "LeakyRelu") {
|
||||
std::string input_name = "alpha";
|
||||
ORT_RETURN_IF_ERROR(ProcessAlphaAttributeAsInput(qnn_model_wrapper, node_unit, input_name));
|
||||
|
|
|
|||
|
|
@ -75,6 +75,20 @@ Status QnnModelWrapper::MakeTensorWrapper(const NodeUnitIODef& tensor, QnnTensor
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
Status QnnModelWrapper::MakeTensorWrapper(const TensorInfo& tensor_info,
|
||||
const std::string& tensor_name,
|
||||
QnnTensorWrapper& tensor_wrapper) const {
|
||||
std::vector<uint8_t> unpacked_tensor;
|
||||
if (tensor_info.is_initializer) {
|
||||
ORT_RETURN_IF_ERROR(UnpackInitializerData(*tensor_info.initializer_tensor, unpacked_tensor));
|
||||
}
|
||||
|
||||
tensor_wrapper = QnnTensorWrapper(tensor_name, GetTensorType(tensor_name), tensor_info.qnn_data_type,
|
||||
tensor_info.quant_param.Copy(), std::vector<uint32_t>(tensor_info.shape),
|
||||
std::move(unpacked_tensor));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
bool QnnModelWrapper::AddTensorWrapper(QnnTensorWrapper&& tensor_wrapper) {
|
||||
// Keep a copy of tensor name sine it will be moved with the wrapper into model_tensors_map_
|
||||
std::string tensor_name = tensor_wrapper.GetName();
|
||||
|
|
|
|||
|
|
@ -66,6 +66,9 @@ class QnnModelWrapper {
|
|||
|
||||
// Make a QnnTensorWrapper from an onnx input or output.
|
||||
Status MakeTensorWrapper(const NodeUnitIODef& tensor, QnnTensorWrapper& tensor_wrapper) const;
|
||||
Status MakeTensorWrapper(const TensorInfo& tensor_info,
|
||||
const std::string& tensor_name,
|
||||
QnnTensorWrapper& tensor_wrapper) const;
|
||||
|
||||
// Add to internal tensor wrapper table
|
||||
bool AddTensorWrapper(QnnTensorWrapper&& tensor_wrapper);
|
||||
|
|
|
|||
|
|
@ -290,10 +290,65 @@ TEST_F(QnnHTPBackendTests, MatMulOp_QDQ) {
|
|||
RunQDQPerChannelMatMulOpTest<uint16_t, Int4x2, uint16_t>({2, 3, 3, 3}, {3, 2}, -1, QDQTolerance(),
|
||||
ExpectedEPNodeAssignment::All, 18, true);
|
||||
|
||||
// // UINT16, per-channel INT8 weight
|
||||
// UINT16, per-channel INT8 weight
|
||||
RunQDQPerChannelMatMulOpTest<uint16_t, int8_t, uint16_t>({2, 3}, {3, 2}, 1, QDQTolerance(),
|
||||
ExpectedEPNodeAssignment::All, 21, false, false);
|
||||
RunQDQPerChannelMatMulOpTest<uint16_t, int8_t, uint16_t>({2, 3, 3}, {3}, -1);
|
||||
RunQDQPerChannelMatMulOpTest<uint16_t, int8_t, uint16_t>({2, 3, 3}, {3}, -1, QDQTolerance(0.0041f));
|
||||
}
|
||||
|
||||
// Tests MatMul with two uint16 (quantized) inputs that are both dynamic.
|
||||
// This exercises a workaround in QNN EP that inserts a QNN Convert op before input[1] (converts from uint16 to uint8).
|
||||
// This workaround prevents a validation error for this specific MatMul configuration.
|
||||
// Got specific shapes and input ranges (quant params) from customer model.
|
||||
TEST_F(QnnHTPBackendTests, MatMulOp_QDQ_Regression_uint16_dynamic_inputs) {
|
||||
ProviderOptions provider_options;
|
||||
#if defined(_WIN32)
|
||||
provider_options["backend_path"] = "QnnHtp.dll";
|
||||
#else
|
||||
provider_options["backend_path"] = "libQnnHtp.so";
|
||||
#endif
|
||||
|
||||
// Test with rank 4 inputs
|
||||
{
|
||||
std::vector<int64_t> shape_0 = {1, 12, 512, 96};
|
||||
TestInputDef<float> input0_def(
|
||||
{1, 12, 512, 96}, false,
|
||||
GetFloatDataInRange(-5.087f, 4.992f,
|
||||
static_cast<size_t>(std::accumulate(shape_0.begin(), shape_0.end(), static_cast<int64_t>(1),
|
||||
std::multiplies<int64_t>()))));
|
||||
std::vector<int64_t> shape_1 = {1, 12, 96, 512};
|
||||
TestInputDef<float> input1_def(
|
||||
shape_1, false,
|
||||
GetFloatDataInRange(-6.772f, 7.258f,
|
||||
static_cast<size_t>(std::accumulate(shape_1.begin(), shape_1.end(), static_cast<int64_t>(1),
|
||||
std::multiplies<int64_t>()))));
|
||||
|
||||
TestQDQModelAccuracy(
|
||||
BuildMatMulOpTestCase(input0_def, input1_def),
|
||||
BuildMatMulOpQDQTestCase<uint16_t, uint16_t, uint16_t>(input0_def, input1_def, false),
|
||||
provider_options, 21, ExpectedEPNodeAssignment::All, QDQTolerance());
|
||||
}
|
||||
|
||||
// Test with input[1] as rank 1
|
||||
{
|
||||
std::vector<int64_t> shape_0 = {1, 12, 512, 96};
|
||||
TestInputDef<float> input0_def(
|
||||
{1, 12, 512, 96}, false,
|
||||
GetFloatDataInRange(-5.087f, 4.992f,
|
||||
static_cast<size_t>(std::accumulate(shape_0.begin(), shape_0.end(), static_cast<int64_t>(1),
|
||||
std::multiplies<int64_t>()))));
|
||||
std::vector<int64_t> shape_1 = {96};
|
||||
TestInputDef<float> input1_def(
|
||||
shape_1, false,
|
||||
GetFloatDataInRange(-6.772f, 7.258f,
|
||||
static_cast<size_t>(std::accumulate(shape_1.begin(), shape_1.end(), static_cast<int64_t>(1),
|
||||
std::multiplies<int64_t>()))));
|
||||
|
||||
TestQDQModelAccuracy(
|
||||
BuildMatMulOpTestCase(input0_def, input1_def),
|
||||
BuildMatMulOpQDQTestCase<uint16_t, uint16_t, uint16_t>(input0_def, input1_def, false),
|
||||
provider_options, 21, ExpectedEPNodeAssignment::All, QDQTolerance());
|
||||
}
|
||||
}
|
||||
|
||||
#endif // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
|
||||
|
|
|
|||
Loading…
Reference in a new issue