mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-16 21:00:14 +00:00
[QNN EP] Support Qnn MatMul with 2 dynamic inputs which are uint16 quantized (#18469)
### Description QNN can't run MatMul if both inputs are dynamic inputs with uint16 quantized on v68. Make it run by inserting Convert op to convert 1 input to int8
This commit is contained in:
parent
e7a524fea9
commit
6a4e4488da
3 changed files with 125 additions and 8 deletions
|
|
@ -443,7 +443,6 @@ bool InstanceAndLayerNormalizationNodeGroupSelector::Check(const GraphViewer& gr
|
|||
}
|
||||
|
||||
int32_t dt_input = dq_nodes[0]->InputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
|
||||
int32_t dt_scale = dq_nodes[1]->InputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
|
||||
int32_t dt_bias = 0;
|
||||
bool has_bias = false;
|
||||
// bias is optional for LayerNorm
|
||||
|
|
@ -453,9 +452,9 @@ bool InstanceAndLayerNormalizationNodeGroupSelector::Check(const GraphViewer& gr
|
|||
}
|
||||
int32_t dt_output = q_nodes[0]->OutputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
|
||||
|
||||
// Input, output, and scale need to be the same type. The bias is int32.
|
||||
// Input, output, need to be the same type. The bias is int32.
|
||||
// Scale can be different with input for a16w8 case
|
||||
return (dt_input == dt_output) &&
|
||||
(dt_input == dt_scale) &&
|
||||
(has_bias ? dt_bias == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT32 : true);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -22,6 +22,11 @@ class SimpleOpBuilder : public BaseOpBuilder {
|
|||
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(SimpleOpBuilder);
|
||||
|
||||
protected:
|
||||
Status ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
|
||||
const NodeUnit& node_unit,
|
||||
const logging::Logger& logger,
|
||||
std::vector<std::string>& input_names,
|
||||
bool do_op_validation) const override ORT_MUST_USE_RESULT;
|
||||
Status ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
|
||||
const NodeUnit& node_unit,
|
||||
std::vector<std::string>&& input_names,
|
||||
|
|
@ -48,6 +53,90 @@ class SimpleOpBuilder : public BaseOpBuilder {
|
|||
static constexpr std::array<std::string_view, 3> gridsample_supported_padding_modes = {"zeros", "border", "reflection"};
|
||||
};
|
||||
|
||||
// Move to qnn_utils if it's re-usable
|
||||
Status InsertConvertOp(QnnModelWrapper& qnn_model_wrapper,
|
||||
const std::string& convert_input_name,
|
||||
const std::string& convert_output_name,
|
||||
Qnn_DataType_t input_qnn_data_type,
|
||||
Qnn_DataType_t output_qnn_data_type,
|
||||
int32_t input_offset,
|
||||
float input_scale,
|
||||
const std::vector<uint32_t>& output_shape,
|
||||
bool do_op_validation) {
|
||||
// Assume input is already handled.
|
||||
float qmin = 0.0f;
|
||||
float qmax = 255.0f;
|
||||
ORT_RETURN_IF_ERROR(qnn::utils::GetQminQmax(input_qnn_data_type, qmin, qmax));
|
||||
double value_min = qnn::utils::Dequantize(input_offset, input_scale, qmin);
|
||||
double value_max = qnn::utils::Dequantize(input_offset, input_scale, qmax);
|
||||
|
||||
Qnn_QuantizeParams_t convert_output_quant_param = QNN_QUANTIZE_PARAMS_INIT;
|
||||
convert_output_quant_param.encodingDefinition = QNN_DEFINITION_DEFINED;
|
||||
convert_output_quant_param.quantizationEncoding = QNN_QUANTIZATION_ENCODING_SCALE_OFFSET;
|
||||
ORT_RETURN_IF_ERROR(qnn::utils::GetQuantParams(static_cast<float>(value_min),
|
||||
static_cast<float>(value_max),
|
||||
output_qnn_data_type,
|
||||
convert_output_quant_param.scaleOffsetEncoding.scale,
|
||||
convert_output_quant_param.scaleOffsetEncoding.offset));
|
||||
|
||||
std::vector<uint32_t> output_shape_copy = output_shape;
|
||||
QnnTensorWrapper convert_output_tensorwrapper(convert_output_name,
|
||||
QNN_TENSOR_TYPE_NATIVE,
|
||||
output_qnn_data_type,
|
||||
convert_output_quant_param,
|
||||
std::move(output_shape_copy));
|
||||
ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(convert_output_tensorwrapper)), "Failed to add tensor.");
|
||||
|
||||
ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(convert_output_name,
|
||||
QNN_OP_PACKAGE_NAME_QTI_AISW,
|
||||
"Convert",
|
||||
{convert_input_name},
|
||||
{convert_output_name},
|
||||
{},
|
||||
do_op_validation),
|
||||
"Failed to add node.");
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status SimpleOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
|
||||
const NodeUnit& node_unit,
|
||||
const logging::Logger& logger,
|
||||
std::vector<std::string>& input_names,
|
||||
bool do_op_validation) const {
|
||||
const std::string& op_type = node_unit.OpType();
|
||||
ORT_RETURN_IF_ERROR(BaseOpBuilder::ProcessInputs(qnn_model_wrapper, node_unit, logger, input_names, do_op_validation));
|
||||
|
||||
if (op_type == "MatMul") {
|
||||
const auto& inputs = node_unit.Inputs();
|
||||
TensorInfo input0_info = {};
|
||||
TensorInfo input1_info = {};
|
||||
ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[0], input0_info));
|
||||
ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[1], input1_info));
|
||||
// Need to insert Convert op if both inputs are dynamic inputs and are ufixed_16
|
||||
if (!input0_info.is_initializer && !input1_info.is_initializer &&
|
||||
input0_info.qnn_data_type == input1_info.qnn_data_type &&
|
||||
input0_info.qnn_data_type == QNN_DATATYPE_UFIXED_POINT_16) {
|
||||
// insert Convert op after input1
|
||||
std::string convert_input_name = input_names.back();
|
||||
input_names.pop_back();
|
||||
const std::string& matmul_output_name = node_unit.Outputs()[0].node_arg.Name();
|
||||
std::string convert_output_name = convert_input_name + "_convert_" + matmul_output_name;
|
||||
ORT_RETURN_IF_ERROR(InsertConvertOp(qnn_model_wrapper,
|
||||
convert_input_name,
|
||||
convert_output_name,
|
||||
input1_info.qnn_data_type,
|
||||
QNN_DATATYPE_UFIXED_POINT_8,
|
||||
input1_info.quant_param.scaleOffsetEncoding.offset,
|
||||
input1_info.quant_param.scaleOffsetEncoding.scale,
|
||||
input1_info.shape,
|
||||
do_op_validation));
|
||||
input_names.push_back(convert_output_name);
|
||||
}
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status SimpleOpBuilder::ExplicitOpCheck(const NodeUnit& node_unit) const {
|
||||
const std::string& op_type = node_unit.OpType();
|
||||
|
||||
|
|
|
|||
|
|
@ -142,11 +142,6 @@ TEST_F(QnnHTPBackendTests, MatMulOp_HTP_u8) {
|
|||
}
|
||||
|
||||
// Test QDQ MatMul with 16-bit act, 8-bit weights (static)
|
||||
// TODO: (SLIGHT) Inaccuracy detected for output 'output', element 0.
|
||||
// Output quant params: scale=0.0015259021893143654, zero_point=0.
|
||||
// Expected val: 98
|
||||
// QNN QDQ val: 97.720298767089844 (err 0.27970123291015625)
|
||||
// CPU QDQ val: 97.726402282714844 (err 0.27359771728515625)
|
||||
TEST_F(QnnHTPBackendTests, MatMulOp_HTP_A16_W8Static) {
|
||||
std::vector<float> input0_data = {-10.0f, -4.0f, -2.0f, 0.0f, 5.0f, 10.0f};
|
||||
std::vector<float> input1_data = {-10.0f, -6.0f, -1.0f, 0.0f, 3.0f, 10.0f};
|
||||
|
|
@ -158,6 +153,40 @@ TEST_F(QnnHTPBackendTests, MatMulOp_HTP_A16_W8Static) {
|
|||
7e-3f);
|
||||
}
|
||||
|
||||
// Test QDQ MatMul with uint16 activation uint16 weights, both dynamic
|
||||
// Inaccuracy detected for output 'output_0', element 1.
|
||||
// Output quant params: scale=0.0015259021893143654, zero_point=0.
|
||||
// Expected val: 40
|
||||
// QNN QDQ val: 39.681087493896484 (err 0.31891250610351562)
|
||||
// CPU QDQ val: 39.99847412109375 (err 0.00152587890625)
|
||||
TEST_F(QnnHTPBackendTests, DISABLED_MatMulOp_HTP_A16_W16Dynamic) {
|
||||
std::vector<float> input0_data = {-10.0f, -4.0f, -2.0f, 0.0f, 5.0f, 10.0f};
|
||||
std::vector<float> input1_data = {-10.0f, -6.0f, -1.0f, 0.0f, 3.0f, 10.0f};
|
||||
RunQDQMatMulOpOpTest<uint16_t, uint16_t, uint16_t>(TestInputDef<float>({2, 3}, false, input0_data),
|
||||
TestInputDef<float>({3, 2}, false, input1_data),
|
||||
ExpectedEPNodeAssignment::All,
|
||||
18,
|
||||
true, // Use com.microsoft Q/DQ ops
|
||||
7e-3f);
|
||||
}
|
||||
|
||||
// Test QDQ MatMul with uint16 activation uint16 weights, both dynamic
|
||||
// Inaccuracy detected for output 'output_0', element 1.
|
||||
// Output quant params: scale=0.71908456087112427, zero_point=1.
|
||||
// Expected val: 46848.41015625
|
||||
// QNN QDQ val: 46844.04296875 (err 4.3671875)
|
||||
// CPU QDQ val: 46848.359375 (err 0.05078125)
|
||||
TEST_F(QnnHTPBackendTests, DISABLED_MatMulOp_HTP_A16_W16DynamicLarge) {
|
||||
std::vector<float> input0_data = GetFloatDataInRange(-10.0f, 10.0f, 12 * 96 * 512);
|
||||
std::vector<float> input1_data = GetFloatDataInRange(-10.0f, 10.0f, 12 * 96 * 512);
|
||||
RunQDQMatMulOpOpTest<uint16_t, uint16_t, uint16_t>(TestInputDef<float>({1, 12, 96, 512}, false, input0_data),
|
||||
TestInputDef<float>({1, 12, 512, 96}, false, input1_data),
|
||||
ExpectedEPNodeAssignment::All,
|
||||
18,
|
||||
true, // Use com.microsoft Q/DQ ops
|
||||
7e-3f);
|
||||
}
|
||||
|
||||
// Test 16-bit QDQ MatMul with static weights
|
||||
// TODO: Inaccuracy detected for output 'output', element 0.
|
||||
// Output quant params: scale=0.0015259021893143654, zero_point=0.
|
||||
|
|
|
|||
Loading…
Reference in a new issue