diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc index bac08f1993..850fd28758 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc @@ -13,11 +13,12 @@ namespace onnxruntime { namespace qnn { /** - * ONNX's MatMul supports 1D tensor as input on both size, but neither QNN's MatMul nor FullyConnected supports it. - * So we need to add Reshape Ops if necessary. + * An ONNX MatMul can be translated to either a QNN MatMul or a QNN FullyConnected. + * ONNX's MatMul suports inputs of rank 1, but neither QNN's MatMul nor FullyConnected support two rank 1 inputs. + * So, we need to add Reshape Ops if necessary. * In two cases, FullyConnected (input_1's shape is [n, k]) is used instead of MatMul without extra Transpose Op: - * 1. input_1 is 2D initializer. - * 2. input_1 is 1D tensor. + * 1. input_1 is a rank 2 initializer. + * 2. input_1 is a rank 1 tensor. */ class MatMulOpBuilder : public BaseOpBuilder { public: @@ -31,29 +32,149 @@ class MatMulOpBuilder : public BaseOpBuilder { Status ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit, std::vector&& input_names, const logging::Logger& logger, bool do_op_validation) const override ORT_MUST_USE_RESULT; + + private: + Status ProcessInputsForQnnMatMul(QnnModelWrapper& qnn_model_wrapper, + const NodeUnit& node_unit, + const TensorInfo& input_info_0, + const TensorInfo& input_info_1, + const logging::Logger& logger, + std::vector& input_names, + bool do_op_validation) const ORT_MUST_USE_RESULT; + Status ProcessInputsForQnnFullyConnected(QnnModelWrapper& qnn_model_wrapper, + const NodeUnit& node_unit, + const TensorInfo& input_info_0, + const TensorInfo& input_info_1, + const logging::Logger& logger, + std::vector& input_names, + bool do_op_validation) const ORT_MUST_USE_RESULT; }; namespace { +// Inserts a QNN Convert operator to convert from one quantization type (e.g., uint16) to another (e.g., uint8). +Status InsertConvertOp(QnnModelWrapper& qnn_model_wrapper, + const std::string& convert_input_name, + const std::string& convert_output_name, + Qnn_DataType_t input_qnn_data_type, + Qnn_DataType_t output_qnn_data_type, + int32_t input_offset, + float input_scale, + const std::vector& output_shape, + bool do_op_validation) { + // Assume input is already handled. + float qmin = 0.0f; + float qmax = 255.0f; + ORT_RETURN_IF_ERROR(qnn::utils::GetQminQmax(input_qnn_data_type, qmin, qmax)); + double value_min = qnn::utils::Dequantize(input_offset, input_scale, qmin); + double value_max = qnn::utils::Dequantize(input_offset, input_scale, qmax); + float scale = 0.0f; + int32_t offset = 0; + ORT_RETURN_IF_ERROR(qnn::utils::GetQuantParams(static_cast(value_min), + static_cast(value_max), + output_qnn_data_type, + scale, + offset)); + + std::vector output_shape_copy = output_shape; + QnnTensorWrapper convert_output_tensorwrapper(convert_output_name, + QNN_TENSOR_TYPE_NATIVE, + output_qnn_data_type, + QnnQuantParamsWrapper(scale, offset), + std::move(output_shape_copy)); + ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(convert_output_tensorwrapper)), "Failed to add tensor."); + + ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(convert_output_name, + QNN_OP_PACKAGE_NAME_QTI_AISW, + "Convert", + {convert_input_name}, + {convert_output_name}, + {}, + do_op_validation), + "Failed to add node."); + return Status::OK(); +} + +inline bool IsQuant16bit(Qnn_DataType_t qnn_data_type) { + return qnn_data_type == QNN_DATATYPE_UFIXED_POINT_16 || qnn_data_type == QNN_DATATYPE_SFIXED_POINT_16; +} + Status CheckInputs(const QnnModelWrapper& qnn_model_wrapper, const NodeUnitIODef& input_def_0, const NodeUnitIODef& input_def_1, TensorInfo& input_info_0, TensorInfo& input_info_1, bool& use_fully_connected) { ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(input_def_0, input_info_0)); ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(input_def_1, input_info_1)); - // Use FullyConnected if 2nd input is 2D initializer or 1D tensor. +#if QNN_API_VERSION_MAJOR >= 2 && QNN_API_VERSION_MINOR <= 20 + // Validation crashes if use QNN FullyConnected in QNN SDK versions 2.26 - 2.27 + // Just use QNN MatMul for these older QNN SDK versions. + use_fully_connected = false; +#else + // Use FullyConnected if 2nd input is a rank 2 initializer or a rank 1 tensor. // FullyConnected cannot pass the Op validation if keep_dims is true, so if input_0 is per-channel quantized tensor - // with rank > 2, it's not easy to set the quantization parameters for the output reshaped 2D tensor. + // with rank > 2, it's not easy to set the quantization parameters for the output reshaped rank 2 tensor. // In this case, we will not use FullyConnected. use_fully_connected = (input_info_1.shape.size() == 2 && input_info_1.is_initializer) || input_info_1.shape.size() == 1; use_fully_connected = use_fully_connected && !(input_info_0.quant_param.IsPerChannel() && input_info_0.shape.size() > 2); + // Don't use FullyConnected if both inputs are dynamic and uint16 (quantized) + use_fully_connected = use_fully_connected && !(IsQuant16bit(input_info_0.qnn_data_type) && + !input_info_0.is_initializer && + IsQuant16bit(input_info_1.qnn_data_type) && + !input_info_1.is_initializer); +#endif return Status::OK(); } +// Process input[0] for ONNX MatMul that can be translated to either a QNN MatMul or a QNN FullyConnected. +Status ProcessInput0(QnnModelWrapper& qnn_model_wrapper, + const TensorInfo& input_0_info, + const std::string& original_input_0_name, + std::vector& input_names, + const logging::Logger& logger, + bool do_op_validation) { + bool reshape_input_0 = input_0_info.shape.size() == 1; + std::string actual_input_0_name = original_input_0_name; + + if (reshape_input_0) { + actual_input_0_name = original_input_0_name + "_ort_qnn_ep_reshape"; + std::vector shape_2d{1, input_0_info.shape[0]}; + QnnQuantParamsWrapper quant_param_2d = input_0_info.quant_param.Copy(); + ORT_RETURN_IF_ERROR(quant_param_2d.HandleUnsqueeze(input_0_info.shape, shape_2d)); + + // If input_0 is initializer, unpack it and add the tensor with new quantization parameter and shape. + // Otherwise, add a Reshape node. + if (input_0_info.is_initializer) { + std::vector unpacked_tensor; + ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_0_info.initializer_tensor, unpacked_tensor)); + QnnTensorWrapper input_tensorwrapper(actual_input_0_name, QNN_TENSOR_TYPE_STATIC, input_0_info.qnn_data_type, + std::move(quant_param_2d), std::move(shape_2d), std::move(unpacked_tensor)); + ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(input_tensorwrapper)), "Failed to add tensor."); + } else { + ORT_RETURN_IF_ERROR(qnn_model_wrapper.AddReshapeNode(original_input_0_name, actual_input_0_name, + input_0_info.shape, shape_2d, + input_0_info.qnn_data_type, input_0_info.quant_param, + quant_param_2d, do_op_validation, + qnn_model_wrapper.IsGraphInput(original_input_0_name), false)); + } + } else { + if (qnn_model_wrapper.IsQnnTensorWrapperExist(actual_input_0_name)) { + LOGS(logger, VERBOSE) << "Tensor already added, skip it: " << actual_input_0_name; + } else { + QnnTensorWrapper input_0_tensor; + ORT_RETURN_IF_ERROR(qnn_model_wrapper.MakeTensorWrapper(input_0_info, actual_input_0_name, input_0_tensor)); + ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(input_0_tensor)), "Failed to add tensor."); + } + } + input_names.emplace_back(actual_input_0_name); + + return Status::OK(); +} } // namespace +// Process operator inputs. Dispatches to other processing functions depending on whether we're +// translating an ONNX MatMul to a QNN MatMul or a QNN FullyConnected. Status MatMulOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit, const logging::Logger& logger, std::vector& input_names, bool do_op_validation) const { @@ -63,77 +184,55 @@ Status MatMulOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper, const bool use_fully_connected = false; ORT_RETURN_IF_ERROR( CheckInputs(qnn_model_wrapper, inputs[0], inputs[1], input_info_0, input_info_1, use_fully_connected)); - bool reshape_input_0 = input_info_0.shape.size() == 1; - bool reshape_input_1 = input_info_1.shape.size() == 1; - // Process input 0. - const std::string& org_input_0_name = inputs[0].node_arg.Name(); - std::string input_0_name = org_input_0_name; - if (reshape_input_0) { - input_0_name = org_input_0_name + "_ort_qnn_ep_reshape"; - std::vector shape_2d{1, input_info_0.shape[0]}; - QnnQuantParamsWrapper quant_param_2d = input_info_0.quant_param.Copy(); - ORT_RETURN_IF_ERROR(quant_param_2d.HandleUnsqueeze(input_info_0.shape, shape_2d)); - - // If input_0 is initializer, unpack it and add the tensor with new quantization parameter and shape. - // Otherwise, add a Reshape node. - if (input_info_0.is_initializer) { - std::vector unpacked_tensor; - ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_info_0.initializer_tensor, unpacked_tensor)); - Qnn_TensorType_t tensor_type = qnn_model_wrapper.GetTensorType(org_input_0_name); - QnnTensorWrapper input_tensorwrapper(input_0_name, tensor_type, input_info_0.qnn_data_type, - std::move(quant_param_2d), std::move(shape_2d), std::move(unpacked_tensor)); - ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(input_tensorwrapper)), "Failed to add tensor."); - } else { - ORT_RETURN_IF_ERROR(qnn_model_wrapper.AddReshapeNode(org_input_0_name, input_0_name, input_info_0.shape, shape_2d, - input_info_0.qnn_data_type, input_info_0.quant_param, - quant_param_2d, do_op_validation, - qnn_model_wrapper.IsGraphInput(org_input_0_name), false)); - } - } else { - if (qnn_model_wrapper.IsQnnTensorWrapperExist(input_0_name)) { - LOGS(logger, VERBOSE) << "Tensor already added, skip it: " << input_0_name; - } else { - QnnTensorWrapper input_0_tensor; - ORT_RETURN_IF_ERROR(qnn_model_wrapper.MakeTensorWrapper(inputs[0], input_0_tensor)); - ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(input_0_tensor)), "Failed to add tensor."); - } + if (use_fully_connected) { + return ProcessInputsForQnnFullyConnected(qnn_model_wrapper, + node_unit, + input_info_0, + input_info_1, + logger, + input_names, + do_op_validation); } - input_names.emplace_back(input_0_name); + return ProcessInputsForQnnMatMul(qnn_model_wrapper, + node_unit, + input_info_0, + input_info_1, + logger, + input_names, + do_op_validation); +} + +Status MatMulOpBuilder::ProcessInputsForQnnMatMul(QnnModelWrapper& qnn_model_wrapper, + const NodeUnit& node_unit, + const TensorInfo& input_info_0, + const TensorInfo& input_info_1, + const logging::Logger& logger, + std::vector& input_names, + bool do_op_validation) const { + const auto& inputs = node_unit.Inputs(); + const bool reshape_input_1 = input_info_1.shape.size() == 1; + + const std::string& org_input_0_name = inputs[0].node_arg.Name(); + ORT_RETURN_IF_ERROR(ProcessInput0(qnn_model_wrapper, input_info_0, org_input_0_name, input_names, + logger, do_op_validation)); // Process input 1. const std::string& org_input_1_name = inputs[1].node_arg.Name(); std::string input_1_name = org_input_1_name; - if (reshape_input_1 || use_fully_connected) { + if (reshape_input_1) { + // Input[1] is a rank 1 tensor that needs to be reshaped. std::vector shape_2d; QnnQuantParamsWrapper quant_param_2d = input_info_1.quant_param.Copy(); - if (reshape_input_1) { - // Input is 1D tensor. - input_1_name = org_input_1_name + "_ort_qnn_ep_reshape"; - if (use_fully_connected) { - // FullyConnected requires input_1's shape to be [n, k]. - shape_2d = {1, input_info_1.shape[0]}; - } else { - shape_2d = {input_info_1.shape[0], 1}; - } - ORT_RETURN_IF_ERROR(quant_param_2d.HandleUnsqueeze(input_info_1.shape, shape_2d)); - } else { - input_1_name = org_input_1_name + "_ort_qnn_ep_transpose"; - shape_2d = {input_info_1.shape[1], input_info_1.shape[0]}; - ORT_RETURN_IF_ERROR(quant_param_2d.HandleTranspose(std::vector({1, 0}))); - } + input_1_name = org_input_1_name + "_ort_qnn_ep_reshape"; + shape_2d = {input_info_1.shape[0], 1}; + ORT_RETURN_IF_ERROR(quant_param_2d.HandleUnsqueeze(input_info_1.shape, shape_2d)); // If input_1 is initializer, unpack it and add the tensor with new quantization parameter and shape. // Otherwise, add a Reshape node. if (input_info_1.is_initializer) { std::vector unpacked_tensor; - if (use_fully_connected && !reshape_input_1) { - // 2D initializer should be transposed to [n, k]. - ORT_RETURN_IF_ERROR(TwoDimensionTranspose(qnn_model_wrapper, input_info_1.shape, - *input_info_1.initializer_tensor, unpacked_tensor)); - } else { - ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_info_1.initializer_tensor, unpacked_tensor)); - } + ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_info_1.initializer_tensor, unpacked_tensor)); Qnn_TensorType_t tensor_type = qnn_model_wrapper.GetTensorType(org_input_1_name); QnnTensorWrapper input_tensorwrapper(input_1_name, tensor_type, input_info_1.qnn_data_type, @@ -156,6 +255,108 @@ Status MatMulOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper, const } input_names.emplace_back(input_1_name); + // Workaround that inserts a QNN Convert op before input[1] (converts from quantized uint16 to quantized uint8) + // to avoid a QNN validation failure. + // + // QNN graph WITHOUT workaround (fails validation): + // input_0_uint16 ---> MatMul ---> output_uint16 + // ^ + // | + // input_1_uint16 -----+ + // + // QNN graph WITH workaround (passes validation): + // input_0_uint16 ----------------------> MatMul ---> output_uint16 + // ^ + // | + // input_1_uint16 --> Convert(to uint8) --+ + if (!input_info_0.is_initializer && !input_info_1.is_initializer && + input_info_0.qnn_data_type == input_info_1.qnn_data_type && + input_info_0.qnn_data_type == QNN_DATATYPE_UFIXED_POINT_16) { + ORT_RETURN_IF_NOT(input_info_1.quant_param.IsPerTensor(), + "MatMul's activation inputs only support per-tensor quantization"); + const Qnn_QuantizeParams_t& quant_param = input_info_1.quant_param.Get(); + // insert Convert op after input1 + std::string convert_input_name = input_names.back(); + input_names.pop_back(); + const std::string& matmul_output_name = node_unit.Outputs()[0].node_arg.Name(); + std::string convert_output_name = convert_input_name + "_convert_" + matmul_output_name; + std::vector input_1_shape = input_info_1.shape; + if (reshape_input_1) { + input_1_shape = {input_info_1.shape[0], 1}; + } + ORT_RETURN_IF_ERROR(InsertConvertOp(qnn_model_wrapper, + convert_input_name, + convert_output_name, + input_info_1.qnn_data_type, + QNN_DATATYPE_UFIXED_POINT_8, + quant_param.scaleOffsetEncoding.offset, + quant_param.scaleOffsetEncoding.scale, + input_1_shape, + do_op_validation)); + input_names.push_back(convert_output_name); + } + return Status::OK(); +} + +Status MatMulOpBuilder::ProcessInputsForQnnFullyConnected(QnnModelWrapper& qnn_model_wrapper, + const NodeUnit& node_unit, + const TensorInfo& input_info_0, + const TensorInfo& input_info_1, + const logging::Logger& logger, + std::vector& input_names, + bool do_op_validation) const { + const auto& inputs = node_unit.Inputs(); + const bool reshape_input_1 = input_info_1.shape.size() == 1; + + const std::string& org_input_0_name = inputs[0].node_arg.Name(); + ORT_RETURN_IF_ERROR(ProcessInput0(qnn_model_wrapper, input_info_0, org_input_0_name, input_names, + logger, do_op_validation)); + + // Process input 1. + const std::string& org_input_1_name = inputs[1].node_arg.Name(); + std::string input_1_name = org_input_1_name; + std::vector shape_2d; + QnnQuantParamsWrapper quant_param_2d = input_info_1.quant_param.Copy(); + if (reshape_input_1) { + // Input[1] is a rank 1 tensor that needs to be reshaped. + input_1_name = org_input_1_name + "_ort_qnn_ep_reshape"; + + // FullyConnected requires input_1's shape to be [n, k]. + shape_2d = {1, input_info_1.shape[0]}; + ORT_RETURN_IF_ERROR(quant_param_2d.HandleUnsqueeze(input_info_1.shape, shape_2d)); + } else { + assert(input_info_1.shape.size() == 2); + input_1_name = org_input_1_name + "_ort_qnn_ep_transpose"; + shape_2d = {input_info_1.shape[1], input_info_1.shape[0]}; + ORT_RETURN_IF_ERROR(quant_param_2d.HandleTranspose(std::vector({1, 0}))); + } + + // If input_1 is initializer, unpack it and add the tensor with new quantization parameter and shape. + // Otherwise, add a Reshape node. + if (input_info_1.is_initializer) { + std::vector unpacked_tensor; + if (!reshape_input_1) { + // 2D initializer should be transposed to [n, k]. + std::vector original_shape_copy = input_info_1.shape; + ORT_RETURN_IF_ERROR(TwoDimensionTranspose(qnn_model_wrapper, + original_shape_copy, // Will be modified to new shape (unnecessary) + *input_info_1.initializer_tensor, + unpacked_tensor)); + } else { + ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_info_1.initializer_tensor, unpacked_tensor)); + } + + Qnn_TensorType_t tensor_type = qnn_model_wrapper.GetTensorType(org_input_1_name); + QnnTensorWrapper input_tensorwrapper(input_1_name, tensor_type, input_info_1.qnn_data_type, + std::move(quant_param_2d), std::move(shape_2d), std::move(unpacked_tensor)); + ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(input_tensorwrapper)), "Failed to add tensor."); + } else { + ORT_RETURN_IF_ERROR(qnn_model_wrapper.AddReshapeNode(org_input_1_name, input_1_name, input_info_1.shape, shape_2d, + input_info_1.qnn_data_type, input_info_1.quant_param, + quant_param_2d, do_op_validation, + qnn_model_wrapper.IsGraphInput(org_input_1_name), false)); + } + input_names.emplace_back(input_1_name); return Status::OK(); } @@ -172,6 +373,24 @@ Status MatMulOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_w bool reshape_input_1 = input_info_1.shape.size() == 1; bool reshape_output = reshape_input_0 || reshape_input_1 || (use_fully_connected && input_info_0.shape.size() > 2); + // For QNN MatMul: set the input transpose parameters to their default values of 0. These parameters should be + // optional, but older versions of QNN SDK failed validation if not explicitly provided. + std::vector param_tensor_names; + if (!use_fully_connected) { + Qnn_Scalar_t scalar_param = QNN_SCALAR_INIT; + scalar_param.dataType = QNN_DATATYPE_BOOL_8; + scalar_param.bool8Value = 0; + QnnParamWrapper transpose_in0_param(node_unit.Index(), node_unit.Name(), QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN0, + scalar_param); + param_tensor_names.push_back(transpose_in0_param.GetParamTensorName()); + qnn_model_wrapper.AddParamWrapper(std::move(transpose_in0_param)); + + QnnParamWrapper transpose_in1_param(node_unit.Index(), node_unit.Name(), QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1, + scalar_param); + param_tensor_names.push_back(transpose_in1_param.GetParamTensorName()); + qnn_model_wrapper.AddParamWrapper(std::move(transpose_in1_param)); + } + const std::string& org_output_name = node_unit.Outputs()[0].node_arg.Name(); std::string op_output_name = org_output_name; TensorInfo output_info{}; @@ -207,7 +426,8 @@ Status MatMulOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_w "Failed to add output tensor."); ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(utils::GetNodeName(node_unit), QNN_OP_PACKAGE_NAME_QTI_AISW, use_fully_connected ? QNN_OP_FULLY_CONNECTED : QNN_OP_MAT_MUL, - std::move(input_names), {op_output_name}, {}, do_op_validation), + std::move(input_names), {op_output_name}, + std::move(param_tensor_names), do_op_validation), "Failed to add fused Matmul node."); if (reshape_output) { diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc index a6c4203ad9..9902617b71 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc @@ -22,11 +22,6 @@ class SimpleOpBuilder : public BaseOpBuilder { ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(SimpleOpBuilder); protected: - Status ProcessInputs(QnnModelWrapper& qnn_model_wrapper, - const NodeUnit& node_unit, - const logging::Logger& logger, - std::vector& input_names, - bool do_op_validation) const override ORT_MUST_USE_RESULT; Status ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit, std::vector&& input_names, @@ -53,91 +48,6 @@ class SimpleOpBuilder : public BaseOpBuilder { static constexpr std::array gridsample_supported_padding_modes = {"zeros", "border", "reflection"}; }; -// Move to qnn_utils if it's re-usable -Status InsertConvertOp(QnnModelWrapper& qnn_model_wrapper, - const std::string& convert_input_name, - const std::string& convert_output_name, - Qnn_DataType_t input_qnn_data_type, - Qnn_DataType_t output_qnn_data_type, - int32_t input_offset, - float input_scale, - const std::vector& output_shape, - bool do_op_validation) { - // Assume input is already handled. - float qmin = 0.0f; - float qmax = 255.0f; - ORT_RETURN_IF_ERROR(qnn::utils::GetQminQmax(input_qnn_data_type, qmin, qmax)); - double value_min = qnn::utils::Dequantize(input_offset, input_scale, qmin); - double value_max = qnn::utils::Dequantize(input_offset, input_scale, qmax); - float scale = 0.0f; - int32_t offset = 0; - ORT_RETURN_IF_ERROR(qnn::utils::GetQuantParams(static_cast(value_min), - static_cast(value_max), - output_qnn_data_type, - scale, - offset)); - - std::vector output_shape_copy = output_shape; - QnnTensorWrapper convert_output_tensorwrapper(convert_output_name, - QNN_TENSOR_TYPE_NATIVE, - output_qnn_data_type, - QnnQuantParamsWrapper(scale, offset), - std::move(output_shape_copy)); - ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(convert_output_tensorwrapper)), "Failed to add tensor."); - - ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(convert_output_name, - QNN_OP_PACKAGE_NAME_QTI_AISW, - "Convert", - {convert_input_name}, - {convert_output_name}, - {}, - do_op_validation), - "Failed to add node."); - return Status::OK(); -} - -Status SimpleOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper, - const NodeUnit& node_unit, - const logging::Logger& logger, - std::vector& input_names, - bool do_op_validation) const { - const std::string& op_type = node_unit.OpType(); - ORT_RETURN_IF_ERROR(BaseOpBuilder::ProcessInputs(qnn_model_wrapper, node_unit, logger, input_names, do_op_validation)); - - if (op_type == "MatMul") { - const auto& inputs = node_unit.Inputs(); - TensorInfo input0_info = {}; - TensorInfo input1_info = {}; - ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[0], input0_info)); - ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[1], input1_info)); - // Need to insert Convert op if both inputs are dynamic inputs and are ufixed_16 - if (!input0_info.is_initializer && !input1_info.is_initializer && - input0_info.qnn_data_type == input1_info.qnn_data_type && - input0_info.qnn_data_type == QNN_DATATYPE_UFIXED_POINT_16) { - ORT_RETURN_IF_NOT(input1_info.quant_param.IsPerTensor(), - "MatMul's activation inputs only support per-tensor quantization"); - const Qnn_QuantizeParams_t& quant_param = input1_info.quant_param.Get(); - // insert Convert op after input1 - std::string convert_input_name = input_names.back(); - input_names.pop_back(); - const std::string& matmul_output_name = node_unit.Outputs()[0].node_arg.Name(); - std::string convert_output_name = convert_input_name + "_convert_" + matmul_output_name; - ORT_RETURN_IF_ERROR(InsertConvertOp(qnn_model_wrapper, - convert_input_name, - convert_output_name, - input1_info.qnn_data_type, - QNN_DATATYPE_UFIXED_POINT_8, - quant_param.scaleOffsetEncoding.offset, - quant_param.scaleOffsetEncoding.scale, - input1_info.shape, - do_op_validation)); - input_names.push_back(convert_output_name); - } - } - - return Status::OK(); -} - Status SimpleOpBuilder::ExplicitOpCheck(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit) const { const std::string& op_type = node_unit.OpType(); @@ -378,19 +288,6 @@ Status SimpleOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_w ORT_RETURN_IF(norm_p_order != 2, "QNN EP only supports LpNormalization with 'p' attribute equal to 2."); } - if (op_type == "MatMul") { - Qnn_Scalar_t scalar_param = QNN_SCALAR_INIT; - scalar_param.dataType = QNN_DATATYPE_BOOL_8; - scalar_param.bool8Value = 0; - QnnParamWrapper transpose_in0_param(node_unit.Index(), node_unit.Name(), QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN0, scalar_param); - param_tensor_names.push_back(transpose_in0_param.GetParamTensorName()); - qnn_model_wrapper.AddParamWrapper(std::move(transpose_in0_param)); - - QnnParamWrapper transpose_in1_param(node_unit.Index(), node_unit.Name(), QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1, scalar_param); - param_tensor_names.push_back(transpose_in1_param.GetParamTensorName()); - qnn_model_wrapper.AddParamWrapper(std::move(transpose_in1_param)); - } - if (op_type == "LeakyRelu") { std::string input_name = "alpha"; ORT_RETURN_IF_ERROR(ProcessAlphaAttributeAsInput(qnn_model_wrapper, node_unit, input_name)); diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc index 9147bade3b..79f8f176a2 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc @@ -75,6 +75,20 @@ Status QnnModelWrapper::MakeTensorWrapper(const NodeUnitIODef& tensor, QnnTensor return Status::OK(); } +Status QnnModelWrapper::MakeTensorWrapper(const TensorInfo& tensor_info, + const std::string& tensor_name, + QnnTensorWrapper& tensor_wrapper) const { + std::vector unpacked_tensor; + if (tensor_info.is_initializer) { + ORT_RETURN_IF_ERROR(UnpackInitializerData(*tensor_info.initializer_tensor, unpacked_tensor)); + } + + tensor_wrapper = QnnTensorWrapper(tensor_name, GetTensorType(tensor_name), tensor_info.qnn_data_type, + tensor_info.quant_param.Copy(), std::vector(tensor_info.shape), + std::move(unpacked_tensor)); + return Status::OK(); +} + bool QnnModelWrapper::AddTensorWrapper(QnnTensorWrapper&& tensor_wrapper) { // Keep a copy of tensor name sine it will be moved with the wrapper into model_tensors_map_ std::string tensor_name = tensor_wrapper.GetName(); diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h index e24d4939dd..d018ca12d6 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h @@ -66,6 +66,9 @@ class QnnModelWrapper { // Make a QnnTensorWrapper from an onnx input or output. Status MakeTensorWrapper(const NodeUnitIODef& tensor, QnnTensorWrapper& tensor_wrapper) const; + Status MakeTensorWrapper(const TensorInfo& tensor_info, + const std::string& tensor_name, + QnnTensorWrapper& tensor_wrapper) const; // Add to internal tensor wrapper table bool AddTensorWrapper(QnnTensorWrapper&& tensor_wrapper); diff --git a/onnxruntime/test/providers/qnn/matmul_test.cpp b/onnxruntime/test/providers/qnn/matmul_test.cpp index c6cbf0236c..4b2aee5f6a 100644 --- a/onnxruntime/test/providers/qnn/matmul_test.cpp +++ b/onnxruntime/test/providers/qnn/matmul_test.cpp @@ -290,10 +290,65 @@ TEST_F(QnnHTPBackendTests, MatMulOp_QDQ) { RunQDQPerChannelMatMulOpTest({2, 3, 3, 3}, {3, 2}, -1, QDQTolerance(), ExpectedEPNodeAssignment::All, 18, true); - // // UINT16, per-channel INT8 weight + // UINT16, per-channel INT8 weight RunQDQPerChannelMatMulOpTest({2, 3}, {3, 2}, 1, QDQTolerance(), ExpectedEPNodeAssignment::All, 21, false, false); - RunQDQPerChannelMatMulOpTest({2, 3, 3}, {3}, -1); + RunQDQPerChannelMatMulOpTest({2, 3, 3}, {3}, -1, QDQTolerance(0.0041f)); +} + +// Tests MatMul with two uint16 (quantized) inputs that are both dynamic. +// This exercises a workaround in QNN EP that inserts a QNN Convert op before input[1] (converts from uint16 to uint8). +// This workaround prevents a validation error for this specific MatMul configuration. +// Got specific shapes and input ranges (quant params) from customer model. +TEST_F(QnnHTPBackendTests, MatMulOp_QDQ_Regression_uint16_dynamic_inputs) { + ProviderOptions provider_options; +#if defined(_WIN32) + provider_options["backend_path"] = "QnnHtp.dll"; +#else + provider_options["backend_path"] = "libQnnHtp.so"; +#endif + + // Test with rank 4 inputs + { + std::vector shape_0 = {1, 12, 512, 96}; + TestInputDef input0_def( + {1, 12, 512, 96}, false, + GetFloatDataInRange(-5.087f, 4.992f, + static_cast(std::accumulate(shape_0.begin(), shape_0.end(), static_cast(1), + std::multiplies())))); + std::vector shape_1 = {1, 12, 96, 512}; + TestInputDef input1_def( + shape_1, false, + GetFloatDataInRange(-6.772f, 7.258f, + static_cast(std::accumulate(shape_1.begin(), shape_1.end(), static_cast(1), + std::multiplies())))); + + TestQDQModelAccuracy( + BuildMatMulOpTestCase(input0_def, input1_def), + BuildMatMulOpQDQTestCase(input0_def, input1_def, false), + provider_options, 21, ExpectedEPNodeAssignment::All, QDQTolerance()); + } + + // Test with input[1] as rank 1 + { + std::vector shape_0 = {1, 12, 512, 96}; + TestInputDef input0_def( + {1, 12, 512, 96}, false, + GetFloatDataInRange(-5.087f, 4.992f, + static_cast(std::accumulate(shape_0.begin(), shape_0.end(), static_cast(1), + std::multiplies())))); + std::vector shape_1 = {96}; + TestInputDef input1_def( + shape_1, false, + GetFloatDataInRange(-6.772f, 7.258f, + static_cast(std::accumulate(shape_1.begin(), shape_1.end(), static_cast(1), + std::multiplies())))); + + TestQDQModelAccuracy( + BuildMatMulOpTestCase(input0_def, input1_def), + BuildMatMulOpQDQTestCase(input0_def, input1_def, false), + provider_options, 21, ExpectedEPNodeAssignment::All, QDQTolerance()); + } } #endif // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)