From a6c18ae9df741ea661c3abc2f7bcce80e2924c77 Mon Sep 17 00:00:00 2001 From: Adrian Lizarraga Date: Mon, 17 Jun 2024 09:46:14 -0700 Subject: [PATCH] [QNN EP] Add quantization axis checks for Conv/ConvTranspose/Q/DQ ops (#21016) ### Description Updates QNN EP to reject Conv/ConvTranspose/Q/DQ ops with unsupported quantization axis values. ### Motivation and Context Allows these unsupported operators to be handled by the CPU EP. Fixes errors like the following: > Node 'ConvTranspose' OpType:ConvTranspose with domain:com.ms.internal.nhwc was inserted using the NHWC format as requested by QNNExecutionProvider, but was not selected by that EP. This means the graph is now invalid as there will not be an EP able to run the node. This could be a bug in layout transformer, or in the GetCapability implementation of the EP. --------- Signed-off-by: adrianlizarraga --- onnxruntime/core/framework/node_unit.cc | 29 +++++-- .../qnn/builder/opbuilder/conv_op_builder.cc | 10 ++- .../builder/opbuilder/simple_op_builder.cc | 21 ++++- .../qnn/builder/qnn_model_wrapper.cc | 12 ++- .../providers/qnn/builder/qnn_model_wrapper.h | 6 +- onnxruntime/test/providers/qnn/conv_test.cc | 81 +++++++++++++++++-- 6 files changed, 139 insertions(+), 20 deletions(-) diff --git a/onnxruntime/core/framework/node_unit.cc b/onnxruntime/core/framework/node_unit.cc index 174942b903..4e2f22dea1 100644 --- a/onnxruntime/core/framework/node_unit.cc +++ b/onnxruntime/core/framework/node_unit.cc @@ -283,6 +283,7 @@ ProviderType NodeUnit::GetExecutionProviderType() const noexcept { return target void NodeUnit::InitForSingleNode() { const auto& input_defs = target_node_.InputDefs(); const auto& output_defs = target_node_.OutputDefs(); + const auto& node_attrs = target_node_.GetAttributes(); auto qlinear_type = GetQLinearOpType(target_node_); if (qlinear_type == QLinearOpType::Unknown || IsVariadicQLinearOp(qlinear_type)) { // TODO, add variadic support // Not a Qlinear op, add all inputs / outputs @@ -321,19 +322,35 @@ void NodeUnit::InitForSingleNode() { // DequantizeLinear has 3 inputs // x, x_scale, x_zp // output is not quantized - inputs_.push_back(NodeUnitIODef{*input_defs[0], NodeUnitIODef::QuantParam{*input_defs[1], input_defs.size() == 3 - ? input_defs[2] - : nullptr}}); + + // Get the DQ axis attribute if available. + std::optional axis; + if (auto entry = node_attrs.find("axis"); entry != node_attrs.end()) { + axis = entry->second.i(); + } + + inputs_.push_back(NodeUnitIODef{*input_defs[0], + NodeUnitIODef::QuantParam{*input_defs[1], + input_defs.size() == 3 ? input_defs[2] : nullptr, + axis}}); outputs_.push_back(NodeUnitIODef{*output_defs[0], std::nullopt}); } else if (qlinear_type == QLinearOpType::QuantizeLinear) { // QuantizeLinear the input is not quantized and has 3 inputs // x, y_scale, y_zp (optional) // The output is quantized + + // Get the Q axis attribute if available. + std::optional axis; + if (auto entry = node_attrs.find("axis"); entry != node_attrs.end()) { + axis = entry->second.i(); + } + inputs_.push_back(NodeUnitIODef{*input_defs[0], std::nullopt}); - outputs_.push_back(NodeUnitIODef{*output_defs[0], NodeUnitIODef::QuantParam{*input_defs[1], input_defs.size() == 3 - ? input_defs[2] - : nullptr}}); + outputs_.push_back(NodeUnitIODef{*output_defs[0], + NodeUnitIODef::QuantParam{*input_defs[1], + input_defs.size() == 3 ? input_defs[2] : nullptr, + axis}}); } else { ORT_THROW("The QLinear op [", static_cast(qlinear_type), "] is not supported"); } diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc index 4eeca98454..1713f201c9 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc @@ -120,7 +120,8 @@ Status ConvOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper, if (is_npu_backend) { const auto& input_1 = inputs[1]; // weight bool is_per_axis_quant = false; - ORT_RETURN_IF_ERROR(qnn_model_wrapper.IsPerChannelQuantized(input_1, is_per_axis_quant)); + int64_t quant_axis = 0; + ORT_RETURN_IF_ERROR(qnn_model_wrapper.IsPerChannelQuantized(input_1, is_per_axis_quant, quant_axis)); if (is_per_axis_quant) { int32_t elem_data_type = 0; @@ -129,6 +130,13 @@ Status ConvOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper, const bool is_signed_type = (elem_data_type == ONNX_NAMESPACE::TensorProto_DataType_INT8) || (elem_data_type == ONNX_NAMESPACE::TensorProto_DataType_INT16); ORT_RETURN_IF_NOT(is_signed_type, "Conv weights must be of a signed quantized type if quantized per-channel"); + + if (conv_type == OnnxConvType::kConvTranspose) { + ORT_RETURN_IF_NOT(quant_axis == 1, + "ConvTranspose's input[1] must be use axis == 1 for per-channel quantization"); + } else { + ORT_RETURN_IF_NOT(quant_axis == 0, "Conv's input[1] must be use axis == 0 for per-channel quantization"); + } } } diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc index 7e2d1ef05b..285781aaa3 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc @@ -41,7 +41,7 @@ class SimpleOpBuilder : public BaseOpBuilder { QnnQuantParamsWrapper& quant_param) const override ORT_MUST_USE_RESULT; private: - Status ExplicitOpCheck(const NodeUnit& node_unit) const; + Status ExplicitOpCheck(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit) const; Status ProcessSigmoidOrTanhOutput(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit, std::vector&& input_names, @@ -138,7 +138,8 @@ Status SimpleOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper, return Status::OK(); } -Status SimpleOpBuilder::ExplicitOpCheck(const NodeUnit& node_unit) const { +Status SimpleOpBuilder::ExplicitOpCheck(QnnModelWrapper& qnn_model_wrapper, + const NodeUnit& node_unit) const { const std::string& op_type = node_unit.OpType(); if (op_type == "GridSample") { @@ -158,6 +159,20 @@ Status SimpleOpBuilder::ExplicitOpCheck(const NodeUnit& node_unit) const { "QNN EP only supports Min and Max operators with exactly 2 inputs."); } + if (op_type == "DequantizeLinear") { + bool is_per_chan_quant = false; + int64_t quant_axis = 0; + ORT_RETURN_IF_ERROR(qnn_model_wrapper.IsPerChannelQuantized(node_unit.Inputs()[0], is_per_chan_quant, quant_axis)); + ORT_RETURN_IF(is_per_chan_quant, "QNN EP does not support a standalone DQ op with per-channel quantization"); + } + + if (op_type == "QuantizeLinear") { + bool is_per_chan_quant = false; + int64_t quant_axis = 0; + ORT_RETURN_IF_ERROR(qnn_model_wrapper.IsPerChannelQuantized(node_unit.Outputs()[0], is_per_chan_quant, quant_axis)); + ORT_RETURN_IF(is_per_chan_quant, "QNN EP does not support a standalone Q op with per-channel quantization"); + } + return Status::OK(); } @@ -475,7 +490,7 @@ Status SimpleOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_w const std::string& op_type = node_unit.OpType(); if (do_op_validation) { - ORT_RETURN_IF_ERROR(ExplicitOpCheck(node_unit)); + ORT_RETURN_IF_ERROR(ExplicitOpCheck(qnn_model_wrapper, node_unit)); // Skip the op validation for DepthToSpace & SpaceToDepth if it's not NHWC data layout if (node_unit.Domain() != kMSInternalNHWCDomain && (op_type == "DepthToSpace" || op_type == "SpaceToDepth" || op_type == "GridSample")) { return Status::OK(); diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc index 6f9ac20285..3a8a8af17b 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc @@ -412,9 +412,10 @@ Status QnnModelWrapper::UnpackScales(const std::string& initializer_name, std::v // Checks if a tensor in the ONNX graph is per-channel quantized. Status QnnModelWrapper::IsPerChannelQuantized(const onnxruntime::NodeUnitIODef& io_def, - /*out*/ bool& is_per_axis) const { + /*out*/ bool& is_per_channel, + /*out*/ int64_t& axis) const { if (!io_def.quant_param) { - is_per_axis = false; + is_per_channel = false; return Status::OK(); } @@ -432,7 +433,12 @@ Status QnnModelWrapper::IsPerChannelQuantized(const onnxruntime::NodeUnitIODef& const bool is_scalar_or_1_elem_vector = scale_shape.NumDimensions() == 0 || (scale_shape.NumDimensions() == 1 && scale_shape.Size() == 1); - is_per_axis = !is_scalar_or_1_elem_vector; + is_per_channel = !is_scalar_or_1_elem_vector; + + if (is_per_channel) { + axis = io_def.quant_param->axis.value_or(1); // 1 is default axis for Q/DQ ops. + } + return Status::OK(); } diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h index 3fd268d1b3..0705a1d1b8 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h @@ -218,8 +218,10 @@ class QnnModelWrapper { // Unpack zero-points from initializer and convert to int32_t (1 zero-point for per-tensor, > 1 for per-channel). Status UnpackZeroPoints(const std::string& initializer_name, std::vector& zero_points) const; - // Checks if a tensor in the ONNX graph is per-axis quantized. - Status IsPerChannelQuantized(const onnxruntime::NodeUnitIODef& io_def, /*out*/ bool& is_per_axis) const; + // Checks if a tensor in the ONNX graph is per-channel quantized. + Status IsPerChannelQuantized(const onnxruntime::NodeUnitIODef& io_def, + /*out*/ bool& is_per_channel, + /*out*/ int64_t& axis) const; private: bool CreateQnnInputOutputTensors(const std::string& qnn_node_name, diff --git a/onnxruntime/test/providers/qnn/conv_test.cc b/onnxruntime/test/providers/qnn/conv_test.cc index b88578a915..5177a629ce 100644 --- a/onnxruntime/test/providers/qnn/conv_test.cc +++ b/onnxruntime/test/providers/qnn/conv_test.cc @@ -154,6 +154,7 @@ static GetTestQDQModelFn BuildQDQPerChannelConvTestCase(const s const TestInputDef& input_def, const TestInputDef& weights_def, const TestInputDef& bias_def, + int64_t weight_quant_axis, const std::vector& strides, const std::vector& pads, const std::vector& dilations, @@ -161,8 +162,9 @@ static GetTestQDQModelFn BuildQDQPerChannelConvTestCase(const s const std::string& auto_pad = "NOTSET", bool use_contrib_qdq = false) { return [conv_op_type, input_def, weights_def, bias_def, strides, pads, - dilations, group, auto_pad, use_contrib_qdq](ModelTestBuilder& builder, - std::vector>& output_qparams) { + dilations, group, auto_pad, use_contrib_qdq, + weight_quant_axis](ModelTestBuilder& builder, + std::vector>& output_qparams) { std::vector conv_inputs; // input -> Q/DQ -> @@ -174,7 +176,6 @@ static GetTestQDQModelFn BuildQDQPerChannelConvTestCase(const s // Quantized(weights) -> DQ -> ORT_ENFORCE(weights_def.IsInitializer() && weights_def.IsRawData()); - int64_t weight_quant_axis = conv_op_type == "Conv" ? 0 : 1; // 0 for Conv, 1 for ConvTranspose std::vector weight_scales; std::vector weight_zero_points; GetTestInputQuantParamsPerChannel(weights_def, weight_scales, weight_zero_points, @@ -283,6 +284,7 @@ template static void RunHTPConvOpPerChannelTest(const std::string& conv_op_type, const TestInputDef& input_def, const TestInputDef& weights_def, const TestInputDef& bias_def, + int64_t weight_quant_axis, const std::vector& strides, const std::vector& pads, const std::vector& dilations, @@ -303,8 +305,9 @@ static void RunHTPConvOpPerChannelTest(const std::string& conv_op_type, const Te auto f32_fn = BuildF32ConvTestCase(conv_op_type, input_def, weights_def, bias_def, strides, pads, dilations, group, auto_pad); auto qdq_fn = BuildQDQPerChannelConvTestCase(conv_op_type, input_def, weights_def, - bias_def, strides, pads, dilations, - group, auto_pad, use_contrib_qdq); + bias_def, weight_quant_axis, strides, + pads, dilations, group, auto_pad, + use_contrib_qdq); TestQDQModelAccuracy(f32_fn, qdq_fn, provider_options, opset, expected_ep_assignment, tolerance); } @@ -713,6 +716,7 @@ TEST_F(QnnHTPBackendTests, ConvU8S8S32_PerChannel) { input_def, weight_def, bias_def, + 0, // weight quant axis {1, 1}, // Strides {0, 0, 0, 0}, // Pads {1, 1}, // Dilations @@ -723,6 +727,34 @@ TEST_F(QnnHTPBackendTests, ConvU8S8S32_PerChannel) { 13); // opset } +// Test per-channel QDQ Conv is rejected with weight axis != 0 +TEST_F(QnnHTPBackendTests, Conv_PerChannel_UnsupportedAxis) { + std::vector input_shape = {1, 2, 4, 4}; + std::vector weight_shape = {3, 2, 3, 3}; + std::vector bias_shape = {3}; + + TestInputDef input_def(input_shape, false, + GetFloatDataInRange(-10.0f, 10.0f, TensorShape(input_shape).Size())); + TestInputDef weight_def(weight_shape, true, + GetFloatDataInRange(-1.0f, 5.0f, TensorShape(weight_shape).Size())); + TestInputDef bias_def(bias_shape, true, + GetFloatDataInRange(-1.0f, 1.0f, TensorShape(bias_shape).Size())); + + RunHTPConvOpPerChannelTest("Conv", + input_def, + weight_def, + bias_def, + 2, // weight quant axis + {1, 1}, // Strides + {0, 0, 0, 0}, // Pads + {1, 1}, // Dilations + 1, // default group + "NOTSET", + ExpectedEPNodeAssignment::None, + false, // use_qdq_contrib_ops + 13); // opset +} + // Test per-channel QDQ Conv. in0: u8, in1 (weight): s8, in2 (bias): s32, out: u8 // \QNN\HTP\HTP\src\hexagon\prepare\graph_prepare.cc:203:ERROR:could not create op: q::QNN_Conv3d_w_scale // \QNN\HTP\HTP\src\hexagon\prepare\graph_prepare.cc:1187:ERROR:Op 0x1a preparation failed with err:-1 @@ -748,6 +780,7 @@ TEST_F(QnnHTPBackendTests, DISABLED_Conv3D_U8S8S32_PerChannel) { input_def, weight_def, bias_def, + 0, // weight quant axis {1, 1, 1}, // Strides {0, 0, 0, 0, 0, 0}, // Pads {1, 1, 1}, // Dilations @@ -776,6 +809,7 @@ TEST_F(QnnHTPBackendTests, ConvDepthwiseU8S8S32_PerChannel) { input_def, weight_def, bias_def, + 0, // weight quant axis {1, 1}, // Strides {0, 0, 0, 0}, // Pads {1, 1}, // Dilations @@ -811,6 +845,7 @@ TEST_F(QnnHTPBackendTests, DISABLED_Conv3D_U8S8S32_PerChannel2) { input_def, weight_def, bias_def, + 0, // weight quant axis {1, 1, 1}, // Strides {0, 0, 0, 0, 0, 0}, // Pads {1, 1, 1}, // Dilations @@ -838,6 +873,7 @@ TEST_F(QnnHTPBackendTests, ConvTransposeU8S8S32_PerChannel) { input_def, weight_def, bias_def, + 1, // weight quant axis {1, 1}, // Strides {0, 0, 0, 0}, // Pads {1, 1}, // Dilations @@ -848,6 +884,34 @@ TEST_F(QnnHTPBackendTests, ConvTransposeU8S8S32_PerChannel) { 13); // opset } +// Test per-channel QDQ ConvTranspose is unsupported with weight axis != 1. +TEST_F(QnnHTPBackendTests, ConvTranspose_PerChannel_UnsupportedAxis) { + std::vector input_shape = {1, 2, 4, 4}; + std::vector weight_shape = {2, 3, 3, 3}; + std::vector bias_shape = {3}; + + TestInputDef input_def(input_shape, false, + GetFloatDataInRange(-10.0f, 10.0f, TensorShape(input_shape).Size())); + TestInputDef weight_def(weight_shape, true, + GetFloatDataInRange(-1.0f, 5.0f, TensorShape(weight_shape).Size())); + TestInputDef bias_def(bias_shape, true, + GetFloatDataInRange(-1.0f, 1.0f, TensorShape(bias_shape).Size())); + + RunHTPConvOpPerChannelTest("ConvTranspose", + input_def, + weight_def, + bias_def, + 2, // weight quant axis + {1, 1}, // Strides + {0, 0, 0, 0}, // Pads + {1, 1}, // Dilations + 1, // default group + "NOTSET", + ExpectedEPNodeAssignment::None, + false, // use_qdq_contrib_ops + 13); // opset +} + // ConvTranspose3D per-channel // Disable it for 2.21 since it failed, re-enabled it for 2.22 TEST_F(QnnHTPBackendTests, DISABLED_ConvTranspose3D_U8S8S32_PerChannel) { @@ -866,6 +930,7 @@ TEST_F(QnnHTPBackendTests, DISABLED_ConvTranspose3D_U8S8S32_PerChannel) { input_def, weight_def, bias_def, + 1, // weight quant axis {1, 1, 1}, // Strides {0, 0, 0, 0, 0, 0}, // Pads {1, 1, 1}, // Dilations @@ -893,6 +958,7 @@ TEST_F(QnnHTPBackendTests, ConvU16S8S32_PerChannel) { input_def, weight_def, bias_def, + 0, // weight quant axis {1, 1}, // Strides {0, 0, 0, 0}, // Pads {1, 1}, // Dilations @@ -928,6 +994,7 @@ TEST_F(QnnHTPBackendTests, DISABLED_Conv3D_U16S8S32_PerChannel) { input_def, weight_def, bias_def, + 0, // weight quant axis {1, 1, 1}, // Strides {0, 0, 0, 0, 0, 0}, // Pads {1, 1, 1}, // Dilations @@ -955,6 +1022,7 @@ TEST_F(QnnHTPBackendTests, ConvTransposeU16S8S32_PerChannel) { input_def, weight_def, bias_def, + 1, // weight quant axis {1, 1}, // Strides {0, 0, 0, 0}, // Pads {1, 1}, // Dilations @@ -982,6 +1050,7 @@ TEST_F(QnnHTPBackendTests, DISABLED_ConvTranspose3D_U16S8S32_PerChannel) { input_def, weight_def, bias_def, + 1, // weight quant axis {1, 1, 1}, // Strides {0, 0, 0, 0, 0, 0}, // Pads {1, 1, 1}, // Dilations @@ -1010,6 +1079,7 @@ TEST_F(QnnHTPBackendTests, ConvDepthwiseU16S8S32_PerChannel) { input_def, weight_def, bias_def, + 0, // weight quant axis {1, 1}, // Strides {0, 0, 0, 0}, // Pads {1, 1}, // Dilations @@ -1045,6 +1115,7 @@ TEST_F(QnnHTPBackendTests, DISABLED_Conv3D_U16S8S32_PerChannel2) { input_def, weight_def, bias_def, + 0, // weight quant axis {1, 1, 1}, // Strides {0, 0, 0, 0, 0, 0}, // Pads {1, 1, 1}, // Dilations