diff --git a/onnxruntime/core/framework/node_unit.cc b/onnxruntime/core/framework/node_unit.cc index 174942b903..4e2f22dea1 100644 --- a/onnxruntime/core/framework/node_unit.cc +++ b/onnxruntime/core/framework/node_unit.cc @@ -283,6 +283,7 @@ ProviderType NodeUnit::GetExecutionProviderType() const noexcept { return target void NodeUnit::InitForSingleNode() { const auto& input_defs = target_node_.InputDefs(); const auto& output_defs = target_node_.OutputDefs(); + const auto& node_attrs = target_node_.GetAttributes(); auto qlinear_type = GetQLinearOpType(target_node_); if (qlinear_type == QLinearOpType::Unknown || IsVariadicQLinearOp(qlinear_type)) { // TODO, add variadic support // Not a Qlinear op, add all inputs / outputs @@ -321,19 +322,35 @@ void NodeUnit::InitForSingleNode() { // DequantizeLinear has 3 inputs // x, x_scale, x_zp // output is not quantized - inputs_.push_back(NodeUnitIODef{*input_defs[0], NodeUnitIODef::QuantParam{*input_defs[1], input_defs.size() == 3 - ? input_defs[2] - : nullptr}}); + + // Get the DQ axis attribute if available. + std::optional axis; + if (auto entry = node_attrs.find("axis"); entry != node_attrs.end()) { + axis = entry->second.i(); + } + + inputs_.push_back(NodeUnitIODef{*input_defs[0], + NodeUnitIODef::QuantParam{*input_defs[1], + input_defs.size() == 3 ? input_defs[2] : nullptr, + axis}}); outputs_.push_back(NodeUnitIODef{*output_defs[0], std::nullopt}); } else if (qlinear_type == QLinearOpType::QuantizeLinear) { // QuantizeLinear the input is not quantized and has 3 inputs // x, y_scale, y_zp (optional) // The output is quantized + + // Get the Q axis attribute if available. + std::optional axis; + if (auto entry = node_attrs.find("axis"); entry != node_attrs.end()) { + axis = entry->second.i(); + } + inputs_.push_back(NodeUnitIODef{*input_defs[0], std::nullopt}); - outputs_.push_back(NodeUnitIODef{*output_defs[0], NodeUnitIODef::QuantParam{*input_defs[1], input_defs.size() == 3 - ? input_defs[2] - : nullptr}}); + outputs_.push_back(NodeUnitIODef{*output_defs[0], + NodeUnitIODef::QuantParam{*input_defs[1], + input_defs.size() == 3 ? input_defs[2] : nullptr, + axis}}); } else { ORT_THROW("The QLinear op [", static_cast(qlinear_type), "] is not supported"); } diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc index 4eeca98454..1713f201c9 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc @@ -120,7 +120,8 @@ Status ConvOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper, if (is_npu_backend) { const auto& input_1 = inputs[1]; // weight bool is_per_axis_quant = false; - ORT_RETURN_IF_ERROR(qnn_model_wrapper.IsPerChannelQuantized(input_1, is_per_axis_quant)); + int64_t quant_axis = 0; + ORT_RETURN_IF_ERROR(qnn_model_wrapper.IsPerChannelQuantized(input_1, is_per_axis_quant, quant_axis)); if (is_per_axis_quant) { int32_t elem_data_type = 0; @@ -129,6 +130,13 @@ Status ConvOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper, const bool is_signed_type = (elem_data_type == ONNX_NAMESPACE::TensorProto_DataType_INT8) || (elem_data_type == ONNX_NAMESPACE::TensorProto_DataType_INT16); ORT_RETURN_IF_NOT(is_signed_type, "Conv weights must be of a signed quantized type if quantized per-channel"); + + if (conv_type == OnnxConvType::kConvTranspose) { + ORT_RETURN_IF_NOT(quant_axis == 1, + "ConvTranspose's input[1] must be use axis == 1 for per-channel quantization"); + } else { + ORT_RETURN_IF_NOT(quant_axis == 0, "Conv's input[1] must be use axis == 0 for per-channel quantization"); + } } } diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc index 7e2d1ef05b..285781aaa3 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc @@ -41,7 +41,7 @@ class SimpleOpBuilder : public BaseOpBuilder { QnnQuantParamsWrapper& quant_param) const override ORT_MUST_USE_RESULT; private: - Status ExplicitOpCheck(const NodeUnit& node_unit) const; + Status ExplicitOpCheck(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit) const; Status ProcessSigmoidOrTanhOutput(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit, std::vector&& input_names, @@ -138,7 +138,8 @@ Status SimpleOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper, return Status::OK(); } -Status SimpleOpBuilder::ExplicitOpCheck(const NodeUnit& node_unit) const { +Status SimpleOpBuilder::ExplicitOpCheck(QnnModelWrapper& qnn_model_wrapper, + const NodeUnit& node_unit) const { const std::string& op_type = node_unit.OpType(); if (op_type == "GridSample") { @@ -158,6 +159,20 @@ Status SimpleOpBuilder::ExplicitOpCheck(const NodeUnit& node_unit) const { "QNN EP only supports Min and Max operators with exactly 2 inputs."); } + if (op_type == "DequantizeLinear") { + bool is_per_chan_quant = false; + int64_t quant_axis = 0; + ORT_RETURN_IF_ERROR(qnn_model_wrapper.IsPerChannelQuantized(node_unit.Inputs()[0], is_per_chan_quant, quant_axis)); + ORT_RETURN_IF(is_per_chan_quant, "QNN EP does not support a standalone DQ op with per-channel quantization"); + } + + if (op_type == "QuantizeLinear") { + bool is_per_chan_quant = false; + int64_t quant_axis = 0; + ORT_RETURN_IF_ERROR(qnn_model_wrapper.IsPerChannelQuantized(node_unit.Outputs()[0], is_per_chan_quant, quant_axis)); + ORT_RETURN_IF(is_per_chan_quant, "QNN EP does not support a standalone Q op with per-channel quantization"); + } + return Status::OK(); } @@ -475,7 +490,7 @@ Status SimpleOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_w const std::string& op_type = node_unit.OpType(); if (do_op_validation) { - ORT_RETURN_IF_ERROR(ExplicitOpCheck(node_unit)); + ORT_RETURN_IF_ERROR(ExplicitOpCheck(qnn_model_wrapper, node_unit)); // Skip the op validation for DepthToSpace & SpaceToDepth if it's not NHWC data layout if (node_unit.Domain() != kMSInternalNHWCDomain && (op_type == "DepthToSpace" || op_type == "SpaceToDepth" || op_type == "GridSample")) { return Status::OK(); diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc index 6f9ac20285..3a8a8af17b 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc @@ -412,9 +412,10 @@ Status QnnModelWrapper::UnpackScales(const std::string& initializer_name, std::v // Checks if a tensor in the ONNX graph is per-channel quantized. Status QnnModelWrapper::IsPerChannelQuantized(const onnxruntime::NodeUnitIODef& io_def, - /*out*/ bool& is_per_axis) const { + /*out*/ bool& is_per_channel, + /*out*/ int64_t& axis) const { if (!io_def.quant_param) { - is_per_axis = false; + is_per_channel = false; return Status::OK(); } @@ -432,7 +433,12 @@ Status QnnModelWrapper::IsPerChannelQuantized(const onnxruntime::NodeUnitIODef& const bool is_scalar_or_1_elem_vector = scale_shape.NumDimensions() == 0 || (scale_shape.NumDimensions() == 1 && scale_shape.Size() == 1); - is_per_axis = !is_scalar_or_1_elem_vector; + is_per_channel = !is_scalar_or_1_elem_vector; + + if (is_per_channel) { + axis = io_def.quant_param->axis.value_or(1); // 1 is default axis for Q/DQ ops. + } + return Status::OK(); } diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h index 3fd268d1b3..0705a1d1b8 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h @@ -218,8 +218,10 @@ class QnnModelWrapper { // Unpack zero-points from initializer and convert to int32_t (1 zero-point for per-tensor, > 1 for per-channel). Status UnpackZeroPoints(const std::string& initializer_name, std::vector& zero_points) const; - // Checks if a tensor in the ONNX graph is per-axis quantized. - Status IsPerChannelQuantized(const onnxruntime::NodeUnitIODef& io_def, /*out*/ bool& is_per_axis) const; + // Checks if a tensor in the ONNX graph is per-channel quantized. + Status IsPerChannelQuantized(const onnxruntime::NodeUnitIODef& io_def, + /*out*/ bool& is_per_channel, + /*out*/ int64_t& axis) const; private: bool CreateQnnInputOutputTensors(const std::string& qnn_node_name, diff --git a/onnxruntime/test/providers/qnn/conv_test.cc b/onnxruntime/test/providers/qnn/conv_test.cc index b88578a915..5177a629ce 100644 --- a/onnxruntime/test/providers/qnn/conv_test.cc +++ b/onnxruntime/test/providers/qnn/conv_test.cc @@ -154,6 +154,7 @@ static GetTestQDQModelFn BuildQDQPerChannelConvTestCase(const s const TestInputDef& input_def, const TestInputDef& weights_def, const TestInputDef& bias_def, + int64_t weight_quant_axis, const std::vector& strides, const std::vector& pads, const std::vector& dilations, @@ -161,8 +162,9 @@ static GetTestQDQModelFn BuildQDQPerChannelConvTestCase(const s const std::string& auto_pad = "NOTSET", bool use_contrib_qdq = false) { return [conv_op_type, input_def, weights_def, bias_def, strides, pads, - dilations, group, auto_pad, use_contrib_qdq](ModelTestBuilder& builder, - std::vector>& output_qparams) { + dilations, group, auto_pad, use_contrib_qdq, + weight_quant_axis](ModelTestBuilder& builder, + std::vector>& output_qparams) { std::vector conv_inputs; // input -> Q/DQ -> @@ -174,7 +176,6 @@ static GetTestQDQModelFn BuildQDQPerChannelConvTestCase(const s // Quantized(weights) -> DQ -> ORT_ENFORCE(weights_def.IsInitializer() && weights_def.IsRawData()); - int64_t weight_quant_axis = conv_op_type == "Conv" ? 0 : 1; // 0 for Conv, 1 for ConvTranspose std::vector weight_scales; std::vector weight_zero_points; GetTestInputQuantParamsPerChannel(weights_def, weight_scales, weight_zero_points, @@ -283,6 +284,7 @@ template static void RunHTPConvOpPerChannelTest(const std::string& conv_op_type, const TestInputDef& input_def, const TestInputDef& weights_def, const TestInputDef& bias_def, + int64_t weight_quant_axis, const std::vector& strides, const std::vector& pads, const std::vector& dilations, @@ -303,8 +305,9 @@ static void RunHTPConvOpPerChannelTest(const std::string& conv_op_type, const Te auto f32_fn = BuildF32ConvTestCase(conv_op_type, input_def, weights_def, bias_def, strides, pads, dilations, group, auto_pad); auto qdq_fn = BuildQDQPerChannelConvTestCase(conv_op_type, input_def, weights_def, - bias_def, strides, pads, dilations, - group, auto_pad, use_contrib_qdq); + bias_def, weight_quant_axis, strides, + pads, dilations, group, auto_pad, + use_contrib_qdq); TestQDQModelAccuracy(f32_fn, qdq_fn, provider_options, opset, expected_ep_assignment, tolerance); } @@ -713,6 +716,7 @@ TEST_F(QnnHTPBackendTests, ConvU8S8S32_PerChannel) { input_def, weight_def, bias_def, + 0, // weight quant axis {1, 1}, // Strides {0, 0, 0, 0}, // Pads {1, 1}, // Dilations @@ -723,6 +727,34 @@ TEST_F(QnnHTPBackendTests, ConvU8S8S32_PerChannel) { 13); // opset } +// Test per-channel QDQ Conv is rejected with weight axis != 0 +TEST_F(QnnHTPBackendTests, Conv_PerChannel_UnsupportedAxis) { + std::vector input_shape = {1, 2, 4, 4}; + std::vector weight_shape = {3, 2, 3, 3}; + std::vector bias_shape = {3}; + + TestInputDef input_def(input_shape, false, + GetFloatDataInRange(-10.0f, 10.0f, TensorShape(input_shape).Size())); + TestInputDef weight_def(weight_shape, true, + GetFloatDataInRange(-1.0f, 5.0f, TensorShape(weight_shape).Size())); + TestInputDef bias_def(bias_shape, true, + GetFloatDataInRange(-1.0f, 1.0f, TensorShape(bias_shape).Size())); + + RunHTPConvOpPerChannelTest("Conv", + input_def, + weight_def, + bias_def, + 2, // weight quant axis + {1, 1}, // Strides + {0, 0, 0, 0}, // Pads + {1, 1}, // Dilations + 1, // default group + "NOTSET", + ExpectedEPNodeAssignment::None, + false, // use_qdq_contrib_ops + 13); // opset +} + // Test per-channel QDQ Conv. in0: u8, in1 (weight): s8, in2 (bias): s32, out: u8 // \QNN\HTP\HTP\src\hexagon\prepare\graph_prepare.cc:203:ERROR:could not create op: q::QNN_Conv3d_w_scale // \QNN\HTP\HTP\src\hexagon\prepare\graph_prepare.cc:1187:ERROR:Op 0x1a preparation failed with err:-1 @@ -748,6 +780,7 @@ TEST_F(QnnHTPBackendTests, DISABLED_Conv3D_U8S8S32_PerChannel) { input_def, weight_def, bias_def, + 0, // weight quant axis {1, 1, 1}, // Strides {0, 0, 0, 0, 0, 0}, // Pads {1, 1, 1}, // Dilations @@ -776,6 +809,7 @@ TEST_F(QnnHTPBackendTests, ConvDepthwiseU8S8S32_PerChannel) { input_def, weight_def, bias_def, + 0, // weight quant axis {1, 1}, // Strides {0, 0, 0, 0}, // Pads {1, 1}, // Dilations @@ -811,6 +845,7 @@ TEST_F(QnnHTPBackendTests, DISABLED_Conv3D_U8S8S32_PerChannel2) { input_def, weight_def, bias_def, + 0, // weight quant axis {1, 1, 1}, // Strides {0, 0, 0, 0, 0, 0}, // Pads {1, 1, 1}, // Dilations @@ -838,6 +873,7 @@ TEST_F(QnnHTPBackendTests, ConvTransposeU8S8S32_PerChannel) { input_def, weight_def, bias_def, + 1, // weight quant axis {1, 1}, // Strides {0, 0, 0, 0}, // Pads {1, 1}, // Dilations @@ -848,6 +884,34 @@ TEST_F(QnnHTPBackendTests, ConvTransposeU8S8S32_PerChannel) { 13); // opset } +// Test per-channel QDQ ConvTranspose is unsupported with weight axis != 1. +TEST_F(QnnHTPBackendTests, ConvTranspose_PerChannel_UnsupportedAxis) { + std::vector input_shape = {1, 2, 4, 4}; + std::vector weight_shape = {2, 3, 3, 3}; + std::vector bias_shape = {3}; + + TestInputDef input_def(input_shape, false, + GetFloatDataInRange(-10.0f, 10.0f, TensorShape(input_shape).Size())); + TestInputDef weight_def(weight_shape, true, + GetFloatDataInRange(-1.0f, 5.0f, TensorShape(weight_shape).Size())); + TestInputDef bias_def(bias_shape, true, + GetFloatDataInRange(-1.0f, 1.0f, TensorShape(bias_shape).Size())); + + RunHTPConvOpPerChannelTest("ConvTranspose", + input_def, + weight_def, + bias_def, + 2, // weight quant axis + {1, 1}, // Strides + {0, 0, 0, 0}, // Pads + {1, 1}, // Dilations + 1, // default group + "NOTSET", + ExpectedEPNodeAssignment::None, + false, // use_qdq_contrib_ops + 13); // opset +} + // ConvTranspose3D per-channel // Disable it for 2.21 since it failed, re-enabled it for 2.22 TEST_F(QnnHTPBackendTests, DISABLED_ConvTranspose3D_U8S8S32_PerChannel) { @@ -866,6 +930,7 @@ TEST_F(QnnHTPBackendTests, DISABLED_ConvTranspose3D_U8S8S32_PerChannel) { input_def, weight_def, bias_def, + 1, // weight quant axis {1, 1, 1}, // Strides {0, 0, 0, 0, 0, 0}, // Pads {1, 1, 1}, // Dilations @@ -893,6 +958,7 @@ TEST_F(QnnHTPBackendTests, ConvU16S8S32_PerChannel) { input_def, weight_def, bias_def, + 0, // weight quant axis {1, 1}, // Strides {0, 0, 0, 0}, // Pads {1, 1}, // Dilations @@ -928,6 +994,7 @@ TEST_F(QnnHTPBackendTests, DISABLED_Conv3D_U16S8S32_PerChannel) { input_def, weight_def, bias_def, + 0, // weight quant axis {1, 1, 1}, // Strides {0, 0, 0, 0, 0, 0}, // Pads {1, 1, 1}, // Dilations @@ -955,6 +1022,7 @@ TEST_F(QnnHTPBackendTests, ConvTransposeU16S8S32_PerChannel) { input_def, weight_def, bias_def, + 1, // weight quant axis {1, 1}, // Strides {0, 0, 0, 0}, // Pads {1, 1}, // Dilations @@ -982,6 +1050,7 @@ TEST_F(QnnHTPBackendTests, DISABLED_ConvTranspose3D_U16S8S32_PerChannel) { input_def, weight_def, bias_def, + 1, // weight quant axis {1, 1, 1}, // Strides {0, 0, 0, 0, 0, 0}, // Pads {1, 1, 1}, // Dilations @@ -1010,6 +1079,7 @@ TEST_F(QnnHTPBackendTests, ConvDepthwiseU16S8S32_PerChannel) { input_def, weight_def, bias_def, + 0, // weight quant axis {1, 1}, // Strides {0, 0, 0, 0}, // Pads {1, 1}, // Dilations @@ -1045,6 +1115,7 @@ TEST_F(QnnHTPBackendTests, DISABLED_Conv3D_U16S8S32_PerChannel2) { input_def, weight_def, bias_def, + 0, // weight quant axis {1, 1, 1}, // Strides {0, 0, 0, 0, 0, 0}, // Pads {1, 1, 1}, // Dilations