[QNN EP] Add quantization axis checks for Conv/ConvTranspose/Q/DQ ops (#21016)

### Description Updates QNN EP to reject Conv/ConvTranspose/Q/DQ ops with unsupported quantization axis values. ### Motivation and Context Allows these unsupported operators to be handled by the CPU EP. Fixes errors like the following: > Node 'ConvTranspose' OpType:ConvTranspose with domain:com.ms.internal.nhwc was inserted using the NHWC format as requested by QNNExecutionProvider, but was not selected by that EP. This means the graph is now invalid as there will not be an EP able to run the node. This could be a bug in layout transformer, or in the GetCapability implementation of the EP. --------- Signed-off-by: adrianlizarraga <adlizarraga@microsoft.com>
2026-07-10 17:37:14 +00:00 · 2024-06-17 09:46:14 -07:00 · 2024-06-17 09:46:14 -07:00 · a6c18ae9df
commit a6c18ae9df
parent c501c6ffaf
6 changed files with 139 additions and 20 deletions
--- a/onnxruntime/core/framework/node_unit.cc
+++ b/onnxruntime/core/framework/node_unit.cc
@ -283,6 +283,7 @@ ProviderType NodeUnit::GetExecutionProviderType() const noexcept { return target
 void NodeUnit::InitForSingleNode() {
  const auto& input_defs = target_node_.InputDefs();
  const auto& output_defs = target_node_.OutputDefs();
+  const auto& node_attrs = target_node_.GetAttributes();
  auto qlinear_type = GetQLinearOpType(target_node_);
  if (qlinear_type == QLinearOpType::Unknown || IsVariadicQLinearOp(qlinear_type)) {  // TODO, add variadic support
    // Not a Qlinear op, add all inputs / outputs
@ -321,19 +322,35 @@ void NodeUnit::InitForSingleNode() {
    // DequantizeLinear has 3 inputs
    // x, x_scale, x_zp
    // output is not quantized
-    inputs_.push_back(NodeUnitIODef{*input_defs[0], NodeUnitIODef::QuantParam{*input_defs[1], input_defs.size() == 3
-                                                                                                  ? input_defs[2]
-                                                                                                  : nullptr}});
+
+    // Get the DQ axis attribute if available.
+    std::optional<int64_t> axis;
+    if (auto entry = node_attrs.find("axis"); entry != node_attrs.end()) {
+      axis = entry->second.i();
+    }
+
+    inputs_.push_back(NodeUnitIODef{*input_defs[0],
+                                    NodeUnitIODef::QuantParam{*input_defs[1],
+                                                              input_defs.size() == 3 ? input_defs[2] : nullptr,
+                                                              axis}});
    outputs_.push_back(NodeUnitIODef{*output_defs[0], std::nullopt});

  } else if (qlinear_type == QLinearOpType::QuantizeLinear) {
    // QuantizeLinear the input is not quantized and has 3 inputs
    // x, y_scale, y_zp (optional)
    // The output is quantized
+
+    // Get the Q axis attribute if available.
+    std::optional<int64_t> axis;
+    if (auto entry = node_attrs.find("axis"); entry != node_attrs.end()) {
+      axis = entry->second.i();
+    }
+
    inputs_.push_back(NodeUnitIODef{*input_defs[0], std::nullopt});
-    outputs_.push_back(NodeUnitIODef{*output_defs[0], NodeUnitIODef::QuantParam{*input_defs[1], input_defs.size() == 3
-                                                                                                    ? input_defs[2]
-                                                                                                    : nullptr}});
+    outputs_.push_back(NodeUnitIODef{*output_defs[0],
+                                     NodeUnitIODef::QuantParam{*input_defs[1],
+                                                               input_defs.size() == 3 ? input_defs[2] : nullptr,
+                                                               axis}});
  } else {
    ORT_THROW("The QLinear op [", static_cast<uint8_t>(qlinear_type), "] is not supported");
  }
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc
@ -120,7 +120,8 @@ Status ConvOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
  if (is_npu_backend) {
    const auto& input_1 = inputs[1];  // weight
    bool is_per_axis_quant = false;
-    ORT_RETURN_IF_ERROR(qnn_model_wrapper.IsPerChannelQuantized(input_1, is_per_axis_quant));
+    int64_t quant_axis = 0;
+    ORT_RETURN_IF_ERROR(qnn_model_wrapper.IsPerChannelQuantized(input_1, is_per_axis_quant, quant_axis));

    if (is_per_axis_quant) {
      int32_t elem_data_type = 0;
@ -129,6 +130,13 @@ Status ConvOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
      const bool is_signed_type = (elem_data_type == ONNX_NAMESPACE::TensorProto_DataType_INT8) ||
                                  (elem_data_type == ONNX_NAMESPACE::TensorProto_DataType_INT16);
      ORT_RETURN_IF_NOT(is_signed_type, "Conv weights must be of a signed quantized type if quantized per-channel");
+
+      if (conv_type == OnnxConvType::kConvTranspose) {
+        ORT_RETURN_IF_NOT(quant_axis == 1,
+                          "ConvTranspose's input[1] must be use axis == 1 for per-channel quantization");
+      } else {
+        ORT_RETURN_IF_NOT(quant_axis == 0, "Conv's input[1] must be use axis == 0 for per-channel quantization");
+      }
    }
  }

--- a/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
@ -41,7 +41,7 @@ class SimpleOpBuilder : public BaseOpBuilder {
                                  QnnQuantParamsWrapper& quant_param) const override ORT_MUST_USE_RESULT;

 private:
-  Status ExplicitOpCheck(const NodeUnit& node_unit) const;
+  Status ExplicitOpCheck(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit) const;
  Status ProcessSigmoidOrTanhOutput(QnnModelWrapper& qnn_model_wrapper,
                                    const NodeUnit& node_unit,
                                    std::vector<std::string>&& input_names,
@ -138,7 +138,8 @@ Status SimpleOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
  return Status::OK();
 }

-Status SimpleOpBuilder::ExplicitOpCheck(const NodeUnit& node_unit) const {
+Status SimpleOpBuilder::ExplicitOpCheck(QnnModelWrapper& qnn_model_wrapper,
+                                        const NodeUnit& node_unit) const {
  const std::string& op_type = node_unit.OpType();

  if (op_type == "GridSample") {
@ -158,6 +159,20 @@ Status SimpleOpBuilder::ExplicitOpCheck(const NodeUnit& node_unit) const {
                      "QNN EP only supports Min and Max operators with exactly 2 inputs.");
  }

+  if (op_type == "DequantizeLinear") {
+    bool is_per_chan_quant = false;
+    int64_t quant_axis = 0;
+    ORT_RETURN_IF_ERROR(qnn_model_wrapper.IsPerChannelQuantized(node_unit.Inputs()[0], is_per_chan_quant, quant_axis));
+    ORT_RETURN_IF(is_per_chan_quant, "QNN EP does not support a standalone DQ op with per-channel quantization");
+  }
+
+  if (op_type == "QuantizeLinear") {
+    bool is_per_chan_quant = false;
+    int64_t quant_axis = 0;
+    ORT_RETURN_IF_ERROR(qnn_model_wrapper.IsPerChannelQuantized(node_unit.Outputs()[0], is_per_chan_quant, quant_axis));
+    ORT_RETURN_IF(is_per_chan_quant, "QNN EP does not support a standalone Q op with per-channel quantization");
+  }
+
  return Status::OK();
 }

@ -475,7 +490,7 @@ Status SimpleOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_w
  const std::string& op_type = node_unit.OpType();

  if (do_op_validation) {
-    ORT_RETURN_IF_ERROR(ExplicitOpCheck(node_unit));
+    ORT_RETURN_IF_ERROR(ExplicitOpCheck(qnn_model_wrapper, node_unit));
    // Skip the op validation for DepthToSpace & SpaceToDepth if it's not NHWC data layout
    if (node_unit.Domain() != kMSInternalNHWCDomain && (op_type == "DepthToSpace" || op_type == "SpaceToDepth" || op_type == "GridSample")) {
      return Status::OK();
--- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
@ -412,9 +412,10 @@ Status QnnModelWrapper::UnpackScales(const std::string& initializer_name, std::v

 // Checks if a tensor in the ONNX graph is per-channel quantized.
 Status QnnModelWrapper::IsPerChannelQuantized(const onnxruntime::NodeUnitIODef& io_def,
-                                              /*out*/ bool& is_per_axis) const {
+                                              /*out*/ bool& is_per_channel,
+                                              /*out*/ int64_t& axis) const {
  if (!io_def.quant_param) {
-    is_per_axis = false;
+    is_per_channel = false;
    return Status::OK();
  }

@ -432,7 +433,12 @@ Status QnnModelWrapper::IsPerChannelQuantized(const onnxruntime::NodeUnitIODef&
  const bool is_scalar_or_1_elem_vector = scale_shape.NumDimensions() == 0 ||
                                          (scale_shape.NumDimensions() == 1 && scale_shape.Size() == 1);

-  is_per_axis = !is_scalar_or_1_elem_vector;
+  is_per_channel = !is_scalar_or_1_elem_vector;
+
+  if (is_per_channel) {
+    axis = io_def.quant_param->axis.value_or(1);  // 1 is default axis for Q/DQ ops.
+  }
+
  return Status::OK();
 }

--- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
@ -218,8 +218,10 @@ class QnnModelWrapper {
  // Unpack zero-points from initializer and convert to int32_t (1 zero-point for per-tensor, > 1 for per-channel).
  Status UnpackZeroPoints(const std::string& initializer_name, std::vector<int32_t>& zero_points) const;

-  // Checks if a tensor in the ONNX graph is per-axis quantized.
-  Status IsPerChannelQuantized(const onnxruntime::NodeUnitIODef& io_def, /*out*/ bool& is_per_axis) const;
+  // Checks if a tensor in the ONNX graph is per-channel quantized.
+  Status IsPerChannelQuantized(const onnxruntime::NodeUnitIODef& io_def,
+                               /*out*/ bool& is_per_channel,
+                               /*out*/ int64_t& axis) const;

 private:
  bool CreateQnnInputOutputTensors(const std::string& qnn_node_name,
--- a/onnxruntime/test/providers/qnn/conv_test.cc
+++ b/onnxruntime/test/providers/qnn/conv_test.cc
@ -154,6 +154,7 @@ static GetTestQDQModelFn<ActivationQType> BuildQDQPerChannelConvTestCase(const s
                                                                         const TestInputDef<float>& input_def,
                                                                         const TestInputDef<float>& weights_def,
                                                                         const TestInputDef<float>& bias_def,
+                                                                         int64_t weight_quant_axis,
                                                                         const std::vector<int64_t>& strides,
                                                                         const std::vector<int64_t>& pads,
                                                                         const std::vector<int64_t>& dilations,
@ -161,8 +162,9 @@ static GetTestQDQModelFn<ActivationQType> BuildQDQPerChannelConvTestCase(const s
                                                                         const std::string& auto_pad = "NOTSET",
                                                                         bool use_contrib_qdq = false) {
  return [conv_op_type, input_def, weights_def, bias_def, strides, pads,
-          dilations, group, auto_pad, use_contrib_qdq](ModelTestBuilder& builder,
-                                                       std::vector<QuantParams<ActivationQType>>& output_qparams) {
+          dilations, group, auto_pad, use_contrib_qdq,
+          weight_quant_axis](ModelTestBuilder& builder,
+                             std::vector<QuantParams<ActivationQType>>& output_qparams) {
    std::vector<NodeArg*> conv_inputs;

    // input -> Q/DQ ->
@ -174,7 +176,6 @@ static GetTestQDQModelFn<ActivationQType> BuildQDQPerChannelConvTestCase(const s

    // Quantized(weights) -> DQ ->
    ORT_ENFORCE(weights_def.IsInitializer() && weights_def.IsRawData());
-    int64_t weight_quant_axis = conv_op_type == "Conv" ? 0 : 1;  // 0 for Conv, 1 for ConvTranspose
    std::vector<float> weight_scales;
    std::vector<WeightQType> weight_zero_points;
    GetTestInputQuantParamsPerChannel<WeightQType>(weights_def, weight_scales, weight_zero_points,
@ -283,6 +284,7 @@ template <typename ActivationQType, typename WeightQType>
 static void RunHTPConvOpPerChannelTest(const std::string& conv_op_type, const TestInputDef<float>& input_def,
                                       const TestInputDef<float>& weights_def,
                                       const TestInputDef<float>& bias_def,
+                                       int64_t weight_quant_axis,
                                       const std::vector<int64_t>& strides,
                                       const std::vector<int64_t>& pads,
                                       const std::vector<int64_t>& dilations,
@ -303,8 +305,9 @@ static void RunHTPConvOpPerChannelTest(const std::string& conv_op_type, const Te
  auto f32_fn = BuildF32ConvTestCase(conv_op_type, input_def, weights_def, bias_def, strides, pads, dilations,
                                     group, auto_pad);
  auto qdq_fn = BuildQDQPerChannelConvTestCase<ActivationQType, WeightQType>(conv_op_type, input_def, weights_def,
-                                                                             bias_def, strides, pads, dilations,
-                                                                             group, auto_pad, use_contrib_qdq);
+                                                                             bias_def, weight_quant_axis, strides,
+                                                                             pads, dilations, group, auto_pad,
+                                                                             use_contrib_qdq);
  TestQDQModelAccuracy(f32_fn, qdq_fn, provider_options, opset, expected_ep_assignment, tolerance);
 }

@ -713,6 +716,7 @@ TEST_F(QnnHTPBackendTests, ConvU8S8S32_PerChannel) {
                                              input_def,
                                              weight_def,
                                              bias_def,
+                                              0,             // weight quant axis
                                              {1, 1},        // Strides
                                              {0, 0, 0, 0},  // Pads
                                              {1, 1},        // Dilations
@ -723,6 +727,34 @@ TEST_F(QnnHTPBackendTests, ConvU8S8S32_PerChannel) {
                                              13);    // opset
 }

+// Test per-channel QDQ Conv is rejected with weight axis != 0
+TEST_F(QnnHTPBackendTests, Conv_PerChannel_UnsupportedAxis) {
+  std::vector<int64_t> input_shape = {1, 2, 4, 4};
+  std::vector<int64_t> weight_shape = {3, 2, 3, 3};
+  std::vector<int64_t> bias_shape = {3};
+
+  TestInputDef<float> input_def(input_shape, false,
+                                GetFloatDataInRange(-10.0f, 10.0f, TensorShape(input_shape).Size()));
+  TestInputDef<float> weight_def(weight_shape, true,
+                                 GetFloatDataInRange(-1.0f, 5.0f, TensorShape(weight_shape).Size()));
+  TestInputDef<float> bias_def(bias_shape, true,
+                               GetFloatDataInRange(-1.0f, 1.0f, TensorShape(bias_shape).Size()));
+
+  RunHTPConvOpPerChannelTest<uint8_t, int8_t>("Conv",
+                                              input_def,
+                                              weight_def,
+                                              bias_def,
+                                              2,             // weight quant axis
+                                              {1, 1},        // Strides
+                                              {0, 0, 0, 0},  // Pads
+                                              {1, 1},        // Dilations
+                                              1,             // default group
+                                              "NOTSET",
+                                              ExpectedEPNodeAssignment::None,
+                                              false,  // use_qdq_contrib_ops
+                                              13);    // opset
+}
+
 // Test per-channel QDQ Conv. in0: u8, in1 (weight): s8, in2 (bias): s32, out: u8
 // \QNN\HTP\HTP\src\hexagon\prepare\graph_prepare.cc:203:ERROR:could not create op: q::QNN_Conv3d_w_scale
 // \QNN\HTP\HTP\src\hexagon\prepare\graph_prepare.cc:1187:ERROR:Op 0x1a preparation failed with err:-1
@ -748,6 +780,7 @@ TEST_F(QnnHTPBackendTests, DISABLED_Conv3D_U8S8S32_PerChannel) {
                                              input_def,
                                              weight_def,
                                              bias_def,
+                                              0,                   // weight quant axis
                                              {1, 1, 1},           // Strides
                                              {0, 0, 0, 0, 0, 0},  // Pads
                                              {1, 1, 1},           // Dilations
@ -776,6 +809,7 @@ TEST_F(QnnHTPBackendTests, ConvDepthwiseU8S8S32_PerChannel) {
                                              input_def,
                                              weight_def,
                                              bias_def,
+                                              0,             // weight quant axis
                                              {1, 1},        // Strides
                                              {0, 0, 0, 0},  // Pads
                                              {1, 1},        // Dilations
@ -811,6 +845,7 @@ TEST_F(QnnHTPBackendTests, DISABLED_Conv3D_U8S8S32_PerChannel2) {
                                              input_def,
                                              weight_def,
                                              bias_def,
+                                              0,                   // weight quant axis
                                              {1, 1, 1},           // Strides
                                              {0, 0, 0, 0, 0, 0},  // Pads
                                              {1, 1, 1},           // Dilations
@ -838,6 +873,7 @@ TEST_F(QnnHTPBackendTests, ConvTransposeU8S8S32_PerChannel) {
                                              input_def,
                                              weight_def,
                                              bias_def,
+                                              1,             // weight quant axis
                                              {1, 1},        // Strides
                                              {0, 0, 0, 0},  // Pads
                                              {1, 1},        // Dilations
@ -848,6 +884,34 @@ TEST_F(QnnHTPBackendTests, ConvTransposeU8S8S32_PerChannel) {
                                              13);    // opset
 }

+// Test per-channel QDQ ConvTranspose is unsupported with weight axis != 1.
+TEST_F(QnnHTPBackendTests, ConvTranspose_PerChannel_UnsupportedAxis) {
+  std::vector<int64_t> input_shape = {1, 2, 4, 4};
+  std::vector<int64_t> weight_shape = {2, 3, 3, 3};
+  std::vector<int64_t> bias_shape = {3};
+
+  TestInputDef<float> input_def(input_shape, false,
+                                GetFloatDataInRange(-10.0f, 10.0f, TensorShape(input_shape).Size()));
+  TestInputDef<float> weight_def(weight_shape, true,
+                                 GetFloatDataInRange(-1.0f, 5.0f, TensorShape(weight_shape).Size()));
+  TestInputDef<float> bias_def(bias_shape, true,
+                               GetFloatDataInRange(-1.0f, 1.0f, TensorShape(bias_shape).Size()));
+
+  RunHTPConvOpPerChannelTest<uint8_t, int8_t>("ConvTranspose",
+                                              input_def,
+                                              weight_def,
+                                              bias_def,
+                                              2,             // weight quant axis
+                                              {1, 1},        // Strides
+                                              {0, 0, 0, 0},  // Pads
+                                              {1, 1},        // Dilations
+                                              1,             // default group
+                                              "NOTSET",
+                                              ExpectedEPNodeAssignment::None,
+                                              false,  // use_qdq_contrib_ops
+                                              13);    // opset
+}
+
 // ConvTranspose3D per-channel
 // Disable it for 2.21 since it failed, re-enabled it for 2.22
 TEST_F(QnnHTPBackendTests, DISABLED_ConvTranspose3D_U8S8S32_PerChannel) {
@ -866,6 +930,7 @@ TEST_F(QnnHTPBackendTests, DISABLED_ConvTranspose3D_U8S8S32_PerChannel) {
                                              input_def,
                                              weight_def,
                                              bias_def,
+                                              1,                   // weight quant axis
                                              {1, 1, 1},           // Strides
                                              {0, 0, 0, 0, 0, 0},  // Pads
                                              {1, 1, 1},           // Dilations
@ -893,6 +958,7 @@ TEST_F(QnnHTPBackendTests, ConvU16S8S32_PerChannel) {
                                               input_def,
                                               weight_def,
                                               bias_def,
+                                               0,             // weight quant axis
                                               {1, 1},        // Strides
                                               {0, 0, 0, 0},  // Pads
                                               {1, 1},        // Dilations
@ -928,6 +994,7 @@ TEST_F(QnnHTPBackendTests, DISABLED_Conv3D_U16S8S32_PerChannel) {
                                               input_def,
                                               weight_def,
                                               bias_def,
+                                               0,                   // weight quant axis
                                               {1, 1, 1},           // Strides
                                               {0, 0, 0, 0, 0, 0},  // Pads
                                               {1, 1, 1},           // Dilations
@ -955,6 +1022,7 @@ TEST_F(QnnHTPBackendTests, ConvTransposeU16S8S32_PerChannel) {
                                               input_def,
                                               weight_def,
                                               bias_def,
+                                               1,             // weight quant axis
                                               {1, 1},        // Strides
                                               {0, 0, 0, 0},  // Pads
                                               {1, 1},        // Dilations
@ -982,6 +1050,7 @@ TEST_F(QnnHTPBackendTests, DISABLED_ConvTranspose3D_U16S8S32_PerChannel) {
                                               input_def,
                                               weight_def,
                                               bias_def,
+                                               1,                   // weight quant axis
                                               {1, 1, 1},           // Strides
                                               {0, 0, 0, 0, 0, 0},  // Pads
                                               {1, 1, 1},           // Dilations
@ -1010,6 +1079,7 @@ TEST_F(QnnHTPBackendTests, ConvDepthwiseU16S8S32_PerChannel) {
                                               input_def,
                                               weight_def,
                                               bias_def,
+                                               0,             // weight quant axis
                                               {1, 1},        // Strides
                                               {0, 0, 0, 0},  // Pads
                                               {1, 1},        // Dilations
@ -1045,6 +1115,7 @@ TEST_F(QnnHTPBackendTests, DISABLED_Conv3D_U16S8S32_PerChannel2) {
                                               input_def,
                                               weight_def,
                                               bias_def,
+                                               0,                   // weight quant axis
                                               {1, 1, 1},           // Strides
                                               {0, 0, 0, 0, 0, 0},  // Pads
                                               {1, 1, 1},           // Dilations