From a6c18ae9df741ea661c3abc2f7bcce80e2924c77 Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Mon, 17 Jun 2024 09:46:14 -0700
Subject: [PATCH] [QNN EP] Add quantization axis checks for
 Conv/ConvTranspose/Q/DQ ops (#21016)

### Description
Updates QNN EP to reject Conv/ConvTranspose/Q/DQ ops with unsupported
quantization axis values.


### Motivation and Context
Allows these unsupported operators to be handled by the CPU EP.

Fixes errors like the following:

> Node 'ConvTranspose' OpType:ConvTranspose with
domain:com.ms.internal.nhwc was inserted using the NHWC format as
requested by QNNExecutionProvider, but was not selected by that EP. This
means the graph is now invalid as there will not be an EP able to run
the node. This could be a bug in layout transformer, or in the
GetCapability implementation of the EP.

---------

Signed-off-by: adrianlizarraga <adlizarraga@microsoft.com>
---
 onnxruntime/core/framework/node_unit.cc       | 29 +++++--
 .../qnn/builder/opbuilder/conv_op_builder.cc  | 10 ++-
 .../builder/opbuilder/simple_op_builder.cc    | 21 ++++-
 .../qnn/builder/qnn_model_wrapper.cc          | 12 ++-
 .../providers/qnn/builder/qnn_model_wrapper.h |  6 +-
 onnxruntime/test/providers/qnn/conv_test.cc   | 81 +++++++++++++++++--
 6 files changed, 139 insertions(+), 20 deletions(-)

diff --git a/onnxruntime/core/framework/node_unit.cc b/onnxruntime/core/framework/node_unit.cc
index 174942b903..4e2f22dea1 100644
--- a/onnxruntime/core/framework/node_unit.cc
+++ b/onnxruntime/core/framework/node_unit.cc
@@ -283,6 +283,7 @@ ProviderType NodeUnit::GetExecutionProviderType() const noexcept { return target
 void NodeUnit::InitForSingleNode() {
   const auto& input_defs = target_node_.InputDefs();
   const auto& output_defs = target_node_.OutputDefs();
+  const auto& node_attrs = target_node_.GetAttributes();
   auto qlinear_type = GetQLinearOpType(target_node_);
   if (qlinear_type == QLinearOpType::Unknown || IsVariadicQLinearOp(qlinear_type)) {  // TODO, add variadic support
     // Not a Qlinear op, add all inputs / outputs
@@ -321,19 +322,35 @@ void NodeUnit::InitForSingleNode() {
     // DequantizeLinear has 3 inputs
     // x, x_scale, x_zp
     // output is not quantized
-    inputs_.push_back(NodeUnitIODef{*input_defs[0], NodeUnitIODef::QuantParam{*input_defs[1], input_defs.size() == 3
-                                                                                                  ? input_defs[2]
-                                                                                                  : nullptr}});
+
+    // Get the DQ axis attribute if available.
+    std::optional<int64_t> axis;
+    if (auto entry = node_attrs.find("axis"); entry != node_attrs.end()) {
+      axis = entry->second.i();
+    }
+
+    inputs_.push_back(NodeUnitIODef{*input_defs[0],
+                                    NodeUnitIODef::QuantParam{*input_defs[1],
+                                                              input_defs.size() == 3 ? input_defs[2] : nullptr,
+                                                              axis}});
     outputs_.push_back(NodeUnitIODef{*output_defs[0], std::nullopt});
 
   } else if (qlinear_type == QLinearOpType::QuantizeLinear) {
     // QuantizeLinear the input is not quantized and has 3 inputs
     // x, y_scale, y_zp (optional)
     // The output is quantized
+
+    // Get the Q axis attribute if available.
+    std::optional<int64_t> axis;
+    if (auto entry = node_attrs.find("axis"); entry != node_attrs.end()) {
+      axis = entry->second.i();
+    }
+
     inputs_.push_back(NodeUnitIODef{*input_defs[0], std::nullopt});
-    outputs_.push_back(NodeUnitIODef{*output_defs[0], NodeUnitIODef::QuantParam{*input_defs[1], input_defs.size() == 3
-                                                                                                    ? input_defs[2]
-                                                                                                    : nullptr}});
+    outputs_.push_back(NodeUnitIODef{*output_defs[0],
+                                     NodeUnitIODef::QuantParam{*input_defs[1],
+                                                               input_defs.size() == 3 ? input_defs[2] : nullptr,
+                                                               axis}});
   } else {
     ORT_THROW("The QLinear op [", static_cast<uint8_t>(qlinear_type), "] is not supported");
   }
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc
index 4eeca98454..1713f201c9 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc
@@ -120,7 +120,8 @@ Status ConvOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
   if (is_npu_backend) {
     const auto& input_1 = inputs[1];  // weight
     bool is_per_axis_quant = false;
-    ORT_RETURN_IF_ERROR(qnn_model_wrapper.IsPerChannelQuantized(input_1, is_per_axis_quant));
+    int64_t quant_axis = 0;
+    ORT_RETURN_IF_ERROR(qnn_model_wrapper.IsPerChannelQuantized(input_1, is_per_axis_quant, quant_axis));
 
     if (is_per_axis_quant) {
       int32_t elem_data_type = 0;
@@ -129,6 +130,13 @@ Status ConvOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
       const bool is_signed_type = (elem_data_type == ONNX_NAMESPACE::TensorProto_DataType_INT8) ||
                                   (elem_data_type == ONNX_NAMESPACE::TensorProto_DataType_INT16);
       ORT_RETURN_IF_NOT(is_signed_type, "Conv weights must be of a signed quantized type if quantized per-channel");
+
+      if (conv_type == OnnxConvType::kConvTranspose) {
+        ORT_RETURN_IF_NOT(quant_axis == 1,
+                          "ConvTranspose's input[1] must be use axis == 1 for per-channel quantization");
+      } else {
+        ORT_RETURN_IF_NOT(quant_axis == 0, "Conv's input[1] must be use axis == 0 for per-channel quantization");
+      }
     }
   }
 
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
index 7e2d1ef05b..285781aaa3 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
@@ -41,7 +41,7 @@ class SimpleOpBuilder : public BaseOpBuilder {
                                   QnnQuantParamsWrapper& quant_param) const override ORT_MUST_USE_RESULT;
 
  private:
-  Status ExplicitOpCheck(const NodeUnit& node_unit) const;
+  Status ExplicitOpCheck(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit) const;
   Status ProcessSigmoidOrTanhOutput(QnnModelWrapper& qnn_model_wrapper,
                                     const NodeUnit& node_unit,
                                     std::vector<std::string>&& input_names,
@@ -138,7 +138,8 @@ Status SimpleOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
   return Status::OK();
 }
 
-Status SimpleOpBuilder::ExplicitOpCheck(const NodeUnit& node_unit) const {
+Status SimpleOpBuilder::ExplicitOpCheck(QnnModelWrapper& qnn_model_wrapper,
+                                        const NodeUnit& node_unit) const {
   const std::string& op_type = node_unit.OpType();
 
   if (op_type == "GridSample") {
@@ -158,6 +159,20 @@ Status SimpleOpBuilder::ExplicitOpCheck(const NodeUnit& node_unit) const {
                       "QNN EP only supports Min and Max operators with exactly 2 inputs.");
   }
 
+  if (op_type == "DequantizeLinear") {
+    bool is_per_chan_quant = false;
+    int64_t quant_axis = 0;
+    ORT_RETURN_IF_ERROR(qnn_model_wrapper.IsPerChannelQuantized(node_unit.Inputs()[0], is_per_chan_quant, quant_axis));
+    ORT_RETURN_IF(is_per_chan_quant, "QNN EP does not support a standalone DQ op with per-channel quantization");
+  }
+
+  if (op_type == "QuantizeLinear") {
+    bool is_per_chan_quant = false;
+    int64_t quant_axis = 0;
+    ORT_RETURN_IF_ERROR(qnn_model_wrapper.IsPerChannelQuantized(node_unit.Outputs()[0], is_per_chan_quant, quant_axis));
+    ORT_RETURN_IF(is_per_chan_quant, "QNN EP does not support a standalone Q op with per-channel quantization");
+  }
+
   return Status::OK();
 }
 
@@ -475,7 +490,7 @@ Status SimpleOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_w
   const std::string& op_type = node_unit.OpType();
 
   if (do_op_validation) {
-    ORT_RETURN_IF_ERROR(ExplicitOpCheck(node_unit));
+    ORT_RETURN_IF_ERROR(ExplicitOpCheck(qnn_model_wrapper, node_unit));
     // Skip the op validation for DepthToSpace & SpaceToDepth if it's not NHWC data layout
     if (node_unit.Domain() != kMSInternalNHWCDomain && (op_type == "DepthToSpace" || op_type == "SpaceToDepth" || op_type == "GridSample")) {
       return Status::OK();
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
index 6f9ac20285..3a8a8af17b 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
@@ -412,9 +412,10 @@ Status QnnModelWrapper::UnpackScales(const std::string& initializer_name, std::v
 
 // Checks if a tensor in the ONNX graph is per-channel quantized.
 Status QnnModelWrapper::IsPerChannelQuantized(const onnxruntime::NodeUnitIODef& io_def,
-                                              /*out*/ bool& is_per_axis) const {
+                                              /*out*/ bool& is_per_channel,
+                                              /*out*/ int64_t& axis) const {
   if (!io_def.quant_param) {
-    is_per_axis = false;
+    is_per_channel = false;
     return Status::OK();
   }
 
@@ -432,7 +433,12 @@ Status QnnModelWrapper::IsPerChannelQuantized(const onnxruntime::NodeUnitIODef&
   const bool is_scalar_or_1_elem_vector = scale_shape.NumDimensions() == 0 ||
                                           (scale_shape.NumDimensions() == 1 && scale_shape.Size() == 1);
 
-  is_per_axis = !is_scalar_or_1_elem_vector;
+  is_per_channel = !is_scalar_or_1_elem_vector;
+
+  if (is_per_channel) {
+    axis = io_def.quant_param->axis.value_or(1);  // 1 is default axis for Q/DQ ops.
+  }
+
   return Status::OK();
 }
 
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
index 3fd268d1b3..0705a1d1b8 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
@@ -218,8 +218,10 @@ class QnnModelWrapper {
   // Unpack zero-points from initializer and convert to int32_t (1 zero-point for per-tensor, > 1 for per-channel).
   Status UnpackZeroPoints(const std::string& initializer_name, std::vector<int32_t>& zero_points) const;
 
-  // Checks if a tensor in the ONNX graph is per-axis quantized.
-  Status IsPerChannelQuantized(const onnxruntime::NodeUnitIODef& io_def, /*out*/ bool& is_per_axis) const;
+  // Checks if a tensor in the ONNX graph is per-channel quantized.
+  Status IsPerChannelQuantized(const onnxruntime::NodeUnitIODef& io_def,
+                               /*out*/ bool& is_per_channel,
+                               /*out*/ int64_t& axis) const;
 
  private:
   bool CreateQnnInputOutputTensors(const std::string& qnn_node_name,
diff --git a/onnxruntime/test/providers/qnn/conv_test.cc b/onnxruntime/test/providers/qnn/conv_test.cc
index b88578a915..5177a629ce 100644
--- a/onnxruntime/test/providers/qnn/conv_test.cc
+++ b/onnxruntime/test/providers/qnn/conv_test.cc
@@ -154,6 +154,7 @@ static GetTestQDQModelFn<ActivationQType> BuildQDQPerChannelConvTestCase(const s
                                                                          const TestInputDef<float>& input_def,
                                                                          const TestInputDef<float>& weights_def,
                                                                          const TestInputDef<float>& bias_def,
+                                                                         int64_t weight_quant_axis,
                                                                          const std::vector<int64_t>& strides,
                                                                          const std::vector<int64_t>& pads,
                                                                          const std::vector<int64_t>& dilations,
@@ -161,8 +162,9 @@ static GetTestQDQModelFn<ActivationQType> BuildQDQPerChannelConvTestCase(const s
                                                                          const std::string& auto_pad = "NOTSET",
                                                                          bool use_contrib_qdq = false) {
   return [conv_op_type, input_def, weights_def, bias_def, strides, pads,
-          dilations, group, auto_pad, use_contrib_qdq](ModelTestBuilder& builder,
-                                                       std::vector<QuantParams<ActivationQType>>& output_qparams) {
+          dilations, group, auto_pad, use_contrib_qdq,
+          weight_quant_axis](ModelTestBuilder& builder,
+                             std::vector<QuantParams<ActivationQType>>& output_qparams) {
     std::vector<NodeArg*> conv_inputs;
 
     // input -> Q/DQ ->
@@ -174,7 +176,6 @@ static GetTestQDQModelFn<ActivationQType> BuildQDQPerChannelConvTestCase(const s
 
     // Quantized(weights) -> DQ ->
     ORT_ENFORCE(weights_def.IsInitializer() && weights_def.IsRawData());
-    int64_t weight_quant_axis = conv_op_type == "Conv" ? 0 : 1;  // 0 for Conv, 1 for ConvTranspose
     std::vector<float> weight_scales;
     std::vector<WeightQType> weight_zero_points;
     GetTestInputQuantParamsPerChannel<WeightQType>(weights_def, weight_scales, weight_zero_points,
@@ -283,6 +284,7 @@ template <typename ActivationQType, typename WeightQType>
 static void RunHTPConvOpPerChannelTest(const std::string& conv_op_type, const TestInputDef<float>& input_def,
                                        const TestInputDef<float>& weights_def,
                                        const TestInputDef<float>& bias_def,
+                                       int64_t weight_quant_axis,
                                        const std::vector<int64_t>& strides,
                                        const std::vector<int64_t>& pads,
                                        const std::vector<int64_t>& dilations,
@@ -303,8 +305,9 @@ static void RunHTPConvOpPerChannelTest(const std::string& conv_op_type, const Te
   auto f32_fn = BuildF32ConvTestCase(conv_op_type, input_def, weights_def, bias_def, strides, pads, dilations,
                                      group, auto_pad);
   auto qdq_fn = BuildQDQPerChannelConvTestCase<ActivationQType, WeightQType>(conv_op_type, input_def, weights_def,
-                                                                             bias_def, strides, pads, dilations,
-                                                                             group, auto_pad, use_contrib_qdq);
+                                                                             bias_def, weight_quant_axis, strides,
+                                                                             pads, dilations, group, auto_pad,
+                                                                             use_contrib_qdq);
   TestQDQModelAccuracy(f32_fn, qdq_fn, provider_options, opset, expected_ep_assignment, tolerance);
 }
 
@@ -713,6 +716,7 @@ TEST_F(QnnHTPBackendTests, ConvU8S8S32_PerChannel) {
                                               input_def,
                                               weight_def,
                                               bias_def,
+                                              0,             // weight quant axis
                                               {1, 1},        // Strides
                                               {0, 0, 0, 0},  // Pads
                                               {1, 1},        // Dilations
@@ -723,6 +727,34 @@ TEST_F(QnnHTPBackendTests, ConvU8S8S32_PerChannel) {
                                               13);    // opset
 }
 
+// Test per-channel QDQ Conv is rejected with weight axis != 0
+TEST_F(QnnHTPBackendTests, Conv_PerChannel_UnsupportedAxis) {
+  std::vector<int64_t> input_shape = {1, 2, 4, 4};
+  std::vector<int64_t> weight_shape = {3, 2, 3, 3};
+  std::vector<int64_t> bias_shape = {3};
+
+  TestInputDef<float> input_def(input_shape, false,
+                                GetFloatDataInRange(-10.0f, 10.0f, TensorShape(input_shape).Size()));
+  TestInputDef<float> weight_def(weight_shape, true,
+                                 GetFloatDataInRange(-1.0f, 5.0f, TensorShape(weight_shape).Size()));
+  TestInputDef<float> bias_def(bias_shape, true,
+                               GetFloatDataInRange(-1.0f, 1.0f, TensorShape(bias_shape).Size()));
+
+  RunHTPConvOpPerChannelTest<uint8_t, int8_t>("Conv",
+                                              input_def,
+                                              weight_def,
+                                              bias_def,
+                                              2,             // weight quant axis
+                                              {1, 1},        // Strides
+                                              {0, 0, 0, 0},  // Pads
+                                              {1, 1},        // Dilations
+                                              1,             // default group
+                                              "NOTSET",
+                                              ExpectedEPNodeAssignment::None,
+                                              false,  // use_qdq_contrib_ops
+                                              13);    // opset
+}
+
 // Test per-channel QDQ Conv. in0: u8, in1 (weight): s8, in2 (bias): s32, out: u8
 // \QNN\HTP\HTP\src\hexagon\prepare\graph_prepare.cc:203:ERROR:could not create op: q::QNN_Conv3d_w_scale
 // \QNN\HTP\HTP\src\hexagon\prepare\graph_prepare.cc:1187:ERROR:Op 0x1a preparation failed with err:-1
@@ -748,6 +780,7 @@ TEST_F(QnnHTPBackendTests, DISABLED_Conv3D_U8S8S32_PerChannel) {
                                               input_def,
                                               weight_def,
                                               bias_def,
+                                              0,                   // weight quant axis
                                               {1, 1, 1},           // Strides
                                               {0, 0, 0, 0, 0, 0},  // Pads
                                               {1, 1, 1},           // Dilations
@@ -776,6 +809,7 @@ TEST_F(QnnHTPBackendTests, ConvDepthwiseU8S8S32_PerChannel) {
                                               input_def,
                                               weight_def,
                                               bias_def,
+                                              0,             // weight quant axis
                                               {1, 1},        // Strides
                                               {0, 0, 0, 0},  // Pads
                                               {1, 1},        // Dilations
@@ -811,6 +845,7 @@ TEST_F(QnnHTPBackendTests, DISABLED_Conv3D_U8S8S32_PerChannel2) {
                                               input_def,
                                               weight_def,
                                               bias_def,
+                                              0,                   // weight quant axis
                                               {1, 1, 1},           // Strides
                                               {0, 0, 0, 0, 0, 0},  // Pads
                                               {1, 1, 1},           // Dilations
@@ -838,6 +873,7 @@ TEST_F(QnnHTPBackendTests, ConvTransposeU8S8S32_PerChannel) {
                                               input_def,
                                               weight_def,
                                               bias_def,
+                                              1,             // weight quant axis
                                               {1, 1},        // Strides
                                               {0, 0, 0, 0},  // Pads
                                               {1, 1},        // Dilations
@@ -848,6 +884,34 @@ TEST_F(QnnHTPBackendTests, ConvTransposeU8S8S32_PerChannel) {
                                               13);    // opset
 }
 
+// Test per-channel QDQ ConvTranspose is unsupported with weight axis != 1.
+TEST_F(QnnHTPBackendTests, ConvTranspose_PerChannel_UnsupportedAxis) {
+  std::vector<int64_t> input_shape = {1, 2, 4, 4};
+  std::vector<int64_t> weight_shape = {2, 3, 3, 3};
+  std::vector<int64_t> bias_shape = {3};
+
+  TestInputDef<float> input_def(input_shape, false,
+                                GetFloatDataInRange(-10.0f, 10.0f, TensorShape(input_shape).Size()));
+  TestInputDef<float> weight_def(weight_shape, true,
+                                 GetFloatDataInRange(-1.0f, 5.0f, TensorShape(weight_shape).Size()));
+  TestInputDef<float> bias_def(bias_shape, true,
+                               GetFloatDataInRange(-1.0f, 1.0f, TensorShape(bias_shape).Size()));
+
+  RunHTPConvOpPerChannelTest<uint8_t, int8_t>("ConvTranspose",
+                                              input_def,
+                                              weight_def,
+                                              bias_def,
+                                              2,             // weight quant axis
+                                              {1, 1},        // Strides
+                                              {0, 0, 0, 0},  // Pads
+                                              {1, 1},        // Dilations
+                                              1,             // default group
+                                              "NOTSET",
+                                              ExpectedEPNodeAssignment::None,
+                                              false,  // use_qdq_contrib_ops
+                                              13);    // opset
+}
+
 // ConvTranspose3D per-channel
 // Disable it for 2.21 since it failed, re-enabled it for 2.22
 TEST_F(QnnHTPBackendTests, DISABLED_ConvTranspose3D_U8S8S32_PerChannel) {
@@ -866,6 +930,7 @@ TEST_F(QnnHTPBackendTests, DISABLED_ConvTranspose3D_U8S8S32_PerChannel) {
                                               input_def,
                                               weight_def,
                                               bias_def,
+                                              1,                   // weight quant axis
                                               {1, 1, 1},           // Strides
                                               {0, 0, 0, 0, 0, 0},  // Pads
                                               {1, 1, 1},           // Dilations
@@ -893,6 +958,7 @@ TEST_F(QnnHTPBackendTests, ConvU16S8S32_PerChannel) {
                                                input_def,
                                                weight_def,
                                                bias_def,
+                                               0,             // weight quant axis
                                                {1, 1},        // Strides
                                                {0, 0, 0, 0},  // Pads
                                                {1, 1},        // Dilations
@@ -928,6 +994,7 @@ TEST_F(QnnHTPBackendTests, DISABLED_Conv3D_U16S8S32_PerChannel) {
                                                input_def,
                                                weight_def,
                                                bias_def,
+                                               0,                   // weight quant axis
                                                {1, 1, 1},           // Strides
                                                {0, 0, 0, 0, 0, 0},  // Pads
                                                {1, 1, 1},           // Dilations
@@ -955,6 +1022,7 @@ TEST_F(QnnHTPBackendTests, ConvTransposeU16S8S32_PerChannel) {
                                                input_def,
                                                weight_def,
                                                bias_def,
+                                               1,             // weight quant axis
                                                {1, 1},        // Strides
                                                {0, 0, 0, 0},  // Pads
                                                {1, 1},        // Dilations
@@ -982,6 +1050,7 @@ TEST_F(QnnHTPBackendTests, DISABLED_ConvTranspose3D_U16S8S32_PerChannel) {
                                                input_def,
                                                weight_def,
                                                bias_def,
+                                               1,                   // weight quant axis
                                                {1, 1, 1},           // Strides
                                                {0, 0, 0, 0, 0, 0},  // Pads
                                                {1, 1, 1},           // Dilations
@@ -1010,6 +1079,7 @@ TEST_F(QnnHTPBackendTests, ConvDepthwiseU16S8S32_PerChannel) {
                                                input_def,
                                                weight_def,
                                                bias_def,
+                                               0,             // weight quant axis
                                                {1, 1},        // Strides
                                                {0, 0, 0, 0},  // Pads
                                                {1, 1},        // Dilations
@@ -1045,6 +1115,7 @@ TEST_F(QnnHTPBackendTests, DISABLED_Conv3D_U16S8S32_PerChannel2) {
                                                input_def,
                                                weight_def,
                                                bias_def,
+                                               0,                   // weight quant axis
                                                {1, 1, 1},           // Strides
                                                {0, 0, 0, 0, 0, 0},  // Pads
                                                {1, 1, 1},           // Dilations