[NNAPI EP] Enable per-channel quantization for QlinearConv (#6155)

* Enable qlinearconv per-channel quantization * Fix the android CI test failure * Add Android Version Check for Per-Channel Quant * Address PR comments * Fix some minor issues * Add verification of per-channel zero points * Make the error tolerance configurable
2026-06-23 02:38:28 +00:00 · 2020-12-18 16:13:22 -08:00 · 2020-12-18 16:13:22 -08:00 · bbb52e9274
commit bbb52e9274
parent 39aedbc97f
10 changed files with 410 additions and 79 deletions
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
@ -7,6 +7,7 @@

 #include <core/common/safeint.h>
 #include <core/common/logging/logging.h>
+#include <core/framework/tensorprotoutils.h>
 #include <core/graph/graph.h>
 #include <core/graph/graph_viewer.h>
 #include <core/providers/common.h>
@ -64,6 +65,32 @@ QLinearOpType GetQLinearOpType(const onnxruntime::Node& node) {
  return QLinearOpType::Unknown;
 }

+ConvType GetConvType(const onnxruntime::Node& node, const InitializedTensorSet& initializers) {
+  const auto& op_type = node.OpType();
+  bool is_qlinear_conv = (op_type == "QLinearConv");
+  ORT_ENFORCE(op_type == "Conv" || is_qlinear_conv);
+
+  NodeAttrHelper helper(node);
+  const auto group = helper.Get("group", 1);
+
+  size_t w_idx = is_qlinear_conv ? 3 : 1;
+  const auto& weight = node.InputDefs()[w_idx]->Name();
+  const auto& weight_tensor = *initializers.at(weight);
+
+  // For ONNX we only have 1 conv ops
+  // For NNAPI we have 3
+  // Input is (N, C, H, W)
+  // group == 1,                                   --> regular conv
+  // group != 1 && weight is (M, 1, kH, kW),       --> depthwise conv
+  // group != 1 && weight is (M, C/group, kH, kW), --> grouped conv
+  if (group == 1)
+    return ConvType::Regular;
+  else if ((weight_tensor.dims()[1] == 1))
+    return ConvType::Depthwise;
+  else
+    return ConvType::Grouped;
+}
+
 bool IsQLinearBinaryOp(QLinearOpType qlinear_op_type) {
  return qlinear_op_type == QLinearOpType::QLinearConv ||
         qlinear_op_type == QLinearOpType::QLinearMatMul ||
@ -71,8 +98,9 @@ bool IsQLinearBinaryOp(QLinearOpType qlinear_op_type) {
 }

 bool HasValidBinaryOpQuantizedInputs(const Node& node) {
+  auto op_type = GetQLinearOpType(node);
  int32_t a_input_type, b_input_type;
-  if (!IsQLinearBinaryOp(GetQLinearOpType(node))) {
+  if (!IsQLinearBinaryOp(op_type)) {
    LOGS_DEFAULT(VERBOSE) << "[" << node.OpType() << "] is not a binary qlinear op";
    return false;
  }
@ -83,7 +111,16 @@ bool HasValidBinaryOpQuantizedInputs(const Node& node) {
  if (!GetType(*input_defs[3], b_input_type))
    return false;

-  if (a_input_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8 || a_input_type != b_input_type) {
+  // QlinearConv supports u8u8 or u8s8
+  // QLinearMatMul/Add only support u8u8
+  bool is_qlinear_conv = op_type == QLinearOpType::QLinearConv;
+  bool has_valid_qlinear_conv_weight =
+      (b_input_type == ONNX_NAMESPACE::TensorProto_DataType_UINT8 ||
+       b_input_type == ONNX_NAMESPACE::TensorProto_DataType_INT8);
+
+  if (a_input_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8 ||
+      (!is_qlinear_conv && a_input_type != b_input_type) ||
+      (is_qlinear_conv && !has_valid_qlinear_conv_weight)) {
    LOGS_DEFAULT(VERBOSE) << "[" << node.OpType()
                          << "] A Input type: [" << a_input_type
                          << "] B Input type: [" << b_input_type
@ -95,8 +132,9 @@ bool HasValidBinaryOpQuantizedInputs(const Node& node) {
 }

 bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const Node& node,
-                                const std::vector<size_t>& indices) {
-  const auto& op = node.OpType();
+                                const std::vector<size_t>& indices, const OpSupportCheckParams& params) {
+  const auto& op_type = node.OpType();
+  bool is_qlinear_conv = (op_type == "QLinearConv");
  const auto input_defs(node.InputDefs());
  for (const auto idx : indices) {
    if (idx >= input_defs.size()) {
@ -106,13 +144,42 @@ bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const
    }
    const auto scale_name = input_defs[idx]->Name();
    if (Contains(initializers, scale_name)) {
-      const auto& tensor = *initializers.at(scale_name);
-      if (!tensor.dims().empty() && tensor.dims()[0] != 1) {
-        LOGS_DEFAULT(VERBOSE) << op << " does not support per-channel quantization";
-        return false;
+      const auto& scale_tensor = *initializers.at(scale_name);
+      int64_t scales_dim = scale_tensor.dims().empty() ? 1 : scale_tensor.dims()[0];
+      bool is_conv_weight = is_qlinear_conv && idx == 4;
+      bool is_conv_u8s8_weight = false;
+
+      if (is_conv_weight) {
+        const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
+        is_conv_u8s8_weight = weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8;
+      }
+
+      // We need to check the per-channel quantization scales dimensions for u8s8 QlinearConv
+      // We only support per-channel quantization for u8s8
+      // For all other cases, the scales should be a scalar
+      if (is_conv_u8s8_weight) {
+        if (params.android_sdk_ver < 29) {
+          LOGS_DEFAULT(VERBOSE) << op_type << " only supports per-channel quantization on Android API 29+, "
+                                << "system API level: " << params.android_sdk_ver;
+          return false;
+        }
+
+        const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
+        if (weight_tensor.dims()[0] != scales_dim) {
+          LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight,"
+                                << " weight dimension[0] " << weight_tensor.dims()[0]
+                                << " scale dimension " << scales_dim;
+          return false;
+        }
+      } else {
+        if (scales_dim != 1) {
+          LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, "
+                                << " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+";
+          return false;
+        }
      }
    } else {
-      LOGS_DEFAULT(VERBOSE) << "The scale of " << op << " must be known";
+      LOGS_DEFAULT(VERBOSE) << "The scale of " << op_type << " must be known";
      return false;
    }
  }
@ -122,7 +189,8 @@ bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const

 bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, const Node& node,
                                    const std::vector<size_t>& indices) {
-  const auto& op = node.OpType();
+  const auto& op_type = node.OpType();
+  bool is_qlinear_conv = (op_type == "QLinearConv");
  const auto input_defs(node.InputDefs());
  for (const auto idx : indices) {
    if (idx >= input_defs.size()) {
@ -130,20 +198,63 @@ bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, co
                            << " >= input number, " << input_defs.size();
      return false;
    }
-    const auto zero_point_name = node.InputDefs()[idx]->Name();
+
+    const auto zero_point_name = input_defs[idx]->Name();
    if (Contains(initializers, zero_point_name)) {
-      const auto& tensor = *initializers.at(zero_point_name);
-      if (!tensor.dims().empty() && tensor.dims()[0] != 1) {
-        LOGS_DEFAULT(VERBOSE) << op << " does not support per-channel quantization";
-        return false;
+      bool is_conv_weight = is_qlinear_conv && idx == 5;
+      bool is_conv_u8s8_weight = false;
+      if (is_conv_weight) {
+        const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
+        is_conv_u8s8_weight = weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8;
      }
-      if (tensor.data_type() != ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
-        LOGS_DEFAULT(VERBOSE) << op << " does not support zero point data type "
-                              << std::to_string(tensor.data_type());
-        return false;
+
+      const auto& zero_tensor = *initializers.at(zero_point_name);
+      int64_t zero_dim = zero_tensor.dims().empty() ? 1 : zero_tensor.dims()[0];
+      if (is_conv_u8s8_weight) {
+        if (zero_tensor.data_type() != ONNX_NAMESPACE::TensorProto_DataType_INT8) {
+          LOGS_DEFAULT(VERBOSE) << "u8s8 QlinearConv only supports int8 zero point for weight, "
+                                << "actual zero point type: [" << zero_tensor.data_type() << "]";
+          return false;
+        }
+
+        // For onnx, u8s8 QlinearConv, the weight zero point can be a scalar,
+        // or a tensor with same channel as weight, for NNAPI we only support it be
+        // 0 (scalar) or all 0 (tensor), NNAPI will assume the zero point for per-channel
+        // quantization is 0 there is no input for it
+        const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
+        if (weight_tensor.dims()[0] != zero_dim && zero_dim != 1) {
+          LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight,"
+                                << " weight dimension[0] " << weight_tensor.dims()[0]
+                                << " zero point dimension " << zero_dim;
+          return false;
+        }
+
+        std::unique_ptr<uint8_t[]> unpacked_tensor;
+        size_t tensor_byte_size;
+        auto status = onnxruntime::utils::UnpackInitializerData(zero_tensor, unpacked_tensor, tensor_byte_size);
+        if (!status.IsOK()) {
+          LOGS_DEFAULT(ERROR) << "QLinearConv erro when unpack zero tensor:" << status.ErrorMessage();
+          return false;
+        }
+
+        // Verify all onnx weight zero point(s) are 0(s)
+        const int8_t* zero_points = reinterpret_cast<const int8_t*>(unpacked_tensor.get());
+        for (size_t i = 0; i < tensor_byte_size; i++) {
+          if (zero_points[i] != 0) {
+            LOGS_DEFAULT(VERBOSE) << "QLinearConv only support 0 as zero point, "
+                                  << "zero_points[" << i << "] has value: " << zero_points[i];
+            return false;
+          }
+        }
+      } else {
+        if (zero_dim != 1) {
+          LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, "
+                                << " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+";
+          return false;
+        }
      }
    } else {
-      LOGS_DEFAULT(VERBOSE) << "The zero point of " << op << " must be known";
+      LOGS_DEFAULT(VERBOSE) << "The zero point of " << op_type << " must be known";
      return false;
    }
  }
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
@ -87,8 +87,18 @@ enum class QLinearOpType : uint8_t {
  // QLinearReduceMean,
 };

+enum class ConvType : uint8_t {
+  Regular,
+  Depthwise,
+  Grouped,
+};
+
 QLinearOpType GetQLinearOpType(const onnxruntime::Node& node);

+// Return the type of the conv ops,
+// This function assumes the input is a 2d conv node
+ConvType GetConvType(const onnxruntime::Node& node, const InitializedTensorSet& initializers);
+
 // This qlinear op is an operator takes 2 input and produces 1 output
 // Such as QLinearConv, QLinearMatMul, QLinearAdd, ...
 bool IsQLinearBinaryOp(QLinearOpType qlinear_op_type);
@ -97,7 +107,7 @@ bool IsQLinearBinaryOp(QLinearOpType qlinear_op_type);
 bool HasValidBinaryOpQuantizedInputs(const Node& node);
 // Check if a qlinear op has valid scales for given indices
 bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const Node& node,
-                                const std::vector<size_t>& indices);
+                                const std::vector<size_t>& indices, const OpSupportCheckParams& params);
 // Check if a qlinear op has valid zero points for given indices
 bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, const Node& node,
                                    const std::vector<size_t>& indices);
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
@ -26,7 +26,7 @@ int32_t ModelBuilder::GetAndroidSdkVer() const {
 // Scalar operand is copied into the model, no need to persist
 #define DEFINE_ADD_OPERAND_FROM_SCALAR(scalar_type, op_type)                      \
  Status ModelBuilder::AddOperandFromScalar(scalar_type value, uint32_t& index) { \
-    OperandType operandType(Type::op_type);                                       \
+    OperandType operandType(Type::op_type, vector<uint32_t>{});                   \
    ORT_RETURN_IF_ERROR(AddNewNNAPIOperand(operandType, index));                  \
    RETURN_STATUS_ON_ERROR_WITH_NOTE(                                             \
        nnapi_->ANeuralNetworksModel_setOperandValue(                             \
@ -377,6 +377,18 @@ Status ModelBuilder::AddNewNNAPIOperand(const OperandType& operand_type, uint32_
  RETURN_STATUS_ON_ERROR(
      nnapi_->ANeuralNetworksModel_addOperand(nnapi_model_->model_, &operand_type.operandType));
  index = next_index_++;
+
+  if (operand_type.channelQuant) {
+    if (GetAndroidSdkVer() < 29) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "Per-channel quantization is only supported on Android API level 29+,",
+                             " system API level: ", GetAndroidSdkVer());
+    }
+
+    RETURN_STATUS_ON_ERROR(nnapi_->ANeuralNetworksModel_setOperandSymmPerChannelQuantParams(
+        nnapi_model_->model_, index, &operand_type.channelQuant->params));
+  }
+
  return Status::OK();
 }

--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
@ -458,6 +458,7 @@ static Status HandleAutoPad(const Shape& input_shape,
                            vector<int32_t>& onnx_pads,
                            int32_t& nnapi_padding_code,
                            bool& use_auto_pad) {
+  use_auto_pad = false;
  if (auto_pad_type != AutoPadType::NOTSET) {
    ORT_RETURN_IF_ERROR(ComputeConvPads(input_shape, weight_size_y, weight_size_x,
                                        onnx_pads, onnx_strides, onnx_dilations,
@ -524,6 +525,47 @@ static Status GetBinaryOpQuantizationScaleAndZeroPoint(
  return Status::OK();
 }

+static Status GetConvOpQuantizationScaleAndZeroPoint(
+    const ModelBuilder& model_builder, const Node& node,
+    float& a_scale, float& w_scale, float& y_scale,
+    int32_t& a_zero_point, int32_t& w_zero_point, int32_t& y_zero_point,
+    optional<vector<float>>& w_scales) ORT_MUST_USE_RESULT;
+static Status GetConvOpQuantizationScaleAndZeroPoint(
+    const ModelBuilder& model_builder, const Node& node,
+    float& a_scale, float& w_scale, float& y_scale,
+    int32_t& a_zero_point, int32_t& w_zero_point, int32_t& y_zero_point,
+    optional<vector<float>>& w_scales) {
+  // Get scale and zero points
+  // We will handle per-channel weight scale and zero point later
+  ORT_RETURN_IF_ERROR(
+      GetBinaryOpQuantizationScaleAndZeroPoint(model_builder, node,
+                                               a_scale, w_scale, y_scale,
+                                               a_zero_point, w_zero_point, y_zero_point));
+
+  const auto input_defs = node.InputDefs();
+  const auto& initializers(model_builder.GetInitializerTensors());
+  const auto& weight_tensor = *initializers.at(input_defs[3]->Name());
+
+  // We are done here is this is u8u8 QLinearConv
+  if (weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_UINT8)
+    return Status::OK();
+
+  // Now we have u8s8 QlinearConv
+  // u8s8 QlinearConv always have 0 as zero point so we are not getting it here
+  // and we do not use w_scale here, so we reset them back to 0
+  w_scale = 0.0f;
+  w_zero_point = 0;
+
+  // We need to copy the 1d scales array for per-channel quantization
+  const auto& scale_tensor = *initializers.at(input_defs[4]->Name());
+  const auto* scales = GetTensorFloatData(scale_tensor);
+  size_t scales_size = scale_tensor.dims().empty() ? 1 : scale_tensor.dims()[0];
+  vector<float> scales_vec(scales_size, 0.0f);
+  memcpy(scales_vec.data(), scales, sizeof(float) * scales_size);
+  w_scales = onnxruntime::make_optional(std::move(scales_vec));
+  return Status::OK();
+}
+
 // NNAPI has the quantization scale and zero point embedded in the ANeuralNetworksOperandType
 // ONNX has the quantization scale and zero point as the inputs of the qlinear operators
 // We want to verify the scale and zeropoint of the ONNX inputs matches the values embedded in the NNAPI inputs
@ -553,6 +595,35 @@ static Status IsValidInputQuantizedType(const ModelBuilder& model_builder,
  return Status::OK();
 }

+static Status IsValidConvWeightQuantizedType(const ModelBuilder& model_builder,
+                                             const std::string& input_name,
+                                             float scale,
+                                             int32_t zero_point,
+                                             const optional<vector<float>>& scales) ORT_MUST_USE_RESULT;
+static Status IsValidConvWeightQuantizedType(const ModelBuilder& model_builder,
+                                             const std::string& input_name,
+                                             float scale,
+                                             int32_t zero_point,
+                                             const optional<vector<float>>& scales) {
+  // first verify as the weight has no per-channel quantization
+  ORT_RETURN_IF_ERROR(IsValidInputQuantizedType(model_builder, input_name, scale, zero_point));
+
+  if (scales) {
+    const OperandType& input_operand_type = model_builder.GetOperandTypes().at(input_name);
+    if (!input_operand_type.channelQuant) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "Input [", input_name, "] has no channelQuant");
+    }
+
+    if (input_operand_type.channelQuant.value().scales != scales.value()) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "Input [", input_name, "] has mismatch scales between onnx and NNAPI");
+    }
+  }
+
+  return Status::OK();
+}
+
 static void AddBinaryOpQuantizationScaleAndZeroPointToSkip(ModelBuilder& model_builder, const Node& node) {
  const auto input_defs(node.InputDefs());
  model_builder.AddInitializerToSkip(input_defs[1]->Name());  // a_scale
@ -1253,6 +1324,13 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
    }
  }

+  const auto& weight = input_defs[w_idx]->Name();
+  const auto& weight_tensor = *initializers.at(weight);
+  auto conv_type = GetConvType(node, model_builder.GetGraphViewer().GetAllInitializedTensors());
+  bool conv_2d = (conv_type == ConvType::Regular),
+       depthwise_conv_2d = (conv_type == ConvType::Depthwise),
+       grouped_conv_2d = (conv_type == ConvType::Grouped);
+
  float x_scale = 0.0f,
        w_scale = 0.0f,
        y_scale = 0.0f;
@ -1260,31 +1338,16 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
          w_zero_point = 0,
          y_zero_point = 0;

+  // this is for per-channel quantization weights
+  optional<vector<float>> w_scales;
+
  if (is_qlinear_conv) {
-    ORT_RETURN_IF_ERROR(GetBinaryOpQuantizationScaleAndZeroPoint(model_builder, node,
-                                                                 x_scale, w_scale, y_scale,
-                                                                 x_zero_point, w_zero_point, y_zero_point));
+    ORT_RETURN_IF_ERROR(GetConvOpQuantizationScaleAndZeroPoint(model_builder, node,
+                                                               x_scale, w_scale, y_scale,
+                                                               x_zero_point, w_zero_point, y_zero_point,
+                                                               w_scales));
  }

-  const auto& weight = input_defs[w_idx]->Name();
-  const auto& weight_tensor = *initializers.at(weight);
-  bool conv_2d = false,
-       depthwise_conv_2d = false,
-       grouped_conv_2d = false;
-
-  // For ONNX we only have 1 conv ops
-  // For NNAPI we have 3
-  // Input is (N, C, H, W)
-  // group == 1,                                   --> regular conv
-  // group != 1 && weight is (M, 1, kH, kW),       --> depthwise conv
-  // group != 1 && weight is (M, C/group, kH, kW), --> grouped conv
-  if (group == 1)
-    conv_2d = true;
-  else if ((weight_tensor.dims()[1] == 1))
-    depthwise_conv_2d = true;
-  else
-    grouped_conv_2d = true;
-
  Shape onnx_weight_shape;
  for (auto dim : weight_tensor.dims())
    onnx_weight_shape.push_back(SafeInt<uint32_t>(dim));
@ -1297,12 +1360,22 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
    case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
      onnx_weight_type = Type::TENSOR_QUANT8_ASYMM;
      break;
+    case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+      onnx_weight_type = Type::TENSOR_QUANT8_SYMM_PER_CHANNEL;
+      break;
    default:
      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                             "The initializer of graph ", weight, " doesn't have valid type: ", weight_tensor.data_type());
  }

-  OperandType onnx_weight_operand_type(onnx_weight_type, onnx_weight_shape, w_scale, w_zero_point);
+  // Get weight operand type
+  // Per-channel quantized weight is handled differently
+  OperandType onnx_weight_operand_type =
+      (is_qlinear_conv && w_scales.has_value())
+          ? OperandType{onnx_weight_type, onnx_weight_shape,
+                        SymmPerChannelQuantParams{w_scales.value(),
+                                                  depthwise_conv_2d ? 3u : 0u}}  // channelDim is 3 for depthwise-conv
+          : OperandType{onnx_weight_type, onnx_weight_shape, w_scale, w_zero_point};

  // Pre-process weights
  if (conv_2d || grouped_conv_2d) {
@ -1314,7 +1387,7 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
  if (is_qlinear_conv) {
    // Verify if the scale and zero point matchs from onnx input/weight and nnapi input/weight
    ORT_RETURN_IF_ERROR(IsValidInputQuantizedType(model_builder, input, x_scale, x_zero_point));
-    ORT_RETURN_IF_ERROR(IsValidInputQuantizedType(model_builder, weight, w_scale, w_zero_point));
+    ORT_RETURN_IF_ERROR(IsValidConvWeightQuantizedType(model_builder, weight, w_scale, w_zero_point, w_scales));
  }

  bool hasBias = (input_defs.size() > b_idx);
@ -1332,14 +1405,15 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
      vector<float> buffer(bias_dimen[0], 0.0f);
      OperandType bias_operand_type(Type::TENSOR_FLOAT32, bias_dimen, x_scale * w_scale);
      ORT_RETURN_IF_ERROR(model_builder.AddOperandFromPersistMemoryBuffer(bias, buffer.data(), bias_operand_type));
-    } else if (weight_type == Type::TENSOR_QUANT8_ASYMM) {
+    } else if (weight_type == Type::TENSOR_QUANT8_ASYMM || weight_type == Type::TENSOR_QUANT8_SYMM_PER_CHANNEL) {
      vector<int32_t> buffer(bias_dimen[0], 0);
      OperandType bias_operand_type(Type::TENSOR_INT32, bias_dimen, x_scale * w_scale);
      ORT_RETURN_IF_ERROR(model_builder.AddOperandFromPersistMemoryBuffer(bias, buffer.data(), bias_operand_type));
    } else {
      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unknown weight type ", TypeToStr(weight_type));
    }
-  } else if (is_qlinear_conv) {  // QLinearConv's bias type need special handling
+  } else if (is_qlinear_conv) {
+    // QLinearConv's bias type need special handling to add scale for quantization input
    const auto& bias_tensor = *model_builder.GetInitializerTensors().at(bias);
    ORT_RETURN_IF_NOT(bias_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT32,
                      "bias of QLinearConv should be int32, actual type: ", bias_tensor.data_type());
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc
@ -228,7 +228,7 @@ bool BinaryOpSupportChecker::HasSupportedInputsImpl(const Node& node) const {
 }

 bool BinaryOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
-                                               const OpSupportCheckParams& /* params */) const {
+                                               const OpSupportCheckParams& params) const {
  const auto& op_type(node.OpType());
  const auto input_defs(node.InputDefs());
  bool op_is_qlinear = op_type == "QLinearAdd";
@ -265,7 +265,7 @@ bool BinaryOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initi

    // All scale/zero points are initializer scalars
    // a/b/y_scale
-    if (!HasValidQuantizationScales(initializers, node, {1, 4, 6}))
+    if (!HasValidQuantizationScales(initializers, node, {1, 4, 6}, params))
      return false;

    // a/b/y_zero_point
@ -599,7 +599,7 @@ bool ConvOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial
    }

    // a/b/y_scale
-    if (!HasValidQuantizationScales(initializers, node, {1, 4, 6}))
+    if (!HasValidQuantizationScales(initializers, node, {1, 4, 6}, params))
      return false;

    // a/b/y_zero_point
@ -860,7 +860,7 @@ bool GemmOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial

      // All scale/zero points are initializer scalars
      // a/b/y_scale
-      if (!HasValidQuantizationScales(initializers, node, {1, 4, 6}))
+      if (!HasValidQuantizationScales(initializers, node, {1, 4, 6}, params))
        return false;

      // a/b/y_zero_point
@ -1003,7 +1003,7 @@ class QuantizeLinearOpSupportChecker : public BaseOpSupportChecker {
 };

 bool QuantizeLinearOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
-                                                       const OpSupportCheckParams& /* params */) const {
+                                                       const OpSupportCheckParams& params) const {
  const auto input_defs(node.InputDefs());
  const auto output_defs(node.OutputDefs());

@ -1018,7 +1018,7 @@ bool QuantizeLinearOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSe
    return false;
  }

-  if (!HasValidQuantizationScales(initializers, node, {1}))
+  if (!HasValidQuantizationScales(initializers, node, {1}, params))
    return false;

  if (input_defs.size() == 3) {  // has zero_point input
@ -1045,9 +1045,9 @@ class DequantizeLinearOpSupportChecker : public BaseOpSupportChecker {
 };

 bool DequantizeLinearOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
-                                                         const OpSupportCheckParams& /* params */) const {
+                                                         const OpSupportCheckParams& params) const {
  const auto input_defs(node.InputDefs());
-  if (!HasValidQuantizationScales(initializers, node, {1}))
+  if (!HasValidQuantizationScales(initializers, node, {1}, params))
    return false;

  if (input_defs.size() == 3) {  // has zero_point input
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_lib/NeuralNetworksWrapper.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_lib/NeuralNetworksWrapper.cc
@ -32,10 +32,22 @@ OperandType::OperandType(Type type, const std::vector<uint32_t>& d, float scale,
  };
 }

-OperandType::OperandType(const OperandType& other) {
-  type = other.type;
-  dimensions = other.dimensions;
-  operandType = other.operandType;
+OperandType::OperandType(Type type, const std::vector<uint32_t>& d, SymmPerChannelQuantParams&& channelQuant)
+    : type(type), dimensions(d), channelQuant(std::move(channelQuant)) {
+  operandType = {
+      .type = static_cast<int32_t>(type),
+      .dimensionCount = static_cast<uint32_t>(dimensions.size()),
+      .dimensions = dimensions.size() > 0 ? dimensions.data() : nullptr,
+      .scale = 0.0f,
+      .zeroPoint = 0,
+  };
+}
+
+OperandType::OperandType(const OperandType& other)
+    : operandType(other.operandType),
+      type(other.type),
+      dimensions(other.dimensions),
+      channelQuant(other.channelQuant) {
  operandType.dimensions = dimensions.size() > 0 ? dimensions.data() : nullptr;
 }

@ -44,6 +56,7 @@ OperandType& OperandType::operator=(const OperandType& other) {
    type = other.type;
    dimensions = other.dimensions;
    operandType = other.operandType;
+    channelQuant = other.channelQuant;
    operandType.dimensions = dimensions.size() > 0 ? dimensions.data() : nullptr;
  }

--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_lib/NeuralNetworksWrapper.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_lib/NeuralNetworksWrapper.h
@ -21,6 +21,9 @@

 #include "NeuralNetworksTypes.h"

+// Move to std::optional when we switch to c++ 17
+#include "core/common/optional.h"
+
 template <typename T>
 T Product(const std::vector<T>& v) {
  return static_cast<T>(
@ -99,12 +102,40 @@ inline std::string TypeToStr(const Type& type) {
  }
 }

+struct SymmPerChannelQuantParams {
+  ANeuralNetworksSymmPerChannelQuantParams params;
+  std::vector<float> scales;
+  SymmPerChannelQuantParams(std::vector<float> scalesVec, uint32_t channelDim)
+      : scales(std::move(scalesVec)) {
+    params = {
+        .channelDim = channelDim,
+        .scaleCount = static_cast<uint32_t>(scales.size()),
+        .scales = scales.size() > 0 ? scales.data() : nullptr,
+    };
+  }
+  SymmPerChannelQuantParams(const SymmPerChannelQuantParams& other)
+      : params(other.params), scales(other.scales) {
+    params.scales = scales.size() > 0 ? scales.data() : nullptr;
+  }
+  SymmPerChannelQuantParams& operator=(const SymmPerChannelQuantParams& other) {
+    if (this != &other) {
+      params = other.params;
+      scales = other.scales;
+      params.scales = scales.size() > 0 ? scales.data() : nullptr;
+    }
+    return *this;
+  }
+};
+
 struct OperandType {
  ANeuralNetworksOperandType operandType;
  Type type;
  std::vector<uint32_t> dimensions;
+  onnxruntime::optional<SymmPerChannelQuantParams> channelQuant;
+
+  explicit OperandType(Type type, const std::vector<uint32_t>& d, float scale = 0.0f, int32_t zeroPoint = 0);
+  explicit OperandType(Type type, const std::vector<uint32_t>& d, SymmPerChannelQuantParams&& channelQuant);

-  explicit OperandType(Type type, const std::vector<uint32_t>& d = {}, float scale = 0.0f, int32_t zeroPoint = 0);
  OperandType(const OperandType& other);
  OperandType& operator=(const OperandType& other);

--- a/onnxruntime/test/providers/cpu/nn/qlinearconv_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/qlinearconv_op_test.cc
@ -389,7 +389,9 @@ class QLinearConvOpTester {
    Y_shape.push_back(output_channels);
    for (size_t n = 0; n < kernel_rank; n++) {
      Y_shape.push_back(((input_shape[n] + pads[n] + pads[kernel_rank + n]) -
-                        (dilations[n] * (kernel_shape[n] - 1) + 1)) / strides[n] + 1);
+                         (dilations[n] * (kernel_shape[n] - 1) + 1)) /
+                            strides[n] +
+                        1);
    }
    const int64_t* output_shape = Y_shape.data() + 2;
    Y_data.resize(ShapeSize(Y_shape));
@ -464,22 +466,38 @@ class QLinearConvOpTester {

    test.AddInput<T1>("x", X_.shape_, X_.data_);
    test.AddInput<float>("x_scale", {}, X_.scale_, all_input_initializer_except_x);
-    test.AddInput<T1>("x_zero_point", {}, {X_.zero_point_});
+    test.AddInput<T1>("x_zero_point", {}, {X_.zero_point_}, all_input_initializer_except_x);

    const std::vector<int64_t> W_scale_shape{static_cast<int64_t>(W_.scale_.size())};
    test.AddInput<T2>("w", W_.shape_, W_.data_, all_input_initializer_except_x);
    test.AddInput<float>("w_scale", W_scale_shape, W_.scale_, all_input_initializer_except_x);
-    test.AddInput<T2>("w_zero_point", {}, {W_.zero_point_});
+    test.AddInput<T2>("w_zero_point", {}, {W_.zero_point_}, all_input_initializer_except_x);

    test.AddInput<float>("y_scale", {}, {output_scale_}, all_input_initializer_except_x);
-    test.AddInput<T1>("y_zero_point", {}, {output_zero_point_});
+    test.AddInput<T1>("y_zero_point", {}, {output_zero_point_}, all_input_initializer_except_x);

    if (!B_.empty()) {
      const std::vector<int64_t> B_shape{static_cast<int64_t>(B_.size())};
-      test.AddInput<int32_t>("b", B_shape, B_);
+      test.AddInput<int32_t>("b", B_shape, B_, all_input_initializer_except_x);
    }

-    test.AddOutput<uint8_t>("y", Y_shape, Y_data);
+    float abs_error = 0.0f;
+
+    // For quantized models, NNAPI's rounding is different than CPU provider
+    // Sometimes the result is within +/-1 of result of CPU provider
+    // For ONNX, we use rounding to nearest ties to even.
+    // For NNAPI, it is using std::round which is HALF_AWAY_FROM_ZERO, see
+    // https://android.googlesource.com/platform/frameworks/ml/+/refs/heads/master/nn/common/operations/Quantize.cpp
+    // Use 1 as abs_error which is the smallest possbile for uint8_t
+    //
+    // NOTE, for now the tolerance will only apply if the NNAPI is actually used,
+    // if for any reason the execution falls back to CPU, we still expect an exact match
+    // See, 'void Check<uint8_t>(...' in onnxruntime/test/providers/provider_test_utils.cc
+#ifdef USE_NNAPI
+    abs_error = 1.0f;
+#endif
+
+    test.AddOutput<uint8_t>("y", Y_shape, Y_data, false /* sort_output */, 0.0f /* rel_error */, abs_error);

    if (!pads_.empty()) {
      test.AddAttribute("pads", pads_);
--- a/onnxruntime/test/providers/provider_test_utils.cc
+++ b/onnxruntime/test/providers/provider_test_utils.cc
@ -68,6 +68,53 @@ void Check(const OpTester::Data& expected_data, const Tensor& output_tensor,
  }
 }

+template <>
+void Check<uint8_t>(const OpTester::Data& expected_data,
+                    const Tensor& output_tensor,
+                    const std::string& provider_type) {
+  auto& expected_tensor = expected_data.data_.Get<Tensor>();
+  auto* expected = expected_tensor.template Data<uint8_t>();
+  auto* output = output_tensor.template Data<uint8_t>();
+  auto size = output_tensor.Shape().Size();
+
+  bool has_abs_err = expected_data.absolute_error_.has_value();
+  bool has_rel_err = expected_data.relative_error_.has_value();
+
+  if (expected_data.sort_output_) {
+    // if order can be jumbled in the output of an operator, sort both the
+    // expected and output buffers prior to
+    // comparison this is a "best-effort" algo and should satisfy the
+    // requirement for the few ops that do require this
+    // support without investing in a more sophisticated infrastructure for the
+    // same
+    sort_expected_and_actual_buffers<uint8_t>(expected, output, size);
+  }
+
+  // For uint8_t results, we only allow NNAPI EP to have an error tolerance, see below for the reason
+  // For any other EPs, we still expect an exact match for the results
+  if (provider_type == kNnapiExecutionProvider && (has_abs_err || has_rel_err)) {
+    double threshold = has_abs_err
+                           ? expected_data.absolute_error_.value()
+                           : 0.0;
+
+    for (int i = 0; i < size; ++i) {
+      if (has_rel_err) {
+        EXPECT_NEAR(expected[i], output[i],
+                    expected_data.relative_error_.value() * expected[i])  // expected[i] is unsigned, can't be negative
+            << "i:" << i << ", provider_type: " << provider_type;
+      } else {  // has_abs_err
+        EXPECT_NEAR(expected[i], output[i], threshold)
+            << "i:" << i << ", provider_type: " << provider_type;
+      }
+    }
+  } else {
+    for (int i = 0; i < size; ++i) {
+      EXPECT_EQ(expected[i], output[i]) << "i:" << i
+                                        << ", provider_type: " << provider_type;
+    }
+  }
+}
+
 template <>
 void Check<double>(const OpTester::Data& expected_data,
                   const Tensor& output_tensor,
@ -747,8 +794,7 @@ void OpTester::Run(
        kAclExecutionProvider,
        kArmNNExecutionProvider,
        kNnapiExecutionProvider,
-        kRocmExecutionProvider
-    };
+        kRocmExecutionProvider};

    bool has_run = false;

@ -844,8 +890,7 @@ void OpTester::Run(
              }
            }

-            if (!valid)
-            {
+            if (!valid) {
              std::cerr << "No kernel registered from EP: " << provider_type << "for node: " << node.OpType() << std::endl;
              break;
            }
--- a/onnxruntime/test/providers/provider_test_utils.h
+++ b/onnxruntime/test/providers/provider_test_utils.h
@ -324,20 +324,24 @@ class OpTester {

  template <typename T>
  void AddOutput(const char* name, const std::vector<int64_t>& dims, const std::initializer_list<T>& expected_values,
-                 bool sort_output = false) {
-    AddData(output_data_, name, dims, expected_values.begin(), expected_values.size(), false, sort_output);
+                 bool sort_output = false, float rel_error = 0.0f, float abs_error = 0.0f) {
+    AddData(output_data_, name, dims, expected_values.begin(), expected_values.size(), false,
+            sort_output, nullptr /* dim_params */, rel_error, abs_error);
  }

  // This function doesn't work for vector<bool> because const vector<bool> cannot invoke its data().
  template <typename T>
  void AddOutput(const char* name, const std::vector<int64_t>& dims, const std::vector<T>& expected_values,
-                 bool sort_output = false) {
-    AddData(output_data_, name, dims, expected_values.data(), expected_values.size(), false, sort_output);
+                 bool sort_output = false, float rel_error = 0.0f, float abs_error = 0.0f) {
+    AddData(output_data_, name, dims, expected_values.data(), expected_values.size(), false,
+            sort_output, nullptr /* dim_params */, rel_error, abs_error);
  }

  template <typename T>
-  void AddOutput(const char* name, const std::vector<int64_t>& dims, const T* p_values, const size_t size) {
-    AddData(output_data_, name, dims, p_values, size);
+  void AddOutput(const char* name, const std::vector<int64_t>& dims, const T* p_values, const size_t size,
+                 bool sort_output = false, float rel_error = 0.0f, float abs_error = 0.0f) {
+    AddData(output_data_, name, dims, p_values, size, false,
+            sort_output, nullptr /* dim_params */, rel_error, abs_error);
  }

  template <typename T>
@ -521,7 +525,8 @@ class OpTester {
  template <typename T>
  void AddData(std::vector<Data>& data, const char* name, const std::vector<int64_t>& dims, const T* values,
               int64_t values_count, bool is_initializer = false, bool sort_output = false,
-               const std::vector<std::string>* dim_params = nullptr) {
+               const std::vector<std::string>* dim_params = nullptr,
+               float rel_error = 0.0f, float abs_error = 0.0f) {
    ORT_TRY {
      TensorShape shape{dims};
      ORT_ENFORCE(shape.Size() == values_count, values_count, " input values doesn't match tensor size of ",
@ -565,7 +570,19 @@ class OpTester {
        }
        node_arg.SetShape(new_shape);
      }
-      data.push_back(Data(std::move(node_arg), std::move(value), optional<float>(), optional<float>(), sort_output));
+
+      optional<float> rel;
+      optional<float> abs;
+
+      if (rel_error != 0.0f) {
+        rel = rel_error;
+      }
+
+      if (abs_error != 0.0f) {
+        abs = abs_error;
+      }
+
+      data.push_back(Data(std::move(node_arg), std::move(value), std::move(rel), std::move(abs), sort_output));
      if (is_initializer) initializer_index_.push_back(data.size() - 1);
    }
    ORT_CATCH(const std::exception& ex) {