[NNAPI QDQ] AddQDQAdd/Mul, update to NNAPI QDQ handling, update some test settings (#10483)

* Squashed commit of the following: commit 12380491a9 Author: Guoyu Wang <wanggy@outlook.com> Date: Mon Feb 7 12:59:04 2022 -0800 Add qdq mul support commit 9cadda7f2c Merge: 7a32847761 0f5d0a091a Author: Guoyu Wang <wanggy@outlook.com> Date: Mon Feb 7 11:24:47 2022 -0800 Merge remote-tracking branch 'origin/master' into gwang-msft/qdq_mul commit 7a32847761 Author: Guoyu Wang <wanggy@outlook.com> Date: Mon Feb 7 00:41:30 2022 -0800 move test case to util commit c1a8f0d81e Author: Guoyu Wang <wanggy@outlook.com> Date: Fri Feb 4 13:04:26 2022 -0800 update input/output check commit a6f0a0d504 Author: Guoyu Wang <wanggy@outlook.com> Date: Thu Feb 3 18:37:21 2022 -0800 update quantized io check functions commit 87f4d1dcfe Merge: 7849f07109 97b8f6f394 Author: Guoyu Wang <wanggy@outlook.com> Date: Wed Feb 2 17:22:58 2022 -0800 Merge remote-tracking branch 'origin/master' into gwang-msft/qdq_mul commit 7849f07109 Author: Guoyu Wang <wanggy@outlook.com> Date: Wed Feb 2 17:22:55 2022 -0800 minor update commit 7196cdf419 Author: Guoyu Wang <wanggy@outlook.com> Date: Wed Feb 2 10:50:10 2022 -0800 init change commit 84c00772a1 Merge: a8c7dce22f 7318361645 Author: Guoyu Wang <wanggy@outlook.com> Date: Tue Feb 1 18:21:17 2022 -0800 Merge remote-tracking branch 'origin/master' into gwang-msft/qdq_mul commit a8c7dce22f Merge: 55e536c182 ef7b4dc05c Author: Guoyu Wang <wanggy@outlook.com> Date: Tue Feb 1 13:51:04 2022 -0800 Merge remote-tracking branch 'origin/master' into gwang-msft/qdq_mul commit 55e536c182 Author: Guoyu Wang <wanggy@outlook.com> Date: Tue Feb 1 11:44:34 2022 -0800 address cr comments commit d460f5b776 Author: Guoyu Wang <wanggy@outlook.com> Date: Tue Feb 1 00:33:54 2022 -0800 fix android UT failure commit 52146cf06f Author: Guoyu Wang <wanggy@outlook.com> Date: Mon Jan 31 16:01:13 2022 -0800 fix build break commit ec6d07df8b Author: Guoyu Wang <wanggy@outlook.com> Date: Mon Jan 31 15:41:52 2022 -0800 minor update to UT commit 8ec8490b4f Author: Guoyu Wang <wanggy@outlook.com> Date: Mon Jan 31 15:01:30 2022 -0800 Add NNAPI support of QDQ Resize * Update qdq add/mul test case, fix build break * Address CR comments * Add QLinearMul support * remove unused params * Address CR comments
2026-05-23 22:13:38 +00:00 · 2022-02-08 20:44:15 -08:00 · 2022-02-08 20:44:15 -08:00 · e4dc4e4d3c
commit e4dc4e4d3c
parent 655f490c95
11 changed files with 712 additions and 747 deletions
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
@ -61,6 +61,8 @@ QuantizedOpType GetQuantizedOpType(const NodeUnit& node_unit) {
      return QuantizedOpType::QLinearMatMul;
    else if (op_type == "QLinearAdd")
      return QuantizedOpType::QLinearAdd;
+    else if (op_type == "QLinearMul")
+      return QuantizedOpType::QLinearMul;
    else if (op_type == "QLinearSigmoid")
      return QuantizedOpType::QLinearSigmoid;
    else if (op_type == "QLinearAveragePool")
@ -72,6 +74,10 @@ QuantizedOpType GetQuantizedOpType(const NodeUnit& node_unit) {
      return QuantizedOpType::QDQResize;
    else if (op_type == "AveragePool")
      return QuantizedOpType::QDQAveragePool;
+    else if (op_type == "Add")
+      return QuantizedOpType::QDQAdd;
+    else if (op_type == "Mul")
+      return QuantizedOpType::QDQMul;
  } else {
    // throw?
    // Do we want to throw here? seems got neglected last time
@ -114,25 +120,13 @@ bool IsQuantizedPool(QuantizedOpType quant_op_type) {
 bool IsQuantizedBinaryOp(QuantizedOpType quant_op_type) {
  return quant_op_type == QuantizedOpType::QLinearMatMul ||
         quant_op_type == QuantizedOpType::QLinearAdd ||
+         quant_op_type == QuantizedOpType::QLinearMul ||
+         quant_op_type == QuantizedOpType::QDQAdd ||
+         quant_op_type == QuantizedOpType::QDQMul ||
         IsQuantizedConv(quant_op_type);
 }

-bool HasValidUnaryOpQuantizedInputs(const NodeUnit& node_unit) {
-  int32_t input_type;
-  if (!GetType(node_unit.Inputs()[0].node_arg, input_type))
-    return false;
-
-  if (input_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
-    LOGS_DEFAULT(VERBOSE) << "[" << node_unit.OpType()
-                          << "] Input type: [" << input_type
-                          << "] is not supported for now";
-    return false;
-  }
-
-  return true;
-}
-
-bool HasValidBinaryOpQuantizedInputs(const NodeUnit& node_unit) {
+bool HasValidBinaryOpQuantizedInputTypes(const NodeUnit& node_unit) {
  auto quant_op_type = GetQuantizedOpType(node_unit);
  int32_t a_input_type, b_input_type;
  if (!IsQuantizedBinaryOp(quant_op_type)) {
@ -146,16 +140,17 @@ bool HasValidBinaryOpQuantizedInputs(const NodeUnit& node_unit) {
  if (!GetType(inputs[1].node_arg, b_input_type))
    return false;

-  // QlinearConv supports u8u8 or u8s8
-  // QLinearMatMul/Add only support u8u8
-  bool is_quant_conv = IsQuantizedConv(quant_op_type);
+  // QlinearConv/MatMul supports u8u8 or u8s8
+  // QLinearAdd/QLinearMul only support u8u8
+  bool is_quant_conv_or_matmul = IsQuantizedConv(quant_op_type) || (quant_op_type == QuantizedOpType::QLinearMatMul);
+
  bool has_valid_qlinear_conv_weight =
      (b_input_type == ONNX_NAMESPACE::TensorProto_DataType_UINT8 ||
       b_input_type == ONNX_NAMESPACE::TensorProto_DataType_INT8);

  if (a_input_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8 ||
-      (!is_quant_conv && a_input_type != b_input_type) ||
-      (is_quant_conv && !has_valid_qlinear_conv_weight)) {
+      (!is_quant_conv_or_matmul && a_input_type != b_input_type) ||
+      (is_quant_conv_or_matmul && !has_valid_qlinear_conv_weight)) {
    LOGS_DEFAULT(VERBOSE) << "[" << node_unit.OpType()
                          << "] A Input type: [" << a_input_type
                          << "] B Input type: [" << b_input_type
@ -166,182 +161,6 @@ bool HasValidBinaryOpQuantizedInputs(const NodeUnit& node_unit) {
  return true;
 }

-bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
-                                const std::vector<size_t>& indices, const OpSupportCheckParams& params, bool is_input) {
-  const auto& op_type = node_unit.OpType();
-  auto quant_op_type = GetQuantizedOpType(node_unit);
-  bool is_quant_conv = IsQuantizedConv(quant_op_type);
-  bool is_quant_matmul = (quant_op_type == QuantizedOpType::QLinearMatMul);
-  const auto& io_defs = is_input ? node_unit.Inputs() : node_unit.Outputs();
-  for (const auto idx : indices) {
-    if (idx >= io_defs.size()) {
-      LOGS_DEFAULT(VERBOSE) << (is_input ? "Input" : "Output") << " index,  " << idx
-                            << " >= size, " << io_defs.size()
-                            << " of NodeUnit: " << node_unit.Name();
-      return false;
-    }
-
-    const auto& io_def = io_defs[idx];
-    if (!io_def.quant_param.has_value()) {
-      LOGS_DEFAULT(VERBOSE) << "HasValidQuantizationZeroPoints, Input index,  " << idx
-                            << " has no quant_param";
-      return false;
-    }
-
-    const auto scale_name = io_def.quant_param->scale.Name();
-
-    if (!Contains(initializers, scale_name)) {
-      LOGS_DEFAULT(VERBOSE) << "The scale of " << op_type << " must be an initializer tensor";
-      return false;
-    }
-
-    // If this op is Qlinear[Conv/MatMul], we want to check u8s8 support for weight tensor (or B tensor for QlinearMatMul)
-    bool is_conv_matmul_weight = is_input && (is_quant_conv || is_quant_matmul) && idx == 1;
-    bool is_conv_matmul_u8s8_weight = false;
-
-    if (is_conv_matmul_weight) {
-      const auto& weight_tensor = *initializers.at(io_def.node_arg.Name());
-      is_conv_matmul_u8s8_weight = weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8;
-    }
-
-    const auto& scale_tensor = *initializers.at(scale_name);
-    int64_t scales_dim = scale_tensor.dims().empty() ? 1 : scale_tensor.dims()[0];
-    if (!is_conv_matmul_u8s8_weight) {
-      if (scales_dim != 1) {
-        LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, "
-                              << " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+";
-        return false;
-      }
-    } else if (scales_dim != 1) {
-      // For u8s8 Qlinear[Conv/MatMul], we support
-      // 1. Per-tensor, the weight will be transformed to uint8 later
-      // 2. Per-channel, only from Android API level 29
-      if (is_quant_matmul) {
-        LOGS_DEFAULT(VERBOSE) << "QLinearMatMul does not support per-channel quantization";
-        return false;
-      }
-
-      if (params.android_feature_level < ANEURALNETWORKS_FEATURE_LEVEL_3) {
-        LOGS_DEFAULT(VERBOSE) << op_type << " only supports per-channel quantization on Android API 29+, "
-                              << "system NNAPI feature level: " << params.android_feature_level;
-        return false;
-      }
-
-      const auto& weight_tensor = *initializers.at(io_def.node_arg.Name());
-      if (weight_tensor.dims()[0] != scales_dim) {
-        LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight,"
-                              << " weight dimension[0] " << weight_tensor.dims()[0]
-                              << " scale dimension " << scales_dim;
-        return false;
-      }
-    }
-  }
-
-  return true;
-}
-
-bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
-                                    const std::vector<size_t>& indices, bool is_input) {
-  const auto& op_type = node_unit.OpType();
-  auto quant_op_type = GetQuantizedOpType(node_unit);
-  bool is_quant_conv = IsQuantizedConv(quant_op_type);
-  bool is_quant_matmul = (quant_op_type == QuantizedOpType::QLinearMatMul);
-
-  const auto& io_defs = is_input ? node_unit.Inputs() : node_unit.Outputs();
-  for (const auto idx : indices) {
-    if (idx >= io_defs.size()) {
-      LOGS_DEFAULT(VERBOSE) << "HasValidQuantizationZeroPoints, "
-                            << (is_input ? "Input" : "Output") << " index,  " << idx
-                            << " >= size, " << io_defs.size();
-      return false;
-    }
-
-    const auto& io_def = io_defs[idx];
-    if (!io_def.quant_param.has_value()) {
-      LOGS_DEFAULT(VERBOSE) << "HasValidQuantizationZeroPoints, Input index,  " << idx
-                            << " has no quant_param";
-      return false;
-    }
-
-    // zero point is optional here
-    if (!io_def.quant_param->zero_point)
-      return true;
-
-    const auto& zero_point_name = io_def.quant_param->zero_point->Name();
-    if (!Contains(initializers, zero_point_name)) {
-      LOGS_DEFAULT(VERBOSE) << "The zero point of " << op_type << " must be an initializer tensor";
-      return false;
-    }
-
-    bool is_conv_matmul_weight = is_input && (is_quant_conv || is_quant_matmul) && idx == 1;
-    bool is_conv_matmul_u8s8_weight = false;
-
-    if (is_conv_matmul_weight) {
-      const auto& weight_tensor = *initializers.at(io_def.node_arg.Name());
-      is_conv_matmul_u8s8_weight = weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8;
-    }
-
-    const auto& zero_tensor = *initializers.at(zero_point_name);
-    int64_t zero_dim = zero_tensor.dims().empty() ? 1 : zero_tensor.dims()[0];
-
-    if (!is_conv_matmul_u8s8_weight) {
-      if (zero_dim != 1) {
-        LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, "
-                              << " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+";
-        return false;
-      }
-    } else {
-      // For u8s8 Qlinear[Conv/MatMul], we support
-      // 1. Per-tensor, the weight will be transformed to uint8 later
-      // 2. Per-channel, only from Android API level 29
-      if (zero_tensor.data_type() != ONNX_NAMESPACE::TensorProto_DataType_INT8) {
-        LOGS_DEFAULT(VERBOSE) << "u8s8 Qlinear[Conv/MatMul] only supports int8 zero point for weight, "
-                              << "actual zero point type: [" << zero_tensor.data_type() << "]";
-        return false;
-      }
-
-      if (zero_dim != 1) {
-        if (is_quant_matmul) {
-          LOGS_DEFAULT(VERBOSE) << "QLinearMatMul does not support per-channel quantization";
-          return false;
-        }
-      }
-
-      // For onnx, u8s8 QlinearConv, the weight zero point can be a scalar,
-      // or a tensor with same channel as weight, for NNAPI we only support it be
-      // 0 (scalar) or all 0 (tensor), NNAPI will assume the zero point for per-channel
-      // quantization is 0 there is no input for it
-      const auto& weight_tensor = *initializers.at(io_def.node_arg.Name());
-      if (weight_tensor.dims()[0] != zero_dim && zero_dim != 1) {
-        LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight,"
-                              << " weight dimension[0] " << weight_tensor.dims()[0]
-                              << " zero point dimension " << zero_dim;
-        return false;
-      }
-
-      std::vector<uint8_t> unpacked_tensor;
-      auto status = onnxruntime::utils::UnpackInitializerData(zero_tensor, node_unit.ModelPath(), unpacked_tensor);
-      if (!status.IsOK()) {
-        LOGS_DEFAULT(ERROR) << "Qlinear[Conv/MatMul] error when unpack zero tensor: " << zero_point_name
-                            << ", error msg: " << status.ErrorMessage();
-        return false;
-      }
-
-      // Verify all onnx weight zero point(s) are 0(s)
-      const int8_t* zero_points = reinterpret_cast<const int8_t*>(unpacked_tensor.data());
-      for (size_t i = 0; i < unpacked_tensor.size(); i++) {
-        if (zero_points[i] != 0) {
-          LOGS_DEFAULT(VERBOSE) << "u8s8 Qlinear[Conv/MatMul]  only support 0 as zero point, "
-                                << "zero_points[" << i << "] has value: " << zero_points[i];
-          return false;
-        }
-      }
-    }
-  }
-
-  return true;
-}
-
 common::Status GetQuantizationScaleAndZeroPoint(
    const InitializedTensorSet& initializers, const NodeUnitIODef& io_def, const Path& model_path,
    float& scale, int32_t& zero_point) {
@ -387,8 +206,8 @@ common::Status GetQuantizationScaleAndZeroPoint(

 common::Status GetQuantizationScaleAndZeroPoint(
    const InitializedTensorSet& initializers, const NodeUnit& node_unit, const std::string& name,
-    float& scale, int32_t& zero_point, bool is_input) {
-  const auto& io_defs = is_input ? node_unit.Inputs() : node_unit.Outputs();
+    float& scale, int32_t& zero_point, IOKind io_kind) {
+  const auto& io_defs = io_kind == IOKind::Input ? node_unit.Inputs() : node_unit.Outputs();
  for (const auto& io_def : io_defs) {
    if (io_def.node_arg.Name() == name)
      return GetQuantizationScaleAndZeroPoint(initializers, io_def, node_unit.ModelPath(),
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
@ -82,12 +82,14 @@ enum class QuantizedOpType : uint8_t {
  QLinearAdd,
  QLinearSigmoid,
  QLinearAveragePool,
+  QLinearMul,
  // Not yet supported
-  // QLinearMul,
  // QLinearReduceMean,
  QDQConv,
  QDQResize,
  QDQAveragePool,
+  QDQAdd,
+  QDQMul,
  // TODO, add other QDQ NodeUnit types
 };

@ -97,6 +99,11 @@ enum class ConvType : uint8_t {
  Grouped,
 };

+enum class IOKind : uint8_t {
+  Input,
+  Output,
+};
+
 QuantizedOpType GetQuantizedOpType(const NodeUnit& node_unit);

 // Return the type of the conv ops,
@ -113,18 +120,8 @@ bool IsQuantizedPool(QuantizedOpType quant_op_type);
 // Such as QLinearConv, QLinearMatMul, QLinearAdd, QDQConv,...
 bool IsQuantizedBinaryOp(QuantizedOpType quant_op_type);

-// Check if a qlinear unary op has valid inputs, Qlinear[Sigmoid/AveragePool]
-bool HasValidUnaryOpQuantizedInputs(const NodeUnit& node_unit);
 // Check if a qlinear binary op has valid inputs, Qlinear[Conv/MatMul/Add]
-bool HasValidBinaryOpQuantizedInputs(const NodeUnit& node_unit);
-
-// Check if a qlinear op has valid scales for given indices
-bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
-                                const std::vector<size_t>& indices, const OpSupportCheckParams& params, bool is_input);
-
-// Check if a qlinear op has valid zero points for given indices
-bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
-                                    const std::vector<size_t>& indices, bool is_input);
+bool HasValidBinaryOpQuantizedInputTypes(const NodeUnit& node_unit);

 common::Status GetQuantizationScaleAndZeroPoint(
    const InitializedTensorSet& initializers, const NodeUnitIODef& io_def, const Path& model_path,
@ -132,7 +129,7 @@ common::Status GetQuantizationScaleAndZeroPoint(

 common::Status GetQuantizationScaleAndZeroPoint(
    const InitializedTensorSet& initializers, const NodeUnit& node_unit, const std::string& name,
-    float& scale, int32_t& zero_point, bool is_input = true);
+    float& scale, int32_t& zero_point, IOKind io_kind = IOKind::Input);

 // Get Shape/Type of a NodeArg
 // TODO, move to shared_utils
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
@ -210,7 +210,7 @@ static Status GetInputDataType(
      // TODO, verify the scale and zero point match if there are multiple op using same input
      const auto* node_unit = all_quantized_op_inputs.at(name)[0];
      ORT_RETURN_IF_ERROR(GetQuantizationScaleAndZeroPoint(
-          initializers, *node_unit, name, scale, zero_point, true /* is_input */));
+          initializers, *node_unit, name, scale, zero_point, IOKind::Input));
      break;
    }
      // case ONNX_NAMESPACE::TensorProto_DataType_INT8:
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
@ -452,7 +452,7 @@ static Status HandleAutoPad(const Shape& input_shape,
 }

 // Get scales and zero points for the qlinear binary ops (which has 2 input and 1 output)
-// QLinearConv, QLinearMatmul, QLinearAdd
+// QLinearConv, QLinearMatmul, QLinearAdd, QLinearMul
 // a, b are inputs, and y is output
 static Status GetBinaryOpQuantizationScaleAndZeroPoint(
    const InitializedTensorSet& initializers, const NodeUnit& node_unit,
@ -656,8 +656,11 @@ class BinaryOpBuilder : public BaseOpBuilder {
 };

 /* static */ bool BinaryOpBuilder::IsQuantizedOp(const NodeUnit& node_unit) {
-  // TODO, add support for QDQ NodeUnit
-  return node_unit.OpType() == "QLinearAdd";
+  const auto quant_type = GetQuantizedOpType(node_unit);
+  return quant_type == QuantizedOpType::QLinearAdd ||
+         quant_type == QuantizedOpType::QLinearMul ||
+         quant_type == QuantizedOpType::QDQAdd ||
+         quant_type == QuantizedOpType::QDQMul;
 }

 void BinaryOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
@ -680,6 +683,7 @@ void BinaryOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const N
          "Mul",
          "Div",
          "QLinearAdd",
+          "QLinearMul",
          "Pow",
      });
 }
@ -690,12 +694,12 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const

  int32_t op_code;
  bool add_activation = true;
-  bool op_is_qlinear = op_type == "QLinearAdd";
-  if (op_type == "Add" || op_is_qlinear) {
+  bool is_quant_op = IsQuantizedOp(node_unit);
+  if (op_type == "Add" || op_type == "QLinearAdd") {  // Add/QLinearAdd/QDQAdd
    op_code = ANEURALNETWORKS_ADD;
  } else if (op_type == "Sub") {
    op_code = ANEURALNETWORKS_SUB;
-  } else if (op_type == "Mul") {
+  } else if (op_type == "Mul" || op_type == "QLinearMul") {  // Mul/QLinearMul/QDQMul
    op_code = ANEURALNETWORKS_MUL;
  } else if (op_type == "Div") {
    op_code = ANEURALNETWORKS_DIV;
@ -721,7 +725,7 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
          b_zero_point = 0,
          y_zero_point = 0;

-  if (op_is_qlinear) {
+  if (is_quant_op) {
    ORT_RETURN_IF_ERROR(GetBinaryOpQuantizationScaleAndZeroPoint(
        model_builder.GetInitializerTensors(), node_unit,
        a_scale, b_scale, y_scale,
@ -729,7 +733,7 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
  }

  // Verify if the scale and zero point matchs from onnx input and nnapi input match
-  if (op_is_qlinear) {
+  if (is_quant_op) {
    ORT_RETURN_IF_ERROR(IsValidInputQuantizedType(model_builder, input1, a_scale, a_zero_point));
    ORT_RETURN_IF_ERROR(IsValidInputQuantizedType(model_builder, input2, b_scale, b_zero_point));
  }
@ -2717,6 +2721,7 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() {
    NNAPI_EP_ADD_SHARED_OP_BUILDER("Mul", BinaryOpBuilder);
    NNAPI_EP_ADD_SHARED_OP_BUILDER("Pow", BinaryOpBuilder);
    NNAPI_EP_ADD_SHARED_OP_BUILDER("QLinearAdd", BinaryOpBuilder);
+    NNAPI_EP_ADD_SHARED_OP_BUILDER("QLinearMul", BinaryOpBuilder);
    NNAPI_EP_ADD_SHARED_OP_BUILDER("Sub", BinaryOpBuilder);
  }

--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc
@ -22,7 +22,21 @@ struct OpSupportCheckerRegistrations {
  std::unordered_map<std::string, const IOpSupportChecker*> op_support_checker_map;
 };

-bool HasExternalInitializer(const InitializedTensorSet& initializers, const NodeUnit& node_unit) {
+template <class T>
+void CreateSharedOpSupportCheckerImpl(const std::string& op_type,
+                                      OpSupportCheckerRegistrations& op_registrations,
+                                      const std::vector<std::string>& op_types) {
+  // The shared OpSupportChecker is already in the OpSupportCheckerRegistrations
+  if (op_registrations.op_support_checker_map.find(op_type) != op_registrations.op_support_checker_map.cend())
+    return;
+
+  op_registrations.support_checkers.push_back(std::make_unique<T>());
+  for (const auto& op : op_types) {
+    op_registrations.op_support_checker_map.emplace(op, op_registrations.support_checkers.back().get());
+  }
+}
+
+static bool HasExternalInitializer(const InitializedTensorSet& initializers, const NodeUnit& node_unit) {
  const auto is_ext_initializer =
      [&](const NodeArg& node_arg) {
        const auto& input_name(node_arg.Name());
@ -58,18 +72,200 @@ bool HasExternalInitializer(const InitializedTensorSet& initializers, const Node
  return false;
 }

-template <class T>
-void CreateSharedOpSupportCheckerImpl(const std::string& op_type,
-                                      OpSupportCheckerRegistrations& op_registrations,
-                                      const std::vector<std::string>& op_types) {
-  // The shared OpSupportChecker is already in the OpSupportCheckerRegistrations
-  if (op_registrations.op_support_checker_map.find(op_type) != op_registrations.op_support_checker_map.cend())
-    return;
-
-  op_registrations.support_checkers.push_back(std::make_unique<T>());
-  for (const auto& op : op_types) {
-    op_registrations.op_support_checker_map.emplace(op, op_registrations.support_checkers.back().get());
+static bool IsQuantizationScaleSupported(const InitializedTensorSet& initializers,
+                                         const NodeUnitIODef& io_def,
+                                         const OpSupportCheckParams& params,
+                                         const std::string& op_type,
+                                         bool is_quant_matmul,
+                                         bool is_conv_matmul_u8s8_weight) {
+  const auto scale_name = io_def.quant_param->scale.Name();
+  auto it = initializers.find(scale_name);
+  if (it == initializers.cend()) {
+    LOGS_DEFAULT(VERBOSE) << "The scale of " << op_type << " must be an initializer tensor";
+    return false;
  }
+
+  const auto& scale_tensor = *it->second;
+  int64_t scales_dim = scale_tensor.dims().empty() ? 1 : scale_tensor.dims()[0];
+  if (!is_conv_matmul_u8s8_weight) {
+    if (scales_dim != 1) {
+      LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, "
+                            << " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+";
+      return false;
+    }
+  } else if (scales_dim != 1) {
+    // For u8s8 Qlinear[Conv/MatMul], we support
+    // 1. Per-tensor, the weight will be transformed to uint8 later
+    // 2. Per-channel, only from Android API level 29
+    if (is_quant_matmul) {
+      LOGS_DEFAULT(VERBOSE) << "QLinearMatMul does not support per-channel quantization";
+      return false;
+    }
+
+    if (params.android_feature_level < ANEURALNETWORKS_FEATURE_LEVEL_3) {
+      LOGS_DEFAULT(VERBOSE) << op_type << " only supports per-channel quantization on Android API 29+, "
+                            << "system NNAPI feature level: " << params.android_feature_level;
+      return false;
+    }
+
+    Shape weight_shape;
+    if (!GetShape(io_def.node_arg, weight_shape))
+      return false;
+
+    if (weight_shape[0] != scales_dim) {
+      LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight,"
+                            << " weight dimension[0] " << weight_shape[0]
+                            << " scale dimension " << scales_dim;
+      return false;
+    }
+  }
+
+  return true;
+}
+
+static bool IsQuantizationZeroPointSupported(const InitializedTensorSet& initializers,
+                                             const NodeUnitIODef& io_def,
+                                             const std::string& op_type,
+                                             const Path& model_path,
+                                             bool is_quant_matmul,
+                                             bool is_conv_matmul_u8s8_weight) {
+  // zero point is optional here
+  if (!io_def.quant_param->zero_point)
+    return true;
+
+  const auto& zero_point_name = io_def.quant_param->zero_point->Name();
+  if (!Contains(initializers, zero_point_name)) {
+    LOGS_DEFAULT(VERBOSE) << "The zero point of " << op_type << " must be an initializer tensor";
+    return false;
+  }
+
+  const auto& zero_tensor = *initializers.at(zero_point_name);
+  int64_t zero_dim = zero_tensor.dims().empty() ? 1 : zero_tensor.dims()[0];
+
+  if (!is_conv_matmul_u8s8_weight) {
+    if (zero_dim != 1) {
+      LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, "
+                            << " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+";
+      return false;
+    }
+  } else {
+    // For u8s8 Qlinear[Conv/MatMul], we support
+    // 1. Per-tensor, the weight will be transformed to uint8 later
+    // 2. Per-channel, only from Android API level 29
+    if (zero_tensor.data_type() != ONNX_NAMESPACE::TensorProto_DataType_INT8) {
+      LOGS_DEFAULT(VERBOSE) << "u8s8 Qlinear[Conv/MatMul] only supports int8 zero point for weight, "
+                            << "actual zero point type: [" << zero_tensor.data_type() << "]";
+      return false;
+    }
+
+    if (zero_dim != 1) {
+      if (is_quant_matmul) {
+        LOGS_DEFAULT(VERBOSE) << "QLinearMatMul does not support per-channel quantization";
+        return false;
+      }
+    }
+
+    // For onnx, u8s8 QlinearConv, the weight zero point can be a scalar,
+    // or a tensor with same channel as weight, for NNAPI we only support it be
+    // 0 (scalar) or all 0 (tensor), NNAPI will assume the zero point for per-channel
+    // quantization is 0 there is no input for it
+    Shape weight_shape;
+    if (!GetShape(io_def.node_arg, weight_shape))
+      return false;
+
+    if (weight_shape[0] != zero_dim && zero_dim != 1) {
+      LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight,"
+                            << " weight dimension[0] " << weight_shape[0]
+                            << " zero point dimension " << zero_dim;
+      return false;
+    }
+
+    std::vector<uint8_t> unpacked_tensor;
+    auto status = onnxruntime::utils::UnpackInitializerData(zero_tensor, model_path, unpacked_tensor);
+    if (!status.IsOK()) {
+      LOGS_DEFAULT(ERROR) << "Qlinear[Conv/MatMul] error when unpack zero tensor: " << zero_point_name
+                          << ", error msg: " << status.ErrorMessage();
+      return false;
+    }
+
+    // Verify all onnx weight zero point(s) are 0(s)
+    const int8_t* zero_points = reinterpret_cast<const int8_t*>(unpacked_tensor.data());
+    for (size_t i = 0; i < unpacked_tensor.size(); i++) {
+      if (zero_points[i] != 0) {
+        LOGS_DEFAULT(VERBOSE) << "u8s8 Qlinear[Conv/MatMul]  only support 0 as zero point, "
+                              << "zero_points[" << i << "] has value: " << zero_points[i];
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+// Check if the given quantized input(s) or output(s) is supported
+static bool IsQuantizedIOSupported(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+                                   const std::vector<size_t>& indices, const OpSupportCheckParams& params, IOKind io_kind) {
+  const auto& op_type = node_unit.OpType();
+  auto quant_op_type = GetQuantizedOpType(node_unit);
+
+  ORT_ENFORCE(quant_op_type != QuantizedOpType::Unknown, "[", op_type, "] is not a quantized op");
+
+  bool is_input = io_kind == IOKind::Input;
+  bool is_quant_conv = IsQuantizedConv(quant_op_type);
+  bool is_quant_matmul = (quant_op_type == QuantizedOpType::QLinearMatMul);
+  const auto& io_defs = is_input ? node_unit.Inputs() : node_unit.Outputs();
+
+  for (const auto idx : indices) {
+    if (idx >= io_defs.size()) {
+      LOGS_DEFAULT(VERBOSE) << (is_input ? "Input" : "Output") << " index,  " << idx
+                            << " >= size, " << io_defs.size()
+                            << " of NodeUnit: " << node_unit.Name();
+      return false;
+    }
+
+    const auto& io_def = io_defs[idx];
+    ORT_ENFORCE(io_def.quant_param.has_value(), "Input index,  ", idx, " has no quant_param");
+
+    // If this op is Qlinear[Conv/MatMul], we want to check u8s8 support for weight tensor (or B tensor for QlinearMatMul)
+    bool is_conv_matmul_weight = is_input && (is_quant_conv || is_quant_matmul) && idx == 1;
+    bool is_conv_matmul_u8s8_weight = false;
+
+    if (is_conv_matmul_weight) {
+      int32_t weight_type;
+      if (!GetType(io_def.node_arg, weight_type))
+        return false;
+      is_conv_matmul_u8s8_weight = weight_type == ONNX_NAMESPACE::TensorProto_DataType_INT8;
+    }
+
+    int32_t input_type;
+    if (!GetType(io_def.node_arg, input_type))
+      return false;
+
+    // We only support u8 for most of the inputs and all outputs, with the exception for Quantized MatMul and Conv,
+    // which allows s8 weight (u8s8)
+    // TODO, add support of s8s8
+    if (input_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8 &&
+        !(input_type == ONNX_NAMESPACE::TensorProto_DataType_INT8 && is_conv_matmul_u8s8_weight)) {
+      LOGS_DEFAULT(VERBOSE) << op_type << "NodeUnit [" << node_unit.Name()
+                            << "], type [" << op_type << "]'s "
+                            << (is_input ? "Input" : "Output") << " index  [" << idx
+                            << "] has unsupported type [" << input_type << "]";
+      return false;
+    }
+
+    // Check scale and zero point
+    if (!IsQuantizationScaleSupported(initializers, io_def, params, op_type,
+                                      is_quant_matmul, is_conv_matmul_u8s8_weight)) {
+      return false;
+    }
+
+    if (!IsQuantizationZeroPointSupported(initializers, io_def, op_type, node_unit.ModelPath(),
+                                          is_quant_matmul, is_conv_matmul_u8s8_weight)) {
+      return false;
+    }
+  }
+
+  return true;
 }

 #pragma endregion helpers
@ -100,7 +296,9 @@ class BaseOpSupportChecker : public IOpSupportChecker {
    return ANEURALNETWORKS_FEATURE_LEVEL_1;
  }

-  virtual bool HasSupportedInputsImpl(const NodeUnit& node_unit) const;
+  virtual bool HasSupportedInputOutputsImpl(
+      const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+      const OpSupportCheckParams& params) const;

  virtual int GetMinSupportedOpSet(const NodeUnit& /* node_unit */) const { return 1; }
  virtual int GetMaxSupportedOpSet(const NodeUnit& /* node_unit */) const { return 15; }
@ -112,7 +310,8 @@ class BaseOpSupportChecker : public IOpSupportChecker {

 private:
  bool HasSupportedOpSet(const NodeUnit& node_unit) const;
-  bool HasSupportedInputs(const NodeUnit& node_unit) const;
+  bool HasSupportedInputOutputs(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+                                const OpSupportCheckParams& params) const;
 };

 /* static */ void BaseOpSupportChecker::CreateSharedOpSupportChecker(
@ -138,7 +337,7 @@ bool BaseOpSupportChecker::IsOpSupported(const InitializedTensorSet& initializer
  if (!IsNodeUnitTypeSupported(node_unit))
    return false;

-  if (!HasSupportedInputs(node_unit))
+  if (!HasSupportedInputOutputs(initializers, node_unit, params))
    return false;

  // We do not support external initializers for now
@ -151,7 +350,8 @@ bool BaseOpSupportChecker::IsOpSupported(const InitializedTensorSet& initializer
  return IsOpSupportedImpl(initializers, node_unit, params);
 }

-bool BaseOpSupportChecker::HasSupportedInputs(const NodeUnit& node_unit) const {
+bool BaseOpSupportChecker::HasSupportedInputOutputs(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+                                                    const OpSupportCheckParams& params) const {
  // We do not support unknown(null) input shape
  auto has_supported_shape = [](const NodeArg& node_arg, const std::string& name, const std::string op_type) {
    const auto* shape_proto = node_arg.Shape();
@ -185,10 +385,12 @@ bool BaseOpSupportChecker::HasSupportedInputs(const NodeUnit& node_unit) const {
        return false;
    }
  }
-  return HasSupportedInputsImpl(node_unit);
+  return HasSupportedInputOutputsImpl(initializers, node_unit, params);
 }

-bool BaseOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const {
+bool BaseOpSupportChecker::HasSupportedInputOutputsImpl(
+    const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
+    const OpSupportCheckParams& /* params */) const {
  // We only check the type of input 0 by default
  // specific op builder can override this
  const auto& input = node_unit.Inputs()[0].node_arg;
@ -245,8 +447,13 @@ class BinaryOpSupportChecker : public BaseOpSupportChecker {
                                           const OpSupportCheckParams& params) const override;
  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
                         const OpSupportCheckParams& params) const override;
-  bool HasSupportedInputsImpl(const NodeUnit& node_unit) const override;
+  bool HasSupportedInputOutputsImpl(
+      const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+      const OpSupportCheckParams& params) const override;
  int GetMinSupportedOpSet(const NodeUnit& node_unit) const override;
+
+  bool IsNodeUnitTypeSupported(const NodeUnit& node_unit) const override;
+  static bool IsQuantizedOp(const NodeUnit& node_unit);
 };

 /* static */ void BinaryOpSupportChecker::CreateSharedOpSupportChecker(
@ -259,10 +466,29 @@ class BinaryOpSupportChecker : public BaseOpSupportChecker {
          "Mul",
          "Div",
          "QLinearAdd",
+          "QLinearMul",
          "Pow",
      });
 }

+bool BinaryOpSupportChecker::IsNodeUnitTypeSupported(const NodeUnit& node_unit) const {
+  if (node_unit.UnitType() == NodeUnit::Type::QDQGroup) {
+    const auto quant_type = GetQuantizedOpType(node_unit);
+    return quant_type == QuantizedOpType::QDQAdd ||
+           quant_type == QuantizedOpType::QDQMul;
+  }
+
+  return true;
+}
+
+/* static */ bool BinaryOpSupportChecker::IsQuantizedOp(const NodeUnit& node_unit) {
+  const auto quant_type = GetQuantizedOpType(node_unit);
+  return quant_type == QuantizedOpType::QLinearAdd ||
+         quant_type == QuantizedOpType::QLinearMul ||
+         quant_type == QuantizedOpType::QDQAdd ||
+         quant_type == QuantizedOpType::QDQMul;
+}
+
 int32_t BinaryOpSupportChecker::GetMinSupportedNNAPIFeatureLevel(
    const NodeUnit& node_unit, const OpSupportCheckParams& /* params */) const {
  const auto& op(node_unit.OpType());
@ -281,21 +507,29 @@ int BinaryOpSupportChecker::GetMinSupportedOpSet(const NodeUnit& node_unit) cons
  const auto& op(node_unit.OpType());

  // Add/Sub/Mul/Div/Pow opset 6- has broadcast attributes we do not support now
-  if (op != "QLinearAdd")
+  if (op != "QLinearAdd" && op != "QLinearMul")
    return 7;

  return 1;
 }

-bool BinaryOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const {
-  bool is_qlinear_add = node_unit.OpType() == "QLinearAdd";
+bool BinaryOpSupportChecker::HasSupportedInputOutputsImpl(
+    const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+    const OpSupportCheckParams& params) const {
+  bool is_quantized_op = IsQuantizedOp(node_unit);
  bool is_pow = node_unit.OpType() == "Pow";
-  if (!is_qlinear_add && !is_pow)
-    return BaseOpSupportChecker::HasSupportedInputsImpl(node_unit);
+  if (!is_quantized_op && !is_pow)
+    return BaseOpSupportChecker::HasSupportedInputOutputsImpl(initializers, node_unit, params);

-  if (is_qlinear_add) {
-    // QLinearAdd
-    if (!HasValidBinaryOpQuantizedInputs(node_unit))
+  if (is_quantized_op) {
+    // QLinearAdd/QDQAdd/QLinearMul/QDQMul
+    if (!HasValidBinaryOpQuantizedInputTypes(node_unit))
+      return false;
+
+    if (!IsQuantizedIOSupported(initializers, node_unit, {0, 1}, params, IOKind::Input))
+      return false;
+
+    if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Output))
      return false;
  }

@ -320,11 +554,10 @@ bool BinaryOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) c
  return true;
 }

-bool BinaryOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
-                                               const OpSupportCheckParams& params) const {
+bool BinaryOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
+                                               const OpSupportCheckParams& /* params */) const {
  const auto& op_type(node_unit.OpType());
  const auto& inputs = node_unit.Inputs();
-  bool op_is_qlinear = op_type == "QLinearAdd";
  Shape input1_shape, input2_shape;
  if (!GetShape(inputs[0].node_arg, input1_shape) ||
      !GetShape(inputs[1].node_arg, input2_shape))
@ -339,32 +572,6 @@ bool BinaryOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initi
    return false;
  }

-  if (op_is_qlinear) {
-    // For QLinearAdd, we only support uint8 output now
-    int32_t output_type;
-    if (!GetType(node_unit.Outputs()[0].node_arg, output_type))
-      return false;
-
-    if (output_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
-      LOGS_DEFAULT(VERBOSE) << "[" << op_type
-                            << "] output type: [" << output_type
-                            << "] is not supported for now";
-      return false;
-    }
-
-    // Check input scales and ZPs
-    if (!HasValidQuantizationScales(initializers, node_unit, {0, 1}, params, true /* is_input */))
-      return false;
-    if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0, 1}, true /* is_input */))
-      return false;
-
-    // Check output scale and ZP
-    if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, false /* is_input */))
-      return false;
-    if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, false /* is_input */))
-      return false;
-  }
-
  return true;
 }

@ -382,7 +589,9 @@ class TransposeOpSupportChecker : public BaseOpSupportChecker {
    return ANEURALNETWORKS_FEATURE_LEVEL_2;
  }

-  bool HasSupportedInputsImpl(const NodeUnit& node_unit) const override;
+  bool HasSupportedInputOutputsImpl(
+      const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+      const OpSupportCheckParams& params) const override;
 };

 bool TransposeOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
@ -401,7 +610,9 @@ bool TransposeOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& /*
  return true;
 }

-bool TransposeOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const {
+bool TransposeOpSupportChecker::HasSupportedInputOutputsImpl(
+    const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
+    const OpSupportCheckParams& /* params */) const {
  int32_t input_type;
  if (!GetType(node_unit.Inputs()[0].node_arg, input_type))
    return false;
@ -561,8 +772,10 @@ class PoolOpSupportChecker : public BaseOpSupportChecker {
    return params.use_nchw ? ANEURALNETWORKS_FEATURE_LEVEL_3 : ANEURALNETWORKS_FEATURE_LEVEL_2;
  }

-  bool HasSupportedInputsImpl(const NodeUnit& node_unit) const override;
-  bool IsNodeUnitTypeSupported(const NodeUnit& /* node_unit */) const override { return true; }
+  bool HasSupportedInputOutputsImpl(
+      const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+      const OpSupportCheckParams& params) const override;
+  bool IsNodeUnitTypeSupported(const NodeUnit& /* node_unit */) const override;
  static bool IsQuantizedOp(const NodeUnit& node_unit);
 };

@ -579,12 +792,21 @@ class PoolOpSupportChecker : public BaseOpSupportChecker {
      });
 }

+bool PoolOpSupportChecker::IsNodeUnitTypeSupported(const NodeUnit& node_unit) const {
+  if (node_unit.UnitType() == NodeUnit::Type::QDQGroup) {
+    const auto quant_type = GetQuantizedOpType(node_unit);
+    return quant_type == QuantizedOpType::QDQAveragePool;
+  }
+
+  return true;
+}
+
 /* static */ bool PoolOpSupportChecker::IsQuantizedOp(const NodeUnit& node_unit) {
  return IsQuantizedPool(GetQuantizedOpType(node_unit));
 }

 bool PoolOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
-                                             const OpSupportCheckParams& params) const {
+                                             const OpSupportCheckParams& /* params */) const {
  const auto& op_name = node_unit.Name();
  const auto& op_type = node_unit.OpType();
  const auto& inputs = node_unit.Inputs();
@ -601,7 +823,8 @@ bool PoolOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial
  }

  bool is_quant_pool = IsQuantizedOp(node_unit);
-  if (op_type == "AveragePool" || op_type == "MaxPool" || op_type == "QLinearAveragePool") {
+  bool is_average_pool = op_type == "AveragePool" || op_type == "QLinearAveragePool";
+  if (is_average_pool || op_type == "MaxPool") {
    NodeAttrHelper helper(node_unit);

    const auto count_include_pad = helper.Get("count_include_pad", 0);
@ -642,20 +865,7 @@ bool PoolOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial
  }

  // We need to check if we have valid scales and zero points for QLinearAveragePool
-  if (is_quant_pool) {
-    // Check input scales and ZPs
-    if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, true /* is_input */))
-      return false;
-    if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, true /* is_input */))
-      return false;
-
-    // Check output scale and ZP
-
-    if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, false /* is_input */))
-      return false;
-    if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, false /* is_input */))
-      return false;
-
+  if (is_average_pool && is_quant_pool) {
    // NNAPI requires Quantized Average Pool has same scale and zero point for both input and output
    float input_scale = 0.0f;
    int32_t input_zp = 0;
@ -697,14 +907,23 @@ bool PoolOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial
  return true;
 }

-bool PoolOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const {
-  bool is_max_pool = node_unit.OpType() == "MaxPool";
+bool PoolOpSupportChecker::HasSupportedInputOutputsImpl(
+    const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+    const OpSupportCheckParams& params) const {
+  const auto& op_type = node_unit.OpType();
  bool is_quant_pool = IsQuantizedOp(node_unit);
-  if (!is_max_pool && !is_quant_pool)
-    return BaseOpSupportChecker::HasSupportedInputsImpl(node_unit);
+  bool is_max_pool = op_type == "MaxPool";
+  bool is_average_pool = op_type == "AveragePool" || op_type == "QLinearAveragePool";
+  bool is_quant_average_pool = is_quant_pool && is_average_pool;
+  if (!is_max_pool && !is_quant_average_pool)
+    return BaseOpSupportChecker::HasSupportedInputOutputsImpl(initializers, node_unit, params);

-  if (is_quant_pool) {
-    return HasValidUnaryOpQuantizedInputs(node_unit);
+  if (is_quant_average_pool) {
+    if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Input))
+      return false;
+
+    if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Output))
+      return false;
  }

  // is_max_pool
@ -742,7 +961,9 @@ class ConvOpSupportChecker : public BaseOpSupportChecker {
    return params.use_nchw ? ANEURALNETWORKS_FEATURE_LEVEL_3 : ANEURALNETWORKS_FEATURE_LEVEL_2;
  }

-  bool HasSupportedInputsImpl(const NodeUnit& node_unit) const override;
+  bool HasSupportedInputOutputsImpl(
+      const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
+      const OpSupportCheckParams& /* params */) const override;
  bool IsNodeUnitTypeSupported(const NodeUnit& /* node_unit */) const override { return true; }
  static bool IsQuantizedOp(const NodeUnit& node_unit);
 };
@ -761,12 +982,20 @@ class ConvOpSupportChecker : public BaseOpSupportChecker {
  return IsQuantizedConv(GetQuantizedOpType(node_unit));
 }

-bool ConvOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const {
+bool ConvOpSupportChecker::HasSupportedInputOutputsImpl(
+    const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+    const OpSupportCheckParams& params) const {
  if (!IsQuantizedOp(node_unit))
-    return BaseOpSupportChecker::HasSupportedInputsImpl(node_unit);
+    return BaseOpSupportChecker::HasSupportedInputOutputsImpl(initializers, node_unit, params);

  // QLinearConv only supports input of uint8 for now
-  if (!HasValidBinaryOpQuantizedInputs(node_unit))
+  if (!HasValidBinaryOpQuantizedInputTypes(node_unit))
+    return false;
+
+  if (!IsQuantizedIOSupported(initializers, node_unit, {0, 1}, params, IOKind::Input))
+    return false;
+
+  if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Output))
    return false;

  return true;
@ -813,34 +1042,10 @@ bool ConvOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial
  }

  if (is_quant_conv) {
-    // For QLinearConv, we only support uint8 output now
-    int32_t output_type;
-    if (!GetType(node_unit.Outputs()[0].node_arg, output_type))
-      return false;
-
-    if (output_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
-      LOGS_DEFAULT(VERBOSE) << "[" << op_type
-                            << "] output type: [" << output_type
-                            << "] is not supported for now";
-      return false;
-    }
-
    if (inputs.size() > 2 && !Contains(initializers, inputs[2].node_arg.Name())) {
      LOGS_DEFAULT(VERBOSE) << "Bias of QLinearConv must be known";
      return false;
    }
-
-    // Check input scales and ZPs
-    if (!HasValidQuantizationScales(initializers, node_unit, {0, 1}, params, true /* is_input */))
-      return false;
-    if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0, 1}, true /* is_input */))
-      return false;
-
-    // Check output scale and ZP
-    if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, false /* is_input */))
-      return false;
-    if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, false /* is_input */))
-      return false;
  }

  return true;
@ -931,16 +1136,26 @@ class GemmOpSupportChecker : public BaseOpSupportChecker {
 private:
  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
                         const OpSupportCheckParams& params) const override;
-  bool HasSupportedInputsImpl(const NodeUnit& node_unit) const override;
+  bool HasSupportedInputOutputsImpl(
+      const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
+      const OpSupportCheckParams& /* params */) const override;
  int GetMinSupportedOpSet(const NodeUnit& node_unit) const override;
 };

-bool GemmOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const {
+bool GemmOpSupportChecker::HasSupportedInputOutputsImpl(
+    const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+    const OpSupportCheckParams& params) const {
  if (node_unit.OpType() != "QLinearMatMul")
-    return BaseOpSupportChecker::HasSupportedInputsImpl(node_unit);
+    return BaseOpSupportChecker::HasSupportedInputOutputsImpl(initializers, node_unit, params);

  // QLinearMatMul
-  if (!HasValidBinaryOpQuantizedInputs(node_unit))
+  if (!HasValidBinaryOpQuantizedInputTypes(node_unit))
+    return false;
+
+  if (!IsQuantizedIOSupported(initializers, node_unit, {0, 1}, params, IOKind::Input))
+    return false;
+
+  if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Output))
    return false;

  return true;
@ -1077,33 +1292,6 @@ bool GemmOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial
      LOGS_DEFAULT(VERBOSE) << "B of MatMul must be known";
      return false;
    }
-
-    if (is_qlinear_matmul) {
-      // For QLinearMatMul, we only support uint8 output now
-      int32_t output_type;
-      if (!GetType(node_unit.Outputs()[0].node_arg, output_type))
-        return false;
-
-      if (output_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
-        LOGS_DEFAULT(VERBOSE) << "[" << op_type
-                              << "] output type: [" << output_type
-                              << "] is not supported for now";
-        return false;
-      }
-
-      // All scale/zero points are initializer scalars
-      // Check input scales and ZPs
-      if (!HasValidQuantizationScales(initializers, node_unit, {0, 1}, params, true /* is_input */))
-        return false;
-      if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0, 1}, true /* is_input */))
-        return false;
-
-      // Check output scale and ZP
-      if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, false /* is_input */))
-        return false;
-      if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, false /* is_input */))
-        return false;
-    }
  } else {
    LOGS_DEFAULT(VERBOSE) << "GemmOpSupportChecker, unknown op: " << op_type;
  }
@ -1127,7 +1315,9 @@ class UnaryOpSupportChecker : public BaseOpSupportChecker {
  int32_t GetMinSupportedNNAPIFeatureLevel(const NodeUnit& /* node_unit */,
                                           const OpSupportCheckParams& params) const override;

-  bool HasSupportedInputsImpl(const NodeUnit& node_unit) const override;
+  bool HasSupportedInputOutputsImpl(
+      const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
+      const OpSupportCheckParams& /* params */) const override;

  int GetMinSupportedOpSet(const NodeUnit& node_unit) const override;

@ -1176,12 +1366,20 @@ int32_t UnaryOpSupportChecker::GetMinSupportedNNAPIFeatureLevel(const NodeUnit&
  return ANEURALNETWORKS_FEATURE_LEVEL_1;
 }

-bool UnaryOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const {
+bool UnaryOpSupportChecker::HasSupportedInputOutputsImpl(
+    const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+    const OpSupportCheckParams& params) const {
  // We only need to override input check for QLinearSigmoid
  if (node_unit.OpType() != "QLinearSigmoid")
-    return BaseOpSupportChecker::HasSupportedInputsImpl(node_unit);
+    return BaseOpSupportChecker::HasSupportedInputOutputsImpl(initializers, node_unit, params);

-  return HasValidUnaryOpQuantizedInputs(node_unit);
+  if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Input))
+    return false;
+
+  if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Output))
+    return false;
+
+  return true;
 }

 // All ops except "Sin" opset 5- uses consumed_inputs attribute which is not supported for now
@ -1195,24 +1393,11 @@ int UnaryOpSupportChecker::GetMinSupportedOpSet(const NodeUnit& node_unit) const
 }

 /* static */ bool UnaryOpSupportChecker::IsQuantizedOpSupported(
-    const InitializedTensorSet& initializers, const NodeUnit& node_unit, const OpSupportCheckParams& params) {
+    const InitializedTensorSet& initializers, const NodeUnit& node_unit, const OpSupportCheckParams& /* params */) {
  const auto& op_type = node_unit.OpType();
  ORT_ENFORCE(op_type == "QLinearSigmoid");
-
  const auto& op_name = node_unit.Name();

-  // Check input scales and ZPs
-  if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, true /* is_input */))
-    return false;
-  if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, true /* is_input */))
-    return false;
-
-  // Check output scale and ZP
-  if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, false /* is_input */))
-    return false;
-  if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, false /* is_input */))
-    return false;
-
  // NNAPI requires the scale be 1.f/256 and zero point to be 0
  // See https://android.googlesource.com/platform/frameworks/ml/+/refs/heads/android10-c2f2-release/nn/common/operations/Activation.cpp#180
  float output_scale = 0.0f;
@ -1249,7 +1434,9 @@ class ConcatOpSupportChecker : public BaseOpSupportChecker {
  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
                         const OpSupportCheckParams& params) const override;

-  bool HasSupportedInputsImpl(const NodeUnit& node_unit) const override;
+  bool HasSupportedInputOutputsImpl(
+      const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
+      const OpSupportCheckParams& /* params */) const override;
 };

 bool ConcatOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
@ -1268,7 +1455,9 @@ bool ConcatOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& /* in
  return true;
 }

-bool ConcatOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const {
+bool ConcatOpSupportChecker::HasSupportedInputOutputsImpl(
+    const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
+    const OpSupportCheckParams& /* params */) const {
  int32_t input_type;
  if (!GetType(node_unit.Inputs()[0].node_arg, input_type))
    return false;
@ -1331,37 +1520,17 @@ bool SqueezeOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& init

 class QuantizeLinearOpSupportChecker : public BaseOpSupportChecker {
 private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
-                         const OpSupportCheckParams& params) const override;
-
  int32_t GetMinSupportedNNAPIFeatureLevel(const NodeUnit& /* node_unit */,
                                           const OpSupportCheckParams& /* params */) const override {
    return ANEURALNETWORKS_FEATURE_LEVEL_3;
  }
-};

-bool QuantizeLinearOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
-                                                       const OpSupportCheckParams& params) const {
-  int32_t output_type;
-  if (!GetType(node_unit.Outputs()[0].node_arg, output_type))
-    return false;
-
-  if (output_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
-    LOGS_DEFAULT(VERBOSE) << "[" << node_unit.OpType()
-                          << "] output type: [" << output_type
-                          << "] is not supported for now";
-    return false;
+  bool HasSupportedInputOutputsImpl(
+      const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+      const OpSupportCheckParams& params) const override {
+    return IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Output);
  }
-
-  // For QuantizeLinear only output is quantized
-  // Check output scale and ZP
-  if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, false /* is_input */))
-    return false;
-  if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, false /* is_input */))
-    return false;
-
-  return true;
-}
+};

 #pragma endregion

@ -1369,42 +1538,17 @@ bool QuantizeLinearOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSe

 class DequantizeLinearOpSupportChecker : public BaseOpSupportChecker {
 private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
-                         const OpSupportCheckParams& params) const override;
-
  int32_t GetMinSupportedNNAPIFeatureLevel(const NodeUnit& /* node_unit */,
                                           const OpSupportCheckParams& /* params */) const override {
    return ANEURALNETWORKS_FEATURE_LEVEL_1;
  }
-  bool HasSupportedInputsImpl(const NodeUnit& node_unit) const override;
-};

-bool DequantizeLinearOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
-                                                         const OpSupportCheckParams& params) const {
-  // For DequantizeLinear only input is quantized
-  // Check input scale and ZP
-  if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, true /* is_input */))
-    return false;
-  if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, true /* is_input */))
-    return false;
-
-  return true;
-}
-
-bool DequantizeLinearOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const {
-  int32_t input_type;
-  if (!GetType(node_unit.Inputs()[0].node_arg, input_type))
-    return false;
-
-  if (input_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
-    LOGS_DEFAULT(VERBOSE) << "[" << node_unit.OpType()
-                          << "] Input type: [" << input_type
-                          << "] is not supported for now";
-    return false;
+  bool HasSupportedInputOutputsImpl(
+      const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+      const OpSupportCheckParams& params) const override {
+    return IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Input);
  }
-
-  return true;
-}
+};

 #pragma endregion

@ -1480,7 +1624,9 @@ class ResizeOpSupportChecker : public BaseOpSupportChecker {
  // We only support Resize opset 11+ here
  int GetMinSupportedOpSet(const NodeUnit& /* node_unit */) const override { return 11; }

-  bool HasSupportedInputsImpl(const NodeUnit& node_unit) const override;
+  bool HasSupportedInputOutputsImpl(
+      const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
+      const OpSupportCheckParams& /* params */) const override;
  bool IsNodeUnitTypeSupported(const NodeUnit& /* node_unit */) const override { return true; }
  static bool IsQuantizedOp(const NodeUnit& node_unit) ORT_MUST_USE_RESULT;  // TODO, see if we want to move this to BaseOpBuilder
 };
@ -1609,33 +1755,6 @@ bool ResizeOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initi
    }
  }

-  if (IsQuantizedOp(node_unit)) {
-    // For QDQResize, we only support uint8 output now
-    // TODO, add int8 support to NNAPI, and maybe move all the output type check into a virtual function
-    // similar to HasSupportedInputsImpl
-    int32_t output_type;
-    if (!GetType(node_unit.Outputs()[0].node_arg, output_type))
-      return false;
-
-    if (output_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
-      LOGS_DEFAULT(VERBOSE) << "[Resize] output type: [" << output_type
-                            << "] is not supported for now";
-      return false;
-    }
-
-    // Check input scales and ZPs
-    if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, true /* is_input */))
-      return false;
-    if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, true /* is_input */))
-      return false;
-
-    // Check output scale and ZP
-    if (!HasValidQuantizationScales(initializers, node_unit, {0}, params, false /* is_input */))
-      return false;
-    if (!HasValidQuantizationZeroPoints(initializers, node_unit, {0}, false /* is_input */))
-      return false;
-  }
-
  return true;
 }

@ -1653,7 +1772,9 @@ int32_t ResizeOpSupportChecker::GetMinSupportedNNAPIFeatureLevel(const NodeUnit&
  return ANEURALNETWORKS_FEATURE_LEVEL_2;
 }

-bool ResizeOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) const {
+bool ResizeOpSupportChecker::HasSupportedInputOutputsImpl(
+    const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+    const OpSupportCheckParams& params) const {
  int32_t input_type;
  if (!GetType(node_unit.Inputs()[0].node_arg, input_type))
    return false;
@ -1666,6 +1787,14 @@ bool ResizeOpSupportChecker::HasSupportedInputsImpl(const NodeUnit& node_unit) c
    return false;
  }

+  if (IsQuantizedOp(node_unit)) {
+    if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Input))
+      return false;
+
+    if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, IOKind::Output))
+      return false;
+  }
+
  return true;
 }

@ -1870,6 +1999,7 @@ static OpSupportCheckerRegistrations CreateOpSupportCheckerRegistrations() {
    NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("Mul", BinaryOpSupportChecker);
    NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("Pow", BinaryOpSupportChecker);
    NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("QLinearAdd", BinaryOpSupportChecker);
+    NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("QLinearMul", BinaryOpSupportChecker);
    NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("Sub", BinaryOpSupportChecker);
  }

--- a/onnxruntime/test/contrib_ops/qlinear_binary_op_test.cc
+++ b/onnxruntime/test/contrib_ops/qlinear_binary_op_test.cc
@ -43,61 +43,87 @@ void RunQLinearMathTestFromFloat(
    const quantization::Params<T>& a_params,
    const std::vector<float>& b, const std::vector<int64_t>& b_shape_origin,
    const quantization::Params<T>& b_params,
-    const quantization::Params<T>& c_params,
-    bool input_b_is_initializer = false,
-    bool all_initializer_scale_zero_point = false) {
-  size_t number_dims = std::max(a_shape_origin.size(), b_shape_origin.size());
-  std::vector<int64_t> a_shape = PrefixingDims(a_shape_origin, number_dims);
-  std::vector<int64_t> b_shape = PrefixingDims(b_shape_origin, number_dims);
-  // calc broadcasting shaped
-  std::vector<int64_t> c_shape(number_dims, 1);
-  for (size_t axis = 0; axis < number_dims; ++axis) {
-    if (a_shape[axis] != b_shape[axis] && (a_shape[axis] != 1 && b_shape[axis] != 1)) {
-      ORT_THROW("Shapes can not be broadcasted");
-    }
-    c_shape[axis] = std::max(a_shape[axis], b_shape[axis]);
-  }
-
-  std::vector<int64_t> a_strides, b_strides, c_strides;
-  auto c_size = CalcStrides(c_shape, c_strides, false);
-  auto a_size = CalcStrides(a_shape, a_strides, true);
-  auto b_size = CalcStrides(b_shape, b_strides, true);
-  if (a_size != static_cast<int64_t>(a.size()) || b_size != static_cast<int64_t>(b.size())) {
-    ORT_THROW("Input size not match input shape!");
-  }
-  constexpr int qmax = std::numeric_limits<T>::max();
-  constexpr int qmin = std::numeric_limits<T>::min();
-
-  OpTester test(op_name, 1, onnxruntime::kMSDomain);
-  std::vector<T> a_quantized = QuantizeTestVector<T>(a, a_params);
-  test.template AddInput<T>("A", a_shape_origin, a_quantized);
-  test.AddInput<float>("A_scale", {}, {a_params.scale}, all_initializer_scale_zero_point);
-  test.template AddInput<T>("A_zero_point", {}, {a_params.zero_point}, all_initializer_scale_zero_point);
-
-  std::vector<T> b_quantized = QuantizeTestVector<T>(b, b_params);
-  test.template AddInput<T>("B", b_shape_origin, b_quantized, input_b_is_initializer);
-  test.AddInput<float>("B_scale", {}, {b_params.scale}, all_initializer_scale_zero_point);
-  test.template AddInput<T>("B_zero_point", {}, {b_params.zero_point}, all_initializer_scale_zero_point);
-
-  test.AddInput<float>("C_scale", {}, {c_params.scale}, all_initializer_scale_zero_point);
-  test.template AddInput<T>("C_zero_point", {}, {c_params.zero_point}, all_initializer_scale_zero_point);
-  std::vector<T> c(c_size);
-  for (int64_t offset = 0; offset < c_size; ++offset) {
-    int64_t remain = offset, a_offset = 0, b_offset = 0;
+    const quantization::Params<T>& c_params) {
+  const auto run_test = [&](bool input_b_is_initializer,
+                            bool all_initializer_scale_zero_point) {
+    size_t number_dims = std::max(a_shape_origin.size(), b_shape_origin.size());
+    std::vector<int64_t> a_shape = PrefixingDims(a_shape_origin, number_dims);
+    std::vector<int64_t> b_shape = PrefixingDims(b_shape_origin, number_dims);
+    // calc broadcasting shaped
+    std::vector<int64_t> c_shape(number_dims, 1);
    for (size_t axis = 0; axis < number_dims; ++axis) {
-      int64_t index = remain / c_strides[axis];
-      remain = remain % c_strides[axis];
-      a_offset += index * a_strides[axis];
-      b_offset += index * b_strides[axis];
+      if (a_shape[axis] != b_shape[axis] && (a_shape[axis] != 1 && b_shape[axis] != 1)) {
+        ORT_THROW("Shapes can not be broadcasted");
+      }
+      c_shape[axis] = std::max(a_shape[axis], b_shape[axis]);
    }

-    float a_dequantized = quantization::Dequantize(a_quantized[a_offset], a_params);
-    float b_dequantized = quantization::Dequantize(b_quantized[b_offset], b_params);
-    c[offset] = clampi<T>(static_cast<int>(std::nearbyintf(calc(a_dequantized, b_dequantized) / c_params.scale)) + c_params.zero_point, qmin, qmax);
-  }
-  test.template AddOutput<T>("C", c_shape, c);
+    std::vector<int64_t> a_strides, b_strides, c_strides;
+    auto c_size = CalcStrides(c_shape, c_strides, false);
+    auto a_size = CalcStrides(a_shape, a_strides, true);
+    auto b_size = CalcStrides(b_shape, b_strides, true);
+    if (a_size != static_cast<int64_t>(a.size()) || b_size != static_cast<int64_t>(b.size())) {
+      ORT_THROW("Input size not match input shape!");
+    }
+    constexpr int qmax = std::numeric_limits<T>::max();
+    constexpr int qmin = std::numeric_limits<T>::min();

-  test.Run();
+    OpTester test(op_name, 1, onnxruntime::kMSDomain);
+    std::vector<T> a_quantized = QuantizeTestVector<T>(a, a_params);
+    test.template AddInput<T>("A", a_shape_origin, a_quantized);
+    test.AddInput<float>("A_scale", {}, {a_params.scale}, all_initializer_scale_zero_point);
+    test.template AddInput<T>("A_zero_point", {}, {a_params.zero_point}, all_initializer_scale_zero_point);
+
+    std::vector<T> b_quantized = QuantizeTestVector<T>(b, b_params);
+    test.template AddInput<T>("B", b_shape_origin, b_quantized, input_b_is_initializer);
+    test.AddInput<float>("B_scale", {}, {b_params.scale}, all_initializer_scale_zero_point);
+    test.template AddInput<T>("B_zero_point", {}, {b_params.zero_point}, all_initializer_scale_zero_point);
+
+    test.AddInput<float>("C_scale", {}, {c_params.scale}, all_initializer_scale_zero_point);
+    test.template AddInput<T>("C_zero_point", {}, {c_params.zero_point}, all_initializer_scale_zero_point);
+    std::vector<T> c(c_size);
+    for (int64_t offset = 0; offset < c_size; ++offset) {
+      int64_t remain = offset, a_offset = 0, b_offset = 0;
+      for (size_t axis = 0; axis < number_dims; ++axis) {
+        int64_t index = remain / c_strides[axis];
+        remain = remain % c_strides[axis];
+        a_offset += index * a_strides[axis];
+        b_offset += index * b_strides[axis];
+      }
+
+      float a_dequantized = quantization::Dequantize(a_quantized[a_offset], a_params);
+      float b_dequantized = quantization::Dequantize(b_quantized[b_offset], b_params);
+      c[offset] = clampi<T>(static_cast<int>(std::nearbyintf(calc(a_dequantized, b_dequantized) / c_params.scale)) + c_params.zero_point, qmin, qmax);
+    }
+
+    float abs_error = 0.0f;
+
+    // For quantized models, NNAPI's rounding is different than CPU provider
+    // Sometimes the result is within +/-1 of result of CPU provider
+    // For ONNX, we use rounding to nearest ties to even.
+    // For NNAPI, it is using std::round which is HALF_AWAY_FROM_ZERO, see
+    // https://android.googlesource.com/platform/frameworks/ml/+/refs/heads/master/nn/common/operations/Quantize.cpp
+    // Use 1 as abs_error which is the smallest possbile for uint8_t
+    //
+    // NOTE, for now the tolerance will only apply if the NNAPI is actually used,
+    // if for any reason the execution falls back to CPU, we still expect an exact match
+    // See, 'void Check<uint8_t>(...' in onnxruntime/test/providers/provider_test_utils.cc
+#ifdef USE_NNAPI
+    abs_error = 1.0f;
+#endif
+
+    test.template AddOutput<T>("C", c_shape, c, false /* sort_output */, 0.0f /* rel_error */, abs_error);
+
+    test.Run();
+  };
+
+  run_test(false /* input_b_is_initializer */, false /* all_initializer_scale_zero_point */);
+
+  // NNAPI will require all the scales and zero points be initializers
+  run_test(false /* input_b_is_initializer */, true /* all_initializer_scale_zero_point */);
+
+  // We also want to test the case input B is an initializer
+  run_test(true /* input_b_is_initializer */, true /* all_initializer_scale_zero_point */);
 }

 // total 32 + 31 elements to cover all path
@ -145,22 +171,6 @@ TEST(QLinearBinaryOpTest, AddU8VectorVectorFull) {
                              A, {63}, A_params,
                              B, {63}, B_params,
                              C_params);
-
-  // NNAPI will require all the scales and zero points be initializers
-  // We also want to test the case input B is an initializer
-  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
-                              A, {63}, A_params,
-                              B, {63}, B_params,
-                              C_params,
-                              false /* input_b_is_initializer */,
-                              true /* all_initializer_scale_zero_point */);
-
-  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
-                              A, {63}, A_params,
-                              B, {63}, B_params,
-                              C_params,
-                              true /* input_b_is_initializer */,
-                              true /* all_initializer_scale_zero_point */);
 }

 TEST(QLinearBinaryOpTest, AddU8VectorVectorBroadcast) {
@ -180,22 +190,6 @@ TEST(QLinearBinaryOpTest, AddU8VectorVectorBroadcast) {
                              A, {3, 3, 7}, A_params,
                              B, {3, 1, 7}, B_params,
                              C_params);
-
-  // NNAPI will require all the scales and zero points be initializers
-  // We also want to test the case input B is an initializer
-  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
-                              A, {3, 3, 7}, A_params,
-                              B, {3, 1, 7}, B_params,
-                              C_params,
-                              false /* input_b_is_initializer */,
-                              true /* all_initializer_scale_zero_point */);
-
-  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
-                              A, {3, 3, 7}, A_params,
-                              B, {3, 1, 7}, B_params,
-                              C_params,
-                              true /* input_b_is_initializer */,
-                              true /* all_initializer_scale_zero_point */);
 }

 TEST(QLinearBinaryOpTest, AddU8ScalarVectorFull) {
@ -212,22 +206,6 @@ TEST(QLinearBinaryOpTest, AddU8ScalarVectorFull) {
                              B, {1}, B_params,
                              A, {63}, A_params,
                              C_params);
-
-  // NNAPI will require all the scales and zero points be initializers
-  // We also want to test the case input B is an initializer
-  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
-                              B, {1}, B_params,
-                              A, {63}, A_params,
-                              C_params,
-                              false /* input_b_is_initializer */,
-                              true /* all_initializer_scale_zero_point */);
-
-  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
-                              B, {1}, B_params,
-                              A, {63}, A_params,
-                              C_params,
-                              true /* input_b_is_initializer */,
-                              true /* all_initializer_scale_zero_point */);
 }

 TEST(QLinearBinaryOpTest, AddU8ScalarVectorBroadcast) {
@ -244,22 +222,6 @@ TEST(QLinearBinaryOpTest, AddU8ScalarVectorBroadcast) {
                              B, {3, 1, 1}, B_params,
                              A, {3, 7, 3}, A_params,
                              C_params);
-
-  // NNAPI will require all the scales and zero points be initializers
-  // We also want to test the case input B is an initializer
-  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
-                              B, {3, 1, 1}, B_params,
-                              A, {3, 7, 3}, A_params,
-                              C_params,
-                              false /* input_b_is_initializer */,
-                              true /* all_initializer_scale_zero_point */);
-
-  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
-                              B, {3, 1, 1}, B_params,
-                              A, {3, 7, 3}, A_params,
-                              C_params,
-                              true /* input_b_is_initializer */,
-                              true /* all_initializer_scale_zero_point */);
 }

 TEST(QLinearBinaryOpTest, AddU8VectorScalarFull) {
@ -276,22 +238,6 @@ TEST(QLinearBinaryOpTest, AddU8VectorScalarFull) {
                              A, {63}, A_params,
                              B, {1}, B_params,
                              C_params);
-
-  // NNAPI will require all the scales and zero points be initializers
-  // We also want to test the case input B is an initializer
-  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
-                              A, {63}, A_params,
-                              B, {1}, B_params,
-                              C_params,
-                              false /* input_b_is_initializer */,
-                              true /* all_initializer_scale_zero_point */);
-
-  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
-                              A, {63}, A_params,
-                              B, {1}, B_params,
-                              C_params,
-                              true /* input_b_is_initializer */,
-                              true /* all_initializer_scale_zero_point */);
 }

 TEST(QLinearBinaryOpTest, AddU8VectorScalarBroadcast) {
@ -308,22 +254,6 @@ TEST(QLinearBinaryOpTest, AddU8VectorScalarBroadcast) {
                              A, {3, 7, 3}, A_params,
                              B, {1, 1, 3}, B_params,
                              C_params);
-
-  // NNAPI will require all the scales and zero points be initializers
-  // We also want to test the case input B is an initializer
-  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
-                              A, {3, 7, 3}, A_params,
-                              B, {1, 1, 3}, B_params,
-                              C_params,
-                              false /* input_b_is_initializer */,
-                              true /* all_initializer_scale_zero_point */);
-
-  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
-                              A, {3, 7, 3}, A_params,
-                              B, {1, 1, 3}, B_params,
-                              C_params,
-                              true /* input_b_is_initializer */,
-                              true /* all_initializer_scale_zero_point */);
 }

 TEST(QLinearBinaryOpTest, AddS8VectorVectorFull) {
--- a/onnxruntime/test/optimizer/qdq_test_utils.h
+++ b/onnxruntime/test/optimizer/qdq_test_utils.h
@ -81,10 +81,27 @@ GetQDQTestCaseFn BuildQDQConvTestCase(const std::vector<int64_t>& input_shape, c
 template <typename InputType, typename OutputType>
 GetQDQTestCaseFn BuildQDQAveragePoolTestCase(const std::vector<int64_t>& input_shape) {
  return [input_shape](ModelTestBuilder& builder) {
+
+#ifdef USE_NNAPI  // NNAPI require consistent scales/ZPs for DQ -> Pool -> Q
+    float dq_scale = 0.0038f;
+    float pool_output_scale = 0.0038f;
+    float q_scale = 0.0038f;
+    InputType dq_zp = std::numeric_limits<OutputType>::max() / 2;
+    InputType pool_output_zp = std::numeric_limits<OutputType>::max() / 2;
+    InputType q_zp = std::numeric_limits<OutputType>::max() / 2;
+#else
+    float dq_scale = 0.0035f;
+    float pool_output_scale = 0.0038f;
+    float q_scale = 0.0039f;
+    InputType dq_zp = 7;
+    InputType pool_output_zp = std::numeric_limits<OutputType>::max() / 2;
+    InputType q_zp = std::numeric_limits<OutputType>::max() / 2;
+#endif
+
    auto* input_arg = builder.MakeInput<float>(input_shape, -1.f, 1.f);
    auto* output_arg = builder.MakeOutput();
    // add QDQ + AveragePool
-    auto* dq_output = AddQDQNodePair<InputType>(builder, input_arg, .0035f, 7);
+    auto* dq_output = AddQDQNodePair<InputType>(builder, input_arg, dq_scale, dq_zp);
    auto* averagepool_output = builder.MakeIntermediate();
    Node& pool_node = builder.AddNode("AveragePool", {dq_output}, {averagepool_output});
    std::vector<int64_t> pads((input_shape.size() - 2) * 2, 1);
@ -95,12 +112,12 @@ GetQDQTestCaseFn BuildQDQAveragePoolTestCase(const std::vector<int64_t>& input_s
    // add QDQ output
    auto* q_output = builder.MakeIntermediate();
    builder.AddQuantizeLinearNode<OutputType>(averagepool_output,
-                                              .0038f,
-                                              std::numeric_limits<OutputType>::max() / 2,
+                                              pool_output_scale,
+                                              pool_output_zp,
                                              q_output);
    builder.AddDequantizeLinearNode<OutputType>(q_output,
-                                                .0039f,
-                                                std::numeric_limits<OutputType>::max() / 2,
+                                                q_scale,
+                                                q_zp,
                                                output_arg);
  };
 }
@ -110,5 +127,65 @@ GetQDQTestCaseFn BuildQDQResizeTestCase(const std::vector<int64_t>& input_shape,
                                        const std::string& mode = "nearest",
                                        const std::string& coordinate_transformation_mode = "half_pixel");

+template <typename Input1Type, typename Input2Type, typename OutputType>
+GetQDQTestCaseFn BuildBinaryOpTestCase(const std::vector<int64_t>& input_shape,
+                                       const std::string& op_type) {
+  return [input_shape, op_type](ModelTestBuilder& builder) {
+    auto* input1_arg = builder.MakeInput<float>(input_shape, -1.f, 1.f);
+    auto* input2_arg = builder.MakeInput<float>(input_shape, -1.f, 1.f);
+    auto* output_arg = builder.MakeOutput();
+
+#ifdef USE_NNAPI  // NNAPI require consistent scales for DQ -> bin_op_input and bin_op_output-> Q
+    float q_scale = 0.008f;
+    float op_input_scale = 0.008f;
+    float op_output_scale = 0.0076f;
+    float dq_scale = 0.0076f;
+#else
+    float q_scale = 0.008f;
+    float op_input_scale = 0.0079f;
+    float op_output_scale = 0.0076f;
+    float dq_scale = 0.0078f;
+#endif
+
+    // add QDQ 1
+    auto* q1_output = builder.MakeIntermediate();
+    auto* dq1_output = builder.MakeIntermediate();
+    builder.AddQuantizeLinearNode<Input1Type>(input1_arg,
+                                              q_scale,
+                                              std::numeric_limits<Input1Type>::max() / 2,
+                                              q1_output);
+    builder.AddDequantizeLinearNode<Input1Type>(q1_output,
+                                                op_input_scale,
+                                                std::numeric_limits<Input1Type>::max() / 2,
+                                                dq1_output);
+
+    // add QDQ 2
+    auto* q2_output = builder.MakeIntermediate();
+    auto* dq2_output = builder.MakeIntermediate();
+    builder.AddQuantizeLinearNode<Input2Type>(input2_arg,
+                                              q_scale,
+                                              std::numeric_limits<Input2Type>::max() / 2,
+                                              q2_output);
+    builder.AddDequantizeLinearNode<Input2Type>(q2_output,
+                                                op_input_scale,
+                                                std::numeric_limits<Input2Type>::max() / 2,
+                                                dq2_output);
+
+    // add binary operator
+    auto* binary_op_output = builder.MakeIntermediate();
+    builder.AddNode(op_type, {dq1_output, dq2_output}, {binary_op_output});
+
+    // add QDQ output
+    auto* q3_output = builder.MakeIntermediate();
+    builder.AddQuantizeLinearNode<OutputType>(binary_op_output,
+                                              op_output_scale,
+                                              std::numeric_limits<OutputType>::max() / 2,
+                                              q3_output);
+    builder.AddDequantizeLinearNode<OutputType>(q3_output,
+                                                dq_scale,
+                                                std::numeric_limits<OutputType>::max() / 2,
+                                                output_arg);
+  };
+}
 }  // namespace test
 }  // namespace onnxruntime
--- a/onnxruntime/test/optimizer/qdq_transformer_test.cc
+++ b/onnxruntime/test/optimizer/qdq_transformer_test.cc
@ -39,7 +39,7 @@ namespace test {
 template <typename InputType, typename WeightType, typename BiasType, typename OutputType>
 void QDQTransformerConvTests() {
  auto test_case = [&](const std::vector<int64_t>& input_shape, const std::vector<int64_t>& weights_shape) {
-    auto check_conv_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
      auto op_to_count = CountOpsInGraph(session.GetGraph());
      if constexpr (std::is_same<InputType, OutputType>::value &&
                    std::is_same<BiasType, int32_t>::value &&
@ -57,7 +57,7 @@ void QDQTransformerConvTests() {
    };

    TransformerTester(BuildQDQConvTestCase<InputType, WeightType, BiasType, OutputType>(input_shape, weights_shape),
-                      check_conv_graph,
+                      check_graph,
                      TransformerLevel::Level1,
                      TransformerLevel::Level2,
                      12 /*opset_version*/,
@ -136,7 +136,7 @@ TEST(QDQTransformerTests, ConvMaxPoolReshape_UInt8) {
      builder.AddQuantizeLinearNode<uint8_t>(reshape_output, .0039f, 135, output_arg);
    };

-    auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
      auto op_to_count = CountOpsInGraph(session.GetGraph());
      EXPECT_EQ(op_to_count["QLinearConv"], 1);
      EXPECT_EQ(op_to_count["MaxPool"], 1);
@ -146,7 +146,7 @@ TEST(QDQTransformerTests, ConvMaxPoolReshape_UInt8) {
    };

    TransformerTester(build_test_case,
-                      check_mp_reshape_graph,
+                      check_graph,
                      TransformerLevel::Level1,
                      TransformerLevel::Level2,
                      opset_version);
@ -197,7 +197,7 @@ TEST(QDQTransformerTests, ConvMaxPoolReshape_Int8) {
      }
    };

-    auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
      auto op_to_count = CountOpsInGraph(session.GetGraph());
      EXPECT_EQ(op_to_count["QLinearConv"], 1);
      EXPECT_EQ(op_to_count["MaxPool"], 1);
@ -206,7 +206,7 @@ TEST(QDQTransformerTests, ConvMaxPoolReshape_Int8) {
      EXPECT_EQ(op_to_count["DequantizeLinear"], 0);
    };

-    TransformerTester(build_test_case, check_mp_reshape_graph, TransformerLevel::Level1, TransformerLevel::Level2);
+    TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2);
  };

  test_case({1, 12, 37}, {32, 12, 5});
@ -217,7 +217,7 @@ TEST(QDQTransformerTests, ConvMaxPoolReshape_Int8) {
 template <typename InputType, typename OutputType>
 void QDQTransformerAveragePoolTests() {
  auto test_case = [&](const std::vector<int64_t>& input_shape) {
-    auto check_averagepool_op_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
      auto op_to_count = CountOpsInGraph(session.GetGraph());
      if constexpr (std::is_same<InputType, OutputType>::value) {
        EXPECT_EQ(op_to_count["com.microsoft.QLinearAveragePool"], 1);
@ -233,7 +233,7 @@ void QDQTransformerAveragePoolTests() {
    };

    TransformerTester(BuildQDQAveragePoolTestCase<InputType, OutputType>(input_shape),
-                      check_averagepool_op_graph,
+                      check_graph,
                      TransformerLevel::Level1,
                      TransformerLevel::Level2,
                      12 /*opset_version*/,
@ -266,52 +266,7 @@ TEST(QDQTransformerTests, AveragePool_U8S8) {
 template <typename Input1Type, typename Input2Type, typename OutputType>
 void QDQTransformerBinaryOpTests(const std::string& op_type) {
  auto test_case = [&](const std::vector<int64_t>& input_shape) {
-    auto build_test_case = [&](ModelTestBuilder& builder) {
-      auto* input1_arg = builder.MakeInput<float>(input_shape, -1.f, 1.f);
-      auto* input2_arg = builder.MakeInput<float>(input_shape, -1.f, 1.f);
-      auto* output_arg = builder.MakeOutput();
-
-      // add QDQ 1
-      auto* q1_output = builder.MakeIntermediate();
-      auto* dq1_output = builder.MakeIntermediate();
-      builder.AddQuantizeLinearNode<Input1Type>(input1_arg,
-                                                .004f,
-                                                std::numeric_limits<Input1Type>::max() / 2,
-                                                q1_output);
-      builder.AddDequantizeLinearNode<Input1Type>(q1_output,
-                                                  .0039f,
-                                                  std::numeric_limits<Input1Type>::max() / 2,
-                                                  dq1_output);
-
-      // add QDQ 2
-      auto* q2_output = builder.MakeIntermediate();
-      auto* dq2_output = builder.MakeIntermediate();
-      builder.AddQuantizeLinearNode<Input2Type>(input2_arg,
-                                                .004f,
-                                                std::numeric_limits<Input2Type>::max() / 2,
-                                                q2_output);
-      builder.AddDequantizeLinearNode<Input2Type>(q2_output,
-                                                  .0039f,
-                                                  std::numeric_limits<Input2Type>::max() / 2,
-                                                  dq2_output);
-
-      // add binary operator
-      auto* binary_op_output = builder.MakeIntermediate();
-      builder.AddNode(op_type, {dq1_output, dq2_output}, {binary_op_output});
-
-      // add QDQ output
-      auto* q3_output = builder.MakeIntermediate();
-      builder.AddQuantizeLinearNode<OutputType>(binary_op_output,
-                                                .0038f,
-                                                std::numeric_limits<OutputType>::max() / 2,
-                                                q3_output);
-      builder.AddDequantizeLinearNode<OutputType>(q3_output,
-                                                  .0039f,
-                                                  std::numeric_limits<OutputType>::max() / 2,
-                                                  output_arg);
-    };
-
-    auto check_binary_op_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
      auto op_to_count = CountOpsInGraph(session.GetGraph());
      if (std::is_same<Input1Type, Input2Type>::value &&
          std::is_same<Input1Type, OutputType>::value) {
@ -327,8 +282,8 @@ void QDQTransformerBinaryOpTests(const std::string& op_type) {
      }
    };

-    TransformerTester(build_test_case,
-                      check_binary_op_graph,
+    TransformerTester(BuildBinaryOpTestCase<Input1Type, Input2Type, OutputType>(input_shape, op_type),
+                      check_graph,
                      TransformerLevel::Level1,
                      TransformerLevel::Level2,
                      12 /*opset_version*/,
@ -426,7 +381,7 @@ void QDQTransformerMatMulTests(bool has_output_q) {
      }
    };

-    auto check_binary_op_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
      auto op_to_count = CountOpsInGraph(session.GetGraph());
      if (has_output_q) {
        if constexpr (std::is_same<Input1Type, OutputType>::value &&
@ -459,7 +414,7 @@ void QDQTransformerMatMulTests(bool has_output_q) {
    };

    TransformerTester(build_test_case,
-                      check_binary_op_graph,
+                      check_graph,
                      TransformerLevel::Level1,
                      TransformerLevel::Level2,
                      12 /*opset_version*/,
@ -696,14 +651,14 @@ TEST(QDQTransformerTests, Gather) {
      builder.AddQuantizeLinearNode<int8_t>(gather_output, .003f, 1, output_arg);
    };

-    auto check_matmul_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
      auto op_to_count = CountOpsInGraph(session.GetGraph());
      EXPECT_EQ(op_to_count["Gather"], 1);
      EXPECT_EQ(op_to_count["QuantizeLinear"], 0);
      EXPECT_EQ(op_to_count["DequantizeLinear"], 0);
    };

-    TransformerTester(build_test_case, check_matmul_graph, TransformerLevel::Level1, TransformerLevel::Level2);
+    TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2);
  };

  test_case({12, 37}, {24, 12});
@ -728,14 +683,14 @@ TEST(QDQTransformerTests, Transpose) {
      builder.AddQuantizeLinearNode<int8_t>(transpose_output, .003f, 1, output_arg);
    };

-    auto check_matmul_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
      auto op_to_count = CountOpsInGraph(session.GetGraph());
      EXPECT_EQ(op_to_count["Transpose"], 1);
      EXPECT_EQ(op_to_count["QuantizeLinear"], 0);
      EXPECT_EQ(op_to_count["DequantizeLinear"], 0);
    };

-    TransformerTester(build_test_case, check_matmul_graph, TransformerLevel::Level1, TransformerLevel::Level2);
+    TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2);
  };

  test_case({2, 13, 12, 37}, {0, 3, 1, 2});
@ -760,13 +715,13 @@ TEST(QDQTransformerTests, Transpose_No_Fusion) {
      builder.AddQuantizeLinearNode<int8_t>(transpose_output, .003f, 1, output_arg);
    };

-    auto check_matmul_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
      auto op_to_count = CountOpsInGraph(session.GetGraph());
      EXPECT_EQ(op_to_count["QuantizeLinear"], 1);
      EXPECT_EQ(op_to_count["DequantizeLinear"], 1);
    };

-    TransformerTester(build_test_case, check_matmul_graph, TransformerLevel::Level1, TransformerLevel::Level2);
+    TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2);
  };

  test_case({2, 13, 12, 37}, {0, 3, 1, 2});
@ -775,7 +730,7 @@ TEST(QDQTransformerTests, Transpose_No_Fusion) {
 TEST(QDQTransformerTests, Resize) {
  auto test_case = [&](const std::vector<int64_t>& input1_shape,
                       const std::vector<int64_t>& sizes_shape) {
-    auto check_resize_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
      auto op_to_count = CountOpsInGraph(session.GetGraph());
      EXPECT_EQ(op_to_count["Resize"], 1);
      EXPECT_EQ(op_to_count["QuantizeLinear"], 0);
@ -783,7 +738,7 @@ TEST(QDQTransformerTests, Resize) {
    };

    TransformerTester(BuildQDQResizeTestCase(input1_shape, sizes_shape),
-                      check_resize_graph,
+                      check_graph,
                      TransformerLevel::Level1,
                      TransformerLevel::Level2);
  };
@ -828,7 +783,7 @@ TEST(QDQTransformerTests, Resize_No_Fusion) {
      builder.AddQuantizeLinearNode<uint8_t>(resize_output, .003f, 1, output_arg);
    };

-    auto check_qdq_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
      auto op_to_count = CountOpsInGraph(session.GetGraph());
      EXPECT_EQ(op_to_count["Resize"], 1);
      EXPECT_EQ(op_to_count["Concat"], 1);
@ -836,7 +791,7 @@ TEST(QDQTransformerTests, Resize_No_Fusion) {
      EXPECT_EQ(op_to_count["DequantizeLinear"], 1);
    };

-    TransformerTester(build_test_case, check_qdq_graph,
+    TransformerTester(build_test_case, check_graph,
                      TransformerLevel::Level1,
                      TransformerLevel::Level2);
  };
@ -867,7 +822,7 @@ TEST(QDQTransformerTests, ResizeReshape) {
      builder.AddNode("Reshape", {qdq_resize_output, reshape_shape}, {output_arg});
    };

-    auto check_qdq_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
      auto op_to_count = CountOpsInGraph(session.GetGraph());
      EXPECT_EQ(op_to_count["Resize"], 1);
      EXPECT_EQ(op_to_count["Reshape"], 1);
@ -875,7 +830,7 @@ TEST(QDQTransformerTests, ResizeReshape) {
      EXPECT_EQ(op_to_count["DequantizeLinear"], 1);
    };

-    TransformerTester(build_test_case, check_qdq_graph,
+    TransformerTester(build_test_case, check_graph,
                      TransformerLevel::Level1,
                      TransformerLevel::Level2);
  };
@ -905,13 +860,13 @@ TEST(QDQTransformerTests, ArgMax) {
      argmax_node.AddAttribute("select_last_index", static_cast<int64_t>(select_last_index));
    };

-    auto check_argmax_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
      auto op_to_count = CountOpsInGraph(session.GetGraph());
      EXPECT_EQ(op_to_count["ArgMax"], 1);
      EXPECT_EQ(op_to_count["DequantizeLinear"], 0);
    };

-    TransformerTester(build_test_case, check_argmax_graph,
+    TransformerTester(build_test_case, check_graph,
                      TransformerLevel::Level1,
                      TransformerLevel::Level2,
                      /* opset_version */ 13);
@ -939,14 +894,14 @@ TEST(QDQTransformerTests, QLinearMatMul) {
      builder.AddQuantizeLinearNode<uint8_t>(matmul_output, .0039f, 135, output_arg);
    };

-    auto check_matmul_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
      auto op_to_count = CountOpsInGraph(session.GetGraph());
      EXPECT_EQ(op_to_count["QLinearMatMul"], 1);
      EXPECT_EQ(op_to_count["QuantizeLinear"], 2);
      EXPECT_EQ(op_to_count["DequantizeLinear"], 0);
    };

-    TransformerTester(build_test_case, check_matmul_graph, TransformerLevel::Level1, TransformerLevel::Level2);
+    TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2);
  };

  test_case({12, 37}, {37, 12});
@ -970,7 +925,7 @@ TEST(QDQTransformerTests, MatMul_No_Fusion) {
      builder.AddQuantizeLinearNode<uint8_t>(matmul_output, .0039f, 135, output_arg);
    };

-    auto check_matmul_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
      auto op_to_count = CountOpsInGraph(session.GetGraph());
      EXPECT_EQ(op_to_count["MatMul"], 1);
      EXPECT_EQ(op_to_count["QLinearMatMul"], 0);
@ -978,7 +933,7 @@ TEST(QDQTransformerTests, MatMul_No_Fusion) {
      EXPECT_EQ(op_to_count["DequantizeLinear"], 1);
    };

-    TransformerTester(build_test_case, check_matmul_graph, TransformerLevel::Level1, TransformerLevel::Level2);
+    TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2);
  };

  test_case({12, 37}, {37, 12});
@ -1006,7 +961,7 @@ TEST(QDQTransformerTests, MatMul_1st_Input_Int8) {
      builder.AddQuantizeLinearNode<uint8_t>(matmul_output, .0039f, 135, output_arg);
    };

-    auto check_matmul_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
      auto op_to_count = CountOpsInGraph(session.GetGraph());
      EXPECT_EQ(op_to_count["MatMul"], 1);
      EXPECT_EQ(op_to_count["QLinearMatMul"], 0);
@ -1014,7 +969,7 @@ TEST(QDQTransformerTests, MatMul_1st_Input_Int8) {
      EXPECT_EQ(op_to_count["DequantizeLinear"], 2);
    };

-    TransformerTester(build_test_case, check_matmul_graph, TransformerLevel::Level1, TransformerLevel::Level2);
+    TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2);
  };

  test_case({12, 37}, {37, 12});
@ -1043,7 +998,7 @@ TEST(QDQTransformerTests, MatMulIntegerToFloat) {
      builder.AddNode("MatMul", {dq_output_1, dq_output_2}, {output_arg});
    };

-    auto check_matmul_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
      auto op_to_count = CountOpsInGraph(session.GetGraph());
      EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 1);
      EXPECT_EQ(op_to_count["QuantizeLinear"], 0);
@ -1051,7 +1006,7 @@ TEST(QDQTransformerTests, MatMulIntegerToFloat) {
    };

    TransformerTester(build_test_case,
-                      check_matmul_graph,
+                      check_graph,
                      TransformerLevel::Level1,
                      TransformerLevel::Level2,
                      12 /*opset_version*/,
@ -1086,7 +1041,7 @@ TEST(QDQTransformerTests, ConvRelu) {
      builder.AddQuantizeLinearNode<uint8_t>(relu_output, .0039f, is_zp_zero ? 0 : 1, output_arg);
    };

-    auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
      auto op_to_count = CountOpsInGraph(session.GetGraph());
      if (is_zp_zero) {
        EXPECT_EQ(op_to_count["QLinearConv"], 1);
@ -1104,7 +1059,7 @@ TEST(QDQTransformerTests, ConvRelu) {
      }
    };

-    TransformerTester(build_test_case, check_mp_reshape_graph, TransformerLevel::Level1, TransformerLevel::Level2);
+    TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2);
  };

  test_case({1, 12, 37}, {32, 12, 5}, true);
@ -1150,7 +1105,7 @@ TEST(QDQTransformerTests, ConvAveragePoolReshape_UInt8) {
      builder.AddDequantizeLinearNode<uint8_t>(q_output, .0035f, 135, output_arg);
    };

-    auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
      auto op_to_count = CountOpsInGraph(session.GetGraph());
      EXPECT_EQ(op_to_count["QLinearConv"], 1);
      EXPECT_EQ(op_to_count["com.microsoft.QLinearAveragePool"], 1);
@ -1160,7 +1115,7 @@ TEST(QDQTransformerTests, ConvAveragePoolReshape_UInt8) {
    };

    TransformerTester(build_test_case,
-                      check_mp_reshape_graph,
+                      check_graph,
                      TransformerLevel::Level1,
                      TransformerLevel::Level2,
                      12 /*opset_version*/,
@ -1213,7 +1168,7 @@ TEST(QDQTransformerTests, ConvAveragePoolReshape_Int8) {
      }
    };

-    auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
      auto op_to_count = CountOpsInGraph(session.GetGraph());
      EXPECT_EQ(op_to_count["QLinearConv"], 1);
      EXPECT_EQ(op_to_count["com.microsoft.QLinearAveragePool"], 1);
@ -1223,7 +1178,7 @@ TEST(QDQTransformerTests, ConvAveragePoolReshape_Int8) {
    };

    TransformerTester(build_test_case,
-                      check_mp_reshape_graph,
+                      check_graph,
                      TransformerLevel::Level1,
                      TransformerLevel::Level2,
                      12 /*opset_version*/,
@ -1277,7 +1232,7 @@ TEST(QDQTransformerTests, ConvAveragePoolReshape_Int8_Fail) {
      }
    };

-    auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
      auto op_to_count = CountOpsInGraph(session.GetGraph());
      EXPECT_EQ(op_to_count["Conv"], 1);
      EXPECT_EQ(op_to_count["QLinearConv"], 0);
@ -1288,7 +1243,7 @@ TEST(QDQTransformerTests, ConvAveragePoolReshape_Int8_Fail) {
    };

    TransformerTester(build_test_case,
-                      check_mp_reshape_graph,
+                      check_graph,
                      TransformerLevel::Level1,
                      TransformerLevel::Level2,
                      12 /*opset_version*/,
@ -1325,7 +1280,7 @@ void QDQTransformerLeakyReluTests() {
                                                  output_arg);
    };

-    auto check_binary_op_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
      auto op_to_count = CountOpsInGraph(session.GetGraph());
      if constexpr (std::is_same<InputType, OutputType>::value) {
        EXPECT_EQ(op_to_count["com.microsoft.QLinearLeakyRelu"], 1);
@ -1341,7 +1296,7 @@ void QDQTransformerLeakyReluTests() {
    };

    TransformerTester(build_test_case,
-                      check_binary_op_graph,
+                      check_graph,
                      TransformerLevel::Level1,
                      TransformerLevel::Level2,
                      12 /*opset_version*/,
@ -1401,7 +1356,7 @@ TEST(QDQTransformerTests, ConvTranspose_QBackward) {
      }
    };

-    auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
      auto op_to_count = CountOpsInGraph(session.GetGraph());
      EXPECT_EQ(op_to_count["QLinearConv"], 1);
      EXPECT_EQ(op_to_count["Transpose"], 1);
@ -1410,7 +1365,7 @@ TEST(QDQTransformerTests, ConvTranspose_QBackward) {
    };

    TransformerTester(build_test_case,
-                      check_mp_reshape_graph,
+                      check_graph,
                      TransformerLevel::Level1,
                      TransformerLevel::Level2);
  };
@ -1461,7 +1416,7 @@ TEST(QDQTransformerTests, QBackward_MutilpleSteps) {
      }
    };

-    auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
      auto op_to_count = CountOpsInGraph(session.GetGraph());
      EXPECT_EQ(op_to_count["QLinearConv"], 1);
      EXPECT_EQ(op_to_count["MaxPool"], 1);
@ -1472,7 +1427,7 @@ TEST(QDQTransformerTests, QBackward_MutilpleSteps) {
    };

    TransformerTester(build_test_case,
-                      check_mp_reshape_graph,
+                      check_graph,
                      TransformerLevel::Level1,
                      TransformerLevel::Level2);
  };
@ -1512,7 +1467,7 @@ TEST(QDQTransformerTests, ConvTranspose_DQForward) {
      }
    };

-    auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
      auto op_to_count = CountOpsInGraph(session.GetGraph());
      EXPECT_EQ(op_to_count["QLinearConv"], 1);
      EXPECT_EQ(op_to_count["Transpose"], 1);
@ -1521,7 +1476,7 @@ TEST(QDQTransformerTests, ConvTranspose_DQForward) {
    };

    TransformerTester(build_test_case,
-                      check_mp_reshape_graph,
+                      check_graph,
                      TransformerLevel::Level1,
                      TransformerLevel::Level2);
  };
@ -1572,7 +1527,7 @@ TEST(QDQTransformerTests, DQForward_MutilpleSteps) {
      }
    };

-    auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
      auto op_to_count = CountOpsInGraph(session.GetGraph());
      EXPECT_EQ(op_to_count["QLinearConv"], 1);
      EXPECT_EQ(op_to_count["MaxPool"], 1);
@ -1583,7 +1538,7 @@ TEST(QDQTransformerTests, DQForward_MutilpleSteps) {
    };

    TransformerTester(build_test_case,
-                      check_mp_reshape_graph,
+                      check_graph,
                      TransformerLevel::Level1,
                      TransformerLevel::Level2);
  };
@ -1704,7 +1659,7 @@ TEST(QDQTransformerTests, Concat) {
      }
    };

-    auto check_mp_reshape_graph = [&input_shapes, &has_input_float, &has_input_int8, &has_output_int8](InferenceSessionWrapper& session) {
+    auto check_graph = [&input_shapes, &has_input_float, &has_input_int8, &has_output_int8](InferenceSessionWrapper& session) {
      auto op_to_count = CountOpsInGraph(session.GetGraph());
      if (has_input_float || has_input_int8 || has_output_int8) {
        EXPECT_EQ(op_to_count["com.microsoft.QLinearConcat"], 0);
@ -1716,7 +1671,7 @@ TEST(QDQTransformerTests, Concat) {
    };

    TransformerTester(build_test_case,
-                      check_mp_reshape_graph,
+                      check_graph,
                      TransformerLevel::Level1,
                      TransformerLevel::Level2,
                      12 /*opset_version*/,
@ -1763,7 +1718,7 @@ TEST(QDQTransformerTests, QDQPropagation_QDQCancelOut) {
      builder.AddNode("Reshape", {maxpool_output, reshape_shape}, {output_arg});
    };

-    auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
      auto op_to_count = CountOpsInGraph(session.GetGraph());
      EXPECT_EQ(op_to_count["MaxPool"], 1);
      EXPECT_EQ(op_to_count["Reshape"], 1);
@ -1773,7 +1728,7 @@ TEST(QDQTransformerTests, QDQPropagation_QDQCancelOut) {
    };

    TransformerTester(build_test_case,
-                      check_mp_reshape_graph,
+                      check_graph,
                      TransformerLevel::Level1,
                      TransformerLevel::Level2);
  };
@ -1799,7 +1754,7 @@ TEST(QDQTransformerTests, QDQPropagation_QDQ_CancelOut_More) {
      builder.AddQuantizeLinearNode<uint8_t>(reshape_output, same_scale ? .004f : .0039f, same_zp ? 129 : 128, output_arg);
    };

-    auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
      auto op_to_count = CountOpsInGraph(session.GetGraph());
      EXPECT_EQ(op_to_count["Reshape"], 1);
      EXPECT_EQ(op_to_count["QuantizeLinear"], same_scale && same_zp ? 1 : 2);
@ -1807,7 +1762,7 @@ TEST(QDQTransformerTests, QDQPropagation_QDQ_CancelOut_More) {
    };

    TransformerTester(build_test_case,
-                      check_mp_reshape_graph,
+                      check_graph,
                      TransformerLevel::Level1,
                      TransformerLevel::Level2);
  };
@ -1833,7 +1788,7 @@ TEST(QDQTransformerTests, QDQPropagation_Q_No_Parent) {
      builder.AddQuantizeLinearNode<uint8_t>(transpose_output, .0035f, 135, output_arg);
    };

-    auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
      GraphViewer graph_viewer(session.GetGraph());
      const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder();
      EXPECT_EQ(graph_viewer.GetNode(node_topology_list[0])->OpType(), "QuantizeLinear");
@ -1841,7 +1796,7 @@ TEST(QDQTransformerTests, QDQPropagation_Q_No_Parent) {
    };

    TransformerTester(build_test_case,
-                      check_mp_reshape_graph,
+                      check_graph,
                      TransformerLevel::Level1,
                      TransformerLevel::Level2);
  };
@ -1866,7 +1821,7 @@ TEST(QDQTransformerTests, QDQPropagation_DQ_No_Children) {
      transpose_node.AddAttribute("perm", perms);
    };

-    auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
      auto op_to_count = CountOpsInGraph(session.GetGraph());
      GraphViewer graph_viewer(session.GetGraph());
      const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder();
@ -1875,7 +1830,7 @@ TEST(QDQTransformerTests, QDQPropagation_DQ_No_Children) {
    };

    TransformerTester(build_test_case,
-                      check_mp_reshape_graph,
+                      check_graph,
                      TransformerLevel::Level1,
                      TransformerLevel::Level2);
  };
@ -1902,7 +1857,7 @@ TEST(QDQTransformerTests, QDQPropagation_Per_Layer_No_Propagation) {
      transpose_node.AddAttribute("perm", perms);
    };

-    auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
      auto op_to_count = CountOpsInGraph(session.GetGraph());
      GraphViewer graph_viewer(session.GetGraph());
      const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder();
@ -1911,7 +1866,7 @@ TEST(QDQTransformerTests, QDQPropagation_Per_Layer_No_Propagation) {
    };

    TransformerTester(build_test_case,
-                      check_mp_reshape_graph,
+                      check_graph,
                      TransformerLevel::Level1,
                      TransformerLevel::Level2);
  };
@ -1935,14 +1890,14 @@ TEST(QDQTransformerTests, QDQPropagation_DQ_Q) {
      builder.AddQuantizeLinearNode<uint8_t>(dq_output, .0035f, 135, output_arg);
    };

-    auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
+    auto check_graph = [&](InferenceSessionWrapper& session) {
      auto op_to_count = CountOpsInGraph(session.GetGraph());
      EXPECT_EQ(op_to_count["QuantizeLinear"], 1);
      EXPECT_EQ(op_to_count["DequantizeLinear"], 1);
    };

    TransformerTester(build_test_case,
-                      check_mp_reshape_graph,
+                      check_graph,
                      TransformerLevel::Level1,
                      TransformerLevel::Level2);
  };
--- a/onnxruntime/test/providers/nnapi/nnapi_basic_test.cc
+++ b/onnxruntime/test/providers/nnapi/nnapi_basic_test.cc
@ -271,7 +271,9 @@ TEST(NnapiExecutionProviderTest, TestNoShapeInputModel) {
      << "No node should be taken by the NNAPI EP";
 }

-static void RunQDQModelTest(const GetQDQTestCaseFn& build_test_case, const char* test_description) {
+static void RunQDQModelTest(const GetQDQTestCaseFn& build_test_case,
+                            const char* test_description,
+                            const EPVerificationParams& params = EPVerificationParams()) {
  onnxruntime::Model model(test_description, false, DefaultLoggingManager().DefaultLogger());
  Graph& graph = model.MainGraph();
  ModelTestBuilder helper(graph);
@ -286,7 +288,7 @@ static void RunQDQModelTest(const GetQDQTestCaseFn& build_test_case, const char*
 #if defined(__ANDROID__)
  RunAndVerifyOutputsWithEP(model_data, "NnapiExecutionProviderTest.TestQDQModel",
                            std::make_unique<NnapiExecutionProvider>(0),
-                            helper.feeds_);
+                            helper.feeds_, params);
 #else
  // test load only
  SessionOptions so;
@ -306,7 +308,8 @@ TEST(NnapiExecutionProviderTest, TestQDQConv) {
                                       uint8_t /* OutputType */>(
                      {1, 1, 5, 5} /*input_shape*/,
                      {1, 1, 3, 3} /*weights_shape*/),
-                  "nnapi_qdq_test_graph_conv");
+                  "nnapi_qdq_test_graph_conv",
+                  {true /* verify_entire_graph_use_ep */});
 }

 TEST(NnapiExecutionProviderTest, TestQDQResize) {
@ -316,14 +319,44 @@ TEST(NnapiExecutionProviderTest, TestQDQResize) {
                                         {1, 3, 32, 32} /* sizes_data */,
                                         "linear" /* mode */,
                                         "asymmetric" /* coordinate_transformation_mode */),
-                  "nnapi_qdq_test_graph_resize");
+                  "nnapi_qdq_test_graph_resize",
+                  {true /* verify_entire_graph_use_ep */});
 }

 TEST(NnapiExecutionProviderTest, TestQDQAveragePool) {
+  // NNAPI use different rounding, which may cause ~1% difference in the result
  RunQDQModelTest(BuildQDQAveragePoolTestCase<uint8_t /* InputType */,
                                              uint8_t /* OutputType */>(
                      {1, 3, 32, 32} /* input_shape */),
-                  "nnapi_qdq_test_graph_averagepool");
+                  "nnapi_qdq_test_graph_averagepool",
+                  {
+                      true /* verify_entire_graph_use_ep */,
+                      1e-2f /* fp32_abs_err */,
+                  });
+}
+
+TEST(NnapiExecutionProviderTest, TestQDQAdd) {
+  RunQDQModelTest(BuildBinaryOpTestCase<uint8_t /* Input1Type */,
+                                        uint8_t /* Input2Type */,
+                                        uint8_t /* OutputType */>(
+                      {1, 23, 13, 13} /* input_shape */,
+                      "Add" /* op_type */),
+                  "nnapi_qdq_test_graph_add",
+                  {true /* verify_entire_graph_use_ep */});
+}
+
+TEST(NnapiExecutionProviderTest, TestQDQMul) {
+  // NNAPI use different rounding, which may cause ~1% difference in the result
+  RunQDQModelTest(BuildBinaryOpTestCase<uint8_t /* Input1Type */,
+                                        uint8_t /* Input2Type */,
+                                        uint8_t /* OutputType */>(
+                      {1, 23, 13, 13} /* input_shape */,
+                      "Mul" /* op_type */),
+                  "nnapi_qdq_test_graph_mul",
+                  {
+                      true /* verify_entire_graph_use_ep */,
+                      1e-2f /* fp32_abs_err */,
+                  });
 }

 #endif  // !(ORT_MINIMAL_BUILD)
--- a/onnxruntime/test/util/include/test_utils.h
+++ b/onnxruntime/test/util/include/test_utils.h
@ -15,6 +15,18 @@ class Graph;

 namespace test {

+// struct to hold some verification params for RunAndVerifyOutputsWithEP
+struct EPVerificationParams {
+  // Verify the entire graph is taken by the EP
+  // if this is set to false, then will verify that at least one node is assigned to 'execution_provider'
+  bool verify_entire_graph_use_ep{false};
+
+  // Some EP may use different rounding than ORT CPU EP, which may cause a bigger abs error than
+  // the default of 1e-5f, especially for scenarios such as [Q -> Quantized op -> DQ]
+  // Set this only if this is necessary
+  float fp32_abs_err = 1e-5f;
+};
+
 // return number of nodes in the Graph and any subgraphs that are assigned to the specified execution provider
 int CountAssignedNodes(const Graph& current_graph, const std::string& ep_type);

@ -23,13 +35,14 @@ int CountAssignedNodes(const Graph& current_graph, const std::string& ep_type);
 void RunAndVerifyOutputsWithEP(const ORTCHAR_T* model_path,
                               const char* log_id,
                               std::unique_ptr<IExecutionProvider> execution_provider,
-                               const NameMLValMap& feeds);
+                               const NameMLValMap& feeds,
+                               const EPVerificationParams& params = EPVerificationParams());

 // helper function that takes in model_data
-// used in nnapi qdq model tests
 void RunAndVerifyOutputsWithEP(const std::string& model_data,
                               const char* log_id,
                               std::unique_ptr<IExecutionProvider> execution_provider,
-                               const NameMLValMap& feeds);
+                               const NameMLValMap& feeds,
+                               const EPVerificationParams& params = EPVerificationParams());
 }  // namespace test
 }  // namespace onnxruntime
--- a/onnxruntime/test/util/test_utils.cc
+++ b/onnxruntime/test/util/test_utils.cc
@ -18,7 +18,8 @@ namespace onnxruntime {
 namespace test {
 static void VerifyOutputs(const std::vector<std::string>& output_names,
                          const std::vector<OrtValue>& expected_fetches,
-                          const std::vector<OrtValue>& fetches) {
+                          const std::vector<OrtValue>& fetches,
+                          const EPVerificationParams& params) {
  ASSERT_EQ(expected_fetches.size(), fetches.size());

  for (size_t i = 0, end = expected_fetches.size(); i < end; ++i) {
@ -40,10 +41,8 @@ static void VerifyOutputs(const std::vector<std::string>& output_names,
            << " mismatch for " << output_names[i];
        break;
      case ONNX_NAMESPACE::TensorProto_DataType_FLOAT: {
-        constexpr float abs_err = 1e-5f;
-
        EXPECT_THAT(ltensor.DataAsSpan<float>(),
-                    ::testing::Pointwise(::testing::FloatNear(abs_err), rtensor.DataAsSpan<float>()));
+                    ::testing::Pointwise(::testing::FloatNear(params.fp32_abs_err), rtensor.DataAsSpan<float>()));
        break;
      }
      default:
@ -72,16 +71,18 @@ int CountAssignedNodes(const Graph& current_graph, const std::string& ep_type) {

 void RunAndVerifyOutputsWithEP(const ORTCHAR_T* model_path, const char* log_id,
                               std::unique_ptr<IExecutionProvider> execution_provider,
-                               const NameMLValMap& feeds) {
+                               const NameMLValMap& feeds,
+                               const EPVerificationParams& params) {
  // read raw data from model provided by the model_path
  std::ifstream stream(model_path, std::ios::in | std::ios::binary);
  std::string model_data((std::istreambuf_iterator<char>(stream)), std::istreambuf_iterator<char>());
-  RunAndVerifyOutputsWithEP(model_data, log_id, std::move(execution_provider), feeds);
+  RunAndVerifyOutputsWithEP(model_data, log_id, std::move(execution_provider), feeds, params);
 }

 void RunAndVerifyOutputsWithEP(const std::string& model_data, const char* log_id,
                               std::unique_ptr<IExecutionProvider> execution_provider,
-                               const NameMLValMap& feeds) {
+                               const NameMLValMap& feeds,
+                               const EPVerificationParams& params) {
  SessionOptions so;
  so.session_logid = log_id;
  RunOptions run_options;
@ -122,12 +123,17 @@ void RunAndVerifyOutputsWithEP(const std::string& model_data, const char* log_id
  // make sure that some nodes are assigned to the EP, otherwise this test is pointless...
  const auto& graph2 = session_object2.GetGraph();
  auto ep_nodes = CountAssignedNodes(graph2, provider_type);
-  ASSERT_GT(ep_nodes, 0) << "No nodes were assigned to " << provider_type;
+  if (params.verify_entire_graph_use_ep) {
+    // Verify the entire graph is assigned to the EP
+    ASSERT_EQ(ep_nodes, graph2.NumberOfNodes()) << "Not all nodes were assigned to " << provider_type;
+  } else {
+    ASSERT_GT(ep_nodes, 0) << "No nodes were assigned to " << provider_type;
+  }

  // Run with EP and verify the result
  std::vector<OrtValue> fetches;
  ASSERT_STATUS_OK(session_object2.Run(run_options, feeds, output_names, &fetches));
-  VerifyOutputs(output_names, expected_fetches, fetches);
+  VerifyOutputs(output_names, expected_fetches, fetches, params);
 }

 #if !defined(DISABLE_SPARSE_TENSORS)