[Android NNAPI EP] Add QLinearAdd op Support, move some throw with return status (#4607)

* remove dependency of external jd-dnnlibrary * add qlinearadd support * combine some qlinear ops logics, move some throw into return status * merge master * minor bug fixes * addressed comments
2026-07-19 19:00:47 +00:00 · 2020-07-30 11:45:11 -07:00 · 2020-07-30 11:45:11 -07:00 · 282975aefb
commit 282975aefb
parent 51332e3c81
6 changed files with 391 additions and 193 deletions
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
@ -9,6 +9,9 @@

 #include "helper.h"

+namespace onnxruntime {
+namespace nnapi {
+
 using std::string;
 using std::vector;

@ -40,6 +43,28 @@ std::string GetErrorCause(int error_code) {
  }
 }

+QLinearOpType GetQLinearOpType(const onnxruntime::Node& node) {
+  const auto& op_type = node.OpType();
+  if (op_type == "DequantizeLinear")
+    return QLinearOpType::DequantizeLinear;
+  else if (op_type == "QuantizeLinear")
+    return QLinearOpType::QuantizeLinear;
+  else if (op_type == "QLinearConv")
+    return QLinearOpType::QLinearConv;
+  else if (op_type == "QLinearMatMul")
+    return QLinearOpType::QLinearMatMul;
+  else if (op_type == "QLinearAdd")
+    return QLinearOpType::QLinearAdd;
+
+  return QLinearOpType::Unknown;
+}
+
+bool IsQLinearBinaryOp(QLinearOpType qlinear_op_type) {
+  return qlinear_op_type == QLinearOpType::QLinearConv ||
+         qlinear_op_type == QLinearOpType::QLinearMatMul ||
+         qlinear_op_type == QLinearOpType::QLinearAdd;
+}
+
 NodeAttrHelper::NodeAttrHelper(const onnxruntime::Node& node)
    : node_attributes_(node.GetAttributes()) {}

@ -97,3 +122,6 @@ vector<float> NodeAttrHelper::Get(const std::string& key, const vector<float>& d
 bool NodeAttrHelper::HasAttr(const std::string& key) const {
  return Contains(node_attributes_, key);
 }
+
+}  // namespace nnapi
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
@ -8,6 +8,9 @@

 #include "core/providers/nnapi/nnapi_builtin/nnapi_lib/NeuralNetworksTypes.h"

+namespace onnxruntime {
+namespace nnapi {
+
 #define THROW_ON_ERROR(val)                                               \
  {                                                                       \
    const auto ret = (val);                                               \
@ -36,12 +39,31 @@ inline bool Contains(const Map& map, const Key& key) {

 std::string GetErrorCause(int error_code);

+enum class QLinearOpType : uint8_t {
+  Unknown,  // Unknown or not a linear quantized op
+  DequantizeLinear,
+  QuantizeLinear,
+  QLinearConv,
+  QLinearMatMul,
+  QLinearAdd,
+  // Not yet supported
+  // QLinearAveragePool,
+  // QLinearMul,
+  // QLinearReduceMean,
+};
+
+QLinearOpType GetQLinearOpType(const onnxruntime::Node& node);
+
+// This qlinear op is an operator takes 2 input and producce 1 output
+// Such as QLinearConv, QLinearMatMul, QLinearAdd, ...
+bool IsQLinearBinaryOp(QLinearOpType qlinear_op_type);
+
 /**
 * Wrapping onnxruntime::Node for retrieving attribute values
 */
 class NodeAttrHelper {
 public:
-  NodeAttrHelper(const onnxruntime::Node& proto);
+  NodeAttrHelper(const onnxruntime::Node& node);

  float Get(const std::string& key, float def_val) const;
  int32_t Get(const std::string& key, int32_t def_val) const;
@ -54,3 +76,6 @@ class NodeAttrHelper {
 private:
  const onnxruntime::NodeAttributes& node_attributes_;
 };
+
+}  // namespace nnapi
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
@ -192,8 +192,8 @@ std::unordered_map<std::string, vector<const Node*>> GetAllQuantizedOpInputs(con
  const auto& node_indices = graph_view.GetNodesInTopologicalOrder();
  for (const auto& node_idx : node_indices) {
    const auto* node(graph_view.GetNode(node_idx));
-    const auto& op_type = node->OpType();
-    if (op_type == "DequantizeLinear" || op_type == "QLinearMatMul" || op_type == "QLinearConv") {
+    auto qlinear_op_type = GetQLinearOpType(*node);
+    if (qlinear_op_type == QLinearOpType::DequantizeLinear || IsQLinearBinaryOp(qlinear_op_type)) {
      const auto& input_name = node->InputDefs()[0]->Name();
      if (Contains(all_quantized_op_inputs, input_name))
        all_quantized_op_inputs.at(input_name).push_back(node);
@ -201,7 +201,7 @@ std::unordered_map<std::string, vector<const Node*>> GetAllQuantizedOpInputs(con
        all_quantized_op_inputs.emplace(input_name, vector<const Node*>{node});
    }

-    if (op_type == "QLinearMatMul" || op_type == "QLinearConv") {
+    if (IsQLinearBinaryOp(qlinear_op_type)) {
      const auto& input_name = node->InputDefs()[3]->Name();
      if (Contains(all_quantized_op_inputs, input_name))
        all_quantized_op_inputs.at(input_name).push_back(node);
@ -328,8 +328,8 @@ void ModelBuilder::RegisterModelInputs() {
          }

          // TODO, verify the scale and zero point match if there are multiple op using same input
-          std::tie(scale, zero_point) =
-              GetQuantizedInputScaleAndZeroPoint(*this, *all_quantized_op_inputs.at(input_name)[0], input_name);
+          ORT_THROW_IF_ERROR(GetQuantizedInputScaleAndZeroPoint(
+              *this, *all_quantized_op_inputs.at(input_name)[0], input_name, scale, zero_point));
          break;
        }
        default:
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
@ -155,7 +155,9 @@ static void AddBinaryOperator(int32_t op_type,
                              const std::string& input2,
                              int32_t fuse_code,
                              const std::string& output,
-                              bool output_is_nhwc) {
+                              bool output_is_nhwc,
+                              float output_scale = 0.0f,
+                              int32_t output_zero_point = 0) {
  auto& shaper(model_builder.GetShaper());
  const auto& operand_indices(model_builder.GetOperandIndices());
  const auto& operand_types(model_builder.GetOperandTypes());
@ -165,7 +167,7 @@ static void AddBinaryOperator(int32_t op_type,
  input_indices.push_back(operand_indices.at(input2));  // input 2
  input_indices.push_back(model_builder.AddOperandFromScalar(fuse_code));
  shaper.Eltwise(input1, input2, output);
-  const OperandType output_operand_type(operand_types.at(input1).type, shaper[output]);
+  const OperandType output_operand_type(operand_types.at(input1).type, shaper[output], output_scale, output_zero_point);
  model_builder.AddOperation(op_type, input_indices, {output}, {output_operand_type}, {output_is_nhwc});
 }

@ -441,39 +443,101 @@ static float GetQuantizationScale(const ModelBuilder& model_builder, const Node&
  return GetTensorFloatData(scale_tensor)[0];
 }

-static int32_t GetQuantizationZeroPoint(const ModelBuilder& model_builder, const Node& node, size_t idx) {
+static Status GetQuantizationZeroPoint(const ModelBuilder& model_builder, const Node& node, size_t idx, int32_t& zero_point) {
  std::unique_ptr<uint8_t[]> unpacked_tensor;
  size_t tensor_byte_size;
  const auto& zero_point_tensor = model_builder.GetInitializerTensors().at(node.InputDefs()[idx]->Name());
-  ORT_THROW_IF_ERROR(
+  ORT_RETURN_IF_ERROR(
      UnpackInitializerTensor(zero_point_tensor, unpacked_tensor, tensor_byte_size));
-  return static_cast<int32_t>(unpacked_tensor.get()[0]);
+  zero_point = static_cast<int32_t>(unpacked_tensor.get()[0]);
+  return Status::OK();
 }

-static void VerifyValidInputQuantizedType(const std::string& input_name,
-                                          const OperandType& input_operand_type,
-                                          float scale, int32_t zero_point) {
-  ORT_ENFORCE(input_operand_type.operandType.scale == scale,
-              "Input [" + input_name + "] NNAPI input: " + " scale: " +
-                  std::to_string(input_operand_type.operandType.scale) +
-                  ", ONNX input scale: " + std::to_string(scale));
+// Get scales and zero points for the qlinear binary ops (which has 2 input and 1 output)
+// QLinearConv, QLinearMatmul, QLinearAdd
+// a, b are inputs, and y is output
+static Status GetBinaryOpQuantizationScaleAndZeroPoint(const ModelBuilder& model_builder, const Node& node,
+                                                       float& a_scale, float& b_scale, float& y_scale,
+                                                       int32_t& a_zero_point, int32_t& b_zero_point, int32_t& y_zero_point) {
+  a_scale = GetQuantizationScale(model_builder, node, 1);
+  b_scale = GetQuantizationScale(model_builder, node, 4);
+  y_scale = GetQuantizationScale(model_builder, node, 6);

-  ORT_ENFORCE(input_operand_type.operandType.zeroPoint == zero_point,
-              "Input [" + input_name + "] NNNAPI input zero point: " +
-                  std::to_string(input_operand_type.operandType.zeroPoint) +
-                  ", ONNX input zero point: " + std::to_string(zero_point));
+  ORT_RETURN_IF_ERROR(GetQuantizationZeroPoint(model_builder, node, 2, a_zero_point));
+  ORT_RETURN_IF_ERROR(GetQuantizationZeroPoint(model_builder, node, 5, b_zero_point));
+  ORT_RETURN_IF_ERROR(GetQuantizationZeroPoint(model_builder, node, 7, y_zero_point));
+
+  return Status::OK();
 }

-std::pair<float, int32_t> GetQuantizedInputScaleAndZeroPoint(const ModelBuilder& model_builder,
-                                                             const Node& node,
-                                                             const std::string& input_name) {
+// NNAPI has the qunatization scale and zero point embedded in the ANeuralNetworksOperandType
+// ONNX has the qunatization scale and zero point as the inputs of the qlinear operators
+// We want to verify the scale and zeropoint of the ONNX inputs matches the values embedded in the NNAPI inputs
+static Status IsValidInputQuantizedType(const ModelBuilder& model_builder,
+                                        const std::string& input_name,
+                                        float scale,
+                                        int32_t zero_point) {
+  const OperandType& input_operand_type = model_builder.GetOperandTypes().at(input_name);
+  if (input_operand_type.operandType.scale != scale) {
+    return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
+                  "Input [" + input_name + "] NNAPI input scale: " +
+                      std::to_string(input_operand_type.operandType.scale) +
+                      ", ONNX input scale: " + std::to_string(scale));
+  }
+
+  if (input_operand_type.operandType.zeroPoint != zero_point) {
+    return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
+                  "Input [" + input_name + "] NNNAPI input zero point: " +
+                      std::to_string(input_operand_type.operandType.zeroPoint) +
+                      ", ONNX input zero point: " + std::to_string(zero_point));
+  }
+
+  return Status::OK();
+}
+
+static void AddBinaryOpQuantizationScaleAndZeroPointToSkip(ModelBuilder& model_builder, const Node& node) {
+  const auto input_defs(node.InputDefs());
+  model_builder.AddInitializerToSkip(input_defs[1]->Name());  // a_scale
+  model_builder.AddInitializerToSkip(input_defs[2]->Name());  // a_zero_point
+  model_builder.AddInitializerToSkip(input_defs[4]->Name());  // b_scale
+  model_builder.AddInitializerToSkip(input_defs[5]->Name());  // b_zero_point
+  model_builder.AddInitializerToSkip(input_defs[6]->Name());  // y_scale
+  model_builder.AddInitializerToSkip(input_defs[7]->Name());  // y_zero_point
+}
+
+static bool IsBinaryOpQuantizedInputsSupported(const Node& node) {
+  int32_t a_input_type, b_input_type;
+  if (!GetType(*node.InputDefs()[0], a_input_type))
+    return false;
+  if (!GetType(*node.InputDefs()[3], b_input_type))
+    return false;
+
+  if (a_input_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8 || a_input_type != b_input_type) {
+    LOGS_DEFAULT(VERBOSE) << "[" << node.OpType()
+                          << "] A Input type: [" << a_input_type
+                          << "] B Input type: [" << b_input_type
+                          << "] is not supported for now";
+    return false;
+  }
+
+  return true;
+}
+
+Status GetQuantizedInputScaleAndZeroPoint(const ModelBuilder& model_builder,
+                                          const Node& node,
+                                          const std::string& input_name,
+                                          float& scale,
+                                          int32_t& zero_point) {
  const auto& op_type = node.OpType();
-  assert(op_type == "QLinearMatMul" || op_type == "QLinearConv" || op_type == "DequantizeLinear");
+  auto qlinear_op_type = GetQLinearOpType(node);
+  assert(qlinear_op_type != QLinearOpType::Unknown &&
+         qlinear_op_type != QLinearOpType::QuantizeLinear);
+
  size_t scale_idx, zero_point_idx;
-  if (op_type == "DequantizeLinear") {
+  if (qlinear_op_type == QLinearOpType::DequantizeLinear) {
    scale_idx = 1;
    zero_point_idx = 2;
-  } else if (op_type == "QLinearMatMul" || op_type == "QLinearConv") {
+  } else if (IsQLinearBinaryOp(qlinear_op_type)) {
    const auto input_defs(node.InputDefs());
    if (input_name == input_defs[0]->Name()) {
      scale_idx = 1;
@ -482,19 +546,20 @@ std::pair<float, int32_t> GetQuantizedInputScaleAndZeroPoint(const ModelBuilder&
      scale_idx = 4;
      zero_point_idx = 5;
    } else {
-      ORT_THROW("Unknown input: " + input_name + ", for op: " + op_type);
+      return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
+                    "Unknown input: " + input_name + ", for op: " + op_type);
    }
  } else {
-    ORT_THROW("Unsupported op: " + op_type);
+    return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "Unsupported op: " + op_type);
  }

-  float scale = GetQuantizationScale(model_builder, node, scale_idx);
-  int32_t zero_point = 0;
+  scale = GetQuantizationScale(model_builder, node, scale_idx);
+  zero_point = 0;
  if (node.InputDefs().size() > 2) {
-    zero_point = GetQuantizationZeroPoint(model_builder, node, zero_point_idx);
+    ORT_RETURN_IF_ERROR(GetQuantizationZeroPoint(model_builder, node, zero_point_idx, zero_point));
  }

-  return std::make_pair(scale, zero_point);
+  return Status::OK();
 }

 #pragma endregion helpers
@ -599,14 +664,23 @@ bool BaseOpBuilder::HasExternalInitializer(ModelBuilder& model_builder, const No
 #pragma region op_binary

 class BinaryOpBuilder : public BaseOpBuilder {
- private:
-  int32_t GetMinSupportedSdkVer(ModelBuilder& model_builder, const Node& node) const override;
+ public:
+  void AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) override;

 private:
-  bool IsOpSupportedImpl(ModelBuilder& /* model_builder */, const Node& node) override;
+  int32_t GetMinSupportedSdkVer(ModelBuilder& model_builder, const Node& node) const override;
+  bool IsOpSupportedImpl(ModelBuilder& model_builder, const Node& node) override;
+  bool HasSupportedInputs(const Node& node) override;
  void AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node) override;
 };

+void BinaryOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) {
+  const auto& op = node.OpType();
+  if (op == "QLinearAdd") {
+    AddBinaryOpQuantizationScaleAndZeroPointToSkip(model_builder, node);
+  }
+}
+
 int32_t BinaryOpBuilder::GetMinSupportedSdkVer(ModelBuilder& /* model_builder */, const Node& node) const {
  const auto& op(node.OpType());
  if (op == "Sub" || op == "Div") {
@ -616,10 +690,28 @@ int32_t BinaryOpBuilder::GetMinSupportedSdkVer(ModelBuilder& /* model_builder */
  return 27;
 }

-bool BinaryOpBuilder::IsOpSupportedImpl(ModelBuilder& /* model_builder */, const Node& node) {
+bool BinaryOpBuilder::HasSupportedInputs(const Node& node) {
+  if (node.OpType() != "QLinearAdd")
+    return BaseOpBuilder::HasSupportedInputs(node);
+
+  // QLinearAdd
+  if (!IsBinaryOpQuantizedInputsSupported(node))
+    return false;
+
+  return true;
+}
+
+bool BinaryOpBuilder::IsOpSupportedImpl(ModelBuilder& model_builder, const Node& node) {
+  const auto& op_type(node.OpType());
+  const auto input_defs(node.InputDefs());
+  bool op_is_qlinear = op_type == "QLinearAdd";
+  size_t a_idx = 0, b_idx = 1;
+  if (op_is_qlinear) {
+    b_idx = 3;
+  }
  Shape input1_shape, input2_shape;
-  if (!GetShape(*node.InputDefs()[0], input1_shape) ||
-      !GetShape(*node.InputDefs()[1], input2_shape))
+  if (!GetShape(*input_defs[a_idx], input1_shape) ||
+      !GetShape(*input_defs[b_idx], input2_shape))
    return false;

  const auto input1_size = input1_shape.size();
@ -631,25 +723,57 @@ bool BinaryOpBuilder::IsOpSupportedImpl(ModelBuilder& /* model_builder */, const
    return false;
  }

+  if (op_is_qlinear) {
+    // For QLinearAdd, we only support uint8 output now
+    int32_t output_type;
+    if (!GetType(*node.OutputDefs()[0], output_type))
+      return false;
+
+    if (output_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
+      LOGS_DEFAULT(VERBOSE) << "[" << op_type
+                            << "] output type: [" << output_type
+                            << "] is not supported for now";
+      return false;
+    }
+
+    // All scale/zero points are initializer scalars
+    // a/b/y_scale
+    if (!IsQuantizationScaleSupported(model_builder, node, {1, 4, 6}))
+      return false;
+
+    // a/b/y_zero_point
+    if (!IsQuantizationZeroPointSupported(model_builder, node, {2, 5, 7}))
+      return false;
+  }
+
  return true;
 }

 void BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node) {
-  const auto& op(node.OpType());
+  const auto& op_type(node.OpType());
+  const auto input_defs(node.InputDefs());
+
  int32_t op_code;
-  if (op == "Add")
+  bool op_is_qlinear = op_type == "QLinearAdd";
+  if (op_type == "Add" || op_is_qlinear)
    op_code = ANEURALNETWORKS_ADD;
-  else if (op == "Sub")
+  else if (op_type == "Sub")
    op_code = ANEURALNETWORKS_SUB;
-  else if (op == "Mul")
+  else if (op_type == "Mul")
    op_code = ANEURALNETWORKS_MUL;
-  else if (op == "Div")
+  else if (op_type == "Div")
    op_code = ANEURALNETWORKS_DIV;
  else {
-    ORT_THROW("UnaryOpBuilder, unknown op: " + op);
+    ORT_THROW("UnaryOpBuilder, unknown op: " + op_type);
  }
-  std::string input1 = node.InputDefs()[0]->Name();
-  std::string input2 = node.InputDefs()[1]->Name();
+
+  size_t a_idx = 0, b_idx = 1;
+  if (op_is_qlinear) {
+    b_idx = 3;
+  }
+
+  std::string input1 = input_defs[a_idx]->Name();
+  std::string input2 = input_defs[b_idx]->Name();
  const auto& output = node.OutputDefs()[0]->Name();

  bool input1_is_nhwc = model_builder.IsOperandNHWC(input1);
@ -660,22 +784,42 @@ void BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
    output_is_nhwc = input1_is_nhwc;
  } else if (input1_is_nhwc) {
    // need transpsoe input1 back to nchw
-    const auto& nhwc_input = node.InputDefs()[0]->Name();
+    const auto& nhwc_input = input_defs[a_idx]->Name();
    if (!model_builder.GetNCHWOperand(nhwc_input, input1)) {
      input1 = model_builder.GetUniqueName(nhwc_input + "_nhwc_to_nchw");
      TransposeNHWCToNCHW(model_builder, nhwc_input, input1);
    }
  } else {  // input2_is_nhwc
    // need transpsoe input2 back to nchw
-    const auto& nhwc_input = node.InputDefs()[1]->Name();
+    const auto& nhwc_input = input_defs[b_idx]->Name();
    if (!model_builder.GetNCHWOperand(nhwc_input, input2)) {
      input2 = model_builder.GetUniqueName(nhwc_input + "_nhwc_to_nchw");
      TransposeNHWCToNCHW(model_builder, nhwc_input, input2);
    }
  }

+  float a_scale = 0.0f,
+        b_scale = 0.0f,
+        y_scale = 0.0f;
+  int32_t a_zero_point = 0,
+          b_zero_point = 0,
+          y_zero_point = 0;
+
+  if (op_is_qlinear) {
+    ORT_THROW_IF_ERROR(
+        GetBinaryOpQuantizationScaleAndZeroPoint(model_builder, node,
+                                                 a_scale, b_scale, y_scale,
+                                                 a_zero_point, b_zero_point, y_zero_point));
+  }
+
+  // Verify if the scale and zero point matchs from onnx input and nnapi input
+  if (op_is_qlinear) {
+    ORT_THROW_IF_ERROR(IsValidInputQuantizedType(model_builder, input1, a_scale, a_zero_point));
+    ORT_THROW_IF_ERROR(IsValidInputQuantizedType(model_builder, input2, b_scale, b_zero_point));
+  }
+
  int32_t fuse_code = model_builder.FindActivation(node, *node.OutputDefs()[0]);
-  AddBinaryOperator(op_code, model_builder, input1, input2, fuse_code, output, output_is_nhwc);
+  AddBinaryOperator(op_code, model_builder, input1, input2, fuse_code, output, output_is_nhwc, y_scale, y_zero_point);
 }

 #pragma endregion
@ -1181,21 +1325,9 @@ bool ConvOpBuilder::HasSupportedInputs(const Node& node) {
    return BaseOpBuilder::HasSupportedInputs(node);

  // QLinearConv only supports input of uint8 for now
-  int32_t x_input_type, w_input_type;
-  if (!GetType(*node.InputDefs()[0], x_input_type))
+  if (!IsBinaryOpQuantizedInputsSupported(node))
    return false;

-  if (!GetType(*node.InputDefs()[3], w_input_type))
-    return false;
-
-  if (x_input_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8 || x_input_type != w_input_type) {
-    LOGS_DEFAULT(VERBOSE) << "[" << node.OpType()
-                          << "] x Input type: [" << x_input_type
-                          << "] w Input type: [" << w_input_type
-                          << "] is not supported for now";
-    return false;
-  }
-
  return true;
 }

@ -1205,13 +1337,8 @@ void ConvOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Nod

  // skip the weight for conv as we need to transpose
  if (op == "QLinearConv") {
-    model_builder.AddInitializerToSkip(input_defs[1]->Name());  // a_scale
-    model_builder.AddInitializerToSkip(input_defs[2]->Name());  // x_zero_point
+    AddBinaryOpQuantizationScaleAndZeroPointToSkip(model_builder, node);
    model_builder.AddInitializerToSkip(input_defs[3]->Name());  // w
-    model_builder.AddInitializerToSkip(input_defs[4]->Name());  // w_scale
-    model_builder.AddInitializerToSkip(input_defs[5]->Name());  // w_zero_point
-    model_builder.AddInitializerToSkip(input_defs[6]->Name());  // y_scale
-    model_builder.AddInitializerToSkip(input_defs[7]->Name());  // y_zero_point
    if (input_defs.size() > 8)
      model_builder.AddInitializerToSkip(input_defs[8]->Name());  // B
  } else {
@ -1337,17 +1464,13 @@ void ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Nod
          y_zero_point = 0;

  if (is_qlinear_conv) {
-    x_scale = GetQuantizationScale(model_builder, node, 1);
-    w_scale = GetQuantizationScale(model_builder, node, 4);
-    y_scale = GetQuantizationScale(model_builder, node, 6);
-
-    x_zero_point = GetQuantizationZeroPoint(model_builder, node, 2);
-    w_zero_point = GetQuantizationZeroPoint(model_builder, node, 5);
-    y_zero_point = GetQuantizationZeroPoint(model_builder, node, 7);
+    ORT_THROW_IF_ERROR(
+        GetBinaryOpQuantizationScaleAndZeroPoint(model_builder, node,
+                                                 x_scale, w_scale, y_scale,
+                                                 x_zero_point, w_zero_point, y_zero_point));
  }

  const auto& weight = input_defs[w_idx]->Name();
-
  const auto& weight_tensor = initializers.at(weight);
  bool conv_2d = false,
       depthwise_conv_2d = false,
@ -1394,15 +1517,8 @@ void ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Nod

  if (is_qlinear_conv) {
    // Verify if the scale and zero point matchs from onnx input/weight and nnapi input/weight
-    const OperandType& x_operand_type = operand_types.at(input);
-    ORT_ENFORCE(x_operand_type.type == Type::TENSOR_QUANT8_ASYMM,
-                "input type is " + TypeToStr(x_operand_type.type));
-    VerifyValidInputQuantizedType(input, x_operand_type, x_scale, x_zero_point);
-
-    const OperandType& w_operand_type = operand_types.at(weight);
-    ORT_ENFORCE(w_operand_type.type == Type::TENSOR_QUANT8_ASYMM,
-                "input type is " + TypeToStr(w_operand_type.type));
-    VerifyValidInputQuantizedType(weight, w_operand_type, w_scale, w_zero_point);
+    ORT_THROW_IF_ERROR(IsValidInputQuantizedType(model_builder, input, x_scale, x_zero_point));
+    ORT_THROW_IF_ERROR(IsValidInputQuantizedType(model_builder, weight, w_scale, w_zero_point));
  }

  bool hasBias = (input_defs.size() > b_idx);
@ -1708,19 +1824,8 @@ bool GemmOpBuilder::HasSupportedInputs(const Node& node) {
    return BaseOpBuilder::HasSupportedInputs(node);

  // QLinearMatMul
-  int32_t a_input_type, b_input_type;
-  if (!GetType(*node.InputDefs()[0], a_input_type))
+  if (!IsBinaryOpQuantizedInputsSupported(node))
    return false;
-  if (!GetType(*node.InputDefs()[3], b_input_type))
-    return false;
-
-  if (a_input_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8 || a_input_type != b_input_type) {
-    LOGS_DEFAULT(VERBOSE) << "[" << node.OpType()
-                          << "] A Input type: [" << a_input_type
-                          << "] B Input type: [" << b_input_type
-                          << "] is not supported for now";
-    return false;
-  }

  return true;
 }
@ -1840,13 +1945,8 @@ void GemmOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Nod
    if (transB == 0)
      model_builder.AddInitializerToSkip(input_defs[1]->Name());
  } else if (op == "QLinearMatMul") {
-    model_builder.AddInitializerToSkip(input_defs[1]->Name());  // a_scale
-    model_builder.AddInitializerToSkip(input_defs[2]->Name());  // a_zero_point
+    AddBinaryOpQuantizationScaleAndZeroPointToSkip(model_builder, node);
    model_builder.AddInitializerToSkip(input_defs[3]->Name());  // b
-    model_builder.AddInitializerToSkip(input_defs[4]->Name());  // b_scale
-    model_builder.AddInitializerToSkip(input_defs[5]->Name());  // b_zero_point
-    model_builder.AddInitializerToSkip(input_defs[6]->Name());  // y_scale
-    model_builder.AddInitializerToSkip(input_defs[7]->Name());  // y_zero_point
  }
 }

@ -1878,13 +1978,10 @@ void GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Nod
          y_zero_point = 0;

  if (is_qlinear_matmul) {
-    a_scale = GetQuantizationScale(model_builder, node, 1);
-    b_scale = GetQuantizationScale(model_builder, node, 4);
-    y_scale = GetQuantizationScale(model_builder, node, 6);
-
-    a_zero_point = GetQuantizationZeroPoint(model_builder, node, 2);
-    b_zero_point = GetQuantizationZeroPoint(model_builder, node, 5);
-    y_zero_point = GetQuantizationZeroPoint(model_builder, node, 7);
+    ORT_THROW_IF_ERROR(
+        GetBinaryOpQuantizationScaleAndZeroPoint(model_builder, node,
+                                                 a_scale, b_scale, y_scale,
+                                                 a_zero_point, b_zero_point, y_zero_point));
  }

  uint32_t input_2_idx;
@ -1908,15 +2005,8 @@ void GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Nod

  // Verify if the scale and zero point matchs from onnx input and nnapi input
  if (is_qlinear_matmul) {
-    const OperandType& a_operand_type = operand_types.at(input1);
-    ORT_ENFORCE(a_operand_type.type == Type::TENSOR_QUANT8_ASYMM,
-                "input type is " + TypeToStr(a_operand_type.type));
-    VerifyValidInputQuantizedType(input1, a_operand_type, a_scale, a_zero_point);
-
-    const OperandType& b_operand_type = operand_types.at(input2);
-    ORT_ENFORCE(b_operand_type.type == Type::TENSOR_QUANT8_ASYMM,
-                "input type is " + TypeToStr(b_operand_type.type));
-    VerifyValidInputQuantizedType(input2, b_operand_type, b_scale, b_zero_point);
+    ORT_THROW_IF_ERROR(IsValidInputQuantizedType(model_builder, input1, a_scale, a_zero_point));
+    ORT_THROW_IF_ERROR(IsValidInputQuantizedType(model_builder, input2, b_scale, b_zero_point));
  }

  uint32_t bias_idx;
@ -2260,7 +2350,7 @@ void QuantizeLinearOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
  Type output_type = Type::TENSOR_QUANT8_ASYMM;

  if (input_defs.size() == 3) {  // Get zero point
-    zero_point = GetQuantizationZeroPoint(model_builder, node, 2);
+    ORT_THROW_IF_ERROR(GetQuantizationZeroPoint(model_builder, node, 2, zero_point));
  }

  LOGS_DEFAULT(VERBOSE) << "scale: " << scale << " zp: " << zero_point;
@ -2332,7 +2422,6 @@ bool DequantizeLinearOpBuilder::IsOpSupportedImpl(ModelBuilder& model_builder, c
 void DequantizeLinearOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node) {
  auto& shaper(model_builder.GetShaper());
  const auto& operand_indices(model_builder.GetOperandIndices());
-  const auto& operand_types(model_builder.GetOperandTypes());
  const auto input_defs(node.InputDefs());

  const auto& input = input_defs[0]->Name();
@ -2342,14 +2431,10 @@ void DequantizeLinearOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builde
  float scale = GetQuantizationScale(model_builder, node, 1);
  int32_t zero_point = 0;
  if (input_defs.size() == 3) {  // Get zero point
-    zero_point = GetQuantizationZeroPoint(model_builder, node, 2);
+    ORT_THROW_IF_ERROR(GetQuantizationZeroPoint(model_builder, node, 2, zero_point));
  }

-  const OperandType& input_operand_type = operand_types.at(input);
-  ORT_ENFORCE(input_operand_type.type == Type::TENSOR_QUANT8_ASYMM,
-              "input type is " + TypeToStr(input_operand_type.type));
-
-  VerifyValidInputQuantizedType(input, input_operand_type, scale, zero_point);
+  ORT_THROW_IF_ERROR(IsValidInputQuantizedType(model_builder, input, scale, zero_point));

  shaper.Identity(input, output);
  const OperandType output_operand_type(Type::TENSOR_FLOAT32, shaper[output]);
@ -2455,6 +2540,7 @@ CreateOpBuilders() {
    op_map.emplace("Sub", binary_op_builder);
    op_map.emplace("Mul", binary_op_builder);
    op_map.emplace("Div", binary_op_builder);
+    op_map.emplace("QLinearAdd", binary_op_builder);
  }

  op_map.emplace("Relu", std::make_shared<ReluOpBuilder>());
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.h
@ -31,8 +31,9 @@ std::unordered_map<std::string, std::shared_ptr<IOpBuilder>> CreateOpBuilders();
 void TransposeNHWCToNCHW(ModelBuilder& model_builder, const std::string& input, const std::string& output);

 // Get the quantized input's scale and zero point for the given input
-std::pair<float, int32_t> GetQuantizedInputScaleAndZeroPoint(const ModelBuilder& model_builder,
-                                                             const Node& node, const std::string& input_name);
+Status GetQuantizedInputScaleAndZeroPoint(const ModelBuilder& model_builder,
+                                          const Node& node, const std::string& input_name,
+                                          float& scale, int32_t& zero_point);

 }  // namespace nnapi
 }  // namespace onnxruntime
--- a/onnxruntime/test/contrib_ops/qlinear_binary_op_test.cc
+++ b/onnxruntime/test/contrib_ops/qlinear_binary_op_test.cc
@ -8,16 +8,14 @@
 namespace onnxruntime {
 namespace test {

-static std::vector<int64_t> PrefixingDims(const std::vector<int64_t>& dims, size_t number_dims)
-{
+static std::vector<int64_t> PrefixingDims(const std::vector<int64_t>& dims, size_t number_dims) {
  std::vector<int64_t> prefixed_dims;
  if (number_dims > dims.size()) prefixed_dims.resize(number_dims - dims.size(), 1);
  prefixed_dims.insert(prefixed_dims.end(), dims.begin(), dims.end());
  return prefixed_dims;
 }

-static int64_t CalcStrides(const std::vector<int64_t>& dims, std::vector<int64_t>& strides, bool clear1 = false)
-{
+static int64_t CalcStrides(const std::vector<int64_t>& dims, std::vector<int64_t>& strides, bool clear1 = false) {
  strides.clear();
  strides.resize(dims.size(), 1);
  for (int i = (int)dims.size() - 2; i >= 0; --i) {
@ -38,13 +36,12 @@ static T clampi(int a, int min_value, int max_value) {
 }

 template <typename T>
-void
-RunQLinearMathTestFromFloat(
+void RunQLinearMathTestFromFloat(
    const char* op_name, std::function<float(float, float)> calc,
    const std::vector<float>& a, const std::vector<int64_t>& a_shape_origin, float A_scale, T A_zero_point,
    const std::vector<float>& b, const std::vector<int64_t>& b_shape_origin, float B_scale, T B_zero_point,
-    float C_scale, T C_zero_point)
-{
+    float C_scale, T C_zero_point,
+    bool all_initializer_scale_zero_point = false) {
  size_t number_dims = std::max(a_shape_origin.size(), b_shape_origin.size());
  std::vector<int64_t> a_shape = PrefixingDims(a_shape_origin, number_dims);
  std::vector<int64_t> b_shape = PrefixingDims(b_shape_origin, number_dims);
@ -61,7 +58,7 @@ RunQLinearMathTestFromFloat(
  auto c_size = CalcStrides(c_shape, c_strides, false);
  auto a_size = CalcStrides(a_shape, a_strides, true);
  auto b_size = CalcStrides(b_shape, b_strides, true);
-  if (a_size != static_cast<int64_t>(a.size()) || b_size != static_cast<int64_t>(b.size())){
+  if (a_size != static_cast<int64_t>(a.size()) || b_size != static_cast<int64_t>(b.size())) {
    throw std::runtime_error("Input size not match input shape!");
  }
  constexpr int qmax = std::numeric_limits<T>::max();
@ -73,19 +70,19 @@ RunQLinearMathTestFromFloat(
    a_quantized[i] = clampi<T>(static_cast<int>(std::nearbyintf(a[i] / A_scale)) + A_zero_point, qmin, qmax);
  }
  test.template AddInput<T>("A", a_shape_origin, a_quantized);
-  test.AddInput<float>("A_scale", {},  {A_scale});
-  test.template AddInput<T>("A_zero_point", {}, {A_zero_point});
+  test.AddInput<float>("A_scale", {}, {A_scale}, all_initializer_scale_zero_point);
+  test.template AddInput<T>("A_zero_point", {}, {A_zero_point}, all_initializer_scale_zero_point);

  std::vector<T> b_quantized(b.size());
  for (size_t i = 0, sz = b.size(); i < sz; ++i) {
    b_quantized[i] = clampi<T>(static_cast<int>(std::nearbyintf(b[i] / B_scale)) + B_zero_point, qmin, qmax);
  }
  test.template AddInput<T>("B", b_shape_origin, b_quantized);
-  test.AddInput<float>("B_scale", {}, {B_scale});
-  test.template AddInput<T>("B_zero_point", {}, {B_zero_point});
+  test.AddInput<float>("B_scale", {}, {B_scale}, all_initializer_scale_zero_point);
+  test.template AddInput<T>("B_zero_point", {}, {B_zero_point}, all_initializer_scale_zero_point);

-  test.AddInput<float>("C_scale", {}, {C_scale});
-  test.template AddInput<T>("C_zero_point", {}, {C_zero_point});
+  test.AddInput<float>("C_scale", {}, {C_scale}, all_initializer_scale_zero_point);
+  test.template AddInput<T>("C_zero_point", {}, {C_zero_point}, all_initializer_scale_zero_point);
  std::vector<T> c(c_size);
  for (int64_t offset = 0; offset < c_size; ++offset) {
    int64_t remain = offset, a_offset = 0, b_offset = 0;
@ -107,27 +104,25 @@ RunQLinearMathTestFromFloat(
 // total 32 + 31 elements to cover all path
 // for add() usage tensor A
 static std::vector<float> A4Add = {
-  0.00f,  0.25f,  0.50f,  0.75f,  1.00f,  1.25f,  1.50f,  1.75f,
-  2.00f,  2.25f,  2.50f,  2.75f,  3.00f,  3.50f,  3.75f,  4.00f,
- -0.00f, -0.25f, -0.50f, -0.75f, -1.00f, -1.25f, -1.50f, -1.75f,
- -2.00f, -2.25f, -2.50f, -2.75f, -3.00f, -4.00f, -3.75f, -3.50f,
-  0.00f,  0.25f,  0.50f,  0.75f,  1.00f,  1.25f,  1.50f,  1.75f,
-  2.00f,  2.25f,  2.50f,  2.75f,  3.00f,  3.75f,  4.25f,  4.50f,
- -0.00f, -0.25f, -0.50f, -0.75f, -1.00f, -1.25f, -1.50f, -1.75f,
- -2.00f, -2.25f, -2.50f, -2.75f, -3.00f,  3.75f,  3.00f
-};
+    0.00f, 0.25f, 0.50f, 0.75f, 1.00f, 1.25f, 1.50f, 1.75f,
+    2.00f, 2.25f, 2.50f, 2.75f, 3.00f, 3.50f, 3.75f, 4.00f,
+    -0.00f, -0.25f, -0.50f, -0.75f, -1.00f, -1.25f, -1.50f, -1.75f,
+    -2.00f, -2.25f, -2.50f, -2.75f, -3.00f, -4.00f, -3.75f, -3.50f,
+    0.00f, 0.25f, 0.50f, 0.75f, 1.00f, 1.25f, 1.50f, 1.75f,
+    2.00f, 2.25f, 2.50f, 2.75f, 3.00f, 3.75f, 4.25f, 4.50f,
+    -0.00f, -0.25f, -0.50f, -0.75f, -1.00f, -1.25f, -1.50f, -1.75f,
+    -2.00f, -2.25f, -2.50f, -2.75f, -3.00f, 3.75f, 3.00f};

 // for add() usage tensor B
 static std::vector<float> B4Add = {
-  4.00f,  0.25f,  0.00f, -0.25f,  0.50f, -0.25f, -0.00f,  0.25f,
- -1.50f, -2.25f,  2.50f,  3.75f, -3.75f, -4.00f,  5.00f,  5.50f,
-  4.00f,  0.25f,  0.00f, -0.25f,  0.50f, -0.25f, -0.00f,  0.25f,
- -1.50f, -2.25f,  2.50f,  3.75f, -3.75f, -4.00f,  5.00f,  5.50f,
-  4.00f,  0.25f,  0.00f, -0.25f,  0.50f, -0.25f, -0.00f,  0.25f,
- -1.50f, -2.25f,  2.50f,  3.75f, -3.75f, -4.00f,  5.00f,  5.50f,
-  4.00f,  0.25f,  0.00f, -0.25f,  0.50f, -0.25f, -0.00f,  0.25f,
- -1.50f, -2.25f,  2.50f,  3.75f, -3.75f, -3.75f, -4.00f
-};
+    4.00f, 0.25f, 0.00f, -0.25f, 0.50f, -0.25f, -0.00f, 0.25f,
+    -1.50f, -2.25f, 2.50f, 3.75f, -3.75f, -4.00f, 5.00f, 5.50f,
+    4.00f, 0.25f, 0.00f, -0.25f, 0.50f, -0.25f, -0.00f, 0.25f,
+    -1.50f, -2.25f, 2.50f, 3.75f, -3.75f, -4.00f, 5.00f, 5.50f,
+    4.00f, 0.25f, 0.00f, -0.25f, 0.50f, -0.25f, -0.00f, 0.25f,
+    -1.50f, -2.25f, 2.50f, 3.75f, -3.75f, -4.00f, 5.00f, 5.50f,
+    4.00f, 0.25f, 0.00f, -0.25f, 0.50f, -0.25f, -0.00f, 0.25f,
+    -1.50f, -2.25f, 2.50f, 3.75f, -3.75f, -3.75f, -4.00f};

 static auto add_function = [](float a_dequantized, float b_dequantized) {
  return a_dequantized + b_dequantized;
@ -144,7 +139,16 @@ TEST(QLinearBinaryOpTest, AddU8VectorVectorFull) {
  uint8_t C_zero_point = 128;

  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
-    A, {63}, A_scale, A_zero_point, B, {63}, B_scale, B_zero_point, C_scale, C_zero_point);
+                              A, {63}, A_scale, A_zero_point,
+                              B, {63}, B_scale, B_zero_point,
+                              C_scale, C_zero_point);
+
+  // NNAPI will require all the scales and zero points be initializers
+  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
+                              A, {63}, A_scale, A_zero_point,
+                              B, {63}, B_scale, B_zero_point,
+                              C_scale, C_zero_point,
+                              true);
 }

 TEST(QLinearBinaryOpTest, AddU8VectorVectorBroadcast) {
@ -152,73 +156,117 @@ TEST(QLinearBinaryOpTest, AddU8VectorVectorBroadcast) {
  float A_scale = 8.0f / 256.0f;
  uint8_t A_zero_point = 128;
  std::vector<float> B = {
-    4.00f,  0.25f,  0.00f, -0.25f,  0.50f, -0.25f, -0.00f,  0.25f,
-   -1.50f, -2.25f,  2.50f,  3.75f, -3.75f, -4.00f,  5.00f,  5.50f,
-   -0.50f, -1.25f,  0.75f,  1.25f,  2.25f
-  };
+      4.00f, 0.25f, 0.00f, -0.25f, 0.50f, -0.25f, -0.00f, 0.25f,
+      -1.50f, -2.25f, 2.50f, 3.75f, -3.75f, -4.00f, 5.00f, 5.50f,
+      -0.50f, -1.25f, 0.75f, 1.25f, 2.25f};
  float B_scale = 8.0f / 256.0f;
  uint8_t B_zero_point = 128;
  float C_scale = 16.0f / 256.0f;
  uint8_t C_zero_point = 128;

  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
-    A, {3, 3, 7}, A_scale, A_zero_point, B, {3, 1, 7}, B_scale, B_zero_point, C_scale, C_zero_point);
+                              A, {3, 3, 7}, A_scale, A_zero_point,
+                              B, {3, 1, 7}, B_scale, B_zero_point,
+                              C_scale, C_zero_point);
+
+  // NNAPI will require all the scales and zero points be initializers
+  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
+                              A, {3, 3, 7}, A_scale, A_zero_point,
+                              B, {3, 1, 7}, B_scale, B_zero_point,
+                              C_scale, C_zero_point,
+                              true);
 }

 TEST(QLinearBinaryOpTest, AddU8ScalarVectorFull) {
  const std::vector<float>& A(A4Add);
  float A_scale = 8.0f / 256.0f;
  uint8_t A_zero_point = 128;
-  std::vector<float> B = { 0.25f };
+  std::vector<float> B = {0.25f};
  float B_scale = 8.0f / 256.0f;
  uint8_t B_zero_point = 96;
  float C_scale = 8.0f / 256.0f;
  uint8_t C_zero_point = 100;

  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
-    B, {1}, B_scale, B_zero_point, A, {63}, A_scale, A_zero_point, C_scale, C_zero_point);
+                              B, {1}, B_scale, B_zero_point,
+                              A, {63}, A_scale, A_zero_point,
+                              C_scale, C_zero_point);
+
+  // NNAPI will require all the scales and zero points be initializers
+  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
+                              B, {1}, B_scale, B_zero_point,
+                              A, {63}, A_scale, A_zero_point,
+                              C_scale, C_zero_point,
+                              true);
 }

 TEST(QLinearBinaryOpTest, AddU8ScalarVectorBroadcast) {
  const std::vector<float>& A(A4Add);
  float A_scale = 8.0f / 256.0f;
  uint8_t A_zero_point = 128;
-  std::vector<float> B = { 0.25f, -0.25f, -0.00f };
+  std::vector<float> B = {0.25f, -0.25f, -0.00f};
  float B_scale = 8.0f / 256.0f;
  uint8_t B_zero_point = 96;
  float C_scale = 8.0f / 256.0f;
  uint8_t C_zero_point = 100;

  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
-    B, {3, 1, 1}, B_scale, B_zero_point, A, {3, 7, 3}, A_scale, A_zero_point, C_scale, C_zero_point);
+                              B, {3, 1, 1}, B_scale, B_zero_point,
+                              A, {3, 7, 3}, A_scale, A_zero_point,
+                              C_scale, C_zero_point);
+
+  // NNAPI will require all the scales and zero points be initializers
+  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
+                              B, {3, 1, 1}, B_scale, B_zero_point,
+                              A, {3, 7, 3}, A_scale, A_zero_point,
+                              C_scale, C_zero_point,
+                              true);
 }

 TEST(QLinearBinaryOpTest, AddU8VectorScalarFull) {
  const std::vector<float>& A(A4Add);
  float A_scale = 8.0f / 256.0f;
  uint8_t A_zero_point = 128;
-  std::vector<float> B = { 0.25f };
+  std::vector<float> B = {0.25f};
  float B_scale = 8.0f / 256.0f;
  uint8_t B_zero_point = 96;
  float C_scale = 16.0f / 256.0f;
  uint8_t C_zero_point = 128;

  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
-    A, {63}, A_scale, A_zero_point, B, {1}, B_scale, B_zero_point, C_scale, C_zero_point);
+                              A, {63}, A_scale, A_zero_point,
+                              B, {1}, B_scale, B_zero_point,
+                              C_scale, C_zero_point);
+
+  // NNAPI will require all the scales and zero points be initializers
+  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
+                              A, {63}, A_scale, A_zero_point,
+                              B, {1}, B_scale, B_zero_point,
+                              C_scale, C_zero_point,
+                              true);
 }

 TEST(QLinearBinaryOpTest, AddU8VectorScalarBroadcast) {
  const std::vector<float>& A(A4Add);
  float A_scale = 8.0f / 256.0f;
  uint8_t A_zero_point = 128;
-  std::vector<float> B = { 0.25f, -0.25f, -0.00f };
+  std::vector<float> B = {0.25f, -0.25f, -0.00f};
  float B_scale = 8.0f / 256.0f;
  uint8_t B_zero_point = 96;
  float C_scale = 16.0f / 256.0f;
  uint8_t C_zero_point = 128;

  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
-    A, {3, 7, 3}, A_scale, A_zero_point, B, {1, 1, 3}, B_scale, B_zero_point, C_scale, C_zero_point);
+                              A, {3, 7, 3}, A_scale, A_zero_point,
+                              B, {1, 1, 3}, B_scale, B_zero_point,
+                              C_scale, C_zero_point);
+
+  // NNAPI will require all the scales and zero points be initializers
+  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
+                              A, {3, 7, 3}, A_scale, A_zero_point,
+                              B, {1, 1, 3}, B_scale, B_zero_point,
+                              C_scale, C_zero_point,
+                              true);
 }

 TEST(QLinearBinaryOpTest, AddS8VectorVectorFull) {
@ -232,7 +280,9 @@ TEST(QLinearBinaryOpTest, AddS8VectorVectorFull) {
  int8_t C_zero_point = -16;

  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
-    A, {63}, A_scale, A_zero_point, B, {63}, B_scale, B_zero_point, C_scale, C_zero_point);
+                              A, {63}, A_scale, A_zero_point,
+                              B, {63}, B_scale, B_zero_point,
+                              C_scale, C_zero_point);
 }

 TEST(QLinearBinaryOpTest, AddS8VectorVectorBroadcast) {
@ -240,75 +290,83 @@ TEST(QLinearBinaryOpTest, AddS8VectorVectorBroadcast) {
  float A_scale = 8.0f / 256.0f;
  int8_t A_zero_point = 0;
  std::vector<float> B = {
-    4.00f,  0.25f,  0.00f, -0.25f,  0.50f, -0.25f, -0.00f,  0.25f,
-   -1.50f, -2.25f,  2.50f,  3.75f, -3.75f, -4.00f,  5.00f,  5.50f,
-   -0.50f, -1.25f,  0.75f,  1.25f,  2.25f
-  };
+      4.00f, 0.25f, 0.00f, -0.25f, 0.50f, -0.25f, -0.00f, 0.25f,
+      -1.50f, -2.25f, 2.50f, 3.75f, -3.75f, -4.00f, 5.00f, 5.50f,
+      -0.50f, -1.25f, 0.75f, 1.25f, 2.25f};
  float B_scale = 8.0f / 256.0f;
  int8_t B_zero_point = 0;
  float C_scale = 16.0f / 256.0f;
  int8_t C_zero_point = -16;

  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
-    A, {3, 3, 7}, A_scale, A_zero_point, B, {3, 1, 7}, B_scale, B_zero_point, C_scale, C_zero_point);
+                              A, {3, 3, 7}, A_scale, A_zero_point,
+                              B, {3, 1, 7}, B_scale, B_zero_point,
+                              C_scale, C_zero_point);
 }

 TEST(QLinearBinaryOpTest, AddS8ScalarVectorFull) {
  const std::vector<float>& A(A4Add);
  float A_scale = 8.0f / 256.0f;
  int8_t A_zero_point = 0;
-  std::vector<float> B = { 0.25f };
+  std::vector<float> B = {0.25f};
  float B_scale = 2.0f / 256.0f;
  int8_t B_zero_point = 16;
  float C_scale = 8.0f / 256.0f;
  int8_t C_zero_point = 10;

  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
-    B, {1}, B_scale, B_zero_point, A, {63}, A_scale, A_zero_point, C_scale, C_zero_point);
+                              B, {1}, B_scale, B_zero_point,
+                              A, {63}, A_scale, A_zero_point,
+                              C_scale, C_zero_point);
 }

 TEST(QLinearBinaryOpTest, AddS8ScalarVectorBroadcast) {
  const std::vector<float>& A(A4Add);
  float A_scale = 8.0f / 256.0f;
  int8_t A_zero_point = 0;
-  std::vector<float> B = { 0.25f, -0.25f, -0.00f };
+  std::vector<float> B = {0.25f, -0.25f, -0.00f};
  float B_scale = 2.0f / 256.0f;
  int8_t B_zero_point = 16;
  float C_scale = 8.0f / 256.0f;
  int8_t C_zero_point = 10;

  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
-    B, {3, 1, 1}, B_scale, B_zero_point, A, {3, 7, 3}, A_scale, A_zero_point, C_scale, C_zero_point);
+                              B, {3, 1, 1}, B_scale, B_zero_point,
+                              A, {3, 7, 3}, A_scale, A_zero_point,
+                              C_scale, C_zero_point);
 }

 TEST(QLinearBinaryOpTest, AddS8VectorScalarFull) {
  const std::vector<float>& A(A4Add);
  float A_scale = 8.0f / 256.0f;
  int8_t A_zero_point = 0;
-  std::vector<float> B = { 0.25f };
+  std::vector<float> B = {0.25f};
  float B_scale = 2.0f / 256.0f;
  int8_t B_zero_point = 16;
  float C_scale = 8.0f / 256.0f;
  int8_t C_zero_point = 10;

  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
-    A, {63}, A_scale, A_zero_point, B, {1}, B_scale, B_zero_point, C_scale, C_zero_point);
+                              A, {63}, A_scale, A_zero_point,
+                              B, {1}, B_scale, B_zero_point,
+                              C_scale, C_zero_point);
 }

 TEST(QLinearBinaryOpTest, AddS8VectorScalarBroadcast) {
  const std::vector<float>& A(A4Add);
  float A_scale = 8.0f / 256.0f;
  int8_t A_zero_point = 0;
-  std::vector<float> B = { 0.25f, -0.25f, -0.00f };
+  std::vector<float> B = {0.25f, -0.25f, -0.00f};
  float B_scale = 2.0f / 256.0f;
  int8_t B_zero_point = 16;
  float C_scale = 8.0f / 256.0f;
  int8_t C_zero_point = 10;

  RunQLinearMathTestFromFloat("QLinearAdd", add_function,
-    A, {3, 7, 3}, A_scale, A_zero_point, B, {1, 1, 3}, B_scale, B_zero_point, C_scale, C_zero_point);
+                              A, {3, 7, 3}, A_scale, A_zero_point,
+                              B, {1, 1, 3}, B_scale, B_zero_point,
+                              C_scale, C_zero_point);
 }

 }  // namespace test
 }  // namespace onnxruntime
-