[NNAPI EP] Add per-tensor u8s8 support for Qlinear[Conv/MatMul] (#6818)

* NNAPI Add per-tensor u8s8 support * Update some comments * Address CR comments * Address CR comments
2026-07-25 19:48:11 +00:00 · 2021-03-03 15:44:49 -08:00 · 2021-03-03 15:44:49 -08:00 · fedb68429c
commit fedb68429c
parent 3c5d811e77
3 changed files with 273 additions and 127 deletions
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
@ -137,7 +137,9 @@ bool HasValidBinaryOpQuantizedInputs(const Node& node) {
 bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const Node& node,
                                const std::vector<size_t>& indices, const OpSupportCheckParams& params) {
  const auto& op_type = node.OpType();
-  bool is_qlinear_conv = (op_type == "QLinearConv");
+  auto qlinear_op_type = GetQLinearOpType(node);
+  bool is_qlinear_conv = (qlinear_op_type == QLinearOpType::QLinearConv);
+  bool is_qlinear_matmul = (qlinear_op_type == QLinearOpType::QLinearMatMul);
  const auto input_defs(node.InputDefs());
  for (const auto idx : indices) {
    if (idx >= input_defs.size()) {
@ -145,46 +147,53 @@ bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const
                            << " >= input number, " << input_defs.size();
      return false;
    }
+
    const auto scale_name = input_defs[idx]->Name();
-    if (Contains(initializers, scale_name)) {
-      const auto& scale_tensor = *initializers.at(scale_name);
-      int64_t scales_dim = scale_tensor.dims().empty() ? 1 : scale_tensor.dims()[0];
-      bool is_conv_weight = is_qlinear_conv && idx == 4;
-      bool is_conv_u8s8_weight = false;
-
-      if (is_conv_weight) {
-        const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
-        is_conv_u8s8_weight = weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8;
-      }
-
-      // We need to check the per-channel quantization scales dimensions for u8s8 QlinearConv
-      // We only support per-channel quantization for u8s8
-      // For all other cases, the scales should be a scalar
-      if (is_conv_u8s8_weight) {
-        if (params.android_sdk_ver < 29) {
-          LOGS_DEFAULT(VERBOSE) << op_type << " only supports per-channel quantization on Android API 29+, "
-                                << "system API level: " << params.android_sdk_ver;
-          return false;
-        }
-
-        const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
-        if (weight_tensor.dims()[0] != scales_dim) {
-          LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight,"
-                                << " weight dimension[0] " << weight_tensor.dims()[0]
-                                << " scale dimension " << scales_dim;
-          return false;
-        }
-      } else {
-        if (scales_dim != 1) {
-          LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, "
-                                << " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+";
-          return false;
-        }
-      }
-    } else {
-      LOGS_DEFAULT(VERBOSE) << "The scale of " << op_type << " must be known";
+    if (!Contains(initializers, scale_name)) {
+      LOGS_DEFAULT(VERBOSE) << "The scale of " << op_type << " must be an initializer tensor";
      return false;
    }
+
+    // If this op is Qlinear[Conv/MatMul], we want to check u8s8 support for weight tensor (or B tensor for QlinearMatMul)
+    bool is_conv_matmul_weight = (is_qlinear_conv || is_qlinear_matmul) && idx == 4;
+    bool is_conv_matmul_u8s8_weight = false;
+
+    if (is_conv_matmul_weight) {
+      const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
+      is_conv_matmul_u8s8_weight = weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8;
+    }
+
+    const auto& scale_tensor = *initializers.at(scale_name);
+    int64_t scales_dim = scale_tensor.dims().empty() ? 1 : scale_tensor.dims()[0];
+    if (!is_conv_matmul_u8s8_weight) {
+      if (scales_dim != 1) {
+        LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, "
+                              << " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+";
+        return false;
+      }
+    } else if (scales_dim != 1) {
+      // For u8s8 Qlinear[Conv/MatMul], we support
+      // 1. Per-tensor, the weight will be transformed to uint8 later
+      // 2. Per-channel, only from Android API level 29
+      if (is_qlinear_matmul) {
+        LOGS_DEFAULT(VERBOSE) << "QLinearMatMul does not support per-channel quantization";
+        return false;
+      }
+
+      if (params.android_sdk_ver < 29) {
+        LOGS_DEFAULT(VERBOSE) << op_type << " only supports per-channel quantization on Android API 29+, "
+                              << "system API level: " << params.android_sdk_ver;
+        return false;
+      }
+
+      const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
+      if (weight_tensor.dims()[0] != scales_dim) {
+        LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight,"
+                              << " weight dimension[0] " << weight_tensor.dims()[0]
+                              << " scale dimension " << scales_dim;
+        return false;
+      }
+    }
  }

  return true;
@ -193,7 +202,9 @@ bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const
 bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, const Node& node,
                                    const std::vector<size_t>& indices) {
  const auto& op_type = node.OpType();
-  bool is_qlinear_conv = (op_type == "QLinearConv");
+  auto qlinear_op_type = GetQLinearOpType(node);
+  bool is_qlinear_conv = (qlinear_op_type == QLinearOpType::QLinearConv);
+  bool is_qlinear_matmul = (qlinear_op_type == QLinearOpType::QLinearMatMul);
  const auto input_defs(node.InputDefs());
  for (const auto idx : indices) {
    if (idx >= input_defs.size()) {
@ -203,65 +214,77 @@ bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, co
    }

    const auto zero_point_name = input_defs[idx]->Name();
-    if (Contains(initializers, zero_point_name)) {
-      bool is_conv_weight = is_qlinear_conv && idx == 5;
-      bool is_conv_u8s8_weight = false;
-      if (is_conv_weight) {
-        const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
-        is_conv_u8s8_weight = weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8;
-      }
+    if (!Contains(initializers, zero_point_name)) {
+      LOGS_DEFAULT(VERBOSE) << "The zero point of " << op_type << " must be an initializer tensor";
+      return false;
+    }

-      const auto& zero_tensor = *initializers.at(zero_point_name);
-      int64_t zero_dim = zero_tensor.dims().empty() ? 1 : zero_tensor.dims()[0];
-      if (is_conv_u8s8_weight) {
-        if (zero_tensor.data_type() != ONNX_NAMESPACE::TensorProto_DataType_INT8) {
-          LOGS_DEFAULT(VERBOSE) << "u8s8 QlinearConv only supports int8 zero point for weight, "
-                                << "actual zero point type: [" << zero_tensor.data_type() << "]";
-          return false;
-        }
+    bool is_conv_matmul_weight = is_qlinear_conv && idx == 5;
+    bool is_conv_matmul_u8s8_weight = false;
+    if (is_conv_matmul_weight) {
+      const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
+      is_conv_matmul_u8s8_weight = weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8;
+    }

-        // For onnx, u8s8 QlinearConv, the weight zero point can be a scalar,
-        // or a tensor with same channel as weight, for NNAPI we only support it be
-        // 0 (scalar) or all 0 (tensor), NNAPI will assume the zero point for per-channel
-        // quantization is 0 there is no input for it
-        const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
-        if (weight_tensor.dims()[0] != zero_dim && zero_dim != 1) {
-          LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight,"
-                                << " weight dimension[0] " << weight_tensor.dims()[0]
-                                << " zero point dimension " << zero_dim;
-          return false;
-        }
+    const auto& zero_tensor = *initializers.at(zero_point_name);
+    int64_t zero_dim = zero_tensor.dims().empty() ? 1 : zero_tensor.dims()[0];

-        std::unique_ptr<uint8_t[]> unpacked_tensor;
-        size_t tensor_byte_size;
-        auto status = onnxruntime::utils::UnpackInitializerData(
-            zero_tensor,
-            node.ModelPath(),
-            unpacked_tensor, tensor_byte_size);
-        if (!status.IsOK()) {
-          LOGS_DEFAULT(ERROR) << "QLinearConv erro when unpack zero tensor:" << status.ErrorMessage();
-          return false;
-        }
-
-        // Verify all onnx weight zero point(s) are 0(s)
-        const int8_t* zero_points = reinterpret_cast<const int8_t*>(unpacked_tensor.get());
-        for (size_t i = 0; i < tensor_byte_size; i++) {
-          if (zero_points[i] != 0) {
-            LOGS_DEFAULT(VERBOSE) << "QLinearConv only support 0 as zero point, "
-                                  << "zero_points[" << i << "] has value: " << zero_points[i];
-            return false;
-          }
-        }
-      } else {
-        if (zero_dim != 1) {
-          LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, "
-                                << " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+";
-          return false;
-        }
+    if (!is_conv_matmul_u8s8_weight) {
+      if (zero_dim != 1) {
+        LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, "
+                              << " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+";
+        return false;
      }
    } else {
-      LOGS_DEFAULT(VERBOSE) << "The zero point of " << op_type << " must be known";
-      return false;
+      // For u8s8 Qlinear[Conv/MatMul], we support
+      // 1. Per-tensor, the weight will be transformed to uint8 later
+      // 2. Per-channel, only from Android API level 29
+      if (zero_tensor.data_type() != ONNX_NAMESPACE::TensorProto_DataType_INT8) {
+        LOGS_DEFAULT(VERBOSE) << "u8s8 Qlinear[Conv/MatMul] only supports int8 zero point for weight, "
+                              << "actual zero point type: [" << zero_tensor.data_type() << "]";
+        return false;
+      }
+
+      if (zero_dim != 1) {
+        if (is_qlinear_matmul) {
+          LOGS_DEFAULT(VERBOSE) << "QLinearMatMul does not support per-channel quantization";
+          return false;
+        }
+      }
+
+      // For onnx, u8s8 QlinearConv, the weight zero point can be a scalar,
+      // or a tensor with same channel as weight, for NNAPI we only support it be
+      // 0 (scalar) or all 0 (tensor), NNAPI will assume the zero point for per-channel
+      // quantization is 0 there is no input for it
+      const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
+      if (weight_tensor.dims()[0] != zero_dim && zero_dim != 1) {
+        LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight,"
+                              << " weight dimension[0] " << weight_tensor.dims()[0]
+                              << " zero point dimension " << zero_dim;
+        return false;
+      }
+
+      std::unique_ptr<uint8_t[]> unpacked_tensor;
+      size_t tensor_byte_size;
+      auto status = onnxruntime::utils::UnpackInitializerData(
+          zero_tensor,
+          node.ModelPath(),
+          unpacked_tensor, tensor_byte_size);
+      if (!status.IsOK()) {
+        LOGS_DEFAULT(ERROR) << "Qlinear[Conv/MatMul] error when unpack zero tensor: " << zero_point_name
+                            << ", error msg: " << status.ErrorMessage();
+        return false;
+      }
+
+      // Verify all onnx weight zero point(s) are 0(s)
+      const int8_t* zero_points = reinterpret_cast<const int8_t*>(unpacked_tensor.get());
+      for (size_t i = 0; i < tensor_byte_size; i++) {
+        if (zero_points[i] != 0) {
+          LOGS_DEFAULT(VERBOSE) << "u8s8 Qlinear[Conv/MatMul]  only support 0 as zero point, "
+                                << "zero_points[" << i << "] has value: " << zero_points[i];
+          return false;
+        }
+      }
    }
  }

--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
@ -272,15 +272,23 @@ enum DataLayout {
  L_1230 = 1,
 };

-// TODO, replace this with more efficient code in optimizers
+// This is primarily used for adding the weight (an initializer) of Conv/QlinearConv
+// And perform layout change from ONNX -> NNAPI
+// If is_per_tensor_u8s8 is true, the QlinearConv is per-tensor u8s8 (input X is unsigned int8
+// and weight W is signed int8 and it is per-tensor (NOT per-channel) quantized), in this case,
+// since NNAPI requires X and W to be same type for per-tensor quantization,
+// the initializer tensor W will be converted from int8 to uint8 by flip each byte by XOR 0x80
+// byte ^ 0x80 == byte + 128
 static Status AddInitializerInNewLayout(ModelBuilder& model_builder,
                                        const std::string& name,
                                        const OperandType& source_operand_type,
-                                        DataLayout new_layout) ORT_MUST_USE_RESULT;
+                                        DataLayout new_layout,
+                                        bool is_per_tensor_u8s8) ORT_MUST_USE_RESULT;
 static Status AddInitializerInNewLayout(ModelBuilder& model_builder,
                                        const std::string& name,
                                        const OperandType& source_operand_type,
-                                        DataLayout new_layout) {
+                                        DataLayout new_layout,
+                                        bool is_per_tensor_u8s8) {
  const auto& tensor = *model_builder.GetInitializerTensors().at(name);
  const Shape& shape = source_operand_type.dimensions;
  ORT_RETURN_IF_NOT(shape.size() == 4,
@ -322,6 +330,8 @@ static Status AddInitializerInNewLayout(ModelBuilder& model_builder,
  std::unique_ptr<uint8_t[]> buffer_holder(new uint8_t[operand_type.GetOperandBlobByteSize()]);
  uint8_t* buffer = buffer_holder.get();
  size_t element_size = operand_type.GetElementByteSize();
+
+  uint8_t bit_flip_val = is_per_tensor_u8s8 ? 0x80 : 0;
  for (uint32_t out = 0; out < out_t; out++) {
    for (uint32_t in = 0; in < in_t; in++) {
      for (uint32_t h = 0; h < h_t; h++) {
@ -345,7 +355,7 @@ static Status AddInitializerInNewLayout(ModelBuilder& model_builder,
          }

          for (size_t i = 0; i < element_size; i++) {
-            buffer[element_size * nnapi_idx + i] = src[element_size * onnx_idx + i];
+            buffer[element_size * nnapi_idx + i] = src[element_size * onnx_idx + i] ^ bit_flip_val;
          }
        }
      }
@ -355,13 +365,21 @@ static Status AddInitializerInNewLayout(ModelBuilder& model_builder,
  return model_builder.AddOperandFromPersistMemoryBuffer(name, &buffer[0], operand_type);
 }

-// TODO, replace this with more efficient code in optimizers
+// This is primarily used for adding the input B (an initializer) of MatMul/QlinearMatMul/Gemm (not transposed)
+// and transpose it, since for NNAPI only supports A*B'
+//
+// If is_per_tensor_u8s8 is true, the QlinearMatMul is per-tensor u8s8 (input A is unsigned int8
+// and input B is signed int8), in this case, since NNAPI requires A and B to be same type,
+// the initializer tensor B will be converted from int8 to uint8 by flip each byte by XOR 0x80
+// byte ^ 0x80 == byte + 128
 static Status AddInitializerTransposed(ModelBuilder& model_builder,
                                       const OperandType& source_operand_type,
-                                       const std::string& name) ORT_MUST_USE_RESULT;
+                                       const std::string& name,
+                                       bool is_per_tensor_u8s8) ORT_MUST_USE_RESULT;
 static Status AddInitializerTransposed(ModelBuilder& model_builder,
                                       const OperandType& source_operand_type,
-                                       const std::string& name) {
+                                       const std::string& name,
+                                       bool is_per_tensor_u8s8) {
  const auto& tensor = *model_builder.GetInitializerTensors().at(name);
  const Shape& shape = source_operand_type.dimensions;

@ -397,10 +415,11 @@ static Status AddInitializerTransposed(ModelBuilder& model_builder,
  std::unique_ptr<uint8_t[]> buffer_holder(new uint8_t[operand_type.GetOperandBlobByteSize()]);
  uint8_t* buffer = buffer_holder.get();
  size_t element_size = operand_type.GetElementByteSize();
+  uint8_t bit_flip_val = is_per_tensor_u8s8 ? 0x80 : 0;
  for (uint32_t x = 0; x < x_t; x++) {
    for (uint32_t y = 0; y < y_t; y++) {
      for (size_t i = 0; i < element_size; i++) {
-        buffer[element_size * (y * x_t + x) + i] = src[element_size * (x * y_t + y) + i];
+        buffer[element_size * (y * x_t + x) + i] = src[element_size * (x * y_t + y) + i] ^ bit_flip_val;
      }
    }
  }
@ -518,16 +537,26 @@ static Status GetBinaryOpQuantizationScaleAndZeroPoint(
  return Status::OK();
 }

-static Status GetConvOpQuantizationScaleAndZeroPoint(
+// Get scale and zero point for
+// [QlinearConv] input, weight, output
+// [QlinearMatMul] A, B, Y
+//
+// In case of u8s8 (input/A is uint8 and weight/B is int8)
+// If the QlinearConv is using per-channel u8s8, return the scales vector
+// If the Qlinear[Conv/MatMul] is using per-tensor u8s8, the weight/B tensor
+// will be convert to uint8 later, will return the same scale and 128 as zero point
+// Also will set is_per_tensor_u8s8 to true to be used later
+static Status GetConvMatMulOpQuantizationScaleAndZeroPoint(
    const ModelBuilder& model_builder, const Node& node,
    float& a_scale, float& w_scale, float& y_scale,
    int32_t& a_zero_point, int32_t& w_zero_point, int32_t& y_zero_point,
-    optional<vector<float>>& w_scales) ORT_MUST_USE_RESULT;
-static Status GetConvOpQuantizationScaleAndZeroPoint(
+    optional<vector<float>>& w_scales, bool& is_per_tensor_u8s8) ORT_MUST_USE_RESULT;
+static Status GetConvMatMulOpQuantizationScaleAndZeroPoint(
    const ModelBuilder& model_builder, const Node& node,
    float& a_scale, float& w_scale, float& y_scale,
    int32_t& a_zero_point, int32_t& w_zero_point, int32_t& y_zero_point,
-    optional<vector<float>>& w_scales) {
+    optional<vector<float>>& w_scales, bool& is_per_tensor_u8s8) {
+  is_per_tensor_u8s8 = false;
  // Get scale and zero points
  // We will handle per-channel weight scale and zero point later
  ORT_RETURN_IF_ERROR(
@ -543,14 +572,26 @@ static Status GetConvOpQuantizationScaleAndZeroPoint(
  if (weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_UINT8)
    return Status::OK();

-  // Now we have u8s8 QlinearConv
+  // This is per-tensor u8s8
+  // NNAPI does not support per-tensor u8s8
+  // For this case we will need to convert the int8 weight tensor to uint8
+  // And have same scale and 128 as zero point
+  // The conversion of the weight tensor itself will be done in the OpBuilder
+  const auto& scale_tensor = *initializers.at(input_defs[4]->Name());
+  int64_t scale_dim = scale_tensor.dims().empty() ? 1 : scale_tensor.dims()[0];
+  if (scale_dim == 1) {
+    w_zero_point = 128;
+    is_per_tensor_u8s8 = true;
+    return Status::OK();
+  }
+
+  // Now we have u8s8 per-channel QlinearConv
  // u8s8 QlinearConv always have 0 as zero point so we are not getting it here
  // and we do not use w_scale here, so we reset them back to 0
  w_scale = 0.0f;
  w_zero_point = 0;

  // We need to copy the 1d scales array for per-channel quantization
-  const auto& scale_tensor = *initializers.at(input_defs[4]->Name());
  const auto* scales = GetTensorFloatData(scale_tensor);
  size_t scales_size = scale_tensor.dims().empty() ? 1 : scale_tensor.dims()[0];
  vector<float> scales_vec(scales_size, 0.0f);
@ -1345,12 +1386,12 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N

  // this is for per-channel quantization weights
  optional<vector<float>> w_scales;
-
+  bool is_per_tensor_u8s8 = false;
  if (is_qlinear_conv) {
-    ORT_RETURN_IF_ERROR(GetConvOpQuantizationScaleAndZeroPoint(model_builder, node,
-                                                               x_scale, w_scale, y_scale,
-                                                               x_zero_point, w_zero_point, y_zero_point,
-                                                               w_scales));
+    ORT_RETURN_IF_ERROR(GetConvMatMulOpQuantizationScaleAndZeroPoint(model_builder, node,
+                                                                     x_scale, w_scale, y_scale,
+                                                                     x_zero_point, w_zero_point, y_zero_point,
+                                                                     w_scales, is_per_tensor_u8s8));
  }

  Shape onnx_weight_shape;
@ -1366,7 +1407,15 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
      onnx_weight_type = Type::TENSOR_QUANT8_ASYMM;
      break;
    case ONNX_NAMESPACE::TensorProto_DataType_INT8:
-      onnx_weight_type = Type::TENSOR_QUANT8_SYMM_PER_CHANNEL;
+      // We support both per-tensor and per-channel u8s8
+      // For per-tensor u8s8 we will convert the int8 weight to uint8
+      if (is_per_tensor_u8s8) {
+        // Per-Tensor u8s8
+        onnx_weight_type = Type::TENSOR_QUANT8_ASYMM;
+      } else {
+        // Per-Channel u8s8
+        onnx_weight_type = Type::TENSOR_QUANT8_SYMM_PER_CHANNEL;
+      }
      break;
    default:
      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
@ -1384,9 +1433,9 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N

  // Pre-process weights
  if (conv_2d || grouped_conv_2d) {
-    ORT_RETURN_IF_ERROR(AddInitializerInNewLayout(model_builder, weight, onnx_weight_operand_type, L_0231));
+    ORT_RETURN_IF_ERROR(AddInitializerInNewLayout(model_builder, weight, onnx_weight_operand_type, L_0231, is_per_tensor_u8s8));
  } else {  // depthwise_conv_2d
-    ORT_RETURN_IF_ERROR(AddInitializerInNewLayout(model_builder, weight, onnx_weight_operand_type, L_1230));
+    ORT_RETURN_IF_ERROR(AddInitializerInNewLayout(model_builder, weight, onnx_weight_operand_type, L_1230, is_per_tensor_u8s8));
  }

  if (is_qlinear_conv) {
@ -1697,10 +1746,14 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
          b_zero_point = 0,
          y_zero_point = 0;

+  bool is_per_tensor_u8s8 = false;
  if (is_qlinear_matmul) {
-    ORT_RETURN_IF_ERROR(GetBinaryOpQuantizationScaleAndZeroPoint(model_builder, node,
-                                                                 a_scale, b_scale, y_scale,
-                                                                 a_zero_point, b_zero_point, y_zero_point));
+    optional<vector<float>> w_scales;
+    ORT_RETURN_IF_ERROR(
+        GetConvMatMulOpQuantizationScaleAndZeroPoint(model_builder, node,
+                                                     a_scale, b_scale, y_scale,
+                                                     a_zero_point, b_zero_point, y_zero_point,
+                                                     w_scales, is_per_tensor_u8s8));
  }

  uint32_t input_2_idx;
@ -1717,7 +1770,7 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
      onnx_mat_b_shape.push_back(SafeInt<uint32_t>(dim));

    const OperandType onnx_mat_b_operand_type(onnx_mat_b_type, onnx_mat_b_shape, b_scale, b_zero_point);
-    ORT_RETURN_IF_ERROR(AddInitializerTransposed(model_builder, onnx_mat_b_operand_type, input2));
+    ORT_RETURN_IF_ERROR(AddInitializerTransposed(model_builder, onnx_mat_b_operand_type, input2, is_per_tensor_u8s8));
  }

  input_2_idx = operand_indices.at(input2);
--- a/onnxruntime/test/providers/cpu/math/quantize_linear_matmul_test.cc
+++ b/onnxruntime/test/providers/cpu/math/quantize_linear_matmul_test.cc
@ -58,15 +58,15 @@ TEST(QuantizeLinearMatmulOpTest, QLinearMatMul3D_U8S8) {
  test.AddInput<uint8_t>("a_zero_point", {}, {113});

  test.AddInput<int8_t>("T2", {2, 4, 3},
-                         {-43, 51, -34,
-                          60, 26, -17,
-                          0, 63, -55,
-                          47, -29, -31,
+                        {-43, 51, -34,
+                         60, 26, -17,
+                         0, 63, -55,
+                         47, -29, -31,

-                          -62, 51, -42,
-                          60, 26, -22,
-                          0, -8, -19,
-                          37, -2, -47});
+                         -62, 51, -42,
+                         60, 26, -22,
+                         0, -8, -19,
+                         37, -2, -47});

  test.AddInput<float>("b_scale", {}, {0.00802f});
  test.AddInput<int8_t>("b_zero_point", {}, {-2});
@ -83,6 +83,76 @@ TEST(QuantizeLinearMatmulOpTest, QLinearMatMul3D_U8S8) {
  test.Run();
 }

+TEST(QuantizeLinearMatmulOpTest, QLinearMatMul2D_U8U8) {
+  auto run_test = [](bool only_t1_not_initializer) {
+    OpTester test("QLinearMatMul", 10);
+    test.AddInput<uint8_t>("T1", {2, 4},
+                           {208, 236, 0, 238,
+                            3, 214, 255, 29});
+
+    test.AddInput<float>("a_scale", {}, {0.0066f}, only_t1_not_initializer);
+    test.AddInput<uint8_t>("a_zero_point", {}, {113}, only_t1_not_initializer);
+
+    test.AddInput<uint8_t>("T2", {4, 3},
+                           {152, 51, 244,
+                            60, 26, 255,
+                            0, 127, 246,
+                            127, 254, 247},
+                           only_t1_not_initializer);
+
+    test.AddInput<float>("b_scale", {}, {0.00705f}, only_t1_not_initializer);
+    test.AddInput<uint8_t>("b_zero_point", {}, {114}, only_t1_not_initializer);
+
+    test.AddInput<float>("y_scale", {}, {0.0107f}, only_t1_not_initializer);
+    test.AddInput<uint8_t>("y_zero_point", {}, {118}, only_t1_not_initializer);
+    test.AddOutput<uint8_t>("T3", {2, 3},
+                            {168, 115, 255,
+                             1, 66, 151});
+
+    test.Run();
+  };
+
+  run_test(false);
+
+  // NNAPI will require all inputs except T1 to be initializers
+  run_test(true);
+}
+
+TEST(QuantizeLinearMatmulOpTest, QLinearMatMul2D_U8S8) {
+  auto run_test = [](bool only_t1_not_initializer) {
+    OpTester test("QLinearMatMul", 10);
+    test.AddInput<uint8_t>("T1", {2, 4},
+                           {208, 126, 0, 238,
+                            3, 214, 255, 29});
+
+    test.AddInput<float>("a_scale", {}, {0.0066f}, only_t1_not_initializer);
+    test.AddInput<uint8_t>("a_zero_point", {}, {113}, only_t1_not_initializer);
+
+    test.AddInput<int8_t>("T2", {4, 3},
+                          {-43, 51, -34,
+                           60, 26, -17,
+                           0, 63, -55,
+                           47, -29, -31},
+                          only_t1_not_initializer);
+
+    test.AddInput<float>("b_scale", {}, {0.00802f}, only_t1_not_initializer);
+    test.AddInput<int8_t>("b_zero_point", {}, {0}, only_t1_not_initializer);
+
+    test.AddInput<float>("y_scale", {}, {0.0123f}, only_t1_not_initializer);
+    test.AddInput<uint8_t>("y_zero_point", {}, {118}, only_t1_not_initializer);
+    test.AddOutput<uint8_t>("T3", {2, 3},
+                            {129, 94, 113,
+                             147, 154, 104});
+
+    test.Run();
+  };
+
+  run_test(false);
+
+  // NNAPI will require all inputs except T1 to be initializers
+  run_test(true);
+}
+
 static void QLinearMatMul2DTest(bool only_t1_not_initializer) {
  // Test non-empty inputs
  OpTester test_non_empty("QLinearMatMul", 10);