From fedb68429cb219d8651e1c20f53d7f314154b033 Mon Sep 17 00:00:00 2001
From: Guoyu Wang <62914304+gwang-msft@users.noreply.github.com>
Date: Wed, 3 Mar 2021 15:44:49 -0800
Subject: [PATCH] [NNAPI EP] Add per-tensor u8s8 support for
 Qlinear[Conv/MatMul] (#6818)

* NNAPI Add per-tensor u8s8 support

* Update some comments

* Address CR comments

* Address CR comments
---
 .../nnapi/nnapi_builtin/builders/helper.cc    | 209 ++++++++++--------
 .../nnapi_builtin/builders/op_builder.cc      | 105 ++++++---
 .../cpu/math/quantize_linear_matmul_test.cc   |  86 ++++++-
 3 files changed, 273 insertions(+), 127 deletions(-)

diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
index ce1e8c359e..c947b96569 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
@@ -137,7 +137,9 @@ bool HasValidBinaryOpQuantizedInputs(const Node& node) {
 bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const Node& node,
                                 const std::vector<size_t>& indices, const OpSupportCheckParams& params) {
   const auto& op_type = node.OpType();
-  bool is_qlinear_conv = (op_type == "QLinearConv");
+  auto qlinear_op_type = GetQLinearOpType(node);
+  bool is_qlinear_conv = (qlinear_op_type == QLinearOpType::QLinearConv);
+  bool is_qlinear_matmul = (qlinear_op_type == QLinearOpType::QLinearMatMul);
   const auto input_defs(node.InputDefs());
   for (const auto idx : indices) {
     if (idx >= input_defs.size()) {
@@ -145,46 +147,53 @@ bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const
                             << " >= input number, " << input_defs.size();
       return false;
     }
+
     const auto scale_name = input_defs[idx]->Name();
-    if (Contains(initializers, scale_name)) {
-      const auto& scale_tensor = *initializers.at(scale_name);
-      int64_t scales_dim = scale_tensor.dims().empty() ? 1 : scale_tensor.dims()[0];
-      bool is_conv_weight = is_qlinear_conv && idx == 4;
-      bool is_conv_u8s8_weight = false;
-
-      if (is_conv_weight) {
-        const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
-        is_conv_u8s8_weight = weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8;
-      }
-
-      // We need to check the per-channel quantization scales dimensions for u8s8 QlinearConv
-      // We only support per-channel quantization for u8s8
-      // For all other cases, the scales should be a scalar
-      if (is_conv_u8s8_weight) {
-        if (params.android_sdk_ver < 29) {
-          LOGS_DEFAULT(VERBOSE) << op_type << " only supports per-channel quantization on Android API 29+, "
-                                << "system API level: " << params.android_sdk_ver;
-          return false;
-        }
-
-        const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
-        if (weight_tensor.dims()[0] != scales_dim) {
-          LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight,"
-                                << " weight dimension[0] " << weight_tensor.dims()[0]
-                                << " scale dimension " << scales_dim;
-          return false;
-        }
-      } else {
-        if (scales_dim != 1) {
-          LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, "
-                                << " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+";
-          return false;
-        }
-      }
-    } else {
-      LOGS_DEFAULT(VERBOSE) << "The scale of " << op_type << " must be known";
+    if (!Contains(initializers, scale_name)) {
+      LOGS_DEFAULT(VERBOSE) << "The scale of " << op_type << " must be an initializer tensor";
       return false;
     }
+
+    // If this op is Qlinear[Conv/MatMul], we want to check u8s8 support for weight tensor (or B tensor for QlinearMatMul)
+    bool is_conv_matmul_weight = (is_qlinear_conv || is_qlinear_matmul) && idx == 4;
+    bool is_conv_matmul_u8s8_weight = false;
+
+    if (is_conv_matmul_weight) {
+      const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
+      is_conv_matmul_u8s8_weight = weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8;
+    }
+
+    const auto& scale_tensor = *initializers.at(scale_name);
+    int64_t scales_dim = scale_tensor.dims().empty() ? 1 : scale_tensor.dims()[0];
+    if (!is_conv_matmul_u8s8_weight) {
+      if (scales_dim != 1) {
+        LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, "
+                              << " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+";
+        return false;
+      }
+    } else if (scales_dim != 1) {
+      // For u8s8 Qlinear[Conv/MatMul], we support
+      // 1. Per-tensor, the weight will be transformed to uint8 later
+      // 2. Per-channel, only from Android API level 29
+      if (is_qlinear_matmul) {
+        LOGS_DEFAULT(VERBOSE) << "QLinearMatMul does not support per-channel quantization";
+        return false;
+      }
+
+      if (params.android_sdk_ver < 29) {
+        LOGS_DEFAULT(VERBOSE) << op_type << " only supports per-channel quantization on Android API 29+, "
+                              << "system API level: " << params.android_sdk_ver;
+        return false;
+      }
+
+      const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
+      if (weight_tensor.dims()[0] != scales_dim) {
+        LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight,"
+                              << " weight dimension[0] " << weight_tensor.dims()[0]
+                              << " scale dimension " << scales_dim;
+        return false;
+      }
+    }
   }
 
   return true;
@@ -193,7 +202,9 @@ bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const
 bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, const Node& node,
                                     const std::vector<size_t>& indices) {
   const auto& op_type = node.OpType();
-  bool is_qlinear_conv = (op_type == "QLinearConv");
+  auto qlinear_op_type = GetQLinearOpType(node);
+  bool is_qlinear_conv = (qlinear_op_type == QLinearOpType::QLinearConv);
+  bool is_qlinear_matmul = (qlinear_op_type == QLinearOpType::QLinearMatMul);
   const auto input_defs(node.InputDefs());
   for (const auto idx : indices) {
     if (idx >= input_defs.size()) {
@@ -203,65 +214,77 @@ bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, co
     }
 
     const auto zero_point_name = input_defs[idx]->Name();
-    if (Contains(initializers, zero_point_name)) {
-      bool is_conv_weight = is_qlinear_conv && idx == 5;
-      bool is_conv_u8s8_weight = false;
-      if (is_conv_weight) {
-        const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
-        is_conv_u8s8_weight = weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8;
-      }
+    if (!Contains(initializers, zero_point_name)) {
+      LOGS_DEFAULT(VERBOSE) << "The zero point of " << op_type << " must be an initializer tensor";
+      return false;
+    }
 
-      const auto& zero_tensor = *initializers.at(zero_point_name);
-      int64_t zero_dim = zero_tensor.dims().empty() ? 1 : zero_tensor.dims()[0];
-      if (is_conv_u8s8_weight) {
-        if (zero_tensor.data_type() != ONNX_NAMESPACE::TensorProto_DataType_INT8) {
-          LOGS_DEFAULT(VERBOSE) << "u8s8 QlinearConv only supports int8 zero point for weight, "
-                                << "actual zero point type: [" << zero_tensor.data_type() << "]";
-          return false;
-        }
+    bool is_conv_matmul_weight = is_qlinear_conv && idx == 5;
+    bool is_conv_matmul_u8s8_weight = false;
+    if (is_conv_matmul_weight) {
+      const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
+      is_conv_matmul_u8s8_weight = weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8;
+    }
 
-        // For onnx, u8s8 QlinearConv, the weight zero point can be a scalar,
-        // or a tensor with same channel as weight, for NNAPI we only support it be
-        // 0 (scalar) or all 0 (tensor), NNAPI will assume the zero point for per-channel
-        // quantization is 0 there is no input for it
-        const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
-        if (weight_tensor.dims()[0] != zero_dim && zero_dim != 1) {
-          LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight,"
-                                << " weight dimension[0] " << weight_tensor.dims()[0]
-                                << " zero point dimension " << zero_dim;
-          return false;
-        }
+    const auto& zero_tensor = *initializers.at(zero_point_name);
+    int64_t zero_dim = zero_tensor.dims().empty() ? 1 : zero_tensor.dims()[0];
 
-        std::unique_ptr<uint8_t[]> unpacked_tensor;
-        size_t tensor_byte_size;
-        auto status = onnxruntime::utils::UnpackInitializerData(
-            zero_tensor,
-            node.ModelPath(),
-            unpacked_tensor, tensor_byte_size);
-        if (!status.IsOK()) {
-          LOGS_DEFAULT(ERROR) << "QLinearConv erro when unpack zero tensor:" << status.ErrorMessage();
-          return false;
-        }
-
-        // Verify all onnx weight zero point(s) are 0(s)
-        const int8_t* zero_points = reinterpret_cast<const int8_t*>(unpacked_tensor.get());
-        for (size_t i = 0; i < tensor_byte_size; i++) {
-          if (zero_points[i] != 0) {
-            LOGS_DEFAULT(VERBOSE) << "QLinearConv only support 0 as zero point, "
-                                  << "zero_points[" << i << "] has value: " << zero_points[i];
-            return false;
-          }
-        }
-      } else {
-        if (zero_dim != 1) {
-          LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, "
-                                << " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+";
-          return false;
-        }
+    if (!is_conv_matmul_u8s8_weight) {
+      if (zero_dim != 1) {
+        LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, "
+                              << " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+";
+        return false;
       }
     } else {
-      LOGS_DEFAULT(VERBOSE) << "The zero point of " << op_type << " must be known";
-      return false;
+      // For u8s8 Qlinear[Conv/MatMul], we support
+      // 1. Per-tensor, the weight will be transformed to uint8 later
+      // 2. Per-channel, only from Android API level 29
+      if (zero_tensor.data_type() != ONNX_NAMESPACE::TensorProto_DataType_INT8) {
+        LOGS_DEFAULT(VERBOSE) << "u8s8 Qlinear[Conv/MatMul] only supports int8 zero point for weight, "
+                              << "actual zero point type: [" << zero_tensor.data_type() << "]";
+        return false;
+      }
+
+      if (zero_dim != 1) {
+        if (is_qlinear_matmul) {
+          LOGS_DEFAULT(VERBOSE) << "QLinearMatMul does not support per-channel quantization";
+          return false;
+        }
+      }
+
+      // For onnx, u8s8 QlinearConv, the weight zero point can be a scalar,
+      // or a tensor with same channel as weight, for NNAPI we only support it be
+      // 0 (scalar) or all 0 (tensor), NNAPI will assume the zero point for per-channel
+      // quantization is 0 there is no input for it
+      const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
+      if (weight_tensor.dims()[0] != zero_dim && zero_dim != 1) {
+        LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight,"
+                              << " weight dimension[0] " << weight_tensor.dims()[0]
+                              << " zero point dimension " << zero_dim;
+        return false;
+      }
+
+      std::unique_ptr<uint8_t[]> unpacked_tensor;
+      size_t tensor_byte_size;
+      auto status = onnxruntime::utils::UnpackInitializerData(
+          zero_tensor,
+          node.ModelPath(),
+          unpacked_tensor, tensor_byte_size);
+      if (!status.IsOK()) {
+        LOGS_DEFAULT(ERROR) << "Qlinear[Conv/MatMul] error when unpack zero tensor: " << zero_point_name
+                            << ", error msg: " << status.ErrorMessage();
+        return false;
+      }
+
+      // Verify all onnx weight zero point(s) are 0(s)
+      const int8_t* zero_points = reinterpret_cast<const int8_t*>(unpacked_tensor.get());
+      for (size_t i = 0; i < tensor_byte_size; i++) {
+        if (zero_points[i] != 0) {
+          LOGS_DEFAULT(VERBOSE) << "u8s8 Qlinear[Conv/MatMul]  only support 0 as zero point, "
+                                << "zero_points[" << i << "] has value: " << zero_points[i];
+          return false;
+        }
+      }
     }
   }
 
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
index e1a438262c..a26a8f1c4a 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
@@ -272,15 +272,23 @@ enum DataLayout {
   L_1230 = 1,
 };
 
-// TODO, replace this with more efficient code in optimizers
+// This is primarily used for adding the weight (an initializer) of Conv/QlinearConv
+// And perform layout change from ONNX -> NNAPI
+// If is_per_tensor_u8s8 is true, the QlinearConv is per-tensor u8s8 (input X is unsigned int8
+// and weight W is signed int8 and it is per-tensor (NOT per-channel) quantized), in this case,
+// since NNAPI requires X and W to be same type for per-tensor quantization,
+// the initializer tensor W will be converted from int8 to uint8 by flip each byte by XOR 0x80
+// byte ^ 0x80 == byte + 128
 static Status AddInitializerInNewLayout(ModelBuilder& model_builder,
                                         const std::string& name,
                                         const OperandType& source_operand_type,
-                                        DataLayout new_layout) ORT_MUST_USE_RESULT;
+                                        DataLayout new_layout,
+                                        bool is_per_tensor_u8s8) ORT_MUST_USE_RESULT;
 static Status AddInitializerInNewLayout(ModelBuilder& model_builder,
                                         const std::string& name,
                                         const OperandType& source_operand_type,
-                                        DataLayout new_layout) {
+                                        DataLayout new_layout,
+                                        bool is_per_tensor_u8s8) {
   const auto& tensor = *model_builder.GetInitializerTensors().at(name);
   const Shape& shape = source_operand_type.dimensions;
   ORT_RETURN_IF_NOT(shape.size() == 4,
@@ -322,6 +330,8 @@ static Status AddInitializerInNewLayout(ModelBuilder& model_builder,
   std::unique_ptr<uint8_t[]> buffer_holder(new uint8_t[operand_type.GetOperandBlobByteSize()]);
   uint8_t* buffer = buffer_holder.get();
   size_t element_size = operand_type.GetElementByteSize();
+
+  uint8_t bit_flip_val = is_per_tensor_u8s8 ? 0x80 : 0;
   for (uint32_t out = 0; out < out_t; out++) {
     for (uint32_t in = 0; in < in_t; in++) {
       for (uint32_t h = 0; h < h_t; h++) {
@@ -345,7 +355,7 @@ static Status AddInitializerInNewLayout(ModelBuilder& model_builder,
           }
 
           for (size_t i = 0; i < element_size; i++) {
-            buffer[element_size * nnapi_idx + i] = src[element_size * onnx_idx + i];
+            buffer[element_size * nnapi_idx + i] = src[element_size * onnx_idx + i] ^ bit_flip_val;
           }
         }
       }
@@ -355,13 +365,21 @@ static Status AddInitializerInNewLayout(ModelBuilder& model_builder,
   return model_builder.AddOperandFromPersistMemoryBuffer(name, &buffer[0], operand_type);
 }
 
-// TODO, replace this with more efficient code in optimizers
+// This is primarily used for adding the input B (an initializer) of MatMul/QlinearMatMul/Gemm (not transposed)
+// and transpose it, since for NNAPI only supports A*B'
+//
+// If is_per_tensor_u8s8 is true, the QlinearMatMul is per-tensor u8s8 (input A is unsigned int8
+// and input B is signed int8), in this case, since NNAPI requires A and B to be same type,
+// the initializer tensor B will be converted from int8 to uint8 by flip each byte by XOR 0x80
+// byte ^ 0x80 == byte + 128
 static Status AddInitializerTransposed(ModelBuilder& model_builder,
                                        const OperandType& source_operand_type,
-                                       const std::string& name) ORT_MUST_USE_RESULT;
+                                       const std::string& name,
+                                       bool is_per_tensor_u8s8) ORT_MUST_USE_RESULT;
 static Status AddInitializerTransposed(ModelBuilder& model_builder,
                                        const OperandType& source_operand_type,
-                                       const std::string& name) {
+                                       const std::string& name,
+                                       bool is_per_tensor_u8s8) {
   const auto& tensor = *model_builder.GetInitializerTensors().at(name);
   const Shape& shape = source_operand_type.dimensions;
 
@@ -397,10 +415,11 @@ static Status AddInitializerTransposed(ModelBuilder& model_builder,
   std::unique_ptr<uint8_t[]> buffer_holder(new uint8_t[operand_type.GetOperandBlobByteSize()]);
   uint8_t* buffer = buffer_holder.get();
   size_t element_size = operand_type.GetElementByteSize();
+  uint8_t bit_flip_val = is_per_tensor_u8s8 ? 0x80 : 0;
   for (uint32_t x = 0; x < x_t; x++) {
     for (uint32_t y = 0; y < y_t; y++) {
       for (size_t i = 0; i < element_size; i++) {
-        buffer[element_size * (y * x_t + x) + i] = src[element_size * (x * y_t + y) + i];
+        buffer[element_size * (y * x_t + x) + i] = src[element_size * (x * y_t + y) + i] ^ bit_flip_val;
       }
     }
   }
@@ -518,16 +537,26 @@ static Status GetBinaryOpQuantizationScaleAndZeroPoint(
   return Status::OK();
 }
 
-static Status GetConvOpQuantizationScaleAndZeroPoint(
+// Get scale and zero point for
+// [QlinearConv] input, weight, output
+// [QlinearMatMul] A, B, Y
+//
+// In case of u8s8 (input/A is uint8 and weight/B is int8)
+// If the QlinearConv is using per-channel u8s8, return the scales vector
+// If the Qlinear[Conv/MatMul] is using per-tensor u8s8, the weight/B tensor
+// will be convert to uint8 later, will return the same scale and 128 as zero point
+// Also will set is_per_tensor_u8s8 to true to be used later
+static Status GetConvMatMulOpQuantizationScaleAndZeroPoint(
     const ModelBuilder& model_builder, const Node& node,
     float& a_scale, float& w_scale, float& y_scale,
     int32_t& a_zero_point, int32_t& w_zero_point, int32_t& y_zero_point,
-    optional<vector<float>>& w_scales) ORT_MUST_USE_RESULT;
-static Status GetConvOpQuantizationScaleAndZeroPoint(
+    optional<vector<float>>& w_scales, bool& is_per_tensor_u8s8) ORT_MUST_USE_RESULT;
+static Status GetConvMatMulOpQuantizationScaleAndZeroPoint(
     const ModelBuilder& model_builder, const Node& node,
     float& a_scale, float& w_scale, float& y_scale,
     int32_t& a_zero_point, int32_t& w_zero_point, int32_t& y_zero_point,
-    optional<vector<float>>& w_scales) {
+    optional<vector<float>>& w_scales, bool& is_per_tensor_u8s8) {
+  is_per_tensor_u8s8 = false;
   // Get scale and zero points
   // We will handle per-channel weight scale and zero point later
   ORT_RETURN_IF_ERROR(
@@ -543,14 +572,26 @@ static Status GetConvOpQuantizationScaleAndZeroPoint(
   if (weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_UINT8)
     return Status::OK();
 
-  // Now we have u8s8 QlinearConv
+  // This is per-tensor u8s8
+  // NNAPI does not support per-tensor u8s8
+  // For this case we will need to convert the int8 weight tensor to uint8
+  // And have same scale and 128 as zero point
+  // The conversion of the weight tensor itself will be done in the OpBuilder
+  const auto& scale_tensor = *initializers.at(input_defs[4]->Name());
+  int64_t scale_dim = scale_tensor.dims().empty() ? 1 : scale_tensor.dims()[0];
+  if (scale_dim == 1) {
+    w_zero_point = 128;
+    is_per_tensor_u8s8 = true;
+    return Status::OK();
+  }
+
+  // Now we have u8s8 per-channel QlinearConv
   // u8s8 QlinearConv always have 0 as zero point so we are not getting it here
   // and we do not use w_scale here, so we reset them back to 0
   w_scale = 0.0f;
   w_zero_point = 0;
 
   // We need to copy the 1d scales array for per-channel quantization
-  const auto& scale_tensor = *initializers.at(input_defs[4]->Name());
   const auto* scales = GetTensorFloatData(scale_tensor);
   size_t scales_size = scale_tensor.dims().empty() ? 1 : scale_tensor.dims()[0];
   vector<float> scales_vec(scales_size, 0.0f);
@@ -1345,12 +1386,12 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
 
   // this is for per-channel quantization weights
   optional<vector<float>> w_scales;
-
+  bool is_per_tensor_u8s8 = false;
   if (is_qlinear_conv) {
-    ORT_RETURN_IF_ERROR(GetConvOpQuantizationScaleAndZeroPoint(model_builder, node,
-                                                               x_scale, w_scale, y_scale,
-                                                               x_zero_point, w_zero_point, y_zero_point,
-                                                               w_scales));
+    ORT_RETURN_IF_ERROR(GetConvMatMulOpQuantizationScaleAndZeroPoint(model_builder, node,
+                                                                     x_scale, w_scale, y_scale,
+                                                                     x_zero_point, w_zero_point, y_zero_point,
+                                                                     w_scales, is_per_tensor_u8s8));
   }
 
   Shape onnx_weight_shape;
@@ -1366,7 +1407,15 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
       onnx_weight_type = Type::TENSOR_QUANT8_ASYMM;
       break;
     case ONNX_NAMESPACE::TensorProto_DataType_INT8:
-      onnx_weight_type = Type::TENSOR_QUANT8_SYMM_PER_CHANNEL;
+      // We support both per-tensor and per-channel u8s8
+      // For per-tensor u8s8 we will convert the int8 weight to uint8
+      if (is_per_tensor_u8s8) {
+        // Per-Tensor u8s8
+        onnx_weight_type = Type::TENSOR_QUANT8_ASYMM;
+      } else {
+        // Per-Channel u8s8
+        onnx_weight_type = Type::TENSOR_QUANT8_SYMM_PER_CHANNEL;
+      }
       break;
     default:
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
@@ -1384,9 +1433,9 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
 
   // Pre-process weights
   if (conv_2d || grouped_conv_2d) {
-    ORT_RETURN_IF_ERROR(AddInitializerInNewLayout(model_builder, weight, onnx_weight_operand_type, L_0231));
+    ORT_RETURN_IF_ERROR(AddInitializerInNewLayout(model_builder, weight, onnx_weight_operand_type, L_0231, is_per_tensor_u8s8));
   } else {  // depthwise_conv_2d
-    ORT_RETURN_IF_ERROR(AddInitializerInNewLayout(model_builder, weight, onnx_weight_operand_type, L_1230));
+    ORT_RETURN_IF_ERROR(AddInitializerInNewLayout(model_builder, weight, onnx_weight_operand_type, L_1230, is_per_tensor_u8s8));
   }
 
   if (is_qlinear_conv) {
@@ -1697,10 +1746,14 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
           b_zero_point = 0,
           y_zero_point = 0;
 
+  bool is_per_tensor_u8s8 = false;
   if (is_qlinear_matmul) {
-    ORT_RETURN_IF_ERROR(GetBinaryOpQuantizationScaleAndZeroPoint(model_builder, node,
-                                                                 a_scale, b_scale, y_scale,
-                                                                 a_zero_point, b_zero_point, y_zero_point));
+    optional<vector<float>> w_scales;
+    ORT_RETURN_IF_ERROR(
+        GetConvMatMulOpQuantizationScaleAndZeroPoint(model_builder, node,
+                                                     a_scale, b_scale, y_scale,
+                                                     a_zero_point, b_zero_point, y_zero_point,
+                                                     w_scales, is_per_tensor_u8s8));
   }
 
   uint32_t input_2_idx;
@@ -1717,7 +1770,7 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
       onnx_mat_b_shape.push_back(SafeInt<uint32_t>(dim));
 
     const OperandType onnx_mat_b_operand_type(onnx_mat_b_type, onnx_mat_b_shape, b_scale, b_zero_point);
-    ORT_RETURN_IF_ERROR(AddInitializerTransposed(model_builder, onnx_mat_b_operand_type, input2));
+    ORT_RETURN_IF_ERROR(AddInitializerTransposed(model_builder, onnx_mat_b_operand_type, input2, is_per_tensor_u8s8));
   }
 
   input_2_idx = operand_indices.at(input2);
diff --git a/onnxruntime/test/providers/cpu/math/quantize_linear_matmul_test.cc b/onnxruntime/test/providers/cpu/math/quantize_linear_matmul_test.cc
index 0c95ff70e1..bda483061b 100644
--- a/onnxruntime/test/providers/cpu/math/quantize_linear_matmul_test.cc
+++ b/onnxruntime/test/providers/cpu/math/quantize_linear_matmul_test.cc
@@ -58,15 +58,15 @@ TEST(QuantizeLinearMatmulOpTest, QLinearMatMul3D_U8S8) {
   test.AddInput<uint8_t>("a_zero_point", {}, {113});
 
   test.AddInput<int8_t>("T2", {2, 4, 3},
-                         {-43, 51, -34,
-                          60, 26, -17,
-                          0, 63, -55,
-                          47, -29, -31,
+                        {-43, 51, -34,
+                         60, 26, -17,
+                         0, 63, -55,
+                         47, -29, -31,
 
-                          -62, 51, -42,
-                          60, 26, -22,
-                          0, -8, -19,
-                          37, -2, -47});
+                         -62, 51, -42,
+                         60, 26, -22,
+                         0, -8, -19,
+                         37, -2, -47});
 
   test.AddInput<float>("b_scale", {}, {0.00802f});
   test.AddInput<int8_t>("b_zero_point", {}, {-2});
@@ -83,6 +83,76 @@ TEST(QuantizeLinearMatmulOpTest, QLinearMatMul3D_U8S8) {
   test.Run();
 }
 
+TEST(QuantizeLinearMatmulOpTest, QLinearMatMul2D_U8U8) {
+  auto run_test = [](bool only_t1_not_initializer) {
+    OpTester test("QLinearMatMul", 10);
+    test.AddInput<uint8_t>("T1", {2, 4},
+                           {208, 236, 0, 238,
+                            3, 214, 255, 29});
+
+    test.AddInput<float>("a_scale", {}, {0.0066f}, only_t1_not_initializer);
+    test.AddInput<uint8_t>("a_zero_point", {}, {113}, only_t1_not_initializer);
+
+    test.AddInput<uint8_t>("T2", {4, 3},
+                           {152, 51, 244,
+                            60, 26, 255,
+                            0, 127, 246,
+                            127, 254, 247},
+                           only_t1_not_initializer);
+
+    test.AddInput<float>("b_scale", {}, {0.00705f}, only_t1_not_initializer);
+    test.AddInput<uint8_t>("b_zero_point", {}, {114}, only_t1_not_initializer);
+
+    test.AddInput<float>("y_scale", {}, {0.0107f}, only_t1_not_initializer);
+    test.AddInput<uint8_t>("y_zero_point", {}, {118}, only_t1_not_initializer);
+    test.AddOutput<uint8_t>("T3", {2, 3},
+                            {168, 115, 255,
+                             1, 66, 151});
+
+    test.Run();
+  };
+
+  run_test(false);
+
+  // NNAPI will require all inputs except T1 to be initializers
+  run_test(true);
+}
+
+TEST(QuantizeLinearMatmulOpTest, QLinearMatMul2D_U8S8) {
+  auto run_test = [](bool only_t1_not_initializer) {
+    OpTester test("QLinearMatMul", 10);
+    test.AddInput<uint8_t>("T1", {2, 4},
+                           {208, 126, 0, 238,
+                            3, 214, 255, 29});
+
+    test.AddInput<float>("a_scale", {}, {0.0066f}, only_t1_not_initializer);
+    test.AddInput<uint8_t>("a_zero_point", {}, {113}, only_t1_not_initializer);
+
+    test.AddInput<int8_t>("T2", {4, 3},
+                          {-43, 51, -34,
+                           60, 26, -17,
+                           0, 63, -55,
+                           47, -29, -31},
+                          only_t1_not_initializer);
+
+    test.AddInput<float>("b_scale", {}, {0.00802f}, only_t1_not_initializer);
+    test.AddInput<int8_t>("b_zero_point", {}, {0}, only_t1_not_initializer);
+
+    test.AddInput<float>("y_scale", {}, {0.0123f}, only_t1_not_initializer);
+    test.AddInput<uint8_t>("y_zero_point", {}, {118}, only_t1_not_initializer);
+    test.AddOutput<uint8_t>("T3", {2, 3},
+                            {129, 94, 113,
+                             147, 154, 104});
+
+    test.Run();
+  };
+
+  run_test(false);
+
+  // NNAPI will require all inputs except T1 to be initializers
+  run_test(true);
+}
+
 static void QLinearMatMul2DTest(bool only_t1_not_initializer) {
   // Test non-empty inputs
   OpTester test_non_empty("QLinearMatMul", 10);