Merge pull request #59 from Microsoft/kezhan/qlinearmatmul

add QLinearMatMul
2026-07-04 04:07:22 +00:00 · 2018-11-29 11:36:52 -08:00 · 2018-11-29 11:36:52 -08:00 · fafc48bf94
commit fafc48bf94
parent e8db06ed44 91c860bec4
1 changed files with 27 additions and 2 deletions
--- a/onnxruntime/contrib_ops/contrib_ops.cc
+++ b/onnxruntime/contrib_ops/contrib_ops.cc
@ -12,8 +12,8 @@
 namespace onnxruntime {
 namespace contrib {
 using ::ONNX_NAMESPACE::AttributeProto;
-using ::ONNX_NAMESPACE::OpSchema;
 using ::ONNX_NAMESPACE::OPTIONAL;
+using ::ONNX_NAMESPACE::OpSchema;

 void RegisterContribSchemas() {
  ONNX_CONTRIB_OPERATOR_SCHEMA(SampleOp)
@ -135,6 +135,31 @@ The linear de-quantization operator. It consumes a quantized data, a scale, a ze
 The dequantization formula is y = (x - x_zero_point) * x_scale.
 Scale and zero point must have same shape. They must be either scalar (per tensor) or 1-D tensor (per 'axis').)DOC");

+  ONNX_CONTRIB_OPERATOR_SCHEMA(QLinearMatMul)
+      .SetDomain(kMSDomain)
+      .SinceVersion(1)
+      .SetDoc(R"DOC(
+Matrix product that behaves like numpy.matmul: https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.matmul.html.
+It consumes two quantized input tensors, their scales and zero points, and output's scale and zero point, and computes
+the quantized output. The quantization formula is x_quantized = (x_fp32 / x_scale) + x_zero_point. For (x_fp32 / x_scale),
+it computes the nearest integer value to arg (in floating-point format), rounding halfway cases away from zero.
+Scale and zero point must have same shape. They must be either scalar (per tensor) or 1-D tensor (per row for a and per column for b).
+If scale and zero point are 1D tensor, the number of elements of scale and zero point tensor of input 'a' and output 'y'
+should be equal to the number of rows of input 'a', and the number of elements of scale and zero point tensor of input 'b'
+should be equal to the number of columns of input 'b'.)DOC")
+      .Input(0, "a", "N-dimensional quantized matrix a", "T1")
+      .Input(1, "a_scale", "scale of quantized input a", "tensor(float)")
+      .Input(2, "a_zero_point", "zero point of quantized input a", "T1")
+      .Input(3, "b", "N-dimensional quantized matrix b", "T2")
+      .Input(4, "b_scale", "scale of quantized input b", "tensor(float)")
+      .Input(5, "b_zero_point", "zero point of quantized input b", "T2")
+      .Input(6, "y_scale", "scale of quantized output y", "tensor(float)")
+      .Input(7, "y_zero_point", "zero point of quantized output y", "T3")
+      .Output(0, "y", "Quantized matrix multiply results from a * b", "T3")
+      .TypeConstraint("T1", {"tensor(int8)", "tensor(uint8)"}, "Constrain input a and its zero point data types as 8-bit integer tensor")
+      .TypeConstraint("T2", {"tensor(int8)", "tensor(uint8)"}, "Constrain input b and its zero point data types as 8-bit integer tensor")
+      .TypeConstraint("T3", {"tensor(int8)", "tensor(uint8)"}, "Constrain output y and its zero point data types as 8-bit integer tensor.");
+
  const char* auto_pad_doc =
      "auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where "
      "default value is NOTSET, which means explicit padding is used. "
@ -323,7 +348,7 @@ The integer convolution operator consumes an input tensor, a filter, and a paddi
 Matrix product that behaves like numpy.matmul: https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.matmul.html.
 The production MUST never overflow. The accumulation may overflow if and only if in 32 bits.)DOC")
      .Input(0, "A", "N-dimensional matrix A", "T1")
-      .Input(0, "B", "N-dimensional matrix B", "T2")
+      .Input(1, "B", "N-dimensional matrix B", "T2")
      .Output(0, "Y", "Matrix multiply results from A * B", "T3")
      .TypeConstraint("T1", {"tensor(int8)", "tensor(uint8)"}, "Constrain input A data types as 8-bit integer tensor")
      .TypeConstraint("T2", {"tensor(int8)", "tensor(uint8)"}, "Constrain input B data types as 8-bit integer tensor")