From a82a0907e0afda1b0bd623792699e2a7cd0dc7eb Mon Sep 17 00:00:00 2001
From: linkerzhang <kezhan@microsoft.com>
Date: Fri, 23 Nov 2018 11:56:14 -0800
Subject: [PATCH] add ops for quantization support.

---
 onnxruntime/contrib_ops/contrib_ops.cc | 223 +++++++++++++++++++++++++
 1 file changed, 223 insertions(+)

diff --git a/onnxruntime/contrib_ops/contrib_ops.cc b/onnxruntime/contrib_ops/contrib_ops.cc
index 49c8a133ea..d698cdcbd3 100644
--- a/onnxruntime/contrib_ops/contrib_ops.cc
+++ b/onnxruntime/contrib_ops/contrib_ops.cc
@@ -104,6 +104,229 @@ The quantization formula is y = (x / y_scale) + y_zero_point. For (x / y_scale),
 The linear de-quantization operator. It consumes a quantized data, a scale, a zero point and computes the full precision data.
 The dequantization formula is y = (x - x_zero_point) * x_scale.
  Scale and zero point must have same shape. They must be either scalar (per tensor) or 1-D tensor (per 'axis').)DOC");
+
+  const char* auto_pad_doc =
+      "auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where "
+      "default value is NOTSET, which means explicit padding is used. "
+      "SAME_UPPER or SAME_LOWER mean pad the input so that the output size match the input."
+      "In case of odd number add the extra padding at the end for SAME_UPPER and at the "
+      "beginning for SAME_LOWER. VALID mean no padding.";
+
+  ONNX_CONTRIB_OPERATOR_SCHEMA(QLinearConv)
+      .SetDomain(kMSDomain)
+      .SinceVersion(1)
+      .SetDoc(R"DOC(
+The convolution operator consumes a quantized input tensor, its scale and zero point, 
+a quantized filter, its scale and zero point, and output’s scale and zero point, 
+and computes the quantized output. Each scale and zero point pair must have same shape.
+It means they must be either scalars (per tensor) or 1-D tensors (per channel).)DOC")
+      .Input(
+          0,
+          "x",
+          "Input data tensor from previous layer; "
+          "has size (N x C x H x W), where N is the batch size, "
+          "C is the number of channels, and H and W are the "
+          "height and width. Note that this is for the 2D image. "
+          "Otherwise the size is (N x C x D1 x D2 ... x Dn). "
+          "Optionally, if dimension denotation is "
+          "in effect, the operation expects input data tensor "
+          "to arrive with the dimension denotation of [DATA_BATCH, "
+          "DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].",
+          "T1")
+      .Input(1, "x_scale", "Scale tensor for input ‘x’. It could be a scalar or a 1-D tensor, which means a per-tensor or per-channel quantization. If it’s a 1-D tensor, its number of elements should be equal to the number of channels of input ‘x’.", "T3")
+      .Input(2, "x_zero_point", "Zero point tensor for input ‘x’. It could be a scalar or a 1-D tensor, which means a per-tensor or per-channel quantization. If it’s a 1-D tensor, its number of elements should be equal to the number of channels of input ‘x’.", "T1")
+      .Input(
+          3,
+          "w",
+          "The weight tensor that will be used in the "
+          "convolutions; has size (M x C/group x kH x kW), where C "
+          "is the number of channels, and kH and kW are the "
+          "height and width of the kernel, and M is the number "
+          "of feature maps. For more than 2 dimensions, the "
+          "kernel shape will be (M x C/group x k1 x k2 x ... x kn), "
+          "where (k1 x k2 x ... kn) is the dimension of the kernel. "
+          "Optionally, if dimension denotation is in effect, "
+          "the operation expects the weight tensor to arrive "
+          "with the dimension denotation of [FILTER_OUT_CHANNEL, "
+          "FILTER_IN_CHANNEL, FILTER_SPATIAL, FILTER_SPATIAL ...]. "
+          "X.shape[1] == (W.shape[1] * group) == C "
+          "(assuming zero based indices for the shape array). "
+          "Or in other words FILTER_IN_CHANNEL should be equal to DATA_CHANNEL. ",
+          "T1")
+      .Input(4, "w_scale", "Scale tensor for input ‘w’. It could be a scalar or a 1-D tensor, which means a per-tensor or per-channel quantization. If it’s a 1-D tensor, its number of elements should be equal to the number of channels of input ‘w’.", "T3")
+      .Input(5, "w_zero_point", "Scale tensor for input ‘w’. It could be a scalar or a 1-D tensor, which means a per-tensor or per-channel quantization. If it’s a 1-D tensor, its number of elements should be equal to the number of channels of input ‘w’.", "T1")
+      .Input(6, "y_scale", "Scale tensor for output ‘y’. It could be a scalar or a 1-D tensor, which means a per-tensor or per-channel quantization. If it’s a 1-D tensor, its number of elements should be equal to the number of channels of input ‘y’.", "T3")
+      .Input(7, "y_zero_point", "Scale tensor for output ‘y’. It could be a scalar or a 1-D tensor, which means a per-tensor or per-channel quantization. If it’s a 1-D tensor, its number of elements should be equal to the number of channels of input ‘y’.", "T1")
+      .Input(8, "B", "Optional 1D bias to be added to the convolution, has size of M.", "T2", OpSchema::Optional)
+      .Output(
+          0,
+          "y",
+          "Output data tensor that contains the result of the "
+          "convolution. The output dimensions are functions "
+          "of the kernel size, stride size, and pad lengths.",
+          "T1")
+      .TypeConstraint(
+          "T1",
+          {"tensor(int8)", "tensor(uint8)"},
+          "Constrain input, filter, and output types to 8-bit integer tensors.")
+      .TypeConstraint("T2", {"tensor(int32)", "tensor(uint32)"}, "Constrain bias type to 32-bit integer tensor.")
+      .TypeConstraint("T3", {"tensor(float)"}, "Constrain scale of input, filter and output to float tensor.")
+      .Attr(
+          "auto_pad",
+          auto_pad_doc,
+          AttributeProto::STRING,
+          std::string("NOTSET"))
+      .Attr(
+          "kernel_shape",
+          "The shape of the convolution kernel. If not present, should be inferred from input 'w'.",
+          AttributeProto::INTS,
+          OPTIONAL)
+      .Attr(
+          "dilations",
+          "dilation value along each axis of the filter. If not present, the dilation defaults to 1 along each axis.",
+          AttributeProto::INTS,
+          OPTIONAL)
+      .Attr(
+          "strides", "Stride along each axis. If not present, the stride defaults to 1 along each axis.", AttributeProto::INTS, OPTIONAL)
+      .Attr("pads",
+            "Padding for the beginning and ending along each axis, it can take any value greater than or equal to 0."
+            "The value represent the number of pixels added to the beginning and end part of the corresponding axis."
+            "`pads` format should be as follow [x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin the number of"
+            "pixels added at the beginning of axis `i` and xi_end, the number of pixels added at the end of axis `i`."
+            "This attribute cannot be used simultaneously with auto_pad attribute. If not present, the padding defaults"
+            "to 0 along start and end of each axis.",
+            AttributeProto::INTS, OPTIONAL)
+      .Attr(
+          "group",
+          "number of groups input channels and output channels are divided into. default is 1.",
+          AttributeProto::INT,
+          static_cast<int64_t>(1));
+
+  ONNX_CONTRIB_OPERATOR_SCHEMA(ConvInteger)
+      .SetDomain(kMSDomain)
+      .SinceVersion(1)
+      .SetDoc(R"DOC(
+The integer convolution operator consumes an input tensor, a filter, and a padding value,
+ and computes the output. The production MUST never overflow. The accumulation may overflow
+ if and only if in 32 bits.)DOC")
+      .Input(
+          0,
+          "x",
+          "Input data tensor from previous layer; "
+          "has size (N x C x H x W), where N is the batch size, "
+          "C is the number of channels, and H and W are the "
+          "height and width. Note that this is for the 2D image. "
+          "Otherwise the size is (N x C x D1 x D2 ... x Dn). "
+          "Optionally, if dimension denotation is "
+          "in effect, the operation expects input data tensor "
+          "to arrive with the dimension denotation of [DATA_BATCH, "
+          "DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].",
+          "T1")
+      .Input(
+          1,
+          "w",
+          "The weight tensor that will be used in the "
+          "convolutions; has size (M x C/group x kH x kW), where C "
+          "is the number of channels, and kH and kW are the "
+          "height and width of the kernel, and M is the number "
+          "of feature maps. For more than 2 dimensions, the "
+          "kernel shape will be (M x C/group x k1 x k2 x ... x kn), "
+          "where (k1 x k2 x ... kn) is the dimension of the kernel. "
+          "Optionally, if dimension denotation is in effect, "
+          "the operation expects the weight tensor to arrive "
+          "with the dimension denotation of [FILTER_OUT_CHANNEL, "
+          "FILTER_IN_CHANNEL, FILTER_SPATIAL, FILTER_SPATIAL ...]. "
+          "X.shape[1] == (W.shape[1] * group) == C "
+          "(assuming zero based indices for the shape array). "
+          "Or in other words FILTER_IN_CHANNEL should be equal to DATA_CHANNEL. ",
+          "T2")
+      .Input(2, "z", "Padding value (zero_point normally), it's optional and default value is 0.", "T1", OpSchema::Optional)
+      .Output(
+          0,
+          "y",
+          "Output data tensor that contains the result of the "
+          "convolution. The output dimensions are functions "
+          "of the kernel size, stride size, and pad lengths.",
+          "T1")
+      .TypeConstraint("T1", {"tensor(int8)", "tensor(uint8)"}, "Constrain input X and Z data types as 8-bit integer tensors")
+      .TypeConstraint("T2", {"tensor(int8)", "tensor(uint8)"}, "Constrain input W data types as 8-bit integer tensor")
+      .TypeConstraint("T3",
+                      {"tensor(int32)", "tensor(uint32)"},
+                      "Constrain output Y data types as 32-bits integer tensors."
+                      "T3 must be tensor(uint32) when both T1 and T2 are tensor(uint8),"
+                      "or must be tensor(int32) when either T1 or T2 is tensor(int8).")
+      .Attr(
+          "auto_pad",
+          auto_pad_doc,
+          AttributeProto::STRING,
+          std::string("NOTSET"))
+      .Attr(
+          "kernel_shape",
+          "The shape of the convolution kernel. If not present, should be inferred from input 'w'.",
+          AttributeProto::INTS,
+          OPTIONAL)
+      .Attr(
+          "dilations",
+          "dilation value along each axis of the filter. If not present, the dilation defaults to 1 along each axis.",
+          AttributeProto::INTS,
+          OPTIONAL)
+      .Attr(
+          "strides", "Stride along each axis. If not present, the stride defaults to 1 along each axis.", AttributeProto::INTS, OPTIONAL)
+      .Attr("pads",
+            "Padding for the beginning and ending along each axis, it can take any value greater than or equal to 0."
+            "The value represent the number of pixels added to the beginning and end part of the corresponding axis."
+            "`pads` format should be as follow [x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin the number of"
+            "pixels added at the beginning of axis `i` and xi_end, the number of pixels added at the end of axis `i`."
+            "This attribute cannot be used simultaneously with auto_pad attribute. If not present, the padding defaults"
+            "to 0 along start and end of each axis.",
+            AttributeProto::INTS, OPTIONAL)
+      .Attr(
+          "group",
+          "number of groups input channels and output channels are divided into. default is 1.",
+          AttributeProto::INT,
+          static_cast<int64_t>(1));
+
+  ONNX_CONTRIB_OPERATOR_SCHEMA(MatMulInteger)
+      .SetDomain(kMSDomain)
+      .SinceVersion(1)
+      .SetDoc(R"DOC(
+Matrix product that behaves like numpy.matmul: https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.matmul.html.
+ The production MUST never overflow. The accumulation may overflow if and only if in 32 bits.)DOC")
+      .Input(0, "A", "N-dimensional matrix A", "T1")
+      .Input(0, "B", "N-dimensional matrix B", "T2")
+      .Output(0, "Y", "Matrix multiply results from A * B", "T3")
+      .TypeConstraint("T1", {"tensor(int8)", "tensor(uint8)"}, "Constrain input A data types as 8-bit integer tensor")
+      .TypeConstraint("T2", {"tensor(int8)", "tensor(uint8)"}, "Constrain input B data types as 8-bit integer tensor")
+      .TypeConstraint("T3",
+                      {"tensor(int32)", "tensor(uint32)"},
+                      "Constrain output Y data types as 32-bit integer tensor."
+                      "T3 must be tensor(uint32) when both T1 and T2 are tensor(uint8),"
+                      "or must be tensor(int32) when either T1 or T2 is tensor(int8).");
+
+  ONNX_CONTRIB_OPERATOR_SCHEMA(ReduceSumInteger)
+      .SetDomain(kMSDomain)
+      .SinceVersion(1)
+      .SetDoc(R"DOC(
+Computes the sum of the low-precision input tensor's element along the provided axes.
+The resulting tensor has the same rank as the input if keepdims equal 1. If keepdims equal 0,
+then the resulting tensor have the reduced dimension pruned. The above behavior is similar to numpy,
+with the exception that numpy default keepdims to False instead of True.)DOC")
+      .Input(0, "data", "An input tensor.", "T1")
+      .Output(0, "reduced", "Reduced output tensor.", "T2")
+      .TypeConstraint("T1", {"tensor(int8)", "tensor(uint8)"}, "Constrain input type to 8-bit integer tensor.")
+      .TypeConstraint("T2",
+                      {"tensor(int32)", "tensor(uint32)"},
+                      "Constrain output data type to 32-bit integer tensor."
+                      "T2 must be tensor(uint32) when T1 is tensor(uint8),"
+                      "or must be tensor(int32) when T1 is tensor(int8).")
+      .Attr(
+          "axes",
+          "A list of integers, along which to reduce. The default is to reduce over all the dimensions of the input tensor.",
+          AttributeProto::INTS)
+      .Attr(
+          "keepdims",
+          "Keep the reduced dimension or not, default 1 mean keep reduced dimension.",
+          AttributeProto::INT);
 }
 
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, SampleOp);