Add schema for new Qops (#2611)

* Add schema for new Qops * adding shape inference + qlinearaveragepool * plus review comments * plus review comments * updates per review comments * plus review comments
2026-07-13 18:08:13 +00:00 · 2019-12-23 10:19:28 -08:00 · 2019-12-23 10:19:28 -08:00 · cd6ff42442
commit cd6ff42442
parent 71940c0915
1 changed files with 421 additions and 1 deletions
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
@ -162,7 +162,6 @@ void convTransposeWithDynamicPadsShapeInference(InferenceContext& ctx) {
    return;
  }
 }
-
 }  // namespace ONNX_NAMESPACE

 namespace onnxruntime {
@ -202,6 +201,126 @@ void NchwcGlobalPoolOpSchemaGenerator(OpSchema& schema) {
  });
 }

+void ValidateTypeAndShapeForScaleAndZP(ONNX_NAMESPACE::InferenceContext& ctx, int index, ::google::protobuf::int32 expectedType, bool isScalar, int expectedTensorSize = 0) {
+  if (ctx.getNumInputs() > static_cast<size_t>(index)) {
+    auto data_type = ctx.getInputType(index);
+    if (nullptr == data_type || data_type->value_case() != ONNX_NAMESPACE::TypeProto::kTensorType ||
+        data_type->tensor_type().elem_type() != expectedType) {
+      fail_type_inference(
+          "Input data type does not match the expected data type. Current data type is ", data_type->tensor_type().elem_type());
+    }
+  }
+
+  if (hasInputShape(ctx, index)) {
+    ONNX_NAMESPACE::TensorShapeProto shape = ctx.getInputType(index)->tensor_type().shape();
+    if (isScalar) {
+      if (shape.dim_size() != 0) {
+        fail_type_inference("Scale and Zero-point must be a scalar");
+      }
+    } else {
+      if (shape.dim_size() != 1) {
+        fail_type_inference("Scale and Zero-point must be of rank 1");
+      }
+
+      if (shape.dim((int)0).has_dim_value() && shape.dim((int)0).dim_value() != expectedTensorSize) {
+        fail_type_inference(
+            "Scale and Zero-point must be of rank 1 and the number of elements should be equal to the number of rows of the corresponding input.");
+      }
+    }
+  }
+}
+
+std::function<void(OpSchema&)> QLinearMathDocGenerator(const char* name, const char* additionalDocumentation) {
+  return [=](OpSchema& schema) {
+    std::string doc = R"DOC(
+Performs element-wise binary {name} on 8 bit data types (with Numpy-style broadcasting support).
+
+{additionalDocumentation}
+)DOC";
+    ONNX_NAMESPACE::ReplaceAll(doc, "{name}", name);
+    ONNX_NAMESPACE::ReplaceAll(doc, "{additionalDocumentation}", additionalDocumentation);
+    schema.SetDoc(doc);
+    schema.Input(0, "A", "First operand.", "T");
+    schema.Input(
+        1,
+        "A_scale",
+        "Input A's scale. It's a scalar, which means a per-tensor/layer quantization.",
+        "tensor(float)");
+    schema.Input(
+        2,
+        "A_zero_point",
+        "Input A zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.",
+        "T",
+        OpSchema::Optional);
+    schema.Input(3, "B", "Second operand.", "T");
+    schema.Input(
+        4,
+        "B_scale",
+        "Input B's scale. It's a scalar, which means a per-tensor/layer quantization.",
+        "tensor(float)");
+    schema.Input(
+        5,
+        "B_zero_point",
+        "Input B zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.",
+        "T",
+        OpSchema::Optional);
+    schema.Input(
+        6,
+        "C_scale",
+        "Output scale. It's a scalar, which means a per-tensor/layer quantization.",
+        "tensor(float)");
+    schema.Input(
+        7,
+        "C_zero_point",
+        "Output zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.",
+        "T",
+        OpSchema::Optional);
+    schema.Output(0, "C", "Result, has same element type as two inputs", "T");
+    schema.TypeConstraint("T", {"tensor(uint8)", "tensor(int8)"}, "Constrain input and output types to 8 bit signed and unsigned tensors.");
+    schema.TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
+      propagateElemTypeFromInputToOutput(ctx, 0, 0);
+
+      auto a_type = ctx.getInputType(0);
+      auto b_type = ctx.getInputType(3);
+
+      if (nullptr == a_type || nullptr == b_type ||
+          a_type->value_case() != ONNX_NAMESPACE::TypeProto::kTensorType ||
+          b_type->value_case() != ONNX_NAMESPACE::TypeProto::kTensorType) {
+        fail_type_inference("inputs are expected to have tensor type.");
+      }
+
+      // validate scale and zero points
+      ValidateTypeAndShapeForScaleAndZP(ctx, 1, ONNX_NAMESPACE::TensorProto::FLOAT, true);
+      ValidateTypeAndShapeForScaleAndZP(ctx, 2, a_type->tensor_type().elem_type(), true);
+      ValidateTypeAndShapeForScaleAndZP(ctx, 4, ONNX_NAMESPACE::TensorProto::FLOAT, true);
+      ValidateTypeAndShapeForScaleAndZP(ctx, 5, b_type->tensor_type().elem_type(), true);
+      ValidateTypeAndShapeForScaleAndZP(ctx, 6, ONNX_NAMESPACE::TensorProto::FLOAT, true);
+      ValidateTypeAndShapeForScaleAndZP(ctx, 7, a_type->tensor_type().elem_type(), true);
+
+      if (hasInputShape(ctx, 0) && hasInputShape(ctx, 3))
+        bidirectionalBroadcastShapeInference(
+            ctx.getInputType(0)->tensor_type().shape(),
+            ctx.getInputType(3)->tensor_type().shape(),
+            *ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape());
+    });
+  };
+}
+
+const char* contrib_ops_pads_doc =
+    "Padding for the beginning and ending along each spatial axis, it can take any value greater "
+    "than or equal to 0. The value represent the number of pixels added to the beginning "
+    "and end part of the corresponding axis. `pads` format should be as follow "
+    "[x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin the number of pixels "
+    "added at the beginning of axis `i` and xi_end, the number of pixels added at "
+    "the end of axis `i`. This attribute cannot be used simultaneously with "
+    "auto_pad attribute. If not present, the padding defaults to 0 along start and end of each spatial axis.";
+const char* contrib_ops_auto_pad_doc =
+    "auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where "
+    "default value is NOTSET, which means explicit padding is used. "
+    "SAME_UPPER or SAME_LOWER mean pad the input so that the output spatial size match the input."
+    "In case of odd number add the extra padding at the end for SAME_UPPER and at the "
+    "beginning for SAME_LOWER. VALID mean no padding.";
+
 void RegisterNchwcSchemas() {
  ONNX_CONTRIB_OPERATOR_SCHEMA(ReorderInput)
      .SetDomain(kMSNchwcDomain)
@ -1541,6 +1660,307 @@ with the exception that numpy default keepdims to False instead of True.)DOC")
          "Keep the reduced dimension or not, default 1 mean keep reduced dimension.",
          AttributeProto::INT);

+  ONNX_CONTRIB_OPERATOR_SCHEMA(QLinearAdd)
+      .SetDomain(kMSDomain)
+      .SinceVersion(1)
+      .FillUsing(QLinearMathDocGenerator("addition",
+                                         "C = (A_scale * (A - A_zero_point) + B_scale * (B - B_zero_point))/C_scale + C_zero_point"));
+
+  ONNX_CONTRIB_OPERATOR_SCHEMA(QLinearMul)
+      .SetDomain(kMSDomain)
+      .SinceVersion(1)
+      .FillUsing(QLinearMathDocGenerator("multiplication",
+                                         "C = ((A - A_zero_point) * (B - B_zero_point)) * (A_scale * B_scale)/C_scale + C_zero_point"));
+
+  ONNX_CONTRIB_OPERATOR_SCHEMA(QLinearReduceMean)
+      .SetDomain(kMSDomain)
+      .SinceVersion(1)
+      .SetDoc(R"DOC(
+Computes the mean of the low-precision input tensor's element along the provided axes.
+The resulting tensor has the same rank as the input if keepdims equal 1. If keepdims equal 0,
+then the resulting tensor have the reduced dimension pruned. The above behavior is similar to numpy,
+with the exception that numpy default keepdims to False instead of True.
+Input and Output scales and zero points are used to requantize the output in a new range. 
+This helps to improve accuracy as after ReduceMean operation the range of the output is expected to decrease.
+
+```
+"Output = Dequantize(Input) -> ReduceMean on fp32 data -> Quantize(output)",
+
+```
+)DOC")
+      .Input(0, "data", "An input tensor.", "T")
+      .Input(
+          1,
+          "data_scale",
+          "Input scale. It's a scalar, which means a per-tensor/layer quantization.",
+          "tensor(float)")
+      .Input(
+          2,
+          "data_zero_point",
+          "Input zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.",
+          "T",
+          OpSchema::Optional)
+      .Input(
+          3,
+          "reduced_scale",
+          "Output scale. It's a scalar, which means a per-tensor/layer quantization.",
+          "tensor(float)")
+      .Input(
+          4,
+          "reduced_zero_point",
+          "Output zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.",
+          "T",
+          OpSchema::Optional)
+      .Output(0, "reduced", "Reduced output tensor.", "T")
+      .TypeConstraint("T", {"tensor(uint8)", "tensor(int8)"}, "Constrain input types to 8 bit signed and unsigned tensors.")
+      .Attr(
+          "axes",
+          "A list of integers, along which to reduce. The default is to reduce over all the dimensions of the input tensor.",
+          AttributeProto::INTS)
+      .Attr(
+          "keepdims",
+          "Keep the reduced dimension or not, default 1 mean keep reduced dimension.",
+          AttributeProto::INT)
+      .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
+        propagateElemTypeFromInputToOutput(ctx, 0, 0);
+
+        if (!hasNInputShapes(ctx, 1)) {
+          return;
+        }
+
+        auto data_type = ctx.getInputType(0);
+        if (nullptr == data_type || data_type->value_case() != ONNX_NAMESPACE::TypeProto::kTensorType) {
+          fail_type_inference("inputs are expected to have tensor type.");
+        }
+
+        // validate scale and zero points
+        ValidateTypeAndShapeForScaleAndZP(ctx, 1, ONNX_NAMESPACE::TensorProto::FLOAT, true);
+        ValidateTypeAndShapeForScaleAndZP(ctx, 2, data_type->tensor_type().elem_type(), true);
+        ValidateTypeAndShapeForScaleAndZP(ctx, 3, ONNX_NAMESPACE::TensorProto::FLOAT, true);
+        ValidateTypeAndShapeForScaleAndZP(ctx, 4, data_type->tensor_type().elem_type(), true);
+
+        int64_t keep_dims = 1;
+        auto attr_proto = ctx.getAttribute("keepdims");
+        if (attr_proto) {
+          keep_dims = attr_proto->i();
+        }
+
+        auto& input_shape = ctx.getInputType(0)->tensor_type().shape();
+        int64_t input_ndim = input_shape.dim_size();
+        auto output_shape =
+            ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape();
+        std::vector<int64_t> axes;
+        auto axes_proto = ctx.getAttribute("axes");
+        if (axes_proto)
+          axes.assign(axes_proto->ints().begin(), axes_proto->ints().end());
+
+        for (size_t i = 0; i < axes.size(); ++i) {
+          if (axes[i] < -input_ndim || axes[i] >= input_ndim) {
+            fail_shape_inference(
+                "axis must be in [-rank, rank-1]. input rank was ", input_ndim);
+          }
+          if (axes[i] < 0)
+            axes[i] += input_ndim;
+        }
+        // do we need to handle negative axis?
+        for (int i = 0; i < input_ndim; ++i) {
+          // axes empty means reduce all dim
+          if (!axes.empty() &&
+              std::find(axes.begin(), axes.end(), i) == axes.end()) {
+            auto dim = output_shape->add_dim();
+            dim->CopyFrom(input_shape.dim(i));
+          } else {
+            if (keep_dims == 1) {
+              auto dim = output_shape->add_dim();
+              dim->set_dim_value(1);
+            }
+          }
+        }
+      });
+
+  ONNX_CONTRIB_OPERATOR_SCHEMA(MulInteger)
+      .SetDomain(kMSDomain)
+      .SinceVersion(1)
+      .SetDoc(R"DOC(Performs element-wise binary quantized multiplication (with Numpy-style broadcasting support).
+"This operator supports **multidirectional (i.e., Numpy-style) broadcasting**"
+The output of this op is the int32 accumulated result of the mul operation
+
+```
+C (int32) = (A - A_zero_point) * (B - B_zero_point)
+```
+
+)DOC")
+      .Input(0, "A", "First operand.", "T")
+      .Input(
+          1,
+          "A_zero_point",
+          "Input A zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.",
+          "T",
+          OpSchema::Optional)
+      .Input(2, "B", "Second operand.", "T")
+      .Input(
+          3,
+          "B_zero_point",
+          "Input B zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.",
+          "T",
+          OpSchema::Optional)
+      .Output(0, "C", "Constrain output to 32 bit tensor", "T1")
+      .TypeConstraint("T", {"tensor(uint8)", "tensor(int8)"}, "Constrain input types to 8 bit signed and unsigned tensors.")
+      .TypeConstraint("T1", {"tensor(int32)"}, "Constrain output types to 32 bit tensors.")
+      .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
+        auto c_type = ctx.getOutputType(0);
+        c_type->mutable_tensor_type()->set_elem_type(
+            ONNX_NAMESPACE::TensorProto::INT32);
+
+        auto a_type = ctx.getInputType(0);
+        auto b_type = ctx.getInputType(3);
+        if (nullptr == a_type || nullptr == b_type ||
+            a_type->value_case() != ONNX_NAMESPACE::TypeProto::kTensorType ||
+            b_type->value_case() != ONNX_NAMESPACE::TypeProto::kTensorType) {
+          fail_type_inference("inputs are expected to have tensor type.");
+        }
+
+        ValidateTypeAndShapeForScaleAndZP(ctx, 1, a_type->tensor_type().elem_type(), true);
+        ValidateTypeAndShapeForScaleAndZP(ctx, 3, b_type->tensor_type().elem_type(), true);
+
+        if (hasInputShape(ctx, 0) && hasInputShape(ctx, 2)) {
+          bidirectionalBroadcastShapeInference(
+              ctx.getInputType(0)->tensor_type().shape(),
+              ctx.getInputType(2)->tensor_type().shape(),
+              *ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape());
+        }
+      });
+
+  const char* QLinearAveragePoolDoc_ver1 = R"DOC(
+ QLinearAveragePool consumes an input tensor X and applies average pooling across
+ the tensor according to kernel sizes, stride sizes, and pad lengths.
+ average pooling consisting of computing the average on all values of a
+ subset of the input tensor according to the kernel size and downsampling the
+ data into the output tensor Y for further processing. The output spatial shape will be following:
+ ```
+ output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - kernel_spatial_shape[i]) / strides_spatial_shape[i] + 1)
+ ```
+ or
+ ```
+ output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - kernel_spatial_shape[i]) / strides_spatial_shape[i] + 1)
+ ```
+ if ceil_mode is enabled
+
+ ```
+ * pad_shape[i] is sum of pads along axis i
+ ```
+
+ `auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following:
+ ```
+ VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - kernel_spatial_shape[i] + 1) / strides_spatial_shape[i])
+ SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides_spatial_shape[i])
+ ```
+ And pad shape will be following if `SAME_UPPER` or `SAME_LOWER`:
+ ```
+ pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + kernel_spatial_shape[i] - input_spatial_shape[i]
+ ```
+ 
+The output of each pooling window is divided by the number of elements (exclude pad when attribute count_include_pad is zero).
+
+Input and output scales and zero points are used to convert the output to a new quantization range.
+Output = Dequantize(Input) -> AveragePool on fp32 data -> Quantize(output)
+)DOC";
+
+  ONNX_CONTRIB_OPERATOR_SCHEMA(QLinearAveragePool)
+      .SetDomain(kMSDomain)
+      .SinceVersion(1)
+      .SetDoc(QLinearAveragePoolDoc_ver1)
+      .Attr(
+          "count_include_pad",
+          "Whether include pad pixels when calculating values for the edges. Default is 0, doesn't count include pad.",
+          AttributeProto::INT,
+          static_cast<int64_t>(0))
+      .Attr(
+          "kernel_shape",
+          "The size of the kernel along each axis.",
+          AttributeProto::INTS)
+      .Attr(
+          "strides",
+          "Stride along each spatial axis. If not present, the stride defaults to 1 along each spatial axis.",
+          AttributeProto::INTS,
+          OPTIONAL)
+      .Attr(
+          "auto_pad",
+          contrib_ops_auto_pad_doc,
+          AttributeProto::STRING,
+          std::string("NOTSET"))
+      .Attr("pads", contrib_ops_pads_doc, AttributeProto::INTS, OPTIONAL)
+      .Attr(
+          "ceil_mode",
+          "Whether to use ceil or floor (default) to compute the output shape.",
+          AttributeProto::INT,
+          static_cast<int64_t>(0))
+      .Input(
+          0,
+          "X",
+          "Input data tensor from the previous operator; "
+          "dimensions for image case are (N x C x H x W), "
+          "where N is the batch size, C is the number of "
+          "channels, and H and W are the height and the "
+          "width of the data. For non image case, the "
+          "dimensions are in the form of "
+          "(N x C x D1 x D2 ... Dn), where N is the batch "
+          "size. Optionally, if dimension denotation is "
+          "in effect, the operation expects the input "
+          "data tensor to arrive with the dimension denotation "
+          "of [DATA_BATCH, DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].",
+          "T")
+      .Input(
+          1,
+          "x_scale",
+          "Input scale. It's a scalar, which means a per-tensor/layer quantization.",
+          "tensor(float)")
+      .Input(
+          2,
+          "x_zero_point",
+          "Input zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.",
+          "T",
+          OpSchema::Optional)
+      .Input(
+          3,
+          "y_scale",
+          "Output scale. It's a scalar, which means a per-tensor/layer quantization.",
+          "tensor(float)")
+      .Input(
+          4,
+          "y_zero_point",
+          "Output zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.",
+          "T",
+          OpSchema::Optional)
+      .Output(
+          0,
+          "Y",
+          "Output data tensor from average or max pooling across "
+          "the input tensor. Dimensions will vary based "
+          "on various kernel, stride, and pad sizes. Floor value of "
+          "the dimension is used",
+          "T")
+      .TypeConstraint(
+          "T",
+          {"tensor(uint8)", "tensor(int8)"},
+          "Constrain input and output types to 8 bit tensors.")
+      .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
+        ONNX_NAMESPACE::propagateElemTypeFromInputToOutput(ctx, 0, 0);
+
+        auto data_type = ctx.getInputType(0);
+        if (nullptr == data_type || data_type->value_case() != ONNX_NAMESPACE::TypeProto::kTensorType) {
+          fail_type_inference("inputs are expected to have tensor type.");
+        }
+
+        // validate scale and zero points
+        ValidateTypeAndShapeForScaleAndZP(ctx, 1, ONNX_NAMESPACE::TensorProto::FLOAT, true);
+        ValidateTypeAndShapeForScaleAndZP(ctx, 2, data_type->tensor_type().elem_type(), true);
+        ValidateTypeAndShapeForScaleAndZP(ctx, 3, ONNX_NAMESPACE::TensorProto::FLOAT, true);
+        ValidateTypeAndShapeForScaleAndZP(ctx, 4, data_type->tensor_type().elem_type(), true);
+
+        ONNX_NAMESPACE::convPoolShapeInference(ctx, false, true, 0, 5);
+      });
+
  ONNX_CONTRIB_OPERATOR_SCHEMA(MurmurHash3)
      .SetDomain(kMSDomain)
      .SinceVersion(1)