diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc index 422c1aad31..ccfe61344b 100644 --- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc +++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc @@ -162,7 +162,6 @@ void convTransposeWithDynamicPadsShapeInference(InferenceContext& ctx) { return; } } - } // namespace ONNX_NAMESPACE namespace onnxruntime { @@ -202,6 +201,126 @@ void NchwcGlobalPoolOpSchemaGenerator(OpSchema& schema) { }); } +void ValidateTypeAndShapeForScaleAndZP(ONNX_NAMESPACE::InferenceContext& ctx, int index, ::google::protobuf::int32 expectedType, bool isScalar, int expectedTensorSize = 0) { + if (ctx.getNumInputs() > static_cast(index)) { + auto data_type = ctx.getInputType(index); + if (nullptr == data_type || data_type->value_case() != ONNX_NAMESPACE::TypeProto::kTensorType || + data_type->tensor_type().elem_type() != expectedType) { + fail_type_inference( + "Input data type does not match the expected data type. Current data type is ", data_type->tensor_type().elem_type()); + } + } + + if (hasInputShape(ctx, index)) { + ONNX_NAMESPACE::TensorShapeProto shape = ctx.getInputType(index)->tensor_type().shape(); + if (isScalar) { + if (shape.dim_size() != 0) { + fail_type_inference("Scale and Zero-point must be a scalar"); + } + } else { + if (shape.dim_size() != 1) { + fail_type_inference("Scale and Zero-point must be of rank 1"); + } + + if (shape.dim((int)0).has_dim_value() && shape.dim((int)0).dim_value() != expectedTensorSize) { + fail_type_inference( + "Scale and Zero-point must be of rank 1 and the number of elements should be equal to the number of rows of the corresponding input."); + } + } + } +} + +std::function QLinearMathDocGenerator(const char* name, const char* additionalDocumentation) { + return [=](OpSchema& schema) { + std::string doc = R"DOC( +Performs element-wise binary {name} on 8 bit data types (with Numpy-style broadcasting support). + +{additionalDocumentation} +)DOC"; + ONNX_NAMESPACE::ReplaceAll(doc, "{name}", name); + ONNX_NAMESPACE::ReplaceAll(doc, "{additionalDocumentation}", additionalDocumentation); + schema.SetDoc(doc); + schema.Input(0, "A", "First operand.", "T"); + schema.Input( + 1, + "A_scale", + "Input A's scale. It's a scalar, which means a per-tensor/layer quantization.", + "tensor(float)"); + schema.Input( + 2, + "A_zero_point", + "Input A zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.", + "T", + OpSchema::Optional); + schema.Input(3, "B", "Second operand.", "T"); + schema.Input( + 4, + "B_scale", + "Input B's scale. It's a scalar, which means a per-tensor/layer quantization.", + "tensor(float)"); + schema.Input( + 5, + "B_zero_point", + "Input B zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.", + "T", + OpSchema::Optional); + schema.Input( + 6, + "C_scale", + "Output scale. It's a scalar, which means a per-tensor/layer quantization.", + "tensor(float)"); + schema.Input( + 7, + "C_zero_point", + "Output zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.", + "T", + OpSchema::Optional); + schema.Output(0, "C", "Result, has same element type as two inputs", "T"); + schema.TypeConstraint("T", {"tensor(uint8)", "tensor(int8)"}, "Constrain input and output types to 8 bit signed and unsigned tensors."); + schema.TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) { + propagateElemTypeFromInputToOutput(ctx, 0, 0); + + auto a_type = ctx.getInputType(0); + auto b_type = ctx.getInputType(3); + + if (nullptr == a_type || nullptr == b_type || + a_type->value_case() != ONNX_NAMESPACE::TypeProto::kTensorType || + b_type->value_case() != ONNX_NAMESPACE::TypeProto::kTensorType) { + fail_type_inference("inputs are expected to have tensor type."); + } + + // validate scale and zero points + ValidateTypeAndShapeForScaleAndZP(ctx, 1, ONNX_NAMESPACE::TensorProto::FLOAT, true); + ValidateTypeAndShapeForScaleAndZP(ctx, 2, a_type->tensor_type().elem_type(), true); + ValidateTypeAndShapeForScaleAndZP(ctx, 4, ONNX_NAMESPACE::TensorProto::FLOAT, true); + ValidateTypeAndShapeForScaleAndZP(ctx, 5, b_type->tensor_type().elem_type(), true); + ValidateTypeAndShapeForScaleAndZP(ctx, 6, ONNX_NAMESPACE::TensorProto::FLOAT, true); + ValidateTypeAndShapeForScaleAndZP(ctx, 7, a_type->tensor_type().elem_type(), true); + + if (hasInputShape(ctx, 0) && hasInputShape(ctx, 3)) + bidirectionalBroadcastShapeInference( + ctx.getInputType(0)->tensor_type().shape(), + ctx.getInputType(3)->tensor_type().shape(), + *ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape()); + }); + }; +} + +const char* contrib_ops_pads_doc = + "Padding for the beginning and ending along each spatial axis, it can take any value greater " + "than or equal to 0. The value represent the number of pixels added to the beginning " + "and end part of the corresponding axis. `pads` format should be as follow " + "[x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin the number of pixels " + "added at the beginning of axis `i` and xi_end, the number of pixels added at " + "the end of axis `i`. This attribute cannot be used simultaneously with " + "auto_pad attribute. If not present, the padding defaults to 0 along start and end of each spatial axis."; +const char* contrib_ops_auto_pad_doc = + "auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where " + "default value is NOTSET, which means explicit padding is used. " + "SAME_UPPER or SAME_LOWER mean pad the input so that the output spatial size match the input." + "In case of odd number add the extra padding at the end for SAME_UPPER and at the " + "beginning for SAME_LOWER. VALID mean no padding."; + void RegisterNchwcSchemas() { ONNX_CONTRIB_OPERATOR_SCHEMA(ReorderInput) .SetDomain(kMSNchwcDomain) @@ -1541,6 +1660,307 @@ with the exception that numpy default keepdims to False instead of True.)DOC") "Keep the reduced dimension or not, default 1 mean keep reduced dimension.", AttributeProto::INT); + ONNX_CONTRIB_OPERATOR_SCHEMA(QLinearAdd) + .SetDomain(kMSDomain) + .SinceVersion(1) + .FillUsing(QLinearMathDocGenerator("addition", + "C = (A_scale * (A - A_zero_point) + B_scale * (B - B_zero_point))/C_scale + C_zero_point")); + + ONNX_CONTRIB_OPERATOR_SCHEMA(QLinearMul) + .SetDomain(kMSDomain) + .SinceVersion(1) + .FillUsing(QLinearMathDocGenerator("multiplication", + "C = ((A - A_zero_point) * (B - B_zero_point)) * (A_scale * B_scale)/C_scale + C_zero_point")); + + ONNX_CONTRIB_OPERATOR_SCHEMA(QLinearReduceMean) + .SetDomain(kMSDomain) + .SinceVersion(1) + .SetDoc(R"DOC( +Computes the mean of the low-precision input tensor's element along the provided axes. +The resulting tensor has the same rank as the input if keepdims equal 1. If keepdims equal 0, +then the resulting tensor have the reduced dimension pruned. The above behavior is similar to numpy, +with the exception that numpy default keepdims to False instead of True. +Input and Output scales and zero points are used to requantize the output in a new range. +This helps to improve accuracy as after ReduceMean operation the range of the output is expected to decrease. + +``` +"Output = Dequantize(Input) -> ReduceMean on fp32 data -> Quantize(output)", + +``` +)DOC") + .Input(0, "data", "An input tensor.", "T") + .Input( + 1, + "data_scale", + "Input scale. It's a scalar, which means a per-tensor/layer quantization.", + "tensor(float)") + .Input( + 2, + "data_zero_point", + "Input zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.", + "T", + OpSchema::Optional) + .Input( + 3, + "reduced_scale", + "Output scale. It's a scalar, which means a per-tensor/layer quantization.", + "tensor(float)") + .Input( + 4, + "reduced_zero_point", + "Output zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.", + "T", + OpSchema::Optional) + .Output(0, "reduced", "Reduced output tensor.", "T") + .TypeConstraint("T", {"tensor(uint8)", "tensor(int8)"}, "Constrain input types to 8 bit signed and unsigned tensors.") + .Attr( + "axes", + "A list of integers, along which to reduce. The default is to reduce over all the dimensions of the input tensor.", + AttributeProto::INTS) + .Attr( + "keepdims", + "Keep the reduced dimension or not, default 1 mean keep reduced dimension.", + AttributeProto::INT) + .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) { + propagateElemTypeFromInputToOutput(ctx, 0, 0); + + if (!hasNInputShapes(ctx, 1)) { + return; + } + + auto data_type = ctx.getInputType(0); + if (nullptr == data_type || data_type->value_case() != ONNX_NAMESPACE::TypeProto::kTensorType) { + fail_type_inference("inputs are expected to have tensor type."); + } + + // validate scale and zero points + ValidateTypeAndShapeForScaleAndZP(ctx, 1, ONNX_NAMESPACE::TensorProto::FLOAT, true); + ValidateTypeAndShapeForScaleAndZP(ctx, 2, data_type->tensor_type().elem_type(), true); + ValidateTypeAndShapeForScaleAndZP(ctx, 3, ONNX_NAMESPACE::TensorProto::FLOAT, true); + ValidateTypeAndShapeForScaleAndZP(ctx, 4, data_type->tensor_type().elem_type(), true); + + int64_t keep_dims = 1; + auto attr_proto = ctx.getAttribute("keepdims"); + if (attr_proto) { + keep_dims = attr_proto->i(); + } + + auto& input_shape = ctx.getInputType(0)->tensor_type().shape(); + int64_t input_ndim = input_shape.dim_size(); + auto output_shape = + ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape(); + std::vector axes; + auto axes_proto = ctx.getAttribute("axes"); + if (axes_proto) + axes.assign(axes_proto->ints().begin(), axes_proto->ints().end()); + + for (size_t i = 0; i < axes.size(); ++i) { + if (axes[i] < -input_ndim || axes[i] >= input_ndim) { + fail_shape_inference( + "axis must be in [-rank, rank-1]. input rank was ", input_ndim); + } + if (axes[i] < 0) + axes[i] += input_ndim; + } + // do we need to handle negative axis? + for (int i = 0; i < input_ndim; ++i) { + // axes empty means reduce all dim + if (!axes.empty() && + std::find(axes.begin(), axes.end(), i) == axes.end()) { + auto dim = output_shape->add_dim(); + dim->CopyFrom(input_shape.dim(i)); + } else { + if (keep_dims == 1) { + auto dim = output_shape->add_dim(); + dim->set_dim_value(1); + } + } + } + }); + + ONNX_CONTRIB_OPERATOR_SCHEMA(MulInteger) + .SetDomain(kMSDomain) + .SinceVersion(1) + .SetDoc(R"DOC(Performs element-wise binary quantized multiplication (with Numpy-style broadcasting support). +"This operator supports **multidirectional (i.e., Numpy-style) broadcasting**" +The output of this op is the int32 accumulated result of the mul operation + +``` +C (int32) = (A - A_zero_point) * (B - B_zero_point) +``` + +)DOC") + .Input(0, "A", "First operand.", "T") + .Input( + 1, + "A_zero_point", + "Input A zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.", + "T", + OpSchema::Optional) + .Input(2, "B", "Second operand.", "T") + .Input( + 3, + "B_zero_point", + "Input B zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.", + "T", + OpSchema::Optional) + .Output(0, "C", "Constrain output to 32 bit tensor", "T1") + .TypeConstraint("T", {"tensor(uint8)", "tensor(int8)"}, "Constrain input types to 8 bit signed and unsigned tensors.") + .TypeConstraint("T1", {"tensor(int32)"}, "Constrain output types to 32 bit tensors.") + .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) { + auto c_type = ctx.getOutputType(0); + c_type->mutable_tensor_type()->set_elem_type( + ONNX_NAMESPACE::TensorProto::INT32); + + auto a_type = ctx.getInputType(0); + auto b_type = ctx.getInputType(3); + if (nullptr == a_type || nullptr == b_type || + a_type->value_case() != ONNX_NAMESPACE::TypeProto::kTensorType || + b_type->value_case() != ONNX_NAMESPACE::TypeProto::kTensorType) { + fail_type_inference("inputs are expected to have tensor type."); + } + + ValidateTypeAndShapeForScaleAndZP(ctx, 1, a_type->tensor_type().elem_type(), true); + ValidateTypeAndShapeForScaleAndZP(ctx, 3, b_type->tensor_type().elem_type(), true); + + if (hasInputShape(ctx, 0) && hasInputShape(ctx, 2)) { + bidirectionalBroadcastShapeInference( + ctx.getInputType(0)->tensor_type().shape(), + ctx.getInputType(2)->tensor_type().shape(), + *ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape()); + } + }); + + const char* QLinearAveragePoolDoc_ver1 = R"DOC( + QLinearAveragePool consumes an input tensor X and applies average pooling across + the tensor according to kernel sizes, stride sizes, and pad lengths. + average pooling consisting of computing the average on all values of a + subset of the input tensor according to the kernel size and downsampling the + data into the output tensor Y for further processing. The output spatial shape will be following: + ``` + output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - kernel_spatial_shape[i]) / strides_spatial_shape[i] + 1) + ``` + or + ``` + output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - kernel_spatial_shape[i]) / strides_spatial_shape[i] + 1) + ``` + if ceil_mode is enabled + + ``` + * pad_shape[i] is sum of pads along axis i + ``` + + `auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following: + ``` + VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - kernel_spatial_shape[i] + 1) / strides_spatial_shape[i]) + SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides_spatial_shape[i]) + ``` + And pad shape will be following if `SAME_UPPER` or `SAME_LOWER`: + ``` + pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + kernel_spatial_shape[i] - input_spatial_shape[i] + ``` + +The output of each pooling window is divided by the number of elements (exclude pad when attribute count_include_pad is zero). + +Input and output scales and zero points are used to convert the output to a new quantization range. +Output = Dequantize(Input) -> AveragePool on fp32 data -> Quantize(output) +)DOC"; + + ONNX_CONTRIB_OPERATOR_SCHEMA(QLinearAveragePool) + .SetDomain(kMSDomain) + .SinceVersion(1) + .SetDoc(QLinearAveragePoolDoc_ver1) + .Attr( + "count_include_pad", + "Whether include pad pixels when calculating values for the edges. Default is 0, doesn't count include pad.", + AttributeProto::INT, + static_cast(0)) + .Attr( + "kernel_shape", + "The size of the kernel along each axis.", + AttributeProto::INTS) + .Attr( + "strides", + "Stride along each spatial axis. If not present, the stride defaults to 1 along each spatial axis.", + AttributeProto::INTS, + OPTIONAL) + .Attr( + "auto_pad", + contrib_ops_auto_pad_doc, + AttributeProto::STRING, + std::string("NOTSET")) + .Attr("pads", contrib_ops_pads_doc, AttributeProto::INTS, OPTIONAL) + .Attr( + "ceil_mode", + "Whether to use ceil or floor (default) to compute the output shape.", + AttributeProto::INT, + static_cast(0)) + .Input( + 0, + "X", + "Input data tensor from the previous operator; " + "dimensions for image case are (N x C x H x W), " + "where N is the batch size, C is the number of " + "channels, and H and W are the height and the " + "width of the data. For non image case, the " + "dimensions are in the form of " + "(N x C x D1 x D2 ... Dn), where N is the batch " + "size. Optionally, if dimension denotation is " + "in effect, the operation expects the input " + "data tensor to arrive with the dimension denotation " + "of [DATA_BATCH, DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].", + "T") + .Input( + 1, + "x_scale", + "Input scale. It's a scalar, which means a per-tensor/layer quantization.", + "tensor(float)") + .Input( + 2, + "x_zero_point", + "Input zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.", + "T", + OpSchema::Optional) + .Input( + 3, + "y_scale", + "Output scale. It's a scalar, which means a per-tensor/layer quantization.", + "tensor(float)") + .Input( + 4, + "y_zero_point", + "Output zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.", + "T", + OpSchema::Optional) + .Output( + 0, + "Y", + "Output data tensor from average or max pooling across " + "the input tensor. Dimensions will vary based " + "on various kernel, stride, and pad sizes. Floor value of " + "the dimension is used", + "T") + .TypeConstraint( + "T", + {"tensor(uint8)", "tensor(int8)"}, + "Constrain input and output types to 8 bit tensors.") + .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) { + ONNX_NAMESPACE::propagateElemTypeFromInputToOutput(ctx, 0, 0); + + auto data_type = ctx.getInputType(0); + if (nullptr == data_type || data_type->value_case() != ONNX_NAMESPACE::TypeProto::kTensorType) { + fail_type_inference("inputs are expected to have tensor type."); + } + + // validate scale and zero points + ValidateTypeAndShapeForScaleAndZP(ctx, 1, ONNX_NAMESPACE::TensorProto::FLOAT, true); + ValidateTypeAndShapeForScaleAndZP(ctx, 2, data_type->tensor_type().elem_type(), true); + ValidateTypeAndShapeForScaleAndZP(ctx, 3, ONNX_NAMESPACE::TensorProto::FLOAT, true); + ValidateTypeAndShapeForScaleAndZP(ctx, 4, data_type->tensor_type().elem_type(), true); + + ONNX_NAMESPACE::convPoolShapeInference(ctx, false, true, 0, 5); + }); + ONNX_CONTRIB_OPERATOR_SCHEMA(MurmurHash3) .SetDomain(kMSDomain) .SinceVersion(1)