From 64e991a9fc044004859913a7e5d0940e93f703ee Mon Sep 17 00:00:00 2001 From: Cheng Date: Wed, 10 Aug 2022 10:52:02 +0800 Subject: [PATCH] [Qlinearsoftmax] contrib cpu (#12177) * [Qlinearsoftmax] contrib cpu * int8 implementation * contrib operator md * qdq transformer test * new attribute: opset * doc * quantized tool * remove template to reduce Binary size * doc of contribe operators * enforce x_shape is valid * fix reduce_size if input-shape is dynamic * add UT * register one op for reducing binarysize * kernel hash update * docs/ContribOperators.md --- docs/ContribOperators.md | 55 ++- docs/OperatorKernels.md | 1 + .../onnxruntime/core/framework/op_kernel.h | 3 + .../contrib_ops/cpu/cpu_contrib_kernels.cc | 2 + .../cpu/quantization/qlinear_softmax.cc | 343 ++++++++++++++++++ .../cpu/quantization/qlinear_softmax.h | 34 ++ onnxruntime/core/graph/contrib_ops/ms_opset.h | 2 + .../graph/contrib_ops/quantization_defs.cc | 58 ++- .../selectors_actions/qdq_actions.cc | 11 +- .../selectors_actions/qdq_actions.h | 3 + .../qdq_selector_action_transformer.cc | 3 +- .../tools/quantization/onnx_quantizer.py | 14 + .../tools/quantization/operators/softmax.py | 86 +++++ .../tools/quantization/qdq_quantizer.py | 5 +- .../python/tools/quantization/registry.py | 3 + .../contrib_ops/qlinear_lookup_table_test.cc | 148 ++++++++ .../test/optimizer/qdq_transformer_test.cc | 61 ++++ .../test/python/quantization/op_test_utils.py | 2 + .../python/quantization/test_op_softmax.py | 180 +++++++++ .../kernel_def_hashes/contrib.cpu.json | 6 +- 20 files changed, 1014 insertions(+), 6 deletions(-) create mode 100644 onnxruntime/contrib_ops/cpu/quantization/qlinear_softmax.cc create mode 100644 onnxruntime/contrib_ops/cpu/quantization/qlinear_softmax.h create mode 100644 onnxruntime/python/tools/quantization/operators/softmax.py create mode 100644 onnxruntime/test/python/quantization/test_op_softmax.py diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md index 714a265a52..2dca2eca8b 100644 --- a/docs/ContribOperators.md +++ b/docs/ContribOperators.md @@ -55,6 +55,7 @@ Do not modify directly.* * com.microsoft.QLinearMul * com.microsoft.QLinearReduceMean * com.microsoft.QLinearSigmoid + * com.microsoft.QLinearSoftmax * com.microsoft.QuantizeLinear * com.microsoft.Range * com.microsoft.ReduceSumInteger @@ -2771,7 +2772,7 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.QLinearSigmoid** - QLinearSigmoid takes quantized input data (Tensor), and quantize parameter for output, and produces one output data + QLinearSigmoid takes quantized input data (Tensor), and quantize parameter for output, and produces one output data (Tensor) where the function `f(x) = quantize(Sigmoid(dequantize(x)))`, is applied to the data tensor elementwise. Wwhere the function `Sigmoid(x) = 1 / (1 + exp(-x))` @@ -2809,6 +2810,58 @@ This version of the operator has been available since version 1 of the 'com.micr +### **com.microsoft.QLinearSoftmax** + + QLinearSoftmax computes the normalized exponential values for the given input: + Softmax(input, axis) = Exp(input) / ReduceSum(Exp(input), axis=axis, keepdims=1) + The input does not need to explicitly be a 2D vector. The "axis" attribute + indicates the dimension along which QLinearSoftmax will be performed for onnx v.13+. + or the dimension coerced to NxD Matrix for onnx v.12-. + The output tensor has the same shape. + +#### Version + +This version of the operator has been available since version 1 of the 'com.microsoft' operator set. + +#### Attributes + +
+
axis : int
+
apply softmax to elements for dimensions axis,or all dims along with axis according to op-version
+
opset : int (required)
+
opset version of corresponding SoftMax.
+
+ +#### Inputs + +
+
X : T
+
The input tensor
+
X_scale : tensor(float)
+
Scale of quantized input 'X'. It must be a scalar.
+
x_zero_point (optional) : T
+
Zero point tensor for input 'X'.It must be a scalar.
+
y_scale : tensor(float)
+
Scale of quantized output 'Y'. It must be a scalar.
+
y_zero_point : T
+
Zero point tensor for output 'Y'. It must be a scalar.
+
+ +#### Outputs + +
+
Y : T
+
Output data tensor from pooling across the input tensor. The output tensor has the same rank as the input.
+
+ +#### Type Constraints + +
+
T : tensor(uint8), tensor(int8)
+
Constrain input and output types to singed/unsigned int8 tensors.
+
+ + ### **com.microsoft.QuantizeLinear** The linear quantization operator. It consumes a full precision data, a scale, a zero point to compute the low precision / quantized tensor. diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md index e1b5253548..f50b920234 100644 --- a/docs/OperatorKernels.md +++ b/docs/OperatorKernels.md @@ -430,6 +430,7 @@ Do not modify directly.* |QLinearLeakyRelu|*in* X:**T**
*in* X_scale:**tensor(float)**
*in* X_zero_point:**T**
*in* Y_scale:**tensor(float)**
*in* Y_zero_point:**T**
*out* Y:**T**|1+|**T** = tensor(int8), tensor(uint8)| |QLinearMul|*in* A:**T**
*in* A_scale:**tensor(float)**
*in* A_zero_point:**T**
*in* B:**T**
*in* B_scale:**tensor(float)**
*in* B_zero_point:**T**
*in* C_scale:**tensor(float)**
*in* C_zero_point:**T**
*out* C:**T**|1+|**T** = tensor(int8), tensor(uint8)| |QLinearSigmoid|*in* X:**T**
*in* X_scale:**tensor(float)**
*in* X_zero_point:**T**
*in* Y_scale:**tensor(float)**
*in* Y_zero_point:**T**
*out* Y:**T**|1+|**T** = tensor(int8), tensor(uint8)| +|QLinearSoftmax|*in* X:**T**
*in* X_scale:**tensor(float)**
*in* x_zero_point:**T**
*in* y_scale:**tensor(float)**
*in* y_zero_point:**T**
*out* Y:**T**|1+|**T** = tensor(int8), tensor(uint8)| |QuantizeLinear|*in* x:**T1**
*in* y_scale:**T1**
*in* y_zero_point:**T2**
*out* y:**T2**|1+|**T1** = tensor(float)
**T2** = tensor(int8), tensor(uint8)| |Range|*in* start:**T**
*in* limit:**T**
*in* delta:**T**
*out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(int16), tensor(int32), tensor(int64)| |SampleOp|*in* X:**T**
*out* Y:**T**|1+|**T** = tensor(float)| diff --git a/include/onnxruntime/core/framework/op_kernel.h b/include/onnxruntime/core/framework/op_kernel.h index 34ed1b4863..0f5425315b 100644 --- a/include/onnxruntime/core/framework/op_kernel.h +++ b/include/onnxruntime/core/framework/op_kernel.h @@ -193,6 +193,9 @@ using BuildKernelCreateInfoFn = KernelCreateInfo (*)(); #define ONNX_CPU_OPERATOR_ML_KERNEL(name, ver, builder, ...) \ ONNX_OPERATOR_KERNEL_EX(name, kMLDomain, ver, kCpuExecutionProvider, builder, __VA_ARGS__) +#define ONNX_CPU_OPERATOR_MS_KERNEL(name, ver, builder, ...) \ + ONNX_OPERATOR_KERNEL_EX(name, kMSDomain, ver, kCpuExecutionProvider, builder, __VA_ARGS__) + #define ONNX_OPERATOR_KERNEL_EX(name, domain, ver, provider, builder, ...) \ class ONNX_OPERATOR_KERNEL_CLASS_NAME(provider, domain, ver, name); \ template <> \ diff --git a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc index 04fa2b9b9f..0de091a9a4 100644 --- a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc +++ b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc @@ -55,6 +55,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, QLinearLeakyRelu); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, QLinearSigmoid); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, QLinearSigmoid); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, QLinearSoftmax); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, QLinearAdd); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, QLinearAdd); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, QLinearMul); @@ -151,6 +152,7 @@ Status RegisterQuantizationKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, diff --git a/onnxruntime/contrib_ops/cpu/quantization/qlinear_softmax.cc b/onnxruntime/contrib_ops/cpu/quantization/qlinear_softmax.cc new file mode 100644 index 0000000000..89cbe521a5 --- /dev/null +++ b/onnxruntime/contrib_ops/cpu/quantization/qlinear_softmax.cc @@ -0,0 +1,343 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "contrib_ops/cpu/quantization/qlinear_softmax.h" + +#include +#include +#include + +#include "core/common/common.h" +#include "core/framework/tensorprotoutils.h" +#include "core/providers/common.h" +#include "core/providers/cpu/tensor/transpose.h" + +#include "core/mlas/inc/mlas.h" +#include "core/platform/threadpool.h" +#include "gsl/gsl-lite.hpp" + +namespace onnxruntime { +namespace contrib { + +constexpr int OPSET13 = 13; + +namespace { + +void QlinearBuildLookupTableUint32(gsl::span table, + const float x_scale, + size_t reduce_len, bool is_signed) { + const double qscale = + fmin(static_cast(UINT32_MAX) / static_cast(reduce_len), static_cast(0x7fffff)); + for (int32_t i = 0; i < 256; i++) { + double scaled_exp_xi = qscale * exp(static_cast(i - 255) * static_cast(x_scale)); + // we can't get the real max value of input tensor here, so we just assume 255. + // in the function of `QlinearSoftmaxCPU`, + // all numbers will have a shift (255-max_value) if its max value is not 255 + // + // if is_signed index = [1 2 3 ......126 127 -128 -127 ..... -3 -2 -1] + // else [0 1 2 3 4 ..... 256] + uint8_t index = static_cast(is_signed ? i - 128 : i); + table[index] = static_cast(lrint(scaled_exp_xi)); + } +} + +void BuildLookupTableIfFixed(const OpKernelInfo& info, std::vector& fixed_lookup_table, + size_t reduce_len, bool is_signed) { + const Tensor* tensor_x_scale = nullptr; + + bool get_x_scale = info.TryGetConstantInput(1, &tensor_x_scale); + ORT_ENFORCE(tensor_x_scale == nullptr || IsScalarOr1ElementVector(tensor_x_scale), + "QlinearBuildLookupTable : input X_scale must be a scalar or 1D tensor of size 1"); + bool is_fixed_parameters = get_x_scale; + + if (is_fixed_parameters) { + fixed_lookup_table.resize(256); + const float X_scale = *(tensor_x_scale->Data()); + QlinearBuildLookupTableUint32(fixed_lookup_table, X_scale, reduce_len, is_signed); + } +} +} // namespace + +QLinearSoftmax::QLinearSoftmax(const OpKernelInfo& info) + : OpKernel(info) { + const auto& node = info.node(); + auto input_defs = node.InputDefs(); + auto input_type = input_defs[0]->TypeAsProto()->tensor_type().elem_type(); + is_signed_ = (input_type == ONNX_NAMESPACE::TensorProto_DataType_INT8); + const auto* x_shape = input_defs[0]->Shape(); + ORT_ENFORCE(x_shape != nullptr && x_shape->dim_size() > 0, "input_shape of QLinearSoftmax must be existed"); + int rank = x_shape->dim_size(); + + int64_t opset = -1; + Status status = info.GetAttr("opset", &opset); + ORT_ENFORCE(status.IsOK(), "opset must be existed in attributes of QlinearSoftmax"); + opset_ = gsl::narrow_cast(opset); + + int64_t axis = -1; + status = info.GetAttr("axis", &axis); + if (status.IsOK()) { + axis_ = gsl::narrow_cast(axis); + } else { + // opset-12 and below, the default axis value is 1 + // opset-13, the default axis value is -1 + axis_ = opset_ < OPSET13 ? 1 : -1; + } + + axis_ = static_cast(HandleNegativeAxis(axis_, int64_t(rank))); + auto input_shape = utils::GetTensorShapeFromTensorShapeProto(*x_shape); + int64_t reduce_size = opset_ < OPSET13 ? input_shape.SizeFromDimension(axis_) : input_shape[axis_]; + // reduce_size could be negative if input-shape has a dynamic axis + if (reduce_size > 0) { + BuildLookupTableIfFixed(info, fixed_lookup_table_, reduce_size, is_signed_); + } +} + +// compute method of Softmax +Status QLinearSoftmax::Compute(OpKernelContext* ctx) const { + const auto* X = ctx->Input(0); + const auto& X_shape = X->Shape(); + auto* Y = ctx->Output(0, X_shape); + + // edge case. one or more dims with value of 0. nothing to do + if (X_shape.Size() == 0) { + return Status::OK(); + } + concurrency::ThreadPool* thread_pool = ctx->GetOperatorThreadPool(); + const size_t D = opset_ < OPSET13 ? X_shape.SizeFromDimension(axis_): X_shape[axis_]; + uint32_t tmp_lookup_table[256]; + gsl::span lookup_table = GetLookupTable(ctx, tmp_lookup_table, D); + + if (opset_ < OPSET13) { + return ComputeInternal(ctx, *X, *Y, lookup_table, axis_, thread_pool); + } else { + return ComputeImplOpset13(ctx, *X, *Y, lookup_table, thread_pool); + } +} + +template +common::Status QlinearSoftmaxCPU(size_t N, + size_t D, + const T* x_data, + T* y_data, + const uint32_t* lookup_table, + uint32_t y_scale, + T yzp, + onnxruntime::concurrency::ThreadPool* thread_pool); + +template <> +common::Status QlinearSoftmaxCPU(size_t N, + size_t D, + const uint8_t* x_data, + uint8_t* y_data, + const uint32_t* lookup_table, + uint32_t y_scale, + uint8_t yzp, + onnxruntime::concurrency::ThreadPool* thread_pool) { + using onnxruntime::TensorOpCost; + using onnxruntime::concurrency::ThreadPool; + ThreadPool::TryParallelFor( + thread_pool, N, + // Read 3*N (max,sum,div) write N (div), computation=Read + TensorOpCost{static_cast(D * 3), + static_cast(D), + static_cast(D * 3)}, + [x_data, y_data, D, y_scale, yzp, &lookup_table](std::ptrdiff_t first, std::ptrdiff_t last) { + const auto c_y_scale = y_scale; + const auto c_y_zp = yzp; + const uint8_t* x_t = x_data + first * D; + uint8_t* y_t = y_data + first * D; + for (; first < last; first++) { + // reduceMaxUint8 + uint8_t xmax = *std::max_element(x_t, x_t + D); + // we want the xmas to align with 255 for higher precision. + // as we build a lookup table with X-255. So we could use the adjustment here + // to let all numbers have a shift in the lookup table. + // 1 2 3 4 5 ...........................254 255 + // 1 3 5 ... 10 + // after the shift ---> + // 235 237 239 .. 255 + const uint32_t* shifted_lookuptable = lookup_table + 255 - xmax; + size_t elements_n = D; + // reduceSumUin8ToUint32: need speedup + // vsum = \sum_i{e^x_i} + uint32_t vsum = 0; + const uint8_t* x_t_cur = x_t; + do { + const size_t vx = *x_t_cur++; + vsum += shifted_lookuptable[vx]; + } while (--elements_n != 0); + if (vsum == 0) { + return; + } + elements_n = D; + x_t_cur = x_t; + // elementwise div, y_i=\frac{x_i}{vsum} + const uint32_t vrounding = (vsum >> 1); + do { + const size_t vx = *x_t_cur++; + const uint32_t vt = shifted_lookuptable[vx]; + // simulate round function, and re-quant to uint8 + const uint32_t vq = ((vt * c_y_scale) + vrounding) / vsum + c_y_zp; + const uint8_t vy = vq > 255 ? static_cast(255) : static_cast(vq); + *y_t++ = vy; + } while (--elements_n != 0); + x_t = x_t_cur; + } + }); + + return Status::OK(); +} + +template <> +common::Status QlinearSoftmaxCPU(size_t N, + size_t D, + const int8_t* x_data, + int8_t* y_data, + const uint32_t* lookup_table, + uint32_t y_scale, + int8_t yzp, + onnxruntime::concurrency::ThreadPool* thread_pool) { + using onnxruntime::TensorOpCost; + using onnxruntime::concurrency::ThreadPool; + ThreadPool::TryParallelFor( + thread_pool, N, + // Read 3*N (max,sum,div) write N (div), computation=Read + TensorOpCost{static_cast(D * 3), + static_cast(D), + static_cast(D * 3)}, + [x_data, y_data, D, y_scale, yzp, &lookup_table](std::ptrdiff_t first, std::ptrdiff_t last) { + const auto c_y_scale = y_scale; + const auto c_y_zp = yzp; + + const int8_t* x_t = x_data + first * D; + int8_t* y_t = y_data + first * D; + for (; first < last; first++) { + // reduceMaxInt8 + int8_t xmax = *std::max_element(x_t, x_t + D); + const size_t adjustment = 127 - xmax; + const uint32_t* shifted_lookuptable = lookup_table; + size_t elements_n = D; + // reduceSumUin8ToUint32: need speedup + uint32_t vsum = 0; + const int8_t* x_t_cur = x_t; + do { + const size_t vx = uint8_t(adjustment + (*x_t_cur++)); + vsum += shifted_lookuptable[vx]; + } while (--elements_n != 0); + if (vsum == 0) { + return; + } + elements_n = D; + x_t_cur = x_t; + // elementwise div + const uint32_t vrounding = (vsum >> 1); + do { + const size_t vx = uint8_t(adjustment + (*x_t_cur++)); + const uint32_t vt = shifted_lookuptable[vx]; + // simulate round function, and re-quant to Int8 + const uint32_t vq = ((vt * c_y_scale) + vrounding) / vsum + c_y_zp; + const int8_t vy = static_cast(vq) > 255 ? static_cast(255) : static_cast(vq); + *y_t++ = vy; + } while (--elements_n != 0); + x_t = x_t_cur; + } + }); + + return Status::OK(); +} + +gsl::span QLinearSoftmax::GetLookupTable(OpKernelContext* context, + gsl::span lookup_table_span, + size_t reduce_len) const { + gsl::span lookup_table = fixed_lookup_table_; + if (fixed_lookup_table_.size() == 0) { + lookup_table = lookup_table_span; + const float X_scale = *(context->Input(1)->Data()); + QlinearBuildLookupTableUint32(lookup_table_span, X_scale, reduce_len, is_signed_); + } + return lookup_table; +} + +// opset-12 and below +Status QLinearSoftmax::ComputeInternal(OpKernelContext* context, const Tensor& input, Tensor& output, + gsl::span lookup_table, int axis, + concurrency::ThreadPool* thread_pool) const { + const auto* Y_scale_tensor = context->Input(3); + const auto* Y_zp_tensor = context->Input(4); + const auto Y_scale = gsl::narrow_cast(1.0F / (*(Y_scale_tensor->Data()))); + const auto& X_shape = input.Shape(); + const size_t N = X_shape.SizeToDimension(axis); + const size_t D = X_shape.SizeFromDimension(axis); + common::Status status; + if (is_signed_) { + using T = int8_t; + const T Y_zp = Y_zp_tensor ? *(Y_zp_tensor->Data()) : 0; + status = QlinearSoftmaxCPU(N, D, input.Data(), output.MutableData(), + lookup_table.data(), Y_scale, Y_zp, thread_pool); + } else { + using T = uint8_t; + const T Y_zp = Y_zp_tensor ? *(Y_zp_tensor->Data()) : 0; + status = QlinearSoftmaxCPU(N, D, input.Data(), output.MutableData(), + lookup_table.data(), Y_scale, Y_zp, thread_pool); + } + return status; +} + +// opset-13 and above +Status QLinearSoftmax::ComputeImplOpset13(OpKernelContext* context, + const Tensor& input, Tensor& output, + gsl::span lookup_table, + concurrency::ThreadPool* thread_pool) const { + const auto& X_shape = input.Shape(); + size_t rank = X_shape.NumDimensions(); + + bool is_transpose_required = (size_t(axis_) != (rank - 1)); + Tensor transposed_input; + Tensor intermediate_output; // output that the softmax implementation will write into while using transposed input + std::vector permutation(rank); + + if (is_transpose_required) { + AllocatorPtr alloc; + ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&alloc)); + std::iota(std::begin(permutation), std::end(permutation), 0); + + // swap the innermost dim with the dim corresponding to axis + permutation[axis_] = rank - 1; + permutation[rank - 1] = axis_; + std::vector transposed_input_dims(rank); + std::transform(permutation.cbegin(), permutation.cend(), + transposed_input_dims.begin(), [&X_shape](size_t e) { return X_shape[e]; }); + + // Allocate a temporary tensor to hold transposed input + transposed_input = Tensor(input.DataType(), TensorShape(transposed_input_dims), alloc); + // Perform the transpose + ORT_RETURN_IF_ERROR(TransposeBase::DoTranspose(permutation, input, transposed_input)); + // Allocate memory for the intermediate output + intermediate_output = Tensor(output.DataType(), TensorShape(transposed_input_dims), alloc); + } + + common::Status status; + + const auto& input_tensor = is_transpose_required ? transposed_input : input; + auto& output_tensor = is_transpose_required ? intermediate_output : output; + + ORT_RETURN_IF_ERROR(ComputeInternal(context, input_tensor, output_tensor, lookup_table, int(rank - 1), thread_pool)); + + if (is_transpose_required) { + // Perform the transpose to get the axes back to the original ordering + status = (TransposeBase::DoTranspose(permutation, intermediate_output, output)); + } + return status; +} + +ONNX_CPU_OPERATOR_MS_KERNEL( + QLinearSoftmax, + 1, + KernelDefBuilder().TypeConstraint( + "T", + {DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType()}), + QLinearSoftmax) + +} // namespace contrib +} // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cpu/quantization/qlinear_softmax.h b/onnxruntime/contrib_ops/cpu/quantization/qlinear_softmax.h new file mode 100644 index 0000000000..a90083cd40 --- /dev/null +++ b/onnxruntime/contrib_ops/cpu/quantization/qlinear_softmax.h @@ -0,0 +1,34 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include + +#include "core/framework/op_kernel.h" + +namespace onnxruntime { +namespace contrib { + +class QLinearSoftmax final : public OpKernel { + public: + QLinearSoftmax(const OpKernelInfo& info); + Status Compute(OpKernelContext* context) const override; + + private: + gsl::span GetLookupTable(OpKernelContext* context, gsl::span lookup_table_span, size_t reduce_len) const; + + Status ComputeInternal(OpKernelContext* context, const Tensor& input, Tensor& output, gsl::span lookup_table, int axis, concurrency::ThreadPool* thread_pool) const; + + Status ComputeImplOpset13(OpKernelContext* context, const Tensor& input, Tensor& output, + gsl::span lookup_table, concurrency::ThreadPool* thread_pool) const; + + private: + std::vector fixed_lookup_table_; + int axis_ = -1; + int opset_ = 1; + bool is_signed_{false}; +}; + +} // namespace contrib +} // namespace onnxruntime diff --git a/onnxruntime/core/graph/contrib_ops/ms_opset.h b/onnxruntime/core/graph/contrib_ops/ms_opset.h index 4d1234988a..c6850eb8e3 100644 --- a/onnxruntime/core/graph/contrib_ops/ms_opset.h +++ b/onnxruntime/core/graph/contrib_ops/ms_opset.h @@ -29,6 +29,7 @@ class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QLinearLeakyRelu); class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QLinearMul); class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QLinearReduceMean); class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QLinearSigmoid); +class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QLinearSoftmax); class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QuantizeLinear); class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, ReduceSumInteger); @@ -98,6 +99,7 @@ class OpSet_Microsoft_ver1 { fn(GetOpSchema()); fn(GetOpSchema()); fn(GetOpSchema()); + fn(GetOpSchema()); fn(GetOpSchema()); fn(GetOpSchema()); diff --git a/onnxruntime/core/graph/contrib_ops/quantization_defs.cc b/onnxruntime/core/graph/contrib_ops/quantization_defs.cc index 7e9995aff5..34fb5a16ad 100644 --- a/onnxruntime/core/graph/contrib_ops/quantization_defs.cc +++ b/onnxruntime/core/graph/contrib_ops/quantization_defs.cc @@ -559,7 +559,7 @@ and produces one output data (Tensor) where the function `f(x) = quantize(alp .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput)); const char* QLinearSigmoidDoc_ver1 = R"DOC( -QLinearSigmoid takes quantized input data (Tensor), and quantize parameter for output, and produces one output data +QLinearSigmoid takes quantized input data (Tensor), and quantize parameter for output, and produces one output data (Tensor) where the function `f(x) = quantize(Sigmoid(dequantize(x)))`, is applied to the data tensor elementwise. Wwhere the function `Sigmoid(x) = 1 / (1 + exp(-x))` )DOC"; @@ -585,6 +585,62 @@ Wwhere the function `Sigmoid(x) = 1 / (1 + exp(-x))` )DOC"; "Constrain input and output types to 8 bit tensors.") .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput)); + ONNX_MS_OPERATOR_SET_SCHEMA(QLinearSoftmax, 1, OpSchema().SetDoc(R"DOC( +QLinearSoftmax computes the normalized exponential values for the given input: +Softmax(input, axis) = Exp(input) / ReduceSum(Exp(input), axis=axis, keepdims=1) +The input does not need to explicitly be a 2D vector. The "axis" attribute +indicates the dimension along which QLinearSoftmax will be performed for onnx v.13+. +or the dimension coerced to NxD Matrix for onnx v.12-. +The output tensor has the same shape. +)DOC") + .Attr("axis", "apply softmax to elements for dimensions axis," + "or all dims along with axis according to op-version", AttributeProto::INT, static_cast(-1)) + .Attr("opset", "opset version of corresponding SoftMax.", AttributeProto::INT) + .Input(0, "X", "The input tensor", "T") + .Input(1, "X_scale", "Scale of quantized input 'X'. It must be a scalar.", "tensor(float)") + .Input(2, "x_zero_point", + "Zero point tensor for input 'X'." + "It must be a scalar.", + "T", OpSchema::Optional) + .Input(3, "y_scale", "Scale of quantized output 'Y'. It must be a scalar.", "tensor(float)") + .Input(4, "y_zero_point", + "Zero point tensor for output 'Y'. " + "It must be a scalar.", + "T") + .Output(0, "Y", + "Output data tensor from pooling across the input " + "tensor. The output tensor has the same rank as the input. ", + "T") + .TypeConstraint("T", {"tensor(uint8)", "tensor(int8)"}, + "Constrain input and output types to singed/unsigned int8 tensors.") + .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) { + // Type inference + propagateElemTypeFromInputToOutput(ctx, 0, 0); + + // Shape inference starts + if (!hasNInputShapes(ctx, 1)) { + return; + } + + // Validate the value of 'axis' + const ONNX_NAMESPACE::TensorShapeProto& input_shape = + ctx.getInputType(0)->tensor_type().shape(); + int r = input_shape.dim_size(); + int axis = static_cast(getAttribute(ctx, "axis", -1)); + if (axis < -r || axis >= r) { + fail_shape_inference( + "'axis' must be in [", + -r, + " , ", + (r - 1), + "]. Its actual value is: ", + axis); + } + + // Shape inference + propagateShapeFromInputToOutput(ctx, 0, 0); + })); + ONNX_MS_OPERATOR_SET_SCHEMA(DynamicQuantizeLSTM, 1, OpSchema() .Attr( "direction", diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc index 0cbd499f45..379f90a4f0 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc +++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc @@ -4,7 +4,7 @@ #include "core/optimizer/qdq_transformer/selectors_actions/qdq_actions.h" #include "core/optimizer/qdq_transformer/qdq_util.h" - +#include "core/graph/node_attr_utils.h" namespace onnxruntime { namespace QDQ { @@ -195,6 +195,15 @@ UnaryReplaceWithQLinear::UnaryReplaceWithQLinear(std::string domain) : ReplaceWithQLinear(std::move(domain), UnaryMoves()) { } +NodeAttributes UnaryReplaceWithQLinear::ExtraAttributes(const RuntimeState& state) const { + const auto& target = state.selected_nodes.Target(); + NodeAttributes attr; + if (target.OpType() == "Softmax") { + attr["opset"] = utils::MakeAttribute(std::string("opset"), int64_t(target.SinceVersion())); + } + return attr; +} + BinaryReplaceWithQLinear::BinaryReplaceWithQLinear(std::string domain) : ReplaceWithQLinear(std::move(domain), BinaryMoves()) { } diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.h b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.h index c9f889ede4..b8e371ecc8 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.h +++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.h @@ -43,6 +43,9 @@ struct ReplaceWithQLinear : public QDQReplaceWithNew { struct UnaryReplaceWithQLinear : ReplaceWithQLinear { UnaryReplaceWithQLinear(std::string domain); + + private: + NodeAttributes ExtraAttributes(const RuntimeState& state) const override; }; struct BinaryReplaceWithQLinear : ReplaceWithQLinear { diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc index 9e75a0fbad..a2ea67813f 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc +++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc @@ -82,7 +82,8 @@ void UnaryOpQDQRules(SelectorActionRegistry& qdq_selector_action_registry) { {{"AveragePool", {}}, {"LeakyRelu", {}}, {"GlobalAveragePool", {}}, - {"Sigmoid", {}}}, + {"Sigmoid", {}}, + {"Softmax", {}}}, std::move(selector), std::move(action)); #else diff --git a/onnxruntime/python/tools/quantization/onnx_quantizer.py b/onnxruntime/python/tools/quantization/onnx_quantizer.py index 6ce15593cb..1c1ade6948 100644 --- a/onnxruntime/python/tools/quantization/onnx_quantizer.py +++ b/onnxruntime/python/tools/quantization/onnx_quantizer.py @@ -132,6 +132,8 @@ class ONNXQuantizer: # some output from nodes will be quantized, yet itself should be treat as existing so # no dequantized will be applied when needed later self.generated_value_names = self.model.get_non_initializer_inputs() + # to store specified scale and zeropoint instead of calculated value, tensor_name->(scale, zeropoint) + self.used_scale_zp_map = {} # routines for subgraph support def quantize_subgraph(self, subgraph, graph_key): @@ -625,6 +627,18 @@ class ONNXQuantizer: self.quantized_value_map[input_name] = QuantizedValue(input_name, output_name, scale_name, zp_name, qType) return nodes + [qlinear_node] + def set_quant_scale_zp(self, tensor_name, value): + assert isinstance(value, tuple) and len(value) == 2, "value must be scale(float) and zeropoint" + assert tensor_name not in self.used_scale_zp_map, f"{tensor_name} has been setted before" + self.used_scale_zp_map[tensor_name] = value + + def find_quant_scale_zp(self, input_name): + if input_name in self.used_scale_zp_map: + return self.used_scale_zp_map[input_name] + if self.parent is not None: + return self.parent.find_quantized_value(input_name) + return (None, None) + def find_quantized_value(self, input_name): if input_name in self.quantized_value_map: return self.quantized_value_map[input_name] diff --git a/onnxruntime/python/tools/quantization/operators/softmax.py b/onnxruntime/python/tools/quantization/operators/softmax.py new file mode 100644 index 0000000000..8855859236 --- /dev/null +++ b/onnxruntime/python/tools/quantization/operators/softmax.py @@ -0,0 +1,86 @@ +import onnx + +from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain +from .base_operator import QuantOperatorBase +from .qdq_base_operator import QDQOperatorBase + + +class QLinearSoftmax(QuantOperatorBase): + def quantize(self): + node = self.node + # set limitations for softmax output scale and zp, because the output of softmax is always 0-1 + if self.quantizer.input_qType == onnx.onnx_pb.TensorProto.UINT8: + out_scale = 1 / 256.0 + out_zero_point = 0 + else: + out_scale = 1 / 256.0 + out_zero_point = -128 + # only try to quantize when given quantization parameters for it + ( + data_found, + output_scale_name, + output_zp_name, + _, + _, + ) = self.quantizer._get_quantization_params(node.output[0], out_scale, out_zero_point) + + # get quantized input tensor names, quantize input if needed + ( + quantized_input_names, + input_zero_point_names, + input_scale_names, + nodes, + ) = self.quantizer.quantize_inputs(node, [0]) + + if not data_found or quantized_input_names is None: + return super().quantize() + + # Create an entry for output quantized value. + qlinear_output_name = node.output[0] + "_quantized" + quantized_output_value = QuantizedValue( + node.output[0], + qlinear_output_name, + output_scale_name, + output_zp_name, + QuantizedValueType.Input, + ) + self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value + + # Create qlinear softmax node for given type + kwargs = {} + for attribute in node.attribute: + kwargs.update(attribute_to_kwarg(attribute)) + kwargs["domain"] = ms_domain + # make qlinearsoft has the real opset_version, its default SinceVersion would be 1 + kwargs["opset"] = self.quantizer.opset_version + qlinear_node_name = node.name + "_quant" if node.name != "" else "" + qnode = onnx.helper.make_node( + "QLinear" + node.op_type, + [ + quantized_input_names[0], + input_scale_names[0], + input_zero_point_names[0], + output_scale_name, + output_zp_name, + ], + [qlinear_output_name], + qlinear_node_name, + **kwargs, + ) + + # add all newly created nodes + nodes.append(qnode) + self.quantizer.new_nodes += nodes + return None + + +class QDQSoftmax(QDQOperatorBase): + def quantize(self): + super().quantize() + if self.quantizer.input_qType == onnx.onnx_pb.TensorProto.UINT8: + out_scale = 1 / 256.0 + out_zero_point = 0 + else: + out_scale = 1 / 256.0 + out_zero_point = -128 + self.quantizer.set_quant_scale_zp(self.node.output[0], (out_scale, out_zero_point)) diff --git a/onnxruntime/python/tools/quantization/qdq_quantizer.py b/onnxruntime/python/tools/quantization/qdq_quantizer.py index b397d0728c..8957b7144f 100644 --- a/onnxruntime/python/tools/quantization/qdq_quantizer.py +++ b/onnxruntime/python/tools/quantization/qdq_quantizer.py @@ -340,7 +340,10 @@ class QDQQuantizer(ONNXQuantizer): if initializer: self._add_qdq_pair_for_weight(initializer, tensor_info.axis) else: - data_found, scale_name, zp_name, _, _ = self._get_quantization_params(tensor_name) + used_scale, used_zp = self.find_quant_scale_zp(tensor_name) + data_found, scale_name, zp_name, _, _ = self._get_quantization_params( + tensor_name, used_scale, used_zp + ) if not data_found: raise ValueError( diff --git a/onnxruntime/python/tools/quantization/registry.py b/onnxruntime/python/tools/quantization/registry.py index 59f42dc1cc..0227da5e24 100644 --- a/onnxruntime/python/tools/quantization/registry.py +++ b/onnxruntime/python/tools/quantization/registry.py @@ -17,6 +17,7 @@ from .operators.pad import QPad from .operators.pooling import QLinearPool from .operators.qdq_base_operator import QDQOperatorBase from .operators.resize import QDQResize, QResize +from .operators.softmax import QDQSoftmax, QLinearSoftmax from .operators.split import QDQSplit, QSplit from .quant_utils import QuantizationMode @@ -55,6 +56,7 @@ QLinearOpsRegistry = { "Resize": QResize, "AveragePool": QLinearPool, "Concat": QLinearConcat, + "Softmax": QLinearSoftmax, } QLinearOpsRegistry.update(CommonOpsRegistry) @@ -73,6 +75,7 @@ QDQRegistry = { "MatMul": QDQMatMul, "Split": QDQSplit, "Gather": QDQGather, + "Softmax": QDQSoftmax, } diff --git a/onnxruntime/test/contrib_ops/qlinear_lookup_table_test.cc b/onnxruntime/test/contrib_ops/qlinear_lookup_table_test.cc index 4a8a861b61..cdec3cd5a2 100644 --- a/onnxruntime/test/contrib_ops/qlinear_lookup_table_test.cc +++ b/onnxruntime/test/contrib_ops/qlinear_lookup_table_test.cc @@ -115,5 +115,153 @@ TEST(QLinearLookupTableBasedOperatorTests, QLinearSigmoid_UInt8_0_Y_ZP) { run_test(true); } +/* +\brief data is generated by pytorch script +\details model defines +``` + input(int8/uint8) + x = self.dequant(x) + x = self.softmax(x) + x = self.quant2(x) + output(int8/uint8) +``` +\see then followed by the [DOC](https://pytorch.org/docs/stable/quantization.html) +*/ +TEST(QLinearLookupTableBasedOperatorTests, QLinearSoftmax_UInt8_v12) { + OpTester test("QLinearSoftmax", 1, onnxruntime::kMSDomain); + test.AddAttribute("axis", -2); + test.AddAttribute("opset", 12); + float X_scale = 0.166099221f; + // + uint8_t X_zero_point = 128; + float Y_scale = 1.0f / 256.0f; + uint8_t Y_zero_point = 0; + // + + std::vector dims = {2, 4, 5}; + auto x_in = std::vector{50, 67, 58, 68, 46, 69, 77, 91, 62, 74, 67, 72, 71, 70, 83, 88, 75, 54, 74, 88}; + auto y_out = std::vector { 0, 2, 0, 2, 0, 2, 8, 86, 1, 5, 2, 4, 3, 3, 23, 52, 6, 0, 5, 52 }; + for (int64_t i = 1; i < dims[0]; i++) { + for (int64_t j = 0; j < dims[1] * dims[2]; j++) { + x_in.push_back(x_in[j]); + y_out.push_back(y_out[j]); + } + } + + test.AddInput("X", dims, x_in); + test.AddInput("X_scale", {}, {X_scale}); + test.AddInput("X_zero_point", {}, {X_zero_point}); + test.AddInput("Y_scale", {}, {Y_scale}); + test.AddInput("Y_zero_point", {}, {Y_zero_point}); + test.AddOutput("Y", dims, y_out); + auto origin_round_mode = std::fegetround(); + std::fesetround(FE_TONEAREST); + test.Run(); + std::fesetround(origin_round_mode); +} + +TEST(QLinearLookupTableBasedOperatorTests, QLinearSoftmax_UInt8_v13) { + OpTester test("QLinearSoftmax", 1, onnxruntime::kMSDomain); + test.AddAttribute("axis", -2); + test.AddAttribute("opset", 13); + float X_scale = 0.0304f; + // + uint8_t X_zero_point = 128; + float Y_scale = 0.0059f; + uint8_t Y_zero_point = 0; + // + + std::vector dims = {4, 4, 4}; + auto x_in = std::vector { + 62, 50, 71, 37, 68, 88, 64, 51, 59, 95, 41, 54, 55, 20, 77, 32, 92, + 63, 43, 13, 76, 82, 53, 43, 60, 18, 73, 74, 22, 89, 44, 106, 17, + 95, 27, 35, 47, 57, 0, 78, 97, 66, 56, 28, 127, 33, 106, 71, 119, + 64, 16, 0, 16, 79, 27, 89, 110, 126, 88, 90, 67, 11, 4, 90}; + auto y_out = std::vector { + 43, 20, 50, 33, 52, 63, 40, 51, 39, 78, + 20, 56, 35, 8, 59, 29, 80, 32, 29, 6, 49, 57, 39, 16, 30, 8, 72, 40, + 10, 71, 30, 107, 4, 90, 11, 20, 10, 28, 5, 74, 45, 37, 27, 16, 111, 14, + 125, 59, 84, 18, 14, 4, 4, 28, 20, 54, 64, 119, 126, 56, 17, 4, 10, 56}; + + test.AddInput("X", dims, x_in); + test.AddInput("X_scale", {}, {X_scale}); + test.AddInput("X_zero_point", {}, {X_zero_point}); + test.AddInput("Y_scale", {}, {Y_scale}); + test.AddInput("Y_zero_point", {}, {Y_zero_point}); + test.AddOutput("Y", dims, y_out); + auto origin_round_mode = std::fegetround(); + std::fesetround(FE_TONEAREST); + test.Run(); + std::fesetround(origin_round_mode); +} + +TEST(QLinearLookupTableBasedOperatorTests, QLinearSoftmax_Int8_v13) { + OpTester test("QLinearSoftmax", 1, onnxruntime::kMSDomain); + test.AddAttribute("axis", -2); + test.AddAttribute("opset", 13); + float X_scale = 0.0304F; + // + int8_t X_zero_point = 0; + float Y_scale = 0.0059F; + int8_t Y_zero_point = -128; + // + + std::vector dims = {4, 4, 4}; + auto x_in = std::vector { + -4, -16, 5, -29, 2, 22, -2, -15, -7, 29, -25, -12, -11, -46, 11, -34, 26, + -3, -23, -53, 10, 16, -13, -23, -6, -48, 7, 8, -44, 23, -22, 40, -49, 29, -39, -31, -19, -9, + -72, 12, 31, 0, -10, -38, 61, -33, 40, 5, 53, -2, -50, -66, -50, 13, -39, 23, 44, 60, 22, 24, + 1, -55, -62, 24}; + auto y_out = std::vector { + -85, -108, -78, -95, -76, -65, -88, -77, -89, -50, -108, -72, -93, + -120, -69, -99, -48, -96, -99, -122, -79, -71, -89, -112, -98, -120, -56, -88, -118, -57, -98, + -21, -124, -38, -117, -108, -118, -100, -124, -54, -83, -91, -100, -112, -17, -114, -2, -69, -44, + -110, -114, -124, -124, -100, -108, -74, -64, -9, -2, -72, -111, -124, -118, -72}; + + test.AddInput("X", dims, x_in); + test.AddInput("X_scale", {}, {X_scale}); + test.AddInput("X_zero_point", {}, {X_zero_point}); + test.AddInput("Y_scale", {}, {Y_scale}); + test.AddInput("Y_zero_point", {}, {Y_zero_point}); + test.AddOutput("Y", dims, y_out); + auto origin_round_mode = std::fegetround(); + std::fesetround(FE_TONEAREST); + test.Run(); + std::fesetround(origin_round_mode); +} + +TEST(QLinearLookupTableBasedOperatorTests, QLinearSoftmax_Int8_v12) { + OpTester test("QLinearSoftmax", 1, onnxruntime::kMSDomain); + test.AddAttribute("axis", -2); + test.AddAttribute("opset", 12); + float X_scale = 0.166099221f; + // + int8_t X_zero_point = 0; + float Y_scale = 1.0f / 128.0f; + int8_t Y_zero_point = 0; + // + + std::vector dims = {2, 4, 5}; + auto x_in = std::vector{-28, -4, -4, -7, 3, -26, 4, -16, 23, 14, -7, 26, -8, 19, -16, -13, 7, 17, 27, 5}; + auto y_out = std::vector{0, 0, 0, 0, 1, 0, 1, 0, 22, 5, 0, 35, 0, 11, 0, 0, 2, 8, 42, 1}; + for (int64_t i = 1; i < dims[0]; i++) { + for (int64_t j = 0; j < dims[1] * dims[2]; j++) { + x_in.push_back(x_in[j]); + y_out.push_back(y_out[j]); + } + } + + test.AddInput("X", dims, x_in); + test.AddInput("X_scale", {}, {X_scale}); + test.AddInput("X_zero_point", {}, {X_zero_point}); + test.AddInput("Y_scale", {}, {Y_scale}); + test.AddInput("Y_zero_point", {}, {Y_zero_point}); + test.AddOutput("Y", dims, y_out); + auto origin_round_mode = std::fegetround(); + std::fesetround(FE_TONEAREST); + test.Run(); + std::fesetround(origin_round_mode); +} + } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/optimizer/qdq_transformer_test.cc b/onnxruntime/test/optimizer/qdq_transformer_test.cc index 4131cbe497..56ae5aeafb 100644 --- a/onnxruntime/test/optimizer/qdq_transformer_test.cc +++ b/onnxruntime/test/optimizer/qdq_transformer_test.cc @@ -1857,6 +1857,67 @@ TEST(QDQTransformerTests, Concat) { test_case({{1, 6, 36}, {1, 6, 8}, {1, 6, 2}}, 2, false, false, true); } +template +void QDQTransformerSoftmaxTests() { + auto test_case = [&](const std::vector& input_shape, int64_t axis) { + auto build_test_case = [&](ModelTestBuilder& builder) { + auto* input_arg = builder.MakeInput(input_shape, -5.f, 5.f); + auto* output_arg = builder.MakeOutput(); + // add QDQ + Softmax + auto* dq_output = AddQDQNodePair(builder, input_arg, .105f, + (std::numeric_limits::max() / 255 * 255) / 2); + auto* softmax_output = builder.MakeIntermediate(); + auto& softmax_node = builder.AddNode("Softmax", {dq_output}, {softmax_output}); + softmax_node.AddAttribute("axis", axis); + // add QDQ output + auto* q_output = builder.MakeIntermediate(); + builder.AddQuantizeLinearNode(softmax_output, + 1.0f / (std::numeric_limits::max() + 1), + 0, + q_output); + builder.AddDequantizeLinearNode(q_output, + 1.0f / (std::numeric_limits::max() + 1), + 0, + output_arg); + }; + + auto check_graph = [&](InferenceSessionWrapper& session) { + auto op_to_count = CountOpsInGraph(session.GetGraph()); + if constexpr (std::is_same::value) { + EXPECT_EQ(op_to_count["com.microsoft.QLinearSoftmax"], 1); + EXPECT_EQ(op_to_count["Softmax"], 0); + EXPECT_EQ(op_to_count["QuantizeLinear"], 1); + EXPECT_EQ(op_to_count["DequantizeLinear"], 1); + } else { + EXPECT_EQ(op_to_count["com.microsoft.QLinearSoftmax"], 0); + EXPECT_EQ(op_to_count["Softmax"], 1); + EXPECT_EQ(op_to_count["QuantizeLinear"], 2); + EXPECT_EQ(op_to_count["DequantizeLinear"], 2); + } + }; + + TransformerTester(build_test_case, + check_graph, + TransformerLevel::Level1, + TransformerLevel::Level2, + 12 /*opset_version*/, + 0.01 /*per_sample_tolerance*/, + 0.01 /*relative_per_sample_tolerance*/, + std::make_unique(QDQIsInt8Allowed())); + }; + + test_case({1, 12, 37}, -1); + test_case({1, 23, 13, 13}, -2); +} + +TEST(QDQTransformerTests, Softmax_S8S8) { + QDQTransformerSoftmaxTests(); +} + +TEST(QDQTransformerTests, Softmax_U8U8) { + QDQTransformerSoftmaxTests(); +} + #endif // !defined(DISABLE_CONTRIB_OPS) TEST(QDQTransformerTests, QDQPropagation_QBackward) { diff --git a/onnxruntime/test/python/quantization/op_test_utils.py b/onnxruntime/test/python/quantization/op_test_utils.py index efb4c8fbc3..251fd12d06 100644 --- a/onnxruntime/test/python/quantization/op_test_utils.py +++ b/onnxruntime/test/python/quantization/op_test_utils.py @@ -74,6 +74,8 @@ def check_model_correctness(testcase, model_path_origin, model_path_to_check, in model_path_origin, sess_options=sess_options, providers=["CPUExecutionProvider"] ) origin_results = origin_sess.run([], inputs) + # enable QDQ transformers + # sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_EXTENDED target_sess = onnxruntime.InferenceSession( model_path_to_check, sess_options=sess_options, diff --git a/onnxruntime/test/python/quantization/test_op_softmax.py b/onnxruntime/test/python/quantization/test_op_softmax.py new file mode 100644 index 0000000000..add97f9ebc --- /dev/null +++ b/onnxruntime/test/python/quantization/test_op_softmax.py @@ -0,0 +1,180 @@ +#!/usr/bin/env python +""" +Softmax quantization test case +""" +# coding: utf-8 +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for +# license information. +# -------------------------------------------------------------------------- + +import unittest + +import numpy as np +import onnx +from onnx import TensorProto, helper +from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_qtype_by_node_type + +from onnxruntime.quantization import QuantFormat, QuantType, quantize_static + + +class TestOpSoftmax(unittest.TestCase): + """_summary_ + unittest (softmax): quantization of QDQ and Qop with u8 and s8 + """ + + def input_feeds(self, n_repeat, name2shape): + input_data_list = [] + for _ in range(n_repeat): + inputs = {} + for name, shape in name2shape.items(): + inputs.update({name: np.random.randint(-1, 2, shape).astype(np.float32)}) + input_data_list.extend([inputs]) + data_r = TestDataFeeds(input_data_list) + return data_r + + def construct_model_conv_softmax( + self, + output_model_path, + conv_input_shape, + conv_weight_shape, + softmax_input_shape, + softmax_attributes, + output_shape, + ): + # (input) + # \ + # Conv + # / \ + # Identity Softmax + # / \ + # (identity_out) (output) + input_tensor = helper.make_tensor_value_info("input", TensorProto.FLOAT, conv_input_shape) + + conv_weight_arr = np.random.randint(-1, 2, conv_weight_shape).astype(np.float32) + conv_weight_initializer = onnx.numpy_helper.from_array(conv_weight_arr, name="conv1_weight") + conv_node = onnx.helper.make_node("Conv", ["input", "conv1_weight"], ["conv_output"], name="conv_node") + + identity_out = helper.make_tensor_value_info("identity_out", TensorProto.FLOAT, softmax_input_shape) + identity_node = helper.make_node("Identity", ["conv_output"], ["identity_out"], name="IdentityNode") + + initializers = [conv_weight_initializer] + + output_tensor = helper.make_tensor_value_info("output", TensorProto.FLOAT, output_shape) + softmax_node = helper.make_node( + "Softmax", ["conv_output"], ["output"], name="softmax_node", **softmax_attributes + ) + + graph = helper.make_graph( + [conv_node, identity_node, softmax_node], + "TestOpQuantizersoftmax_test_model", + [input_tensor], + [identity_out, output_tensor], + initializer=initializers, + ) + model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)]) + model.ir_version = 7 # use stable onnx ir version + onnx.save(model, output_model_path) + + def quantize_softmax_test(self, activation_type, weight_type, extra_options={}): + np.random.seed(1) + model_fp32_path = "softmax_fp32.onnx" + self.construct_model_conv_softmax( + model_fp32_path, + [1, 2, 26, 42], + [3, 2, 3, 3], + [1, 3, 24, 40], + {"axis": -2}, + [1, 3, 24, 40], + ) + data_reader = self.input_feeds(1, {"input": [1, 2, 26, 42]}) + + activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8 + activation_type_str = "u8" if (activation_type == QuantType.QUInt8) else "s8" + weight_type_str = "u8" if (weight_type == QuantType.QUInt8) else "s8" + model_q8_path = f"softmax_{activation_type_str}{weight_type_str}.onnx" + model_q8_qdq_path = f"softmax_qdq_{activation_type_str}{weight_type_str}.onnx" + + # Verify QOperator mode + data_reader.rewind() + quantize_static( + model_fp32_path, + model_q8_path, + data_reader, + quant_format=QuantFormat.QOperator, + activation_type=activation_type, + weight_type=weight_type, + extra_options=extra_options, + ) + qnode_counts = { + "QLinearConv": 1, + "QuantizeLinear": 1, + "DequantizeLinear": 2, + "QLinearSoftmax": 1, + "Softmax": 0, + } + check_op_type_count(self, model_q8_path, **qnode_counts) + qnode_io_qtypes = { + "QuantizeLinear": [ + ["i", 2, activation_proto_qtype], + ["o", 0, activation_proto_qtype], + ] + } + qnode_io_qtypes.update( + { + "QLinearConv": [ + ["i", 2, activation_proto_qtype], + ["i", 7, activation_proto_qtype], + ["o", 0, activation_proto_qtype], + ] + } + ) + qnode_io_qtypes.update( + {"QLinearSoftmax": [["i", 4, activation_proto_qtype]]} + ) # shape info note workig on custome ops + check_qtype_by_node_type(self, model_q8_path, qnode_io_qtypes) + data_reader.rewind() + check_model_correctness(self, model_fp32_path, model_q8_path, data_reader.get_next()) + + # Verify QDQ mode + data_reader.rewind() + quantize_static( + model_fp32_path, + model_q8_qdq_path, + data_reader, + quant_format=QuantFormat.QDQ, + activation_type=activation_type, + weight_type=weight_type, + extra_options=extra_options, + ) + qdqnode_counts = { + "Conv": 1, + "QuantizeLinear": 3, + "DequantizeLinear": 4, + "Softmax": 1, + } + check_op_type_count(self, model_q8_qdq_path, **qdqnode_counts) + qnode_io_qtypes = { + "QuantizeLinear": [ + ["i", 2, activation_proto_qtype], + ["o", 0, activation_proto_qtype], + ] + } + check_qtype_by_node_type(self, model_q8_qdq_path, qnode_io_qtypes) + data_reader.rewind() + check_model_correctness(self, model_fp32_path, model_q8_qdq_path, data_reader.get_next()) + + def test_quantize_softmax(self): + self.quantize_softmax_test(QuantType.QUInt8, QuantType.QUInt8) + + def test_quantize_softmax_s8s8(self): + self.quantize_softmax_test( + QuantType.QInt8, + QuantType.QInt8, + extra_options={"ActivationSymmetric": True}, + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/onnxruntime/test/testdata/kernel_def_hashes/contrib.cpu.json b/onnxruntime/test/testdata/kernel_def_hashes/contrib.cpu.json index 0d624bf0a5..5fb55faa14 100644 --- a/onnxruntime/test/testdata/kernel_def_hashes/contrib.cpu.json +++ b/onnxruntime/test/testdata/kernel_def_hashes/contrib.cpu.json @@ -298,5 +298,9 @@ [ "QGemm com.microsoft CPUExecutionProvider", 13737193491843065240 + ], + [ + "QLinearSoftmax com.microsoft CPUExecutionProvider", + 10339195975968977840 ] -] \ No newline at end of file +]