diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
index 714a265a52..2dca2eca8b 100644
--- a/docs/ContribOperators.md
+++ b/docs/ContribOperators.md
@@ -55,6 +55,7 @@ Do not modify directly.*
* com.microsoft.QLinearMul
* com.microsoft.QLinearReduceMean
* com.microsoft.QLinearSigmoid
+ * com.microsoft.QLinearSoftmax
* com.microsoft.QuantizeLinear
* com.microsoft.Range
* com.microsoft.ReduceSumInteger
@@ -2771,7 +2772,7 @@ This version of the operator has been available since version 1 of the 'com.micr
### **com.microsoft.QLinearSigmoid**
- QLinearSigmoid takes quantized input data (Tensor), and quantize parameter for output, and produces one output data
+ QLinearSigmoid takes quantized input data (Tensor), and quantize parameter for output, and produces one output data
(Tensor) where the function `f(x) = quantize(Sigmoid(dequantize(x)))`, is applied to the data tensor elementwise.
Wwhere the function `Sigmoid(x) = 1 / (1 + exp(-x))`
@@ -2809,6 +2810,58 @@ This version of the operator has been available since version 1 of the 'com.micr
+### **com.microsoft.QLinearSoftmax**
+
+ QLinearSoftmax computes the normalized exponential values for the given input:
+ Softmax(input, axis) = Exp(input) / ReduceSum(Exp(input), axis=axis, keepdims=1)
+ The input does not need to explicitly be a 2D vector. The "axis" attribute
+ indicates the dimension along which QLinearSoftmax will be performed for onnx v.13+.
+ or the dimension coerced to NxD Matrix for onnx v.12-.
+ The output tensor has the same shape.
+
+#### Version
+
+This version of the operator has been available since version 1 of the 'com.microsoft' operator set.
+
+#### Attributes
+
+
+- axis : int
+- apply softmax to elements for dimensions axis,or all dims along with axis according to op-version
+- opset : int (required)
+- opset version of corresponding SoftMax.
+
+
+#### Inputs
+
+
+- X : T
+- The input tensor
+- X_scale : tensor(float)
+- Scale of quantized input 'X'. It must be a scalar.
+- x_zero_point (optional) : T
+- Zero point tensor for input 'X'.It must be a scalar.
+- y_scale : tensor(float)
+- Scale of quantized output 'Y'. It must be a scalar.
+- y_zero_point : T
+- Zero point tensor for output 'Y'. It must be a scalar.
+
+
+#### Outputs
+
+
+- Y : T
+- Output data tensor from pooling across the input tensor. The output tensor has the same rank as the input.
+
+
+#### Type Constraints
+
+
+- T : tensor(uint8), tensor(int8)
+- Constrain input and output types to singed/unsigned int8 tensors.
+
+
+
### **com.microsoft.QuantizeLinear**
The linear quantization operator. It consumes a full precision data, a scale, a zero point to compute the low precision / quantized tensor.
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index e1b5253548..f50b920234 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -430,6 +430,7 @@ Do not modify directly.*
|QLinearLeakyRelu|*in* X:**T**
*in* X_scale:**tensor(float)**
*in* X_zero_point:**T**
*in* Y_scale:**tensor(float)**
*in* Y_zero_point:**T**
*out* Y:**T**|1+|**T** = tensor(int8), tensor(uint8)|
|QLinearMul|*in* A:**T**
*in* A_scale:**tensor(float)**
*in* A_zero_point:**T**
*in* B:**T**
*in* B_scale:**tensor(float)**
*in* B_zero_point:**T**
*in* C_scale:**tensor(float)**
*in* C_zero_point:**T**
*out* C:**T**|1+|**T** = tensor(int8), tensor(uint8)|
|QLinearSigmoid|*in* X:**T**
*in* X_scale:**tensor(float)**
*in* X_zero_point:**T**
*in* Y_scale:**tensor(float)**
*in* Y_zero_point:**T**
*out* Y:**T**|1+|**T** = tensor(int8), tensor(uint8)|
+|QLinearSoftmax|*in* X:**T**
*in* X_scale:**tensor(float)**
*in* x_zero_point:**T**
*in* y_scale:**tensor(float)**
*in* y_zero_point:**T**
*out* Y:**T**|1+|**T** = tensor(int8), tensor(uint8)|
|QuantizeLinear|*in* x:**T1**
*in* y_scale:**T1**
*in* y_zero_point:**T2**
*out* y:**T2**|1+|**T1** = tensor(float)
**T2** = tensor(int8), tensor(uint8)|
|Range|*in* start:**T**
*in* limit:**T**
*in* delta:**T**
*out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(int16), tensor(int32), tensor(int64)|
|SampleOp|*in* X:**T**
*out* Y:**T**|1+|**T** = tensor(float)|
diff --git a/include/onnxruntime/core/framework/op_kernel.h b/include/onnxruntime/core/framework/op_kernel.h
index 34ed1b4863..0f5425315b 100644
--- a/include/onnxruntime/core/framework/op_kernel.h
+++ b/include/onnxruntime/core/framework/op_kernel.h
@@ -193,6 +193,9 @@ using BuildKernelCreateInfoFn = KernelCreateInfo (*)();
#define ONNX_CPU_OPERATOR_ML_KERNEL(name, ver, builder, ...) \
ONNX_OPERATOR_KERNEL_EX(name, kMLDomain, ver, kCpuExecutionProvider, builder, __VA_ARGS__)
+#define ONNX_CPU_OPERATOR_MS_KERNEL(name, ver, builder, ...) \
+ ONNX_OPERATOR_KERNEL_EX(name, kMSDomain, ver, kCpuExecutionProvider, builder, __VA_ARGS__)
+
#define ONNX_OPERATOR_KERNEL_EX(name, domain, ver, provider, builder, ...) \
class ONNX_OPERATOR_KERNEL_CLASS_NAME(provider, domain, ver, name); \
template <> \
diff --git a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
index 04fa2b9b9f..0de091a9a4 100644
--- a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
@@ -55,6 +55,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1,
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, QLinearLeakyRelu);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, QLinearSigmoid);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, QLinearSigmoid);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, QLinearSoftmax);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, QLinearAdd);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, QLinearAdd);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, QLinearMul);
@@ -151,6 +152,7 @@ Status RegisterQuantizationKernels(KernelRegistry& kernel_registry) {
BuildKernelCreateInfo,
BuildKernelCreateInfo,
BuildKernelCreateInfo,
+ BuildKernelCreateInfo,
BuildKernelCreateInfo,
BuildKernelCreateInfo,
BuildKernelCreateInfo,
diff --git a/onnxruntime/contrib_ops/cpu/quantization/qlinear_softmax.cc b/onnxruntime/contrib_ops/cpu/quantization/qlinear_softmax.cc
new file mode 100644
index 0000000000..89cbe521a5
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/quantization/qlinear_softmax.cc
@@ -0,0 +1,343 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "contrib_ops/cpu/quantization/qlinear_softmax.h"
+
+#include
+#include
+#include
+
+#include "core/common/common.h"
+#include "core/framework/tensorprotoutils.h"
+#include "core/providers/common.h"
+#include "core/providers/cpu/tensor/transpose.h"
+
+#include "core/mlas/inc/mlas.h"
+#include "core/platform/threadpool.h"
+#include "gsl/gsl-lite.hpp"
+
+namespace onnxruntime {
+namespace contrib {
+
+constexpr int OPSET13 = 13;
+
+namespace {
+
+void QlinearBuildLookupTableUint32(gsl::span table,
+ const float x_scale,
+ size_t reduce_len, bool is_signed) {
+ const double qscale =
+ fmin(static_cast(UINT32_MAX) / static_cast(reduce_len), static_cast(0x7fffff));
+ for (int32_t i = 0; i < 256; i++) {
+ double scaled_exp_xi = qscale * exp(static_cast(i - 255) * static_cast(x_scale));
+ // we can't get the real max value of input tensor here, so we just assume 255.
+ // in the function of `QlinearSoftmaxCPU`,
+ // all numbers will have a shift (255-max_value) if its max value is not 255
+ //
+ // if is_signed index = [1 2 3 ......126 127 -128 -127 ..... -3 -2 -1]
+ // else [0 1 2 3 4 ..... 256]
+ uint8_t index = static_cast(is_signed ? i - 128 : i);
+ table[index] = static_cast(lrint(scaled_exp_xi));
+ }
+}
+
+void BuildLookupTableIfFixed(const OpKernelInfo& info, std::vector& fixed_lookup_table,
+ size_t reduce_len, bool is_signed) {
+ const Tensor* tensor_x_scale = nullptr;
+
+ bool get_x_scale = info.TryGetConstantInput(1, &tensor_x_scale);
+ ORT_ENFORCE(tensor_x_scale == nullptr || IsScalarOr1ElementVector(tensor_x_scale),
+ "QlinearBuildLookupTable : input X_scale must be a scalar or 1D tensor of size 1");
+ bool is_fixed_parameters = get_x_scale;
+
+ if (is_fixed_parameters) {
+ fixed_lookup_table.resize(256);
+ const float X_scale = *(tensor_x_scale->Data());
+ QlinearBuildLookupTableUint32(fixed_lookup_table, X_scale, reduce_len, is_signed);
+ }
+}
+} // namespace
+
+QLinearSoftmax::QLinearSoftmax(const OpKernelInfo& info)
+ : OpKernel(info) {
+ const auto& node = info.node();
+ auto input_defs = node.InputDefs();
+ auto input_type = input_defs[0]->TypeAsProto()->tensor_type().elem_type();
+ is_signed_ = (input_type == ONNX_NAMESPACE::TensorProto_DataType_INT8);
+ const auto* x_shape = input_defs[0]->Shape();
+ ORT_ENFORCE(x_shape != nullptr && x_shape->dim_size() > 0, "input_shape of QLinearSoftmax must be existed");
+ int rank = x_shape->dim_size();
+
+ int64_t opset = -1;
+ Status status = info.GetAttr("opset", &opset);
+ ORT_ENFORCE(status.IsOK(), "opset must be existed in attributes of QlinearSoftmax");
+ opset_ = gsl::narrow_cast(opset);
+
+ int64_t axis = -1;
+ status = info.GetAttr("axis", &axis);
+ if (status.IsOK()) {
+ axis_ = gsl::narrow_cast(axis);
+ } else {
+ // opset-12 and below, the default axis value is 1
+ // opset-13, the default axis value is -1
+ axis_ = opset_ < OPSET13 ? 1 : -1;
+ }
+
+ axis_ = static_cast(HandleNegativeAxis(axis_, int64_t(rank)));
+ auto input_shape = utils::GetTensorShapeFromTensorShapeProto(*x_shape);
+ int64_t reduce_size = opset_ < OPSET13 ? input_shape.SizeFromDimension(axis_) : input_shape[axis_];
+ // reduce_size could be negative if input-shape has a dynamic axis
+ if (reduce_size > 0) {
+ BuildLookupTableIfFixed(info, fixed_lookup_table_, reduce_size, is_signed_);
+ }
+}
+
+// compute method of Softmax
+Status QLinearSoftmax::Compute(OpKernelContext* ctx) const {
+ const auto* X = ctx->Input(0);
+ const auto& X_shape = X->Shape();
+ auto* Y = ctx->Output(0, X_shape);
+
+ // edge case. one or more dims with value of 0. nothing to do
+ if (X_shape.Size() == 0) {
+ return Status::OK();
+ }
+ concurrency::ThreadPool* thread_pool = ctx->GetOperatorThreadPool();
+ const size_t D = opset_ < OPSET13 ? X_shape.SizeFromDimension(axis_): X_shape[axis_];
+ uint32_t tmp_lookup_table[256];
+ gsl::span lookup_table = GetLookupTable(ctx, tmp_lookup_table, D);
+
+ if (opset_ < OPSET13) {
+ return ComputeInternal(ctx, *X, *Y, lookup_table, axis_, thread_pool);
+ } else {
+ return ComputeImplOpset13(ctx, *X, *Y, lookup_table, thread_pool);
+ }
+}
+
+template
+common::Status QlinearSoftmaxCPU(size_t N,
+ size_t D,
+ const T* x_data,
+ T* y_data,
+ const uint32_t* lookup_table,
+ uint32_t y_scale,
+ T yzp,
+ onnxruntime::concurrency::ThreadPool* thread_pool);
+
+template <>
+common::Status QlinearSoftmaxCPU(size_t N,
+ size_t D,
+ const uint8_t* x_data,
+ uint8_t* y_data,
+ const uint32_t* lookup_table,
+ uint32_t y_scale,
+ uint8_t yzp,
+ onnxruntime::concurrency::ThreadPool* thread_pool) {
+ using onnxruntime::TensorOpCost;
+ using onnxruntime::concurrency::ThreadPool;
+ ThreadPool::TryParallelFor(
+ thread_pool, N,
+ // Read 3*N (max,sum,div) write N (div), computation=Read
+ TensorOpCost{static_cast(D * 3),
+ static_cast(D),
+ static_cast(D * 3)},
+ [x_data, y_data, D, y_scale, yzp, &lookup_table](std::ptrdiff_t first, std::ptrdiff_t last) {
+ const auto c_y_scale = y_scale;
+ const auto c_y_zp = yzp;
+ const uint8_t* x_t = x_data + first * D;
+ uint8_t* y_t = y_data + first * D;
+ for (; first < last; first++) {
+ // reduceMaxUint8
+ uint8_t xmax = *std::max_element(x_t, x_t + D);
+ // we want the xmas to align with 255 for higher precision.
+ // as we build a lookup table with X-255. So we could use the adjustment here
+ // to let all numbers have a shift in the lookup table.
+ // 1 2 3 4 5 ...........................254 255
+ // 1 3 5 ... 10
+ // after the shift --->
+ // 235 237 239 .. 255
+ const uint32_t* shifted_lookuptable = lookup_table + 255 - xmax;
+ size_t elements_n = D;
+ // reduceSumUin8ToUint32: need speedup
+ // vsum = \sum_i{e^x_i}
+ uint32_t vsum = 0;
+ const uint8_t* x_t_cur = x_t;
+ do {
+ const size_t vx = *x_t_cur++;
+ vsum += shifted_lookuptable[vx];
+ } while (--elements_n != 0);
+ if (vsum == 0) {
+ return;
+ }
+ elements_n = D;
+ x_t_cur = x_t;
+ // elementwise div, y_i=\frac{x_i}{vsum}
+ const uint32_t vrounding = (vsum >> 1);
+ do {
+ const size_t vx = *x_t_cur++;
+ const uint32_t vt = shifted_lookuptable[vx];
+ // simulate round function, and re-quant to uint8
+ const uint32_t vq = ((vt * c_y_scale) + vrounding) / vsum + c_y_zp;
+ const uint8_t vy = vq > 255 ? static_cast(255) : static_cast(vq);
+ *y_t++ = vy;
+ } while (--elements_n != 0);
+ x_t = x_t_cur;
+ }
+ });
+
+ return Status::OK();
+}
+
+template <>
+common::Status QlinearSoftmaxCPU(size_t N,
+ size_t D,
+ const int8_t* x_data,
+ int8_t* y_data,
+ const uint32_t* lookup_table,
+ uint32_t y_scale,
+ int8_t yzp,
+ onnxruntime::concurrency::ThreadPool* thread_pool) {
+ using onnxruntime::TensorOpCost;
+ using onnxruntime::concurrency::ThreadPool;
+ ThreadPool::TryParallelFor(
+ thread_pool, N,
+ // Read 3*N (max,sum,div) write N (div), computation=Read
+ TensorOpCost{static_cast(D * 3),
+ static_cast(D),
+ static_cast(D * 3)},
+ [x_data, y_data, D, y_scale, yzp, &lookup_table](std::ptrdiff_t first, std::ptrdiff_t last) {
+ const auto c_y_scale = y_scale;
+ const auto c_y_zp = yzp;
+
+ const int8_t* x_t = x_data + first * D;
+ int8_t* y_t = y_data + first * D;
+ for (; first < last; first++) {
+ // reduceMaxInt8
+ int8_t xmax = *std::max_element(x_t, x_t + D);
+ const size_t adjustment = 127 - xmax;
+ const uint32_t* shifted_lookuptable = lookup_table;
+ size_t elements_n = D;
+ // reduceSumUin8ToUint32: need speedup
+ uint32_t vsum = 0;
+ const int8_t* x_t_cur = x_t;
+ do {
+ const size_t vx = uint8_t(adjustment + (*x_t_cur++));
+ vsum += shifted_lookuptable[vx];
+ } while (--elements_n != 0);
+ if (vsum == 0) {
+ return;
+ }
+ elements_n = D;
+ x_t_cur = x_t;
+ // elementwise div
+ const uint32_t vrounding = (vsum >> 1);
+ do {
+ const size_t vx = uint8_t(adjustment + (*x_t_cur++));
+ const uint32_t vt = shifted_lookuptable[vx];
+ // simulate round function, and re-quant to Int8
+ const uint32_t vq = ((vt * c_y_scale) + vrounding) / vsum + c_y_zp;
+ const int8_t vy = static_cast(vq) > 255 ? static_cast(255) : static_cast(vq);
+ *y_t++ = vy;
+ } while (--elements_n != 0);
+ x_t = x_t_cur;
+ }
+ });
+
+ return Status::OK();
+}
+
+gsl::span QLinearSoftmax::GetLookupTable(OpKernelContext* context,
+ gsl::span lookup_table_span,
+ size_t reduce_len) const {
+ gsl::span lookup_table = fixed_lookup_table_;
+ if (fixed_lookup_table_.size() == 0) {
+ lookup_table = lookup_table_span;
+ const float X_scale = *(context->Input(1)->Data());
+ QlinearBuildLookupTableUint32(lookup_table_span, X_scale, reduce_len, is_signed_);
+ }
+ return lookup_table;
+}
+
+// opset-12 and below
+Status QLinearSoftmax::ComputeInternal(OpKernelContext* context, const Tensor& input, Tensor& output,
+ gsl::span lookup_table, int axis,
+ concurrency::ThreadPool* thread_pool) const {
+ const auto* Y_scale_tensor = context->Input(3);
+ const auto* Y_zp_tensor = context->Input(4);
+ const auto Y_scale = gsl::narrow_cast(1.0F / (*(Y_scale_tensor->Data())));
+ const auto& X_shape = input.Shape();
+ const size_t N = X_shape.SizeToDimension(axis);
+ const size_t D = X_shape.SizeFromDimension(axis);
+ common::Status status;
+ if (is_signed_) {
+ using T = int8_t;
+ const T Y_zp = Y_zp_tensor ? *(Y_zp_tensor->Data()) : 0;
+ status = QlinearSoftmaxCPU(N, D, input.Data(), output.MutableData(),
+ lookup_table.data(), Y_scale, Y_zp, thread_pool);
+ } else {
+ using T = uint8_t;
+ const T Y_zp = Y_zp_tensor ? *(Y_zp_tensor->Data()) : 0;
+ status = QlinearSoftmaxCPU(N, D, input.Data(), output.MutableData(),
+ lookup_table.data(), Y_scale, Y_zp, thread_pool);
+ }
+ return status;
+}
+
+// opset-13 and above
+Status QLinearSoftmax::ComputeImplOpset13(OpKernelContext* context,
+ const Tensor& input, Tensor& output,
+ gsl::span lookup_table,
+ concurrency::ThreadPool* thread_pool) const {
+ const auto& X_shape = input.Shape();
+ size_t rank = X_shape.NumDimensions();
+
+ bool is_transpose_required = (size_t(axis_) != (rank - 1));
+ Tensor transposed_input;
+ Tensor intermediate_output; // output that the softmax implementation will write into while using transposed input
+ std::vector permutation(rank);
+
+ if (is_transpose_required) {
+ AllocatorPtr alloc;
+ ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&alloc));
+ std::iota(std::begin(permutation), std::end(permutation), 0);
+
+ // swap the innermost dim with the dim corresponding to axis
+ permutation[axis_] = rank - 1;
+ permutation[rank - 1] = axis_;
+ std::vector transposed_input_dims(rank);
+ std::transform(permutation.cbegin(), permutation.cend(),
+ transposed_input_dims.begin(), [&X_shape](size_t e) { return X_shape[e]; });
+
+ // Allocate a temporary tensor to hold transposed input
+ transposed_input = Tensor(input.DataType(), TensorShape(transposed_input_dims), alloc);
+ // Perform the transpose
+ ORT_RETURN_IF_ERROR(TransposeBase::DoTranspose(permutation, input, transposed_input));
+ // Allocate memory for the intermediate output
+ intermediate_output = Tensor(output.DataType(), TensorShape(transposed_input_dims), alloc);
+ }
+
+ common::Status status;
+
+ const auto& input_tensor = is_transpose_required ? transposed_input : input;
+ auto& output_tensor = is_transpose_required ? intermediate_output : output;
+
+ ORT_RETURN_IF_ERROR(ComputeInternal(context, input_tensor, output_tensor, lookup_table, int(rank - 1), thread_pool));
+
+ if (is_transpose_required) {
+ // Perform the transpose to get the axes back to the original ordering
+ status = (TransposeBase::DoTranspose(permutation, intermediate_output, output));
+ }
+ return status;
+}
+
+ONNX_CPU_OPERATOR_MS_KERNEL(
+ QLinearSoftmax,
+ 1,
+ KernelDefBuilder().TypeConstraint(
+ "T",
+ {DataTypeImpl::GetTensorType(),
+ DataTypeImpl::GetTensorType()}),
+ QLinearSoftmax)
+
+} // namespace contrib
+} // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cpu/quantization/qlinear_softmax.h b/onnxruntime/contrib_ops/cpu/quantization/qlinear_softmax.h
new file mode 100644
index 0000000000..a90083cd40
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/quantization/qlinear_softmax.h
@@ -0,0 +1,34 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include
+
+#include "core/framework/op_kernel.h"
+
+namespace onnxruntime {
+namespace contrib {
+
+class QLinearSoftmax final : public OpKernel {
+ public:
+ QLinearSoftmax(const OpKernelInfo& info);
+ Status Compute(OpKernelContext* context) const override;
+
+ private:
+ gsl::span GetLookupTable(OpKernelContext* context, gsl::span lookup_table_span, size_t reduce_len) const;
+
+ Status ComputeInternal(OpKernelContext* context, const Tensor& input, Tensor& output, gsl::span lookup_table, int axis, concurrency::ThreadPool* thread_pool) const;
+
+ Status ComputeImplOpset13(OpKernelContext* context, const Tensor& input, Tensor& output,
+ gsl::span lookup_table, concurrency::ThreadPool* thread_pool) const;
+
+ private:
+ std::vector fixed_lookup_table_;
+ int axis_ = -1;
+ int opset_ = 1;
+ bool is_signed_{false};
+};
+
+} // namespace contrib
+} // namespace onnxruntime
diff --git a/onnxruntime/core/graph/contrib_ops/ms_opset.h b/onnxruntime/core/graph/contrib_ops/ms_opset.h
index 4d1234988a..c6850eb8e3 100644
--- a/onnxruntime/core/graph/contrib_ops/ms_opset.h
+++ b/onnxruntime/core/graph/contrib_ops/ms_opset.h
@@ -29,6 +29,7 @@ class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QLinearLeakyRelu);
class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QLinearMul);
class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QLinearReduceMean);
class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QLinearSigmoid);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QLinearSoftmax);
class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QuantizeLinear);
class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, ReduceSumInteger);
@@ -98,6 +99,7 @@ class OpSet_Microsoft_ver1 {
fn(GetOpSchema());
fn(GetOpSchema());
fn(GetOpSchema());
+ fn(GetOpSchema());
fn(GetOpSchema());
fn(GetOpSchema());
diff --git a/onnxruntime/core/graph/contrib_ops/quantization_defs.cc b/onnxruntime/core/graph/contrib_ops/quantization_defs.cc
index 7e9995aff5..34fb5a16ad 100644
--- a/onnxruntime/core/graph/contrib_ops/quantization_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/quantization_defs.cc
@@ -559,7 +559,7 @@ and produces one output data (Tensor) where the function `f(x) = quantize(alp
.TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput));
const char* QLinearSigmoidDoc_ver1 = R"DOC(
-QLinearSigmoid takes quantized input data (Tensor), and quantize parameter for output, and produces one output data
+QLinearSigmoid takes quantized input data (Tensor), and quantize parameter for output, and produces one output data
(Tensor) where the function `f(x) = quantize(Sigmoid(dequantize(x)))`, is applied to the data tensor elementwise.
Wwhere the function `Sigmoid(x) = 1 / (1 + exp(-x))` )DOC";
@@ -585,6 +585,62 @@ Wwhere the function `Sigmoid(x) = 1 / (1 + exp(-x))` )DOC";
"Constrain input and output types to 8 bit tensors.")
.TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput));
+ ONNX_MS_OPERATOR_SET_SCHEMA(QLinearSoftmax, 1, OpSchema().SetDoc(R"DOC(
+QLinearSoftmax computes the normalized exponential values for the given input:
+Softmax(input, axis) = Exp(input) / ReduceSum(Exp(input), axis=axis, keepdims=1)
+The input does not need to explicitly be a 2D vector. The "axis" attribute
+indicates the dimension along which QLinearSoftmax will be performed for onnx v.13+.
+or the dimension coerced to NxD Matrix for onnx v.12-.
+The output tensor has the same shape.
+)DOC")
+ .Attr("axis", "apply softmax to elements for dimensions axis,"
+ "or all dims along with axis according to op-version", AttributeProto::INT, static_cast(-1))
+ .Attr("opset", "opset version of corresponding SoftMax.", AttributeProto::INT)
+ .Input(0, "X", "The input tensor", "T")
+ .Input(1, "X_scale", "Scale of quantized input 'X'. It must be a scalar.", "tensor(float)")
+ .Input(2, "x_zero_point",
+ "Zero point tensor for input 'X'."
+ "It must be a scalar.",
+ "T", OpSchema::Optional)
+ .Input(3, "y_scale", "Scale of quantized output 'Y'. It must be a scalar.", "tensor(float)")
+ .Input(4, "y_zero_point",
+ "Zero point tensor for output 'Y'. "
+ "It must be a scalar.",
+ "T")
+ .Output(0, "Y",
+ "Output data tensor from pooling across the input "
+ "tensor. The output tensor has the same rank as the input. ",
+ "T")
+ .TypeConstraint("T", {"tensor(uint8)", "tensor(int8)"},
+ "Constrain input and output types to singed/unsigned int8 tensors.")
+ .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
+ // Type inference
+ propagateElemTypeFromInputToOutput(ctx, 0, 0);
+
+ // Shape inference starts
+ if (!hasNInputShapes(ctx, 1)) {
+ return;
+ }
+
+ // Validate the value of 'axis'
+ const ONNX_NAMESPACE::TensorShapeProto& input_shape =
+ ctx.getInputType(0)->tensor_type().shape();
+ int r = input_shape.dim_size();
+ int axis = static_cast(getAttribute(ctx, "axis", -1));
+ if (axis < -r || axis >= r) {
+ fail_shape_inference(
+ "'axis' must be in [",
+ -r,
+ " , ",
+ (r - 1),
+ "]. Its actual value is: ",
+ axis);
+ }
+
+ // Shape inference
+ propagateShapeFromInputToOutput(ctx, 0, 0);
+ }));
+
ONNX_MS_OPERATOR_SET_SCHEMA(DynamicQuantizeLSTM, 1, OpSchema()
.Attr(
"direction",
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc
index 0cbd499f45..379f90a4f0 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc
@@ -4,7 +4,7 @@
#include "core/optimizer/qdq_transformer/selectors_actions/qdq_actions.h"
#include "core/optimizer/qdq_transformer/qdq_util.h"
-
+#include "core/graph/node_attr_utils.h"
namespace onnxruntime {
namespace QDQ {
@@ -195,6 +195,15 @@ UnaryReplaceWithQLinear::UnaryReplaceWithQLinear(std::string domain)
: ReplaceWithQLinear(std::move(domain), UnaryMoves()) {
}
+NodeAttributes UnaryReplaceWithQLinear::ExtraAttributes(const RuntimeState& state) const {
+ const auto& target = state.selected_nodes.Target();
+ NodeAttributes attr;
+ if (target.OpType() == "Softmax") {
+ attr["opset"] = utils::MakeAttribute(std::string("opset"), int64_t(target.SinceVersion()));
+ }
+ return attr;
+}
+
BinaryReplaceWithQLinear::BinaryReplaceWithQLinear(std::string domain)
: ReplaceWithQLinear(std::move(domain), BinaryMoves()) {
}
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.h b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.h
index c9f889ede4..b8e371ecc8 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.h
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.h
@@ -43,6 +43,9 @@ struct ReplaceWithQLinear : public QDQReplaceWithNew {
struct UnaryReplaceWithQLinear : ReplaceWithQLinear {
UnaryReplaceWithQLinear(std::string domain);
+
+ private:
+ NodeAttributes ExtraAttributes(const RuntimeState& state) const override;
};
struct BinaryReplaceWithQLinear : ReplaceWithQLinear {
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc
index 9e75a0fbad..a2ea67813f 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc
@@ -82,7 +82,8 @@ void UnaryOpQDQRules(SelectorActionRegistry& qdq_selector_action_registry) {
{{"AveragePool", {}},
{"LeakyRelu", {}},
{"GlobalAveragePool", {}},
- {"Sigmoid", {}}},
+ {"Sigmoid", {}},
+ {"Softmax", {}}},
std::move(selector),
std::move(action));
#else
diff --git a/onnxruntime/python/tools/quantization/onnx_quantizer.py b/onnxruntime/python/tools/quantization/onnx_quantizer.py
index 6ce15593cb..1c1ade6948 100644
--- a/onnxruntime/python/tools/quantization/onnx_quantizer.py
+++ b/onnxruntime/python/tools/quantization/onnx_quantizer.py
@@ -132,6 +132,8 @@ class ONNXQuantizer:
# some output from nodes will be quantized, yet itself should be treat as existing so
# no dequantized will be applied when needed later
self.generated_value_names = self.model.get_non_initializer_inputs()
+ # to store specified scale and zeropoint instead of calculated value, tensor_name->(scale, zeropoint)
+ self.used_scale_zp_map = {}
# routines for subgraph support
def quantize_subgraph(self, subgraph, graph_key):
@@ -625,6 +627,18 @@ class ONNXQuantizer:
self.quantized_value_map[input_name] = QuantizedValue(input_name, output_name, scale_name, zp_name, qType)
return nodes + [qlinear_node]
+ def set_quant_scale_zp(self, tensor_name, value):
+ assert isinstance(value, tuple) and len(value) == 2, "value must be scale(float) and zeropoint"
+ assert tensor_name not in self.used_scale_zp_map, f"{tensor_name} has been setted before"
+ self.used_scale_zp_map[tensor_name] = value
+
+ def find_quant_scale_zp(self, input_name):
+ if input_name in self.used_scale_zp_map:
+ return self.used_scale_zp_map[input_name]
+ if self.parent is not None:
+ return self.parent.find_quantized_value(input_name)
+ return (None, None)
+
def find_quantized_value(self, input_name):
if input_name in self.quantized_value_map:
return self.quantized_value_map[input_name]
diff --git a/onnxruntime/python/tools/quantization/operators/softmax.py b/onnxruntime/python/tools/quantization/operators/softmax.py
new file mode 100644
index 0000000000..8855859236
--- /dev/null
+++ b/onnxruntime/python/tools/quantization/operators/softmax.py
@@ -0,0 +1,86 @@
+import onnx
+
+from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
+from .base_operator import QuantOperatorBase
+from .qdq_base_operator import QDQOperatorBase
+
+
+class QLinearSoftmax(QuantOperatorBase):
+ def quantize(self):
+ node = self.node
+ # set limitations for softmax output scale and zp, because the output of softmax is always 0-1
+ if self.quantizer.input_qType == onnx.onnx_pb.TensorProto.UINT8:
+ out_scale = 1 / 256.0
+ out_zero_point = 0
+ else:
+ out_scale = 1 / 256.0
+ out_zero_point = -128
+ # only try to quantize when given quantization parameters for it
+ (
+ data_found,
+ output_scale_name,
+ output_zp_name,
+ _,
+ _,
+ ) = self.quantizer._get_quantization_params(node.output[0], out_scale, out_zero_point)
+
+ # get quantized input tensor names, quantize input if needed
+ (
+ quantized_input_names,
+ input_zero_point_names,
+ input_scale_names,
+ nodes,
+ ) = self.quantizer.quantize_inputs(node, [0])
+
+ if not data_found or quantized_input_names is None:
+ return super().quantize()
+
+ # Create an entry for output quantized value.
+ qlinear_output_name = node.output[0] + "_quantized"
+ quantized_output_value = QuantizedValue(
+ node.output[0],
+ qlinear_output_name,
+ output_scale_name,
+ output_zp_name,
+ QuantizedValueType.Input,
+ )
+ self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
+
+ # Create qlinear softmax node for given type
+ kwargs = {}
+ for attribute in node.attribute:
+ kwargs.update(attribute_to_kwarg(attribute))
+ kwargs["domain"] = ms_domain
+ # make qlinearsoft has the real opset_version, its default SinceVersion would be 1
+ kwargs["opset"] = self.quantizer.opset_version
+ qlinear_node_name = node.name + "_quant" if node.name != "" else ""
+ qnode = onnx.helper.make_node(
+ "QLinear" + node.op_type,
+ [
+ quantized_input_names[0],
+ input_scale_names[0],
+ input_zero_point_names[0],
+ output_scale_name,
+ output_zp_name,
+ ],
+ [qlinear_output_name],
+ qlinear_node_name,
+ **kwargs,
+ )
+
+ # add all newly created nodes
+ nodes.append(qnode)
+ self.quantizer.new_nodes += nodes
+ return None
+
+
+class QDQSoftmax(QDQOperatorBase):
+ def quantize(self):
+ super().quantize()
+ if self.quantizer.input_qType == onnx.onnx_pb.TensorProto.UINT8:
+ out_scale = 1 / 256.0
+ out_zero_point = 0
+ else:
+ out_scale = 1 / 256.0
+ out_zero_point = -128
+ self.quantizer.set_quant_scale_zp(self.node.output[0], (out_scale, out_zero_point))
diff --git a/onnxruntime/python/tools/quantization/qdq_quantizer.py b/onnxruntime/python/tools/quantization/qdq_quantizer.py
index b397d0728c..8957b7144f 100644
--- a/onnxruntime/python/tools/quantization/qdq_quantizer.py
+++ b/onnxruntime/python/tools/quantization/qdq_quantizer.py
@@ -340,7 +340,10 @@ class QDQQuantizer(ONNXQuantizer):
if initializer:
self._add_qdq_pair_for_weight(initializer, tensor_info.axis)
else:
- data_found, scale_name, zp_name, _, _ = self._get_quantization_params(tensor_name)
+ used_scale, used_zp = self.find_quant_scale_zp(tensor_name)
+ data_found, scale_name, zp_name, _, _ = self._get_quantization_params(
+ tensor_name, used_scale, used_zp
+ )
if not data_found:
raise ValueError(
diff --git a/onnxruntime/python/tools/quantization/registry.py b/onnxruntime/python/tools/quantization/registry.py
index 59f42dc1cc..0227da5e24 100644
--- a/onnxruntime/python/tools/quantization/registry.py
+++ b/onnxruntime/python/tools/quantization/registry.py
@@ -17,6 +17,7 @@ from .operators.pad import QPad
from .operators.pooling import QLinearPool
from .operators.qdq_base_operator import QDQOperatorBase
from .operators.resize import QDQResize, QResize
+from .operators.softmax import QDQSoftmax, QLinearSoftmax
from .operators.split import QDQSplit, QSplit
from .quant_utils import QuantizationMode
@@ -55,6 +56,7 @@ QLinearOpsRegistry = {
"Resize": QResize,
"AveragePool": QLinearPool,
"Concat": QLinearConcat,
+ "Softmax": QLinearSoftmax,
}
QLinearOpsRegistry.update(CommonOpsRegistry)
@@ -73,6 +75,7 @@ QDQRegistry = {
"MatMul": QDQMatMul,
"Split": QDQSplit,
"Gather": QDQGather,
+ "Softmax": QDQSoftmax,
}
diff --git a/onnxruntime/test/contrib_ops/qlinear_lookup_table_test.cc b/onnxruntime/test/contrib_ops/qlinear_lookup_table_test.cc
index 4a8a861b61..cdec3cd5a2 100644
--- a/onnxruntime/test/contrib_ops/qlinear_lookup_table_test.cc
+++ b/onnxruntime/test/contrib_ops/qlinear_lookup_table_test.cc
@@ -115,5 +115,153 @@ TEST(QLinearLookupTableBasedOperatorTests, QLinearSigmoid_UInt8_0_Y_ZP) {
run_test(true);
}
+/*
+\brief data is generated by pytorch script
+\details model defines
+```
+ input(int8/uint8)
+ x = self.dequant(x)
+ x = self.softmax(x)
+ x = self.quant2(x)
+ output(int8/uint8)
+```
+\see then followed by the [DOC](https://pytorch.org/docs/stable/quantization.html)
+*/
+TEST(QLinearLookupTableBasedOperatorTests, QLinearSoftmax_UInt8_v12) {
+ OpTester test("QLinearSoftmax", 1, onnxruntime::kMSDomain);
+ test.AddAttribute("axis", -2);
+ test.AddAttribute("opset", 12);
+ float X_scale = 0.166099221f;
+ //
+ uint8_t X_zero_point = 128;
+ float Y_scale = 1.0f / 256.0f;
+ uint8_t Y_zero_point = 0;
+ //
+
+ std::vector dims = {2, 4, 5};
+ auto x_in = std::vector{50, 67, 58, 68, 46, 69, 77, 91, 62, 74, 67, 72, 71, 70, 83, 88, 75, 54, 74, 88};
+ auto y_out = std::vector { 0, 2, 0, 2, 0, 2, 8, 86, 1, 5, 2, 4, 3, 3, 23, 52, 6, 0, 5, 52 };
+ for (int64_t i = 1; i < dims[0]; i++) {
+ for (int64_t j = 0; j < dims[1] * dims[2]; j++) {
+ x_in.push_back(x_in[j]);
+ y_out.push_back(y_out[j]);
+ }
+ }
+
+ test.AddInput("X", dims, x_in);
+ test.AddInput("X_scale", {}, {X_scale});
+ test.AddInput("X_zero_point", {}, {X_zero_point});
+ test.AddInput("Y_scale", {}, {Y_scale});
+ test.AddInput("Y_zero_point", {}, {Y_zero_point});
+ test.AddOutput("Y", dims, y_out);
+ auto origin_round_mode = std::fegetround();
+ std::fesetround(FE_TONEAREST);
+ test.Run();
+ std::fesetround(origin_round_mode);
+}
+
+TEST(QLinearLookupTableBasedOperatorTests, QLinearSoftmax_UInt8_v13) {
+ OpTester test("QLinearSoftmax", 1, onnxruntime::kMSDomain);
+ test.AddAttribute("axis", -2);
+ test.AddAttribute("opset", 13);
+ float X_scale = 0.0304f;
+ //
+ uint8_t X_zero_point = 128;
+ float Y_scale = 0.0059f;
+ uint8_t Y_zero_point = 0;
+ //
+
+ std::vector dims = {4, 4, 4};
+ auto x_in = std::vector {
+ 62, 50, 71, 37, 68, 88, 64, 51, 59, 95, 41, 54, 55, 20, 77, 32, 92,
+ 63, 43, 13, 76, 82, 53, 43, 60, 18, 73, 74, 22, 89, 44, 106, 17,
+ 95, 27, 35, 47, 57, 0, 78, 97, 66, 56, 28, 127, 33, 106, 71, 119,
+ 64, 16, 0, 16, 79, 27, 89, 110, 126, 88, 90, 67, 11, 4, 90};
+ auto y_out = std::vector {
+ 43, 20, 50, 33, 52, 63, 40, 51, 39, 78,
+ 20, 56, 35, 8, 59, 29, 80, 32, 29, 6, 49, 57, 39, 16, 30, 8, 72, 40,
+ 10, 71, 30, 107, 4, 90, 11, 20, 10, 28, 5, 74, 45, 37, 27, 16, 111, 14,
+ 125, 59, 84, 18, 14, 4, 4, 28, 20, 54, 64, 119, 126, 56, 17, 4, 10, 56};
+
+ test.AddInput("X", dims, x_in);
+ test.AddInput("X_scale", {}, {X_scale});
+ test.AddInput("X_zero_point", {}, {X_zero_point});
+ test.AddInput("Y_scale", {}, {Y_scale});
+ test.AddInput("Y_zero_point", {}, {Y_zero_point});
+ test.AddOutput("Y", dims, y_out);
+ auto origin_round_mode = std::fegetround();
+ std::fesetround(FE_TONEAREST);
+ test.Run();
+ std::fesetround(origin_round_mode);
+}
+
+TEST(QLinearLookupTableBasedOperatorTests, QLinearSoftmax_Int8_v13) {
+ OpTester test("QLinearSoftmax", 1, onnxruntime::kMSDomain);
+ test.AddAttribute("axis", -2);
+ test.AddAttribute("opset", 13);
+ float X_scale = 0.0304F;
+ //
+ int8_t X_zero_point = 0;
+ float Y_scale = 0.0059F;
+ int8_t Y_zero_point = -128;
+ //
+
+ std::vector dims = {4, 4, 4};
+ auto x_in = std::vector {
+ -4, -16, 5, -29, 2, 22, -2, -15, -7, 29, -25, -12, -11, -46, 11, -34, 26,
+ -3, -23, -53, 10, 16, -13, -23, -6, -48, 7, 8, -44, 23, -22, 40, -49, 29, -39, -31, -19, -9,
+ -72, 12, 31, 0, -10, -38, 61, -33, 40, 5, 53, -2, -50, -66, -50, 13, -39, 23, 44, 60, 22, 24,
+ 1, -55, -62, 24};
+ auto y_out = std::vector {
+ -85, -108, -78, -95, -76, -65, -88, -77, -89, -50, -108, -72, -93,
+ -120, -69, -99, -48, -96, -99, -122, -79, -71, -89, -112, -98, -120, -56, -88, -118, -57, -98,
+ -21, -124, -38, -117, -108, -118, -100, -124, -54, -83, -91, -100, -112, -17, -114, -2, -69, -44,
+ -110, -114, -124, -124, -100, -108, -74, -64, -9, -2, -72, -111, -124, -118, -72};
+
+ test.AddInput("X", dims, x_in);
+ test.AddInput("X_scale", {}, {X_scale});
+ test.AddInput("X_zero_point", {}, {X_zero_point});
+ test.AddInput("Y_scale", {}, {Y_scale});
+ test.AddInput("Y_zero_point", {}, {Y_zero_point});
+ test.AddOutput("Y", dims, y_out);
+ auto origin_round_mode = std::fegetround();
+ std::fesetround(FE_TONEAREST);
+ test.Run();
+ std::fesetround(origin_round_mode);
+}
+
+TEST(QLinearLookupTableBasedOperatorTests, QLinearSoftmax_Int8_v12) {
+ OpTester test("QLinearSoftmax", 1, onnxruntime::kMSDomain);
+ test.AddAttribute("axis", -2);
+ test.AddAttribute("opset", 12);
+ float X_scale = 0.166099221f;
+ //
+ int8_t X_zero_point = 0;
+ float Y_scale = 1.0f / 128.0f;
+ int8_t Y_zero_point = 0;
+ //
+
+ std::vector dims = {2, 4, 5};
+ auto x_in = std::vector{-28, -4, -4, -7, 3, -26, 4, -16, 23, 14, -7, 26, -8, 19, -16, -13, 7, 17, 27, 5};
+ auto y_out = std::vector{0, 0, 0, 0, 1, 0, 1, 0, 22, 5, 0, 35, 0, 11, 0, 0, 2, 8, 42, 1};
+ for (int64_t i = 1; i < dims[0]; i++) {
+ for (int64_t j = 0; j < dims[1] * dims[2]; j++) {
+ x_in.push_back(x_in[j]);
+ y_out.push_back(y_out[j]);
+ }
+ }
+
+ test.AddInput("X", dims, x_in);
+ test.AddInput("X_scale", {}, {X_scale});
+ test.AddInput("X_zero_point", {}, {X_zero_point});
+ test.AddInput("Y_scale", {}, {Y_scale});
+ test.AddInput("Y_zero_point", {}, {Y_zero_point});
+ test.AddOutput("Y", dims, y_out);
+ auto origin_round_mode = std::fegetround();
+ std::fesetround(FE_TONEAREST);
+ test.Run();
+ std::fesetround(origin_round_mode);
+}
+
} // namespace test
} // namespace onnxruntime
diff --git a/onnxruntime/test/optimizer/qdq_transformer_test.cc b/onnxruntime/test/optimizer/qdq_transformer_test.cc
index 4131cbe497..56ae5aeafb 100644
--- a/onnxruntime/test/optimizer/qdq_transformer_test.cc
+++ b/onnxruntime/test/optimizer/qdq_transformer_test.cc
@@ -1857,6 +1857,67 @@ TEST(QDQTransformerTests, Concat) {
test_case({{1, 6, 36}, {1, 6, 8}, {1, 6, 2}}, 2, false, false, true);
}
+template
+void QDQTransformerSoftmaxTests() {
+ auto test_case = [&](const std::vector& input_shape, int64_t axis) {
+ auto build_test_case = [&](ModelTestBuilder& builder) {
+ auto* input_arg = builder.MakeInput(input_shape, -5.f, 5.f);
+ auto* output_arg = builder.MakeOutput();
+ // add QDQ + Softmax
+ auto* dq_output = AddQDQNodePair(builder, input_arg, .105f,
+ (std::numeric_limits::max() / 255 * 255) / 2);
+ auto* softmax_output = builder.MakeIntermediate();
+ auto& softmax_node = builder.AddNode("Softmax", {dq_output}, {softmax_output});
+ softmax_node.AddAttribute("axis", axis);
+ // add QDQ output
+ auto* q_output = builder.MakeIntermediate();
+ builder.AddQuantizeLinearNode(softmax_output,
+ 1.0f / (std::numeric_limits::max() + 1),
+ 0,
+ q_output);
+ builder.AddDequantizeLinearNode(q_output,
+ 1.0f / (std::numeric_limits::max() + 1),
+ 0,
+ output_arg);
+ };
+
+ auto check_graph = [&](InferenceSessionWrapper& session) {
+ auto op_to_count = CountOpsInGraph(session.GetGraph());
+ if constexpr (std::is_same::value) {
+ EXPECT_EQ(op_to_count["com.microsoft.QLinearSoftmax"], 1);
+ EXPECT_EQ(op_to_count["Softmax"], 0);
+ EXPECT_EQ(op_to_count["QuantizeLinear"], 1);
+ EXPECT_EQ(op_to_count["DequantizeLinear"], 1);
+ } else {
+ EXPECT_EQ(op_to_count["com.microsoft.QLinearSoftmax"], 0);
+ EXPECT_EQ(op_to_count["Softmax"], 1);
+ EXPECT_EQ(op_to_count["QuantizeLinear"], 2);
+ EXPECT_EQ(op_to_count["DequantizeLinear"], 2);
+ }
+ };
+
+ TransformerTester(build_test_case,
+ check_graph,
+ TransformerLevel::Level1,
+ TransformerLevel::Level2,
+ 12 /*opset_version*/,
+ 0.01 /*per_sample_tolerance*/,
+ 0.01 /*relative_per_sample_tolerance*/,
+ std::make_unique(QDQIsInt8Allowed()));
+ };
+
+ test_case({1, 12, 37}, -1);
+ test_case({1, 23, 13, 13}, -2);
+}
+
+TEST(QDQTransformerTests, Softmax_S8S8) {
+ QDQTransformerSoftmaxTests();
+}
+
+TEST(QDQTransformerTests, Softmax_U8U8) {
+ QDQTransformerSoftmaxTests();
+}
+
#endif // !defined(DISABLE_CONTRIB_OPS)
TEST(QDQTransformerTests, QDQPropagation_QBackward) {
diff --git a/onnxruntime/test/python/quantization/op_test_utils.py b/onnxruntime/test/python/quantization/op_test_utils.py
index efb4c8fbc3..251fd12d06 100644
--- a/onnxruntime/test/python/quantization/op_test_utils.py
+++ b/onnxruntime/test/python/quantization/op_test_utils.py
@@ -74,6 +74,8 @@ def check_model_correctness(testcase, model_path_origin, model_path_to_check, in
model_path_origin, sess_options=sess_options, providers=["CPUExecutionProvider"]
)
origin_results = origin_sess.run([], inputs)
+ # enable QDQ transformers
+ # sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
target_sess = onnxruntime.InferenceSession(
model_path_to_check,
sess_options=sess_options,
diff --git a/onnxruntime/test/python/quantization/test_op_softmax.py b/onnxruntime/test/python/quantization/test_op_softmax.py
new file mode 100644
index 0000000000..add97f9ebc
--- /dev/null
+++ b/onnxruntime/test/python/quantization/test_op_softmax.py
@@ -0,0 +1,180 @@
+#!/usr/bin/env python
+"""
+Softmax quantization test case
+"""
+# coding: utf-8
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+import unittest
+
+import numpy as np
+import onnx
+from onnx import TensorProto, helper
+from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_qtype_by_node_type
+
+from onnxruntime.quantization import QuantFormat, QuantType, quantize_static
+
+
+class TestOpSoftmax(unittest.TestCase):
+ """_summary_
+ unittest (softmax): quantization of QDQ and Qop with u8 and s8
+ """
+
+ def input_feeds(self, n_repeat, name2shape):
+ input_data_list = []
+ for _ in range(n_repeat):
+ inputs = {}
+ for name, shape in name2shape.items():
+ inputs.update({name: np.random.randint(-1, 2, shape).astype(np.float32)})
+ input_data_list.extend([inputs])
+ data_r = TestDataFeeds(input_data_list)
+ return data_r
+
+ def construct_model_conv_softmax(
+ self,
+ output_model_path,
+ conv_input_shape,
+ conv_weight_shape,
+ softmax_input_shape,
+ softmax_attributes,
+ output_shape,
+ ):
+ # (input)
+ # \
+ # Conv
+ # / \
+ # Identity Softmax
+ # / \
+ # (identity_out) (output)
+ input_tensor = helper.make_tensor_value_info("input", TensorProto.FLOAT, conv_input_shape)
+
+ conv_weight_arr = np.random.randint(-1, 2, conv_weight_shape).astype(np.float32)
+ conv_weight_initializer = onnx.numpy_helper.from_array(conv_weight_arr, name="conv1_weight")
+ conv_node = onnx.helper.make_node("Conv", ["input", "conv1_weight"], ["conv_output"], name="conv_node")
+
+ identity_out = helper.make_tensor_value_info("identity_out", TensorProto.FLOAT, softmax_input_shape)
+ identity_node = helper.make_node("Identity", ["conv_output"], ["identity_out"], name="IdentityNode")
+
+ initializers = [conv_weight_initializer]
+
+ output_tensor = helper.make_tensor_value_info("output", TensorProto.FLOAT, output_shape)
+ softmax_node = helper.make_node(
+ "Softmax", ["conv_output"], ["output"], name="softmax_node", **softmax_attributes
+ )
+
+ graph = helper.make_graph(
+ [conv_node, identity_node, softmax_node],
+ "TestOpQuantizersoftmax_test_model",
+ [input_tensor],
+ [identity_out, output_tensor],
+ initializer=initializers,
+ )
+ model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
+ model.ir_version = 7 # use stable onnx ir version
+ onnx.save(model, output_model_path)
+
+ def quantize_softmax_test(self, activation_type, weight_type, extra_options={}):
+ np.random.seed(1)
+ model_fp32_path = "softmax_fp32.onnx"
+ self.construct_model_conv_softmax(
+ model_fp32_path,
+ [1, 2, 26, 42],
+ [3, 2, 3, 3],
+ [1, 3, 24, 40],
+ {"axis": -2},
+ [1, 3, 24, 40],
+ )
+ data_reader = self.input_feeds(1, {"input": [1, 2, 26, 42]})
+
+ activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8
+ activation_type_str = "u8" if (activation_type == QuantType.QUInt8) else "s8"
+ weight_type_str = "u8" if (weight_type == QuantType.QUInt8) else "s8"
+ model_q8_path = f"softmax_{activation_type_str}{weight_type_str}.onnx"
+ model_q8_qdq_path = f"softmax_qdq_{activation_type_str}{weight_type_str}.onnx"
+
+ # Verify QOperator mode
+ data_reader.rewind()
+ quantize_static(
+ model_fp32_path,
+ model_q8_path,
+ data_reader,
+ quant_format=QuantFormat.QOperator,
+ activation_type=activation_type,
+ weight_type=weight_type,
+ extra_options=extra_options,
+ )
+ qnode_counts = {
+ "QLinearConv": 1,
+ "QuantizeLinear": 1,
+ "DequantizeLinear": 2,
+ "QLinearSoftmax": 1,
+ "Softmax": 0,
+ }
+ check_op_type_count(self, model_q8_path, **qnode_counts)
+ qnode_io_qtypes = {
+ "QuantizeLinear": [
+ ["i", 2, activation_proto_qtype],
+ ["o", 0, activation_proto_qtype],
+ ]
+ }
+ qnode_io_qtypes.update(
+ {
+ "QLinearConv": [
+ ["i", 2, activation_proto_qtype],
+ ["i", 7, activation_proto_qtype],
+ ["o", 0, activation_proto_qtype],
+ ]
+ }
+ )
+ qnode_io_qtypes.update(
+ {"QLinearSoftmax": [["i", 4, activation_proto_qtype]]}
+ ) # shape info note workig on custome ops
+ check_qtype_by_node_type(self, model_q8_path, qnode_io_qtypes)
+ data_reader.rewind()
+ check_model_correctness(self, model_fp32_path, model_q8_path, data_reader.get_next())
+
+ # Verify QDQ mode
+ data_reader.rewind()
+ quantize_static(
+ model_fp32_path,
+ model_q8_qdq_path,
+ data_reader,
+ quant_format=QuantFormat.QDQ,
+ activation_type=activation_type,
+ weight_type=weight_type,
+ extra_options=extra_options,
+ )
+ qdqnode_counts = {
+ "Conv": 1,
+ "QuantizeLinear": 3,
+ "DequantizeLinear": 4,
+ "Softmax": 1,
+ }
+ check_op_type_count(self, model_q8_qdq_path, **qdqnode_counts)
+ qnode_io_qtypes = {
+ "QuantizeLinear": [
+ ["i", 2, activation_proto_qtype],
+ ["o", 0, activation_proto_qtype],
+ ]
+ }
+ check_qtype_by_node_type(self, model_q8_qdq_path, qnode_io_qtypes)
+ data_reader.rewind()
+ check_model_correctness(self, model_fp32_path, model_q8_qdq_path, data_reader.get_next())
+
+ def test_quantize_softmax(self):
+ self.quantize_softmax_test(QuantType.QUInt8, QuantType.QUInt8)
+
+ def test_quantize_softmax_s8s8(self):
+ self.quantize_softmax_test(
+ QuantType.QInt8,
+ QuantType.QInt8,
+ extra_options={"ActivationSymmetric": True},
+ )
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/onnxruntime/test/testdata/kernel_def_hashes/contrib.cpu.json b/onnxruntime/test/testdata/kernel_def_hashes/contrib.cpu.json
index 0d624bf0a5..5fb55faa14 100644
--- a/onnxruntime/test/testdata/kernel_def_hashes/contrib.cpu.json
+++ b/onnxruntime/test/testdata/kernel_def_hashes/contrib.cpu.json
@@ -298,5 +298,9 @@
[
"QGemm com.microsoft CPUExecutionProvider",
13737193491843065240
+ ],
+ [
+ "QLinearSoftmax com.microsoft CPUExecutionProvider",
+ 10339195975968977840
]
-]
\ No newline at end of file
+]