fix crash in dequantizelinear/quantizelinear for optional zero point (#4047)

fix the issue #4032 and #3802 in OnnxRuntime side. For the quantizeLinear, there also needs a fix in ONNX type inference. Will do that in ONNX repo.
2026-07-12 17:57:38 +00:00 · 2020-05-27 17:11:55 -07:00 · 2020-05-27 17:11:55 -07:00 · 23c313cb73
commit 23c313cb73
parent 6665d5e2bc
4 changed files with 91 additions and 93 deletions
--- a/onnxruntime/core/providers/cpu/tensor/quantize_linear.cc
+++ b/onnxruntime/core/providers/cpu/tensor/quantize_linear.cc
@ -28,7 +28,8 @@ template <typename T>
 Status DequantizeLinear<T>::Compute(OpKernelContext* ctx) const {
  auto& x = *ctx->Input<Tensor>(0);
  auto& x_scale = *ctx->Input<Tensor>(1);
-  auto& x_zero_point = *ctx->Input<Tensor>(2);
+  auto* x_zero_point = ctx->Input<Tensor>(2);
+
  const auto& x_shape = x.Shape();
  auto& y = *ctx->Output(0, x_shape);

@ -36,15 +37,19 @@ Status DequantizeLinear<T>::Compute(OpKernelContext* ctx) const {
  int64_t broadcast_dim;
  int64_t block_size;

-  if (has_axis_) {
+  if (has_axis_) {  // custom DequantizeLinear only
    const int64_t axis = HandleNegativeAxis(axis_, x_shape.NumDimensions());
    N = x_shape.SizeToDimension(axis);
    broadcast_dim = x_shape[axis];
    block_size = x_shape.SizeFromDimension(axis + 1);

    // if an axis was specified, ensure the scale and zero point are compatible
-    ORT_ENFORCE(x_scale.Shape().NumDimensions() == 1 && x_scale.Shape().Size() == broadcast_dim, "x_scale must be 1D tensor with size ", broadcast_dim);
-    ORT_ENFORCE(x_zero_point.Shape().NumDimensions() == 1 && x_zero_point.Shape().Size() == broadcast_dim, "x_zero_point must be 1D tensor with size ", broadcast_dim);
+    ORT_ENFORCE(x_scale.Shape().NumDimensions() == 1 && x_scale.Shape().Size() == broadcast_dim,
+                "x_scale must be 1D tensor with size ",
+                broadcast_dim);
+    ORT_ENFORCE(x_zero_point != nullptr && x_zero_point->Shape().NumDimensions() == 1 && x_zero_point->Shape().Size() == broadcast_dim,
+                "x_zero_point must be 1D tensor with size ",
+                broadcast_dim);
  } else {
    N = 1;
    broadcast_dim = 1;
@ -52,17 +57,17 @@ Status DequantizeLinear<T>::Compute(OpKernelContext* ctx) const {

    // if no axis, enforce that scale and zero point are scalars
    ORT_ENFORCE(IsScalarOr1ElementVector(&x_scale), "x_scale must be a scalar or 1D tensor or size 1.");
-    ORT_ENFORCE(IsScalarOr1ElementVector(&x_zero_point), "x_zero_point must be a scalar or 1D tensor or size 1.");
+    ORT_ENFORCE(x_zero_point == nullptr || IsScalarOr1ElementVector(x_zero_point), "x_zero_point must be a scalar or 1D tensor or size 1.");
  }

-  const T* zero_point = x_zero_point.template Data<T>();
+  const T* zero_point = x_zero_point ? x_zero_point->template Data<T>() : nullptr;
  const float* scale = x_scale.template Data<float>();
  const T* input = x.template Data<T>();
  float* output = y.template MutableData<float>();

  for (size_t n = 0; n < static_cast<size_t>(N); n++) {
    for (size_t bd = 0; bd < static_cast<size_t>(broadcast_dim); bd++) {
-      auto zp = static_cast<int32_t>(zero_point[bd]);
+      auto zp = zero_point ? static_cast<int32_t>(zero_point[bd]) : 0;
      auto sc = scale[bd];

      for (size_t bs = 0; bs < static_cast<size_t>(block_size); bs++) {
--- a/onnxruntime/core/providers/cuda/tensor/quantize_linear.cc
+++ b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cc
@ -13,26 +13,23 @@ template <class T, class U>
 Status QuantizeLinear<T, U>::ComputeInternal(OpKernelContext* ctx) const {
  typedef typename ToCudaType<U>::MappedType CudaU;

-  auto x = ctx->Input<Tensor>(0);
-  auto y_scale = ctx->Input<Tensor>(1);
-  auto y_zero_point = ctx->Input<Tensor>(2);
-  ORT_ENFORCE(x != nullptr &&
-              y_scale != nullptr &&
-              y_zero_point != nullptr);
-  auto y = ctx->Output(0, x->Shape());
-  ORT_ENFORCE(y != nullptr);
+  auto& x = *ctx->Input<Tensor>(0);
+  auto& y_scale = *ctx->Input<Tensor>(1);
+  auto* y_zero_point = ctx->Input<Tensor>(2);

-  const auto& x_shape = x->Shape();
+  auto& y = *ctx->Output(0, x.Shape());

-  const CudaU* input = reinterpret_cast<const CudaU*>(x->template Data<U>());
-  T* output = y->template MutableData<T>();
+  const auto& x_shape = x.Shape();
+
+  const CudaU* input = reinterpret_cast<const CudaU*>(x.template Data<U>());
+  T* output = y.template MutableData<T>();

  // TO DO: support per-channel
-  ORT_ENFORCE(IsScalarOr1ElementVector(y_scale), "y_scale must be a scalar or 1D tensor of size 1.");
-  ORT_ENFORCE(IsScalarOr1ElementVector(y_zero_point), "y_zero_point must be a scalar or 1D tensor of size 1.");
+  ORT_ENFORCE(IsScalarOr1ElementVector(&y_scale), "y_scale must be a scalar or 1D tensor of size 1.");
+  ORT_ENFORCE(y_zero_point == nullptr || IsScalarOr1ElementVector(y_zero_point), "y_zero_point must be a scalar or 1D tensor of size 1.");

-  const T* zero_point = y_zero_point->template Data<T>();
-  const CudaU* scale = reinterpret_cast<const CudaU*>(y_scale->template Data<U>());
+  const T* zero_point = y_zero_point != nullptr ? y_zero_point->template Data<T>() : nullptr;
+  const CudaU* scale = reinterpret_cast<const CudaU*>(y_scale.template Data<U>());
  const auto num_of_elements = x_shape.Size();

  CudaQuantizeLinear(input, output, scale, zero_point, num_of_elements);
@ -44,26 +41,22 @@ template <class T, class U>
 Status DequantizeLinear<T, U>::ComputeInternal(OpKernelContext* ctx) const {
  typedef typename ToCudaType<U>::MappedType CudaU;

-  auto x = ctx->Input<Tensor>(0);
-  auto y_scale = ctx->Input<Tensor>(1);
-  auto y_zero_point = ctx->Input<Tensor>(2);
-  ORT_ENFORCE(x != nullptr &&
-              y_scale != nullptr &&
-              y_zero_point != nullptr);
+  auto& x = *ctx->Input<Tensor>(0);
+  auto& y_scale = *ctx->Input<Tensor>(1);
+  auto* y_zero_point = ctx->Input<Tensor>(2);

-  const auto& x_shape = x->Shape();
+  const auto& x_shape = x.Shape();

-  auto y = ctx->Output(0, x_shape);
-  ORT_ENFORCE(y != nullptr);
+  auto& y = *ctx->Output(0, x_shape);

-  const T* input = x->template Data<T>();
-  CudaU* output = reinterpret_cast<CudaU*>(y->template MutableData<U>());
+  const T* input = x.template Data<T>();
+  CudaU* output = reinterpret_cast<CudaU*>(y.template MutableData<U>());

-  ORT_ENFORCE(IsScalarOr1ElementVector(y_scale), "y_scale must be a scalar or 1D tensor of size 1.");
-  ORT_ENFORCE(IsScalarOr1ElementVector(y_zero_point), "y_zero_point must be a scalar or 1D tensor of size 1.");
+  ORT_ENFORCE(IsScalarOr1ElementVector(&y_scale), "y_scale must be a scalar or 1D tensor of size 1.");
+  ORT_ENFORCE(y_zero_point == nullptr || IsScalarOr1ElementVector(y_zero_point), "y_zero_point must be a scalar or 1D tensor of size 1.");

-  const T* zero_point = y_zero_point->template Data<T>();
-  const CudaU* scale = reinterpret_cast<const CudaU*>(y_scale->template Data<U>());
+  const T* zero_point = y_zero_point != nullptr ? y_zero_point->template Data<T>() : nullptr;
+  const CudaU* scale = reinterpret_cast<const CudaU*>(y_scale.template Data<U>());
  const auto num_of_elements = x_shape.Size();

  CudaDequantizeLinear(input, output, scale, zero_point, num_of_elements);
--- a/onnxruntime/core/providers/cuda/tensor/quantize_linear.cu
+++ b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cu
@ -3,102 +3,84 @@

 #include "quantize_linear.cuh"

+#include <limits>
+
 #include "core/providers/cuda/cu_inc/common.cuh"

 namespace onnxruntime {
 namespace cuda {

-template <int NumThreadsPerBlock, int NumElementsPerThread>
-__global__ void QuantizeLinearKernel(const half* input, int8_t* output, const half* scale, const int8_t* zero_point, CUDA_LONG N) {
+template <typename T>
+struct Round;
+
+template <>
+struct Round<float> {
+  __device__ __forceinline__ int operator()(float v) const {
+    return __float2int_rn(v);
+  }
+};
+
+template <>
+struct Round<half> {
+  __device__ __forceinline__ int operator()(half v) const {
+    return __half2int_rn(v);
+  }
+};
+
+template <int NumThreadsPerBlock, int NumElementsPerThread, typename OutT, typename InT>
+__global__ void QuantizeLinearKernel(const InT* input, OutT* output, const InT* scale_ptr, const OutT* zero_point_ptr, CUDA_LONG N, Round<InT> round) {
  CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;

+  InT scale = *scale_ptr;
+  OutT zero_point = zero_point_ptr != nullptr ? *zero_point_ptr : 0;
 #pragma unroll
  for (int i = 0; i < NumElementsPerThread; i++) {
    if (id < N) {
-      int value = __half2int_rn(input[id] / (*scale)) + *zero_point;
-      output[id] = static_cast<int8_t>(max(-128, min(127, value)));
+      int value = round(input[id] / scale) + zero_point;
+      output[id] = static_cast<OutT>(max(std::numeric_limits<OutT>::min(), min(std::numeric_limits<OutT>::max(), value)));
      id += NumThreadsPerBlock;
    }
  }
 }

-template <int NumThreadsPerBlock, int NumElementsPerThread>
-__global__ void QuantizeLinearKernel(const float* input, int8_t* output, const float* scale, const int8_t* zero_point, CUDA_LONG N) {
-  CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
-
-#pragma unroll
-  for (int i = 0; i < NumElementsPerThread; i++) {
-    if (id < N) {
-      int value = __float2int_rn(input[id] / (*scale)) + *zero_point;
-      output[id] = static_cast<int8_t>(max(-128, min(127, value)));
-      id += NumThreadsPerBlock;
-    }
-  }
-}
-
-template <int NumThreadsPerBlock, int NumElementsPerThread>
-__global__ void QuantizeLinearKernel(const float* input, uint8_t* output, const float* scale, const uint8_t* zero_point, CUDA_LONG N) {
-  CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
-
-#pragma unroll
-  for (int i = 0; i < NumElementsPerThread; i++) {
-    if (id < N) {
-      int value = __float2int_rn(input[id] / (*scale)) + *zero_point;
-      output[id] = static_cast<uint8_t>(max(0, min(255, value)));
-      id += NumThreadsPerBlock;
-    }
-  }
-}
-
-template <int NumThreadsPerBlock, int NumElementsPerThread>
-__global__ void QuantizeLinearKernel(const half* input, uint8_t* output, const half* scale, const uint8_t* zero_point, CUDA_LONG N) {
-  CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
-
-#pragma unroll
-  for (int i = 0; i < NumElementsPerThread; i++) {
-    if (id < N) {
-      int value = __half2int_rn(input[id] / (*scale)) + *zero_point;
-      output[id] = static_cast<uint8_t>(max(0, min(255, value)));
-      id += NumThreadsPerBlock;
-    }
-  }
-}
-
-template <class T, class U>
-Status CudaQuantizeLinear(const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element) {
+template <class OutT, class InT>
+Status CudaQuantizeLinear(const InT* input, OutT* output, const InT* scale, const OutT* zero_point, size_t num_of_element) {
  if (num_of_element <= 0)
    return Status::OK();

  int blocksPerGrid = static_cast<int>(CeilDiv(num_of_element, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
-  QuantizeLinearKernel<GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
+  QuantizeLinearKernel<GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread, OutT, InT><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
      input,
      output,
      scale,
      zero_point,
-      num_of_element);
+      num_of_element,
+      Round<InT>());
  return Status::OK();
 }

-template <class T, class U, int NumThreadsPerBlock, int NumElementsPerThread>
-__global__ void DequantizeLinearKernel(const T* input, U* output, const U* scale, const T* zero_point, CUDA_LONG N) {
+template <class InT, class OutT, int NumThreadsPerBlock, int NumElementsPerThread>
+__global__ void DequantizeLinearKernel(const InT* input, OutT* output, const OutT* scale_ptr, const InT* zero_point_ptr, CUDA_LONG N) {
  CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;

+  OutT scale = *scale_ptr;
+  InT zero_point = zero_point_ptr != nullptr ? *zero_point_ptr : 0;
 #pragma unroll
  for (int i = 0; i < NumElementsPerThread; i++) {
    if (id < N) {
-      output[id] = static_cast<U>((input[id] - *zero_point)) * (*scale);
+      output[id] = static_cast<OutT>(input[id] - zero_point) * scale;
      id += NumThreadsPerBlock;
    }
  }
 }

-template <class T, class U>
-Status CudaDequantizeLinear(const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element) {
+template <class InT, class OutT>
+Status CudaDequantizeLinear(const InT* input, OutT* output, const OutT* scale, const InT* zero_point, size_t num_of_element) {
  if (num_of_element <= 0)
    return Status::OK();

  int blocksPerGrid = static_cast<int>(CeilDiv(num_of_element, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
-  DequantizeLinearKernel<T, U, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
+  DequantizeLinearKernel<InT, OutT, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
      input,
      output,
      scale,
--- a/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc
@ -55,6 +55,15 @@ TEST(DequantizeLinearOpTest, DequantizeLinear_Scalar) {
  test.Run();
 }

+// dequantize without zero point
+TEST(DequantizeLinearOpTest, DequantizeLinear_Without_Zero_Point) {
+  OpTester test("DequantizeLinear", 10);
+  test.AddInput<int8_t>("x", {}, {100});
+  test.AddInput<float>("x_scale", {}, {2.0f});
+  test.AddOutput<float>("y", {}, {200.0f});
+  test.Run();
+}
+
 // quantize with scalar zero point and scale
 TEST(QuantizeLinearOpTest, QuantizeLinear_uint8) {
  OpTester test("QuantizeLinear", 10);
@ -126,5 +135,14 @@ TEST(QuantizeLinearOpTest, QuantizeLinear_Scalar) {
  test.Run();
 }

+// quantize with scalar data
+TEST(QuantizeLinearOpTest, DISABLED_QuantizeLinear_With_Zero_Point) {
+  OpTester test("QuantizeLinear", 10);
+  test.AddInput<float>("x", {}, {3});
+  test.AddInput<float>("y_scale", {}, {2.0f});
+  test.AddOutput<uint8_t>("y", {}, {2});
+  test.Run();
+}
+
 }  // namespace test
 }  // namespace onnxruntime