From 23c313cb73e838f6c35052a7fd05ca8131dbfbbd Mon Sep 17 00:00:00 2001
From: Yufeng Li <liyufeng1987@gmail.com>
Date: Wed, 27 May 2020 17:11:55 -0700
Subject: [PATCH] fix crash in dequantizelinear/quantizelinear for optional
 zero point (#4047)

fix the issue #4032 and #3802 in OnnxRuntime side. For the quantizeLinear, there also needs a fix in ONNX type inference. Will do that in ONNX repo.
---
 .../providers/cpu/tensor/quantize_linear.cc   | 19 ++--
 .../providers/cuda/tensor/quantize_linear.cc  | 53 +++++------
 .../providers/cuda/tensor/quantize_linear.cu  | 94 ++++++++-----------
 .../cpu/tensor/quantize_linear_test.cc        | 18 ++++
 4 files changed, 91 insertions(+), 93 deletions(-)
diff --git a/onnxruntime/core/providers/cpu/tensor/quantize_linear.cc b/onnxruntime/core/providers/cpu/tensor/quantize_linear.cc
index 3ba3ba6837..1474213563 100644
--- a/onnxruntime/core/providers/cpu/tensor/quantize_linear.cc
+++ b/onnxruntime/core/providers/cpu/tensor/quantize_linear.cc
@@ -28,7 +28,8 @@ template <typename T>
 Status DequantizeLinear<T>::Compute(OpKernelContext* ctx) const {
   auto& x = *ctx->Input<Tensor>(0);
   auto& x_scale = *ctx->Input<Tensor>(1);
-  auto& x_zero_point = *ctx->Input<Tensor>(2);
+  auto* x_zero_point = ctx->Input<Tensor>(2);
+
   const auto& x_shape = x.Shape();
   auto& y = *ctx->Output(0, x_shape);
 
@@ -36,15 +37,19 @@ Status DequantizeLinear<T>::Compute(OpKernelContext* ctx) const {
   int64_t broadcast_dim;
   int64_t block_size;
 
-  if (has_axis_) {
+  if (has_axis_) {  // custom DequantizeLinear only
     const int64_t axis = HandleNegativeAxis(axis_, x_shape.NumDimensions());
     N = x_shape.SizeToDimension(axis);
     broadcast_dim = x_shape[axis];
     block_size = x_shape.SizeFromDimension(axis + 1);
 
     // if an axis was specified, ensure the scale and zero point are compatible
-    ORT_ENFORCE(x_scale.Shape().NumDimensions() == 1 && x_scale.Shape().Size() == broadcast_dim, "x_scale must be 1D tensor with size ", broadcast_dim);
-    ORT_ENFORCE(x_zero_point.Shape().NumDimensions() == 1 && x_zero_point.Shape().Size() == broadcast_dim, "x_zero_point must be 1D tensor with size ", broadcast_dim);
+    ORT_ENFORCE(x_scale.Shape().NumDimensions() == 1 && x_scale.Shape().Size() == broadcast_dim,
+                "x_scale must be 1D tensor with size ",
+                broadcast_dim);
+    ORT_ENFORCE(x_zero_point != nullptr && x_zero_point->Shape().NumDimensions() == 1 && x_zero_point->Shape().Size() == broadcast_dim,
+                "x_zero_point must be 1D tensor with size ",
+                broadcast_dim);
   } else {
     N = 1;
     broadcast_dim = 1;
@@ -52,17 +57,17 @@ Status DequantizeLinear<T>::Compute(OpKernelContext* ctx) const {
 
     // if no axis, enforce that scale and zero point are scalars
     ORT_ENFORCE(IsScalarOr1ElementVector(&x_scale), "x_scale must be a scalar or 1D tensor or size 1.");
-    ORT_ENFORCE(IsScalarOr1ElementVector(&x_zero_point), "x_zero_point must be a scalar or 1D tensor or size 1.");
+    ORT_ENFORCE(x_zero_point == nullptr || IsScalarOr1ElementVector(x_zero_point), "x_zero_point must be a scalar or 1D tensor or size 1.");
   }
 
-  const T* zero_point = x_zero_point.template Data<T>();
+  const T* zero_point = x_zero_point ? x_zero_point->template Data<T>() : nullptr;
   const float* scale = x_scale.template Data<float>();
   const T* input = x.template Data<T>();
   float* output = y.template MutableData<float>();
 
   for (size_t n = 0; n < static_cast<size_t>(N); n++) {
     for (size_t bd = 0; bd < static_cast<size_t>(broadcast_dim); bd++) {
-      auto zp = static_cast<int32_t>(zero_point[bd]);
+      auto zp = zero_point ? static_cast<int32_t>(zero_point[bd]) : 0;
       auto sc = scale[bd];
 
       for (size_t bs = 0; bs < static_cast<size_t>(block_size); bs++) {
diff --git a/onnxruntime/core/providers/cuda/tensor/quantize_linear.cc b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cc
index 6cc9b6e57d..3f321fadb0 100644
--- a/onnxruntime/core/providers/cuda/tensor/quantize_linear.cc
+++ b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cc
@@ -13,26 +13,23 @@ template <class T, class U>
 Status QuantizeLinear<T, U>::ComputeInternal(OpKernelContext* ctx) const {
   typedef typename ToCudaType<U>::MappedType CudaU;
 
-  auto x = ctx->Input<Tensor>(0);
-  auto y_scale = ctx->Input<Tensor>(1);
-  auto y_zero_point = ctx->Input<Tensor>(2);
-  ORT_ENFORCE(x != nullptr &&
-              y_scale != nullptr &&
-              y_zero_point != nullptr);
-  auto y = ctx->Output(0, x->Shape());
-  ORT_ENFORCE(y != nullptr);
+  auto& x = *ctx->Input<Tensor>(0);
+  auto& y_scale = *ctx->Input<Tensor>(1);
+  auto* y_zero_point = ctx->Input<Tensor>(2);
 
-  const auto& x_shape = x->Shape();
+  auto& y = *ctx->Output(0, x.Shape());
 
-  const CudaU* input = reinterpret_cast<const CudaU*>(x->template Data<U>());
-  T* output = y->template MutableData<T>();
+  const auto& x_shape = x.Shape();
+
+  const CudaU* input = reinterpret_cast<const CudaU*>(x.template Data<U>());
+  T* output = y.template MutableData<T>();
 
   // TO DO: support per-channel
-  ORT_ENFORCE(IsScalarOr1ElementVector(y_scale), "y_scale must be a scalar or 1D tensor of size 1.");
-  ORT_ENFORCE(IsScalarOr1ElementVector(y_zero_point), "y_zero_point must be a scalar or 1D tensor of size 1.");
+  ORT_ENFORCE(IsScalarOr1ElementVector(&y_scale), "y_scale must be a scalar or 1D tensor of size 1.");
+  ORT_ENFORCE(y_zero_point == nullptr || IsScalarOr1ElementVector(y_zero_point), "y_zero_point must be a scalar or 1D tensor of size 1.");
 
-  const T* zero_point = y_zero_point->template Data<T>();
-  const CudaU* scale = reinterpret_cast<const CudaU*>(y_scale->template Data<U>());
+  const T* zero_point = y_zero_point != nullptr ? y_zero_point->template Data<T>() : nullptr;
+  const CudaU* scale = reinterpret_cast<const CudaU*>(y_scale.template Data<U>());
   const auto num_of_elements = x_shape.Size();
 
   CudaQuantizeLinear(input, output, scale, zero_point, num_of_elements);
@@ -44,26 +41,22 @@ template <class T, class U>
 Status DequantizeLinear<T, U>::ComputeInternal(OpKernelContext* ctx) const {
   typedef typename ToCudaType<U>::MappedType CudaU;
 
-  auto x = ctx->Input<Tensor>(0);
-  auto y_scale = ctx->Input<Tensor>(1);
-  auto y_zero_point = ctx->Input<Tensor>(2);
-  ORT_ENFORCE(x != nullptr &&
-              y_scale != nullptr &&
-              y_zero_point != nullptr);
+  auto& x = *ctx->Input<Tensor>(0);
+  auto& y_scale = *ctx->Input<Tensor>(1);
+  auto* y_zero_point = ctx->Input<Tensor>(2);
 
-  const auto& x_shape = x->Shape();
+  const auto& x_shape = x.Shape();
 
-  auto y = ctx->Output(0, x_shape);
-  ORT_ENFORCE(y != nullptr);
+  auto& y = *ctx->Output(0, x_shape);
 
-  const T* input = x->template Data<T>();
-  CudaU* output = reinterpret_cast<CudaU*>(y->template MutableData<U>());
+  const T* input = x.template Data<T>();
+  CudaU* output = reinterpret_cast<CudaU*>(y.template MutableData<U>());
 
-  ORT_ENFORCE(IsScalarOr1ElementVector(y_scale), "y_scale must be a scalar or 1D tensor of size 1.");
-  ORT_ENFORCE(IsScalarOr1ElementVector(y_zero_point), "y_zero_point must be a scalar or 1D tensor of size 1.");
+  ORT_ENFORCE(IsScalarOr1ElementVector(&y_scale), "y_scale must be a scalar or 1D tensor of size 1.");
+  ORT_ENFORCE(y_zero_point == nullptr || IsScalarOr1ElementVector(y_zero_point), "y_zero_point must be a scalar or 1D tensor of size 1.");
 
-  const T* zero_point = y_zero_point->template Data<T>();
-  const CudaU* scale = reinterpret_cast<const CudaU*>(y_scale->template Data<U>());
+  const T* zero_point = y_zero_point != nullptr ? y_zero_point->template Data<T>() : nullptr;
+  const CudaU* scale = reinterpret_cast<const CudaU*>(y_scale.template Data<U>());
   const auto num_of_elements = x_shape.Size();
 
   CudaDequantizeLinear(input, output, scale, zero_point, num_of_elements);
diff --git a/onnxruntime/core/providers/cuda/tensor/quantize_linear.cu b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cu
index b5f97f8e99..341f1ffc31 100644
--- a/onnxruntime/core/providers/cuda/tensor/quantize_linear.cu
+++ b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cu
@@ -3,102 +3,84 @@
 
 #include "quantize_linear.cuh"
 
+#include <limits>
+
 #include "core/providers/cuda/cu_inc/common.cuh"
 
 namespace onnxruntime {
 namespace cuda {
 
-template <int NumThreadsPerBlock, int NumElementsPerThread>
-__global__ void QuantizeLinearKernel(const half* input, int8_t* output, const half* scale, const int8_t* zero_point, CUDA_LONG N) {
+template <typename T>
+struct Round;
+
+template <>
+struct Round<float> {
+  __device__ __forceinline__ int operator()(float v) const {
+    return __float2int_rn(v);
+  }
+};
+
+template <>
+struct Round<half> {
+  __device__ __forceinline__ int operator()(half v) const {
+    return __half2int_rn(v);
+  }
+};
+
+template <int NumThreadsPerBlock, int NumElementsPerThread, typename OutT, typename InT>
+__global__ void QuantizeLinearKernel(const InT* input, OutT* output, const InT* scale_ptr, const OutT* zero_point_ptr, CUDA_LONG N, Round<InT> round) {
   CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
 
+  InT scale = *scale_ptr;
+  OutT zero_point = zero_point_ptr != nullptr ? *zero_point_ptr : 0;
 #pragma unroll
   for (int i = 0; i < NumElementsPerThread; i++) {
     if (id < N) {
-      int value = __half2int_rn(input[id] / (*scale)) + *zero_point;
-      output[id] = static_cast<int8_t>(max(-128, min(127, value)));
+      int value = round(input[id] / scale) + zero_point;
+      output[id] = static_cast<OutT>(max(std::numeric_limits<OutT>::min(), min(std::numeric_limits<OutT>::max(), value)));
       id += NumThreadsPerBlock;
     }
   }
 }
 
-template <int NumThreadsPerBlock, int NumElementsPerThread>
-__global__ void QuantizeLinearKernel(const float* input, int8_t* output, const float* scale, const int8_t* zero_point, CUDA_LONG N) {
-  CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
-
-#pragma unroll
-  for (int i = 0; i < NumElementsPerThread; i++) {
-    if (id < N) {
-      int value = __float2int_rn(input[id] / (*scale)) + *zero_point;
-      output[id] = static_cast<int8_t>(max(-128, min(127, value)));
-      id += NumThreadsPerBlock;
-    }
-  }
-}
-
-template <int NumThreadsPerBlock, int NumElementsPerThread>
-__global__ void QuantizeLinearKernel(const float* input, uint8_t* output, const float* scale, const uint8_t* zero_point, CUDA_LONG N) {
-  CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
-
-#pragma unroll
-  for (int i = 0; i < NumElementsPerThread; i++) {
-    if (id < N) {
-      int value = __float2int_rn(input[id] / (*scale)) + *zero_point;
-      output[id] = static_cast<uint8_t>(max(0, min(255, value)));
-      id += NumThreadsPerBlock;
-    }
-  }
-}
-
-template <int NumThreadsPerBlock, int NumElementsPerThread>
-__global__ void QuantizeLinearKernel(const half* input, uint8_t* output, const half* scale, const uint8_t* zero_point, CUDA_LONG N) {
-  CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
-
-#pragma unroll
-  for (int i = 0; i < NumElementsPerThread; i++) {
-    if (id < N) {
-      int value = __half2int_rn(input[id] / (*scale)) + *zero_point;
-      output[id] = static_cast<uint8_t>(max(0, min(255, value)));
-      id += NumThreadsPerBlock;
-    }
-  }
-}
-
-template <class T, class U>
-Status CudaQuantizeLinear(const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element) {
+template <class OutT, class InT>
+Status CudaQuantizeLinear(const InT* input, OutT* output, const InT* scale, const OutT* zero_point, size_t num_of_element) {
   if (num_of_element <= 0)
     return Status::OK();
 
   int blocksPerGrid = static_cast<int>(CeilDiv(num_of_element, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
-  QuantizeLinearKernel<GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
+  QuantizeLinearKernel<GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread, OutT, InT><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
       input,
       output,
       scale,
       zero_point,
-      num_of_element);
+      num_of_element,
+      Round<InT>());
   return Status::OK();
 }
 
-template <class T, class U, int NumThreadsPerBlock, int NumElementsPerThread>
-__global__ void DequantizeLinearKernel(const T* input, U* output, const U* scale, const T* zero_point, CUDA_LONG N) {
+template <class InT, class OutT, int NumThreadsPerBlock, int NumElementsPerThread>
+__global__ void DequantizeLinearKernel(const InT* input, OutT* output, const OutT* scale_ptr, const InT* zero_point_ptr, CUDA_LONG N) {
   CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
 
+  OutT scale = *scale_ptr;
+  InT zero_point = zero_point_ptr != nullptr ? *zero_point_ptr : 0;
 #pragma unroll
   for (int i = 0; i < NumElementsPerThread; i++) {
     if (id < N) {
-      output[id] = static_cast<U>((input[id] - *zero_point)) * (*scale);
+      output[id] = static_cast<OutT>(input[id] - zero_point) * scale;
       id += NumThreadsPerBlock;
     }
   }
 }
 
-template <class T, class U>
-Status CudaDequantizeLinear(const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element) {
+template <class InT, class OutT>
+Status CudaDequantizeLinear(const InT* input, OutT* output, const OutT* scale, const InT* zero_point, size_t num_of_element) {
   if (num_of_element <= 0)
     return Status::OK();
 
   int blocksPerGrid = static_cast<int>(CeilDiv(num_of_element, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
-  DequantizeLinearKernel<T, U, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
+  DequantizeLinearKernel<InT, OutT, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
       input,
       output,
       scale,
diff --git a/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc b/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc
index db4554aeb6..739d0e9181 100644
--- a/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc
@@ -55,6 +55,15 @@ TEST(DequantizeLinearOpTest, DequantizeLinear_Scalar) {
   test.Run();
 }
 
+// dequantize without zero point
+TEST(DequantizeLinearOpTest, DequantizeLinear_Without_Zero_Point) {
+  OpTester test("DequantizeLinear", 10);
+  test.AddInput<int8_t>("x", {}, {100});
+  test.AddInput<float>("x_scale", {}, {2.0f});
+  test.AddOutput<float>("y", {}, {200.0f});
+  test.Run();
+}
+
 // quantize with scalar zero point and scale
 TEST(QuantizeLinearOpTest, QuantizeLinear_uint8) {
   OpTester test("QuantizeLinear", 10);
@@ -126,5 +135,14 @@ TEST(QuantizeLinearOpTest, QuantizeLinear_Scalar) {
   test.Run();
 }
 
+// quantize with scalar data
+TEST(QuantizeLinearOpTest, DISABLED_QuantizeLinear_With_Zero_Point) {
+  OpTester test("QuantizeLinear", 10);
+  test.AddInput<float>("x", {}, {3});
+  test.AddInput<float>("y_scale", {}, {2.0f});
+  test.AddOutput<uint8_t>("y", {}, {2});
+  test.Run();
+}
+
 }  // namespace test
 }  // namespace onnxruntime