From 23c313cb73e838f6c35052a7fd05ca8131dbfbbd Mon Sep 17 00:00:00 2001 From: Yufeng Li Date: Wed, 27 May 2020 17:11:55 -0700 Subject: [PATCH] fix crash in dequantizelinear/quantizelinear for optional zero point (#4047) fix the issue #4032 and #3802 in OnnxRuntime side. For the quantizeLinear, there also needs a fix in ONNX type inference. Will do that in ONNX repo. --- .../providers/cpu/tensor/quantize_linear.cc | 19 ++-- .../providers/cuda/tensor/quantize_linear.cc | 53 +++++------ .../providers/cuda/tensor/quantize_linear.cu | 94 ++++++++----------- .../cpu/tensor/quantize_linear_test.cc | 18 ++++ 4 files changed, 91 insertions(+), 93 deletions(-) diff --git a/onnxruntime/core/providers/cpu/tensor/quantize_linear.cc b/onnxruntime/core/providers/cpu/tensor/quantize_linear.cc index 3ba3ba6837..1474213563 100644 --- a/onnxruntime/core/providers/cpu/tensor/quantize_linear.cc +++ b/onnxruntime/core/providers/cpu/tensor/quantize_linear.cc @@ -28,7 +28,8 @@ template Status DequantizeLinear::Compute(OpKernelContext* ctx) const { auto& x = *ctx->Input(0); auto& x_scale = *ctx->Input(1); - auto& x_zero_point = *ctx->Input(2); + auto* x_zero_point = ctx->Input(2); + const auto& x_shape = x.Shape(); auto& y = *ctx->Output(0, x_shape); @@ -36,15 +37,19 @@ Status DequantizeLinear::Compute(OpKernelContext* ctx) const { int64_t broadcast_dim; int64_t block_size; - if (has_axis_) { + if (has_axis_) { // custom DequantizeLinear only const int64_t axis = HandleNegativeAxis(axis_, x_shape.NumDimensions()); N = x_shape.SizeToDimension(axis); broadcast_dim = x_shape[axis]; block_size = x_shape.SizeFromDimension(axis + 1); // if an axis was specified, ensure the scale and zero point are compatible - ORT_ENFORCE(x_scale.Shape().NumDimensions() == 1 && x_scale.Shape().Size() == broadcast_dim, "x_scale must be 1D tensor with size ", broadcast_dim); - ORT_ENFORCE(x_zero_point.Shape().NumDimensions() == 1 && x_zero_point.Shape().Size() == broadcast_dim, "x_zero_point must be 1D tensor with size ", broadcast_dim); + ORT_ENFORCE(x_scale.Shape().NumDimensions() == 1 && x_scale.Shape().Size() == broadcast_dim, + "x_scale must be 1D tensor with size ", + broadcast_dim); + ORT_ENFORCE(x_zero_point != nullptr && x_zero_point->Shape().NumDimensions() == 1 && x_zero_point->Shape().Size() == broadcast_dim, + "x_zero_point must be 1D tensor with size ", + broadcast_dim); } else { N = 1; broadcast_dim = 1; @@ -52,17 +57,17 @@ Status DequantizeLinear::Compute(OpKernelContext* ctx) const { // if no axis, enforce that scale and zero point are scalars ORT_ENFORCE(IsScalarOr1ElementVector(&x_scale), "x_scale must be a scalar or 1D tensor or size 1."); - ORT_ENFORCE(IsScalarOr1ElementVector(&x_zero_point), "x_zero_point must be a scalar or 1D tensor or size 1."); + ORT_ENFORCE(x_zero_point == nullptr || IsScalarOr1ElementVector(x_zero_point), "x_zero_point must be a scalar or 1D tensor or size 1."); } - const T* zero_point = x_zero_point.template Data(); + const T* zero_point = x_zero_point ? x_zero_point->template Data() : nullptr; const float* scale = x_scale.template Data(); const T* input = x.template Data(); float* output = y.template MutableData(); for (size_t n = 0; n < static_cast(N); n++) { for (size_t bd = 0; bd < static_cast(broadcast_dim); bd++) { - auto zp = static_cast(zero_point[bd]); + auto zp = zero_point ? static_cast(zero_point[bd]) : 0; auto sc = scale[bd]; for (size_t bs = 0; bs < static_cast(block_size); bs++) { diff --git a/onnxruntime/core/providers/cuda/tensor/quantize_linear.cc b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cc index 6cc9b6e57d..3f321fadb0 100644 --- a/onnxruntime/core/providers/cuda/tensor/quantize_linear.cc +++ b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cc @@ -13,26 +13,23 @@ template Status QuantizeLinear::ComputeInternal(OpKernelContext* ctx) const { typedef typename ToCudaType::MappedType CudaU; - auto x = ctx->Input(0); - auto y_scale = ctx->Input(1); - auto y_zero_point = ctx->Input(2); - ORT_ENFORCE(x != nullptr && - y_scale != nullptr && - y_zero_point != nullptr); - auto y = ctx->Output(0, x->Shape()); - ORT_ENFORCE(y != nullptr); + auto& x = *ctx->Input(0); + auto& y_scale = *ctx->Input(1); + auto* y_zero_point = ctx->Input(2); - const auto& x_shape = x->Shape(); + auto& y = *ctx->Output(0, x.Shape()); - const CudaU* input = reinterpret_cast(x->template Data()); - T* output = y->template MutableData(); + const auto& x_shape = x.Shape(); + + const CudaU* input = reinterpret_cast(x.template Data()); + T* output = y.template MutableData(); // TO DO: support per-channel - ORT_ENFORCE(IsScalarOr1ElementVector(y_scale), "y_scale must be a scalar or 1D tensor of size 1."); - ORT_ENFORCE(IsScalarOr1ElementVector(y_zero_point), "y_zero_point must be a scalar or 1D tensor of size 1."); + ORT_ENFORCE(IsScalarOr1ElementVector(&y_scale), "y_scale must be a scalar or 1D tensor of size 1."); + ORT_ENFORCE(y_zero_point == nullptr || IsScalarOr1ElementVector(y_zero_point), "y_zero_point must be a scalar or 1D tensor of size 1."); - const T* zero_point = y_zero_point->template Data(); - const CudaU* scale = reinterpret_cast(y_scale->template Data()); + const T* zero_point = y_zero_point != nullptr ? y_zero_point->template Data() : nullptr; + const CudaU* scale = reinterpret_cast(y_scale.template Data()); const auto num_of_elements = x_shape.Size(); CudaQuantizeLinear(input, output, scale, zero_point, num_of_elements); @@ -44,26 +41,22 @@ template Status DequantizeLinear::ComputeInternal(OpKernelContext* ctx) const { typedef typename ToCudaType::MappedType CudaU; - auto x = ctx->Input(0); - auto y_scale = ctx->Input(1); - auto y_zero_point = ctx->Input(2); - ORT_ENFORCE(x != nullptr && - y_scale != nullptr && - y_zero_point != nullptr); + auto& x = *ctx->Input(0); + auto& y_scale = *ctx->Input(1); + auto* y_zero_point = ctx->Input(2); - const auto& x_shape = x->Shape(); + const auto& x_shape = x.Shape(); - auto y = ctx->Output(0, x_shape); - ORT_ENFORCE(y != nullptr); + auto& y = *ctx->Output(0, x_shape); - const T* input = x->template Data(); - CudaU* output = reinterpret_cast(y->template MutableData()); + const T* input = x.template Data(); + CudaU* output = reinterpret_cast(y.template MutableData()); - ORT_ENFORCE(IsScalarOr1ElementVector(y_scale), "y_scale must be a scalar or 1D tensor of size 1."); - ORT_ENFORCE(IsScalarOr1ElementVector(y_zero_point), "y_zero_point must be a scalar or 1D tensor of size 1."); + ORT_ENFORCE(IsScalarOr1ElementVector(&y_scale), "y_scale must be a scalar or 1D tensor of size 1."); + ORT_ENFORCE(y_zero_point == nullptr || IsScalarOr1ElementVector(y_zero_point), "y_zero_point must be a scalar or 1D tensor of size 1."); - const T* zero_point = y_zero_point->template Data(); - const CudaU* scale = reinterpret_cast(y_scale->template Data()); + const T* zero_point = y_zero_point != nullptr ? y_zero_point->template Data() : nullptr; + const CudaU* scale = reinterpret_cast(y_scale.template Data()); const auto num_of_elements = x_shape.Size(); CudaDequantizeLinear(input, output, scale, zero_point, num_of_elements); diff --git a/onnxruntime/core/providers/cuda/tensor/quantize_linear.cu b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cu index b5f97f8e99..341f1ffc31 100644 --- a/onnxruntime/core/providers/cuda/tensor/quantize_linear.cu +++ b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cu @@ -3,102 +3,84 @@ #include "quantize_linear.cuh" +#include + #include "core/providers/cuda/cu_inc/common.cuh" namespace onnxruntime { namespace cuda { -template -__global__ void QuantizeLinearKernel(const half* input, int8_t* output, const half* scale, const int8_t* zero_point, CUDA_LONG N) { +template +struct Round; + +template <> +struct Round { + __device__ __forceinline__ int operator()(float v) const { + return __float2int_rn(v); + } +}; + +template <> +struct Round { + __device__ __forceinline__ int operator()(half v) const { + return __half2int_rn(v); + } +}; + +template +__global__ void QuantizeLinearKernel(const InT* input, OutT* output, const InT* scale_ptr, const OutT* zero_point_ptr, CUDA_LONG N, Round round) { CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x; + InT scale = *scale_ptr; + OutT zero_point = zero_point_ptr != nullptr ? *zero_point_ptr : 0; #pragma unroll for (int i = 0; i < NumElementsPerThread; i++) { if (id < N) { - int value = __half2int_rn(input[id] / (*scale)) + *zero_point; - output[id] = static_cast(max(-128, min(127, value))); + int value = round(input[id] / scale) + zero_point; + output[id] = static_cast(max(std::numeric_limits::min(), min(std::numeric_limits::max(), value))); id += NumThreadsPerBlock; } } } -template -__global__ void QuantizeLinearKernel(const float* input, int8_t* output, const float* scale, const int8_t* zero_point, CUDA_LONG N) { - CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x; - -#pragma unroll - for (int i = 0; i < NumElementsPerThread; i++) { - if (id < N) { - int value = __float2int_rn(input[id] / (*scale)) + *zero_point; - output[id] = static_cast(max(-128, min(127, value))); - id += NumThreadsPerBlock; - } - } -} - -template -__global__ void QuantizeLinearKernel(const float* input, uint8_t* output, const float* scale, const uint8_t* zero_point, CUDA_LONG N) { - CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x; - -#pragma unroll - for (int i = 0; i < NumElementsPerThread; i++) { - if (id < N) { - int value = __float2int_rn(input[id] / (*scale)) + *zero_point; - output[id] = static_cast(max(0, min(255, value))); - id += NumThreadsPerBlock; - } - } -} - -template -__global__ void QuantizeLinearKernel(const half* input, uint8_t* output, const half* scale, const uint8_t* zero_point, CUDA_LONG N) { - CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x; - -#pragma unroll - for (int i = 0; i < NumElementsPerThread; i++) { - if (id < N) { - int value = __half2int_rn(input[id] / (*scale)) + *zero_point; - output[id] = static_cast(max(0, min(255, value))); - id += NumThreadsPerBlock; - } - } -} - -template -Status CudaQuantizeLinear(const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element) { +template +Status CudaQuantizeLinear(const InT* input, OutT* output, const InT* scale, const OutT* zero_point, size_t num_of_element) { if (num_of_element <= 0) return Status::OK(); int blocksPerGrid = static_cast(CeilDiv(num_of_element, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread)); - QuantizeLinearKernel<<>>( + QuantizeLinearKernel<<>>( input, output, scale, zero_point, - num_of_element); + num_of_element, + Round()); return Status::OK(); } -template -__global__ void DequantizeLinearKernel(const T* input, U* output, const U* scale, const T* zero_point, CUDA_LONG N) { +template +__global__ void DequantizeLinearKernel(const InT* input, OutT* output, const OutT* scale_ptr, const InT* zero_point_ptr, CUDA_LONG N) { CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x; + OutT scale = *scale_ptr; + InT zero_point = zero_point_ptr != nullptr ? *zero_point_ptr : 0; #pragma unroll for (int i = 0; i < NumElementsPerThread; i++) { if (id < N) { - output[id] = static_cast((input[id] - *zero_point)) * (*scale); + output[id] = static_cast(input[id] - zero_point) * scale; id += NumThreadsPerBlock; } } } -template -Status CudaDequantizeLinear(const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element) { +template +Status CudaDequantizeLinear(const InT* input, OutT* output, const OutT* scale, const InT* zero_point, size_t num_of_element) { if (num_of_element <= 0) return Status::OK(); int blocksPerGrid = static_cast(CeilDiv(num_of_element, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread)); - DequantizeLinearKernel<<>>( + DequantizeLinearKernel<<>>( input, output, scale, diff --git a/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc b/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc index db4554aeb6..739d0e9181 100644 --- a/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc @@ -55,6 +55,15 @@ TEST(DequantizeLinearOpTest, DequantizeLinear_Scalar) { test.Run(); } +// dequantize without zero point +TEST(DequantizeLinearOpTest, DequantizeLinear_Without_Zero_Point) { + OpTester test("DequantizeLinear", 10); + test.AddInput("x", {}, {100}); + test.AddInput("x_scale", {}, {2.0f}); + test.AddOutput("y", {}, {200.0f}); + test.Run(); +} + // quantize with scalar zero point and scale TEST(QuantizeLinearOpTest, QuantizeLinear_uint8) { OpTester test("QuantizeLinear", 10); @@ -126,5 +135,14 @@ TEST(QuantizeLinearOpTest, QuantizeLinear_Scalar) { test.Run(); } +// quantize with scalar data +TEST(QuantizeLinearOpTest, DISABLED_QuantizeLinear_With_Zero_Point) { + OpTester test("QuantizeLinear", 10); + test.AddInput("x", {}, {3}); + test.AddInput("y_scale", {}, {2.0f}); + test.AddOutput("y", {}, {2}); + test.Run(); +} + } // namespace test } // namespace onnxruntime