fix crash in dequantizelinear/quantizelinear for optional zero point (#4047)

fix the issue #4032 and #3802 in OnnxRuntime side. For the quantizeLinear, there also needs a fix in ONNX type inference. Will do that in ONNX repo.
This commit is contained in:
Yufeng Li 2020-05-27 17:11:55 -07:00 committed by GitHub
parent 6665d5e2bc
commit 23c313cb73
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 91 additions and 93 deletions

View file

@ -28,7 +28,8 @@ template <typename T>
Status DequantizeLinear<T>::Compute(OpKernelContext* ctx) const {
auto& x = *ctx->Input<Tensor>(0);
auto& x_scale = *ctx->Input<Tensor>(1);
auto& x_zero_point = *ctx->Input<Tensor>(2);
auto* x_zero_point = ctx->Input<Tensor>(2);
const auto& x_shape = x.Shape();
auto& y = *ctx->Output(0, x_shape);
@ -36,15 +37,19 @@ Status DequantizeLinear<T>::Compute(OpKernelContext* ctx) const {
int64_t broadcast_dim;
int64_t block_size;
if (has_axis_) {
if (has_axis_) { // custom DequantizeLinear only
const int64_t axis = HandleNegativeAxis(axis_, x_shape.NumDimensions());
N = x_shape.SizeToDimension(axis);
broadcast_dim = x_shape[axis];
block_size = x_shape.SizeFromDimension(axis + 1);
// if an axis was specified, ensure the scale and zero point are compatible
ORT_ENFORCE(x_scale.Shape().NumDimensions() == 1 && x_scale.Shape().Size() == broadcast_dim, "x_scale must be 1D tensor with size ", broadcast_dim);
ORT_ENFORCE(x_zero_point.Shape().NumDimensions() == 1 && x_zero_point.Shape().Size() == broadcast_dim, "x_zero_point must be 1D tensor with size ", broadcast_dim);
ORT_ENFORCE(x_scale.Shape().NumDimensions() == 1 && x_scale.Shape().Size() == broadcast_dim,
"x_scale must be 1D tensor with size ",
broadcast_dim);
ORT_ENFORCE(x_zero_point != nullptr && x_zero_point->Shape().NumDimensions() == 1 && x_zero_point->Shape().Size() == broadcast_dim,
"x_zero_point must be 1D tensor with size ",
broadcast_dim);
} else {
N = 1;
broadcast_dim = 1;
@ -52,17 +57,17 @@ Status DequantizeLinear<T>::Compute(OpKernelContext* ctx) const {
// if no axis, enforce that scale and zero point are scalars
ORT_ENFORCE(IsScalarOr1ElementVector(&x_scale), "x_scale must be a scalar or 1D tensor or size 1.");
ORT_ENFORCE(IsScalarOr1ElementVector(&x_zero_point), "x_zero_point must be a scalar or 1D tensor or size 1.");
ORT_ENFORCE(x_zero_point == nullptr || IsScalarOr1ElementVector(x_zero_point), "x_zero_point must be a scalar or 1D tensor or size 1.");
}
const T* zero_point = x_zero_point.template Data<T>();
const T* zero_point = x_zero_point ? x_zero_point->template Data<T>() : nullptr;
const float* scale = x_scale.template Data<float>();
const T* input = x.template Data<T>();
float* output = y.template MutableData<float>();
for (size_t n = 0; n < static_cast<size_t>(N); n++) {
for (size_t bd = 0; bd < static_cast<size_t>(broadcast_dim); bd++) {
auto zp = static_cast<int32_t>(zero_point[bd]);
auto zp = zero_point ? static_cast<int32_t>(zero_point[bd]) : 0;
auto sc = scale[bd];
for (size_t bs = 0; bs < static_cast<size_t>(block_size); bs++) {

View file

@ -13,26 +13,23 @@ template <class T, class U>
Status QuantizeLinear<T, U>::ComputeInternal(OpKernelContext* ctx) const {
typedef typename ToCudaType<U>::MappedType CudaU;
auto x = ctx->Input<Tensor>(0);
auto y_scale = ctx->Input<Tensor>(1);
auto y_zero_point = ctx->Input<Tensor>(2);
ORT_ENFORCE(x != nullptr &&
y_scale != nullptr &&
y_zero_point != nullptr);
auto y = ctx->Output(0, x->Shape());
ORT_ENFORCE(y != nullptr);
auto& x = *ctx->Input<Tensor>(0);
auto& y_scale = *ctx->Input<Tensor>(1);
auto* y_zero_point = ctx->Input<Tensor>(2);
const auto& x_shape = x->Shape();
auto& y = *ctx->Output(0, x.Shape());
const CudaU* input = reinterpret_cast<const CudaU*>(x->template Data<U>());
T* output = y->template MutableData<T>();
const auto& x_shape = x.Shape();
const CudaU* input = reinterpret_cast<const CudaU*>(x.template Data<U>());
T* output = y.template MutableData<T>();
// TO DO: support per-channel
ORT_ENFORCE(IsScalarOr1ElementVector(y_scale), "y_scale must be a scalar or 1D tensor of size 1.");
ORT_ENFORCE(IsScalarOr1ElementVector(y_zero_point), "y_zero_point must be a scalar or 1D tensor of size 1.");
ORT_ENFORCE(IsScalarOr1ElementVector(&y_scale), "y_scale must be a scalar or 1D tensor of size 1.");
ORT_ENFORCE(y_zero_point == nullptr || IsScalarOr1ElementVector(y_zero_point), "y_zero_point must be a scalar or 1D tensor of size 1.");
const T* zero_point = y_zero_point->template Data<T>();
const CudaU* scale = reinterpret_cast<const CudaU*>(y_scale->template Data<U>());
const T* zero_point = y_zero_point != nullptr ? y_zero_point->template Data<T>() : nullptr;
const CudaU* scale = reinterpret_cast<const CudaU*>(y_scale.template Data<U>());
const auto num_of_elements = x_shape.Size();
CudaQuantizeLinear(input, output, scale, zero_point, num_of_elements);
@ -44,26 +41,22 @@ template <class T, class U>
Status DequantizeLinear<T, U>::ComputeInternal(OpKernelContext* ctx) const {
typedef typename ToCudaType<U>::MappedType CudaU;
auto x = ctx->Input<Tensor>(0);
auto y_scale = ctx->Input<Tensor>(1);
auto y_zero_point = ctx->Input<Tensor>(2);
ORT_ENFORCE(x != nullptr &&
y_scale != nullptr &&
y_zero_point != nullptr);
auto& x = *ctx->Input<Tensor>(0);
auto& y_scale = *ctx->Input<Tensor>(1);
auto* y_zero_point = ctx->Input<Tensor>(2);
const auto& x_shape = x->Shape();
const auto& x_shape = x.Shape();
auto y = ctx->Output(0, x_shape);
ORT_ENFORCE(y != nullptr);
auto& y = *ctx->Output(0, x_shape);
const T* input = x->template Data<T>();
CudaU* output = reinterpret_cast<CudaU*>(y->template MutableData<U>());
const T* input = x.template Data<T>();
CudaU* output = reinterpret_cast<CudaU*>(y.template MutableData<U>());
ORT_ENFORCE(IsScalarOr1ElementVector(y_scale), "y_scale must be a scalar or 1D tensor of size 1.");
ORT_ENFORCE(IsScalarOr1ElementVector(y_zero_point), "y_zero_point must be a scalar or 1D tensor of size 1.");
ORT_ENFORCE(IsScalarOr1ElementVector(&y_scale), "y_scale must be a scalar or 1D tensor of size 1.");
ORT_ENFORCE(y_zero_point == nullptr || IsScalarOr1ElementVector(y_zero_point), "y_zero_point must be a scalar or 1D tensor of size 1.");
const T* zero_point = y_zero_point->template Data<T>();
const CudaU* scale = reinterpret_cast<const CudaU*>(y_scale->template Data<U>());
const T* zero_point = y_zero_point != nullptr ? y_zero_point->template Data<T>() : nullptr;
const CudaU* scale = reinterpret_cast<const CudaU*>(y_scale.template Data<U>());
const auto num_of_elements = x_shape.Size();
CudaDequantizeLinear(input, output, scale, zero_point, num_of_elements);

View file

@ -3,102 +3,84 @@
#include "quantize_linear.cuh"
#include <limits>
#include "core/providers/cuda/cu_inc/common.cuh"
namespace onnxruntime {
namespace cuda {
template <int NumThreadsPerBlock, int NumElementsPerThread>
__global__ void QuantizeLinearKernel(const half* input, int8_t* output, const half* scale, const int8_t* zero_point, CUDA_LONG N) {
template <typename T>
struct Round;
template <>
struct Round<float> {
__device__ __forceinline__ int operator()(float v) const {
return __float2int_rn(v);
}
};
template <>
struct Round<half> {
__device__ __forceinline__ int operator()(half v) const {
return __half2int_rn(v);
}
};
template <int NumThreadsPerBlock, int NumElementsPerThread, typename OutT, typename InT>
__global__ void QuantizeLinearKernel(const InT* input, OutT* output, const InT* scale_ptr, const OutT* zero_point_ptr, CUDA_LONG N, Round<InT> round) {
CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
InT scale = *scale_ptr;
OutT zero_point = zero_point_ptr != nullptr ? *zero_point_ptr : 0;
#pragma unroll
for (int i = 0; i < NumElementsPerThread; i++) {
if (id < N) {
int value = __half2int_rn(input[id] / (*scale)) + *zero_point;
output[id] = static_cast<int8_t>(max(-128, min(127, value)));
int value = round(input[id] / scale) + zero_point;
output[id] = static_cast<OutT>(max(std::numeric_limits<OutT>::min(), min(std::numeric_limits<OutT>::max(), value)));
id += NumThreadsPerBlock;
}
}
}
template <int NumThreadsPerBlock, int NumElementsPerThread>
__global__ void QuantizeLinearKernel(const float* input, int8_t* output, const float* scale, const int8_t* zero_point, CUDA_LONG N) {
CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
#pragma unroll
for (int i = 0; i < NumElementsPerThread; i++) {
if (id < N) {
int value = __float2int_rn(input[id] / (*scale)) + *zero_point;
output[id] = static_cast<int8_t>(max(-128, min(127, value)));
id += NumThreadsPerBlock;
}
}
}
template <int NumThreadsPerBlock, int NumElementsPerThread>
__global__ void QuantizeLinearKernel(const float* input, uint8_t* output, const float* scale, const uint8_t* zero_point, CUDA_LONG N) {
CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
#pragma unroll
for (int i = 0; i < NumElementsPerThread; i++) {
if (id < N) {
int value = __float2int_rn(input[id] / (*scale)) + *zero_point;
output[id] = static_cast<uint8_t>(max(0, min(255, value)));
id += NumThreadsPerBlock;
}
}
}
template <int NumThreadsPerBlock, int NumElementsPerThread>
__global__ void QuantizeLinearKernel(const half* input, uint8_t* output, const half* scale, const uint8_t* zero_point, CUDA_LONG N) {
CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
#pragma unroll
for (int i = 0; i < NumElementsPerThread; i++) {
if (id < N) {
int value = __half2int_rn(input[id] / (*scale)) + *zero_point;
output[id] = static_cast<uint8_t>(max(0, min(255, value)));
id += NumThreadsPerBlock;
}
}
}
template <class T, class U>
Status CudaQuantizeLinear(const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element) {
template <class OutT, class InT>
Status CudaQuantizeLinear(const InT* input, OutT* output, const InT* scale, const OutT* zero_point, size_t num_of_element) {
if (num_of_element <= 0)
return Status::OK();
int blocksPerGrid = static_cast<int>(CeilDiv(num_of_element, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
QuantizeLinearKernel<GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
QuantizeLinearKernel<GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread, OutT, InT><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
input,
output,
scale,
zero_point,
num_of_element);
num_of_element,
Round<InT>());
return Status::OK();
}
template <class T, class U, int NumThreadsPerBlock, int NumElementsPerThread>
__global__ void DequantizeLinearKernel(const T* input, U* output, const U* scale, const T* zero_point, CUDA_LONG N) {
template <class InT, class OutT, int NumThreadsPerBlock, int NumElementsPerThread>
__global__ void DequantizeLinearKernel(const InT* input, OutT* output, const OutT* scale_ptr, const InT* zero_point_ptr, CUDA_LONG N) {
CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
OutT scale = *scale_ptr;
InT zero_point = zero_point_ptr != nullptr ? *zero_point_ptr : 0;
#pragma unroll
for (int i = 0; i < NumElementsPerThread; i++) {
if (id < N) {
output[id] = static_cast<U>((input[id] - *zero_point)) * (*scale);
output[id] = static_cast<OutT>(input[id] - zero_point) * scale;
id += NumThreadsPerBlock;
}
}
}
template <class T, class U>
Status CudaDequantizeLinear(const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element) {
template <class InT, class OutT>
Status CudaDequantizeLinear(const InT* input, OutT* output, const OutT* scale, const InT* zero_point, size_t num_of_element) {
if (num_of_element <= 0)
return Status::OK();
int blocksPerGrid = static_cast<int>(CeilDiv(num_of_element, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
DequantizeLinearKernel<T, U, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
DequantizeLinearKernel<InT, OutT, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
input,
output,
scale,

View file

@ -55,6 +55,15 @@ TEST(DequantizeLinearOpTest, DequantizeLinear_Scalar) {
test.Run();
}
// dequantize without zero point
TEST(DequantizeLinearOpTest, DequantizeLinear_Without_Zero_Point) {
OpTester test("DequantizeLinear", 10);
test.AddInput<int8_t>("x", {}, {100});
test.AddInput<float>("x_scale", {}, {2.0f});
test.AddOutput<float>("y", {}, {200.0f});
test.Run();
}
// quantize with scalar zero point and scale
TEST(QuantizeLinearOpTest, QuantizeLinear_uint8) {
OpTester test("QuantizeLinear", 10);
@ -126,5 +135,14 @@ TEST(QuantizeLinearOpTest, QuantizeLinear_Scalar) {
test.Run();
}
// quantize with scalar data
TEST(QuantizeLinearOpTest, DISABLED_QuantizeLinear_With_Zero_Point) {
OpTester test("QuantizeLinear", 10);
test.AddInput<float>("x", {}, {3});
test.AddInput<float>("y_scale", {}, {2.0f});
test.AddOutput<uint8_t>("y", {}, {2});
test.Run();
}
} // namespace test
} // namespace onnxruntime