mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-27 22:45:57 +00:00
fix crash in dequantizelinear/quantizelinear for optional zero point (#4047)
fix the issue #4032 and #3802 in OnnxRuntime side. For the quantizeLinear, there also needs a fix in ONNX type inference. Will do that in ONNX repo.
This commit is contained in:
parent
6665d5e2bc
commit
23c313cb73
4 changed files with 91 additions and 93 deletions
|
|
@ -28,7 +28,8 @@ template <typename T>
|
|||
Status DequantizeLinear<T>::Compute(OpKernelContext* ctx) const {
|
||||
auto& x = *ctx->Input<Tensor>(0);
|
||||
auto& x_scale = *ctx->Input<Tensor>(1);
|
||||
auto& x_zero_point = *ctx->Input<Tensor>(2);
|
||||
auto* x_zero_point = ctx->Input<Tensor>(2);
|
||||
|
||||
const auto& x_shape = x.Shape();
|
||||
auto& y = *ctx->Output(0, x_shape);
|
||||
|
||||
|
|
@ -36,15 +37,19 @@ Status DequantizeLinear<T>::Compute(OpKernelContext* ctx) const {
|
|||
int64_t broadcast_dim;
|
||||
int64_t block_size;
|
||||
|
||||
if (has_axis_) {
|
||||
if (has_axis_) { // custom DequantizeLinear only
|
||||
const int64_t axis = HandleNegativeAxis(axis_, x_shape.NumDimensions());
|
||||
N = x_shape.SizeToDimension(axis);
|
||||
broadcast_dim = x_shape[axis];
|
||||
block_size = x_shape.SizeFromDimension(axis + 1);
|
||||
|
||||
// if an axis was specified, ensure the scale and zero point are compatible
|
||||
ORT_ENFORCE(x_scale.Shape().NumDimensions() == 1 && x_scale.Shape().Size() == broadcast_dim, "x_scale must be 1D tensor with size ", broadcast_dim);
|
||||
ORT_ENFORCE(x_zero_point.Shape().NumDimensions() == 1 && x_zero_point.Shape().Size() == broadcast_dim, "x_zero_point must be 1D tensor with size ", broadcast_dim);
|
||||
ORT_ENFORCE(x_scale.Shape().NumDimensions() == 1 && x_scale.Shape().Size() == broadcast_dim,
|
||||
"x_scale must be 1D tensor with size ",
|
||||
broadcast_dim);
|
||||
ORT_ENFORCE(x_zero_point != nullptr && x_zero_point->Shape().NumDimensions() == 1 && x_zero_point->Shape().Size() == broadcast_dim,
|
||||
"x_zero_point must be 1D tensor with size ",
|
||||
broadcast_dim);
|
||||
} else {
|
||||
N = 1;
|
||||
broadcast_dim = 1;
|
||||
|
|
@ -52,17 +57,17 @@ Status DequantizeLinear<T>::Compute(OpKernelContext* ctx) const {
|
|||
|
||||
// if no axis, enforce that scale and zero point are scalars
|
||||
ORT_ENFORCE(IsScalarOr1ElementVector(&x_scale), "x_scale must be a scalar or 1D tensor or size 1.");
|
||||
ORT_ENFORCE(IsScalarOr1ElementVector(&x_zero_point), "x_zero_point must be a scalar or 1D tensor or size 1.");
|
||||
ORT_ENFORCE(x_zero_point == nullptr || IsScalarOr1ElementVector(x_zero_point), "x_zero_point must be a scalar or 1D tensor or size 1.");
|
||||
}
|
||||
|
||||
const T* zero_point = x_zero_point.template Data<T>();
|
||||
const T* zero_point = x_zero_point ? x_zero_point->template Data<T>() : nullptr;
|
||||
const float* scale = x_scale.template Data<float>();
|
||||
const T* input = x.template Data<T>();
|
||||
float* output = y.template MutableData<float>();
|
||||
|
||||
for (size_t n = 0; n < static_cast<size_t>(N); n++) {
|
||||
for (size_t bd = 0; bd < static_cast<size_t>(broadcast_dim); bd++) {
|
||||
auto zp = static_cast<int32_t>(zero_point[bd]);
|
||||
auto zp = zero_point ? static_cast<int32_t>(zero_point[bd]) : 0;
|
||||
auto sc = scale[bd];
|
||||
|
||||
for (size_t bs = 0; bs < static_cast<size_t>(block_size); bs++) {
|
||||
|
|
|
|||
|
|
@ -13,26 +13,23 @@ template <class T, class U>
|
|||
Status QuantizeLinear<T, U>::ComputeInternal(OpKernelContext* ctx) const {
|
||||
typedef typename ToCudaType<U>::MappedType CudaU;
|
||||
|
||||
auto x = ctx->Input<Tensor>(0);
|
||||
auto y_scale = ctx->Input<Tensor>(1);
|
||||
auto y_zero_point = ctx->Input<Tensor>(2);
|
||||
ORT_ENFORCE(x != nullptr &&
|
||||
y_scale != nullptr &&
|
||||
y_zero_point != nullptr);
|
||||
auto y = ctx->Output(0, x->Shape());
|
||||
ORT_ENFORCE(y != nullptr);
|
||||
auto& x = *ctx->Input<Tensor>(0);
|
||||
auto& y_scale = *ctx->Input<Tensor>(1);
|
||||
auto* y_zero_point = ctx->Input<Tensor>(2);
|
||||
|
||||
const auto& x_shape = x->Shape();
|
||||
auto& y = *ctx->Output(0, x.Shape());
|
||||
|
||||
const CudaU* input = reinterpret_cast<const CudaU*>(x->template Data<U>());
|
||||
T* output = y->template MutableData<T>();
|
||||
const auto& x_shape = x.Shape();
|
||||
|
||||
const CudaU* input = reinterpret_cast<const CudaU*>(x.template Data<U>());
|
||||
T* output = y.template MutableData<T>();
|
||||
|
||||
// TO DO: support per-channel
|
||||
ORT_ENFORCE(IsScalarOr1ElementVector(y_scale), "y_scale must be a scalar or 1D tensor of size 1.");
|
||||
ORT_ENFORCE(IsScalarOr1ElementVector(y_zero_point), "y_zero_point must be a scalar or 1D tensor of size 1.");
|
||||
ORT_ENFORCE(IsScalarOr1ElementVector(&y_scale), "y_scale must be a scalar or 1D tensor of size 1.");
|
||||
ORT_ENFORCE(y_zero_point == nullptr || IsScalarOr1ElementVector(y_zero_point), "y_zero_point must be a scalar or 1D tensor of size 1.");
|
||||
|
||||
const T* zero_point = y_zero_point->template Data<T>();
|
||||
const CudaU* scale = reinterpret_cast<const CudaU*>(y_scale->template Data<U>());
|
||||
const T* zero_point = y_zero_point != nullptr ? y_zero_point->template Data<T>() : nullptr;
|
||||
const CudaU* scale = reinterpret_cast<const CudaU*>(y_scale.template Data<U>());
|
||||
const auto num_of_elements = x_shape.Size();
|
||||
|
||||
CudaQuantizeLinear(input, output, scale, zero_point, num_of_elements);
|
||||
|
|
@ -44,26 +41,22 @@ template <class T, class U>
|
|||
Status DequantizeLinear<T, U>::ComputeInternal(OpKernelContext* ctx) const {
|
||||
typedef typename ToCudaType<U>::MappedType CudaU;
|
||||
|
||||
auto x = ctx->Input<Tensor>(0);
|
||||
auto y_scale = ctx->Input<Tensor>(1);
|
||||
auto y_zero_point = ctx->Input<Tensor>(2);
|
||||
ORT_ENFORCE(x != nullptr &&
|
||||
y_scale != nullptr &&
|
||||
y_zero_point != nullptr);
|
||||
auto& x = *ctx->Input<Tensor>(0);
|
||||
auto& y_scale = *ctx->Input<Tensor>(1);
|
||||
auto* y_zero_point = ctx->Input<Tensor>(2);
|
||||
|
||||
const auto& x_shape = x->Shape();
|
||||
const auto& x_shape = x.Shape();
|
||||
|
||||
auto y = ctx->Output(0, x_shape);
|
||||
ORT_ENFORCE(y != nullptr);
|
||||
auto& y = *ctx->Output(0, x_shape);
|
||||
|
||||
const T* input = x->template Data<T>();
|
||||
CudaU* output = reinterpret_cast<CudaU*>(y->template MutableData<U>());
|
||||
const T* input = x.template Data<T>();
|
||||
CudaU* output = reinterpret_cast<CudaU*>(y.template MutableData<U>());
|
||||
|
||||
ORT_ENFORCE(IsScalarOr1ElementVector(y_scale), "y_scale must be a scalar or 1D tensor of size 1.");
|
||||
ORT_ENFORCE(IsScalarOr1ElementVector(y_zero_point), "y_zero_point must be a scalar or 1D tensor of size 1.");
|
||||
ORT_ENFORCE(IsScalarOr1ElementVector(&y_scale), "y_scale must be a scalar or 1D tensor of size 1.");
|
||||
ORT_ENFORCE(y_zero_point == nullptr || IsScalarOr1ElementVector(y_zero_point), "y_zero_point must be a scalar or 1D tensor of size 1.");
|
||||
|
||||
const T* zero_point = y_zero_point->template Data<T>();
|
||||
const CudaU* scale = reinterpret_cast<const CudaU*>(y_scale->template Data<U>());
|
||||
const T* zero_point = y_zero_point != nullptr ? y_zero_point->template Data<T>() : nullptr;
|
||||
const CudaU* scale = reinterpret_cast<const CudaU*>(y_scale.template Data<U>());
|
||||
const auto num_of_elements = x_shape.Size();
|
||||
|
||||
CudaDequantizeLinear(input, output, scale, zero_point, num_of_elements);
|
||||
|
|
|
|||
|
|
@ -3,102 +3,84 @@
|
|||
|
||||
#include "quantize_linear.cuh"
|
||||
|
||||
#include <limits>
|
||||
|
||||
#include "core/providers/cuda/cu_inc/common.cuh"
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace cuda {
|
||||
|
||||
template <int NumThreadsPerBlock, int NumElementsPerThread>
|
||||
__global__ void QuantizeLinearKernel(const half* input, int8_t* output, const half* scale, const int8_t* zero_point, CUDA_LONG N) {
|
||||
template <typename T>
|
||||
struct Round;
|
||||
|
||||
template <>
|
||||
struct Round<float> {
|
||||
__device__ __forceinline__ int operator()(float v) const {
|
||||
return __float2int_rn(v);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct Round<half> {
|
||||
__device__ __forceinline__ int operator()(half v) const {
|
||||
return __half2int_rn(v);
|
||||
}
|
||||
};
|
||||
|
||||
template <int NumThreadsPerBlock, int NumElementsPerThread, typename OutT, typename InT>
|
||||
__global__ void QuantizeLinearKernel(const InT* input, OutT* output, const InT* scale_ptr, const OutT* zero_point_ptr, CUDA_LONG N, Round<InT> round) {
|
||||
CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
|
||||
|
||||
InT scale = *scale_ptr;
|
||||
OutT zero_point = zero_point_ptr != nullptr ? *zero_point_ptr : 0;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < NumElementsPerThread; i++) {
|
||||
if (id < N) {
|
||||
int value = __half2int_rn(input[id] / (*scale)) + *zero_point;
|
||||
output[id] = static_cast<int8_t>(max(-128, min(127, value)));
|
||||
int value = round(input[id] / scale) + zero_point;
|
||||
output[id] = static_cast<OutT>(max(std::numeric_limits<OutT>::min(), min(std::numeric_limits<OutT>::max(), value)));
|
||||
id += NumThreadsPerBlock;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int NumThreadsPerBlock, int NumElementsPerThread>
|
||||
__global__ void QuantizeLinearKernel(const float* input, int8_t* output, const float* scale, const int8_t* zero_point, CUDA_LONG N) {
|
||||
CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < NumElementsPerThread; i++) {
|
||||
if (id < N) {
|
||||
int value = __float2int_rn(input[id] / (*scale)) + *zero_point;
|
||||
output[id] = static_cast<int8_t>(max(-128, min(127, value)));
|
||||
id += NumThreadsPerBlock;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int NumThreadsPerBlock, int NumElementsPerThread>
|
||||
__global__ void QuantizeLinearKernel(const float* input, uint8_t* output, const float* scale, const uint8_t* zero_point, CUDA_LONG N) {
|
||||
CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < NumElementsPerThread; i++) {
|
||||
if (id < N) {
|
||||
int value = __float2int_rn(input[id] / (*scale)) + *zero_point;
|
||||
output[id] = static_cast<uint8_t>(max(0, min(255, value)));
|
||||
id += NumThreadsPerBlock;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int NumThreadsPerBlock, int NumElementsPerThread>
|
||||
__global__ void QuantizeLinearKernel(const half* input, uint8_t* output, const half* scale, const uint8_t* zero_point, CUDA_LONG N) {
|
||||
CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < NumElementsPerThread; i++) {
|
||||
if (id < N) {
|
||||
int value = __half2int_rn(input[id] / (*scale)) + *zero_point;
|
||||
output[id] = static_cast<uint8_t>(max(0, min(255, value)));
|
||||
id += NumThreadsPerBlock;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, class U>
|
||||
Status CudaQuantizeLinear(const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element) {
|
||||
template <class OutT, class InT>
|
||||
Status CudaQuantizeLinear(const InT* input, OutT* output, const InT* scale, const OutT* zero_point, size_t num_of_element) {
|
||||
if (num_of_element <= 0)
|
||||
return Status::OK();
|
||||
|
||||
int blocksPerGrid = static_cast<int>(CeilDiv(num_of_element, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
|
||||
QuantizeLinearKernel<GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
|
||||
QuantizeLinearKernel<GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread, OutT, InT><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
|
||||
input,
|
||||
output,
|
||||
scale,
|
||||
zero_point,
|
||||
num_of_element);
|
||||
num_of_element,
|
||||
Round<InT>());
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <class T, class U, int NumThreadsPerBlock, int NumElementsPerThread>
|
||||
__global__ void DequantizeLinearKernel(const T* input, U* output, const U* scale, const T* zero_point, CUDA_LONG N) {
|
||||
template <class InT, class OutT, int NumThreadsPerBlock, int NumElementsPerThread>
|
||||
__global__ void DequantizeLinearKernel(const InT* input, OutT* output, const OutT* scale_ptr, const InT* zero_point_ptr, CUDA_LONG N) {
|
||||
CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
|
||||
|
||||
OutT scale = *scale_ptr;
|
||||
InT zero_point = zero_point_ptr != nullptr ? *zero_point_ptr : 0;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < NumElementsPerThread; i++) {
|
||||
if (id < N) {
|
||||
output[id] = static_cast<U>((input[id] - *zero_point)) * (*scale);
|
||||
output[id] = static_cast<OutT>(input[id] - zero_point) * scale;
|
||||
id += NumThreadsPerBlock;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, class U>
|
||||
Status CudaDequantizeLinear(const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element) {
|
||||
template <class InT, class OutT>
|
||||
Status CudaDequantizeLinear(const InT* input, OutT* output, const OutT* scale, const InT* zero_point, size_t num_of_element) {
|
||||
if (num_of_element <= 0)
|
||||
return Status::OK();
|
||||
|
||||
int blocksPerGrid = static_cast<int>(CeilDiv(num_of_element, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
|
||||
DequantizeLinearKernel<T, U, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
|
||||
DequantizeLinearKernel<InT, OutT, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
|
||||
input,
|
||||
output,
|
||||
scale,
|
||||
|
|
|
|||
|
|
@ -55,6 +55,15 @@ TEST(DequantizeLinearOpTest, DequantizeLinear_Scalar) {
|
|||
test.Run();
|
||||
}
|
||||
|
||||
// dequantize without zero point
|
||||
TEST(DequantizeLinearOpTest, DequantizeLinear_Without_Zero_Point) {
|
||||
OpTester test("DequantizeLinear", 10);
|
||||
test.AddInput<int8_t>("x", {}, {100});
|
||||
test.AddInput<float>("x_scale", {}, {2.0f});
|
||||
test.AddOutput<float>("y", {}, {200.0f});
|
||||
test.Run();
|
||||
}
|
||||
|
||||
// quantize with scalar zero point and scale
|
||||
TEST(QuantizeLinearOpTest, QuantizeLinear_uint8) {
|
||||
OpTester test("QuantizeLinear", 10);
|
||||
|
|
@ -126,5 +135,14 @@ TEST(QuantizeLinearOpTest, QuantizeLinear_Scalar) {
|
|||
test.Run();
|
||||
}
|
||||
|
||||
// quantize with scalar data
|
||||
TEST(QuantizeLinearOpTest, DISABLED_QuantizeLinear_With_Zero_Point) {
|
||||
OpTester test("QuantizeLinear", 10);
|
||||
test.AddInput<float>("x", {}, {3});
|
||||
test.AddInput<float>("y_scale", {}, {2.0f});
|
||||
test.AddOutput<uint8_t>("y", {}, {2});
|
||||
test.Run();
|
||||
}
|
||||
|
||||
} // namespace test
|
||||
} // namespace onnxruntime
|
||||
|
|
|
|||
Loading…
Reference in a new issue