mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-24 22:17:32 +00:00
Implement QuantizeLinear and DequantizeLinear (#3098)
* Implement QuantizeLinear and DequantizeLinear
This commit is contained in:
parent
83753bcbe3
commit
fbb658e603
5 changed files with 261 additions and 0 deletions
|
|
@ -700,6 +700,10 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
|
|||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, Round);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, Round);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, Round);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, int8_t, QuantizeLinear);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, uint8_t, QuantizeLinear);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, int8_t, DequantizeLinear);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, uint8_t, DequantizeLinear);
|
||||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, CumSum);
|
||||
|
||||
static void RegisterCudaKernels(KernelRegistry& kernel_registry) {
|
||||
|
|
@ -1167,6 +1171,10 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) {
|
|||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, Round)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, Round)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, Round)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, int8_t, QuantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, uint8_t, QuantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, int8_t, DequantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, uint8_t, DequantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, CumSum)>,
|
||||
};
|
||||
|
||||
|
|
|
|||
108
onnxruntime/core/providers/cuda/tensor/quantize_linear.cc
Normal file
108
onnxruntime/core/providers/cuda/tensor/quantize_linear.cc
Normal file
|
|
@ -0,0 +1,108 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#include "quantize_linear.h"
|
||||
#include "quantize_linear.cuh"
|
||||
|
||||
#include "core/providers/common.h"
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace cuda {
|
||||
|
||||
ONNX_OPERATOR_TYPED_KERNEL_EX(QuantizeLinear,
|
||||
kOnnxDomain,
|
||||
10,
|
||||
uint8_t,
|
||||
kCudaExecutionProvider,
|
||||
KernelDefBuilder()
|
||||
.TypeConstraint("T1", DataTypeImpl::GetTensorType<float>())
|
||||
.TypeConstraint("T2", DataTypeImpl::GetTensorType<uint8_t>()),
|
||||
QuantizeLinear<uint8_t>);
|
||||
|
||||
ONNX_OPERATOR_TYPED_KERNEL_EX(QuantizeLinear,
|
||||
kOnnxDomain,
|
||||
10,
|
||||
int8_t,
|
||||
kCudaExecutionProvider,
|
||||
KernelDefBuilder()
|
||||
.TypeConstraint("T1", DataTypeImpl::GetTensorType<float>())
|
||||
.TypeConstraint("T2", DataTypeImpl::GetTensorType<int8_t>()),
|
||||
QuantizeLinear<int8_t>);
|
||||
|
||||
template <class T>
|
||||
Status QuantizeLinear<T>::ComputeInternal(OpKernelContext* ctx) const {
|
||||
auto x = ctx->Input<Tensor>(0);
|
||||
auto y_scale = ctx->Input<Tensor>(1);
|
||||
auto y_zero_point = ctx->Input<Tensor>(2);
|
||||
ORT_ENFORCE(x != nullptr &&
|
||||
y_scale != nullptr &&
|
||||
y_zero_point != nullptr);
|
||||
auto y = ctx->Output(0, x->Shape());
|
||||
ORT_ENFORCE(y != nullptr);
|
||||
|
||||
const auto& x_shape = x->Shape();
|
||||
|
||||
const float* input = x->template Data<float>();
|
||||
T* output = y->template MutableData<T>();
|
||||
|
||||
ORT_ENFORCE(IsScalarOr1ElementVector(y_scale), "x_scale must be a scalar or 1D tensor of size 1.");
|
||||
ORT_ENFORCE(IsScalarOr1ElementVector(y_zero_point), "x_zero_point must be a scalar or 1D tensor of size 1.");
|
||||
|
||||
const T* zero_point = y_zero_point->template Data<T>();
|
||||
const float* scale = y_scale->template Data<float>();
|
||||
const auto num_of_elements = x_shape.Size();
|
||||
|
||||
CudaQuantizeLinear(input, output, scale, zero_point, num_of_elements);
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
ONNX_OPERATOR_TYPED_KERNEL_EX(DequantizeLinear,
|
||||
kOnnxDomain,
|
||||
10,
|
||||
uint8_t,
|
||||
kCudaExecutionProvider,
|
||||
KernelDefBuilder()
|
||||
.TypeConstraint("T", DataTypeImpl::GetTensorType<uint8_t>()),
|
||||
DequantizeLinear<uint8_t>);
|
||||
|
||||
ONNX_OPERATOR_TYPED_KERNEL_EX(DequantizeLinear,
|
||||
kOnnxDomain,
|
||||
10,
|
||||
int8_t,
|
||||
kCudaExecutionProvider,
|
||||
KernelDefBuilder()
|
||||
.TypeConstraint("T", DataTypeImpl::GetTensorType<int8_t>()),
|
||||
DequantizeLinear<int8_t>);
|
||||
|
||||
template <class T>
|
||||
Status DequantizeLinear<T>::ComputeInternal(OpKernelContext* ctx) const {
|
||||
auto x = ctx->Input<Tensor>(0);
|
||||
auto y_scale = ctx->Input<Tensor>(1);
|
||||
auto y_zero_point = ctx->Input<Tensor>(2);
|
||||
ORT_ENFORCE(x != nullptr &&
|
||||
y_scale != nullptr &&
|
||||
y_zero_point != nullptr);
|
||||
|
||||
const auto& x_shape = x->Shape();
|
||||
|
||||
auto y = ctx->Output(0, x_shape);
|
||||
ORT_ENFORCE(y != nullptr);
|
||||
|
||||
const T* input = x->template Data<T>();
|
||||
float* output = y->template MutableData<float>();
|
||||
|
||||
ORT_ENFORCE(IsScalarOr1ElementVector(y_scale), "x_scale must be a scalar or 1D tensor of size 1.");
|
||||
ORT_ENFORCE(IsScalarOr1ElementVector(y_zero_point), "x_zero_point must be a scalar or 1D tensor of size 1.");
|
||||
|
||||
const T* zero_point = y_zero_point->template Data<T>();
|
||||
const float* scale = y_scale->template Data<float>();
|
||||
const auto num_of_elements = x_shape.Size();
|
||||
|
||||
CudaDequantizeLinear(input, output, scale, zero_point, num_of_elements);
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace onnxruntime
|
||||
93
onnxruntime/core/providers/cuda/tensor/quantize_linear.cu
Normal file
93
onnxruntime/core/providers/cuda/tensor/quantize_linear.cu
Normal file
|
|
@ -0,0 +1,93 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#include "quantize_linear.cuh"
|
||||
|
||||
#include "core/providers/cuda/cu_inc/common.cuh"
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace cuda {
|
||||
|
||||
template <int NumThreadsPerBlock, int NumElementsPerThread>
|
||||
__global__ void QuantizeLinearKernel(const float* input, int8_t* output, const float* scale, const int8_t* zero_point, CUDA_LONG N) {
|
||||
CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < NumElementsPerThread; i++) {
|
||||
if (id < N) {
|
||||
int value = __float2int_rn(input[id] / (*scale)) + *zero_point;
|
||||
output[id] = static_cast<int8_t>(max(-127, min(127, value)));
|
||||
id += NumThreadsPerBlock;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int NumThreadsPerBlock, int NumElementsPerThread>
|
||||
__global__ void QuantizeLinearKernel(const float* input, uint8_t* output, const float* scale, const uint8_t* zero_point, CUDA_LONG N) {
|
||||
CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < NumElementsPerThread; i++) {
|
||||
if (id < N) {
|
||||
int value = __float2int_rn(input[id] / (*scale)) + *zero_point;
|
||||
output[id] = static_cast<uint8_t>(max(0, min(255, value)));
|
||||
id += NumThreadsPerBlock;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
Status CudaQuantizeLinear(const float* input, T* output, const float* scale, const T* zero_point, size_t num_of_element) {
|
||||
if (num_of_element <= 0)
|
||||
return Status::OK();
|
||||
|
||||
int blocksPerGrid = static_cast<int>(CeilDiv(num_of_element, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
|
||||
CUDA_LONG N = static_cast<CUDA_LONG>(num_of_element);
|
||||
QuantizeLinearKernel<GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
|
||||
<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
|
||||
input,
|
||||
output,
|
||||
scale,
|
||||
zero_point,
|
||||
num_of_element);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <class T, int NumThreadsPerBlock, int NumElementsPerThread>
|
||||
__global__ void DequantizeLinearKernel(const T* input, float* output, const float* scale, const T* zero_point, CUDA_LONG N) {
|
||||
CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < NumElementsPerThread; i++) {
|
||||
if (id < N) {
|
||||
output[id] = (input[id] - *zero_point) * (*scale);
|
||||
id += NumThreadsPerBlock;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
Status CudaDequantizeLinear(const T* input, float* output, const float* scale, const T* zero_point, size_t num_of_element) {
|
||||
if (num_of_element <= 0)
|
||||
return Status::OK();
|
||||
|
||||
int blocksPerGrid = static_cast<int>(CeilDiv(num_of_element, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
|
||||
CUDA_LONG N = static_cast<CUDA_LONG>(num_of_element);
|
||||
DequantizeLinearKernel<T, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
|
||||
<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
|
||||
input,
|
||||
output,
|
||||
scale,
|
||||
zero_point,
|
||||
num_of_element);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template Status CudaQuantizeLinear<int8_t>(const float* input, int8_t* output, const float* scale, const int8_t* zero_point, size_t num_of_element);
|
||||
template Status CudaQuantizeLinear<uint8_t>(const float* input, uint8_t* output, const float* scale, const uint8_t* zero_point, size_t num_of_element);
|
||||
|
||||
template Status CudaDequantizeLinear<int8_t>(const int8_t* input, float* output, const float* scale, const int8_t* zero_point, size_t num_of_element);
|
||||
template Status CudaDequantizeLinear<uint8_t>(const uint8_t* input, float* output, const float* scale, const uint8_t* zero_point, size_t num_of_element);
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace onnxruntime
|
||||
21
onnxruntime/core/providers/cuda/tensor/quantize_linear.cuh
Normal file
21
onnxruntime/core/providers/cuda/tensor/quantize_linear.cuh
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "quantize_linear.h"
|
||||
#include "core/providers/cpu/math/matmul_helper.h"
|
||||
#include "core/providers/cuda/cuda_common.h"
|
||||
#include "core/providers/cuda/shared_inc/cuda_utils.h"
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace cuda {
|
||||
|
||||
template <class T>
|
||||
Status CudaQuantizeLinear(const float* input, T* output, const float* scale, const T* zero_point, size_t num_of_element);
|
||||
|
||||
template <class T>
|
||||
Status CudaDequantizeLinear(const T* input, float* output, const float* scale, const T* zero_point, size_t num_of_element);
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace onnxruntime
|
||||
31
onnxruntime/core/providers/cuda/tensor/quantize_linear.h
Normal file
31
onnxruntime/core/providers/cuda/tensor/quantize_linear.h
Normal file
|
|
@ -0,0 +1,31 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "core/common/common.h"
|
||||
#include "core/framework/op_kernel.h"
|
||||
#include "core/providers/cuda/cuda_common.h"
|
||||
#include "gsl/gsl"
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace cuda {
|
||||
|
||||
template <class T>
|
||||
class QuantizeLinear final : public CudaKernel {
|
||||
public:
|
||||
QuantizeLinear(const OpKernelInfo& info) : CudaKernel(info) {}
|
||||
|
||||
Status ComputeInternal(OpKernelContext* p_op_kernel_context) const override;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class DequantizeLinear final : public CudaKernel {
|
||||
public:
|
||||
DequantizeLinear(const OpKernelInfo& info) : CudaKernel(info) {}
|
||||
|
||||
Status ComputeInternal(OpKernelContext* p_op_kernel_context) const override;
|
||||
};
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace onnxruntime
|
||||
Loading…
Reference in a new issue