Implement QuantizeLinear and DequantizeLinear (#3098)

* Implement QuantizeLinear and DequantizeLinear
This commit is contained in:
Yufeng Li 2020-03-04 13:30:20 -08:00 committed by GitHub
parent 83753bcbe3
commit fbb658e603
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 261 additions and 0 deletions

View file

@ -700,6 +700,10 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, Round);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, Round);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, Round);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, int8_t, QuantizeLinear);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, uint8_t, QuantizeLinear);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, int8_t, DequantizeLinear);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, uint8_t, DequantizeLinear);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, CumSum);
static void RegisterCudaKernels(KernelRegistry& kernel_registry) {
@ -1167,6 +1171,10 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) {
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, Round)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, Round)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, Round)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, int8_t, QuantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, uint8_t, QuantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, int8_t, DequantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, uint8_t, DequantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, CumSum)>,
};

View file

@ -0,0 +1,108 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "quantize_linear.h"
#include "quantize_linear.cuh"
#include "core/providers/common.h"
namespace onnxruntime {
namespace cuda {
ONNX_OPERATOR_TYPED_KERNEL_EX(QuantizeLinear,
kOnnxDomain,
10,
uint8_t,
kCudaExecutionProvider,
KernelDefBuilder()
.TypeConstraint("T1", DataTypeImpl::GetTensorType<float>())
.TypeConstraint("T2", DataTypeImpl::GetTensorType<uint8_t>()),
QuantizeLinear<uint8_t>);
ONNX_OPERATOR_TYPED_KERNEL_EX(QuantizeLinear,
kOnnxDomain,
10,
int8_t,
kCudaExecutionProvider,
KernelDefBuilder()
.TypeConstraint("T1", DataTypeImpl::GetTensorType<float>())
.TypeConstraint("T2", DataTypeImpl::GetTensorType<int8_t>()),
QuantizeLinear<int8_t>);
template <class T>
Status QuantizeLinear<T>::ComputeInternal(OpKernelContext* ctx) const {
auto x = ctx->Input<Tensor>(0);
auto y_scale = ctx->Input<Tensor>(1);
auto y_zero_point = ctx->Input<Tensor>(2);
ORT_ENFORCE(x != nullptr &&
y_scale != nullptr &&
y_zero_point != nullptr);
auto y = ctx->Output(0, x->Shape());
ORT_ENFORCE(y != nullptr);
const auto& x_shape = x->Shape();
const float* input = x->template Data<float>();
T* output = y->template MutableData<T>();
ORT_ENFORCE(IsScalarOr1ElementVector(y_scale), "x_scale must be a scalar or 1D tensor of size 1.");
ORT_ENFORCE(IsScalarOr1ElementVector(y_zero_point), "x_zero_point must be a scalar or 1D tensor of size 1.");
const T* zero_point = y_zero_point->template Data<T>();
const float* scale = y_scale->template Data<float>();
const auto num_of_elements = x_shape.Size();
CudaQuantizeLinear(input, output, scale, zero_point, num_of_elements);
return Status::OK();
}
ONNX_OPERATOR_TYPED_KERNEL_EX(DequantizeLinear,
kOnnxDomain,
10,
uint8_t,
kCudaExecutionProvider,
KernelDefBuilder()
.TypeConstraint("T", DataTypeImpl::GetTensorType<uint8_t>()),
DequantizeLinear<uint8_t>);
ONNX_OPERATOR_TYPED_KERNEL_EX(DequantizeLinear,
kOnnxDomain,
10,
int8_t,
kCudaExecutionProvider,
KernelDefBuilder()
.TypeConstraint("T", DataTypeImpl::GetTensorType<int8_t>()),
DequantizeLinear<int8_t>);
template <class T>
Status DequantizeLinear<T>::ComputeInternal(OpKernelContext* ctx) const {
auto x = ctx->Input<Tensor>(0);
auto y_scale = ctx->Input<Tensor>(1);
auto y_zero_point = ctx->Input<Tensor>(2);
ORT_ENFORCE(x != nullptr &&
y_scale != nullptr &&
y_zero_point != nullptr);
const auto& x_shape = x->Shape();
auto y = ctx->Output(0, x_shape);
ORT_ENFORCE(y != nullptr);
const T* input = x->template Data<T>();
float* output = y->template MutableData<float>();
ORT_ENFORCE(IsScalarOr1ElementVector(y_scale), "x_scale must be a scalar or 1D tensor of size 1.");
ORT_ENFORCE(IsScalarOr1ElementVector(y_zero_point), "x_zero_point must be a scalar or 1D tensor of size 1.");
const T* zero_point = y_zero_point->template Data<T>();
const float* scale = y_scale->template Data<float>();
const auto num_of_elements = x_shape.Size();
CudaDequantizeLinear(input, output, scale, zero_point, num_of_elements);
return Status::OK();
}
} // namespace cuda
} // namespace onnxruntime

View file

@ -0,0 +1,93 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "quantize_linear.cuh"
#include "core/providers/cuda/cu_inc/common.cuh"
namespace onnxruntime {
namespace cuda {
template <int NumThreadsPerBlock, int NumElementsPerThread>
__global__ void QuantizeLinearKernel(const float* input, int8_t* output, const float* scale, const int8_t* zero_point, CUDA_LONG N) {
CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
#pragma unroll
for (int i = 0; i < NumElementsPerThread; i++) {
if (id < N) {
int value = __float2int_rn(input[id] / (*scale)) + *zero_point;
output[id] = static_cast<int8_t>(max(-127, min(127, value)));
id += NumThreadsPerBlock;
}
}
}
template <int NumThreadsPerBlock, int NumElementsPerThread>
__global__ void QuantizeLinearKernel(const float* input, uint8_t* output, const float* scale, const uint8_t* zero_point, CUDA_LONG N) {
CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
#pragma unroll
for (int i = 0; i < NumElementsPerThread; i++) {
if (id < N) {
int value = __float2int_rn(input[id] / (*scale)) + *zero_point;
output[id] = static_cast<uint8_t>(max(0, min(255, value)));
id += NumThreadsPerBlock;
}
}
}
template <class T>
Status CudaQuantizeLinear(const float* input, T* output, const float* scale, const T* zero_point, size_t num_of_element) {
if (num_of_element <= 0)
return Status::OK();
int blocksPerGrid = static_cast<int>(CeilDiv(num_of_element, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
CUDA_LONG N = static_cast<CUDA_LONG>(num_of_element);
QuantizeLinearKernel<GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
input,
output,
scale,
zero_point,
num_of_element);
return Status::OK();
}
template <class T, int NumThreadsPerBlock, int NumElementsPerThread>
__global__ void DequantizeLinearKernel(const T* input, float* output, const float* scale, const T* zero_point, CUDA_LONG N) {
CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
#pragma unroll
for (int i = 0; i < NumElementsPerThread; i++) {
if (id < N) {
output[id] = (input[id] - *zero_point) * (*scale);
id += NumThreadsPerBlock;
}
}
}
template <class T>
Status CudaDequantizeLinear(const T* input, float* output, const float* scale, const T* zero_point, size_t num_of_element) {
if (num_of_element <= 0)
return Status::OK();
int blocksPerGrid = static_cast<int>(CeilDiv(num_of_element, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
CUDA_LONG N = static_cast<CUDA_LONG>(num_of_element);
DequantizeLinearKernel<T, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
input,
output,
scale,
zero_point,
num_of_element);
return Status::OK();
}
template Status CudaQuantizeLinear<int8_t>(const float* input, int8_t* output, const float* scale, const int8_t* zero_point, size_t num_of_element);
template Status CudaQuantizeLinear<uint8_t>(const float* input, uint8_t* output, const float* scale, const uint8_t* zero_point, size_t num_of_element);
template Status CudaDequantizeLinear<int8_t>(const int8_t* input, float* output, const float* scale, const int8_t* zero_point, size_t num_of_element);
template Status CudaDequantizeLinear<uint8_t>(const uint8_t* input, float* output, const float* scale, const uint8_t* zero_point, size_t num_of_element);
} // namespace cuda
} // namespace onnxruntime

View file

@ -0,0 +1,21 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "quantize_linear.h"
#include "core/providers/cpu/math/matmul_helper.h"
#include "core/providers/cuda/cuda_common.h"
#include "core/providers/cuda/shared_inc/cuda_utils.h"
namespace onnxruntime {
namespace cuda {
template <class T>
Status CudaQuantizeLinear(const float* input, T* output, const float* scale, const T* zero_point, size_t num_of_element);
template <class T>
Status CudaDequantizeLinear(const T* input, float* output, const float* scale, const T* zero_point, size_t num_of_element);
} // namespace cuda
} // namespace onnxruntime

View file

@ -0,0 +1,31 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/common/common.h"
#include "core/framework/op_kernel.h"
#include "core/providers/cuda/cuda_common.h"
#include "gsl/gsl"
namespace onnxruntime {
namespace cuda {
template <class T>
class QuantizeLinear final : public CudaKernel {
public:
QuantizeLinear(const OpKernelInfo& info) : CudaKernel(info) {}
Status ComputeInternal(OpKernelContext* p_op_kernel_context) const override;
};
template <class T>
class DequantizeLinear final : public CudaKernel {
public:
DequantizeLinear(const OpKernelInfo& info) : CudaKernel(info) {}
Status ComputeInternal(OpKernelContext* p_op_kernel_context) const override;
};
} // namespace cuda
} // namespace onnxruntime