Implement QuantizeLinear and DequantizeLinear (#3098)

* Implement QuantizeLinear and DequantizeLinear
2026-07-12 17:57:38 +00:00 · 2020-03-04 13:30:20 -08:00 · 2020-03-04 13:30:20 -08:00 · fbb658e603
commit fbb658e603
parent 83753bcbe3
5 changed files with 261 additions and 0 deletions
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@ -700,6 +700,10 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, Round);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, Round);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, Round);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, int8_t, QuantizeLinear);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, uint8_t, QuantizeLinear);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, int8_t, DequantizeLinear);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, uint8_t, DequantizeLinear);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, CumSum);

 static void RegisterCudaKernels(KernelRegistry& kernel_registry) {
@ -1167,6 +1171,10 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) {
      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, Round)>,
      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, Round)>,
      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, Round)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, int8_t, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, uint8_t, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, int8_t, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, uint8_t, DequantizeLinear)>,
      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, CumSum)>,
  };

--- a/onnxruntime/core/providers/cuda/tensor/quantize_linear.cc
+++ b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cc
@ -0,0 +1,108 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "quantize_linear.h"
+#include "quantize_linear.cuh"
+
+#include "core/providers/common.h"
+
+namespace onnxruntime {
+namespace cuda {
+
+ONNX_OPERATOR_TYPED_KERNEL_EX(QuantizeLinear,
+                              kOnnxDomain,
+                              10,
+                              uint8_t,
+                              kCudaExecutionProvider,
+                              KernelDefBuilder()
+                                  .TypeConstraint("T1", DataTypeImpl::GetTensorType<float>())
+                                  .TypeConstraint("T2", DataTypeImpl::GetTensorType<uint8_t>()),
+                              QuantizeLinear<uint8_t>);
+
+ONNX_OPERATOR_TYPED_KERNEL_EX(QuantizeLinear,
+                              kOnnxDomain,
+                              10,
+                              int8_t,
+                              kCudaExecutionProvider,
+                              KernelDefBuilder()
+                                  .TypeConstraint("T1", DataTypeImpl::GetTensorType<float>())
+                                  .TypeConstraint("T2", DataTypeImpl::GetTensorType<int8_t>()),
+                              QuantizeLinear<int8_t>);
+
+template <class T>
+Status QuantizeLinear<T>::ComputeInternal(OpKernelContext* ctx) const {
+  auto x = ctx->Input<Tensor>(0);
+  auto y_scale = ctx->Input<Tensor>(1);
+  auto y_zero_point = ctx->Input<Tensor>(2);
+  ORT_ENFORCE(x != nullptr &&
+              y_scale != nullptr &&
+              y_zero_point != nullptr);
+  auto y = ctx->Output(0, x->Shape());
+  ORT_ENFORCE(y != nullptr);
+
+  const auto& x_shape = x->Shape();
+
+  const float* input = x->template Data<float>();
+  T* output = y->template MutableData<T>();
+
+  ORT_ENFORCE(IsScalarOr1ElementVector(y_scale), "x_scale must be a scalar or 1D tensor of size 1.");
+  ORT_ENFORCE(IsScalarOr1ElementVector(y_zero_point), "x_zero_point must be a scalar or 1D tensor of size 1.");
+
+  const T* zero_point = y_zero_point->template Data<T>();
+  const float* scale = y_scale->template Data<float>();
+  const auto num_of_elements = x_shape.Size();
+
+  CudaQuantizeLinear(input, output, scale, zero_point, num_of_elements);
+
+  return Status::OK();
+}
+
+ONNX_OPERATOR_TYPED_KERNEL_EX(DequantizeLinear,
+                              kOnnxDomain,
+                              10,
+                              uint8_t,
+                              kCudaExecutionProvider,
+                              KernelDefBuilder()
+                                  .TypeConstraint("T", DataTypeImpl::GetTensorType<uint8_t>()),
+                              DequantizeLinear<uint8_t>);
+
+ONNX_OPERATOR_TYPED_KERNEL_EX(DequantizeLinear,
+                              kOnnxDomain,
+                              10,
+                              int8_t,
+                              kCudaExecutionProvider,
+                              KernelDefBuilder()
+                                  .TypeConstraint("T", DataTypeImpl::GetTensorType<int8_t>()),
+                              DequantizeLinear<int8_t>);
+
+template <class T>
+Status DequantizeLinear<T>::ComputeInternal(OpKernelContext* ctx) const {
+  auto x = ctx->Input<Tensor>(0);
+  auto y_scale = ctx->Input<Tensor>(1);
+  auto y_zero_point = ctx->Input<Tensor>(2);
+  ORT_ENFORCE(x != nullptr &&
+              y_scale != nullptr &&
+              y_zero_point != nullptr);
+
+  const auto& x_shape = x->Shape();
+
+  auto y = ctx->Output(0, x_shape);
+  ORT_ENFORCE(y != nullptr);
+
+  const T* input = x->template Data<T>();
+  float* output = y->template MutableData<float>();
+
+  ORT_ENFORCE(IsScalarOr1ElementVector(y_scale), "x_scale must be a scalar or 1D tensor of size 1.");
+  ORT_ENFORCE(IsScalarOr1ElementVector(y_zero_point), "x_zero_point must be a scalar or 1D tensor of size 1.");
+
+  const T* zero_point = y_zero_point->template Data<T>();
+  const float* scale = y_scale->template Data<float>();
+  const auto num_of_elements = x_shape.Size();
+
+  CudaDequantizeLinear(input, output, scale, zero_point, num_of_elements);
+
+  return Status::OK();
+}
+
+}  // namespace cuda
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/cuda/tensor/quantize_linear.cu
+++ b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cu
@ -0,0 +1,93 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "quantize_linear.cuh"
+
+#include "core/providers/cuda/cu_inc/common.cuh"
+
+namespace onnxruntime {
+namespace cuda {
+
+template <int NumThreadsPerBlock, int NumElementsPerThread>
+__global__ void QuantizeLinearKernel(const float* input, int8_t* output, const float* scale, const int8_t* zero_point, CUDA_LONG N) {
+  CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
+
+#pragma unroll
+  for (int i = 0; i < NumElementsPerThread; i++) {
+    if (id < N) {
+      int value = __float2int_rn(input[id] / (*scale)) + *zero_point;
+      output[id] = static_cast<int8_t>(max(-127, min(127, value)));
+      id += NumThreadsPerBlock;
+    }
+  }
+}
+
+template <int NumThreadsPerBlock, int NumElementsPerThread>
+__global__ void QuantizeLinearKernel(const float* input, uint8_t* output, const float* scale, const uint8_t* zero_point, CUDA_LONG N) {
+  CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
+
+#pragma unroll
+  for (int i = 0; i < NumElementsPerThread; i++) {
+    if (id < N) {
+      int value = __float2int_rn(input[id] / (*scale)) + *zero_point;
+      output[id] = static_cast<uint8_t>(max(0, min(255, value)));
+      id += NumThreadsPerBlock;
+    }
+  }
+}
+
+template <class T>
+Status CudaQuantizeLinear(const float* input, T* output, const float* scale, const T* zero_point, size_t num_of_element) {
+  if (num_of_element <= 0)
+    return Status::OK();
+
+  int blocksPerGrid = static_cast<int>(CeilDiv(num_of_element, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
+  CUDA_LONG N = static_cast<CUDA_LONG>(num_of_element);
+  QuantizeLinearKernel<GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
+      <<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
+          input,
+          output,
+          scale,
+          zero_point,
+          num_of_element);
+  return Status::OK();
+}
+
+template <class T, int NumThreadsPerBlock, int NumElementsPerThread>
+__global__ void DequantizeLinearKernel(const T* input, float* output, const float* scale, const T* zero_point, CUDA_LONG N) {
+  CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
+
+#pragma unroll
+  for (int i = 0; i < NumElementsPerThread; i++) {
+    if (id < N) {
+      output[id] = (input[id] - *zero_point) * (*scale);
+      id += NumThreadsPerBlock;
+    }
+  }
+}
+
+template <class T>
+Status CudaDequantizeLinear(const T* input, float* output, const float* scale, const T* zero_point, size_t num_of_element) {
+  if (num_of_element <= 0)
+    return Status::OK();
+
+  int blocksPerGrid = static_cast<int>(CeilDiv(num_of_element, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
+  CUDA_LONG N = static_cast<CUDA_LONG>(num_of_element);
+  DequantizeLinearKernel<T, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
+      <<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
+          input,
+          output,
+          scale,
+          zero_point,
+          num_of_element);
+  return Status::OK();
+}
+
+template Status CudaQuantizeLinear<int8_t>(const float* input, int8_t* output, const float* scale, const int8_t* zero_point, size_t num_of_element);
+template Status CudaQuantizeLinear<uint8_t>(const float* input, uint8_t* output, const float* scale, const uint8_t* zero_point, size_t num_of_element);
+
+template Status CudaDequantizeLinear<int8_t>(const int8_t* input, float* output, const float* scale, const int8_t* zero_point, size_t num_of_element);
+template Status CudaDequantizeLinear<uint8_t>(const uint8_t* input, float* output, const float* scale, const uint8_t* zero_point, size_t num_of_element);
+
+}  // namespace cuda
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/cuda/tensor/quantize_linear.cuh
+++ b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cuh
@ -0,0 +1,21 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "quantize_linear.h"
+#include "core/providers/cpu/math/matmul_helper.h"
+#include "core/providers/cuda/cuda_common.h"
+#include "core/providers/cuda/shared_inc/cuda_utils.h"
+
+namespace onnxruntime {
+namespace cuda {
+
+template <class T>
+Status CudaQuantizeLinear(const float* input, T* output, const float* scale, const T* zero_point, size_t num_of_element);
+
+template <class T>
+Status CudaDequantizeLinear(const T* input, float* output, const float* scale, const T* zero_point, size_t num_of_element);
+
+}  // namespace cuda
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/cuda/tensor/quantize_linear.h
+++ b/onnxruntime/core/providers/cuda/tensor/quantize_linear.h
@ -0,0 +1,31 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/common/common.h"
+#include "core/framework/op_kernel.h"
+#include "core/providers/cuda/cuda_common.h"
+#include "gsl/gsl"
+
+namespace onnxruntime {
+namespace cuda {
+
+template <class T>
+class QuantizeLinear final : public CudaKernel {
+ public:
+  QuantizeLinear(const OpKernelInfo& info) : CudaKernel(info) {}
+
+  Status ComputeInternal(OpKernelContext* p_op_kernel_context) const override;
+};
+
+template <class T>
+class DequantizeLinear final : public CudaKernel {
+ public:
+  DequantizeLinear(const OpKernelInfo& info) : CudaKernel(info) {}
+
+  Status ComputeInternal(OpKernelContext* p_op_kernel_context) const override;
+};
+
+}  // namespace cuda
+}  // namespace onnxruntime