diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index 0dc18a50cc..46dbd26087 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -700,6 +700,10 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, Round); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, Round); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, Round); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, int8_t, QuantizeLinear); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, uint8_t, QuantizeLinear); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, int8_t, DequantizeLinear); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, uint8_t, DequantizeLinear); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, CumSum); static void RegisterCudaKernels(KernelRegistry& kernel_registry) { @@ -1167,6 +1171,10 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, }; diff --git a/onnxruntime/core/providers/cuda/tensor/quantize_linear.cc b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cc new file mode 100644 index 0000000000..626313096e --- /dev/null +++ b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cc @@ -0,0 +1,108 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "quantize_linear.h" +#include "quantize_linear.cuh" + +#include "core/providers/common.h" + +namespace onnxruntime { +namespace cuda { + +ONNX_OPERATOR_TYPED_KERNEL_EX(QuantizeLinear, + kOnnxDomain, + 10, + uint8_t, + kCudaExecutionProvider, + KernelDefBuilder() + .TypeConstraint("T1", DataTypeImpl::GetTensorType()) + .TypeConstraint("T2", DataTypeImpl::GetTensorType()), + QuantizeLinear); + +ONNX_OPERATOR_TYPED_KERNEL_EX(QuantizeLinear, + kOnnxDomain, + 10, + int8_t, + kCudaExecutionProvider, + KernelDefBuilder() + .TypeConstraint("T1", DataTypeImpl::GetTensorType()) + .TypeConstraint("T2", DataTypeImpl::GetTensorType()), + QuantizeLinear); + +template +Status QuantizeLinear::ComputeInternal(OpKernelContext* ctx) const { + auto x = ctx->Input(0); + auto y_scale = ctx->Input(1); + auto y_zero_point = ctx->Input(2); + ORT_ENFORCE(x != nullptr && + y_scale != nullptr && + y_zero_point != nullptr); + auto y = ctx->Output(0, x->Shape()); + ORT_ENFORCE(y != nullptr); + + const auto& x_shape = x->Shape(); + + const float* input = x->template Data(); + T* output = y->template MutableData(); + + ORT_ENFORCE(IsScalarOr1ElementVector(y_scale), "x_scale must be a scalar or 1D tensor of size 1."); + ORT_ENFORCE(IsScalarOr1ElementVector(y_zero_point), "x_zero_point must be a scalar or 1D tensor of size 1."); + + const T* zero_point = y_zero_point->template Data(); + const float* scale = y_scale->template Data(); + const auto num_of_elements = x_shape.Size(); + + CudaQuantizeLinear(input, output, scale, zero_point, num_of_elements); + + return Status::OK(); +} + +ONNX_OPERATOR_TYPED_KERNEL_EX(DequantizeLinear, + kOnnxDomain, + 10, + uint8_t, + kCudaExecutionProvider, + KernelDefBuilder() + .TypeConstraint("T", DataTypeImpl::GetTensorType()), + DequantizeLinear); + +ONNX_OPERATOR_TYPED_KERNEL_EX(DequantizeLinear, + kOnnxDomain, + 10, + int8_t, + kCudaExecutionProvider, + KernelDefBuilder() + .TypeConstraint("T", DataTypeImpl::GetTensorType()), + DequantizeLinear); + +template +Status DequantizeLinear::ComputeInternal(OpKernelContext* ctx) const { + auto x = ctx->Input(0); + auto y_scale = ctx->Input(1); + auto y_zero_point = ctx->Input(2); + ORT_ENFORCE(x != nullptr && + y_scale != nullptr && + y_zero_point != nullptr); + + const auto& x_shape = x->Shape(); + + auto y = ctx->Output(0, x_shape); + ORT_ENFORCE(y != nullptr); + + const T* input = x->template Data(); + float* output = y->template MutableData(); + + ORT_ENFORCE(IsScalarOr1ElementVector(y_scale), "x_scale must be a scalar or 1D tensor of size 1."); + ORT_ENFORCE(IsScalarOr1ElementVector(y_zero_point), "x_zero_point must be a scalar or 1D tensor of size 1."); + + const T* zero_point = y_zero_point->template Data(); + const float* scale = y_scale->template Data(); + const auto num_of_elements = x_shape.Size(); + + CudaDequantizeLinear(input, output, scale, zero_point, num_of_elements); + + return Status::OK(); +} + +} // namespace cuda +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/tensor/quantize_linear.cu b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cu new file mode 100644 index 0000000000..f380fa69f8 --- /dev/null +++ b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cu @@ -0,0 +1,93 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "quantize_linear.cuh" + +#include "core/providers/cuda/cu_inc/common.cuh" + +namespace onnxruntime { +namespace cuda { + +template +__global__ void QuantizeLinearKernel(const float* input, int8_t* output, const float* scale, const int8_t* zero_point, CUDA_LONG N) { + CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x; + +#pragma unroll + for (int i = 0; i < NumElementsPerThread; i++) { + if (id < N) { + int value = __float2int_rn(input[id] / (*scale)) + *zero_point; + output[id] = static_cast(max(-127, min(127, value))); + id += NumThreadsPerBlock; + } + } +} + +template +__global__ void QuantizeLinearKernel(const float* input, uint8_t* output, const float* scale, const uint8_t* zero_point, CUDA_LONG N) { + CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x; + +#pragma unroll + for (int i = 0; i < NumElementsPerThread; i++) { + if (id < N) { + int value = __float2int_rn(input[id] / (*scale)) + *zero_point; + output[id] = static_cast(max(0, min(255, value))); + id += NumThreadsPerBlock; + } + } +} + +template +Status CudaQuantizeLinear(const float* input, T* output, const float* scale, const T* zero_point, size_t num_of_element) { + if (num_of_element <= 0) + return Status::OK(); + + int blocksPerGrid = static_cast(CeilDiv(num_of_element, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread)); + CUDA_LONG N = static_cast(num_of_element); + QuantizeLinearKernel + <<>>( + input, + output, + scale, + zero_point, + num_of_element); + return Status::OK(); +} + +template +__global__ void DequantizeLinearKernel(const T* input, float* output, const float* scale, const T* zero_point, CUDA_LONG N) { + CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x; + +#pragma unroll + for (int i = 0; i < NumElementsPerThread; i++) { + if (id < N) { + output[id] = (input[id] - *zero_point) * (*scale); + id += NumThreadsPerBlock; + } + } +} + +template +Status CudaDequantizeLinear(const T* input, float* output, const float* scale, const T* zero_point, size_t num_of_element) { + if (num_of_element <= 0) + return Status::OK(); + + int blocksPerGrid = static_cast(CeilDiv(num_of_element, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread)); + CUDA_LONG N = static_cast(num_of_element); + DequantizeLinearKernel + <<>>( + input, + output, + scale, + zero_point, + num_of_element); + return Status::OK(); +} + +template Status CudaQuantizeLinear(const float* input, int8_t* output, const float* scale, const int8_t* zero_point, size_t num_of_element); +template Status CudaQuantizeLinear(const float* input, uint8_t* output, const float* scale, const uint8_t* zero_point, size_t num_of_element); + +template Status CudaDequantizeLinear(const int8_t* input, float* output, const float* scale, const int8_t* zero_point, size_t num_of_element); +template Status CudaDequantizeLinear(const uint8_t* input, float* output, const float* scale, const uint8_t* zero_point, size_t num_of_element); + +} // namespace cuda +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/tensor/quantize_linear.cuh b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cuh new file mode 100644 index 0000000000..a5d42d1180 --- /dev/null +++ b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cuh @@ -0,0 +1,21 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "quantize_linear.h" +#include "core/providers/cpu/math/matmul_helper.h" +#include "core/providers/cuda/cuda_common.h" +#include "core/providers/cuda/shared_inc/cuda_utils.h" + +namespace onnxruntime { +namespace cuda { + +template +Status CudaQuantizeLinear(const float* input, T* output, const float* scale, const T* zero_point, size_t num_of_element); + +template +Status CudaDequantizeLinear(const T* input, float* output, const float* scale, const T* zero_point, size_t num_of_element); + +} // namespace cuda +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/tensor/quantize_linear.h b/onnxruntime/core/providers/cuda/tensor/quantize_linear.h new file mode 100644 index 0000000000..7c964dc4ee --- /dev/null +++ b/onnxruntime/core/providers/cuda/tensor/quantize_linear.h @@ -0,0 +1,31 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "core/common/common.h" +#include "core/framework/op_kernel.h" +#include "core/providers/cuda/cuda_common.h" +#include "gsl/gsl" + +namespace onnxruntime { +namespace cuda { + +template +class QuantizeLinear final : public CudaKernel { + public: + QuantizeLinear(const OpKernelInfo& info) : CudaKernel(info) {} + + Status ComputeInternal(OpKernelContext* p_op_kernel_context) const override; +}; + +template +class DequantizeLinear final : public CudaKernel { + public: + DequantizeLinear(const OpKernelInfo& info) : CudaKernel(info) {} + + Status ComputeInternal(OpKernelContext* p_op_kernel_context) const override; +}; + +} // namespace cuda +} // namespace onnxruntime