mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-07-03 03:58:54 +00:00
Enable running some ops on CUDA (#6572)
This commit is contained in:
parent
505c1f30b5
commit
8f0b877a1d
9 changed files with 172 additions and 30 deletions
|
|
@ -133,7 +133,7 @@ template <>
|
|||
__device__ __inline__ double _Round(double a) { return rint(a); }
|
||||
|
||||
template <>
|
||||
__device__ __inline__ half _Round(half a) {
|
||||
__device__ __inline__ half _Round(half a) {
|
||||
#if __CUDA_ARCH__ < 530
|
||||
return half(rintf((float)a));
|
||||
#else
|
||||
|
|
@ -141,6 +141,42 @@ __device__ __inline__ half _Round(half a) {
|
|||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __inline__ T _Cos(T a);
|
||||
|
||||
template <>
|
||||
__device__ __inline__ float _Cos(float a) { return cosf(a); }
|
||||
|
||||
template <>
|
||||
__device__ __inline__ double _Cos(double a) { return cos(a); }
|
||||
|
||||
template <>
|
||||
__device__ __inline__ half _Cos(half a) {
|
||||
#if __CUDA_ARCH__ < 530
|
||||
return half(cosf((float)a));
|
||||
#else
|
||||
return hcos(a);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __inline__ T _Sin(T a);
|
||||
|
||||
template <>
|
||||
__device__ __inline__ float _Sin(float a) { return sinf(a); }
|
||||
|
||||
template <>
|
||||
__device__ __inline__ double _Sin(double a) { return sin(a); }
|
||||
|
||||
template <>
|
||||
__device__ __inline__ half _Sin(half a) {
|
||||
#if __CUDA_ARCH__ < 530
|
||||
return half(sinf((float)a));
|
||||
#else
|
||||
return hsin(a);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __inline__ T _Exp(T a);
|
||||
|
||||
|
|
@ -270,10 +306,9 @@ struct GridDim {
|
|||
};
|
||||
};
|
||||
|
||||
|
||||
#define CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N) \
|
||||
CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x; \
|
||||
if (id >= N) \
|
||||
#define CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N) \
|
||||
CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x; \
|
||||
if (id >= N) \
|
||||
return;
|
||||
|
||||
// CUDA_KERNEL_ASSERT is a macro that wraps an assert() call inside cuda kernels.
|
||||
|
|
@ -281,16 +316,15 @@ struct GridDim {
|
|||
// See http://docs.nvidia.com/cuda/cuda-c-programming-guide/#assertion
|
||||
#if defined(__APPLE__) || defined(__HIP_PLATFORM_HCC__)
|
||||
#define CUDA_KERNEL_ASSERT(...)
|
||||
#else // __APPLE__
|
||||
#else // __APPLE__
|
||||
#define CUDA_KERNEL_ASSERT(...) assert(__VA_ARGS__)
|
||||
#endif // __APPLE__
|
||||
#endif // __APPLE__
|
||||
|
||||
// WARP related definitions and functions
|
||||
constexpr int GPU_WARP_SIZE = 32;
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T WARP_SHFL(T value, int srcLane, int width = GPU_WARP_SIZE, unsigned int mask = 0xffffffff)
|
||||
{
|
||||
__device__ __forceinline__ T WARP_SHFL(T value, int srcLane, int width = GPU_WARP_SIZE, unsigned int mask = 0xffffffff) {
|
||||
#if CUDA_VERSION >= 9000
|
||||
return __shfl_sync(mask, value, srcLane, width);
|
||||
#else
|
||||
|
|
@ -299,8 +333,7 @@ __device__ __forceinline__ T WARP_SHFL(T value, int srcLane, int width = GPU_WAR
|
|||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = GPU_WARP_SIZE, unsigned int mask = 0xffffffff)
|
||||
{
|
||||
__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = GPU_WARP_SIZE, unsigned int mask = 0xffffffff) {
|
||||
#if CUDA_VERSION >= 9000
|
||||
return __shfl_xor_sync(mask, value, laneMask, width);
|
||||
#else
|
||||
|
|
@ -309,8 +342,7 @@ __device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = GP
|
|||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T WARP_SHFL_UP(T value, unsigned int delta, int width = GPU_WARP_SIZE, unsigned int mask = 0xffffffff)
|
||||
{
|
||||
__device__ __forceinline__ T WARP_SHFL_UP(T value, unsigned int delta, int width = GPU_WARP_SIZE, unsigned int mask = 0xffffffff) {
|
||||
#if CUDA_VERSION >= 9000
|
||||
return __shfl_up_sync(mask, value, delta, width);
|
||||
#else
|
||||
|
|
@ -319,8 +351,7 @@ __device__ __forceinline__ T WARP_SHFL_UP(T value, unsigned int delta, int width
|
|||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T WARP_SHFL_DOWN(T value, unsigned int delta, int width = GPU_WARP_SIZE, unsigned int mask = 0xffffffff)
|
||||
{
|
||||
__device__ __forceinline__ T WARP_SHFL_DOWN(T value, unsigned int delta, int width = GPU_WARP_SIZE, unsigned int mask = 0xffffffff) {
|
||||
#if CUDA_VERSION >= 9000
|
||||
return __shfl_down_sync(mask, value, delta, width);
|
||||
#else
|
||||
|
|
|
|||
|
|
@ -294,6 +294,12 @@ namespace cuda {
|
|||
// opset 1 to 9
|
||||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MemcpyFromHost);
|
||||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MemcpyToHost);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, float, Cos);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, double, Cos);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, MLFloat16, Cos);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, float, Sin);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, double, Sin);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, MLFloat16, Sin);
|
||||
class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 4, 10, Concat);
|
||||
class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, Unsqueeze);
|
||||
class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 8, Flatten);
|
||||
|
|
@ -496,6 +502,7 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kO
|
|||
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, double, ReduceMax);
|
||||
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, MLFloat16, ReduceMax);
|
||||
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, int32_t, ReduceMax);
|
||||
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, int64_t, ReduceMax);
|
||||
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, float, ReduceMean);
|
||||
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, double, ReduceMean);
|
||||
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, MLFloat16, ReduceMean);
|
||||
|
|
@ -512,6 +519,7 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kO
|
|||
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, double, ReduceSum);
|
||||
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, MLFloat16, ReduceSum);
|
||||
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, int32_t, ReduceSum);
|
||||
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, int64_t, ReduceSum);
|
||||
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, float, ReduceLogSum);
|
||||
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, double, ReduceLogSum);
|
||||
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, MLFloat16, ReduceLogSum);
|
||||
|
|
@ -685,6 +693,7 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kO
|
|||
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 11, double, ReduceMax);
|
||||
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 11, MLFloat16, ReduceMax);
|
||||
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 11, int32_t, ReduceMax);
|
||||
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 11, int64_t, ReduceMax);
|
||||
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, float, ReduceMean);
|
||||
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, double, ReduceMean);
|
||||
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, MLFloat16, ReduceMean);
|
||||
|
|
@ -701,6 +710,7 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kO
|
|||
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, double, ReduceSum);
|
||||
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, MLFloat16, ReduceSum);
|
||||
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, int32_t, ReduceSum);
|
||||
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, int64_t, ReduceSum);
|
||||
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, float, ReduceSumSquare);
|
||||
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, double, ReduceSumSquare);
|
||||
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, MLFloat16, ReduceSumSquare);
|
||||
|
|
@ -778,6 +788,7 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kO
|
|||
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, double, ReduceMax);
|
||||
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, MLFloat16, ReduceMax);
|
||||
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, int32_t, ReduceMax);
|
||||
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, int64_t, ReduceMax);
|
||||
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, int8_t, ReduceMax);
|
||||
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, uint8_t, ReduceMax);
|
||||
|
||||
|
|
@ -959,6 +970,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
|
|||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, ReduceMax);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, ReduceMax);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int32_t, ReduceMax);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int64_t, ReduceMax);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int8_t, ReduceMax);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, uint8_t, ReduceMax);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, ReduceMean);
|
||||
|
|
@ -979,6 +991,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
|
|||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, ReduceSum);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, ReduceSum);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int32_t, ReduceSum);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int64_t, ReduceSum);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, ReduceSumSquare);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, ReduceSumSquare);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, ReduceSumSquare);
|
||||
|
|
@ -1033,6 +1046,12 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
|
|||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, Squeeze)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 12, Identity)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 9, Dropout)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, float, Cos)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, double, Cos)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, MLFloat16, Cos)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, float, Sin)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, double, Sin)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, MLFloat16, Sin)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, Gather)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, float, Gemm)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, double, Gemm)>,
|
||||
|
|
@ -1229,6 +1248,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
|
|||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, double, ReduceMax)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, MLFloat16, ReduceMax)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, int32_t, ReduceMax)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, int64_t, ReduceMax)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, float, ReduceMean)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, double, ReduceMean)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, MLFloat16, ReduceMean)>,
|
||||
|
|
@ -1245,6 +1265,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
|
|||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, double, ReduceSum)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, MLFloat16, ReduceSum)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, int32_t, ReduceSum)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, int64_t, ReduceSum)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, float, ReduceLogSum)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, double, ReduceLogSum)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, MLFloat16, ReduceLogSum)>,
|
||||
|
|
@ -1422,6 +1443,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
|
|||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 11, double, ReduceMax)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 11, MLFloat16, ReduceMax)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 11, int32_t, ReduceMax)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 11, int64_t, ReduceMax)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, float, ReduceMean)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, double, ReduceMean)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, MLFloat16, ReduceMean)>,
|
||||
|
|
@ -1438,6 +1460,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
|
|||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, double, ReduceSum)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, MLFloat16, ReduceSum)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, int32_t, ReduceSum)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, int64_t, ReduceSum)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, float, ReduceSumSquare)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, double, ReduceSumSquare)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, MLFloat16, ReduceSumSquare)>,
|
||||
|
|
@ -1511,6 +1534,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
|
|||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, double, ReduceMax)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, MLFloat16, ReduceMax)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, int32_t, ReduceMax)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, int64_t, ReduceMax)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, int8_t, ReduceMax)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, uint8_t, ReduceMax)>,
|
||||
|
||||
|
|
@ -1692,6 +1716,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
|
|||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, ReduceMax)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, ReduceMax)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int32_t, ReduceMax)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int64_t, ReduceMax)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int8_t, ReduceMax)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, uint8_t, ReduceMax)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, ReduceMean)>,
|
||||
|
|
@ -1712,6 +1737,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
|
|||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, ReduceSum)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, ReduceSum)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int32_t, ReduceSum)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int64_t, ReduceSum)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, ReduceSumSquare)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, ReduceSumSquare)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, ReduceSumSquare)>,
|
||||
|
|
@ -2044,4 +2070,4 @@ void CUDAExecutionProvider::RegisterAllocator(std::shared_ptr<AllocatorManager>
|
|||
TryInsertAllocator(cuda_cpu_alloc);
|
||||
}
|
||||
|
||||
} // namespace onnxruntime
|
||||
} // namespace onnxruntime
|
||||
|
|
@ -58,7 +58,7 @@ Status UnaryElementwise::Prepare(OpKernelContext* context, UnaryElementwisePrepa
|
|||
return Status::OK(); \
|
||||
}
|
||||
|
||||
#define UNARY_OP_VERSIONED_TYPED(name, startver, endver, T) \
|
||||
#define UNARY_OP_VERSIONED_TYPED(name, startver, endver, T) \
|
||||
UNARY_ELEMENTWISE_REGISTER_VERSIONED_KERNEL(name, startver, endver, T)
|
||||
|
||||
#define UNARY_OP_TYPED(name, ver, T) \
|
||||
|
|
@ -143,6 +143,8 @@ UNARY_OP_HFD(Erf, 13)
|
|||
|
||||
UNARY_LOGICALOP_TYPED(Not, 1, bool)
|
||||
UNARY_OP_HFD(Round, 11)
|
||||
UNARY_OP_HFD(Cos, 7)
|
||||
UNARY_OP_HFD(Sin, 7)
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace onnxruntime
|
||||
|
|
|
|||
|
|
@ -98,5 +98,19 @@ class Round final : public UnaryElementwise {
|
|||
Status ComputeInternal(OpKernelContext* context) const override;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
class Sin final : public UnaryElementwise {
|
||||
public:
|
||||
Sin(const OpKernelInfo& info) : UnaryElementwise(info) {}
|
||||
Status ComputeInternal(OpKernelContext* context) const override;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
class Cos final : public UnaryElementwise {
|
||||
public:
|
||||
Cos(const OpKernelInfo& info) : UnaryElementwise(info) {}
|
||||
Status ComputeInternal(OpKernelContext* context) const override;
|
||||
};
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace onnxruntime
|
||||
|
|
|
|||
|
|
@ -79,6 +79,8 @@ SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFD(Log)
|
|||
SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFD(Exp)
|
||||
SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFD(Erf)
|
||||
SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFD(Round)
|
||||
SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFD(Sin)
|
||||
SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFD(Cos)
|
||||
SPECIALIZED_UNARY_ELEMENTWISE_IMPL(Not, bool)
|
||||
|
||||
// When casting, half needs to be converted via float type from most other types
|
||||
|
|
@ -121,11 +123,11 @@ void Impl_Cast(
|
|||
const InT* input_data,
|
||||
OutT* output_data,
|
||||
size_t count) {
|
||||
UnaryElementWiseImpl(stream,
|
||||
input_data,
|
||||
output_data,
|
||||
OP_Cast<InT, OutT>(),
|
||||
count);
|
||||
UnaryElementWiseImpl(stream,
|
||||
input_data,
|
||||
output_data,
|
||||
OP_Cast<InT, OutT>(),
|
||||
count);
|
||||
}
|
||||
|
||||
#define SPECIALIZED_CAST_IMPL2(InT, OutT) \
|
||||
|
|
|
|||
|
|
@ -24,7 +24,9 @@ namespace cuda {
|
|||
UNARY_OP_NAME_EXPR(Log, _Log(a)) \
|
||||
UNARY_OP_NAME_EXPR(Erf, _Erf(a)) \
|
||||
UNARY_OP_NAME_EXPR(Not, !a) \
|
||||
UNARY_OP_NAME_EXPR(Round, _Round(a))
|
||||
UNARY_OP_NAME_EXPR(Round, _Round(a)) \
|
||||
UNARY_OP_NAME_EXPR(Sin, _Sin(a)) \
|
||||
UNARY_OP_NAME_EXPR(Cos, _Cos(a))
|
||||
|
||||
#define UNARY_ELEMENTWISE_IMPL_DECLARATION(name) \
|
||||
template <typename T> \
|
||||
|
|
|
|||
|
|
@ -771,6 +771,7 @@ Status ReduceKernel<allow_multi_axes>::ComputeImpl(OpKernelContext* ctx, cudnnRe
|
|||
}
|
||||
|
||||
SPECIALIZED_REDUCEKERNEL_COMPUTEIMPL(int32_t)
|
||||
SPECIALIZED_REDUCEKERNEL_COMPUTEIMPL(int64_t)
|
||||
SPECIALIZED_REDUCEKERNEL_COMPUTEIMPL(int8_t)
|
||||
SPECIALIZED_REDUCEKERNEL_COMPUTEIMPL(uint8_t)
|
||||
|
||||
|
|
@ -955,6 +956,7 @@ REGISTER_KERNEL_TYPED_12(ReduceMax, MLFloat16)
|
|||
REGISTER_KERNEL_TYPED_12(ReduceMax, float)
|
||||
REGISTER_KERNEL_TYPED_12(ReduceMax, double)
|
||||
REGISTER_KERNEL_TYPED_12(ReduceMax, int32_t)
|
||||
REGISTER_KERNEL_TYPED_12(ReduceMax, int64_t)
|
||||
REGISTER_KERNEL_TYPED_12(ReduceMax, int8_t)
|
||||
REGISTER_KERNEL_TYPED_12(ReduceMax, uint8_t)
|
||||
|
||||
|
|
@ -973,6 +975,7 @@ REGISTER_KERNEL_TYPED_13(ReduceSum, MLFloat16)
|
|||
REGISTER_KERNEL_TYPED_13(ReduceSum, float)
|
||||
REGISTER_KERNEL_TYPED_13(ReduceSum, double)
|
||||
REGISTER_KERNEL_TYPED_13(ReduceSum, int32_t)
|
||||
REGISTER_KERNEL_TYPED_13(ReduceSum, int64_t)
|
||||
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
|
||||
REGISTER_KERNEL_TYPED_13(ReduceSum, BFloat16)
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -76,6 +76,30 @@ __device__ __inline__ half _Round(half a) {
|
|||
return hrint(a);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __inline__ T _Cos(T a);
|
||||
|
||||
template <>
|
||||
__device__ __inline__ float _Cos(float a) { return cosf(a); }
|
||||
|
||||
template <>
|
||||
__device__ __inline__ double _Cos(double a) { return cos(a); }
|
||||
|
||||
template <>
|
||||
__device__ __inline__ half _Cos(half a) { return hcos(a); }
|
||||
|
||||
template <typename T>
|
||||
__device__ __inline__ T _Sin(T a);
|
||||
|
||||
template <>
|
||||
__device__ __inline__ float _Sin(float a) { return sinf(a); }
|
||||
|
||||
template <>
|
||||
__device__ __inline__ double _Sin(double a) { return sin(a); }
|
||||
|
||||
template <>
|
||||
__device__ __inline__ half _Sin(half a) { return hsin(a); }
|
||||
|
||||
template <typename T>
|
||||
__device__ __inline__ T _Exp(T a);
|
||||
|
||||
|
|
|
|||
|
|
@ -203,9 +203,9 @@ TEST(MathOpTest, Add_Broadcast_0x1) {
|
|||
test.AddInput<float>("B", {1}, {2.0f});
|
||||
test.AddOutput<float>("C", {1}, {12.0f});
|
||||
#if defined(OPENVINO_CONFIG_MYRIAD)
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider}); // OpenVINO: disabled temporarily on MYRIADX due to a bug
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider}); // OpenVINO: disabled temporarily on MYRIADX due to a bug
|
||||
#else
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess, "");
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess, "");
|
||||
#endif
|
||||
};
|
||||
|
||||
|
|
@ -221,9 +221,9 @@ TEST(MathOpTest, Add_Broadcast_1x0) {
|
|||
test.AddInput<float>("B", {}, {2.0f}, scalar_as_initializer);
|
||||
test.AddOutput<float>("C", {1}, {12.0f});
|
||||
#if defined(OPENVINO_CONFIG_MYRIAD)
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider}); // OpenVINO: disabled temporarily on MYRIADX due to a bug
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider}); // OpenVINO: disabled temporarily on MYRIADX due to a bug
|
||||
#else
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess, "");
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess, "");
|
||||
#endif
|
||||
};
|
||||
|
||||
|
|
@ -1864,7 +1864,8 @@ void TrigFloatTest(OpTester& test, std::initializer_list<float> input) {
|
|||
}
|
||||
|
||||
template <double (&op)(double value)>
|
||||
void TrigDoubleTest(OpTester& test, std::initializer_list<double> input) {
|
||||
void TrigDoubleTest(OpTester& test, std::initializer_list<double> input,
|
||||
const std::unordered_set<std::string> excluded_provider_types = {}) {
|
||||
std::vector<int64_t> dims{static_cast<int64_t>(input.size())};
|
||||
|
||||
std::vector<double> output;
|
||||
|
|
@ -1873,9 +1874,24 @@ void TrigDoubleTest(OpTester& test, std::initializer_list<double> input) {
|
|||
|
||||
test.AddInput<double>("X", dims, input);
|
||||
test.AddOutput<double>("Y", dims, output);
|
||||
test.Run();
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess, "", excluded_provider_types);
|
||||
}
|
||||
|
||||
template <float (&op)(float value)>
|
||||
void TrigFloat16Test(OpTester& test, std::initializer_list<float> input) {
|
||||
std::vector<int64_t> dims{static_cast<int64_t>(input.size())};
|
||||
|
||||
std::vector<MLFloat16> float16_input;
|
||||
std::vector<MLFloat16> float16_output;
|
||||
for (auto v : input) {
|
||||
float16_input.push_back(MLFloat16(math::floatToHalf(v)));
|
||||
float16_output.push_back(MLFloat16(math::floatToHalf(op(v))));
|
||||
}
|
||||
|
||||
test.AddInput<MLFloat16>("X", dims, float16_input);
|
||||
test.AddOutput<MLFloat16>("Y", dims, float16_output);
|
||||
test.Run();
|
||||
}
|
||||
TEST(MathOpTest, SinFloat) {
|
||||
OpTester test("Sin");
|
||||
TrigFloatTest<std::sin>(test, {1.1f, -1.1f, 2.2f, -2.2f});
|
||||
|
|
@ -1886,11 +1902,33 @@ TEST(MathOpTest, SinDouble) {
|
|||
TrigDoubleTest<std::sin>(test, {1.1, -1.1, 2.2, -2.2});
|
||||
}
|
||||
|
||||
TEST(MathOpTest, Cos) {
|
||||
TEST(MathOpTest, SinFloat16) {
|
||||
if (DefaultCudaExecutionProvider().get() != nullptr) { // MLFloat16 type not supported on CPU
|
||||
OpTester test("Sin");
|
||||
TrigFloat16Test<std::sin>(test, {1.1f, -1.1f, 2.2f, -2.2f});
|
||||
}
|
||||
}
|
||||
|
||||
TEST(MathOpTest, CosFloat) {
|
||||
OpTester test("Cos");
|
||||
TrigFloatTest<std::cos>(test, {1.1f, -1.1f, 2.2f, -2.2f});
|
||||
}
|
||||
|
||||
TEST(MathOpTest, CosDouble) {
|
||||
if (DefaultCudaExecutionProvider().get() != nullptr) { // double type not supported on CPU
|
||||
OpTester test("Cos");
|
||||
TrigDoubleTest<std::cos>(test, {1.1, -1.1, 2.2, -2.2}, {kTensorrtExecutionProvider});
|
||||
// Fails TensorRT unit-test because the unit tests only test one EP at a time and the TensorRT EP will not be able to find an implementation in the fall-back CPU EP,
|
||||
// so skip it
|
||||
}
|
||||
}
|
||||
|
||||
TEST(MathOpTest, CosFloat16) {
|
||||
if (DefaultCudaExecutionProvider().get() != nullptr) { // MLFloat16 type not supported on CPU
|
||||
OpTester test("Cos");
|
||||
TrigFloat16Test<std::cos>(test, {1.1f, -1.1f, 2.2f, -2.2f});
|
||||
}
|
||||
}
|
||||
TEST(MathOpTest, Tan) {
|
||||
OpTester test("Tan");
|
||||
TrigFloatTest<std::tan>(test, {-100.0f, -50.0f, 0.0f, 50.0f, 100.0f});
|
||||
|
|
|
|||
Loading…
Reference in a new issue