diff --git a/onnxruntime/core/providers/cuda/cu_inc/common.cuh b/onnxruntime/core/providers/cuda/cu_inc/common.cuh index 7d5cd9f397..e4c0b5f6fb 100644 --- a/onnxruntime/core/providers/cuda/cu_inc/common.cuh +++ b/onnxruntime/core/providers/cuda/cu_inc/common.cuh @@ -133,7 +133,7 @@ template <> __device__ __inline__ double _Round(double a) { return rint(a); } template <> -__device__ __inline__ half _Round(half a) { +__device__ __inline__ half _Round(half a) { #if __CUDA_ARCH__ < 530 return half(rintf((float)a)); #else @@ -141,6 +141,42 @@ __device__ __inline__ half _Round(half a) { #endif } +template +__device__ __inline__ T _Cos(T a); + +template <> +__device__ __inline__ float _Cos(float a) { return cosf(a); } + +template <> +__device__ __inline__ double _Cos(double a) { return cos(a); } + +template <> +__device__ __inline__ half _Cos(half a) { +#if __CUDA_ARCH__ < 530 + return half(cosf((float)a)); +#else + return hcos(a); +#endif +} + +template +__device__ __inline__ T _Sin(T a); + +template <> +__device__ __inline__ float _Sin(float a) { return sinf(a); } + +template <> +__device__ __inline__ double _Sin(double a) { return sin(a); } + +template <> +__device__ __inline__ half _Sin(half a) { +#if __CUDA_ARCH__ < 530 + return half(sinf((float)a)); +#else + return hsin(a); +#endif +} + template __device__ __inline__ T _Exp(T a); @@ -270,10 +306,9 @@ struct GridDim { }; }; - -#define CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N) \ - CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x; \ - if (id >= N) \ +#define CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N) \ + CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x; \ + if (id >= N) \ return; // CUDA_KERNEL_ASSERT is a macro that wraps an assert() call inside cuda kernels. @@ -281,16 +316,15 @@ struct GridDim { // See http://docs.nvidia.com/cuda/cuda-c-programming-guide/#assertion #if defined(__APPLE__) || defined(__HIP_PLATFORM_HCC__) #define CUDA_KERNEL_ASSERT(...) -#else // __APPLE__ +#else // __APPLE__ #define CUDA_KERNEL_ASSERT(...) assert(__VA_ARGS__) -#endif // __APPLE__ +#endif // __APPLE__ // WARP related definitions and functions constexpr int GPU_WARP_SIZE = 32; template -__device__ __forceinline__ T WARP_SHFL(T value, int srcLane, int width = GPU_WARP_SIZE, unsigned int mask = 0xffffffff) -{ +__device__ __forceinline__ T WARP_SHFL(T value, int srcLane, int width = GPU_WARP_SIZE, unsigned int mask = 0xffffffff) { #if CUDA_VERSION >= 9000 return __shfl_sync(mask, value, srcLane, width); #else @@ -299,8 +333,7 @@ __device__ __forceinline__ T WARP_SHFL(T value, int srcLane, int width = GPU_WAR } template -__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = GPU_WARP_SIZE, unsigned int mask = 0xffffffff) -{ +__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = GPU_WARP_SIZE, unsigned int mask = 0xffffffff) { #if CUDA_VERSION >= 9000 return __shfl_xor_sync(mask, value, laneMask, width); #else @@ -309,8 +342,7 @@ __device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = GP } template -__device__ __forceinline__ T WARP_SHFL_UP(T value, unsigned int delta, int width = GPU_WARP_SIZE, unsigned int mask = 0xffffffff) -{ +__device__ __forceinline__ T WARP_SHFL_UP(T value, unsigned int delta, int width = GPU_WARP_SIZE, unsigned int mask = 0xffffffff) { #if CUDA_VERSION >= 9000 return __shfl_up_sync(mask, value, delta, width); #else @@ -319,8 +351,7 @@ __device__ __forceinline__ T WARP_SHFL_UP(T value, unsigned int delta, int width } template -__device__ __forceinline__ T WARP_SHFL_DOWN(T value, unsigned int delta, int width = GPU_WARP_SIZE, unsigned int mask = 0xffffffff) -{ +__device__ __forceinline__ T WARP_SHFL_DOWN(T value, unsigned int delta, int width = GPU_WARP_SIZE, unsigned int mask = 0xffffffff) { #if CUDA_VERSION >= 9000 return __shfl_down_sync(mask, value, delta, width); #else diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index ff542b78cb..782a46fc32 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -294,6 +294,12 @@ namespace cuda { // opset 1 to 9 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MemcpyFromHost); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MemcpyToHost); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, float, Cos); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, double, Cos); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, MLFloat16, Cos); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, float, Sin); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, double, Sin); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, MLFloat16, Sin); class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 4, 10, Concat); class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, Unsqueeze); class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 8, Flatten); @@ -496,6 +502,7 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kO class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, double, ReduceMax); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, MLFloat16, ReduceMax); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, int32_t, ReduceMax); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, int64_t, ReduceMax); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, float, ReduceMean); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, double, ReduceMean); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, MLFloat16, ReduceMean); @@ -512,6 +519,7 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kO class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, double, ReduceSum); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, MLFloat16, ReduceSum); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, int32_t, ReduceSum); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, int64_t, ReduceSum); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, float, ReduceLogSum); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, double, ReduceLogSum); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, MLFloat16, ReduceLogSum); @@ -685,6 +693,7 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kO class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 11, double, ReduceMax); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 11, MLFloat16, ReduceMax); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 11, int32_t, ReduceMax); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 11, int64_t, ReduceMax); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, float, ReduceMean); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, double, ReduceMean); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, MLFloat16, ReduceMean); @@ -701,6 +710,7 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kO class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, double, ReduceSum); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, MLFloat16, ReduceSum); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, int32_t, ReduceSum); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, int64_t, ReduceSum); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, float, ReduceSumSquare); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, double, ReduceSumSquare); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, MLFloat16, ReduceSumSquare); @@ -778,6 +788,7 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kO class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, double, ReduceMax); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, MLFloat16, ReduceMax); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, int32_t, ReduceMax); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, int64_t, ReduceMax); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, int8_t, ReduceMax); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, uint8_t, ReduceMax); @@ -959,6 +970,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, ReduceMax); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, ReduceMax); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int32_t, ReduceMax); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int64_t, ReduceMax); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int8_t, ReduceMax); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, uint8_t, ReduceMax); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, ReduceMean); @@ -979,6 +991,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, ReduceSum); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, ReduceSum); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int32_t, ReduceSum); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int64_t, ReduceSum); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, ReduceSumSquare); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, ReduceSumSquare); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, ReduceSumSquare); @@ -1033,6 +1046,12 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -1229,6 +1248,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -1245,6 +1265,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -1422,6 +1443,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -1438,6 +1460,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -1511,6 +1534,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -1692,6 +1716,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -1712,6 +1737,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -2044,4 +2070,4 @@ void CUDAExecutionProvider::RegisterAllocator(std::shared_ptr TryInsertAllocator(cuda_cpu_alloc); } -} // namespace onnxruntime +} // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc index 53220ae131..5a235749a9 100644 --- a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc +++ b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc @@ -58,7 +58,7 @@ Status UnaryElementwise::Prepare(OpKernelContext* context, UnaryElementwisePrepa return Status::OK(); \ } -#define UNARY_OP_VERSIONED_TYPED(name, startver, endver, T) \ +#define UNARY_OP_VERSIONED_TYPED(name, startver, endver, T) \ UNARY_ELEMENTWISE_REGISTER_VERSIONED_KERNEL(name, startver, endver, T) #define UNARY_OP_TYPED(name, ver, T) \ @@ -143,6 +143,8 @@ UNARY_OP_HFD(Erf, 13) UNARY_LOGICALOP_TYPED(Not, 1, bool) UNARY_OP_HFD(Round, 11) +UNARY_OP_HFD(Cos, 7) +UNARY_OP_HFD(Sin, 7) } // namespace cuda } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.h b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.h index 28fb96db4a..3ff97a6011 100644 --- a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.h +++ b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.h @@ -98,5 +98,19 @@ class Round final : public UnaryElementwise { Status ComputeInternal(OpKernelContext* context) const override; }; +template +class Sin final : public UnaryElementwise { + public: + Sin(const OpKernelInfo& info) : UnaryElementwise(info) {} + Status ComputeInternal(OpKernelContext* context) const override; +}; + +template +class Cos final : public UnaryElementwise { + public: + Cos(const OpKernelInfo& info) : UnaryElementwise(info) {} + Status ComputeInternal(OpKernelContext* context) const override; +}; + } // namespace cuda } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.cu b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.cu index 5b5102938d..66d50dafa5 100644 --- a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.cu +++ b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.cu @@ -79,6 +79,8 @@ SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFD(Log) SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFD(Exp) SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFD(Erf) SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFD(Round) +SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFD(Sin) +SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFD(Cos) SPECIALIZED_UNARY_ELEMENTWISE_IMPL(Not, bool) // When casting, half needs to be converted via float type from most other types @@ -121,11 +123,11 @@ void Impl_Cast( const InT* input_data, OutT* output_data, size_t count) { - UnaryElementWiseImpl(stream, - input_data, - output_data, - OP_Cast(), - count); + UnaryElementWiseImpl(stream, + input_data, + output_data, + OP_Cast(), + count); } #define SPECIALIZED_CAST_IMPL2(InT, OutT) \ diff --git a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.h b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.h index 2b28886386..5467d81f15 100644 --- a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.h +++ b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.h @@ -24,7 +24,9 @@ namespace cuda { UNARY_OP_NAME_EXPR(Log, _Log(a)) \ UNARY_OP_NAME_EXPR(Erf, _Erf(a)) \ UNARY_OP_NAME_EXPR(Not, !a) \ - UNARY_OP_NAME_EXPR(Round, _Round(a)) + UNARY_OP_NAME_EXPR(Round, _Round(a)) \ + UNARY_OP_NAME_EXPR(Sin, _Sin(a)) \ + UNARY_OP_NAME_EXPR(Cos, _Cos(a)) #define UNARY_ELEMENTWISE_IMPL_DECLARATION(name) \ template \ diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc index 612dee590a..214d274baa 100644 --- a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc +++ b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc @@ -771,6 +771,7 @@ Status ReduceKernel::ComputeImpl(OpKernelContext* ctx, cudnnRe } SPECIALIZED_REDUCEKERNEL_COMPUTEIMPL(int32_t) +SPECIALIZED_REDUCEKERNEL_COMPUTEIMPL(int64_t) SPECIALIZED_REDUCEKERNEL_COMPUTEIMPL(int8_t) SPECIALIZED_REDUCEKERNEL_COMPUTEIMPL(uint8_t) @@ -955,6 +956,7 @@ REGISTER_KERNEL_TYPED_12(ReduceMax, MLFloat16) REGISTER_KERNEL_TYPED_12(ReduceMax, float) REGISTER_KERNEL_TYPED_12(ReduceMax, double) REGISTER_KERNEL_TYPED_12(ReduceMax, int32_t) +REGISTER_KERNEL_TYPED_12(ReduceMax, int64_t) REGISTER_KERNEL_TYPED_12(ReduceMax, int8_t) REGISTER_KERNEL_TYPED_12(ReduceMax, uint8_t) @@ -973,6 +975,7 @@ REGISTER_KERNEL_TYPED_13(ReduceSum, MLFloat16) REGISTER_KERNEL_TYPED_13(ReduceSum, float) REGISTER_KERNEL_TYPED_13(ReduceSum, double) REGISTER_KERNEL_TYPED_13(ReduceSum, int32_t) +REGISTER_KERNEL_TYPED_13(ReduceSum, int64_t) #if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 REGISTER_KERNEL_TYPED_13(ReduceSum, BFloat16) #endif diff --git a/onnxruntime/core/providers/rocm/cu_inc/common.cuh b/onnxruntime/core/providers/rocm/cu_inc/common.cuh index 36df3a5022..0d5ffee4ef 100644 --- a/onnxruntime/core/providers/rocm/cu_inc/common.cuh +++ b/onnxruntime/core/providers/rocm/cu_inc/common.cuh @@ -76,6 +76,30 @@ __device__ __inline__ half _Round(half a) { return hrint(a); } +template +__device__ __inline__ T _Cos(T a); + +template <> +__device__ __inline__ float _Cos(float a) { return cosf(a); } + +template <> +__device__ __inline__ double _Cos(double a) { return cos(a); } + +template <> +__device__ __inline__ half _Cos(half a) { return hcos(a); } + +template +__device__ __inline__ T _Sin(T a); + +template <> +__device__ __inline__ float _Sin(float a) { return sinf(a); } + +template <> +__device__ __inline__ double _Sin(double a) { return sin(a); } + +template <> +__device__ __inline__ half _Sin(half a) { return hsin(a); } + template __device__ __inline__ T _Exp(T a); diff --git a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc index c373ed45a0..7d713357f5 100644 --- a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc +++ b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc @@ -203,9 +203,9 @@ TEST(MathOpTest, Add_Broadcast_0x1) { test.AddInput("B", {1}, {2.0f}); test.AddOutput("C", {1}, {12.0f}); #if defined(OPENVINO_CONFIG_MYRIAD) - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider}); // OpenVINO: disabled temporarily on MYRIADX due to a bug + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider}); // OpenVINO: disabled temporarily on MYRIADX due to a bug #else - test.Run(OpTester::ExpectResult::kExpectSuccess, ""); + test.Run(OpTester::ExpectResult::kExpectSuccess, ""); #endif }; @@ -221,9 +221,9 @@ TEST(MathOpTest, Add_Broadcast_1x0) { test.AddInput("B", {}, {2.0f}, scalar_as_initializer); test.AddOutput("C", {1}, {12.0f}); #if defined(OPENVINO_CONFIG_MYRIAD) - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider}); // OpenVINO: disabled temporarily on MYRIADX due to a bug + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider}); // OpenVINO: disabled temporarily on MYRIADX due to a bug #else - test.Run(OpTester::ExpectResult::kExpectSuccess, ""); + test.Run(OpTester::ExpectResult::kExpectSuccess, ""); #endif }; @@ -1864,7 +1864,8 @@ void TrigFloatTest(OpTester& test, std::initializer_list input) { } template -void TrigDoubleTest(OpTester& test, std::initializer_list input) { +void TrigDoubleTest(OpTester& test, std::initializer_list input, + const std::unordered_set excluded_provider_types = {}) { std::vector dims{static_cast(input.size())}; std::vector output; @@ -1873,9 +1874,24 @@ void TrigDoubleTest(OpTester& test, std::initializer_list input) { test.AddInput("X", dims, input); test.AddOutput("Y", dims, output); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", excluded_provider_types); } +template +void TrigFloat16Test(OpTester& test, std::initializer_list input) { + std::vector dims{static_cast(input.size())}; + + std::vector float16_input; + std::vector float16_output; + for (auto v : input) { + float16_input.push_back(MLFloat16(math::floatToHalf(v))); + float16_output.push_back(MLFloat16(math::floatToHalf(op(v)))); + } + + test.AddInput("X", dims, float16_input); + test.AddOutput("Y", dims, float16_output); + test.Run(); +} TEST(MathOpTest, SinFloat) { OpTester test("Sin"); TrigFloatTest(test, {1.1f, -1.1f, 2.2f, -2.2f}); @@ -1886,11 +1902,33 @@ TEST(MathOpTest, SinDouble) { TrigDoubleTest(test, {1.1, -1.1, 2.2, -2.2}); } -TEST(MathOpTest, Cos) { +TEST(MathOpTest, SinFloat16) { + if (DefaultCudaExecutionProvider().get() != nullptr) { // MLFloat16 type not supported on CPU + OpTester test("Sin"); + TrigFloat16Test(test, {1.1f, -1.1f, 2.2f, -2.2f}); + } +} + +TEST(MathOpTest, CosFloat) { OpTester test("Cos"); TrigFloatTest(test, {1.1f, -1.1f, 2.2f, -2.2f}); } +TEST(MathOpTest, CosDouble) { + if (DefaultCudaExecutionProvider().get() != nullptr) { // double type not supported on CPU + OpTester test("Cos"); + TrigDoubleTest(test, {1.1, -1.1, 2.2, -2.2}, {kTensorrtExecutionProvider}); + // Fails TensorRT unit-test because the unit tests only test one EP at a time and the TensorRT EP will not be able to find an implementation in the fall-back CPU EP, + // so skip it + } +} + +TEST(MathOpTest, CosFloat16) { + if (DefaultCudaExecutionProvider().get() != nullptr) { // MLFloat16 type not supported on CPU + OpTester test("Cos"); + TrigFloat16Test(test, {1.1f, -1.1f, 2.2f, -2.2f}); + } +} TEST(MathOpTest, Tan) { OpTester test("Tan"); TrigFloatTest(test, {-100.0f, -50.0f, 0.0f, 50.0f, 100.0f});