diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index f4edd3b787..cc548f6c4f 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -543,6 +543,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, float, Less); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, double, Less); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, MLFloat16, Less); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, EyeLike); // opset 10 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, Dropout); @@ -899,6 +900,7 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, // opset 11 BuildKernelCreateInfo, diff --git a/onnxruntime/core/providers/cuda/tensor/eye_like.cc b/onnxruntime/core/providers/cuda/tensor/eye_like.cc new file mode 100644 index 0000000000..9d78a67d80 --- /dev/null +++ b/onnxruntime/core/providers/cuda/tensor/eye_like.cc @@ -0,0 +1,93 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "eye_like.h" +#include "eye_like_impl.h" +#include "core/providers/cpu/tensor/utils.h" +#include "core/framework/tensorprotoutils.h" +#include "core/providers/cuda/shared_inc/fast_divmod.h" + +namespace onnxruntime { +namespace cuda { + +ONNX_OPERATOR_KERNEL_EX( + EyeLike, + kOnnxDomain, + 9, + kCudaExecutionProvider, + KernelDefBuilder().TypeConstraint("T1", + std::vector{ + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType() + }) + .TypeConstraint("T2", + std::vector{ + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType() + }), + EyeLike); + +#define TYPED_FUNCTION_CALL(T) \ + EyeLikeImpl::MappedType>( \ + offset, \ + dim1 + 1, \ + reinterpret_cast::MappedType *>(T2->template MutableData()), \ + diag_count); \ + break; + +Status EyeLike::ComputeInternal(OpKernelContext* context) const { + const auto* T1 = context->Input(0); + ORT_ENFORCE(T1 != nullptr); + + const std::vector& input_dims = T1->Shape().GetDims(); + if (input_dims.size() != 2) { + return Status(ONNXRUNTIME, INVALID_ARGUMENT, "EyeLike : Input tensor dimension is not 2"); + } + + // set output tensor shape same as input tensor and set all values to zero + auto* T2 = context->Output(0, input_dims); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(T2->MutableDataRaw(), 0, T2->SizeInBytes())); + auto dim0 = input_dims[0]; + auto dim1 = input_dims[1]; + + if ((k_ >= 0 && k_ >= dim1) || (k_ < 0 && (std::abs(k_)) >= dim0)) { + return Status::OK(); + } + + // Calculate the start offset and total number of elements in diagnal. + size_t offset, diag_count; + if (k_ >= 0) { + offset = k_; + diag_count = std::min(dim1 - k_, dim0); + } else { + offset = (-k_) * dim1; + diag_count = std::min(dim0 + k_, dim1); + } + + auto output_tensor_dtype = has_dtype_ ? static_cast(dtype_) : utils::GetTensorProtoType(*T1); + switch (output_tensor_dtype) { + case ONNX_NAMESPACE::TensorProto_DataType_FLOAT: + TYPED_FUNCTION_CALL(float) + case ONNX_NAMESPACE::TensorProto_DataType_DOUBLE: + TYPED_FUNCTION_CALL(double) + case ONNX_NAMESPACE::TensorProto_DataType_INT32: + TYPED_FUNCTION_CALL(int32_t) + case ONNX_NAMESPACE::TensorProto_DataType_UINT64: + TYPED_FUNCTION_CALL(uint64_t) + case ONNX_NAMESPACE::TensorProto_DataType_INT64: + TYPED_FUNCTION_CALL(int64_t) + default: + ORT_THROW("Unsupported 'dtype' value: ", output_tensor_dtype); + } + + return Status::OK(); +} + +} // namespace cuda +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/tensor/eye_like.h b/onnxruntime/core/providers/cuda/tensor/eye_like.h new file mode 100644 index 0000000000..ed198ce62c --- /dev/null +++ b/onnxruntime/core/providers/cuda/tensor/eye_like.h @@ -0,0 +1,30 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/common/common.h" +#include "core/framework/op_kernel.h" +#include "core/providers/cuda/cuda_common.h" + +namespace onnxruntime { +namespace cuda { + +class EyeLike final : public CudaKernel { + public: + EyeLike(const OpKernelInfo& info) : CudaKernel(info) { + if (!info.GetAttr("k", &k_).IsOK()) { + k_ = 0; + } + + has_dtype_ = info.GetAttr("dtype", &dtype_).IsOK(); + } + + Status ComputeInternal(OpKernelContext* context) const override; + + private: + bool has_dtype_; + int64_t dtype_; + int64_t k_; +}; + +} // namespace cuda +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/tensor/eye_like_impl.cu b/onnxruntime/core/providers/cuda/tensor/eye_like_impl.cu new file mode 100644 index 0000000000..a20d0ea5e0 --- /dev/null +++ b/onnxruntime/core/providers/cuda/tensor/eye_like_impl.cu @@ -0,0 +1,52 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include +#include "core/providers/cuda/shared_inc/cuda_utils.h" +#include "core/providers/cuda/cu_inc/common.cuh" +#include "eye_like_impl.h" + +namespace onnxruntime { +namespace cuda { + +template +__global__ void _EyeLikeKernel( + size_t offset, + size_t stripe, + T* output_data, + CUDA_LONG N) { + CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N); + + // offset is the first elements, stripe is width + 1. + output_data[offset + id * stripe] = static_cast(1); +} + +template +void EyeLikeImpl( + size_t offset, + size_t stripe, + T* output_data, + size_t diag_count) { + constexpr int block_size = 256; + int blocksPerGrid = (int)(ceil(static_cast(diag_count) / block_size)); + CUDA_LONG N = static_cast(diag_count); + + _EyeLikeKernel<<>>(offset, stripe, output_data, N); +} + +#define SPECIALIZED_IMPL(T) \ + template void EyeLikeImpl( \ + size_t offset, \ + size_t stripe, \ + T* output_data, \ + size_t diag_count); + +SPECIALIZED_IMPL(int32_t) +SPECIALIZED_IMPL(int64_t) +SPECIALIZED_IMPL(uint64_t) +SPECIALIZED_IMPL(float) +SPECIALIZED_IMPL(double) + +} // namespace cuda +} // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/core/providers/cuda/tensor/eye_like_impl.h b/onnxruntime/core/providers/cuda/tensor/eye_like_impl.h new file mode 100644 index 0000000000..f95ca63782 --- /dev/null +++ b/onnxruntime/core/providers/cuda/tensor/eye_like_impl.h @@ -0,0 +1,22 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include +#include "core/providers/cuda/shared_inc/cuda_utils.h" +#include "core/providers/cuda/shared_inc/fast_divmod.h" +#include "core/common/common.h" + +namespace onnxruntime { +namespace cuda { + +template +void EyeLikeImpl( + size_t offset, // offset of first element in diagnal + size_t stripe, // stripe, here it's width + 1 + T* output_data, // output buffer + size_t diag_count // total number of elements in diagnal +); + +} // namespace cuda +} // namespace onnxruntime