Implement EyeLike Cuda Kernel to improve performance (#2139)

2026-07-18 18:52:16 +00:00 · 2019-10-23 13:48:39 -07:00 · 2019-10-23 13:48:39 -07:00 · 90b6ff927f
commit 90b6ff927f
parent ac9425c119
5 changed files with 199 additions and 0 deletions
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@ -543,6 +543,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, float, Less);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, double, Less);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, MLFloat16, Less);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, EyeLike);

 // opset 10
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, Dropout);
@ -899,6 +900,7 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) {
      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, MLFloat16, Less)>,
      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, float, RoiAlign)>,
      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, double, RoiAlign)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, EyeLike)>,

      // opset 11
      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, Gemm)>,
--- a/onnxruntime/core/providers/cuda/tensor/eye_like.cc
+++ b/onnxruntime/core/providers/cuda/tensor/eye_like.cc
@ -0,0 +1,93 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "eye_like.h"
+#include "eye_like_impl.h"
+#include "core/providers/cpu/tensor/utils.h"
+#include "core/framework/tensorprotoutils.h"
+#include "core/providers/cuda/shared_inc/fast_divmod.h"
+
+namespace onnxruntime {
+namespace cuda {
+
+ONNX_OPERATOR_KERNEL_EX(
+    EyeLike,
+    kOnnxDomain,
+    9,
+    kCudaExecutionProvider,
+    KernelDefBuilder().TypeConstraint("T1",
+                                      std::vector<MLDataType>{
+                                          DataTypeImpl::GetTensorType<float>(),
+                                          DataTypeImpl::GetTensorType<double>(),
+                                          DataTypeImpl::GetTensorType<uint64_t>(),
+                                          DataTypeImpl::GetTensorType<int64_t>(),
+                                          DataTypeImpl::GetTensorType<int32_t>()
+                                      })
+                        .TypeConstraint("T2",
+                                        std::vector<MLDataType>{
+                                            DataTypeImpl::GetTensorType<float>(),
+                                            DataTypeImpl::GetTensorType<double>(),
+                                            DataTypeImpl::GetTensorType<uint64_t>(),
+                                            DataTypeImpl::GetTensorType<int64_t>(),
+                                            DataTypeImpl::GetTensorType<int32_t>()
+                                        }),
+    EyeLike);
+
+#define TYPED_FUNCTION_CALL(T)                                                                 \
+    EyeLikeImpl<typename ToCudaType<T>::MappedType>(                                           \
+      offset,                                                                                  \
+      dim1 + 1,                                                                                \
+      reinterpret_cast<typename ToCudaType<T>::MappedType *>(T2->template MutableData<T>()),   \
+      diag_count);                                                                             \
+      break;
+
+Status EyeLike::ComputeInternal(OpKernelContext* context) const {
+  const auto* T1 = context->Input<Tensor>(0);
+  ORT_ENFORCE(T1 != nullptr);
+
+  const std::vector<int64_t>& input_dims = T1->Shape().GetDims();
+  if (input_dims.size() != 2) {
+    return Status(ONNXRUNTIME, INVALID_ARGUMENT, "EyeLike : Input tensor dimension is not 2");
+  }
+
+  // set output tensor shape same as input tensor and set all values to zero
+  auto* T2 = context->Output(0, input_dims);
+  CUDA_RETURN_IF_ERROR(cudaMemsetAsync(T2->MutableDataRaw(), 0, T2->SizeInBytes()));
+  auto dim0 = input_dims[0];
+  auto dim1 = input_dims[1];
+
+  if ((k_ >= 0 && k_ >= dim1) || (k_ < 0 && (std::abs(k_)) >= dim0)) {
+    return Status::OK();
+  }
+
+  // Calculate the start offset and total number of elements in diagnal.
+  size_t offset, diag_count;
+  if (k_ >= 0) {
+    offset = k_;
+    diag_count = std::min(dim1 - k_, dim0);
+  } else {
+    offset = (-k_) * dim1;
+    diag_count = std::min(dim0 + k_, dim1);
+  }
+
+  auto output_tensor_dtype = has_dtype_ ? static_cast<ONNX_NAMESPACE::TensorProto::DataType>(dtype_) : utils::GetTensorProtoType(*T1);
+  switch (output_tensor_dtype) {
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT:
+      TYPED_FUNCTION_CALL(float)
+    case ONNX_NAMESPACE::TensorProto_DataType_DOUBLE:
+      TYPED_FUNCTION_CALL(double)
+    case ONNX_NAMESPACE::TensorProto_DataType_INT32:
+      TYPED_FUNCTION_CALL(int32_t)
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT64:
+      TYPED_FUNCTION_CALL(uint64_t)
+    case ONNX_NAMESPACE::TensorProto_DataType_INT64:
+      TYPED_FUNCTION_CALL(int64_t)
+    default:
+      ORT_THROW("Unsupported 'dtype' value: ", output_tensor_dtype);
+  }
+
+  return Status::OK();
+}
+
+}  // namespace cuda
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/cuda/tensor/eye_like.h
+++ b/onnxruntime/core/providers/cuda/tensor/eye_like.h
@ -0,0 +1,30 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/common/common.h"
+#include "core/framework/op_kernel.h"
+#include "core/providers/cuda/cuda_common.h"
+
+namespace onnxruntime {
+namespace cuda {
+
+class EyeLike final : public CudaKernel {
+ public:
+  EyeLike(const OpKernelInfo& info) : CudaKernel(info) {
+    if (!info.GetAttr("k", &k_).IsOK()) {
+      k_ = 0;
+    }
+
+    has_dtype_ = info.GetAttr("dtype", &dtype_).IsOK();
+  }
+
+  Status ComputeInternal(OpKernelContext* context) const override;
+
+ private:
+  bool has_dtype_;
+  int64_t dtype_;
+  int64_t k_;
+};
+
+}  // namespace cuda
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/cuda/tensor/eye_like_impl.cu
+++ b/onnxruntime/core/providers/cuda/tensor/eye_like_impl.cu
@ -0,0 +1,52 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include <stdint.h>
+#include "core/providers/cuda/shared_inc/cuda_utils.h"
+#include "core/providers/cuda/cu_inc/common.cuh"
+#include "eye_like_impl.h"
+
+namespace onnxruntime {
+namespace cuda {
+
+template <typename T>
+__global__ void _EyeLikeKernel(
+    size_t offset,
+    size_t stripe,
+    T* output_data,
+    CUDA_LONG N) {
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
+
+  // offset is the first elements, stripe is width + 1.
+  output_data[offset + id * stripe] = static_cast<T>(1);
+}
+
+template <typename T>
+void EyeLikeImpl(
+    size_t offset,
+    size_t stripe,
+    T* output_data,
+    size_t diag_count) {
+  constexpr int block_size = 256;
+  int blocksPerGrid = (int)(ceil(static_cast<float>(diag_count) / block_size));
+  CUDA_LONG N = static_cast<CUDA_LONG>(diag_count);
+
+  _EyeLikeKernel<<<blocksPerGrid, block_size, 0>>>(offset, stripe, output_data, N);
+}
+
+#define SPECIALIZED_IMPL(T)                                          \
+  template void EyeLikeImpl<T>(                                      \
+    size_t offset,                                                   \
+    size_t stripe,                                                   \
+    T* output_data,                                                  \
+    size_t diag_count);
+
+SPECIALIZED_IMPL(int32_t)
+SPECIALIZED_IMPL(int64_t)
+SPECIALIZED_IMPL(uint64_t)
+SPECIALIZED_IMPL(float)
+SPECIALIZED_IMPL(double)
+
+}  // namespace cuda
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/cuda/tensor/eye_like_impl.h
+++ b/onnxruntime/core/providers/cuda/tensor/eye_like_impl.h
@ -0,0 +1,22 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include <stdint.h>
+#include "core/providers/cuda/shared_inc/cuda_utils.h"
+#include "core/providers/cuda/shared_inc/fast_divmod.h"
+#include "core/common/common.h"
+
+namespace onnxruntime {
+namespace cuda {
+
+template <typename T>
+void EyeLikeImpl(
+    size_t offset,                  // offset of first element in diagnal
+    size_t stripe,                  // stripe, here it's width + 1
+    T* output_data,                 // output buffer
+    size_t diag_count               // total number of elements in diagnal
+);
+
+}  // namespace cuda
+}  // namespace onnxruntime