mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-29 23:06:41 +00:00
Implement EyeLike Cuda Kernel to improve performance (#2139)
This commit is contained in:
parent
ac9425c119
commit
90b6ff927f
5 changed files with 199 additions and 0 deletions
|
|
@ -543,6 +543,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
|
|||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, float, Less);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, double, Less);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, MLFloat16, Less);
|
||||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, EyeLike);
|
||||
|
||||
// opset 10
|
||||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, Dropout);
|
||||
|
|
@ -899,6 +900,7 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) {
|
|||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, MLFloat16, Less)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, float, RoiAlign)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, double, RoiAlign)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, EyeLike)>,
|
||||
|
||||
// opset 11
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, Gemm)>,
|
||||
|
|
|
|||
93
onnxruntime/core/providers/cuda/tensor/eye_like.cc
Normal file
93
onnxruntime/core/providers/cuda/tensor/eye_like.cc
Normal file
|
|
@ -0,0 +1,93 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#include "eye_like.h"
|
||||
#include "eye_like_impl.h"
|
||||
#include "core/providers/cpu/tensor/utils.h"
|
||||
#include "core/framework/tensorprotoutils.h"
|
||||
#include "core/providers/cuda/shared_inc/fast_divmod.h"
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace cuda {
|
||||
|
||||
ONNX_OPERATOR_KERNEL_EX(
|
||||
EyeLike,
|
||||
kOnnxDomain,
|
||||
9,
|
||||
kCudaExecutionProvider,
|
||||
KernelDefBuilder().TypeConstraint("T1",
|
||||
std::vector<MLDataType>{
|
||||
DataTypeImpl::GetTensorType<float>(),
|
||||
DataTypeImpl::GetTensorType<double>(),
|
||||
DataTypeImpl::GetTensorType<uint64_t>(),
|
||||
DataTypeImpl::GetTensorType<int64_t>(),
|
||||
DataTypeImpl::GetTensorType<int32_t>()
|
||||
})
|
||||
.TypeConstraint("T2",
|
||||
std::vector<MLDataType>{
|
||||
DataTypeImpl::GetTensorType<float>(),
|
||||
DataTypeImpl::GetTensorType<double>(),
|
||||
DataTypeImpl::GetTensorType<uint64_t>(),
|
||||
DataTypeImpl::GetTensorType<int64_t>(),
|
||||
DataTypeImpl::GetTensorType<int32_t>()
|
||||
}),
|
||||
EyeLike);
|
||||
|
||||
#define TYPED_FUNCTION_CALL(T) \
|
||||
EyeLikeImpl<typename ToCudaType<T>::MappedType>( \
|
||||
offset, \
|
||||
dim1 + 1, \
|
||||
reinterpret_cast<typename ToCudaType<T>::MappedType *>(T2->template MutableData<T>()), \
|
||||
diag_count); \
|
||||
break;
|
||||
|
||||
Status EyeLike::ComputeInternal(OpKernelContext* context) const {
|
||||
const auto* T1 = context->Input<Tensor>(0);
|
||||
ORT_ENFORCE(T1 != nullptr);
|
||||
|
||||
const std::vector<int64_t>& input_dims = T1->Shape().GetDims();
|
||||
if (input_dims.size() != 2) {
|
||||
return Status(ONNXRUNTIME, INVALID_ARGUMENT, "EyeLike : Input tensor dimension is not 2");
|
||||
}
|
||||
|
||||
// set output tensor shape same as input tensor and set all values to zero
|
||||
auto* T2 = context->Output(0, input_dims);
|
||||
CUDA_RETURN_IF_ERROR(cudaMemsetAsync(T2->MutableDataRaw(), 0, T2->SizeInBytes()));
|
||||
auto dim0 = input_dims[0];
|
||||
auto dim1 = input_dims[1];
|
||||
|
||||
if ((k_ >= 0 && k_ >= dim1) || (k_ < 0 && (std::abs(k_)) >= dim0)) {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Calculate the start offset and total number of elements in diagnal.
|
||||
size_t offset, diag_count;
|
||||
if (k_ >= 0) {
|
||||
offset = k_;
|
||||
diag_count = std::min(dim1 - k_, dim0);
|
||||
} else {
|
||||
offset = (-k_) * dim1;
|
||||
diag_count = std::min(dim0 + k_, dim1);
|
||||
}
|
||||
|
||||
auto output_tensor_dtype = has_dtype_ ? static_cast<ONNX_NAMESPACE::TensorProto::DataType>(dtype_) : utils::GetTensorProtoType(*T1);
|
||||
switch (output_tensor_dtype) {
|
||||
case ONNX_NAMESPACE::TensorProto_DataType_FLOAT:
|
||||
TYPED_FUNCTION_CALL(float)
|
||||
case ONNX_NAMESPACE::TensorProto_DataType_DOUBLE:
|
||||
TYPED_FUNCTION_CALL(double)
|
||||
case ONNX_NAMESPACE::TensorProto_DataType_INT32:
|
||||
TYPED_FUNCTION_CALL(int32_t)
|
||||
case ONNX_NAMESPACE::TensorProto_DataType_UINT64:
|
||||
TYPED_FUNCTION_CALL(uint64_t)
|
||||
case ONNX_NAMESPACE::TensorProto_DataType_INT64:
|
||||
TYPED_FUNCTION_CALL(int64_t)
|
||||
default:
|
||||
ORT_THROW("Unsupported 'dtype' value: ", output_tensor_dtype);
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace onnxruntime
|
||||
30
onnxruntime/core/providers/cuda/tensor/eye_like.h
Normal file
30
onnxruntime/core/providers/cuda/tensor/eye_like.h
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#include "core/common/common.h"
|
||||
#include "core/framework/op_kernel.h"
|
||||
#include "core/providers/cuda/cuda_common.h"
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace cuda {
|
||||
|
||||
class EyeLike final : public CudaKernel {
|
||||
public:
|
||||
EyeLike(const OpKernelInfo& info) : CudaKernel(info) {
|
||||
if (!info.GetAttr("k", &k_).IsOK()) {
|
||||
k_ = 0;
|
||||
}
|
||||
|
||||
has_dtype_ = info.GetAttr("dtype", &dtype_).IsOK();
|
||||
}
|
||||
|
||||
Status ComputeInternal(OpKernelContext* context) const override;
|
||||
|
||||
private:
|
||||
bool has_dtype_;
|
||||
int64_t dtype_;
|
||||
int64_t k_;
|
||||
};
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace onnxruntime
|
||||
52
onnxruntime/core/providers/cuda/tensor/eye_like_impl.cu
Normal file
52
onnxruntime/core/providers/cuda/tensor/eye_like_impl.cu
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#pragma once
|
||||
#include <stdint.h>
|
||||
#include "core/providers/cuda/shared_inc/cuda_utils.h"
|
||||
#include "core/providers/cuda/cu_inc/common.cuh"
|
||||
#include "eye_like_impl.h"
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace cuda {
|
||||
|
||||
template <typename T>
|
||||
__global__ void _EyeLikeKernel(
|
||||
size_t offset,
|
||||
size_t stripe,
|
||||
T* output_data,
|
||||
CUDA_LONG N) {
|
||||
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
|
||||
|
||||
// offset is the first elements, stripe is width + 1.
|
||||
output_data[offset + id * stripe] = static_cast<T>(1);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void EyeLikeImpl(
|
||||
size_t offset,
|
||||
size_t stripe,
|
||||
T* output_data,
|
||||
size_t diag_count) {
|
||||
constexpr int block_size = 256;
|
||||
int blocksPerGrid = (int)(ceil(static_cast<float>(diag_count) / block_size));
|
||||
CUDA_LONG N = static_cast<CUDA_LONG>(diag_count);
|
||||
|
||||
_EyeLikeKernel<<<blocksPerGrid, block_size, 0>>>(offset, stripe, output_data, N);
|
||||
}
|
||||
|
||||
#define SPECIALIZED_IMPL(T) \
|
||||
template void EyeLikeImpl<T>( \
|
||||
size_t offset, \
|
||||
size_t stripe, \
|
||||
T* output_data, \
|
||||
size_t diag_count);
|
||||
|
||||
SPECIALIZED_IMPL(int32_t)
|
||||
SPECIALIZED_IMPL(int64_t)
|
||||
SPECIALIZED_IMPL(uint64_t)
|
||||
SPECIALIZED_IMPL(float)
|
||||
SPECIALIZED_IMPL(double)
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace onnxruntime
|
||||
22
onnxruntime/core/providers/cuda/tensor/eye_like_impl.h
Normal file
22
onnxruntime/core/providers/cuda/tensor/eye_like_impl.h
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#pragma once
|
||||
#include <stdint.h>
|
||||
#include "core/providers/cuda/shared_inc/cuda_utils.h"
|
||||
#include "core/providers/cuda/shared_inc/fast_divmod.h"
|
||||
#include "core/common/common.h"
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace cuda {
|
||||
|
||||
template <typename T>
|
||||
void EyeLikeImpl(
|
||||
size_t offset, // offset of first element in diagnal
|
||||
size_t stripe, // stripe, here it's width + 1
|
||||
T* output_data, // output buffer
|
||||
size_t diag_count // total number of elements in diagnal
|
||||
);
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace onnxruntime
|
||||
Loading…
Reference in a new issue