Implement EyeLike Cuda Kernel to improve performance (#2139)

This commit is contained in:
Nathan 2019-10-23 13:48:39 -07:00 committed by Changming Sun
parent ac9425c119
commit 90b6ff927f
5 changed files with 199 additions and 0 deletions

View file

@ -543,6 +543,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, float, Less);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, double, Less);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, MLFloat16, Less);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, EyeLike);
// opset 10
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, Dropout);
@ -899,6 +900,7 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) {
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, MLFloat16, Less)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, float, RoiAlign)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, double, RoiAlign)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, EyeLike)>,
// opset 11
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, Gemm)>,

View file

@ -0,0 +1,93 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "eye_like.h"
#include "eye_like_impl.h"
#include "core/providers/cpu/tensor/utils.h"
#include "core/framework/tensorprotoutils.h"
#include "core/providers/cuda/shared_inc/fast_divmod.h"
namespace onnxruntime {
namespace cuda {
ONNX_OPERATOR_KERNEL_EX(
EyeLike,
kOnnxDomain,
9,
kCudaExecutionProvider,
KernelDefBuilder().TypeConstraint("T1",
std::vector<MLDataType>{
DataTypeImpl::GetTensorType<float>(),
DataTypeImpl::GetTensorType<double>(),
DataTypeImpl::GetTensorType<uint64_t>(),
DataTypeImpl::GetTensorType<int64_t>(),
DataTypeImpl::GetTensorType<int32_t>()
})
.TypeConstraint("T2",
std::vector<MLDataType>{
DataTypeImpl::GetTensorType<float>(),
DataTypeImpl::GetTensorType<double>(),
DataTypeImpl::GetTensorType<uint64_t>(),
DataTypeImpl::GetTensorType<int64_t>(),
DataTypeImpl::GetTensorType<int32_t>()
}),
EyeLike);
#define TYPED_FUNCTION_CALL(T) \
EyeLikeImpl<typename ToCudaType<T>::MappedType>( \
offset, \
dim1 + 1, \
reinterpret_cast<typename ToCudaType<T>::MappedType *>(T2->template MutableData<T>()), \
diag_count); \
break;
Status EyeLike::ComputeInternal(OpKernelContext* context) const {
const auto* T1 = context->Input<Tensor>(0);
ORT_ENFORCE(T1 != nullptr);
const std::vector<int64_t>& input_dims = T1->Shape().GetDims();
if (input_dims.size() != 2) {
return Status(ONNXRUNTIME, INVALID_ARGUMENT, "EyeLike : Input tensor dimension is not 2");
}
// set output tensor shape same as input tensor and set all values to zero
auto* T2 = context->Output(0, input_dims);
CUDA_RETURN_IF_ERROR(cudaMemsetAsync(T2->MutableDataRaw(), 0, T2->SizeInBytes()));
auto dim0 = input_dims[0];
auto dim1 = input_dims[1];
if ((k_ >= 0 && k_ >= dim1) || (k_ < 0 && (std::abs(k_)) >= dim0)) {
return Status::OK();
}
// Calculate the start offset and total number of elements in diagnal.
size_t offset, diag_count;
if (k_ >= 0) {
offset = k_;
diag_count = std::min(dim1 - k_, dim0);
} else {
offset = (-k_) * dim1;
diag_count = std::min(dim0 + k_, dim1);
}
auto output_tensor_dtype = has_dtype_ ? static_cast<ONNX_NAMESPACE::TensorProto::DataType>(dtype_) : utils::GetTensorProtoType(*T1);
switch (output_tensor_dtype) {
case ONNX_NAMESPACE::TensorProto_DataType_FLOAT:
TYPED_FUNCTION_CALL(float)
case ONNX_NAMESPACE::TensorProto_DataType_DOUBLE:
TYPED_FUNCTION_CALL(double)
case ONNX_NAMESPACE::TensorProto_DataType_INT32:
TYPED_FUNCTION_CALL(int32_t)
case ONNX_NAMESPACE::TensorProto_DataType_UINT64:
TYPED_FUNCTION_CALL(uint64_t)
case ONNX_NAMESPACE::TensorProto_DataType_INT64:
TYPED_FUNCTION_CALL(int64_t)
default:
ORT_THROW("Unsupported 'dtype' value: ", output_tensor_dtype);
}
return Status::OK();
}
} // namespace cuda
} // namespace onnxruntime

View file

@ -0,0 +1,30 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/common/common.h"
#include "core/framework/op_kernel.h"
#include "core/providers/cuda/cuda_common.h"
namespace onnxruntime {
namespace cuda {
class EyeLike final : public CudaKernel {
public:
EyeLike(const OpKernelInfo& info) : CudaKernel(info) {
if (!info.GetAttr("k", &k_).IsOK()) {
k_ = 0;
}
has_dtype_ = info.GetAttr("dtype", &dtype_).IsOK();
}
Status ComputeInternal(OpKernelContext* context) const override;
private:
bool has_dtype_;
int64_t dtype_;
int64_t k_;
};
} // namespace cuda
} // namespace onnxruntime

View file

@ -0,0 +1,52 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <stdint.h>
#include "core/providers/cuda/shared_inc/cuda_utils.h"
#include "core/providers/cuda/cu_inc/common.cuh"
#include "eye_like_impl.h"
namespace onnxruntime {
namespace cuda {
template <typename T>
__global__ void _EyeLikeKernel(
size_t offset,
size_t stripe,
T* output_data,
CUDA_LONG N) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
// offset is the first elements, stripe is width + 1.
output_data[offset + id * stripe] = static_cast<T>(1);
}
template <typename T>
void EyeLikeImpl(
size_t offset,
size_t stripe,
T* output_data,
size_t diag_count) {
constexpr int block_size = 256;
int blocksPerGrid = (int)(ceil(static_cast<float>(diag_count) / block_size));
CUDA_LONG N = static_cast<CUDA_LONG>(diag_count);
_EyeLikeKernel<<<blocksPerGrid, block_size, 0>>>(offset, stripe, output_data, N);
}
#define SPECIALIZED_IMPL(T) \
template void EyeLikeImpl<T>( \
size_t offset, \
size_t stripe, \
T* output_data, \
size_t diag_count);
SPECIALIZED_IMPL(int32_t)
SPECIALIZED_IMPL(int64_t)
SPECIALIZED_IMPL(uint64_t)
SPECIALIZED_IMPL(float)
SPECIALIZED_IMPL(double)
} // namespace cuda
} // namespace onnxruntime

View file

@ -0,0 +1,22 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <stdint.h>
#include "core/providers/cuda/shared_inc/cuda_utils.h"
#include "core/providers/cuda/shared_inc/fast_divmod.h"
#include "core/common/common.h"
namespace onnxruntime {
namespace cuda {
template <typename T>
void EyeLikeImpl(
size_t offset, // offset of first element in diagnal
size_t stripe, // stripe, here it's width + 1
T* output_data, // output buffer
size_t diag_count // total number of elements in diagnal
);
} // namespace cuda
} // namespace onnxruntime