From 18b192a45bf7ca3752cecd074786e58d8f4d3719 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Sat, 19 Oct 2019 18:28:52 -0700 Subject: [PATCH] Implement Range Cuda Kernel to improve performance (#2148) --- .../providers/cuda/cuda_execution_provider.cc | 2 + .../core/providers/cuda/generator/range.cc | 107 ++++++++++++++++++ .../core/providers/cuda/generator/range.h | 22 ++++ .../providers/cuda/generator/range_impl.cu | 42 +++++++ .../providers/cuda/generator/range_impl.h | 16 +++ 5 files changed, 189 insertions(+) create mode 100644 onnxruntime/core/providers/cuda/generator/range.cc create mode 100644 onnxruntime/core/providers/cuda/generator/range.h create mode 100644 onnxruntime/core/providers/cuda/generator/range_impl.cu create mode 100644 onnxruntime/core/providers/cuda/generator/range_impl.h diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index 6420e8aebb..b22304fd1f 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -546,6 +546,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, Gemm); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, Gemm); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, Gemm); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, Range); class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 10, Scatter); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, ScatterElements); @@ -894,6 +895,7 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, diff --git a/onnxruntime/core/providers/cuda/generator/range.cc b/onnxruntime/core/providers/cuda/generator/range.cc new file mode 100644 index 0000000000..6fa25b14d3 --- /dev/null +++ b/onnxruntime/core/providers/cuda/generator/range.cc @@ -0,0 +1,107 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/framework/tensorprotoutils.h" +#include "core/providers/cuda/cuda_common.h" +#include "range.h" +#include "range_impl.h" + +using namespace onnxruntime::cuda; +using namespace ::onnxruntime::common; +using namespace ONNX_NAMESPACE; + +namespace onnxruntime { +namespace cuda { + +ONNX_OPERATOR_KERNEL_EX( + Range, + kOnnxDomain, + 11, + kCudaExecutionProvider, + KernelDefBuilder().TypeConstraint("T", {DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType()}), + Range); + +template +static Status ComputeRange(OpKernelContext* ctx) { + const auto& start_tensor = *ctx->Input(0); + const auto& limit_tensor = *ctx->Input(1); + const auto* delta_tensor_ptr = ctx->Input(2); + + if (!start_tensor.Shape().IsScalar()) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "start in Range operator should be scalar like tensor, yet got shape:", + start_tensor.Shape()); + } + if (!limit_tensor.Shape().IsScalar()) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "limit in Range operator should be scalar like tensor, yet got shape:", + limit_tensor.Shape()); + } + if (delta_tensor_ptr != nullptr && !delta_tensor_ptr->Shape().IsScalar()) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "delta in Range operator should be scalar like tensor, yet got shape:", + delta_tensor_ptr->Shape()); + } + + // Start, Limit and Delta are stored in GPU. So we need copy it to CPU to read. + // It is better to store these tensors in pinned memory or CPU for better performance. + T start; + CUDA_RETURN_IF_ERROR(cudaMemcpy(&start, start_tensor.template Data(), sizeof(T), cudaMemcpyDeviceToHost)); + + T limit; + CUDA_RETURN_IF_ERROR(cudaMemcpy(&limit, limit_tensor.template Data(), sizeof(T), cudaMemcpyDeviceToHost)); + + T delta = T(1); + if (delta_tensor_ptr != nullptr) { + CUDA_RETURN_IF_ERROR(cudaMemcpy(&delta, delta_tensor_ptr->template Data(), sizeof(T), cudaMemcpyDeviceToHost)); + } + + if (delta == T(0)) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "delta in Range operator can not be zero!"); + } + + int count = static_cast(ceil(1.0 * (limit - start) / delta)); + if (count <= 0) + count = 0; + TensorShape shape = {static_cast(count)}; + T* y = ctx->Output(0, shape)->template MutableData(); + + if (count > 0) { + if (!RangeImpl(start, delta, count, y)) { + CUDA_CALL(cudaGetLastError()); + return Status(common::ONNXRUNTIME, common::FAIL); + } + } + + return Status::OK(); +} + +Status Range::ComputeInternal(OpKernelContext* ctx) const { + const auto* input_tensor = ctx->Input(0); + if (input_tensor == nullptr) { + return Status(common::ONNXRUNTIME, common::FAIL, "input count mismatch"); + } + + auto data_type = input_tensor->DataType(); + if (data_type == DataTypeImpl::GetType()) { + return ComputeRange(ctx); + } else if (data_type == DataTypeImpl::GetType()) { + return ComputeRange(ctx); + } else if (data_type == DataTypeImpl::GetType()) { + return ComputeRange(ctx); + } else if (data_type == DataTypeImpl::GetType()) { + return ComputeRange(ctx); + } else if (data_type == DataTypeImpl::GetType()) { + return ComputeRange(ctx); + } + + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Range op: Unsupported tensor data type:", data_type); +} + +} // namespace cuda +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/generator/range.h b/onnxruntime/core/providers/cuda/generator/range.h new file mode 100644 index 0000000000..c25983ca91 --- /dev/null +++ b/onnxruntime/core/providers/cuda/generator/range.h @@ -0,0 +1,22 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "core/common/common.h" +#include "core/framework/op_kernel.h" + +namespace onnxruntime { +namespace cuda { + +using namespace onnxruntime::cuda; + +class Range final : public CudaKernel { + public: + explicit Range(const OpKernelInfo& info) : CudaKernel(info) {} + + Status ComputeInternal(OpKernelContext* context) const override; +}; + +} // namespace cuda +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/generator/range_impl.cu b/onnxruntime/core/providers/cuda/generator/range_impl.cu new file mode 100644 index 0000000000..4756350392 --- /dev/null +++ b/onnxruntime/core/providers/cuda/generator/range_impl.cu @@ -0,0 +1,42 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include +#include +#include +#include "core/providers/cuda/cu_inc/common.cuh" +#include "core/providers/cuda/cuda_common.h" +#include "range_impl.h" + +using namespace onnxruntime::cuda; + +namespace onnxruntime { +namespace cuda { + +template +__global__ void RangeKernel(const T start, const T delta, const int count, T* output) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < count) { + output[index] = start + delta * index; + } +} + +template +bool RangeImpl(const T start, const T delta, const int count, T* output) { + constexpr int block_size = 256; + int grid_size = (count + block_size - 1) / block_size; + RangeKernel<<>>(start, delta, count, output); + return CUDA_CALL(cudaPeekAtLastError()); +} + +#define SPECIALIZED_IMPL(T) \ + template bool RangeImpl(const T start, const T delta, const int count, T* output); + +SPECIALIZED_IMPL(int16_t) +SPECIALIZED_IMPL(int32_t) +SPECIALIZED_IMPL(int64_t) +SPECIALIZED_IMPL(float) +SPECIALIZED_IMPL(double) + +} // namespace cuda +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/generator/range_impl.h b/onnxruntime/core/providers/cuda/generator/range_impl.h new file mode 100644 index 0000000000..dfcfa992ba --- /dev/null +++ b/onnxruntime/core/providers/cuda/generator/range_impl.h @@ -0,0 +1,16 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include "core/providers/cuda/shared_inc/cuda_utils.h" + +namespace onnxruntime { +namespace cuda { + +using namespace onnxruntime::cuda; + +template +bool RangeImpl(const T start, const T delta, const int count, T* output); + +} // namespace cuda +} // namespace onnxruntime