From d6849bd26cf54bfb95f2e1301b291e2128f96f19 Mon Sep 17 00:00:00 2001 From: RandySheriffH <48490400+RandySheriffH@users.noreply.github.com> Date: Thu, 31 Oct 2019 10:26:00 -0700 Subject: [PATCH] Rashuai/cuda top k (#1919) * implement cuda topk * implement heap * add type support * refactor interface * add support for sorting by index * add test case * use cub device radix sort * register for opset 9 and 10 * add opset 9/10 delaration * refactor code * refactor code * fix comment * fix comment * switch to scratched mem --- .../providers/cuda/cuda_execution_provider.cc | 9 ++ onnxruntime/core/providers/cuda/math/topk.cc | 101 ++++++++++++++++++ onnxruntime/core/providers/cuda/math/topk.h | 23 ++++ .../core/providers/cuda/math/topk_impl.cu | 94 ++++++++++++++++ .../core/providers/cuda/math/topk_impl.h | 17 +++ 5 files changed, 244 insertions(+) create mode 100644 onnxruntime/core/providers/cuda/math/topk.cc create mode 100644 onnxruntime/core/providers/cuda/math/topk.h create mode 100644 onnxruntime/core/providers/cuda/math/topk_impl.cu create mode 100644 onnxruntime/core/providers/cuda/math/topk_impl.h diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index 0e17bb6a62..17b3d68477 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -564,6 +564,10 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, R class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 10, Scatter); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, ScatterElements); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 9, TopK); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, TopK); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, TopK); + static void RegisterCudaKernels(KernelRegistry& kernel_registry) { static const BuildKernelCreateInfoFn function_table[] = { BuildKernelCreateInfo, @@ -919,6 +923,11 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, + + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + }; for (auto& function_table_entry : function_table) { diff --git a/onnxruntime/core/providers/cuda/math/topk.cc b/onnxruntime/core/providers/cuda/math/topk.cc new file mode 100644 index 0000000000..0b7b2577cc --- /dev/null +++ b/onnxruntime/core/providers/cuda/math/topk.cc @@ -0,0 +1,101 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "topk.h" +#include "topk_impl.h" + +namespace onnxruntime { +namespace cuda { + +ONNX_OPERATOR_VERSIONED_KERNEL_EX( + TopK, + kOnnxDomain, + 1,9, + kCudaExecutionProvider, + KernelDefBuilder().TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()), + TopK); + +ONNX_OPERATOR_VERSIONED_KERNEL_EX( + TopK, + kOnnxDomain, + 10,10, + kCudaExecutionProvider, + KernelDefBuilder().InputMemoryType(1).TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()), + TopK); + +ONNX_OPERATOR_KERNEL_EX( + TopK, + kOnnxDomain, + 11, + kCudaExecutionProvider, + KernelDefBuilder().InputMemoryType(1).TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()), + TopK); + +template +TopK::TopK(const OpKernelInfo& info) : CudaKernel(info) { + info.GetAttrOrDefault("axis", &axis_, -1); + info.GetAttrOrDefault("largest", &largest_, 1); + info.GetAttrOrDefault("sorted", &sorted_, 1); + if (!inputk) { + info.GetAttrOrDefault("k", &K_, 0); + } +} + +#define ISTYPE(T) tensor_X->DataType() == DataTypeImpl::GetType() +#define TOPKIMPL(T) TopKImpl(this, tensor_X->Data(), \ + static_cast(tensor_V->MutableDataRaw()), \ + static_cast(tensor_I->MutableDataRaw()), \ + elem_nums_cuda.GpuPtr(), \ + elem_nums.size(), \ + axis, K_, largest_, sorted_, N, dimension) + +template +Status TopK::ComputeInternal(OpKernelContext* ctx) const { + auto tensor_X = ctx->Input(0); + ORT_ENFORCE(nullptr != tensor_X); + auto rank = static_cast(tensor_X->Shape().NumDimensions()); + auto axis = axis_ < 0 ? rank + axis_ : axis_; + ORT_ENFORCE(axis > -1 && axis < rank); + + if (inputk) { + auto tensor_K = ctx->Input(1); + ORT_ENFORCE(nullptr != tensor_K); + K_ = *tensor_K->Data(); + ORT_ENFORCE(K_ >= 0 && K_ <= tensor_X->Shape().GetDims()[axis]); + } + + auto output_shape = tensor_X->Shape(); + output_shape[axis] = K_; + auto tensor_V = ctx->Output(0, output_shape); + auto tensor_I = ctx->Output(1, output_shape); + + if (0 == K_) { + return Status::OK(); + } + + auto elem_nums = tensor_X->Shape().GetDims(); + auto dimension = elem_nums[axis]; + for (auto i = static_cast(elem_nums.size()) - 2; i >= 0; --i) { + elem_nums[i] *= elem_nums[i + 1]; + } + + auto N = elem_nums[0] / dimension; + CudaAsyncBuffer elem_nums_cuda(this, elem_nums); + ORT_RETURN_IF_ERROR(elem_nums_cuda.CopyToGpu()); + + if (ISTYPE(uint8_t)) return TOPKIMPL(uint8_t); + if (ISTYPE(uint16_t)) return TOPKIMPL(uint16_t); + if (ISTYPE(uint32_t)) return TOPKIMPL(uint32_t); + if (ISTYPE(uint64_t)) return TOPKIMPL(uint64_t); + if (ISTYPE(int8_t)) return TOPKIMPL(int8_t); + if (ISTYPE(int16_t)) return TOPKIMPL(int16_t); + if (ISTYPE(int32_t)) return TOPKIMPL(int32_t); + if (ISTYPE(int64_t)) return TOPKIMPL(int64_t); + if (ISTYPE(float)) return TOPKIMPL(float); + if (ISTYPE(double)) return TOPKIMPL(double); + if (ISTYPE(uint8_t)) return TOPKIMPL(uint8_t); + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for TopK operator"); +} + +} // namespace cuda +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/math/topk.h b/onnxruntime/core/providers/cuda/math/topk.h new file mode 100644 index 0000000000..944c95e369 --- /dev/null +++ b/onnxruntime/core/providers/cuda/math/topk.h @@ -0,0 +1,23 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include "core/providers/cuda/cuda_common.h" + +namespace onnxruntime { +namespace cuda { + +template +class TopK final : public CudaKernel { + public: + TopK(const OpKernelInfo&); + Status ComputeInternal(OpKernelContext*) const override; + + private: + int64_t axis_; + int64_t largest_; + int64_t sorted_; + mutable int64_t K_; +}; +} // namespace cuda +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/math/topk_impl.cu b/onnxruntime/core/providers/cuda/math/topk_impl.cu new file mode 100644 index 0000000000..018b223156 --- /dev/null +++ b/onnxruntime/core/providers/cuda/math/topk_impl.cu @@ -0,0 +1,94 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "topk_impl.h" +#include "core/providers/cuda/cu_inc/common.cuh" +#include "cub/cub.cuh" +#include + +namespace onnxruntime { +namespace cuda { + +template +__global__ void FillInput(const T* input_x, T* output_v, int64_t* output_i, const int64_t* elem_nums, size_t size, int64_t axis, int64_t K, int64_t offset, int64_t dimension) { + CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, dimension); + auto left = offset / (axis == size - 1 ? 1 : elem_nums[axis + 1]) * elem_nums[axis]; + auto right = axis == size - 1 ? 0 : offset % elem_nums[axis + 1]; + auto input_offset = left + id * (axis == size - 1 ? 1 : elem_nums[axis + 1]) + right; + output_v[id] = input_x[input_offset]; + output_i[id] = id; +} + +template +__global__ void FillOutput(const T* input_v, const int64_t* input_i, T* output_v, int64_t* output_i, const int64_t* elem_nums, size_t size, int64_t axis, int64_t K, int64_t offset, int64_t dimension) { + CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, dimension); + auto left = offset / (axis == size - 1 ? 1 : elem_nums[axis + 1]) * elem_nums[axis] * K / dimension; + auto right = axis == size - 1 ? 0 : offset % elem_nums[axis + 1]; + auto output_offset = left + id * (axis == size - 1 ? 1 : elem_nums[axis + 1]) + right; + output_v[output_offset] = input_v[id]; + output_i[output_offset] = input_i[id]; +} + +__global__ void ExcludeOutput(int64_t* output_i, int64_t K, int64_t dimension) { + CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, dimension); + if (id >= K) { + output_i[id] = dimension; + } +} + +template +Status TopKImpl(const CudaKernel* kernel, const T* input_x, T* output_v, int64_t* output_i, const int64_t* elem_nums, size_t size, int64_t axis, int64_t K, int64_t largest, int64_t sorted, int64_t N, int64_t dimension) { + auto input_key_buffer = kernel->GetScratchBuffer(dimension); + auto output_key_buffer = kernel->GetScratchBuffer(dimension); + auto input_value_buffer = kernel->GetScratchBuffer(dimension); + auto output_value_buffer = kernel->GetScratchBuffer(dimension); + auto input_key = input_key_buffer.get(); + auto output_key = output_key_buffer.get(); + auto input_value = input_value_buffer.get(); + auto output_value = output_value_buffer.get(); + size_t temp_bytes = 0; + CUDA_RETURN_IF_ERROR(cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes, input_key, output_key, input_value, output_value, dimension)); + auto temp_storage_buffer = kernel->GetScratchBuffer(temp_bytes); + auto temp_storage = temp_storage_buffer.get(); + auto blocksPerGridD = (int)(ceil(static_cast(dimension) / GridDim::maxThreadsPerBlock)); + auto blocksPerGridK = (int)(ceil(static_cast(K) / GridDim::maxThreadsPerBlock)); + for (int64_t i = 0; i < N; i++) { + FillInput<<>>(input_x, input_key, input_value, elem_nums, size, axis, K, i, dimension); + CUDA_RETURN_IF_ERROR(1 == largest ? cub::DeviceRadixSort::SortPairsDescending(temp_storage, temp_bytes, input_key, output_key, input_value, output_value, dimension) : cub::DeviceRadixSort::SortPairs(temp_storage, temp_bytes, input_key, output_key, input_value, output_value, dimension)); + if (1 == sorted) { + FillOutput<<>>(output_key, output_value, output_v, output_i, elem_nums, size, axis, K, i, dimension); + } else { //reorder by ascending index + ExcludeOutput<<>>(output_value, K, dimension); + CUDA_RETURN_IF_ERROR(cub::DeviceRadixSort::SortPairs(temp_storage, temp_bytes, output_value, input_value, output_key, input_key, dimension)); + FillOutput<<>>(input_key, input_value, output_v, output_i, elem_nums, size, axis, K, i, dimension); + } + } + return Status::OK(); +} + +#define TOPKIMPLE(T) template Status TopKImpl(const CudaKernel* kernel, \ + const T* input_x, \ + T* output_v, \ + int64_t* output_i, \ + const int64_t* elem_nums, \ + size_t size, \ + int64_t axis, \ + int64_t K, \ + int64_t largest, \ + int64_t sorted, \ + int64_t N, \ + int64_t dimension) + +TOPKIMPLE(uint8_t); +TOPKIMPLE(uint16_t); +TOPKIMPLE(uint32_t); +TOPKIMPLE(uint64_t); +TOPKIMPLE(int8_t); +TOPKIMPLE(int16_t); +TOPKIMPLE(int32_t); +TOPKIMPLE(int64_t); +TOPKIMPLE(float); +TOPKIMPLE(double); + +} // namespace cuda +} // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/core/providers/cuda/math/topk_impl.h b/onnxruntime/core/providers/cuda/math/topk_impl.h new file mode 100644 index 0000000000..ddb66606c2 --- /dev/null +++ b/onnxruntime/core/providers/cuda/math/topk_impl.h @@ -0,0 +1,17 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include +#include "core/providers/cuda/shared_inc/cuda_utils.h" +#include "core/providers/cuda/cuda_common.h" +#include "core/common/common.h" + +namespace onnxruntime { +namespace cuda { + +template +Status TopKImpl(const CudaKernel* kernel, const T* input_x, T* output_v, int64_t* output_i, const int64_t* elem_nums, size_t size, int64_t axis, int64_t K, int64_t largest, int64_t sorted, int64_t N, int64_t dimension); + +} // namespace cuda +} // namespace onnxruntime \ No newline at end of file