diff --git a/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.cc b/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.cc index 87d21ab321..0b5f78919b 100644 --- a/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.cc +++ b/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.cc @@ -35,29 +35,8 @@ ONNX_OPERATOR_KERNEL_EX( using namespace nms_helpers; -// CPU version -namespace nms_helpers { -Status GetThresholdsFromInputs(const PrepareContext& pc, - int64_t& max_output_boxes_per_class, - float& iou_threshold, - float& score_threshold) { - if (pc.max_output_boxes_per_class_ != nullptr) { - max_output_boxes_per_class = std::max(*pc.max_output_boxes_per_class_, 0); - } - - if (pc.iou_threshold_ != nullptr) { - iou_threshold = *pc.iou_threshold_; - ORT_RETURN_IF_NOT((iou_threshold >= 0 && iou_threshold <= 1.f), "iou_threshold must be in range [0, 1]."); - } - - if (pc.score_threshold_ != nullptr) { - score_threshold = *pc.score_threshold_; - } - - return Status::OK(); -} -} // namespace nms_helpers - +// This works for both CPU and GPU. +// CUDA kernel declare OrtMemTypeCPUInput for max_output_boxes_per_class(2), iou_threshold(3) and score_threshold(4) Status NonMaxSuppressionBase::PrepareCompute(OpKernelContext* ctx, PrepareContext& pc) { const auto* boxes_tensor = ctx->Input(0); ORT_ENFORCE(boxes_tensor); @@ -111,6 +90,26 @@ Status NonMaxSuppressionBase::PrepareCompute(OpKernelContext* ctx, PrepareContex return Status::OK(); } +Status NonMaxSuppressionBase::GetThresholdsFromInputs(const PrepareContext& pc, + int64_t& max_output_boxes_per_class, + float& iou_threshold, + float& score_threshold) { + if (pc.max_output_boxes_per_class_ != nullptr) { + max_output_boxes_per_class = std::max(*pc.max_output_boxes_per_class_, 0); + } + + if (pc.iou_threshold_ != nullptr) { + iou_threshold = *pc.iou_threshold_; + ORT_RETURN_IF_NOT((iou_threshold >= 0 && iou_threshold <= 1.f), "iou_threshold must be in range [0, 1]."); + } + + if (pc.score_threshold_ != nullptr) { + score_threshold = *pc.score_threshold_; + } + + return Status::OK(); +} + Status NonMaxSuppression::Compute(OpKernelContext* ctx) const { PrepareContext pc; auto ret = PrepareCompute(ctx, pc); diff --git a/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.h b/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.h index 37578539cd..f6465ea3ee 100644 --- a/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.h +++ b/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.h @@ -18,6 +18,10 @@ class NonMaxSuppressionBase { } static Status PrepareCompute(OpKernelContext* ctx, PrepareContext& pc); + static Status GetThresholdsFromInputs(const PrepareContext& pc, + int64_t& max_output_boxes_per_class, + float& iou_threshold, + float& score_threshold); int64_t GetCenterPointBox() const { return center_point_box_; diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index 17b3d68477..f964a98c0b 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -544,24 +544,24 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, double, Less); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, MLFloat16, Less); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, EyeLike); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 10, Scatter); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, MLFloat16, Where); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, float, Where); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, int32_t, Where); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, int64_t, Where); - // opset 10 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, Dropout); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, float, RoiAlign); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, double, RoiAlign); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, NonMaxSuppression); // opset 11 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, Gemm); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, Gemm); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, Gemm); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, NonMaxSuppression); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, Range); - -class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 10, Scatter); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, ScatterElements); class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 9, TopK); @@ -909,7 +909,9 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -919,9 +921,8 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, - - BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, diff --git a/onnxruntime/core/providers/cuda/object_detection/non_max_suppression.cc b/onnxruntime/core/providers/cuda/object_detection/non_max_suppression.cc new file mode 100644 index 0000000000..039221bdf0 --- /dev/null +++ b/onnxruntime/core/providers/cuda/object_detection/non_max_suppression.cc @@ -0,0 +1,134 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "non_max_suppression.h" +#include "core/providers/cpu/object_detection/non_max_suppression_helper.h" +#include "non_max_suppression_impl.h" +#include "core/providers/cuda/tensor/concat_impl.h" + +namespace onnxruntime { +namespace cuda { + +ONNX_OPERATOR_VERSIONED_KERNEL_EX( + NonMaxSuppression, + kOnnxDomain, + 10, 10, + kCudaExecutionProvider, + KernelDefBuilder() + .InputMemoryType(2) + .InputMemoryType(3) + .InputMemoryType(4), + NonMaxSuppression); + +ONNX_OPERATOR_KERNEL_EX( + NonMaxSuppression, + kOnnxDomain, + 11, + kCudaExecutionProvider, + KernelDefBuilder() + .InputMemoryType(2) + .InputMemoryType(3) + .InputMemoryType(4), + NonMaxSuppression); + +Status NonMaxSuppression::ComputeInternal(OpKernelContext* ctx) const { + PrepareContext pc; + auto ret = PrepareCompute(ctx, pc); + ORT_RETURN_IF_NOT(ret.IsOK(), ret.ErrorMessage()); + + int64_t max_output_boxes_per_class = 0; + float iou_threshold = .0f; + float score_threshold = .0f; + + ret = GetThresholdsFromInputs(pc, max_output_boxes_per_class, iou_threshold, score_threshold); + ORT_RETURN_IF_NOT(ret.IsOK(), ret.ErrorMessage()); + + if (0 == pc.num_boxes_ || 0 == max_output_boxes_per_class) { + ctx->Output(0, {0, 3}); + return Status::OK(); + } + + // TODO: use cub::DeviceSegmentedRadixSort::SortPairsDescending instead of cub::DeviceRadixSort::SortPairsDescending + // to deal with multi batch/class parallelly + + std::vector, int>> all_selected_indices; + int total_num_saved_outputs = 0; + + for (int64_t batch_index = 0; batch_index < pc.num_batches_; ++batch_index) { + for (int64_t class_index = 0; class_index < pc.num_classes_; ++class_index) { + IAllocatorUniquePtr d_selected_indices{}; + IAllocatorUniquePtr h_number_selected_ptr{AllocateBufferOnCPUPinned(sizeof(int))}; + auto* h_number_selected = static_cast(h_number_selected_ptr.get()); + + ORT_RETURN_IF_ERROR(NonMaxSuppressionImpl( + [this](size_t bytes) { return GetScratchBuffer(bytes); }, + pc, + GetCenterPointBox(), + batch_index, + class_index, + max_output_boxes_per_class, + iou_threshold, + score_threshold, + d_selected_indices, + h_number_selected)); + + int num_saved_outputs = *h_number_selected; + if (num_saved_outputs > 0) { + all_selected_indices.emplace_back(std::move(d_selected_indices), num_saved_outputs); + total_num_saved_outputs += num_saved_outputs; + } + } + } + + if (total_num_saved_outputs == 0) { + ctx->Output(0, {0, 3}); + } else { + // concatenate outputs + const int last_dim = 3; + const int num_elements = last_dim * total_num_saved_outputs; + Tensor* output = ctx->Output(0, {static_cast(total_num_saved_outputs), last_dim}); + ORT_ENFORCE(output != nullptr); + int64_t* dst = output->MutableData(); + size_t count = all_selected_indices.size(); + + CudaAsyncBuffer input_ptr(this, count); + CudaAsyncBuffer concat_sizes_gpu(this, count); + CudaAsyncBuffer concat_sizes_range_gpu(this, count); + CudaAsyncBuffer axis_dimension_input_output_mapping_gpu(this, total_num_saved_outputs); + + int index = 0; + for (size_t i = 0; i < count; i++) { + auto& it = all_selected_indices[i]; + auto src = std::get<0>(it).get(); + auto size = std::get<1>(it); + + input_ptr.CpuPtr()[i] = src; + concat_sizes_gpu.CpuPtr()[i] = size; + concat_sizes_range_gpu.CpuPtr()[i] = (i == 0) ? size : size + concat_sizes_range_gpu.CpuPtr()[i - 1]; + for (int j = 0; j < size; j++) { + axis_dimension_input_output_mapping_gpu.CpuPtr()[index++] = i; + } + } + + concat_sizes_gpu.CopyToGpu(); + axis_dimension_input_output_mapping_gpu.CopyToGpu(); + concat_sizes_range_gpu.CopyToGpu(); + input_ptr.CopyToGpu(); + + ORT_RETURN_IF_ERROR(ConcatImpl(sizeof(int64_t), + num_elements, + last_dim, + concat_sizes_gpu.GpuPtr(), + concat_sizes_range_gpu.GpuPtr(), + axis_dimension_input_output_mapping_gpu.GpuPtr(), + static_cast(count), + dst, + input_ptr.GpuPtr(), + static_cast(num_elements))); + } + + return Status::OK(); +} + +} // namespace cuda +}; // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/object_detection/non_max_suppression.h b/onnxruntime/core/providers/cuda/object_detection/non_max_suppression.h new file mode 100644 index 0000000000..7a03126294 --- /dev/null +++ b/onnxruntime/core/providers/cuda/object_detection/non_max_suppression.h @@ -0,0 +1,24 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "core/common/common.h" +#include "core/framework/op_kernel.h" +#include "core/providers/cuda/cuda_common.h" +#include "core/providers/cpu/object_detection/non_max_suppression.h" + +namespace onnxruntime { +namespace cuda { + +struct NonMaxSuppression final : public CudaKernel, public NonMaxSuppressionBase { + explicit NonMaxSuppression(const OpKernelInfo& info) : CudaKernel(info), NonMaxSuppressionBase(info) { + } + + Status ComputeInternal(OpKernelContext* context) const override; + + private: + ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(NonMaxSuppression); +}; +} // namespace cuda +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/object_detection/non_max_suppression_impl.cu b/onnxruntime/core/providers/cuda/object_detection/non_max_suppression_impl.cu new file mode 100644 index 0000000000..8d04fc2d4d --- /dev/null +++ b/onnxruntime/core/providers/cuda/object_detection/non_max_suppression_impl.cu @@ -0,0 +1,450 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +/* Modifications Copyright (c) Microsoft. */ + +#include "non_max_suppression_impl.h" +#include "core/providers/cpu/object_detection/non_max_suppression_helper.h" +#include "core/providers/cuda/cu_inc/common.cuh" +#include "core/providers/cuda/cuda_common.h" + +#include "core/framework/tensor.h" + +#include +#include + +#include + +namespace onnxruntime { +namespace cuda { + +using namespace nms_helpers; + +namespace { + +struct __align__(16) Box { + float x1, y1, x2, y2; +}; + +// This is the width of the bitmask for masking boxes for each thread. +// This needs to be a multiple of 2(a POD width usually) so that division and +// modulo can be implemented as bit operations during host selection. +constexpr int kNmsBoxesPerThread = 8 * sizeof(int); +// Helper to calculate modulo mask and shift bits. +// For kNmsBoxesPerThread=32 ModuloMask will be 31, i.e 0x1F thus +// i % 32 == i & 31. Similarly ShiftBits will be 5 so that +// i / 32 == i >> 5. Using these bit operations should reduce the stall on host +// thread. +__device__ constexpr int NumBits(int n) { return (n == 0) ? 0 : NumBits(n >> 1) + 1; } +constexpr int kNmsBoxesPerThreadModuloMask = kNmsBoxesPerThread - 1; +constexpr int kNmsBoxesPerThreadShiftBits = + NumBits(kNmsBoxesPerThreadModuloMask); + +constexpr int kNmsBlockDim = 16; +constexpr int kNmsBlockDimMax = 128; +constexpr int kNmsChunkSize = 2000; + +template +__device__ inline void Swap(T& a, T& b) { + T c(a); + a = b; + b = c; +} + +// Check whether two boxes have an IoU greater than threshold. +template +__device__ inline bool OverThreshold(const Box* a, const Box* b, + const float a_area, + const T iou_threshold) { + const float b_area = (b->x2 - b->x1) * (b->y2 - b->y1); + if (a_area == 0.0f || b_area == 0.0f) return false; + const float xx1 = fmaxf(a->x1, b->x1); + const float yy1 = fmaxf(a->y1, b->y1); + const float xx2 = fminf(a->x2, b->x2); + const float yy2 = fminf(a->y2, b->y2); + + // fdimf computes the positive difference between xx2+1 and xx1. + const float w = fdimf(xx2, xx1); + const float h = fdimf(yy2, yy1); + const float intersection = w * h; + + // Testing for aa/bb > t + // eq with aa > bb*t (b is !=0) + // avoiding divisions. + const float aa = intersection; + const float bb = a_area + b_area - intersection; + const float bt = bb * iou_threshold; + return aa >= bt; +} + +__device__ inline void Flipped(Box& box) { + if (box.x1 > box.x2) Swap(box.x1, box.x2); + if (box.y1 > box.y2) Swap(box.y1, box.y2); +} +template +__device__ inline bool CheckBit(T* bit_mask, int bit) { + constexpr int kShiftLen = NumBits(8 * sizeof(T)) - 1; + constexpr int kRemainderMask = 8 * sizeof(T) - 1; + int bin = bit >> kShiftLen; + return (bit_mask[bin] >> (bit & kRemainderMask)) & 1; +} + +// Produce a global bitmask (result_mask) of selected boxes from bitmask +// generated by NMSKernel Abort early if max_boxes boxes are selected. Bitmask +// is num_boxes*bit_mask_len bits indicating whether to keep or remove a box. +__global__ void NMSReduce(const int* bitmask, const int bit_mask_len, + const int num_boxes, const int64_t max_boxes, + char* result_mask) { + extern __shared__ int local[]; + + // set global mask to accept all boxes + for (int box = blockIdx.x * blockDim.x + threadIdx.x; box < bit_mask_len; box += blockDim.x * gridDim.x) { + local[box] = 0xFFFFFFFF; + } + __syncthreads(); + int accepted_boxes = 0; + for (int box = 0; box < num_boxes - 1; ++box) { + // if current box is masked by an earlier box, skip it. + if (!CheckBit(local, box)) { + continue; + } + accepted_boxes += 1; + int offset = box * bit_mask_len; + // update global mask with current box's mask + for (int b = blockIdx.x * blockDim.x + threadIdx.x; b < bit_mask_len; b += blockDim.x * gridDim.x) { + local[b] &= ~bitmask[offset + b]; + } + __syncthreads(); + if (accepted_boxes > max_boxes) break; + } + // copy global mask to result_max char array. char array is needed for + // cub::DeviceSelect later. + for (int box = blockIdx.x * blockDim.x + threadIdx.x; box < num_boxes; box += blockDim.x * gridDim.x) { + result_mask[box] = CheckBit(local, box); + } +} + +// For each box, compute a bitmask of boxes which has an overlap with given box +// above threshold. +// +// Starting from highes scoring box, mark any box which has IoU>threshold with +// given box. Each thread processes a kNmsBoxesPerThread boxes per stride, and +// each box has bitmask of overlaps of length bit_mask_len. +// +__launch_bounds__(kNmsBlockDim* kNmsBlockDim, 4) __global__ + void NMSKernel( + const int64_t center_point_box, + const Box* d_desc_sorted_boxes, + const int num_boxes, + const float iou_threshold, + const int bit_mask_len, + int* d_delete_mask) { + for (int i_block_offset = blockIdx.x * blockDim.x; i_block_offset < num_boxes; + i_block_offset += blockDim.x * gridDim.x) { + const int i = i_block_offset + threadIdx.x; + if (i < num_boxes) { + for (int j_thread_offset = + kNmsBoxesPerThread * (blockIdx.y * blockDim.y + threadIdx.y); + j_thread_offset < num_boxes; + j_thread_offset += kNmsBoxesPerThread * blockDim.y * gridDim.y) { + // Note : We can do everything using multiplication, + // and use fp16 - we are comparing against a low precision + // threshold. + int above_threshold = 0; + // Make sure that threads are within valid domain. + bool valid = false; + // Loop over the next kNmsBoxesPerThread boxes and set corresponding bit + // if it is overlapping with current box + for (int ib = 0; ib < kNmsBoxesPerThread; ++ib) { + // This thread will compare Box i and Box j. + const int j = j_thread_offset + ib; + if (i >= j || i >= num_boxes || j >= num_boxes) continue; + valid = true; + if (SuppressByIOU(reinterpret_cast(d_desc_sorted_boxes), + i, j, center_point_box, iou_threshold)) { + // we have score[j] <= score[i]. + above_threshold |= (1U << ib); + } + } + if (valid) { + d_delete_mask[i * bit_mask_len + j_thread_offset / kNmsBoxesPerThread] = + above_threshold; + } + } + } + } +} +// Variadic template helpers for Index selecting multiple arrays at the same +// time +template +__device__ inline void SelectHelper(const Index i_selected, + const Index i_original) {} + +template +__device__ inline void SelectHelper(const Index i_selected, + const Index i_original, + const T* original, T* selected, + Args... args) { + selected[i_selected] = original[i_original]; + SelectHelper(i_selected, i_original, args...); +} + +// Helper template to select elements from original arrays using the index +// mapping and store into selected array. Each array sharing same mapping need +// to be passed as pairs of pointers to original and selected arrays. For +// selecting 2 arrays call would be +// IndexMultiSelect(num_elements, indices, original1 ,selected1, original2, +// selected2). +template +__global__ void IndexMultiSelect(const int num_elements, const Index* indices, + const T* original, T* selected, Args... args) { + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num_elements; idx += blockDim.x * gridDim.x) { + SelectHelper(idx, indices[idx], original, selected, args...); + } +} + +template +__global__ void SetZero(const int count, T* __restrict__ ptr) { + // Check that the grid is one dimensional and index doesn't overflow. + assert(blockDim.y == 1); + assert(blockDim.z == 1); + assert(blockDim.x * gridDim.x / blockDim.x == gridDim.x); + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < count; i += blockDim.x * gridDim.x) { + ptr[i] = T(0); + } +} + +template +__global__ void Iota(const int num_elements, const T offset, T* to_fill) { + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num_elements; idx += blockDim.x * gridDim.x) { + to_fill[idx] = static_cast(idx) + offset; + } +} + +__global__ void NormalizeOutput(const int num_elements, const int* original, int64_t* to_normalize, int64_t batch_index, int64_t class_index) { + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num_elements; idx += blockDim.x * gridDim.x) { + to_normalize[idx * 3] = batch_index; + to_normalize[idx * 3 + 1] = class_index; + to_normalize[idx * 3 + 2] = static_cast(original[idx]); + } +} + +Status NmsGpu(std::function(size_t)> allocator, + const int64_t center_point_box, + const float* d_sorted_boxes_float_ptr, + const int num_boxes, + const float iou_threshold, + int* d_selected_indices, + int* h_nkeep, + const int64_t max_boxes) { + // Making sure we respect the __align(16)__ + // we promised to the compiler. + auto iptr = reinterpret_cast(d_sorted_boxes_float_ptr); + ORT_ENFORCE((iptr & 15) == 0); + + const int bit_mask_len = + (num_boxes + kNmsBoxesPerThread - 1) / kNmsBoxesPerThread; + int max_nms_mask_size = num_boxes * bit_mask_len; + + IAllocatorUniquePtr d_nms_mask_ptr{allocator(max_nms_mask_size * sizeof(int))}; + auto* d_nms_mask = static_cast(d_nms_mask_ptr.get()); + + int blocksPerGrid = (int)(ceil(static_cast(max_nms_mask_size) / GridDim::maxThreadsPerBlock)); + SetZero<<>>(max_nms_mask_size, d_nms_mask); + + int* d_delete_mask = d_nms_mask; + int* h_selected_count = h_nkeep; + const Box* d_sorted_boxes = + reinterpret_cast(d_sorted_boxes_float_ptr); + dim3 block_dim, thread_block; + int num_blocks = (num_boxes + kNmsBlockDim - 1) / kNmsBlockDim; + num_blocks = std::max(std::min(num_blocks, kNmsBlockDimMax), 1); + block_dim.x = num_blocks; + block_dim.y = num_blocks; + block_dim.z = 1; + thread_block.x = kNmsBlockDim; + thread_block.y = kNmsBlockDim; + thread_block.z = 1; + NMSKernel<<>>(center_point_box, + d_sorted_boxes, + num_boxes, + iou_threshold, + bit_mask_len, + d_delete_mask); + + IAllocatorUniquePtr d_selected_boxes_ptr{allocator(num_boxes * sizeof(char))}; + auto* d_selected_boxes = static_cast(d_selected_boxes_ptr.get()); + IAllocatorUniquePtr d_indices_ptr{allocator(num_boxes * sizeof(int))}; + auto* d_indices = static_cast(d_indices_ptr.get()); + + blocksPerGrid = (int)(ceil(static_cast(num_boxes) / GridDim::maxThreadsPerBlock)); + Iota<<>>(num_boxes, 0, d_indices); + + NMSReduce<<<1, 1024, bit_mask_len * sizeof(int)>>>(d_delete_mask, bit_mask_len, num_boxes, max_boxes, d_selected_boxes); + + size_t flagged_buffer_size = 0; + CUDA_RETURN_IF_ERROR(cub::DeviceSelect::Flagged(static_cast(nullptr), // temp_storage + flagged_buffer_size, + static_cast(nullptr), // input + static_cast(nullptr), // selection flag + static_cast(nullptr), // selected items + static_cast(nullptr), // num_selected + num_boxes)); + + IAllocatorUniquePtr d_cub_scratch_buffer_ptr{allocator(flagged_buffer_size)}; + auto* d_cub_scratch_buffer = static_cast(d_cub_scratch_buffer_ptr.get()); + IAllocatorUniquePtr d_num_selected_ptr{allocator(sizeof(int))}; + auto* d_num_selected = static_cast(d_num_selected_ptr.get()); + + CUDA_RETURN_IF_ERROR(cub::DeviceSelect::Flagged( + d_cub_scratch_buffer, // temp_storage + flagged_buffer_size, + d_indices, // input + d_selected_boxes, // selection flag + d_selected_indices, // selected items + d_num_selected, num_boxes)); + CUDA_RETURN_IF_ERROR(cudaMemcpy(h_selected_count, d_num_selected, sizeof(int), cudaMemcpyDeviceToHost)); + + return Status::OK(); +} + +struct DeviceGreaterThan { + float threshold_; + __host__ __device__ __forceinline__ DeviceGreaterThan(float threshold) + : threshold_(threshold) {} + + __host__ __device__ __forceinline__ bool operator()(const float& val) const { + return (val > threshold_); + } +}; + +} // namespace + +Status NonMaxSuppressionImpl( + std::function(size_t)> allocator, + const PrepareContext& pc, + const int64_t center_point_box, + int64_t batch_index, + int64_t class_index, + int64_t max_output_boxes_per_class, + float iou_threshold, + float score_threshold, + IAllocatorUniquePtr& selected_indices, + int* h_number_selected) { + // STEP 1. Prepare data + int num_boxes = pc.num_boxes_; + const float* boxes_data = pc.boxes_data_ + batch_index * num_boxes * 4; + const float* scores_data = pc.scores_data_ + (batch_index * pc.num_classes_ + class_index) * num_boxes; + + // prepare temporary memory for sorting scores + + // calculate temporary size that used for sorting + size_t cub_sort_temp_storage_bytes = 0; + CUDA_RETURN_IF_ERROR(cub::DeviceRadixSort::SortPairsDescending( + nullptr, cub_sort_temp_storage_bytes, + static_cast(nullptr), // scores + static_cast(nullptr), // sorted scores + static_cast(nullptr), // input indices + static_cast(nullptr), // sorted indices + num_boxes, // num items + 0, 8 * sizeof(float) // sort all bits + )); + + // allocate temporary memory + IAllocatorUniquePtr d_cub_sort_buffer_ptr{allocator(cub_sort_temp_storage_bytes)}; + auto* d_cub_sort_buffer = static_cast(d_cub_sort_buffer_ptr.get()); + IAllocatorUniquePtr d_indices_ptr{allocator(num_boxes * sizeof(int))}; + auto* d_indices = static_cast(d_indices_ptr.get()); + IAllocatorUniquePtr d_sorted_indices_ptr{allocator(num_boxes * sizeof(int))}; + auto* d_sorted_indices = static_cast(d_sorted_indices_ptr.get()); + IAllocatorUniquePtr d_selected_indices_ptr{allocator(num_boxes * sizeof(int))}; + auto* d_selected_indices = static_cast(d_selected_indices_ptr.get()); + IAllocatorUniquePtr d_sorted_scores_ptr{allocator(num_boxes * sizeof(float))}; + auto* d_sorted_scores = static_cast(d_sorted_scores_ptr.get()); + IAllocatorUniquePtr d_sorted_boxes_ptr{allocator(num_boxes * 4 * sizeof(float))}; + auto* d_sorted_boxes = static_cast(d_sorted_boxes_ptr.get()); + + // create sequense of indices + int blocksPerGrid = (int)(ceil(static_cast(num_boxes) / GridDim::maxThreadsPerBlock)); + Iota<<>>(num_boxes, 0, d_indices); + CUDA_RETURN_IF_ERROR(cudaGetLastError()); + + // sort scores + CUDA_RETURN_IF_ERROR(cub::DeviceRadixSort::SortPairsDescending( + d_cub_sort_buffer, + cub_sort_temp_storage_bytes, + scores_data, + d_sorted_scores, + d_indices, + d_sorted_indices, + num_boxes, + 0, + 8 * sizeof(float) // sort all bits + )); + + // pick sorted scores + const Box* original_boxes = reinterpret_cast(boxes_data); + Box* sorted_boxes = reinterpret_cast(d_sorted_boxes); + IndexMultiSelect<<>>(num_boxes, d_sorted_indices, original_boxes, sorted_boxes); + CUDA_RETURN_IF_ERROR(cudaGetLastError()); + + // STEP 2. filter boxes by scores + int limited_num_boxes = num_boxes; + if (pc.score_threshold_ != nullptr) { + thrust::device_ptr sorted_scores_device_ptr(d_sorted_scores); + limited_num_boxes = thrust::count_if( + sorted_scores_device_ptr, + sorted_scores_device_ptr + num_boxes, + DeviceGreaterThan(score_threshold)); + CUDA_RETURN_IF_ERROR(cudaGetLastError()); + + if (limited_num_boxes == 0) { + *h_number_selected = 0; + return Status::OK(); + } + } + + // STEP 3. launch NMS kernels + ORT_RETURN_IF_ERROR(NmsGpu(allocator, + center_point_box, + d_sorted_boxes, + limited_num_boxes, + iou_threshold, + d_selected_indices, + h_number_selected, + max_output_boxes_per_class)); + CUDA_RETURN_IF_ERROR(cudaGetLastError()); + + // STEP 4. map back to sorted indices + *h_number_selected = std::min(*h_number_selected, (int)max_output_boxes_per_class); + int num_to_keep = *h_number_selected; + if (num_to_keep > 0) { + IAllocatorUniquePtr d_output_indices_ptr{allocator(num_to_keep * sizeof(int))}; + auto* d_output_indices = static_cast(d_output_indices_ptr.get()); + IAllocatorUniquePtr d_normalized_output_indices_ptr{allocator(num_to_keep * 3 * sizeof(int64_t))}; + auto* d_normalized_output_indices = static_cast(d_normalized_output_indices_ptr.get()); + + int blocksPerGrid = (int)(ceil(static_cast(num_to_keep) / GridDim::maxThreadsPerBlock)); + IndexMultiSelect<<>>(num_to_keep, d_selected_indices, d_sorted_indices, d_output_indices); + NormalizeOutput<<>>(num_to_keep, d_output_indices, d_normalized_output_indices, batch_index, class_index); + CUDA_RETURN_IF_ERROR(cudaGetLastError()); + + selected_indices = std::move(d_normalized_output_indices_ptr); + } + + return Status::OK(); +} + +} // namespace cuda +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/object_detection/non_max_suppression_impl.h b/onnxruntime/core/providers/cuda/object_detection/non_max_suppression_impl.h new file mode 100644 index 0000000000..c10c508377 --- /dev/null +++ b/onnxruntime/core/providers/cuda/object_detection/non_max_suppression_impl.h @@ -0,0 +1,29 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include +#include "core/providers/cuda/shared_inc/cuda_utils.h" +#include "core/common/common.h" +#include "core/framework/allocator.h" +#include "core/framework/data_types.h" +#include +#include "core/providers/cpu/object_detection/non_max_suppression_helper.h" + +namespace onnxruntime { +namespace cuda { + +Status NonMaxSuppressionImpl( + std::function(size_t)> allocator, + const PrepareContext& pc, + const int64_t center_point_box, + int64_t batch_index, + int64_t class_index, + int64_t max_output_boxes_per_class, + float iou_threshold, + float score_threshold, + IAllocatorUniquePtr& selected_indices, + int* h_number_selected); + +} // namespace cuda +} // namespace onnxruntime