mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-26 22:35:43 +00:00
NonMaxSuppression cuda implementation (#2082)
This commit is contained in:
parent
67755adfd8
commit
bf7fa091cc
7 changed files with 669 additions and 28 deletions
|
|
@ -35,29 +35,8 @@ ONNX_OPERATOR_KERNEL_EX(
|
|||
|
||||
using namespace nms_helpers;
|
||||
|
||||
// CPU version
|
||||
namespace nms_helpers {
|
||||
Status GetThresholdsFromInputs(const PrepareContext& pc,
|
||||
int64_t& max_output_boxes_per_class,
|
||||
float& iou_threshold,
|
||||
float& score_threshold) {
|
||||
if (pc.max_output_boxes_per_class_ != nullptr) {
|
||||
max_output_boxes_per_class = std::max<int64_t>(*pc.max_output_boxes_per_class_, 0);
|
||||
}
|
||||
|
||||
if (pc.iou_threshold_ != nullptr) {
|
||||
iou_threshold = *pc.iou_threshold_;
|
||||
ORT_RETURN_IF_NOT((iou_threshold >= 0 && iou_threshold <= 1.f), "iou_threshold must be in range [0, 1].");
|
||||
}
|
||||
|
||||
if (pc.score_threshold_ != nullptr) {
|
||||
score_threshold = *pc.score_threshold_;
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace nms_helpers
|
||||
|
||||
// This works for both CPU and GPU.
|
||||
// CUDA kernel declare OrtMemTypeCPUInput for max_output_boxes_per_class(2), iou_threshold(3) and score_threshold(4)
|
||||
Status NonMaxSuppressionBase::PrepareCompute(OpKernelContext* ctx, PrepareContext& pc) {
|
||||
const auto* boxes_tensor = ctx->Input<Tensor>(0);
|
||||
ORT_ENFORCE(boxes_tensor);
|
||||
|
|
@ -111,6 +90,26 @@ Status NonMaxSuppressionBase::PrepareCompute(OpKernelContext* ctx, PrepareContex
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
Status NonMaxSuppressionBase::GetThresholdsFromInputs(const PrepareContext& pc,
|
||||
int64_t& max_output_boxes_per_class,
|
||||
float& iou_threshold,
|
||||
float& score_threshold) {
|
||||
if (pc.max_output_boxes_per_class_ != nullptr) {
|
||||
max_output_boxes_per_class = std::max<int64_t>(*pc.max_output_boxes_per_class_, 0);
|
||||
}
|
||||
|
||||
if (pc.iou_threshold_ != nullptr) {
|
||||
iou_threshold = *pc.iou_threshold_;
|
||||
ORT_RETURN_IF_NOT((iou_threshold >= 0 && iou_threshold <= 1.f), "iou_threshold must be in range [0, 1].");
|
||||
}
|
||||
|
||||
if (pc.score_threshold_ != nullptr) {
|
||||
score_threshold = *pc.score_threshold_;
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status NonMaxSuppression::Compute(OpKernelContext* ctx) const {
|
||||
PrepareContext pc;
|
||||
auto ret = PrepareCompute(ctx, pc);
|
||||
|
|
|
|||
|
|
@ -18,6 +18,10 @@ class NonMaxSuppressionBase {
|
|||
}
|
||||
|
||||
static Status PrepareCompute(OpKernelContext* ctx, PrepareContext& pc);
|
||||
static Status GetThresholdsFromInputs(const PrepareContext& pc,
|
||||
int64_t& max_output_boxes_per_class,
|
||||
float& iou_threshold,
|
||||
float& score_threshold);
|
||||
|
||||
int64_t GetCenterPointBox() const {
|
||||
return center_point_box_;
|
||||
|
|
|
|||
|
|
@ -544,24 +544,24 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
|
|||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, double, Less);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, MLFloat16, Less);
|
||||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, EyeLike);
|
||||
class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 10, Scatter);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, MLFloat16, Where);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, float, Where);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, int32_t, Where);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, int64_t, Where);
|
||||
|
||||
|
||||
// opset 10
|
||||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, Dropout);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, float, RoiAlign);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, double, RoiAlign);
|
||||
class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, NonMaxSuppression);
|
||||
|
||||
// opset 11
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, Gemm);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, Gemm);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, Gemm);
|
||||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, NonMaxSuppression);
|
||||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, Range);
|
||||
|
||||
class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 10, Scatter);
|
||||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, ScatterElements);
|
||||
|
||||
class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 9, TopK);
|
||||
|
|
@ -909,7 +909,9 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) {
|
|||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, MLFloat16, Less)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, float, RoiAlign)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, double, RoiAlign)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, NonMaxSuppression)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, EyeLike)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 10, Scatter)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, MLFloat16, Where)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, float, Where)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, int32_t, Where)>,
|
||||
|
|
@ -919,9 +921,8 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) {
|
|||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, Gemm)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, Gemm)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, Gemm)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, NonMaxSuppression)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, Range)>,
|
||||
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 10, Scatter)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, ScatterElements)>,
|
||||
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 9, TopK)>,
|
||||
|
|
|
|||
|
|
@ -0,0 +1,134 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#include "non_max_suppression.h"
|
||||
#include "core/providers/cpu/object_detection/non_max_suppression_helper.h"
|
||||
#include "non_max_suppression_impl.h"
|
||||
#include "core/providers/cuda/tensor/concat_impl.h"
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace cuda {
|
||||
|
||||
ONNX_OPERATOR_VERSIONED_KERNEL_EX(
|
||||
NonMaxSuppression,
|
||||
kOnnxDomain,
|
||||
10, 10,
|
||||
kCudaExecutionProvider,
|
||||
KernelDefBuilder()
|
||||
.InputMemoryType<OrtMemTypeCPUInput>(2)
|
||||
.InputMemoryType<OrtMemTypeCPUInput>(3)
|
||||
.InputMemoryType<OrtMemTypeCPUInput>(4),
|
||||
NonMaxSuppression);
|
||||
|
||||
ONNX_OPERATOR_KERNEL_EX(
|
||||
NonMaxSuppression,
|
||||
kOnnxDomain,
|
||||
11,
|
||||
kCudaExecutionProvider,
|
||||
KernelDefBuilder()
|
||||
.InputMemoryType<OrtMemTypeCPUInput>(2)
|
||||
.InputMemoryType<OrtMemTypeCPUInput>(3)
|
||||
.InputMemoryType<OrtMemTypeCPUInput>(4),
|
||||
NonMaxSuppression);
|
||||
|
||||
Status NonMaxSuppression::ComputeInternal(OpKernelContext* ctx) const {
|
||||
PrepareContext pc;
|
||||
auto ret = PrepareCompute(ctx, pc);
|
||||
ORT_RETURN_IF_NOT(ret.IsOK(), ret.ErrorMessage());
|
||||
|
||||
int64_t max_output_boxes_per_class = 0;
|
||||
float iou_threshold = .0f;
|
||||
float score_threshold = .0f;
|
||||
|
||||
ret = GetThresholdsFromInputs(pc, max_output_boxes_per_class, iou_threshold, score_threshold);
|
||||
ORT_RETURN_IF_NOT(ret.IsOK(), ret.ErrorMessage());
|
||||
|
||||
if (0 == pc.num_boxes_ || 0 == max_output_boxes_per_class) {
|
||||
ctx->Output(0, {0, 3});
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// TODO: use cub::DeviceSegmentedRadixSort::SortPairsDescending instead of cub::DeviceRadixSort::SortPairsDescending
|
||||
// to deal with multi batch/class parallelly
|
||||
|
||||
std::vector<std::tuple<IAllocatorUniquePtr<void>, int>> all_selected_indices;
|
||||
int total_num_saved_outputs = 0;
|
||||
|
||||
for (int64_t batch_index = 0; batch_index < pc.num_batches_; ++batch_index) {
|
||||
for (int64_t class_index = 0; class_index < pc.num_classes_; ++class_index) {
|
||||
IAllocatorUniquePtr<void> d_selected_indices{};
|
||||
IAllocatorUniquePtr<void> h_number_selected_ptr{AllocateBufferOnCPUPinned<void>(sizeof(int))};
|
||||
auto* h_number_selected = static_cast<int*>(h_number_selected_ptr.get());
|
||||
|
||||
ORT_RETURN_IF_ERROR(NonMaxSuppressionImpl(
|
||||
[this](size_t bytes) { return GetScratchBuffer<void>(bytes); },
|
||||
pc,
|
||||
GetCenterPointBox(),
|
||||
batch_index,
|
||||
class_index,
|
||||
max_output_boxes_per_class,
|
||||
iou_threshold,
|
||||
score_threshold,
|
||||
d_selected_indices,
|
||||
h_number_selected));
|
||||
|
||||
int num_saved_outputs = *h_number_selected;
|
||||
if (num_saved_outputs > 0) {
|
||||
all_selected_indices.emplace_back(std::move(d_selected_indices), num_saved_outputs);
|
||||
total_num_saved_outputs += num_saved_outputs;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (total_num_saved_outputs == 0) {
|
||||
ctx->Output(0, {0, 3});
|
||||
} else {
|
||||
// concatenate outputs
|
||||
const int last_dim = 3;
|
||||
const int num_elements = last_dim * total_num_saved_outputs;
|
||||
Tensor* output = ctx->Output(0, {static_cast<int64_t>(total_num_saved_outputs), last_dim});
|
||||
ORT_ENFORCE(output != nullptr);
|
||||
int64_t* dst = output->MutableData<int64_t>();
|
||||
size_t count = all_selected_indices.size();
|
||||
|
||||
CudaAsyncBuffer<const void*> input_ptr(this, count);
|
||||
CudaAsyncBuffer<int64_t> concat_sizes_gpu(this, count);
|
||||
CudaAsyncBuffer<int64_t> concat_sizes_range_gpu(this, count);
|
||||
CudaAsyncBuffer<int64_t> axis_dimension_input_output_mapping_gpu(this, total_num_saved_outputs);
|
||||
|
||||
int index = 0;
|
||||
for (size_t i = 0; i < count; i++) {
|
||||
auto& it = all_selected_indices[i];
|
||||
auto src = std::get<0>(it).get();
|
||||
auto size = std::get<1>(it);
|
||||
|
||||
input_ptr.CpuPtr()[i] = src;
|
||||
concat_sizes_gpu.CpuPtr()[i] = size;
|
||||
concat_sizes_range_gpu.CpuPtr()[i] = (i == 0) ? size : size + concat_sizes_range_gpu.CpuPtr()[i - 1];
|
||||
for (int j = 0; j < size; j++) {
|
||||
axis_dimension_input_output_mapping_gpu.CpuPtr()[index++] = i;
|
||||
}
|
||||
}
|
||||
|
||||
concat_sizes_gpu.CopyToGpu();
|
||||
axis_dimension_input_output_mapping_gpu.CopyToGpu();
|
||||
concat_sizes_range_gpu.CopyToGpu();
|
||||
input_ptr.CopyToGpu();
|
||||
|
||||
ORT_RETURN_IF_ERROR(ConcatImpl(sizeof(int64_t),
|
||||
num_elements,
|
||||
last_dim,
|
||||
concat_sizes_gpu.GpuPtr(),
|
||||
concat_sizes_range_gpu.GpuPtr(),
|
||||
axis_dimension_input_output_mapping_gpu.GpuPtr(),
|
||||
static_cast<int>(count),
|
||||
dst,
|
||||
input_ptr.GpuPtr(),
|
||||
static_cast<size_t>(num_elements)));
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
} // namespace cuda
|
||||
}; // namespace onnxruntime
|
||||
|
|
@ -0,0 +1,24 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "core/common/common.h"
|
||||
#include "core/framework/op_kernel.h"
|
||||
#include "core/providers/cuda/cuda_common.h"
|
||||
#include "core/providers/cpu/object_detection/non_max_suppression.h"
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace cuda {
|
||||
|
||||
struct NonMaxSuppression final : public CudaKernel, public NonMaxSuppressionBase {
|
||||
explicit NonMaxSuppression(const OpKernelInfo& info) : CudaKernel(info), NonMaxSuppressionBase(info) {
|
||||
}
|
||||
|
||||
Status ComputeInternal(OpKernelContext* context) const override;
|
||||
|
||||
private:
|
||||
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(NonMaxSuppression);
|
||||
};
|
||||
} // namespace cuda
|
||||
} // namespace onnxruntime
|
||||
|
|
@ -0,0 +1,450 @@
|
|||
/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
/* Modifications Copyright (c) Microsoft. */
|
||||
|
||||
#include "non_max_suppression_impl.h"
|
||||
#include "core/providers/cpu/object_detection/non_max_suppression_helper.h"
|
||||
#include "core/providers/cuda/cu_inc/common.cuh"
|
||||
#include "core/providers/cuda/cuda_common.h"
|
||||
|
||||
#include "core/framework/tensor.h"
|
||||
|
||||
#include <thrust/device_vector.h>
|
||||
#include <thrust/execution_policy.h>
|
||||
|
||||
#include <cub/cub.cuh>
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace cuda {
|
||||
|
||||
using namespace nms_helpers;
|
||||
|
||||
namespace {
|
||||
|
||||
struct __align__(16) Box {
|
||||
float x1, y1, x2, y2;
|
||||
};
|
||||
|
||||
// This is the width of the bitmask for masking boxes for each thread.
|
||||
// This needs to be a multiple of 2(a POD width usually) so that division and
|
||||
// modulo can be implemented as bit operations during host selection.
|
||||
constexpr int kNmsBoxesPerThread = 8 * sizeof(int);
|
||||
// Helper to calculate modulo mask and shift bits.
|
||||
// For kNmsBoxesPerThread=32 ModuloMask will be 31, i.e 0x1F thus
|
||||
// i % 32 == i & 31. Similarly ShiftBits will be 5 so that
|
||||
// i / 32 == i >> 5. Using these bit operations should reduce the stall on host
|
||||
// thread.
|
||||
__device__ constexpr int NumBits(int n) { return (n == 0) ? 0 : NumBits(n >> 1) + 1; }
|
||||
constexpr int kNmsBoxesPerThreadModuloMask = kNmsBoxesPerThread - 1;
|
||||
constexpr int kNmsBoxesPerThreadShiftBits =
|
||||
NumBits(kNmsBoxesPerThreadModuloMask);
|
||||
|
||||
constexpr int kNmsBlockDim = 16;
|
||||
constexpr int kNmsBlockDimMax = 128;
|
||||
constexpr int kNmsChunkSize = 2000;
|
||||
|
||||
template <typename T>
|
||||
__device__ inline void Swap(T& a, T& b) {
|
||||
T c(a);
|
||||
a = b;
|
||||
b = c;
|
||||
}
|
||||
|
||||
// Check whether two boxes have an IoU greater than threshold.
|
||||
template <typename T>
|
||||
__device__ inline bool OverThreshold(const Box* a, const Box* b,
|
||||
const float a_area,
|
||||
const T iou_threshold) {
|
||||
const float b_area = (b->x2 - b->x1) * (b->y2 - b->y1);
|
||||
if (a_area == 0.0f || b_area == 0.0f) return false;
|
||||
const float xx1 = fmaxf(a->x1, b->x1);
|
||||
const float yy1 = fmaxf(a->y1, b->y1);
|
||||
const float xx2 = fminf(a->x2, b->x2);
|
||||
const float yy2 = fminf(a->y2, b->y2);
|
||||
|
||||
// fdimf computes the positive difference between xx2+1 and xx1.
|
||||
const float w = fdimf(xx2, xx1);
|
||||
const float h = fdimf(yy2, yy1);
|
||||
const float intersection = w * h;
|
||||
|
||||
// Testing for aa/bb > t
|
||||
// eq with aa > bb*t (b is !=0)
|
||||
// avoiding divisions.
|
||||
const float aa = intersection;
|
||||
const float bb = a_area + b_area - intersection;
|
||||
const float bt = bb * iou_threshold;
|
||||
return aa >= bt;
|
||||
}
|
||||
|
||||
__device__ inline void Flipped(Box& box) {
|
||||
if (box.x1 > box.x2) Swap(box.x1, box.x2);
|
||||
if (box.y1 > box.y2) Swap(box.y1, box.y2);
|
||||
}
|
||||
template <typename T>
|
||||
__device__ inline bool CheckBit(T* bit_mask, int bit) {
|
||||
constexpr int kShiftLen = NumBits(8 * sizeof(T)) - 1;
|
||||
constexpr int kRemainderMask = 8 * sizeof(T) - 1;
|
||||
int bin = bit >> kShiftLen;
|
||||
return (bit_mask[bin] >> (bit & kRemainderMask)) & 1;
|
||||
}
|
||||
|
||||
// Produce a global bitmask (result_mask) of selected boxes from bitmask
|
||||
// generated by NMSKernel Abort early if max_boxes boxes are selected. Bitmask
|
||||
// is num_boxes*bit_mask_len bits indicating whether to keep or remove a box.
|
||||
__global__ void NMSReduce(const int* bitmask, const int bit_mask_len,
|
||||
const int num_boxes, const int64_t max_boxes,
|
||||
char* result_mask) {
|
||||
extern __shared__ int local[];
|
||||
|
||||
// set global mask to accept all boxes
|
||||
for (int box = blockIdx.x * blockDim.x + threadIdx.x; box < bit_mask_len; box += blockDim.x * gridDim.x) {
|
||||
local[box] = 0xFFFFFFFF;
|
||||
}
|
||||
__syncthreads();
|
||||
int accepted_boxes = 0;
|
||||
for (int box = 0; box < num_boxes - 1; ++box) {
|
||||
// if current box is masked by an earlier box, skip it.
|
||||
if (!CheckBit(local, box)) {
|
||||
continue;
|
||||
}
|
||||
accepted_boxes += 1;
|
||||
int offset = box * bit_mask_len;
|
||||
// update global mask with current box's mask
|
||||
for (int b = blockIdx.x * blockDim.x + threadIdx.x; b < bit_mask_len; b += blockDim.x * gridDim.x) {
|
||||
local[b] &= ~bitmask[offset + b];
|
||||
}
|
||||
__syncthreads();
|
||||
if (accepted_boxes > max_boxes) break;
|
||||
}
|
||||
// copy global mask to result_max char array. char array is needed for
|
||||
// cub::DeviceSelect later.
|
||||
for (int box = blockIdx.x * blockDim.x + threadIdx.x; box < num_boxes; box += blockDim.x * gridDim.x) {
|
||||
result_mask[box] = CheckBit(local, box);
|
||||
}
|
||||
}
|
||||
|
||||
// For each box, compute a bitmask of boxes which has an overlap with given box
|
||||
// above threshold.
|
||||
//
|
||||
// Starting from highes scoring box, mark any box which has IoU>threshold with
|
||||
// given box. Each thread processes a kNmsBoxesPerThread boxes per stride, and
|
||||
// each box has bitmask of overlaps of length bit_mask_len.
|
||||
//
|
||||
__launch_bounds__(kNmsBlockDim* kNmsBlockDim, 4) __global__
|
||||
void NMSKernel(
|
||||
const int64_t center_point_box,
|
||||
const Box* d_desc_sorted_boxes,
|
||||
const int num_boxes,
|
||||
const float iou_threshold,
|
||||
const int bit_mask_len,
|
||||
int* d_delete_mask) {
|
||||
for (int i_block_offset = blockIdx.x * blockDim.x; i_block_offset < num_boxes;
|
||||
i_block_offset += blockDim.x * gridDim.x) {
|
||||
const int i = i_block_offset + threadIdx.x;
|
||||
if (i < num_boxes) {
|
||||
for (int j_thread_offset =
|
||||
kNmsBoxesPerThread * (blockIdx.y * blockDim.y + threadIdx.y);
|
||||
j_thread_offset < num_boxes;
|
||||
j_thread_offset += kNmsBoxesPerThread * blockDim.y * gridDim.y) {
|
||||
// Note : We can do everything using multiplication,
|
||||
// and use fp16 - we are comparing against a low precision
|
||||
// threshold.
|
||||
int above_threshold = 0;
|
||||
// Make sure that threads are within valid domain.
|
||||
bool valid = false;
|
||||
// Loop over the next kNmsBoxesPerThread boxes and set corresponding bit
|
||||
// if it is overlapping with current box
|
||||
for (int ib = 0; ib < kNmsBoxesPerThread; ++ib) {
|
||||
// This thread will compare Box i and Box j.
|
||||
const int j = j_thread_offset + ib;
|
||||
if (i >= j || i >= num_boxes || j >= num_boxes) continue;
|
||||
valid = true;
|
||||
if (SuppressByIOU(reinterpret_cast<const float*>(d_desc_sorted_boxes),
|
||||
i, j, center_point_box, iou_threshold)) {
|
||||
// we have score[j] <= score[i].
|
||||
above_threshold |= (1U << ib);
|
||||
}
|
||||
}
|
||||
if (valid) {
|
||||
d_delete_mask[i * bit_mask_len + j_thread_offset / kNmsBoxesPerThread] =
|
||||
above_threshold;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Variadic template helpers for Index selecting multiple arrays at the same
|
||||
// time
|
||||
template <typename Index>
|
||||
__device__ inline void SelectHelper(const Index i_selected,
|
||||
const Index i_original) {}
|
||||
|
||||
template <typename Index, typename T, typename... Args>
|
||||
__device__ inline void SelectHelper(const Index i_selected,
|
||||
const Index i_original,
|
||||
const T* original, T* selected,
|
||||
Args... args) {
|
||||
selected[i_selected] = original[i_original];
|
||||
SelectHelper(i_selected, i_original, args...);
|
||||
}
|
||||
|
||||
// Helper template to select elements from original arrays using the index
|
||||
// mapping and store into selected array. Each array sharing same mapping need
|
||||
// to be passed as pairs of pointers to original and selected arrays. For
|
||||
// selecting 2 arrays call would be
|
||||
// IndexMultiSelect(num_elements, indices, original1 ,selected1, original2,
|
||||
// selected2).
|
||||
template <typename Index, typename T, typename... Args>
|
||||
__global__ void IndexMultiSelect(const int num_elements, const Index* indices,
|
||||
const T* original, T* selected, Args... args) {
|
||||
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num_elements; idx += blockDim.x * gridDim.x) {
|
||||
SelectHelper(idx, indices[idx], original, selected, args...);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void SetZero(const int count, T* __restrict__ ptr) {
|
||||
// Check that the grid is one dimensional and index doesn't overflow.
|
||||
assert(blockDim.y == 1);
|
||||
assert(blockDim.z == 1);
|
||||
assert(blockDim.x * gridDim.x / blockDim.x == gridDim.x);
|
||||
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < count; i += blockDim.x * gridDim.x) {
|
||||
ptr[i] = T(0);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void Iota(const int num_elements, const T offset, T* to_fill) {
|
||||
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num_elements; idx += blockDim.x * gridDim.x) {
|
||||
to_fill[idx] = static_cast<T>(idx) + offset;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void NormalizeOutput(const int num_elements, const int* original, int64_t* to_normalize, int64_t batch_index, int64_t class_index) {
|
||||
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num_elements; idx += blockDim.x * gridDim.x) {
|
||||
to_normalize[idx * 3] = batch_index;
|
||||
to_normalize[idx * 3 + 1] = class_index;
|
||||
to_normalize[idx * 3 + 2] = static_cast<int64_t>(original[idx]);
|
||||
}
|
||||
}
|
||||
|
||||
Status NmsGpu(std::function<IAllocatorUniquePtr<void>(size_t)> allocator,
|
||||
const int64_t center_point_box,
|
||||
const float* d_sorted_boxes_float_ptr,
|
||||
const int num_boxes,
|
||||
const float iou_threshold,
|
||||
int* d_selected_indices,
|
||||
int* h_nkeep,
|
||||
const int64_t max_boxes) {
|
||||
// Making sure we respect the __align(16)__
|
||||
// we promised to the compiler.
|
||||
auto iptr = reinterpret_cast<std::uintptr_t>(d_sorted_boxes_float_ptr);
|
||||
ORT_ENFORCE((iptr & 15) == 0);
|
||||
|
||||
const int bit_mask_len =
|
||||
(num_boxes + kNmsBoxesPerThread - 1) / kNmsBoxesPerThread;
|
||||
int max_nms_mask_size = num_boxes * bit_mask_len;
|
||||
|
||||
IAllocatorUniquePtr<void> d_nms_mask_ptr{allocator(max_nms_mask_size * sizeof(int))};
|
||||
auto* d_nms_mask = static_cast<int*>(d_nms_mask_ptr.get());
|
||||
|
||||
int blocksPerGrid = (int)(ceil(static_cast<float>(max_nms_mask_size) / GridDim::maxThreadsPerBlock));
|
||||
SetZero<int><<<blocksPerGrid, GridDim::maxThreadsPerBlock>>>(max_nms_mask_size, d_nms_mask);
|
||||
|
||||
int* d_delete_mask = d_nms_mask;
|
||||
int* h_selected_count = h_nkeep;
|
||||
const Box* d_sorted_boxes =
|
||||
reinterpret_cast<const Box*>(d_sorted_boxes_float_ptr);
|
||||
dim3 block_dim, thread_block;
|
||||
int num_blocks = (num_boxes + kNmsBlockDim - 1) / kNmsBlockDim;
|
||||
num_blocks = std::max(std::min(num_blocks, kNmsBlockDimMax), 1);
|
||||
block_dim.x = num_blocks;
|
||||
block_dim.y = num_blocks;
|
||||
block_dim.z = 1;
|
||||
thread_block.x = kNmsBlockDim;
|
||||
thread_block.y = kNmsBlockDim;
|
||||
thread_block.z = 1;
|
||||
NMSKernel<<<block_dim, thread_block>>>(center_point_box,
|
||||
d_sorted_boxes,
|
||||
num_boxes,
|
||||
iou_threshold,
|
||||
bit_mask_len,
|
||||
d_delete_mask);
|
||||
|
||||
IAllocatorUniquePtr<void> d_selected_boxes_ptr{allocator(num_boxes * sizeof(char))};
|
||||
auto* d_selected_boxes = static_cast<char*>(d_selected_boxes_ptr.get());
|
||||
IAllocatorUniquePtr<void> d_indices_ptr{allocator(num_boxes * sizeof(int))};
|
||||
auto* d_indices = static_cast<int*>(d_indices_ptr.get());
|
||||
|
||||
blocksPerGrid = (int)(ceil(static_cast<float>(num_boxes) / GridDim::maxThreadsPerBlock));
|
||||
Iota<int><<<blocksPerGrid, GridDim::maxThreadsPerBlock>>>(num_boxes, 0, d_indices);
|
||||
|
||||
NMSReduce<<<1, 1024, bit_mask_len * sizeof(int)>>>(d_delete_mask, bit_mask_len, num_boxes, max_boxes, d_selected_boxes);
|
||||
|
||||
size_t flagged_buffer_size = 0;
|
||||
CUDA_RETURN_IF_ERROR(cub::DeviceSelect::Flagged(static_cast<void*>(nullptr), // temp_storage
|
||||
flagged_buffer_size,
|
||||
static_cast<int*>(nullptr), // input
|
||||
static_cast<char*>(nullptr), // selection flag
|
||||
static_cast<int*>(nullptr), // selected items
|
||||
static_cast<int*>(nullptr), // num_selected
|
||||
num_boxes));
|
||||
|
||||
IAllocatorUniquePtr<void> d_cub_scratch_buffer_ptr{allocator(flagged_buffer_size)};
|
||||
auto* d_cub_scratch_buffer = static_cast<uint8_t*>(d_cub_scratch_buffer_ptr.get());
|
||||
IAllocatorUniquePtr<void> d_num_selected_ptr{allocator(sizeof(int))};
|
||||
auto* d_num_selected = static_cast<int*>(d_num_selected_ptr.get());
|
||||
|
||||
CUDA_RETURN_IF_ERROR(cub::DeviceSelect::Flagged(
|
||||
d_cub_scratch_buffer, // temp_storage
|
||||
flagged_buffer_size,
|
||||
d_indices, // input
|
||||
d_selected_boxes, // selection flag
|
||||
d_selected_indices, // selected items
|
||||
d_num_selected, num_boxes));
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpy(h_selected_count, d_num_selected, sizeof(int), cudaMemcpyDeviceToHost));
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
struct DeviceGreaterThan {
|
||||
float threshold_;
|
||||
__host__ __device__ __forceinline__ DeviceGreaterThan(float threshold)
|
||||
: threshold_(threshold) {}
|
||||
|
||||
__host__ __device__ __forceinline__ bool operator()(const float& val) const {
|
||||
return (val > threshold_);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
Status NonMaxSuppressionImpl(
|
||||
std::function<IAllocatorUniquePtr<void>(size_t)> allocator,
|
||||
const PrepareContext& pc,
|
||||
const int64_t center_point_box,
|
||||
int64_t batch_index,
|
||||
int64_t class_index,
|
||||
int64_t max_output_boxes_per_class,
|
||||
float iou_threshold,
|
||||
float score_threshold,
|
||||
IAllocatorUniquePtr<void>& selected_indices,
|
||||
int* h_number_selected) {
|
||||
// STEP 1. Prepare data
|
||||
int num_boxes = pc.num_boxes_;
|
||||
const float* boxes_data = pc.boxes_data_ + batch_index * num_boxes * 4;
|
||||
const float* scores_data = pc.scores_data_ + (batch_index * pc.num_classes_ + class_index) * num_boxes;
|
||||
|
||||
// prepare temporary memory for sorting scores
|
||||
|
||||
// calculate temporary size that used for sorting
|
||||
size_t cub_sort_temp_storage_bytes = 0;
|
||||
CUDA_RETURN_IF_ERROR(cub::DeviceRadixSort::SortPairsDescending(
|
||||
nullptr, cub_sort_temp_storage_bytes,
|
||||
static_cast<float*>(nullptr), // scores
|
||||
static_cast<float*>(nullptr), // sorted scores
|
||||
static_cast<int*>(nullptr), // input indices
|
||||
static_cast<int*>(nullptr), // sorted indices
|
||||
num_boxes, // num items
|
||||
0, 8 * sizeof(float) // sort all bits
|
||||
));
|
||||
|
||||
// allocate temporary memory
|
||||
IAllocatorUniquePtr<void> d_cub_sort_buffer_ptr{allocator(cub_sort_temp_storage_bytes)};
|
||||
auto* d_cub_sort_buffer = static_cast<uint8_t*>(d_cub_sort_buffer_ptr.get());
|
||||
IAllocatorUniquePtr<void> d_indices_ptr{allocator(num_boxes * sizeof(int))};
|
||||
auto* d_indices = static_cast<int*>(d_indices_ptr.get());
|
||||
IAllocatorUniquePtr<void> d_sorted_indices_ptr{allocator(num_boxes * sizeof(int))};
|
||||
auto* d_sorted_indices = static_cast<int*>(d_sorted_indices_ptr.get());
|
||||
IAllocatorUniquePtr<void> d_selected_indices_ptr{allocator(num_boxes * sizeof(int))};
|
||||
auto* d_selected_indices = static_cast<int*>(d_selected_indices_ptr.get());
|
||||
IAllocatorUniquePtr<void> d_sorted_scores_ptr{allocator(num_boxes * sizeof(float))};
|
||||
auto* d_sorted_scores = static_cast<float*>(d_sorted_scores_ptr.get());
|
||||
IAllocatorUniquePtr<void> d_sorted_boxes_ptr{allocator(num_boxes * 4 * sizeof(float))};
|
||||
auto* d_sorted_boxes = static_cast<float*>(d_sorted_boxes_ptr.get());
|
||||
|
||||
// create sequense of indices
|
||||
int blocksPerGrid = (int)(ceil(static_cast<float>(num_boxes) / GridDim::maxThreadsPerBlock));
|
||||
Iota<int><<<blocksPerGrid, GridDim::maxThreadsPerBlock>>>(num_boxes, 0, d_indices);
|
||||
CUDA_RETURN_IF_ERROR(cudaGetLastError());
|
||||
|
||||
// sort scores
|
||||
CUDA_RETURN_IF_ERROR(cub::DeviceRadixSort::SortPairsDescending(
|
||||
d_cub_sort_buffer,
|
||||
cub_sort_temp_storage_bytes,
|
||||
scores_data,
|
||||
d_sorted_scores,
|
||||
d_indices,
|
||||
d_sorted_indices,
|
||||
num_boxes,
|
||||
0,
|
||||
8 * sizeof(float) // sort all bits
|
||||
));
|
||||
|
||||
// pick sorted scores
|
||||
const Box* original_boxes = reinterpret_cast<const Box*>(boxes_data);
|
||||
Box* sorted_boxes = reinterpret_cast<Box*>(d_sorted_boxes);
|
||||
IndexMultiSelect<int, Box><<<blocksPerGrid, GridDim::maxThreadsPerBlock>>>(num_boxes, d_sorted_indices, original_boxes, sorted_boxes);
|
||||
CUDA_RETURN_IF_ERROR(cudaGetLastError());
|
||||
|
||||
// STEP 2. filter boxes by scores
|
||||
int limited_num_boxes = num_boxes;
|
||||
if (pc.score_threshold_ != nullptr) {
|
||||
thrust::device_ptr<float> sorted_scores_device_ptr(d_sorted_scores);
|
||||
limited_num_boxes = thrust::count_if(
|
||||
sorted_scores_device_ptr,
|
||||
sorted_scores_device_ptr + num_boxes,
|
||||
DeviceGreaterThan(score_threshold));
|
||||
CUDA_RETURN_IF_ERROR(cudaGetLastError());
|
||||
|
||||
if (limited_num_boxes == 0) {
|
||||
*h_number_selected = 0;
|
||||
return Status::OK();
|
||||
}
|
||||
}
|
||||
|
||||
// STEP 3. launch NMS kernels
|
||||
ORT_RETURN_IF_ERROR(NmsGpu(allocator,
|
||||
center_point_box,
|
||||
d_sorted_boxes,
|
||||
limited_num_boxes,
|
||||
iou_threshold,
|
||||
d_selected_indices,
|
||||
h_number_selected,
|
||||
max_output_boxes_per_class));
|
||||
CUDA_RETURN_IF_ERROR(cudaGetLastError());
|
||||
|
||||
// STEP 4. map back to sorted indices
|
||||
*h_number_selected = std::min(*h_number_selected, (int)max_output_boxes_per_class);
|
||||
int num_to_keep = *h_number_selected;
|
||||
if (num_to_keep > 0) {
|
||||
IAllocatorUniquePtr<void> d_output_indices_ptr{allocator(num_to_keep * sizeof(int))};
|
||||
auto* d_output_indices = static_cast<int*>(d_output_indices_ptr.get());
|
||||
IAllocatorUniquePtr<void> d_normalized_output_indices_ptr{allocator(num_to_keep * 3 * sizeof(int64_t))};
|
||||
auto* d_normalized_output_indices = static_cast<int64_t*>(d_normalized_output_indices_ptr.get());
|
||||
|
||||
int blocksPerGrid = (int)(ceil(static_cast<float>(num_to_keep) / GridDim::maxThreadsPerBlock));
|
||||
IndexMultiSelect<int, int><<<blocksPerGrid, GridDim::maxThreadsPerBlock>>>(num_to_keep, d_selected_indices, d_sorted_indices, d_output_indices);
|
||||
NormalizeOutput<<<blocksPerGrid, GridDim::maxThreadsPerBlock>>>(num_to_keep, d_output_indices, d_normalized_output_indices, batch_index, class_index);
|
||||
CUDA_RETURN_IF_ERROR(cudaGetLastError());
|
||||
|
||||
selected_indices = std::move(d_normalized_output_indices_ptr);
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace onnxruntime
|
||||
|
|
@ -0,0 +1,29 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#pragma once
|
||||
#include <stdint.h>
|
||||
#include "core/providers/cuda/shared_inc/cuda_utils.h"
|
||||
#include "core/common/common.h"
|
||||
#include "core/framework/allocator.h"
|
||||
#include "core/framework/data_types.h"
|
||||
#include <functional>
|
||||
#include "core/providers/cpu/object_detection/non_max_suppression_helper.h"
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace cuda {
|
||||
|
||||
Status NonMaxSuppressionImpl(
|
||||
std::function<IAllocatorUniquePtr<void>(size_t)> allocator,
|
||||
const PrepareContext& pc,
|
||||
const int64_t center_point_box,
|
||||
int64_t batch_index,
|
||||
int64_t class_index,
|
||||
int64_t max_output_boxes_per_class,
|
||||
float iou_threshold,
|
||||
float score_threshold,
|
||||
IAllocatorUniquePtr<void>& selected_indices,
|
||||
int* h_number_selected);
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace onnxruntime
|
||||
Loading…
Reference in a new issue