NonMaxSuppression cuda implementation (#2082)

This commit is contained in:
Yulong Wang 2019-10-31 11:53:22 -07:00 committed by GitHub
parent 67755adfd8
commit bf7fa091cc
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 669 additions and 28 deletions

View file

@ -35,29 +35,8 @@ ONNX_OPERATOR_KERNEL_EX(
using namespace nms_helpers;
// CPU version
namespace nms_helpers {
Status GetThresholdsFromInputs(const PrepareContext& pc,
int64_t& max_output_boxes_per_class,
float& iou_threshold,
float& score_threshold) {
if (pc.max_output_boxes_per_class_ != nullptr) {
max_output_boxes_per_class = std::max<int64_t>(*pc.max_output_boxes_per_class_, 0);
}
if (pc.iou_threshold_ != nullptr) {
iou_threshold = *pc.iou_threshold_;
ORT_RETURN_IF_NOT((iou_threshold >= 0 && iou_threshold <= 1.f), "iou_threshold must be in range [0, 1].");
}
if (pc.score_threshold_ != nullptr) {
score_threshold = *pc.score_threshold_;
}
return Status::OK();
}
} // namespace nms_helpers
// This works for both CPU and GPU.
// CUDA kernel declare OrtMemTypeCPUInput for max_output_boxes_per_class(2), iou_threshold(3) and score_threshold(4)
Status NonMaxSuppressionBase::PrepareCompute(OpKernelContext* ctx, PrepareContext& pc) {
const auto* boxes_tensor = ctx->Input<Tensor>(0);
ORT_ENFORCE(boxes_tensor);
@ -111,6 +90,26 @@ Status NonMaxSuppressionBase::PrepareCompute(OpKernelContext* ctx, PrepareContex
return Status::OK();
}
Status NonMaxSuppressionBase::GetThresholdsFromInputs(const PrepareContext& pc,
int64_t& max_output_boxes_per_class,
float& iou_threshold,
float& score_threshold) {
if (pc.max_output_boxes_per_class_ != nullptr) {
max_output_boxes_per_class = std::max<int64_t>(*pc.max_output_boxes_per_class_, 0);
}
if (pc.iou_threshold_ != nullptr) {
iou_threshold = *pc.iou_threshold_;
ORT_RETURN_IF_NOT((iou_threshold >= 0 && iou_threshold <= 1.f), "iou_threshold must be in range [0, 1].");
}
if (pc.score_threshold_ != nullptr) {
score_threshold = *pc.score_threshold_;
}
return Status::OK();
}
Status NonMaxSuppression::Compute(OpKernelContext* ctx) const {
PrepareContext pc;
auto ret = PrepareCompute(ctx, pc);

View file

@ -18,6 +18,10 @@ class NonMaxSuppressionBase {
}
static Status PrepareCompute(OpKernelContext* ctx, PrepareContext& pc);
static Status GetThresholdsFromInputs(const PrepareContext& pc,
int64_t& max_output_boxes_per_class,
float& iou_threshold,
float& score_threshold);
int64_t GetCenterPointBox() const {
return center_point_box_;

View file

@ -544,24 +544,24 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, double, Less);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, MLFloat16, Less);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, EyeLike);
class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 10, Scatter);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, MLFloat16, Where);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, float, Where);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, int32_t, Where);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, int64_t, Where);
// opset 10
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, Dropout);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, float, RoiAlign);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, double, RoiAlign);
class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, NonMaxSuppression);
// opset 11
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, Gemm);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, Gemm);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, Gemm);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, NonMaxSuppression);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, Range);
class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 10, Scatter);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, ScatterElements);
class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 9, TopK);
@ -909,7 +909,9 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) {
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, MLFloat16, Less)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, float, RoiAlign)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, double, RoiAlign)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, NonMaxSuppression)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, EyeLike)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 10, Scatter)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, MLFloat16, Where)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, float, Where)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, int32_t, Where)>,
@ -919,9 +921,8 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) {
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, Gemm)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, Gemm)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, Gemm)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, NonMaxSuppression)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, Range)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 10, Scatter)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, ScatterElements)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 9, TopK)>,

View file

@ -0,0 +1,134 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "non_max_suppression.h"
#include "core/providers/cpu/object_detection/non_max_suppression_helper.h"
#include "non_max_suppression_impl.h"
#include "core/providers/cuda/tensor/concat_impl.h"
namespace onnxruntime {
namespace cuda {
ONNX_OPERATOR_VERSIONED_KERNEL_EX(
NonMaxSuppression,
kOnnxDomain,
10, 10,
kCudaExecutionProvider,
KernelDefBuilder()
.InputMemoryType<OrtMemTypeCPUInput>(2)
.InputMemoryType<OrtMemTypeCPUInput>(3)
.InputMemoryType<OrtMemTypeCPUInput>(4),
NonMaxSuppression);
ONNX_OPERATOR_KERNEL_EX(
NonMaxSuppression,
kOnnxDomain,
11,
kCudaExecutionProvider,
KernelDefBuilder()
.InputMemoryType<OrtMemTypeCPUInput>(2)
.InputMemoryType<OrtMemTypeCPUInput>(3)
.InputMemoryType<OrtMemTypeCPUInput>(4),
NonMaxSuppression);
Status NonMaxSuppression::ComputeInternal(OpKernelContext* ctx) const {
PrepareContext pc;
auto ret = PrepareCompute(ctx, pc);
ORT_RETURN_IF_NOT(ret.IsOK(), ret.ErrorMessage());
int64_t max_output_boxes_per_class = 0;
float iou_threshold = .0f;
float score_threshold = .0f;
ret = GetThresholdsFromInputs(pc, max_output_boxes_per_class, iou_threshold, score_threshold);
ORT_RETURN_IF_NOT(ret.IsOK(), ret.ErrorMessage());
if (0 == pc.num_boxes_ || 0 == max_output_boxes_per_class) {
ctx->Output(0, {0, 3});
return Status::OK();
}
// TODO: use cub::DeviceSegmentedRadixSort::SortPairsDescending instead of cub::DeviceRadixSort::SortPairsDescending
// to deal with multi batch/class parallelly
std::vector<std::tuple<IAllocatorUniquePtr<void>, int>> all_selected_indices;
int total_num_saved_outputs = 0;
for (int64_t batch_index = 0; batch_index < pc.num_batches_; ++batch_index) {
for (int64_t class_index = 0; class_index < pc.num_classes_; ++class_index) {
IAllocatorUniquePtr<void> d_selected_indices{};
IAllocatorUniquePtr<void> h_number_selected_ptr{AllocateBufferOnCPUPinned<void>(sizeof(int))};
auto* h_number_selected = static_cast<int*>(h_number_selected_ptr.get());
ORT_RETURN_IF_ERROR(NonMaxSuppressionImpl(
[this](size_t bytes) { return GetScratchBuffer<void>(bytes); },
pc,
GetCenterPointBox(),
batch_index,
class_index,
max_output_boxes_per_class,
iou_threshold,
score_threshold,
d_selected_indices,
h_number_selected));
int num_saved_outputs = *h_number_selected;
if (num_saved_outputs > 0) {
all_selected_indices.emplace_back(std::move(d_selected_indices), num_saved_outputs);
total_num_saved_outputs += num_saved_outputs;
}
}
}
if (total_num_saved_outputs == 0) {
ctx->Output(0, {0, 3});
} else {
// concatenate outputs
const int last_dim = 3;
const int num_elements = last_dim * total_num_saved_outputs;
Tensor* output = ctx->Output(0, {static_cast<int64_t>(total_num_saved_outputs), last_dim});
ORT_ENFORCE(output != nullptr);
int64_t* dst = output->MutableData<int64_t>();
size_t count = all_selected_indices.size();
CudaAsyncBuffer<const void*> input_ptr(this, count);
CudaAsyncBuffer<int64_t> concat_sizes_gpu(this, count);
CudaAsyncBuffer<int64_t> concat_sizes_range_gpu(this, count);
CudaAsyncBuffer<int64_t> axis_dimension_input_output_mapping_gpu(this, total_num_saved_outputs);
int index = 0;
for (size_t i = 0; i < count; i++) {
auto& it = all_selected_indices[i];
auto src = std::get<0>(it).get();
auto size = std::get<1>(it);
input_ptr.CpuPtr()[i] = src;
concat_sizes_gpu.CpuPtr()[i] = size;
concat_sizes_range_gpu.CpuPtr()[i] = (i == 0) ? size : size + concat_sizes_range_gpu.CpuPtr()[i - 1];
for (int j = 0; j < size; j++) {
axis_dimension_input_output_mapping_gpu.CpuPtr()[index++] = i;
}
}
concat_sizes_gpu.CopyToGpu();
axis_dimension_input_output_mapping_gpu.CopyToGpu();
concat_sizes_range_gpu.CopyToGpu();
input_ptr.CopyToGpu();
ORT_RETURN_IF_ERROR(ConcatImpl(sizeof(int64_t),
num_elements,
last_dim,
concat_sizes_gpu.GpuPtr(),
concat_sizes_range_gpu.GpuPtr(),
axis_dimension_input_output_mapping_gpu.GpuPtr(),
static_cast<int>(count),
dst,
input_ptr.GpuPtr(),
static_cast<size_t>(num_elements)));
}
return Status::OK();
}
} // namespace cuda
}; // namespace onnxruntime

View file

@ -0,0 +1,24 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/common/common.h"
#include "core/framework/op_kernel.h"
#include "core/providers/cuda/cuda_common.h"
#include "core/providers/cpu/object_detection/non_max_suppression.h"
namespace onnxruntime {
namespace cuda {
struct NonMaxSuppression final : public CudaKernel, public NonMaxSuppressionBase {
explicit NonMaxSuppression(const OpKernelInfo& info) : CudaKernel(info), NonMaxSuppressionBase(info) {
}
Status ComputeInternal(OpKernelContext* context) const override;
private:
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(NonMaxSuppression);
};
} // namespace cuda
} // namespace onnxruntime

View file

@ -0,0 +1,450 @@
/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
/* Modifications Copyright (c) Microsoft. */
#include "non_max_suppression_impl.h"
#include "core/providers/cpu/object_detection/non_max_suppression_helper.h"
#include "core/providers/cuda/cu_inc/common.cuh"
#include "core/providers/cuda/cuda_common.h"
#include "core/framework/tensor.h"
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <cub/cub.cuh>
namespace onnxruntime {
namespace cuda {
using namespace nms_helpers;
namespace {
struct __align__(16) Box {
float x1, y1, x2, y2;
};
// This is the width of the bitmask for masking boxes for each thread.
// This needs to be a multiple of 2(a POD width usually) so that division and
// modulo can be implemented as bit operations during host selection.
constexpr int kNmsBoxesPerThread = 8 * sizeof(int);
// Helper to calculate modulo mask and shift bits.
// For kNmsBoxesPerThread=32 ModuloMask will be 31, i.e 0x1F thus
// i % 32 == i & 31. Similarly ShiftBits will be 5 so that
// i / 32 == i >> 5. Using these bit operations should reduce the stall on host
// thread.
__device__ constexpr int NumBits(int n) { return (n == 0) ? 0 : NumBits(n >> 1) + 1; }
constexpr int kNmsBoxesPerThreadModuloMask = kNmsBoxesPerThread - 1;
constexpr int kNmsBoxesPerThreadShiftBits =
NumBits(kNmsBoxesPerThreadModuloMask);
constexpr int kNmsBlockDim = 16;
constexpr int kNmsBlockDimMax = 128;
constexpr int kNmsChunkSize = 2000;
template <typename T>
__device__ inline void Swap(T& a, T& b) {
T c(a);
a = b;
b = c;
}
// Check whether two boxes have an IoU greater than threshold.
template <typename T>
__device__ inline bool OverThreshold(const Box* a, const Box* b,
const float a_area,
const T iou_threshold) {
const float b_area = (b->x2 - b->x1) * (b->y2 - b->y1);
if (a_area == 0.0f || b_area == 0.0f) return false;
const float xx1 = fmaxf(a->x1, b->x1);
const float yy1 = fmaxf(a->y1, b->y1);
const float xx2 = fminf(a->x2, b->x2);
const float yy2 = fminf(a->y2, b->y2);
// fdimf computes the positive difference between xx2+1 and xx1.
const float w = fdimf(xx2, xx1);
const float h = fdimf(yy2, yy1);
const float intersection = w * h;
// Testing for aa/bb > t
// eq with aa > bb*t (b is !=0)
// avoiding divisions.
const float aa = intersection;
const float bb = a_area + b_area - intersection;
const float bt = bb * iou_threshold;
return aa >= bt;
}
__device__ inline void Flipped(Box& box) {
if (box.x1 > box.x2) Swap(box.x1, box.x2);
if (box.y1 > box.y2) Swap(box.y1, box.y2);
}
template <typename T>
__device__ inline bool CheckBit(T* bit_mask, int bit) {
constexpr int kShiftLen = NumBits(8 * sizeof(T)) - 1;
constexpr int kRemainderMask = 8 * sizeof(T) - 1;
int bin = bit >> kShiftLen;
return (bit_mask[bin] >> (bit & kRemainderMask)) & 1;
}
// Produce a global bitmask (result_mask) of selected boxes from bitmask
// generated by NMSKernel Abort early if max_boxes boxes are selected. Bitmask
// is num_boxes*bit_mask_len bits indicating whether to keep or remove a box.
__global__ void NMSReduce(const int* bitmask, const int bit_mask_len,
const int num_boxes, const int64_t max_boxes,
char* result_mask) {
extern __shared__ int local[];
// set global mask to accept all boxes
for (int box = blockIdx.x * blockDim.x + threadIdx.x; box < bit_mask_len; box += blockDim.x * gridDim.x) {
local[box] = 0xFFFFFFFF;
}
__syncthreads();
int accepted_boxes = 0;
for (int box = 0; box < num_boxes - 1; ++box) {
// if current box is masked by an earlier box, skip it.
if (!CheckBit(local, box)) {
continue;
}
accepted_boxes += 1;
int offset = box * bit_mask_len;
// update global mask with current box's mask
for (int b = blockIdx.x * blockDim.x + threadIdx.x; b < bit_mask_len; b += blockDim.x * gridDim.x) {
local[b] &= ~bitmask[offset + b];
}
__syncthreads();
if (accepted_boxes > max_boxes) break;
}
// copy global mask to result_max char array. char array is needed for
// cub::DeviceSelect later.
for (int box = blockIdx.x * blockDim.x + threadIdx.x; box < num_boxes; box += blockDim.x * gridDim.x) {
result_mask[box] = CheckBit(local, box);
}
}
// For each box, compute a bitmask of boxes which has an overlap with given box
// above threshold.
//
// Starting from highes scoring box, mark any box which has IoU>threshold with
// given box. Each thread processes a kNmsBoxesPerThread boxes per stride, and
// each box has bitmask of overlaps of length bit_mask_len.
//
__launch_bounds__(kNmsBlockDim* kNmsBlockDim, 4) __global__
void NMSKernel(
const int64_t center_point_box,
const Box* d_desc_sorted_boxes,
const int num_boxes,
const float iou_threshold,
const int bit_mask_len,
int* d_delete_mask) {
for (int i_block_offset = blockIdx.x * blockDim.x; i_block_offset < num_boxes;
i_block_offset += blockDim.x * gridDim.x) {
const int i = i_block_offset + threadIdx.x;
if (i < num_boxes) {
for (int j_thread_offset =
kNmsBoxesPerThread * (blockIdx.y * blockDim.y + threadIdx.y);
j_thread_offset < num_boxes;
j_thread_offset += kNmsBoxesPerThread * blockDim.y * gridDim.y) {
// Note : We can do everything using multiplication,
// and use fp16 - we are comparing against a low precision
// threshold.
int above_threshold = 0;
// Make sure that threads are within valid domain.
bool valid = false;
// Loop over the next kNmsBoxesPerThread boxes and set corresponding bit
// if it is overlapping with current box
for (int ib = 0; ib < kNmsBoxesPerThread; ++ib) {
// This thread will compare Box i and Box j.
const int j = j_thread_offset + ib;
if (i >= j || i >= num_boxes || j >= num_boxes) continue;
valid = true;
if (SuppressByIOU(reinterpret_cast<const float*>(d_desc_sorted_boxes),
i, j, center_point_box, iou_threshold)) {
// we have score[j] <= score[i].
above_threshold |= (1U << ib);
}
}
if (valid) {
d_delete_mask[i * bit_mask_len + j_thread_offset / kNmsBoxesPerThread] =
above_threshold;
}
}
}
}
}
// Variadic template helpers for Index selecting multiple arrays at the same
// time
template <typename Index>
__device__ inline void SelectHelper(const Index i_selected,
const Index i_original) {}
template <typename Index, typename T, typename... Args>
__device__ inline void SelectHelper(const Index i_selected,
const Index i_original,
const T* original, T* selected,
Args... args) {
selected[i_selected] = original[i_original];
SelectHelper(i_selected, i_original, args...);
}
// Helper template to select elements from original arrays using the index
// mapping and store into selected array. Each array sharing same mapping need
// to be passed as pairs of pointers to original and selected arrays. For
// selecting 2 arrays call would be
// IndexMultiSelect(num_elements, indices, original1 ,selected1, original2,
// selected2).
template <typename Index, typename T, typename... Args>
__global__ void IndexMultiSelect(const int num_elements, const Index* indices,
const T* original, T* selected, Args... args) {
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num_elements; idx += blockDim.x * gridDim.x) {
SelectHelper(idx, indices[idx], original, selected, args...);
}
}
template <typename T>
__global__ void SetZero(const int count, T* __restrict__ ptr) {
// Check that the grid is one dimensional and index doesn't overflow.
assert(blockDim.y == 1);
assert(blockDim.z == 1);
assert(blockDim.x * gridDim.x / blockDim.x == gridDim.x);
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < count; i += blockDim.x * gridDim.x) {
ptr[i] = T(0);
}
}
template <typename T>
__global__ void Iota(const int num_elements, const T offset, T* to_fill) {
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num_elements; idx += blockDim.x * gridDim.x) {
to_fill[idx] = static_cast<T>(idx) + offset;
}
}
__global__ void NormalizeOutput(const int num_elements, const int* original, int64_t* to_normalize, int64_t batch_index, int64_t class_index) {
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num_elements; idx += blockDim.x * gridDim.x) {
to_normalize[idx * 3] = batch_index;
to_normalize[idx * 3 + 1] = class_index;
to_normalize[idx * 3 + 2] = static_cast<int64_t>(original[idx]);
}
}
Status NmsGpu(std::function<IAllocatorUniquePtr<void>(size_t)> allocator,
const int64_t center_point_box,
const float* d_sorted_boxes_float_ptr,
const int num_boxes,
const float iou_threshold,
int* d_selected_indices,
int* h_nkeep,
const int64_t max_boxes) {
// Making sure we respect the __align(16)__
// we promised to the compiler.
auto iptr = reinterpret_cast<std::uintptr_t>(d_sorted_boxes_float_ptr);
ORT_ENFORCE((iptr & 15) == 0);
const int bit_mask_len =
(num_boxes + kNmsBoxesPerThread - 1) / kNmsBoxesPerThread;
int max_nms_mask_size = num_boxes * bit_mask_len;
IAllocatorUniquePtr<void> d_nms_mask_ptr{allocator(max_nms_mask_size * sizeof(int))};
auto* d_nms_mask = static_cast<int*>(d_nms_mask_ptr.get());
int blocksPerGrid = (int)(ceil(static_cast<float>(max_nms_mask_size) / GridDim::maxThreadsPerBlock));
SetZero<int><<<blocksPerGrid, GridDim::maxThreadsPerBlock>>>(max_nms_mask_size, d_nms_mask);
int* d_delete_mask = d_nms_mask;
int* h_selected_count = h_nkeep;
const Box* d_sorted_boxes =
reinterpret_cast<const Box*>(d_sorted_boxes_float_ptr);
dim3 block_dim, thread_block;
int num_blocks = (num_boxes + kNmsBlockDim - 1) / kNmsBlockDim;
num_blocks = std::max(std::min(num_blocks, kNmsBlockDimMax), 1);
block_dim.x = num_blocks;
block_dim.y = num_blocks;
block_dim.z = 1;
thread_block.x = kNmsBlockDim;
thread_block.y = kNmsBlockDim;
thread_block.z = 1;
NMSKernel<<<block_dim, thread_block>>>(center_point_box,
d_sorted_boxes,
num_boxes,
iou_threshold,
bit_mask_len,
d_delete_mask);
IAllocatorUniquePtr<void> d_selected_boxes_ptr{allocator(num_boxes * sizeof(char))};
auto* d_selected_boxes = static_cast<char*>(d_selected_boxes_ptr.get());
IAllocatorUniquePtr<void> d_indices_ptr{allocator(num_boxes * sizeof(int))};
auto* d_indices = static_cast<int*>(d_indices_ptr.get());
blocksPerGrid = (int)(ceil(static_cast<float>(num_boxes) / GridDim::maxThreadsPerBlock));
Iota<int><<<blocksPerGrid, GridDim::maxThreadsPerBlock>>>(num_boxes, 0, d_indices);
NMSReduce<<<1, 1024, bit_mask_len * sizeof(int)>>>(d_delete_mask, bit_mask_len, num_boxes, max_boxes, d_selected_boxes);
size_t flagged_buffer_size = 0;
CUDA_RETURN_IF_ERROR(cub::DeviceSelect::Flagged(static_cast<void*>(nullptr), // temp_storage
flagged_buffer_size,
static_cast<int*>(nullptr), // input
static_cast<char*>(nullptr), // selection flag
static_cast<int*>(nullptr), // selected items
static_cast<int*>(nullptr), // num_selected
num_boxes));
IAllocatorUniquePtr<void> d_cub_scratch_buffer_ptr{allocator(flagged_buffer_size)};
auto* d_cub_scratch_buffer = static_cast<uint8_t*>(d_cub_scratch_buffer_ptr.get());
IAllocatorUniquePtr<void> d_num_selected_ptr{allocator(sizeof(int))};
auto* d_num_selected = static_cast<int*>(d_num_selected_ptr.get());
CUDA_RETURN_IF_ERROR(cub::DeviceSelect::Flagged(
d_cub_scratch_buffer, // temp_storage
flagged_buffer_size,
d_indices, // input
d_selected_boxes, // selection flag
d_selected_indices, // selected items
d_num_selected, num_boxes));
CUDA_RETURN_IF_ERROR(cudaMemcpy(h_selected_count, d_num_selected, sizeof(int), cudaMemcpyDeviceToHost));
return Status::OK();
}
struct DeviceGreaterThan {
float threshold_;
__host__ __device__ __forceinline__ DeviceGreaterThan(float threshold)
: threshold_(threshold) {}
__host__ __device__ __forceinline__ bool operator()(const float& val) const {
return (val > threshold_);
}
};
} // namespace
Status NonMaxSuppressionImpl(
std::function<IAllocatorUniquePtr<void>(size_t)> allocator,
const PrepareContext& pc,
const int64_t center_point_box,
int64_t batch_index,
int64_t class_index,
int64_t max_output_boxes_per_class,
float iou_threshold,
float score_threshold,
IAllocatorUniquePtr<void>& selected_indices,
int* h_number_selected) {
// STEP 1. Prepare data
int num_boxes = pc.num_boxes_;
const float* boxes_data = pc.boxes_data_ + batch_index * num_boxes * 4;
const float* scores_data = pc.scores_data_ + (batch_index * pc.num_classes_ + class_index) * num_boxes;
// prepare temporary memory for sorting scores
// calculate temporary size that used for sorting
size_t cub_sort_temp_storage_bytes = 0;
CUDA_RETURN_IF_ERROR(cub::DeviceRadixSort::SortPairsDescending(
nullptr, cub_sort_temp_storage_bytes,
static_cast<float*>(nullptr), // scores
static_cast<float*>(nullptr), // sorted scores
static_cast<int*>(nullptr), // input indices
static_cast<int*>(nullptr), // sorted indices
num_boxes, // num items
0, 8 * sizeof(float) // sort all bits
));
// allocate temporary memory
IAllocatorUniquePtr<void> d_cub_sort_buffer_ptr{allocator(cub_sort_temp_storage_bytes)};
auto* d_cub_sort_buffer = static_cast<uint8_t*>(d_cub_sort_buffer_ptr.get());
IAllocatorUniquePtr<void> d_indices_ptr{allocator(num_boxes * sizeof(int))};
auto* d_indices = static_cast<int*>(d_indices_ptr.get());
IAllocatorUniquePtr<void> d_sorted_indices_ptr{allocator(num_boxes * sizeof(int))};
auto* d_sorted_indices = static_cast<int*>(d_sorted_indices_ptr.get());
IAllocatorUniquePtr<void> d_selected_indices_ptr{allocator(num_boxes * sizeof(int))};
auto* d_selected_indices = static_cast<int*>(d_selected_indices_ptr.get());
IAllocatorUniquePtr<void> d_sorted_scores_ptr{allocator(num_boxes * sizeof(float))};
auto* d_sorted_scores = static_cast<float*>(d_sorted_scores_ptr.get());
IAllocatorUniquePtr<void> d_sorted_boxes_ptr{allocator(num_boxes * 4 * sizeof(float))};
auto* d_sorted_boxes = static_cast<float*>(d_sorted_boxes_ptr.get());
// create sequense of indices
int blocksPerGrid = (int)(ceil(static_cast<float>(num_boxes) / GridDim::maxThreadsPerBlock));
Iota<int><<<blocksPerGrid, GridDim::maxThreadsPerBlock>>>(num_boxes, 0, d_indices);
CUDA_RETURN_IF_ERROR(cudaGetLastError());
// sort scores
CUDA_RETURN_IF_ERROR(cub::DeviceRadixSort::SortPairsDescending(
d_cub_sort_buffer,
cub_sort_temp_storage_bytes,
scores_data,
d_sorted_scores,
d_indices,
d_sorted_indices,
num_boxes,
0,
8 * sizeof(float) // sort all bits
));
// pick sorted scores
const Box* original_boxes = reinterpret_cast<const Box*>(boxes_data);
Box* sorted_boxes = reinterpret_cast<Box*>(d_sorted_boxes);
IndexMultiSelect<int, Box><<<blocksPerGrid, GridDim::maxThreadsPerBlock>>>(num_boxes, d_sorted_indices, original_boxes, sorted_boxes);
CUDA_RETURN_IF_ERROR(cudaGetLastError());
// STEP 2. filter boxes by scores
int limited_num_boxes = num_boxes;
if (pc.score_threshold_ != nullptr) {
thrust::device_ptr<float> sorted_scores_device_ptr(d_sorted_scores);
limited_num_boxes = thrust::count_if(
sorted_scores_device_ptr,
sorted_scores_device_ptr + num_boxes,
DeviceGreaterThan(score_threshold));
CUDA_RETURN_IF_ERROR(cudaGetLastError());
if (limited_num_boxes == 0) {
*h_number_selected = 0;
return Status::OK();
}
}
// STEP 3. launch NMS kernels
ORT_RETURN_IF_ERROR(NmsGpu(allocator,
center_point_box,
d_sorted_boxes,
limited_num_boxes,
iou_threshold,
d_selected_indices,
h_number_selected,
max_output_boxes_per_class));
CUDA_RETURN_IF_ERROR(cudaGetLastError());
// STEP 4. map back to sorted indices
*h_number_selected = std::min(*h_number_selected, (int)max_output_boxes_per_class);
int num_to_keep = *h_number_selected;
if (num_to_keep > 0) {
IAllocatorUniquePtr<void> d_output_indices_ptr{allocator(num_to_keep * sizeof(int))};
auto* d_output_indices = static_cast<int*>(d_output_indices_ptr.get());
IAllocatorUniquePtr<void> d_normalized_output_indices_ptr{allocator(num_to_keep * 3 * sizeof(int64_t))};
auto* d_normalized_output_indices = static_cast<int64_t*>(d_normalized_output_indices_ptr.get());
int blocksPerGrid = (int)(ceil(static_cast<float>(num_to_keep) / GridDim::maxThreadsPerBlock));
IndexMultiSelect<int, int><<<blocksPerGrid, GridDim::maxThreadsPerBlock>>>(num_to_keep, d_selected_indices, d_sorted_indices, d_output_indices);
NormalizeOutput<<<blocksPerGrid, GridDim::maxThreadsPerBlock>>>(num_to_keep, d_output_indices, d_normalized_output_indices, batch_index, class_index);
CUDA_RETURN_IF_ERROR(cudaGetLastError());
selected_indices = std::move(d_normalized_output_indices_ptr);
}
return Status::OK();
}
} // namespace cuda
} // namespace onnxruntime

View file

@ -0,0 +1,29 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <stdint.h>
#include "core/providers/cuda/shared_inc/cuda_utils.h"
#include "core/common/common.h"
#include "core/framework/allocator.h"
#include "core/framework/data_types.h"
#include <functional>
#include "core/providers/cpu/object_detection/non_max_suppression_helper.h"
namespace onnxruntime {
namespace cuda {
Status NonMaxSuppressionImpl(
std::function<IAllocatorUniquePtr<void>(size_t)> allocator,
const PrepareContext& pc,
const int64_t center_point_box,
int64_t batch_index,
int64_t class_index,
int64_t max_output_boxes_per_class,
float iou_threshold,
float score_threshold,
IAllocatorUniquePtr<void>& selected_indices,
int* h_number_selected);
} // namespace cuda
} // namespace onnxruntime