diff --git a/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.cc b/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.cc
index 87d21ab321..0b5f78919b 100644
--- a/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.cc
+++ b/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.cc
@@ -35,29 +35,8 @@ ONNX_OPERATOR_KERNEL_EX(
 
 using namespace nms_helpers;
 
-// CPU version
-namespace nms_helpers {
-Status GetThresholdsFromInputs(const PrepareContext& pc,
-                               int64_t& max_output_boxes_per_class,
-                               float& iou_threshold,
-                               float& score_threshold) {
-  if (pc.max_output_boxes_per_class_ != nullptr) {
-    max_output_boxes_per_class = std::max<int64_t>(*pc.max_output_boxes_per_class_, 0);
-  }
-
-  if (pc.iou_threshold_ != nullptr) {
-    iou_threshold = *pc.iou_threshold_;
-    ORT_RETURN_IF_NOT((iou_threshold >= 0 && iou_threshold <= 1.f), "iou_threshold must be in range [0, 1].");
-  }
-
-  if (pc.score_threshold_ != nullptr) {
-    score_threshold = *pc.score_threshold_;
-  }
-
-  return Status::OK();
-}
-}  // namespace nms_helpers
-
+// This works for both CPU and GPU.
+// CUDA kernel declare OrtMemTypeCPUInput for max_output_boxes_per_class(2), iou_threshold(3) and score_threshold(4)
 Status NonMaxSuppressionBase::PrepareCompute(OpKernelContext* ctx, PrepareContext& pc) {
   const auto* boxes_tensor = ctx->Input<Tensor>(0);
   ORT_ENFORCE(boxes_tensor);
@@ -111,6 +90,26 @@ Status NonMaxSuppressionBase::PrepareCompute(OpKernelContext* ctx, PrepareContex
   return Status::OK();
 }
 
+Status NonMaxSuppressionBase::GetThresholdsFromInputs(const PrepareContext& pc,
+                                                      int64_t& max_output_boxes_per_class,
+                                                      float& iou_threshold,
+                                                      float& score_threshold) {
+  if (pc.max_output_boxes_per_class_ != nullptr) {
+    max_output_boxes_per_class = std::max<int64_t>(*pc.max_output_boxes_per_class_, 0);
+  }
+
+  if (pc.iou_threshold_ != nullptr) {
+    iou_threshold = *pc.iou_threshold_;
+    ORT_RETURN_IF_NOT((iou_threshold >= 0 && iou_threshold <= 1.f), "iou_threshold must be in range [0, 1].");
+  }
+
+  if (pc.score_threshold_ != nullptr) {
+    score_threshold = *pc.score_threshold_;
+  }
+
+  return Status::OK();
+}
+
 Status NonMaxSuppression::Compute(OpKernelContext* ctx) const {
   PrepareContext pc;
   auto ret = PrepareCompute(ctx, pc);
diff --git a/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.h b/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.h
index 37578539cd..f6465ea3ee 100644
--- a/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.h
+++ b/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.h
@@ -18,6 +18,10 @@ class NonMaxSuppressionBase {
   }
 
   static Status PrepareCompute(OpKernelContext* ctx, PrepareContext& pc);
+  static Status GetThresholdsFromInputs(const PrepareContext& pc,
+                                        int64_t& max_output_boxes_per_class,
+                                        float& iou_threshold,
+                                        float& score_threshold);
 
   int64_t GetCenterPointBox() const {
     return center_point_box_;
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index 17b3d68477..f964a98c0b 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -544,24 +544,24 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, double, Less);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, MLFloat16, Less);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, EyeLike);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 10, Scatter);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, MLFloat16, Where);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, float, Where);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, int32_t, Where);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, int64_t, Where);
 
-
 // opset 10
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, Dropout);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, float, RoiAlign);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, double, RoiAlign);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, NonMaxSuppression);
 
 // opset 11
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, Gemm);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, Gemm);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, Gemm);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, NonMaxSuppression);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, Range);
-
-class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 10, Scatter);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, ScatterElements);
 
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 9, TopK);
@@ -909,7 +909,9 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, MLFloat16, Less)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, float, RoiAlign)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, double, RoiAlign)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, NonMaxSuppression)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, EyeLike)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 10, Scatter)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, MLFloat16, Where)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, float, Where)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, int32_t, Where)>,
@@ -919,9 +921,8 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, Gemm)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, Gemm)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, Gemm)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, NonMaxSuppression)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, Range)>,
-
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 10, Scatter)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, ScatterElements)>,
 
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 9, TopK)>,
diff --git a/onnxruntime/core/providers/cuda/object_detection/non_max_suppression.cc b/onnxruntime/core/providers/cuda/object_detection/non_max_suppression.cc
new file mode 100644
index 0000000000..039221bdf0
--- /dev/null
+++ b/onnxruntime/core/providers/cuda/object_detection/non_max_suppression.cc
@@ -0,0 +1,134 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "non_max_suppression.h"
+#include "core/providers/cpu/object_detection/non_max_suppression_helper.h"
+#include "non_max_suppression_impl.h"
+#include "core/providers/cuda/tensor/concat_impl.h"
+
+namespace onnxruntime {
+namespace cuda {
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    NonMaxSuppression,
+    kOnnxDomain,
+    10, 10,
+    kCudaExecutionProvider,
+    KernelDefBuilder()
+        .InputMemoryType<OrtMemTypeCPUInput>(2)
+        .InputMemoryType<OrtMemTypeCPUInput>(3)
+        .InputMemoryType<OrtMemTypeCPUInput>(4),
+    NonMaxSuppression);
+
+ONNX_OPERATOR_KERNEL_EX(
+    NonMaxSuppression,
+    kOnnxDomain,
+    11,
+    kCudaExecutionProvider,
+    KernelDefBuilder()
+        .InputMemoryType<OrtMemTypeCPUInput>(2)
+        .InputMemoryType<OrtMemTypeCPUInput>(3)
+        .InputMemoryType<OrtMemTypeCPUInput>(4),
+    NonMaxSuppression);
+
+Status NonMaxSuppression::ComputeInternal(OpKernelContext* ctx) const {
+  PrepareContext pc;
+  auto ret = PrepareCompute(ctx, pc);
+  ORT_RETURN_IF_NOT(ret.IsOK(), ret.ErrorMessage());
+
+  int64_t max_output_boxes_per_class = 0;
+  float iou_threshold = .0f;
+  float score_threshold = .0f;
+
+  ret = GetThresholdsFromInputs(pc, max_output_boxes_per_class, iou_threshold, score_threshold);
+  ORT_RETURN_IF_NOT(ret.IsOK(), ret.ErrorMessage());
+
+  if (0 == pc.num_boxes_ || 0 == max_output_boxes_per_class) {
+    ctx->Output(0, {0, 3});
+    return Status::OK();
+  }
+
+  // TODO: use cub::DeviceSegmentedRadixSort::SortPairsDescending instead of cub::DeviceRadixSort::SortPairsDescending
+  //       to deal with multi batch/class parallelly
+
+  std::vector<std::tuple<IAllocatorUniquePtr<void>, int>> all_selected_indices;
+  int total_num_saved_outputs = 0;
+
+  for (int64_t batch_index = 0; batch_index < pc.num_batches_; ++batch_index) {
+    for (int64_t class_index = 0; class_index < pc.num_classes_; ++class_index) {
+      IAllocatorUniquePtr<void> d_selected_indices{};
+      IAllocatorUniquePtr<void> h_number_selected_ptr{AllocateBufferOnCPUPinned<void>(sizeof(int))};
+      auto* h_number_selected = static_cast<int*>(h_number_selected_ptr.get());
+
+      ORT_RETURN_IF_ERROR(NonMaxSuppressionImpl(
+          [this](size_t bytes) { return GetScratchBuffer<void>(bytes); },
+          pc,
+          GetCenterPointBox(),
+          batch_index,
+          class_index,
+          max_output_boxes_per_class,
+          iou_threshold,
+          score_threshold,
+          d_selected_indices,
+          h_number_selected));
+
+      int num_saved_outputs = *h_number_selected;
+      if (num_saved_outputs > 0) {
+        all_selected_indices.emplace_back(std::move(d_selected_indices), num_saved_outputs);
+        total_num_saved_outputs += num_saved_outputs;
+      }
+    }
+  }
+
+  if (total_num_saved_outputs == 0) {
+    ctx->Output(0, {0, 3});
+  } else {
+    // concatenate outputs
+    const int last_dim = 3;
+    const int num_elements = last_dim * total_num_saved_outputs;
+    Tensor* output = ctx->Output(0, {static_cast<int64_t>(total_num_saved_outputs), last_dim});
+    ORT_ENFORCE(output != nullptr);
+    int64_t* dst = output->MutableData<int64_t>();
+    size_t count = all_selected_indices.size();
+
+    CudaAsyncBuffer<const void*> input_ptr(this, count);
+    CudaAsyncBuffer<int64_t> concat_sizes_gpu(this, count);
+    CudaAsyncBuffer<int64_t> concat_sizes_range_gpu(this, count);
+    CudaAsyncBuffer<int64_t> axis_dimension_input_output_mapping_gpu(this, total_num_saved_outputs);
+
+    int index = 0;
+    for (size_t i = 0; i < count; i++) {
+      auto& it = all_selected_indices[i];
+      auto src = std::get<0>(it).get();
+      auto size = std::get<1>(it);
+
+      input_ptr.CpuPtr()[i] = src;
+      concat_sizes_gpu.CpuPtr()[i] = size;
+      concat_sizes_range_gpu.CpuPtr()[i] = (i == 0) ? size : size + concat_sizes_range_gpu.CpuPtr()[i - 1];
+      for (int j = 0; j < size; j++) {
+        axis_dimension_input_output_mapping_gpu.CpuPtr()[index++] = i;
+      }
+    }
+
+    concat_sizes_gpu.CopyToGpu();
+    axis_dimension_input_output_mapping_gpu.CopyToGpu();
+    concat_sizes_range_gpu.CopyToGpu();
+    input_ptr.CopyToGpu();
+
+    ORT_RETURN_IF_ERROR(ConcatImpl(sizeof(int64_t),
+                                   num_elements,
+                                   last_dim,
+                                   concat_sizes_gpu.GpuPtr(),
+                                   concat_sizes_range_gpu.GpuPtr(),
+                                   axis_dimension_input_output_mapping_gpu.GpuPtr(),
+                                   static_cast<int>(count),
+                                   dst,
+                                   input_ptr.GpuPtr(),
+                                   static_cast<size_t>(num_elements)));
+  }
+
+  return Status::OK();
+}
+
+}  // namespace cuda
+};  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/object_detection/non_max_suppression.h b/onnxruntime/core/providers/cuda/object_detection/non_max_suppression.h
new file mode 100644
index 0000000000..7a03126294
--- /dev/null
+++ b/onnxruntime/core/providers/cuda/object_detection/non_max_suppression.h
@@ -0,0 +1,24 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/common/common.h"
+#include "core/framework/op_kernel.h"
+#include "core/providers/cuda/cuda_common.h"
+#include "core/providers/cpu/object_detection/non_max_suppression.h"
+
+namespace onnxruntime {
+namespace cuda {
+
+struct NonMaxSuppression final : public CudaKernel, public NonMaxSuppressionBase {
+  explicit NonMaxSuppression(const OpKernelInfo& info) : CudaKernel(info), NonMaxSuppressionBase(info) {
+  }
+
+  Status ComputeInternal(OpKernelContext* context) const override;
+
+ private:
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(NonMaxSuppression);
+};
+}  // namespace cuda
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/object_detection/non_max_suppression_impl.cu b/onnxruntime/core/providers/cuda/object_detection/non_max_suppression_impl.cu
new file mode 100644
index 0000000000..8d04fc2d4d
--- /dev/null
+++ b/onnxruntime/core/providers/cuda/object_detection/non_max_suppression_impl.cu
@@ -0,0 +1,450 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+/* Modifications Copyright (c) Microsoft. */
+
+#include "non_max_suppression_impl.h"
+#include "core/providers/cpu/object_detection/non_max_suppression_helper.h"
+#include "core/providers/cuda/cu_inc/common.cuh"
+#include "core/providers/cuda/cuda_common.h"
+
+#include "core/framework/tensor.h"
+
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+
+#include <cub/cub.cuh>
+
+namespace onnxruntime {
+namespace cuda {
+
+using namespace nms_helpers;
+
+namespace {
+
+struct __align__(16) Box {
+  float x1, y1, x2, y2;
+};
+
+// This is the width of the bitmask for masking boxes for each thread.
+// This needs to be a multiple of 2(a POD width usually) so that division and
+// modulo can be implemented as bit operations during host selection.
+constexpr int kNmsBoxesPerThread = 8 * sizeof(int);
+// Helper to calculate modulo mask and shift bits.
+// For kNmsBoxesPerThread=32 ModuloMask will be 31, i.e 0x1F thus
+// i % 32 == i & 31. Similarly ShiftBits will be 5 so that
+// i / 32 == i >> 5. Using these bit operations should reduce the stall on host
+// thread.
+__device__ constexpr int NumBits(int n) { return (n == 0) ? 0 : NumBits(n >> 1) + 1; }
+constexpr int kNmsBoxesPerThreadModuloMask = kNmsBoxesPerThread - 1;
+constexpr int kNmsBoxesPerThreadShiftBits =
+    NumBits(kNmsBoxesPerThreadModuloMask);
+
+constexpr int kNmsBlockDim = 16;
+constexpr int kNmsBlockDimMax = 128;
+constexpr int kNmsChunkSize = 2000;
+
+template <typename T>
+__device__ inline void Swap(T& a, T& b) {
+  T c(a);
+  a = b;
+  b = c;
+}
+
+// Check whether two boxes have an IoU greater than threshold.
+template <typename T>
+__device__ inline bool OverThreshold(const Box* a, const Box* b,
+                                     const float a_area,
+                                     const T iou_threshold) {
+  const float b_area = (b->x2 - b->x1) * (b->y2 - b->y1);
+  if (a_area == 0.0f || b_area == 0.0f) return false;
+  const float xx1 = fmaxf(a->x1, b->x1);
+  const float yy1 = fmaxf(a->y1, b->y1);
+  const float xx2 = fminf(a->x2, b->x2);
+  const float yy2 = fminf(a->y2, b->y2);
+
+  // fdimf computes the positive difference between xx2+1 and xx1.
+  const float w = fdimf(xx2, xx1);
+  const float h = fdimf(yy2, yy1);
+  const float intersection = w * h;
+
+  // Testing for aa/bb > t
+  // eq with aa > bb*t (b is !=0)
+  // avoiding divisions.
+  const float aa = intersection;
+  const float bb = a_area + b_area - intersection;
+  const float bt = bb * iou_threshold;
+  return aa >= bt;
+}
+
+__device__ inline void Flipped(Box& box) {
+  if (box.x1 > box.x2) Swap(box.x1, box.x2);
+  if (box.y1 > box.y2) Swap(box.y1, box.y2);
+}
+template <typename T>
+__device__ inline bool CheckBit(T* bit_mask, int bit) {
+  constexpr int kShiftLen = NumBits(8 * sizeof(T)) - 1;
+  constexpr int kRemainderMask = 8 * sizeof(T) - 1;
+  int bin = bit >> kShiftLen;
+  return (bit_mask[bin] >> (bit & kRemainderMask)) & 1;
+}
+
+// Produce a global bitmask (result_mask) of selected boxes from bitmask
+// generated by NMSKernel Abort early if max_boxes boxes are selected. Bitmask
+// is num_boxes*bit_mask_len bits indicating whether to keep or remove a box.
+__global__ void NMSReduce(const int* bitmask, const int bit_mask_len,
+                          const int num_boxes, const int64_t max_boxes,
+                          char* result_mask) {
+  extern __shared__ int local[];
+
+  // set global mask to accept all boxes
+  for (int box = blockIdx.x * blockDim.x + threadIdx.x; box < bit_mask_len; box += blockDim.x * gridDim.x) {
+    local[box] = 0xFFFFFFFF;
+  }
+  __syncthreads();
+  int accepted_boxes = 0;
+  for (int box = 0; box < num_boxes - 1; ++box) {
+    // if current box is masked by an earlier box, skip it.
+    if (!CheckBit(local, box)) {
+      continue;
+    }
+    accepted_boxes += 1;
+    int offset = box * bit_mask_len;
+    // update global mask with current box's mask
+    for (int b = blockIdx.x * blockDim.x + threadIdx.x; b < bit_mask_len; b += blockDim.x * gridDim.x) {
+      local[b] &= ~bitmask[offset + b];
+    }
+    __syncthreads();
+    if (accepted_boxes > max_boxes) break;
+  }
+  // copy global mask to result_max char array. char array is needed for
+  // cub::DeviceSelect later.
+  for (int box = blockIdx.x * blockDim.x + threadIdx.x; box < num_boxes; box += blockDim.x * gridDim.x) {
+    result_mask[box] = CheckBit(local, box);
+  }
+}
+
+// For each box, compute a bitmask of boxes which has an overlap with given box
+// above threshold.
+//
+// Starting from highes scoring box, mark any box which has IoU>threshold with
+// given box. Each thread processes a kNmsBoxesPerThread boxes per stride, and
+// each box has bitmask of overlaps of length bit_mask_len.
+//
+__launch_bounds__(kNmsBlockDim* kNmsBlockDim, 4) __global__
+    void NMSKernel(
+        const int64_t center_point_box,
+        const Box* d_desc_sorted_boxes,
+        const int num_boxes,
+        const float iou_threshold,
+        const int bit_mask_len,
+        int* d_delete_mask) {
+  for (int i_block_offset = blockIdx.x * blockDim.x; i_block_offset < num_boxes;
+       i_block_offset += blockDim.x * gridDim.x) {
+    const int i = i_block_offset + threadIdx.x;
+    if (i < num_boxes) {
+      for (int j_thread_offset =
+               kNmsBoxesPerThread * (blockIdx.y * blockDim.y + threadIdx.y);
+           j_thread_offset < num_boxes;
+           j_thread_offset += kNmsBoxesPerThread * blockDim.y * gridDim.y) {
+        // Note : We can do everything using multiplication,
+        // and use fp16 - we are comparing against a low precision
+        // threshold.
+        int above_threshold = 0;
+        // Make sure that threads are within valid domain.
+        bool valid = false;
+        // Loop over the next kNmsBoxesPerThread boxes and set corresponding bit
+        // if it is overlapping with current box
+        for (int ib = 0; ib < kNmsBoxesPerThread; ++ib) {
+          // This thread will compare Box i and Box j.
+          const int j = j_thread_offset + ib;
+          if (i >= j || i >= num_boxes || j >= num_boxes) continue;
+          valid = true;
+          if (SuppressByIOU(reinterpret_cast<const float*>(d_desc_sorted_boxes),
+                            i, j, center_point_box, iou_threshold)) {
+            // we have score[j] <= score[i].
+            above_threshold |= (1U << ib);
+          }
+        }
+        if (valid) {
+          d_delete_mask[i * bit_mask_len + j_thread_offset / kNmsBoxesPerThread] =
+              above_threshold;
+        }
+      }
+    }
+  }
+}
+// Variadic template helpers for Index selecting multiple arrays at the same
+// time
+template <typename Index>
+__device__ inline void SelectHelper(const Index i_selected,
+                                    const Index i_original) {}
+
+template <typename Index, typename T, typename... Args>
+__device__ inline void SelectHelper(const Index i_selected,
+                                    const Index i_original,
+                                    const T* original, T* selected,
+                                    Args... args) {
+  selected[i_selected] = original[i_original];
+  SelectHelper(i_selected, i_original, args...);
+}
+
+// Helper template to select elements from original arrays using the index
+// mapping and store into selected array. Each array sharing same mapping need
+// to be passed as pairs of pointers to original and selected arrays. For
+// selecting 2 arrays call would be
+// IndexMultiSelect(num_elements, indices, original1 ,selected1, original2,
+// selected2).
+template <typename Index, typename T, typename... Args>
+__global__ void IndexMultiSelect(const int num_elements, const Index* indices,
+                                 const T* original, T* selected, Args... args) {
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num_elements; idx += blockDim.x * gridDim.x) {
+    SelectHelper(idx, indices[idx], original, selected, args...);
+  }
+}
+
+template <typename T>
+__global__ void SetZero(const int count, T* __restrict__ ptr) {
+  // Check that the grid is one dimensional and index doesn't overflow.
+  assert(blockDim.y == 1);
+  assert(blockDim.z == 1);
+  assert(blockDim.x * gridDim.x / blockDim.x == gridDim.x);
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < count; i += blockDim.x * gridDim.x) {
+    ptr[i] = T(0);
+  }
+}
+
+template <typename T>
+__global__ void Iota(const int num_elements, const T offset, T* to_fill) {
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num_elements; idx += blockDim.x * gridDim.x) {
+    to_fill[idx] = static_cast<T>(idx) + offset;
+  }
+}
+
+__global__ void NormalizeOutput(const int num_elements, const int* original, int64_t* to_normalize, int64_t batch_index, int64_t class_index) {
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num_elements; idx += blockDim.x * gridDim.x) {
+    to_normalize[idx * 3] = batch_index;
+    to_normalize[idx * 3 + 1] = class_index;
+    to_normalize[idx * 3 + 2] = static_cast<int64_t>(original[idx]);
+  }
+}
+
+Status NmsGpu(std::function<IAllocatorUniquePtr<void>(size_t)> allocator,
+              const int64_t center_point_box,
+              const float* d_sorted_boxes_float_ptr,
+              const int num_boxes,
+              const float iou_threshold,
+              int* d_selected_indices,
+              int* h_nkeep,
+              const int64_t max_boxes) {
+  // Making sure we respect the __align(16)__
+  // we promised to the compiler.
+  auto iptr = reinterpret_cast<std::uintptr_t>(d_sorted_boxes_float_ptr);
+  ORT_ENFORCE((iptr & 15) == 0);
+
+  const int bit_mask_len =
+      (num_boxes + kNmsBoxesPerThread - 1) / kNmsBoxesPerThread;
+  int max_nms_mask_size = num_boxes * bit_mask_len;
+
+  IAllocatorUniquePtr<void> d_nms_mask_ptr{allocator(max_nms_mask_size * sizeof(int))};
+  auto* d_nms_mask = static_cast<int*>(d_nms_mask_ptr.get());
+
+  int blocksPerGrid = (int)(ceil(static_cast<float>(max_nms_mask_size) / GridDim::maxThreadsPerBlock));
+  SetZero<int><<<blocksPerGrid, GridDim::maxThreadsPerBlock>>>(max_nms_mask_size, d_nms_mask);
+
+  int* d_delete_mask = d_nms_mask;
+  int* h_selected_count = h_nkeep;
+  const Box* d_sorted_boxes =
+      reinterpret_cast<const Box*>(d_sorted_boxes_float_ptr);
+  dim3 block_dim, thread_block;
+  int num_blocks = (num_boxes + kNmsBlockDim - 1) / kNmsBlockDim;
+  num_blocks = std::max(std::min(num_blocks, kNmsBlockDimMax), 1);
+  block_dim.x = num_blocks;
+  block_dim.y = num_blocks;
+  block_dim.z = 1;
+  thread_block.x = kNmsBlockDim;
+  thread_block.y = kNmsBlockDim;
+  thread_block.z = 1;
+  NMSKernel<<<block_dim, thread_block>>>(center_point_box,
+                                         d_sorted_boxes,
+                                         num_boxes,
+                                         iou_threshold,
+                                         bit_mask_len,
+                                         d_delete_mask);
+
+  IAllocatorUniquePtr<void> d_selected_boxes_ptr{allocator(num_boxes * sizeof(char))};
+  auto* d_selected_boxes = static_cast<char*>(d_selected_boxes_ptr.get());
+  IAllocatorUniquePtr<void> d_indices_ptr{allocator(num_boxes * sizeof(int))};
+  auto* d_indices = static_cast<int*>(d_indices_ptr.get());
+
+  blocksPerGrid = (int)(ceil(static_cast<float>(num_boxes) / GridDim::maxThreadsPerBlock));
+  Iota<int><<<blocksPerGrid, GridDim::maxThreadsPerBlock>>>(num_boxes, 0, d_indices);
+
+  NMSReduce<<<1, 1024, bit_mask_len * sizeof(int)>>>(d_delete_mask, bit_mask_len, num_boxes, max_boxes, d_selected_boxes);
+
+  size_t flagged_buffer_size = 0;
+  CUDA_RETURN_IF_ERROR(cub::DeviceSelect::Flagged(static_cast<void*>(nullptr),  // temp_storage
+                                                  flagged_buffer_size,
+                                                  static_cast<int*>(nullptr),   // input
+                                                  static_cast<char*>(nullptr),  // selection flag
+                                                  static_cast<int*>(nullptr),   // selected items
+                                                  static_cast<int*>(nullptr),   // num_selected
+                                                  num_boxes));
+
+  IAllocatorUniquePtr<void> d_cub_scratch_buffer_ptr{allocator(flagged_buffer_size)};
+  auto* d_cub_scratch_buffer = static_cast<uint8_t*>(d_cub_scratch_buffer_ptr.get());
+  IAllocatorUniquePtr<void> d_num_selected_ptr{allocator(sizeof(int))};
+  auto* d_num_selected = static_cast<int*>(d_num_selected_ptr.get());
+
+  CUDA_RETURN_IF_ERROR(cub::DeviceSelect::Flagged(
+      d_cub_scratch_buffer,  // temp_storage
+      flagged_buffer_size,
+      d_indices,           // input
+      d_selected_boxes,    // selection flag
+      d_selected_indices,  // selected items
+      d_num_selected, num_boxes));
+  CUDA_RETURN_IF_ERROR(cudaMemcpy(h_selected_count, d_num_selected, sizeof(int), cudaMemcpyDeviceToHost));
+
+  return Status::OK();
+}
+
+struct DeviceGreaterThan {
+  float threshold_;
+  __host__ __device__ __forceinline__ DeviceGreaterThan(float threshold)
+      : threshold_(threshold) {}
+
+  __host__ __device__ __forceinline__ bool operator()(const float& val) const {
+    return (val > threshold_);
+  }
+};
+
+}  // namespace
+
+Status NonMaxSuppressionImpl(
+    std::function<IAllocatorUniquePtr<void>(size_t)> allocator,
+    const PrepareContext& pc,
+    const int64_t center_point_box,
+    int64_t batch_index,
+    int64_t class_index,
+    int64_t max_output_boxes_per_class,
+    float iou_threshold,
+    float score_threshold,
+    IAllocatorUniquePtr<void>& selected_indices,
+    int* h_number_selected) {
+  // STEP 1. Prepare data
+  int num_boxes = pc.num_boxes_;
+  const float* boxes_data = pc.boxes_data_ + batch_index * num_boxes * 4;
+  const float* scores_data = pc.scores_data_ + (batch_index * pc.num_classes_ + class_index) * num_boxes;
+
+  // prepare temporary memory for sorting scores
+
+  // calculate temporary size that used for sorting
+  size_t cub_sort_temp_storage_bytes = 0;
+  CUDA_RETURN_IF_ERROR(cub::DeviceRadixSort::SortPairsDescending(
+      nullptr, cub_sort_temp_storage_bytes,
+      static_cast<float*>(nullptr),  // scores
+      static_cast<float*>(nullptr),  // sorted scores
+      static_cast<int*>(nullptr),    // input indices
+      static_cast<int*>(nullptr),    // sorted indices
+      num_boxes,                     // num items
+      0, 8 * sizeof(float)           // sort all bits
+      ));
+
+  // allocate temporary memory
+  IAllocatorUniquePtr<void> d_cub_sort_buffer_ptr{allocator(cub_sort_temp_storage_bytes)};
+  auto* d_cub_sort_buffer = static_cast<uint8_t*>(d_cub_sort_buffer_ptr.get());
+  IAllocatorUniquePtr<void> d_indices_ptr{allocator(num_boxes * sizeof(int))};
+  auto* d_indices = static_cast<int*>(d_indices_ptr.get());
+  IAllocatorUniquePtr<void> d_sorted_indices_ptr{allocator(num_boxes * sizeof(int))};
+  auto* d_sorted_indices = static_cast<int*>(d_sorted_indices_ptr.get());
+  IAllocatorUniquePtr<void> d_selected_indices_ptr{allocator(num_boxes * sizeof(int))};
+  auto* d_selected_indices = static_cast<int*>(d_selected_indices_ptr.get());
+  IAllocatorUniquePtr<void> d_sorted_scores_ptr{allocator(num_boxes * sizeof(float))};
+  auto* d_sorted_scores = static_cast<float*>(d_sorted_scores_ptr.get());
+  IAllocatorUniquePtr<void> d_sorted_boxes_ptr{allocator(num_boxes * 4 * sizeof(float))};
+  auto* d_sorted_boxes = static_cast<float*>(d_sorted_boxes_ptr.get());
+
+  // create sequense of indices
+  int blocksPerGrid = (int)(ceil(static_cast<float>(num_boxes) / GridDim::maxThreadsPerBlock));
+  Iota<int><<<blocksPerGrid, GridDim::maxThreadsPerBlock>>>(num_boxes, 0, d_indices);
+  CUDA_RETURN_IF_ERROR(cudaGetLastError());
+
+  // sort scores
+  CUDA_RETURN_IF_ERROR(cub::DeviceRadixSort::SortPairsDescending(
+      d_cub_sort_buffer,
+      cub_sort_temp_storage_bytes,
+      scores_data,
+      d_sorted_scores,
+      d_indices,
+      d_sorted_indices,
+      num_boxes,
+      0,
+      8 * sizeof(float)  // sort all bits
+      ));
+
+  // pick sorted scores
+  const Box* original_boxes = reinterpret_cast<const Box*>(boxes_data);
+  Box* sorted_boxes = reinterpret_cast<Box*>(d_sorted_boxes);
+  IndexMultiSelect<int, Box><<<blocksPerGrid, GridDim::maxThreadsPerBlock>>>(num_boxes, d_sorted_indices, original_boxes, sorted_boxes);
+  CUDA_RETURN_IF_ERROR(cudaGetLastError());
+
+  // STEP 2. filter boxes by scores
+  int limited_num_boxes = num_boxes;
+  if (pc.score_threshold_ != nullptr) {
+    thrust::device_ptr<float> sorted_scores_device_ptr(d_sorted_scores);
+    limited_num_boxes = thrust::count_if(
+        sorted_scores_device_ptr,
+        sorted_scores_device_ptr + num_boxes,
+        DeviceGreaterThan(score_threshold));
+    CUDA_RETURN_IF_ERROR(cudaGetLastError());
+
+    if (limited_num_boxes == 0) {
+      *h_number_selected = 0;
+      return Status::OK();
+    }
+  }
+
+  // STEP 3. launch NMS kernels
+  ORT_RETURN_IF_ERROR(NmsGpu(allocator,
+                             center_point_box,
+                             d_sorted_boxes,
+                             limited_num_boxes,
+                             iou_threshold,
+                             d_selected_indices,
+                             h_number_selected,
+                             max_output_boxes_per_class));
+  CUDA_RETURN_IF_ERROR(cudaGetLastError());
+
+  // STEP 4. map back to sorted indices
+  *h_number_selected = std::min(*h_number_selected, (int)max_output_boxes_per_class);
+  int num_to_keep = *h_number_selected;
+  if (num_to_keep > 0) {
+    IAllocatorUniquePtr<void> d_output_indices_ptr{allocator(num_to_keep * sizeof(int))};
+    auto* d_output_indices = static_cast<int*>(d_output_indices_ptr.get());
+    IAllocatorUniquePtr<void> d_normalized_output_indices_ptr{allocator(num_to_keep * 3 * sizeof(int64_t))};
+    auto* d_normalized_output_indices = static_cast<int64_t*>(d_normalized_output_indices_ptr.get());
+
+    int blocksPerGrid = (int)(ceil(static_cast<float>(num_to_keep) / GridDim::maxThreadsPerBlock));
+    IndexMultiSelect<int, int><<<blocksPerGrid, GridDim::maxThreadsPerBlock>>>(num_to_keep, d_selected_indices, d_sorted_indices, d_output_indices);
+    NormalizeOutput<<<blocksPerGrid, GridDim::maxThreadsPerBlock>>>(num_to_keep, d_output_indices, d_normalized_output_indices, batch_index, class_index);
+    CUDA_RETURN_IF_ERROR(cudaGetLastError());
+
+    selected_indices = std::move(d_normalized_output_indices_ptr);
+  }
+
+  return Status::OK();
+}
+
+}  // namespace cuda
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/object_detection/non_max_suppression_impl.h b/onnxruntime/core/providers/cuda/object_detection/non_max_suppression_impl.h
new file mode 100644
index 0000000000..c10c508377
--- /dev/null
+++ b/onnxruntime/core/providers/cuda/object_detection/non_max_suppression_impl.h
@@ -0,0 +1,29 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include <stdint.h>
+#include "core/providers/cuda/shared_inc/cuda_utils.h"
+#include "core/common/common.h"
+#include "core/framework/allocator.h"
+#include "core/framework/data_types.h"
+#include <functional>
+#include "core/providers/cpu/object_detection/non_max_suppression_helper.h"
+
+namespace onnxruntime {
+namespace cuda {
+
+Status NonMaxSuppressionImpl(
+    std::function<IAllocatorUniquePtr<void>(size_t)> allocator,
+    const PrepareContext& pc,
+    const int64_t center_point_box,
+    int64_t batch_index,
+    int64_t class_index,
+    int64_t max_output_boxes_per_class,
+    float iou_threshold,
+    float score_threshold,
+    IAllocatorUniquePtr<void>& selected_indices,
+    int* h_number_selected);
+
+}  // namespace cuda
+}  // namespace onnxruntime