From 9707b39a67cbe34a00d8a9917eea3577d66e70c1 Mon Sep 17 00:00:00 2001
From: Du Li <duli1@microsoft.com>
Date: Fri, 20 Sep 2019 16:31:47 -0700
Subject: [PATCH] Duli/clip cuda (#1677)

Add Cuda Kernel for Clip operator

* Add Cuda Kernel for Clip operator
* Register Clip CUDA Kernel
---
 .../providers/cuda/cuda_execution_provider.cc |  2 +
 onnxruntime/core/providers/cuda/math/clip.cc  | 42 +++++++++++++++++++
 onnxruntime/core/providers/cuda/math/clip.h   | 31 ++++++++++++++
 .../core/providers/cuda/math/clip_impl.cu     | 33 +++++++++++++++
 .../core/providers/cuda/math/clip_impl.h      | 16 +++++++
 5 files changed, 124 insertions(+)
 create mode 100644 onnxruntime/core/providers/cuda/math/clip.cc
 create mode 100644 onnxruntime/core/providers/cuda/math/clip.h
 create mode 100644 onnxruntime/core/providers/cuda/math/clip_impl.cu
 create mode 100644 onnxruntime/core/providers/cuda/math/clip_impl.h

diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index 38082c0676..68083477d4 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -360,6 +360,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, float, Ceil);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, double, Ceil);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, MLFloat16, Ceil);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, float, Clip);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, float, Reciprocal);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, double, Reciprocal);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, MLFloat16, Reciprocal);
@@ -563,6 +564,7 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, float, MatMul)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, double, MatMul)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, MLFloat16, MatMul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, float, Clip)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, float, Tile)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, double, Tile)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, MLFloat16, Tile)>,
diff --git a/onnxruntime/core/providers/cuda/math/clip.cc b/onnxruntime/core/providers/cuda/math/clip.cc
new file mode 100644
index 0000000000..7f8f03aebf
--- /dev/null
+++ b/onnxruntime/core/providers/cuda/math/clip.cc
@@ -0,0 +1,42 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/common.h"
+#include "core/providers/cuda/math/clip.h"
+#include "core/providers/cuda/math/clip_impl.h"
+
+namespace onnxruntime {
+namespace cuda {
+
+#define REGISTER_KERNEL_TYPED(T)                                                \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                                \
+      Clip,                                                                     \
+      kOnnxDomain,                                                              \
+      6,                                                                        \
+      T,                                                                        \
+      kCudaExecutionProvider,                                                   \
+      KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
+      Clip<T>);
+
+template <typename T>
+Status Clip<T>::ComputeInternal(OpKernelContext* ctx) const {
+  const Tensor& X = *ctx->Input<Tensor>(0);
+  const TensorShape input_shape{X.Shape()};
+  Tensor* Y = ctx->Output(0, input_shape);
+
+  size_t count = input_shape.Size();
+
+  auto* y_data = Y->template MutableData<T>();
+  const auto* x_data = X.template Data<T>();
+  ClipImpl<T>(x_data, y_data, min_, max_, count);
+  return Status::OK();
+}
+
+#define SPECIALIZED_COMPUTE(T) \
+  REGISTER_KERNEL_TYPED(T)     \
+  template Status Clip<T>::ComputeInternal(OpKernelContext* ctx) const;
+
+SPECIALIZED_COMPUTE(float)
+
+}  // namespace cuda
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/math/clip.h b/onnxruntime/core/providers/cuda/math/clip.h
new file mode 100644
index 0000000000..1f748990c1
--- /dev/null
+++ b/onnxruntime/core/providers/cuda/math/clip.h
@@ -0,0 +1,31 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "core/providers/cuda/cuda_common.h"
+
+namespace onnxruntime {
+namespace cuda {
+
+template <typename T>
+class Clip final : public CudaKernel {
+ public:
+  Clip(const OpKernelInfo& info) : CudaKernel{info} {
+    auto min_val = -std::numeric_limits<T>::infinity();
+    auto max_val = std::numeric_limits<T>::infinity();
+
+    info.GetAttrOrDefault("min", &min_, min_val);
+    info.GetAttrOrDefault("max", &max_, max_val);
+
+    // Make sure the range of interval is sensible 
+    ORT_ENFORCE(min_val <= max_val);
+  }
+
+  Status ComputeInternal(OpKernelContext* context) const override;
+
+ private:
+  T min_, max_;
+};
+
+}  // namespace cuda
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/math/clip_impl.cu b/onnxruntime/core/providers/cuda/math/clip_impl.cu
new file mode 100644
index 0000000000..42d415e946
--- /dev/null
+++ b/onnxruntime/core/providers/cuda/math/clip_impl.cu
@@ -0,0 +1,33 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "core/providers/cuda/math/clip_impl.h"
+#include "core/providers/cuda/cu_inc/common.cuh"
+
+namespace onnxruntime {
+namespace cuda {
+template <typename T>
+__global__ void _Clip(const T* input, T* output, T min, T max, size_t N) {
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
+  output[id] = (input[id] < min) ? min : ((input[id] > max) ? max : input[id]);
+}
+
+template <typename T>
+void ClipImpl(const T* input_data, T* output_data, T min, T max, size_t count) {
+  typedef typename ToCudaType<T>::MappedType CudaT;
+
+  int blocksPerGrid = (int)(ceil(static_cast<float>(count) / GridDim::maxThreadsPerBlock));
+  _Clip<CudaT><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(reinterpret_cast<const CudaT*>(input_data),
+                                                                  reinterpret_cast<CudaT*>(output_data),
+                                                                  *reinterpret_cast<CudaT*>(&min),
+                                                                  *reinterpret_cast<CudaT*>(&max),
+                                                                  count);
+}
+
+template void ClipImpl<float>(const float* input_data, float* output_data, float min, float max, size_t count);
+template void ClipImpl<double>(const double* input_data, double* output_data, double min, double max, size_t count);
+template void ClipImpl<MLFloat16>(const MLFloat16* input_data, MLFloat16* output_data, MLFloat16 min, MLFloat16 max, size_t count);
+
+}  // namespace cuda
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/math/clip_impl.h b/onnxruntime/core/providers/cuda/math/clip_impl.h
new file mode 100644
index 0000000000..b497a44888
--- /dev/null
+++ b/onnxruntime/core/providers/cuda/math/clip_impl.h
@@ -0,0 +1,16 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/cuda/math/clip.h"
+#include "core/providers/cuda/cuda_common.h"
+#include "core/providers/cuda/shared_inc/cuda_utils.h"
+
+namespace onnxruntime {
+namespace cuda {
+template <typename T>
+void ClipImpl(const T* input_data, T* output_data, T min, T max, size_t count);
+
+}  // namespace cuda
+}  // namespace onnxruntime