diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc index 44f310aeed..06c7ba271a 100644 --- a/onnxruntime/core/providers/cpu/tensor/pad.cc +++ b/onnxruntime/core/providers/cpu/tensor/pad.cc @@ -390,7 +390,7 @@ Status Pad::Compute(OpKernelContext* ctx) const { } } - T value = 0; + T value = static_cast(0); const Tensor* value_tensor = ctx->Input(2); if (nullptr != value_tensor) { ORT_ENFORCE(utils::IsPrimitiveDataType(value_tensor->DataType()) && diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index a6535c7eeb..d9b8205e0c 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -660,6 +660,9 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, int32_t, Resize); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, uint8_t, Resize); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, Clip); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, Pad); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, Pad); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, Pad); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, bool, Equal); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, int32_t, Equal); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, int64_t, Equal); @@ -1118,6 +1121,9 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, diff --git a/onnxruntime/core/providers/cuda/tensor/pad.cc b/onnxruntime/core/providers/cuda/tensor/pad.cc index a74af54ca2..ac22a8c57e 100644 --- a/onnxruntime/core/providers/cuda/tensor/pad.cc +++ b/onnxruntime/core/providers/cuda/tensor/pad.cc @@ -18,13 +18,82 @@ namespace cuda { kCudaExecutionProvider, \ KernelDefBuilder() \ .TypeConstraint("T", DataTypeImpl::GetTensorType()), \ + Pad); \ + ONNX_OPERATOR_TYPED_KERNEL_EX( \ + Pad, \ + kOnnxDomain, \ + 11, \ + T, \ + kCudaExecutionProvider, \ + KernelDefBuilder() \ + .InputMemoryType(1) \ + .InputMemoryType(2) \ + .TypeConstraint("T", DataTypeImpl::GetTensorType()), \ Pad); +template +typename ToCudaType::MappedType ToCudaValue(const T& value) { + return value; +} + +template<> +typename ToCudaType::MappedType ToCudaValue(const MLFloat16& value) { + return *reinterpret_cast::MappedType *>(&value.val); +} + template Status Pad::ComputeInternal(OpKernelContext* ctx) const { + typedef typename ToCudaType::MappedType CudaT; const auto& input_tensor = *ctx->Input(0); auto const& input_shape = input_tensor.Shape(); auto dimension_count = input_shape.NumDimensions(); + + const std::vector* p_pads = &pads_; + const std::vector* p_slices = &slices_; + CudaT value = ToCudaType::FromFloat(value_); + + // kOnnxDomain Pad opset >= 11 (Or) kMsDomain opset == 1 + std::vector pads; + std::vector slices; + if (is_dynamic_) { + const Tensor& pads_tensor = *ctx->Input(1); + const std::vector& pads_tensor_dims = pads_tensor.Shape().GetDims(); + ORT_ENFORCE(utils::IsPrimitiveDataType(pads_tensor.DataType()), + "Pads tensor should be an INT64 tensor"); + ORT_ENFORCE(pads_tensor_dims.size() == 1 || (pads_tensor_dims.size() == 2 && pads_tensor_dims[0] == 1), + "Pads tensor should be a 1D tensor of shape [2 * input_rank] or a 2D tensor of shape [1, 2 * input_rank]"); + + const int64_t* pads_tensor_raw_data = pads_tensor.template Data(); + size_t pads_size = static_cast(pads_tensor.Shape().Size()); + ORT_ENFORCE(pads_size == 2 * dimension_count, + "Pads tensor size should be equal to twice the input dimension count "); + + pads.reserve(2 * dimension_count); + for (size_t i = 0; i < pads_size; ++i) { + pads.push_back(pads_tensor_raw_data[i]); + } + // Separate out any negative pads into the slices array + slices.resize(pads.size(), 0); + for (size_t index = 0; index < pads.size(); index++) { + if (pads[index] < 0) { + slices[index] = pads[index]; + pads[index] = 0; + } + } + + T raw_value(0); + const Tensor* value_tensor = ctx->Input(2); + if (nullptr != value_tensor) { + ORT_ENFORCE(utils::IsPrimitiveDataType(value_tensor->DataType()) && + value_tensor->Shape().Size() == 1, + "Value tensor should be a 1D tensor of size 1 with the same type as that of the input tensor"); + raw_value = value_tensor->template Data()[0]; + value = ToCudaValue(raw_value); + } + p_pads = &pads; + p_slices = &slices; + } + CudaAsyncBuffer input_dims(this, input_shape.GetDims()); CudaAsyncBuffer input_strides(this, dimension_count); CudaAsyncBuffer lower_pads(this, dimension_count); @@ -33,15 +102,14 @@ Status Pad::ComputeInternal(OpKernelContext* ctx) const { TensorPitches::Calculate(input_strides.CpuSpan(), input_shape.GetDims()); std::vector output_dims(input_shape.GetDims()); - - ORT_ENFORCE(dimension_count * 2 == pads_.size(), "'pads' attribute has wrong number of values"); + ORT_ENFORCE(dimension_count * 2 == p_pads->size(), "'pads' attribute has wrong number of values"); // Calculate output dimensions, and handle any negative padding auto lower_pads_span = lower_pads.CpuSpan(); auto upper_pads_span = upper_pads.CpuSpan(); for (size_t i = 0; i < dimension_count; i++) { - lower_pads_span[i] = pads_[i] + slices_[i]; - upper_pads_span[i] = pads_[i + dimension_count] + slices_[i + dimension_count]; + lower_pads_span[i] = (*p_pads)[i] + (*p_slices)[i]; + upper_pads_span[i] = (*p_pads)[i + dimension_count] + (*p_slices)[i + dimension_count]; output_dims[i] += lower_pads_span[i] + upper_pads_span[i]; } TensorShape output_shape(output_dims); @@ -65,7 +133,7 @@ Status Pad::ComputeInternal(OpKernelContext* ctx) const { input_strides.GpuPtr(), lower_pads.GpuPtr(), upper_pads.GpuPtr(), - value_, + value, static_cast(mode_), reinterpret_cast::MappedType*>(input_tensor.template Data()), fdm_output_strides.GpuPtr(), diff --git a/onnxruntime/core/providers/cuda/tensor/pad.h b/onnxruntime/core/providers/cuda/tensor/pad.h index 6c0a360cee..7682355d08 100644 --- a/onnxruntime/core/providers/cuda/tensor/pad.h +++ b/onnxruntime/core/providers/cuda/tensor/pad.h @@ -6,6 +6,8 @@ #include "core/providers/cuda/cuda_common.h" #include "core/providers/cpu/tensor/pad.h" +using onnxruntime::PadBase; + namespace onnxruntime { namespace cuda { diff --git a/onnxruntime/core/providers/cuda/tensor/pad_impl.cu b/onnxruntime/core/providers/cuda/tensor/pad_impl.cu index c45f820795..2f0d0d3288 100644 --- a/onnxruntime/core/providers/cuda/tensor/pad_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/pad_impl.cu @@ -21,7 +21,7 @@ __global__ void _PadKernel( const int64_t* input_strides, const int64_t* lower_pads, const int64_t* upper_pads, - const float pad_value, + const T pad_value, const T* input_data, const fast_divmod* fdm_output_strides, T* output_data, @@ -74,7 +74,7 @@ void PadImpl( const int64_t* input_strides, const int64_t* lower_pads, const int64_t* upper_pads, - const float pad_value, + const T pad_value, const int pad_mode, const T* input_data, const fast_divmod* fdm_output_strides, @@ -104,7 +104,7 @@ void PadImpl( } #define SPECIALIZED_IMPL(T) \ - template void PadImpl(const size_t shape_rank, const int64_t* input_dims, const int64_t* input_strides, const int64_t* lower_pads, const int64_t* upper_pads, const float pad_value, const int pad_mode, const T* input_data, const fast_divmod* fdm_output_strides, T* output_data, const size_t N); + template void PadImpl(const size_t shape_rank, const int64_t* input_dims, const int64_t* input_strides, const int64_t* lower_pads, const int64_t* upper_pads, const T pad_value, const int pad_mode, const T* input_data, const fast_divmod* fdm_output_strides, T* output_data, const size_t N); SPECIALIZED_IMPL(float) SPECIALIZED_IMPL(double) diff --git a/onnxruntime/core/providers/cuda/tensor/pad_impl.h b/onnxruntime/core/providers/cuda/tensor/pad_impl.h index d886f1fdc5..95bf8335a3 100644 --- a/onnxruntime/core/providers/cuda/tensor/pad_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/pad_impl.h @@ -15,7 +15,7 @@ void PadImpl( const int64_t* input_strides, const int64_t* lower_pads, const int64_t* upper_pads, - const float pad_value, + const T pad_value, const int pad_mode, const T* input_data, const fast_divmod* fdm_output_strides,