diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index b4cc23d893..a2854e3b63 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -344,6 +344,7 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kO class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, bool, Equal); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, int32_t, Equal); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, int64_t, Equal); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 8, Expand); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, int32_t, Greater); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, int64_t, Greater); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, uint32_t, Greater); @@ -668,6 +669,7 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, diff --git a/onnxruntime/core/providers/cuda/math/binary_elementwise_ops.cc b/onnxruntime/core/providers/cuda/math/binary_elementwise_ops.cc index 1aee5bff52..16f6246b3d 100644 --- a/onnxruntime/core/providers/cuda/math/binary_elementwise_ops.cc +++ b/onnxruntime/core/providers/cuda/math/binary_elementwise_ops.cc @@ -20,7 +20,7 @@ Status BinaryElementwise::Prepare(OpKernelContext* context, return Status::OK(); } -static Status ComputeOutputShape(const std::string& node_name, const TensorShape& lhs_shape, const TensorShape& rhs_shape, TensorShape& out_shape) { + Status ComputeOutputShape(const std::string& node_name, const TensorShape& lhs_shape, const TensorShape& rhs_shape, TensorShape& out_shape) { size_t lhs_rank = lhs_shape.NumDimensions(); size_t rhs_rank = rhs_shape.NumDimensions(); size_t out_rank = std::max(lhs_rank, rhs_rank); diff --git a/onnxruntime/core/providers/cuda/tensor/expand.cc b/onnxruntime/core/providers/cuda/tensor/expand.cc new file mode 100644 index 0000000000..599fa39f40 --- /dev/null +++ b/onnxruntime/core/providers/cuda/tensor/expand.cc @@ -0,0 +1,76 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "expand.h" +#include "expand_impl.h" +#include "core/providers/cpu/tensor/utils.h" + +namespace onnxruntime { +namespace cuda { + +Status Expand::ComputeInternal(OpKernelContext* ctx) const { + const auto& input0 = *ctx->Input(0); + const auto& input1 = *ctx->Input(1); + int device_id = GetDeviceId(); + + // new shape to be expanded to + const auto* p_shape = input1.template Data(); + std::vector output_dims{p_shape, p_shape + input1.Shape().Size()}; + TensorShape output_shape(output_dims); + + ORT_RETURN_IF_ERROR(ComputeOutputShape(Node().Name(), input0.Shape(), output_dims, output_shape)); + auto rank = output_shape.NumDimensions(); + auto& output_tensor = *ctx->Output(0, output_shape); + auto input_shape = input0.Shape().GetDims(); + + // pad input_dims with 1 to make ranks match + for (int i = 0; i < rank - input_shape.size(); i++) { + input_shape.insert(input_shape.begin(), 1); + } + + // create fast_divmod using dimension values + CudaAsyncBuffer fdm_input_dims(this, device_id, rank); + CudaAsyncBuffer fdm_output_dims(this, device_id, rank); + CudaAsyncBuffer fdm_output_subdim_size(this, device_id, rank); + { + auto in_span = fdm_input_dims.CpuSpan(); + auto out_span = fdm_output_dims.CpuSpan(); + auto sdm_span = fdm_output_subdim_size.CpuSpan(); + auto subdim_size = output_shape.Size(); + for (auto i = 0; i < rank; i++) { + in_span[i] = fast_divmod(static_cast(input_shape[i])); + out_span[i] = fast_divmod(static_cast(output_shape[i])); + subdim_size /= output_shape[i]; + sdm_span[i] = static_cast(subdim_size); + } + } + ORT_RETURN_IF_ERROR(fdm_input_dims.CopyToGpu()); + ORT_RETURN_IF_ERROR(fdm_output_dims.CopyToGpu()); + ORT_RETURN_IF_ERROR(fdm_output_subdim_size.CopyToGpu()); + + ExpandImpl( + input0.DataType()->Size(), + output_shape.NumDimensions(), + output_shape.Size(), + input0.Shape().Size(), + input0.DataRaw(), + output_tensor.MutableDataRaw(), + fdm_input_dims.GpuPtr(), + fdm_output_dims.GpuPtr(), + fdm_output_subdim_size.GpuPtr()); + + return Status::OK(); +} + +ONNX_OPERATOR_KERNEL_EX( + Expand, + kOnnxDomain, + 8, + kCudaExecutionProvider, + KernelDefBuilder() + .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()) + .InputMemoryType(1), + Expand); + +} // namespace cuda +}; // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/tensor/expand.h b/onnxruntime/core/providers/cuda/tensor/expand.h new file mode 100644 index 0000000000..708fc965db --- /dev/null +++ b/onnxruntime/core/providers/cuda/tensor/expand.h @@ -0,0 +1,25 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/common/common.h" +#include "core/framework/op_kernel.h" +#include "core/providers/cuda/cuda_common.h" + +namespace onnxruntime { +namespace cuda { + +class Expand final : public CudaKernel { + public: + Expand(const OpKernelInfo& info) : CudaKernel(info) {} + + Status ComputeInternal(OpKernelContext* context) const override; +}; + +Status ComputeOutputShape( + const std::string& node_name, + const TensorShape& lhs_shape, + const TensorShape& rhs_shape, + TensorShape& out_shape); + +} // namespace cuda +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/tensor/expand_impl.cu b/onnxruntime/core/providers/cuda/tensor/expand_impl.cu new file mode 100644 index 0000000000..bbad602ec0 --- /dev/null +++ b/onnxruntime/core/providers/cuda/tensor/expand_impl.cu @@ -0,0 +1,96 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/providers/cuda/cu_inc/common.cuh" +#include "expand_impl.h" +#include "core/providers/cuda/cuda_common.h" + +namespace onnxruntime { +namespace cuda { + +template +__global__ void ExpandKernel( + const size_t rank, + const size_t N, + const size_t N_input, + const T* input_data, + T* output_data, + const fast_divmod* fdm_input_dims, + const fast_divmod* fdm_output_dims, + const fast_divmod* fdm_output_subdim_size) { + CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N); + + // initialize + auto output_index = id; + auto input_index = 0; + auto input_subdim_size = N_input; + auto out_coord = output_index; + // use striding when tensor is larger than grid + int stride = blockDim.x * gridDim.x; + + // translate indices to coordinates. copy expanded dims from source + while (output_index < N) { + for (int64_t i = 0; i < rank; i++) { + input_subdim_size = fdm_input_dims[i].div(input_subdim_size); + auto new_out_coord = fdm_output_subdim_size[i].div(out_coord); + auto in_coord = (new_out_coord > (fdm_input_dims[i].d_ - 1)) ? fdm_input_dims[i].d_ - 1 : new_out_coord; + input_index += input_subdim_size * in_coord; + out_coord -= new_out_coord * fdm_output_subdim_size[i].d_; + } + output_data[output_index] = input_data[input_index]; + output_index += stride; + out_coord = output_index; + input_subdim_size = N_input; + input_index = 0; + } +} + +Status ExpandImpl( + const size_t element_size, + const size_t rank, + const size_t N, + const size_t N_input, + const void* input_data, + void* output_data, + const fast_divmod* fdm_input_dims, + const fast_divmod* fdm_output_dims, + const fast_divmod* fdm_output_subdim_size) { + int blocksPerGrid = (int)(ceil(static_cast(N) / GridDim::maxThreadsPerBlock)); + + switch (element_size) { + case sizeof(uint8_t): + ExpandKernel<<>>( + rank, N, N_input, + reinterpret_cast::MappedType*>(input_data), + reinterpret_cast::MappedType*>(output_data), + fdm_input_dims, fdm_output_dims, fdm_output_subdim_size); + break; + case sizeof(uint16_t): + ExpandKernel<<>>( + rank, N, N_input, + reinterpret_cast::MappedType*>(input_data), + reinterpret_cast::MappedType*>(output_data), + fdm_input_dims, fdm_output_dims, fdm_output_subdim_size); + break; + case sizeof(uint32_t): + ExpandKernel<<>>( + rank, N, N_input, + reinterpret_cast::MappedType*>(input_data), + reinterpret_cast::MappedType*>(output_data), + fdm_input_dims, fdm_output_dims, fdm_output_subdim_size); + break; + case sizeof(uint64_t): + ExpandKernel<<>>( + rank, N, N_input, + reinterpret_cast::MappedType*>(input_data), + reinterpret_cast::MappedType*>(output_data), + fdm_input_dims, fdm_output_dims, fdm_output_subdim_size); + break; + default: + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for Expand operator"); + } + return Status::OK(); +} + +} // namespace cuda +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/tensor/expand_impl.h b/onnxruntime/core/providers/cuda/tensor/expand_impl.h new file mode 100644 index 0000000000..c127d72477 --- /dev/null +++ b/onnxruntime/core/providers/cuda/tensor/expand_impl.h @@ -0,0 +1,25 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include +#include "core/providers/cuda/shared_inc/cuda_utils.h" +#include "core/framework/data_types.h" +#include "core/common/common.h" + +namespace onnxruntime { +namespace cuda { + +Status ExpandImpl( + const size_t element_size, + const size_t shape_rank, + const size_t N, + const size_t N_input, + const void* input_data, + void* output_data, + const fast_divmod* fdm_input_dims, + const fast_divmod* fdm_output_dims, + const fast_divmod* fdm_output_subdim_size); + +} // namespace cuda +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/tensor/slice_impl.cu b/onnxruntime/core/providers/cuda/tensor/slice_impl.cu index e157ded702..e9d88bf0f6 100644 --- a/onnxruntime/core/providers/cuda/tensor/slice_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/slice_impl.cu @@ -8,7 +8,7 @@ namespace onnxruntime { namespace cuda { -template +template __global__ void _SliceKernel(const int32_t dimension_count, const int64_t* starts, const int64_t* steps, diff --git a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc index 922a7c30ce..d917e57576 100644 --- a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc +++ b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc @@ -950,6 +950,17 @@ TEST(MathOpTest, Expand_8_1x3_int64) { test.Run(); } +TEST(MathOpTest, Expand_8_3x1x3x1_int64) { + OpTester test("Expand", 8); + test.AddInput("data_0", {1, 3, 1, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9}); + test.AddInput("data_1", {4}, {3, 1, 3, 1}); + test.AddOutput("result", {3, 3, 3, 3}, + {1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 6, 4, 5, 6, 4, 5, 6, 7, 8, 9, 7, 8, 9, 7, 8, 9, + 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 6, 4, 5, 6, 4, 5, 6, 7, 8, 9, 7, 8, 9, 7, 8, 9, + 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 6, 4, 5, 6, 4, 5, 6, 7, 8, 9, 7, 8, 9, 7, 8, 9,}); + test.Run(); +} + TEST(MathOpTest, Expand_8_3x3_float16) { OpTester test("Expand", 8); test.AddInput("data_0", {1}, {MLFloat16(math::floatToHalf(1.0f))});