diff --git a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_auxiliary_ops.cc b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_auxiliary_ops.cc index af83133cee..ab9f9e1c35 100644 --- a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_auxiliary_ops.cc +++ b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_auxiliary_ops.cc @@ -456,5 +456,18 @@ template std::unique_ptr ReduceSum( const std::vector& reduce_axes, AllocatorPtr allocator, concurrency::ThreadPool* tp, void* einsum_cuda_assets, const DeviceHelpers::ReduceSum& reduce_sum_func); +// MLFloat16 +template std::unique_ptr MatMul( + const Tensor& input_1, const std::vector& input_shape_1_override, + const Tensor& input_2, const std::vector& input_shape_2_override, + AllocatorPtr allocator, concurrency::ThreadPool* tp, void* einsum_cuda_assets, + const DeviceHelpers::MatMul& device_matmul_func); + +template std::unique_ptr ReduceSum( + const Tensor& input, const std::vector& input_shape_override, + const std::vector& reduce_axes, AllocatorPtr allocator, + concurrency::ThreadPool* tp, void* einsum_cuda_assets, + const DeviceHelpers::ReduceSum& device_reduce_sum_func); + } // namespace EinsumOp } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_typed_compute_processor.cc b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_typed_compute_processor.cc index 0d6bb37ba3..39b15ce661 100644 --- a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_typed_compute_processor.cc +++ b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_typed_compute_processor.cc @@ -367,5 +367,6 @@ template class EinsumTypedComputeProcessor; template class EinsumTypedComputeProcessor; template class EinsumTypedComputeProcessor; template class EinsumTypedComputeProcessor; +template class EinsumTypedComputeProcessor; } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/math/einsum.cc b/onnxruntime/core/providers/cuda/math/einsum.cc index 661b91db63..8333561fac 100644 --- a/onnxruntime/core/providers/cuda/math/einsum.cc +++ b/onnxruntime/core/providers/cuda/math/einsum.cc @@ -15,7 +15,8 @@ ONNX_OPERATOR_KERNEL_EX( KernelDefBuilder().TypeConstraint("T", std::vector{ DataTypeImpl::GetTensorType(), - DataTypeImpl::GetTensorType()}), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType()}), Einsum); Status Einsum::Compute(OpKernelContext* context) const { @@ -59,6 +60,16 @@ Status Einsum::DeviceCompute(OpKernelContext* context, const std::vector, EinsumOp::DeviceHelpers::CudaDeviceHelpers::DataCopy); return einsum_compute_processor.Run(); + } else if (inputs[0]->IsDataType()) { + auto einsum_compute_processor = EinsumTypedComputeProcessor(context, allocator, tp, + einsum_compute_preprocessor, + &einsum_cuda_assets); + + einsum_compute_processor.SetDeviceHelpers(EinsumOp::DeviceHelpers::CudaDeviceHelpers::Transpose, + EinsumOp::DeviceHelpers::CudaDeviceHelpers::MatMul, + EinsumOp::DeviceHelpers::CudaDeviceHelpers::ReduceSum, + EinsumOp::DeviceHelpers::CudaDeviceHelpers::DataCopy); + return einsum_compute_processor.Run(); } return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED, diff --git a/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops.cc b/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops.cc index 4d3fd9d83b..79070a392d 100644 --- a/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops.cc +++ b/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops.cc @@ -165,6 +165,19 @@ template Tensor DeviceHelpers::CudaDeviceHelpers::ReduceSum( const TensorShape* input_shape_override, concurrency::ThreadPool* tp, void* einsum_cuda_assets); +// MLFloat16 +template Status DeviceHelpers::CudaDeviceHelpers::MatMul( + const MLFloat16* input_1_data, const MLFloat16* input_2_data, MLFloat16* output_data, + size_t left_stride, size_t right_stride, size_t output_stride, + size_t num_batches, size_t M, size_t K, size_t N, concurrency::ThreadPool* tp, + void* einsum_cuda_assets); + +template Tensor DeviceHelpers::CudaDeviceHelpers::ReduceSum( + const Tensor& input, const std::vector& reduce_axes, + bool keep_dims, AllocatorPtr allocator, + const TensorShape* input_shape_override, + concurrency::ThreadPool* tp, void* einsum_cuda_assets); + } // namespace EinsumOp } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops_diagonal.cu b/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops_diagonal.cu index d84396cc21..6f31e29091 100644 --- a/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops_diagonal.cu +++ b/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops_diagonal.cu @@ -30,8 +30,8 @@ __global__ void _DiagonalKernel( if (i == dim_1) { // Process dim_2 as dim_2 needs to have the same dim value as dim_1 // For example: given a tensor of shape [2, 3, 3] and parsing the diagonal along axes `1` and `2` - // we need to parse elements in input[j, i, i] (j -> 0 to 1; and i -> 0 to 2) - // and place them in output[j, i] and by definition of diagonal parsing dim_1 has to be equal to + // we need to parse elements in input[j, i, i] (j -> 0 to 1; and i -> 0 to 2) + // and place them in output[j, i] and by definition of diagonal parsing dim_1 has to be equal to // dim_2 input_idx += input_strides[dim_2] * dim; } @@ -75,6 +75,13 @@ void DiagonalImpl( output_size); break; + case sizeof(int16_t): + _DiagonalKernel<<>>( + reinterpret_cast(input_data), input_rank, dim_1, dim_2, + input_strides, reinterpret_cast(output_data), output_strides, + output_size); + break; + // Should not hit this as we do not register kernel support for types that will run into this default: ORT_THROW("Einsum Op: Diagonal parsing unsupported"); diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc index 214d274baa..827aeca791 100644 --- a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc +++ b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc @@ -610,7 +610,7 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr &zero, output_tensor, reinterpret_cast(output.template MutableData()))); } } - } else { + } else { // For ArgMax & ArgMin ops, use the indicies as the output with int64 type // cudnnReduceTensor has issue if input and output has same size, which will happen if the axis to be reduced has dim value of 1. // the output is zeros of the output size @@ -928,6 +928,13 @@ template Tensor ReduceCompute( bool keep_dims, bool calculate_log, bool calculate_sqt, bool log_sum_exp, bool fast_reduction, const TensorShape* input_shape_override); +template Tensor ReduceCompute( + CUDAExecutionProvider& cuda_ep, cudnnReduceTensorOp_t cudnn_reduce_op, + AllocatorPtr allocator, + const Tensor& input, const std::vector& axes, + bool keep_dims, bool calculate_log, bool calculate_sqt, bool log_sum_exp, + bool fast_reduction, const TensorShape* input_shape_override); + } // namespace ReductionOps #if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 diff --git a/onnxruntime/test/providers/cpu/math/einsum_test.cc b/onnxruntime/test/providers/cpu/math/einsum_test.cc index 0d22ba6f14..c1f4fe79ab 100644 --- a/onnxruntime/test/providers/cpu/math/einsum_test.cc +++ b/onnxruntime/test/providers/cpu/math/einsum_test.cc @@ -3,6 +3,7 @@ #include "gtest/gtest.h" #include "test/providers/provider_test_utils.h" +#include "test/common/cuda_op_test_utils.h" #include "core/framework/data_types.h" #include "core/util/math.h" @@ -519,5 +520,180 @@ TEST(Einsum, ImplicitEinsumAsTensorContraction) { test.Run(); } + +// Test each theme for half support +TEST(Einsum, ExplicitEinsumAsIdentity_1D_input_Half) { + if (!HasCudaEnvironment(600)) { + return; + } + OpTester test("Einsum", 12, onnxruntime::kOnnxDomain); + test.AddAttribute("equation", "i->i"); + std::vector input_x_f = {0.9f, 2.5f, 2.3f, 1.5f, -4.5f}; + std::vector output_f = {0.9f, 2.5f, 2.3f, 1.5f, -4.5f}; + std::vector input_x(5); + std::vector output(5); + ConvertFloatToMLFloat16(input_x_f.data(), input_x.data(), 5); + ConvertFloatToMLFloat16(output_f.data(), output.data(), 5); + test.AddInput("x", {5}, input_x); + test.AddOutput("y", {5}, output); + test.Run(); +} + +TEST(Einsum, ExplicitEinsumAsTransposeOp_2D_input_Half) { + if (!HasCudaEnvironment(600)) { + return; + } + OpTester test("Einsum", 12, onnxruntime::kOnnxDomain); + test.AddAttribute("equation", "ji->ij"); + std::vector input_x_f = {1.f, 2.f, 3.f, 4.f}; + std::vector output_f = {1.f, 3.f, 2.f, 4.f}; + std::vector input_x(4); + std::vector output(4); + ConvertFloatToMLFloat16(input_x_f.data(), input_x.data(), 4); + ConvertFloatToMLFloat16(output_f.data(), output.data(), 4); + test.AddInput("x", {2, 2}, input_x); + test.AddOutput("y", {2, 2}, output); + test.Run(); +} + +TEST(Einsum, ExplicitEinsumAsReduceOp_2D_input_0_Half) { + if (!HasCudaEnvironment(600)) { + return; + } + OpTester test("Einsum", 12, onnxruntime::kOnnxDomain); + test.AddAttribute("equation", "ij->i"); + std::vector input_x_f = {1.f, 2.f, 3.f, 4.f}; + std::vector output_f = {3.f, 7.f}; + std::vector input_x(4); + std::vector output(2); + ConvertFloatToMLFloat16(input_x_f.data(), input_x.data(), 4); + ConvertFloatToMLFloat16(output_f.data(), output.data(), 2); + test.AddInput("x", {2, 2}, input_x); + test.AddOutput("y", {2}, output); + test.Run(); +} + +TEST(Einsum, ExplicitEinsumAsOuterProductOp_2D_input_Half) { + if (!HasCudaEnvironment(600)) { + return; + } + OpTester test("Einsum", 12, onnxruntime::kOnnxDomain); + test.AddAttribute("equation", "i,j->ij"); + std::vector input_x_f = {1.f, 2.f}; + std::vector input_y_f = {3.f, 4.f}; + std::vector output_f = {3.f, 4.f, 6.f, 8.f}; + std::vector input_x(2); + std::vector input_y(2); + std::vector output(4); + ConvertFloatToMLFloat16(input_x_f.data(), input_x.data(), 2); + ConvertFloatToMLFloat16(input_y_f.data(), input_y.data(), 2); + ConvertFloatToMLFloat16(output_f.data(), output.data(), 4); + test.AddInput("x", {2}, input_x); + test.AddInput("y", {2}, input_y); + test.AddOutput("o", {2, 2}, output); + test.Run(); +} + +TEST(Einsum, ExplicitEinsumAsMatmul_Half) { + if (!HasCudaEnvironment(600)) { + return; + } + OpTester test("Einsum", 12, onnxruntime::kOnnxDomain); + test.AddAttribute("equation", "ij,jk->ik"); + std::vector input_x_f = {1.f, 2.f, 3.f, 4.f}; + std::vector input_y_f = {1.f, 2.f, 3.f, 4.f}; + std::vector output_f = {7.f, 10.f, 15.f, 22.f}; + std::vector input_x(4); + std::vector input_y(4); + std::vector output(4); + ConvertFloatToMLFloat16(input_x_f.data(), input_x.data(), 4); + ConvertFloatToMLFloat16(input_y_f.data(), input_y.data(), 4); + ConvertFloatToMLFloat16(output_f.data(), output.data(), 4); + test.AddInput("x", {2, 2}, input_x); + test.AddInput("y", {2, 2}, input_y); + test.AddOutput("o", {2, 2}, output); + test.Run(); +} + +TEST(Einsum, ExplicitEinsumAsBatchedMatmul_Half) { + if (!HasCudaEnvironment(600)) { + return; + } + OpTester test("Einsum", 12, onnxruntime::kOnnxDomain); + test.AddAttribute("equation", "bij,bjk->bik"); + std::vector input_x_f = {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f}; + std::vector input_y_f = {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f}; + std::vector output_f = {7.f, 10.f, 15.f, 22.f, 7.f, 10.f, 15.f, 22.f}; + std::vector input_x(8); + std::vector input_y(8); + std::vector output(8); + ConvertFloatToMLFloat16(input_x_f.data(), input_x.data(), 8); + ConvertFloatToMLFloat16(input_y_f.data(), input_y.data(), 8); + ConvertFloatToMLFloat16(output_f.data(), output.data(), 8); + test.AddInput("x", {2, 2, 2}, input_x); + test.AddInput("y", {2, 2, 2}, input_y); + test.AddOutput("o", {2, 2, 2}, output); + test.Run(); +} + +TEST(Einsum, ExplicitEinsumAsDiagonalOp_Half) { + if (!HasCudaEnvironment(600)) { + return; + } + OpTester test("Einsum", 12, onnxruntime::kOnnxDomain); + test.AddAttribute("equation", "ii->i"); + std::vector input_x_f = {1.f, 2.f, 3.f, 4.f}; + std::vector output_f = {1.f, 4.f}; + std::vector input_x(4); + std::vector output(2); + ConvertFloatToMLFloat16(input_x_f.data(), input_x.data(), 4); + ConvertFloatToMLFloat16(output_f.data(), output.data(), 2); + test.AddInput("x", {2, 2}, input_x); + test.AddOutput("o", {2}, output); + test.Run(); +} + +TEST(Einsum, ExplicitEinsumAsElementwiseMulOpWithOneScalar_Half) { + if (!HasCudaEnvironment(600)) { + return; + } + OpTester test("Einsum", 12, onnxruntime::kOnnxDomain); + test.AddAttribute("equation", ",...i->...i"); + std::vector input_x_f = {10.f}; + std::vector input_y_f = {1.f, 2.f, 3.f, 4.f}; + std::vector output_f = {10.f, 20.f, 30.f, 40.f}; + std::vector input_x(1); + std::vector input_y(4); + std::vector output(4); + ConvertFloatToMLFloat16(input_x_f.data(), input_x.data(), 1); + ConvertFloatToMLFloat16(input_y_f.data(), input_y.data(), 4); + ConvertFloatToMLFloat16(output_f.data(), output.data(), 4); + test.AddInput("x", {}, input_x); + test.AddInput("y", {2, 2}, input_y); + test.AddOutput("o", {2, 2}, output); + test.Run(); +} + +TEST(Einsum, ExplicitEinsumAsTensorContraction_Half) { + if (!HasCudaEnvironment(600)) { + return; + } + OpTester test("Einsum", 12, onnxruntime::kOnnxDomain); + test.AddAttribute("equation", "abcd,ea->bcde"); + std::vector input_x_f = {1.f, 2.f, 1.f, 2.f, 1.f, 2.f, 1.f, 2.f, 1.f, 2.f, 1.f, 2.f, 1.f, 2.f, 1.f, 2.f}; + std::vector input_y_f = {1.f, 2.f, 1.f, 2.f}; + std::vector output_f = {3.f, 3.f, 6.f, 6.f, 3.f, 3.f, 6.f, 6.f, 3.f, 3.f, 6.f, 6.f, 3.f, 3.f, 6.f, 6.f}; + std::vector input_x(16); + std::vector input_y(4); + std::vector output(16); + ConvertFloatToMLFloat16(input_x_f.data(), input_x.data(), 16); + ConvertFloatToMLFloat16(input_y_f.data(), input_y.data(), 4); + ConvertFloatToMLFloat16(output_f.data(), output.data(), 16); + test.AddInput("x", {2, 2, 2, 2}, input_x); + test.AddInput("y", {2, 2}, input_y); + test.AddOutput("o", {2, 2, 2, 2}, output); + test.Run(); +} + } // namespace test } // namespace onnxruntime