diff --git a/onnxruntime/core/providers/cpu/math/matmul.cc b/onnxruntime/core/providers/cpu/math/matmul.cc index 583ee759cc..16bb1ddfce 100644 --- a/onnxruntime/core/providers/cpu/math/matmul.cc +++ b/onnxruntime/core/providers/cpu/math/matmul.cc @@ -103,6 +103,13 @@ Status MatMul::Compute(OpKernelContext* ctx) const { if (y->Shape().Size() == 0) return Status::OK(); + if (helper.K() == 0) { + // When we have (M, 0, N) then the inputs are empty, but the output should + // be filled out with zeros. + memset(y->MutableDataRaw(), 0, y->SizeInBytes()); + return Status::OK(); + } + // Using DataRaw as int32_t/uint32_t and int64_t/uint64_t share a common // operator body. const auto* a_data = reinterpret_cast(a->DataRaw()); diff --git a/onnxruntime/core/providers/cuda/math/matmul.cc b/onnxruntime/core/providers/cuda/math/matmul.cc index 6e126fbead..04ffa875c1 100644 --- a/onnxruntime/core/providers/cuda/math/matmul.cc +++ b/onnxruntime/core/providers/cuda/math/matmul.cc @@ -110,7 +110,16 @@ Status MatMul::ComputeInternal(OpKernelContext* ctx) const { Tensor* Y = ctx->Output(0, helper.OutputShape()); // Bail out early if the output is going to be empty - if (Y->Shape().Size() == 0) return Status::OK(); + const auto output_size = Y->Shape().Size(); + if (output_size == 0) return Status::OK(); + + if (helper.K() == 0) { + // When we have (M, 0, N) then the inputs are empty, but the output should + // be filled out with zeros. + using CudaT = typename ToCudaType::MappedType; + Fill(Stream(ctx), reinterpret_cast(Y->MutableData()), CudaT(0.f), narrow(output_size)); + return Status::OK(); + } if (GetTuningContext()->IsTunableOpEnabled()) { return tunable::TunableMatMul(alpha_, trans_a, trans_b, trans_batch_a_, trans_batch_b_, helper, this, ctx); diff --git a/onnxruntime/test/providers/cpu/math/matmul_test.cc b/onnxruntime/test/providers/cpu/math/matmul_test.cc index 82f6914d08..b7ae0a9f0d 100644 --- a/onnxruntime/test/providers/cpu/math/matmul_test.cc +++ b/onnxruntime/test/providers/cpu/math/matmul_test.cc @@ -219,6 +219,27 @@ TEST(MathOpTest, MatMulUint64Type) { RunMatMulTest(9); } +TEST(MathOpTest, MatMul_ZeroK) { + // test with empty inputs and zero filled output + constexpr const std::array empty_input{}; + const std::vector expected_output{0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0}; + OpTester test("MatMul", 14); + + test.AddInput("A", {4, 0}, empty_input); + test.AddInput("B", {0, 4}, empty_input); + test.AddOutput("Y", {4, 4}, expected_output); + + // No special case is implemented. + test.ConfigExcludeEps({kCoreMLExecutionProvider, kNnapiExecutionProvider, + kDmlExecutionProvider, kDnnlExecutionProvider, kQnnExecutionProvider, + kOpenVINOExecutionProvider}) + .Config(run_with_tunable_op) + .RunWithConfig(); +} + #if defined(USE_CUDA) || defined(USE_ROCM) TEST(MathOpTest, MatMul_Float16) { #ifdef USE_CUDA