[CUDA] Special case for K==0 in CUDA MatMul (#21525)

### Description This change addresses a case where we multiply two matrices, and their inner dimension is 0. numpy and Eigen which is being used in our CPU EP implementation correctly handle this case and output a [M, N] matrix filled with zeros. ### Motivation and Context This is required to support GenAI empty input Lora implementation. Addresses: https://github.com/microsoft/onnxruntime/issues/21483
2026-07-24 19:43:35 +00:00 · 2024-08-13 11:27:05 -07:00 · 2024-08-13 11:27:05 -07:00 · c2911bbb1c
commit c2911bbb1c
parent 6af5394bd7
3 changed files with 38 additions and 1 deletions
--- a/onnxruntime/core/providers/cpu/math/matmul.cc
+++ b/onnxruntime/core/providers/cpu/math/matmul.cc
@ -103,6 +103,13 @@ Status MatMul<T>::Compute(OpKernelContext* ctx) const {
  if (y->Shape().Size() == 0)
    return Status::OK();

+  if (helper.K() == 0) {
+    // When we have (M, 0, N) then the inputs are empty, but the output should
+    // be filled out with zeros.
+    memset(y->MutableDataRaw(), 0, y->SizeInBytes());
+    return Status::OK();
+  }
+
  // Using DataRaw as int32_t/uint32_t and int64_t/uint64_t share a common
  // operator body.
  const auto* a_data = reinterpret_cast<const T*>(a->DataRaw());
--- a/onnxruntime/core/providers/cuda/math/matmul.cc
+++ b/onnxruntime/core/providers/cuda/math/matmul.cc
@ -110,7 +110,16 @@ Status MatMul<T>::ComputeInternal(OpKernelContext* ctx) const {

  Tensor* Y = ctx->Output(0, helper.OutputShape());
  // Bail out early if the output is going to be empty
-  if (Y->Shape().Size() == 0) return Status::OK();
+  const auto output_size = Y->Shape().Size();
+  if (output_size == 0) return Status::OK();
+
+  if (helper.K() == 0) {
+    // When we have (M, 0, N) then the inputs are empty, but the output should
+    // be filled out with zeros.
+    using CudaT = typename ToCudaType<T>::MappedType;
+    Fill<CudaT>(Stream(ctx), reinterpret_cast<CudaT*>(Y->MutableData<T>()), CudaT(0.f), narrow<int64_t>(output_size));
+    return Status::OK();
+  }

  if (GetTuningContext()->IsTunableOpEnabled()) {
    return tunable::TunableMatMul<T>(alpha_, trans_a, trans_b, trans_batch_a_, trans_batch_b_, helper, this, ctx);
--- a/onnxruntime/test/providers/cpu/math/matmul_test.cc
+++ b/onnxruntime/test/providers/cpu/math/matmul_test.cc
@ -219,6 +219,27 @@ TEST(MathOpTest, MatMulUint64Type) {
  RunMatMulTest<uint64_t>(9);
 }

+TEST(MathOpTest, MatMul_ZeroK) {
+  // test with empty inputs and zero filled output
+  constexpr const std::array<float, 0> empty_input{};
+  const std::vector<float> expected_output{0, 0, 0, 0,
+                                           0, 0, 0, 0,
+                                           0, 0, 0, 0,
+                                           0, 0, 0, 0};
+  OpTester test("MatMul", 14);
+
+  test.AddInput<float>("A", {4, 0}, empty_input);
+  test.AddInput<float>("B", {0, 4}, empty_input);
+  test.AddOutput<float>("Y", {4, 4}, expected_output);
+
+  // No special case is implemented.
+  test.ConfigExcludeEps({kCoreMLExecutionProvider, kNnapiExecutionProvider,
+                         kDmlExecutionProvider, kDnnlExecutionProvider, kQnnExecutionProvider,
+                         kOpenVINOExecutionProvider})
+      .Config(run_with_tunable_op)
+      .RunWithConfig();
+}
+
 #if defined(USE_CUDA) || defined(USE_ROCM)
 TEST(MathOpTest, MatMul_Float16) {
 #ifdef USE_CUDA