[CUDA] Special case for K==0 in CUDA MatMul (#21525)

### Description
This change addresses a case where we multiply two matrices, and their
inner dimension is 0.
numpy and Eigen which is being used in our CPU EP implementation
correctly handle this case
and output a [M, N] matrix filled with zeros.

### Motivation and Context
This is required to support GenAI empty input Lora implementation.

Addresses: https://github.com/microsoft/onnxruntime/issues/21483
This commit is contained in:
Dmitri Smirnov 2024-08-13 11:27:05 -07:00 committed by GitHub
parent 6af5394bd7
commit c2911bbb1c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 38 additions and 1 deletions

View file

@ -103,6 +103,13 @@ Status MatMul<T>::Compute(OpKernelContext* ctx) const {
if (y->Shape().Size() == 0)
return Status::OK();
if (helper.K() == 0) {
// When we have (M, 0, N) then the inputs are empty, but the output should
// be filled out with zeros.
memset(y->MutableDataRaw(), 0, y->SizeInBytes());
return Status::OK();
}
// Using DataRaw as int32_t/uint32_t and int64_t/uint64_t share a common
// operator body.
const auto* a_data = reinterpret_cast<const T*>(a->DataRaw());

View file

@ -110,7 +110,16 @@ Status MatMul<T>::ComputeInternal(OpKernelContext* ctx) const {
Tensor* Y = ctx->Output(0, helper.OutputShape());
// Bail out early if the output is going to be empty
if (Y->Shape().Size() == 0) return Status::OK();
const auto output_size = Y->Shape().Size();
if (output_size == 0) return Status::OK();
if (helper.K() == 0) {
// When we have (M, 0, N) then the inputs are empty, but the output should
// be filled out with zeros.
using CudaT = typename ToCudaType<T>::MappedType;
Fill<CudaT>(Stream(ctx), reinterpret_cast<CudaT*>(Y->MutableData<T>()), CudaT(0.f), narrow<int64_t>(output_size));
return Status::OK();
}
if (GetTuningContext()->IsTunableOpEnabled()) {
return tunable::TunableMatMul<T>(alpha_, trans_a, trans_b, trans_batch_a_, trans_batch_b_, helper, this, ctx);

View file

@ -219,6 +219,27 @@ TEST(MathOpTest, MatMulUint64Type) {
RunMatMulTest<uint64_t>(9);
}
TEST(MathOpTest, MatMul_ZeroK) {
// test with empty inputs and zero filled output
constexpr const std::array<float, 0> empty_input{};
const std::vector<float> expected_output{0, 0, 0, 0,
0, 0, 0, 0,
0, 0, 0, 0,
0, 0, 0, 0};
OpTester test("MatMul", 14);
test.AddInput<float>("A", {4, 0}, empty_input);
test.AddInput<float>("B", {0, 4}, empty_input);
test.AddOutput<float>("Y", {4, 4}, expected_output);
// No special case is implemented.
test.ConfigExcludeEps({kCoreMLExecutionProvider, kNnapiExecutionProvider,
kDmlExecutionProvider, kDnnlExecutionProvider, kQnnExecutionProvider,
kOpenVINOExecutionProvider})
.Config(run_with_tunable_op)
.RunWithConfig();
}
#if defined(USE_CUDA) || defined(USE_ROCM)
TEST(MathOpTest, MatMul_Float16) {
#ifdef USE_CUDA