mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-07-03 03:58:54 +00:00
[CUDA] Special case for K==0 in CUDA MatMul (#21525)
### Description This change addresses a case where we multiply two matrices, and their inner dimension is 0. numpy and Eigen which is being used in our CPU EP implementation correctly handle this case and output a [M, N] matrix filled with zeros. ### Motivation and Context This is required to support GenAI empty input Lora implementation. Addresses: https://github.com/microsoft/onnxruntime/issues/21483
This commit is contained in:
parent
6af5394bd7
commit
c2911bbb1c
3 changed files with 38 additions and 1 deletions
|
|
@ -103,6 +103,13 @@ Status MatMul<T>::Compute(OpKernelContext* ctx) const {
|
|||
if (y->Shape().Size() == 0)
|
||||
return Status::OK();
|
||||
|
||||
if (helper.K() == 0) {
|
||||
// When we have (M, 0, N) then the inputs are empty, but the output should
|
||||
// be filled out with zeros.
|
||||
memset(y->MutableDataRaw(), 0, y->SizeInBytes());
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Using DataRaw as int32_t/uint32_t and int64_t/uint64_t share a common
|
||||
// operator body.
|
||||
const auto* a_data = reinterpret_cast<const T*>(a->DataRaw());
|
||||
|
|
|
|||
|
|
@ -110,7 +110,16 @@ Status MatMul<T>::ComputeInternal(OpKernelContext* ctx) const {
|
|||
|
||||
Tensor* Y = ctx->Output(0, helper.OutputShape());
|
||||
// Bail out early if the output is going to be empty
|
||||
if (Y->Shape().Size() == 0) return Status::OK();
|
||||
const auto output_size = Y->Shape().Size();
|
||||
if (output_size == 0) return Status::OK();
|
||||
|
||||
if (helper.K() == 0) {
|
||||
// When we have (M, 0, N) then the inputs are empty, but the output should
|
||||
// be filled out with zeros.
|
||||
using CudaT = typename ToCudaType<T>::MappedType;
|
||||
Fill<CudaT>(Stream(ctx), reinterpret_cast<CudaT*>(Y->MutableData<T>()), CudaT(0.f), narrow<int64_t>(output_size));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
if (GetTuningContext()->IsTunableOpEnabled()) {
|
||||
return tunable::TunableMatMul<T>(alpha_, trans_a, trans_b, trans_batch_a_, trans_batch_b_, helper, this, ctx);
|
||||
|
|
|
|||
|
|
@ -219,6 +219,27 @@ TEST(MathOpTest, MatMulUint64Type) {
|
|||
RunMatMulTest<uint64_t>(9);
|
||||
}
|
||||
|
||||
TEST(MathOpTest, MatMul_ZeroK) {
|
||||
// test with empty inputs and zero filled output
|
||||
constexpr const std::array<float, 0> empty_input{};
|
||||
const std::vector<float> expected_output{0, 0, 0, 0,
|
||||
0, 0, 0, 0,
|
||||
0, 0, 0, 0,
|
||||
0, 0, 0, 0};
|
||||
OpTester test("MatMul", 14);
|
||||
|
||||
test.AddInput<float>("A", {4, 0}, empty_input);
|
||||
test.AddInput<float>("B", {0, 4}, empty_input);
|
||||
test.AddOutput<float>("Y", {4, 4}, expected_output);
|
||||
|
||||
// No special case is implemented.
|
||||
test.ConfigExcludeEps({kCoreMLExecutionProvider, kNnapiExecutionProvider,
|
||||
kDmlExecutionProvider, kDnnlExecutionProvider, kQnnExecutionProvider,
|
||||
kOpenVINOExecutionProvider})
|
||||
.Config(run_with_tunable_op)
|
||||
.RunWithConfig();
|
||||
}
|
||||
|
||||
#if defined(USE_CUDA) || defined(USE_ROCM)
|
||||
TEST(MathOpTest, MatMul_Float16) {
|
||||
#ifdef USE_CUDA
|
||||
|
|
|
|||
Loading…
Reference in a new issue