MLAS: Apply 'small-M' optimization for column-vectors (#2971)

Apply 'small-M' optimization for column-vectors in MlasSgemmOperation
2026-05-18 21:21:17 +00:00 · 2020-02-04 14:51:20 -08:00 · 2020-02-04 14:51:20 -08:00 · d5efbcb8d8
commit d5efbcb8d8
parent 7ff5c0e5a3
2 changed files with 29 additions and 0 deletions
--- a/onnxruntime/core/mlas/lib/sgemm.cpp
+++ b/onnxruntime/core/mlas/lib/sgemm.cpp
@ -850,6 +850,34 @@ Return Value:

    }

+    //
+    // Handle the case when both B and C are column-vectors that are contiguous in memory.
+    // Because transposition of such vectors doesn't change their layout, and
+    // Transpose(A*B) = Transpose(B) * Transpose(A), we can apply the same 'small-M'
+    // optimization as above, with A and B flipped.
+    //
+    if (N == 1 && ldb == 1 && ldc == 1 && alpha == 1.0f && (beta == 0.0f || beta == 1.0f)) {
+
+#if defined(MLAS_TARGET_AMD64)
+
+        PMLAS_SGEMM_KERNEL_M1_ROUTINE SgemmKernelM1Routine;
+
+        if (TransA == CblasNoTrans) {
+            SgemmKernelM1Routine = MlasPlatform.KernelM1TransposeBRoutine;
+        } else {
+            SgemmKernelM1Routine = MlasPlatform.KernelM1Routine;
+        }
+
+        if (SgemmKernelM1Routine != nullptr) {
+            SgemmKernelM1Routine(B, A, C, K, M, lda, beta);
+            return;
+        }
+
+#endif
+
+    }
+
+
    //
    // Compute the strides to step through slices of the input matrices.
    //
--- a/onnxruntime/test/mlas/unittest.cpp
+++ b/onnxruntime/test/mlas/unittest.cpp
@ -396,6 +396,7 @@ public:
                for (size_t a = 0; a < _countof(multipliers); a++) {
                    for (size_t b = 0; b < _countof(multipliers); b++) {
                        Test(1, N, K, multipliers[a], multipliers[b]);
+                        Test(N, 1, K, multipliers[a], multipliers[b]);
                    }
                }
            }