From d5efbcb8d88fbe16d60d2cb39d09699d141fb847 Mon Sep 17 00:00:00 2001
From: Maxim Kalinin <makalini@microsoft.com>
Date: Tue, 4 Feb 2020 14:51:20 -0800
Subject: [PATCH] MLAS: Apply 'small-M' optimization for column-vectors (#2971)

Apply 'small-M' optimization for column-vectors in MlasSgemmOperation
---
 onnxruntime/core/mlas/lib/sgemm.cpp | 28 ++++++++++++++++++++++++++++
 onnxruntime/test/mlas/unittest.cpp  |  1 +
 2 files changed, 29 insertions(+)

diff --git a/onnxruntime/core/mlas/lib/sgemm.cpp b/onnxruntime/core/mlas/lib/sgemm.cpp
index 92ad6d6416..4f6903f0b0 100644
--- a/onnxruntime/core/mlas/lib/sgemm.cpp
+++ b/onnxruntime/core/mlas/lib/sgemm.cpp
@@ -850,6 +850,34 @@ Return Value:
 
     }
 
+    //
+    // Handle the case when both B and C are column-vectors that are contiguous in memory.
+    // Because transposition of such vectors doesn't change their layout, and
+    // Transpose(A*B) = Transpose(B) * Transpose(A), we can apply the same 'small-M'
+    // optimization as above, with A and B flipped.
+    //
+    if (N == 1 && ldb == 1 && ldc == 1 && alpha == 1.0f && (beta == 0.0f || beta == 1.0f)) {
+
+#if defined(MLAS_TARGET_AMD64)
+
+        PMLAS_SGEMM_KERNEL_M1_ROUTINE SgemmKernelM1Routine;
+
+        if (TransA == CblasNoTrans) {
+            SgemmKernelM1Routine = MlasPlatform.KernelM1TransposeBRoutine;
+        } else {
+            SgemmKernelM1Routine = MlasPlatform.KernelM1Routine;
+        }
+
+        if (SgemmKernelM1Routine != nullptr) {
+            SgemmKernelM1Routine(B, A, C, K, M, lda, beta);
+            return;
+        }
+
+#endif
+
+    }
+
+
     //
     // Compute the strides to step through slices of the input matrices.
     //
diff --git a/onnxruntime/test/mlas/unittest.cpp b/onnxruntime/test/mlas/unittest.cpp
index daa1377576..d78167f8a5 100644
--- a/onnxruntime/test/mlas/unittest.cpp
+++ b/onnxruntime/test/mlas/unittest.cpp
@@ -396,6 +396,7 @@ public:
                 for (size_t a = 0; a < _countof(multipliers); a++) {
                     for (size_t b = 0; b < _countof(multipliers); b++) {
                         Test(1, N, K, multipliers[a], multipliers[b]);
+                        Test(N, 1, K, multipliers[a], multipliers[b]);
                     }
                 }
             }