From d5efbcb8d88fbe16d60d2cb39d09699d141fb847 Mon Sep 17 00:00:00 2001 From: Maxim Kalinin Date: Tue, 4 Feb 2020 14:51:20 -0800 Subject: [PATCH] MLAS: Apply 'small-M' optimization for column-vectors (#2971) Apply 'small-M' optimization for column-vectors in MlasSgemmOperation --- onnxruntime/core/mlas/lib/sgemm.cpp | 28 ++++++++++++++++++++++++++++ onnxruntime/test/mlas/unittest.cpp | 1 + 2 files changed, 29 insertions(+) diff --git a/onnxruntime/core/mlas/lib/sgemm.cpp b/onnxruntime/core/mlas/lib/sgemm.cpp index 92ad6d6416..4f6903f0b0 100644 --- a/onnxruntime/core/mlas/lib/sgemm.cpp +++ b/onnxruntime/core/mlas/lib/sgemm.cpp @@ -850,6 +850,34 @@ Return Value: } + // + // Handle the case when both B and C are column-vectors that are contiguous in memory. + // Because transposition of such vectors doesn't change their layout, and + // Transpose(A*B) = Transpose(B) * Transpose(A), we can apply the same 'small-M' + // optimization as above, with A and B flipped. + // + if (N == 1 && ldb == 1 && ldc == 1 && alpha == 1.0f && (beta == 0.0f || beta == 1.0f)) { + +#if defined(MLAS_TARGET_AMD64) + + PMLAS_SGEMM_KERNEL_M1_ROUTINE SgemmKernelM1Routine; + + if (TransA == CblasNoTrans) { + SgemmKernelM1Routine = MlasPlatform.KernelM1TransposeBRoutine; + } else { + SgemmKernelM1Routine = MlasPlatform.KernelM1Routine; + } + + if (SgemmKernelM1Routine != nullptr) { + SgemmKernelM1Routine(B, A, C, K, M, lda, beta); + return; + } + +#endif + + } + + // // Compute the strides to step through slices of the input matrices. // diff --git a/onnxruntime/test/mlas/unittest.cpp b/onnxruntime/test/mlas/unittest.cpp index daa1377576..d78167f8a5 100644 --- a/onnxruntime/test/mlas/unittest.cpp +++ b/onnxruntime/test/mlas/unittest.cpp @@ -396,6 +396,7 @@ public: for (size_t a = 0; a < _countof(multipliers); a++) { for (size_t b = 0; b < _countof(multipliers); b++) { Test(1, N, K, multipliers[a], multipliers[b]); + Test(N, 1, K, multipliers[a], multipliers[b]); } } }