MLAS: Apply 'small-M' optimization for column-vectors (#2971)

Apply 'small-M' optimization for column-vectors in MlasSgemmOperation
This commit is contained in:
Maxim Kalinin 2020-02-04 14:51:20 -08:00 committed by GitHub
parent 7ff5c0e5a3
commit d5efbcb8d8
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 29 additions and 0 deletions

View file

@ -850,6 +850,34 @@ Return Value:
}
//
// Handle the case when both B and C are column-vectors that are contiguous in memory.
// Because transposition of such vectors doesn't change their layout, and
// Transpose(A*B) = Transpose(B) * Transpose(A), we can apply the same 'small-M'
// optimization as above, with A and B flipped.
//
if (N == 1 && ldb == 1 && ldc == 1 && alpha == 1.0f && (beta == 0.0f || beta == 1.0f)) {
#if defined(MLAS_TARGET_AMD64)
PMLAS_SGEMM_KERNEL_M1_ROUTINE SgemmKernelM1Routine;
if (TransA == CblasNoTrans) {
SgemmKernelM1Routine = MlasPlatform.KernelM1TransposeBRoutine;
} else {
SgemmKernelM1Routine = MlasPlatform.KernelM1Routine;
}
if (SgemmKernelM1Routine != nullptr) {
SgemmKernelM1Routine(B, A, C, K, M, lda, beta);
return;
}
#endif
}
//
// Compute the strides to step through slices of the input matrices.
//

View file

@ -396,6 +396,7 @@ public:
for (size_t a = 0; a < _countof(multipliers); a++) {
for (size_t b = 0; b < _countof(multipliers); b++) {
Test(1, N, K, multipliers[a], multipliers[b]);
Test(N, 1, K, multipliers[a], multipliers[b]);
}
}
}