fix bug: getting current cpu core type (#10630)

Prev merged pull request has a bug:

#10521

It was aimed to detect current CPU core micro-architecture and select a best suited kernel. Unfortunately it assumes that a thread can never migrate from one core to another.

This change tries to fix that problem. It introduces about 2-5% performance degradation on symmetric quantized matmul

Co-authored-by: Chen Fu <fuchen@microsoft.com>
This commit is contained in:
Chen Fu 2022-02-25 08:56:14 -08:00 committed by GitHub
parent 617474e298
commit 12c44bfc4e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 107 additions and 71 deletions

View file

@ -109,6 +109,12 @@ Abstract:
#include "core/common/cpuid_info.h"
using MLAS_CPUIDINFO = onnxruntime::CPUIDInfo;
#include <unistd.h>
#include <sys/syscall.h>
#if !defined(__NR_getcpu)
#include <asm-generic/unistd.h>
#endif
#endif // MLAS_TARGET_ARM64
#else // BUILD_MLAS_NO_ONNXRUNTIME
@ -126,6 +132,8 @@ class MLASCPUIDInfo
// ARM
bool HasArmNeonDot() const { return has_arm_neon_dot_; }
int32_t GetCurrentUarch() { return -1; }
private:
MLASCPUIDInfo();
@ -774,6 +782,47 @@ struct MLAS_CONV_SYM_POST_PROCESS_PARAMS {
// Environment information class.
//
/**
* @brief IDs for cpu microarchitectures.
*
* Copied from python cpuinfo package. Can't use the definition
* from cpuinfo directly as it causes lots of compilation issues
* in many platforms that we support.
*/
enum MlasUArch {
cpuinfo_uarch_unknown = 0,
/** ARM Cortex-A32. */
cpuinfo_uarch_cortex_a32 = 0x00300332,
/** ARM Cortex-A35. */
cpuinfo_uarch_cortex_a35 = 0x00300335,
/** ARM Cortex-A53. */
cpuinfo_uarch_cortex_a53 = 0x00300353,
/** ARM Cortex-A55 revision 0 (restricted dual-issue capabilities compared to revision 1+). */
cpuinfo_uarch_cortex_a55r0 = 0x00300354,
/** ARM Cortex-A55. */
cpuinfo_uarch_cortex_a55 = 0x00300355,
/** ARM Cortex-A57. */
cpuinfo_uarch_cortex_a57 = 0x00300357,
/** ARM Cortex-A65. */
cpuinfo_uarch_cortex_a65 = 0x00300365,
/** ARM Cortex-A72. */
cpuinfo_uarch_cortex_a72 = 0x00300372,
/** ARM Cortex-A73. */
cpuinfo_uarch_cortex_a73 = 0x00300373,
/** ARM Cortex-A75. */
cpuinfo_uarch_cortex_a75 = 0x00300375,
/** ARM Cortex-A76. */
cpuinfo_uarch_cortex_a76 = 0x00300376,
/** ARM Cortex-A77. */
cpuinfo_uarch_cortex_a77 = 0x00300377,
/** ARM Cortex-A78. */
cpuinfo_uarch_cortex_a78 = 0x00300378,
};
enum MlasCoreType { mlas_core_unknown = 0, mlas_core_little = 2, mlas_core_big = 3 };
struct MLAS_PLATFORM {
MLAS_PLATFORM(void);
@ -836,6 +885,51 @@ struct MLAS_PLATFORM {
static constexpr int32_t MaximumThreadCount = MLAS_MAXIMUM_THREAD_COUNT;
#endif
#if defined(MLAS_TARGET_ARM64) && defined(__linux__)
// TODO!! implement uarch detection in Windows
std::vector<MlasCoreType> mlas_coretype_tbl;
#endif
/**
* @return 2 current core is little core with narrow memory load (e.g. ARMv8 a53)
* 3 current core is big core with wider load (e.g. ARMv8 a72)
*/
MlasCoreType GetCoreType()
{
#if defined(MLAS_TARGET_ARM64) && defined(__linux__)
if (mlas_coretype_tbl.size() == 0) {
// functionality missing, return default
return mlas_core_big;
}
unsigned cpu = 0;
if (syscall(__NR_getcpu, &cpu, NULL, NULL) != 0) {
// failed to detect current core id. give up
return mlas_core_big;
}
if (cpu >= mlas_coretype_tbl.size()) {
mlas_coretype_tbl.resize(cpu + 1, mlas_core_unknown);
}
auto core_type = mlas_coretype_tbl[cpu];
if (core_type == mlas_core_unknown) {
auto uarch = MLAS_CPUIDINFO::GetCPUIDInfo().GetCurrentUarch();
if (uarch == cpuinfo_uarch_cortex_a53 || uarch == cpuinfo_uarch_cortex_a55r0 ||
uarch == cpuinfo_uarch_cortex_a55) {
core_type = mlas_core_little;
} else {
core_type = mlas_core_big;
}
mlas_coretype_tbl[cpu] = core_type;
}
return core_type;
#else
return mlas_core_big;
#endif
}
};
inline
@ -1987,72 +2081,3 @@ MlasReadTimeStampCounter(void)
#endif
#endif
}
/**
* @brief IDs for cpu microarchitectures.
*
* Copied from python cpuinfo package. Can't use the definition
* from cpuinfo directly as it causes lots of compilation issues
* in many platforms that we support.
*/
enum MlasUArch {
cpuinfo_uarch_unknown = 0,
/** ARM Cortex-A32. */
cpuinfo_uarch_cortex_a32 = 0x00300332,
/** ARM Cortex-A35. */
cpuinfo_uarch_cortex_a35 = 0x00300335,
/** ARM Cortex-A53. */
cpuinfo_uarch_cortex_a53 = 0x00300353,
/** ARM Cortex-A55 revision 0 (restricted dual-issue capabilities compared to revision 1+). */
cpuinfo_uarch_cortex_a55r0 = 0x00300354,
/** ARM Cortex-A55. */
cpuinfo_uarch_cortex_a55 = 0x00300355,
/** ARM Cortex-A57. */
cpuinfo_uarch_cortex_a57 = 0x00300357,
/** ARM Cortex-A65. */
cpuinfo_uarch_cortex_a65 = 0x00300365,
/** ARM Cortex-A72. */
cpuinfo_uarch_cortex_a72 = 0x00300372,
/** ARM Cortex-A73. */
cpuinfo_uarch_cortex_a73 = 0x00300373,
/** ARM Cortex-A75. */
cpuinfo_uarch_cortex_a75 = 0x00300375,
/** ARM Cortex-A76. */
cpuinfo_uarch_cortex_a76 = 0x00300376,
/** ARM Cortex-A77. */
cpuinfo_uarch_cortex_a77 = 0x00300377,
/** ARM Cortex-A78. */
cpuinfo_uarch_cortex_a78 = 0x00300378,
};
enum MlasCoreType { mlas_core_unknown = 0, mlas_core_little = 2, mlas_core_big = 3 };
/**
* @return 2 current core is little core with narrow memory load (e.g. ARMv8 a53)
* 3 current core is big core with wider load (e.g. ARMv8 a72)
*/
MLAS_FORCEINLINE
int32_t
MlasGetCoreUArch()
{
thread_local int32_t core_type = mlas_core_unknown;
if (core_type == mlas_core_unknown) {
// initialization needed
#if defined(MLAS_TARGET_ARM64) && defined(__linux__)
auto uarch = MLAS_CPUIDINFO::GetCPUIDInfo().GetCurrentUarch();
if (uarch == cpuinfo_uarch_cortex_a53 || uarch == cpuinfo_uarch_cortex_a55r0 ||
uarch == cpuinfo_uarch_cortex_a55) {
core_type = mlas_core_little;
} else {
core_type = mlas_core_big;
}
#else
core_type = mlas_core_big;
#endif // MLAS_TARGET_ARM64
}
return core_type;
}

View file

@ -17,6 +17,9 @@ Abstract:
#include "mlasi.h"
#include <thread>
#include <mutex>
#if defined(MLAS_TARGET_POWER) && defined(__linux__)
#include <sys/auxv.h>
#endif
@ -394,6 +397,14 @@ Return Value:
#endif
#endif
// Init the table describing the type (big or litte) of each core
#if defined(MLAS_TARGET_ARM64) && defined(__linux__)
// TODO!! implemente core uarch detection in Windows
auto tbl_size = std::thread::hardware_concurrency();
if (tbl_size > 0) {
mlas_coretype_tbl.resize(tbl_size, mlas_core_unknown);
}
#endif
}
size_t

View file

@ -209,7 +209,7 @@ MlasSymmQgemmBatch(
if (ThreadPool == nullptr) {
// So our caller handles threaded job partition.
// Call single threaded operation directly
auto uarch = MlasGetCoreUArch();
auto uarch = GetMlasPlatform().GetCoreType();
MLAS_SYMM_QGEMM_OPERATION* operation =
uarch == mlas_core_little ? dispatch->LitOperation : dispatch->BigOperation;
@ -260,7 +260,7 @@ MlasSymmQgemmBatch(
ThreadsPerGemm = ThreadCountM * ThreadCountN;
MlasTrySimpleParallel(ThreadPool, ThreadsPerGemm * BatchN, [&](ptrdiff_t tid) {
auto uarch = MlasGetCoreUArch();
auto uarch = GetMlasPlatform().GetCoreType();
MLAS_SYMM_QGEMM_OPERATION* operation =
uarch == mlas_core_little ? dispatch->LitOperation : dispatch->BigOperation;