mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-14 20:48:00 +00:00
### Description The default thread count methodology by onnxruntime did not account for new upcoming Intel microarchitectures leading to a suboptimal thread count. Optimizing the thread count for new Intel microarchitectures reveal gains on the majority of models across datatypes and shows gains up to ~1.5x speedup. ### Motivation and Context Applications should run on Intel with the most performant thread configuration for the majority of models. With new microarchitectures, adjusting the thread count methodology is required to take advantage of their differences. <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. -->
121 lines
3.6 KiB
C++
121 lines
3.6 KiB
C++
|
|
// Copyright (c) Microsoft Corporation. All rights reserved.
|
|
// Licensed under the MIT License.
|
|
|
|
#include "lib/Api/pch/pch.h"
|
|
#include "HardwareCoreEnumerator.h"
|
|
|
|
namespace WINMLP {
|
|
|
|
struct LogicalProcessorInformation {
|
|
std::unique_ptr<char[]> Buffer;
|
|
size_t Length;
|
|
};
|
|
|
|
struct CoreCounter {
|
|
uint32_t PhysicalCores = 0;
|
|
uint32_t LLCCores = 0;
|
|
};
|
|
|
|
static LogicalProcessorInformation GetLogicalProcessorInfos(LOGICAL_PROCESSOR_RELATIONSHIP relationship) {
|
|
DWORD length = 0;
|
|
DWORD rc = GetLogicalProcessorInformationEx(relationship, nullptr, &length);
|
|
|
|
assert(rc == FALSE);
|
|
|
|
auto processorInformationBytes = std::make_unique<char[]>(length);
|
|
|
|
rc = GetLogicalProcessorInformationEx(
|
|
relationship, reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(processorInformationBytes.get()), &length
|
|
);
|
|
|
|
assert(rc == TRUE);
|
|
|
|
return {std::move(processorInformationBytes), length};
|
|
}
|
|
|
|
uint32_t CountSetBits(DWORD input) {
|
|
uint32_t c;
|
|
for (c = 0; input; c++) {
|
|
input &= input - 1;
|
|
}
|
|
return c;
|
|
}
|
|
|
|
static CoreCounter GetCoreInfo() {
|
|
auto logicalProcessorInformation = GetLogicalProcessorInfos(RelationAll);
|
|
|
|
CoreCounter cores;
|
|
DWORD dwLevel2GroupMask = 0;
|
|
DWORD dwLevel3GroupMask = 0;
|
|
size_t read = 0;
|
|
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX currentProcessorInfo = NULL;
|
|
|
|
while ((read + FIELD_OFFSET(SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, Processor)) < logicalProcessorInformation.Length
|
|
) {
|
|
currentProcessorInfo =
|
|
reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(logicalProcessorInformation.Buffer.get() + read);
|
|
if ((read + currentProcessorInfo->Size) > logicalProcessorInformation.Length) {
|
|
break;
|
|
}
|
|
|
|
switch (currentProcessorInfo->Relationship) {
|
|
case RelationProcessorCore:
|
|
cores.PhysicalCores++;
|
|
break;
|
|
case RelationCache:
|
|
//Cache level masks count Logicial processors
|
|
if (currentProcessorInfo->Cache.Level == 2) {
|
|
dwLevel2GroupMask |= currentProcessorInfo->Cache.GroupMask.Mask;
|
|
} else if (currentProcessorInfo->Cache.Level == 3) {
|
|
dwLevel3GroupMask |= currentProcessorInfo->Cache.GroupMask.Mask;
|
|
}
|
|
break;
|
|
}
|
|
|
|
read += currentProcessorInfo->Size;
|
|
}
|
|
|
|
cores.LLCCores = cores.PhysicalCores - CountSetBits(dwLevel2GroupMask & ~dwLevel3GroupMask);
|
|
|
|
return cores;
|
|
}
|
|
|
|
uint32_t HardwareCoreEnumerator::DefaultIntraOpNumThreads() {
|
|
// # of physical cores = # of P cores + # of E Cores + # of Soc Cores.
|
|
// # of logical cores = # of P cores x 2 (if hyper threading is enabled) + # of E cores + # of Soc Cores.
|
|
auto cores = GetCoreInfo();
|
|
|
|
#if !defined(_M_ARM64EC) && !defined(_M_ARM64) && !defined(__aarch64__)
|
|
const int kVendorID_Intel[3] = {0x756e6547, 0x6c65746e, 0x49656e69}; // "GenuntelineI"
|
|
bool isIntelSpecifiedPlatform = false;
|
|
const int kVendorID_IntelSpecifiedPlatformIDs[3] = {
|
|
// ExtendedModel,ExtendedFamily,Family Code, and Model Number
|
|
0xa06a, // MTL
|
|
0xc065, // ARL-H
|
|
0xb065 // ARL-U
|
|
};
|
|
|
|
int regs_leaf0[4];
|
|
int regs_leaf1[4];
|
|
__cpuid(regs_leaf0, 0);
|
|
__cpuid(regs_leaf1, 0x1);
|
|
|
|
auto isIntel = (kVendorID_Intel[0] == regs_leaf0[1]) && (kVendorID_Intel[1] == regs_leaf0[2]) &&
|
|
(kVendorID_Intel[2] == regs_leaf0[3]);
|
|
|
|
for (int intelSpecifiedPlatform : kVendorID_IntelSpecifiedPlatformIDs) {
|
|
if ((regs_leaf1[0] >> 4) == intelSpecifiedPlatform) {
|
|
isIntelSpecifiedPlatform = true;
|
|
}
|
|
}
|
|
|
|
if (isIntel && isIntelSpecifiedPlatform) {
|
|
// We want to use the number of physical cores, but exclude cores without an LLC
|
|
return cores.LLCCores;
|
|
}
|
|
#endif
|
|
return cores.PhysicalCores;
|
|
}
|
|
|
|
} // namespace WINMLP
|