diff --git a/onnxruntime/core/platform/env.cc b/onnxruntime/core/platform/env.cc index 608f3e7f15..854c9d5214 100644 --- a/onnxruntime/core/platform/env.cc +++ b/onnxruntime/core/platform/env.cc @@ -19,6 +19,20 @@ limitations under the License. namespace onnxruntime { +std::ostream& operator<<(std::ostream& os, const LogicalProcessors& aff) { + os << "{"; + std::copy(aff.cbegin(), aff.cend(), std::ostream_iterator(os, ", ")); + return os << "}"; +} + +std::ostream& operator<<(std::ostream& os, gsl::span affs) { + os << "{"; + for (const auto& aff : affs) { + os << aff; + } + return os << "}"; +} + Env::Env() = default; } // namespace onnxruntime diff --git a/onnxruntime/core/platform/env.h b/onnxruntime/core/platform/env.h index 42a9bf7a72..c2fb51a575 100644 --- a/onnxruntime/core/platform/env.h +++ b/onnxruntime/core/platform/env.h @@ -16,6 +16,7 @@ limitations under the License. #pragma once +#include #include #include #include @@ -58,19 +59,30 @@ class EnvThread { OrtCustomThreadHandle custom_thread_handle = nullptr; }; +/// Type that holds a collection of logical processors IDs used for setting affinities. +using LogicalProcessors = std::vector; + // Parameters that are required to create a set of threads for a thread pool struct ThreadOptions { // Stack size for a new thread. If it is 0, the operating system uses the same value as the stack that's specified for // the main thread, which is usually set in the main executable(not controlled by onnxruntime.dll). unsigned int stack_size = 0; - // Thread affinity means a thread can only run on the logical processors that the thread is allowed to run on. - // If the vector is not empty, set the affinity of each thread to just one CPU. - // Index is thread index, value is CPU ID, starting from zero. For example, the first thread in the pool will be bound - // to the logical processor with id of affinity[0]. If the vector is empty, the thread can run on all the processors - // its process can run on. NOTE: When hyperthreading is enabled, for example, on a 4 cores 8 physical threads CPU, - // processor group [0,1,2,3] may only contain half of the physical cores. - std::vector affinity; + // Thread affinity means a thread can only run on the logical processor(s) that the thread is allowed to run on. + // If the vector is not empty, then set the affinity of each thread to logical cpus ids within the LogicalProcessors. + // For example, the first thread in the pool will be bound to the logical processors contained in affinity[0]. + // If the vector is empty, the thread can run on all the processors its process can run on. + // NOTE: When hyperthreading is enabled, for example, on a 4 cores we would have 8 logical processors, + // processor group [0,1,2,3] may only occupy up some of the physical cores. There might be more than 2 logical + // processor per physical core on a given computer. Physical cores assigned to a given VM may contain + // logical processor indices that do not start with 0 and possibly go beyond the number of bits in an integer. + // + // If the size of the TP is not specified, ORT creates thread pools with a number of threads that are equal + // to the number of visible physical cores. The threads affinities are set to all of the logical processors + // that are contained in a given physical core with the same index as the thread. ORT does not set any affinity + // to the thread that is considered main (the thread that initiates the creation of the TP). + // The process that owns the thread may consider setting its affinity. + std::vector affinity; // Set or unset denormal as zero. bool set_denormal_as_zero = false; @@ -80,6 +92,10 @@ struct ThreadOptions { OrtCustomJoinThreadFn custom_join_thread_fn = nullptr; int dynamic_block_base_ = 0; }; + +std::ostream& operator<<(std::ostream& os, const LogicalProcessors&); +std::ostream& operator<<(std::ostream& os, gsl::span); + /// \brief An interface used by the onnxruntime implementation to /// access operating system functionality like the filesystem etc. /// @@ -117,10 +133,14 @@ class Env { /// The result of Default() belongs to this library and must never be deleted. static Env& Default(); - virtual int GetNumCpuCores() const = 0; + /// + /// The API returns the number of different physical cores on the system + /// + /// Number of physical cores + virtual int GetNumPhysicalCpuCores() const = 0; - // This function doesn't support systems with more than 64 logical processors - virtual std::vector GetThreadAffinityMasks() const = 0; + // This function currently doesn't support systems with more than 64 logical processors on Windows + virtual std::vector GetThreadAffinityMasks() const = 0; /// \brief Returns the number of micro-seconds since the Unix epoch. virtual uint64_t NowMicros() const { diff --git a/onnxruntime/core/platform/posix/env.cc b/onnxruntime/core/platform/posix/env.cc index bcdd748f40..1e55ed1dce 100644 --- a/onnxruntime/core/platform/posix/env.cc +++ b/onnxruntime/core/platform/posix/env.cc @@ -16,25 +16,32 @@ limitations under the License. #include "core/platform/env.h" -#include + +#include +#include +#include +#include #include #include #include -#include #include -#include -#include -#include -#include -#include #include +#include +#include +#include + +#include +#include #include #include // for std::forward #include -#include #include +#ifdef CPUINFO_SUPPORTED +#include +#endif + #include "core/common/common.h" #include "core/common/logging/logging.h" #include "core/platform/scoped_resource.h" @@ -84,7 +91,7 @@ static void UnmapFile(void* param) noexcept { std::unique_ptr p(reinterpret_cast(param)); int ret = munmap(p->addr, p->len); if (ret != 0) { - auto[err_no, err_msg] = GetSystemError(); + auto [err_no, err_msg] = GetSystemError(); LOGS_DEFAULT(ERROR) << "munmap failed. error code: " << err_no << " error msg: " << err_msg; } } @@ -94,7 +101,7 @@ struct FileDescriptorTraits { static Handle GetInvalidHandleValue() { return -1; } static void CleanUp(Handle h) { if (close(h) == -1) { - auto[err_no, err_msg] = GetSystemError(); + auto [err_no, err_msg] = GetSystemError(); LOGS_DEFAULT(ERROR) << "Failed to close file descriptor " << h << " - error code: " << err_no << " error msg: " << err_msg; } } @@ -121,7 +128,7 @@ int nftw_remove( int /*typeflag*/, struct FTW* /*ftwbuf*/) { const auto result = remove(fpath); if (result != 0) { - auto[err_no, err_msg] = GetSystemError(); + auto [err_no, err_msg] = GetSystemError(); LOGS_DEFAULT(WARNING) << "remove() failed. Error code: " << err_no << " error msg: " << err_msg << ", path: " << fpath; } @@ -135,7 +142,6 @@ struct Freer { using MallocdStringPtr = std::unique_ptr >; - class PosixThread : public EnvThread { private: struct Param { @@ -143,16 +149,16 @@ class PosixThread : public EnvThread { int index; unsigned (*start_address)(int id, Eigen::ThreadPoolInterface* param); Eigen::ThreadPoolInterface* param; - std::optional affinity_mask; + std::optional affinity; Param(const ORTCHAR_T* name_prefix1, int index1, unsigned (*start_address1)(int id, Eigen::ThreadPoolInterface* param), - Eigen::ThreadPoolInterface* param1) - : name_prefix(name_prefix1), - index(index1), - start_address(start_address1), - param(param1) {} + Eigen::ThreadPoolInterface* param1) + : name_prefix(name_prefix1), + index(index1), + start_address(start_address1), + param(param1) {} }; public: @@ -166,7 +172,7 @@ class PosixThread : public EnvThread { auto param_ptr = std::make_unique(name_prefix, index, start_address, param); if (gsl::narrow(index) < thread_options.affinity.size()) { - param_ptr->affinity_mask = thread_options.affinity[index]; + param_ptr->affinity = thread_options.affinity[index]; } if (custom_create_thread_fn) { @@ -220,16 +226,19 @@ class PosixThread : public EnvThread { std::unique_ptr p(static_cast(param)); ORT_TRY { #if !defined(__APPLE__) && !defined(__ANDROID__) && !defined(__wasm__) && !defined(_AIX) - if (p->affinity_mask.has_value()) { + if (p->affinity.has_value() && !p->affinity->empty()) { cpu_set_t cpuset; CPU_ZERO(&cpuset); - CPU_SET(*p->affinity_mask, &cpuset); + for(auto id : *p->affinity) { + CPU_SET(id, &cpuset); + } // pthread_setaffinity_np() does not set errno, it returns it. auto ret = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); if (ret != 0) { auto [err_no, err_msg] = GetSystemError(ret); - LOGS_DEFAULT(ERROR) << "pthread_setaffinity_np failed for thread: " << pthread_self() - << ", mask: " << *p->affinity_mask + LOGS_DEFAULT(ERROR) << "pthread_setaffinity_np failed for thread: " << syscall(SYS_gettid) + << ", index: " << p->index + << ", mask: " << *p->affinity << ", error code: " << err_no << " error msg: " << err_msg << ". Specify the number of threads explicitly so the affinity is not set."; } @@ -262,16 +271,51 @@ class PosixEnv : public Env { return new PosixThread(name_prefix, index, start_address, param, thread_options); } - int GetNumCpuCores() const override { - // TODO if you need the number of physical cores you'll need to parse - // /proc/cpuinfo and grep for "cpu cores". - // However, that information is not always available(output of 'grep -i core /proc/cpuinfo' is empty) - return std::thread::hardware_concurrency(); + // we are guessing the number of phys cores based on a popular HT case. + static int DefaultNumCores() { + return std::max(1, static_cast(std::thread::hardware_concurrency() / 2)); } - std::vector GetThreadAffinityMasks() const override { - std::vector ret(std::thread::hardware_concurrency() / 2); - std::iota(ret.begin(), ret.end(), 0); + // Return the number of physical cores + int GetNumPhysicalCpuCores() const override { +#ifdef CPUINFO_SUPPORTED + if(cpuinfo_available_) { + return gsl::narrow(cpuinfo_get_cores_count()); + } +#endif + // We guess the number of cores + return DefaultNumCores(); + } + + std::vector GetThreadAffinityMasks() const override { + + std::vector ret; +#ifdef CPUINFO_SUPPORTED + if (cpuinfo_available_) { +#if !defined(__APPLE__) && !defined(__ANDROID__) && !defined(__wasm__) && !defined(_AIX) + // We currently do not implement affinity on more than 64 cores. + auto num_phys_cores = cpuinfo_get_cores_count(); + ret.reserve(num_phys_cores); + for (uint32_t i = 0; i < num_phys_cores; ++i) { + const auto* core = cpuinfo_get_core(i); + LogicalProcessors th_aff; + // Processor count will never exceed 64 in a given group. + // TBD: Processor groups are currently not taken into account. + th_aff.reserve(core->processor_count); + auto log_proc_idx = core->processor_start; + for (uint32_t count = 0; count < core->processor_count; count++, ++log_proc_idx) { + const auto* log_proc = cpuinfo_get_processor(log_proc_idx); + th_aff.push_back(log_proc->linux_id); + } + ret.push_back(std::move(th_aff)); + } +#endif + } +#endif // CPUINFO_SUPPORTED + // Just the size of the thread-pool + if(ret.empty()) { + ret.resize(GetNumPhysicalCpuCores()); + } return ret; } @@ -407,7 +451,7 @@ class PosixEnv : public Env { } static common::Status ReportSystemError(const char* operation_name, const std::string& path) { - auto[err_no, err_msg] = GetSystemError(); + auto [err_no, err_msg] = GetSystemError(); std::ostringstream oss; oss << operation_name << " file \"" << path << "\" failed: " << err_msg; return common::Status(common::SYSTEM, err_no, oss.str()); @@ -537,8 +581,18 @@ class PosixEnv : public Env { } private: - PosixEnv() = default; + PosixEnv() { +#ifdef CPUINFO_SUPPORTED + cpuinfo_available_ = cpuinfo_initialize(); + if(!cpuinfo_available_) { + LOGS_DEFAULT(INFO) << "cpuinfo_initialize failed"; + } +#endif + } Telemetry telemetry_provider_; +#ifdef CPUINFO_SUPPORTED + bool cpuinfo_available_{false}; +#endif }; } // namespace diff --git a/onnxruntime/core/platform/windows/env.cc b/onnxruntime/core/platform/windows/env.cc index 23eb51f3b4..cac5157ad1 100644 --- a/onnxruntime/core/platform/windows/env.cc +++ b/onnxruntime/core/platform/windows/env.cc @@ -18,6 +18,7 @@ limitations under the License. #include +#include #include #include #include @@ -28,6 +29,7 @@ limitations under the License. #include #include "core/common/logging/logging.h" +#include "core/common/span_utils.h" #include "core/platform/env.h" #include "core/platform/scoped_resource.h" #include "core/platform/windows/telemetry.h" @@ -70,15 +72,15 @@ class WindowsThread : public EnvThread { int index; unsigned (*start_address)(int id, Eigen::ThreadPoolInterface* param); Eigen::ThreadPoolInterface* param; - std::optional affinity_mask; + std::optional affinity; Param(const ORTCHAR_T* name_prefix1, int index1, unsigned (*start_address1)(int id, Eigen::ThreadPoolInterface* param), - Eigen::ThreadPoolInterface* param1) - : name_prefix(name_prefix1), - index(index1), - start_address(start_address1), - param(param1) {} + Eigen::ThreadPoolInterface* param1) + : name_prefix(name_prefix1), + index(index1), + start_address(start_address1), + param(param1) {} }; public: @@ -92,7 +94,7 @@ class WindowsThread : public EnvThread { std::unique_ptr local_param = std::make_unique(name_prefix, index, start_address, param); if (gsl::narrow(index) < thread_options.affinity.size()) { - local_param->affinity_mask = thread_options.affinity[index]; + local_param->affinity = thread_options.affinity[index]; } if (custom_create_thread_fn) { @@ -159,12 +161,17 @@ class WindowsThread : public EnvThread { unsigned ret = 0; ORT_TRY { // TODO: should I try to use SetThreadSelectedCpuSets? - if (p->affinity_mask.has_value()) { - auto rc = SetThreadAffinityMask(GetCurrentThread(), *p->affinity_mask); + if (p->affinity.has_value() && !p->affinity->empty()) { + DWORD_PTR mask = 0; + for (auto id : *p->affinity) { + mask |= DWORD_PTR{1} << id; + } + auto rc = SetThreadAffinityMask(GetCurrentThread(), mask); if (!rc) { const auto error_code = GetLastError(); LOGS_DEFAULT(ERROR) << "SetThreadAffinityMask failed for thread: " << GetCurrentThreadId() - << ", mask: " << *p->affinity_mask + << ", index: " << p->index + << ", mask: " << *p->affinity << ", error code: " << error_code << ", error msg: " << std::system_category().message(error_code) << ". Specify the number of threads explicitly so the affinity is not set."; @@ -212,53 +219,100 @@ class WindowsEnv : public Env { Sleep(static_cast(micros) / 1000); } - int GetNumCpuCores() const override { - SYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer[256]; - DWORD returnLength = sizeof(buffer); + struct LogicalProcessorInformation { + std::unique_ptr buffer_; + gsl::span logical_processors; + }; + + std::optional FetchLogicalProcessorInfo() const { + // We will fail the first time around. The docs say, the size of the structure + // is different on different versions and releases. + DWORD returnLength = 0; + if (GetLogicalProcessorInformation(NULL, &returnLength) == FALSE) { + if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) { + auto last_error = GetLastError(); + LOGS_DEFAULT(ERROR) << "GetLogicalProcessorInformation failed to obtain buffer length. error code: " + << last_error + << " error msg: " << std::system_category().message(last_error); + return {}; + } + } + + auto allocation = std::make_unique(returnLength); + SYSTEM_LOGICAL_PROCESSOR_INFORMATION* buffer = reinterpret_cast(allocation.get()); if (GetLogicalProcessorInformation(buffer, &returnLength) == FALSE) { - // try GetSystemInfo - SYSTEM_INFO sysInfo; - GetSystemInfo(&sysInfo); - if (sysInfo.dwNumberOfProcessors <= 0) { - ORT_THROW("Fatal error: 0 count processors from GetSystemInfo"); - } - // This is the number of logical processors in the current group - return sysInfo.dwNumberOfProcessors; + auto last_error = GetLastError(); + LOGS_DEFAULT(ERROR) << "GetLogicalProcessorInformation failed to retrieve SYSTEM_LOGICAL_PROCESSOR_INFORMATION. error code: " + << last_error + << " error msg: " << std::system_category().message(last_error); + return {}; } - int processorCoreCount = 0; - int count = (int)(returnLength / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION)); - for (int i = 0; i != count; ++i) { - if (buffer[i].Relationship == RelationProcessorCore) { - ++processorCoreCount; - } - } - if (!processorCoreCount) - ORT_THROW("Fatal error: 0 count processors from GetLogicalProcessorInformation"); - return processorCoreCount; + + const size_t count = gsl::narrow(returnLength) / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); + std::optional result; + result = {std::move(allocation), gsl::make_span(buffer, count)}; + return result; } - std::vector GetThreadAffinityMasks() const override { - auto generate_vector_of_n = [](int n) { - std::vector ret(n); - std::iota(ret.begin(), ret.end(), 0); - return ret; - }; - // Indeed 64 should be enough. However, it's harmless to have a little more. - SYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer[256]; - DWORD returnLength = sizeof(buffer); - if (GetLogicalProcessorInformation(buffer, &returnLength) == FALSE) { - return generate_vector_of_n(std::thread::hardware_concurrency()); + static int DefaultNumCores() { + return std::max(1, static_cast(std::thread::hardware_concurrency() / 2)); + } + + int GetNumPhysicalCpuCores() const override { + auto logical_processor_info = FetchLogicalProcessorInfo(); + if (!logical_processor_info.has_value()) { + return DefaultNumCores(); } - std::vector ret; - int count = (int)(returnLength / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION)); - for (int i = 0; i != count; ++i) { - if (buffer[i].Relationship == RelationProcessorCore) { - ret.push_back(buffer[i].ProcessorMask); + + int phys_cores = 0; + for (const auto& processor_info : logical_processor_info->logical_processors) { + if (processor_info.Relationship == RelationProcessorCore) { + phys_cores++; } } - if (ret.empty()) { - return generate_vector_of_n(std::thread::hardware_concurrency()); + + phys_cores = std::max(1, phys_cores); + + return phys_cores; + } + + std::vector GetThreadAffinityMasks() const override { + std::vector ret; + + auto logical_processor_info = FetchLogicalProcessorInfo(); + if (!logical_processor_info.has_value()) { + ret.resize(DefaultNumCores()); + return ret; } + + // Convert mask to a vector of ints + auto mask_to_vector = [](uint64_t mask) { + LogicalProcessors aff; + int bit = 0; + while (mask != 0) { + if ((mask & 0x1) != 0) { + aff.push_back(bit); + } + mask >>= 0x1; + ++bit; + } + return aff; + }; + + for (const auto& processor_info : logical_processor_info->logical_processors) { + if (processor_info.Relationship == RelationProcessorCore) { + // A single core can host multiple logical processors + // so the mask returned can have more than one bit set. + // We allow threads to be ran on any logical CPU within a given + // physical core. + ret.push_back(mask_to_vector(processor_info.ProcessorMask)); + } + } + + if (ret.empty()) { + ret.resize(DefaultNumCores()); + } + return ret; } @@ -776,6 +830,8 @@ class WindowsEnv : public Env { } private: + WindowsEnv() = default; + typedef VOID(WINAPI* FnGetSystemTimePreciseAsFileTime)(LPFILETIME); WindowsTelemetry telemetry_provider_; }; diff --git a/onnxruntime/core/util/thread_utils.cc b/onnxruntime/core/util/thread_utils.cc index 94218ee7c9..c60d8cecd9 100644 --- a/onnxruntime/core/util/thread_utils.cc +++ b/onnxruntime/core/util/thread_utils.cc @@ -17,19 +17,31 @@ static std::unique_ptr CreateThreadPoolHelper(Env* env, OrtThreadPoolParams options) { if (options.thread_pool_size == 1) return nullptr; - std::vector cpu_list; ThreadOptions to; + if (options.affinity_vec_len != 0) { - to.affinity.assign(options.affinity_vec, options.affinity_vec + options.affinity_vec_len); + // Currently, the affinities are passed in as bit masks and they need to be converted to integers. + // We when create a public API, bit-masks must be done away with because of the following reasons: + // 1) integers have a limited number of bits + // 2) bit-masks of integers can only represent numbers 0 -63, but on VMs the actual logical processor numbering + // may not start with zero for a given core and may be way beyond 63. + // 3) Customers would be forced to concoct bit-masks which is far less convenient than simply an array of processor integers. + to.affinity.reserve(options.affinity_vec_len); + std::transform(options.affinity_vec, options.affinity_vec + options.affinity_vec_len, std::back_inserter(to.affinity), + [](size_t affinity) { + return LogicalProcessors{static_cast(affinity)}; + }); } + if (options.thread_pool_size <= 0) { // default - cpu_list = Env::Default().GetThreadAffinityMasks(); + auto cpu_list = Env::Default().GetThreadAffinityMasks(); if (cpu_list.empty() || cpu_list.size() == 1) return nullptr; options.thread_pool_size = static_cast(cpu_list.size()); if (options.auto_set_affinity) to.affinity = cpu_list; } + to.set_denormal_as_zero = options.set_denormal_as_zero; // set custom thread management members diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc index 6054838531..cac280cdb8 100644 --- a/onnxruntime/test/onnx/main.cc +++ b/onnxruntime/test/onnx/main.cc @@ -416,7 +416,7 @@ int real_main(int argc, char* argv[], Ort::Env& env) { } auto pos = token.find("|"); if (pos == std::string::npos || pos == 0 || pos == token.length()) { - ORT_THROW(R"(Use a '|' to separate the key and value for + ORT_THROW(R"(Use a '|' to separate the key and value for the run-time option you are trying to use.\n)"); } @@ -426,7 +426,7 @@ the run-time option you are trying to use.\n)"); if (key == "runtime") { std::set supported_runtime = {"CPU", "GPU_FP32", "GPU", "GPU_FLOAT16", "DSP", "AIP_FIXED_TF"}; if (supported_runtime.find(value) == supported_runtime.end()) { - ORT_THROW(R"(Wrong configuration value for the key 'runtime'. + ORT_THROW(R"(Wrong configuration value for the key 'runtime'. select from 'CPU', 'GPU_FP32', 'GPU', 'GPU_FLOAT16', 'DSP', 'AIP_FIXED_TF'. \n)"); } } else if (key == "priority") { @@ -434,7 +434,7 @@ select from 'CPU', 'GPU_FP32', 'GPU', 'GPU_FLOAT16', 'DSP', 'AIP_FIXED_TF'. \n)" } else if (key == "buffer_type") { std::set supported_buffer_type = {"TF8", "TF16", "UINT8", "FLOAT", "ITENSOR"}; if (supported_buffer_type.find(value) == supported_buffer_type.end()) { - ORT_THROW(R"(Wrong configuration value for the key 'buffer_type'. + ORT_THROW(R"(Wrong configuration value for the key 'buffer_type'. select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)"); } } else { diff --git a/onnxruntime/test/onnx/testenv.cc b/onnxruntime/test/onnx/testenv.cc index 6b9782a2a9..849cdd4427 100644 --- a/onnxruntime/test/onnx/testenv.cc +++ b/onnxruntime/test/onnx/testenv.cc @@ -18,7 +18,7 @@ static std::once_flag default_pool_init; PThreadPool TestEnv::GetDefaultThreadPool(onnxruntime::Env& env) { std::call_once(default_pool_init, [&env] { using namespace onnxruntime::concurrency; - int core_num = env.GetNumCpuCores(); + int core_num = env.GetNumPhysicalCpuCores(); onnxruntime::ThreadOptions t_opts; default_pool = std::make_unique(&env, t_opts, ORT_TSTR("onnx_runner_tp"), core_num, false); diff --git a/onnxruntime/test/perftest/performance_runner.cc b/onnxruntime/test/perftest/performance_runner.cc index e2b2ef7ca3..2039c65b53 100644 --- a/onnxruntime/test/perftest/performance_runner.cc +++ b/onnxruntime/test/perftest/performance_runner.cc @@ -42,7 +42,7 @@ static std::unique_ptr default_pool; static std::once_flag default_pool_init; Eigen::ThreadPoolInterface* GetDefaultThreadPool(const onnxruntime::Env& env) { std::call_once(default_pool_init, [&env] { - int core_num = env.GetNumCpuCores(); + int core_num = env.GetNumPhysicalCpuCores(); default_pool = std::make_unique(core_num); }); return default_pool.get(); diff --git a/onnxruntime/test/perftest/posix/utils.cc b/onnxruntime/test/perftest/posix/utils.cc index 931274fcab..9bf029d8df 100644 --- a/onnxruntime/test/perftest/posix/utils.cc +++ b/onnxruntime/test/perftest/posix/utils.cc @@ -37,7 +37,7 @@ class CPUUsage : public ICPUUsage { } else { clock_t proc_total_clock_diff = (time_sample.tms_stime - proc_sys_clock_start_) + (time_sample.tms_utime - proc_user_clock_start_); clock_t total_clock_diff = total_clock_now - total_clock_start_; - return static_cast(100.0 * proc_total_clock_diff / total_clock_diff / onnxruntime::Env::Default().GetNumCpuCores()); + return static_cast(100.0 * proc_total_clock_diff / total_clock_diff / onnxruntime::Env::Default().GetNumPhysicalCpuCores()); } }