Improve logging and default affinity mask generation (#13338)

### Description Fix logging for affinity failures on Linux. Make `GetCpuCores()` consistently return the number of physical cores. Use `CpuInfo` library to correctly set affinities for Linux where supported. Make windows generate affinity masks as ordinals and convert them to masks at the setting site. Allow setting multiple logical processors affinity masks per thread. We continue to set all logical processors as thread affinity per physical core. ### Motivation and Context Error logging on Linux uses `pthread_self()` which does not return Thread ID. Fix default affinity mask generation on Windows. The following are the issues with Windows: - `GetThreadAffinityMasks()` returns bitmasks, but on other platforms it returns ordinals generated for the hardware concurrency - The maximum number of processors supported for requires a mask of 64-bits, but `size_t` type used is not always 64-bit - The masks returned per physical core may have multiple bits set, because the mask applies to several logical cores hosted by the physical core. In the past, customers complained that their threads jump from one core to another which adversely affects performance. The decision was made to stay this way. - 64-bit masks do not allow for logical processors with IDs that are outside of 0-63 range.
2026-06-28 03:20:58 +00:00 · 2022-10-26 13:30:27 -07:00 · 2022-10-26 13:30:27 -07:00 · 1c8a22ec68
commit 1c8a22ec68
parent 136e15bfaf
9 changed files with 257 additions and 101 deletions
--- a/onnxruntime/core/platform/env.cc
+++ b/onnxruntime/core/platform/env.cc
@ -19,6 +19,20 @@ limitations under the License.

 namespace onnxruntime {

+std::ostream& operator<<(std::ostream& os, const LogicalProcessors& aff) {
+  os << "{";
+  std::copy(aff.cbegin(), aff.cend(), std::ostream_iterator<int>(os, ", "));
+  return os << "}";
+}
+
+std::ostream& operator<<(std::ostream& os, gsl::span<const LogicalProcessors> affs) {
+  os << "{";
+  for (const auto& aff : affs) {
+    os << aff;
+  }
+  return os << "}";
+}
+
 Env::Env() = default;

 }  // namespace onnxruntime
--- a/onnxruntime/core/platform/env.h
+++ b/onnxruntime/core/platform/env.h
@ -16,6 +16,7 @@ limitations under the License.

 #pragma once

+#include <iosfwd>
 #include <functional>
 #include <memory>
 #include <string>
@ -58,19 +59,30 @@ class EnvThread {
  OrtCustomThreadHandle custom_thread_handle = nullptr;
 };

+/// Type that holds a collection of logical processors IDs used for setting affinities.
+using LogicalProcessors = std::vector<int>;
+
 // Parameters that are required to create a set of threads for a thread pool
 struct ThreadOptions {
  // Stack size for a new thread. If it is 0, the operating system uses the same value as the stack that's specified for
  // the main thread, which is usually set in the main executable(not controlled by onnxruntime.dll).
  unsigned int stack_size = 0;

-  // Thread affinity means a thread can only run on the logical processors that the thread is allowed to run on.
-  // If the vector is not empty, set the affinity of each thread to just one CPU.
-  // Index is thread index, value is CPU ID, starting from zero. For example, the first thread in the pool will be bound
-  // to the logical processor with id of affinity[0]. If the vector is empty, the thread can run on all the processors
-  // its process can run on. NOTE: When hyperthreading is enabled, for example, on a 4 cores 8 physical threads CPU,
-  // processor group [0,1,2,3] may only contain half of the physical cores.
-  std::vector<size_t> affinity;
+  // Thread affinity means a thread can only run on the logical processor(s) that the thread is allowed to run on.
+  // If the vector is not empty, then set the affinity of each thread to logical cpus ids within the LogicalProcessors.
+  // For example, the first thread in the pool will be bound to the logical processors contained in affinity[0].
+  // If the vector is empty, the thread can run on all the processors its process can run on. 
+  // NOTE: When hyperthreading is enabled, for example, on a 4 cores we would have 8 logical processors,
+  // processor group [0,1,2,3] may only occupy up some of the physical cores. There might be more than 2 logical
+  // processor per physical core on a given computer. Physical cores assigned to a given VM may contain
+  // logical processor indices that do not start with 0 and possibly go beyond the number of bits in an integer.
+  // 
+  // If the size of the TP is not specified, ORT creates thread pools with a number of threads that are equal
+  // to the number of visible physical cores. The threads affinities are set to all of the logical processors
+  // that are contained in a given physical core with the same index as the thread. ORT does not set any affinity
+  // to the thread that is considered main (the thread that initiates the creation of the TP).
+  // The process that owns the thread may consider setting its affinity.
+  std::vector<LogicalProcessors> affinity;

  // Set or unset denormal as zero.
  bool set_denormal_as_zero = false;
@ -80,6 +92,10 @@ struct ThreadOptions {
  OrtCustomJoinThreadFn custom_join_thread_fn = nullptr;
  int dynamic_block_base_ = 0;
 };
+
+std::ostream& operator<<(std::ostream& os, const LogicalProcessors&);
+std::ostream& operator<<(std::ostream& os, gsl::span<const LogicalProcessors>);
+
 /// \brief An interface used by the onnxruntime implementation to
 /// access operating system functionality like the filesystem etc.
 ///
@ -117,10 +133,14 @@ class Env {
  /// The result of Default() belongs to this library and must never be deleted.
  static Env& Default();

-  virtual int GetNumCpuCores() const = 0;
+  /// <summary>
+  /// The API returns the number of different physical cores on the system
+  /// </summary>
+  /// <returns>Number of physical cores</returns>
+  virtual int GetNumPhysicalCpuCores() const = 0;

-  // This function doesn't support systems with more than 64 logical processors
-  virtual std::vector<size_t> GetThreadAffinityMasks() const = 0;
+  // This function currently doesn't support systems with more than 64 logical processors on Windows
+  virtual std::vector<LogicalProcessors> GetThreadAffinityMasks() const = 0;

  /// \brief Returns the number of micro-seconds since the Unix epoch.
  virtual uint64_t NowMicros() const {
--- a/onnxruntime/core/platform/posix/env.cc
+++ b/onnxruntime/core/platform/posix/env.cc
@ -16,25 +16,32 @@ limitations under the License.

 #include "core/platform/env.h"

-#include <unistd.h>
+
+#include <assert.h>
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <ftw.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <stdio.h>
-#include <fcntl.h>
 #include <stdlib.h>
-#include <sys/mman.h>
-#include <fcntl.h>
-#include <dlfcn.h>
-#include <ftw.h>
-#include <optional>
 #include <string.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include <iostream>
+#include <optional>
 #include <thread>
 #include <utility>  // for std::forward
 #include <vector>
-#include <assert.h>

 #include <gsl/gsl>

+#ifdef CPUINFO_SUPPORTED
+#include <cpuinfo.h>
+#endif
+
 #include "core/common/common.h"
 #include "core/common/logging/logging.h"
 #include "core/platform/scoped_resource.h"
@ -84,7 +91,7 @@ static void UnmapFile(void* param) noexcept {
  std::unique_ptr<UnmapFileParam> p(reinterpret_cast<UnmapFileParam*>(param));
  int ret = munmap(p->addr, p->len);
  if (ret != 0) {
-    auto[err_no, err_msg] = GetSystemError();
+    auto [err_no, err_msg] = GetSystemError();
    LOGS_DEFAULT(ERROR) << "munmap failed. error code: " << err_no << " error msg: " << err_msg;
  }
 }
@ -94,7 +101,7 @@ struct FileDescriptorTraits {
  static Handle GetInvalidHandleValue() { return -1; }
  static void CleanUp(Handle h) {
    if (close(h) == -1) {
-      auto[err_no, err_msg] = GetSystemError();
+      auto [err_no, err_msg] = GetSystemError();
      LOGS_DEFAULT(ERROR) << "Failed to close file descriptor " << h << " - error code: " << err_no << " error msg: " << err_msg;
    }
  }
@ -121,7 +128,7 @@ int nftw_remove(
    int /*typeflag*/, struct FTW* /*ftwbuf*/) {
  const auto result = remove(fpath);
  if (result != 0) {
-    auto[err_no, err_msg] = GetSystemError();
+    auto [err_no, err_msg] = GetSystemError();
    LOGS_DEFAULT(WARNING) << "remove() failed. Error code: " << err_no << " error msg: " << err_msg
                          << ", path: " << fpath;
  }
@ -135,7 +142,6 @@ struct Freer {

 using MallocdStringPtr = std::unique_ptr<char, Freer<char> >;

-
 class PosixThread : public EnvThread {
 private:
  struct Param {
@ -143,16 +149,16 @@ class PosixThread : public EnvThread {
    int index;
    unsigned (*start_address)(int id, Eigen::ThreadPoolInterface* param);
    Eigen::ThreadPoolInterface* param;
-    std::optional<size_t> affinity_mask;
+    std::optional<LogicalProcessors> affinity;

    Param(const ORTCHAR_T* name_prefix1,
          int index1,
          unsigned (*start_address1)(int id, Eigen::ThreadPoolInterface* param),
-          Eigen::ThreadPoolInterface* param1) 
-      : name_prefix(name_prefix1),
-      index(index1),
-      start_address(start_address1), 
-      param(param1) {}
+          Eigen::ThreadPoolInterface* param1)
+        : name_prefix(name_prefix1),
+          index(index1),
+          start_address(start_address1),
+          param(param1) {}
  };

 public:
@ -166,7 +172,7 @@ class PosixThread : public EnvThread {

    auto param_ptr = std::make_unique<Param>(name_prefix, index, start_address, param);
    if (gsl::narrow<size_t>(index) < thread_options.affinity.size()) {
-      param_ptr->affinity_mask = thread_options.affinity[index];
+      param_ptr->affinity = thread_options.affinity[index];
    }

    if (custom_create_thread_fn) {
@ -220,16 +226,19 @@ class PosixThread : public EnvThread {
    std::unique_ptr<Param> p(static_cast<Param*>(param));
    ORT_TRY {
 #if !defined(__APPLE__) && !defined(__ANDROID__) && !defined(__wasm__) && !defined(_AIX)
-      if (p->affinity_mask.has_value()) {
+      if (p->affinity.has_value() && !p->affinity->empty()) {
        cpu_set_t cpuset;
        CPU_ZERO(&cpuset);
-        CPU_SET(*p->affinity_mask, &cpuset);
+        for(auto id : *p->affinity) {
+          CPU_SET(id, &cpuset);
+        }
        // pthread_setaffinity_np() does not set errno, it returns it.
        auto ret = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
        if (ret != 0) {
          auto [err_no, err_msg] = GetSystemError(ret);
-          LOGS_DEFAULT(ERROR) << "pthread_setaffinity_np failed for thread: " << pthread_self()
-                              << ", mask: " << *p->affinity_mask
+          LOGS_DEFAULT(ERROR) << "pthread_setaffinity_np failed for thread: " << syscall(SYS_gettid)
+                              << ", index: " << p->index
+                              << ", mask: " << *p->affinity
                              << ", error code: " << err_no << " error msg: " << err_msg
                              << ". Specify the number of threads explicitly so the affinity is not set.";
        }
@ -262,16 +271,51 @@ class PosixEnv : public Env {
    return new PosixThread(name_prefix, index, start_address, param, thread_options);
  }

-  int GetNumCpuCores() const override {
-    // TODO if you need the number of physical cores you'll need to parse
-    // /proc/cpuinfo and grep for "cpu cores".
-    // However, that information is not always available(output of 'grep -i core /proc/cpuinfo' is empty)
-    return std::thread::hardware_concurrency();
+  // we are guessing the number of phys cores based on a popular HT case.
+  static int DefaultNumCores() {
+    return std::max(1, static_cast<int>(std::thread::hardware_concurrency() / 2));
  }

-  std::vector<size_t> GetThreadAffinityMasks() const override {
-    std::vector<size_t> ret(std::thread::hardware_concurrency() / 2);
-    std::iota(ret.begin(), ret.end(), 0);
+  // Return the number of physical cores
+  int GetNumPhysicalCpuCores() const override {
+#ifdef CPUINFO_SUPPORTED
+    if(cpuinfo_available_) {
+      return gsl::narrow<int>(cpuinfo_get_cores_count());
+    }
+#endif
+    // We guess the number of cores
+    return DefaultNumCores();
+  }
+
+  std::vector<LogicalProcessors> GetThreadAffinityMasks() const override {
+
+    std::vector<LogicalProcessors> ret;
+#ifdef CPUINFO_SUPPORTED
+    if (cpuinfo_available_) {
+#if !defined(__APPLE__) && !defined(__ANDROID__) && !defined(__wasm__) && !defined(_AIX)
+      // We currently do not implement affinity on more than 64 cores.
+      auto num_phys_cores = cpuinfo_get_cores_count();
+      ret.reserve(num_phys_cores);
+      for (uint32_t i = 0; i < num_phys_cores; ++i) {
+        const auto* core = cpuinfo_get_core(i);
+        LogicalProcessors th_aff;
+        // Processor count will never exceed 64 in a given group.
+        // TBD: Processor groups are currently not taken into account.
+        th_aff.reserve(core->processor_count);
+        auto log_proc_idx = core->processor_start;
+        for (uint32_t count = 0; count < core->processor_count; count++, ++log_proc_idx) {
+          const auto* log_proc = cpuinfo_get_processor(log_proc_idx);
+          th_aff.push_back(log_proc->linux_id);
+        }
+        ret.push_back(std::move(th_aff));
+       }
+#endif
+    }
+#endif // CPUINFO_SUPPORTED
+    // Just the size of the thread-pool
+    if(ret.empty()) {
+      ret.resize(GetNumPhysicalCpuCores());
+    }
    return ret;
  }

@ -407,7 +451,7 @@ class PosixEnv : public Env {
  }

  static common::Status ReportSystemError(const char* operation_name, const std::string& path) {
-    auto[err_no, err_msg] = GetSystemError();
+    auto [err_no, err_msg] = GetSystemError();
    std::ostringstream oss;
    oss << operation_name << " file \"" << path << "\" failed: " << err_msg;
    return common::Status(common::SYSTEM, err_no, oss.str());
@ -537,8 +581,18 @@ class PosixEnv : public Env {
  }

 private:
-  PosixEnv() = default;
+  PosixEnv()  {
+#ifdef CPUINFO_SUPPORTED
+    cpuinfo_available_ = cpuinfo_initialize();
+    if(!cpuinfo_available_) {
+      LOGS_DEFAULT(INFO) << "cpuinfo_initialize failed";
+    }
+#endif
+  }
  Telemetry telemetry_provider_;
+#ifdef CPUINFO_SUPPORTED
+  bool cpuinfo_available_{false};
+#endif
 };

 }  // namespace
--- a/onnxruntime/core/platform/windows/env.cc
+++ b/onnxruntime/core/platform/windows/env.cc
@ -18,6 +18,7 @@ limitations under the License.

 #include <Windows.h>

+#include <iostream>
 #include <fstream>
 #include <optional>
 #include <string>
@ -28,6 +29,7 @@ limitations under the License.

 #include <gsl/gsl>
 #include "core/common/logging/logging.h"
+#include "core/common/span_utils.h"
 #include "core/platform/env.h"
 #include "core/platform/scoped_resource.h"
 #include "core/platform/windows/telemetry.h"
@ -70,15 +72,15 @@ class WindowsThread : public EnvThread {
    int index;
    unsigned (*start_address)(int id, Eigen::ThreadPoolInterface* param);
    Eigen::ThreadPoolInterface* param;
-    std::optional<size_t> affinity_mask;
+    std::optional<LogicalProcessors> affinity;
    Param(const ORTCHAR_T* name_prefix1,
          int index1,
          unsigned (*start_address1)(int id, Eigen::ThreadPoolInterface* param),
-          Eigen::ThreadPoolInterface* param1) 
-      : name_prefix(name_prefix1),
-      index(index1),
-      start_address(start_address1),
-      param(param1) {}
+          Eigen::ThreadPoolInterface* param1)
+        : name_prefix(name_prefix1),
+          index(index1),
+          start_address(start_address1),
+          param(param1) {}
  };

 public:
@ -92,7 +94,7 @@ class WindowsThread : public EnvThread {

    std::unique_ptr<Param> local_param = std::make_unique<Param>(name_prefix, index, start_address, param);
    if (gsl::narrow<size_t>(index) < thread_options.affinity.size()) {
-      local_param->affinity_mask = thread_options.affinity[index];
+      local_param->affinity = thread_options.affinity[index];
    }

    if (custom_create_thread_fn) {
@ -159,12 +161,17 @@ class WindowsThread : public EnvThread {
    unsigned ret = 0;
    ORT_TRY {
      // TODO: should I try to use SetThreadSelectedCpuSets?
-      if (p->affinity_mask.has_value()) {
-        auto rc = SetThreadAffinityMask(GetCurrentThread(), *p->affinity_mask);
+      if (p->affinity.has_value() && !p->affinity->empty()) {
+        DWORD_PTR mask = 0;
+        for (auto id : *p->affinity) {
+          mask |= DWORD_PTR{1} << id;
+        }
+        auto rc = SetThreadAffinityMask(GetCurrentThread(), mask);
        if (!rc) {
          const auto error_code = GetLastError();
          LOGS_DEFAULT(ERROR) << "SetThreadAffinityMask failed for thread: " << GetCurrentThreadId()
-                              << ", mask: " << *p->affinity_mask
+                              << ", index: " << p->index
+                              << ", mask: " << *p->affinity
                              << ", error code: " << error_code
                              << ", error msg: " << std::system_category().message(error_code)
                              << ". Specify the number of threads explicitly so the affinity is not set.";
@ -212,53 +219,100 @@ class WindowsEnv : public Env {
    Sleep(static_cast<DWORD>(micros) / 1000);
  }

-  int GetNumCpuCores() const override {
-    SYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer[256];
-    DWORD returnLength = sizeof(buffer);
+  struct LogicalProcessorInformation {
+    std::unique_ptr<char[]> buffer_;
+    gsl::span<const SYSTEM_LOGICAL_PROCESSOR_INFORMATION> logical_processors;
+  };
+
+  std::optional<LogicalProcessorInformation> FetchLogicalProcessorInfo() const {
+    // We will fail the first time around. The docs say, the size of the structure
+    // is different on different versions and releases.
+    DWORD returnLength = 0;
+    if (GetLogicalProcessorInformation(NULL, &returnLength) == FALSE) {
+      if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
+        auto last_error = GetLastError();
+        LOGS_DEFAULT(ERROR) << "GetLogicalProcessorInformation failed to obtain buffer length. error code: "
+                            << last_error
+                            << " error msg: " << std::system_category().message(last_error);
+        return {};
+      }
+    }
+
+    auto allocation = std::make_unique<char[]>(returnLength);
+    SYSTEM_LOGICAL_PROCESSOR_INFORMATION* buffer = reinterpret_cast<SYSTEM_LOGICAL_PROCESSOR_INFORMATION*>(allocation.get());
    if (GetLogicalProcessorInformation(buffer, &returnLength) == FALSE) {
-      // try GetSystemInfo
-      SYSTEM_INFO sysInfo;
-      GetSystemInfo(&sysInfo);
-      if (sysInfo.dwNumberOfProcessors <= 0) {
-        ORT_THROW("Fatal error: 0 count processors from GetSystemInfo");
-      }
-      // This is the number of logical processors in the current group
-      return sysInfo.dwNumberOfProcessors;
+      auto last_error = GetLastError();
+      LOGS_DEFAULT(ERROR) << "GetLogicalProcessorInformation failed to retrieve SYSTEM_LOGICAL_PROCESSOR_INFORMATION. error code: "
+                          << last_error
+                          << " error msg: " << std::system_category().message(last_error);
+      return {};
    }
-    int processorCoreCount = 0;
-    int count = (int)(returnLength / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION));
-    for (int i = 0; i != count; ++i) {
-      if (buffer[i].Relationship == RelationProcessorCore) {
-        ++processorCoreCount;
-      }
-    }
-    if (!processorCoreCount)
-      ORT_THROW("Fatal error: 0 count processors from GetLogicalProcessorInformation");
-    return processorCoreCount;
+
+    const size_t count = gsl::narrow<size_t>(returnLength) / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
+    std::optional<LogicalProcessorInformation> result;
+    result = {std::move(allocation), gsl::make_span(buffer, count)};
+    return result;
  }

-  std::vector<size_t> GetThreadAffinityMasks() const override {
-    auto generate_vector_of_n = [](int n) {
-      std::vector<size_t> ret(n);
-      std::iota(ret.begin(), ret.end(), 0);
-      return ret;
-    };
-    // Indeed 64 should be enough. However, it's harmless to have a little more.
-    SYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer[256];
-    DWORD returnLength = sizeof(buffer);
-    if (GetLogicalProcessorInformation(buffer, &returnLength) == FALSE) {
-      return generate_vector_of_n(std::thread::hardware_concurrency());
+  static int DefaultNumCores() {
+    return std::max(1, static_cast<int>(std::thread::hardware_concurrency() / 2));
+  }
+
+  int GetNumPhysicalCpuCores() const override {
+    auto logical_processor_info = FetchLogicalProcessorInfo();
+    if (!logical_processor_info.has_value()) {
+      return DefaultNumCores();
    }
-    std::vector<size_t> ret;
-    int count = (int)(returnLength / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION));
-    for (int i = 0; i != count; ++i) {
-      if (buffer[i].Relationship == RelationProcessorCore) {
-        ret.push_back(buffer[i].ProcessorMask);
+
+    int phys_cores = 0;
+    for (const auto& processor_info : logical_processor_info->logical_processors) {
+      if (processor_info.Relationship == RelationProcessorCore) {
+        phys_cores++;
      }
    }
-    if (ret.empty()) {
-      return generate_vector_of_n(std::thread::hardware_concurrency());
+
+    phys_cores = std::max(1, phys_cores);
+
+    return phys_cores;
+  }
+
+  std::vector<LogicalProcessors> GetThreadAffinityMasks() const override {
+    std::vector<LogicalProcessors> ret;
+
+    auto logical_processor_info = FetchLogicalProcessorInfo();
+    if (!logical_processor_info.has_value()) {
+      ret.resize(DefaultNumCores());
+      return ret;
    }
+
+    // Convert mask to a vector of ints
+    auto mask_to_vector = [](uint64_t mask) {
+      LogicalProcessors aff;
+      int bit = 0;
+      while (mask != 0) {
+        if ((mask & 0x1) != 0) {
+          aff.push_back(bit);
+        }
+        mask >>= 0x1;
+        ++bit;
+      }
+      return aff;
+    };
+
+    for (const auto& processor_info : logical_processor_info->logical_processors) {
+      if (processor_info.Relationship == RelationProcessorCore) {
+        // A single core can host multiple logical processors
+        // so the mask returned can have more than one bit set.
+        // We allow threads to be ran on any logical CPU within a given
+        // physical core.
+        ret.push_back(mask_to_vector(processor_info.ProcessorMask));
+      }
+    }
+
+    if (ret.empty()) {
+      ret.resize(DefaultNumCores());
+    }
+
    return ret;
  }

@ -776,6 +830,8 @@ class WindowsEnv : public Env {
  }

 private:
+  WindowsEnv() = default;
+
  typedef VOID(WINAPI* FnGetSystemTimePreciseAsFileTime)(LPFILETIME);
  WindowsTelemetry telemetry_provider_;
 };
--- a/onnxruntime/core/util/thread_utils.cc
+++ b/onnxruntime/core/util/thread_utils.cc
@ -17,19 +17,31 @@ static std::unique_ptr<ThreadPool>
 CreateThreadPoolHelper(Env* env, OrtThreadPoolParams options) {
  if (options.thread_pool_size == 1)
    return nullptr;
-  std::vector<size_t> cpu_list;
  ThreadOptions to;
+
  if (options.affinity_vec_len != 0) {
-    to.affinity.assign(options.affinity_vec, options.affinity_vec + options.affinity_vec_len);
+    // Currently, the affinities are passed in as bit masks and they need to be converted to integers.
+    // We when create a public API, bit-masks must be done away with because of the following reasons:
+    // 1) integers have a limited number of bits
+    // 2) bit-masks of integers can only represent numbers 0 -63, but on VMs the actual logical processor numbering
+    //    may not start with zero for a given core and may be way beyond 63.
+    // 3) Customers would be forced to concoct bit-masks which is far less convenient than simply an array of processor integers. 
+    to.affinity.reserve(options.affinity_vec_len);
+    std::transform(options.affinity_vec, options.affinity_vec + options.affinity_vec_len, std::back_inserter(to.affinity),
+                   [](size_t affinity) {
+                     return LogicalProcessors{static_cast<int>(affinity)};
+                   });
  }
+
  if (options.thread_pool_size <= 0) {  // default
-    cpu_list = Env::Default().GetThreadAffinityMasks();
+    auto cpu_list = Env::Default().GetThreadAffinityMasks();
    if (cpu_list.empty() || cpu_list.size() == 1)
      return nullptr;
    options.thread_pool_size = static_cast<int>(cpu_list.size());
    if (options.auto_set_affinity)
      to.affinity = cpu_list;
  }
+
  to.set_denormal_as_zero = options.set_denormal_as_zero;

  // set custom thread management members
--- a/onnxruntime/test/onnx/main.cc
+++ b/onnxruntime/test/onnx/main.cc
@ -416,7 +416,7 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
        }
        auto pos = token.find("|");
        if (pos == std::string::npos || pos == 0 || pos == token.length()) {
-          ORT_THROW(R"(Use a '|' to separate the key and value for 
+          ORT_THROW(R"(Use a '|' to separate the key and value for
 the run-time option you are trying to use.\n)");
        }

@ -426,7 +426,7 @@ the run-time option you are trying to use.\n)");
        if (key == "runtime") {
          std::set<std::string> supported_runtime = {"CPU", "GPU_FP32", "GPU", "GPU_FLOAT16", "DSP", "AIP_FIXED_TF"};
          if (supported_runtime.find(value) == supported_runtime.end()) {
-            ORT_THROW(R"(Wrong configuration value for the key 'runtime'. 
+            ORT_THROW(R"(Wrong configuration value for the key 'runtime'.
 select from 'CPU', 'GPU_FP32', 'GPU', 'GPU_FLOAT16', 'DSP', 'AIP_FIXED_TF'. \n)");
          }
        } else if (key == "priority") {
@ -434,7 +434,7 @@ select from 'CPU', 'GPU_FP32', 'GPU', 'GPU_FLOAT16', 'DSP', 'AIP_FIXED_TF'. \n)"
        } else if (key == "buffer_type") {
          std::set<std::string> supported_buffer_type = {"TF8", "TF16", "UINT8", "FLOAT", "ITENSOR"};
          if (supported_buffer_type.find(value) == supported_buffer_type.end()) {
-            ORT_THROW(R"(Wrong configuration value for the key 'buffer_type'. 
+            ORT_THROW(R"(Wrong configuration value for the key 'buffer_type'.
 select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
          }
        } else {
--- a/onnxruntime/test/onnx/testenv.cc
+++ b/onnxruntime/test/onnx/testenv.cc
@ -18,7 +18,7 @@ static std::once_flag default_pool_init;
 PThreadPool TestEnv::GetDefaultThreadPool(onnxruntime::Env& env) {
  std::call_once(default_pool_init, [&env] {
    using namespace onnxruntime::concurrency;
-    int core_num = env.GetNumCpuCores();
+    int core_num = env.GetNumPhysicalCpuCores();

    onnxruntime::ThreadOptions t_opts;
    default_pool = std::make_unique<ThreadPool>(&env, t_opts, ORT_TSTR("onnx_runner_tp"), core_num, false);
--- a/onnxruntime/test/perftest/performance_runner.cc
+++ b/onnxruntime/test/perftest/performance_runner.cc
@ -42,7 +42,7 @@ static std::unique_ptr<DefaultThreadPoolType> default_pool;
 static std::once_flag default_pool_init;
 Eigen::ThreadPoolInterface* GetDefaultThreadPool(const onnxruntime::Env& env) {
  std::call_once(default_pool_init, [&env] {
-    int core_num = env.GetNumCpuCores();
+    int core_num = env.GetNumPhysicalCpuCores();
    default_pool = std::make_unique<DefaultThreadPoolType>(core_num);
  });
  return default_pool.get();
--- a/onnxruntime/test/perftest/posix/utils.cc
+++ b/onnxruntime/test/perftest/posix/utils.cc
@ -37,7 +37,7 @@ class CPUUsage : public ICPUUsage {
    } else {
      clock_t proc_total_clock_diff = (time_sample.tms_stime - proc_sys_clock_start_) + (time_sample.tms_utime - proc_user_clock_start_);
      clock_t total_clock_diff = total_clock_now - total_clock_start_;
-      return static_cast<short>(100.0 * proc_total_clock_diff / total_clock_diff / onnxruntime::Env::Default().GetNumCpuCores());
+      return static_cast<short>(100.0 * proc_total_clock_diff / total_clock_diff / onnxruntime::Env::Default().GetNumPhysicalCpuCores());
    }
  }