diff --git a/onnxruntime/core/platform/env.cc b/onnxruntime/core/platform/env.cc
index 608f3e7f15..854c9d5214 100644
--- a/onnxruntime/core/platform/env.cc
+++ b/onnxruntime/core/platform/env.cc
@@ -19,6 +19,20 @@ limitations under the License.
 
 namespace onnxruntime {
 
+std::ostream& operator<<(std::ostream& os, const LogicalProcessors& aff) {
+  os << "{";
+  std::copy(aff.cbegin(), aff.cend(), std::ostream_iterator<int>(os, ", "));
+  return os << "}";
+}
+
+std::ostream& operator<<(std::ostream& os, gsl::span<const LogicalProcessors> affs) {
+  os << "{";
+  for (const auto& aff : affs) {
+    os << aff;
+  }
+  return os << "}";
+}
+
 Env::Env() = default;
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/platform/env.h b/onnxruntime/core/platform/env.h
index 42a9bf7a72..c2fb51a575 100644
--- a/onnxruntime/core/platform/env.h
+++ b/onnxruntime/core/platform/env.h
@@ -16,6 +16,7 @@ limitations under the License.
 
 #pragma once
 
+#include <iosfwd>
 #include <functional>
 #include <memory>
 #include <string>
@@ -58,19 +59,30 @@ class EnvThread {
   OrtCustomThreadHandle custom_thread_handle = nullptr;
 };
 
+/// Type that holds a collection of logical processors IDs used for setting affinities.
+using LogicalProcessors = std::vector<int>;
+
 // Parameters that are required to create a set of threads for a thread pool
 struct ThreadOptions {
   // Stack size for a new thread. If it is 0, the operating system uses the same value as the stack that's specified for
   // the main thread, which is usually set in the main executable(not controlled by onnxruntime.dll).
   unsigned int stack_size = 0;
 
-  // Thread affinity means a thread can only run on the logical processors that the thread is allowed to run on.
-  // If the vector is not empty, set the affinity of each thread to just one CPU.
-  // Index is thread index, value is CPU ID, starting from zero. For example, the first thread in the pool will be bound
-  // to the logical processor with id of affinity[0]. If the vector is empty, the thread can run on all the processors
-  // its process can run on. NOTE: When hyperthreading is enabled, for example, on a 4 cores 8 physical threads CPU,
-  // processor group [0,1,2,3] may only contain half of the physical cores.
-  std::vector<size_t> affinity;
+  // Thread affinity means a thread can only run on the logical processor(s) that the thread is allowed to run on.
+  // If the vector is not empty, then set the affinity of each thread to logical cpus ids within the LogicalProcessors.
+  // For example, the first thread in the pool will be bound to the logical processors contained in affinity[0].
+  // If the vector is empty, the thread can run on all the processors its process can run on. 
+  // NOTE: When hyperthreading is enabled, for example, on a 4 cores we would have 8 logical processors,
+  // processor group [0,1,2,3] may only occupy up some of the physical cores. There might be more than 2 logical
+  // processor per physical core on a given computer. Physical cores assigned to a given VM may contain
+  // logical processor indices that do not start with 0 and possibly go beyond the number of bits in an integer.
+  // 
+  // If the size of the TP is not specified, ORT creates thread pools with a number of threads that are equal
+  // to the number of visible physical cores. The threads affinities are set to all of the logical processors
+  // that are contained in a given physical core with the same index as the thread. ORT does not set any affinity
+  // to the thread that is considered main (the thread that initiates the creation of the TP).
+  // The process that owns the thread may consider setting its affinity.
+  std::vector<LogicalProcessors> affinity;
 
   // Set or unset denormal as zero.
   bool set_denormal_as_zero = false;
@@ -80,6 +92,10 @@ struct ThreadOptions {
   OrtCustomJoinThreadFn custom_join_thread_fn = nullptr;
   int dynamic_block_base_ = 0;
 };
+
+std::ostream& operator<<(std::ostream& os, const LogicalProcessors&);
+std::ostream& operator<<(std::ostream& os, gsl::span<const LogicalProcessors>);
+
 /// \brief An interface used by the onnxruntime implementation to
 /// access operating system functionality like the filesystem etc.
 ///
@@ -117,10 +133,14 @@ class Env {
   /// The result of Default() belongs to this library and must never be deleted.
   static Env& Default();
 
-  virtual int GetNumCpuCores() const = 0;
+  /// <summary>
+  /// The API returns the number of different physical cores on the system
+  /// </summary>
+  /// <returns>Number of physical cores</returns>
+  virtual int GetNumPhysicalCpuCores() const = 0;
 
-  // This function doesn't support systems with more than 64 logical processors
-  virtual std::vector<size_t> GetThreadAffinityMasks() const = 0;
+  // This function currently doesn't support systems with more than 64 logical processors on Windows
+  virtual std::vector<LogicalProcessors> GetThreadAffinityMasks() const = 0;
 
   /// \brief Returns the number of micro-seconds since the Unix epoch.
   virtual uint64_t NowMicros() const {
diff --git a/onnxruntime/core/platform/posix/env.cc b/onnxruntime/core/platform/posix/env.cc
index bcdd748f40..1e55ed1dce 100644
--- a/onnxruntime/core/platform/posix/env.cc
+++ b/onnxruntime/core/platform/posix/env.cc
@@ -16,25 +16,32 @@ limitations under the License.
 
 #include "core/platform/env.h"
 
-#include <unistd.h>
+
+#include <assert.h>
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <ftw.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <stdio.h>
-#include <fcntl.h>
 #include <stdlib.h>
-#include <sys/mman.h>
-#include <fcntl.h>
-#include <dlfcn.h>
-#include <ftw.h>
-#include <optional>
 #include <string.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include <iostream>
+#include <optional>
 #include <thread>
 #include <utility>  // for std::forward
 #include <vector>
-#include <assert.h>
 
 #include <gsl/gsl>
 
+#ifdef CPUINFO_SUPPORTED
+#include <cpuinfo.h>
+#endif
+
 #include "core/common/common.h"
 #include "core/common/logging/logging.h"
 #include "core/platform/scoped_resource.h"
@@ -84,7 +91,7 @@ static void UnmapFile(void* param) noexcept {
   std::unique_ptr<UnmapFileParam> p(reinterpret_cast<UnmapFileParam*>(param));
   int ret = munmap(p->addr, p->len);
   if (ret != 0) {
-    auto[err_no, err_msg] = GetSystemError();
+    auto [err_no, err_msg] = GetSystemError();
     LOGS_DEFAULT(ERROR) << "munmap failed. error code: " << err_no << " error msg: " << err_msg;
   }
 }
@@ -94,7 +101,7 @@ struct FileDescriptorTraits {
   static Handle GetInvalidHandleValue() { return -1; }
   static void CleanUp(Handle h) {
     if (close(h) == -1) {
-      auto[err_no, err_msg] = GetSystemError();
+      auto [err_no, err_msg] = GetSystemError();
       LOGS_DEFAULT(ERROR) << "Failed to close file descriptor " << h << " - error code: " << err_no << " error msg: " << err_msg;
     }
   }
@@ -121,7 +128,7 @@ int nftw_remove(
     int /*typeflag*/, struct FTW* /*ftwbuf*/) {
   const auto result = remove(fpath);
   if (result != 0) {
-    auto[err_no, err_msg] = GetSystemError();
+    auto [err_no, err_msg] = GetSystemError();
     LOGS_DEFAULT(WARNING) << "remove() failed. Error code: " << err_no << " error msg: " << err_msg
                           << ", path: " << fpath;
   }
@@ -135,7 +142,6 @@ struct Freer {
 
 using MallocdStringPtr = std::unique_ptr<char, Freer<char> >;
 
-
 class PosixThread : public EnvThread {
  private:
   struct Param {
@@ -143,16 +149,16 @@ class PosixThread : public EnvThread {
     int index;
     unsigned (*start_address)(int id, Eigen::ThreadPoolInterface* param);
     Eigen::ThreadPoolInterface* param;
-    std::optional<size_t> affinity_mask;
+    std::optional<LogicalProcessors> affinity;
 
     Param(const ORTCHAR_T* name_prefix1,
           int index1,
           unsigned (*start_address1)(int id, Eigen::ThreadPoolInterface* param),
-          Eigen::ThreadPoolInterface* param1) 
-      : name_prefix(name_prefix1),
-      index(index1),
-      start_address(start_address1), 
-      param(param1) {}
+          Eigen::ThreadPoolInterface* param1)
+        : name_prefix(name_prefix1),
+          index(index1),
+          start_address(start_address1),
+          param(param1) {}
   };
 
  public:
@@ -166,7 +172,7 @@ class PosixThread : public EnvThread {
 
     auto param_ptr = std::make_unique<Param>(name_prefix, index, start_address, param);
     if (gsl::narrow<size_t>(index) < thread_options.affinity.size()) {
-      param_ptr->affinity_mask = thread_options.affinity[index];
+      param_ptr->affinity = thread_options.affinity[index];
     }
 
     if (custom_create_thread_fn) {
@@ -220,16 +226,19 @@ class PosixThread : public EnvThread {
     std::unique_ptr<Param> p(static_cast<Param*>(param));
     ORT_TRY {
 #if !defined(__APPLE__) && !defined(__ANDROID__) && !defined(__wasm__) && !defined(_AIX)
-      if (p->affinity_mask.has_value()) {
+      if (p->affinity.has_value() && !p->affinity->empty()) {
         cpu_set_t cpuset;
         CPU_ZERO(&cpuset);
-        CPU_SET(*p->affinity_mask, &cpuset);
+        for(auto id : *p->affinity) {
+          CPU_SET(id, &cpuset);
+        }
         // pthread_setaffinity_np() does not set errno, it returns it.
         auto ret = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
         if (ret != 0) {
           auto [err_no, err_msg] = GetSystemError(ret);
-          LOGS_DEFAULT(ERROR) << "pthread_setaffinity_np failed for thread: " << pthread_self()
-                              << ", mask: " << *p->affinity_mask
+          LOGS_DEFAULT(ERROR) << "pthread_setaffinity_np failed for thread: " << syscall(SYS_gettid)
+                              << ", index: " << p->index
+                              << ", mask: " << *p->affinity
                               << ", error code: " << err_no << " error msg: " << err_msg
                               << ". Specify the number of threads explicitly so the affinity is not set.";
         }
@@ -262,16 +271,51 @@ class PosixEnv : public Env {
     return new PosixThread(name_prefix, index, start_address, param, thread_options);
   }
 
-  int GetNumCpuCores() const override {
-    // TODO if you need the number of physical cores you'll need to parse
-    // /proc/cpuinfo and grep for "cpu cores".
-    // However, that information is not always available(output of 'grep -i core /proc/cpuinfo' is empty)
-    return std::thread::hardware_concurrency();
+  // we are guessing the number of phys cores based on a popular HT case.
+  static int DefaultNumCores() {
+    return std::max(1, static_cast<int>(std::thread::hardware_concurrency() / 2));
   }
 
-  std::vector<size_t> GetThreadAffinityMasks() const override {
-    std::vector<size_t> ret(std::thread::hardware_concurrency() / 2);
-    std::iota(ret.begin(), ret.end(), 0);
+  // Return the number of physical cores
+  int GetNumPhysicalCpuCores() const override {
+#ifdef CPUINFO_SUPPORTED
+    if(cpuinfo_available_) {
+      return gsl::narrow<int>(cpuinfo_get_cores_count());
+    }
+#endif
+    // We guess the number of cores
+    return DefaultNumCores();
+  }
+
+  std::vector<LogicalProcessors> GetThreadAffinityMasks() const override {
+
+    std::vector<LogicalProcessors> ret;
+#ifdef CPUINFO_SUPPORTED
+    if (cpuinfo_available_) {
+#if !defined(__APPLE__) && !defined(__ANDROID__) && !defined(__wasm__) && !defined(_AIX)
+      // We currently do not implement affinity on more than 64 cores.
+      auto num_phys_cores = cpuinfo_get_cores_count();
+      ret.reserve(num_phys_cores);
+      for (uint32_t i = 0; i < num_phys_cores; ++i) {
+        const auto* core = cpuinfo_get_core(i);
+        LogicalProcessors th_aff;
+        // Processor count will never exceed 64 in a given group.
+        // TBD: Processor groups are currently not taken into account.
+        th_aff.reserve(core->processor_count);
+        auto log_proc_idx = core->processor_start;
+        for (uint32_t count = 0; count < core->processor_count; count++, ++log_proc_idx) {
+          const auto* log_proc = cpuinfo_get_processor(log_proc_idx);
+          th_aff.push_back(log_proc->linux_id);
+        }
+        ret.push_back(std::move(th_aff));
+       }
+#endif
+    }
+#endif // CPUINFO_SUPPORTED
+    // Just the size of the thread-pool
+    if(ret.empty()) {
+      ret.resize(GetNumPhysicalCpuCores());
+    }
     return ret;
   }
 
@@ -407,7 +451,7 @@ class PosixEnv : public Env {
   }
 
   static common::Status ReportSystemError(const char* operation_name, const std::string& path) {
-    auto[err_no, err_msg] = GetSystemError();
+    auto [err_no, err_msg] = GetSystemError();
     std::ostringstream oss;
     oss << operation_name << " file \"" << path << "\" failed: " << err_msg;
     return common::Status(common::SYSTEM, err_no, oss.str());
@@ -537,8 +581,18 @@ class PosixEnv : public Env {
   }
 
  private:
-  PosixEnv() = default;
+  PosixEnv()  {
+#ifdef CPUINFO_SUPPORTED
+    cpuinfo_available_ = cpuinfo_initialize();
+    if(!cpuinfo_available_) {
+      LOGS_DEFAULT(INFO) << "cpuinfo_initialize failed";
+    }
+#endif
+  }
   Telemetry telemetry_provider_;
+#ifdef CPUINFO_SUPPORTED
+  bool cpuinfo_available_{false};
+#endif
 };
 
 }  // namespace
diff --git a/onnxruntime/core/platform/windows/env.cc b/onnxruntime/core/platform/windows/env.cc
index 23eb51f3b4..cac5157ad1 100644
--- a/onnxruntime/core/platform/windows/env.cc
+++ b/onnxruntime/core/platform/windows/env.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <Windows.h>
 
+#include <iostream>
 #include <fstream>
 #include <optional>
 #include <string>
@@ -28,6 +29,7 @@ limitations under the License.
 
 #include <gsl/gsl>
 #include "core/common/logging/logging.h"
+#include "core/common/span_utils.h"
 #include "core/platform/env.h"
 #include "core/platform/scoped_resource.h"
 #include "core/platform/windows/telemetry.h"
@@ -70,15 +72,15 @@ class WindowsThread : public EnvThread {
     int index;
     unsigned (*start_address)(int id, Eigen::ThreadPoolInterface* param);
     Eigen::ThreadPoolInterface* param;
-    std::optional<size_t> affinity_mask;
+    std::optional<LogicalProcessors> affinity;
     Param(const ORTCHAR_T* name_prefix1,
           int index1,
           unsigned (*start_address1)(int id, Eigen::ThreadPoolInterface* param),
-          Eigen::ThreadPoolInterface* param1) 
-      : name_prefix(name_prefix1),
-      index(index1),
-      start_address(start_address1),
-      param(param1) {}
+          Eigen::ThreadPoolInterface* param1)
+        : name_prefix(name_prefix1),
+          index(index1),
+          start_address(start_address1),
+          param(param1) {}
   };
 
  public:
@@ -92,7 +94,7 @@ class WindowsThread : public EnvThread {
 
     std::unique_ptr<Param> local_param = std::make_unique<Param>(name_prefix, index, start_address, param);
     if (gsl::narrow<size_t>(index) < thread_options.affinity.size()) {
-      local_param->affinity_mask = thread_options.affinity[index];
+      local_param->affinity = thread_options.affinity[index];
     }
 
     if (custom_create_thread_fn) {
@@ -159,12 +161,17 @@ class WindowsThread : public EnvThread {
     unsigned ret = 0;
     ORT_TRY {
       // TODO: should I try to use SetThreadSelectedCpuSets?
-      if (p->affinity_mask.has_value()) {
-        auto rc = SetThreadAffinityMask(GetCurrentThread(), *p->affinity_mask);
+      if (p->affinity.has_value() && !p->affinity->empty()) {
+        DWORD_PTR mask = 0;
+        for (auto id : *p->affinity) {
+          mask |= DWORD_PTR{1} << id;
+        }
+        auto rc = SetThreadAffinityMask(GetCurrentThread(), mask);
         if (!rc) {
           const auto error_code = GetLastError();
           LOGS_DEFAULT(ERROR) << "SetThreadAffinityMask failed for thread: " << GetCurrentThreadId()
-                              << ", mask: " << *p->affinity_mask
+                              << ", index: " << p->index
+                              << ", mask: " << *p->affinity
                               << ", error code: " << error_code
                               << ", error msg: " << std::system_category().message(error_code)
                               << ". Specify the number of threads explicitly so the affinity is not set.";
@@ -212,53 +219,100 @@ class WindowsEnv : public Env {
     Sleep(static_cast<DWORD>(micros) / 1000);
   }
 
-  int GetNumCpuCores() const override {
-    SYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer[256];
-    DWORD returnLength = sizeof(buffer);
+  struct LogicalProcessorInformation {
+    std::unique_ptr<char[]> buffer_;
+    gsl::span<const SYSTEM_LOGICAL_PROCESSOR_INFORMATION> logical_processors;
+  };
+
+  std::optional<LogicalProcessorInformation> FetchLogicalProcessorInfo() const {
+    // We will fail the first time around. The docs say, the size of the structure
+    // is different on different versions and releases.
+    DWORD returnLength = 0;
+    if (GetLogicalProcessorInformation(NULL, &returnLength) == FALSE) {
+      if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
+        auto last_error = GetLastError();
+        LOGS_DEFAULT(ERROR) << "GetLogicalProcessorInformation failed to obtain buffer length. error code: "
+                            << last_error
+                            << " error msg: " << std::system_category().message(last_error);
+        return {};
+      }
+    }
+
+    auto allocation = std::make_unique<char[]>(returnLength);
+    SYSTEM_LOGICAL_PROCESSOR_INFORMATION* buffer = reinterpret_cast<SYSTEM_LOGICAL_PROCESSOR_INFORMATION*>(allocation.get());
     if (GetLogicalProcessorInformation(buffer, &returnLength) == FALSE) {
-      // try GetSystemInfo
-      SYSTEM_INFO sysInfo;
-      GetSystemInfo(&sysInfo);
-      if (sysInfo.dwNumberOfProcessors <= 0) {
-        ORT_THROW("Fatal error: 0 count processors from GetSystemInfo");
-      }
-      // This is the number of logical processors in the current group
-      return sysInfo.dwNumberOfProcessors;
+      auto last_error = GetLastError();
+      LOGS_DEFAULT(ERROR) << "GetLogicalProcessorInformation failed to retrieve SYSTEM_LOGICAL_PROCESSOR_INFORMATION. error code: "
+                          << last_error
+                          << " error msg: " << std::system_category().message(last_error);
+      return {};
     }
-    int processorCoreCount = 0;
-    int count = (int)(returnLength / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION));
-    for (int i = 0; i != count; ++i) {
-      if (buffer[i].Relationship == RelationProcessorCore) {
-        ++processorCoreCount;
-      }
-    }
-    if (!processorCoreCount)
-      ORT_THROW("Fatal error: 0 count processors from GetLogicalProcessorInformation");
-    return processorCoreCount;
+
+    const size_t count = gsl::narrow<size_t>(returnLength) / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
+    std::optional<LogicalProcessorInformation> result;
+    result = {std::move(allocation), gsl::make_span(buffer, count)};
+    return result;
   }
 
-  std::vector<size_t> GetThreadAffinityMasks() const override {
-    auto generate_vector_of_n = [](int n) {
-      std::vector<size_t> ret(n);
-      std::iota(ret.begin(), ret.end(), 0);
-      return ret;
-    };
-    // Indeed 64 should be enough. However, it's harmless to have a little more.
-    SYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer[256];
-    DWORD returnLength = sizeof(buffer);
-    if (GetLogicalProcessorInformation(buffer, &returnLength) == FALSE) {
-      return generate_vector_of_n(std::thread::hardware_concurrency());
+  static int DefaultNumCores() {
+    return std::max(1, static_cast<int>(std::thread::hardware_concurrency() / 2));
+  }
+
+  int GetNumPhysicalCpuCores() const override {
+    auto logical_processor_info = FetchLogicalProcessorInfo();
+    if (!logical_processor_info.has_value()) {
+      return DefaultNumCores();
     }
-    std::vector<size_t> ret;
-    int count = (int)(returnLength / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION));
-    for (int i = 0; i != count; ++i) {
-      if (buffer[i].Relationship == RelationProcessorCore) {
-        ret.push_back(buffer[i].ProcessorMask);
+
+    int phys_cores = 0;
+    for (const auto& processor_info : logical_processor_info->logical_processors) {
+      if (processor_info.Relationship == RelationProcessorCore) {
+        phys_cores++;
       }
     }
-    if (ret.empty()) {
-      return generate_vector_of_n(std::thread::hardware_concurrency());
+
+    phys_cores = std::max(1, phys_cores);
+
+    return phys_cores;
+  }
+
+  std::vector<LogicalProcessors> GetThreadAffinityMasks() const override {
+    std::vector<LogicalProcessors> ret;
+
+    auto logical_processor_info = FetchLogicalProcessorInfo();
+    if (!logical_processor_info.has_value()) {
+      ret.resize(DefaultNumCores());
+      return ret;
     }
+
+    // Convert mask to a vector of ints
+    auto mask_to_vector = [](uint64_t mask) {
+      LogicalProcessors aff;
+      int bit = 0;
+      while (mask != 0) {
+        if ((mask & 0x1) != 0) {
+          aff.push_back(bit);
+        }
+        mask >>= 0x1;
+        ++bit;
+      }
+      return aff;
+    };
+
+    for (const auto& processor_info : logical_processor_info->logical_processors) {
+      if (processor_info.Relationship == RelationProcessorCore) {
+        // A single core can host multiple logical processors
+        // so the mask returned can have more than one bit set.
+        // We allow threads to be ran on any logical CPU within a given
+        // physical core.
+        ret.push_back(mask_to_vector(processor_info.ProcessorMask));
+      }
+    }
+
+    if (ret.empty()) {
+      ret.resize(DefaultNumCores());
+    }
+
     return ret;
   }
 
@@ -776,6 +830,8 @@ class WindowsEnv : public Env {
   }
 
  private:
+  WindowsEnv() = default;
+
   typedef VOID(WINAPI* FnGetSystemTimePreciseAsFileTime)(LPFILETIME);
   WindowsTelemetry telemetry_provider_;
 };
diff --git a/onnxruntime/core/util/thread_utils.cc b/onnxruntime/core/util/thread_utils.cc
index 94218ee7c9..c60d8cecd9 100644
--- a/onnxruntime/core/util/thread_utils.cc
+++ b/onnxruntime/core/util/thread_utils.cc
@@ -17,19 +17,31 @@ static std::unique_ptr<ThreadPool>
 CreateThreadPoolHelper(Env* env, OrtThreadPoolParams options) {
   if (options.thread_pool_size == 1)
     return nullptr;
-  std::vector<size_t> cpu_list;
   ThreadOptions to;
+
   if (options.affinity_vec_len != 0) {
-    to.affinity.assign(options.affinity_vec, options.affinity_vec + options.affinity_vec_len);
+    // Currently, the affinities are passed in as bit masks and they need to be converted to integers.
+    // We when create a public API, bit-masks must be done away with because of the following reasons:
+    // 1) integers have a limited number of bits
+    // 2) bit-masks of integers can only represent numbers 0 -63, but on VMs the actual logical processor numbering
+    //    may not start with zero for a given core and may be way beyond 63.
+    // 3) Customers would be forced to concoct bit-masks which is far less convenient than simply an array of processor integers. 
+    to.affinity.reserve(options.affinity_vec_len);
+    std::transform(options.affinity_vec, options.affinity_vec + options.affinity_vec_len, std::back_inserter(to.affinity),
+                   [](size_t affinity) {
+                     return LogicalProcessors{static_cast<int>(affinity)};
+                   });
   }
+
   if (options.thread_pool_size <= 0) {  // default
-    cpu_list = Env::Default().GetThreadAffinityMasks();
+    auto cpu_list = Env::Default().GetThreadAffinityMasks();
     if (cpu_list.empty() || cpu_list.size() == 1)
       return nullptr;
     options.thread_pool_size = static_cast<int>(cpu_list.size());
     if (options.auto_set_affinity)
       to.affinity = cpu_list;
   }
+
   to.set_denormal_as_zero = options.set_denormal_as_zero;
 
   // set custom thread management members
diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc
index 6054838531..cac280cdb8 100644
--- a/onnxruntime/test/onnx/main.cc
+++ b/onnxruntime/test/onnx/main.cc
@@ -416,7 +416,7 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
         }
         auto pos = token.find("|");
         if (pos == std::string::npos || pos == 0 || pos == token.length()) {
-          ORT_THROW(R"(Use a '|' to separate the key and value for 
+          ORT_THROW(R"(Use a '|' to separate the key and value for
 the run-time option you are trying to use.\n)");
         }
 
@@ -426,7 +426,7 @@ the run-time option you are trying to use.\n)");
         if (key == "runtime") {
           std::set<std::string> supported_runtime = {"CPU", "GPU_FP32", "GPU", "GPU_FLOAT16", "DSP", "AIP_FIXED_TF"};
           if (supported_runtime.find(value) == supported_runtime.end()) {
-            ORT_THROW(R"(Wrong configuration value for the key 'runtime'. 
+            ORT_THROW(R"(Wrong configuration value for the key 'runtime'.
 select from 'CPU', 'GPU_FP32', 'GPU', 'GPU_FLOAT16', 'DSP', 'AIP_FIXED_TF'. \n)");
           }
         } else if (key == "priority") {
@@ -434,7 +434,7 @@ select from 'CPU', 'GPU_FP32', 'GPU', 'GPU_FLOAT16', 'DSP', 'AIP_FIXED_TF'. \n)"
         } else if (key == "buffer_type") {
           std::set<std::string> supported_buffer_type = {"TF8", "TF16", "UINT8", "FLOAT", "ITENSOR"};
           if (supported_buffer_type.find(value) == supported_buffer_type.end()) {
-            ORT_THROW(R"(Wrong configuration value for the key 'buffer_type'. 
+            ORT_THROW(R"(Wrong configuration value for the key 'buffer_type'.
 select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
           }
         } else {
diff --git a/onnxruntime/test/onnx/testenv.cc b/onnxruntime/test/onnx/testenv.cc
index 6b9782a2a9..849cdd4427 100644
--- a/onnxruntime/test/onnx/testenv.cc
+++ b/onnxruntime/test/onnx/testenv.cc
@@ -18,7 +18,7 @@ static std::once_flag default_pool_init;
 PThreadPool TestEnv::GetDefaultThreadPool(onnxruntime::Env& env) {
   std::call_once(default_pool_init, [&env] {
     using namespace onnxruntime::concurrency;
-    int core_num = env.GetNumCpuCores();
+    int core_num = env.GetNumPhysicalCpuCores();
 
     onnxruntime::ThreadOptions t_opts;
     default_pool = std::make_unique<ThreadPool>(&env, t_opts, ORT_TSTR("onnx_runner_tp"), core_num, false);
diff --git a/onnxruntime/test/perftest/performance_runner.cc b/onnxruntime/test/perftest/performance_runner.cc
index e2b2ef7ca3..2039c65b53 100644
--- a/onnxruntime/test/perftest/performance_runner.cc
+++ b/onnxruntime/test/perftest/performance_runner.cc
@@ -42,7 +42,7 @@ static std::unique_ptr<DefaultThreadPoolType> default_pool;
 static std::once_flag default_pool_init;
 Eigen::ThreadPoolInterface* GetDefaultThreadPool(const onnxruntime::Env& env) {
   std::call_once(default_pool_init, [&env] {
-    int core_num = env.GetNumCpuCores();
+    int core_num = env.GetNumPhysicalCpuCores();
     default_pool = std::make_unique<DefaultThreadPoolType>(core_num);
   });
   return default_pool.get();
diff --git a/onnxruntime/test/perftest/posix/utils.cc b/onnxruntime/test/perftest/posix/utils.cc
index 931274fcab..9bf029d8df 100644
--- a/onnxruntime/test/perftest/posix/utils.cc
+++ b/onnxruntime/test/perftest/posix/utils.cc
@@ -37,7 +37,7 @@ class CPUUsage : public ICPUUsage {
     } else {
       clock_t proc_total_clock_diff = (time_sample.tms_stime - proc_sys_clock_start_) + (time_sample.tms_utime - proc_user_clock_start_);
       clock_t total_clock_diff = total_clock_now - total_clock_start_;
-      return static_cast<short>(100.0 * proc_total_clock_diff / total_clock_diff / onnxruntime::Env::Default().GetNumCpuCores());
+      return static_cast<short>(100.0 * proc_total_clock_diff / total_clock_diff / onnxruntime::Env::Default().GetNumPhysicalCpuCores());
     }
   }