From b935524e22fdce64874df94ecfb549cc0c87fd7f Mon Sep 17 00:00:00 2001
From: Pranav Sharma <prs@microsoft.com>
Date: Thu, 15 Sep 2022 17:54:32 -0700
Subject: [PATCH] Revert reverse setup of allocators + create/register
 allocator in CPU EP only when needed. (#12954)

* Revert reverse setup of allocators + create/register allocator in CPU EP only when needed.
---
 .../core/framework/execution_providers.h      |  4 --
 onnxruntime/core/framework/session_state.cc   | 45 +++++++------------
 .../providers/cpu/cpu_execution_provider.cc   | 36 +++++++++++++--
 .../providers/cpu/cpu_execution_provider.h    | 21 ++-------
 .../providers/cpu/cpu_provider_factory.cc     |  2 +-
 onnxruntime/core/session/inference_session.cc | 15 ++-----
 .../internal_testing_tests.cc                 |  2 +-
 .../providers/xnnpack/xnnpack_basic_test.cc   | 16 +++----
 onnxruntime/test/util/default_providers.cc    |  7 ++-
 9 files changed, 72 insertions(+), 76 deletions(-)

diff --git a/onnxruntime/core/framework/execution_providers.h b/onnxruntime/core/framework/execution_providers.h
index 8a26944ef5..944a983b56 100644
--- a/onnxruntime/core/framework/execution_providers.h
+++ b/onnxruntime/core/framework/execution_providers.h
@@ -69,10 +69,6 @@ class ExecutionProviders {
 
   size_t NumProviders() const { return exec_providers_.size(); }
 
-  using iterator = typename std::vector<std::shared_ptr<IExecutionProvider>>::iterator;
-  iterator begin() noexcept { return exec_providers_.begin(); }
-  iterator end() noexcept { return exec_providers_.end(); }
-
   using const_iterator = typename std::vector<std::shared_ptr<IExecutionProvider>>::const_iterator;
   const_iterator begin() const noexcept { return exec_providers_.cbegin(); }
   const_iterator end() const noexcept { return exec_providers_.cend(); }
diff --git a/onnxruntime/core/framework/session_state.cc b/onnxruntime/core/framework/session_state.cc
index 15d15ef9a0..8c5b42ad9b 100644
--- a/onnxruntime/core/framework/session_state.cc
+++ b/onnxruntime/core/framework/session_state.cc
@@ -25,35 +25,22 @@ using namespace ::onnxruntime::common;
 namespace onnxruntime {
 
 void SessionState::SetupAllocators() {
-  // register allocators in reverse order. one per OrtMemType+OrtDevice combination.
-  // this order prefers allocators from the core EPs (CPU EP, CUDA EP, ROCM EP) and ensures the CUDA/ROCM EP's
-  // per-thread allocators will be preferred over the TensorRT/MIGraphX EP non-per-thread allocators.
-  // TODO: Refactor the EP/allocator relationship so that we can be explicit about which allocator is preferred and
-  //       avoid creating unnecessary allocators.
-  std::for_each(std::make_reverse_iterator(execution_providers_.end()),
-                std::make_reverse_iterator(execution_providers_.begin()),
-                [this](const auto& provider_iter) {
-                  IExecutionProvider& provider = *provider_iter;
-                  for (const auto& allocator : provider.GetAllocators()) {
-                    const OrtMemoryInfo& memory_info = allocator->Info();
-                    auto iter = allocators_.find(memory_info);
-                    if (iter != allocators_.end()) {
-                      // EPs could be sharing allocators so no info message unless this is a different instance.
-                      // This is an expected scenario as multiple EPs may have allocators for the same device.
-                      // e.g. xnnpack and CPU EP are both CPU based.
-                      if (iter->second(memory_info.id, memory_info.mem_type) != allocator) {
-                        LOGS(logger_, INFO) << "Allocator already registered for " << allocator->Info()
-                                            << ". Ignoring allocator from " << provider.Type();
-                      }
-                    } else {
-                      // slightly weird indirection to go back to the provider to get the allocator each time
-                      // it's needed in order to support scenarios such as the CUDA EP's per-thread allocator.
-                      allocators_[memory_info] = [&provider](int id, OrtMemType mem_type) {
-                        return provider.GetAllocator(id, mem_type);
-                      };
-                    }
-                  }
-                });
+  for (const auto& provider : execution_providers_) {
+    for (const auto& allocator : provider->GetAllocators()) {
+      const OrtMemoryInfo& memory_info = allocator->Info();
+      if (allocators_.find(memory_info) != allocators_.end()) {
+        // EPs are ordered by priority so ignore the duplicate allocator for this memory location.
+        LOGS(logger_, INFO) << "Allocator already registered for " << allocator->Info()
+                            << ". Ignoring allocator from " << provider->Type();
+      } else {
+        // slightly weird indirection to go back to the provider to get the allocator each time it's needed
+        // in order to support scenarios such as the CUDA EP's per-thread allocator.
+        allocators_[memory_info] = [&provider](int id, OrtMemType mem_type) {
+          return provider->GetAllocator(id, mem_type);
+        };
+      }
+    }
+  }
 }
 
 AllocatorPtr SessionState::GetAllocator(const OrtMemoryInfo& location) const noexcept {
diff --git a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
index c627c7bf3c..a395501883 100644
--- a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
+++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
@@ -24,13 +24,41 @@ struct KernelRegistryAndStatus {
 }  // namespace
 
 namespace onnxruntime {
+CPUExecutionProvider::CPUExecutionProvider(const CPUExecutionProviderInfo& info, bool delay_allocator_registration)
+    : IExecutionProvider{onnxruntime::kCpuExecutionProvider}, info_{info} {
+  if (!delay_allocator_registration) {
+    AllocatorManager mgr;  // needed only to call RegisterAllocator
+    RegisterAllocator(mgr);
+  }
+}
+
 void CPUExecutionProvider::RegisterAllocator(AllocatorManager& allocator_manager) {
   OrtDevice cpu_device{OrtDevice::CPU, OrtDevice::MemType::DEFAULT, DEFAULT_CPU_ALLOCATOR_DEVICE_ID};
-  auto cpu_alloc = allocator_manager.GetAllocator(OrtMemTypeDefault, cpu_device);
-
+  // if EP is used in multiple inference sessions we may already have an allocator. if so use that.
+  auto cpu_alloc = GetAllocator(cpu_device.Id(), OrtMemTypeDefault);
   if (!cpu_alloc) {
-    // share our allocator
-    allocator_manager.InsertAllocator(GetAllocator(cpu_device.Id(), OrtMemTypeDefault));
+    // use shared allocator if available
+    cpu_alloc = allocator_manager.GetAllocator(OrtMemTypeDefault, cpu_device);
+
+    if (!cpu_alloc) {
+      // create our allocator
+      bool create_arena = info_.create_arena;
+#if defined(USE_JEMALLOC) || defined(USE_MIMALLOC)
+      // JEMalloc/mimalloc already have memory pool, so just use device allocator.
+      create_arena = false;
+#elif !(defined(__amd64__) || defined(_M_AMD64) || defined(__aarch64__) || defined(_M_ARM64))
+      // Disable Arena allocator for x86_32 build because it may run into infinite loop when integer overflow happens
+      create_arena = false;
+#endif
+      AllocatorCreationInfo device_info{[](int) { return std::make_unique<CPUAllocator>(); },
+                                        DEFAULT_CPU_ALLOCATOR_DEVICE_ID, create_arena};
+
+      cpu_alloc = CreateAllocator(device_info);
+      // enable sharing of our allocator
+      allocator_manager.InsertAllocator(cpu_alloc);
+    }
+
+    InsertAllocator(cpu_alloc);
   }
 }
 
diff --git a/onnxruntime/core/providers/cpu/cpu_execution_provider.h b/onnxruntime/core/providers/cpu/cpu_execution_provider.h
index 97b012f3aa..c0ef15ac80 100644
--- a/onnxruntime/core/providers/cpu/cpu_execution_provider.h
+++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.h
@@ -25,23 +25,9 @@ using FuseRuleFn = std::function<void(const onnxruntime::GraphViewer&,
 // Logical device representation.
 class CPUExecutionProvider : public IExecutionProvider {
  public:
-  explicit CPUExecutionProvider(const CPUExecutionProviderInfo& info)
-      : IExecutionProvider{onnxruntime::kCpuExecutionProvider} {
-    bool create_arena = info.create_arena;
-
-#if defined(USE_JEMALLOC) || defined(USE_MIMALLOC)
-    // JEMalloc/mimalloc already have memory pool, so just use device allocator.
-    create_arena = false;
-#elif !(defined(__amd64__) || defined(_M_AMD64) || defined(__aarch64__) || defined(_M_ARM64))
-    // Disable Arena allocator for x86_32 build because it may run into infinite loop when integer overflow happens
-    create_arena = false;
-#endif
-
-    AllocatorCreationInfo device_info{[](int) { return std::make_unique<CPUAllocator>(); },
-                                      DEFAULT_CPU_ALLOCATOR_DEVICE_ID, create_arena};
-
-    InsertAllocator(CreateAllocator(device_info));
-  }
+  // delay_allocator_registration = true is used to allow sharing of allocators between different providers that are
+  // associated with the same device
+  explicit CPUExecutionProvider(const CPUExecutionProviderInfo& info, bool delay_allocator_registration = false);
 
   void RegisterAllocator(AllocatorManager& allocator_manager) override;
 
@@ -49,6 +35,7 @@ class CPUExecutionProvider : public IExecutionProvider {
   std::unique_ptr<IDataTransfer> GetDataTransfer() const override;
 
  private:
+  CPUExecutionProviderInfo info_;
   std::vector<FuseRuleFn> fuse_rules_;
 };
 
diff --git a/onnxruntime/core/providers/cpu/cpu_provider_factory.cc b/onnxruntime/core/providers/cpu/cpu_provider_factory.cc
index 4b6ab05be0..a79992fdf1 100644
--- a/onnxruntime/core/providers/cpu/cpu_provider_factory.cc
+++ b/onnxruntime/core/providers/cpu/cpu_provider_factory.cc
@@ -24,7 +24,7 @@ struct CpuProviderFactory : IExecutionProviderFactory {
 std::unique_ptr<IExecutionProvider> CpuProviderFactory::CreateProvider() {
   CPUExecutionProviderInfo info;
   info.create_arena = create_arena_;
-  return std::make_unique<CPUExecutionProvider>(info);
+  return std::make_unique<CPUExecutionProvider>(info, true /* delay allocator registration to allow sharing */);
 }
 
 std::shared_ptr<IExecutionProviderFactory> CPUProviderFactoryCreator::Create(int use_arena) {
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index 722356732b..cbc96b0ad8 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -1231,7 +1231,7 @@ common::Status InferenceSession::Initialize() {
     if (!have_cpu_ep) {
       LOGS(*session_logger_, INFO) << "Adding default CPU execution provider.";
       CPUExecutionProviderInfo epi{session_options_.enable_cpu_mem_arena};
-      auto p_cpu_exec_provider = std::make_unique<CPUExecutionProvider>(epi);
+      auto p_cpu_exec_provider = std::make_unique<CPUExecutionProvider>(epi, true /* delay allocator registration to allow sharing */);
       ORT_RETURN_IF_ERROR_SESSIONID_(RegisterExecutionProvider(std::move(p_cpu_exec_provider)));
       execution_providers_.SetCpuProviderWasImplicitlyAdded(true);
     }
@@ -1248,18 +1248,11 @@ common::Status InferenceSession::Initialize() {
 
     // Ensure all registered EPs have created their allocators and shared them where possible.
     // Allocator creation may be delayed until IExecutionProvider::RegisterAllocator is called.
-    //
-    // We iterate EPs in reverse order as we are currently using this mechanism to share a CPU or CUDA
-    // allocator between CPU and XNNPACK, or CUDA and TensorRT. The memory config options for the CPU and CUDA EPs are
-    // more comprehensive so we prefer those, and need to call RegisterAllocator for those EPs first so their
-    // allocators are the ones that get shared.
     {
       AllocatorManager allocator_manager;
-      std::for_each(std::make_reverse_iterator(execution_providers_.end()),
-                    std::make_reverse_iterator(execution_providers_.begin()),
-                    [&allocator_manager](auto& iter) {
-                      iter->RegisterAllocator(allocator_manager);
-                    });
+      for (const auto& provider : execution_providers_) {
+        provider->RegisterAllocator(allocator_manager);
+      }
     }
 
     // At this time we know all the providers that will be part of this session.
diff --git a/onnxruntime/test/providers/internal_testing/internal_testing_tests.cc b/onnxruntime/test/providers/internal_testing/internal_testing_tests.cc
index 6944ee9e8c..a87932adf3 100644
--- a/onnxruntime/test/providers/internal_testing/internal_testing_tests.cc
+++ b/onnxruntime/test/providers/internal_testing/internal_testing_tests.cc
@@ -268,7 +268,7 @@ TEST(InternalTestingEP, TestRegisterAllocatorHandlesUsageInMultipleSessions) {
   std::vector<std::shared_ptr<IExecutionProvider>> eps{
       std::make_shared<InternalTestingExecutionProvider>(supported_ops, std::unordered_set<std::string>{},
                                                          DataLayout::NHWC),
-      std::make_shared<CPUExecutionProvider>(CPUExecutionProviderInfo{})};
+      std::make_shared<CPUExecutionProvider>(CPUExecutionProviderInfo{}, true /* delay allocator registration to allow sharing */)};
 
   // check RegisterAllocator is implemented properly and supports calls from multiple inference sessions
   init_session(eps, session1);
diff --git a/onnxruntime/test/providers/xnnpack/xnnpack_basic_test.cc b/onnxruntime/test/providers/xnnpack/xnnpack_basic_test.cc
index a7624ef696..81ec643681 100644
--- a/onnxruntime/test/providers/xnnpack/xnnpack_basic_test.cc
+++ b/onnxruntime/test/providers/xnnpack/xnnpack_basic_test.cc
@@ -108,7 +108,7 @@ TEST(XnnpackEP, TestAllocatorSharing) {
   // and use the same EP instances in both
   std::vector<std::shared_ptr<IExecutionProvider>> eps{
       std::make_shared<XnnpackExecutionProvider>(XnnpackExecutionProviderInfo{}),
-      std::make_shared<CPUExecutionProvider>(CPUExecutionProviderInfo{})};
+      std::make_shared<CPUExecutionProvider>(CPUExecutionProviderInfo{}, true /* delay allocator creation to allow sharing */)};
 
   // check RegisterAllocator is implemented properly and supports calls from multiple inference sessions
   init_session(eps, session1);
@@ -223,13 +223,13 @@ TEST(XnnpackEP, TestQDQConvU8U8) {
 
 TEST(XnnpackEP, TestQDQConvS8S8) {
   RunModelTest(BuildQDQConvTestCase<int8_t /* InputType */,
-                                     int8_t /* WeightType */,
-                                     int32_t /* BiasType */,
-                                     int8_t /* OutputType */>(
-                    {1, 1, 5, 5} /* input_shape */,
-                    {1, 1, 3, 3} /* weights_shape */),
-                "xnnpack_qdq_test_graph_conv_s8s8",
-                {ExpectedEPNodeAssignment::Some, 0.2f});
+                                    int8_t /* WeightType */,
+                                    int32_t /* BiasType */,
+                                    int8_t /* OutputType */>(
+                   {1, 1, 5, 5} /* input_shape */,
+                   {1, 1, 3, 3} /* weights_shape */),
+               "xnnpack_qdq_test_graph_conv_s8s8",
+               {ExpectedEPNodeAssignment::Some, 0.2f});
 
   const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "conv_qdq_s8s8.onnx";
   RunModelTestWithPath(ort_model_path, "xnnpack_qdq_test_graph_conv_s8s8", 0.7f);
diff --git a/onnxruntime/test/util/default_providers.cc b/onnxruntime/test/util/default_providers.cc
index 0b371c385b..cec60b7335 100644
--- a/onnxruntime/test/util/default_providers.cc
+++ b/onnxruntime/test/util/default_providers.cc
@@ -14,7 +14,12 @@ namespace onnxruntime {
 namespace test {
 
 std::unique_ptr<IExecutionProvider> DefaultCpuExecutionProvider(bool enable_arena) {
-  return CPUProviderFactoryCreator::Create(enable_arena)->CreateProvider();
+  auto ret = CPUProviderFactoryCreator::Create(enable_arena)->CreateProvider();
+  // The factory created CPU provider doesn't create/reg allocators; something that is expected by
+  // clients of DefaultCpuExecutionProvider; hence the need to call RegisterAllocator explicitly.
+  AllocatorManager mgr;  // needed only to call RegisterAllocator
+  ret->RegisterAllocator(mgr);
+  return ret;
 }
 
 std::unique_ptr<IExecutionProvider> DefaultTensorrtExecutionProvider() {