From b935524e22fdce64874df94ecfb549cc0c87fd7f Mon Sep 17 00:00:00 2001 From: Pranav Sharma Date: Thu, 15 Sep 2022 17:54:32 -0700 Subject: [PATCH] Revert reverse setup of allocators + create/register allocator in CPU EP only when needed. (#12954) * Revert reverse setup of allocators + create/register allocator in CPU EP only when needed. --- .../core/framework/execution_providers.h | 4 -- onnxruntime/core/framework/session_state.cc | 45 +++++++------------ .../providers/cpu/cpu_execution_provider.cc | 36 +++++++++++++-- .../providers/cpu/cpu_execution_provider.h | 21 ++------- .../providers/cpu/cpu_provider_factory.cc | 2 +- onnxruntime/core/session/inference_session.cc | 15 ++----- .../internal_testing_tests.cc | 2 +- .../providers/xnnpack/xnnpack_basic_test.cc | 16 +++---- onnxruntime/test/util/default_providers.cc | 7 ++- 9 files changed, 72 insertions(+), 76 deletions(-) diff --git a/onnxruntime/core/framework/execution_providers.h b/onnxruntime/core/framework/execution_providers.h index 8a26944ef5..944a983b56 100644 --- a/onnxruntime/core/framework/execution_providers.h +++ b/onnxruntime/core/framework/execution_providers.h @@ -69,10 +69,6 @@ class ExecutionProviders { size_t NumProviders() const { return exec_providers_.size(); } - using iterator = typename std::vector>::iterator; - iterator begin() noexcept { return exec_providers_.begin(); } - iterator end() noexcept { return exec_providers_.end(); } - using const_iterator = typename std::vector>::const_iterator; const_iterator begin() const noexcept { return exec_providers_.cbegin(); } const_iterator end() const noexcept { return exec_providers_.cend(); } diff --git a/onnxruntime/core/framework/session_state.cc b/onnxruntime/core/framework/session_state.cc index 15d15ef9a0..8c5b42ad9b 100644 --- a/onnxruntime/core/framework/session_state.cc +++ b/onnxruntime/core/framework/session_state.cc @@ -25,35 +25,22 @@ using namespace ::onnxruntime::common; namespace onnxruntime { void SessionState::SetupAllocators() { - // register allocators in reverse order. one per OrtMemType+OrtDevice combination. - // this order prefers allocators from the core EPs (CPU EP, CUDA EP, ROCM EP) and ensures the CUDA/ROCM EP's - // per-thread allocators will be preferred over the TensorRT/MIGraphX EP non-per-thread allocators. - // TODO: Refactor the EP/allocator relationship so that we can be explicit about which allocator is preferred and - // avoid creating unnecessary allocators. - std::for_each(std::make_reverse_iterator(execution_providers_.end()), - std::make_reverse_iterator(execution_providers_.begin()), - [this](const auto& provider_iter) { - IExecutionProvider& provider = *provider_iter; - for (const auto& allocator : provider.GetAllocators()) { - const OrtMemoryInfo& memory_info = allocator->Info(); - auto iter = allocators_.find(memory_info); - if (iter != allocators_.end()) { - // EPs could be sharing allocators so no info message unless this is a different instance. - // This is an expected scenario as multiple EPs may have allocators for the same device. - // e.g. xnnpack and CPU EP are both CPU based. - if (iter->second(memory_info.id, memory_info.mem_type) != allocator) { - LOGS(logger_, INFO) << "Allocator already registered for " << allocator->Info() - << ". Ignoring allocator from " << provider.Type(); - } - } else { - // slightly weird indirection to go back to the provider to get the allocator each time - // it's needed in order to support scenarios such as the CUDA EP's per-thread allocator. - allocators_[memory_info] = [&provider](int id, OrtMemType mem_type) { - return provider.GetAllocator(id, mem_type); - }; - } - } - }); + for (const auto& provider : execution_providers_) { + for (const auto& allocator : provider->GetAllocators()) { + const OrtMemoryInfo& memory_info = allocator->Info(); + if (allocators_.find(memory_info) != allocators_.end()) { + // EPs are ordered by priority so ignore the duplicate allocator for this memory location. + LOGS(logger_, INFO) << "Allocator already registered for " << allocator->Info() + << ". Ignoring allocator from " << provider->Type(); + } else { + // slightly weird indirection to go back to the provider to get the allocator each time it's needed + // in order to support scenarios such as the CUDA EP's per-thread allocator. + allocators_[memory_info] = [&provider](int id, OrtMemType mem_type) { + return provider->GetAllocator(id, mem_type); + }; + } + } + } } AllocatorPtr SessionState::GetAllocator(const OrtMemoryInfo& location) const noexcept { diff --git a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc index c627c7bf3c..a395501883 100644 --- a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc +++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc @@ -24,13 +24,41 @@ struct KernelRegistryAndStatus { } // namespace namespace onnxruntime { +CPUExecutionProvider::CPUExecutionProvider(const CPUExecutionProviderInfo& info, bool delay_allocator_registration) + : IExecutionProvider{onnxruntime::kCpuExecutionProvider}, info_{info} { + if (!delay_allocator_registration) { + AllocatorManager mgr; // needed only to call RegisterAllocator + RegisterAllocator(mgr); + } +} + void CPUExecutionProvider::RegisterAllocator(AllocatorManager& allocator_manager) { OrtDevice cpu_device{OrtDevice::CPU, OrtDevice::MemType::DEFAULT, DEFAULT_CPU_ALLOCATOR_DEVICE_ID}; - auto cpu_alloc = allocator_manager.GetAllocator(OrtMemTypeDefault, cpu_device); - + // if EP is used in multiple inference sessions we may already have an allocator. if so use that. + auto cpu_alloc = GetAllocator(cpu_device.Id(), OrtMemTypeDefault); if (!cpu_alloc) { - // share our allocator - allocator_manager.InsertAllocator(GetAllocator(cpu_device.Id(), OrtMemTypeDefault)); + // use shared allocator if available + cpu_alloc = allocator_manager.GetAllocator(OrtMemTypeDefault, cpu_device); + + if (!cpu_alloc) { + // create our allocator + bool create_arena = info_.create_arena; +#if defined(USE_JEMALLOC) || defined(USE_MIMALLOC) + // JEMalloc/mimalloc already have memory pool, so just use device allocator. + create_arena = false; +#elif !(defined(__amd64__) || defined(_M_AMD64) || defined(__aarch64__) || defined(_M_ARM64)) + // Disable Arena allocator for x86_32 build because it may run into infinite loop when integer overflow happens + create_arena = false; +#endif + AllocatorCreationInfo device_info{[](int) { return std::make_unique(); }, + DEFAULT_CPU_ALLOCATOR_DEVICE_ID, create_arena}; + + cpu_alloc = CreateAllocator(device_info); + // enable sharing of our allocator + allocator_manager.InsertAllocator(cpu_alloc); + } + + InsertAllocator(cpu_alloc); } } diff --git a/onnxruntime/core/providers/cpu/cpu_execution_provider.h b/onnxruntime/core/providers/cpu/cpu_execution_provider.h index 97b012f3aa..c0ef15ac80 100644 --- a/onnxruntime/core/providers/cpu/cpu_execution_provider.h +++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.h @@ -25,23 +25,9 @@ using FuseRuleFn = std::function(); }, - DEFAULT_CPU_ALLOCATOR_DEVICE_ID, create_arena}; - - InsertAllocator(CreateAllocator(device_info)); - } + // delay_allocator_registration = true is used to allow sharing of allocators between different providers that are + // associated with the same device + explicit CPUExecutionProvider(const CPUExecutionProviderInfo& info, bool delay_allocator_registration = false); void RegisterAllocator(AllocatorManager& allocator_manager) override; @@ -49,6 +35,7 @@ class CPUExecutionProvider : public IExecutionProvider { std::unique_ptr GetDataTransfer() const override; private: + CPUExecutionProviderInfo info_; std::vector fuse_rules_; }; diff --git a/onnxruntime/core/providers/cpu/cpu_provider_factory.cc b/onnxruntime/core/providers/cpu/cpu_provider_factory.cc index 4b6ab05be0..a79992fdf1 100644 --- a/onnxruntime/core/providers/cpu/cpu_provider_factory.cc +++ b/onnxruntime/core/providers/cpu/cpu_provider_factory.cc @@ -24,7 +24,7 @@ struct CpuProviderFactory : IExecutionProviderFactory { std::unique_ptr CpuProviderFactory::CreateProvider() { CPUExecutionProviderInfo info; info.create_arena = create_arena_; - return std::make_unique(info); + return std::make_unique(info, true /* delay allocator registration to allow sharing */); } std::shared_ptr CPUProviderFactoryCreator::Create(int use_arena) { diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc index 722356732b..cbc96b0ad8 100644 --- a/onnxruntime/core/session/inference_session.cc +++ b/onnxruntime/core/session/inference_session.cc @@ -1231,7 +1231,7 @@ common::Status InferenceSession::Initialize() { if (!have_cpu_ep) { LOGS(*session_logger_, INFO) << "Adding default CPU execution provider."; CPUExecutionProviderInfo epi{session_options_.enable_cpu_mem_arena}; - auto p_cpu_exec_provider = std::make_unique(epi); + auto p_cpu_exec_provider = std::make_unique(epi, true /* delay allocator registration to allow sharing */); ORT_RETURN_IF_ERROR_SESSIONID_(RegisterExecutionProvider(std::move(p_cpu_exec_provider))); execution_providers_.SetCpuProviderWasImplicitlyAdded(true); } @@ -1248,18 +1248,11 @@ common::Status InferenceSession::Initialize() { // Ensure all registered EPs have created their allocators and shared them where possible. // Allocator creation may be delayed until IExecutionProvider::RegisterAllocator is called. - // - // We iterate EPs in reverse order as we are currently using this mechanism to share a CPU or CUDA - // allocator between CPU and XNNPACK, or CUDA and TensorRT. The memory config options for the CPU and CUDA EPs are - // more comprehensive so we prefer those, and need to call RegisterAllocator for those EPs first so their - // allocators are the ones that get shared. { AllocatorManager allocator_manager; - std::for_each(std::make_reverse_iterator(execution_providers_.end()), - std::make_reverse_iterator(execution_providers_.begin()), - [&allocator_manager](auto& iter) { - iter->RegisterAllocator(allocator_manager); - }); + for (const auto& provider : execution_providers_) { + provider->RegisterAllocator(allocator_manager); + } } // At this time we know all the providers that will be part of this session. diff --git a/onnxruntime/test/providers/internal_testing/internal_testing_tests.cc b/onnxruntime/test/providers/internal_testing/internal_testing_tests.cc index 6944ee9e8c..a87932adf3 100644 --- a/onnxruntime/test/providers/internal_testing/internal_testing_tests.cc +++ b/onnxruntime/test/providers/internal_testing/internal_testing_tests.cc @@ -268,7 +268,7 @@ TEST(InternalTestingEP, TestRegisterAllocatorHandlesUsageInMultipleSessions) { std::vector> eps{ std::make_shared(supported_ops, std::unordered_set{}, DataLayout::NHWC), - std::make_shared(CPUExecutionProviderInfo{})}; + std::make_shared(CPUExecutionProviderInfo{}, true /* delay allocator registration to allow sharing */)}; // check RegisterAllocator is implemented properly and supports calls from multiple inference sessions init_session(eps, session1); diff --git a/onnxruntime/test/providers/xnnpack/xnnpack_basic_test.cc b/onnxruntime/test/providers/xnnpack/xnnpack_basic_test.cc index a7624ef696..81ec643681 100644 --- a/onnxruntime/test/providers/xnnpack/xnnpack_basic_test.cc +++ b/onnxruntime/test/providers/xnnpack/xnnpack_basic_test.cc @@ -108,7 +108,7 @@ TEST(XnnpackEP, TestAllocatorSharing) { // and use the same EP instances in both std::vector> eps{ std::make_shared(XnnpackExecutionProviderInfo{}), - std::make_shared(CPUExecutionProviderInfo{})}; + std::make_shared(CPUExecutionProviderInfo{}, true /* delay allocator creation to allow sharing */)}; // check RegisterAllocator is implemented properly and supports calls from multiple inference sessions init_session(eps, session1); @@ -223,13 +223,13 @@ TEST(XnnpackEP, TestQDQConvU8U8) { TEST(XnnpackEP, TestQDQConvS8S8) { RunModelTest(BuildQDQConvTestCase( - {1, 1, 5, 5} /* input_shape */, - {1, 1, 3, 3} /* weights_shape */), - "xnnpack_qdq_test_graph_conv_s8s8", - {ExpectedEPNodeAssignment::Some, 0.2f}); + int8_t /* WeightType */, + int32_t /* BiasType */, + int8_t /* OutputType */>( + {1, 1, 5, 5} /* input_shape */, + {1, 1, 3, 3} /* weights_shape */), + "xnnpack_qdq_test_graph_conv_s8s8", + {ExpectedEPNodeAssignment::Some, 0.2f}); const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "conv_qdq_s8s8.onnx"; RunModelTestWithPath(ort_model_path, "xnnpack_qdq_test_graph_conv_s8s8", 0.7f); diff --git a/onnxruntime/test/util/default_providers.cc b/onnxruntime/test/util/default_providers.cc index 0b371c385b..cec60b7335 100644 --- a/onnxruntime/test/util/default_providers.cc +++ b/onnxruntime/test/util/default_providers.cc @@ -14,7 +14,12 @@ namespace onnxruntime { namespace test { std::unique_ptr DefaultCpuExecutionProvider(bool enable_arena) { - return CPUProviderFactoryCreator::Create(enable_arena)->CreateProvider(); + auto ret = CPUProviderFactoryCreator::Create(enable_arena)->CreateProvider(); + // The factory created CPU provider doesn't create/reg allocators; something that is expected by + // clients of DefaultCpuExecutionProvider; hence the need to call RegisterAllocator explicitly. + AllocatorManager mgr; // needed only to call RegisterAllocator + ret->RegisterAllocator(mgr); + return ret; } std::unique_ptr DefaultTensorrtExecutionProvider() {