mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-01 23:30:35 +00:00
Revert reverse setup of allocators + create/register allocator in CPU EP only when needed. (#12954)
* Revert reverse setup of allocators + create/register allocator in CPU EP only when needed.
This commit is contained in:
parent
94d9e9ad6d
commit
b935524e22
9 changed files with 72 additions and 76 deletions
|
|
@ -69,10 +69,6 @@ class ExecutionProviders {
|
|||
|
||||
size_t NumProviders() const { return exec_providers_.size(); }
|
||||
|
||||
using iterator = typename std::vector<std::shared_ptr<IExecutionProvider>>::iterator;
|
||||
iterator begin() noexcept { return exec_providers_.begin(); }
|
||||
iterator end() noexcept { return exec_providers_.end(); }
|
||||
|
||||
using const_iterator = typename std::vector<std::shared_ptr<IExecutionProvider>>::const_iterator;
|
||||
const_iterator begin() const noexcept { return exec_providers_.cbegin(); }
|
||||
const_iterator end() const noexcept { return exec_providers_.cend(); }
|
||||
|
|
|
|||
|
|
@ -25,35 +25,22 @@ using namespace ::onnxruntime::common;
|
|||
namespace onnxruntime {
|
||||
|
||||
void SessionState::SetupAllocators() {
|
||||
// register allocators in reverse order. one per OrtMemType+OrtDevice combination.
|
||||
// this order prefers allocators from the core EPs (CPU EP, CUDA EP, ROCM EP) and ensures the CUDA/ROCM EP's
|
||||
// per-thread allocators will be preferred over the TensorRT/MIGraphX EP non-per-thread allocators.
|
||||
// TODO: Refactor the EP/allocator relationship so that we can be explicit about which allocator is preferred and
|
||||
// avoid creating unnecessary allocators.
|
||||
std::for_each(std::make_reverse_iterator(execution_providers_.end()),
|
||||
std::make_reverse_iterator(execution_providers_.begin()),
|
||||
[this](const auto& provider_iter) {
|
||||
IExecutionProvider& provider = *provider_iter;
|
||||
for (const auto& allocator : provider.GetAllocators()) {
|
||||
const OrtMemoryInfo& memory_info = allocator->Info();
|
||||
auto iter = allocators_.find(memory_info);
|
||||
if (iter != allocators_.end()) {
|
||||
// EPs could be sharing allocators so no info message unless this is a different instance.
|
||||
// This is an expected scenario as multiple EPs may have allocators for the same device.
|
||||
// e.g. xnnpack and CPU EP are both CPU based.
|
||||
if (iter->second(memory_info.id, memory_info.mem_type) != allocator) {
|
||||
LOGS(logger_, INFO) << "Allocator already registered for " << allocator->Info()
|
||||
<< ". Ignoring allocator from " << provider.Type();
|
||||
}
|
||||
} else {
|
||||
// slightly weird indirection to go back to the provider to get the allocator each time
|
||||
// it's needed in order to support scenarios such as the CUDA EP's per-thread allocator.
|
||||
allocators_[memory_info] = [&provider](int id, OrtMemType mem_type) {
|
||||
return provider.GetAllocator(id, mem_type);
|
||||
};
|
||||
}
|
||||
}
|
||||
});
|
||||
for (const auto& provider : execution_providers_) {
|
||||
for (const auto& allocator : provider->GetAllocators()) {
|
||||
const OrtMemoryInfo& memory_info = allocator->Info();
|
||||
if (allocators_.find(memory_info) != allocators_.end()) {
|
||||
// EPs are ordered by priority so ignore the duplicate allocator for this memory location.
|
||||
LOGS(logger_, INFO) << "Allocator already registered for " << allocator->Info()
|
||||
<< ". Ignoring allocator from " << provider->Type();
|
||||
} else {
|
||||
// slightly weird indirection to go back to the provider to get the allocator each time it's needed
|
||||
// in order to support scenarios such as the CUDA EP's per-thread allocator.
|
||||
allocators_[memory_info] = [&provider](int id, OrtMemType mem_type) {
|
||||
return provider->GetAllocator(id, mem_type);
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
AllocatorPtr SessionState::GetAllocator(const OrtMemoryInfo& location) const noexcept {
|
||||
|
|
|
|||
|
|
@ -24,13 +24,41 @@ struct KernelRegistryAndStatus {
|
|||
} // namespace
|
||||
|
||||
namespace onnxruntime {
|
||||
CPUExecutionProvider::CPUExecutionProvider(const CPUExecutionProviderInfo& info, bool delay_allocator_registration)
|
||||
: IExecutionProvider{onnxruntime::kCpuExecutionProvider}, info_{info} {
|
||||
if (!delay_allocator_registration) {
|
||||
AllocatorManager mgr; // needed only to call RegisterAllocator
|
||||
RegisterAllocator(mgr);
|
||||
}
|
||||
}
|
||||
|
||||
void CPUExecutionProvider::RegisterAllocator(AllocatorManager& allocator_manager) {
|
||||
OrtDevice cpu_device{OrtDevice::CPU, OrtDevice::MemType::DEFAULT, DEFAULT_CPU_ALLOCATOR_DEVICE_ID};
|
||||
auto cpu_alloc = allocator_manager.GetAllocator(OrtMemTypeDefault, cpu_device);
|
||||
|
||||
// if EP is used in multiple inference sessions we may already have an allocator. if so use that.
|
||||
auto cpu_alloc = GetAllocator(cpu_device.Id(), OrtMemTypeDefault);
|
||||
if (!cpu_alloc) {
|
||||
// share our allocator
|
||||
allocator_manager.InsertAllocator(GetAllocator(cpu_device.Id(), OrtMemTypeDefault));
|
||||
// use shared allocator if available
|
||||
cpu_alloc = allocator_manager.GetAllocator(OrtMemTypeDefault, cpu_device);
|
||||
|
||||
if (!cpu_alloc) {
|
||||
// create our allocator
|
||||
bool create_arena = info_.create_arena;
|
||||
#if defined(USE_JEMALLOC) || defined(USE_MIMALLOC)
|
||||
// JEMalloc/mimalloc already have memory pool, so just use device allocator.
|
||||
create_arena = false;
|
||||
#elif !(defined(__amd64__) || defined(_M_AMD64) || defined(__aarch64__) || defined(_M_ARM64))
|
||||
// Disable Arena allocator for x86_32 build because it may run into infinite loop when integer overflow happens
|
||||
create_arena = false;
|
||||
#endif
|
||||
AllocatorCreationInfo device_info{[](int) { return std::make_unique<CPUAllocator>(); },
|
||||
DEFAULT_CPU_ALLOCATOR_DEVICE_ID, create_arena};
|
||||
|
||||
cpu_alloc = CreateAllocator(device_info);
|
||||
// enable sharing of our allocator
|
||||
allocator_manager.InsertAllocator(cpu_alloc);
|
||||
}
|
||||
|
||||
InsertAllocator(cpu_alloc);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -25,23 +25,9 @@ using FuseRuleFn = std::function<void(const onnxruntime::GraphViewer&,
|
|||
// Logical device representation.
|
||||
class CPUExecutionProvider : public IExecutionProvider {
|
||||
public:
|
||||
explicit CPUExecutionProvider(const CPUExecutionProviderInfo& info)
|
||||
: IExecutionProvider{onnxruntime::kCpuExecutionProvider} {
|
||||
bool create_arena = info.create_arena;
|
||||
|
||||
#if defined(USE_JEMALLOC) || defined(USE_MIMALLOC)
|
||||
// JEMalloc/mimalloc already have memory pool, so just use device allocator.
|
||||
create_arena = false;
|
||||
#elif !(defined(__amd64__) || defined(_M_AMD64) || defined(__aarch64__) || defined(_M_ARM64))
|
||||
// Disable Arena allocator for x86_32 build because it may run into infinite loop when integer overflow happens
|
||||
create_arena = false;
|
||||
#endif
|
||||
|
||||
AllocatorCreationInfo device_info{[](int) { return std::make_unique<CPUAllocator>(); },
|
||||
DEFAULT_CPU_ALLOCATOR_DEVICE_ID, create_arena};
|
||||
|
||||
InsertAllocator(CreateAllocator(device_info));
|
||||
}
|
||||
// delay_allocator_registration = true is used to allow sharing of allocators between different providers that are
|
||||
// associated with the same device
|
||||
explicit CPUExecutionProvider(const CPUExecutionProviderInfo& info, bool delay_allocator_registration = false);
|
||||
|
||||
void RegisterAllocator(AllocatorManager& allocator_manager) override;
|
||||
|
||||
|
|
@ -49,6 +35,7 @@ class CPUExecutionProvider : public IExecutionProvider {
|
|||
std::unique_ptr<IDataTransfer> GetDataTransfer() const override;
|
||||
|
||||
private:
|
||||
CPUExecutionProviderInfo info_;
|
||||
std::vector<FuseRuleFn> fuse_rules_;
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@ struct CpuProviderFactory : IExecutionProviderFactory {
|
|||
std::unique_ptr<IExecutionProvider> CpuProviderFactory::CreateProvider() {
|
||||
CPUExecutionProviderInfo info;
|
||||
info.create_arena = create_arena_;
|
||||
return std::make_unique<CPUExecutionProvider>(info);
|
||||
return std::make_unique<CPUExecutionProvider>(info, true /* delay allocator registration to allow sharing */);
|
||||
}
|
||||
|
||||
std::shared_ptr<IExecutionProviderFactory> CPUProviderFactoryCreator::Create(int use_arena) {
|
||||
|
|
|
|||
|
|
@ -1231,7 +1231,7 @@ common::Status InferenceSession::Initialize() {
|
|||
if (!have_cpu_ep) {
|
||||
LOGS(*session_logger_, INFO) << "Adding default CPU execution provider.";
|
||||
CPUExecutionProviderInfo epi{session_options_.enable_cpu_mem_arena};
|
||||
auto p_cpu_exec_provider = std::make_unique<CPUExecutionProvider>(epi);
|
||||
auto p_cpu_exec_provider = std::make_unique<CPUExecutionProvider>(epi, true /* delay allocator registration to allow sharing */);
|
||||
ORT_RETURN_IF_ERROR_SESSIONID_(RegisterExecutionProvider(std::move(p_cpu_exec_provider)));
|
||||
execution_providers_.SetCpuProviderWasImplicitlyAdded(true);
|
||||
}
|
||||
|
|
@ -1248,18 +1248,11 @@ common::Status InferenceSession::Initialize() {
|
|||
|
||||
// Ensure all registered EPs have created their allocators and shared them where possible.
|
||||
// Allocator creation may be delayed until IExecutionProvider::RegisterAllocator is called.
|
||||
//
|
||||
// We iterate EPs in reverse order as we are currently using this mechanism to share a CPU or CUDA
|
||||
// allocator between CPU and XNNPACK, or CUDA and TensorRT. The memory config options for the CPU and CUDA EPs are
|
||||
// more comprehensive so we prefer those, and need to call RegisterAllocator for those EPs first so their
|
||||
// allocators are the ones that get shared.
|
||||
{
|
||||
AllocatorManager allocator_manager;
|
||||
std::for_each(std::make_reverse_iterator(execution_providers_.end()),
|
||||
std::make_reverse_iterator(execution_providers_.begin()),
|
||||
[&allocator_manager](auto& iter) {
|
||||
iter->RegisterAllocator(allocator_manager);
|
||||
});
|
||||
for (const auto& provider : execution_providers_) {
|
||||
provider->RegisterAllocator(allocator_manager);
|
||||
}
|
||||
}
|
||||
|
||||
// At this time we know all the providers that will be part of this session.
|
||||
|
|
|
|||
|
|
@ -268,7 +268,7 @@ TEST(InternalTestingEP, TestRegisterAllocatorHandlesUsageInMultipleSessions) {
|
|||
std::vector<std::shared_ptr<IExecutionProvider>> eps{
|
||||
std::make_shared<InternalTestingExecutionProvider>(supported_ops, std::unordered_set<std::string>{},
|
||||
DataLayout::NHWC),
|
||||
std::make_shared<CPUExecutionProvider>(CPUExecutionProviderInfo{})};
|
||||
std::make_shared<CPUExecutionProvider>(CPUExecutionProviderInfo{}, true /* delay allocator registration to allow sharing */)};
|
||||
|
||||
// check RegisterAllocator is implemented properly and supports calls from multiple inference sessions
|
||||
init_session(eps, session1);
|
||||
|
|
|
|||
|
|
@ -108,7 +108,7 @@ TEST(XnnpackEP, TestAllocatorSharing) {
|
|||
// and use the same EP instances in both
|
||||
std::vector<std::shared_ptr<IExecutionProvider>> eps{
|
||||
std::make_shared<XnnpackExecutionProvider>(XnnpackExecutionProviderInfo{}),
|
||||
std::make_shared<CPUExecutionProvider>(CPUExecutionProviderInfo{})};
|
||||
std::make_shared<CPUExecutionProvider>(CPUExecutionProviderInfo{}, true /* delay allocator creation to allow sharing */)};
|
||||
|
||||
// check RegisterAllocator is implemented properly and supports calls from multiple inference sessions
|
||||
init_session(eps, session1);
|
||||
|
|
@ -223,13 +223,13 @@ TEST(XnnpackEP, TestQDQConvU8U8) {
|
|||
|
||||
TEST(XnnpackEP, TestQDQConvS8S8) {
|
||||
RunModelTest(BuildQDQConvTestCase<int8_t /* InputType */,
|
||||
int8_t /* WeightType */,
|
||||
int32_t /* BiasType */,
|
||||
int8_t /* OutputType */>(
|
||||
{1, 1, 5, 5} /* input_shape */,
|
||||
{1, 1, 3, 3} /* weights_shape */),
|
||||
"xnnpack_qdq_test_graph_conv_s8s8",
|
||||
{ExpectedEPNodeAssignment::Some, 0.2f});
|
||||
int8_t /* WeightType */,
|
||||
int32_t /* BiasType */,
|
||||
int8_t /* OutputType */>(
|
||||
{1, 1, 5, 5} /* input_shape */,
|
||||
{1, 1, 3, 3} /* weights_shape */),
|
||||
"xnnpack_qdq_test_graph_conv_s8s8",
|
||||
{ExpectedEPNodeAssignment::Some, 0.2f});
|
||||
|
||||
const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "conv_qdq_s8s8.onnx";
|
||||
RunModelTestWithPath(ort_model_path, "xnnpack_qdq_test_graph_conv_s8s8", 0.7f);
|
||||
|
|
|
|||
|
|
@ -14,7 +14,12 @@ namespace onnxruntime {
|
|||
namespace test {
|
||||
|
||||
std::unique_ptr<IExecutionProvider> DefaultCpuExecutionProvider(bool enable_arena) {
|
||||
return CPUProviderFactoryCreator::Create(enable_arena)->CreateProvider();
|
||||
auto ret = CPUProviderFactoryCreator::Create(enable_arena)->CreateProvider();
|
||||
// The factory created CPU provider doesn't create/reg allocators; something that is expected by
|
||||
// clients of DefaultCpuExecutionProvider; hence the need to call RegisterAllocator explicitly.
|
||||
AllocatorManager mgr; // needed only to call RegisterAllocator
|
||||
ret->RegisterAllocator(mgr);
|
||||
return ret;
|
||||
}
|
||||
|
||||
std::unique_ptr<IExecutionProvider> DefaultTensorrtExecutionProvider() {
|
||||
|
|
|
|||
Loading…
Reference in a new issue