Lazily get allocator when allocating an MLValue (#4276)

According to profiling in #4267, getting the allocator can account for a large fraction of overhead when accessing a kernel output, due to STL container operations. The allocator isn't used when (i) we're not creating a fence, and (ii) we have a memory pattern and a pre-allocated buffer, so we can avoid this overhead.
2026-07-16 18:31:27 +00:00 · 2020-06-19 15:55:43 -07:00 · 2020-06-19 15:55:43 -07:00 · a541d28fb4
commit a541d28fb4
parent a490beedf1
1 changed files with 5 additions and 2 deletions
--- a/onnxruntime/core/framework/execution_frame.cc
+++ b/onnxruntime/core/framework/execution_frame.cc
@ -324,11 +324,13 @@ Status ExecutionFrame::AllocateMLValueTensorSelfOwnBufferHelper(OrtValue& ort_va
    return Status(ONNXRUNTIME, FAIL, "size overflow");
  }

-  auto alloc = GetAllocator(location);
-
+  // Lazily get the allocator only if needed.
+  AllocatorPtr alloc = nullptr;
+  
  // create fence if needed
  if (create_fence) {
    ORT_ENFORCE(ort_value.Fence() == nullptr);
+    alloc = GetAllocator(location);
    FencePtr f = alloc->CreateFence(&session_state_);
    // it is OK to have fence been nullptr if the execution provider has no async execution,
    // and allocator::CreateFence returns nullptr
@ -370,6 +372,7 @@ Status ExecutionFrame::AllocateMLValueTensorSelfOwnBufferHelper(OrtValue& ort_va
  }

  //no memory pattern, or the pattern is not correct.
+  if (!alloc) alloc = GetAllocator(location);
  std::unique_ptr<Tensor> p_tensor = onnxruntime::make_unique<Tensor>(element_type, shape, alloc);

  {