Handle mem pattern allocation failure better. Make BFCArena behavior more consistent (#4062)

* Fixes from investigating issue running BERT-Squad model with larger batch sizes. When the batch size gets large enough the initial run will be successful (no memory pattern in use) but the second will fail to allocate the memory pattern block. The cause of this failure is that we still have the smaller blocks from the first run allocated, as BFCArena has no logic to free those. This essentially results in 2x the memory being required to run the model. There was inconsistency in BFCArena::Extend which on one path threw an exception if it couldn't do the allocation, and on another just returned false (resulting in Alloc returning a nullptr). Make the behavior consistent by always throwing if BFCArena fails to find a buffer to return. There are a huge number of places in the code where we assume Alloc returns a valid pointer so throwing will result in more correct behavior as a whole. It's also consistent with what happens when CUDA or the standard library fails to allocate memory. Next, update ExecutionFrame to check for this failure and not insert a memory block entry if it happens. With the existing code if BFCArena Alloc returned a nullptr we happily inserted that in the blocks, delaying detection of the failure to when we attempted to use the block in AllocateMLValueTensorSelfOwnBufferHelper. Finally update AllocateMLValueTensorSelfOwnBufferHelper to expect a location may not have a block. A log message will be provided when the block allocation fails so it's not necessary to have more on each individual allocation that would have used the block. Falls through to default behavior of doing a normal allocation.
2026-07-06 04:28:32 +00:00 · 2020-06-05 18:54:01 +10:00 · 2020-06-05 18:54:01 +10:00 · 9790e19424
commit 9790e19424
parent 81101c9efd
6 changed files with 133 additions and 57 deletions
--- a/include/onnxruntime/core/framework/allocator.h
+++ b/include/onnxruntime/core/framework/allocator.h
@ -118,7 +118,7 @@ struct OrtMemoryInfo {
  std::string ToString() const {
    std::ostringstream ostr;
    ostr << "OrtMemoryInfo: ["
-         << " name:" << name
+         << "name:" << name
         << " id:" << id
         << " mem_type:" << mem_type
         << " alloc_type:" << alloc_type
@ -306,9 +306,9 @@ class MiMallocAllocator : public IDeviceAllocator {
 #endif

 #if defined(USE_MIMALLOC_ARENA_ALLOCATOR)
-  using TAllocator = MiMallocAllocator;
+using TAllocator = MiMallocAllocator;
 #else
-  using TAllocator = CPUAllocator;
+using TAllocator = CPUAllocator;
 #endif

 using AllocatorPtr = std::shared_ptr<IAllocator>;
--- a/onnxruntime/core/framework/bfc_arena.cc
+++ b/onnxruntime/core/framework/bfc_arena.cc
@ -14,8 +14,8 @@ BFCArena::BFCArena(std::unique_ptr<IDeviceAllocator> resource_allocator,
            device_allocator_->Info().device, device_allocator_->Info().id, device_allocator_->Info().mem_type) {
  LOGS_DEFAULT(INFO) << "Creating BFCArena for " << device_allocator_->Info().name;

-  // TODO - consider to make the initial chunk size and max 'fragmentation' (kMaxDeadBytesInChunk) values configurable. 
-  // But first we need to add a mechanism to allow that sort of low level configuration to be done 
+  // TODO - consider to make the initial chunk size and max 'fragmentation' (kMaxDeadBytesInChunk) values configurable.
+  // But first we need to add a mechanism to allow that sort of low level configuration to be done
  // without adding separate parameters to SessionOptions for every single one of them.
  curr_region_allocation_bytes_ = RoundedBytes(std::min(total_memory, size_t{1048576}));

@ -62,7 +62,7 @@ BFCArena::Chunk* BFCArena::ChunkFromHandle(ChunkHandle h) {
  return &(chunks_[h]);
 }

-bool BFCArena::Extend(size_t rounded_bytes) {
+Status BFCArena::Extend(size_t rounded_bytes) {
  size_t available_bytes = memory_limit_ - static_cast<size_t>(stats_.total_allocated_bytes);
  // Rounds available_bytes down to the nearest multiple of kMinAllocationSize.
  available_bytes = (available_bytes / kMinAllocationSize) * kMinAllocationSize;
@ -70,7 +70,8 @@ bool BFCArena::Extend(size_t rounded_bytes) {
  // Do we have enough space to handle the client's request?
  // If not, fail immediately.
  if (rounded_bytes > available_bytes) {
-    return false;
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Available memory of ", available_bytes,
+                           " is smaller than requested bytes of ", rounded_bytes);
  }

  auto safe_alloc = [this](size_t alloc_bytes) {
@ -137,11 +138,11 @@ bool BFCArena::Extend(size_t rounded_bytes) {
  }

  if (mem_addr == nullptr) {
-    ORT_THROW("Failed to allocate memory for requested buffer of size ", rounded_bytes);
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                           "Failed to allocate memory for requested buffer of size ", rounded_bytes);
  }

-  LOGS_DEFAULT(INFO) << "Extended allocation by " << bytes
-                     << " bytes.";
+  LOGS_DEFAULT(INFO) << "Extended allocation by " << bytes << " bytes.";

  stats_.total_allocated_bytes += bytes;
  LOGS_DEFAULT(INFO) << "Total allocated bytes: "
@ -170,7 +171,7 @@ bool BFCArena::Extend(size_t rounded_bytes) {
  // Insert the chunk into the right bin.
  InsertFreeChunkIntoBin(h);

-  return true;
+  return Status::OK();
 }

 BFCArena::ChunkHandle BFCArena::AllocateChunk() {
@ -260,10 +261,15 @@ void* BFCArena::AllocateRawInternal(size_t num_bytes,
                     << ". bin_num:" << bin_num << " rounded_bytes:" << rounded_bytes;

  // Try to extend
-  if (Extend(rounded_bytes)) {
+  auto status = Extend(rounded_bytes);
+  if (status.IsOK()) {
    ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes);
    if (ptr != nullptr) {
      return ptr;
+    } else {
+      status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                               "Failed to find a free memory block despite calling Extend. rounded_bytes=",
+                               rounded_bytes);
    }
  }

@ -271,13 +277,12 @@ void* BFCArena::AllocateRawInternal(size_t num_bytes,
  // couldn't find one.  This means we must have run out of memory,
  // Dump the memory log for analysis.
  if (dump_log_on_failure) {
-    LOGS_DEFAULT(ERROR) << "BFC Arena ran out of memory trying "
-                        << "to allocate " << num_bytes
+    LOGS_DEFAULT(ERROR) << "BFC Arena ran out of memory trying to allocate " << num_bytes
                        << ".  Current allocation summary follows.";
    DumpMemoryLog(rounded_bytes);
  }

-  return nullptr;
+  ORT_THROW(status.ErrorMessage());
 }

 void BFCArena::GetStats(AllocatorStats* stats) {
@ -307,8 +312,7 @@ void* BFCArena::FindChunkPtr(BinNum bin_num, size_t rounded_bytes,
        // kMaxDeadBytesInChunk bytes on padding this alloc.
        const int64_t kMaxDeadBytesInChunk = 128 << 20;  // 128mb
        if (chunk->size >= rounded_bytes * 2 ||
-            static_cast<int64_t>(chunk->size) - rounded_bytes >=
-                kMaxDeadBytesInChunk) {
+            static_cast<int64_t>(chunk->size) - rounded_bytes >= kMaxDeadBytesInChunk) {
          SplitChunk(h, rounded_bytes);
          chunk = ChunkFromHandle(h);  // Update chunk pointer in case it moved
        }
--- a/onnxruntime/core/framework/bfc_arena.h
+++ b/onnxruntime/core/framework/bfc_arena.h
@ -321,9 +321,8 @@ class BFCArena : public IArenaAllocator {
  size_t RoundedBytes(size_t bytes);

  // Try to add a new memory region that can satisfy an allocation of
-  // 'rounded_bytes' bytes.  Returns true on success and false on
-  // failure.
-  bool Extend(size_t rounded_bytes);
+  // 'rounded_bytes' bytes.
+  Status Extend(size_t rounded_bytes);

  // Returns a pointer to an underlying allocated chunk of size
  // 'rounded_bytes'.
--- a/onnxruntime/core/framework/execution_frame.cc
+++ b/onnxruntime/core/framework/execution_frame.cc
@ -243,15 +243,39 @@ ExecutionFrame::ExecutionFrame(const std::vector<int>& feed_mlvalue_idxs, const
        // pre-allocate the big chunk requested in memory pattern.
        // all the internal kernel's input/output tensors will be allocated on these buffer.
        for (size_t i = 0; i < mem_patterns_->locations.size(); i++) {
-          ORT_ENFORCE(buffers_.find(mem_patterns_->locations[i]) == buffers_.end());
-          AllocatorPtr alloc = GetAllocator(mem_patterns_->locations[i]);
-          void* buffer = mem_patterns_->patterns[i].PeakSize() > 0
-                             ? alloc->Alloc(mem_patterns_->patterns[i].PeakSize())
-                             : nullptr;
-          buffers_[mem_patterns_->locations[i]] = BufferUniquePtr(buffer, alloc);
+          const auto& location = mem_patterns_->locations[i];
+          ORT_ENFORCE(buffers_.find(location) == buffers_.end());
+          if (mem_patterns_->patterns[i].PeakSize() > 0) {
+            AllocatorPtr alloc = GetAllocator(location);
+            void* buffer = nullptr;
+            // it's possible we can't allocate the large block. if we have memory patterns we know we have successfully
+            // executed once before, so if there's an arena involved it probably has smaller blocks available.
+            // due to that we can still run and use those blocks (inside the arena logic) instead of one large one.
+            // it's less efficient (the arena will add some overhead to coalesce individual allocations
+            // back into blocks on 'free'), but better than failing completely.
+            try {
+              buffer = alloc->Alloc(mem_patterns_->patterns[i].PeakSize());

-          // log size of activation. Keep it commented out for now to avoid log flooding.
-          // VLOGS(session_state_.Logger(), 1) << "Allocated memory for activations, size: " << mem_patterns_->patterns[i].PeakSize();
+              // handle allocator that doesn't throw
+              if (buffer == nullptr) {
+                // INFO level as this may fire on every run and there may not be much a user can do
+                LOGS(session_state_.Logger(), INFO) << "Allocation of memory pattern buffer for "
+                                                    << location.ToString() << " returned nullptr";
+              }
+
+            } catch (const OnnxRuntimeException& ex) {
+              LOGS(session_state_.Logger(), INFO) << "Allocation of memory pattern buffer for "
+                                                  << location.ToString() << " failed. Error:" << ex.what();
+            }
+
+            if (buffer != nullptr) {
+              buffers_[location] = BufferUniquePtr(buffer, alloc);
+            }
+
+            // log size of activation. Keep it commented out for now to avoid log flooding.
+            // VLOGS(session_state_.Logger(), 1) << "Allocated memory for activations, size: "
+            //                                   << mem_patterns_->patterns[i].PeakSize();
+          }
        }
      }
    }
@ -312,30 +336,30 @@ Status ExecutionFrame::AllocateMLValueTensorSelfOwnBufferHelper(OrtValue& ort_va
      // if block not found, fall back to default behavior
      if (block) {
        auto it = buffers_.find(location);
-        // if the block is not correct, log message then fall back to default behavior
-        if (it != buffers_.end() && block->size_ == size) {
-          void* buffer = it->second.get();
-          auto status = AllocateTensorWithPreAllocateBufferHelper(
-              ort_value, static_cast<void*>(static_cast<char*>(buffer) + block->offset_), element_type, location,
-              shape);
-          return status;
-        }
-        if (block->size_ != size) {
-          // the block size may vary especially if the model has NonZero ops, or different sequence lengths are
-          // fed in, so use VERBOSE as the log level as it's expected.
-          // TODO: Should we re-use the block if the size is large enough? Would probably need to allow it
-          // to be freed if the size difference was too large so our memory usage doesn't stick at a high water mark
-          LOGS(session_state_.Logger(), VERBOSE) << "For ort_value with index: " << ort_value_index
-                                                 << ", block in memory pattern size is: " << block->size_
-                                                 << " but the actually size is: " << size
-                                                 << ", fall back to default allocation behavior";
-        } else if (it == buffers_.end()) {
-          LOGS(session_state_.Logger(), WARNING) << "For ort_value with index: " << ort_value_index
-                                                 << ", block not found in target location. fall back to default allocation behavior";
+        if (it != buffers_.end()) {
+          // if the block is not correct, log message then fall back to default behavior
+          if (block->size_ == size) {
+            void* buffer = it->second.get();
+            auto status = AllocateTensorWithPreAllocateBufferHelper(
+                ort_value, static_cast<void*>(static_cast<char*>(buffer) + block->offset_), element_type, location,
+                shape);
+            return status;
+          } else {
+            // the block size may vary especially if the model has NonZero ops, or different sequence lengths are
+            // fed in, so use VERBOSE as the log level as it's expected.
+            // TODO: Should we re-use the block if the size is large enough? Would probably need to allow it
+            // to be freed if the size difference was too large so our memory usage doesn't stick at a high water mark
+            LOGS(session_state_.Logger(), VERBOSE) << "For ort_value with index: " << ort_value_index
+                                                   << ", block in memory pattern size is: " << block->size_
+                                                   << " but the actually size is: " << size
+                                                   << ", fall back to default allocation behavior";
+          }
        }
+        // else { we couldn't allocate the large block for the buffer so we didn't insert an entry }
      }
    }
  }
+
  //no memory pattern, or the pattern is not correct.
  std::unique_ptr<Tensor> p_tensor = onnxruntime::make_unique<Tensor>(element_type, shape, alloc);

--- a/onnxruntime/core/framework/memcpy.cc
+++ b/onnxruntime/core/framework/memcpy.cc
@ -14,6 +14,15 @@ Status Memcpy::Compute(OpKernelContext* ctx) const {
  const auto* X = ctx->Input<Tensor>(0);
  Tensor* Y = ctx->Output(0, X->Shape());
  Status retval = Info().GetDataTransferManager().CopyTensor(*X, *Y, Info().GetKernelDef().ExecQueueId());
+
+  if (!retval.IsOK()) {
+    LOGS(ctx->Logger(), ERROR) << MakeString(retval.ErrorMessage(),
+                                             " Copying ", Node().InputDefs()[0]->Name(),
+                                             " to ", Node().OutputDefs()[0]->Name(),
+                                             " Input shape:", X->Shape(), " Output shape:", Y->Shape(),
+                                             " X data:", X->DataRaw(), " Y data:", Y->DataRaw());
+  }
+
  return retval;
 }

--- a/onnxruntime/test/framework/bfc_arena_test.cc
+++ b/onnxruntime/test/framework/bfc_arena_test.cc
@ -3,6 +3,7 @@

 #include "core/framework/bfc_arena.h"
 #include "gtest/gtest.h"
+#include "gmock/gmock.h"
 #include <cstdlib>

 namespace onnxruntime {
@ -72,8 +73,8 @@ TEST(BFCArenaTest, AllocationsAndDeallocations) {

  // Ensure out of memory errors work and do not prevent future allocations from
  // working.
-  void* out_of_memory_ptr = a.Alloc((1 << 30) + 1);
-  EXPECT_EQ(out_of_memory_ptr, nullptr);
+
+  EXPECT_THROW(a.Alloc((1 << 30) + 1), OnnxRuntimeException);

  // Allocate a lot of raw pointers
  for (int s = 1; s < 256; s++) {
@ -152,15 +153,54 @@ TEST(BFCArenaTest, AllocatedVsRequested) {
 }

 TEST(BFCArenaTest, TestCustomMemoryLimit) {
-  // Configure a 1MiB byte limit
-  BFCArena a(std::unique_ptr<IDeviceAllocator>(new CPUAllocator()), 1 << 20);
+  {
+    // Configure a 1MiB byte limit
+    BFCArena a(std::unique_ptr<IDeviceAllocator>(new CPUAllocator()), 1 << 20);

-  void* first_ptr = a.Alloc(sizeof(float) * (1 << 6));
-  void* second_ptr = a.Alloc(sizeof(float) * (1 << 20));
+    void* first_ptr = a.Alloc(sizeof(float) * (1 << 6));
+    EXPECT_NE(nullptr, first_ptr);

-  EXPECT_NE(nullptr, first_ptr);
-  EXPECT_EQ(nullptr, second_ptr);
-  a.Free(first_ptr);
+    // test allocation of more than available memory throws
+    try {
+      a.Alloc(sizeof(float) * (1 << 20));
+      FAIL() << "Allocation should have thrown";
+    } catch (const OnnxRuntimeException& ex) {
+#ifdef GTEST_USES_POSIX_RE
+      EXPECT_THAT(ex.what(),
+                  testing::ContainsRegex("Available memory of [0-9]+ is smaller than requested bytes of [0-9]+"));
+#else
+      EXPECT_THAT(ex.what(),
+                  testing::ContainsRegex("Available memory of \\d+ is smaller than requested bytes of \\d+"));
+#endif
+    } catch (...) {
+      FAIL() << "Allocation should have thrown OnnxRuntimeException";
+    }
+
+    a.Free(first_ptr);
+  }
+
+  {
+    // allow for the maximum amount of memory less 5MiB
+    constexpr size_t available = std::numeric_limits<size_t>::max() - (5 * 1024 * 1024);
+    BFCArena b(std::unique_ptr<IDeviceAllocator>(new CPUAllocator()), available,
+               ArenaExtendStrategy::kSameAsRequested);  // need this strategy. kNextPowerOfTwo would overflow size_t
+
+    void* first_ptr = b.Alloc(sizeof(float) * (1 << 6));
+    EXPECT_NE(nullptr, first_ptr);
+
+    // test allocation that is less than available memory, but more than what could reasonably be expected to exist.
+    // first alloc creates a 1MB block so allow for that not being available.
+    try {
+      b.Alloc(available - (3 * 1024 * 1024));
+      FAIL() << "Allocation should have thrown";
+    } catch (const OnnxRuntimeException& ex) {
+      EXPECT_THAT(ex.what(), testing::HasSubstr("Failed to allocate memory for requested buffer of size"));
+    } catch (...) {
+      FAIL() << "Allocation should have thrown OnnxRuntimeException";
+    }
+
+    b.Free(first_ptr);
+  }
 }

 TEST(BFCArenaTest, AllocationsAndDeallocationsWithGrowth) {