diff --git a/include/onnxruntime/core/framework/allocator.h b/include/onnxruntime/core/framework/allocator.h index d79b9bc8d1..2b5be2308a 100644 --- a/include/onnxruntime/core/framework/allocator.h +++ b/include/onnxruntime/core/framework/allocator.h @@ -118,7 +118,7 @@ struct OrtMemoryInfo { std::string ToString() const { std::ostringstream ostr; ostr << "OrtMemoryInfo: [" - << " name:" << name + << "name:" << name << " id:" << id << " mem_type:" << mem_type << " alloc_type:" << alloc_type @@ -306,9 +306,9 @@ class MiMallocAllocator : public IDeviceAllocator { #endif #if defined(USE_MIMALLOC_ARENA_ALLOCATOR) - using TAllocator = MiMallocAllocator; +using TAllocator = MiMallocAllocator; #else - using TAllocator = CPUAllocator; +using TAllocator = CPUAllocator; #endif using AllocatorPtr = std::shared_ptr; diff --git a/onnxruntime/core/framework/bfc_arena.cc b/onnxruntime/core/framework/bfc_arena.cc index 631a207ded..565c2f41cf 100644 --- a/onnxruntime/core/framework/bfc_arena.cc +++ b/onnxruntime/core/framework/bfc_arena.cc @@ -14,8 +14,8 @@ BFCArena::BFCArena(std::unique_ptr resource_allocator, device_allocator_->Info().device, device_allocator_->Info().id, device_allocator_->Info().mem_type) { LOGS_DEFAULT(INFO) << "Creating BFCArena for " << device_allocator_->Info().name; - // TODO - consider to make the initial chunk size and max 'fragmentation' (kMaxDeadBytesInChunk) values configurable. - // But first we need to add a mechanism to allow that sort of low level configuration to be done + // TODO - consider to make the initial chunk size and max 'fragmentation' (kMaxDeadBytesInChunk) values configurable. + // But first we need to add a mechanism to allow that sort of low level configuration to be done // without adding separate parameters to SessionOptions for every single one of them. curr_region_allocation_bytes_ = RoundedBytes(std::min(total_memory, size_t{1048576})); @@ -62,7 +62,7 @@ BFCArena::Chunk* BFCArena::ChunkFromHandle(ChunkHandle h) { return &(chunks_[h]); } -bool BFCArena::Extend(size_t rounded_bytes) { +Status BFCArena::Extend(size_t rounded_bytes) { size_t available_bytes = memory_limit_ - static_cast(stats_.total_allocated_bytes); // Rounds available_bytes down to the nearest multiple of kMinAllocationSize. available_bytes = (available_bytes / kMinAllocationSize) * kMinAllocationSize; @@ -70,7 +70,8 @@ bool BFCArena::Extend(size_t rounded_bytes) { // Do we have enough space to handle the client's request? // If not, fail immediately. if (rounded_bytes > available_bytes) { - return false; + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Available memory of ", available_bytes, + " is smaller than requested bytes of ", rounded_bytes); } auto safe_alloc = [this](size_t alloc_bytes) { @@ -137,11 +138,11 @@ bool BFCArena::Extend(size_t rounded_bytes) { } if (mem_addr == nullptr) { - ORT_THROW("Failed to allocate memory for requested buffer of size ", rounded_bytes); + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, + "Failed to allocate memory for requested buffer of size ", rounded_bytes); } - LOGS_DEFAULT(INFO) << "Extended allocation by " << bytes - << " bytes."; + LOGS_DEFAULT(INFO) << "Extended allocation by " << bytes << " bytes."; stats_.total_allocated_bytes += bytes; LOGS_DEFAULT(INFO) << "Total allocated bytes: " @@ -170,7 +171,7 @@ bool BFCArena::Extend(size_t rounded_bytes) { // Insert the chunk into the right bin. InsertFreeChunkIntoBin(h); - return true; + return Status::OK(); } BFCArena::ChunkHandle BFCArena::AllocateChunk() { @@ -260,10 +261,15 @@ void* BFCArena::AllocateRawInternal(size_t num_bytes, << ". bin_num:" << bin_num << " rounded_bytes:" << rounded_bytes; // Try to extend - if (Extend(rounded_bytes)) { + auto status = Extend(rounded_bytes); + if (status.IsOK()) { ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes); if (ptr != nullptr) { return ptr; + } else { + status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, + "Failed to find a free memory block despite calling Extend. rounded_bytes=", + rounded_bytes); } } @@ -271,13 +277,12 @@ void* BFCArena::AllocateRawInternal(size_t num_bytes, // couldn't find one. This means we must have run out of memory, // Dump the memory log for analysis. if (dump_log_on_failure) { - LOGS_DEFAULT(ERROR) << "BFC Arena ran out of memory trying " - << "to allocate " << num_bytes + LOGS_DEFAULT(ERROR) << "BFC Arena ran out of memory trying to allocate " << num_bytes << ". Current allocation summary follows."; DumpMemoryLog(rounded_bytes); } - return nullptr; + ORT_THROW(status.ErrorMessage()); } void BFCArena::GetStats(AllocatorStats* stats) { @@ -307,8 +312,7 @@ void* BFCArena::FindChunkPtr(BinNum bin_num, size_t rounded_bytes, // kMaxDeadBytesInChunk bytes on padding this alloc. const int64_t kMaxDeadBytesInChunk = 128 << 20; // 128mb if (chunk->size >= rounded_bytes * 2 || - static_cast(chunk->size) - rounded_bytes >= - kMaxDeadBytesInChunk) { + static_cast(chunk->size) - rounded_bytes >= kMaxDeadBytesInChunk) { SplitChunk(h, rounded_bytes); chunk = ChunkFromHandle(h); // Update chunk pointer in case it moved } diff --git a/onnxruntime/core/framework/bfc_arena.h b/onnxruntime/core/framework/bfc_arena.h index 82c8d962f7..9b503db26e 100644 --- a/onnxruntime/core/framework/bfc_arena.h +++ b/onnxruntime/core/framework/bfc_arena.h @@ -321,9 +321,8 @@ class BFCArena : public IArenaAllocator { size_t RoundedBytes(size_t bytes); // Try to add a new memory region that can satisfy an allocation of - // 'rounded_bytes' bytes. Returns true on success and false on - // failure. - bool Extend(size_t rounded_bytes); + // 'rounded_bytes' bytes. + Status Extend(size_t rounded_bytes); // Returns a pointer to an underlying allocated chunk of size // 'rounded_bytes'. diff --git a/onnxruntime/core/framework/execution_frame.cc b/onnxruntime/core/framework/execution_frame.cc index ebd4d5d124..9f24a51670 100644 --- a/onnxruntime/core/framework/execution_frame.cc +++ b/onnxruntime/core/framework/execution_frame.cc @@ -243,15 +243,39 @@ ExecutionFrame::ExecutionFrame(const std::vector& feed_mlvalue_idxs, const // pre-allocate the big chunk requested in memory pattern. // all the internal kernel's input/output tensors will be allocated on these buffer. for (size_t i = 0; i < mem_patterns_->locations.size(); i++) { - ORT_ENFORCE(buffers_.find(mem_patterns_->locations[i]) == buffers_.end()); - AllocatorPtr alloc = GetAllocator(mem_patterns_->locations[i]); - void* buffer = mem_patterns_->patterns[i].PeakSize() > 0 - ? alloc->Alloc(mem_patterns_->patterns[i].PeakSize()) - : nullptr; - buffers_[mem_patterns_->locations[i]] = BufferUniquePtr(buffer, alloc); + const auto& location = mem_patterns_->locations[i]; + ORT_ENFORCE(buffers_.find(location) == buffers_.end()); + if (mem_patterns_->patterns[i].PeakSize() > 0) { + AllocatorPtr alloc = GetAllocator(location); + void* buffer = nullptr; + // it's possible we can't allocate the large block. if we have memory patterns we know we have successfully + // executed once before, so if there's an arena involved it probably has smaller blocks available. + // due to that we can still run and use those blocks (inside the arena logic) instead of one large one. + // it's less efficient (the arena will add some overhead to coalesce individual allocations + // back into blocks on 'free'), but better than failing completely. + try { + buffer = alloc->Alloc(mem_patterns_->patterns[i].PeakSize()); - // log size of activation. Keep it commented out for now to avoid log flooding. - // VLOGS(session_state_.Logger(), 1) << "Allocated memory for activations, size: " << mem_patterns_->patterns[i].PeakSize(); + // handle allocator that doesn't throw + if (buffer == nullptr) { + // INFO level as this may fire on every run and there may not be much a user can do + LOGS(session_state_.Logger(), INFO) << "Allocation of memory pattern buffer for " + << location.ToString() << " returned nullptr"; + } + + } catch (const OnnxRuntimeException& ex) { + LOGS(session_state_.Logger(), INFO) << "Allocation of memory pattern buffer for " + << location.ToString() << " failed. Error:" << ex.what(); + } + + if (buffer != nullptr) { + buffers_[location] = BufferUniquePtr(buffer, alloc); + } + + // log size of activation. Keep it commented out for now to avoid log flooding. + // VLOGS(session_state_.Logger(), 1) << "Allocated memory for activations, size: " + // << mem_patterns_->patterns[i].PeakSize(); + } } } } @@ -312,30 +336,30 @@ Status ExecutionFrame::AllocateMLValueTensorSelfOwnBufferHelper(OrtValue& ort_va // if block not found, fall back to default behavior if (block) { auto it = buffers_.find(location); - // if the block is not correct, log message then fall back to default behavior - if (it != buffers_.end() && block->size_ == size) { - void* buffer = it->second.get(); - auto status = AllocateTensorWithPreAllocateBufferHelper( - ort_value, static_cast(static_cast(buffer) + block->offset_), element_type, location, - shape); - return status; - } - if (block->size_ != size) { - // the block size may vary especially if the model has NonZero ops, or different sequence lengths are - // fed in, so use VERBOSE as the log level as it's expected. - // TODO: Should we re-use the block if the size is large enough? Would probably need to allow it - // to be freed if the size difference was too large so our memory usage doesn't stick at a high water mark - LOGS(session_state_.Logger(), VERBOSE) << "For ort_value with index: " << ort_value_index - << ", block in memory pattern size is: " << block->size_ - << " but the actually size is: " << size - << ", fall back to default allocation behavior"; - } else if (it == buffers_.end()) { - LOGS(session_state_.Logger(), WARNING) << "For ort_value with index: " << ort_value_index - << ", block not found in target location. fall back to default allocation behavior"; + if (it != buffers_.end()) { + // if the block is not correct, log message then fall back to default behavior + if (block->size_ == size) { + void* buffer = it->second.get(); + auto status = AllocateTensorWithPreAllocateBufferHelper( + ort_value, static_cast(static_cast(buffer) + block->offset_), element_type, location, + shape); + return status; + } else { + // the block size may vary especially if the model has NonZero ops, or different sequence lengths are + // fed in, so use VERBOSE as the log level as it's expected. + // TODO: Should we re-use the block if the size is large enough? Would probably need to allow it + // to be freed if the size difference was too large so our memory usage doesn't stick at a high water mark + LOGS(session_state_.Logger(), VERBOSE) << "For ort_value with index: " << ort_value_index + << ", block in memory pattern size is: " << block->size_ + << " but the actually size is: " << size + << ", fall back to default allocation behavior"; + } } + // else { we couldn't allocate the large block for the buffer so we didn't insert an entry } } } } + //no memory pattern, or the pattern is not correct. std::unique_ptr p_tensor = onnxruntime::make_unique(element_type, shape, alloc); diff --git a/onnxruntime/core/framework/memcpy.cc b/onnxruntime/core/framework/memcpy.cc index f1dfff4bc8..9c45a2044e 100644 --- a/onnxruntime/core/framework/memcpy.cc +++ b/onnxruntime/core/framework/memcpy.cc @@ -14,6 +14,15 @@ Status Memcpy::Compute(OpKernelContext* ctx) const { const auto* X = ctx->Input(0); Tensor* Y = ctx->Output(0, X->Shape()); Status retval = Info().GetDataTransferManager().CopyTensor(*X, *Y, Info().GetKernelDef().ExecQueueId()); + + if (!retval.IsOK()) { + LOGS(ctx->Logger(), ERROR) << MakeString(retval.ErrorMessage(), + " Copying ", Node().InputDefs()[0]->Name(), + " to ", Node().OutputDefs()[0]->Name(), + " Input shape:", X->Shape(), " Output shape:", Y->Shape(), + " X data:", X->DataRaw(), " Y data:", Y->DataRaw()); + } + return retval; } diff --git a/onnxruntime/test/framework/bfc_arena_test.cc b/onnxruntime/test/framework/bfc_arena_test.cc index 7051f17246..88c6d16117 100644 --- a/onnxruntime/test/framework/bfc_arena_test.cc +++ b/onnxruntime/test/framework/bfc_arena_test.cc @@ -3,6 +3,7 @@ #include "core/framework/bfc_arena.h" #include "gtest/gtest.h" +#include "gmock/gmock.h" #include namespace onnxruntime { @@ -72,8 +73,8 @@ TEST(BFCArenaTest, AllocationsAndDeallocations) { // Ensure out of memory errors work and do not prevent future allocations from // working. - void* out_of_memory_ptr = a.Alloc((1 << 30) + 1); - EXPECT_EQ(out_of_memory_ptr, nullptr); + + EXPECT_THROW(a.Alloc((1 << 30) + 1), OnnxRuntimeException); // Allocate a lot of raw pointers for (int s = 1; s < 256; s++) { @@ -152,15 +153,54 @@ TEST(BFCArenaTest, AllocatedVsRequested) { } TEST(BFCArenaTest, TestCustomMemoryLimit) { - // Configure a 1MiB byte limit - BFCArena a(std::unique_ptr(new CPUAllocator()), 1 << 20); + { + // Configure a 1MiB byte limit + BFCArena a(std::unique_ptr(new CPUAllocator()), 1 << 20); - void* first_ptr = a.Alloc(sizeof(float) * (1 << 6)); - void* second_ptr = a.Alloc(sizeof(float) * (1 << 20)); + void* first_ptr = a.Alloc(sizeof(float) * (1 << 6)); + EXPECT_NE(nullptr, first_ptr); - EXPECT_NE(nullptr, first_ptr); - EXPECT_EQ(nullptr, second_ptr); - a.Free(first_ptr); + // test allocation of more than available memory throws + try { + a.Alloc(sizeof(float) * (1 << 20)); + FAIL() << "Allocation should have thrown"; + } catch (const OnnxRuntimeException& ex) { +#ifdef GTEST_USES_POSIX_RE + EXPECT_THAT(ex.what(), + testing::ContainsRegex("Available memory of [0-9]+ is smaller than requested bytes of [0-9]+")); +#else + EXPECT_THAT(ex.what(), + testing::ContainsRegex("Available memory of \\d+ is smaller than requested bytes of \\d+")); +#endif + } catch (...) { + FAIL() << "Allocation should have thrown OnnxRuntimeException"; + } + + a.Free(first_ptr); + } + + { + // allow for the maximum amount of memory less 5MiB + constexpr size_t available = std::numeric_limits::max() - (5 * 1024 * 1024); + BFCArena b(std::unique_ptr(new CPUAllocator()), available, + ArenaExtendStrategy::kSameAsRequested); // need this strategy. kNextPowerOfTwo would overflow size_t + + void* first_ptr = b.Alloc(sizeof(float) * (1 << 6)); + EXPECT_NE(nullptr, first_ptr); + + // test allocation that is less than available memory, but more than what could reasonably be expected to exist. + // first alloc creates a 1MB block so allow for that not being available. + try { + b.Alloc(available - (3 * 1024 * 1024)); + FAIL() << "Allocation should have thrown"; + } catch (const OnnxRuntimeException& ex) { + EXPECT_THAT(ex.what(), testing::HasSubstr("Failed to allocate memory for requested buffer of size")); + } catch (...) { + FAIL() << "Allocation should have thrown OnnxRuntimeException"; + } + + b.Free(first_ptr); + } } TEST(BFCArenaTest, AllocationsAndDeallocationsWithGrowth) {