diff --git a/include/onnxruntime/core/framework/allocator.h b/include/onnxruntime/core/framework/allocator.h
index d79b9bc8d1..2b5be2308a 100644
--- a/include/onnxruntime/core/framework/allocator.h
+++ b/include/onnxruntime/core/framework/allocator.h
@@ -118,7 +118,7 @@ struct OrtMemoryInfo {
   std::string ToString() const {
     std::ostringstream ostr;
     ostr << "OrtMemoryInfo: ["
-         << " name:" << name
+         << "name:" << name
          << " id:" << id
          << " mem_type:" << mem_type
          << " alloc_type:" << alloc_type
@@ -306,9 +306,9 @@ class MiMallocAllocator : public IDeviceAllocator {
 #endif
 
 #if defined(USE_MIMALLOC_ARENA_ALLOCATOR)
-  using TAllocator = MiMallocAllocator;
+using TAllocator = MiMallocAllocator;
 #else
-  using TAllocator = CPUAllocator;
+using TAllocator = CPUAllocator;
 #endif
 
 using AllocatorPtr = std::shared_ptr<IAllocator>;
diff --git a/onnxruntime/core/framework/bfc_arena.cc b/onnxruntime/core/framework/bfc_arena.cc
index 631a207ded..565c2f41cf 100644
--- a/onnxruntime/core/framework/bfc_arena.cc
+++ b/onnxruntime/core/framework/bfc_arena.cc
@@ -14,8 +14,8 @@ BFCArena::BFCArena(std::unique_ptr<IDeviceAllocator> resource_allocator,
             device_allocator_->Info().device, device_allocator_->Info().id, device_allocator_->Info().mem_type) {
   LOGS_DEFAULT(INFO) << "Creating BFCArena for " << device_allocator_->Info().name;
 
-  // TODO - consider to make the initial chunk size and max 'fragmentation' (kMaxDeadBytesInChunk) values configurable. 
-  // But first we need to add a mechanism to allow that sort of low level configuration to be done 
+  // TODO - consider to make the initial chunk size and max 'fragmentation' (kMaxDeadBytesInChunk) values configurable.
+  // But first we need to add a mechanism to allow that sort of low level configuration to be done
   // without adding separate parameters to SessionOptions for every single one of them.
   curr_region_allocation_bytes_ = RoundedBytes(std::min(total_memory, size_t{1048576}));
 
@@ -62,7 +62,7 @@ BFCArena::Chunk* BFCArena::ChunkFromHandle(ChunkHandle h) {
   return &(chunks_[h]);
 }
 
-bool BFCArena::Extend(size_t rounded_bytes) {
+Status BFCArena::Extend(size_t rounded_bytes) {
   size_t available_bytes = memory_limit_ - static_cast<size_t>(stats_.total_allocated_bytes);
   // Rounds available_bytes down to the nearest multiple of kMinAllocationSize.
   available_bytes = (available_bytes / kMinAllocationSize) * kMinAllocationSize;
@@ -70,7 +70,8 @@ bool BFCArena::Extend(size_t rounded_bytes) {
   // Do we have enough space to handle the client's request?
   // If not, fail immediately.
   if (rounded_bytes > available_bytes) {
-    return false;
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Available memory of ", available_bytes,
+                           " is smaller than requested bytes of ", rounded_bytes);
   }
 
   auto safe_alloc = [this](size_t alloc_bytes) {
@@ -137,11 +138,11 @@ bool BFCArena::Extend(size_t rounded_bytes) {
   }
 
   if (mem_addr == nullptr) {
-    ORT_THROW("Failed to allocate memory for requested buffer of size ", rounded_bytes);
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                           "Failed to allocate memory for requested buffer of size ", rounded_bytes);
   }
 
-  LOGS_DEFAULT(INFO) << "Extended allocation by " << bytes
-                     << " bytes.";
+  LOGS_DEFAULT(INFO) << "Extended allocation by " << bytes << " bytes.";
 
   stats_.total_allocated_bytes += bytes;
   LOGS_DEFAULT(INFO) << "Total allocated bytes: "
@@ -170,7 +171,7 @@ bool BFCArena::Extend(size_t rounded_bytes) {
   // Insert the chunk into the right bin.
   InsertFreeChunkIntoBin(h);
 
-  return true;
+  return Status::OK();
 }
 
 BFCArena::ChunkHandle BFCArena::AllocateChunk() {
@@ -260,10 +261,15 @@ void* BFCArena::AllocateRawInternal(size_t num_bytes,
                      << ". bin_num:" << bin_num << " rounded_bytes:" << rounded_bytes;
 
   // Try to extend
-  if (Extend(rounded_bytes)) {
+  auto status = Extend(rounded_bytes);
+  if (status.IsOK()) {
     ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes);
     if (ptr != nullptr) {
       return ptr;
+    } else {
+      status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                               "Failed to find a free memory block despite calling Extend. rounded_bytes=",
+                               rounded_bytes);
     }
   }
 
@@ -271,13 +277,12 @@ void* BFCArena::AllocateRawInternal(size_t num_bytes,
   // couldn't find one.  This means we must have run out of memory,
   // Dump the memory log for analysis.
   if (dump_log_on_failure) {
-    LOGS_DEFAULT(ERROR) << "BFC Arena ran out of memory trying "
-                        << "to allocate " << num_bytes
+    LOGS_DEFAULT(ERROR) << "BFC Arena ran out of memory trying to allocate " << num_bytes
                         << ".  Current allocation summary follows.";
     DumpMemoryLog(rounded_bytes);
   }
 
-  return nullptr;
+  ORT_THROW(status.ErrorMessage());
 }
 
 void BFCArena::GetStats(AllocatorStats* stats) {
@@ -307,8 +312,7 @@ void* BFCArena::FindChunkPtr(BinNum bin_num, size_t rounded_bytes,
         // kMaxDeadBytesInChunk bytes on padding this alloc.
         const int64_t kMaxDeadBytesInChunk = 128 << 20;  // 128mb
         if (chunk->size >= rounded_bytes * 2 ||
-            static_cast<int64_t>(chunk->size) - rounded_bytes >=
-                kMaxDeadBytesInChunk) {
+            static_cast<int64_t>(chunk->size) - rounded_bytes >= kMaxDeadBytesInChunk) {
           SplitChunk(h, rounded_bytes);
           chunk = ChunkFromHandle(h);  // Update chunk pointer in case it moved
         }
diff --git a/onnxruntime/core/framework/bfc_arena.h b/onnxruntime/core/framework/bfc_arena.h
index 82c8d962f7..9b503db26e 100644
--- a/onnxruntime/core/framework/bfc_arena.h
+++ b/onnxruntime/core/framework/bfc_arena.h
@@ -321,9 +321,8 @@ class BFCArena : public IArenaAllocator {
   size_t RoundedBytes(size_t bytes);
 
   // Try to add a new memory region that can satisfy an allocation of
-  // 'rounded_bytes' bytes.  Returns true on success and false on
-  // failure.
-  bool Extend(size_t rounded_bytes);
+  // 'rounded_bytes' bytes.
+  Status Extend(size_t rounded_bytes);
 
   // Returns a pointer to an underlying allocated chunk of size
   // 'rounded_bytes'.
diff --git a/onnxruntime/core/framework/execution_frame.cc b/onnxruntime/core/framework/execution_frame.cc
index ebd4d5d124..9f24a51670 100644
--- a/onnxruntime/core/framework/execution_frame.cc
+++ b/onnxruntime/core/framework/execution_frame.cc
@@ -243,15 +243,39 @@ ExecutionFrame::ExecutionFrame(const std::vector<int>& feed_mlvalue_idxs, const
         // pre-allocate the big chunk requested in memory pattern.
         // all the internal kernel's input/output tensors will be allocated on these buffer.
         for (size_t i = 0; i < mem_patterns_->locations.size(); i++) {
-          ORT_ENFORCE(buffers_.find(mem_patterns_->locations[i]) == buffers_.end());
-          AllocatorPtr alloc = GetAllocator(mem_patterns_->locations[i]);
-          void* buffer = mem_patterns_->patterns[i].PeakSize() > 0
-                             ? alloc->Alloc(mem_patterns_->patterns[i].PeakSize())
-                             : nullptr;
-          buffers_[mem_patterns_->locations[i]] = BufferUniquePtr(buffer, alloc);
+          const auto& location = mem_patterns_->locations[i];
+          ORT_ENFORCE(buffers_.find(location) == buffers_.end());
+          if (mem_patterns_->patterns[i].PeakSize() > 0) {
+            AllocatorPtr alloc = GetAllocator(location);
+            void* buffer = nullptr;
+            // it's possible we can't allocate the large block. if we have memory patterns we know we have successfully
+            // executed once before, so if there's an arena involved it probably has smaller blocks available.
+            // due to that we can still run and use those blocks (inside the arena logic) instead of one large one.
+            // it's less efficient (the arena will add some overhead to coalesce individual allocations
+            // back into blocks on 'free'), but better than failing completely.
+            try {
+              buffer = alloc->Alloc(mem_patterns_->patterns[i].PeakSize());
 
-          // log size of activation. Keep it commented out for now to avoid log flooding.
-          // VLOGS(session_state_.Logger(), 1) << "Allocated memory for activations, size: " << mem_patterns_->patterns[i].PeakSize();
+              // handle allocator that doesn't throw
+              if (buffer == nullptr) {
+                // INFO level as this may fire on every run and there may not be much a user can do
+                LOGS(session_state_.Logger(), INFO) << "Allocation of memory pattern buffer for "
+                                                    << location.ToString() << " returned nullptr";
+              }
+
+            } catch (const OnnxRuntimeException& ex) {
+              LOGS(session_state_.Logger(), INFO) << "Allocation of memory pattern buffer for "
+                                                  << location.ToString() << " failed. Error:" << ex.what();
+            }
+
+            if (buffer != nullptr) {
+              buffers_[location] = BufferUniquePtr(buffer, alloc);
+            }
+
+            // log size of activation. Keep it commented out for now to avoid log flooding.
+            // VLOGS(session_state_.Logger(), 1) << "Allocated memory for activations, size: "
+            //                                   << mem_patterns_->patterns[i].PeakSize();
+          }
         }
       }
     }
@@ -312,30 +336,30 @@ Status ExecutionFrame::AllocateMLValueTensorSelfOwnBufferHelper(OrtValue& ort_va
       // if block not found, fall back to default behavior
       if (block) {
         auto it = buffers_.find(location);
-        // if the block is not correct, log message then fall back to default behavior
-        if (it != buffers_.end() && block->size_ == size) {
-          void* buffer = it->second.get();
-          auto status = AllocateTensorWithPreAllocateBufferHelper(
-              ort_value, static_cast<void*>(static_cast<char*>(buffer) + block->offset_), element_type, location,
-              shape);
-          return status;
-        }
-        if (block->size_ != size) {
-          // the block size may vary especially if the model has NonZero ops, or different sequence lengths are
-          // fed in, so use VERBOSE as the log level as it's expected.
-          // TODO: Should we re-use the block if the size is large enough? Would probably need to allow it
-          // to be freed if the size difference was too large so our memory usage doesn't stick at a high water mark
-          LOGS(session_state_.Logger(), VERBOSE) << "For ort_value with index: " << ort_value_index
-                                                 << ", block in memory pattern size is: " << block->size_
-                                                 << " but the actually size is: " << size
-                                                 << ", fall back to default allocation behavior";
-        } else if (it == buffers_.end()) {
-          LOGS(session_state_.Logger(), WARNING) << "For ort_value with index: " << ort_value_index
-                                                 << ", block not found in target location. fall back to default allocation behavior";
+        if (it != buffers_.end()) {
+          // if the block is not correct, log message then fall back to default behavior
+          if (block->size_ == size) {
+            void* buffer = it->second.get();
+            auto status = AllocateTensorWithPreAllocateBufferHelper(
+                ort_value, static_cast<void*>(static_cast<char*>(buffer) + block->offset_), element_type, location,
+                shape);
+            return status;
+          } else {
+            // the block size may vary especially if the model has NonZero ops, or different sequence lengths are
+            // fed in, so use VERBOSE as the log level as it's expected.
+            // TODO: Should we re-use the block if the size is large enough? Would probably need to allow it
+            // to be freed if the size difference was too large so our memory usage doesn't stick at a high water mark
+            LOGS(session_state_.Logger(), VERBOSE) << "For ort_value with index: " << ort_value_index
+                                                   << ", block in memory pattern size is: " << block->size_
+                                                   << " but the actually size is: " << size
+                                                   << ", fall back to default allocation behavior";
+          }
         }
+        // else { we couldn't allocate the large block for the buffer so we didn't insert an entry }
       }
     }
   }
+
   //no memory pattern, or the pattern is not correct.
   std::unique_ptr<Tensor> p_tensor = onnxruntime::make_unique<Tensor>(element_type, shape, alloc);
 
diff --git a/onnxruntime/core/framework/memcpy.cc b/onnxruntime/core/framework/memcpy.cc
index f1dfff4bc8..9c45a2044e 100644
--- a/onnxruntime/core/framework/memcpy.cc
+++ b/onnxruntime/core/framework/memcpy.cc
@@ -14,6 +14,15 @@ Status Memcpy::Compute(OpKernelContext* ctx) const {
   const auto* X = ctx->Input<Tensor>(0);
   Tensor* Y = ctx->Output(0, X->Shape());
   Status retval = Info().GetDataTransferManager().CopyTensor(*X, *Y, Info().GetKernelDef().ExecQueueId());
+
+  if (!retval.IsOK()) {
+    LOGS(ctx->Logger(), ERROR) << MakeString(retval.ErrorMessage(),
+                                             " Copying ", Node().InputDefs()[0]->Name(),
+                                             " to ", Node().OutputDefs()[0]->Name(),
+                                             " Input shape:", X->Shape(), " Output shape:", Y->Shape(),
+                                             " X data:", X->DataRaw(), " Y data:", Y->DataRaw());
+  }
+
   return retval;
 }
 
diff --git a/onnxruntime/test/framework/bfc_arena_test.cc b/onnxruntime/test/framework/bfc_arena_test.cc
index 7051f17246..88c6d16117 100644
--- a/onnxruntime/test/framework/bfc_arena_test.cc
+++ b/onnxruntime/test/framework/bfc_arena_test.cc
@@ -3,6 +3,7 @@
 
 #include "core/framework/bfc_arena.h"
 #include "gtest/gtest.h"
+#include "gmock/gmock.h"
 #include <cstdlib>
 
 namespace onnxruntime {
@@ -72,8 +73,8 @@ TEST(BFCArenaTest, AllocationsAndDeallocations) {
 
   // Ensure out of memory errors work and do not prevent future allocations from
   // working.
-  void* out_of_memory_ptr = a.Alloc((1 << 30) + 1);
-  EXPECT_EQ(out_of_memory_ptr, nullptr);
+
+  EXPECT_THROW(a.Alloc((1 << 30) + 1), OnnxRuntimeException);
 
   // Allocate a lot of raw pointers
   for (int s = 1; s < 256; s++) {
@@ -152,15 +153,54 @@ TEST(BFCArenaTest, AllocatedVsRequested) {
 }
 
 TEST(BFCArenaTest, TestCustomMemoryLimit) {
-  // Configure a 1MiB byte limit
-  BFCArena a(std::unique_ptr<IDeviceAllocator>(new CPUAllocator()), 1 << 20);
+  {
+    // Configure a 1MiB byte limit
+    BFCArena a(std::unique_ptr<IDeviceAllocator>(new CPUAllocator()), 1 << 20);
 
-  void* first_ptr = a.Alloc(sizeof(float) * (1 << 6));
-  void* second_ptr = a.Alloc(sizeof(float) * (1 << 20));
+    void* first_ptr = a.Alloc(sizeof(float) * (1 << 6));
+    EXPECT_NE(nullptr, first_ptr);
 
-  EXPECT_NE(nullptr, first_ptr);
-  EXPECT_EQ(nullptr, second_ptr);
-  a.Free(first_ptr);
+    // test allocation of more than available memory throws
+    try {
+      a.Alloc(sizeof(float) * (1 << 20));
+      FAIL() << "Allocation should have thrown";
+    } catch (const OnnxRuntimeException& ex) {
+#ifdef GTEST_USES_POSIX_RE
+      EXPECT_THAT(ex.what(),
+                  testing::ContainsRegex("Available memory of [0-9]+ is smaller than requested bytes of [0-9]+"));
+#else
+      EXPECT_THAT(ex.what(),
+                  testing::ContainsRegex("Available memory of \\d+ is smaller than requested bytes of \\d+"));
+#endif
+    } catch (...) {
+      FAIL() << "Allocation should have thrown OnnxRuntimeException";
+    }
+
+    a.Free(first_ptr);
+  }
+
+  {
+    // allow for the maximum amount of memory less 5MiB
+    constexpr size_t available = std::numeric_limits<size_t>::max() - (5 * 1024 * 1024);
+    BFCArena b(std::unique_ptr<IDeviceAllocator>(new CPUAllocator()), available,
+               ArenaExtendStrategy::kSameAsRequested);  // need this strategy. kNextPowerOfTwo would overflow size_t
+
+    void* first_ptr = b.Alloc(sizeof(float) * (1 << 6));
+    EXPECT_NE(nullptr, first_ptr);
+
+    // test allocation that is less than available memory, but more than what could reasonably be expected to exist.
+    // first alloc creates a 1MB block so allow for that not being available.
+    try {
+      b.Alloc(available - (3 * 1024 * 1024));
+      FAIL() << "Allocation should have thrown";
+    } catch (const OnnxRuntimeException& ex) {
+      EXPECT_THAT(ex.what(), testing::HasSubstr("Failed to allocate memory for requested buffer of size"));
+    } catch (...) {
+      FAIL() << "Allocation should have thrown OnnxRuntimeException";
+    }
+
+    b.Free(first_ptr);
+  }
 }
 
 TEST(BFCArenaTest, AllocationsAndDeallocationsWithGrowth) {