From d91b49dbaacb3bc39a0d0126fa4ffeafe1f65889 Mon Sep 17 00:00:00 2001 From: zdevito Date: Thu, 29 Aug 2024 09:19:05 -0700 Subject: [PATCH] expandable_segments <-> other allocator options (#134338) Previously setting garbage_collection_threshold or max_split_size_mb along with expandable_segments:True could cause the allocator to hit assert failures when running nearly out of memory. This PR ensures garbage_collection and max_split freeing do not accidentally try to release expandable segments. Pull Request resolved: https://github.com/pytorch/pytorch/pull/134338 Approved by: https://github.com/ezyang --- c10/cuda/CUDACachingAllocator.cpp | 16 +++++---- test/test_cuda.py | 56 +++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 6 deletions(-) diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp index bfab4507b62..fa26a9f3e0c 100644 --- a/c10/cuda/CUDACachingAllocator.cpp +++ b/c10/cuda/CUDACachingAllocator.cpp @@ -2611,7 +2611,7 @@ class DeviceCachingAllocator { while (it != large_blocks.blocks.end()) { Block* block = *it; ++it; - if (!block->is_split() && + if (!block->is_split() && !block->expandable_segment_ && static_cast(block->gc_count()) >= age_threshold) { block_freed = true; gc_reclaimed += block->size; @@ -2754,7 +2754,8 @@ class DeviceCachingAllocator { ? CUDAAllocatorConfig::max_split_size() : key.size; auto it = pool.blocks.lower_bound(&key); - if (it == pool.blocks.end() || (*it)->stream != p.stream()) { + if (it == pool.blocks.end() || (*it)->stream != p.stream() || + (*it)->expandable_segment_) { // No single block is large enough; free multiple oversize blocks, // starting with the largest if (it == pool.blocks.begin()) @@ -2766,12 +2767,15 @@ class DeviceCachingAllocator { ((*it)->size >= CUDAAllocatorConfig::max_split_size()) && ((*it)->stream == p.stream())) { auto cur = it; - totalReleased += (*it)->size; - if (it != pool.blocks.begin()) { + bool is_first = cur == pool.blocks.begin(); + if (!is_first) { --it; + } + if (!(*cur)->expandable_segment_) { release_block(*cur, context); - } else { - release_block(*cur, context); + totalReleased += (*cur)->size; + } + if (is_first) { break; } } diff --git a/test/test_cuda.py b/test/test_cuda.py index b2f1ef3b53c..f53ae5f61da 100644 --- a/test/test_cuda.py +++ b/test/test_cuda.py @@ -4098,6 +4098,62 @@ class TestCudaMallocAsync(TestCase): finally: torch.cuda.memory._record_memory_history(None) + def test_max_split_expandable(self): + torch.cuda.memory.empty_cache() + mb = 1024 * 1024 + _, all_memory = torch.cuda.memory.mem_get_info() + total_allowed = 120 * mb + fraction_allowed = total_allowed / all_memory + assert int(fraction_allowed * all_memory) == total_allowed + torch.cuda.memory.set_per_process_memory_fraction(fraction_allowed) + + def alloc(n): + return torch.ones(n * mb, dtype=torch.int8, device="cuda") + + torch.cuda.memory._set_allocator_settings( + "expandable_segments:False,max_split_size_mb:40" + ) + a = alloc(40) + torch.cuda.memory._set_allocator_settings( + "expandable_segments:True,max_split_size_mb:40" + ) + b = alloc(40) + torch.cuda.memory._set_allocator_settings( + "expandable_segments:False,max_split_size_mb:40" + ) + c = alloc(40) + with self.assertRaises(torch.OutOfMemoryError): + alloc(40) + del a, b, c + # force release_cached_blocks to run with some expandable segments in the free list + alloc(120) + + def test_garbage_collect_expandable(self): + torch.cuda.memory.empty_cache() + mb = 1024 * 1024 + _, all_memory = torch.cuda.memory.mem_get_info() + total_allowed = 120 * mb + fraction_allowed = total_allowed / all_memory + assert int(fraction_allowed * all_memory) == total_allowed + torch.cuda.memory.set_per_process_memory_fraction(fraction_allowed) + + def alloc(n): + return torch.ones(n * mb, dtype=torch.int8, device="cuda") + + torch.cuda.memory._set_allocator_settings( + "expandable_segments:False,garbage_collection_threshold:0.5" + ) + a = alloc(40) + torch.cuda.memory._set_allocator_settings( + "expandable_segments:True,garbage_collection_threshold:0.5" + ) + b = alloc(40) + del a, b + # causes GC to run. The expandable segment block will be split + # so GC would not attempt to free it anyway, but this at least makes sure + # expandable_segment blocks can be in the free list when this is called. + alloc(80) + def test_allocator_settings(self): def power2_div(size, div_factor): pow2 = 1