From 052e82446705cfee3ff5842fb4ebeea51b5bba79 Mon Sep 17 00:00:00 2001 From: Zheng Yan Date: Sun, 4 Feb 2024 16:45:25 +0000 Subject: [PATCH] improve CUDACachingAllocator lock contention (#118550) Summary: NativeCachingAllocator has a global lock which shows lock contention with one process using multiple GPUs. The lock is required to lookup Block from pointer. We can make the lock more fine grain to reduce the lock contention. Test Plan: existing unittests, verified on prod models using eight GPUs showing double digits improvements Differential Revision: D52493091 Pull Request resolved: https://github.com/pytorch/pytorch/pull/118550 Approved by: https://github.com/albanD --- c10/cuda/CUDACachingAllocator.cpp | 32 +++++++++++++++++++++++-------- c10/util/hash.h | 11 +++++++++++ 2 files changed, 35 insertions(+), 8 deletions(-) diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp index f44703c5a9a..088486b4ee1 100644 --- a/c10/cuda/CUDACachingAllocator.cpp +++ b/c10/cuda/CUDACachingAllocator.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -2836,14 +2837,28 @@ void local_raw_delete(void* ptr); class NativeCachingAllocator : public CUDAAllocator { private: - std::mutex mutex; + // Shard allocation region to have independent mutexes to reduce contention. + static constexpr size_t kNumMutexShard = 67; + + // TODO: use std::hardware_destructive_interference_size once available + struct alignas(64) AlignedMutex { + std::mutex m; + }; + + std::array mutex; // allocated blocks by device pointer - ska::flat_hash_map allocated_blocks; + std::array, kNumMutexShard> + allocated_blocks; + + static size_t get_mutex_shard_id(void* ptr) { + return twang_mix64((size_t)ptr) % kNumMutexShard; + } void add_allocated_block(Block* block) { - std::lock_guard lock(mutex); - allocated_blocks[block->ptr] = block; + const auto mutex_shard_id = get_mutex_shard_id(block->ptr); + std::lock_guard lock(mutex[mutex_shard_id].m); + allocated_blocks[mutex_shard_id][block->ptr] = block; } c10::ApproximateClockToUnixTimeConverter clock_converter; @@ -2852,14 +2867,15 @@ class NativeCachingAllocator : public CUDAAllocator { std::vector> device_allocator; Block* get_allocated_block(void* ptr, bool remove = false) { - std::lock_guard lock(mutex); - auto it = allocated_blocks.find(ptr); - if (it == allocated_blocks.end()) { + const auto mutex_shard_id = get_mutex_shard_id(ptr); + std::lock_guard lock(mutex[mutex_shard_id].m); + auto it = allocated_blocks[mutex_shard_id].find(ptr); + if (it == allocated_blocks[mutex_shard_id].end()) { return nullptr; } Block* block = it->second; if (remove) { - allocated_blocks.erase(it); + allocated_blocks[mutex_shard_id].erase(it); } return block; } diff --git a/c10/util/hash.h b/c10/util/hash.h index c8993e6dbaf..a6a1c733403 100644 --- a/c10/util/hash.h +++ b/c10/util/hash.h @@ -240,6 +240,17 @@ struct sha1 { std::size_t bit_count_high{}; }; +constexpr uint64_t twang_mix64(uint64_t key) noexcept { + key = (~key) + (key << 21); // key *= (1 << 21) - 1; key -= 1; + key = key ^ (key >> 24); + key = key + (key << 3) + (key << 8); // key *= 1 + (1 << 3) + (1 << 8) + key = key ^ (key >> 14); + key = key + (key << 2) + (key << 4); // key *= 1 + (1 << 2) + (1 << 4) + key = key ^ (key >> 28); + key = key + (key << 31); // key *= 1 + (1 << 31) + return key; +} + //////////////////////////////////////////////////////////////////////////////// // c10::hash implementation ////////////////////////////////////////////////////////////////////////////////