improve CUDACachingAllocator lock contention (#118550)

Summary: NativeCachingAllocator has a global lock which shows lock contention with one process using multiple GPUs. The lock is required to lookup Block from pointer. We can make the lock more fine grain to reduce the lock contention.

Test Plan: existing unittests, verified on prod models using eight GPUs showing double digits improvements

Differential Revision: D52493091

Pull Request resolved: https://github.com/pytorch/pytorch/pull/118550
Approved by: https://github.com/albanD
This commit is contained in:
Zheng Yan 2024-02-04 16:45:25 +00:00 committed by PyTorch MergeBot
parent b41f3e8df1
commit 052e824467
2 changed files with 35 additions and 8 deletions

View file

@ -9,6 +9,7 @@
#include <c10/util/ScopeExit.h>
#include <c10/util/UniqueVoidPtr.h>
#include <c10/util/flat_hash_map.h>
#include <c10/util/hash.h>
#include <c10/util/irange.h>
#include <c10/util/llvmMathExtras.h>
#include <c10/util/static_tracepoint.h>
@ -2836,14 +2837,28 @@ void local_raw_delete(void* ptr);
class NativeCachingAllocator : public CUDAAllocator {
private:
std::mutex mutex;
// Shard allocation region to have independent mutexes to reduce contention.
static constexpr size_t kNumMutexShard = 67;
// TODO: use std::hardware_destructive_interference_size once available
struct alignas(64) AlignedMutex {
std::mutex m;
};
std::array<AlignedMutex, kNumMutexShard> mutex;
// allocated blocks by device pointer
ska::flat_hash_map<void*, Block*> allocated_blocks;
std::array<ska::flat_hash_map<void*, Block*>, kNumMutexShard>
allocated_blocks;
static size_t get_mutex_shard_id(void* ptr) {
return twang_mix64((size_t)ptr) % kNumMutexShard;
}
void add_allocated_block(Block* block) {
std::lock_guard<std::mutex> lock(mutex);
allocated_blocks[block->ptr] = block;
const auto mutex_shard_id = get_mutex_shard_id(block->ptr);
std::lock_guard<std::mutex> lock(mutex[mutex_shard_id].m);
allocated_blocks[mutex_shard_id][block->ptr] = block;
}
c10::ApproximateClockToUnixTimeConverter clock_converter;
@ -2852,14 +2867,15 @@ class NativeCachingAllocator : public CUDAAllocator {
std::vector<std::unique_ptr<DeviceCachingAllocator>> device_allocator;
Block* get_allocated_block(void* ptr, bool remove = false) {
std::lock_guard<std::mutex> lock(mutex);
auto it = allocated_blocks.find(ptr);
if (it == allocated_blocks.end()) {
const auto mutex_shard_id = get_mutex_shard_id(ptr);
std::lock_guard<std::mutex> lock(mutex[mutex_shard_id].m);
auto it = allocated_blocks[mutex_shard_id].find(ptr);
if (it == allocated_blocks[mutex_shard_id].end()) {
return nullptr;
}
Block* block = it->second;
if (remove) {
allocated_blocks.erase(it);
allocated_blocks[mutex_shard_id].erase(it);
}
return block;
}

View file

@ -240,6 +240,17 @@ struct sha1 {
std::size_t bit_count_high{};
};
constexpr uint64_t twang_mix64(uint64_t key) noexcept {
key = (~key) + (key << 21); // key *= (1 << 21) - 1; key -= 1;
key = key ^ (key >> 24);
key = key + (key << 3) + (key << 8); // key *= 1 + (1 << 3) + (1 << 8)
key = key ^ (key >> 14);
key = key + (key << 2) + (key << 4); // key *= 1 + (1 << 2) + (1 << 4)
key = key ^ (key >> 28);
key = key + (key << 31); // key *= 1 + (1 << 31)
return key;
}
////////////////////////////////////////////////////////////////////////////////
// c10::hash implementation
////////////////////////////////////////////////////////////////////////////////