pytorch/c10/cuda/driver_api.cpp
Yifu Wang 78d69bfe11 [SymmetricMemory] introduce multicast support, multimem_all_reduce_ and multimem_one_shot_all_reduce (#133424)
### Summary
- Added multicast support to SymmetricMemory. If the cuda runtime and cuda driver have multicast support, SymmetricMemory associate all peer buffers with a multicast object and exposes the multicast virtual address.
- Implemented `multimem_all_reduce_` and `multimem_one_shot_all_reduce` based on the multicast support. The two variants shows different performance characteristic for different message size. We plan to use Inductor for collective algo selection (and required symmetric memory buffer allocation).

### Benchmark

8xH100 (non-standard version with HBM2e at 650W). NVSwitch V3 with NVLS support.

![image](https://github.com/user-attachments/assets/4998a16b-c2c0-4797-9dd0-1da2303df947)

![image](https://github.com/user-attachments/assets/278ad361-52cb-4864-82c6-bb67e8d0a3fe)

Differential Revision: [D61682507](https://our.internmc.facebook.com/intern/diff/D61682507)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/133424
Approved by: https://github.com/yf225, https://github.com/weifengpy
2024-08-23 20:09:20 +00:00

52 lines
1.5 KiB
C++

#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
#include <c10/cuda/driver_api.h>
#include <c10/util/CallOnce.h>
#include <c10/util/Exception.h>
#include <dlfcn.h>
namespace c10::cuda {
namespace {
DriverAPI create_driver_api() {
void* handle_0 = dlopen("libcuda.so.1", RTLD_LAZY | RTLD_NOLOAD);
TORCH_CHECK(handle_0, "Can't open libcuda.so.1: ", dlerror());
void* handle_1 = DriverAPI::get_nvml_handle();
DriverAPI r{};
#define LOOKUP_LIBCUDA_ENTRY(name) \
r.name##_ = ((decltype(&name))dlsym(handle_0, #name)); \
TORCH_INTERNAL_ASSERT(r.name##_, "Can't find ", #name, ": ", dlerror())
C10_LIBCUDA_DRIVER_API(LOOKUP_LIBCUDA_ENTRY)
#undef LOOKUP_LIBCUDA_ENTRY
#define LOOKUP_LIBCUDA_ENTRY(name) \
r.name##_ = ((decltype(&name))dlsym(handle_0, #name)); \
dlerror();
C10_LIBCUDA_DRIVER_API_12030(LOOKUP_LIBCUDA_ENTRY)
#undef LOOKUP_LIBCUDA_ENTRY
if (handle_1) {
#define LOOKUP_NVML_ENTRY(name) \
r.name##_ = ((decltype(&name))dlsym(handle_1, #name)); \
TORCH_INTERNAL_ASSERT(r.name##_, "Can't find ", #name, ": ", dlerror())
C10_NVML_DRIVER_API(LOOKUP_NVML_ENTRY)
#undef LOOKUP_NVML_ENTRY
}
return r;
}
} // namespace
void* DriverAPI::get_nvml_handle() {
static void* nvml_hanle = dlopen("libnvidia-ml.so.1", RTLD_LAZY);
return nvml_hanle;
}
C10_EXPORT DriverAPI* DriverAPI::get() {
static DriverAPI singleton = create_driver_api();
return &singleton;
}
} // namespace c10::cuda
#endif