mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-14 20:57:59 +00:00
### Summary - Added multicast support to SymmetricMemory. If the cuda runtime and cuda driver have multicast support, SymmetricMemory associate all peer buffers with a multicast object and exposes the multicast virtual address. - Implemented `multimem_all_reduce_` and `multimem_one_shot_all_reduce` based on the multicast support. The two variants shows different performance characteristic for different message size. We plan to use Inductor for collective algo selection (and required symmetric memory buffer allocation). ### Benchmark 8xH100 (non-standard version with HBM2e at 650W). NVSwitch V3 with NVLS support.   Differential Revision: [D61682507](https://our.internmc.facebook.com/intern/diff/D61682507) Pull Request resolved: https://github.com/pytorch/pytorch/pull/133424 Approved by: https://github.com/yf225, https://github.com/weifengpy
52 lines
1.5 KiB
C++
52 lines
1.5 KiB
C++
#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
|
|
#include <c10/cuda/driver_api.h>
|
|
#include <c10/util/CallOnce.h>
|
|
#include <c10/util/Exception.h>
|
|
#include <dlfcn.h>
|
|
|
|
namespace c10::cuda {
|
|
|
|
namespace {
|
|
|
|
DriverAPI create_driver_api() {
|
|
void* handle_0 = dlopen("libcuda.so.1", RTLD_LAZY | RTLD_NOLOAD);
|
|
TORCH_CHECK(handle_0, "Can't open libcuda.so.1: ", dlerror());
|
|
void* handle_1 = DriverAPI::get_nvml_handle();
|
|
DriverAPI r{};
|
|
|
|
#define LOOKUP_LIBCUDA_ENTRY(name) \
|
|
r.name##_ = ((decltype(&name))dlsym(handle_0, #name)); \
|
|
TORCH_INTERNAL_ASSERT(r.name##_, "Can't find ", #name, ": ", dlerror())
|
|
C10_LIBCUDA_DRIVER_API(LOOKUP_LIBCUDA_ENTRY)
|
|
#undef LOOKUP_LIBCUDA_ENTRY
|
|
|
|
#define LOOKUP_LIBCUDA_ENTRY(name) \
|
|
r.name##_ = ((decltype(&name))dlsym(handle_0, #name)); \
|
|
dlerror();
|
|
C10_LIBCUDA_DRIVER_API_12030(LOOKUP_LIBCUDA_ENTRY)
|
|
#undef LOOKUP_LIBCUDA_ENTRY
|
|
|
|
if (handle_1) {
|
|
#define LOOKUP_NVML_ENTRY(name) \
|
|
r.name##_ = ((decltype(&name))dlsym(handle_1, #name)); \
|
|
TORCH_INTERNAL_ASSERT(r.name##_, "Can't find ", #name, ": ", dlerror())
|
|
C10_NVML_DRIVER_API(LOOKUP_NVML_ENTRY)
|
|
#undef LOOKUP_NVML_ENTRY
|
|
}
|
|
return r;
|
|
}
|
|
} // namespace
|
|
|
|
void* DriverAPI::get_nvml_handle() {
|
|
static void* nvml_hanle = dlopen("libnvidia-ml.so.1", RTLD_LAZY);
|
|
return nvml_hanle;
|
|
}
|
|
|
|
C10_EXPORT DriverAPI* DriverAPI::get() {
|
|
static DriverAPI singleton = create_driver_api();
|
|
return &singleton;
|
|
}
|
|
|
|
} // namespace c10::cuda
|
|
|
|
#endif
|