diff --git a/c10/cuda/CUDADeviceAssertion.h b/c10/cuda/CUDADeviceAssertion.h index 76c422d83bd..65aca3c6399 100644 --- a/c10/cuda/CUDADeviceAssertion.h +++ b/c10/cuda/CUDADeviceAssertion.h @@ -18,7 +18,7 @@ static __device__ void dstrcpy(char* dst, const char* src) { *dst = '\0'; } -__device__ __noinline__ void dsa_add_new_assertion_failure( +__device__ void dsa_add_new_assertion_failure( DeviceAssertionsData* assertions_data, const char* assertion_msg, const char* filename, diff --git a/c10/cuda/CUDADeviceAssertionHost.cpp b/c10/cuda/CUDADeviceAssertionHost.cpp index 58ece480799..7c898e2b5d0 100644 --- a/c10/cuda/CUDADeviceAssertionHost.cpp +++ b/c10/cuda/CUDADeviceAssertionHost.cpp @@ -21,6 +21,33 @@ namespace cuda { namespace { +#ifdef TORCH_USE_CUDA_DSA +/// Get current device id +/// We need our own implementation of this function to prevent +/// an infinite initialization loop for CUDAKernelLaunchRegistry +int dsa_get_device_id() { + int device = -1; + C10_CUDA_ERROR_HANDLED(cudaGetDevice(&device)); + CHECK_CUDA_API_CALL_WITHOUT_CHECKING_DEVICE_ASSERTS(); + return device; +} + +/// Get a device's compute capability - note that this dangerously assumes +/// that if one CUDA GPU supports device-side assertions they all do. This is +/// probably fine since the latest CUDA GPU that doesn't support UVM is the +/// K80 released 2014-11-17. Mixing that GPU with a newer one is likely to be +/// rare enough that the defensive +/// We need our own implementation of this function to prevent +/// an infinite initialization loop for CUDAKernelLaunchRegistry +int dsa_get_device_compute_capability(const int device_num) { + int compute_capability = -1; + C10_CUDA_ERROR_HANDLED(cudaDeviceGetAttribute( + &compute_capability, cudaDevAttrComputeCapabilityMajor, device_num)); + CHECK_CUDA_API_CALL_WITHOUT_CHECKING_DEVICE_ASSERTS(); + return compute_capability; +} +#endif + /// Get the number of CUDA devices /// We need our own implementation of this function to prevent /// an infinite initialization loop for CUDAKernelLaunchRegistry @@ -60,40 +87,13 @@ void uvm_deleter(DeviceAssertionsData* uvm_assertions_ptr) { } } -#ifdef TORCH_USE_CUDA_DSA -/// Get current device id -/// We need our own implementation of this function to prevent -/// an infinite initialization loop for CUDAKernelLaunchRegistry -int dsa_get_device_id() { - int device = -1; - C10_CUDA_ERROR_HANDLED(cudaGetDevice(&device)); - CHECK_CUDA_API_CALL_WITHOUT_CHECKING_DEVICE_ASSERTS(); - return device; -} - -/// Get a device's compute capability - note that this dangerously assumes -/// that if one CUDA GPU supports device-side assertions they all do. This is -/// probably fine since the latest CUDA GPU that doesn't support UVM is the -/// K80 released 2014-11-17. Mixing that GPU with a newer one is likely to be -/// rare enough that the defensive -/// We need our own implementation of this function to prevent -/// an infinite initialization loop for CUDAKernelLaunchRegistry -int dsa_get_device_compute_capability(const int device_num) { - int compute_capability = -1; - C10_CUDA_ERROR_HANDLED(cudaDeviceGetAttribute( - &compute_capability, cudaDevAttrComputeCapabilityMajor, device_num)); - CHECK_CUDA_API_CALL_WITHOUT_CHECKING_DEVICE_ASSERTS(); - return compute_capability; -} -#endif - } // namespace /// Check that kernels ran correctly by checking the message buffer. BLOCKING. std::string c10_retrieve_device_side_assertion_info() { #ifdef TORCH_USE_CUDA_DSA const auto& launch_registry = CUDAKernelLaunchRegistry::get_singleton_ref(); - if (!launch_registry.enabled) { + if (!launch_registry.enabled_at_runtime) { return "Device-side assertion tracking was not enabled by user."; } else if (!launch_registry.do_all_devices_support_managed_memory) { return "Device-side assertions disabled because not all devices support managed memory."; @@ -115,20 +115,7 @@ std::string c10_retrieve_device_side_assertion_info() { std::stringstream oss; - { - oss << "This process interacted the following GPUs = {"; - bool first_gpu_listed = true; - for (const auto& x : uvm_assertions) { - if (x) { - if (!first_gpu_listed) { - oss << "," - } - first_gpu_listed = true; - oss << x; - } - } - oss << "}" << std::endl; - } + oss << "Looking for device-side assertion failure information...\n"; // Loop over each device that could be managed by the process for (const auto device_num : c10::irange(assertion_data.size())) { @@ -202,7 +189,7 @@ CUDAKernelLaunchRegistry::CUDAKernelLaunchRegistry() : do_all_devices_support_managed_memory( dsa_check_if_all_devices_support_managed_memory()), gather_launch_stacktrace(check_env_for_enable_launch_stacktracing()), - enabled(check_env_for_dsa_enabled()) { + enabled_at_runtime(check_env_for_dsa_enabled()) { for (C10_UNUSED const auto _ : c10::irange(dsa_get_device_count())) { uvm_assertions.emplace_back(nullptr, uvm_deleter); } @@ -226,7 +213,7 @@ uint32_t CUDAKernelLaunchRegistry::insert( const char* kernel_name, const int32_t stream_id) { #ifdef TORCH_USE_CUDA_DSA - if (!is_enabled()) { + if (!enabled_at_runtime) { return 0; } @@ -274,7 +261,7 @@ CUDAKernelLaunchRegistry::snapshot() const { DeviceAssertionsData* CUDAKernelLaunchRegistry:: get_uvm_assertions_ptr_for_current_device() { #ifdef TORCH_USE_CUDA_DSA - if (!is_enabled()) { + if (!enabled_at_runtime) { return nullptr; } @@ -352,16 +339,5 @@ bool CUDAKernelLaunchRegistry::has_failed() const { return false; } -bool CUDAKernelLaunchRegistry::is_enabled() const { -#ifdef TORCH_USE_CUDA_DSA - std::cerr << "" -#else - std::cerr - << "TORCH_USE_CUDA_DSA not enabled in CUDAKernelLaunchRegistry::is_enabled" - << std::endl; - return false; -#endif -} - } // namespace cuda } // namespace c10 diff --git a/c10/cuda/CUDADeviceAssertionHost.h b/c10/cuda/CUDADeviceAssertionHost.h index 7465f3d36b2..9e873cb8272 100644 --- a/c10/cuda/CUDADeviceAssertionHost.h +++ b/c10/cuda/CUDADeviceAssertionHost.h @@ -130,13 +130,15 @@ class C10_CUDA_API CUDAKernelLaunchRegistry { /// Whether or not to gather stack traces when launching kernels bool gather_launch_stacktrace = false; /// Whether or not host-side DSA is enabled or disabled at run-time - /// Device-side code cannot be adjusted at run-time - bool enabled = false; + /// Note: Device-side code cannot be enabled/disabled at run-time + bool enabled_at_runtime = false; /// Whether or not a device has indicated a failure bool has_failed() const; - /// Since multiple mechanisms can enable/disable, we add a function that - /// aggregates them - bool is_enabled() const; +#ifdef TORCH_USE_CUDA_DSA + const bool enabled_at_compile_time = true; +#else + const bool enabled_at_compile_time = false; +#endif }; std::string c10_retrieve_device_side_assertion_info(); @@ -147,9 +149,9 @@ std::string c10_retrieve_device_side_assertion_info(); // Each kernel launched with TORCH_DSA_KERNEL_LAUNCH // requires the same input arguments. We introduce the following macro to // standardize these. -#define TORCH_DSA_KERNEL_ARGS \ - c10::cuda::DeviceAssertionsData *const assertions_data, \ - uint32_t assertion_caller_id +#define TORCH_DSA_KERNEL_ARGS \ + [[maybe_unused]] c10::cuda::DeviceAssertionsData *const assertions_data, \ + [[maybe_unused]] uint32_t assertion_caller_id // This macro can be used to pass the DSA arguments onward to another // function diff --git a/c10/cuda/CUDAMiscFunctions.cpp b/c10/cuda/CUDAMiscFunctions.cpp index c9e6de12dca..37225d2a9a0 100644 --- a/c10/cuda/CUDAMiscFunctions.cpp +++ b/c10/cuda/CUDAMiscFunctions.cpp @@ -12,7 +12,7 @@ const char* get_cuda_check_suffix() noexcept { return ""; } else { return "\nCUDA kernel errors might be asynchronously reported at some" - " other API call,so the stacktrace below might be incorrect." + " other API call, so the stacktrace below might be incorrect." "\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1."; } } diff --git a/c10/cuda/test/impl/CUDAAssertionsTest_1_var_test.cu b/c10/cuda/test/impl/CUDAAssertionsTest_1_var_test.cu index f30774102a4..e3ed2d14329 100644 --- a/c10/cuda/test/impl/CUDAAssertionsTest_1_var_test.cu +++ b/c10/cuda/test/impl/CUDAAssertionsTest_1_var_test.cu @@ -14,19 +14,13 @@ using ::testing::HasSubstr; void did_not_fail_diagnostics() { -#ifdef TORCH_USE_CUDA_DSA - std::cerr << "DSA was enabled" << std::endl; -#else - std::cerr << "DSA was not enabled" << std::endl; -#endif - std::cerr - << "c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().enabled = " - << c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().enabled + << "c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().enabled_at_runtime = " + << c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().enabled_at_runtime << std::endl; std::cerr - << "c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().is_enabled() = " - << c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().is_enabled() + << "c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().enabled_at_compile_time = " + << c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().enabled_at_compile_time << std::endl; std::cerr << "c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().do_all_devices_support_managed_memory = " @@ -92,11 +86,10 @@ void cuda_device_assertions_1_var_test() { TEST(CUDATest, cuda_device_assertions_1_var_test) { #ifdef TORCH_USE_CUDA_DSA - c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().enabled = true; - std::cerr << "BEFORE TEST" << std::endl; + c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().enabled_at_runtime = true; did_not_fail_diagnostics(); cuda_device_assertions_1_var_test(); #else - GTEST_SKIP() << "CUDA device-side assertions (DSA) was not enabled."; + GTEST_SKIP() << "CUDA device-side assertions (DSA) was not enabled at compile time."; #endif } diff --git a/c10/cuda/test/impl/CUDAAssertionsTest_catches_stream.cu b/c10/cuda/test/impl/CUDAAssertionsTest_catches_stream.cu index 71fcf3ee249..85b419ed48a 100644 --- a/c10/cuda/test/impl/CUDAAssertionsTest_catches_stream.cu +++ b/c10/cuda/test/impl/CUDAAssertionsTest_catches_stream.cu @@ -96,6 +96,6 @@ TEST(CUDATest, cuda_device_assertions_catches_stream) { c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().enabled = true; cuda_device_assertions_catches_stream(); #else - GTEST_SKIP() << "CUDA device-side assertions (DSA) was not enabled."; + GTEST_SKIP() << "CUDA device-side assertions (DSA) was not enabled at compile time."; #endif } diff --git a/c10/cuda/test/impl/CUDAAssertionsTest_catches_thread_and_block_and_device.cu b/c10/cuda/test/impl/CUDAAssertionsTest_catches_thread_and_block_and_device.cu index 1a0a0b475a0..6cd44817057 100644 --- a/c10/cuda/test/impl/CUDAAssertionsTest_catches_thread_and_block_and_device.cu +++ b/c10/cuda/test/impl/CUDAAssertionsTest_catches_thread_and_block_and_device.cu @@ -81,6 +81,6 @@ TEST(CUDATest, cuda_device_assertions_catches_thread_and_block_and_device) { c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().enabled = true; cuda_device_assertions_catches_thread_and_block_and_device(); #else - GTEST_SKIP() << "CUDA device-side assertions (DSA) was not enabled."; + GTEST_SKIP() << "CUDA device-side assertions (DSA) was not enabled at compile time."; #endif } diff --git a/c10/cuda/test/impl/CUDAAssertionsTest_from_2_processes.cu b/c10/cuda/test/impl/CUDAAssertionsTest_from_2_processes.cu index 3f829259a6b..6be83404045 100644 --- a/c10/cuda/test/impl/CUDAAssertionsTest_from_2_processes.cu +++ b/c10/cuda/test/impl/CUDAAssertionsTest_from_2_processes.cu @@ -99,7 +99,7 @@ TEST(CUDATest, cuda_device_assertions_from_2_processes) { c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().enabled = true; cuda_device_assertions_from_2_processes(); #else - GTEST_SKIP() << "CUDA device-side assertions (DSA) was not enabled."; + GTEST_SKIP() << "CUDA device-side assertions (DSA) was not enabled at compile time."; #endif } diff --git a/c10/cuda/test/impl/CUDAAssertionsTest_multiple_writes_from_blocks_and_threads.cu b/c10/cuda/test/impl/CUDAAssertionsTest_multiple_writes_from_blocks_and_threads.cu index f5f8597f20c..8072e310cd4 100644 --- a/c10/cuda/test/impl/CUDAAssertionsTest_multiple_writes_from_blocks_and_threads.cu +++ b/c10/cuda/test/impl/CUDAAssertionsTest_multiple_writes_from_blocks_and_threads.cu @@ -88,6 +88,6 @@ TEST(CUDATest, cuda_device_assertions_multiple_writes_from_blocks_and_threads) { c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().enabled = true; cuda_device_assertions_multiple_writes_from_blocks_and_threads(); #else - GTEST_SKIP() << "CUDA device-side assertions (DSA) was not enabled."; + GTEST_SKIP() << "CUDA device-side assertions (DSA) was not enabled at compile time."; #endif } diff --git a/c10/cuda/test/impl/CUDAAssertionsTest_multiple_writes_from_multiple_blocks.cu b/c10/cuda/test/impl/CUDAAssertionsTest_multiple_writes_from_multiple_blocks.cu index a66c792d5a2..81784ab4ffd 100644 --- a/c10/cuda/test/impl/CUDAAssertionsTest_multiple_writes_from_multiple_blocks.cu +++ b/c10/cuda/test/impl/CUDAAssertionsTest_multiple_writes_from_multiple_blocks.cu @@ -85,6 +85,6 @@ TEST(CUDATest, cuda_device_assertions_multiple_writes_from_multiple_blocks) { c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().enabled = true; cuda_device_assertions_multiple_writes_from_multiple_blocks(); #else - GTEST_SKIP() << "CUDA device-side assertions (DSA) was not enabled."; + GTEST_SKIP() << "CUDA device-side assertions (DSA) was not enabled at compile time."; #endif } diff --git a/c10/cuda/test/impl/CUDAAssertionsTest_multiple_writes_from_same_block.cu b/c10/cuda/test/impl/CUDAAssertionsTest_multiple_writes_from_same_block.cu index f1e39c8ba19..8858e65467b 100644 --- a/c10/cuda/test/impl/CUDAAssertionsTest_multiple_writes_from_same_block.cu +++ b/c10/cuda/test/impl/CUDAAssertionsTest_multiple_writes_from_same_block.cu @@ -73,6 +73,6 @@ TEST(CUDATest, cuda_device_assertions_multiple_writes_from_same_block) { c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().enabled = true; cuda_device_assertions_multiple_writes_from_same_block(); #else - GTEST_SKIP() << "CUDA device-side assertions (DSA) was not enabled."; + GTEST_SKIP() << "CUDA device-side assertions (DSA) was not enabled at compile time."; #endif } diff --git a/caffe2/core/context_gpu.h b/caffe2/core/context_gpu.h index 611b2550c7e..ae5244a95a3 100644 --- a/caffe2/core/context_gpu.h +++ b/caffe2/core/context_gpu.h @@ -201,6 +201,10 @@ class CAFFE2_CUDA_API CUDAContext final : public BaseContext { return gpu_id_; } + inline c10::cuda::CUDAStream stream() const { + return at::cuda::getStreamFromExternal(getCudaObjects().GetStream(gpu_id_), gpu_id_); + } + inline cudaStream_t cuda_stream() const { return getCudaObjects().GetStream(gpu_id_); }