From 6ff3383157b9a9d0342822639919683ded827483 Mon Sep 17 00:00:00 2001 From: Taras Date: Thu, 6 Feb 2025 15:58:20 +0000 Subject: [PATCH] Enable CUPTI on Windows (#141454) Fixes: - https://github.com/pytorch/pytorch/issues/93855 The PR enables CUPTI on Windows and enables unit tests to check CUDA profiling events. Additionally, the changes can be verified using the following script: ``` import torch from torch.profiler import profile, ProfilerActivity def check_cupti_enabled(): # Check if CUDA is available if not torch.cuda.is_available(): print("CUDA is not available on this system.") return False # Create a simple CUDA tensor x = torch.randn(1000, 1000, device="cuda") y = torch.randn(1000, 1000, device="cuda") try: # Use PyTorch profiler to perform a basic check with profile(activities=[ProfilerActivity.CUDA]) as prof: z = x @ y # Simple CUDA operation # Print profiling results print("CUPTI is enabled and profiling works.") print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10)) return True except RuntimeError as e: # If profiling fails, CUPTI is likely not set up correctly print("Error: CUPTI might not be enabled or accessible.") print(f"Details: {e}") return False if __name__ == "__main__": if check_cupti_enabled(): print("CUPTI is properly configured in PyTorch.") else: print("CUPTI is not configured correctly. Check your CUDA installation.") ``` Sample output: ``` CUPTI is enabled and profiling works. --------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls --------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_128x128x8_NN_vec 0.00% 0.000us 0.00% 0.000us 0.000us 2.086ms 100.00% 2.086ms 2.086ms 1 cudaFree 9.67% 9.816ms 9.67% 9.816ms 9.816ms 0.000us 0.00% 0.000us 0.000us 1 cudaDeviceGetAttribute 0.01% 10.000us 0.01% 10.000us 0.476us 0.000us 0.00% 0.000us 0.000us 21 cudaGetDriverEntryPoint 0.00% 1.700us 0.00% 1.700us 0.850us 0.000us 0.00% 0.000us 0.000us 2 cudaGetSymbolAddress 85.15% 86.438ms 85.15% 86.438ms 86.438ms 0.000us 0.00% 0.000us 0.000us 1 cudaMalloc 0.43% 433.300us 0.43% 433.300us 144.433us 0.000us 0.00% 0.000us 0.000us 3 cudaLaunchKernel 2.61% 2.648ms 2.61% 2.648ms 2.648ms 0.000us 0.00% 0.000us 0.000us 1 cudaDeviceSynchronize 2.13% 2.163ms 2.13% 2.163ms 2.163ms 0.000us 0.00% 0.000us 0.000us 1 --------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Self CPU time total: 101.511ms Self CUDA time total: 2.086ms CUPTI is properly configured in PyTorch. ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/141454 Approved by: https://github.com/malfet --- caffe2/CMakeLists.txt | 2 +- cmake/Dependencies.cmake | 2 +- test/profiler/test_profiler.py | 7 +------ 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index d8ea2c06de5..44b1d0213ee 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -1559,7 +1559,7 @@ if(USE_CUDA) # FIXME: If kineto is linked with CUPTI it pollutes torch_cpu with CUDA dependencies # Even worse, it never declares that it depends on cudart, but calls the API, see # https://github.com/pytorch/kineto/blob/aef2f5c0f15e3be52406ac0b885e8689de6bc9f6/libkineto/src/CudaDeviceProperties.cpp#L24 - if(USE_KINETO AND NOT MSVC AND NOT LIBKINETO_NOCUPTI) + if(USE_KINETO AND NOT LIBKINETO_NOCUPTI) target_link_libraries(torch_cpu PRIVATE torch::cudart) endif() target_link_libraries(torch_cuda INTERFACE torch::cudart) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 9342555d9bc..114ce44adb9 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1568,7 +1568,7 @@ if(USE_KINETO AND INTERN_BUILD_MOBILE AND USE_LITE_INTERPRETER_PROFILER AND (USE endif() if(USE_KINETO) - if((NOT USE_CUDA) OR MSVC) + if(NOT USE_CUDA) set(LIBKINETO_NOCUPTI ON CACHE STRING "" FORCE) else() set(LIBKINETO_NOCUPTI OFF CACHE STRING "") diff --git a/test/profiler/test_profiler.py b/test/profiler/test_profiler.py index 79bf93e63cb..a8a7e559db2 100644 --- a/test/profiler/test_profiler.py +++ b/test/profiler/test_profiler.py @@ -289,8 +289,7 @@ class TestProfiler(TestCase): ) ) - # TODO: https://github.com/pytorch/kineto/issues/617 - if kineto_available() and not IS_WINDOWS: + if kineto_available(): with TemporaryFileName(mode="w+") as fname: p.export_chrome_trace(fname) with open(fname) as f: @@ -1360,11 +1359,7 @@ class TestProfiler(TestCase): finally: torch._C._profiler._set_fwd_bwd_enabled_val(True) - # This test is broken on Windows, the likely reason is that kineto/CUPTI - # is not supported that particular environment. Once the CI stabilizes - # we can narrow the condition so Windows is checked as well (TODO) @unittest.skipIf(not kineto_available(), "Kineto is required") - @unittest.skipIf(IS_WINDOWS, "Test does not work on Windows") @unittest.skipIf(not torch.cuda.is_available(), "CUDA is required") def test_profiler_cuda_sync_events(self): device = torch.device("cuda:0")