From 6ff3383157b9a9d0342822639919683ded827483 Mon Sep 17 00:00:00 2001
From: Taras <taras@janeasystems.com>
Date: Thu, 6 Feb 2025 15:58:20 +0000
Subject: [PATCH] Enable CUPTI on Windows (#141454)

Fixes:
- https://github.com/pytorch/pytorch/issues/93855

The PR enables CUPTI on Windows and enables unit tests to check CUDA profiling events.
Additionally, the changes can be verified using the following script:

```
import torch
from torch.profiler import profile, ProfilerActivity

def check_cupti_enabled():
    # Check if CUDA is available
    if not torch.cuda.is_available():
        print("CUDA is not available on this system.")
        return False

    # Create a simple CUDA tensor
    x = torch.randn(1000, 1000, device="cuda")
    y = torch.randn(1000, 1000, device="cuda")

    try:
        # Use PyTorch profiler to perform a basic check
        with profile(activities=[ProfilerActivity.CUDA]) as prof:
            z = x @ y  # Simple CUDA operation

        # Print profiling results
        print("CUPTI is enabled and profiling works.")
        print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
        return True
    except RuntimeError as e:
        # If profiling fails, CUPTI is likely not set up correctly
        print("Error: CUPTI might not be enabled or accessible.")
        print(f"Details: {e}")
        return False

if __name__ == "__main__":
    if check_cupti_enabled():
        print("CUPTI is properly configured in PyTorch.")
    else:
        print("CUPTI is not configured correctly. Check your CUDA installation.")
```

Sample output:
```
CUPTI is enabled and profiling works.
---------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                       Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
---------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
     sgemm_128x128x8_NN_vec         0.00%       0.000us         0.00%       0.000us       0.000us       2.086ms       100.00%       2.086ms       2.086ms             1
                   cudaFree         9.67%       9.816ms         9.67%       9.816ms       9.816ms       0.000us         0.00%       0.000us       0.000us             1
     cudaDeviceGetAttribute         0.01%      10.000us         0.01%      10.000us       0.476us       0.000us         0.00%       0.000us       0.000us            21
    cudaGetDriverEntryPoint         0.00%       1.700us         0.00%       1.700us       0.850us       0.000us         0.00%       0.000us       0.000us             2
       cudaGetSymbolAddress        85.15%      86.438ms        85.15%      86.438ms      86.438ms       0.000us         0.00%       0.000us       0.000us             1
                 cudaMalloc         0.43%     433.300us         0.43%     433.300us     144.433us       0.000us         0.00%       0.000us       0.000us             3
           cudaLaunchKernel         2.61%       2.648ms         2.61%       2.648ms       2.648ms       0.000us         0.00%       0.000us       0.000us             1
      cudaDeviceSynchronize         2.13%       2.163ms         2.13%       2.163ms       2.163ms       0.000us         0.00%       0.000us       0.000us             1
---------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
Self CPU time total: 101.511ms
Self CUDA time total: 2.086ms

CUPTI is properly configured in PyTorch.
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/141454
Approved by: https://github.com/malfet
---
 caffe2/CMakeLists.txt          | 2 +-
 cmake/Dependencies.cmake       | 2 +-
 test/profiler/test_profiler.py | 7 +------
 3 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index d8ea2c06de5..44b1d0213ee 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -1559,7 +1559,7 @@ if(USE_CUDA)
   # FIXME: If kineto is linked with CUPTI it pollutes torch_cpu with CUDA dependencies
   # Even worse, it never declares that it depends on cudart, but calls the API, see
   # https://github.com/pytorch/kineto/blob/aef2f5c0f15e3be52406ac0b885e8689de6bc9f6/libkineto/src/CudaDeviceProperties.cpp#L24
-  if(USE_KINETO AND NOT MSVC AND NOT LIBKINETO_NOCUPTI)
+  if(USE_KINETO AND NOT LIBKINETO_NOCUPTI)
     target_link_libraries(torch_cpu PRIVATE torch::cudart)
   endif()
   target_link_libraries(torch_cuda INTERFACE torch::cudart)
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 9342555d9bc..114ce44adb9 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1568,7 +1568,7 @@ if(USE_KINETO AND INTERN_BUILD_MOBILE AND USE_LITE_INTERPRETER_PROFILER AND (USE
 endif()
 
 if(USE_KINETO)
-  if((NOT USE_CUDA) OR MSVC)
+  if(NOT USE_CUDA)
     set(LIBKINETO_NOCUPTI ON CACHE STRING "" FORCE)
   else()
     set(LIBKINETO_NOCUPTI OFF CACHE STRING "")
diff --git a/test/profiler/test_profiler.py b/test/profiler/test_profiler.py
index 79bf93e63cb..a8a7e559db2 100644
--- a/test/profiler/test_profiler.py
+++ b/test/profiler/test_profiler.py
@@ -289,8 +289,7 @@ class TestProfiler(TestCase):
                     )
                 )
 
-        # TODO: https://github.com/pytorch/kineto/issues/617
-        if kineto_available() and not IS_WINDOWS:
+        if kineto_available():
             with TemporaryFileName(mode="w+") as fname:
                 p.export_chrome_trace(fname)
                 with open(fname) as f:
@@ -1360,11 +1359,7 @@ class TestProfiler(TestCase):
         finally:
             torch._C._profiler._set_fwd_bwd_enabled_val(True)
 
-    # This test is broken on Windows, the likely reason is that kineto/CUPTI
-    # is not supported that particular environment. Once the CI stabilizes
-    # we can narrow the condition so Windows is checked as well (TODO)
     @unittest.skipIf(not kineto_available(), "Kineto is required")
-    @unittest.skipIf(IS_WINDOWS, "Test does not work on Windows")
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
     def test_profiler_cuda_sync_events(self):
         device = torch.device("cuda:0")