[inductor] Use torch.cuda.clock_rate instead of triton.testing.nvsmi (#118662)

`triton.testing.nvsmi` invokes `nvidia-smi` as a subprocess, and Meta prod usually doesn't make nvidia-smi available. Might as well just use something that's native to torch. Differential Revision: [D53235814](https://our.internmc.facebook.com/intern/diff/D53235814/) Pull Request resolved: https://github.com/pytorch/pytorch/pull/118662 Approved by: https://github.com/jansel
2026-05-14 20:57:59 +00:00 · 2024-02-13 14:31:01 -08:00 · 2024-02-13 14:31:01 -08:00 · 563f1b9fef
commit 563f1b9fef
parent 80379ef0aa
2 changed files with 10 additions and 2 deletions
--- a/torch/_inductor/utils.py
+++ b/torch/_inductor/utils.py
@ -1162,9 +1162,9 @@ def get_device_tflops(dtype):

    if inspect.signature(get_max_simd_tflops).parameters.get("clock_rate"):
        # Triton API change in https://github.com/openai/triton/pull/2293
-        from triton.testing import nvsmi
+        from torch._utils_internal import max_clock_rate

-        sm_clock = nvsmi(["clocks.max.sm"])[0]
+        sm_clock = max_clock_rate()
        if dtype in (torch.float16, torch.bfloat16):
            return get_max_tensorcore_tflops(dtype, sm_clock)

--- a/torch/_utils_internal.py
+++ b/torch/_utils_internal.py
@ -1,3 +1,4 @@
+import functools
 import logging
 import os
 import sys
@ -94,6 +95,13 @@ def log_export_usage(**kwargs):
    pass


+@functools.lru_cache(None)
+def max_clock_rate():
+    from triton.testing import nvsmi
+
+    return nvsmi(["clocks.max.sm"])[0]
+
+
 TEST_MASTER_ADDR = "127.0.0.1"
 TEST_MASTER_PORT = 29500
 # USE_GLOBAL_DEPS controls whether __init__.py tries to load