[inductor] Use torch.cuda.clock_rate instead of triton.testing.nvsmi (#118662)

`triton.testing.nvsmi` invokes `nvidia-smi` as a subprocess, and Meta
prod usually doesn't make nvidia-smi available.  Might as well just use
something that's native to torch.

Differential Revision: [D53235814](https://our.internmc.facebook.com/intern/diff/D53235814/)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/118662
Approved by: https://github.com/jansel
This commit is contained in:
Bert Maher 2024-02-13 14:31:01 -08:00 committed by PyTorch MergeBot
parent 80379ef0aa
commit 563f1b9fef
2 changed files with 10 additions and 2 deletions

View file

@ -1162,9 +1162,9 @@ def get_device_tflops(dtype):
if inspect.signature(get_max_simd_tflops).parameters.get("clock_rate"):
# Triton API change in https://github.com/openai/triton/pull/2293
from triton.testing import nvsmi
from torch._utils_internal import max_clock_rate
sm_clock = nvsmi(["clocks.max.sm"])[0]
sm_clock = max_clock_rate()
if dtype in (torch.float16, torch.bfloat16):
return get_max_tensorcore_tflops(dtype, sm_clock)

View file

@ -1,3 +1,4 @@
import functools
import logging
import os
import sys
@ -94,6 +95,13 @@ def log_export_usage(**kwargs):
pass
@functools.lru_cache(None)
def max_clock_rate():
from triton.testing import nvsmi
return nvsmi(["clocks.max.sm"])[0]
TEST_MASTER_ADDR = "127.0.0.1"
TEST_MASTER_PORT = 29500
# USE_GLOBAL_DEPS controls whether __init__.py tries to load