mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-14 20:57:59 +00:00
torch.compile benchmark utility (#97699)
I've had many exchanges that look like this https://github.com/rasbt/faster-pytorch-blog/pull/2 so this is an attempt to get make this problem easier Pull Request resolved: https://github.com/pytorch/pytorch/pull/97699 Approved by: https://github.com/ezyang
This commit is contained in:
parent
455795c799
commit
bc8cb62bcb
3 changed files with 231 additions and 0 deletions
|
|
@ -41,6 +41,8 @@ Optimizations can be passed in :func:`~torch.compile` with either a backend mode
|
|||
The default backend is `inductor` which will likely be the most reliable and performant option for most users and library maintainers,
|
||||
other backends are there for power users who don't mind more experimental community support.
|
||||
|
||||
There is some nuance involved in benchmarking ``torch.compile`` so we've provided a utility to make this simpler with :func:`~torch.utils.benchmark.utils.compile.bench_all`
|
||||
|
||||
You can get the full list of community backends by running :func:`~torch._dynamo.list_backends`
|
||||
|
||||
.. autosummary::
|
||||
|
|
|
|||
42
test/test_compile_benchmark_util.py
Normal file
42
test/test_compile_benchmark_util.py
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
# Owner(s): ["module: dynamo"]
|
||||
|
||||
import torch
|
||||
import torch._dynamo as torchdynamo
|
||||
from torch.testing._internal.common_utils import TestCase, run_tests
|
||||
import unittest
|
||||
|
||||
# We cannot import TEST_CUDA from torch.testing._internal.common_cuda here, because if we do that,
|
||||
# the TEST_CUDNN line from torch.testing._internal.common_cuda will be executed multiple times
|
||||
# as well during the execution of this test suite, and it will cause
|
||||
# CUDA OOM error on Windows.
|
||||
TEST_CUDA = torch.cuda.is_available()
|
||||
try:
|
||||
import tabulate # noqa: F401 # type: ignore[import]
|
||||
from torch.utils.benchmark.utils.compile import bench_all
|
||||
HAS_TABULATE = True
|
||||
except ImportError:
|
||||
HAS_TABULATE = False
|
||||
|
||||
@unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
|
||||
@unittest.skipIf(not HAS_TABULATE, "tabulate not available")
|
||||
class TestCompileBenchmarkUtil(TestCase):
|
||||
def test_training_and_inference(self):
|
||||
class ToyModel(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super(ToyModel, self).__init__()
|
||||
self.weight = torch.nn.Parameter(torch.Tensor(2, 2))
|
||||
|
||||
def forward(self, x):
|
||||
return x * self.weight
|
||||
|
||||
torchdynamo.reset()
|
||||
model = ToyModel().cuda()
|
||||
|
||||
inference_table = bench_all(model, torch.ones(1024, 2, 2).cuda(), 5)
|
||||
self.assertTrue("Inference" in inference_table and "Eager" in inference_table and "-" in inference_table)
|
||||
|
||||
training_table = bench_all(model, torch.ones(1024, 2, 2).cuda(), 5, optimizer=torch.optim.SGD(model.parameters(), lr=0.01))
|
||||
self.assertTrue("Train" in training_table and "Eager" in training_table and "-" in training_table)
|
||||
|
||||
if __name__ == '__main__':
|
||||
run_tests()
|
||||
187
torch/utils/benchmark/utils/compile.py
Normal file
187
torch/utils/benchmark/utils/compile.py
Normal file
|
|
@ -0,0 +1,187 @@
|
|||
import torch
|
||||
|
||||
__all__ = ["bench_all", "benchmark_compile"]
|
||||
|
||||
import torch._dynamo
|
||||
from torch._dynamo.testing import CompileCounterWithBackend
|
||||
from torch.utils.benchmark import Timer
|
||||
|
||||
from typing import Optional, List, Callable, Union, Any, cast
|
||||
|
||||
_warned_tensor_cores = False
|
||||
_default_float_32_precision = torch.get_float32_matmul_precision()
|
||||
|
||||
try:
|
||||
from tabulate import tabulate
|
||||
HAS_TABULATE = True
|
||||
except ImportError:
|
||||
HAS_TABULATE = False
|
||||
print("tabulate is not installed, please pip install tabulate to use this utility")
|
||||
|
||||
if HAS_TABULATE:
|
||||
def _enable_tensor_cores():
|
||||
global _warned_tensor_cores
|
||||
|
||||
if torch.cuda.is_available():
|
||||
if torch.backends.cuda.matmul.allow_tf32 is False and torch.cuda.get_device_capability() >= (8, 0):
|
||||
torch.set_float32_matmul_precision("high")
|
||||
if not _warned_tensor_cores:
|
||||
print("Your GPU supports tensor cores")
|
||||
print("we will enable it automatically by setting `torch.set_float32_matmul_precision('high')`")
|
||||
_warned_tensor_cores = True
|
||||
|
||||
def _disable_tensor_cores():
|
||||
torch.set_float32_matmul_precision(_default_float_32_precision)
|
||||
|
||||
def bench_loop(
|
||||
model: Union[torch.nn.Module, Callable],
|
||||
sample_input: Union[torch.Tensor, Any],
|
||||
num_iters: int = 5,
|
||||
optimizer: torch.optim.Optimizer = None,
|
||||
loss_fn: Callable = None,
|
||||
):
|
||||
# Define the statement and setup for the benchmark
|
||||
if optimizer and loss_fn:
|
||||
# Training mode
|
||||
stmt = """
|
||||
output = model(sample_input)
|
||||
loss = loss_fn(output) if loss_fn else output.sum()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
"""
|
||||
else:
|
||||
# Inference mode
|
||||
stmt = "model(sample_input)"
|
||||
|
||||
# Create the Timer object
|
||||
timer = Timer(
|
||||
stmt=stmt,
|
||||
globals={"model": model, "sample_input": sample_input, "optimizer": optimizer, "loss_fn": loss_fn},
|
||||
)
|
||||
|
||||
|
||||
result = timer.timeit(number=num_iters)
|
||||
|
||||
# Get the average time per iteration in milliseconds
|
||||
avg_time = result.mean * 1000
|
||||
return round(avg_time, 2)
|
||||
|
||||
def benchmark_compile(
|
||||
model: Union[torch.nn.Module, Callable],
|
||||
sample_input: Union[torch.Tensor, Any],
|
||||
num_iters: int = 5,
|
||||
backend: Optional[str] = None,
|
||||
mode: Optional[str] = "default",
|
||||
optimizer: torch.optim.Optimizer = None,
|
||||
loss_fn : Union[torch.nn.Module, Callable] = None,
|
||||
):
|
||||
"""
|
||||
Use this utility to benchmark torch.compile
|
||||
"""
|
||||
if backend:
|
||||
try:
|
||||
torch._dynamo.reset()
|
||||
compile_counter_with_backend = CompileCounterWithBackend(backend)
|
||||
opt_model = torch.compile(model, backend=compile_counter_with_backend, mode=mode)
|
||||
|
||||
# Compilation only happens after the first inference
|
||||
compilation_time = bench_loop(opt_model, sample_input, 1, optimizer, loss_fn)
|
||||
|
||||
running_time = bench_loop(opt_model, sample_input, num_iters, optimizer, loss_fn)
|
||||
|
||||
if compile_counter_with_backend.frame_count == 0:
|
||||
raise RuntimeError("No compilation occurred during benchmarking.")
|
||||
|
||||
if compile_counter_with_backend.frame_count > 1:
|
||||
raise RuntimeError("Recompilation occurred during benchmarking.")
|
||||
|
||||
except Exception as e:
|
||||
print(e)
|
||||
print(f"Failed to compile {backend} with mode {mode}")
|
||||
return None, None
|
||||
else:
|
||||
opt_model = model
|
||||
compilation_time = None
|
||||
running_time = bench_loop(opt_model, sample_input, num_iters, optimizer, loss_fn)
|
||||
|
||||
compilation_time = round(compilation_time, 2) if compilation_time else None
|
||||
running_time = round(running_time, 2) if running_time else None
|
||||
|
||||
|
||||
return compilation_time, running_time
|
||||
|
||||
|
||||
def bench_all(
|
||||
model : Union[torch.nn.Module, Callable],
|
||||
sample_input: Union[torch.Tensor, Any],
|
||||
num_iters : int = 5,
|
||||
optimizer: Optional[torch.optim.Optimizer] = None,
|
||||
loss_fn : Union[torch.nn.Module, Callable] = None,
|
||||
):
|
||||
"""
|
||||
This is a simple utility that can be used to benchmark torch.compile
|
||||
In particular it ensures that your GPU is setup to use tensor cores if it supports its
|
||||
It also tries out all the main backends and prints a table of results so you can easily compare them all
|
||||
Many of the backendds have their own optional dependencies so please pip install them seperately
|
||||
|
||||
You will get one table for inference and another for training
|
||||
If you'd like to leverage this utility for training make sure to pass in a torch.optim.Optimizer
|
||||
|
||||
The important warnings are
|
||||
Your GPU supports tensor cores
|
||||
we will enable it automatically by setting `torch.set_float32_matmul_precision('high')`
|
||||
|
||||
If a compilation fails for any reason including the dependency not being included
|
||||
then we will print Failed to compile {backend} with mode {mode}
|
||||
"""
|
||||
field_names = ["Train/Inference", "Backend", "Mode", "Compilation Time", "Average Running Time"]
|
||||
table = []
|
||||
|
||||
|
||||
eager_time = None
|
||||
torch._dynamo.reset()
|
||||
_, eager_time = benchmark_compile(model, sample_input, num_iters, None, None, optimizer)
|
||||
table.append(
|
||||
[("Training" if optimizer else "Inference"), "Eager", "-", "-", f"{eager_time} ms"]
|
||||
)
|
||||
|
||||
for backend in torch._dynamo.list_backends():
|
||||
|
||||
if backend == "inductor":
|
||||
mode_options = cast(List[Optional[str]], list(torch._inductor.list_mode_options().keys())) + [None]
|
||||
for mode in mode_options:
|
||||
if mode == "default":
|
||||
continue
|
||||
torch._dynamo.reset()
|
||||
try:
|
||||
if torch.cuda.is_available():
|
||||
_enable_tensor_cores()
|
||||
compilation_time, running_time = benchmark_compile(
|
||||
model, sample_input, num_iters, backend, mode, optimizer, loss_fn)
|
||||
finally:
|
||||
if torch.cuda.is_available():
|
||||
_disable_tensor_cores()
|
||||
table.append([
|
||||
("Training" if optimizer else "Inference"),
|
||||
backend if backend else "-",
|
||||
mode if mode is not None else "-",
|
||||
f"{compilation_time} ms " if compilation_time else "-",
|
||||
f"{running_time} ms " if running_time else "-",
|
||||
])
|
||||
|
||||
else:
|
||||
torch._dynamo.reset()
|
||||
compilation_time, running_time = benchmark_compile(
|
||||
model, sample_input, num_iters, backend, None, optimizer, loss_fn)
|
||||
|
||||
if running_time is not None:
|
||||
table.append([
|
||||
("Training" if optimizer else "Inference"),
|
||||
backend, "-",
|
||||
f"{compilation_time} ms " or "-",
|
||||
f"{running_time} ms ",
|
||||
])
|
||||
|
||||
|
||||
return tabulate(table, headers=field_names, tablefmt="github")
|
||||
Loading…
Reference in a new issue