torch.compile benchmark utility (#97699)

I've had many exchanges that look like this https://github.com/rasbt/faster-pytorch-blog/pull/2 so this is an attempt to get make this problem easier

Pull Request resolved: https://github.com/pytorch/pytorch/pull/97699
Approved by: https://github.com/ezyang
This commit is contained in:
Mark Saroufim 2023-04-12 03:02:02 +00:00 committed by PyTorch MergeBot
parent 455795c799
commit bc8cb62bcb
3 changed files with 231 additions and 0 deletions

View file

@ -41,6 +41,8 @@ Optimizations can be passed in :func:`~torch.compile` with either a backend mode
The default backend is `inductor` which will likely be the most reliable and performant option for most users and library maintainers,
other backends are there for power users who don't mind more experimental community support.
There is some nuance involved in benchmarking ``torch.compile`` so we've provided a utility to make this simpler with :func:`~torch.utils.benchmark.utils.compile.bench_all`
You can get the full list of community backends by running :func:`~torch._dynamo.list_backends`
.. autosummary::

View file

@ -0,0 +1,42 @@
# Owner(s): ["module: dynamo"]
import torch
import torch._dynamo as torchdynamo
from torch.testing._internal.common_utils import TestCase, run_tests
import unittest
# We cannot import TEST_CUDA from torch.testing._internal.common_cuda here, because if we do that,
# the TEST_CUDNN line from torch.testing._internal.common_cuda will be executed multiple times
# as well during the execution of this test suite, and it will cause
# CUDA OOM error on Windows.
TEST_CUDA = torch.cuda.is_available()
try:
import tabulate # noqa: F401 # type: ignore[import]
from torch.utils.benchmark.utils.compile import bench_all
HAS_TABULATE = True
except ImportError:
HAS_TABULATE = False
@unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
@unittest.skipIf(not HAS_TABULATE, "tabulate not available")
class TestCompileBenchmarkUtil(TestCase):
def test_training_and_inference(self):
class ToyModel(torch.nn.Module):
def __init__(self):
super(ToyModel, self).__init__()
self.weight = torch.nn.Parameter(torch.Tensor(2, 2))
def forward(self, x):
return x * self.weight
torchdynamo.reset()
model = ToyModel().cuda()
inference_table = bench_all(model, torch.ones(1024, 2, 2).cuda(), 5)
self.assertTrue("Inference" in inference_table and "Eager" in inference_table and "-" in inference_table)
training_table = bench_all(model, torch.ones(1024, 2, 2).cuda(), 5, optimizer=torch.optim.SGD(model.parameters(), lr=0.01))
self.assertTrue("Train" in training_table and "Eager" in training_table and "-" in training_table)
if __name__ == '__main__':
run_tests()

View file

@ -0,0 +1,187 @@
import torch
__all__ = ["bench_all", "benchmark_compile"]
import torch._dynamo
from torch._dynamo.testing import CompileCounterWithBackend
from torch.utils.benchmark import Timer
from typing import Optional, List, Callable, Union, Any, cast
_warned_tensor_cores = False
_default_float_32_precision = torch.get_float32_matmul_precision()
try:
from tabulate import tabulate
HAS_TABULATE = True
except ImportError:
HAS_TABULATE = False
print("tabulate is not installed, please pip install tabulate to use this utility")
if HAS_TABULATE:
def _enable_tensor_cores():
global _warned_tensor_cores
if torch.cuda.is_available():
if torch.backends.cuda.matmul.allow_tf32 is False and torch.cuda.get_device_capability() >= (8, 0):
torch.set_float32_matmul_precision("high")
if not _warned_tensor_cores:
print("Your GPU supports tensor cores")
print("we will enable it automatically by setting `torch.set_float32_matmul_precision('high')`")
_warned_tensor_cores = True
def _disable_tensor_cores():
torch.set_float32_matmul_precision(_default_float_32_precision)
def bench_loop(
model: Union[torch.nn.Module, Callable],
sample_input: Union[torch.Tensor, Any],
num_iters: int = 5,
optimizer: torch.optim.Optimizer = None,
loss_fn: Callable = None,
):
# Define the statement and setup for the benchmark
if optimizer and loss_fn:
# Training mode
stmt = """
output = model(sample_input)
loss = loss_fn(output) if loss_fn else output.sum()
loss.backward()
optimizer.step()
optimizer.zero_grad()
"""
else:
# Inference mode
stmt = "model(sample_input)"
# Create the Timer object
timer = Timer(
stmt=stmt,
globals={"model": model, "sample_input": sample_input, "optimizer": optimizer, "loss_fn": loss_fn},
)
result = timer.timeit(number=num_iters)
# Get the average time per iteration in milliseconds
avg_time = result.mean * 1000
return round(avg_time, 2)
def benchmark_compile(
model: Union[torch.nn.Module, Callable],
sample_input: Union[torch.Tensor, Any],
num_iters: int = 5,
backend: Optional[str] = None,
mode: Optional[str] = "default",
optimizer: torch.optim.Optimizer = None,
loss_fn : Union[torch.nn.Module, Callable] = None,
):
"""
Use this utility to benchmark torch.compile
"""
if backend:
try:
torch._dynamo.reset()
compile_counter_with_backend = CompileCounterWithBackend(backend)
opt_model = torch.compile(model, backend=compile_counter_with_backend, mode=mode)
# Compilation only happens after the first inference
compilation_time = bench_loop(opt_model, sample_input, 1, optimizer, loss_fn)
running_time = bench_loop(opt_model, sample_input, num_iters, optimizer, loss_fn)
if compile_counter_with_backend.frame_count == 0:
raise RuntimeError("No compilation occurred during benchmarking.")
if compile_counter_with_backend.frame_count > 1:
raise RuntimeError("Recompilation occurred during benchmarking.")
except Exception as e:
print(e)
print(f"Failed to compile {backend} with mode {mode}")
return None, None
else:
opt_model = model
compilation_time = None
running_time = bench_loop(opt_model, sample_input, num_iters, optimizer, loss_fn)
compilation_time = round(compilation_time, 2) if compilation_time else None
running_time = round(running_time, 2) if running_time else None
return compilation_time, running_time
def bench_all(
model : Union[torch.nn.Module, Callable],
sample_input: Union[torch.Tensor, Any],
num_iters : int = 5,
optimizer: Optional[torch.optim.Optimizer] = None,
loss_fn : Union[torch.nn.Module, Callable] = None,
):
"""
This is a simple utility that can be used to benchmark torch.compile
In particular it ensures that your GPU is setup to use tensor cores if it supports its
It also tries out all the main backends and prints a table of results so you can easily compare them all
Many of the backendds have their own optional dependencies so please pip install them seperately
You will get one table for inference and another for training
If you'd like to leverage this utility for training make sure to pass in a torch.optim.Optimizer
The important warnings are
Your GPU supports tensor cores
we will enable it automatically by setting `torch.set_float32_matmul_precision('high')`
If a compilation fails for any reason including the dependency not being included
then we will print Failed to compile {backend} with mode {mode}
"""
field_names = ["Train/Inference", "Backend", "Mode", "Compilation Time", "Average Running Time"]
table = []
eager_time = None
torch._dynamo.reset()
_, eager_time = benchmark_compile(model, sample_input, num_iters, None, None, optimizer)
table.append(
[("Training" if optimizer else "Inference"), "Eager", "-", "-", f"{eager_time} ms"]
)
for backend in torch._dynamo.list_backends():
if backend == "inductor":
mode_options = cast(List[Optional[str]], list(torch._inductor.list_mode_options().keys())) + [None]
for mode in mode_options:
if mode == "default":
continue
torch._dynamo.reset()
try:
if torch.cuda.is_available():
_enable_tensor_cores()
compilation_time, running_time = benchmark_compile(
model, sample_input, num_iters, backend, mode, optimizer, loss_fn)
finally:
if torch.cuda.is_available():
_disable_tensor_cores()
table.append([
("Training" if optimizer else "Inference"),
backend if backend else "-",
mode if mode is not None else "-",
f"{compilation_time} ms " if compilation_time else "-",
f"{running_time} ms " if running_time else "-",
])
else:
torch._dynamo.reset()
compilation_time, running_time = benchmark_compile(
model, sample_input, num_iters, backend, None, optimizer, loss_fn)
if running_time is not None:
table.append([
("Training" if optimizer else "Inference"),
backend, "-",
f"{compilation_time} ms " or "-",
f"{running_time} ms ",
])
return tabulate(table, headers=field_names, tablefmt="github")