Fix broken gpt_fast micro benchmark after #144315 (#145235)

The benchmark is failing with the following error

```
  File "/var/lib/jenkins/workspace/benchmarks/gpt_fast/benchmark.py", line 333, in <module>
    main(output_file=args.output, only_model=args.only)
  File "/var/lib/jenkins/workspace/benchmarks/gpt_fast/benchmark.py", line 308, in main
    lst = func(device)
  File "/var/lib/jenkins/workspace/benchmarks/gpt_fast/benchmark.py", line 66, in run_mlp_layer_norm_gelu
    us_per_iter = benchmarker.benchmark(compiled_mod, (x,)) * 1000
  File "/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch/_inductor/runtime/benchmarking.py", line 39, in wrapper
    return fn(self, *args, **kwargs)
TypeError: benchmark() missing 1 required positional argument: 'fn_kwargs'
```

An example error is https://github.com/pytorch/pytorch/actions/runs/12862761823/job/35858912555

I also assign `oncall: pt2` as the owner of this job going forward.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/145235
Approved by: https://github.com/nmacchioni
This commit is contained in:
Huy Do 2025-01-21 17:42:24 +00:00 committed by PyTorch MergeBot
parent 2cffbff7da
commit eb553ae3cf
3 changed files with 6 additions and 4 deletions

View file

@ -26,7 +26,7 @@ jobs:
# Use metal host for benchmark jobs
test-matrix: |
{ include: [
{ config: "inductor-micro-benchmark-cpu-x86", shard: 1, num_shards: 1, runner: "linux.24xl.spr-metal" },
{ config: "inductor-micro-benchmark-cpu-x86", shard: 1, num_shards: 1, runner: "linux.24xl.spr-metal", owners: ["oncall:pt2"] },
]}
secrets: inherit

View file

@ -38,7 +38,7 @@ jobs:
cuda-arch-list: '8.0'
test-matrix: |
{ include: [
{ config: "inductor-micro-benchmark", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
{ config: "inductor-micro-benchmark", shard: 1, num_shards: 1, runner: "linux.aws.a100", owners: ["oncall:pt2"] },
]}
secrets: inherit

View file

@ -63,7 +63,7 @@ def run_mlp_layer_norm_gelu(device: str = "cuda"):
for _ in range(WARMUP_ITER):
compiled_mod(x)
us_per_iter = benchmarker.benchmark(compiled_mod, (x,)) * 1000
us_per_iter = benchmarker.benchmark(compiled_mod, (x,), {}) * 1000
flops_utilization += us_per_iter * flops / 1e9 / A100_40G_BF16_TFLOPS
flops_utilization = flops_utilization / len(input_shapes)
@ -102,7 +102,7 @@ def run_layer_norm(device: str = "cuda"):
for _ in range(WARMUP_ITER):
compiled_mod(x)
us_per_iter = benchmarker.benchmark(compiled_mod, (x,)) * 1000
us_per_iter = benchmarker.benchmark(compiled_mod, (x,), {}) * 1000
memory_bandwidth += (1e6 / us_per_iter) * 2 * BS * D * dtype.itemsize / 1e9
memory_bandwidth = memory_bandwidth / len(input_shapes)
@ -155,6 +155,7 @@ def run_gather_gemv(device: str = "cuda"):
score_idxs,
x,
),
{},
)
* 1000
)
@ -207,6 +208,7 @@ def run_gemv(device: str = "cuda"):
W,
x,
),
{},
)
* 1000
)