Fix broken gpt_fast micro benchmark after #144315 (#145235)

The benchmark is failing with the following error ``` File "/var/lib/jenkins/workspace/benchmarks/gpt_fast/benchmark.py", line 333, in <module> main(output_file=args.output, only_model=args.only) File "/var/lib/jenkins/workspace/benchmarks/gpt_fast/benchmark.py", line 308, in main lst = func(device) File "/var/lib/jenkins/workspace/benchmarks/gpt_fast/benchmark.py", line 66, in run_mlp_layer_norm_gelu us_per_iter = benchmarker.benchmark(compiled_mod, (x,)) * 1000 File "/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch/_inductor/runtime/benchmarking.py", line 39, in wrapper return fn(self, *args, **kwargs) TypeError: benchmark() missing 1 required positional argument: 'fn_kwargs' ``` An example error is https://github.com/pytorch/pytorch/actions/runs/12862761823/job/35858912555 I also assign `oncall: pt2` as the owner of this job going forward. Pull Request resolved: https://github.com/pytorch/pytorch/pull/145235 Approved by: https://github.com/nmacchioni
2026-05-14 20:57:59 +00:00 · 2025-01-21 17:42:24 +00:00 · 2025-01-21 17:42:24 +00:00 · eb553ae3cf
commit eb553ae3cf
parent 2cffbff7da
3 changed files with 6 additions and 4 deletions
--- a/.github/workflows/inductor-micro-benchmark-x86.yml
+++ b/.github/workflows/inductor-micro-benchmark-x86.yml
@ -26,7 +26,7 @@ jobs:
      # Use metal host for benchmark jobs
      test-matrix: |
        { include: [
-          { config: "inductor-micro-benchmark-cpu-x86", shard: 1, num_shards: 1, runner: "linux.24xl.spr-metal" },
+          { config: "inductor-micro-benchmark-cpu-x86", shard: 1, num_shards: 1, runner: "linux.24xl.spr-metal", owners: ["oncall:pt2"] },
        ]}
    secrets: inherit

--- a/.github/workflows/inductor-micro-benchmark.yml
+++ b/.github/workflows/inductor-micro-benchmark.yml
@ -38,7 +38,7 @@ jobs:
      cuda-arch-list: '8.0'
      test-matrix: |
        { include: [
-          { config: "inductor-micro-benchmark", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
+          { config: "inductor-micro-benchmark", shard: 1, num_shards: 1, runner: "linux.aws.a100", owners: ["oncall:pt2"] },
        ]}
    secrets: inherit

--- a/benchmarks/gpt_fast/benchmark.py
+++ b/benchmarks/gpt_fast/benchmark.py
@ -63,7 +63,7 @@ def run_mlp_layer_norm_gelu(device: str = "cuda"):
            for _ in range(WARMUP_ITER):
                compiled_mod(x)

-            us_per_iter = benchmarker.benchmark(compiled_mod, (x,)) * 1000
+            us_per_iter = benchmarker.benchmark(compiled_mod, (x,), {}) * 1000
            flops_utilization += us_per_iter * flops / 1e9 / A100_40G_BF16_TFLOPS

        flops_utilization = flops_utilization / len(input_shapes)
@ -102,7 +102,7 @@ def run_layer_norm(device: str = "cuda"):
            for _ in range(WARMUP_ITER):
                compiled_mod(x)

-            us_per_iter = benchmarker.benchmark(compiled_mod, (x,)) * 1000
+            us_per_iter = benchmarker.benchmark(compiled_mod, (x,), {}) * 1000
            memory_bandwidth += (1e6 / us_per_iter) * 2 * BS * D * dtype.itemsize / 1e9

        memory_bandwidth = memory_bandwidth / len(input_shapes)
@ -155,6 +155,7 @@ def run_gather_gemv(device: str = "cuda"):
                        score_idxs,
                        x,
                    ),
+                    {},
                )
                * 1000
            )
@ -207,6 +208,7 @@ def run_gemv(device: str = "cuda"):
                        W,
                        x,
                    ),
+                    {},
                )
                * 1000
            )