Upload MPS benchmark results (#141087)

This uploads the MPS benchmark results to benchmark database. The data can then be queried, for example: ``` select benchmark, model, metric from oss_ci_benchmark_v3 where head_sha = '99a133116fee15aa1467165f2b209b37da53f189' and metric.name in ['eager_peak_mem', 'dynamo_peak_mem', 'speedup'] and model.name = 'BERT_pytorch' ``` I'm documenting the JSON format at https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database ### Testing Locally, ``` PYTHONPATH=/Users/huydo/Storage/mine/benchmark python benchmarks/dynamo/torchbench.py --performance --only resnet152 --backend eager --training --devices mps --output test/test-reports/torchbench_training.csv ``` Workflow dispatch https://github.com/pytorch/pytorch/actions/runs/11927990520 Pull Request resolved: https://github.com/pytorch/pytorch/pull/141087 Approved by: https://github.com/malfet
2026-05-14 20:57:59 +00:00 · 2024-11-20 18:18:21 +00:00 · 2024-11-20 18:18:21 +00:00 · 4acd56eb53
commit 4acd56eb53
parent 1d8318df98
2 changed files with 115 additions and 21 deletions
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@ -223,6 +223,14 @@ jobs:
          use-gha: true
          file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}

+      - name: Upload the benchmark results
+        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
+        with:
+          benchmark-results-dir: test/test-reports
+          dry-run: false
+          schema-version: v3
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+
      - name: Clean up disk space
        if: always()
        continue-on-error: true
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@ -111,6 +111,11 @@ os.environ["KINETO_LOG_LEVEL"] = "5"

 current_name = ""
 current_device = ""
+current_backend = ""
+current_mode = ""
+current_dtype = ""
+current_quantization = ""
+current_settings = None
 current_onnx_compiler = ""
 current_batch_size = None
 output_filename = None
@ -356,10 +361,19 @@ def load_model_from_path(path_and_class_str):
    return model, inputs


-def output_csv(filename, headers, row):
+def write_outputs(filename, headers, row):
+    """
+    Write both CSV and JSON outputs using the original CSV output interface
+    """
    global disable_output
    if disable_output:
        return
+
+    output_csv(filename, headers, row)
+    output_json(filename, headers, row)
+
+
+def output_csv(filename, headers, row):
    if os.path.exists(filename):
        with open(filename) as fd:
            lines = list(csv.reader(fd)) or [[]]
@ -377,6 +391,56 @@ def output_csv(filename, headers, row):
            writer.writerow(list(line) + ["0"] * (len(headers) - len(line)))


+def output_json(filename, headers, row):
+    """
+    Write the result into JSON format, so that it can be uploaded to the benchmark database
+    to be displayed on OSS dashboard. The JSON format is defined at
+    https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
+    """
+    origin = ""
+    if "torchbench" in filename:
+        origin = "torchbench"
+    elif "huggingface" in filename:
+        origin = "huggingface"
+    elif "timm_models" in filename:
+        origin = "timm_models"
+
+    extra_info = {
+        "device": current_device,
+        "quantization": current_quantization,
+        "batch_size": current_batch_size,
+    }
+    if current_settings:
+        extra_info.update(current_settings)
+
+    mapping_headers = {headers[i]: v for i, v in enumerate(row)}
+    with open(f"{os.path.splitext(filename)[0]}.json", "a") as f:
+        for header, value in mapping_headers.items():
+            # These headers are not metric names
+            if header in ("dev", "name", "batch_size"):
+                continue
+
+            record = {
+                "benchmark": {
+                    "name": "TorchInductor",
+                    "mode": current_mode,
+                    "dtype": current_dtype,
+                    "extra_info": extra_info,
+                },
+                "model": {
+                    "name": current_name,
+                    "type": "OSS model",
+                    "backend": current_backend,
+                    "origins": [origin],
+                },
+                "metric": {
+                    "name": header,
+                    "benchmark_values": [value],
+                },
+            }
+            print(json.dumps(record), file=f)
+
+
 def get_suite_from_model_iter_fn(model_iter_fn):
    # TODO: This is a bit of a hack
    suite = None
@ -729,7 +793,7 @@ def coverage_experiment(args, model_iter_fn, model, example_inputs):
    with profiler.prof:
        frozen_model_iter_fn(model, example_inputs)
    coverage_result = profiler.results()
-    output_csv(
+    write_outputs(
        output_filename,
        (
            "dev",
@ -768,7 +832,7 @@ def recompile_profiler_experiment(args, model_iter_fn, model, example_inputs):
        model_iter_fn
    )
    opt_model_iter_fn(model, example_inputs)
-    output_csv(
+    write_outputs(
        output_filename, ["model", "profiler report"], [current_name, prof.report()]
    )
    met = prof.get_metrics()
@ -923,7 +987,7 @@ def latency_experiment_summary(suite_name, args, model, timings, **kwargs):
        for k, v in kwargs["dynamo_stats"].items():
            headers.append(k)
            row.append(v)
-    output_csv(
+    write_outputs(
        output_filename,
        headers,
        row,
@ -932,7 +996,7 @@ def latency_experiment_summary(suite_name, args, model, timings, **kwargs):
    assert (
        output_filename.find(".csv") > 0
    ), f"expected output_filename to be a .csv, but got {output_filename}"
-    output_csv(
+    write_outputs(
        output_filename[:-4] + "_compilation_metrics.csv",
        first_headers + c_headers,
        first_fields + c_data,
@ -1092,7 +1156,7 @@ def speedup_experiment(args, model_iter_fn, model, example_inputs, **kwargs):
        for k, v in kwargs["dynamo_stats"].items():
            headers.append(k)
            row.append(v)
-    output_csv(
+    write_outputs(
        output_filename,
        headers,
        row,
@ -1101,7 +1165,7 @@ def speedup_experiment(args, model_iter_fn, model, example_inputs, **kwargs):
    assert (
        output_filename.find(".csv") > 0
    ), f"expected output_filename to be a .csv, but got {output_filename}"
-    output_csv(
+    write_outputs(
        output_filename[:-4] + "_compilation_metrics.csv",
        first_headers + c_headers,
        first_fields + c_data,
@ -1177,7 +1241,7 @@ def speedup_experiment_ds(args, model_iter_fn, model, example_inputs):
            ]
        )
    )
-    output_csv(
+    write_outputs(
        output_filename,
        ("dev", "name", "batch_size", "speedup mean", "speedup median", "speedup var"),
        [
@ -1339,7 +1403,7 @@ def speedup_experiment_onnx(
        row.append(kwargs["compilation_latency"])
        row.append(kwargs["compression_ratio"])

-    output_csv(
+    write_outputs(
        output_filename,
        headers,
        row,
@ -1348,7 +1412,7 @@ def speedup_experiment_onnx(
    assert (
        output_filename.find(".csv") > 0
    ), f"expected output_filename to be a .csv, but got {output_filename}"
-    output_csv(
+    write_outputs(
        output_filename[:-4] + "_compilation_metrics.csv",
        ["dev", "name", "batch_size"] + headers,
        [current_device, current_name, current_batch_size] + data,
@ -1422,7 +1486,7 @@ def baselines(models, model_iter_fn, example_inputs, args):
            for s, p, m in zip(speedup, pvalue, [m for n, m in models[1:]])
        ]
    )
-    output_csv(
+    write_outputs(
        output_filename,
        ("dev", "name", "batch_size") + tuple(n for n, m in models[1:]),
        [current_device, current_name, current_batch_size]
@ -1449,7 +1513,7 @@ def xla(args, model_iter_fn, model, example_inputs):
    pvalue = ttest_ind(timings[:, 0], timings[:, 1]).pvalue
    time_baseline, time_xla = np.median(timings, axis=0)
    speedup = time_baseline / time_xla
-    output_csv(
+    write_outputs(
        output_filename,
        ("dev", "name", "batch_size", "speedup", "time_baseline", "time_xla"),
        [
@ -2245,7 +2309,7 @@ def optimize_onnx_ctx(
            # `torch.onnx.dynamo_export` raises error that encloses diagnostics.
            diagnostic_context = e.onnx_program.diagnostic_context
            for parsed_error in parser.parse_diagnostic_context(diagnostic_context):
-                output_csv(
+                write_outputs(
                    output_error_filename, parsed_error.headers, parsed_error.row
                )
            if context.onnx_model is not None:
@ -2261,7 +2325,7 @@ def optimize_onnx_ctx(
                cause_of_exception, diagnostics.RuntimeErrorWithDiagnostic
            ):
                parsed_error = parser.parse_exception(cause_of_exception)
-                output_csv(
+                write_outputs(
                    output_error_filename, parsed_error.headers, parsed_error.row
                )
            raise
@ -2269,7 +2333,7 @@ def optimize_onnx_ctx(
            # `torch.onnx.export` errors.
            # ORT errors.
            parsed_error = parser.parse_exception(e)
-            output_csv(output_error_filename, parsed_error.headers, parsed_error.row)
+            write_outputs(output_error_filename, parsed_error.headers, parsed_error.row)
            raise

    run_n_iterations_onnx.context = context
@ -2836,7 +2900,7 @@ class BenchmarkRunner:
                headers.append(k)
                fields.append(v)

-            output_csv(output_filename, headers, fields)
+            write_outputs(output_filename, headers, fields)

            output_signpost(
                dict(zip(o_headers, o_fields)),
@ -3125,7 +3189,7 @@ class BenchmarkRunner:
                mean.item(),
                div.item(),
            ]
-            output_csv(output_filename, headers, fields)
+            write_outputs(output_filename, headers, fields)
        return tolerance_status

    def run_performance_test_non_alternate(
@ -3588,7 +3652,7 @@ class BenchmarkRunner:
                user_stack = add_double_quotes(
                    ", ".join([str(x) for x in graph_break.user_stack])
                )
-                output_csv(
+                write_outputs(
                    filename,
                    ["model", "reason", "user_stack"],
                    [current_name, reason, user_stack],
@ -4271,7 +4335,7 @@ def write_csv_when_exception(args, name: str, status: str, device=None):
        rows = [[device, name, placeholder_batch_size, 0.0] for device in devices]

    for row in rows:
-        output_csv(output_filename, headers, row)
+        write_outputs(output_filename, headers, row)


 def run(runner, args, original_dir=None):
@ -4466,6 +4530,11 @@ def run(runner, args, original_dir=None):
        current_name, \
        current_device, \
        current_batch_size, \
+        current_backend, \
+        current_mode, \
+        current_dtype, \
+        current_quantization, \
+        current_settings, \
        output_filename, \
        disable_output, \
        optimize_ctx, \
@ -4677,7 +4746,7 @@ def run(runner, args, original_dir=None):
        for device in args.devices:
            batch_size = runner.batch_size_finder(device, args.only)
            print(args.only, batch_size)
-            output_csv(output_filename, [], [args.only, batch_size])
+            write_outputs(output_filename, [], [args.only, batch_size])
        return

    if args.export_profiler_trace:
@ -4818,6 +4887,23 @@ def run(runner, args, original_dir=None):
            current_name = name
            current_device = device
            current_batch_size = batch_size
+            current_backend = args.backend
+            current_mode = (
+                "training" if args.training else "inference" if args.inference else ""
+            )
+            if args.float16:
+                current_dtype = "float16"
+            elif args.bfloat16:
+                current_dtype = "bfloat16"
+            elif args.float32:
+                current_dtype = "float32"
+            elif args.amp:
+                current_dtype = "amp"
+            else:
+                current_dtype = ""
+            current_quantization = args.quantization
+            # Keep the remaining of the settings
+            current_settings = vars(args)
            set_model_name(name)

            # Look for stuff that looks like batch size, and mark it dynamic.
@ -4881,7 +4967,7 @@ def run(runner, args, original_dir=None):
                    )
        if args.generate_aot_autograd_stats:
            stats_file = output_filename.split(".csv")[0] + "_stats.csv"
-            output_csv(
+            write_outputs(
                stats_file,
                ("dev", "name", "batch_size", "total_aot_graphs", "ok_aot_graphs"),
                [