[CI] Add a workflow for quick perf comparison (#96166)

Summary: ciflow/inductor-perf-test-nightly now contains full dashboard run which takes a very long time. Ed proposed a simplification of the perf run there, but it is still worth to have a set of fast perf test which only includes one configuration (--training --amp). Pull Request resolved: https://github.com/pytorch/pytorch/pull/96166 Approved by: https://github.com/huydhn, https://github.com/weiwangmeta
2026-05-14 20:57:59 +00:00 · 2023-03-08 18:17:23 +00:00 · 2023-03-08 18:17:23 +00:00 · b3a079810e
commit b3a079810e
parent 4a1b971748
6 changed files with 54 additions and 6 deletions
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -298,10 +298,16 @@ test_single_dynamo_benchmark() {

  local partition_flags=()
  if [[ -n "$NUM_TEST_SHARDS" && -n "$shard_id" ]]; then
-    partition_flags=( --total-partitions 2 --partition-id "$shard_id" )
+    partition_flags=( --total-partitions "$NUM_TEST_SHARDS" --partition-id "$shard_id" )
  fi

-  if [[ "${TEST_CONFIG}" == *perf* ]]; then
+  if [[ "${TEST_CONFIG}" == *perf_compare* ]]; then
+    python "benchmarks/dynamo/$suite.py" \
+      --ci --performance --disable-cudagraphs \
+      "${DYNAMO_BENCHMARK_FLAGS[@]}" \
+      "$@" "${partition_flags[@]}" \
+      --output "$TEST_REPORTS_DIR/${name}_${suite}.csv"
+  elif [[ "${TEST_CONFIG}" == *perf* ]]; then
    # MKL_THREADING_LAYER=GNU to mitigate https://github.com/pytorch/pytorch/issues/37377
    MKL_THREADING_LAYER=GNU python benchmarks/dynamo/runner.py --suites="$suite" \
      --base-sha="$BASE_SHA" --output-dir="$TEST_REPORTS_DIR" "${partition_flags[@]}" \
@ -325,7 +331,9 @@ test_dynamo_benchmark() {
  local shard_id="$1"
  shift

-  if [[ "${TEST_CONFIG}" == *perf* ]]; then
+  if [[ "${TEST_CONFIG}" == *perf_compare* ]]; then
+    test_single_dynamo_benchmark "amp" "$suite" "$shard_id" --training --amp "$@"
+  elif [[ "${TEST_CONFIG}" == *perf* ]]; then
    # Performance test training only, for float32 and amp
    test_single_dynamo_benchmark "amp" "$suite" "$shard_id" --training --dtypes=amp "$@"
    test_single_dynamo_benchmark "float32" "$suite" "$shard_id" --training --dtypes=float32 "$@"
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -6,6 +6,7 @@ ciflow_push_tags:
 - ciflow/binaries_libtorch
 - ciflow/binaries_wheel
 - ciflow/inductor
+- ciflow/inductor-perf-compare
 - ciflow/inductor-perf-test-nightly
 - ciflow/mps
 - ciflow/nightly
--- a/.github/workflows/inductor-perf-compare.yml
+++ b/.github/workflows/inductor-perf-compare.yml
@ -0,0 +1,37 @@
+name: inductor-A100-perf-compare
+
+on:
+  push:
+    tags:
+      - ciflow/inductor-perf-compare/*
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  linux-bionic-cuda11_8-py3_10-gcc7-inductor-build:
+    name: cuda11.8-py3.10-gcc7-sm80
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-bionic-cuda11.8-py3.10-gcc7-sm80
+      docker-image-name: pytorch-linux-bionic-cuda11.8-cudnn8-py3-gcc7
+      cuda-arch-list: '8.0'
+      test-matrix: |
+        { include: [
+          { config: "inductor_huggingface_perf_compare", shard: 1, num_shards: 1, runner: "linux.gcp.a100" },
+          { config: "inductor_timm_perf_compare", shard: 1, num_shards: 2, runner: "linux.gcp.a100" },
+          { config: "inductor_timm_perf_compare", shard: 2, num_shards: 2, runner: "linux.gcp.a100" },
+          { config: "inductor_torchbench_perf_compare", shard: 1, num_shards: 1, runner: "linux.gcp.a100" },
+        ]}
+
+  linux-bionic-cuda11_8-py3_10-gcc7-inductor-test:
+    name: cuda11.8-py3.10-gcc7-sm80
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-bionic-cuda11_8-py3_10-gcc7-inductor-build
+    with:
+      build-environment: linux-bionic-cuda11.8-py3.10-gcc7-sm80
+      docker-image: ${{ needs.linux-bionic-cuda11_8-py3_10-gcc7-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-bionic-cuda11_8-py3_10-gcc7-inductor-build.outputs.test-matrix }}
+      use-gha: anything-non-empty-to-use-gha
--- a/.github/workflows/inductor-perf-test-nightly.yml
+++ b/.github/workflows/inductor-perf-test-nightly.yml
@ -1,4 +1,4 @@
-name: inductor-A100-perf
+name: inductor-A100-perf-nightly

 on:
  schedule:
--- a/.github/workflows/upload-torch-dynamo-perf-stats.yml
+++ b/.github/workflows/upload-torch-dynamo-perf-stats.yml
@ -2,7 +2,7 @@ name: Upload torch dynamo performance stats

 on:
  workflow_run:
-    workflows: [inductor-A100-perf]
+    workflows: [inductor-A100-perf-nightly]
    types:
      - completed
    branches:
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@ -1934,7 +1934,9 @@ def run(runner, args, original_dir=None):
    if args.unspecialize_int:
        torch._dynamo.config.specialize_int = False
    if args.ci:
-        args.repeat = 2
+        if args.accuracy:
+            # Run fewer iterations when checking accuracy
+            args.repeat = 2
        if args.dynamic_ci_skips_only:
            # Test only the incremental set of jobs whose skipped was
            # caused solely by turning on dynamic shapes