From b3a079810e3d3ecf7ece4c3c10fb54f3ff4074ef Mon Sep 17 00:00:00 2001
From: Bin Bao <binbao@fb.com>
Date: Wed, 8 Mar 2023 18:17:23 +0000
Subject: [PATCH] [CI] Add a workflow for quick perf comparison (#96166)

Summary: ciflow/inductor-perf-test-nightly now contains full dashboard
run which takes a very long time. Ed proposed a simplification of the
perf run there, but it is still worth to have a set of fast perf test
which only includes one configuration (--training --amp).
Pull Request resolved: https://github.com/pytorch/pytorch/pull/96166
Approved by: https://github.com/huydhn, https://github.com/weiwangmeta
---
 .ci/pytorch/test.sh                           | 14 +++++--
 .github/pytorch-probot.yml                    |  1 +
 .github/workflows/inductor-perf-compare.yml   | 37 +++++++++++++++++++
 .../workflows/inductor-perf-test-nightly.yml  |  2 +-
 .../upload-torch-dynamo-perf-stats.yml        |  2 +-
 benchmarks/dynamo/common.py                   |  4 +-
 6 files changed, 54 insertions(+), 6 deletions(-)
 create mode 100644 .github/workflows/inductor-perf-compare.yml

diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh
index 75e514f827f..e36aea1dab6 100755
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@@ -298,10 +298,16 @@ test_single_dynamo_benchmark() {
 
   local partition_flags=()
   if [[ -n "$NUM_TEST_SHARDS" && -n "$shard_id" ]]; then
-    partition_flags=( --total-partitions 2 --partition-id "$shard_id" )
+    partition_flags=( --total-partitions "$NUM_TEST_SHARDS" --partition-id "$shard_id" )
   fi
 
-  if [[ "${TEST_CONFIG}" == *perf* ]]; then
+  if [[ "${TEST_CONFIG}" == *perf_compare* ]]; then
+    python "benchmarks/dynamo/$suite.py" \
+      --ci --performance --disable-cudagraphs \
+      "${DYNAMO_BENCHMARK_FLAGS[@]}" \
+      "$@" "${partition_flags[@]}" \
+      --output "$TEST_REPORTS_DIR/${name}_${suite}.csv"
+  elif [[ "${TEST_CONFIG}" == *perf* ]]; then
     # MKL_THREADING_LAYER=GNU to mitigate https://github.com/pytorch/pytorch/issues/37377
     MKL_THREADING_LAYER=GNU python benchmarks/dynamo/runner.py --suites="$suite" \
       --base-sha="$BASE_SHA" --output-dir="$TEST_REPORTS_DIR" "${partition_flags[@]}" \
@@ -325,7 +331,9 @@ test_dynamo_benchmark() {
   local shard_id="$1"
   shift
 
-  if [[ "${TEST_CONFIG}" == *perf* ]]; then
+  if [[ "${TEST_CONFIG}" == *perf_compare* ]]; then
+    test_single_dynamo_benchmark "amp" "$suite" "$shard_id" --training --amp "$@"
+  elif [[ "${TEST_CONFIG}" == *perf* ]]; then
     # Performance test training only, for float32 and amp
     test_single_dynamo_benchmark "amp" "$suite" "$shard_id" --training --dtypes=amp "$@"
     test_single_dynamo_benchmark "float32" "$suite" "$shard_id" --training --dtypes=float32 "$@"
diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml
index dafa081dabb..feeb1630cd7 100644
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@@ -6,6 +6,7 @@ ciflow_push_tags:
 - ciflow/binaries_libtorch
 - ciflow/binaries_wheel
 - ciflow/inductor
+- ciflow/inductor-perf-compare
 - ciflow/inductor-perf-test-nightly
 - ciflow/mps
 - ciflow/nightly
diff --git a/.github/workflows/inductor-perf-compare.yml b/.github/workflows/inductor-perf-compare.yml
new file mode 100644
index 00000000000..e1ffcbfd02c
--- /dev/null
+++ b/.github/workflows/inductor-perf-compare.yml
@@ -0,0 +1,37 @@
+name: inductor-A100-perf-compare
+
+on:
+  push:
+    tags:
+      - ciflow/inductor-perf-compare/*
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  linux-bionic-cuda11_8-py3_10-gcc7-inductor-build:
+    name: cuda11.8-py3.10-gcc7-sm80
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-bionic-cuda11.8-py3.10-gcc7-sm80
+      docker-image-name: pytorch-linux-bionic-cuda11.8-cudnn8-py3-gcc7
+      cuda-arch-list: '8.0'
+      test-matrix: |
+        { include: [
+          { config: "inductor_huggingface_perf_compare", shard: 1, num_shards: 1, runner: "linux.gcp.a100" },
+          { config: "inductor_timm_perf_compare", shard: 1, num_shards: 2, runner: "linux.gcp.a100" },
+          { config: "inductor_timm_perf_compare", shard: 2, num_shards: 2, runner: "linux.gcp.a100" },
+          { config: "inductor_torchbench_perf_compare", shard: 1, num_shards: 1, runner: "linux.gcp.a100" },
+        ]}
+
+  linux-bionic-cuda11_8-py3_10-gcc7-inductor-test:
+    name: cuda11.8-py3.10-gcc7-sm80
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-bionic-cuda11_8-py3_10-gcc7-inductor-build
+    with:
+      build-environment: linux-bionic-cuda11.8-py3.10-gcc7-sm80
+      docker-image: ${{ needs.linux-bionic-cuda11_8-py3_10-gcc7-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-bionic-cuda11_8-py3_10-gcc7-inductor-build.outputs.test-matrix }}
+      use-gha: anything-non-empty-to-use-gha
diff --git a/.github/workflows/inductor-perf-test-nightly.yml b/.github/workflows/inductor-perf-test-nightly.yml
index 6493f0447cf..41d8eef9eb3 100644
--- a/.github/workflows/inductor-perf-test-nightly.yml
+++ b/.github/workflows/inductor-perf-test-nightly.yml
@@ -1,4 +1,4 @@
-name: inductor-A100-perf
+name: inductor-A100-perf-nightly
 
 on:
   schedule:
diff --git a/.github/workflows/upload-torch-dynamo-perf-stats.yml b/.github/workflows/upload-torch-dynamo-perf-stats.yml
index 6a1d3d8af74..023dd62336f 100644
--- a/.github/workflows/upload-torch-dynamo-perf-stats.yml
+++ b/.github/workflows/upload-torch-dynamo-perf-stats.yml
@@ -2,7 +2,7 @@ name: Upload torch dynamo performance stats
 
 on:
   workflow_run:
-    workflows: [inductor-A100-perf]
+    workflows: [inductor-A100-perf-nightly]
     types:
       - completed
     branches:
diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index 58a4311e30d..5bc34f28452 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -1934,7 +1934,9 @@ def run(runner, args, original_dir=None):
     if args.unspecialize_int:
         torch._dynamo.config.specialize_int = False
     if args.ci:
-        args.repeat = 2
+        if args.accuracy:
+            # Run fewer iterations when checking accuracy
+            args.repeat = 2
         if args.dynamic_ci_skips_only:
             # Test only the incremental set of jobs whose skipped was
             # caused solely by turning on dynamic shapes