From b3a079810e3d3ecf7ece4c3c10fb54f3ff4074ef Mon Sep 17 00:00:00 2001 From: Bin Bao Date: Wed, 8 Mar 2023 18:17:23 +0000 Subject: [PATCH] [CI] Add a workflow for quick perf comparison (#96166) Summary: ciflow/inductor-perf-test-nightly now contains full dashboard run which takes a very long time. Ed proposed a simplification of the perf run there, but it is still worth to have a set of fast perf test which only includes one configuration (--training --amp). Pull Request resolved: https://github.com/pytorch/pytorch/pull/96166 Approved by: https://github.com/huydhn, https://github.com/weiwangmeta --- .ci/pytorch/test.sh | 14 +++++-- .github/pytorch-probot.yml | 1 + .github/workflows/inductor-perf-compare.yml | 37 +++++++++++++++++++ .../workflows/inductor-perf-test-nightly.yml | 2 +- .../upload-torch-dynamo-perf-stats.yml | 2 +- benchmarks/dynamo/common.py | 4 +- 6 files changed, 54 insertions(+), 6 deletions(-) create mode 100644 .github/workflows/inductor-perf-compare.yml diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh index 75e514f827f..e36aea1dab6 100755 --- a/.ci/pytorch/test.sh +++ b/.ci/pytorch/test.sh @@ -298,10 +298,16 @@ test_single_dynamo_benchmark() { local partition_flags=() if [[ -n "$NUM_TEST_SHARDS" && -n "$shard_id" ]]; then - partition_flags=( --total-partitions 2 --partition-id "$shard_id" ) + partition_flags=( --total-partitions "$NUM_TEST_SHARDS" --partition-id "$shard_id" ) fi - if [[ "${TEST_CONFIG}" == *perf* ]]; then + if [[ "${TEST_CONFIG}" == *perf_compare* ]]; then + python "benchmarks/dynamo/$suite.py" \ + --ci --performance --disable-cudagraphs \ + "${DYNAMO_BENCHMARK_FLAGS[@]}" \ + "$@" "${partition_flags[@]}" \ + --output "$TEST_REPORTS_DIR/${name}_${suite}.csv" + elif [[ "${TEST_CONFIG}" == *perf* ]]; then # MKL_THREADING_LAYER=GNU to mitigate https://github.com/pytorch/pytorch/issues/37377 MKL_THREADING_LAYER=GNU python benchmarks/dynamo/runner.py --suites="$suite" \ --base-sha="$BASE_SHA" --output-dir="$TEST_REPORTS_DIR" "${partition_flags[@]}" \ @@ -325,7 +331,9 @@ test_dynamo_benchmark() { local shard_id="$1" shift - if [[ "${TEST_CONFIG}" == *perf* ]]; then + if [[ "${TEST_CONFIG}" == *perf_compare* ]]; then + test_single_dynamo_benchmark "amp" "$suite" "$shard_id" --training --amp "$@" + elif [[ "${TEST_CONFIG}" == *perf* ]]; then # Performance test training only, for float32 and amp test_single_dynamo_benchmark "amp" "$suite" "$shard_id" --training --dtypes=amp "$@" test_single_dynamo_benchmark "float32" "$suite" "$shard_id" --training --dtypes=float32 "$@" diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml index dafa081dabb..feeb1630cd7 100644 --- a/.github/pytorch-probot.yml +++ b/.github/pytorch-probot.yml @@ -6,6 +6,7 @@ ciflow_push_tags: - ciflow/binaries_libtorch - ciflow/binaries_wheel - ciflow/inductor +- ciflow/inductor-perf-compare - ciflow/inductor-perf-test-nightly - ciflow/mps - ciflow/nightly diff --git a/.github/workflows/inductor-perf-compare.yml b/.github/workflows/inductor-perf-compare.yml new file mode 100644 index 00000000000..e1ffcbfd02c --- /dev/null +++ b/.github/workflows/inductor-perf-compare.yml @@ -0,0 +1,37 @@ +name: inductor-A100-perf-compare + +on: + push: + tags: + - ciflow/inductor-perf-compare/* + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + linux-bionic-cuda11_8-py3_10-gcc7-inductor-build: + name: cuda11.8-py3.10-gcc7-sm80 + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-bionic-cuda11.8-py3.10-gcc7-sm80 + docker-image-name: pytorch-linux-bionic-cuda11.8-cudnn8-py3-gcc7 + cuda-arch-list: '8.0' + test-matrix: | + { include: [ + { config: "inductor_huggingface_perf_compare", shard: 1, num_shards: 1, runner: "linux.gcp.a100" }, + { config: "inductor_timm_perf_compare", shard: 1, num_shards: 2, runner: "linux.gcp.a100" }, + { config: "inductor_timm_perf_compare", shard: 2, num_shards: 2, runner: "linux.gcp.a100" }, + { config: "inductor_torchbench_perf_compare", shard: 1, num_shards: 1, runner: "linux.gcp.a100" }, + ]} + + linux-bionic-cuda11_8-py3_10-gcc7-inductor-test: + name: cuda11.8-py3.10-gcc7-sm80 + uses: ./.github/workflows/_linux-test.yml + needs: linux-bionic-cuda11_8-py3_10-gcc7-inductor-build + with: + build-environment: linux-bionic-cuda11.8-py3.10-gcc7-sm80 + docker-image: ${{ needs.linux-bionic-cuda11_8-py3_10-gcc7-inductor-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-bionic-cuda11_8-py3_10-gcc7-inductor-build.outputs.test-matrix }} + use-gha: anything-non-empty-to-use-gha diff --git a/.github/workflows/inductor-perf-test-nightly.yml b/.github/workflows/inductor-perf-test-nightly.yml index 6493f0447cf..41d8eef9eb3 100644 --- a/.github/workflows/inductor-perf-test-nightly.yml +++ b/.github/workflows/inductor-perf-test-nightly.yml @@ -1,4 +1,4 @@ -name: inductor-A100-perf +name: inductor-A100-perf-nightly on: schedule: diff --git a/.github/workflows/upload-torch-dynamo-perf-stats.yml b/.github/workflows/upload-torch-dynamo-perf-stats.yml index 6a1d3d8af74..023dd62336f 100644 --- a/.github/workflows/upload-torch-dynamo-perf-stats.yml +++ b/.github/workflows/upload-torch-dynamo-perf-stats.yml @@ -2,7 +2,7 @@ name: Upload torch dynamo performance stats on: workflow_run: - workflows: [inductor-A100-perf] + workflows: [inductor-A100-perf-nightly] types: - completed branches: diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py index 58a4311e30d..5bc34f28452 100644 --- a/benchmarks/dynamo/common.py +++ b/benchmarks/dynamo/common.py @@ -1934,7 +1934,9 @@ def run(runner, args, original_dir=None): if args.unspecialize_int: torch._dynamo.config.specialize_int = False if args.ci: - args.repeat = 2 + if args.accuracy: + # Run fewer iterations when checking accuracy + args.repeat = 2 if args.dynamic_ci_skips_only: # Test only the incremental set of jobs whose skipped was # caused solely by turning on dynamic shapes