[CI] Add a workflow for quick perf comparison (#96166)

Summary: ciflow/inductor-perf-test-nightly now contains full dashboard
run which takes a very long time. Ed proposed a simplification of the
perf run there, but it is still worth to have a set of fast perf test
which only includes one configuration (--training --amp).
Pull Request resolved: https://github.com/pytorch/pytorch/pull/96166
Approved by: https://github.com/huydhn, https://github.com/weiwangmeta
This commit is contained in:
Bin Bao 2023-03-08 18:17:23 +00:00 committed by PyTorch MergeBot
parent 4a1b971748
commit b3a079810e
6 changed files with 54 additions and 6 deletions

View file

@ -298,10 +298,16 @@ test_single_dynamo_benchmark() {
local partition_flags=()
if [[ -n "$NUM_TEST_SHARDS" && -n "$shard_id" ]]; then
partition_flags=( --total-partitions 2 --partition-id "$shard_id" )
partition_flags=( --total-partitions "$NUM_TEST_SHARDS" --partition-id "$shard_id" )
fi
if [[ "${TEST_CONFIG}" == *perf* ]]; then
if [[ "${TEST_CONFIG}" == *perf_compare* ]]; then
python "benchmarks/dynamo/$suite.py" \
--ci --performance --disable-cudagraphs \
"${DYNAMO_BENCHMARK_FLAGS[@]}" \
"$@" "${partition_flags[@]}" \
--output "$TEST_REPORTS_DIR/${name}_${suite}.csv"
elif [[ "${TEST_CONFIG}" == *perf* ]]; then
# MKL_THREADING_LAYER=GNU to mitigate https://github.com/pytorch/pytorch/issues/37377
MKL_THREADING_LAYER=GNU python benchmarks/dynamo/runner.py --suites="$suite" \
--base-sha="$BASE_SHA" --output-dir="$TEST_REPORTS_DIR" "${partition_flags[@]}" \
@ -325,7 +331,9 @@ test_dynamo_benchmark() {
local shard_id="$1"
shift
if [[ "${TEST_CONFIG}" == *perf* ]]; then
if [[ "${TEST_CONFIG}" == *perf_compare* ]]; then
test_single_dynamo_benchmark "amp" "$suite" "$shard_id" --training --amp "$@"
elif [[ "${TEST_CONFIG}" == *perf* ]]; then
# Performance test training only, for float32 and amp
test_single_dynamo_benchmark "amp" "$suite" "$shard_id" --training --dtypes=amp "$@"
test_single_dynamo_benchmark "float32" "$suite" "$shard_id" --training --dtypes=float32 "$@"

View file

@ -6,6 +6,7 @@ ciflow_push_tags:
- ciflow/binaries_libtorch
- ciflow/binaries_wheel
- ciflow/inductor
- ciflow/inductor-perf-compare
- ciflow/inductor-perf-test-nightly
- ciflow/mps
- ciflow/nightly

View file

@ -0,0 +1,37 @@
name: inductor-A100-perf-compare
on:
push:
tags:
- ciflow/inductor-perf-compare/*
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
cancel-in-progress: true
jobs:
linux-bionic-cuda11_8-py3_10-gcc7-inductor-build:
name: cuda11.8-py3.10-gcc7-sm80
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-bionic-cuda11.8-py3.10-gcc7-sm80
docker-image-name: pytorch-linux-bionic-cuda11.8-cudnn8-py3-gcc7
cuda-arch-list: '8.0'
test-matrix: |
{ include: [
{ config: "inductor_huggingface_perf_compare", shard: 1, num_shards: 1, runner: "linux.gcp.a100" },
{ config: "inductor_timm_perf_compare", shard: 1, num_shards: 2, runner: "linux.gcp.a100" },
{ config: "inductor_timm_perf_compare", shard: 2, num_shards: 2, runner: "linux.gcp.a100" },
{ config: "inductor_torchbench_perf_compare", shard: 1, num_shards: 1, runner: "linux.gcp.a100" },
]}
linux-bionic-cuda11_8-py3_10-gcc7-inductor-test:
name: cuda11.8-py3.10-gcc7-sm80
uses: ./.github/workflows/_linux-test.yml
needs: linux-bionic-cuda11_8-py3_10-gcc7-inductor-build
with:
build-environment: linux-bionic-cuda11.8-py3.10-gcc7-sm80
docker-image: ${{ needs.linux-bionic-cuda11_8-py3_10-gcc7-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-bionic-cuda11_8-py3_10-gcc7-inductor-build.outputs.test-matrix }}
use-gha: anything-non-empty-to-use-gha

View file

@ -1,4 +1,4 @@
name: inductor-A100-perf
name: inductor-A100-perf-nightly
on:
schedule:

View file

@ -2,7 +2,7 @@ name: Upload torch dynamo performance stats
on:
workflow_run:
workflows: [inductor-A100-perf]
workflows: [inductor-A100-perf-nightly]
types:
- completed
branches:

View file

@ -1934,7 +1934,9 @@ def run(runner, args, original_dir=None):
if args.unspecialize_int:
torch._dynamo.config.specialize_int = False
if args.ci:
args.repeat = 2
if args.accuracy:
# Run fewer iterations when checking accuracy
args.repeat = 2
if args.dynamic_ci_skips_only:
# Test only the incremental set of jobs whose skipped was
# caused solely by turning on dynamic shapes