mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-14 20:57:59 +00:00
[CI] Add a workflow for quick perf comparison (#96166)
Summary: ciflow/inductor-perf-test-nightly now contains full dashboard run which takes a very long time. Ed proposed a simplification of the perf run there, but it is still worth to have a set of fast perf test which only includes one configuration (--training --amp). Pull Request resolved: https://github.com/pytorch/pytorch/pull/96166 Approved by: https://github.com/huydhn, https://github.com/weiwangmeta
This commit is contained in:
parent
4a1b971748
commit
b3a079810e
6 changed files with 54 additions and 6 deletions
|
|
@ -298,10 +298,16 @@ test_single_dynamo_benchmark() {
|
|||
|
||||
local partition_flags=()
|
||||
if [[ -n "$NUM_TEST_SHARDS" && -n "$shard_id" ]]; then
|
||||
partition_flags=( --total-partitions 2 --partition-id "$shard_id" )
|
||||
partition_flags=( --total-partitions "$NUM_TEST_SHARDS" --partition-id "$shard_id" )
|
||||
fi
|
||||
|
||||
if [[ "${TEST_CONFIG}" == *perf* ]]; then
|
||||
if [[ "${TEST_CONFIG}" == *perf_compare* ]]; then
|
||||
python "benchmarks/dynamo/$suite.py" \
|
||||
--ci --performance --disable-cudagraphs \
|
||||
"${DYNAMO_BENCHMARK_FLAGS[@]}" \
|
||||
"$@" "${partition_flags[@]}" \
|
||||
--output "$TEST_REPORTS_DIR/${name}_${suite}.csv"
|
||||
elif [[ "${TEST_CONFIG}" == *perf* ]]; then
|
||||
# MKL_THREADING_LAYER=GNU to mitigate https://github.com/pytorch/pytorch/issues/37377
|
||||
MKL_THREADING_LAYER=GNU python benchmarks/dynamo/runner.py --suites="$suite" \
|
||||
--base-sha="$BASE_SHA" --output-dir="$TEST_REPORTS_DIR" "${partition_flags[@]}" \
|
||||
|
|
@ -325,7 +331,9 @@ test_dynamo_benchmark() {
|
|||
local shard_id="$1"
|
||||
shift
|
||||
|
||||
if [[ "${TEST_CONFIG}" == *perf* ]]; then
|
||||
if [[ "${TEST_CONFIG}" == *perf_compare* ]]; then
|
||||
test_single_dynamo_benchmark "amp" "$suite" "$shard_id" --training --amp "$@"
|
||||
elif [[ "${TEST_CONFIG}" == *perf* ]]; then
|
||||
# Performance test training only, for float32 and amp
|
||||
test_single_dynamo_benchmark "amp" "$suite" "$shard_id" --training --dtypes=amp "$@"
|
||||
test_single_dynamo_benchmark "float32" "$suite" "$shard_id" --training --dtypes=float32 "$@"
|
||||
|
|
|
|||
1
.github/pytorch-probot.yml
vendored
1
.github/pytorch-probot.yml
vendored
|
|
@ -6,6 +6,7 @@ ciflow_push_tags:
|
|||
- ciflow/binaries_libtorch
|
||||
- ciflow/binaries_wheel
|
||||
- ciflow/inductor
|
||||
- ciflow/inductor-perf-compare
|
||||
- ciflow/inductor-perf-test-nightly
|
||||
- ciflow/mps
|
||||
- ciflow/nightly
|
||||
|
|
|
|||
37
.github/workflows/inductor-perf-compare.yml
vendored
Normal file
37
.github/workflows/inductor-perf-compare.yml
vendored
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
name: inductor-A100-perf-compare
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- ciflow/inductor-perf-compare/*
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
linux-bionic-cuda11_8-py3_10-gcc7-inductor-build:
|
||||
name: cuda11.8-py3.10-gcc7-sm80
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
with:
|
||||
build-environment: linux-bionic-cuda11.8-py3.10-gcc7-sm80
|
||||
docker-image-name: pytorch-linux-bionic-cuda11.8-cudnn8-py3-gcc7
|
||||
cuda-arch-list: '8.0'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "inductor_huggingface_perf_compare", shard: 1, num_shards: 1, runner: "linux.gcp.a100" },
|
||||
{ config: "inductor_timm_perf_compare", shard: 1, num_shards: 2, runner: "linux.gcp.a100" },
|
||||
{ config: "inductor_timm_perf_compare", shard: 2, num_shards: 2, runner: "linux.gcp.a100" },
|
||||
{ config: "inductor_torchbench_perf_compare", shard: 1, num_shards: 1, runner: "linux.gcp.a100" },
|
||||
]}
|
||||
|
||||
linux-bionic-cuda11_8-py3_10-gcc7-inductor-test:
|
||||
name: cuda11.8-py3.10-gcc7-sm80
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: linux-bionic-cuda11_8-py3_10-gcc7-inductor-build
|
||||
with:
|
||||
build-environment: linux-bionic-cuda11.8-py3.10-gcc7-sm80
|
||||
docker-image: ${{ needs.linux-bionic-cuda11_8-py3_10-gcc7-inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-bionic-cuda11_8-py3_10-gcc7-inductor-build.outputs.test-matrix }}
|
||||
use-gha: anything-non-empty-to-use-gha
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
name: inductor-A100-perf
|
||||
name: inductor-A100-perf-nightly
|
||||
|
||||
on:
|
||||
schedule:
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@ name: Upload torch dynamo performance stats
|
|||
|
||||
on:
|
||||
workflow_run:
|
||||
workflows: [inductor-A100-perf]
|
||||
workflows: [inductor-A100-perf-nightly]
|
||||
types:
|
||||
- completed
|
||||
branches:
|
||||
|
|
|
|||
|
|
@ -1934,7 +1934,9 @@ def run(runner, args, original_dir=None):
|
|||
if args.unspecialize_int:
|
||||
torch._dynamo.config.specialize_int = False
|
||||
if args.ci:
|
||||
args.repeat = 2
|
||||
if args.accuracy:
|
||||
# Run fewer iterations when checking accuracy
|
||||
args.repeat = 2
|
||||
if args.dynamic_ci_skips_only:
|
||||
# Test only the incremental set of jobs whose skipped was
|
||||
# caused solely by turning on dynamic shapes
|
||||
|
|
|
|||
Loading…
Reference in a new issue