From 21dccbca62c88b796ba5730115141e69b8b2e6ab Mon Sep 17 00:00:00 2001 From: Sam Estep Date: Mon, 8 Feb 2021 10:12:37 -0800 Subject: [PATCH] Revert D26232345: [pytorch][PR] Report test time regressions Test Plan: revert-hammer Differential Revision: D26232345 (https://github.com/pytorch/pytorch/commit/7467f90b131ab28340d1b6c1f565fc4291424af1) Original commit changeset: b687b1737519 fbshipit-source-id: 10a031c5500b083f7c82f2ae2743b671c5a07bff --- .circleci/config.yml | 4 +- .../job-specs/pytorch-job-specs.yml | 4 +- mypy.ini | 6 - test/print_test_stats.py | 254 +++++ test/test_testing.py | 452 --------- tools/code_coverage/package/oss/init.py | 1 + torch/testing/_internal/print_test_stats.py | 864 ------------------ 7 files changed, 259 insertions(+), 1326 deletions(-) create mode 100755 test/print_test_stats.py delete mode 100755 torch/testing/_internal/print_test_stats.py diff --git a/.circleci/config.yml b/.circleci/config.yml index 7420b41939d..ca26c432e08 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -642,9 +642,9 @@ jobs: export CIRCLE_JOB="$CIRCLE_JOB" export CIRCLE_WORKFLOW_ID="$CIRCLE_WORKFLOW_ID" cd workspace - python torch/testing/_internal/print_test_stats.py --upload-to-s3 --compare-with-s3 test + python test/print_test_stats.py --upload-to-s3 test EOL - echo "(cat docker_commands.sh | docker exec -u jenkins -e LANG=C.UTF-8 -i "$id" bash) 2>&1" > command.sh + echo "(cat docker_commands.sh | docker exec -u jenkins -i "$id" bash) 2>&1" > command.sh unbuffer bash command.sh | ts echo "Retrieving test reports" diff --git a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml index bf70df71e98..99b327c275a 100644 --- a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml +++ b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml @@ -201,9 +201,9 @@ jobs: export CIRCLE_JOB="$CIRCLE_JOB" export CIRCLE_WORKFLOW_ID="$CIRCLE_WORKFLOW_ID" cd workspace - python torch/testing/_internal/print_test_stats.py --upload-to-s3 --compare-with-s3 test + python test/print_test_stats.py --upload-to-s3 test EOL - echo "(cat docker_commands.sh | docker exec -u jenkins -e LANG=C.UTF-8 -i "$id" bash) 2>&1" > command.sh + echo "(cat docker_commands.sh | docker exec -u jenkins -i "$id" bash) 2>&1" > command.sh unbuffer bash command.sh | ts echo "Retrieving test reports" diff --git a/mypy.ini b/mypy.ini index ae1547b6c6e..65187724bfd 100644 --- a/mypy.ini +++ b/mypy.ini @@ -242,9 +242,3 @@ ignore_missing_imports = True [mypy-mypy.*] ignore_missing_imports = True - -[mypy-xml.*] -ignore_missing_imports = True - -[mypy-boto3.*] -ignore_missing_imports = True diff --git a/test/print_test_stats.py b/test/print_test_stats.py new file mode 100755 index 00000000000..95b267028fa --- /dev/null +++ b/test/print_test_stats.py @@ -0,0 +1,254 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Read and print test results statistics +from xml.dom import minidom +from glob import glob +import bz2 +import json +import os +import statistics +import time + +import boto3 +import datetime +import requests + +class TestCase: + def __init__(self, dom): + self.class_name = str(dom.attributes['classname'].value) + self.name = str(dom.attributes['name'].value) + self.time = float(dom.attributes['time'].value) + self.errored = len(dom.getElementsByTagName('error')) > 0 + self.failed = len(dom.getElementsByTagName('failure')) > 0 + self.skipped = len(dom.getElementsByTagName('skipped')) > 0 + + +class TestSuite: + def __init__(self, name): + self.name = name + self.test_cases = [] + self.failed_count = 0 + self.skipped_count = 0 + self.errored_count = 0 + self.total_time = 0.0 + + def __repr__(self): + rc = f'{self.name} run_time: {self.total_time:.2f} tests: {len(self.test_cases)}' + if self.skipped_count > 0: + rc += f' skipped: {self.skipped_count}' + return f'TestSuite({rc})' + + def append(self, test_case): + self.test_cases.append(test_case) + self.total_time += test_case.time + self.failed_count += 1 if test_case.failed else 0 + self.skipped_count += 1 if test_case.skipped else 0 + self.errored_count += 1 if test_case.errored else 0 + + def print_report(self, num_longest=3): + sorted_tests = sorted(self.test_cases, key=lambda x: x.time) + test_count = len(sorted_tests) + print(f"class {self.name}:") + print(f" tests: {test_count} failed: {self.failed_count} skipped: {self.skipped_count} errored: {self.errored_count}") + print(f" run_time: {self.total_time:.2f} seconds") + print(f" avg_time: {self.total_time/test_count:.2f} seconds") + if test_count >= 2: + print(f" median_time: {statistics.median(x.time for x in sorted_tests):.2f} seconds") + sorted_tests = sorted_tests[-num_longest:] + print(f" {len(sorted_tests)} longest tests:") + for test in reversed(sorted_tests): + print(f" {test.name} time: {test.time:.2f} seconds") + print("") + + + +def parse_report(path): + dom = minidom.parse(path) + for test_case in dom.getElementsByTagName('testcase'): + yield TestCase(test_case) + +def parse_reports(folder): + reports = glob(os.path.join(folder, '**', '*.xml'), recursive=True) + tests_by_class = dict() + for report in reports: + for test_case in parse_report(report): + class_name = test_case.class_name + if class_name not in tests_by_class: + tests_by_class[class_name] = TestSuite(class_name) + tests_by_class[class_name].append(test_case) + return tests_by_class + +def build_info(): + return { + "build_pr": os.environ.get("CIRCLE_PR_NUMBER"), + "build_tag": os.environ.get("CIRCLE_TAG"), + "build_sha1": os.environ.get("CIRCLE_SHA1"), + "build_branch": os.environ.get("CIRCLE_BRANCH"), + "build_job": os.environ.get("CIRCLE_JOB"), + "build_workflow_id": os.environ.get("CIRCLE_WORKFLOW_ID"), + } + +def build_message(test_case): + return { + "normal": { + **build_info(), + "test_suite_name": test_case.class_name, + "test_case_name": test_case.name, + }, + "int": { + "time": int(time.time()), + "test_total_count": 1, + "test_total_time": int(test_case.time * 1000), + "test_failed_count": 1 if test_case.failed > 0 else 0, + "test_skipped_count": 1 if test_case.skipped > 0 else 0, + "test_errored_count": 1 if test_case.errored > 0 else 0, + }, + } + +def send_report_to_scribe(reports): + access_token = os.environ.get("SCRIBE_GRAPHQL_ACCESS_TOKEN") + + if not access_token: + print("No scribe access token provided, skip sending report!") + return + print("Scribe access token provided, sending report...") + url = "https://graph.facebook.com/scribe_logs" + r = requests.post( + url, + data={ + "access_token": access_token, + "logs": json.dumps( + [ + { + "category": "perfpipe_pytorch_test_times", + "message": json.dumps(build_message(test_case)), + "line_escape": False, + } + for name in sorted(reports.keys()) + for test_case in reports[name].test_cases + ] + ), + }, + ) + r.raise_for_status() + +def send_report_to_s3(reports, *, total_seconds): + job = os.environ.get('CIRCLE_JOB') + sha1 = os.environ.get('CIRCLE_SHA1') + branch = os.environ.get('CIRCLE_BRANCH', '') + if branch not in ['master', 'nightly'] and not branch.startswith("release/"): + print("S3 upload only enabled on master, nightly and release branches.") + print(f"skipping test report on branch: {branch}") + return + now = datetime.datetime.utcnow().isoformat() + key = f'test_time/{sha1}/{job}/{now}Z.json.bz2' # Z meaning UTC + s3 = boto3.resource('s3') + try: + s3.get_bucket_acl(Bucket='ossci-metrics') + except Exception as e: + print(f"AWS ACL failed: {e}") + print("AWS credential found, uploading to S3...") + + obj = s3.Object('ossci-metrics', key) + print("") + # use bz2 because the results are smaller than gzip, and the + # compression time penalty we pay is only about half a second for + # input files of a few megabytes in size like these JSON files, and + # because for some reason zlib doesn't seem to play nice with the + # gunzip command whereas Python's bz2 does work with bzip2 + obj.put(Body=bz2.compress(json.dumps({ + **build_info(), + 'total_seconds': total_seconds, + 'suites': { + name: { + 'total_seconds': suite.total_time, + 'cases': [ + { + 'name': case.name, + 'seconds': case.time, + 'errored': case.errored, + 'failed': case.failed, + 'skipped': case.skipped, + } + for case in suite.test_cases + ], + } + for name, suite in reports.items() + } + }).encode())) + +def positive_integer(value): + parsed = int(value) + if parsed < 1: + raise argparse.ArgumentTypeError(f"{value} is not a natural number") + return parsed + +def positive_float(value): + parsed = float(value) + if parsed <= 0.0: + raise argparse.ArgumentTypeError(f"{value} is not a positive rational number") + return parsed + +if __name__ == '__main__': + import argparse + import sys + parser = argparse.ArgumentParser( + "Print statistics from test XML output.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "--longest-of-class", + type=positive_integer, + default=3, + metavar="N", + help="how many longest tests to show for each class", + ) + parser.add_argument( + "--class-print-threshold", + type=positive_float, + default=1.0, + metavar="N", + help="Minimal total time to warrant class report", + ) + parser.add_argument( + "--longest-of-run", + type=positive_integer, + default=10, + metavar="N", + help="how many longest tests to show from the entire run", + ) + parser.add_argument( + "--upload-to-s3", + action="store_true", + help="upload test time to S3 bucket", + ) + parser.add_argument( + "folder", + help="test report folder", + ) + args = parser.parse_args() + + reports = parse_reports(args.folder) + if len(reports) == 0: + print(f"No test reports found in {args.folder}") + sys.exit(0) + + send_report_to_scribe(reports) + + longest_tests = [] + total_time = 0 + for name in sorted(reports.keys()): + test_suite = reports[name] + if test_suite.total_time >= args.class_print_threshold: + test_suite.print_report(args.longest_of_class) + total_time += test_suite.total_time + longest_tests.extend(test_suite.test_cases) + longest_tests = sorted(longest_tests, key=lambda x: x.time)[-args.longest_of_run:] + + if args.upload_to_s3: + send_report_to_s3(reports, total_seconds=total_time) + + print(f"Total runtime is {datetime.timedelta(seconds=int(total_time))}") + print(f"{len(longest_tests)} longest tests of entire run:") + for test_case in reversed(longest_tests): + print(f" {test_case.class_name}.{test_case.name} time: {test_case.time:.2f} seconds") diff --git a/test/test_testing.py b/test/test_testing.py index 05d1e135c53..280a312914e 100644 --- a/test/test_testing.py +++ b/test/test_testing.py @@ -8,7 +8,6 @@ from torch.testing._internal.common_utils import \ from torch.testing._internal.common_device_type import \ (instantiate_device_type_tests, onlyCUDA, onlyOnCPUAndCUDA, dtypes) from torch.testing._internal import mypy_wrapper -from torch.testing._internal import print_test_stats # For testing TestCase methods and torch.testing functions class TestTesting(TestCase): @@ -546,456 +545,5 @@ class TestMypyWrapper(TestCase): )) -def fakehash(char): - return char * 40 - - -def makecase(name, seconds, *, errored=False, failed=False, skipped=False): - return { - 'name': name, - 'seconds': seconds, - 'errored': errored, - 'failed': failed, - 'skipped': skipped, - } - - -def makereport(tests): - suites = { - suite_name: { - 'total_seconds': sum(case['seconds'] for case in cases), - 'cases': cases, - } - for suite_name, cases in tests.items() - } - return { - 'total_seconds': sum(s['total_seconds'] for s in suites.values()), - 'suites': suites, - } - - -class TestPrintTestStats(TestCase): - maxDiff = None - - def test_analysis(self): - head_report = makereport({ - # input ordering of the suites is ignored - 'Grault': [ - # not printed: status same and time similar - makecase('test_grault0', 4.78, failed=True), - # status same, but time increased a lot - makecase('test_grault2', 1.473, errored=True), - ], - # individual tests times changed, not overall suite - 'Qux': [ - # input ordering of the test cases is ignored - makecase('test_qux1', 0.001, skipped=True), - makecase('test_qux6', 0.002, skipped=True), - # time in bounds, but status changed - makecase('test_qux4', 7.158, failed=True), - # not printed because it's the same as before - makecase('test_qux7', 0.003, skipped=True), - makecase('test_qux5', 11.968), - makecase('test_qux3', 23.496), - ], - # new test suite - 'Bar': [ - makecase('test_bar2', 3.742, failed=True), - makecase('test_bar1', 50.447), - ], - # overall suite time changed but no individual tests - 'Norf': [ - makecase('test_norf1', 3), - makecase('test_norf2', 3), - makecase('test_norf3', 3), - makecase('test_norf4', 3), - ], - # suite doesn't show up if it doesn't change enough - 'Foo': [ - makecase('test_foo1', 42), - makecase('test_foo2', 56), - ], - }) - - base_reports = { - # bbbb has no reports, so base is cccc instead - fakehash('b'): [], - fakehash('c'): [ - makereport({ - 'Baz': [ - makecase('test_baz2', 13.605), - # no recent suites have & skip this test - makecase('test_baz1', 0.004, skipped=True), - ], - 'Foo': [ - makecase('test_foo1', 43), - # test added since dddd - makecase('test_foo2', 57), - ], - 'Grault': [ - makecase('test_grault0', 4.88, failed=True), - makecase('test_grault1', 11.967, failed=True), - makecase('test_grault2', 0.395, errored=True), - makecase('test_grault3', 30.460), - ], - 'Norf': [ - makecase('test_norf1', 2), - makecase('test_norf2', 2), - makecase('test_norf3', 2), - makecase('test_norf4', 2), - ], - 'Qux': [ - makecase('test_qux3', 4.978, errored=True), - makecase('test_qux7', 0.002, skipped=True), - makecase('test_qux2', 5.618), - makecase('test_qux4', 7.766, errored=True), - makecase('test_qux6', 23.589, failed=True), - ], - }), - ], - fakehash('d'): [ - makereport({ - 'Foo': [ - makecase('test_foo1', 40), - # removed in cccc - makecase('test_foo3', 17), - ], - 'Baz': [ - # not skipped, so not included in stdev - makecase('test_baz1', 3.14), - ], - 'Qux': [ - makecase('test_qux7', 0.004, skipped=True), - makecase('test_qux2', 6.02), - makecase('test_qux4', 20.932), - ], - 'Norf': [ - makecase('test_norf1', 3), - makecase('test_norf2', 3), - makecase('test_norf3', 3), - makecase('test_norf4', 3), - ], - 'Grault': [ - makecase('test_grault0', 5, failed=True), - makecase('test_grault1', 14.325, failed=True), - makecase('test_grault2', 0.31, errored=True), - ], - }), - ], - fakehash('e'): [], - fakehash('f'): [ - makereport({ - 'Foo': [ - makecase('test_foo3', 24), - makecase('test_foo1', 43), - ], - 'Baz': [ - makecase('test_baz2', 16.857), - ], - 'Qux': [ - makecase('test_qux2', 6.422), - makecase('test_qux4', 6.382, errored=True), - ], - 'Norf': [ - makecase('test_norf1', 0.9), - makecase('test_norf3', 0.9), - makecase('test_norf2', 0.9), - makecase('test_norf4', 0.9), - ], - 'Grault': [ - makecase('test_grault0', 4.7, failed=True), - makecase('test_grault1', 13.146, failed=True), - makecase('test_grault2', 0.48, errored=True), - ], - }), - ], - } - - simpler_head = print_test_stats.simplify(head_report) - simpler_base = {} - for commit, reports in base_reports.items(): - simpler_base[commit] = [print_test_stats.simplify(r) for r in reports] - analysis = print_test_stats.analyze( - head_report=simpler_head, - base_reports=simpler_base, - ) - - self.assertEqual( - '''\ - -- class Baz: -- # was 15.23s ± 2.30s -- -- def test_baz1: ... -- # was 0.004s (skipped) -- -- def test_baz2: ... -- # was 15.231s ± 2.300s - - - class Grault: - # was 48.86s ± 1.19s - # now 6.25s - - - def test_grault1: ... - - # was 13.146s ± 1.179s (failed) - - - def test_grault3: ... - - # was 30.460s - - - class Qux: - # was 41.66s ± 1.06s - # now 42.63s - - - def test_qux2: ... - - # was 6.020s ± 0.402s - - ! def test_qux3: ... - ! # was 4.978s (errored) - ! # now 23.496s - - ! def test_qux4: ... - ! # was 7.074s ± 0.979s (errored) - ! # now 7.158s (failed) - - ! def test_qux6: ... - ! # was 23.589s (failed) - ! # now 0.002s (skipped) - - + def test_qux1: ... - + # now 0.001s (skipped) - - + def test_qux5: ... - + # now 11.968s - - -+ class Bar: -+ # now 54.19s -+ -+ def test_bar1: ... -+ # now 50.447s -+ -+ def test_bar2: ... -+ # now 3.742s (failed) - -''', - print_test_stats.anomalies(analysis), - ) - - def test_graph(self): - # HEAD is on master - self.assertEqual( - '''\ -Commit graph (base is most recent master ancestor with at least one S3 report): - - : (master) - | - * aaaaaaaaaa (HEAD) total time 502.99s - * bbbbbbbbbb (base) 1 report, total time 47.84s - * cccccccccc 1 report, total time 332.50s - * dddddddddd 0 reports - | - : -''', - print_test_stats.graph( - head_sha=fakehash('a'), - head_seconds=502.99, - base_seconds={ - fakehash('b'): [47.84], - fakehash('c'): [332.50], - fakehash('d'): [], - }, - on_master=True, - ) - ) - - self.assertEqual( - '''\ -Commit graph (base is most recent master ancestor with at least one S3 report): - - : (master) - | - | * aaaaaaaaaa (HEAD) total time 9988.77s - |/ - * bbbbbbbbbb (base) 121 reports, total time 7654.32s ± 55.55s - * cccccccccc 20 reports, total time 5555.55s ± 253.19s - * dddddddddd 1 report, total time 1234.56s - | - : -''', - print_test_stats.graph( - head_sha=fakehash('a'), - head_seconds=9988.77, - base_seconds={ - fakehash('b'): [7598.77] * 60 + [7654.32] + [7709.87] * 60, - fakehash('c'): [5308.77] * 10 + [5802.33] * 10, - fakehash('d'): [1234.56], - }, - on_master=False, - ) - ) - - self.assertEqual( - '''\ -Commit graph (base is most recent master ancestor with at least one S3 report): - - : (master) - | - | * aaaaaaaaaa (HEAD) total time 25.52s - | | - | : (5 commits) - |/ - * bbbbbbbbbb 0 reports - * cccccccccc 0 reports - * dddddddddd (base) 15 reports, total time 58.92s ± 25.82s - | - : -''', - print_test_stats.graph( - head_sha=fakehash('a'), - head_seconds=25.52, - base_seconds={ - fakehash('b'): [], - fakehash('c'): [], - fakehash('d'): [52.25] * 14 + [152.26], - }, - on_master=False, - ancestry_path=5, - ) - ) - - self.assertEqual( - '''\ -Commit graph (base is most recent master ancestor with at least one S3 report): - - : (master) - | - | * aaaaaaaaaa (HEAD) total time 0.08s - |/| - | : (1 commit) - | - * bbbbbbbbbb 0 reports - * cccccccccc (base) 1 report, total time 0.09s - * dddddddddd 3 reports, total time 0.10s ± 0.05s - | - : -''', - print_test_stats.graph( - head_sha=fakehash('a'), - head_seconds=0.08, - base_seconds={ - fakehash('b'): [], - fakehash('c'): [0.09], - fakehash('d'): [0.05, 0.10, 0.15], - }, - on_master=False, - other_ancestors=1, - ) - ) - - self.assertEqual( - '''\ -Commit graph (base is most recent master ancestor with at least one S3 report): - - : (master) - | - | * aaaaaaaaaa (HEAD) total time 5.98s - | | - | : (1 commit) - |/| - | : (7 commits) - | - * bbbbbbbbbb (base) 2 reports, total time 6.02s ± 1.71s - * cccccccccc 0 reports - * dddddddddd 10 reports, total time 5.84s ± 0.92s - | - : -''', - print_test_stats.graph( - head_sha=fakehash('a'), - head_seconds=5.98, - base_seconds={ - fakehash('b'): [4.81, 7.23], - fakehash('c'): [], - fakehash('d'): [4.97] * 5 + [6.71] * 5, - }, - on_master=False, - ancestry_path=1, - other_ancestors=7, - ) - ) - - def test_regression_info(self): - self.assertEqual( - '''\ ------ Historic stats comparison result ------ - - job: foo_job - commit: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa - - - class Foo: - # was 42.50s ± 2.12s - # now 3.02s - - - def test_bar: ... - - # was 1.000s - - ! def test_foo: ... - ! # was 41.500s ± 2.121s - ! # now 0.020s (skipped) - - + def test_baz: ... - + # now 3.000s - - -Commit graph (base is most recent master ancestor with at least one S3 report): - - : (master) - | - | * aaaaaaaaaa (HEAD) total time 3.02s - |/ - * bbbbbbbbbb (base) 1 report, total time 41.00s - * cccccccccc 1 report, total time 43.00s - | - : - -Removed (across 1 suite) 1 test, totaling - 1.00s -Modified (across 1 suite) 1 test, totaling - 41.48s ± 2.12s -Added (across 1 suite) 1 test, totaling + 3.00s -''', - print_test_stats.regression_info( - head_sha=fakehash('a'), - head_report=makereport({ - 'Foo': [ - makecase('test_foo', 0.02, skipped=True), - makecase('test_baz', 3), - ]}), - base_reports={ - fakehash('b'): [ - makereport({ - 'Foo': [ - makecase('test_foo', 40), - makecase('test_bar', 1), - ], - }), - ], - fakehash('c'): [ - makereport({ - 'Foo': [ - makecase('test_foo', 43), - ], - }), - ], - }, - job_name='foo_job', - on_master=False, - ancestry_path=0, - other_ancestors=0, - ) - ) - - if __name__ == '__main__': run_tests() diff --git a/tools/code_coverage/package/oss/init.py b/tools/code_coverage/package/oss/init.py index 33eceed125b..53169b1c332 100644 --- a/tools/code_coverage/package/oss/init.py +++ b/tools/code_coverage/package/oss/init.py @@ -31,6 +31,7 @@ from .utils import ( BLOCKED_PYTHON_TESTS = { "run_test.py", + "print_test_stats.py", "test_dataloader.py", "test_multiprocessing.py", "test_multiprocessing_spawn.py", diff --git a/torch/testing/_internal/print_test_stats.py b/torch/testing/_internal/print_test_stats.py deleted file mode 100755 index b33ca5623d5..00000000000 --- a/torch/testing/_internal/print_test_stats.py +++ /dev/null @@ -1,864 +0,0 @@ -#!/usr/bin/env python - -import bz2 -import datetime -import json -import math -import os -import statistics -import subprocess -import time -from collections import defaultdict -from glob import glob -from pathlib import Path -from typing import DefaultDict, Dict, Iterable, List, Optional, Tuple -from xml.dom import minidom - -import requests -from typing_extensions import TypedDict - -try: - import boto3 - HAVE_BOTO3 = True -except ImportError: - HAVE_BOTO3 = False - -Commit = str # 40-digit SHA-1 hex string -Status = Optional[str] # errored, failed, skipped, or None - -# represent suite as dict because indexing is useful -SimplerCase = Tuple[float, Status] -SimplerSuite = Dict[str, SimplerCase] -SimplerReport = Dict[str, SimplerSuite] - - -class Case(TypedDict): - name: str - seconds: float - errored: bool - failed: bool - skipped: bool - - -class Suite(TypedDict): - total_seconds: float - cases: List[Case] - - -class ReportMeta(TypedDict, total=False): - build_pr: str - build_tag: str - build_sha1: Commit - build_branch: str - build_job: str - build_workflow_id: str - - -class Report(ReportMeta): - total_seconds: float - suites: Dict[str, Suite] - - -class Stat(TypedDict): - center: float - spread: Optional[float] - - -class CaseDiff(TypedDict): - margin: str - name: str - was: Optional[Tuple[Stat, Status]] - now: Optional[SimplerCase] - - -class SuiteDiff(TypedDict): - margin: str - name: str - was: Optional[Stat] - now: Optional[float] - cases: List[CaseDiff] - - -def case_status(case: Case) -> Status: - for k in {'errored', 'failed', 'skipped'}: - if case[k]: # type: ignore - return k - return None - - -def simplify(report: Report) -> SimplerReport: - return { - suite_name: { - case['name']: (case['seconds'], case_status(case)) - for case in suite['cases'] - } - for suite_name, suite in report['suites'].items() - } - - -def plural(n: int) -> str: - return '' if n == 1 else 's' - - -def display_stat( - x: Stat, - format: Tuple[Tuple[int, int], Tuple[int, int]], -) -> str: - spread_len = format[1][0] + 1 + format[1][1] - spread = x['spread'] - if spread is not None: - spread_str = f' ± {spread:{spread_len}.{format[1][1]}f}s' - else: - spread_str = ' ' * (3 + spread_len + 1) - mean_len = format[0][0] + 1 + format[0][1] - return f'{x["center"]:{mean_len}.{format[0][1]}f}s{spread_str}' - - -def list_stat(l: List[float]) -> Stat: - return { - 'center': statistics.mean(l), - 'spread': statistics.stdev(l) if len(l) > 1 else None - } - - -def zero_stat() -> Stat: - return {'center': 0, 'spread': None} - - -def recenter(was: Stat, now: float) -> Stat: - return {'center': now - was['center'], 'spread': was['spread']} - - -def sum_normals(stats: Iterable[Stat]) -> Stat: - """ - Returns a stat corresponding to the sum of the given stats. - - Assumes that the center and spread for each of the given stats are - mean and stdev, respectively. - """ - l = list(stats) - spread: Optional[float] - if any(stat['spread'] is not None for stat in l): - spread = math.sqrt(sum((stat['spread'] or 0)**2 for stat in l)) - else: - spread = None - return { - 'center': sum(stat['center'] for stat in l), - 'spread': spread, - } - - -def format_seconds(seconds: List[float]) -> str: - if len(seconds) > 0: - x = list_stat(seconds) - return f'total time {display_stat(x, ((5, 2), (4, 2)))}'.strip() - return '' - - -def show_ancestors(num_commits: int) -> str: - return f' | : ({num_commits} commit{plural(num_commits)})' - - -def unlines(lines: List[str]) -> str: - return ''.join(f'{line}\n' for line in lines) - - -def matching_test_times( - base_reports: Dict[Commit, List[SimplerReport]], - suite_name: str, - case_name: str, - status: Status, -) -> List[float]: - times: List[float] = [] - for reports in base_reports.values(): - for report in reports: - suite = report.get(suite_name) - if suite: - case = suite.get(case_name) - if case: - t, s = case - if s == status: - times.append(t) - return times - - -def analyze( - *, - head_report: SimplerReport, - base_reports: Dict[Commit, List[SimplerReport]], -) -> List[SuiteDiff]: - # most recent master ancestor with at least one S3 report - base_sha = next(sha for sha, reports in base_reports.items() if reports) - - # find all relevant suites (those in either base or head or both) - all_reports = [head_report] + base_reports[base_sha] - all_suites = {k for r in all_reports for k in r.keys()} - - removed_suites: List[SuiteDiff] = [] - modified_suites: List[SuiteDiff] = [] - added_suites: List[SuiteDiff] = [] - - for suite_name in sorted(all_suites): - case_diffs: List[CaseDiff] = [] - head_suite = head_report.get(suite_name) - base_cases: Dict[str, Status] = dict(sorted(set.intersection(*[ - {(n, s) for n, (_, s) in report.get(suite_name, {}).items()} - for report in base_reports[base_sha] - ] or [set()]))) - case_stats: Dict[str, Stat] = {} - if head_suite: - now = sum(case[0] for case in head_suite.values()) - if any(suite_name in report for report in base_reports[base_sha]): - removed_cases: List[CaseDiff] = [] - for case_name, case_status in base_cases.items(): - case_stats[case_name] = list_stat(matching_test_times( - base_reports, - suite_name, - case_name, - case_status, - )) - if case_name not in head_suite: - removed_cases.append({ - 'margin': '-', - 'name': case_name, - 'was': (case_stats[case_name], case_status), - 'now': None, - }) - modified_cases: List[CaseDiff] = [] - added_cases: List[CaseDiff] = [] - for head_case_name in sorted(head_suite): - head_case = head_suite[head_case_name] - if head_case_name in base_cases: - stat = case_stats[head_case_name] - base_status = base_cases[head_case_name] - if head_case[1] != base_status: - modified_cases.append({ - 'margin': '!', - 'name': head_case_name, - 'was': (stat, base_status), - 'now': head_case, - }) - else: - added_cases.append({ - 'margin': '+', - 'name': head_case_name, - 'was': None, - 'now': head_case, - }) - # there might be a bug calculating this stdev, not sure - was = sum_normals(case_stats.values()) - case_diffs = removed_cases + modified_cases + added_cases - if case_diffs: - modified_suites.append({ - 'margin': ' ', - 'name': suite_name, - 'was': was, - 'now': now, - 'cases': case_diffs, - }) - else: - for head_case_name in sorted(head_suite): - head_case = head_suite[head_case_name] - case_diffs.append({ - 'margin': ' ', - 'name': head_case_name, - 'was': None, - 'now': head_case, - }) - added_suites.append({ - 'margin': '+', - 'name': suite_name, - 'was': None, - 'now': now, - 'cases': case_diffs, - }) - else: - for case_name, case_status in base_cases.items(): - case_stats[case_name] = list_stat(matching_test_times( - base_reports, - suite_name, - case_name, - case_status, - )) - case_diffs.append({ - 'margin': ' ', - 'name': case_name, - 'was': (case_stats[case_name], case_status), - 'now': None, - }) - removed_suites.append({ - 'margin': '-', - 'name': suite_name, - # there might be a bug calculating this stdev, not sure - 'was': sum_normals(case_stats.values()), - 'now': None, - 'cases': case_diffs, - }) - - return removed_suites + modified_suites + added_suites - - -def case_diff_lines(diff: CaseDiff) -> List[str]: - lines = [f'def {diff["name"]}: ...'] - - case_fmt = ((3, 3), (2, 3)) - - was = diff['was'] - if was: - was_line = f' # was {display_stat(was[0], case_fmt)}' - was_status = was[1] - if was_status: - was_line += f' ({was_status})' - lines.append(was_line) - - now = diff['now'] - if now: - now_stat: Stat = {'center': now[0], 'spread': None} - now_line = f' # now {display_stat(now_stat, case_fmt)}' - now_status = now[1] - if now_status: - now_line += f' ({now_status})' - lines.append(now_line) - - return [''] + [f'{diff["margin"]} {l}' for l in lines] - - -def display_suite_diff(diff: SuiteDiff) -> str: - lines = [f'class {diff["name"]}:'] - - suite_fmt = ((4, 2), (3, 2)) - - was = diff['was'] - if was: - lines.append(f' # was {display_stat(was, suite_fmt)}') - - now = diff['now'] - if now is not None: - now_stat: Stat = {'center': now, 'spread': None} - lines.append(f' # now {display_stat(now_stat, suite_fmt)}') - - for case_diff in diff['cases']: - lines.extend([f' {l}' for l in case_diff_lines(case_diff)]) - - return unlines([''] + [f'{diff["margin"]} {l}'.rstrip() for l in lines] + ['']) - - -def anomalies(diffs: List[SuiteDiff]) -> str: - return ''.join(map(display_suite_diff, diffs)) - - -def graph( - *, - head_sha: Commit, - head_seconds: float, - base_seconds: Dict[Commit, List[float]], - on_master: bool, - ancestry_path: int = 0, - other_ancestors: int = 0, -) -> str: - lines = [ - 'Commit graph (base is most recent master ancestor with at least one S3 report):', - '', - ' : (master)', - ' |', - ] - - head_time_str = f' {format_seconds([head_seconds])}' - if on_master: - lines.append(f' * {head_sha[:10]} (HEAD) {head_time_str}') - else: - lines.append(f' | * {head_sha[:10]} (HEAD) {head_time_str}') - - if ancestry_path > 0: - lines += [ - ' | |', - show_ancestors(ancestry_path), - ] - - if other_ancestors > 0: - lines += [ - ' |/|', - show_ancestors(other_ancestors), - ' |', - ] - else: - lines.append(' |/') - - is_first = True - for sha, seconds in base_seconds.items(): - num_runs = len(seconds) - prefix = str(num_runs).rjust(3) - base = '(base)' if is_first and num_runs > 0 else ' ' - if num_runs > 0: - is_first = False - t = format_seconds(seconds) - p = plural(num_runs) - if t: - p = f'{p}, '.ljust(3) - lines.append(f' * {sha[:10]} {base} {prefix} report{p}{t}') - - lines.extend([' |', ' :']) - - return unlines(lines) - - -def case_delta(case: CaseDiff) -> Stat: - was = case['was'] - now = case['now'] - return recenter( - was[0] if was else zero_stat(), - now[0] if now else 0, - ) - - -def display_final_stat(stat: Stat) -> str: - center = stat['center'] - spread = stat['spread'] - displayed = display_stat( - {'center': abs(center), 'spread': spread}, - ((4, 2), (3, 2)), - ) - if center < 0: - sign = '-' - elif center > 0: - sign = '+' - else: - sign = ' ' - return f'{sign}{displayed}'.rstrip() - - -def summary_line(message: str, d: DefaultDict[str, List[CaseDiff]]) -> str: - all_cases = [c for cs in d.values() for c in cs] - tests = len(all_cases) - suites = len(d) - sp = f'{plural(suites)})'.ljust(2) - tp = f'{plural(tests)},'.ljust(2) - # there might be a bug calculating this stdev, not sure - stat = sum_normals(case_delta(c) for c in all_cases) - return ''.join([ - f'{message} (across {suites:>4} suite{sp}', - f'{tests:>6} test{tp}', - f' totaling {display_final_stat(stat)}', - ]) - - -def summary(analysis: List[SuiteDiff]) -> str: - removed_tests: DefaultDict[str, List[CaseDiff]] = defaultdict(list) - modified_tests: DefaultDict[str, List[CaseDiff]] = defaultdict(list) - added_tests: DefaultDict[str, List[CaseDiff]] = defaultdict(list) - - for diff in analysis: - # the use of 'margin' here is not the most elegant - name = diff['name'] - margin = diff['margin'] - cases = diff['cases'] - if margin == '-': - removed_tests[name] += cases - elif margin == '+': - added_tests[name] += cases - else: - removed = list(filter(lambda c: c['margin'] == '-', cases)) - added = list(filter(lambda c: c['margin'] == '+', cases)) - modified = list(filter(lambda c: c['margin'] == '!', cases)) - if removed: - removed_tests[name] += removed - if added: - added_tests[name] += added - if modified: - modified_tests[name] += modified - - return unlines([ - summary_line('Removed ', removed_tests), - summary_line('Modified', modified_tests), - summary_line('Added ', added_tests), - ]) - - -def regression_info( - *, - head_sha: Commit, - head_report: Report, - base_reports: Dict[Commit, List[Report]], - job_name: str, - on_master: bool, - ancestry_path: int, - other_ancestors: int, -) -> str: - """ - Return a human-readable report describing any test time regressions. - - The head_sha and head_report args give info about the current commit - and its test times. Since Python dicts maintain insertion order - (guaranteed as part of the language spec since 3.7), the - base_reports argument must list the head's several most recent - master commits, from newest to oldest (so the merge-base is - list(base_reports)[0]). - """ - simpler_head = simplify(head_report) - simpler_base: Dict[Commit, List[SimplerReport]] = {} - for commit, reports in base_reports.items(): - simpler_base[commit] = [simplify(r) for r in reports] - analysis = analyze( - head_report=simpler_head, - base_reports=simpler_base, - ) - - return '\n'.join([ - unlines([ - '----- Historic stats comparison result ------', - '', - f' job: {job_name}', - f' commit: {head_sha}', - ]), - anomalies(analysis), - graph( - head_sha=head_sha, - head_seconds=head_report['total_seconds'], - base_seconds={ - c: [r['total_seconds'] for r in rs] - for c, rs in base_reports.items() - }, - on_master=on_master, - ancestry_path=ancestry_path, - other_ancestors=other_ancestors, - ), - summary(analysis), - ]) - - -class TestCase: - def __init__(self, dom) -> None: - self.class_name = str(dom.attributes['classname'].value) - self.name = str(dom.attributes['name'].value) - self.time = float(dom.attributes['time'].value) - self.errored = len(dom.getElementsByTagName('error')) > 0 - self.failed = len(dom.getElementsByTagName('failure')) > 0 - self.skipped = len(dom.getElementsByTagName('skipped')) > 0 - - -class TestSuite: - def __init__(self, name) -> None: - self.name = name - self.test_cases: List[TestCase] = [] - self.failed_count = 0 - self.skipped_count = 0 - self.errored_count = 0 - self.total_time = 0.0 - - def __repr__(self): - rc = f'{self.name} run_time: {self.total_time:.2f} tests: {len(self.test_cases)}' - if self.skipped_count > 0: - rc += f' skipped: {self.skipped_count}' - return f'TestSuite({rc})' - - def append(self, test_case): - self.test_cases.append(test_case) - self.total_time += test_case.time - self.failed_count += 1 if test_case.failed else 0 - self.skipped_count += 1 if test_case.skipped else 0 - self.errored_count += 1 if test_case.errored else 0 - - def print_report(self, num_longest=3): - sorted_tests = sorted(self.test_cases, key=lambda x: x.time) - test_count = len(sorted_tests) - print(f"class {self.name}:") - print(f" tests: {test_count} failed: {self.failed_count} skipped: {self.skipped_count} errored: {self.errored_count}") - print(f" run_time: {self.total_time:.2f} seconds") - print(f" avg_time: {self.total_time/test_count:.2f} seconds") - if test_count >= 2: - print(f" median_time: {statistics.median(x.time for x in sorted_tests):.2f} seconds") - sorted_tests = sorted_tests[-num_longest:] - print(f" {len(sorted_tests)} longest tests:") - for test in reversed(sorted_tests): - print(f" {test.name} time: {test.time:.2f} seconds") - print("") - - -def parse_report(path): - dom = minidom.parse(path) - for test_case in dom.getElementsByTagName('testcase'): - yield TestCase(test_case) - - -def parse_reports(folder): - reports = glob(os.path.join(folder, '**', '*.xml'), recursive=True) - tests_by_class = dict() - for report in reports: - for test_case in parse_report(report): - class_name = test_case.class_name - if class_name not in tests_by_class: - tests_by_class[class_name] = TestSuite(class_name) - tests_by_class[class_name].append(test_case) - return tests_by_class - - -def build_info(): - return { - "build_pr": os.environ.get("CIRCLE_PR_NUMBER"), - "build_tag": os.environ.get("CIRCLE_TAG"), - "build_sha1": os.environ.get("CIRCLE_SHA1"), - "build_branch": os.environ.get("CIRCLE_BRANCH"), - "build_job": os.environ.get("CIRCLE_JOB"), - "build_workflow_id": os.environ.get("CIRCLE_WORKFLOW_ID"), - } - - -def build_message(test_case): - return { - "normal": { - **build_info(), - "test_suite_name": test_case.class_name, - "test_case_name": test_case.name, - }, - "int": { - "time": int(time.time()), - "test_total_count": 1, - "test_total_time": int(test_case.time * 1000), - "test_failed_count": 1 if test_case.failed > 0 else 0, - "test_skipped_count": 1 if test_case.skipped > 0 else 0, - "test_errored_count": 1 if test_case.errored > 0 else 0, - }, - } - - -def send_report_to_scribe(reports): - access_token = os.environ.get("SCRIBE_GRAPHQL_ACCESS_TOKEN") - - if not access_token: - print("No scribe access token provided, skip sending report!") - return - print("Scribe access token provided, sending report...") - url = "https://graph.facebook.com/scribe_logs" - r = requests.post( - url, - data={ - "access_token": access_token, - "logs": json.dumps( - [ - { - "category": "perfpipe_pytorch_test_times", - "message": json.dumps(build_message(test_case)), - "line_escape": False, - } - for name in sorted(reports.keys()) - for test_case in reports[name].test_cases - ] - ), - }, - ) - r.raise_for_status() - - -def assemble_s3_object(reports, *, total_seconds): - return { - **build_info(), - 'total_seconds': total_seconds, - 'suites': { - name: { - 'total_seconds': suite.total_time, - 'cases': [ - { - 'name': case.name, - 'seconds': case.time, - 'errored': case.errored, - 'failed': case.failed, - 'skipped': case.skipped, - } - for case in suite.test_cases - ], - } - for name, suite in reports.items() - } - } - - -def send_report_to_s3(obj): - job = os.environ.get('CIRCLE_JOB') - sha1 = os.environ.get('CIRCLE_SHA1') - branch = os.environ.get('CIRCLE_BRANCH', '') - if branch not in ['master', 'nightly'] and not branch.startswith("release/"): - print("S3 upload only enabled on master, nightly and release branches.") - print(f"skipping test report on branch: {branch}") - return - now = datetime.datetime.utcnow().isoformat() - key = f'test_time/{sha1}/{job}/{now}Z.json.bz2' # Z meaning UTC - s3 = boto3.resource('s3') - try: - s3.get_bucket_acl(Bucket='ossci-metrics') - except Exception as e: - print(f"AWS ACL failed: {e}") - print("AWS credential found, uploading to S3...") - - obj = s3.Object('ossci-metrics', key) - print("") - # use bz2 because the results are smaller than gzip, and the - # compression time penalty we pay is only about half a second for - # input files of a few megabytes in size like these JSON files, and - # because for some reason zlib doesn't seem to play nice with the - # gunzip command whereas Python's bz2 does work with bzip2 - obj.put(Body=bz2.compress(json.dumps(obj).encode())) - - -def print_regressions(obj, *, num_prev_commits): - sha1 = os.environ.get("CIRCLE_SHA1", "HEAD") - - base = subprocess.check_output( - ["git", "merge-base", sha1, "origin/master"], - encoding="ascii", - ).strip() - - count_spec = f"{base}..{sha1}" - intermediate_commits = int(subprocess.check_output( - ["git", "rev-list", "--count", count_spec], - encoding="ascii" - )) - ancestry_path = int(subprocess.check_output( - ["git", "rev-list", "--ancestry-path", "--count", count_spec], - encoding="ascii", - )) - - # if current commit is already on master, we need to exclude it from - # this history; otherwise we include the merge-base - commits = subprocess.check_output( - ["git", "rev-list", f"--max-count={num_prev_commits+1}", base], - encoding="ascii", - ).splitlines() - on_master = False - if base == sha1: - on_master = True - commits = commits[1:] - else: - commits = commits[:-1] - - job = os.environ.get("CIRCLE_JOB", "") - s3 = boto3.resource("s3") - bucket = s3.Bucket(name="ossci-metrics") - index = {} - for commit in commits: - summaries = bucket.objects.filter(Prefix=f"test_time/{commit}/{job}/") - index[commit] = list(summaries) - - objects: Dict[Commit, List[Report]] = {} - # should we do these in parallel? - for commit, summaries in index.items(): - objects[commit] = [] - for summary in summaries: - binary = summary.get()["Body"].read() - string = bz2.decompress(binary).decode("utf-8") - objects[commit].append(json.loads(string)) - - print() - print(regression_info( - head_sha=sha1, - head_report=obj, - base_reports=objects, - job_name=job, - on_master=on_master, - ancestry_path=ancestry_path - 1, - other_ancestors=intermediate_commits - ancestry_path, - ), end="") - - -def positive_integer(value): - parsed = int(value) - if parsed < 1: - raise argparse.ArgumentTypeError(f"{value} is not a natural number") - return parsed - - -def positive_float(value): - parsed = float(value) - if parsed <= 0.0: - raise argparse.ArgumentTypeError(f"{value} is not a positive rational number") - return parsed - - -if __name__ == '__main__': - import argparse - import sys - parser = argparse.ArgumentParser( - "Print statistics from test XML output.", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - parser.add_argument( - "--longest-of-class", - type=positive_integer, - default=3, - metavar="N", - help="how many longest tests to show for each class", - ) - parser.add_argument( - "--class-print-threshold", - type=positive_float, - default=1.0, - metavar="N", - help="Minimal total time to warrant class report", - ) - parser.add_argument( - "--longest-of-run", - type=positive_integer, - default=10, - metavar="N", - help="how many longest tests to show from the entire run", - ) - if HAVE_BOTO3: - parser.add_argument( - "--upload-to-s3", - action="store_true", - help="upload test time to S3 bucket", - ) - parser.add_argument( - "--compare-with-s3", - action="store_true", - help="download test times for base commits and compare", - ) - parser.add_argument( - "--num-prev-commits", - type=positive_integer, - default=10, - metavar="N", - help="how many previous commits to compare test times with", - ) - parser.add_argument( - "--use-json", - metavar="FILE.json", - help="compare S3 with JSON file, instead of the test report folder", - ) - parser.add_argument( - "folder", - help="test report folder", - ) - args = parser.parse_args() - - reports = parse_reports(args.folder) - if len(reports) == 0: - print(f"No test reports found in {args.folder}") - sys.exit(0) - - send_report_to_scribe(reports) - - longest_tests = [] - total_time = 0 - for name in sorted(reports.keys()): - test_suite = reports[name] - if test_suite.total_time >= args.class_print_threshold: - test_suite.print_report(args.longest_of_class) - total_time += test_suite.total_time - longest_tests.extend(test_suite.test_cases) - longest_tests = sorted(longest_tests, key=lambda x: x.time)[-args.longest_of_run:] - - obj = assemble_s3_object(reports, total_seconds=total_time) - - if args.upload_to_s3: - send_report_to_s3(obj) - - print(f"Total runtime is {datetime.timedelta(seconds=int(total_time))}") - print(f"{len(longest_tests)} longest tests of entire run:") - for test_case in reversed(longest_tests): - print(f" {test_case.class_name}.{test_case.name} time: {test_case.time:.2f} seconds") - - if args.compare_with_s3: - head_json = obj - if args.use_json: - head_json = json.loads(Path(args.use_json).read_text()) - print_regressions(head_json, num_prev_commits=args.num_prev_commits)