Add a mode to rerun all disabled tests (without running anything else) (#88646)

Rerun all disabled test to gather their latest result so that we can close disabled tickets automatically. When running under this mode (RERUN_DISABLED_TESTS=true), only disabled tests are run while the rest are skipped `<skipped message="Test is enabled but --rerun-disabled-tests verification mode is set, so only disabled tests are run" type="skip"/>`

The logic is roughly as follows, the test runs multiple times (n=50)

* If the disabled test passes, and it's flaky, do nothing because it's still flaky.  In the test report, we'll see the test passes with the following skipped message:
```
<testcase classname="TestMultiprocessing" file="test_multiprocessing.py" line="357" name="test_fs" time="0.000" timestamp="0001-01-01T00:00:00">
    <skipped message="{&quot;flaky&quot;: True, &quot;num_red&quot;: 4, &quot;num_green&quot;: 0, &quot;max_num_retries&quot;: 3, &quot;rerun_disabled_test&quot;: true}" type="skip"/>
</testcase>
```

* If the disabled test passes every single time, and it is not flaky anymore, mark it so that it can be closed later.  We will see the test runs and passes, i.e.
```
<testcase classname="TestCommonCUDA" name="test_out_warning_linalg_lu_factor_cuda" time="0.170" file="test_ops.py" />
```

* If the disabled test fails after all retries, this is also expected. So only report this but don't fail the job (because we don't care about red signals here), we'll see the test is skipped (without the `flaky` field), i.e.
```
<testcase classname="TestMultiprocessing" file="test_multiprocessing.py" line="357" name="test_fs" time="0.000" timestamp="0001-01-01T00:00:00">
    <skipped message="{&quot;num_red&quot;: 4, &quot;num_green&quot;: 0, &quot;max_num_retries&quot;: 3, &quot;rerun_disabled_test&quot;: true}" type="skip"/>
</testcase>
```

This runs at the same schedule as `mem_leak_check` (daily).  The change to update test stats, and (potentially) grouping on HUD will come in separated PRs.

### Testing

* pull https://github.com/pytorch/pytorch/actions/runs/3447434434
* trunk https://github.com/pytorch/pytorch/actions/runs/3447434928
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88646
Approved by: https://github.com/clee2000
This commit is contained in:
Huy Do 2022-11-15 05:08:26 +00:00 committed by PyTorch MergeBot
parent 73d71ae3d6
commit 21dd311077
10 changed files with 143 additions and 15 deletions

View file

@ -34,6 +34,13 @@ VALID_TEST_CONFIG_LABELS = {f"{PREFIX}{label}" for label in {
"xla",
}}
# Supported modes when running periodically
SUPPORTED_PERIODICAL_MODES = {
"mem_leak_check",
"rerun_disabled_tests",
}
def parse_args() -> Any:
from argparse import ArgumentParser
parser = ArgumentParser("Filter all test configurations and keep only requested ones")
@ -109,6 +116,23 @@ def filter(test_matrix: Dict[str, List[Any]], labels: Set[str]) -> Dict[str, Lis
return filtered_test_matrix
def set_periodic_modes(test_matrix: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
"""
Apply all periodic modes when running under a schedule
"""
scheduled_test_matrix: Dict[str, List[Any]] = {
"include": [],
}
for config in test_matrix.get("include", []):
for mode in SUPPORTED_PERIODICAL_MODES:
cfg = config.copy()
cfg[mode] = mode
scheduled_test_matrix["include"].append(cfg)
return scheduled_test_matrix
def set_output(name: str, val: Any) -> None:
if os.getenv("GITHUB_OUTPUT"):
with open(str(os.getenv("GITHUB_OUTPUT")), "a") as env:
@ -163,8 +187,7 @@ def main() -> None:
filtered_test_matrix = test_matrix
if args.event_name == "schedule":
for config in filtered_test_matrix.get("include", []):
config["mem_leak_check"] = "mem_leak_check"
filtered_test_matrix = set_periodic_modes(filtered_test_matrix)
# Set the filtered test matrix as the output
set_output("test-matrix", json.dumps(filtered_test_matrix))

View file

@ -4,7 +4,14 @@ import os
import yaml
import json
from unittest import TestCase, main, mock
from filter_test_configs import get_labels, filter, PREFIX, VALID_TEST_CONFIG_LABELS
from filter_test_configs import (
get_labels,
filter,
set_periodic_modes,
PREFIX,
VALID_TEST_CONFIG_LABELS,
SUPPORTED_PERIODICAL_MODES
)
import requests
from requests.models import Response
from typing import Any, Dict
@ -86,5 +93,26 @@ class TestConfigFilter(TestCase):
self.assertEqual(case["expected"], json.dumps(filtered_test_matrix))
def test_set_periodic_modes(self) -> None:
testcases = [
{
"test_matrix": "{include: []}",
"description": "Empty test matrix",
},
{
"test_matrix": '{include: [{config: "default", runner: "linux"}, {config: "cfg", runner: "macos"}]}',
"descripion": "Replicate each periodic mode in a different config",
},
]
for case in testcases:
test_matrix = yaml.safe_load(case["test_matrix"])
scheduled_test_matrix = set_periodic_modes(test_matrix)
self.assertEqual(
len(test_matrix["include"]) * len(SUPPORTED_PERIODICAL_MODES),
len(scheduled_test_matrix["include"])
)
if __name__ == '__main__':
main()

View file

@ -115,7 +115,8 @@ jobs:
DOCKER_IMAGE: ${{ inputs.docker-image }}
XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0'}}
PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
timeout-minutes: 240
run: |
set -x
@ -170,6 +171,7 @@ jobs:
-e XLA_CUDA \
-e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
-e PYTORCH_TEST_RERUN_DISABLED_TESTS \
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
--ulimit stack=10485760:83886080 \
--security-opt seccomp=unconfined \

View file

@ -129,7 +129,8 @@ jobs:
- name: Test
id: test
env:
PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0'}}
PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
run: |
COMMIT_MESSAGES=$(git cherry -v "origin/${GIT_DEFAULT_BRANCH:-master}")

View file

@ -97,7 +97,8 @@ jobs:
DOCKER_IMAGE: ${{ inputs.docker-image }}
XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
PYTORCH_JIT_ENABLE_NVFUSER: 1
PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0'}}
PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
timeout-minutes: 270
run: |
set -x
@ -148,6 +149,7 @@ jobs:
-e SCCACHE_BUCKET \
-e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
-e PYTORCH_TEST_RERUN_DISABLED_TESTS \
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
--ulimit stack=10485760:83886080 \
--security-opt seccomp=unconfined \

View file

@ -124,7 +124,8 @@ jobs:
TEST_CONFIG: ${{ matrix.config }}
PR_BODY: ${{ github.event.pull_request.body }}
TORCH_CUDA_ARCH_LIST: "7.0"
PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0'}}
PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
run: |
COMMIT_MESSAGES=$(git cherry -v "origin/${GIT_DEFAULT_BRANCH:-master}")

View file

@ -439,8 +439,11 @@ def run_test(
if options.pytest:
unittest_args = [arg if arg != "-f" else "-x" for arg in unittest_args]
elif IS_CI:
ci_args = ["--import-slow-tests", "--import-disabled-tests"]
if os.getenv("PYTORCH_TEST_RERUN_DISABLED_TESTS", "0") == "1":
ci_args.append("--rerun-disabled-tests")
# use the downloaded test cases configuration, not supported in pytest
unittest_args.extend(["--import-slow-tests", "--import-disabled-tests"])
unittest_args.extend(ci_args)
# Extra arguments are not supported with pytest
executable = get_executable_command(

View file

@ -2716,6 +2716,9 @@ class ConvDataset(Dataset):
@unittest.skipIf(IS_WINDOWS, "Needs fork")
@unittest.skipIf(
TEST_WITH_ASAN,
"This test hangs when running with ASAN, see https://github.com/pytorch/pytorch/issues/75492")
class TestConvAfterFork(TestCase):
# Tests crash reported in https://github.com/pytorch/pytorch/issues/53565
def test_conv_after_fork(self):

View file

@ -11,7 +11,8 @@ from functools import reduce
import numpy as np
from torch.testing import make_tensor
from torch.testing._internal.common_utils import TestCase, run_tests
from torch.testing._internal.common_utils import (
TestCase, run_tests, TEST_WITH_TORCHDYNAMO)
from torch.testing._internal.common_device_type import (
instantiate_device_type_tests, onlyCUDA, dtypes, dtypesIfCPU, dtypesIfCUDA,
onlyNativeDeviceTypes)
@ -737,6 +738,10 @@ class TestIndexing(TestCase):
self.assertEqual(y, torch.ones(size=(10, 10), device=device))
self.assertEqual(len(w), 2)
@unittest.skipIf(
TEST_WITH_TORCHDYNAMO,
"This test causes SIGKILL when running with dynamo, https://github.com/pytorch/pytorch/issues/88472"
)
def test_index_put_accumulate_large_tensor(self, device):
# This test is for tensors with number of elements >= INT_MAX (2^31 - 1).
N = (1 << 31) + 5

View file

@ -107,7 +107,6 @@ IS_REMOTE_GPU = os.getenv('PYTORCH_TEST_REMOTE_GPU') == '1'
RETRY_TEST_CASES = os.getenv('PYTORCH_RETRY_TEST_CASES') == '1'
OVERRIDE_FLAKY_SIGNAL = os.getenv('PYTORCH_OVERRIDE_FLAKY_SIGNAL') == '1'
DISABLE_RUNNING_SCRIPT_CHK = os.getenv('PYTORCH_DISABLE_RUNNING_SCRIPT_CHK') == '1'
MAX_NUM_RETRIES = 3
DEFAULT_DISABLED_TESTS_FILE = '.pytorch-disabled-tests.json'
DEFAULT_SLOW_TESTS_FILE = '.pytorch-slow-tests.json'
@ -506,6 +505,7 @@ parser.add_argument('--log-suffix', type=str, default="")
parser.add_argument('--run-parallel', type=int, default=1)
parser.add_argument('--import-slow-tests', type=str, nargs='?', const=DEFAULT_SLOW_TESTS_FILE)
parser.add_argument('--import-disabled-tests', type=str, nargs='?', const=DEFAULT_DISABLED_TESTS_FILE)
parser.add_argument('--rerun-disabled-tests', action='store_true')
# Only run when -h or --help flag is active to display both unittest and parser help messages.
def run_unittest_help(argv):
@ -527,6 +527,9 @@ else:
# infer flags based on the default settings
GRAPH_EXECUTOR = cppProfilingFlagsToProfilingMode()
RERUN_DISABLED_TESTS = args.rerun_disabled_tests
# Rerun disabled tests many more times to make sure that they are not flaky anymore
MAX_NUM_RETRIES = 3 if not RERUN_DISABLED_TESTS else 50
SLOW_TESTS_FILE = args.import_slow_tests
DISABLED_TESTS_FILE = args.import_disabled_tests
@ -1653,6 +1656,9 @@ def check_if_enable(test: unittest.TestCase):
raise unittest.SkipTest("test is slow; run with PYTORCH_TEST_WITH_SLOW to enable test")
sanitized_test_method_name = remove_device_and_dtype_suffixes(test._testMethodName)
if not IS_SANDCASTLE:
should_skip = False
skip_msg = ""
for disabled_test, (issue_url, platforms) in disabled_tests_dict.items():
disable_test_parts = disabled_test.split()
if len(disable_test_parts) > 1:
@ -1687,11 +1693,22 @@ def check_if_enable(test: unittest.TestCase):
platforms = list(filter(lambda p: p in platform_to_conditional, platforms))
if platforms == [] or any([platform_to_conditional[platform] for platform in platforms]):
should_skip = True
skip_msg = f"Test is disabled because an issue exists disabling it: {issue_url}" \
f" for {'all' if platforms == [] else ''}platform(s) {', '.join(platforms)}. " \
"If you're seeing this on your local machine and would like to enable this test, " \
"please make sure CI is not set and you are not using the flag --import-disabled-tests."
raise unittest.SkipTest(skip_msg)
break
if should_skip and not RERUN_DISABLED_TESTS:
# Skip the disabled test when not running under --rerun-disabled-tests verification mode
raise unittest.SkipTest(skip_msg)
if not should_skip and RERUN_DISABLED_TESTS:
skip_msg = "Test is enabled but --rerun-disabled-tests verification mode is set, so only" \
" disabled tests are run"
raise unittest.SkipTest(skip_msg)
if TEST_SKIP_FAST:
if not getattr(test, test._testMethodName).__dict__.get('slow_test', False):
raise unittest.SkipTest("test is fast; we disabled it with PYTORCH_TEST_SKIP_FAST")
@ -2039,9 +2056,48 @@ class TestCase(expecttest.TestCase):
def _run_with_retry(self, result=None, num_runs_left=0, report_only=True, num_red=0, num_green=0):
using_unittest = isinstance(result, unittest.TestResult)
if num_runs_left == 0:
# The logic when RERUN_DISABLED_TESTS is set to true is as follows:
# |-if the disabled test passes:
# |-- if it's flaky:
# |--- Do nothing because it's still flaky
# |-- elif it isn't flaky anymore:
# |--- Close the disabled ticket (later)
# |
# |- elif the disabled test fails after n retries:
# |-- This is expected, report this but don't fail the job
skipped_msg = {
"num_red": num_red,
"num_green": num_green,
"max_num_retries": MAX_NUM_RETRIES,
"rerun_disabled_test": RERUN_DISABLED_TESTS,
}
traceback_str = ""
if RERUN_DISABLED_TESTS and using_unittest:
# Hide all failures and errors when RERUN_DISABLED_TESTS is enabled. This is
# a verification check, we don't want more red signals coming from it
if result.failures:
_, traceback_str = result.failures.pop(-1)
if result.errors:
_, traceback_str = result.errors.pop(-1)
if traceback_str:
skipped_msg["traceback_str"] = traceback_str
if num_green == 0:
# The disabled test fails, report as skipped but don't fail the job
result.addSkip(self, json.dumps(skipped_msg))
if num_red == 0:
# The test passes after re-running multiple times. This acts as a signal
# to confirm that it's not flaky anymore
result.addSuccess(self)
if num_green > 0 and num_red > 0 and using_unittest:
result.addSkip(self, f'{{"flaky": {True}, "num_red": {num_red}, "num_green": {num_green},' +
f'"max_num_retries": {MAX_NUM_RETRIES}}}')
skipped_msg["flaky"] = True
# Still flaky, do nothing
result.addSkip(self, json.dumps(skipped_msg))
return
if using_unittest:
@ -2100,9 +2156,13 @@ class TestCase(expecttest.TestCase):
result.addExpectedFailure(self, err)
self._run_with_retry(result=result, num_runs_left=num_retries_left, report_only=report_only,
num_red=num_red + 1, num_green=num_green)
elif report_only and num_retries_left < MAX_NUM_RETRIES:
elif (RERUN_DISABLED_TESTS or report_only) and num_retries_left < MAX_NUM_RETRIES:
# Always re-run up to MAX_NUM_RETRIES when running under report only or rerun disabled tests modes
print(f" {self._testMethodName} succeeded - num_retries_left: {num_retries_left}")
result.addUnexpectedSuccess(self)
if RERUN_DISABLED_TESTS:
result.addSuccess(self)
else:
result.addUnexpectedSuccess(self)
self._run_with_retry(result=result, num_runs_left=num_retries_left, report_only=report_only,
num_red=num_red, num_green=num_green + 1)
elif not report_only and num_retries_left < MAX_NUM_RETRIES: