Add a mode to rerun all disabled tests (without running anything else) (#88646)

Rerun all disabled test to gather their latest result so that we can close disabled tickets automatically. When running under this mode (RERUN_DISABLED_TESTS=true), only disabled tests are run while the rest are skipped `<skipped message="Test is enabled but --rerun-disabled-tests verification mode is set, so only disabled tests are run" type="skip"/>` The logic is roughly as follows, the test runs multiple times (n=50) * If the disabled test passes, and it's flaky, do nothing because it's still flaky. In the test report, we'll see the test passes with the following skipped message: ``` <testcase classname="TestMultiprocessing" file="test_multiprocessing.py" line="357" name="test_fs" time="0.000" timestamp="0001-01-01T00:00:00"> <skipped message="{"flaky": True, "num_red": 4, "num_green": 0, "max_num_retries": 3, "rerun_disabled_test": true}" type="skip"/> </testcase> ``` * If the disabled test passes every single time, and it is not flaky anymore, mark it so that it can be closed later. We will see the test runs and passes, i.e. ``` <testcase classname="TestCommonCUDA" name="test_out_warning_linalg_lu_factor_cuda" time="0.170" file="test_ops.py" /> ``` * If the disabled test fails after all retries, this is also expected. So only report this but don't fail the job (because we don't care about red signals here), we'll see the test is skipped (without the `flaky` field), i.e. ``` <testcase classname="TestMultiprocessing" file="test_multiprocessing.py" line="357" name="test_fs" time="0.000" timestamp="0001-01-01T00:00:00"> <skipped message="{"num_red": 4, "num_green": 0, "max_num_retries": 3, "rerun_disabled_test": true}" type="skip"/> </testcase> ``` This runs at the same schedule as `mem_leak_check` (daily). The change to update test stats, and (potentially) grouping on HUD will come in separated PRs. ### Testing * pull https://github.com/pytorch/pytorch/actions/runs/3447434434 * trunk https://github.com/pytorch/pytorch/actions/runs/3447434928 Pull Request resolved: https://github.com/pytorch/pytorch/pull/88646 Approved by: https://github.com/clee2000
2026-05-14 20:57:59 +00:00 · 2022-11-15 05:08:26 +00:00 · 2022-11-15 05:08:26 +00:00 · 21dd311077
commit 21dd311077
parent 73d71ae3d6
10 changed files with 143 additions and 15 deletions
--- a/.github/scripts/filter_test_configs.py
+++ b/.github/scripts/filter_test_configs.py
@ -34,6 +34,13 @@ VALID_TEST_CONFIG_LABELS = {f"{PREFIX}{label}" for label in {
    "xla",
 }}
 # Supported modes when running periodically
 SUPPORTED_PERIODICAL_MODES = {
    "mem_leak_check",
    "rerun_disabled_tests",
 }
 def parse_args() -> Any:
    from argparse import ArgumentParser
    parser = ArgumentParser("Filter all test configurations and keep only requested ones")
@ -109,6 +116,23 @@ def filter(test_matrix: Dict[str, List[Any]], labels: Set[str]) -> Dict[str, Lis
        return filtered_test_matrix
 def set_periodic_modes(test_matrix: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
    """
    Apply all periodic modes when running under a schedule
    """
    scheduled_test_matrix: Dict[str, List[Any]] = {
        "include": [],
    }
    for config in test_matrix.get("include", []):
        for mode in SUPPORTED_PERIODICAL_MODES:
            cfg = config.copy()
            cfg[mode] = mode
            scheduled_test_matrix["include"].append(cfg)
    return scheduled_test_matrix
 def set_output(name: str, val: Any) -> None:
    if os.getenv("GITHUB_OUTPUT"):
        with open(str(os.getenv("GITHUB_OUTPUT")), "a") as env:
@ -163,8 +187,7 @@ def main() -> None:
        filtered_test_matrix = test_matrix
    if args.event_name == "schedule":
-        for config in filtered_test_matrix.get("include", []):
+        filtered_test_matrix = set_periodic_modes(filtered_test_matrix)
            config["mem_leak_check"] = "mem_leak_check"
    # Set the filtered test matrix as the output
    set_output("test-matrix", json.dumps(filtered_test_matrix))
--- a/.github/scripts/test_filter_test_configs.py
+++ b/.github/scripts/test_filter_test_configs.py
@ -4,7 +4,14 @@ import os
 import yaml
 import json
 from unittest import TestCase, main, mock
-from filter_test_configs import get_labels, filter, PREFIX, VALID_TEST_CONFIG_LABELS
+from filter_test_configs import (
    get_labels,
    filter,
    set_periodic_modes,
    PREFIX,
    VALID_TEST_CONFIG_LABELS,
    SUPPORTED_PERIODICAL_MODES
 )
 import requests
 from requests.models import Response
 from typing import Any, Dict
@ -86,5 +93,26 @@ class TestConfigFilter(TestCase):
            self.assertEqual(case["expected"], json.dumps(filtered_test_matrix))
    def test_set_periodic_modes(self) -> None:
        testcases = [
            {
                "test_matrix": "{include: []}",
                "description": "Empty test matrix",
            },
            {
                "test_matrix": '{include: [{config: "default", runner: "linux"}, {config: "cfg", runner: "macos"}]}',
                "descripion": "Replicate each periodic mode in a different config",
            },
        ]
        for case in testcases:
            test_matrix = yaml.safe_load(case["test_matrix"])
            scheduled_test_matrix = set_periodic_modes(test_matrix)
            self.assertEqual(
                len(test_matrix["include"]) * len(SUPPORTED_PERIODICAL_MODES),
                len(scheduled_test_matrix["include"])
            )
 if __name__ == '__main__':
    main()
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@ -115,7 +115,8 @@ jobs:
          DOCKER_IMAGE: ${{ inputs.docker-image }}
          XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
          XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
-          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0'}}
+          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
          PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
        timeout-minutes: 240
        run: |
          set -x
@ -170,6 +171,7 @@ jobs:
            -e XLA_CUDA \
            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
            -e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
            -e PYTORCH_TEST_RERUN_DISABLED_TESTS \
            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
            --ulimit stack=10485760:83886080 \
            --security-opt seccomp=unconfined \
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@ -129,7 +129,8 @@ jobs:
      - name: Test
        id: test
        env:
-          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0'}}
+          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
          PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
        run: |
          COMMIT_MESSAGES=$(git cherry -v "origin/${GIT_DEFAULT_BRANCH:-master}")
--- a/.github/workflows/_rocm-test.yml
+++ b/.github/workflows/_rocm-test.yml
@ -97,7 +97,8 @@ jobs:
          DOCKER_IMAGE: ${{ inputs.docker-image }}
          XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
          PYTORCH_JIT_ENABLE_NVFUSER: 1
-          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0'}}
+          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
          PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
        timeout-minutes: 270
        run: |
          set -x
@ -148,6 +149,7 @@ jobs:
            -e SCCACHE_BUCKET \
            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
            -e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
            -e PYTORCH_TEST_RERUN_DISABLED_TESTS \
            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
            --ulimit stack=10485760:83886080 \
            --security-opt seccomp=unconfined \
--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@ -124,7 +124,8 @@ jobs:
          TEST_CONFIG: ${{ matrix.config }}
          PR_BODY: ${{ github.event.pull_request.body }}
          TORCH_CUDA_ARCH_LIST: "7.0"
-          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0'}}
+          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
          PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
        run: |
          COMMIT_MESSAGES=$(git cherry -v "origin/${GIT_DEFAULT_BRANCH:-master}")
--- a/test/run_test.py
+++ b/test/run_test.py
@ -439,8 +439,11 @@ def run_test(
    if options.pytest:
        unittest_args = [arg if arg != "-f" else "-x" for arg in unittest_args]
    elif IS_CI:
        ci_args = ["--import-slow-tests", "--import-disabled-tests"]
        if os.getenv("PYTORCH_TEST_RERUN_DISABLED_TESTS", "0") == "1":
            ci_args.append("--rerun-disabled-tests")
        # use the downloaded test cases configuration, not supported in pytest
-        unittest_args.extend(["--import-slow-tests", "--import-disabled-tests"])
+        unittest_args.extend(ci_args)
    # Extra arguments are not supported with pytest
    executable = get_executable_command(
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@ -2716,6 +2716,9 @@ class ConvDataset(Dataset):
@unittest.skipIf(IS_WINDOWS, "Needs fork")
@unittest.skipIf(
    TEST_WITH_ASAN,
    "This test hangs when running with ASAN, see https://github.com/pytorch/pytorch/issues/75492")
 class TestConvAfterFork(TestCase):
    # Tests crash reported in https://github.com/pytorch/pytorch/issues/53565
    def test_conv_after_fork(self):
--- a/test/test_indexing.py
+++ b/test/test_indexing.py
@ -11,7 +11,8 @@ from functools import reduce
 import numpy as np
 from torch.testing import make_tensor
-from torch.testing._internal.common_utils import TestCase, run_tests
+from torch.testing._internal.common_utils import (
    TestCase, run_tests, TEST_WITH_TORCHDYNAMO)
 from torch.testing._internal.common_device_type import (
    instantiate_device_type_tests, onlyCUDA, dtypes, dtypesIfCPU, dtypesIfCUDA,
    onlyNativeDeviceTypes)
@ -737,6 +738,10 @@ class TestIndexing(TestCase):
            self.assertEqual(y, torch.ones(size=(10, 10), device=device))
            self.assertEqual(len(w), 2)
    @unittest.skipIf(
        TEST_WITH_TORCHDYNAMO,
        "This test causes SIGKILL when running with dynamo, https://github.com/pytorch/pytorch/issues/88472"
    )
    def test_index_put_accumulate_large_tensor(self, device):
        # This test is for tensors with number of elements >= INT_MAX (2^31 - 1).
        N = (1 << 31) + 5
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@ -107,7 +107,6 @@ IS_REMOTE_GPU = os.getenv('PYTORCH_TEST_REMOTE_GPU') == '1'
 RETRY_TEST_CASES = os.getenv('PYTORCH_RETRY_TEST_CASES') == '1'
 OVERRIDE_FLAKY_SIGNAL = os.getenv('PYTORCH_OVERRIDE_FLAKY_SIGNAL') == '1'
 DISABLE_RUNNING_SCRIPT_CHK = os.getenv('PYTORCH_DISABLE_RUNNING_SCRIPT_CHK') == '1'
 MAX_NUM_RETRIES = 3
 DEFAULT_DISABLED_TESTS_FILE = '.pytorch-disabled-tests.json'
 DEFAULT_SLOW_TESTS_FILE = '.pytorch-slow-tests.json'
@ -506,6 +505,7 @@ parser.add_argument('--log-suffix', type=str, default="")
 parser.add_argument('--run-parallel', type=int, default=1)
 parser.add_argument('--import-slow-tests', type=str, nargs='?', const=DEFAULT_SLOW_TESTS_FILE)
 parser.add_argument('--import-disabled-tests', type=str, nargs='?', const=DEFAULT_DISABLED_TESTS_FILE)
 parser.add_argument('--rerun-disabled-tests', action='store_true')
 # Only run when -h or --help flag is active to display both unittest and parser help messages.
 def run_unittest_help(argv):
@ -527,6 +527,9 @@ else:
    # infer flags based on the default settings
    GRAPH_EXECUTOR = cppProfilingFlagsToProfilingMode()
 RERUN_DISABLED_TESTS = args.rerun_disabled_tests
 # Rerun disabled tests many more times to make sure that they are not flaky anymore
 MAX_NUM_RETRIES = 3 if not RERUN_DISABLED_TESTS else 50
 SLOW_TESTS_FILE = args.import_slow_tests
 DISABLED_TESTS_FILE = args.import_disabled_tests
@ -1653,6 +1656,9 @@ def check_if_enable(test: unittest.TestCase):
            raise unittest.SkipTest("test is slow; run with PYTORCH_TEST_WITH_SLOW to enable test")
    sanitized_test_method_name = remove_device_and_dtype_suffixes(test._testMethodName)
    if not IS_SANDCASTLE:
        should_skip = False
        skip_msg = ""
        for disabled_test, (issue_url, platforms) in disabled_tests_dict.items():
            disable_test_parts = disabled_test.split()
            if len(disable_test_parts) > 1:
@ -1687,11 +1693,22 @@ def check_if_enable(test: unittest.TestCase):
                        platforms = list(filter(lambda p: p in platform_to_conditional, platforms))
                    if platforms == [] or any([platform_to_conditional[platform] for platform in platforms]):
                        should_skip = True
                        skip_msg = f"Test is disabled because an issue exists disabling it: {issue_url}" \
                            f" for {'all' if platforms == [] else ''}platform(s) {', '.join(platforms)}. " \
                            "If you're seeing this on your local machine and would like to enable this test, " \
                            "please make sure CI is not set and you are not using the flag --import-disabled-tests."
-                        raise unittest.SkipTest(skip_msg)
+                        break
        if should_skip and not RERUN_DISABLED_TESTS:
            # Skip the disabled test when not running under --rerun-disabled-tests verification mode
            raise unittest.SkipTest(skip_msg)
        if not should_skip and RERUN_DISABLED_TESTS:
            skip_msg = "Test is enabled but --rerun-disabled-tests verification mode is set, so only" \
                " disabled tests are run"
            raise unittest.SkipTest(skip_msg)
    if TEST_SKIP_FAST:
        if not getattr(test, test._testMethodName).__dict__.get('slow_test', False):
            raise unittest.SkipTest("test is fast; we disabled it with PYTORCH_TEST_SKIP_FAST")
@ -2039,9 +2056,48 @@ class TestCase(expecttest.TestCase):
    def _run_with_retry(self, result=None, num_runs_left=0, report_only=True, num_red=0, num_green=0):
        using_unittest = isinstance(result, unittest.TestResult)
        if num_runs_left == 0:
            # The logic when RERUN_DISABLED_TESTS is set to true is as follows:
            # |-if the disabled test passes:
            # |-- if it's flaky:
            # |---  Do nothing because it's still flaky
            # |-- elif it isn't flaky anymore:
            # |---  Close the disabled ticket (later)
            # |
            # |- elif the disabled test fails after n retries:
            # |--  This is expected, report this but don't fail the job
            skipped_msg = {
                "num_red": num_red,
                "num_green": num_green,
                "max_num_retries": MAX_NUM_RETRIES,
                "rerun_disabled_test": RERUN_DISABLED_TESTS,
            }
            traceback_str = ""
            if RERUN_DISABLED_TESTS and using_unittest:
                # Hide all failures and errors when RERUN_DISABLED_TESTS is enabled. This is
                # a verification check, we don't want more red signals coming from it
                if result.failures:
                    _, traceback_str = result.failures.pop(-1)
                if result.errors:
                    _, traceback_str = result.errors.pop(-1)
                if traceback_str:
                    skipped_msg["traceback_str"] = traceback_str
                if num_green == 0:
                    # The disabled test fails, report as skipped but don't fail the job
                    result.addSkip(self, json.dumps(skipped_msg))
                if num_red == 0:
                    # The test passes after re-running multiple times. This acts as a signal
                    # to confirm that it's not flaky anymore
                    result.addSuccess(self)
            if num_green > 0 and num_red > 0 and using_unittest:
-                result.addSkip(self, f'{{"flaky": {True}, "num_red": {num_red}, "num_green": {num_green},' +
+                skipped_msg["flaky"] = True
-                                     f'"max_num_retries": {MAX_NUM_RETRIES}}}')
+                # Still flaky, do nothing
                result.addSkip(self, json.dumps(skipped_msg))
            return
        if using_unittest:
@ -2100,9 +2156,13 @@ class TestCase(expecttest.TestCase):
                result.addExpectedFailure(self, err)
            self._run_with_retry(result=result, num_runs_left=num_retries_left, report_only=report_only,
                                 num_red=num_red + 1, num_green=num_green)
-        elif report_only and num_retries_left < MAX_NUM_RETRIES:
+        elif (RERUN_DISABLED_TESTS or report_only) and num_retries_left < MAX_NUM_RETRIES:
            # Always re-run up to MAX_NUM_RETRIES when running under report only or rerun disabled tests modes
            print(f"    {self._testMethodName} succeeded - num_retries_left: {num_retries_left}")
-            result.addUnexpectedSuccess(self)
+            if RERUN_DISABLED_TESTS:
                result.addSuccess(self)
            else:
                result.addUnexpectedSuccess(self)
            self._run_with_retry(result=result, num_runs_left=num_retries_left, report_only=report_only,
                                 num_red=num_red, num_green=num_green + 1)
        elif not report_only and num_retries_left < MAX_NUM_RETRIES: