From 21dd311077d00ff5c3f930295ddc8cf915a262d7 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Tue, 15 Nov 2022 05:08:26 +0000
Subject: [PATCH] Add a mode to rerun all disabled tests (without running
 anything else) (#88646)

Rerun all disabled test to gather their latest result so that we can close disabled tickets automatically. When running under this mode (RERUN_DISABLED_TESTS=true), only disabled tests are run while the rest are skipped `<skipped message="Test is enabled but --rerun-disabled-tests verification mode is set, so only disabled tests are run" type="skip"/>`

The logic is roughly as follows, the test runs multiple times (n=50)

* If the disabled test passes, and it's flaky, do nothing because it's still flaky.  In the test report, we'll see the test passes with the following skipped message:
```
<testcase classname="TestMultiprocessing" file="test_multiprocessing.py" line="357" name="test_fs" time="0.000" timestamp="0001-01-01T00:00:00">
    <skipped message="{&quot;flaky&quot;: True, &quot;num_red&quot;: 4, &quot;num_green&quot;: 0, &quot;max_num_retries&quot;: 3, &quot;rerun_disabled_test&quot;: true}" type="skip"/>
</testcase>
```

* If the disabled test passes every single time, and it is not flaky anymore, mark it so that it can be closed later.  We will see the test runs and passes, i.e.
```
<testcase classname="TestCommonCUDA" name="test_out_warning_linalg_lu_factor_cuda" time="0.170" file="test_ops.py" />
```

* If the disabled test fails after all retries, this is also expected. So only report this but don't fail the job (because we don't care about red signals here), we'll see the test is skipped (without the `flaky` field), i.e.
```
<testcase classname="TestMultiprocessing" file="test_multiprocessing.py" line="357" name="test_fs" time="0.000" timestamp="0001-01-01T00:00:00">
    <skipped message="{&quot;num_red&quot;: 4, &quot;num_green&quot;: 0, &quot;max_num_retries&quot;: 3, &quot;rerun_disabled_test&quot;: true}" type="skip"/>
</testcase>
```

This runs at the same schedule as `mem_leak_check` (daily).  The change to update test stats, and (potentially) grouping on HUD will come in separated PRs.

### Testing

* pull https://github.com/pytorch/pytorch/actions/runs/3447434434
* trunk https://github.com/pytorch/pytorch/actions/runs/3447434928
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88646
Approved by: https://github.com/clee2000
---
 .github/scripts/filter_test_configs.py      | 27 +++++++-
 .github/scripts/test_filter_test_configs.py | 30 ++++++++-
 .github/workflows/_linux-test.yml           |  4 +-
 .github/workflows/_mac-test.yml             |  3 +-
 .github/workflows/_rocm-test.yml            |  4 +-
 .github/workflows/_win-test.yml             |  3 +-
 test/run_test.py                            |  5 +-
 test/test_dataloader.py                     |  3 +
 test/test_indexing.py                       |  7 +-
 torch/testing/_internal/common_utils.py     | 72 +++++++++++++++++++--
 10 files changed, 143 insertions(+), 15 deletions(-)
diff --git a/.github/scripts/filter_test_configs.py b/.github/scripts/filter_test_configs.py
index 06c8f90441e..bb5314434e0 100755
--- a/.github/scripts/filter_test_configs.py
+++ b/.github/scripts/filter_test_configs.py
@@ -34,6 +34,13 @@ VALID_TEST_CONFIG_LABELS = {f"{PREFIX}{label}" for label in {
     "xla",
 }}
 
+# Supported modes when running periodically
+SUPPORTED_PERIODICAL_MODES = {
+    "mem_leak_check",
+    "rerun_disabled_tests",
+}
+
+
 def parse_args() -> Any:
     from argparse import ArgumentParser
     parser = ArgumentParser("Filter all test configurations and keep only requested ones")
@@ -109,6 +116,23 @@ def filter(test_matrix: Dict[str, List[Any]], labels: Set[str]) -> Dict[str, Lis
         return filtered_test_matrix
 
 
+def set_periodic_modes(test_matrix: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+    """
+    Apply all periodic modes when running under a schedule
+    """
+    scheduled_test_matrix: Dict[str, List[Any]] = {
+        "include": [],
+    }
+
+    for config in test_matrix.get("include", []):
+        for mode in SUPPORTED_PERIODICAL_MODES:
+            cfg = config.copy()
+            cfg[mode] = mode
+            scheduled_test_matrix["include"].append(cfg)
+
+    return scheduled_test_matrix
+
+
 def set_output(name: str, val: Any) -> None:
     if os.getenv("GITHUB_OUTPUT"):
         with open(str(os.getenv("GITHUB_OUTPUT")), "a") as env:
@@ -163,8 +187,7 @@ def main() -> None:
         filtered_test_matrix = test_matrix
 
     if args.event_name == "schedule":
-        for config in filtered_test_matrix.get("include", []):
-            config["mem_leak_check"] = "mem_leak_check"
+        filtered_test_matrix = set_periodic_modes(filtered_test_matrix)
 
     # Set the filtered test matrix as the output
     set_output("test-matrix", json.dumps(filtered_test_matrix))
diff --git a/.github/scripts/test_filter_test_configs.py b/.github/scripts/test_filter_test_configs.py
index a043a353554..55410e846c9 100755
--- a/.github/scripts/test_filter_test_configs.py
+++ b/.github/scripts/test_filter_test_configs.py
@@ -4,7 +4,14 @@ import os
 import yaml
 import json
 from unittest import TestCase, main, mock
-from filter_test_configs import get_labels, filter, PREFIX, VALID_TEST_CONFIG_LABELS
+from filter_test_configs import (
+    get_labels,
+    filter,
+    set_periodic_modes,
+    PREFIX,
+    VALID_TEST_CONFIG_LABELS,
+    SUPPORTED_PERIODICAL_MODES
+)
 import requests
 from requests.models import Response
 from typing import Any, Dict
@@ -86,5 +93,26 @@ class TestConfigFilter(TestCase):
             self.assertEqual(case["expected"], json.dumps(filtered_test_matrix))
 
 
+    def test_set_periodic_modes(self) -> None:
+        testcases = [
+            {
+                "test_matrix": "{include: []}",
+                "description": "Empty test matrix",
+            },
+            {
+                "test_matrix": '{include: [{config: "default", runner: "linux"}, {config: "cfg", runner: "macos"}]}',
+                "descripion": "Replicate each periodic mode in a different config",
+            },
+        ]
+
+        for case in testcases:
+            test_matrix = yaml.safe_load(case["test_matrix"])
+            scheduled_test_matrix = set_periodic_modes(test_matrix)
+            self.assertEqual(
+                len(test_matrix["include"]) * len(SUPPORTED_PERIODICAL_MODES),
+                len(scheduled_test_matrix["include"])
+            )
+
+
 if __name__ == '__main__':
     main()
diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml
index dc1346205e6..6ad30080fd6 100644
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@@ -115,7 +115,8 @@ jobs:
           DOCKER_IMAGE: ${{ inputs.docker-image }}
           XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
           XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
-          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0'}}
+          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
+          PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
         timeout-minutes: 240
         run: |
           set -x
@@ -170,6 +171,7 @@ jobs:
             -e XLA_CUDA \
             -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
             -e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
+            -e PYTORCH_TEST_RERUN_DISABLED_TESTS \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --ulimit stack=10485760:83886080 \
             --security-opt seccomp=unconfined \
diff --git a/.github/workflows/_mac-test.yml b/.github/workflows/_mac-test.yml
index 82dee7b5484..cbc3372e1c4 100644
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@@ -129,7 +129,8 @@ jobs:
       - name: Test
         id: test
         env:
-          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0'}}
+          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
+          PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
         run: |
           COMMIT_MESSAGES=$(git cherry -v "origin/${GIT_DEFAULT_BRANCH:-master}")
 
diff --git a/.github/workflows/_rocm-test.yml b/.github/workflows/_rocm-test.yml
index 0d8ff874ba0..dd1a0830275 100644
--- a/.github/workflows/_rocm-test.yml
+++ b/.github/workflows/_rocm-test.yml
@@ -97,7 +97,8 @@ jobs:
           DOCKER_IMAGE: ${{ inputs.docker-image }}
           XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
           PYTORCH_JIT_ENABLE_NVFUSER: 1
-          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0'}}
+          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
+          PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
         timeout-minutes: 270
         run: |
           set -x
@@ -148,6 +149,7 @@ jobs:
             -e SCCACHE_BUCKET \
             -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
             -e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
+            -e PYTORCH_TEST_RERUN_DISABLED_TESTS \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --ulimit stack=10485760:83886080 \
             --security-opt seccomp=unconfined \
diff --git a/.github/workflows/_win-test.yml b/.github/workflows/_win-test.yml
index a0047abbc0f..0cabb8ec469 100644
--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@@ -124,7 +124,8 @@ jobs:
           TEST_CONFIG: ${{ matrix.config }}
           PR_BODY: ${{ github.event.pull_request.body }}
           TORCH_CUDA_ARCH_LIST: "7.0"
-          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0'}}
+          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
+          PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
         run: |
           COMMIT_MESSAGES=$(git cherry -v "origin/${GIT_DEFAULT_BRANCH:-master}")
 
diff --git a/test/run_test.py b/test/run_test.py
index 59454c6aaa3..1273ab45c4f 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -439,8 +439,11 @@ def run_test(
     if options.pytest:
         unittest_args = [arg if arg != "-f" else "-x" for arg in unittest_args]
     elif IS_CI:
+        ci_args = ["--import-slow-tests", "--import-disabled-tests"]
+        if os.getenv("PYTORCH_TEST_RERUN_DISABLED_TESTS", "0") == "1":
+            ci_args.append("--rerun-disabled-tests")
         # use the downloaded test cases configuration, not supported in pytest
-        unittest_args.extend(["--import-slow-tests", "--import-disabled-tests"])
+        unittest_args.extend(ci_args)
 
     # Extra arguments are not supported with pytest
     executable = get_executable_command(
diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index 6a7ff90527d..347f9be73e8 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -2716,6 +2716,9 @@ class ConvDataset(Dataset):
 
 
 @unittest.skipIf(IS_WINDOWS, "Needs fork")
+@unittest.skipIf(
+    TEST_WITH_ASAN,
+    "This test hangs when running with ASAN, see https://github.com/pytorch/pytorch/issues/75492")
 class TestConvAfterFork(TestCase):
     # Tests crash reported in https://github.com/pytorch/pytorch/issues/53565
     def test_conv_after_fork(self):
diff --git a/test/test_indexing.py b/test/test_indexing.py
index 1d5f2ea68ac..5dc23a3d546 100644
--- a/test/test_indexing.py
+++ b/test/test_indexing.py
@@ -11,7 +11,8 @@ from functools import reduce
 import numpy as np
 
 from torch.testing import make_tensor
-from torch.testing._internal.common_utils import TestCase, run_tests
+from torch.testing._internal.common_utils import (
+    TestCase, run_tests, TEST_WITH_TORCHDYNAMO)
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests, onlyCUDA, dtypes, dtypesIfCPU, dtypesIfCUDA,
     onlyNativeDeviceTypes)
@@ -737,6 +738,10 @@ class TestIndexing(TestCase):
             self.assertEqual(y, torch.ones(size=(10, 10), device=device))
             self.assertEqual(len(w), 2)
 
+    @unittest.skipIf(
+        TEST_WITH_TORCHDYNAMO,
+        "This test causes SIGKILL when running with dynamo, https://github.com/pytorch/pytorch/issues/88472"
+    )
     def test_index_put_accumulate_large_tensor(self, device):
         # This test is for tensors with number of elements >= INT_MAX (2^31 - 1).
         N = (1 << 31) + 5
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 8f497d515eb..e0b703046c5 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -107,7 +107,6 @@ IS_REMOTE_GPU = os.getenv('PYTORCH_TEST_REMOTE_GPU') == '1'
 RETRY_TEST_CASES = os.getenv('PYTORCH_RETRY_TEST_CASES') == '1'
 OVERRIDE_FLAKY_SIGNAL = os.getenv('PYTORCH_OVERRIDE_FLAKY_SIGNAL') == '1'
 DISABLE_RUNNING_SCRIPT_CHK = os.getenv('PYTORCH_DISABLE_RUNNING_SCRIPT_CHK') == '1'
-MAX_NUM_RETRIES = 3
 
 DEFAULT_DISABLED_TESTS_FILE = '.pytorch-disabled-tests.json'
 DEFAULT_SLOW_TESTS_FILE = '.pytorch-slow-tests.json'
@@ -506,6 +505,7 @@ parser.add_argument('--log-suffix', type=str, default="")
 parser.add_argument('--run-parallel', type=int, default=1)
 parser.add_argument('--import-slow-tests', type=str, nargs='?', const=DEFAULT_SLOW_TESTS_FILE)
 parser.add_argument('--import-disabled-tests', type=str, nargs='?', const=DEFAULT_DISABLED_TESTS_FILE)
+parser.add_argument('--rerun-disabled-tests', action='store_true')
 
 # Only run when -h or --help flag is active to display both unittest and parser help messages.
 def run_unittest_help(argv):
@@ -527,6 +527,9 @@ else:
     # infer flags based on the default settings
     GRAPH_EXECUTOR = cppProfilingFlagsToProfilingMode()
 
+RERUN_DISABLED_TESTS = args.rerun_disabled_tests
+# Rerun disabled tests many more times to make sure that they are not flaky anymore
+MAX_NUM_RETRIES = 3 if not RERUN_DISABLED_TESTS else 50
 
 SLOW_TESTS_FILE = args.import_slow_tests
 DISABLED_TESTS_FILE = args.import_disabled_tests
@@ -1653,6 +1656,9 @@ def check_if_enable(test: unittest.TestCase):
             raise unittest.SkipTest("test is slow; run with PYTORCH_TEST_WITH_SLOW to enable test")
     sanitized_test_method_name = remove_device_and_dtype_suffixes(test._testMethodName)
     if not IS_SANDCASTLE:
+        should_skip = False
+        skip_msg = ""
+
         for disabled_test, (issue_url, platforms) in disabled_tests_dict.items():
             disable_test_parts = disabled_test.split()
             if len(disable_test_parts) > 1:
@@ -1687,11 +1693,22 @@ def check_if_enable(test: unittest.TestCase):
                         platforms = list(filter(lambda p: p in platform_to_conditional, platforms))
 
                     if platforms == [] or any([platform_to_conditional[platform] for platform in platforms]):
+                        should_skip = True
                         skip_msg = f"Test is disabled because an issue exists disabling it: {issue_url}" \
                             f" for {'all' if platforms == [] else ''}platform(s) {', '.join(platforms)}. " \
                             "If you're seeing this on your local machine and would like to enable this test, " \
                             "please make sure CI is not set and you are not using the flag --import-disabled-tests."
-                        raise unittest.SkipTest(skip_msg)
+                        break
+
+        if should_skip and not RERUN_DISABLED_TESTS:
+            # Skip the disabled test when not running under --rerun-disabled-tests verification mode
+            raise unittest.SkipTest(skip_msg)
+
+        if not should_skip and RERUN_DISABLED_TESTS:
+            skip_msg = "Test is enabled but --rerun-disabled-tests verification mode is set, so only" \
+                " disabled tests are run"
+            raise unittest.SkipTest(skip_msg)
+
     if TEST_SKIP_FAST:
         if not getattr(test, test._testMethodName).__dict__.get('slow_test', False):
             raise unittest.SkipTest("test is fast; we disabled it with PYTORCH_TEST_SKIP_FAST")
@@ -2039,9 +2056,48 @@ class TestCase(expecttest.TestCase):
     def _run_with_retry(self, result=None, num_runs_left=0, report_only=True, num_red=0, num_green=0):
         using_unittest = isinstance(result, unittest.TestResult)
         if num_runs_left == 0:
+            # The logic when RERUN_DISABLED_TESTS is set to true is as follows:
+            # |-if the disabled test passes:
+            # |-- if it's flaky:
+            # |---  Do nothing because it's still flaky
+            # |-- elif it isn't flaky anymore:
+            # |---  Close the disabled ticket (later)
+            # |
+            # |- elif the disabled test fails after n retries:
+            # |--  This is expected, report this but don't fail the job
+            skipped_msg = {
+                "num_red": num_red,
+                "num_green": num_green,
+                "max_num_retries": MAX_NUM_RETRIES,
+                "rerun_disabled_test": RERUN_DISABLED_TESTS,
+            }
+
+            traceback_str = ""
+            if RERUN_DISABLED_TESTS and using_unittest:
+                # Hide all failures and errors when RERUN_DISABLED_TESTS is enabled. This is
+                # a verification check, we don't want more red signals coming from it
+                if result.failures:
+                    _, traceback_str = result.failures.pop(-1)
+                if result.errors:
+                    _, traceback_str = result.errors.pop(-1)
+
+                if traceback_str:
+                    skipped_msg["traceback_str"] = traceback_str
+
+                if num_green == 0:
+                    # The disabled test fails, report as skipped but don't fail the job
+                    result.addSkip(self, json.dumps(skipped_msg))
+
+                if num_red == 0:
+                    # The test passes after re-running multiple times. This acts as a signal
+                    # to confirm that it's not flaky anymore
+                    result.addSuccess(self)
+
             if num_green > 0 and num_red > 0 and using_unittest:
-                result.addSkip(self, f'{{"flaky": {True}, "num_red": {num_red}, "num_green": {num_green},' +
-                                     f'"max_num_retries": {MAX_NUM_RETRIES}}}')
+                skipped_msg["flaky"] = True
+                # Still flaky, do nothing
+                result.addSkip(self, json.dumps(skipped_msg))
+
             return
 
         if using_unittest:
@@ -2100,9 +2156,13 @@ class TestCase(expecttest.TestCase):
                 result.addExpectedFailure(self, err)
             self._run_with_retry(result=result, num_runs_left=num_retries_left, report_only=report_only,
                                  num_red=num_red + 1, num_green=num_green)
-        elif report_only and num_retries_left < MAX_NUM_RETRIES:
+        elif (RERUN_DISABLED_TESTS or report_only) and num_retries_left < MAX_NUM_RETRIES:
+            # Always re-run up to MAX_NUM_RETRIES when running under report only or rerun disabled tests modes
             print(f"    {self._testMethodName} succeeded - num_retries_left: {num_retries_left}")
-            result.addUnexpectedSuccess(self)
+            if RERUN_DISABLED_TESTS:
+                result.addSuccess(self)
+            else:
+                result.addUnexpectedSuccess(self)
             self._run_with_retry(result=result, num_runs_left=num_retries_left, report_only=report_only,
                                  num_red=num_red, num_green=num_green + 1)
         elif not report_only and num_retries_left < MAX_NUM_RETRIES: