diff --git a/.github/scripts/filter_test_configs.py b/.github/scripts/filter_test_configs.py index 06c8f90441e..bb5314434e0 100755 --- a/.github/scripts/filter_test_configs.py +++ b/.github/scripts/filter_test_configs.py @@ -34,6 +34,13 @@ VALID_TEST_CONFIG_LABELS = {f"{PREFIX}{label}" for label in { "xla", }} +# Supported modes when running periodically +SUPPORTED_PERIODICAL_MODES = { + "mem_leak_check", + "rerun_disabled_tests", +} + + def parse_args() -> Any: from argparse import ArgumentParser parser = ArgumentParser("Filter all test configurations and keep only requested ones") @@ -109,6 +116,23 @@ def filter(test_matrix: Dict[str, List[Any]], labels: Set[str]) -> Dict[str, Lis return filtered_test_matrix +def set_periodic_modes(test_matrix: Dict[str, List[Any]]) -> Dict[str, List[Any]]: + """ + Apply all periodic modes when running under a schedule + """ + scheduled_test_matrix: Dict[str, List[Any]] = { + "include": [], + } + + for config in test_matrix.get("include", []): + for mode in SUPPORTED_PERIODICAL_MODES: + cfg = config.copy() + cfg[mode] = mode + scheduled_test_matrix["include"].append(cfg) + + return scheduled_test_matrix + + def set_output(name: str, val: Any) -> None: if os.getenv("GITHUB_OUTPUT"): with open(str(os.getenv("GITHUB_OUTPUT")), "a") as env: @@ -163,8 +187,7 @@ def main() -> None: filtered_test_matrix = test_matrix if args.event_name == "schedule": - for config in filtered_test_matrix.get("include", []): - config["mem_leak_check"] = "mem_leak_check" + filtered_test_matrix = set_periodic_modes(filtered_test_matrix) # Set the filtered test matrix as the output set_output("test-matrix", json.dumps(filtered_test_matrix)) diff --git a/.github/scripts/test_filter_test_configs.py b/.github/scripts/test_filter_test_configs.py index a043a353554..55410e846c9 100755 --- a/.github/scripts/test_filter_test_configs.py +++ b/.github/scripts/test_filter_test_configs.py @@ -4,7 +4,14 @@ import os import yaml import json from unittest import TestCase, main, mock -from filter_test_configs import get_labels, filter, PREFIX, VALID_TEST_CONFIG_LABELS +from filter_test_configs import ( + get_labels, + filter, + set_periodic_modes, + PREFIX, + VALID_TEST_CONFIG_LABELS, + SUPPORTED_PERIODICAL_MODES +) import requests from requests.models import Response from typing import Any, Dict @@ -86,5 +93,26 @@ class TestConfigFilter(TestCase): self.assertEqual(case["expected"], json.dumps(filtered_test_matrix)) + def test_set_periodic_modes(self) -> None: + testcases = [ + { + "test_matrix": "{include: []}", + "description": "Empty test matrix", + }, + { + "test_matrix": '{include: [{config: "default", runner: "linux"}, {config: "cfg", runner: "macos"}]}', + "descripion": "Replicate each periodic mode in a different config", + }, + ] + + for case in testcases: + test_matrix = yaml.safe_load(case["test_matrix"]) + scheduled_test_matrix = set_periodic_modes(test_matrix) + self.assertEqual( + len(test_matrix["include"]) * len(SUPPORTED_PERIODICAL_MODES), + len(scheduled_test_matrix["include"]) + ) + + if __name__ == '__main__': main() diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml index dc1346205e6..6ad30080fd6 100644 --- a/.github/workflows/_linux-test.yml +++ b/.github/workflows/_linux-test.yml @@ -115,7 +115,8 @@ jobs: DOCKER_IMAGE: ${{ inputs.docker-image }} XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }} XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla - PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0'}} + PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }} + PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }} timeout-minutes: 240 run: | set -x @@ -170,6 +171,7 @@ jobs: -e XLA_CUDA \ -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ -e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \ + -e PYTORCH_TEST_RERUN_DISABLED_TESTS \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --ulimit stack=10485760:83886080 \ --security-opt seccomp=unconfined \ diff --git a/.github/workflows/_mac-test.yml b/.github/workflows/_mac-test.yml index 82dee7b5484..cbc3372e1c4 100644 --- a/.github/workflows/_mac-test.yml +++ b/.github/workflows/_mac-test.yml @@ -129,7 +129,8 @@ jobs: - name: Test id: test env: - PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0'}} + PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }} + PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }} run: | COMMIT_MESSAGES=$(git cherry -v "origin/${GIT_DEFAULT_BRANCH:-master}") diff --git a/.github/workflows/_rocm-test.yml b/.github/workflows/_rocm-test.yml index 0d8ff874ba0..dd1a0830275 100644 --- a/.github/workflows/_rocm-test.yml +++ b/.github/workflows/_rocm-test.yml @@ -97,7 +97,8 @@ jobs: DOCKER_IMAGE: ${{ inputs.docker-image }} XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla PYTORCH_JIT_ENABLE_NVFUSER: 1 - PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0'}} + PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }} + PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }} timeout-minutes: 270 run: | set -x @@ -148,6 +149,7 @@ jobs: -e SCCACHE_BUCKET \ -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ -e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \ + -e PYTORCH_TEST_RERUN_DISABLED_TESTS \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --ulimit stack=10485760:83886080 \ --security-opt seccomp=unconfined \ diff --git a/.github/workflows/_win-test.yml b/.github/workflows/_win-test.yml index a0047abbc0f..0cabb8ec469 100644 --- a/.github/workflows/_win-test.yml +++ b/.github/workflows/_win-test.yml @@ -124,7 +124,8 @@ jobs: TEST_CONFIG: ${{ matrix.config }} PR_BODY: ${{ github.event.pull_request.body }} TORCH_CUDA_ARCH_LIST: "7.0" - PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0'}} + PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }} + PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }} run: | COMMIT_MESSAGES=$(git cherry -v "origin/${GIT_DEFAULT_BRANCH:-master}") diff --git a/test/run_test.py b/test/run_test.py index 59454c6aaa3..1273ab45c4f 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -439,8 +439,11 @@ def run_test( if options.pytest: unittest_args = [arg if arg != "-f" else "-x" for arg in unittest_args] elif IS_CI: + ci_args = ["--import-slow-tests", "--import-disabled-tests"] + if os.getenv("PYTORCH_TEST_RERUN_DISABLED_TESTS", "0") == "1": + ci_args.append("--rerun-disabled-tests") # use the downloaded test cases configuration, not supported in pytest - unittest_args.extend(["--import-slow-tests", "--import-disabled-tests"]) + unittest_args.extend(ci_args) # Extra arguments are not supported with pytest executable = get_executable_command( diff --git a/test/test_dataloader.py b/test/test_dataloader.py index 6a7ff90527d..347f9be73e8 100644 --- a/test/test_dataloader.py +++ b/test/test_dataloader.py @@ -2716,6 +2716,9 @@ class ConvDataset(Dataset): @unittest.skipIf(IS_WINDOWS, "Needs fork") +@unittest.skipIf( + TEST_WITH_ASAN, + "This test hangs when running with ASAN, see https://github.com/pytorch/pytorch/issues/75492") class TestConvAfterFork(TestCase): # Tests crash reported in https://github.com/pytorch/pytorch/issues/53565 def test_conv_after_fork(self): diff --git a/test/test_indexing.py b/test/test_indexing.py index 1d5f2ea68ac..5dc23a3d546 100644 --- a/test/test_indexing.py +++ b/test/test_indexing.py @@ -11,7 +11,8 @@ from functools import reduce import numpy as np from torch.testing import make_tensor -from torch.testing._internal.common_utils import TestCase, run_tests +from torch.testing._internal.common_utils import ( + TestCase, run_tests, TEST_WITH_TORCHDYNAMO) from torch.testing._internal.common_device_type import ( instantiate_device_type_tests, onlyCUDA, dtypes, dtypesIfCPU, dtypesIfCUDA, onlyNativeDeviceTypes) @@ -737,6 +738,10 @@ class TestIndexing(TestCase): self.assertEqual(y, torch.ones(size=(10, 10), device=device)) self.assertEqual(len(w), 2) + @unittest.skipIf( + TEST_WITH_TORCHDYNAMO, + "This test causes SIGKILL when running with dynamo, https://github.com/pytorch/pytorch/issues/88472" + ) def test_index_put_accumulate_large_tensor(self, device): # This test is for tensors with number of elements >= INT_MAX (2^31 - 1). N = (1 << 31) + 5 diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py index 8f497d515eb..e0b703046c5 100644 --- a/torch/testing/_internal/common_utils.py +++ b/torch/testing/_internal/common_utils.py @@ -107,7 +107,6 @@ IS_REMOTE_GPU = os.getenv('PYTORCH_TEST_REMOTE_GPU') == '1' RETRY_TEST_CASES = os.getenv('PYTORCH_RETRY_TEST_CASES') == '1' OVERRIDE_FLAKY_SIGNAL = os.getenv('PYTORCH_OVERRIDE_FLAKY_SIGNAL') == '1' DISABLE_RUNNING_SCRIPT_CHK = os.getenv('PYTORCH_DISABLE_RUNNING_SCRIPT_CHK') == '1' -MAX_NUM_RETRIES = 3 DEFAULT_DISABLED_TESTS_FILE = '.pytorch-disabled-tests.json' DEFAULT_SLOW_TESTS_FILE = '.pytorch-slow-tests.json' @@ -506,6 +505,7 @@ parser.add_argument('--log-suffix', type=str, default="") parser.add_argument('--run-parallel', type=int, default=1) parser.add_argument('--import-slow-tests', type=str, nargs='?', const=DEFAULT_SLOW_TESTS_FILE) parser.add_argument('--import-disabled-tests', type=str, nargs='?', const=DEFAULT_DISABLED_TESTS_FILE) +parser.add_argument('--rerun-disabled-tests', action='store_true') # Only run when -h or --help flag is active to display both unittest and parser help messages. def run_unittest_help(argv): @@ -527,6 +527,9 @@ else: # infer flags based on the default settings GRAPH_EXECUTOR = cppProfilingFlagsToProfilingMode() +RERUN_DISABLED_TESTS = args.rerun_disabled_tests +# Rerun disabled tests many more times to make sure that they are not flaky anymore +MAX_NUM_RETRIES = 3 if not RERUN_DISABLED_TESTS else 50 SLOW_TESTS_FILE = args.import_slow_tests DISABLED_TESTS_FILE = args.import_disabled_tests @@ -1653,6 +1656,9 @@ def check_if_enable(test: unittest.TestCase): raise unittest.SkipTest("test is slow; run with PYTORCH_TEST_WITH_SLOW to enable test") sanitized_test_method_name = remove_device_and_dtype_suffixes(test._testMethodName) if not IS_SANDCASTLE: + should_skip = False + skip_msg = "" + for disabled_test, (issue_url, platforms) in disabled_tests_dict.items(): disable_test_parts = disabled_test.split() if len(disable_test_parts) > 1: @@ -1687,11 +1693,22 @@ def check_if_enable(test: unittest.TestCase): platforms = list(filter(lambda p: p in platform_to_conditional, platforms)) if platforms == [] or any([platform_to_conditional[platform] for platform in platforms]): + should_skip = True skip_msg = f"Test is disabled because an issue exists disabling it: {issue_url}" \ f" for {'all' if platforms == [] else ''}platform(s) {', '.join(platforms)}. " \ "If you're seeing this on your local machine and would like to enable this test, " \ "please make sure CI is not set and you are not using the flag --import-disabled-tests." - raise unittest.SkipTest(skip_msg) + break + + if should_skip and not RERUN_DISABLED_TESTS: + # Skip the disabled test when not running under --rerun-disabled-tests verification mode + raise unittest.SkipTest(skip_msg) + + if not should_skip and RERUN_DISABLED_TESTS: + skip_msg = "Test is enabled but --rerun-disabled-tests verification mode is set, so only" \ + " disabled tests are run" + raise unittest.SkipTest(skip_msg) + if TEST_SKIP_FAST: if not getattr(test, test._testMethodName).__dict__.get('slow_test', False): raise unittest.SkipTest("test is fast; we disabled it with PYTORCH_TEST_SKIP_FAST") @@ -2039,9 +2056,48 @@ class TestCase(expecttest.TestCase): def _run_with_retry(self, result=None, num_runs_left=0, report_only=True, num_red=0, num_green=0): using_unittest = isinstance(result, unittest.TestResult) if num_runs_left == 0: + # The logic when RERUN_DISABLED_TESTS is set to true is as follows: + # |-if the disabled test passes: + # |-- if it's flaky: + # |--- Do nothing because it's still flaky + # |-- elif it isn't flaky anymore: + # |--- Close the disabled ticket (later) + # | + # |- elif the disabled test fails after n retries: + # |-- This is expected, report this but don't fail the job + skipped_msg = { + "num_red": num_red, + "num_green": num_green, + "max_num_retries": MAX_NUM_RETRIES, + "rerun_disabled_test": RERUN_DISABLED_TESTS, + } + + traceback_str = "" + if RERUN_DISABLED_TESTS and using_unittest: + # Hide all failures and errors when RERUN_DISABLED_TESTS is enabled. This is + # a verification check, we don't want more red signals coming from it + if result.failures: + _, traceback_str = result.failures.pop(-1) + if result.errors: + _, traceback_str = result.errors.pop(-1) + + if traceback_str: + skipped_msg["traceback_str"] = traceback_str + + if num_green == 0: + # The disabled test fails, report as skipped but don't fail the job + result.addSkip(self, json.dumps(skipped_msg)) + + if num_red == 0: + # The test passes after re-running multiple times. This acts as a signal + # to confirm that it's not flaky anymore + result.addSuccess(self) + if num_green > 0 and num_red > 0 and using_unittest: - result.addSkip(self, f'{{"flaky": {True}, "num_red": {num_red}, "num_green": {num_green},' + - f'"max_num_retries": {MAX_NUM_RETRIES}}}') + skipped_msg["flaky"] = True + # Still flaky, do nothing + result.addSkip(self, json.dumps(skipped_msg)) + return if using_unittest: @@ -2100,9 +2156,13 @@ class TestCase(expecttest.TestCase): result.addExpectedFailure(self, err) self._run_with_retry(result=result, num_runs_left=num_retries_left, report_only=report_only, num_red=num_red + 1, num_green=num_green) - elif report_only and num_retries_left < MAX_NUM_RETRIES: + elif (RERUN_DISABLED_TESTS or report_only) and num_retries_left < MAX_NUM_RETRIES: + # Always re-run up to MAX_NUM_RETRIES when running under report only or rerun disabled tests modes print(f" {self._testMethodName} succeeded - num_retries_left: {num_retries_left}") - result.addUnexpectedSuccess(self) + if RERUN_DISABLED_TESTS: + result.addSuccess(self) + else: + result.addUnexpectedSuccess(self) self._run_with_retry(result=result, num_runs_left=num_retries_left, report_only=report_only, num_red=num_red, num_green=num_green + 1) elif not report_only and num_retries_left < MAX_NUM_RETRIES: