CI sanity check test for env vars (#120519)

Make a test that fails on purpose to trigger retries. Check the opposite of success (that env vars exist) It's bit hacky because I want it to fail on the normal flow in order to trigger reruns but I don't want to expose the failures to users since it's confusing. Pull Request resolved: https://github.com/pytorch/pytorch/pull/120519 Approved by: https://github.com/huydhn
2026-05-14 20:57:59 +00:00 · 2024-03-11 15:35:45 +00:00 · 2024-03-11 15:35:45 +00:00 · fac06a12c8
commit fac06a12c8
parent 6c11d3ce0c
2 changed files with 42 additions and 1 deletions
--- a/test/run_test.py
+++ b/test/run_test.py
@ -2,6 +2,7 @@

 import argparse
 import copy
+import glob
 import json
 import os
 import pathlib
@ -381,6 +382,7 @@ def run_test(
    launcher_cmd=None,
    extra_unittest_args=None,
    env=None,
+    print_log=True,
 ) -> int:
    env = env or os.environ.copy()
    maybe_set_hip_visible_devies()
@ -542,7 +544,7 @@ def run_test(
            # comes up in the future.
            ret_code = 0 if ret_code == 5 or ret_code == 4 else ret_code

-    if options.pipe_logs:
+    if options.pipe_logs and print_log:
        handle_log_file(
            test_module, log_path, failed=(ret_code != 0), was_rerun=was_rerun
        )
@ -1002,6 +1004,23 @@ def get_pytest_args(options, is_cpp_test=False, is_distributed_test=False):
    return pytest_args


+def run_ci_sanity_check(test: ShardedTest, test_directory, options):
+    assert (
+        test.name == "test_ci_sanity_check_fail"
+    ), f"This handler only works for test_ci_sanity_check_fail, got {test.name}"
+    ret_code = run_test(test, test_directory, options, print_log=False)
+    # This test should fail
+    if ret_code != 1:
+        return 1
+    test_reports_dir = str(REPO_ROOT / "test/test-reports")
+    # Delete the log files and xmls generated by the test
+    for file in glob.glob(f"{test_reports_dir}/{test.name}*.log"):
+        os.remove(file)
+    for dirname in glob.glob(f"{test_reports_dir}/**/{test.name}"):
+        shutil.rmtree(dirname)
+    return 0
+
+
 CUSTOM_HANDLERS = {
    "test_cuda_primary_ctx": run_test_with_subprocess,
    "test_cuda_nvml_based_avail": run_test_with_subprocess,
@ -1024,6 +1043,7 @@ CUSTOM_HANDLERS = {
    "distributed/rpc/test_share_memory": run_test_with_subprocess,
    "distributed/rpc/cuda/test_tensorpipe_agent": run_test_with_subprocess,
    "doctests": run_doctests,
+    "test_ci_sanity_check_fail": run_ci_sanity_check,
 }


--- a/test/test_ci_sanity_check_fail.py
+++ b/test/test_ci_sanity_check_fail.py
@ -0,0 +1,21 @@
+# Owner(s): ["module: ci"]
+# Sanity check for CI setup in GHA.  This file is expected to fail so it can trigger reruns
+
+import os
+
+from torch.testing._internal.common_utils import run_tests, slowTest, TestCase
+
+
+class TestCISanityCheck(TestCase):
+    def test_env_vars_exist(self):
+        # This check should fail and trigger reruns.  If it passes, something is wrong
+        self.assertTrue(os.environ.get("CI") is None)
+
+    @slowTest
+    def test_env_vars_exist_slow(self):
+        # Same as the above, but for the slow suite
+        self.assertTrue(os.environ.get("CI") is None)
+
+
+if __name__ == "__main__":
+    run_tests()