CI sanity check test for env vars (#120519)

Make a test that fails on purpose to trigger retries.  Check the opposite of success (that env vars exist)

It's bit hacky because I want it to fail on the normal flow in order to trigger reruns but I don't want to expose the failures to users since it's confusing.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/120519
Approved by: https://github.com/huydhn
This commit is contained in:
Catherine Lee 2024-03-11 15:35:45 +00:00 committed by PyTorch MergeBot
parent 6c11d3ce0c
commit fac06a12c8
2 changed files with 42 additions and 1 deletions

View file

@ -2,6 +2,7 @@
import argparse
import copy
import glob
import json
import os
import pathlib
@ -381,6 +382,7 @@ def run_test(
launcher_cmd=None,
extra_unittest_args=None,
env=None,
print_log=True,
) -> int:
env = env or os.environ.copy()
maybe_set_hip_visible_devies()
@ -542,7 +544,7 @@ def run_test(
# comes up in the future.
ret_code = 0 if ret_code == 5 or ret_code == 4 else ret_code
if options.pipe_logs:
if options.pipe_logs and print_log:
handle_log_file(
test_module, log_path, failed=(ret_code != 0), was_rerun=was_rerun
)
@ -1002,6 +1004,23 @@ def get_pytest_args(options, is_cpp_test=False, is_distributed_test=False):
return pytest_args
def run_ci_sanity_check(test: ShardedTest, test_directory, options):
assert (
test.name == "test_ci_sanity_check_fail"
), f"This handler only works for test_ci_sanity_check_fail, got {test.name}"
ret_code = run_test(test, test_directory, options, print_log=False)
# This test should fail
if ret_code != 1:
return 1
test_reports_dir = str(REPO_ROOT / "test/test-reports")
# Delete the log files and xmls generated by the test
for file in glob.glob(f"{test_reports_dir}/{test.name}*.log"):
os.remove(file)
for dirname in glob.glob(f"{test_reports_dir}/**/{test.name}"):
shutil.rmtree(dirname)
return 0
CUSTOM_HANDLERS = {
"test_cuda_primary_ctx": run_test_with_subprocess,
"test_cuda_nvml_based_avail": run_test_with_subprocess,
@ -1024,6 +1043,7 @@ CUSTOM_HANDLERS = {
"distributed/rpc/test_share_memory": run_test_with_subprocess,
"distributed/rpc/cuda/test_tensorpipe_agent": run_test_with_subprocess,
"doctests": run_doctests,
"test_ci_sanity_check_fail": run_ci_sanity_check,
}

View file

@ -0,0 +1,21 @@
# Owner(s): ["module: ci"]
# Sanity check for CI setup in GHA. This file is expected to fail so it can trigger reruns
import os
from torch.testing._internal.common_utils import run_tests, slowTest, TestCase
class TestCISanityCheck(TestCase):
def test_env_vars_exist(self):
# This check should fail and trigger reruns. If it passes, something is wrong
self.assertTrue(os.environ.get("CI") is None)
@slowTest
def test_env_vars_exist_slow(self):
# Same as the above, but for the slow suite
self.assertTrue(os.environ.get("CI") is None)
if __name__ == "__main__":
run_tests()