Various CI settings (#117668)

Test [ci-verbose-test-logs] (this worked, the test logs printing while running and interleaved and are really long)

Settings for no timeout (step timeout still applies, only gets rid of ~30 min timeout for shard of test file) and no piping logs/extra verbose test logs (good for debugging deadlocks but results in very long and possibly interleaved logs).

Also allows these to be set via pr body if the label name is in brackets ex [label name] or the test above.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/117668
Approved by: https://github.com/huydhn
This commit is contained in:
Catherine Lee 2024-01-26 00:17:29 +00:00 committed by PyTorch MergeBot
parent 8c167f9fc3
commit de9ddd19a5
11 changed files with 126 additions and 14 deletions

View file

@ -26,11 +26,17 @@ outputs:
description: True if the filtered test configs matrix is empty. False otherwise.
value: ${{ steps.filter.outputs.is-test-matrix-empty }}
keep-going:
description: True if keep-going label was on PR.
description: True if keep-going label was on PR or [keep-going] in PR body.
value: ${{ steps.filter.outputs.keep-going }}
reenabled-issues:
description: Comma separated list of issue numbers that should correspond to disable test issues that the PR fixes
value: ${{ steps.filter.outputs.reenabled-issues }}
ci-verbose-test-logs:
description: True if ci-verbose-test-logs label was on PR or [ci-verbose-test-logs] in PR body.
value: ${{ steps.filter.outputs.ci-verbose-test-logs }}
ci-no-test-timeout:
description: True if ci-no-test-timeout label was on PR or [ci-no-test-timeout] in PR body.
value: ${{ steps.filter.outputs.ci-no-test-timeout }}
runs:
using: composite

View file

@ -474,6 +474,10 @@ def get_reenabled_issues(pr_body: str = "") -> List[str]:
return parse_reenabled_issues(pr_body) + parse_reenabled_issues(commit_messages)
def check_for_setting(labels: Set[str], body: str, setting: str) -> bool:
return setting in labels or f"[{setting}]" in body
def perform_misc_tasks(
labels: Set[str], test_matrix: Dict[str, List[Any]], job_name: str, pr_body: str
) -> None:
@ -481,7 +485,14 @@ def perform_misc_tasks(
In addition to apply the filter logic, the script also does the following
misc tasks to set keep-going and is-unstable variables
"""
set_output("keep-going", "keep-going" in labels)
set_output("keep-going", check_for_setting(labels, pr_body, "keep-going"))
set_output(
"ci-verbose-test-logs",
check_for_setting(labels, pr_body, "ci-verbose-test-logs"),
)
set_output(
"ci-no-test-timeout", check_for_setting(labels, pr_body, "ci-no-test-timeout")
)
# Obviously, if the job name includes unstable, then this is an unstable job
is_unstable = job_name and IssueType.UNSTABLE.value in job_name

View file

@ -636,55 +636,98 @@ class TestConfigFilter(TestCase):
@mock.patch("subprocess.check_output")
def test_perform_misc_tasks(self, mocked_subprocess: Any) -> None:
def _gen_expected_string(
keep_going: bool = False,
ci_verbose_test_logs: bool = False,
ci_no_test_timeout: bool = False,
is_unstable: bool = False,
reenabled_issues: str = "",
) -> str:
return (
f"keep-going={keep_going}\n"
f"ci-verbose-test-logs={ci_verbose_test_logs}\n"
f"ci-no-test-timeout={ci_no_test_timeout}\n"
f"is-unstable={is_unstable}\n"
f"reenabled-issues={reenabled_issues}\n"
)
mocked_subprocess.return_value = b""
testcases: List[Dict[str, Any]] = [
{
"labels": {},
"test_matrix": '{include: [{config: "default"}]}',
"job_name": "A job name",
"expected": "keep-going=False\nis-unstable=False\nreenabled-issues=\n",
"expected": _gen_expected_string(),
"description": "No keep-going, no is-unstable",
},
{
"labels": {"keep-going"},
"test_matrix": '{include: [{config: "default"}]}',
"job_name": "A job name",
"expected": "keep-going=True\nis-unstable=False\nreenabled-issues=\n",
"expected": _gen_expected_string(keep_going=True),
"description": "Has keep-going, no is-unstable",
},
{
"labels": {},
"test_matrix": '{include: [{config: "default"}]}',
"job_name": "A job name",
"pr_body": "[keep-going]",
"expected": _gen_expected_string(keep_going=True),
"description": "Keep-going in PR body",
},
{
"labels": {"ci-verbose-test-logs"},
"test_matrix": '{include: [{config: "default"}]}',
"job_name": "A job name",
"pr_body": "[ci-no-test-timeout]",
"expected": _gen_expected_string(
ci_verbose_test_logs=True, ci_no_test_timeout=True
),
"description": "No pipe logs label and no test timeout in PR body",
},
{
"labels": {"ci-no-test-timeout"},
"test_matrix": '{include: [{config: "default"}]}',
"job_name": "A job name",
"pr_body": "[ci-verbose-test-logs]",
"expected": _gen_expected_string(
ci_verbose_test_logs=True, ci_no_test_timeout=True
),
"description": "No pipe logs in PR body and no test timeout in label (same as the above but swapped)",
},
{
"labels": {},
"test_matrix": '{include: [{config: "default"}]}',
"job_name": None,
"expected": "keep-going=False\nis-unstable=False\nreenabled-issues=\n",
"expected": _gen_expected_string(),
"description": "No job name",
},
{
"labels": {},
"test_matrix": '{include: [{config: "default"}]}',
"job_name": "macos-12-py3-arm64 / test (default, 1, 3, macos-m1-stable, unstable)",
"expected": "keep-going=False\nis-unstable=True\nreenabled-issues=\n",
"expected": _gen_expected_string(is_unstable=True),
"description": "Unstable job",
},
{
"labels": {},
"test_matrix": '{include: [{config: "default"}]}',
"job_name": "macos-12-py3-arm64 / test (default, 1, 3, macos-m1-stable, unstable)",
"expected": "keep-going=False\nis-unstable=True\nreenabled-issues=\n",
"expected": _gen_expected_string(is_unstable=True),
"description": "Unstable job",
},
{
"labels": {},
"test_matrix": '{include: [{config: "1", unstable: "unstable"}, {config: "2", unstable: "unstable"}]}',
"job_name": "macos-12-py3-arm64 / build",
"expected": "keep-going=False\nis-unstable=True\nreenabled-issues=\n",
"expected": _gen_expected_string(is_unstable=True),
"description": "All configs are unstable",
},
{
"labels": {},
"test_matrix": '{include: [{config: "1", unstable: "unstable"}, {config: "2"}]}',
"job_name": "macos-12-py3-arm64 / build",
"expected": "keep-going=False\nis-unstable=False\nreenabled-issues=\n",
"expected": _gen_expected_string(is_unstable=False),
"description": "Only mark some configs as unstable",
},
{
@ -692,7 +735,7 @@ class TestConfigFilter(TestCase):
"test_matrix": '{include: [{config: "default"}]}',
"job_name": "A job name",
"pr_body": "resolves #123 fixes #234",
"expected": "keep-going=False\nis-unstable=False\nreenabled-issues=123,234\n",
"expected": _gen_expected_string(reenabled_issues="123,234"),
"description": "Reenable some issues",
},
]

View file

@ -169,6 +169,8 @@ jobs:
NUM_TEST_SHARDS: ${{ matrix.num_shards }}
REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
@ -218,6 +220,8 @@ jobs:
-e NUM_TEST_SHARDS \
-e REENABLED_ISSUES \
-e CONTINUE_THROUGH_ERROR \
-e VERBOSE_TEST_LOGS \
-e NO_TEST_TIMEOUT \
-e PR_LABELS \
-e MAX_JOBS="$(nproc --ignore=2)" \
-e SCCACHE_BUCKET \

View file

@ -34,6 +34,8 @@ jobs:
test-matrix: ${{ steps.filter.outputs.test-matrix }}
is-test-matrix-empty: ${{ steps.filter.outputs.is-test-matrix-empty }}
keep-going: ${{ steps.filter.outputs.keep-going }}
ci-verbose-test-logs: ${{ steps.filter.outputs.ci-verbose-test-logs }}
ci-no-test-timeout: ${{ steps.filter.outputs.ci-no-test-timeout }}
reenabled-issues: ${{ steps.filter.outputs.reenabled-issues }}
steps:
- name: Checkout PyTorch
@ -95,6 +97,8 @@ jobs:
PY_VERS: 3.9
PR_BODY: ${{ github.event.pull_request.body }}
CONTINUE_THROUGH_ERROR: ${{ needs.filter.outputs.keep-going }}
VERBOSE_TEST_LOGS: ${{ needs.filter.outputs.ci-verbose-test-logs }}
NO_TEST_TIMEOUT: ${{ needs.filter.outputs.ci-no-test-timeout }}
PIP_REQUIREMENTS_FILE: .github/requirements/pip-requirements-${{ runner.os }}.txt
REENABLED_ISSUES: ${{ needs.filter.outputs.reenabled-issues }}
run: |

View file

@ -148,6 +148,8 @@ jobs:
PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
PIP_REQUIREMENTS_FILE: .github/requirements/pip-requirements-${{ runner.os }}.txt
GITHUB_REPOSITORY: ${{ github.repository }}
GITHUB_WORKFLOW: ${{ github.workflow }}

View file

@ -148,6 +148,8 @@ jobs:
BRANCH: ${{ steps.parse-ref.outputs.branch }}
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
TEST_CONFIG: ${{ matrix.config }}
SHARD_NUMBER: ${{ matrix.shard }}
NUM_TEST_SHARDS: ${{ matrix.num_shards }}
@ -196,6 +198,8 @@ jobs:
-e NUM_TEST_SHARDS \
-e REENABLED_ISSUES \
-e CONTINUE_THROUGH_ERROR \
-e VERBOSE_TEST_LOGS \
-e NO_TEST_TIMEOUT \
-e MAX_JOBS="$(nproc --ignore=2)" \
-e SCCACHE_BUCKET \
-e XLA_CLANG_CACHE_S3_BUCKET_NAME \

View file

@ -140,6 +140,8 @@ jobs:
INSTALL_WINDOWS_SDK: 1
PYTHON_VERSION: 3.8
CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
VC_PRODUCT: "BuildTools"
VC_VERSION: ""
VS_VERSION: "16.8.6"

View file

@ -143,6 +143,8 @@ jobs:
PYTORCH_RETRY_TEST_CASES: 1
PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
TEST_CONFIG: ${{ matrix.config }}
SHARD_NUMBER: ${{ matrix.shard }}
NUM_TEST_SHARDS: ${{ matrix.num_shards }}
@ -185,6 +187,8 @@ jobs:
-e PYTORCH_RETRY_TEST_CASES \
-e PYTORCH_OVERRIDE_FLAKY_SIGNAL \
-e CONTINUE_THROUGH_ERROR \
-e VERBOSE_TEST_LOGS \
-e NO_TEST_TIMEOUT \
-e MAX_JOBS="$(nproc --ignore=2)" \
-e SCCACHE_BUCKET \
-e XLA_CLANG_CACHE_S3_BUCKET_NAME \

View file

@ -129,6 +129,24 @@ class _NodeReporterReruns(_NodeReporter):
tag.text = bin_xml_escape(content)
self.append(tag)
def append_skipped(self, report: TestReport) -> None:
# Referenced from the below
# https://github.com/pytest-dev/pytest/blob/2178ee86d7c1ee93748cfb46540a6e40b4761f2d/src/_pytest/junitxml.py#L236C6-L236C6
# Modified to escape characters not supported by xml in the skip reason. Everything else should be the same.
if hasattr(report, "wasxfail"):
# Super here instead of the actual code so we can reduce possible divergence
super().append_skipped(report)
else:
assert isinstance(report.longrepr, tuple)
filename, lineno, skipreason = report.longrepr
if skipreason.startswith("Skipped: "):
skipreason = skipreason[9:]
details = f"{filename}:{lineno}: {skipreason}"
skipped = ET.Element("skipped", type="pytest.skip", message=bin_xml_escape(skipreason))
skipped.text = bin_xml_escape(details)
self.append(skipped)
self.write_captured_output(report)
class LogXMLReruns(LogXML):
def __init__(self, *args, **kwargs):

View file

@ -605,7 +605,7 @@ def run_test(
argv = [test_file + ".py"] + unittest_args
os.makedirs(REPO_ROOT / "test" / "test-reports", exist_ok=True)
if IS_CI:
if options.pipe_logs:
log_fd, log_path = tempfile.mkstemp(
dir=REPO_ROOT / "test" / "test-reports",
prefix=f"{sanitize_file_name(str(test_module))}_",
@ -619,7 +619,9 @@ def run_test(
"BUILD_ENVRIONMENT", ""
)
timeout = (
THRESHOLD * 6
None
if not options.enable_timeout
else THRESHOLD * 6
if is_slow
else THRESHOLD * 3
if should_retry
@ -631,7 +633,7 @@ def run_test(
with ExitStack() as stack:
output = None
if IS_CI:
if options.pipe_logs:
output = stack.enter_context(open(log_path, "w"))
if should_retry:
@ -664,7 +666,7 @@ def run_test(
# comes up in the future.
ret_code = 0 if ret_code == 5 or ret_code == 4 else ret_code
if IS_CI:
if options.pipe_logs:
handle_log_file(
test_module, log_path, failed=(ret_code != 0), was_rerun=was_rerun
)
@ -1249,6 +1251,18 @@ def parse_args():
help="Runs the full test suite despite one of the tests failing",
default=strtobool(os.environ.get("CONTINUE_THROUGH_ERROR", "False")),
)
parser.add_argument(
"--pipe-logs",
action="store_true",
help="Print logs to output file while running tests. True if in CI and env var is not set",
default=IS_CI and not strtobool(os.environ.get("VERBOSE_TEST_LOGS", "False")),
)
parser.add_argument(
"--enable-timeout",
action="store_true",
help="Set a timeout based on the test times json file. Only works if there are test times available",
default=IS_CI and not strtobool(os.environ.get("NO_TEST_TIMEOUT", "False")),
)
parser.add_argument(
"additional_unittest_args",
nargs="*",