From de9ddd19a5c4ff9c254a085572d34ea6f4197724 Mon Sep 17 00:00:00 2001
From: Catherine Lee <csl@fb.com>
Date: Fri, 26 Jan 2024 00:17:29 +0000
Subject: [PATCH] Various CI settings (#117668)

Test [ci-verbose-test-logs] (this worked, the test logs printing while running and interleaved and are really long)

Settings for no timeout (step timeout still applies, only gets rid of ~30 min timeout for shard of test file) and no piping logs/extra verbose test logs (good for debugging deadlocks but results in very long and possibly interleaved logs).

Also allows these to be set via pr body if the label name is in brackets ex [label name] or the test above.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/117668
Approved by: https://github.com/huydhn
---
 .../actions/filter-test-configs/action.yml    |  8 ++-
 .github/scripts/filter_test_configs.py        | 13 +++-
 .github/scripts/test_filter_test_configs.py   | 59 ++++++++++++++++---
 .github/workflows/_linux-test.yml             |  4 ++
 .github/workflows/_mac-test-mps.yml           |  4 ++
 .github/workflows/_mac-test.yml               |  2 +
 .github/workflows/_rocm-test.yml              |  4 ++
 .github/workflows/_win-test.yml               |  2 +
 .github/workflows/_xpu-test.yml               |  4 ++
 test/conftest.py                              | 18 ++++++
 test/run_test.py                              | 22 +++++--
 11 files changed, 126 insertions(+), 14 deletions(-)

diff --git a/.github/actions/filter-test-configs/action.yml b/.github/actions/filter-test-configs/action.yml
index 3fb107a17b9..8439a27eb9f 100644
--- a/.github/actions/filter-test-configs/action.yml
+++ b/.github/actions/filter-test-configs/action.yml
@@ -26,11 +26,17 @@ outputs:
     description: True if the filtered test configs matrix is empty. False otherwise.
     value: ${{ steps.filter.outputs.is-test-matrix-empty }}
   keep-going:
-    description: True if keep-going label was on PR.
+    description: True if keep-going label was on PR or [keep-going] in PR body.
     value: ${{ steps.filter.outputs.keep-going }}
   reenabled-issues:
     description: Comma separated list of issue numbers that should correspond to disable test issues that the PR fixes
     value: ${{ steps.filter.outputs.reenabled-issues }}
+  ci-verbose-test-logs:
+    description: True if ci-verbose-test-logs label was on PR or [ci-verbose-test-logs] in PR body.
+    value: ${{ steps.filter.outputs.ci-verbose-test-logs }}
+  ci-no-test-timeout:
+    description: True if ci-no-test-timeout label was on PR or [ci-no-test-timeout] in PR body.
+    value: ${{ steps.filter.outputs.ci-no-test-timeout }}
 
 runs:
   using: composite
diff --git a/.github/scripts/filter_test_configs.py b/.github/scripts/filter_test_configs.py
index c25b0f6fe84..e63e3d88677 100755
--- a/.github/scripts/filter_test_configs.py
+++ b/.github/scripts/filter_test_configs.py
@@ -474,6 +474,10 @@ def get_reenabled_issues(pr_body: str = "") -> List[str]:
     return parse_reenabled_issues(pr_body) + parse_reenabled_issues(commit_messages)
 
 
+def check_for_setting(labels: Set[str], body: str, setting: str) -> bool:
+    return setting in labels or f"[{setting}]" in body
+
+
 def perform_misc_tasks(
     labels: Set[str], test_matrix: Dict[str, List[Any]], job_name: str, pr_body: str
 ) -> None:
@@ -481,7 +485,14 @@ def perform_misc_tasks(
     In addition to apply the filter logic, the script also does the following
     misc tasks to set keep-going and is-unstable variables
     """
-    set_output("keep-going", "keep-going" in labels)
+    set_output("keep-going", check_for_setting(labels, pr_body, "keep-going"))
+    set_output(
+        "ci-verbose-test-logs",
+        check_for_setting(labels, pr_body, "ci-verbose-test-logs"),
+    )
+    set_output(
+        "ci-no-test-timeout", check_for_setting(labels, pr_body, "ci-no-test-timeout")
+    )
 
     # Obviously, if the job name includes unstable, then this is an unstable job
     is_unstable = job_name and IssueType.UNSTABLE.value in job_name
diff --git a/.github/scripts/test_filter_test_configs.py b/.github/scripts/test_filter_test_configs.py
index 40be20a0995..5c5dfa79d50 100755
--- a/.github/scripts/test_filter_test_configs.py
+++ b/.github/scripts/test_filter_test_configs.py
@@ -636,55 +636,98 @@ class TestConfigFilter(TestCase):
 
     @mock.patch("subprocess.check_output")
     def test_perform_misc_tasks(self, mocked_subprocess: Any) -> None:
+        def _gen_expected_string(
+            keep_going: bool = False,
+            ci_verbose_test_logs: bool = False,
+            ci_no_test_timeout: bool = False,
+            is_unstable: bool = False,
+            reenabled_issues: str = "",
+        ) -> str:
+            return (
+                f"keep-going={keep_going}\n"
+                f"ci-verbose-test-logs={ci_verbose_test_logs}\n"
+                f"ci-no-test-timeout={ci_no_test_timeout}\n"
+                f"is-unstable={is_unstable}\n"
+                f"reenabled-issues={reenabled_issues}\n"
+            )
+
         mocked_subprocess.return_value = b""
         testcases: List[Dict[str, Any]] = [
             {
                 "labels": {},
                 "test_matrix": '{include: [{config: "default"}]}',
                 "job_name": "A job name",
-                "expected": "keep-going=False\nis-unstable=False\nreenabled-issues=\n",
+                "expected": _gen_expected_string(),
                 "description": "No keep-going, no is-unstable",
             },
             {
                 "labels": {"keep-going"},
                 "test_matrix": '{include: [{config: "default"}]}',
                 "job_name": "A job name",
-                "expected": "keep-going=True\nis-unstable=False\nreenabled-issues=\n",
+                "expected": _gen_expected_string(keep_going=True),
                 "description": "Has keep-going, no is-unstable",
             },
+            {
+                "labels": {},
+                "test_matrix": '{include: [{config: "default"}]}',
+                "job_name": "A job name",
+                "pr_body": "[keep-going]",
+                "expected": _gen_expected_string(keep_going=True),
+                "description": "Keep-going in PR body",
+            },
+            {
+                "labels": {"ci-verbose-test-logs"},
+                "test_matrix": '{include: [{config: "default"}]}',
+                "job_name": "A job name",
+                "pr_body": "[ci-no-test-timeout]",
+                "expected": _gen_expected_string(
+                    ci_verbose_test_logs=True, ci_no_test_timeout=True
+                ),
+                "description": "No pipe logs label and no test timeout in PR body",
+            },
+            {
+                "labels": {"ci-no-test-timeout"},
+                "test_matrix": '{include: [{config: "default"}]}',
+                "job_name": "A job name",
+                "pr_body": "[ci-verbose-test-logs]",
+                "expected": _gen_expected_string(
+                    ci_verbose_test_logs=True, ci_no_test_timeout=True
+                ),
+                "description": "No pipe logs in PR body and no test timeout in label (same as the above but swapped)",
+            },
             {
                 "labels": {},
                 "test_matrix": '{include: [{config: "default"}]}',
                 "job_name": None,
-                "expected": "keep-going=False\nis-unstable=False\nreenabled-issues=\n",
+                "expected": _gen_expected_string(),
                 "description": "No job name",
             },
             {
                 "labels": {},
                 "test_matrix": '{include: [{config: "default"}]}',
                 "job_name": "macos-12-py3-arm64 / test (default, 1, 3, macos-m1-stable, unstable)",
-                "expected": "keep-going=False\nis-unstable=True\nreenabled-issues=\n",
+                "expected": _gen_expected_string(is_unstable=True),
                 "description": "Unstable job",
             },
             {
                 "labels": {},
                 "test_matrix": '{include: [{config: "default"}]}',
                 "job_name": "macos-12-py3-arm64 / test (default, 1, 3, macos-m1-stable, unstable)",
-                "expected": "keep-going=False\nis-unstable=True\nreenabled-issues=\n",
+                "expected": _gen_expected_string(is_unstable=True),
                 "description": "Unstable job",
             },
             {
                 "labels": {},
                 "test_matrix": '{include: [{config: "1", unstable: "unstable"}, {config: "2", unstable: "unstable"}]}',
                 "job_name": "macos-12-py3-arm64 / build",
-                "expected": "keep-going=False\nis-unstable=True\nreenabled-issues=\n",
+                "expected": _gen_expected_string(is_unstable=True),
                 "description": "All configs are unstable",
             },
             {
                 "labels": {},
                 "test_matrix": '{include: [{config: "1", unstable: "unstable"}, {config: "2"}]}',
                 "job_name": "macos-12-py3-arm64 / build",
-                "expected": "keep-going=False\nis-unstable=False\nreenabled-issues=\n",
+                "expected": _gen_expected_string(is_unstable=False),
                 "description": "Only mark some configs as unstable",
             },
             {
@@ -692,7 +735,7 @@ class TestConfigFilter(TestCase):
                 "test_matrix": '{include: [{config: "default"}]}',
                 "job_name": "A job name",
                 "pr_body": "resolves #123 fixes #234",
-                "expected": "keep-going=False\nis-unstable=False\nreenabled-issues=123,234\n",
+                "expected": _gen_expected_string(reenabled_issues="123,234"),
                 "description": "Reenable some issues",
             },
         ]
diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml
index 1d14950549a..3e0962f29c5 100644
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@@ -169,6 +169,8 @@ jobs:
           NUM_TEST_SHARDS: ${{ matrix.num_shards }}
           REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
           CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
+          VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
+          NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
           SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
           SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
           SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
@@ -218,6 +220,8 @@ jobs:
             -e NUM_TEST_SHARDS \
             -e REENABLED_ISSUES \
             -e CONTINUE_THROUGH_ERROR \
+            -e VERBOSE_TEST_LOGS \
+            -e NO_TEST_TIMEOUT \
             -e PR_LABELS \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
diff --git a/.github/workflows/_mac-test-mps.yml b/.github/workflows/_mac-test-mps.yml
index b10c1f84bd7..33302a16ecc 100644
--- a/.github/workflows/_mac-test-mps.yml
+++ b/.github/workflows/_mac-test-mps.yml
@@ -34,6 +34,8 @@ jobs:
       test-matrix: ${{ steps.filter.outputs.test-matrix }}
       is-test-matrix-empty: ${{ steps.filter.outputs.is-test-matrix-empty }}
       keep-going: ${{ steps.filter.outputs.keep-going }}
+      ci-verbose-test-logs: ${{ steps.filter.outputs.ci-verbose-test-logs }}
+      ci-no-test-timeout: ${{ steps.filter.outputs.ci-no-test-timeout }}
       reenabled-issues: ${{ steps.filter.outputs.reenabled-issues }}
     steps:
       - name: Checkout PyTorch
@@ -95,6 +97,8 @@ jobs:
           PY_VERS: 3.9
           PR_BODY: ${{ github.event.pull_request.body }}
           CONTINUE_THROUGH_ERROR: ${{ needs.filter.outputs.keep-going }}
+          VERBOSE_TEST_LOGS: ${{ needs.filter.outputs.ci-verbose-test-logs }}
+          NO_TEST_TIMEOUT: ${{ needs.filter.outputs.ci-no-test-timeout }}
           PIP_REQUIREMENTS_FILE: .github/requirements/pip-requirements-${{ runner.os }}.txt
           REENABLED_ISSUES: ${{ needs.filter.outputs.reenabled-issues }}
         run: |
diff --git a/.github/workflows/_mac-test.yml b/.github/workflows/_mac-test.yml
index 4848a566f15..92728df26d4 100644
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@@ -148,6 +148,8 @@ jobs:
           PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
           PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
           CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
+          VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
+          NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
           PIP_REQUIREMENTS_FILE: .github/requirements/pip-requirements-${{ runner.os }}.txt
           GITHUB_REPOSITORY: ${{ github.repository }}
           GITHUB_WORKFLOW: ${{ github.workflow }}
diff --git a/.github/workflows/_rocm-test.yml b/.github/workflows/_rocm-test.yml
index 5f4f8aafe6a..f62f2a38782 100644
--- a/.github/workflows/_rocm-test.yml
+++ b/.github/workflows/_rocm-test.yml
@@ -148,6 +148,8 @@ jobs:
           BRANCH: ${{ steps.parse-ref.outputs.branch }}
           SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
           CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
+          VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
+          NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
           TEST_CONFIG: ${{ matrix.config }}
           SHARD_NUMBER: ${{ matrix.shard }}
           NUM_TEST_SHARDS: ${{ matrix.num_shards }}
@@ -196,6 +198,8 @@ jobs:
             -e NUM_TEST_SHARDS \
             -e REENABLED_ISSUES \
             -e CONTINUE_THROUGH_ERROR \
+            -e VERBOSE_TEST_LOGS \
+            -e NO_TEST_TIMEOUT \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
             -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
diff --git a/.github/workflows/_win-test.yml b/.github/workflows/_win-test.yml
index ebfe4211b34..5fc8da283d2 100644
--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@@ -140,6 +140,8 @@ jobs:
           INSTALL_WINDOWS_SDK: 1
           PYTHON_VERSION: 3.8
           CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
+          VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
+          NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
           VC_PRODUCT: "BuildTools"
           VC_VERSION: ""
           VS_VERSION: "16.8.6"
diff --git a/.github/workflows/_xpu-test.yml b/.github/workflows/_xpu-test.yml
index a3f05b09522..915e8f6dabb 100644
--- a/.github/workflows/_xpu-test.yml
+++ b/.github/workflows/_xpu-test.yml
@@ -143,6 +143,8 @@ jobs:
           PYTORCH_RETRY_TEST_CASES: 1
           PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
           CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
+          VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
+          NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
           TEST_CONFIG: ${{ matrix.config }}
           SHARD_NUMBER: ${{ matrix.shard }}
           NUM_TEST_SHARDS: ${{ matrix.num_shards }}
@@ -185,6 +187,8 @@ jobs:
             -e PYTORCH_RETRY_TEST_CASES \
             -e PYTORCH_OVERRIDE_FLAKY_SIGNAL \
             -e CONTINUE_THROUGH_ERROR \
+            -e VERBOSE_TEST_LOGS \
+            -e NO_TEST_TIMEOUT \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
             -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
diff --git a/test/conftest.py b/test/conftest.py
index c7a1570f0bf..6253ca7c0ba 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -129,6 +129,24 @@ class _NodeReporterReruns(_NodeReporter):
         tag.text = bin_xml_escape(content)
         self.append(tag)
 
+    def append_skipped(self, report: TestReport) -> None:
+        # Referenced from the below
+        # https://github.com/pytest-dev/pytest/blob/2178ee86d7c1ee93748cfb46540a6e40b4761f2d/src/_pytest/junitxml.py#L236C6-L236C6
+        # Modified to escape characters not supported by xml in the skip reason.  Everything else should be the same.
+        if hasattr(report, "wasxfail"):
+            # Super here instead of the actual code so we can reduce possible divergence
+            super().append_skipped(report)
+        else:
+            assert isinstance(report.longrepr, tuple)
+            filename, lineno, skipreason = report.longrepr
+            if skipreason.startswith("Skipped: "):
+                skipreason = skipreason[9:]
+            details = f"{filename}:{lineno}: {skipreason}"
+
+            skipped = ET.Element("skipped", type="pytest.skip", message=bin_xml_escape(skipreason))
+            skipped.text = bin_xml_escape(details)
+            self.append(skipped)
+            self.write_captured_output(report)
 
 class LogXMLReruns(LogXML):
     def __init__(self, *args, **kwargs):
diff --git a/test/run_test.py b/test/run_test.py
index b091413415c..eba5363e5bf 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -605,7 +605,7 @@ def run_test(
         argv = [test_file + ".py"] + unittest_args
 
     os.makedirs(REPO_ROOT / "test" / "test-reports", exist_ok=True)
-    if IS_CI:
+    if options.pipe_logs:
         log_fd, log_path = tempfile.mkstemp(
             dir=REPO_ROOT / "test" / "test-reports",
             prefix=f"{sanitize_file_name(str(test_module))}_",
@@ -619,7 +619,9 @@ def run_test(
         "BUILD_ENVRIONMENT", ""
     )
     timeout = (
-        THRESHOLD * 6
+        None
+        if not options.enable_timeout
+        else THRESHOLD * 6
         if is_slow
         else THRESHOLD * 3
         if should_retry
@@ -631,7 +633,7 @@ def run_test(
 
     with ExitStack() as stack:
         output = None
-        if IS_CI:
+        if options.pipe_logs:
             output = stack.enter_context(open(log_path, "w"))
 
         if should_retry:
@@ -664,7 +666,7 @@ def run_test(
             # comes up in the future.
             ret_code = 0 if ret_code == 5 or ret_code == 4 else ret_code
 
-    if IS_CI:
+    if options.pipe_logs:
         handle_log_file(
             test_module, log_path, failed=(ret_code != 0), was_rerun=was_rerun
         )
@@ -1249,6 +1251,18 @@ def parse_args():
         help="Runs the full test suite despite one of the tests failing",
         default=strtobool(os.environ.get("CONTINUE_THROUGH_ERROR", "False")),
     )
+    parser.add_argument(
+        "--pipe-logs",
+        action="store_true",
+        help="Print logs to output file while running tests.  True if in CI and env var is not set",
+        default=IS_CI and not strtobool(os.environ.get("VERBOSE_TEST_LOGS", "False")),
+    )
+    parser.add_argument(
+        "--enable-timeout",
+        action="store_true",
+        help="Set a timeout based on the test times json file.  Only works if there are test times available",
+        default=IS_CI and not strtobool(os.environ.get("NO_TEST_TIMEOUT", "False")),
+    )
     parser.add_argument(
         "additional_unittest_args",
         nargs="*",