[CI][ROCm] fix device visibility, again (#91813)

The previous PR #91137 was incomplete. Though it successfully queried for the number of available GPUs, it still resulted in test files sharing the same GPU. This PR lifts the maxtasksperchild=1 restriction so that Pool workers will always use the same GPU. This also adds a Note in run_test.py for future reference. Pull Request resolved: https://github.com/pytorch/pytorch/pull/91813 Approved by: https://github.com/kit1980, https://github.com/huydhn, https://github.com/malfet
2026-05-14 20:57:59 +00:00 · 2023-01-06 22:19:07 +00:00 · 2023-01-06 22:19:07 +00:00 · f44946289b
commit f44946289b
parent 4f1f14e38b
2 changed files with 15 additions and 2 deletions
--- a/test/run_test.py
+++ b/test/run_test.py
@ -50,6 +50,17 @@ except ImportError:
    )


+# Note [ROCm parallel CI testing]
+# https://github.com/pytorch/pytorch/pull/85770 added file-granularity parallel testing.
+# In .jenkins/pytorch/test.sh, TEST_CONFIG == "default", CUDA and HIP_VISIBLE_DEVICES is set to 0.
+# This results in multiple test files sharing the same GPU.
+# This should be a supported use case for ROCm, but it exposed issues in the kernel driver resulting in hangs.
+# See https://github.com/pytorch/pytorch/issues/90940.
+#
+# Further, ROCm self-hosted runners have up to 4 GPUs.
+# Device visibility was set to 0 to match CUDA test behavior, but this was wasting available GPU resources.
+# Assigning each Pool worker their own dedicated GPU avoids the ROCm oversubscription issues.
+# This should also result in better overall wall clock time since all GPUs can be utilized.
 def maybe_set_hip_visible_devies():
    # Special handling of ROCm GHA runners for parallel (file granularity) tests.
    if torch.version.hip:
@ -1319,7 +1330,8 @@ def main():
    print_to_stderr("parallel (file granularity) tests:\n {}".format("\n ".join(selected_tests_parallel)))
    print_to_stderr("serial (file granularity) tests:\n {}".format("\n ".join(selected_tests_serial)))

-    pool = get_context("spawn").Pool(NUM_PROCS, maxtasksperchild=1)
+    # See Note [ROCm parallel CI testing]
+    pool = get_context("spawn").Pool(NUM_PROCS, maxtasksperchild=None if torch.version.hip else 1)
    os.makedirs(REPO_ROOT / "test" / "test-reports", exist_ok=True)

    def success_callback(err_message):
--- a/tools/testing/test_selections.py
+++ b/tools/testing/test_selections.py
@ -9,7 +9,8 @@ IS_MEM_LEAK_CHECK = os.getenv("PYTORCH_TEST_CUDA_MEM_LEAK_CHECK", "0") == "1"

 NUM_PROCS = 1 if IS_MEM_LEAK_CHECK else 2

-# Special logic for ROCm GHA runners.
+# See Note [ROCm parallel CI testing]
+# Special logic for ROCm GHA runners to query number of GPUs available.
 # torch.version.hip was not available to check if this was a ROCm self-hosted runner.
 # Must check for ROCm runner in another way. We look for /opt/rocm directory.
 if os.path.exists("/opt/rocm") and not IS_MEM_LEAK_CHECK: