mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-14 20:57:59 +00:00
[CI][ROCm] fix device visibility, again (#91813)
The previous PR #91137 was incomplete. Though it successfully queried for the number of available GPUs, it still resulted in test files sharing the same GPU. This PR lifts the maxtasksperchild=1 restriction so that Pool workers will always use the same GPU. This also adds a Note in run_test.py for future reference. Pull Request resolved: https://github.com/pytorch/pytorch/pull/91813 Approved by: https://github.com/kit1980, https://github.com/huydhn, https://github.com/malfet
This commit is contained in:
parent
4f1f14e38b
commit
f44946289b
2 changed files with 15 additions and 2 deletions
|
|
@ -50,6 +50,17 @@ except ImportError:
|
|||
)
|
||||
|
||||
|
||||
# Note [ROCm parallel CI testing]
|
||||
# https://github.com/pytorch/pytorch/pull/85770 added file-granularity parallel testing.
|
||||
# In .jenkins/pytorch/test.sh, TEST_CONFIG == "default", CUDA and HIP_VISIBLE_DEVICES is set to 0.
|
||||
# This results in multiple test files sharing the same GPU.
|
||||
# This should be a supported use case for ROCm, but it exposed issues in the kernel driver resulting in hangs.
|
||||
# See https://github.com/pytorch/pytorch/issues/90940.
|
||||
#
|
||||
# Further, ROCm self-hosted runners have up to 4 GPUs.
|
||||
# Device visibility was set to 0 to match CUDA test behavior, but this was wasting available GPU resources.
|
||||
# Assigning each Pool worker their own dedicated GPU avoids the ROCm oversubscription issues.
|
||||
# This should also result in better overall wall clock time since all GPUs can be utilized.
|
||||
def maybe_set_hip_visible_devies():
|
||||
# Special handling of ROCm GHA runners for parallel (file granularity) tests.
|
||||
if torch.version.hip:
|
||||
|
|
@ -1319,7 +1330,8 @@ def main():
|
|||
print_to_stderr("parallel (file granularity) tests:\n {}".format("\n ".join(selected_tests_parallel)))
|
||||
print_to_stderr("serial (file granularity) tests:\n {}".format("\n ".join(selected_tests_serial)))
|
||||
|
||||
pool = get_context("spawn").Pool(NUM_PROCS, maxtasksperchild=1)
|
||||
# See Note [ROCm parallel CI testing]
|
||||
pool = get_context("spawn").Pool(NUM_PROCS, maxtasksperchild=None if torch.version.hip else 1)
|
||||
os.makedirs(REPO_ROOT / "test" / "test-reports", exist_ok=True)
|
||||
|
||||
def success_callback(err_message):
|
||||
|
|
|
|||
|
|
@ -9,7 +9,8 @@ IS_MEM_LEAK_CHECK = os.getenv("PYTORCH_TEST_CUDA_MEM_LEAK_CHECK", "0") == "1"
|
|||
|
||||
NUM_PROCS = 1 if IS_MEM_LEAK_CHECK else 2
|
||||
|
||||
# Special logic for ROCm GHA runners.
|
||||
# See Note [ROCm parallel CI testing]
|
||||
# Special logic for ROCm GHA runners to query number of GPUs available.
|
||||
# torch.version.hip was not available to check if this was a ROCm self-hosted runner.
|
||||
# Must check for ROCm runner in another way. We look for /opt/rocm directory.
|
||||
if os.path.exists("/opt/rocm") and not IS_MEM_LEAK_CHECK:
|
||||
|
|
|
|||
Loading…
Reference in a new issue