From ffe0c1ae4d8de75bc4d6cf5c43123df1e538d8a4 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Sat, 14 Dec 2019 16:01:40 -0800 Subject: [PATCH] Make test_torch.py pass cuda-memcheck (#29243) Summary: Make the following changes: - When there are more than 10k errors, cuda-memcheck only shows 10k errors, in this case we shouldn't raise an Exception - Add UNDER_CUDA_MEMCHECK environment to allow disabling `pin_memory` tests when running cuda-memcheck. - Add a `--ci` command option, when turned on, then this script would run output to stdout instead of writing a file, and exit with an error if cuda-memcheck fails - Add a `--nohang` command option. When turned on, then hang would be treated as pass instead of error - Do simple filtering on the test to run: if `'cpu'` in the test name but not `'cuda'` is not in the test name - Add `--split` and `--rank` to allowing splitting the work (NVIDIA CI has a limitation of 3 hours, we have to split the work to satisfy this limitation) - The error summary could be `ERROR SUMMARY: 1 error`, or `ERROR SUMMARY: 2 errors`, the tail could be `error` or `errors`, it is not of the same length. The script is fixed to handle this case. - Ignore errors from `cufft` Pull Request resolved: https://github.com/pytorch/pytorch/pull/29243 Differential Revision: D18941701 Pulled By: mruberry fbshipit-source-id: 2048428f32b66ef50c67444c03ce4dd9491179d2 --- test/common_device_type.py | 3 ++ test/scripts/cuda_memcheck_common.py | 15 ++++++--- test/scripts/run_cuda_memcheck.py | 49 +++++++++++++++++++++++++--- test/test_torch.py | 4 ++- 4 files changed, 60 insertions(+), 11 deletions(-) diff --git a/test/common_device_type.py b/test/common_device_type.py index b70d1d9e7e7..687aaf89d05 100644 --- a/test/common_device_type.py +++ b/test/common_device_type.py @@ -2,6 +2,7 @@ import inspect import threading from functools import wraps import unittest +import os import torch from common_utils import TestCase, TEST_WITH_ROCM, TEST_MKL, \ skipCUDANonDefaultStreamIf @@ -247,6 +248,8 @@ device_type_test_bases.append(CPUTestBase) if torch.cuda.is_available(): device_type_test_bases.append(CUDATestBase) +PYTORCH_CUDA_MEMCHECK = os.getenv('PYTORCH_CUDA_MEMCHECK', '0') == '1' + # Adds 'instantiated' device-specific test cases to the given scope. # The tests in these test cases are derived from the generic tests in diff --git a/test/scripts/cuda_memcheck_common.py b/test/scripts/cuda_memcheck_common.py index 7f7dc825339..aa52ced783b 100644 --- a/test/scripts/cuda_memcheck_common.py +++ b/test/scripts/cuda_memcheck_common.py @@ -9,15 +9,20 @@ class ParseError(Exception): class Report: """A report is a container of errors, and a summary on how many errors are found""" - HEAD = 'ERROR SUMMARY: ' - TAIL = ' errors' - def __init__(self, text, errors): + # text is something like + # ERROR SUMMARY: 1 error + # or + # ERROR SUMMARY: 2 errors self.text = text - self.num_errors = int(text[len(self.HEAD):len(text) - len(self.TAIL)]) + self.num_errors = int(text.strip().split()[2]) self.errors = errors if len(errors) != self.num_errors: - raise ParseError("Number of errors does not match") + if len(errors) == 10000 and self.num_errors > 10000: + # When there are more than 10k errors, cuda-memcheck only display 10k + self.num_errors = 10000 + else: + raise ParseError("Number of errors does not match") class Error: diff --git a/test/scripts/run_cuda_memcheck.py b/test/scripts/run_cuda_memcheck.py index f80fa84350c..2541e4bff48 100755 --- a/test/scripts/run_cuda_memcheck.py +++ b/test/scripts/run_cuda_memcheck.py @@ -18,7 +18,8 @@ import multiprocessing import argparse import subprocess import tqdm -import re +import os +import sys import cuda_memcheck_common as cmc ALL_TESTS = [] @@ -35,6 +36,13 @@ parser.add_argument('--nproc', type=int, default=multiprocessing.cpu_count(), help='Number of processes running tests, default to number of cores in the system') parser.add_argument('--gpus', default='all', help='GPU assignments for each process, it could be "all", or : separated list like "1,2:3,4:5,6"') +parser.add_argument('--ci', action='store_true', + help='Whether this script is executed in CI. When executed inside a CI, this script fails when ' + 'an error is detected. Also, it will not show tqdm progress bar, but directly print the error' + 'to stdout instead.') +parser.add_argument('--nohang', action='store_true', help='Treat timeout as success') +parser.add_argument('--split', type=int, default=1, help='Split the job into pieces') +parser.add_argument('--rank', type=int, default=0, help='Which piece this process should pick') args = parser.parse_args() # Filters that ignores cublas/cudnn errors @@ -48,10 +56,13 @@ def is_ignored_only(output): return False count_ignored_errors = 0 for e in report.errors: - if 'libcublas' in ''.join(e.stack) or 'libcudnn' in ''.join(e.stack): + if 'libcublas' in ''.join(e.stack) or 'libcudnn' in ''.join(e.stack) or 'libcufft' in ''.join(e.stack): count_ignored_errors += 1 return count_ignored_errors == report.num_errors +# Set environment PYTORCH_CUDA_MEMCHECK=1 to allow skipping some tests +os.environ['PYTORCH_CUDA_MEMCHECK'] = '1' + # Discover tests: # To get a list of tests, run: # pytest --setup-only test/test_torch.py @@ -66,6 +77,21 @@ for line in lines: line = line.replace('::', '.') ALL_TESTS.append(line) +# Do a simple filtering: +# if 'cpu' or 'CPU' is in the name and 'cuda' or 'CUDA' is not in the name, then skip it +def is_cpu_only(name): + name = name.lower() + return ('cpu' in name) and not ('cuda' in name) + +ALL_TESTS = [x for x in ALL_TESTS if not is_cpu_only(x)] + +# Split all tests into chunks, and only on the selected chunk +ALL_TESTS.sort() +chunk_size = (len(ALL_TESTS) + args.split - 1) // args.split +start = chunk_size * args.rank +end = chunk_size * (args.rank + 1) +ALL_TESTS = ALL_TESTS[start:end] + # Run tests: # Since running cuda-memcheck on PyTorch unit tests is very slow, these tests must be run in parallel. # This is done by using the coroutine feature in new Python versions. A number of coroutines are created; @@ -74,8 +100,17 @@ for line in lines: # These subprocesses are balanced across different GPUs on the system by assigning one devices per process, # or as specified by the user progress = 0 -logfile = open('result.log', 'w') -progressbar = tqdm.tqdm(total=len(ALL_TESTS)) +if not args.ci: + logfile = open('result.log', 'w') + progressbar = tqdm.tqdm(total=len(ALL_TESTS)) +else: + logfile = sys.stdout + + # create a fake progress bar that does not display anything + class ProgressbarStub: + def update(*args): + return + progressbar = ProgressbarStub() async def run1(coroutine_id): global progress @@ -97,6 +132,8 @@ async def run1(coroutine_id): except asyncio.TimeoutError: print('Timeout:', test, file=logfile) proc.kill() + if args.ci and not args.nohang: + sys.exit("Hang detected on cuda-memcheck") else: if proc.returncode == 0: print('Success:', test, file=logfile) @@ -108,13 +145,15 @@ async def run1(coroutine_id): print('Fail:', test, file=logfile) print(stdout, file=logfile) print(stderr, file=logfile) + if args.ci: + sys.exit("Failure detected on cuda-memcheck") else: print('Ignored:', test, file=logfile) del proc progressbar.update(1) async def main(): - tasks = [asyncio.create_task(run1(i)) for i in range(args.nproc)] + tasks = [asyncio.ensure_future(run1(i)) for i in range(args.nproc)] for t in tasks: await t diff --git a/test/test_torch.py b/test/test_torch.py index 7481ecced2e..35ca02d2f5e 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -34,7 +34,7 @@ from common_utils import TestCase, iter_indices, TEST_NUMPY, TEST_SCIPY, TEST_MK IS_SANDCASTLE, load_tests, brute_pdist, brute_cdist, slowTest, \ skipCUDANonDefaultStreamIf, skipCUDAMemoryLeakCheckIf from multiprocessing.reduction import ForkingPickler -from common_device_type import instantiate_device_type_tests, \ +from common_device_type import instantiate_device_type_tests, PYTORCH_CUDA_MEMCHECK, \ skipCPUIfNoLapack, skipCUDAIfNoMagma, skipCUDAIfRocm, onlyCUDA, onlyCPU, \ dtypes, dtypesIfCUDA, deviceCountAtLeast, skipCUDAIf, precisionOverride import torch.backends.quantized @@ -4929,6 +4929,7 @@ tensor([[[1., 1., 1., ..., 1., 1., 1.], self.assertEqual(torch.empty_like(a).shape, a.shape) self.assertEqual(torch.empty_like(a).type(), a.type()) + @unittest.skipIf(PYTORCH_CUDA_MEMCHECK, "is_pinned uses failure to detect pointer property") def test_pin_memory(self): x = torch.randn(3, 5) self.assertFalse(x.is_pinned()) @@ -12673,6 +12674,7 @@ class TestTorchDeviceType(TestCase): self.assertEqual(z, x) @onlyCUDA + @unittest.skipIf(PYTORCH_CUDA_MEMCHECK, "is_pinned uses failure to detect pointer property") def test_pin_memory_from_constructor(self, device): def _get_like(t, **kwargs): return [