From ffe0c1ae4d8de75bc4d6cf5c43123df1e538d8a4 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Sat, 14 Dec 2019 16:01:40 -0800
Subject: [PATCH] Make test_torch.py pass cuda-memcheck (#29243)

Summary:
Make the following changes:
- When there are more than 10k errors, cuda-memcheck only shows 10k errors, in this case we shouldn't raise an Exception
- Add UNDER_CUDA_MEMCHECK environment to allow disabling `pin_memory` tests when running cuda-memcheck.
- Add a `--ci` command option, when turned on, then this script would run output to stdout instead of writing a file, and exit with an error if cuda-memcheck fails
- Add a `--nohang` command option. When turned on, then hang would be treated as pass instead of error
- Do simple filtering on the test to run: if `'cpu'` in the test name but not `'cuda'` is not in the test name
- Add `--split` and `--rank` to allowing splitting the work (NVIDIA CI has a limitation of 3 hours, we have to split the work to satisfy this limitation)
- The error summary could be `ERROR SUMMARY: 1 error`, or `ERROR SUMMARY: 2 errors`, the tail could be `error` or `errors`, it is not of the same length. The script is fixed to handle this case.
- Ignore errors from `cufft`
Pull Request resolved: https://github.com/pytorch/pytorch/pull/29243

Differential Revision: D18941701

Pulled By: mruberry

fbshipit-source-id: 2048428f32b66ef50c67444c03ce4dd9491179d2
---
 test/common_device_type.py           |  3 ++
 test/scripts/cuda_memcheck_common.py | 15 ++++++---
 test/scripts/run_cuda_memcheck.py    | 49 +++++++++++++++++++++++++---
 test/test_torch.py                   |  4 ++-
 4 files changed, 60 insertions(+), 11 deletions(-)

diff --git a/test/common_device_type.py b/test/common_device_type.py
index b70d1d9e7e7..687aaf89d05 100644
--- a/test/common_device_type.py
+++ b/test/common_device_type.py
@@ -2,6 +2,7 @@ import inspect
 import threading
 from functools import wraps
 import unittest
+import os
 import torch
 from common_utils import TestCase, TEST_WITH_ROCM, TEST_MKL, \
     skipCUDANonDefaultStreamIf
@@ -247,6 +248,8 @@ device_type_test_bases.append(CPUTestBase)
 if torch.cuda.is_available():
     device_type_test_bases.append(CUDATestBase)
 
+PYTORCH_CUDA_MEMCHECK = os.getenv('PYTORCH_CUDA_MEMCHECK', '0') == '1'
+
 
 # Adds 'instantiated' device-specific test cases to the given scope.
 # The tests in these test cases are derived from the generic tests in
diff --git a/test/scripts/cuda_memcheck_common.py b/test/scripts/cuda_memcheck_common.py
index 7f7dc825339..aa52ced783b 100644
--- a/test/scripts/cuda_memcheck_common.py
+++ b/test/scripts/cuda_memcheck_common.py
@@ -9,15 +9,20 @@ class ParseError(Exception):
 class Report:
     """A report is a container of errors, and a summary on how many errors are found"""
 
-    HEAD = 'ERROR SUMMARY: '
-    TAIL = ' errors'
-
     def __init__(self, text, errors):
+        # text is something like
+        # ERROR SUMMARY: 1 error
+        # or
+        # ERROR SUMMARY: 2 errors
         self.text = text
-        self.num_errors = int(text[len(self.HEAD):len(text) - len(self.TAIL)])
+        self.num_errors = int(text.strip().split()[2])
         self.errors = errors
         if len(errors) != self.num_errors:
-            raise ParseError("Number of errors does not match")
+            if len(errors) == 10000 and self.num_errors > 10000:
+                # When there are more than 10k errors, cuda-memcheck only display 10k
+                self.num_errors = 10000
+            else:
+                raise ParseError("Number of errors does not match")
 
 
 class Error:
diff --git a/test/scripts/run_cuda_memcheck.py b/test/scripts/run_cuda_memcheck.py
index f80fa84350c..2541e4bff48 100755
--- a/test/scripts/run_cuda_memcheck.py
+++ b/test/scripts/run_cuda_memcheck.py
@@ -18,7 +18,8 @@ import multiprocessing
 import argparse
 import subprocess
 import tqdm
-import re
+import os
+import sys
 import cuda_memcheck_common as cmc
 
 ALL_TESTS = []
@@ -35,6 +36,13 @@ parser.add_argument('--nproc', type=int, default=multiprocessing.cpu_count(),
                     help='Number of processes running tests, default to number of cores in the system')
 parser.add_argument('--gpus', default='all',
                     help='GPU assignments for each process, it could be "all", or : separated list like "1,2:3,4:5,6"')
+parser.add_argument('--ci', action='store_true',
+                    help='Whether this script is executed in CI. When executed inside a CI, this script fails when '
+                         'an error is detected. Also, it will not show tqdm progress bar, but directly print the error'
+                         'to stdout instead.')
+parser.add_argument('--nohang', action='store_true', help='Treat timeout as success')
+parser.add_argument('--split', type=int, default=1, help='Split the job into pieces')
+parser.add_argument('--rank', type=int, default=0, help='Which piece this process should pick')
 args = parser.parse_args()
 
 # Filters that ignores cublas/cudnn errors
@@ -48,10 +56,13 @@ def is_ignored_only(output):
         return False
     count_ignored_errors = 0
     for e in report.errors:
-        if 'libcublas' in ''.join(e.stack) or 'libcudnn' in ''.join(e.stack):
+        if 'libcublas' in ''.join(e.stack) or 'libcudnn' in ''.join(e.stack) or 'libcufft' in ''.join(e.stack):
             count_ignored_errors += 1
     return count_ignored_errors == report.num_errors
 
+# Set environment PYTORCH_CUDA_MEMCHECK=1 to allow skipping some tests
+os.environ['PYTORCH_CUDA_MEMCHECK'] = '1'
+
 # Discover tests:
 # To get a list of tests, run:
 # pytest --setup-only test/test_torch.py
@@ -66,6 +77,21 @@ for line in lines:
         line = line.replace('::', '.')
         ALL_TESTS.append(line)
 
+# Do a simple filtering:
+# if 'cpu' or 'CPU' is in the name and 'cuda' or 'CUDA' is not in the name, then skip it
+def is_cpu_only(name):
+    name = name.lower()
+    return ('cpu' in name) and not ('cuda' in name)
+
+ALL_TESTS = [x for x in ALL_TESTS if not is_cpu_only(x)]
+
+# Split all tests into chunks, and only on the selected chunk
+ALL_TESTS.sort()
+chunk_size = (len(ALL_TESTS) + args.split - 1) // args.split
+start = chunk_size * args.rank
+end = chunk_size * (args.rank + 1)
+ALL_TESTS = ALL_TESTS[start:end]
+
 # Run tests:
 # Since running cuda-memcheck on PyTorch unit tests is very slow, these tests must be run in parallel.
 # This is done by using the coroutine feature in new Python versions.  A number of coroutines are created;
@@ -74,8 +100,17 @@ for line in lines:
 # These subprocesses are balanced across different GPUs on the system by assigning one devices per process,
 # or as specified by the user
 progress = 0
-logfile = open('result.log', 'w')
-progressbar = tqdm.tqdm(total=len(ALL_TESTS))
+if not args.ci:
+    logfile = open('result.log', 'w')
+    progressbar = tqdm.tqdm(total=len(ALL_TESTS))
+else:
+    logfile = sys.stdout
+
+    # create a fake progress bar that does not display anything
+    class ProgressbarStub:
+        def update(*args):
+            return
+    progressbar = ProgressbarStub()
 
 async def run1(coroutine_id):
     global progress
@@ -97,6 +132,8 @@ async def run1(coroutine_id):
         except asyncio.TimeoutError:
             print('Timeout:', test, file=logfile)
             proc.kill()
+            if args.ci and not args.nohang:
+                sys.exit("Hang detected on cuda-memcheck")
         else:
             if proc.returncode == 0:
                 print('Success:', test, file=logfile)
@@ -108,13 +145,15 @@ async def run1(coroutine_id):
                     print('Fail:', test, file=logfile)
                     print(stdout, file=logfile)
                     print(stderr, file=logfile)
+                    if args.ci:
+                        sys.exit("Failure detected on cuda-memcheck")
                 else:
                     print('Ignored:', test, file=logfile)
         del proc
         progressbar.update(1)
 
 async def main():
-    tasks = [asyncio.create_task(run1(i)) for i in range(args.nproc)]
+    tasks = [asyncio.ensure_future(run1(i)) for i in range(args.nproc)]
     for t in tasks:
         await t
 
diff --git a/test/test_torch.py b/test/test_torch.py
index 7481ecced2e..35ca02d2f5e 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -34,7 +34,7 @@ from common_utils import TestCase, iter_indices, TEST_NUMPY, TEST_SCIPY, TEST_MK
     IS_SANDCASTLE, load_tests, brute_pdist, brute_cdist, slowTest, \
     skipCUDANonDefaultStreamIf, skipCUDAMemoryLeakCheckIf
 from multiprocessing.reduction import ForkingPickler
-from common_device_type import instantiate_device_type_tests, \
+from common_device_type import instantiate_device_type_tests, PYTORCH_CUDA_MEMCHECK, \
     skipCPUIfNoLapack, skipCUDAIfNoMagma, skipCUDAIfRocm, onlyCUDA, onlyCPU, \
     dtypes, dtypesIfCUDA, deviceCountAtLeast, skipCUDAIf, precisionOverride
 import torch.backends.quantized
@@ -4929,6 +4929,7 @@ tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
             self.assertEqual(torch.empty_like(a).shape, a.shape)
             self.assertEqual(torch.empty_like(a).type(), a.type())
 
+    @unittest.skipIf(PYTORCH_CUDA_MEMCHECK, "is_pinned uses failure to detect pointer property")
     def test_pin_memory(self):
         x = torch.randn(3, 5)
         self.assertFalse(x.is_pinned())
@@ -12673,6 +12674,7 @@ class TestTorchDeviceType(TestCase):
         self.assertEqual(z, x)
 
     @onlyCUDA
+    @unittest.skipIf(PYTORCH_CUDA_MEMCHECK, "is_pinned uses failure to detect pointer property")
     def test_pin_memory_from_constructor(self, device):
         def _get_like(t, **kwargs):
             return [