initial commit to enable fast_nvcc (#49773)

Summary:
draft enable fast_nvcc.
* cleaned up some non-standard usages
* added fall-back to wrap_nvcc

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49773

Test Plan:
Configuration to enable fast nvcc:
  - install and enable `ccache` but delete `.ccache/` folder before each build.
  - `TORCH_CUDA_ARCH_LIST=6.0;6.1;6.2;7.0;7.5`
  - Toggling `USE_FAST_NVCC=ON/OFF` cmake config and run `cmake --build` to verify the build time.

Initial statistic for a full compilation:
* `cmake --build . -- -j $(nproc)`:
  - fast NVCC
```
        real    48m55.706s
        user    1559m14.218s
        sys     318m41.138s
```
  - normal NVCC:
```
        real    43m38.723s
        user    1470m28.131s
        sys     90m46.879s
```
* `cmake --build . -- -j $(nproc/4)`:
  - fast NVCC:
```
        real    53m44.173s
        user    1130m18.323s
        sys     71m32.385s
```
  - normal  NVCC:
```
        real    81m53.768s
        user    858m45.402s
        sys     61m15.539s
```
* Conclusion: fast NVCC doesn't provide too much gain when compiler is set to use full CPU utilization, in fact it is **even worse** because of the thread switcing.

initial statistic for partial recompile (edit .cu files)

* `cmake --build . -- -j $(nproc)`
  - fast NVCC:
```
[2021-01-13 18:10:24] [ 86%] Building NVCC (Device) object caffe2/CMakeFiles/torch_cuda.dir/__/aten/src/ATen/native/cuda/torch_cuda_generated_BinaryMiscOpsKernels.cu.o
[2021-01-13 18:11:08] [ 86%] Linking CXX shared library ../lib/libtorch_cuda.so
```
  - normal NVCC:
```
[2021-01-13 17:35:40] [ 86%] Building NVCC (Device) object caffe2/CMakeFiles/torch_cuda.dir/__/aten/src/ATen/native/cuda/torch_cuda_generated_BinaryMiscOpsKernels.cu.o
[2021-01-13 17:38:08] [ 86%] Linking CXX shared library ../lib/libtorch_cuda.so
```
* Conclusion: Effective compilation time for single CU file modification reduced from from 2min30sec to only 40sec when compiling multiple architecture. This shows **4X** gain in speed up using fast NVCC -- reaching the theoretical limit of 5X when compiling 5 gencode architecture at the same time.

Follow up PRs:
- should have better fallback mechanism to detect whether a build is supported by fast_nvcc or not instead of dryruning then fail with fallback.
- performance measurement instrumentation to measure what's the total compile time vs the parallel tasks critical path time.
- figure out why `-j $(nproc)` gives significant sys overhead (`sys 318m41.138s` vs `sys 90m46.879s`) over normal nvcc, guess this is context switching, but not exactly sure

Reviewed By: malfet

Differential Revision: D25692758

Pulled By: walterddr

fbshipit-source-id: c244d07b9b71f146e972b6b3682ca792b38c4457
This commit is contained in:
Rong Rong (AI Infra) 2021-01-19 14:48:44 -08:00 committed by Facebook GitHub Bot
parent f7b2b22b64
commit ebd142e94b
5 changed files with 43 additions and 1 deletions

2
.gitignore vendored
View file

@ -52,6 +52,8 @@ test/cpp_extensions/install/
test/test-reports/
third_party/build/
tools/shared/_utils_internal.py
tools/fast_nvcc/wrap_nvcc.sh
tools/fast_nvcc/tmp/
torch.egg-info/
torch/_C/__init__.pyi
torch/_C/_nn.pyi

View file

@ -161,6 +161,7 @@ option(COLORIZE_OUTPUT "Colorize output during compilation" ON)
option(USE_ASAN "Use Address Sanitizer" OFF)
option(USE_TSAN "Use Thread Sanitizer" OFF)
option(USE_CUDA "Use CUDA" ON)
option(USE_FAST_NVCC "Use parallel NVCC build" OFF)
option(USE_ROCM "Use ROCm" ON)
option(CAFFE2_STATIC_LINK_CUDA "Statically link CUDA libraries" OFF)
cmake_dependent_option(

View file

@ -767,6 +767,18 @@ else()
# Search default search paths, after we search our own set of paths.
cuda_find_host_program(CUDA_NVCC_EXECUTABLE nvcc)
endif()
# FAST_NVCC
if(USE_FAST_NVCC AND CUDA_NVCC_EXECUTABLE AND NOT CUDA_NVCC_EXECUTABLE_ORIGIN)
set(CUDA_NVCC_EXECUTABLE_ORIGIN "${CUDA_NVCC_EXECUTABLE}")
set(FAST_NVCC_EXECUTABLE "${PROJECT_SOURCE_DIR}/tools/fast_nvcc/fast_nvcc.py")
configure_file(${PROJECT_SOURCE_DIR}/tools/fast_nvcc/wrap_nvcc.sh.in "${PROJECT_SOURCE_DIR}/tools/fast_nvcc/tmp/wrap_nvcc.sh")
file(COPY "${PROJECT_SOURCE_DIR}/tools/fast_nvcc/tmp/wrap_nvcc.sh"
DESTINATION "${PROJECT_SOURCE_DIR}/tools/fast_nvcc/"
FILE_PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE
)
set(CUDA_NVCC_EXECUTABLE "${PROJECT_SOURCE_DIR}/tools/fast_nvcc/wrap_nvcc.sh")
endif()
mark_as_advanced(CUDA_NVCC_EXECUTABLE)
if(CUDA_NVCC_EXECUTABLE AND NOT CUDA_VERSION)
@ -789,7 +801,6 @@ else()
string(REGEX REPLACE "([0-9]+)\\.([0-9]+).*" "\\2" CUDA_VERSION_MINOR "${CUDA_VERSION}")
endif()
# Always set this convenience variable
set(CUDA_VERSION_STRING "${CUDA_VERSION}")

View file

@ -157,6 +157,19 @@ def warn_if_tmpdir_set(env):
fast_nvcc_warn(url_vars)
def contains_non_executable(commands):
for command in commands:
# This is to deal with special command dry-run result from NVCC such as:
# ```
# #$ "/lib64/ccache"/c++ -std=c++11 -E -x c++ -D__CUDACC__ -D__NVCC__ -fPIC -fvisibility=hidden -O3 \
# -I ... -m64 "reduce_scatter.cu" > "/tmp/tmpxft_0037fae3_00000000-5_reduce_scatter.cpp4.ii
# #$ -- Filter Dependencies -- > ... pytorch/build/nccl/obj/collectives/device/reduce_scatter.dep.tmp
# ```
if command.startswith("--"):
return True
return False
def module_id_contents(command):
"""
Guess the contents of the .module_id file contained within command.
@ -275,6 +288,8 @@ def is_weakly_connected(graph):
"""
Return true iff graph is weakly connected.
"""
if not graph:
return True
neighbors = [set() for _ in graph]
for node, predecessors in enumerate(graph):
for pred in predecessors:
@ -408,6 +423,10 @@ def exit_code(results):
return 0
def wrap_nvcc(args, config=default_config):
return subprocess.call([config.nvcc] + args)
def fast_nvcc(args, *, config=default_config):
"""
Emulate the result of calling the given nvcc binary with args.
@ -422,6 +441,10 @@ def fast_nvcc(args, *, config=default_config):
commands = dryrun_data['commands']
if not config.faithful:
commands = make_rm_force(unique_module_id_files(commands))
if contains_non_executable(commands):
return wrap_nvcc(args, config)
command_parts = list(map(shlex.split, commands))
if config.verbose:
print_verbose_output(

View file

@ -0,0 +1,5 @@
#!/bin/bash
# This script was created because cmake is not happy about dangling -- when
# defining CUDA_NVCC_EXECUTABLE, thus we wrapped it in a shell script.
@FAST_NVCC_EXECUTABLE@ --nvcc @CUDA_NVCC_EXECUTABLE_ORIGIN@ -- "$@"