mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-14 20:57:59 +00:00
initial commit to enable fast_nvcc (#49773)
Summary: draft enable fast_nvcc. * cleaned up some non-standard usages * added fall-back to wrap_nvcc Pull Request resolved: https://github.com/pytorch/pytorch/pull/49773 Test Plan: Configuration to enable fast nvcc: - install and enable `ccache` but delete `.ccache/` folder before each build. - `TORCH_CUDA_ARCH_LIST=6.0;6.1;6.2;7.0;7.5` - Toggling `USE_FAST_NVCC=ON/OFF` cmake config and run `cmake --build` to verify the build time. Initial statistic for a full compilation: * `cmake --build . -- -j $(nproc)`: - fast NVCC ``` real 48m55.706s user 1559m14.218s sys 318m41.138s ``` - normal NVCC: ``` real 43m38.723s user 1470m28.131s sys 90m46.879s ``` * `cmake --build . -- -j $(nproc/4)`: - fast NVCC: ``` real 53m44.173s user 1130m18.323s sys 71m32.385s ``` - normal NVCC: ``` real 81m53.768s user 858m45.402s sys 61m15.539s ``` * Conclusion: fast NVCC doesn't provide too much gain when compiler is set to use full CPU utilization, in fact it is **even worse** because of the thread switcing. initial statistic for partial recompile (edit .cu files) * `cmake --build . -- -j $(nproc)` - fast NVCC: ``` [2021-01-13 18:10:24] [ 86%] Building NVCC (Device) object caffe2/CMakeFiles/torch_cuda.dir/__/aten/src/ATen/native/cuda/torch_cuda_generated_BinaryMiscOpsKernels.cu.o [2021-01-13 18:11:08] [ 86%] Linking CXX shared library ../lib/libtorch_cuda.so ``` - normal NVCC: ``` [2021-01-13 17:35:40] [ 86%] Building NVCC (Device) object caffe2/CMakeFiles/torch_cuda.dir/__/aten/src/ATen/native/cuda/torch_cuda_generated_BinaryMiscOpsKernels.cu.o [2021-01-13 17:38:08] [ 86%] Linking CXX shared library ../lib/libtorch_cuda.so ``` * Conclusion: Effective compilation time for single CU file modification reduced from from 2min30sec to only 40sec when compiling multiple architecture. This shows **4X** gain in speed up using fast NVCC -- reaching the theoretical limit of 5X when compiling 5 gencode architecture at the same time. Follow up PRs: - should have better fallback mechanism to detect whether a build is supported by fast_nvcc or not instead of dryruning then fail with fallback. - performance measurement instrumentation to measure what's the total compile time vs the parallel tasks critical path time. - figure out why `-j $(nproc)` gives significant sys overhead (`sys 318m41.138s` vs `sys 90m46.879s`) over normal nvcc, guess this is context switching, but not exactly sure Reviewed By: malfet Differential Revision: D25692758 Pulled By: walterddr fbshipit-source-id: c244d07b9b71f146e972b6b3682ca792b38c4457
This commit is contained in:
parent
f7b2b22b64
commit
ebd142e94b
5 changed files with 43 additions and 1 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
|
@ -52,6 +52,8 @@ test/cpp_extensions/install/
|
|||
test/test-reports/
|
||||
third_party/build/
|
||||
tools/shared/_utils_internal.py
|
||||
tools/fast_nvcc/wrap_nvcc.sh
|
||||
tools/fast_nvcc/tmp/
|
||||
torch.egg-info/
|
||||
torch/_C/__init__.pyi
|
||||
torch/_C/_nn.pyi
|
||||
|
|
|
|||
|
|
@ -161,6 +161,7 @@ option(COLORIZE_OUTPUT "Colorize output during compilation" ON)
|
|||
option(USE_ASAN "Use Address Sanitizer" OFF)
|
||||
option(USE_TSAN "Use Thread Sanitizer" OFF)
|
||||
option(USE_CUDA "Use CUDA" ON)
|
||||
option(USE_FAST_NVCC "Use parallel NVCC build" OFF)
|
||||
option(USE_ROCM "Use ROCm" ON)
|
||||
option(CAFFE2_STATIC_LINK_CUDA "Statically link CUDA libraries" OFF)
|
||||
cmake_dependent_option(
|
||||
|
|
|
|||
|
|
@ -767,6 +767,18 @@ else()
|
|||
# Search default search paths, after we search our own set of paths.
|
||||
cuda_find_host_program(CUDA_NVCC_EXECUTABLE nvcc)
|
||||
endif()
|
||||
|
||||
# FAST_NVCC
|
||||
if(USE_FAST_NVCC AND CUDA_NVCC_EXECUTABLE AND NOT CUDA_NVCC_EXECUTABLE_ORIGIN)
|
||||
set(CUDA_NVCC_EXECUTABLE_ORIGIN "${CUDA_NVCC_EXECUTABLE}")
|
||||
set(FAST_NVCC_EXECUTABLE "${PROJECT_SOURCE_DIR}/tools/fast_nvcc/fast_nvcc.py")
|
||||
configure_file(${PROJECT_SOURCE_DIR}/tools/fast_nvcc/wrap_nvcc.sh.in "${PROJECT_SOURCE_DIR}/tools/fast_nvcc/tmp/wrap_nvcc.sh")
|
||||
file(COPY "${PROJECT_SOURCE_DIR}/tools/fast_nvcc/tmp/wrap_nvcc.sh"
|
||||
DESTINATION "${PROJECT_SOURCE_DIR}/tools/fast_nvcc/"
|
||||
FILE_PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE
|
||||
)
|
||||
set(CUDA_NVCC_EXECUTABLE "${PROJECT_SOURCE_DIR}/tools/fast_nvcc/wrap_nvcc.sh")
|
||||
endif()
|
||||
mark_as_advanced(CUDA_NVCC_EXECUTABLE)
|
||||
|
||||
if(CUDA_NVCC_EXECUTABLE AND NOT CUDA_VERSION)
|
||||
|
|
@ -789,7 +801,6 @@ else()
|
|||
string(REGEX REPLACE "([0-9]+)\\.([0-9]+).*" "\\2" CUDA_VERSION_MINOR "${CUDA_VERSION}")
|
||||
endif()
|
||||
|
||||
|
||||
# Always set this convenience variable
|
||||
set(CUDA_VERSION_STRING "${CUDA_VERSION}")
|
||||
|
||||
|
|
|
|||
|
|
@ -157,6 +157,19 @@ def warn_if_tmpdir_set(env):
|
|||
fast_nvcc_warn(url_vars)
|
||||
|
||||
|
||||
def contains_non_executable(commands):
|
||||
for command in commands:
|
||||
# This is to deal with special command dry-run result from NVCC such as:
|
||||
# ```
|
||||
# #$ "/lib64/ccache"/c++ -std=c++11 -E -x c++ -D__CUDACC__ -D__NVCC__ -fPIC -fvisibility=hidden -O3 \
|
||||
# -I ... -m64 "reduce_scatter.cu" > "/tmp/tmpxft_0037fae3_00000000-5_reduce_scatter.cpp4.ii
|
||||
# #$ -- Filter Dependencies -- > ... pytorch/build/nccl/obj/collectives/device/reduce_scatter.dep.tmp
|
||||
# ```
|
||||
if command.startswith("--"):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def module_id_contents(command):
|
||||
"""
|
||||
Guess the contents of the .module_id file contained within command.
|
||||
|
|
@ -275,6 +288,8 @@ def is_weakly_connected(graph):
|
|||
"""
|
||||
Return true iff graph is weakly connected.
|
||||
"""
|
||||
if not graph:
|
||||
return True
|
||||
neighbors = [set() for _ in graph]
|
||||
for node, predecessors in enumerate(graph):
|
||||
for pred in predecessors:
|
||||
|
|
@ -408,6 +423,10 @@ def exit_code(results):
|
|||
return 0
|
||||
|
||||
|
||||
def wrap_nvcc(args, config=default_config):
|
||||
return subprocess.call([config.nvcc] + args)
|
||||
|
||||
|
||||
def fast_nvcc(args, *, config=default_config):
|
||||
"""
|
||||
Emulate the result of calling the given nvcc binary with args.
|
||||
|
|
@ -422,6 +441,10 @@ def fast_nvcc(args, *, config=default_config):
|
|||
commands = dryrun_data['commands']
|
||||
if not config.faithful:
|
||||
commands = make_rm_force(unique_module_id_files(commands))
|
||||
|
||||
if contains_non_executable(commands):
|
||||
return wrap_nvcc(args, config)
|
||||
|
||||
command_parts = list(map(shlex.split, commands))
|
||||
if config.verbose:
|
||||
print_verbose_output(
|
||||
|
|
|
|||
5
tools/fast_nvcc/wrap_nvcc.sh.in
Normal file
5
tools/fast_nvcc/wrap_nvcc.sh.in
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
#!/bin/bash
|
||||
|
||||
# This script was created because cmake is not happy about dangling -- when
|
||||
# defining CUDA_NVCC_EXECUTABLE, thus we wrapped it in a shell script.
|
||||
@FAST_NVCC_EXECUTABLE@ --nvcc @CUDA_NVCC_EXECUTABLE_ORIGIN@ -- "$@"
|
||||
Loading…
Reference in a new issue