initial commit to enable fast_nvcc (#49773)

Summary: draft enable fast_nvcc. * cleaned up some non-standard usages * added fall-back to wrap_nvcc Pull Request resolved: https://github.com/pytorch/pytorch/pull/49773 Test Plan: Configuration to enable fast nvcc: - install and enable `ccache` but delete `.ccache/` folder before each build. - `TORCH_CUDA_ARCH_LIST=6.0;6.1;6.2;7.0;7.5` - Toggling `USE_FAST_NVCC=ON/OFF` cmake config and run `cmake --build` to verify the build time. Initial statistic for a full compilation: * `cmake --build . -- -j $(nproc)`: - fast NVCC ``` real 48m55.706s user 1559m14.218s sys 318m41.138s ``` - normal NVCC: ``` real 43m38.723s user 1470m28.131s sys 90m46.879s ``` * `cmake --build . -- -j $(nproc/4)`: - fast NVCC: ``` real 53m44.173s user 1130m18.323s sys 71m32.385s ``` - normal NVCC: ``` real 81m53.768s user 858m45.402s sys 61m15.539s ``` * Conclusion: fast NVCC doesn't provide too much gain when compiler is set to use full CPU utilization, in fact it is **even worse** because of the thread switcing. initial statistic for partial recompile (edit .cu files) * `cmake --build . -- -j $(nproc)` - fast NVCC: ``` [2021-01-13 18:10:24] [ 86%] Building NVCC (Device) object caffe2/CMakeFiles/torch_cuda.dir/__/aten/src/ATen/native/cuda/torch_cuda_generated_BinaryMiscOpsKernels.cu.o [2021-01-13 18:11:08] [ 86%] Linking CXX shared library ../lib/libtorch_cuda.so ``` - normal NVCC: ``` [2021-01-13 17:35:40] [ 86%] Building NVCC (Device) object caffe2/CMakeFiles/torch_cuda.dir/__/aten/src/ATen/native/cuda/torch_cuda_generated_BinaryMiscOpsKernels.cu.o [2021-01-13 17:38:08] [ 86%] Linking CXX shared library ../lib/libtorch_cuda.so ``` * Conclusion: Effective compilation time for single CU file modification reduced from from 2min30sec to only 40sec when compiling multiple architecture. This shows **4X** gain in speed up using fast NVCC -- reaching the theoretical limit of 5X when compiling 5 gencode architecture at the same time. Follow up PRs: - should have better fallback mechanism to detect whether a build is supported by fast_nvcc or not instead of dryruning then fail with fallback. - performance measurement instrumentation to measure what's the total compile time vs the parallel tasks critical path time. - figure out why `-j $(nproc)` gives significant sys overhead (`sys 318m41.138s` vs `sys 90m46.879s`) over normal nvcc, guess this is context switching, but not exactly sure Reviewed By: malfet Differential Revision: D25692758 Pulled By: walterddr fbshipit-source-id: c244d07b9b71f146e972b6b3682ca792b38c4457
2026-05-14 20:57:59 +00:00 · 2021-01-19 14:48:44 -08:00 · 2021-01-19 14:48:44 -08:00 · ebd142e94b
commit ebd142e94b
parent f7b2b22b64
5 changed files with 43 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -52,6 +52,8 @@ test/cpp_extensions/install/
 test/test-reports/
 third_party/build/
 tools/shared/_utils_internal.py
+tools/fast_nvcc/wrap_nvcc.sh
+tools/fast_nvcc/tmp/
 torch.egg-info/
 torch/_C/__init__.pyi
 torch/_C/_nn.pyi
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -161,6 +161,7 @@ option(COLORIZE_OUTPUT "Colorize output during compilation" ON)
 option(USE_ASAN "Use Address Sanitizer" OFF)
 option(USE_TSAN "Use Thread Sanitizer" OFF)
 option(USE_CUDA "Use CUDA" ON)
+option(USE_FAST_NVCC "Use parallel NVCC build" OFF)
 option(USE_ROCM "Use ROCm" ON)
 option(CAFFE2_STATIC_LINK_CUDA "Statically link CUDA libraries" OFF)
 cmake_dependent_option(
--- a/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake
+++ b/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake
@ -767,6 +767,18 @@ else()
  # Search default search paths, after we search our own set of paths.
  cuda_find_host_program(CUDA_NVCC_EXECUTABLE nvcc)
 endif()
+
+# FAST_NVCC
+if(USE_FAST_NVCC AND CUDA_NVCC_EXECUTABLE AND NOT CUDA_NVCC_EXECUTABLE_ORIGIN)
+  set(CUDA_NVCC_EXECUTABLE_ORIGIN "${CUDA_NVCC_EXECUTABLE}")
+  set(FAST_NVCC_EXECUTABLE "${PROJECT_SOURCE_DIR}/tools/fast_nvcc/fast_nvcc.py")
+  configure_file(${PROJECT_SOURCE_DIR}/tools/fast_nvcc/wrap_nvcc.sh.in "${PROJECT_SOURCE_DIR}/tools/fast_nvcc/tmp/wrap_nvcc.sh")
+  file(COPY "${PROJECT_SOURCE_DIR}/tools/fast_nvcc/tmp/wrap_nvcc.sh"
+    DESTINATION "${PROJECT_SOURCE_DIR}/tools/fast_nvcc/"
+    FILE_PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE
+  )
+  set(CUDA_NVCC_EXECUTABLE "${PROJECT_SOURCE_DIR}/tools/fast_nvcc/wrap_nvcc.sh")
+endif()
 mark_as_advanced(CUDA_NVCC_EXECUTABLE)

 if(CUDA_NVCC_EXECUTABLE AND NOT CUDA_VERSION)
@ -789,7 +801,6 @@ else()
  string(REGEX REPLACE "([0-9]+)\\.([0-9]+).*" "\\2" CUDA_VERSION_MINOR "${CUDA_VERSION}")
 endif()

-
 # Always set this convenience variable
 set(CUDA_VERSION_STRING "${CUDA_VERSION}")

--- a/tools/fast_nvcc/fast_nvcc.py
+++ b/tools/fast_nvcc/fast_nvcc.py
@ -157,6 +157,19 @@ def warn_if_tmpdir_set(env):
        fast_nvcc_warn(url_vars)


+def contains_non_executable(commands):
+    for command in commands:
+        # This is to deal with special command dry-run result from NVCC such as:
+        # ```
+        # #$ "/lib64/ccache"/c++ -std=c++11 -E -x c++ -D__CUDACC__ -D__NVCC__  -fPIC -fvisibility=hidden -O3 \
+        #   -I ... -m64 "reduce_scatter.cu" > "/tmp/tmpxft_0037fae3_00000000-5_reduce_scatter.cpp4.ii
+        # #$ -- Filter Dependencies -- > ... pytorch/build/nccl/obj/collectives/device/reduce_scatter.dep.tmp
+        # ```
+        if command.startswith("--"):
+            return True
+    return False
+
+
 def module_id_contents(command):
    """
    Guess the contents of the .module_id file contained within command.
@ -275,6 +288,8 @@ def is_weakly_connected(graph):
    """
    Return true iff graph is weakly connected.
    """
+    if not graph:
+        return True
    neighbors = [set() for _ in graph]
    for node, predecessors in enumerate(graph):
        for pred in predecessors:
@ -408,6 +423,10 @@ def exit_code(results):
    return 0


+def wrap_nvcc(args, config=default_config):
+    return subprocess.call([config.nvcc] + args)
+
+
 def fast_nvcc(args, *, config=default_config):
    """
    Emulate the result of calling the given nvcc binary with args.
@ -422,6 +441,10 @@ def fast_nvcc(args, *, config=default_config):
    commands = dryrun_data['commands']
    if not config.faithful:
        commands = make_rm_force(unique_module_id_files(commands))
+
+    if contains_non_executable(commands):
+        return wrap_nvcc(args, config)
+
    command_parts = list(map(shlex.split, commands))
    if config.verbose:
        print_verbose_output(
--- a/tools/fast_nvcc/wrap_nvcc.sh.in
+++ b/tools/fast_nvcc/wrap_nvcc.sh.in
@ -0,0 +1,5 @@
+#!/bin/bash
+
+# This script was created because cmake is not happy about dangling -- when
+# defining CUDA_NVCC_EXECUTABLE, thus we wrapped it in a shell script.
+@FAST_NVCC_EXECUTABLE@ --nvcc @CUDA_NVCC_EXECUTABLE_ORIGIN@ -- "$@"