From 62c1e33fc90183fb25199ef5e4f80ac867603fa2 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Tue, 14 Mar 2023 03:17:27 +0000 Subject: [PATCH] [BE] Remove fast_nvcc tool (#96665) As of CUDA-11.4+ this functionality can be mimicked by passing [`--threads`](https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#threads-number-t) option to CUDA compiler Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/96665 Approved by: https://github.com/atalman, https://github.com/PaliC --- CMakeLists.txt | 1 - .../Modules_CUDA_fix/upstream/FindCUDA.cmake | 17 - tools/fast_nvcc/fast_nvcc.py | 566 ------------------ tools/fast_nvcc/wrap_nvcc.bat.in | 1 - tools/fast_nvcc/wrap_nvcc.sh.in | 5 - 5 files changed, 590 deletions(-) delete mode 100755 tools/fast_nvcc/fast_nvcc.py delete mode 100644 tools/fast_nvcc/wrap_nvcc.bat.in delete mode 100644 tools/fast_nvcc/wrap_nvcc.sh.in diff --git a/CMakeLists.txt b/CMakeLists.txt index b5c09ca05ba..bf0acfa432b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -197,7 +197,6 @@ cmake_dependent_option( cmake_dependent_option( BUILD_NVFUSER "Build NVFUSER" ON "USE_CUDA OR USE_ROCM" OFF) -option(USE_FAST_NVCC "Use parallel NVCC build" OFF) cmake_dependent_option(USE_ROCM "Use ROCm" ON "LINUX" OFF) option(CAFFE2_STATIC_LINK_CUDA "Statically link CUDA libraries" OFF) cmake_dependent_option( diff --git a/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake b/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake index 839c43ea048..420ee63cfad 100644 --- a/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake +++ b/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake @@ -765,23 +765,6 @@ else() cuda_find_host_program(CUDA_NVCC_EXECUTABLE nvcc) endif() -# FAST_NVCC -if(USE_FAST_NVCC AND CUDA_NVCC_EXECUTABLE AND NOT CUDA_NVCC_EXECUTABLE_ORIGIN) - set(CUDA_NVCC_EXECUTABLE_ORIGIN "${CUDA_NVCC_EXECUTABLE}") - set(EXTENSION "sh") - if (MSVC) - set(EXTENSION "bat") - endif() - set(FAST_NVCC_EXECUTABLE "${PROJECT_SOURCE_DIR}/tools/fast_nvcc/fast_nvcc.py") - configure_file(${PROJECT_SOURCE_DIR}/tools/fast_nvcc/wrap_nvcc.${EXTENSION}.in "${PROJECT_SOURCE_DIR}/tools/fast_nvcc/tmp/wrap_nvcc.${EXTENSION}") - file(COPY "${PROJECT_SOURCE_DIR}/tools/fast_nvcc/tmp/wrap_nvcc.${EXTENSION}" - DESTINATION "${PROJECT_SOURCE_DIR}/tools/fast_nvcc/" - FILE_PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE - ) - set(CUDA_NVCC_EXECUTABLE "${PROJECT_SOURCE_DIR}/tools/fast_nvcc/wrap_nvcc.${EXTENSION}") -endif() -mark_as_advanced(CUDA_NVCC_EXECUTABLE) - if(CUDA_NVCC_EXECUTABLE AND NOT CUDA_VERSION) # Compute the version. execute_process(COMMAND ${CUDA_NVCC_EXECUTABLE} "--version" diff --git a/tools/fast_nvcc/fast_nvcc.py b/tools/fast_nvcc/fast_nvcc.py deleted file mode 100755 index 285a2032dfb..00000000000 --- a/tools/fast_nvcc/fast_nvcc.py +++ /dev/null @@ -1,566 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import asyncio -import collections -import csv -import hashlib -import itertools -import os -import pathlib -import re -import shlex -import shutil -import subprocess -import sys -import time -from typing import Awaitable, cast, DefaultDict, Dict, List, Match, Optional, Set - -from typing_extensions import TypedDict # Python 3.11+ - -help_msg = """fast_nvcc [OPTION]... -- [NVCC_ARG]... - -Run the commands given by nvcc --dryrun, in parallel. - -All flags for this script itself (see the "optional arguments" section -of --help) must be passed before the first "--". Everything after that -first "--" is passed directly to nvcc, with the --dryrun argument added. - -This script only works with the "normal" execution path of nvcc, so for -instance passing --help (after "--") doesn't work since the --help -execution path doesn't compile anything, so adding --dryrun there gives -nothing in stderr. -""" -parser = argparse.ArgumentParser(help_msg) -parser.add_argument( - "--faithful", - action="store_true", - help="don't modify the commands given by nvcc (slower)", -) -parser.add_argument( - "--graph", - metavar="FILE.gv", - help="write Graphviz DOT file with execution graph", -) -parser.add_argument( - "--nvcc", - metavar="PATH", - default="nvcc", - help='path to nvcc (default is just "nvcc")', -) -parser.add_argument( - "--save", - metavar="DIR", - help="copy intermediate files from each command into DIR", -) -parser.add_argument( - "--sequential", - action="store_true", - help="sequence commands instead of using the graph (slower)", -) -parser.add_argument( - "--table", - metavar="FILE.csv", - help="write CSV with times and intermediate file sizes", -) -parser.add_argument( - "--verbose", - metavar="FILE.txt", - help="like nvcc --verbose, but expanded and into a file", -) -default_config = parser.parse_args([]) - - -# docs about temporary directories used by NVCC -url_base = "https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html" -url_vars = f"{url_base}#keeping-intermediate-phase-files" - - -# regex for temporary file names -re_tmp = r"(? None: - """ - Warn the user about something regarding fast_nvcc. - """ - print(f"warning (fast_nvcc): {warning}", file=sys.stderr) - - -def warn_if_windows() -> None: - """ - Warn the user that using fast_nvcc on Windows might not work. - """ - # use os.name instead of platform.system() because there is a - # platform.py file in this directory, making it very difficult to - # import the platform module from the Python standard library - if os.name == "nt": - fast_nvcc_warn("untested on Windows, might not work; see this URL:") - fast_nvcc_warn(url_vars) - - -def warn_if_tmpdir_flag(args: List[str]) -> None: - """ - Warn the user that using fast_nvcc with some flags might not work. - """ - file_path_specs = "file-and-path-specifications" - guiding_driver = "options-for-guiding-compiler-driver" - scary_flags = { - "--objdir-as-tempdir": file_path_specs, - "-objtemp": file_path_specs, - "--keep": guiding_driver, - "-keep": guiding_driver, - "--keep-dir": guiding_driver, - "-keep-dir": guiding_driver, - "--save-temps": guiding_driver, - "-save-temps": guiding_driver, - } - for arg in args: - for flag, frag in scary_flags.items(): - if re.match(rf"^{re.escape(flag)}(?:=.*)?$", arg): - fast_nvcc_warn(f"{flag} not supported since it interacts with") - fast_nvcc_warn("TMPDIR, so fast_nvcc may break; see this URL:") - fast_nvcc_warn(f"{url_base}#{frag}") - - -class DryunData(TypedDict): - env: Dict[str, str] - commands: List[str] - exit_code: int - - -def nvcc_dryrun_data(binary: str, args: List[str]) -> DryunData: - """ - Return parsed environment variables and commands from nvcc --dryrun. - """ - result = subprocess.run( # type: ignore[call-overload] - [binary, "--dryrun"] + args, - capture_output=True, - encoding="ascii", # this is just a guess - ) - print(result.stdout, end="") - env = {} - commands = [] - output = result.stderr - if os.name == "nt": - output = result.stdout - for line in output.splitlines(): - match = re.match(r"^#\$ (.*)$", line) - if match: - (stripped,) = match.groups() - mapping = re.match(r"^(\w+)=(.*)$", stripped) - if mapping: - name, val = mapping.groups() - env[name] = val - else: - commands.append(stripped) - else: - print(line, file=sys.stderr) - return {"env": env, "commands": commands, "exit_code": result.returncode} - - -def warn_if_tmpdir_set(env: Dict[str, str]) -> None: - """ - Warn the user that setting TMPDIR with fast_nvcc might not work. - """ - if os.getenv("TMPDIR") or "TMPDIR" in env: - fast_nvcc_warn("TMPDIR is set, might not work; see this URL:") - fast_nvcc_warn(url_vars) - - -def contains_non_executable(commands: List[str]) -> bool: - for command in commands: - # This is to deal with special command dry-run result from NVCC such as: - # ``` - # #$ "/lib64/ccache"/c++ -std=c++11 -E -x c++ -D__CUDACC__ -D__NVCC__ -fPIC -fvisibility=hidden -O3 \ - # -I ... -m64 "reduce_scatter.cu" > "/tmp/tmpxft_0037fae3_00000000-5_reduce_scatter.cpp4.ii - # #$ -- Filter Dependencies -- > ... pytorch/build/nccl/obj/collectives/device/reduce_scatter.dep.tmp - # ``` - if command.startswith("--"): - return True - return False - - -def module_id_contents(command: List[str]) -> str: - """ - Guess the contents of the .module_id file contained within command. - """ - if command[0] == "cicc": - path = command[-3] - elif command[0] == "cudafe++": - path = command[-1] - middle = pathlib.PurePath(path).name.replace("-", "_").replace(".", "_") - # this suffix is very wrong (the real one is far less likely to be - # unique), but it seems difficult to find a rule that reproduces the - # real suffixes, so here's one that, while inaccurate, is at least - # hopefully as straightforward as possible - suffix = hashlib.md5(str.encode(middle)).hexdigest()[:8] - return f"_{len(middle)}_{middle}_{suffix}" - - -def unique_module_id_files(commands: List[str]) -> List[str]: - """ - Give each command its own .module_id filename instead of sharing. - """ - module_id = None - uniqueified = [] - for i, line in enumerate(commands): - arr = [] - - def uniqueify(s: Match[str]) -> str: - filename = re.sub(r"\-(\d+)", r"-\1-" + str(i), s.group(0)) - arr.append(filename) - return filename - - line = re.sub(re_tmp + r".module_id", uniqueify, line) - line = re.sub(r"\s*\-\-gen\_module\_id\_file\s*", " ", line) - if arr: - (filename,) = arr - if os.name == "nt": - filename = "%TEMP%\\" + filename - if not module_id: - module_id = module_id_contents(shlex.split(line)) - uniqueified.append(f"echo -n '{module_id}' > \"{filename}\"") - uniqueified.append(line) - return uniqueified - - -def make_rm_force(commands: List[str]) -> List[str]: - """ - Add --force to all rm commands. - """ - return [f"{c} --force" if c.startswith("rm ") else c for c in commands] - - -def print_verbose_output( - *, - env: Dict[str, str], - commands: List[List[str]], - filename: str, -) -> None: - """ - Human-readably write nvcc --dryrun data to stderr. - """ - padding = len(str(len(commands) - 1)) - with open(filename, "w") as f: - for name, val in env.items(): - print(f'#{" "*padding}$ {name}={val}', file=f) - for i, command in enumerate(commands): - prefix = f"{str(i).rjust(padding)}$ " - print(f"#{prefix}{command[0]}", file=f) - for part in command[1:]: - print(f'#{" "*len(prefix)}{part}', file=f) - - -Graph = List[Set[int]] - - -def straight_line_dependencies(commands: List[str]) -> Graph: - """ - Return a straight-line dependency graph. - """ - return [({i - 1} if i > 0 else set()) for i in range(len(commands))] - - -def files_mentioned(command: str) -> List[str]: - """ - Return fully-qualified names of all tmp files referenced by command. - """ - if os.name == "nt": - return [f"/%TEMP%/{match.group(1)}" for match in re.finditer(re_tmp, command)] - return [f"/tmp/{match.group(1)}" for match in re.finditer(re_tmp, command)] - - -def nvcc_data_dependencies(commands: List[str]) -> Graph: - """ - Return a list of the set of dependencies for each command. - """ - # fatbin needs to be treated specially because while the cicc steps - # do refer to .fatbin.c files, they do so through the - # --include_file_name option, since they're generating files that - # refer to .fatbin.c file(s) that will later be created by the - # fatbinary step; so for most files, we make a data dependency from - # the later step to the earlier step, but for .fatbin.c files, the - # data dependency is sort of flipped, because the steps that use the - # files generated by cicc need to wait for the fatbinary step to - # finish first - tmp_files: Dict[str, int] = {} - fatbins: DefaultDict[int, Set[str]] = collections.defaultdict(set) - graph = [] - for i, line in enumerate(commands): - deps = set() - for tmp in files_mentioned(line): - if tmp in tmp_files: - dep = tmp_files[tmp] - deps.add(dep) - if dep in fatbins: - for filename in fatbins[dep]: - if filename in tmp_files: - deps.add(tmp_files[filename]) - if tmp.endswith(".fatbin.c") and not line.startswith("fatbinary"): - fatbins[i].add(tmp) - else: - tmp_files[tmp] = i - if (line.startswith("rm ") or line.startswith("erase ")) and not deps: - if os.name == "nt": - commands[i] = line.replace("/", "\\") - deps.add(i - 1) - graph.append(deps) - return graph - - -def is_weakly_connected(graph: Graph) -> bool: - """ - Return true iff graph is weakly connected. - """ - if not graph: - return True - neighbors: List[Set[int]] = [set() for _ in graph] - for node, predecessors in enumerate(graph): - for pred in predecessors: - neighbors[pred].add(node) - neighbors[node].add(pred) - # assume nonempty graph - stack = [0] - found = {0} - while stack: - node = stack.pop() - for neighbor in neighbors[node]: - if neighbor not in found: - found.add(neighbor) - stack.append(neighbor) - return len(found) == len(graph) - - -def warn_if_not_weakly_connected(graph: Graph) -> None: - """ - Warn the user if the execution graph is not weakly connected. - """ - if not is_weakly_connected(graph): - fast_nvcc_warn("execution graph is not (weakly) connected") - - -def print_dot_graph( - *, - commands: List[List[str]], - graph: Graph, - filename: str, -) -> None: - """ - Print a DOT file displaying short versions of the commands in graph. - """ - - def name(k: int) -> str: - return f'"{k} {os.path.basename(commands[k][0])}"' - - with open(filename, "w") as f: - print("digraph {", file=f) - # print all nodes, in case it's disconnected - for i in range(len(graph)): - print(f" {name(i)};", file=f) - for i, deps in enumerate(graph): - for j in deps: - print(f" {name(j)} -> {name(i)};", file=f) - print("}", file=f) - - -class Result(TypedDict, total=False): - exit_code: int - stdout: bytes - stderr: bytes - time: float - files: Dict[str, int] - - -async def run_command( - command: str, - *, - env: Dict[str, str], - deps: Set[Awaitable[Result]], - gather_data: bool, - i: int, - save: Optional[str], -) -> Result: - """ - Run the command with the given env after waiting for deps. - """ - for task in deps: - dep_result = await task - # abort if a previous step failed - if "exit_code" not in dep_result or dep_result["exit_code"] != 0: - return {} - if gather_data: - t1 = time.monotonic() - proc = await asyncio.create_subprocess_shell( - command, - env=env, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - stdout, stderr = await proc.communicate() - code = cast(int, proc.returncode) - results: Result = {"exit_code": code, "stdout": stdout, "stderr": stderr} - if gather_data: - t2 = time.monotonic() - results["time"] = t2 - t1 - sizes = {} - for tmp_file in files_mentioned(command): - if os.path.exists(tmp_file): - sizes[tmp_file] = os.path.getsize(tmp_file) - else: - sizes[tmp_file] = 0 - results["files"] = sizes - if save: - dest = pathlib.Path(save) / str(i) - dest.mkdir() - for src in map(pathlib.Path, files_mentioned(command)): - if src.exists(): - shutil.copy2(src, dest / (src.name)) - return results - - -async def run_graph( - *, - env: Dict[str, str], - commands: List[str], - graph: Graph, - gather_data: bool = False, - save: Optional[str] = None, -) -> List[Result]: - """ - Return outputs/errors (and optionally time/file info) from commands. - """ - if os.name == "nt": - env.update(os.environ.copy()) - tasks: List[Awaitable[Result]] = [] - for i, (command, indices) in enumerate(zip(commands, graph)): - deps = {tasks[j] for j in indices} - tasks.append( - asyncio.create_task( - run_command( # type: ignore[attr-defined] - command, - env=env, - deps=deps, - gather_data=gather_data, - i=i, - save=save, - ) - ) - ) - return [await task for task in tasks] - - -def print_command_outputs(command_results: List[Result]) -> None: - """ - Print captured stdout and stderr from commands. - """ - for result in command_results: - sys.stdout.write(result.get("stdout", b"").decode("ascii")) - sys.stderr.write(result.get("stderr", b"").decode("ascii")) - - -def write_log_csv( - command_parts: List[List[str]], - command_results: List[Result], - *, - filename: str, -) -> None: - """ - Write a CSV file of the times and /tmp file sizes from each command. - """ - tmp_files: List[str] = [] - for result in command_results: - tmp_files.extend(result.get("files", {}).keys()) - with open(filename, "w", newline="") as csvfile: - fieldnames = ["command", "seconds"] + list(dict.fromkeys(tmp_files)) - writer = csv.DictWriter(csvfile, fieldnames=fieldnames) - writer.writeheader() - for i, result in enumerate(command_results): - command = f"{i} {os.path.basename(command_parts[i][0])}" - row = {"command": command, "seconds": result.get("time", 0)} - writer.writerow({**row, **result.get("files", {})}) - - -def exit_code(results: List[Result]) -> int: - """ - Aggregate individual exit codes into a single code. - """ - for result in results: - code = result.get("exit_code", 0) - if code != 0: - return code - return 0 - - -def wrap_nvcc( - args: List[str], - config: argparse.Namespace = default_config, -) -> int: - return subprocess.call([config.nvcc] + args) - - -def fast_nvcc( - args: List[str], - *, - config: argparse.Namespace = default_config, -) -> int: - """ - Emulate the result of calling the given nvcc binary with args. - - Should run faster than plain nvcc. - """ - warn_if_windows() - warn_if_tmpdir_flag(args) - dryrun_data = nvcc_dryrun_data(config.nvcc, args) - env = dryrun_data["env"] - warn_if_tmpdir_set(env) - commands = dryrun_data["commands"] - if not config.faithful: - commands = make_rm_force(unique_module_id_files(commands)) - - if contains_non_executable(commands): - return wrap_nvcc(args, config) - - command_parts = list(map(shlex.split, commands)) - if config.verbose: - print_verbose_output( - env=env, - commands=command_parts, - filename=config.verbose, - ) - graph = nvcc_data_dependencies(commands) - warn_if_not_weakly_connected(graph) - if config.graph: - print_dot_graph( - commands=command_parts, - graph=graph, - filename=config.graph, - ) - if config.sequential: - graph = straight_line_dependencies(commands) - results = asyncio.run( - run_graph( # type: ignore[attr-defined] - env=env, - commands=commands, - graph=graph, - gather_data=bool(config.table), - save=config.save, - ) - ) - print_command_outputs(results) - if config.table: - write_log_csv(command_parts, results, filename=config.table) - return exit_code([dryrun_data] + results) # type: ignore[arg-type, operator] - - -def our_arg(arg: str) -> bool: - return arg != "--" - - -if __name__ == "__main__": - argv = sys.argv[1:] - us = list(itertools.takewhile(our_arg, argv)) - them = list(itertools.dropwhile(our_arg, argv)) - sys.exit(fast_nvcc(them[1:], config=parser.parse_args(us))) diff --git a/tools/fast_nvcc/wrap_nvcc.bat.in b/tools/fast_nvcc/wrap_nvcc.bat.in deleted file mode 100644 index f02a751e3a4..00000000000 --- a/tools/fast_nvcc/wrap_nvcc.bat.in +++ /dev/null @@ -1 +0,0 @@ -python "@FAST_NVCC_EXECUTABLE@" --nvcc "@CUDA_NVCC_EXECUTABLE_ORIGIN@" -- %* diff --git a/tools/fast_nvcc/wrap_nvcc.sh.in b/tools/fast_nvcc/wrap_nvcc.sh.in deleted file mode 100644 index d15ed016068..00000000000 --- a/tools/fast_nvcc/wrap_nvcc.sh.in +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -# This script was created because cmake is not happy about dangling -- when -# defining CUDA_NVCC_EXECUTABLE, thus we wrapped it in a shell script. -@FAST_NVCC_EXECUTABLE@ --nvcc @CUDA_NVCC_EXECUTABLE_ORIGIN@ -- "$@"