From 9e18b6a0f3ff154e71f2a7efbac331425e1ad46d Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Tue, 4 Feb 2025 11:47:02 -0800 Subject: [PATCH] [CUDA] Update nvcc flags (#23572) ### Description (1) Remove `if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11)` since build requires cuda >= 11.4. (2) Add sm_86 and sm_89 since we generate SASS code for specified cuda architectures only. This change could support popular consumer GPUs (like RTX 30X0 and RTX 40X0). (3) Add sm_120 to support Blackwell GPUs (like RTX 50X0 etc). (4) Add `-Xfatbin=-compress-all` to reduce wheel size. When CMAKE_CUDA_ARCHITECTURES is not specified, the linux wheel size built by CUDA 12.8 is reduced 8% (from 324MB to 299MB). ### Motivation and Context To support popular consumer GPUs (RTX 30x0, 40x0, 50x0) in the default setting. Reduce binary size. Note that the default sm settings does not impact official released binary. ORT official released binary are built with augmentation like CMAKE_CUDA_ARCHITECTURES=75;80;90, which has both SASS (real) and PTX (virtual) by default. See https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html for more info. --- cmake/CMakeLists.txt | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 962b42c190..c1a171fba0 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -1517,35 +1517,33 @@ if (onnxruntime_USE_CUDA) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_53,code=sm_53") # TX1, Nano set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_62,code=sm_62") # TX2 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_72,code=sm_72") # AGX Xavier, NX Xavier - if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_87,code=sm_87") # AGX Orin, NX Orin - endif() + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_87,code=sm_87") # AGX Orin, NX Orin else() - # the following compute capabilities are removed in CUDA 11 Toolkit - if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_30,code=sm_30") # K series - endif() if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12) # 37, 50 still work in CUDA 11 but are marked deprecated and will be removed in future CUDA version. set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_37,code=sm_37") # K80 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_50,code=sm_50") # M series endif() + # Note that we generate SASS code for specified cuda architectures. It does not support forward compatibility. + # To add PTX for future GPU architectures >= XX, append -gencode=arch=compute_XX,code=compute_XX. set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_52,code=sm_52") # M60 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_60,code=sm_60") # P series set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_70,code=sm_70") # V series set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_75,code=sm_75") # T series - if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_80,code=sm_80") # A series - endif() + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_80,code=sm_80") # A series + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_86,code=sm_86") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_89,code=sm_89") if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_90,code=sm_90") # H series + if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_120,code=sm_120") # B series + endif() endif() endif() endif() + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xfatbin=-compress-all") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr") - if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --Werror default-stream-launch") - endif() + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --Werror default-stream-launch") if (NOT WIN32) list(APPEND CUDA_NVCC_FLAGS --compiler-options -fPIC) endif()