[CUDA] Update nvcc flags (#23572)

### Description
(1) Remove `if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11)`
since build requires cuda >= 11.4.
(2) Add sm_86 and sm_89 since we generate SASS code for specified cuda
architectures only. This change could support popular consumer GPUs
(like RTX 30X0 and RTX 40X0).
(3) Add sm_120 to support Blackwell GPUs (like RTX 50X0 etc).
(4) Add `-Xfatbin=-compress-all` to reduce wheel size. When
CMAKE_CUDA_ARCHITECTURES is not specified, the linux wheel size built by
CUDA 12.8 is reduced 8% (from 324MB to 299MB).

### Motivation and Context

To support popular consumer GPUs (RTX 30x0, 40x0, 50x0) in the default
setting. Reduce binary size.

Note that the default sm settings does not impact official released
binary. ORT official released binary are built with augmentation like
CMAKE_CUDA_ARCHITECTURES=75;80;90, which has both SASS (real) and PTX
(virtual) by default. See
https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html for
more info.
This commit is contained in:
Tianlei Wu 2025-02-04 11:47:02 -08:00 committed by GitHub
parent b47e1e64d7
commit 9e18b6a0f3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -1517,35 +1517,33 @@ if (onnxruntime_USE_CUDA)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_53,code=sm_53") # TX1, Nano
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_62,code=sm_62") # TX2
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_72,code=sm_72") # AGX Xavier, NX Xavier
if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_87,code=sm_87") # AGX Orin, NX Orin
endif()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_87,code=sm_87") # AGX Orin, NX Orin
else()
# the following compute capabilities are removed in CUDA 11 Toolkit
if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_30,code=sm_30") # K series
endif()
if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12)
# 37, 50 still work in CUDA 11 but are marked deprecated and will be removed in future CUDA version.
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_37,code=sm_37") # K80
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_50,code=sm_50") # M series
endif()
# Note that we generate SASS code for specified cuda architectures. It does not support forward compatibility.
# To add PTX for future GPU architectures >= XX, append -gencode=arch=compute_XX,code=compute_XX.
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_52,code=sm_52") # M60
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_60,code=sm_60") # P series
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_70,code=sm_70") # V series
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_75,code=sm_75") # T series
if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_80,code=sm_80") # A series
endif()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_80,code=sm_80") # A series
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_86,code=sm_86")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_89,code=sm_89")
if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_90,code=sm_90") # H series
if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_120,code=sm_120") # B series
endif()
endif()
endif()
endif()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xfatbin=-compress-all")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --Werror default-stream-launch")
endif()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --Werror default-stream-launch")
if (NOT WIN32)
list(APPEND CUDA_NVCC_FLAGS --compiler-options -fPIC)
endif()