mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-16 21:00:14 +00:00
### Description - [x] Add cuDNN flash attention using cudnn frontend, and enable it in MultiHeadAttention operator. - [x] Support attention mask. - [x] Support attention bias. - [x] Update tests and benchmark script. The cuDNN SDPA is disabled by default. To enable it, need the following: (1) Requires cuDNN 9.3 or newer version installed. (2) Set an environment variable `ORT_ENABLE_CUDNN_FLASH_ATTENTION=1` or set `sdpa_kernel=8` cuda provider option to enable it. (3) Only works on devices with compute capability >= 8.0. Note that some combinations of parameters might be rejected due to limited support of head dimension or sequence lengths. Future Works: (1) FP8 and BF16 APIs. Currently, only API for FP16 are exposed. (2) Add API to support ragged batching (padding removed in inputs). (3) Support other input formats (like QKV_BS3NH). (4) Currently, q are converted to BSNH, k/v are converted to either BSNH or BNSH format. May do some experiment to see whether converting q to BNSH could be better in some case. ### Example Benchmark Results on H100 The following tests are on FP16 MultiHeadAttention operator without attention mask and attention bias. #### Test Setting 1 batch_size | sequence_length | past_sequence_length | num_heads | head_size -- | -- | -- | -- | -- 16 | 256 | 0 | 32 | 128 format | average_latency | tflops | kernel -- | -- | -- | -- Q,K,V (BNSH) | 0.000075 | 229.5 | torch:flash Q,K,V (BNSH) | 0.000119 | 144.8 | torch:efficient Q,K,V (BNSH) | 0.000224 | 76.5 | torch:math Q,K,V (BSNH) | 0.000075 | 227.8 | ort:cudnn Q,K,V (BSNH) | 0.000094 | 182.8 | ort:flash Q,K,V (BSNH) | 0.000138 | 124.7 | ort:efficient Q,K,V (BSNH) | 0.000438 | 39.3 | ort:math Q,KV | 0.000129 | 133.0 | ort:cudnn Q,KV | 0.000151 | 114.1 | ort:flash Q,KV | 0.000194 | 88.5 | ort:efficient QKV | 0.000154 | 111.8 | ort:cudnn QKV | 0.000175 | 98.0 | ort:flash QKV | 0.000217 | 79.0 | ort:efficient #### Test Setting 2 batch_size | sequence_length | past_sequence_length | num_heads | head_size -- | -- | -- | -- | -- 16 | 512 | 0 | 16 | 64 format | average_latency | tflops | kernel -- | -- | -- | -- Q,K,V (BNSH) | 0.000069 | 249.2 | torch:flash Q,K,V (BNSH) | 0.000141 | 121.7 | torch:efficient Q,K,V (BNSH) | 0.000294 | 58.5 | torch:math Q,K,V (BSNH) | 0.000077 | 221.7 | ort:cudnn Q,K,V (BSNH) | 0.000087 | 196.6 | ort:flash Q,K,V (BSNH) | 0.000163 | 105.6 | ort:efficient Q,K,V (BSNH) | 0.000651 | 26.4 | ort:math Q,KV | 0.000103 | 167.1 | ort:cudnn Q,KV | 0.000117 | 146.3 | ort:flash Q,KV | 0.000192 | 89.6 | ort:efficient QKV | 0.000113 | 151.5 | ort:cudnn QKV | 0.000128 | 134.7 | ort:flash QKV | 0.000201 | 85.3 | ort:efficient
109 lines
2.9 KiB
CMake
109 lines
2.9 KiB
CMake
add_library(CUDNN::cudnn_all INTERFACE IMPORTED)
|
|
|
|
find_path(
|
|
CUDNN_INCLUDE_DIR cudnn.h
|
|
HINTS $ENV{CUDNN_PATH} ${CUDNN_PATH} ${Python_SITEARCH}/nvidia/cudnn ${CUDAToolkit_INCLUDE_DIRS}
|
|
PATH_SUFFIXES include
|
|
REQUIRED
|
|
)
|
|
|
|
file(READ "${CUDNN_INCLUDE_DIR}/cudnn_version.h" cudnn_version_header)
|
|
string(REGEX MATCH "#define CUDNN_MAJOR [1-9]+" macrodef "${cudnn_version_header}")
|
|
string(REGEX MATCH "[1-9]+" CUDNN_MAJOR_VERSION "${macrodef}")
|
|
|
|
function(find_cudnn_library NAME)
|
|
find_library(
|
|
${NAME}_LIBRARY ${NAME} "lib${NAME}.so.${CUDNN_MAJOR_VERSION}"
|
|
HINTS $ENV{CUDNN_PATH} ${CUDNN_PATH} ${Python_SITEARCH}/nvidia/cudnn ${CUDAToolkit_LIBRARY_DIR}
|
|
PATH_SUFFIXES lib64 lib/x64 lib
|
|
REQUIRED
|
|
)
|
|
|
|
if(${NAME}_LIBRARY)
|
|
add_library(CUDNN::${NAME} UNKNOWN IMPORTED)
|
|
set_target_properties(
|
|
CUDNN::${NAME} PROPERTIES
|
|
INTERFACE_INCLUDE_DIRECTORIES ${CUDNN_INCLUDE_DIR}
|
|
IMPORTED_LOCATION ${${NAME}_LIBRARY}
|
|
)
|
|
message(STATUS "${NAME} found at ${${NAME}_LIBRARY}.")
|
|
else()
|
|
message(STATUS "${NAME} not found.")
|
|
endif()
|
|
|
|
|
|
endfunction()
|
|
|
|
find_cudnn_library(cudnn)
|
|
|
|
include (FindPackageHandleStandardArgs)
|
|
find_package_handle_standard_args(
|
|
LIBRARY REQUIRED_VARS
|
|
CUDNN_INCLUDE_DIR cudnn_LIBRARY
|
|
)
|
|
|
|
if(CUDNN_INCLUDE_DIR AND cudnn_LIBRARY)
|
|
|
|
message(STATUS "cuDNN: ${cudnn_LIBRARY}")
|
|
message(STATUS "cuDNN: ${CUDNN_INCLUDE_DIR}")
|
|
|
|
set(CUDNN_FOUND ON CACHE INTERNAL "cuDNN Library Found")
|
|
|
|
else()
|
|
|
|
set(CUDNN_FOUND OFF CACHE INTERNAL "cuDNN Library Not Found")
|
|
|
|
endif()
|
|
|
|
target_include_directories(
|
|
CUDNN::cudnn_all
|
|
INTERFACE
|
|
$<INSTALL_INTERFACE:include>
|
|
$<BUILD_INTERFACE:${CUDNN_INCLUDE_DIR}>
|
|
)
|
|
|
|
target_link_libraries(
|
|
CUDNN::cudnn_all
|
|
INTERFACE
|
|
CUDNN::cudnn
|
|
)
|
|
|
|
if(CUDNN_MAJOR_VERSION EQUAL 8)
|
|
find_cudnn_library(cudnn_adv_infer)
|
|
find_cudnn_library(cudnn_adv_train)
|
|
find_cudnn_library(cudnn_cnn_infer)
|
|
find_cudnn_library(cudnn_cnn_train)
|
|
find_cudnn_library(cudnn_ops_infer)
|
|
find_cudnn_library(cudnn_ops_train)
|
|
|
|
target_link_libraries(
|
|
CUDNN::cudnn_all
|
|
INTERFACE
|
|
CUDNN::cudnn_adv_train
|
|
CUDNN::cudnn_ops_train
|
|
CUDNN::cudnn_cnn_train
|
|
CUDNN::cudnn_adv_infer
|
|
CUDNN::cudnn_cnn_infer
|
|
CUDNN::cudnn_ops_infer
|
|
)
|
|
elseif(CUDNN_MAJOR_VERSION EQUAL 9)
|
|
find_cudnn_library(cudnn_cnn)
|
|
find_cudnn_library(cudnn_adv)
|
|
find_cudnn_library(cudnn_graph)
|
|
find_cudnn_library(cudnn_ops)
|
|
find_cudnn_library(cudnn_engines_runtime_compiled)
|
|
find_cudnn_library(cudnn_engines_precompiled)
|
|
find_cudnn_library(cudnn_heuristic)
|
|
|
|
target_link_libraries(
|
|
CUDNN::cudnn_all
|
|
INTERFACE
|
|
CUDNN::cudnn_adv
|
|
CUDNN::cudnn_ops
|
|
CUDNN::cudnn_cnn
|
|
CUDNN::cudnn_graph
|
|
CUDNN::cudnn_engines_runtime_compiled
|
|
CUDNN::cudnn_engines_precompiled
|
|
CUDNN::cudnn_heuristic
|
|
)
|
|
endif()
|