mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-04 23:59:56 +00:00
### Description Add memory efficient attention from CUTLASS. TODO (in next pull request): (1) Need performance tests on different GPUs, then add a sequence length threshold (only activate it for long sequence length). (2) Merge changes from https://github.com/NVIDIA/cutlass/pull/773 when it is in cutlass master.
12 lines
333 B
CMake
12 lines
333 B
CMake
if (onnxruntime_USE_FLASH_ATTENTION)
|
|
include(FetchContent)
|
|
FetchContent_Declare(cutlass
|
|
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
|
|
GIT_TAG 8b42e751c63ba219755c8ed91af5f6ec1ecc1ee6
|
|
)
|
|
|
|
FetchContent_GetProperties(cutlass)
|
|
if(NOT cutlass_POPULATED)
|
|
FetchContent_Populate(cutlass)
|
|
endif()
|
|
endif()
|