mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-23 02:38:28 +00:00
### Description - [x] Add cuDNN flash attention using cudnn frontend, and enable it in MultiHeadAttention operator. - [x] Support attention mask. - [x] Support attention bias. - [x] Update tests and benchmark script. The cuDNN SDPA is disabled by default. To enable it, need the following: (1) Requires cuDNN 9.3 or newer version installed. (2) Set an environment variable `ORT_ENABLE_CUDNN_FLASH_ATTENTION=1` or set `sdpa_kernel=8` cuda provider option to enable it. (3) Only works on devices with compute capability >= 8.0. Note that some combinations of parameters might be rejected due to limited support of head dimension or sequence lengths. Future Works: (1) FP8 and BF16 APIs. Currently, only API for FP16 are exposed. (2) Add API to support ragged batching (padding removed in inputs). (3) Support other input formats (like QKV_BS3NH). (4) Currently, q are converted to BSNH, k/v are converted to either BSNH or BNSH format. May do some experiment to see whether converting q to BNSH could be better in some case. ### Example Benchmark Results on H100 The following tests are on FP16 MultiHeadAttention operator without attention mask and attention bias. #### Test Setting 1 batch_size | sequence_length | past_sequence_length | num_heads | head_size -- | -- | -- | -- | -- 16 | 256 | 0 | 32 | 128 format | average_latency | tflops | kernel -- | -- | -- | -- Q,K,V (BNSH) | 0.000075 | 229.5 | torch:flash Q,K,V (BNSH) | 0.000119 | 144.8 | torch:efficient Q,K,V (BNSH) | 0.000224 | 76.5 | torch:math Q,K,V (BSNH) | 0.000075 | 227.8 | ort:cudnn Q,K,V (BSNH) | 0.000094 | 182.8 | ort:flash Q,K,V (BSNH) | 0.000138 | 124.7 | ort:efficient Q,K,V (BSNH) | 0.000438 | 39.3 | ort:math Q,KV | 0.000129 | 133.0 | ort:cudnn Q,KV | 0.000151 | 114.1 | ort:flash Q,KV | 0.000194 | 88.5 | ort:efficient QKV | 0.000154 | 111.8 | ort:cudnn QKV | 0.000175 | 98.0 | ort:flash QKV | 0.000217 | 79.0 | ort:efficient #### Test Setting 2 batch_size | sequence_length | past_sequence_length | num_heads | head_size -- | -- | -- | -- | -- 16 | 512 | 0 | 16 | 64 format | average_latency | tflops | kernel -- | -- | -- | -- Q,K,V (BNSH) | 0.000069 | 249.2 | torch:flash Q,K,V (BNSH) | 0.000141 | 121.7 | torch:efficient Q,K,V (BNSH) | 0.000294 | 58.5 | torch:math Q,K,V (BSNH) | 0.000077 | 221.7 | ort:cudnn Q,K,V (BSNH) | 0.000087 | 196.6 | ort:flash Q,K,V (BSNH) | 0.000163 | 105.6 | ort:efficient Q,K,V (BSNH) | 0.000651 | 26.4 | ort:math Q,KV | 0.000103 | 167.1 | ort:cudnn Q,KV | 0.000117 | 146.3 | ort:flash Q,KV | 0.000192 | 89.6 | ort:efficient QKV | 0.000113 | 151.5 | ort:cudnn QKV | 0.000128 | 134.7 | ort:flash QKV | 0.000201 | 85.3 | ort:efficient
261 lines
8.4 KiB
CMake
261 lines
8.4 KiB
CMake
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
# Licensed under the MIT License.
|
|
|
|
find_package(Python3 COMPONENTS Interpreter REQUIRED)
|
|
|
|
# GLOB pattern of file to be excluded
|
|
set(contrib_ops_excluded_files
|
|
"bert/cudnn_fmha/*"
|
|
"bert/cutlass_fmha/*"
|
|
"bert/fastertransformer_decoder_attention/*"
|
|
"bert/flash_attention/*"
|
|
"bert/tensorrt_fused_multihead_attention/*"
|
|
"bert/attention.cc"
|
|
"bert/attention.h"
|
|
"bert/attention_impl.cu"
|
|
"bert/attention_softmax.h"
|
|
"bert/attention_softmax.cu"
|
|
"bert/attention_prepare_qkv.cu"
|
|
"bert/attention_kernel_options.h"
|
|
"bert/attention_kernel_options.cc"
|
|
"bert/decoder_attention_impl.h"
|
|
"bert/decoder_attention_impl.cu"
|
|
"bert/decoder_masked_multihead_attention.h"
|
|
"bert/decoder_masked_multihead_attention.cc"
|
|
"bert/decoder_masked_self_attention.h"
|
|
"bert/decoder_masked_self_attention.cc"
|
|
"bert/multihead_attention.cc"
|
|
"bert/multihead_attention.h"
|
|
"bert/relative_attn_bias.cc"
|
|
"bert/relative_attn_bias.h"
|
|
"bert/relative_attn_bias_impl.cu"
|
|
"bert/relative_attn_bias_impl.h"
|
|
"bert/skip_layer_norm.cc"
|
|
"bert/skip_layer_norm.h"
|
|
"bert/skip_layer_norm_impl.cu"
|
|
"bert/skip_layer_norm_impl.h"
|
|
"bert/transformer_common.h"
|
|
"bert/transformer_common.cc"
|
|
"bert/packed_attention.h"
|
|
"bert/packed_attention.cc"
|
|
"bert/packed_attention_impl.h"
|
|
"bert/packed_attention_impl.cu"
|
|
"bert/packed_multihead_attention.h"
|
|
"bert/packed_multihead_attention.cc"
|
|
"bert/packed_multihead_attention_impl.h"
|
|
"bert/packed_multihead_attention_impl.cu"
|
|
"diffusion/group_norm_impl.cu"
|
|
"diffusion/nhwc_conv.cc"
|
|
"math/gemm_float8.cc"
|
|
"math/gemm_float8.cu"
|
|
"math/gemm_float8.h"
|
|
"moe/*"
|
|
"sparse/*"
|
|
"quantization/attention_quantization.cc"
|
|
"quantization/attention_quantization.h"
|
|
"quantization/attention_quantization_impl.cu"
|
|
"quantization/attention_quantization_impl.cuh"
|
|
"quantization/dequantize_blockwise_bnb4.cuh"
|
|
"quantization/dequantize_blockwise_bnb4.cu"
|
|
"quantization/matmul_bnb4.cc"
|
|
"quantization/matmul_bnb4.cuh"
|
|
"quantization/matmul_bnb4.cu"
|
|
"quantization/moe_quantization.h"
|
|
"quantization/moe_quantization.cc"
|
|
"quantization/quantize_dequantize_linear.cc"
|
|
"quantization/qordered_ops/qordered_attention_impl.cu"
|
|
"quantization/qordered_ops/qordered_attention_impl.h"
|
|
"quantization/qordered_ops/qordered_attention_input_enum.h"
|
|
"quantization/qordered_ops/qordered_attention.cc"
|
|
"quantization/qordered_ops/qordered_attention.h"
|
|
"quantization/qordered_ops/qordered_common.cuh"
|
|
"quantization/qordered_ops/qordered_layer_norm.h"
|
|
"quantization/qordered_ops/qordered_layer_norm.cc"
|
|
"quantization/qordered_ops/qordered_layer_norm_impl.h"
|
|
"quantization/qordered_ops/qordered_layer_norm_impl.cu"
|
|
"quantization/qordered_ops/qordered_longformer_attention.cc"
|
|
"quantization/qordered_ops/qordered_longformer_attention.h"
|
|
"quantization/qordered_ops/qordered_matmul.h"
|
|
"quantization/qordered_ops/qordered_matmul.cc"
|
|
"quantization/qordered_ops/qordered_matmul_utils.h"
|
|
"quantization/qordered_ops/qordered_matmul_utils.cc"
|
|
"quantization/qordered_ops/qordered_qdq_impl.cu"
|
|
"quantization/qordered_ops/qordered_qdq_impl.h"
|
|
"quantization/qordered_ops/qordered_qdq.cc"
|
|
"quantization/qordered_ops/qordered_qdq.h"
|
|
"quantization/qordered_ops/qordered_unary_ops.h"
|
|
"quantization/qordered_ops/qordered_unary_ops.cc"
|
|
"quantization/qordered_ops/qordered_unary_ops_impl.h"
|
|
"quantization/qordered_ops/qordered_unary_ops_impl.cu"
|
|
"cuda_contrib_kernels.cc"
|
|
"cuda_contrib_kernels.h"
|
|
"inverse.cc"
|
|
"fused_conv.cc"
|
|
"bert/group_query_attention.h"
|
|
"bert/group_query_attention.cc"
|
|
"bert/group_query_attention_impl.h"
|
|
"bert/group_query_attention_impl.cu"
|
|
"collective/custom_*"
|
|
"collective/distributed_*"
|
|
"collective/ipc_*"
|
|
"collective/shard*"
|
|
)
|
|
|
|
if (NOT onnxruntime_USE_NCCL)
|
|
# Those are string patterns to exclude. Do NOT use stars such as
|
|
# collective/*.cc or *.h.
|
|
list(APPEND contrib_ops_excluded_files "collective/nccl_kernels.cc")
|
|
endif()
|
|
|
|
if (NOT onnxruntime_ENABLE_ATEN)
|
|
list(APPEND contrib_ops_excluded_files "aten_ops/aten_op.cc")
|
|
endif()
|
|
|
|
set(provider_excluded_files
|
|
"atomic/common.cuh"
|
|
"cu_inc/common.cuh"
|
|
"math/einsum_utils/einsum_auxiliary_ops.cc"
|
|
"math/einsum_utils/einsum_auxiliary_ops.h"
|
|
"math/einsum_utils/einsum_auxiliary_ops_diagonal.cu"
|
|
"math/einsum_utils/einsum_auxiliary_ops_diagonal.h"
|
|
"math/einsum.cc"
|
|
"math/einsum.h"
|
|
"math/gemm.cc"
|
|
"math/matmul.cc"
|
|
"math/softmax_impl.cu"
|
|
"math/softmax_warpwise_impl.cuh"
|
|
"math/softmax_common.cc"
|
|
"math/softmax_common.h"
|
|
"math/softmax.cc"
|
|
"math/softmax.h"
|
|
"nn/conv.cc"
|
|
"nn/conv.h"
|
|
"nn/conv_transpose.cc"
|
|
"nn/conv_transpose.h"
|
|
"nn/pool.cc"
|
|
"nn/pool.h"
|
|
"reduction/reduction_ops.cc"
|
|
"rnn/cudnn_rnn_base.cc"
|
|
"rnn/cudnn_rnn_base.h"
|
|
"rnn/gru.cc"
|
|
"rnn/gru.h"
|
|
"rnn/lstm.cc"
|
|
"rnn/lstm.h"
|
|
"rnn/rnn.cc"
|
|
"rnn/rnn.h"
|
|
"rnn/rnn_impl.cu"
|
|
"rnn/rnn_impl.h"
|
|
"shared_inc/cuda_call.h"
|
|
"shared_inc/cudnn_fe_call.h"
|
|
"shared_inc/fpgeneric.h"
|
|
"cuda_allocator.cc"
|
|
"cuda_allocator.h"
|
|
"cuda_call.cc"
|
|
"cuda_common.cc"
|
|
"cuda_common.h"
|
|
"cuda_execution_provider_info.cc"
|
|
"cuda_execution_provider_info.h"
|
|
"cuda_execution_provider.cc"
|
|
"cuda_execution_provider.h"
|
|
"cuda_memory_check.cc"
|
|
"cuda_memory_check.h"
|
|
"cuda_fence.cc"
|
|
"cuda_fence.h"
|
|
"cuda_kernel.h"
|
|
"cuda_pch.cc"
|
|
"cuda_pch.h"
|
|
"cuda_profiler.cc"
|
|
"cuda_profiler.h"
|
|
"cuda_provider_factory.cc"
|
|
"cuda_provider_factory.h"
|
|
"cuda_stream_handle.cc",
|
|
"cuda_stream_handle.h",
|
|
"cuda_utils.cu"
|
|
"cudnn_common.cc"
|
|
"cudnn_common.h"
|
|
"cudnn_fe_call.cc"
|
|
"cupti_manager.cc"
|
|
"cupti_manager.h"
|
|
"fpgeneric.cu"
|
|
"gpu_data_transfer.cc"
|
|
"gpu_data_transfer.h"
|
|
"integer_gemm.cc"
|
|
"tunable/*"
|
|
"cuda_nhwc_kernels.cc"
|
|
"cuda_nhwc_kernels.h"
|
|
)
|
|
|
|
set(training_ops_excluded_files
|
|
"activation/gelu_grad_impl_common.cuh" # uses custom tanh
|
|
"collective/adasum_kernels.cc"
|
|
"collective/adasum_kernels.h"
|
|
"math/div_grad.cc" # miopen API differs from cudnn, no double type support
|
|
"nn/batch_norm_grad.cc" # no double type support
|
|
"nn/batch_norm_grad.h" # miopen API differs from cudnn
|
|
"nn/batch_norm_internal.cc" # miopen API differs from cudnn, no double type support
|
|
"nn/batch_norm_internal.h" # miopen API differs from cudnn, no double type support
|
|
"nn/conv_grad.cc"
|
|
"nn/conv_grad.h"
|
|
"reduction/reduction_all.cc" # deterministic = true, ignore ctx setting
|
|
"reduction/reduction_ops.cc" # no double type support
|
|
"cuda_training_kernels.cc"
|
|
"cuda_training_kernels.h"
|
|
"nn/conv_shared.cc"
|
|
"nn/conv_shared.h"
|
|
"nn/conv_transpose_grad.cc"
|
|
"nn/conv_transpose_grad.h"
|
|
)
|
|
|
|
function(auto_set_source_files_hip_language)
|
|
foreach(f ${ARGN})
|
|
if(f MATCHES ".*\\.cu$")
|
|
set_source_files_properties(${f} PROPERTIES LANGUAGE HIP)
|
|
endif()
|
|
endforeach()
|
|
endfunction()
|
|
|
|
# cuda_dir must be relative to REPO_ROOT
|
|
function(hipify cuda_dir in_excluded_file_patterns out_generated_cc_files out_generated_cu_files)
|
|
set(hipify_tool ${REPO_ROOT}/tools/ci_build/amd_hipify.py)
|
|
|
|
file(GLOB_RECURSE srcs CONFIGURE_DEPENDS
|
|
"${REPO_ROOT}/${cuda_dir}/cuda/*.h"
|
|
"${REPO_ROOT}/${cuda_dir}/cuda/*.cc"
|
|
"${REPO_ROOT}/${cuda_dir}/cuda/*.cuh"
|
|
"${REPO_ROOT}/${cuda_dir}/cuda/*.cu"
|
|
)
|
|
|
|
# do exclusion
|
|
set(excluded_file_patterns ${${in_excluded_file_patterns}})
|
|
list(TRANSFORM excluded_file_patterns PREPEND "${REPO_ROOT}/${cuda_dir}/cuda/")
|
|
file(GLOB_RECURSE excluded_srcs CONFIGURE_DEPENDS ${excluded_file_patterns})
|
|
foreach(f ${excluded_srcs})
|
|
message(STATUS "Excluded from hipify: ${f}")
|
|
endforeach()
|
|
list(REMOVE_ITEM srcs ${excluded_srcs})
|
|
|
|
foreach(f ${srcs})
|
|
file(RELATIVE_PATH cuda_f_rel "${REPO_ROOT}" ${f})
|
|
string(REPLACE "cuda" "rocm" rocm_f_rel ${cuda_f_rel})
|
|
set(f_out "${CMAKE_CURRENT_BINARY_DIR}/amdgpu/${rocm_f_rel}")
|
|
add_custom_command(
|
|
OUTPUT ${f_out}
|
|
COMMAND Python3::Interpreter ${hipify_tool}
|
|
--hipify_perl ${onnxruntime_HIPIFY_PERL}
|
|
${f} -o ${f_out}
|
|
DEPENDS ${hipify_tool} ${f}
|
|
COMMENT "Hipify: ${cuda_f_rel} -> amdgpu/${rocm_f_rel}"
|
|
)
|
|
if(f MATCHES ".*\\.cuh?$")
|
|
list(APPEND generated_cu_files ${f_out})
|
|
else()
|
|
list(APPEND generated_cc_files ${f_out})
|
|
endif()
|
|
endforeach()
|
|
|
|
set_source_files_properties(${generated_cc_files} PROPERTIES GENERATED TRUE)
|
|
set_source_files_properties(${generated_cu_files} PROPERTIES GENERATED TRUE)
|
|
auto_set_source_files_hip_language(${generated_cu_files})
|
|
set(${out_generated_cc_files} ${generated_cc_files} PARENT_SCOPE)
|
|
set(${out_generated_cu_files} ${generated_cu_files} PARENT_SCOPE)
|
|
endfunction()
|