mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-21 21:52:11 +00:00
Fix unused variable for CUDA EP builds with USE_FLASH_ATTENTION off (#14404)
### Description
Fixes unused `use_memory_efficient_attention` variable in
contrib_ops/cuda/bert/attention_impl.cu.
### Motivation and Context
ORT with CUDA version < 11.6 fails to build for release configurations
due to an unused variable.
```shell
c:\...\onnxruntime\onnxruntime\contrib_ops\cuda\bert\attention_impl.cu(420): error : variable "use_memory_efficient_attention" was declared but never referenced [C:\...\onnxruntime\build\Windows\RelWithDebInfo\onnx
runtime_providers_cuda.vcxproj]
detected during instantiation of "onnxruntime::common::Status onnxruntime::contrib::cuda::QkvToContext(const cudaDeviceProp &, cublasHandle_t &, cudaStream_t, onnxruntime::contrib::AttentionParameters &, onnxruntime::contrib::cuda::AttentionData<T> &) [wit
h T=float]"
(923): here
```
This happens for CUDA < 11.6. Our cmake script turns off
onnxruntime_USE_FLASH_ATTENTION for CUDA < 11.6, which leaves the
aforementioned variable unused outside of asserts (which are removed in
release builds).
The USE_FLASH_ATTENTION option was added by
https://github.com/microsoft/onnxruntime/pull/14343
This commit is contained in:
parent
3c1ef7dee6
commit
85d7e9c596
1 changed files with 4 additions and 4 deletions
|
|
@ -417,10 +417,10 @@ Status QkvToContext(
|
|||
const bool past_present_share_buffer = parameters.past_present_share_buffer;
|
||||
const float mask_filter_value = parameters.mask_filter_value;
|
||||
void* fused_runner = data.fused_runner;
|
||||
bool use_memory_efficient_attention = data.use_memory_efficient_attention;
|
||||
|
||||
// At most one fused kernel is enabled.
|
||||
assert(int(use_memory_efficient_attention) + int(fused_runner != nullptr) + int(data.fused_cross_attention_kernel != nullptr) <= 1);
|
||||
assert(int(data.use_memory_efficient_attention) + int(fused_runner != nullptr) +
|
||||
int(data.fused_cross_attention_kernel != nullptr) <= 1);
|
||||
|
||||
const int batches = batch_size * num_heads;
|
||||
const int size_per_batch_q = sequence_length * qk_head_size;
|
||||
|
|
@ -469,7 +469,7 @@ Status QkvToContext(
|
|||
assert(data.fused_cross_attention_kernel == nullptr);
|
||||
assert(!use_fused_kernel);
|
||||
assert(data.gemm_buffer != nullptr);
|
||||
assert(!use_memory_efficient_attention);
|
||||
assert(!data.use_memory_efficient_attention);
|
||||
|
||||
if (data.present != data.past) {
|
||||
// For easy testing. Production should better avoid this path.
|
||||
|
|
@ -564,7 +564,7 @@ Status QkvToContext(
|
|||
}
|
||||
|
||||
#if USE_FLASH_ATTENTION
|
||||
if (use_memory_efficient_attention) {
|
||||
if (data.use_memory_efficient_attention) {
|
||||
// We only enable fused cross attention when there is no key padding mask.
|
||||
// Otherwise, key have effective batch size 2 * batch_size, which is different from batch_size of query.
|
||||
assert(data.mask_index == nullptr);
|
||||
|
|
|
|||
Loading…
Reference in a new issue