Fix causual flash attention related kernel run (#14299)

2026-07-08 17:17:15 +00:00 · 2023-01-13 21:40:22 -08:00 · 2023-01-13 21:40:22 -08:00 · bd39c8f35e
commit bd39c8f35e
parent 8824f812e0
1 changed files with 1 additions and 1 deletions
--- a/onnxruntime/contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/mha_runner.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/mha_runner.cu
@ -159,7 +159,7 @@ class FusedMHARunnerFP16v2::mhaImpl {
    params.o_ptr = output;
    params.cu_seqlens = static_cast<int*>(const_cast<void*>(cu_seqlens));

-    if (use_flash_attention && flash_attention_kernel != nullptr) {
+    if (use_flash_attention && flash_attention_kernel != nullptr && !has_causal_mask) {
      flash_attention_kernel->run(params, stream);
    } else {
      xmmaKernel->run(params, stream, use_flash_attention, has_causal_mask);