From 8a9ddfe963cc2eed511eb2f351f1e9b4b5e23aab Mon Sep 17 00:00:00 2001 From: Ye Wang <52801275+wangyems@users.noreply.github.com> Date: Wed, 5 May 2021 09:54:25 -0700 Subject: [PATCH] Longformer Attention non-determinism issue fix (#7574) * Fix run-to-run not deterministic bug. * Remove non-deterministic logic in softmax * Fix value diff when removing non-deterministic issue. Co-authored-by: Lei Zhang --- .../cuda/bert/longformer_attention_softmax.cu | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/onnxruntime/contrib_ops/cuda/bert/longformer_attention_softmax.cu b/onnxruntime/contrib_ops/cuda/bert/longformer_attention_softmax.cu index 0f95df570f..469ef221b2 100644 --- a/onnxruntime/contrib_ops/cuda/bert/longformer_attention_softmax.cu +++ b/onnxruntime/contrib_ops/cuda/bert/longformer_attention_softmax.cu @@ -103,7 +103,7 @@ __launch_bounds__(blockSize) if (is_local_row) { for (int g = tid; g < global_num; g += blockSize) { int i = global_index[g]; - if (i < col_start || i > col_end) { + if (i < col_start || i >= col_end) { float x = input_block[i]; x = x * scaler + (float)mask_block[i]; if (max_input < x) { @@ -130,7 +130,7 @@ __launch_bounds__(blockSize) if (is_local_row) { for (int g = tid; g < global_num; g += blockSize) { int i = global_index[g]; - if (i < col_start || i > col_end) { + if (i < col_start || i >= col_end) { float x = input_block[i]; x = expf((x)*scaler + (float)mask_block[i] - max_shared); sum_input += x; @@ -163,14 +163,21 @@ __launch_bounds__(blockSize) } for (int i = tid + zero_start; i < zero_end; i += blockSize) { - output_block[i] = (T)(0.); + if (i < col_start || i >= col_end) { + output_block[i] = (T)(0.); + } } + } + __syncthreads(); + if (is_local_row) { for (int g = tid; g < global_num; g += blockSize) { int i = global_index[g]; - float x = input_block[i]; - x = expf((x)*scaler + (float)mask_block[i] - max_shared); - output_block[i] = (T)(recip_sum * x); + if (i < col_start || i >= col_end) { + float x = input_block[i]; + x = expf((x)*scaler + (float)mask_block[i] - max_shared); + output_block[i] = (T)(recip_sum * x); + } } }