From 8f7eb75c3ef659da5cc85100a31199c5c4e31808 Mon Sep 17 00:00:00 2001
From: Yufeng Li <liyufeng1987@gmail.com>
Date: Thu, 12 Jan 2023 09:06:05 -0800
Subject: [PATCH] fix greedysearch token out of range bug (#14242)

Bug: the last sentence generates token out of vocabulary size.
Cause: total element should be computed with padded vocabulary size.
---
 .../contrib_ops/cuda/transformers/generation_cuda_impl.cu       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu
index bf98394d1c..600eb50648 100644
--- a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu
@@ -181,7 +181,7 @@ void LaunchLogitsProcessKernel(
     float repetition_penalty,
     int no_repeat_ngram_size,
     cudaStream_t stream) {
-  int total_elements = batch_size * num_beams * vocab_size;
+  int total_elements = batch_size * num_beams * padded_vocab_size;
   constexpr int blockSize = 256;
   const int gridSize = (total_elements + blockSize - 1) / blockSize;
   LogitsProcessKernel<T><<<gridSize, blockSize, 0, stream>>>(