diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu index 5b960460c4..6fc80be2e2 100644 --- a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu +++ b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu @@ -416,6 +416,10 @@ __global__ void BeamSearchScorer_Process(BeamScorerState& state_cpu, int32_t next_index = next_indices[batch * top_k + j]; int batch_beam_idx = batch_start + next_index; + + printf("\nbatch=%d batch_beam_idx=%d j=%uz next_token=%d eos_token_id=%d next_score=%f next_index=%d\n", + batch, batch_beam_idx, j, next_token, state.eos_token_id_, next_score, next_index); + // Add to generated hypotheses if end of sentence. if ((state.eos_token_id_ >= 0) && (next_token == state.eos_token_id_)) { bool is_beam_token_worse_than_top_num_beams = (j >= state.num_beams_); @@ -444,6 +448,8 @@ __global__ void BeamSearchScorer_Process(BeamScorerState& state_cpu, } // Check if we are done so that we can save a pad step if all(done) + printf("\n beam_hyp.beams_used_ == state.num_beams_ is %d\n", beam_hyp.beams_used_ == state.num_beams_ ? 1 : 0); + if (beam_hyp.beams_used_ == state.num_beams_) { if (state.early_stopping_ || !beam_hyp.CanImprove(*std::max_element(next_scores + batch_start, next_scores + batch_start + top_k), sequence_length)) { beam_hyp.done_ = true; @@ -466,16 +472,15 @@ __global__ void BeamSearchScorer_Process(BeamScorerState& state_cpu, } __global__ void DumpBeamScorerState(BeamScorerState* state) { - state->Print(); + state->Print(false); } void DumpBeamScorerStates(BeamScorerState& state_cpu, BeamScorerState* state, cudaStream_t stream){ - printf("\n state_cpu: \n"); - state_cpu.Print(); + state_cpu.Print(true); - printf("\n state_gpu: \n"); + cudaDeviceSynchronize(); DumpBeamScorerState<<<1, 1, 0, stream>>>(state); - cudaStreamSynchronize(stream); + cudaDeviceSynchronize(); } __global__ void DumpBeamSearchScorer(const BeamHypotheses& beam_hyp) { @@ -486,8 +491,9 @@ void DumpBeamHypotheses(gsl::span beam_hyps, cudaStream_t stream printf("\n BeamHypotheses of size %zu: \n", beam_hyps.size()); for (size_t i = 0; i < beam_hyps.size(); i++) { printf("\n [%zu]:\n", i); + cudaDeviceSynchronize(); DumpBeamSearchScorer<<<1, 1, 0, stream>>>(beam_hyps[i]); - cudaStreamSynchronize(stream); + cudaDeviceSynchronize(); } } @@ -515,6 +521,7 @@ void LaunchBeamSearchScorer_Process(BeamScorerState& state_cpu, DumpBeamScorerStates(state_cpu, state_gpu, stream); cudaDeviceSynchronize(); + BeamSearchScorer_Process<<<1, state_cpu.batch_size_, 0, stream>>>(state_cpu, state_gpu, sequences.data(), diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h index f0a7b2c1ef..bf0f999168 100644 --- a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h +++ b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h @@ -86,8 +86,8 @@ struct BeamScorerState { int hypothesis_buffer_used_; // Offset of available buffer, or length of used buffer. // Function to dump the struct data to stdout - __host__ __device__ void Print() const { - printf("BeamScorerState Dump:\n"); + __host__ __device__ void Print(bool is_cpu) const { + printf("BeamScorerState (cpu=%d) Dump:\n", is_cpu ? 1 : 0); printf(" batch_size_: %d\n", batch_size_); printf(" num_beams_: %d\n", num_beams_); printf(" max_length_: %d\n", max_length_);