diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu index 891a898521..e0c0dcc28c 100644 --- a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu +++ b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu @@ -267,37 +267,6 @@ __global__ void InitializeBeamHypotheses(BeamHypotheses* beam_hyps, int beam_hyp beam_hyp.done_ = false; } -// Function to dump the data in the struct -__device__ void dump_hypothesis_score(const struct HypothesisScore* hs) { - printf("HypothesisScore Dump:\n"); - printf(" hypothesis_length: %d\n", hs->hypothesis_length); - printf(" score: %f\n", hs->score); - - printf(" hypothesis: "); - if (hs->hypothesis_length > 0 && hs->hypothesis != NULL) { - for (int i = 0; i < hs->hypothesis_length; ++i) { - printf("%d ", hs->hypothesis[i]); - } - } else { - printf("(empty)"); - } - printf("\n"); -} - -__device__ void dump_beam_hypotheses(const struct BeamHypotheses* bh) { - printf("BeamHypotheses Dump:\n"); - printf(" beams_count_: %d\n", bh->beams_count_); - printf(" beams_used_: %d\n", bh->beams_used_); - printf(" length_penalty_: %f\n", bh->length_penalty_); - printf(" done_: %s\n", bh->done_ ? "true" : "false"); - - printf(" beams_: %d\n", bh->beams_used_); - for (int i = 0; i < bh->beams_used_; ++i) { - printf(" Beam %d:\n", i + 1); - dump_hypothesis_score(&bh->beams_[i]); - } -} - // For counts that are typically far less than 256, this will round up the count to the next multiple of 32 // If this winds up being >256 then it uses a block size of 256 and calculates the appropriate grid_size struct GridBlock32 { @@ -329,7 +298,9 @@ void LaunchInitializeBeamHypotheses(gsl::span beam_hyps, __device__ void BeamHypotheses::Add(const int32_t* hypothesis, int hypothesis_length, float sum_logprobs) { float score = sum_logprobs / pow(static_cast(hypothesis_length), length_penalty_); +#ifdef DEBUG_GENERATION printf("\n BeamHypotheses::Add (score=%f hypothesis_length=%d sum_logprobs=%f) \n", score, hypothesis_length, sum_logprobs); +#endif size_t index = beams_used_; // If the array is full, don't add unless it's better than the worst element @@ -345,15 +316,18 @@ __device__ void BeamHypotheses::Add(const int32_t* hypothesis, int hypothesis_le beams_[index] = HypothesisScore{hypothesis, hypothesis_length, score}; +#ifdef DEBUG_GENERATION printf("\n BeamHypotheses::Add (index=%d) \n", static_cast(index)); - // dump_hypothesis_score(&beams_[index]); +#endif } __device__ bool BeamHypotheses::CanImprove(float best_sum_logprobs, int current_length) const { float current_score = best_sum_logprobs / pow(static_cast(current_length), length_penalty_); bool result = beams_[beams_count_ - 1].score < current_score; +#ifdef DEBUG_GENERATION printf("\n BeamHypotheses::CanImprove (current_score=%f beams_[%d].score=%f can_improve=%d) \n", current_score, beams_count_ - 1, beams_[beams_count_ - 1].score, static_cast(result)); +#endif return result; } @@ -398,8 +372,9 @@ __global__ void BeamSearchScorer_Process(BeamScorerState& state_cpu, // It contains word ID of whole sequence generated so far. // It is different from subgraph input_ids, which only need one word when past state is not empty. BeamScorerState& state = *state_gpu; - +#ifdef DEBUG_GENERATION printf("\n >>> BeamSearchScorer_Process \n"); +#endif int batch = threadIdx.x; int batch_start = batch * state.num_beams_; @@ -417,9 +392,10 @@ __global__ void BeamSearchScorer_Process(BeamScorerState& state_cpu, int batch_beam_idx = batch_start + next_index; +#ifdef DEBUG_GENERATION printf("\nbatch=%d batch_beam_idx=%d j=%d next_token=%d eos_token_id=%d next_score=%f next_index=%d\n", batch, batch_beam_idx, static_cast(j), next_token, state.eos_token_id_, next_score, next_index); - +#endif // Add to generated hypotheses if end of sentence. if ((state.eos_token_id_ >= 0) && (next_token == state.eos_token_id_)) { bool is_beam_token_worse_than_top_num_beams = (j >= state.num_beams_); @@ -448,15 +424,23 @@ __global__ void BeamSearchScorer_Process(BeamScorerState& state_cpu, } // Check if we are done so that we can save a pad step if all(done) +#ifdef DEBUG_GENERATION printf("\n beam_hyp.beams_used_ == state.num_beams_ is %d\n", beam_hyp.beams_used_ == state.num_beams_ ? 1 : 0); - +#endif if (beam_hyp.beams_used_ == state.num_beams_) { - if (state.early_stopping_ || !beam_hyp.CanImprove(*std::max_element(next_scores + batch_start, next_scores + batch_start + top_k), sequence_length)) { + bool is_done = state.early_stopping_; + if (!is_done) { + float best_sum_logprobs = *std::max_element(next_scores + batch_start, next_scores + batch_start + top_k); + is_done = !beam_hyp.CanImprove(*std::max_element(next_scores + batch_start, next_scores + batch_start + top_k), sequence_length); + } + + if (is_done) { beam_hyp.done_ = true; if (atomicAdd(&state.not_done_count_, -1) == 1) state_cpu.not_done_count_ = 0; // Update the CPU side - - printf("\n --- BeamSearchScorer_Process updated cpu state for batch %d\n", threadIdx.x); +#ifdef DEBUG_GENERATION + printf("\n --- BeamSearchScorer_Process updated cpu state for batch %d\n", batch); +#endif } } } else { @@ -467,8 +451,9 @@ __global__ void BeamSearchScorer_Process(BeamScorerState& state_cpu, next_beam_indices_[batch_start + beam_idx] = 0; } } - +#ifdef DEBUG_GENERATION printf("\n <<< BeamSearchScorer_Process \n"); +#endif } __global__ void DumpBeamScorerState(BeamScorerState* state) { @@ -484,7 +469,7 @@ void DumpBeamScorerStates(BeamScorerState& state_cpu, BeamScorerState* state, cu } __global__ void DumpBeamSearchScorer(const BeamHypotheses& beam_hyp) { - dump_beam_hypotheses(&beam_hyp); + beam_hyp.Print(); } void DumpBeamHypotheses(gsl::span beam_hyps, cudaStream_t stream) { @@ -510,15 +495,13 @@ void LaunchBeamSearchScorer_Process(BeamScorerState& state_cpu, gsl::span next_tokens, gsl::span next_indices, cudaStream_t stream) { - // size_t printfBufferSize = 10 * 1024 * 1024; - // cudaDeviceSetLimit(cudaLimitPrintfFifoSize, printfBufferSize); - // cudaDeviceSynchronize(); +#ifdef DEBUG_GENERATION printf("\n >>> LaunchBeamSearchScorer_Process \n"); - // cudaDeviceSynchronize(); DumpBeamHypotheses(beam_hyps, stream); cudaDeviceSynchronize(); DumpBeamScorerStates(state_cpu, state_gpu, stream); cudaDeviceSynchronize(); +#endif BeamSearchScorer_Process<<<1, state_cpu.batch_size_, 0, stream>>>(state_cpu, state_gpu, @@ -533,13 +516,14 @@ void LaunchBeamSearchScorer_Process(BeamScorerState& state_cpu, next_tokens.data(), next_indices.data()); +#ifdef DEBUG_GENERATION cudaDeviceSynchronize(); DumpBeamHypotheses(beam_hyps, stream); cudaDeviceSynchronize(); DumpBeamScorerStates(state_cpu, state_gpu, stream); cudaDeviceSynchronize(); printf("\n <<< LaunchBeamSearchScorer_Process \n"); - // cudaDeviceSynchronize(); +#endif } __global__ void BeamSearchScorer_AppendNextTokenToSequences1(BeamScorerState& state, @@ -575,7 +559,9 @@ void LaunchBeamSearchScorer_AppendNextTokenToSequences(BeamScorerState& state_cp gsl::span next_beam_tokens, gsl::span next_beam_indices, cudaStream_t stream) { +#ifdef DEBUG_GENERATION printf("\n >>> LaunchBeamSearchScorer_AppendNextTokenToSequences \n"); +#endif const int max_threads = 512; int batch_beam_size = state_cpu.batch_size_ * state_cpu.num_beams_; @@ -610,7 +596,9 @@ void LaunchBeamSearchScorer_AppendNextTokenToSequences(BeamScorerState& state_cp next_sequences.data(), sequence_length, next_beam_tokens.data()); +#ifdef DEBUG_GENERATION printf("\n <<< LaunchBeamSearchScorer_AppendNextTokenToSequences \n"); +#endif } template @@ -657,7 +645,10 @@ void LaunchBeamSearchScorer_Finalize(int batch_size, gsl::span output, gsl::span sequence_scores, cudaStream_t stream) { +#ifdef DEBUG_GENERATION printf("\n >>> LaunchBeamSearchScorer_Finalize \n"); +#endif + cudaDeviceSynchronize(); BeamSearchScorer_Finalize<<<1, batch_size, 0, stream>>>(state, sequences.data(), @@ -667,7 +658,10 @@ void LaunchBeamSearchScorer_Finalize(int batch_size, output.data(), sequence_scores.data()); cudaDeviceSynchronize(); + +#ifdef DEBUG_GENERATION printf("\n <<< LaunchBeamSearchScorer_Finalize \n"); +#endif } template void LaunchBeamSearchScorer_Finalize( diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h index bf0f999168..958076257c 100644 --- a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h +++ b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h @@ -7,6 +7,7 @@ #include #include #include +#include "contrib_ops/cpu/transformers/generation_shared.h" namespace onnxruntime { namespace contrib { @@ -50,6 +51,21 @@ struct HypothesisScore { const int32_t* hypothesis; int hypothesis_length; float score; + +#ifdef DEBUG_GENERATION + __device__ void Print() const { + printf("HypothesisScore (hypothesis_length=%d, score=%f) \n", hypothesis_length, score); + printf(" hypothesis:"); + if (hypothesis_length > 0 && hypothesis != NULL) { + for (int i = 0; i < hypothesis_length; ++i) { + printf("%d ", hypothesis[i]); + } + } else { + printf("(empty)"); + } + printf("\n"); +} +#endif }; struct BeamHypotheses { @@ -72,6 +88,22 @@ struct BeamHypotheses { int pad_token_id, // pad token int32_t* sequences, // buffer with pad token, shape (num_return_sequences, max_length) T* sequences_scores); // buffer for sequence scores, with shape (num_return_sequences) + +#ifdef DEBUG_GENERATION + __device__ void Print() const { + printf("BeamHypotheses:\n"); + printf(" beams_count: %d\n", beams_count_); + printf(" beams_used: %d\n", beams_used_); + printf(" length_penalty: %f\n", length_penalty_); + printf(" done: %s\n", done_ ? "true" : "false"); + + printf(" beams:\n"); + for (int i = 0; i < beams_used_; ++i) { + printf(" Beam %d:\n", i + 1); + beams_[i].Print(); + } + } +#endif }; struct BeamScorerState { @@ -85,7 +117,7 @@ struct BeamScorerState { int not_done_count_; // When zero, every batch entry is done (starts at batch_size_) int hypothesis_buffer_used_; // Offset of available buffer, or length of used buffer. - // Function to dump the struct data to stdout +#ifdef DEBUG_GENERATION __host__ __device__ void Print(bool is_cpu) const { printf("BeamScorerState (cpu=%d) Dump:\n", is_cpu ? 1 : 0); printf(" batch_size_: %d\n", batch_size_); @@ -98,6 +130,7 @@ struct BeamScorerState { printf(" not_done_count_: %d\n", not_done_count_); printf(" hypothesis_buffer_used_: %d\n", hypothesis_buffer_used_); } +#endif }; void LaunchInitializeBeamHypotheses(gsl::span beam_hyps, float length_penalty, gsl::span beams, int num_beams, cudaStream_t stream); diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc index 7b3376e949..dbf11b683a 100644 --- a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc +++ b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc @@ -524,6 +524,7 @@ Status ProcessLogits(const OrtValue& logits, // beam_state->remaining_scores = beam_state->remaining_scores.subspan(next_token_scores.size()); } + gsl::span out_scores = beam_state->next_scores; if (num_beams <= 32) { constexpr size_t max_parts_of_vocab = 128; size_t candidate_count = SafeInt(batch_beam_size) * 2 * num_beams; @@ -588,16 +589,17 @@ Status ProcessLogits(const OrtValue& logits, // cuda::LaunchNextTokenKernel(next_token_indices, beam_state->next_indices.data(), beam_state->next_tokens.data(), batch_size, top_k, vocab_size, cuda_stream); - // BUG?: Copy topk_scores to next_scores #ifdef DEBUG_GENERATION dumper->Print("next_scores before scorer", topk_scores->Data(), batch_size, top_k); dumper->Print("next_tokens before scorer", beam_state->next_tokens.data(), batch_size, top_k); dumper->Print("next_indices before scorer", beam_state->next_indices.data(), batch_size, top_k); #endif + // use topk_scores as next_scores + out_scores = gsl::span(topk_scores->MutableData(), batch_size * top_k); } // gsl::span doesn't convert from non const to const, so all we're doing here is making each const. - gsl::span next_scores(beam_state->next_scores.data(), beam_state->next_scores.size()); + gsl::span next_scores(out_scores.data(), out_scores.size()); gsl::span next_tokens(beam_state->next_tokens.data(), beam_state->next_tokens.size()); gsl::span next_indices(beam_state->next_indices.data(), beam_state->next_indices.size()); @@ -723,8 +725,10 @@ void CudaBeamSearchScorer::Process(transformers::ISequences& sequences, gsl::span& next_scores, gsl::span& next_tokens, gsl::span& next_indices) { +#ifdef DEBUG_GENERATION printf("\n---Process ---\n"); state_cpu_->Print(true); +#endif cuda::LaunchBeamSearchScorer_Process(*state_cpu_, state_gpu_.get(), @@ -755,8 +759,10 @@ void CudaBeamSearchScorer::Process(transformers::ISequences& sequences, bool CudaBeamSearchScorer::IsDoneLater() const { CUDA_CALL_THROW(cudaEventSynchronize(event_process_complete_.Get())); +#ifdef DEBUG_GENERATION printf("\n---IsDoneLater ---\n"); state_cpu_->Print(true); +#endif return state_cpu_->not_done_count_ == 0; }