dump more data

This commit is contained in:
Tianlei Wu 2025-02-04 18:25:10 -08:00
parent 7fe7ea2f0e
commit 13399efe02
2 changed files with 15 additions and 8 deletions

View file

@ -416,6 +416,10 @@ __global__ void BeamSearchScorer_Process(BeamScorerState& state_cpu,
int32_t next_index = next_indices[batch * top_k + j];
int batch_beam_idx = batch_start + next_index;
printf("\nbatch=%d batch_beam_idx=%d j=%uz next_token=%d eos_token_id=%d next_score=%f next_index=%d\n",
batch, batch_beam_idx, j, next_token, state.eos_token_id_, next_score, next_index);
// Add to generated hypotheses if end of sentence.
if ((state.eos_token_id_ >= 0) && (next_token == state.eos_token_id_)) {
bool is_beam_token_worse_than_top_num_beams = (j >= state.num_beams_);
@ -444,6 +448,8 @@ __global__ void BeamSearchScorer_Process(BeamScorerState& state_cpu,
}
// Check if we are done so that we can save a pad step if all(done)
printf("\n beam_hyp.beams_used_ == state.num_beams_ is %d\n", beam_hyp.beams_used_ == state.num_beams_ ? 1 : 0);
if (beam_hyp.beams_used_ == state.num_beams_) {
if (state.early_stopping_ || !beam_hyp.CanImprove(*std::max_element(next_scores + batch_start, next_scores + batch_start + top_k), sequence_length)) {
beam_hyp.done_ = true;
@ -466,16 +472,15 @@ __global__ void BeamSearchScorer_Process(BeamScorerState& state_cpu,
}
__global__ void DumpBeamScorerState(BeamScorerState* state) {
state->Print();
state->Print(false);
}
void DumpBeamScorerStates(BeamScorerState& state_cpu, BeamScorerState* state, cudaStream_t stream){
printf("\n state_cpu: \n");
state_cpu.Print();
state_cpu.Print(true);
printf("\n state_gpu: \n");
cudaDeviceSynchronize();
DumpBeamScorerState<<<1, 1, 0, stream>>>(state);
cudaStreamSynchronize(stream);
cudaDeviceSynchronize();
}
__global__ void DumpBeamSearchScorer(const BeamHypotheses& beam_hyp) {
@ -486,8 +491,9 @@ void DumpBeamHypotheses(gsl::span<BeamHypotheses> beam_hyps, cudaStream_t stream
printf("\n BeamHypotheses of size %zu: \n", beam_hyps.size());
for (size_t i = 0; i < beam_hyps.size(); i++) {
printf("\n [%zu]:\n", i);
cudaDeviceSynchronize();
DumpBeamSearchScorer<<<1, 1, 0, stream>>>(beam_hyps[i]);
cudaStreamSynchronize(stream);
cudaDeviceSynchronize();
}
}
@ -515,6 +521,7 @@ void LaunchBeamSearchScorer_Process(BeamScorerState& state_cpu,
DumpBeamScorerStates(state_cpu, state_gpu, stream);
cudaDeviceSynchronize();
BeamSearchScorer_Process<<<1, state_cpu.batch_size_, 0, stream>>>(state_cpu,
state_gpu,
sequences.data(),

View file

@ -86,8 +86,8 @@ struct BeamScorerState {
int hypothesis_buffer_used_; // Offset of available buffer, or length of used buffer.
// Function to dump the struct data to stdout
__host__ __device__ void Print() const {
printf("BeamScorerState Dump:\n");
__host__ __device__ void Print(bool is_cpu) const {
printf("BeamScorerState (cpu=%d) Dump:\n", is_cpu ? 1 : 0);
printf(" batch_size_: %d\n", batch_size_);
printf(" num_beams_: %d\n", num_beams_);
printf(" max_length_: %d\n", max_length_);