From 7fe7ea2f0e4db13ba8f6039ded2476932948cacd Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Tue, 4 Feb 2025 16:56:59 -0800 Subject: [PATCH] sync --- .../cuda/transformers/generation_cuda_impl.cu | 34 +++++++++++++------ .../cuda/transformers/generation_cuda_impl.h | 2 +- .../transformers/generation_device_helper.cc | 2 +- 3 files changed, 26 insertions(+), 12 deletions(-) diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu index faa167a375..5b960460c4 100644 --- a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu +++ b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu @@ -383,7 +383,7 @@ __device__ void BeamHypotheses::Output( } __global__ void BeamSearchScorer_Process(BeamScorerState& state_cpu, - BeamScorerState& state, + BeamScorerState* state_gpu, const int32_t* sequences_buffer, int sequence_length, BeamHypotheses* beam_hyps_, @@ -397,6 +397,8 @@ __global__ void BeamSearchScorer_Process(BeamScorerState& state_cpu, // Sequences shape is (batch_size * num_beams, total_sequence_length) // It contains word ID of whole sequence generated so far. // It is different from subgraph input_ids, which only need one word when past state is not empty. + BeamScorerState& state = *state_gpu; + printf("\n >>> BeamSearchScorer_Process \n"); int batch = threadIdx.x; @@ -463,11 +465,11 @@ __global__ void BeamSearchScorer_Process(BeamScorerState& state_cpu, printf("\n <<< BeamSearchScorer_Process \n"); } -__global__ void DumpBeamScorerState(BeamScorerState& state) { - state.Print(); +__global__ void DumpBeamScorerState(BeamScorerState* state) { + state->Print(); } -void DumpBeamScorerStates(BeamScorerState& state_cpu, BeamScorerState& state, cudaStream_t stream){ +void DumpBeamScorerStates(BeamScorerState& state_cpu, BeamScorerState* state, cudaStream_t stream){ printf("\n state_cpu: \n"); state_cpu.Print(); @@ -490,7 +492,7 @@ void DumpBeamHypotheses(gsl::span beam_hyps, cudaStream_t stream } void LaunchBeamSearchScorer_Process(BeamScorerState& state_cpu, - BeamScorerState& state, + BeamScorerState* state_gpu, gsl::span sequences, int sequence_length, gsl::span beam_hyps, @@ -502,13 +504,19 @@ void LaunchBeamSearchScorer_Process(BeamScorerState& state_cpu, gsl::span next_tokens, gsl::span next_indices, cudaStream_t stream) { - printf("\n >>> LaunchBeamSearchScorer_Process \n"); + size_t printfBufferSize = 10 * 1024 * 1024; + cudaDeviceSetLimit(cudaLimitPrintfFifoSize, printfBufferSize); + cudaDeviceSynchronize(); + printf("\n >>> LaunchBeamSearchScorer_Process \n"); + cudaDeviceSynchronize(); DumpBeamHypotheses(beam_hyps, stream); - DumpBeamScorerStates(state_cpu, state, stream); + cudaDeviceSynchronize(); + DumpBeamScorerStates(state_cpu, state_gpu, stream); + cudaDeviceSynchronize(); BeamSearchScorer_Process<<<1, state_cpu.batch_size_, 0, stream>>>(state_cpu, - state, + state_gpu, sequences.data(), sequence_length, beam_hyps.data(), @@ -520,10 +528,14 @@ void LaunchBeamSearchScorer_Process(BeamScorerState& state_cpu, next_tokens.data(), next_indices.data()); + cudaDeviceSynchronize(); DumpBeamHypotheses(beam_hyps, stream); - DumpBeamScorerStates(state_cpu, state, stream); - + cudaDeviceSynchronize(); + DumpBeamScorerStates(state_cpu, state_gpu, stream); + cudaDeviceSynchronize(); printf("\n <<< LaunchBeamSearchScorer_Process \n"); + cudaDeviceSynchronize(); + } __global__ void BeamSearchScorer_AppendNextTokenToSequences1(BeamScorerState& state, @@ -642,6 +654,7 @@ void LaunchBeamSearchScorer_Finalize(int batch_size, gsl::span sequence_scores, cudaStream_t stream) { printf("\n >>> LaunchBeamSearchScorer_Finalize \n"); + cudaDeviceSynchronize(); BeamSearchScorer_Finalize<<<1, batch_size, 0, stream>>>(state, sequences.data(), sequence_length, @@ -649,6 +662,7 @@ void LaunchBeamSearchScorer_Finalize(int batch_size, final_beam_scores.data(), output.data(), sequence_scores.data()); + cudaDeviceSynchronize(); printf("\n <<< LaunchBeamSearchScorer_Finalize \n"); } diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h index 0a68a822c7..f0a7b2c1ef 100644 --- a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h +++ b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h @@ -103,7 +103,7 @@ struct BeamScorerState { void LaunchInitializeBeamHypotheses(gsl::span beam_hyps, float length_penalty, gsl::span beams, int num_beams, cudaStream_t stream); void LaunchBeamSearchScorer_Process(BeamScorerState& state_cpu, - BeamScorerState& state, + BeamScorerState* state, gsl::span sequences, int sequence_length, gsl::span beam_hyps_, diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc index b066744887..c568a0338f 100644 --- a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc +++ b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc @@ -726,7 +726,7 @@ void CudaBeamSearchScorer::Process(transformers::ISequences& sequences, state_cpu_->Print(); cuda::LaunchBeamSearchScorer_Process(*state_cpu_, - *state_gpu_, + state_gpu_.get(), sequences.GetCurrentDeviceSequences(), sequences.GetSequenceLength(), beam_hyps_,