From 3c3103e5df31e565a161bead2415b431a7bd5cdb Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Fri, 24 Jan 2025 15:52:15 -0800 Subject: [PATCH] add debug code --- .../cpu/transformers/generation_shared.h | 2 +- onnxruntime/contrib_ops/cpu/utils/debug_macros.h | 2 +- .../cuda/transformers/generation_cuda_impl.cu | 5 ++++- .../cuda/transformers/generation_cuda_impl.h | 16 +++++++++++++++- .../transformers/generation_device_helper.cc | 8 ++++++++ 5 files changed, 29 insertions(+), 4 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/transformers/generation_shared.h b/onnxruntime/contrib_ops/cpu/transformers/generation_shared.h index 8145fbd4a4..635c485296 100644 --- a/onnxruntime/contrib_ops/cpu/transformers/generation_shared.h +++ b/onnxruntime/contrib_ops/cpu/transformers/generation_shared.h @@ -8,7 +8,7 @@ #include #include "core/framework/allocator.h" #include "core/framework/ort_value.h" -#include "contrib_ops/cpu/utils/debug_macros.h" +#include "contrib_ops/cpu/utils/console_dumper.h" namespace onnxruntime { diff --git a/onnxruntime/contrib_ops/cpu/utils/debug_macros.h b/onnxruntime/contrib_ops/cpu/utils/debug_macros.h index d5cbaa0a3e..d6cea821ce 100644 --- a/onnxruntime/contrib_ops/cpu/utils/debug_macros.h +++ b/onnxruntime/contrib_ops/cpu/utils/debug_macros.h @@ -1,7 +1,7 @@ #pragma once #include "core/common/make_string.h" -// #define DEBUG_GENERATION 1 // uncomment it for debugging generation (like beam search etc) +#define DEBUG_GENERATION 1 // uncomment it for debugging generation (like beam search etc) #ifdef DEBUG_GENERATION #define DUMP_TENSOR_LEVEL 2 diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu index eb1943b59d..45ed087aac 100644 --- a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu +++ b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu @@ -5,7 +5,7 @@ // cub.cuh includes device/dispatch_radix_sort.cuh which has assignment in conditional expressions #if defined(_MSC_VER) #pragma warning(push) -#pragma warning(disable : 4706) +#pragma warning(disable : 4706) #endif #include #if defined(_MSC_VER) @@ -406,6 +406,9 @@ __global__ void BeamSearchScorer_Process(BeamScorerState& state_cpu, beam_hyp.done_ = true; if (atomicAdd(&state.not_done_count_, -1) == 1) state_cpu.not_done_count_ = 0; // Update the CPU side + + printf("\n --- BeamSearchScorer_Process updated cpu state for batch %d\n", threadIdx.x); + state_cpu.Print(); } } } else { diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h index 281cb6c725..0a68a822c7 100644 --- a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h +++ b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h @@ -6,6 +6,7 @@ #include #include #include +#include namespace onnxruntime { namespace contrib { @@ -82,8 +83,21 @@ struct BeamScorerState { int eos_token_id_; bool early_stopping_; int not_done_count_; // When zero, every batch entry is done (starts at batch_size_) - int hypothesis_buffer_used_; // Offset of available buffer, or length of used buffer. + + // Function to dump the struct data to stdout + __host__ __device__ void Print() const { + printf("BeamScorerState Dump:\n"); + printf(" batch_size_: %d\n", batch_size_); + printf(" num_beams_: %d\n", num_beams_); + printf(" max_length_: %d\n", max_length_); + printf(" num_return_sequences_: %d\n", num_return_sequences_); + printf(" pad_token_id_: %d\n", pad_token_id_); + printf(" eos_token_id_: %d\n", eos_token_id_); + printf(" early_stopping_: %s\n", early_stopping_ ? "true" : "false"); + printf(" not_done_count_: %d\n", not_done_count_); + printf(" hypothesis_buffer_used_: %d\n", hypothesis_buffer_used_); + } }; void LaunchInitializeBeamHypotheses(gsl::span beam_hyps, float length_penalty, gsl::span beams, int num_beams, cudaStream_t stream); diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc index 4e65336665..b066744887 100644 --- a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc +++ b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc @@ -722,6 +722,9 @@ void CudaBeamSearchScorer::Process(transformers::ISequences& sequences, gsl::span& next_scores, gsl::span& next_tokens, gsl::span& next_indices) { + printf("\n---Process ---\n"); + state_cpu_->Print(); + cuda::LaunchBeamSearchScorer_Process(*state_cpu_, *state_gpu_, sequences.GetCurrentDeviceSequences(), @@ -735,6 +738,7 @@ void CudaBeamSearchScorer::Process(transformers::ISequences& sequences, next_tokens, next_indices, stream_); + CUDA_CALL_THROW(cudaEventRecord(event_process_complete_.Get(), stream_)); cuda::LaunchBeamSearchScorer_AppendNextTokenToSequences(*state_cpu_, @@ -749,6 +753,10 @@ void CudaBeamSearchScorer::Process(transformers::ISequences& sequences, bool CudaBeamSearchScorer::IsDoneLater() const { CUDA_CALL_THROW(cudaEventSynchronize(event_process_complete_.Get())); + + printf("\n---IsDoneLater ---\n"); + state_cpu_->Print(); + return state_cpu_->not_done_count_ == 0; }