BeamScorer to use contiguous arrays for BeamHypotheses (#15923)

### Description Change BeamHypotheses to not use a stl::priority_queue and instead all BeamHypotheses use a single buffer that they each get a small slice of. As the beam count is really small (typically 4,8, max of 32) and the array size fixed, the BeamHypotheses just does a sorted insert into an array. This also allows for the BeamHypotheses inside of the BeamSearchScorer to be a single fixed allocation vs an onnxruntime::FastAllocVector. ### Motivation and Context The goal is to simplify the memory usage and make the code more easily ported to CUDA.
2026-07-13 18:08:13 +00:00 · 2023-05-13 14:17:45 -07:00 · 2023-05-13 14:17:45 -07:00 · 310273cbe4
commit 310273cbe4
parent 896a963492
7 changed files with 70 additions and 117 deletions
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h
@ -202,12 +202,7 @@ Status BeamSearchGpt<T>::Execute(const FeedsFetchesManager* init_run_feeds_fetch
  std::vector<OrtValue> fetches;

  // Initialize resources
-  onnxruntime::OrtStlAllocator<HypothesisScore> hypothesis_score_allocator(this->cpu_allocator_);
-  onnxruntime::OrtStlAllocator<BeamHypotheses> beam_hyps_allocator(this->cpu_allocator_);
-  this->beam_scorer_ = std::make_unique<BeamSearchScorer>(*parameters,
-                                                          hypothesis_score_allocator,
-                                                          beam_hyps_allocator,
-                                                          this->cpu_allocator_);
+  this->beam_scorer_ = std::make_unique<BeamSearchScorer>(*parameters, this->cpu_allocator_);

  BeamSearchCpuState cpu_state{*parameters,
                               this->cpu_allocator_,
@ -233,11 +228,10 @@ Status BeamSearchGpt<T>::Execute(const FeedsFetchesManager* init_run_feeds_fetch
    }
  }

-  constexpr bool use_position = true;
  BeamSearchState<T> beam_state{*parameters,
                                this->temp_space_allocator_,
                                gpt_subgraph_.has_decoder_masked_attention_,
-                                use_position};
+                                true /* use_position */};

  init_beam_state_func_(&beam_state,
                        cpu_state.sequence_lengths,
@ -245,8 +239,7 @@ Status BeamSearchGpt<T>::Execute(const FeedsFetchesManager* init_run_feeds_fetch
                        parameters->num_beams,
                        this->ort_stream_);

-  gsl::span<const int32_t> input_ids = expanded_input_ids_in_cpu.Get<Tensor>().DataAsSpan<int32_t>();
-  cpu_state.SetExpandedSequence(input_ids);
+  cpu_state.SetExpandedSequence(expanded_input_ids_in_cpu.Get<Tensor>().DataAsSpan<int32_t>());

 #ifdef DEBUG_GENERATION
  const IConsoleDumper* dumper = this->GetConsoleDumper();
@ -382,7 +375,7 @@ Status BeamSearchGpt<T>::Execute(const FeedsFetchesManager* init_run_feeds_fetch
    final_beam_scores = cpu_state.final_beam_scores;
  }

-  this->beam_scorer_->Finalize(&(cpu_state.sequences),
+  this->beam_scorer_->Finalize(cpu_state.sequences,
                               final_beam_scores,
                               output_sequences,
                               output_sequences_scores);
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_t5.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_t5.h
@ -183,18 +183,12 @@ Status BeamSearchT5<T>::Execute(const FeedsFetchesManager& encoder_feeds_fetches
  // Copy decoder_input_ids (in CPU) to sequence. It contains decoder_start_token_id for each beam.
  cpu_state.SetUnexpandedSequence(decoder_input_ids.Get<Tensor>().DataAsSpan<int32_t>());

-  onnxruntime::OrtStlAllocator<HypothesisScore> hypothesis_score_allocator(this->cpu_allocator_);
-  onnxruntime::OrtStlAllocator<BeamHypotheses> beam_hyps_allocator(this->cpu_allocator_);
-  this->beam_scorer_ = std::make_unique<BeamSearchScorer>(*parameters,
-                                                          hypothesis_score_allocator,
-                                                          beam_hyps_allocator,
-                                                          this->cpu_allocator_);
+  this->beam_scorer_ = std::make_unique<BeamSearchScorer>(*parameters, this->cpu_allocator_);

-  constexpr bool use_position = false;
  BeamSearchState<T> beam_state{*parameters,
                                this->temp_space_allocator_,
                                decoder_subgraph_.has_decoder_masked_attention_,
-                                use_position};
+                                false /* use_position */};

  init_beam_state_func_(&beam_state,
                        cpu_state.sequence_lengths,
@ -373,7 +367,7 @@ Status BeamSearchT5<T>::Execute(const FeedsFetchesManager& encoder_feeds_fetches
    final_beam_scores = cpu_state.final_beam_scores;
  }

-  this->beam_scorer_->Finalize(&(cpu_state.sequences),
+  this->beam_scorer_->Finalize(cpu_state.sequences,
                               final_beam_scores,
                               output_sequences,
                               output_sequences_scores);
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.cc
@ -18,43 +18,44 @@ namespace contrib {
 namespace transformers {
 using ::onnxruntime::rnn::detail::Allocate;

-BeamHypotheses::BeamHypotheses(int num_beams,
-                               float length_penalty,
-                               bool early_stopping,
-                               onnxruntime::OrtStlAllocator<HypothesisScore>& hypothesis_score_allocator)
-    : num_beams_(num_beams),
-      length_penalty_(length_penalty),
-      early_stopping_(early_stopping),
-      worst_score_(1e9),
-      beams_(hypothesis_score_allocator) {
+void BeamHypotheses::Init(const IGenerationParameters& parameters, gsl::span<HypothesisScore> beams) {
+  length_penalty_ = parameters.length_penalty;
+  early_stopping_ = parameters.early_stopping;
+  beams_ = beams;
+  beams_used_ = 0;
 }

 void BeamHypotheses::Add(gsl::span<const int32_t>& hypothesis, float sum_logprobs) {
  auto length = hypothesis.size();
  float score = sum_logprobs / pow(static_cast<float>(length), length_penalty_);

-  if (this->Size() < num_beams_ || score > worst_score_) {
-    HypothesisScore item(hypothesis, score);
-    beams_.push(item);
-    if (this->Size() > num_beams_) {
-      beams_.pop();
-    }
-    worst_score_ = beams_.top().score;
-  }
+  size_t index = beams_used_;
+  // If the array is full, don't add unless it's better than the worst element
+  if (index == beams_.size()) {
+    if (score <= beams_[--index].score)
+      return;
+  } else
+    beams_used_++;
+
+  // Rotate existing elements over while the new element scores higher
+  for (; index > 0 && score > beams_[index - 1].score; index--)
+    beams_[index] = beams_[index - 1];
+
+  beams_[index] = HypothesisScore{hypothesis, score};
 }

-bool BeamHypotheses::IsDone(float best_sum_logprobs, int current_length) {
+bool BeamHypotheses::IsDone(float best_sum_logprobs, int current_length) const {
  // If there are enough hypotheses and that none of the hypotheses being generated can become better
  // than the worst one in the heap, then we are done with this sentence.

-  if (Size() < num_beams_)
+  if (static_cast<size_t>(beams_used_) < beams_.size())
    return false;

  if (early_stopping_)
    return true;

  float current_score = best_sum_logprobs / pow(static_cast<float>(current_length), length_penalty_);
-  return worst_score_ >= current_score;
+  return beams_.back().score >= current_score;
 }

 void BeamHypotheses::Output(
@ -63,63 +64,49 @@ void BeamHypotheses::Output(
    gsl::span<int32_t>& sequences,       // buffer filled with pad token ID, shape (num_return_sequences, max_length)
    gsl::span<float>& sequences_scores)  // buffer of shape (num_return_sequences) or empty
 {
-  ORT_ENFORCE(top_k <= Size());
-  int remove_count = Size() - top_k;
-  for (int i = 0; i < remove_count; i++) {
-    beams_.pop();
-  }
-
-  // Since pop get the worst sequence, so output it in the reverse order.
-  // The first (worst) beam shall be put at the last position among top_k sequences.
-  int index = top_k - 1;
-  while (!beams_.empty()) {
-    auto item = beams_.top();
-    gsl::span<const int32_t>& source = item.hypothesis;
+  // Copy the top_k beams into the sequences
+  ORT_ENFORCE(top_k <= beams_used_);
+  for (int index = 0; index < top_k; index++) {
+    auto& item = beams_[index];
    gsl::span<int32_t> target = sequences.subspan(static_cast<gsl::index>(index) * max_length, max_length);

    // Note that word_ids might be less than max_length.
    // Since the sequences has been filled with pad token ID, so padding is not needed here.
-    gsl::copy(source, target);
+    gsl::copy(item.hypothesis, target);

    if (!sequences_scores.empty())
      sequences_scores[index] = item.score;
-
-    beams_.pop();
-    index--;
  }
 }

 BeamSearchScorer::BeamSearchScorer(const IGenerationParameters& parameters,
-                                   onnxruntime::OrtStlAllocator<HypothesisScore>& hypothesis_score_allocator,
-                                   onnxruntime::OrtStlAllocator<BeamHypotheses>& beam_hyps_allocator,
                                   AllocatorPtr& allocator)
    : batch_size_{static_cast<size_t>(parameters.batch_size)},
      num_beams_{static_cast<size_t>(parameters.num_beams)},
      max_length_{static_cast<size_t>(parameters.max_length)},
      num_beam_hyps_to_keep_{static_cast<size_t>(parameters.num_return_sequences)},
      pad_token_id_{parameters.pad_token_id},
-      eos_token_id_{parameters.eos_token_id},
-      beam_hyps_(beam_hyps_allocator) {
-  for (size_t i = 0; i < batch_size_; i++) {
-    beam_hyps_.push_back(BeamHypotheses(num_beams_, parameters.length_penalty, parameters.early_stopping, hypothesis_score_allocator));
-  }
-
+      eos_token_id_{parameters.eos_token_id} {
  size_t batch_beam_size = batch_size_ * num_beams_;

+  auto beams = Allocate<HypothesisScore>(allocator, batch_beam_size, hypothesis_scores_ptr_);
+  beam_hyps_ = Allocate<BeamHypotheses>(allocator, batch_size_, beam_hyps_ptr_);
+  for (size_t i = 0; i < batch_size_; i++)
+    beam_hyps_[i].Init(parameters, beams.subspan(i * num_beams_, num_beams_));
+
  done_ = Allocate<bool>(allocator, batch_size_, done_ptr_, true /* fill allocated array */, false /* fill with false */);

-  constexpr bool no_fill = false;  // Do not fill values after allocation
-  next_beam_scores_ = Allocate<float>(allocator, batch_beam_size, next_beam_scores_ptr_, no_fill);
-  next_beam_tokens_ = Allocate<int32_t>(allocator, batch_beam_size, next_beam_tokens_ptr_, no_fill);
-  next_beam_indices_ = Allocate<int32_t>(allocator, batch_beam_size, next_beam_indices_ptr_, no_fill);
+  next_beam_scores_ = Allocate<float>(allocator, batch_beam_size, next_beam_scores_ptr_);
+  next_beam_tokens_ = Allocate<int32_t>(allocator, batch_beam_size, next_beam_tokens_ptr_);
+  next_beam_indices_ = Allocate<int32_t>(allocator, batch_beam_size, next_beam_indices_ptr_);

  // Space to store intermediate sequence with length sequence_length, sequence_length + 1, ..., max_sequence_length.
  size_t per_beam = (SafeInt<size_t>(max_length_) * (max_length_ + 1) - (parameters.sequence_length - 1) * parameters.sequence_length) / 2;
  hypothesis_buffer_length_ = batch_beam_size * per_beam;
-  hypothesis_buffer_ = Allocate<int32_t>(allocator, hypothesis_buffer_length_, hypothesis_buffer_ptr_, no_fill);
+  hypothesis_buffer_ = Allocate<int32_t>(allocator, hypothesis_buffer_length_, hypothesis_buffer_ptr_);
 }

-bool BeamSearchScorer::IsDone() {
+bool BeamSearchScorer::IsDone() const {
  for (auto done : done_) {
    if (!done)
      return false;
@ -127,7 +114,7 @@ bool BeamSearchScorer::IsDone() {
  return true;
 }

-void BeamSearchScorer::Process(ISequences* sequences,
+void BeamSearchScorer::Process(ISequences& sequences,
                               gsl::span<const float>& next_scores,
                               gsl::span<const int32_t>& next_tokens,
                               gsl::span<const int32_t>& next_indices) {
@ -135,7 +122,7 @@ void BeamSearchScorer::Process(ISequences* sequences,
  // It contains word ID of whole sequence generated so far.
  // It is different from subgraph input_ids, which only need one word when past state is not empty.

-  const int sequence_length = sequences->GetSequenceLength();
+  const int sequence_length = sequences.GetSequenceLength();

  ORT_ENFORCE(next_scores.size() == next_tokens.size());
  ORT_ENFORCE(next_scores.size() == next_indices.size());
@ -143,7 +130,7 @@ void BeamSearchScorer::Process(ISequences* sequences,
  for (size_t batch = 0; batch < batch_size_; batch++) {
    BeamHypotheses& beam_hyp = beam_hyps_[batch];
    if (done_[batch]) {
-      ORT_ENFORCE(beam_hyp.Size() >= gsl::narrow_cast<int>(num_beams_),
+      ORT_ENFORCE(beam_hyp.Size() == gsl::narrow_cast<int>(num_beams_),
                  "Batch can only be done if all beams have been generated");

      // Pad the batch.
@ -172,7 +159,7 @@ void BeamSearchScorer::Process(ISequences* sequences,
        }

        // Clone the sequence and append to buffer.
-        gsl::span<const int32_t> src = sequences->GetSequence(batch_beam_idx);
+        gsl::span<const int32_t> src = sequences.GetSequence(batch_beam_idx);
        auto clone = hypothesis_buffer_.subspan(hypothesis_buffer_offset_, sequence_length);
        gsl::copy(src, clone);
        hypothesis_buffer_offset_ += static_cast<size_t>(sequence_length);
@ -205,11 +192,10 @@ void BeamSearchScorer::Process(ISequences* sequences,
  }
 }

-void BeamSearchScorer::Finalize(ISequences* sequences,
+void BeamSearchScorer::Finalize(ISequences& sequences,
                                gsl::span<const float>& final_beam_scores,
                                Tensor* output_sequences,
                                Tensor* output_sequence_scores) {
-  ORT_ENFORCE(sequences != nullptr);
  ORT_ENFORCE(output_sequences != nullptr);

  // Finalize all open beam hypotheses and add to generated hypotheses.
@ -222,7 +208,7 @@ void BeamSearchScorer::Finalize(ISequences* sequences,
    for (size_t beam_index = 0; beam_index < num_beams_; beam_index++) {
      size_t batch_beam_index = batch_index * num_beams_ + beam_index;
      float final_score = final_beam_scores[batch_beam_index];
-      auto final_tokens = sequences->GetSequence(batch_beam_index);
+      auto final_tokens = sequences.GetSequence(batch_beam_index);
      beam_hyp.Add(final_tokens, final_score);
    }
  }
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.h
@ -20,69 +20,50 @@ namespace contrib {
 namespace transformers {

 struct HypothesisScore {
-  HypothesisScore(gsl::span<const int32_t>& _hypothesis, float _score)
-      : hypothesis(_hypothesis), score(_score) {}
-
  gsl::span<const int32_t> hypothesis;
  float score;
 };

-class HypothesisScoreCompare {
- public:
-  bool operator()(const HypothesisScore& a, const HypothesisScore& b) {
-    return a.score > b.score;
-  }
-};
-
-class BeamHypotheses {
- public:
-  BeamHypotheses(int num_beams,
-                 float length_penalty,
-                 bool early_stopping,
-                 onnxruntime::OrtStlAllocator<HypothesisScore>& hypothesis_score_allocator);
+struct BeamHypotheses {
+  void Init(const IGenerationParameters& parameters, gsl::span<HypothesisScore> beams);

  // Number of hypotheses
-  int Size() { return static_cast<int>(beams_.size()); }
+  int Size() const { return beams_used_; }

  // Add a new hypothesis
  void Add(gsl::span<const int32_t>& hypothesis, float sum_logprobs);

-  bool IsDone(float best_sum_logprobs, int current_length);
+  bool IsDone(float best_sum_logprobs, int current_length) const;

-  // Output results. Note that it will clear all beams.
+  // Output results
  void Output(int top_k,                            // number of sequences to return
              int max_length,                       // max sequence length
              gsl::span<int32_t>& sequences,        // buffer with pad token, shape (num_return_sequences, max_length)
              gsl::span<float>& sequences_scores);  // buffer for sequence scores, with shape (num_return_sequences)

 private:
-  int num_beams_;
  float length_penalty_;
  bool early_stopping_;
-  float worst_score_;
-
-  // Min-heap for top k
-  std::priority_queue<HypothesisScore, onnxruntime::FastAllocVector<HypothesisScore>, HypothesisScoreCompare> beams_;
+  gsl::span<HypothesisScore> beams_;  // Beam width sized array of hypotheses, sorted by highest scoring
+  int beams_used_;                    // Number of elements used in beams_
 };

 class BeamSearchScorer : public IBeamScorer {
 public:
  BeamSearchScorer(const IGenerationParameters& parameters,
-                   onnxruntime::OrtStlAllocator<HypothesisScore>& hypothesis_score_allocator,
-                   onnxruntime::OrtStlAllocator<BeamHypotheses>& beam_hyps_allocator,
                   AllocatorPtr& allocator);

-  void Process(ISequences* sequences,
+  void Process(ISequences& sequences,
               gsl::span<const float>& next_scores,
               gsl::span<const int32_t>& next_tokens,
               gsl::span<const int32_t>& next_indices) override;

-  void Finalize(ISequences* sequences,
+  void Finalize(ISequences& sequences,
                gsl::span<const float>& final_beam_scores,
                Tensor* output_sequences,
                Tensor* output_sequence_scores) override;

-  bool IsDone();
+  bool IsDone() const;

  gsl::span<float>& GetNextScores() { return next_beam_scores_; }
  gsl::span<int32_t>& GetNextTokens() { return next_beam_tokens_; }
@ -113,7 +94,9 @@ class BeamSearchScorer : public IBeamScorer {
  size_t hypothesis_buffer_length_{};                   // Total number of elements
  size_t hypothesis_buffer_offset_{};                   // Offset of available buffer, or length of used buffer.

-  onnxruntime::FastAllocVector<BeamHypotheses> beam_hyps_;
+  IAllocatorUniquePtr<HypothesisScore> hypothesis_scores_ptr_;  // num_beams_ * batch_size_, divided into num_beams_ chunks per BeamHypothesis in beam_hyps_
+  IAllocatorUniquePtr<BeamHypotheses> beam_hyps_ptr_;
+  gsl::span<BeamHypotheses> beam_hyps_;  // batch_size_ count
 };

 }  // namespace transformers
--- a/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.cc
@ -429,7 +429,7 @@ Status ProcessLogits(const OrtValue& logits,                                 //
 #endif

  beam_scorer->Process(
-      sequences,
+      *sequences,
      next_scores,
      next_tokens,
      next_indices);
--- a/onnxruntime/contrib_ops/cpu/transformers/generation_shared.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/generation_shared.h
@ -93,30 +93,27 @@ struct ISamplingState {
  gsl::span<T> cumulative_probs;
 };

-class ISequences {
- public:
+struct ISequences {
  virtual ~ISequences() {}
  virtual gsl::span<const int32_t> GetSequence(int beam_index) const = 0;
  virtual int GetSequenceLength() const = 0;
 };

-class ILogitsProcessorList {
- public:
+struct ILogitsProcessorList {
  virtual ~ILogitsProcessorList() {}
  virtual void Process(const ISequences* sequences, gsl::span<float>& next_token_scores, int step) = 0;
 };

 // Interface for all scorers for beam search or beam sample.
-class IBeamScorer {
- public:
+struct IBeamScorer {
  virtual ~IBeamScorer() {}

-  virtual void Process(ISequences* sequences,
+  virtual void Process(ISequences& sequences,
                       gsl::span<const float>& next_scores,
                       gsl::span<const int32_t>& next_tokens,
                       gsl::span<const int32_t>& next_indices) = 0;

-  virtual void Finalize(ISequences* sequences,
+  virtual void Finalize(ISequences& sequences,
                        gsl::span<const float>& final_beam_scores,
                        Tensor* output_sequences,
                        Tensor* output_sequence_scores) = 0;
--- a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc
+++ b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc
@ -586,7 +586,7 @@ Status ProcessLogits(const OrtValue& logits,                                 //

  // TODO: Implement BeamScorer on CUDA
  beam_scorer->Process(
-      sequences,
+      *sequences,
      next_scores,
      next_tokens,
      next_indices);