mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-27 22:45:57 +00:00
Remove early stopping from LLaMA end-to-end benchmarking (#20033)
### Description This PR removes early stopping from the end-to-end LLaMA-2 benchmark script. ### Motivation and Context This allows models to always generate the requested number of new tokens.
This commit is contained in:
parent
7e84ba0ea3
commit
f9cddd2cf5
1 changed files with 0 additions and 4 deletions
|
|
@ -400,11 +400,7 @@ def main():
|
|||
sampling_times.append(sampling_end_time - sampling_start_time)
|
||||
|
||||
all_token_ids = torch.cat([all_token_ids, tokens_to_add], dim=-1)
|
||||
|
||||
# Return early if all batch entries have reached EOS token id
|
||||
current_length += 1
|
||||
if torch.all(has_eos) or current_length > max_length:
|
||||
break
|
||||
|
||||
# Update inputs for next inference run
|
||||
inputs["input_ids"] = tokens_to_add
|
||||
|
|
|
|||
Loading…
Reference in a new issue