Deprecate one step beam search (#14046)

### Description
Deprecate one step beam search since it lacks maintenance (some tests
failed) and its performance is not optimal.

For users who still need this feature, please use older version
(<=1.13.1) of onnxruntime to export one step beam search model, and the
model can run in latest onnxruntime.

It is recommend to use
[convert_generation.py](https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/transformers/convert_generation.py)
to generate beam search onnx model for better performance.
This commit is contained in:
Tianlei Wu 2022-12-22 23:14:31 -08:00 committed by GitHub
parent e49f358686
commit 8ac264b896
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 45 additions and 2259 deletions

View file

@ -15,8 +15,7 @@ from datetime import datetime
import psutil
import torch
from gpt2_beamsearch_helper import MODEL_CLASSES, Gpt2HelperFactory
from gpt2_helper import DEFAULT_TOLERANCE, PRETRAINED_GPT2_MODELS, Gpt2Helper
from gpt2_helper import DEFAULT_TOLERANCE, MODEL_CLASSES, PRETRAINED_GPT2_MODELS, Gpt2Helper
from packaging import version
from transformers import AutoConfig
@ -124,12 +123,6 @@ def parse_arguments(argv=None):
parser.set_defaults(torchscript=False)
parser.add_argument("-b", "--batch_sizes", nargs="+", type=int, default=[1], help="batch size")
parser.add_argument(
"--beam_size",
type=int,
default=4,
help="Beam size if greedy/top-p/top-k sampling is needed",
)
parser.add_argument(
"--sequence_lengths",
@ -170,54 +163,6 @@ def parse_arguments(argv=None):
parser.add_argument("--disable_io_binding", required=False, action="store_true")
parser.set_defaults(disable_io_binding=False)
search_option_group = parser.add_argument_group("configurable one step search options")
search_option_group.add_argument(
"--ignore_eos",
type=bool,
default=False,
help="If ignore end of sentence token in model inference.",
)
search_option_group.add_argument(
"--repetition_penalty",
type=float,
default=1,
help="Positive. >1 to penalize and <1 to encorage.",
)
search_option_group.add_argument(
"--temperature",
type=float,
default=1,
help="Softmax temperature for output logits.",
)
search_option_group.add_argument(
"--excluded_token_ids",
required=False,
nargs="+",
type=float,
help="A list of token ids to be excluded in inference.",
)
search_option_group.add_argument(
"--length_penalty",
type=float,
default=1,
help="Positive. >1 to penalize and <1 to encorage short sentence.",
)
sampling_option_group = parser.add_argument_group("one step sampling options")
sampling_option_group.add_argument(
"--do_sample",
action="store_true",
help="If to do sampling instead of beam search or greedy.",
)
sampling_option_group.add_argument(
"--do_sample_top_p",
type=float,
default=0.95,
help="Nuclear/top-p sampling accumulation probability.",
)
sampling_option_group.add_argument("--do_sample_top_k", type=int, default=0, help="Use top-k if non-zero.")
args = parser.parse_args(argv)
return args
@ -249,41 +194,9 @@ def main(args):
prepare_environment(cache_dir, output_dir, args.use_gpu)
model_class = MODEL_CLASSES[args.model_class][0]
if args.model_class == "GPT2LMHeadModel_BeamSearchStep":
model_type = "beam_search_step"
elif args.model_class == "GPT2LMHeadModel_ConfigurableOneStepSearch":
model_type = "configurable_one_step_search"
else:
model_type = "default"
gpt2helper = Gpt2HelperFactory.create_helper(model_type)
gpt2helper = Gpt2Helper
config = AutoConfig.from_pretrained(args.model_name_or_path, torchscript=args.torchscript, cache_dir=cache_dir)
if model_type == "beam_search_step":
model = model_class.from_pretrained(
args.model_name_or_path,
config=config,
batch_size=1,
beam_size=args.beam_size,
cache_dir=cache_dir,
)
elif model_type == "configurable_one_step_search":
model = model_class.from_pretrained(
args.model_name_or_path,
config=config,
batch_size=1,
beam_size=args.beam_size,
ignore_eos=args.ignore_eos,
temperature=args.temperature,
repetition_penalty=args.repetition_penalty,
excluded_token_ids=args.excluded_token_ids,
length_penalty=args.length_penalty,
do_sample=args.do_sample,
do_sample_top_p=args.do_sample_top_p,
do_sample_top_k=args.do_sample_top_k,
cache_dir=cache_dir,
)
else:
model = model_class.from_pretrained(args.model_name_or_path, config=config, cache_dir=cache_dir)
model = model_class.from_pretrained(args.model_name_or_path, config=config, cache_dir=cache_dir)
# This scirpt does not support float16 for PyTorch.
# if args.float16:
@ -352,29 +265,14 @@ def main(args):
return
# Allocate output buffers for IO Binding
if model_type == "beam_search_step" or model_type == "configurable_one_step_search":
max_output_shapes = gpt2helper.get_output_shapes(
max(args.batch_sizes),
context_len=max(args.past_sequence_lengths),
past_sequence_length=max(args.past_sequence_lengths),
sequence_length=max(args.sequence_lengths),
beam_size=args.beam_size,
step=0,
config=config,
model_class=args.model_class,
)
output_buffers = gpt2helper.get_output_buffers(max_output_shapes, device, args.precision == Precision.FLOAT16)
else:
max_output_shapes = gpt2helper.get_output_shapes(
max(args.batch_sizes),
max(args.past_sequence_lengths),
max(args.sequence_lengths),
config,
args.model_class,
)
output_buffers = gpt2helper.get_output_buffers(max_output_shapes, device, args.precision == Precision.FLOAT16)
max_output_shapes = gpt2helper.get_output_shapes(
max(args.batch_sizes),
max(args.past_sequence_lengths),
max(args.sequence_lengths),
config,
args.model_class,
)
output_buffers = gpt2helper.get_output_buffers(max_output_shapes, device, args.precision == Precision.FLOAT16)
csv_filename = args.result_csv or "benchmark_result_{}.csv".format(datetime.now().strftime("%Y%m%d-%H%M%S"))
with open(csv_filename, mode="a", newline="") as csv_file:
@ -402,53 +300,32 @@ def main(args):
for past_sequence_length in args.past_sequence_lengths:
assert batch_size > 0 and sequence_length > 0 and past_sequence_length >= 0
logger.debug(
f"Running test for batch_size={batch_size} sequence_length={sequence_length} past_sequence_length={past_sequence_length}..."
"Running test for batch_size=%d sequence_length=%d past_sequence_length=%d ...",
batch_size,
sequence_length,
past_sequence_length,
)
dummy_inputs = gpt2helper.get_dummy_inputs(
batch_size,
past_sequence_length,
sequence_length,
config.num_attention_heads,
config.hidden_size,
config.n_layer,
config.vocab_size,
device,
float16=(args.precision == Precision.FLOAT16),
has_position_ids=use_padding,
has_attention_mask=use_padding,
)
output_shapes = gpt2helper.get_output_shapes(
batch_size,
past_sequence_length,
sequence_length,
config,
args.model_class,
)
if model_type == "beam_search_step" or model_type == "configurable_one_step_search":
dummy_inputs = gpt2helper.get_dummy_inputs(
batch_size,
past_sequence_length,
sequence_length,
config.num_attention_heads,
config.hidden_size,
config.n_layer,
config.vocab_size,
device,
float16=(args.precision == Precision.FLOAT16),
has_position_ids=use_padding,
has_attention_mask=use_padding,
)
output_shapes = gpt2helper.get_output_shapes(
batch_size,
past_sequence_length,
past_sequence_length,
sequence_length,
args.beam_size,
0,
config,
args.model_class,
)
else:
dummy_inputs = gpt2helper.get_dummy_inputs(
batch_size,
past_sequence_length,
sequence_length,
config.num_attention_heads,
config.hidden_size,
config.n_layer,
config.vocab_size,
device,
float16=(args.precision == Precision.FLOAT16),
has_position_ids=use_padding,
has_attention_mask=use_padding,
)
output_shapes = gpt2helper.get_output_shapes(
batch_size,
past_sequence_length,
sequence_length,
config,
args.model_class,
)
try:
if args.validate_onnx or args.output_torch_latency:

View file

@ -23,9 +23,8 @@ from pathlib import Path
import numpy
import torch
from gpt2_beamsearch_helper import MODEL_CLASSES, Gpt2HelperFactory
from gpt2_beamsearch_tester import Gpt2TesterFactory
from gpt2_helper import DEFAULT_TOLERANCE, PRETRAINED_GPT2_MODELS
from gpt2_helper import DEFAULT_TOLERANCE, MODEL_CLASSES, PRETRAINED_GPT2_MODELS, Gpt2Helper
from gpt2_tester import Gpt2Tester
from packaging import version
from transformers import AutoConfig
@ -174,61 +173,6 @@ def parse_arguments(argv=None):
"Note that we will optimize 1 and 2 differently for best performance.",
)
parser.add_argument(
"--beam_size",
type=int,
default=4,
help="Beam size if greedy/top-p/top-k sampling is needed",
)
search_option_group = parser.add_argument_group("configurable one step search options")
search_option_group.add_argument(
"--ignore_eos",
type=bool,
default=False,
help="If ignore end of sentence token in model inference.",
)
search_option_group.add_argument(
"--repetition_penalty",
type=float,
default=1,
help="Positive. >1 to penalize and <1 to encourage.",
)
search_option_group.add_argument(
"--temperature",
type=float,
default=1,
help="Softmax temperature for output logits.",
)
search_option_group.add_argument(
"--excluded_token_ids",
required=False,
nargs="+",
type=float,
help="A list of token ids to be excluded in inference.",
)
search_option_group.add_argument(
"--length_penalty",
type=float,
default=1,
help="Positive. >1 to penalize and <1 to encourage short sentence.",
)
sampling_option_group = parser.add_argument_group("one step sampling options")
sampling_option_group.add_argument(
"--do_sample",
action="store_true",
help="If to do sampling instead of beam search or greedy.",
)
sampling_option_group.add_argument(
"--do_sample_top_p",
type=float,
default=0.95,
help="Nuclear/top-p sampling accumulation probability.",
)
sampling_option_group.add_argument("--do_sample_top_k", type=int, default=0, help="Use top-k if non-zero.")
fp16_option_group = parser.add_argument_group(
'float to float16 conversion parameters that works when "--precision fp16" is specified'
)
@ -334,48 +278,15 @@ def main(argv=None, experiment_name: str = "", run_id: str = "0", csv_filename:
model_class = MODEL_CLASSES[args.model_class][0]
use_padding = MODEL_CLASSES[args.model_class][2]
if args.model_class == "GPT2LMHeadModel_BeamSearchStep":
model_type = "beam_search_step"
elif args.model_class == "GPT2LMHeadModel_ConfigurableOneStepSearch":
model_type = "configurable_one_step_search"
else:
model_type = "default"
gpt2helper = Gpt2HelperFactory.create_helper(model_type)
gpt2tester = Gpt2TesterFactory.create_tester(model_type)
gpt2helper = Gpt2Helper
config = AutoConfig.from_pretrained(args.model_name_or_path, cache_dir=cache_dir)
if model_type == "beam_search_step":
model = model_class.from_pretrained(
args.model_name_or_path,
config=config,
batch_size=1,
beam_size=args.beam_size,
cache_dir=cache_dir,
)
elif model_type == "configurable_one_step_search":
model = model_class.from_pretrained(
args.model_name_or_path,
config=config,
batch_size=1,
beam_size=args.beam_size,
ignore_eos=args.ignore_eos,
temperature=args.temperature,
repetition_penalty=args.repetition_penalty,
excluded_token_ids=args.excluded_token_ids,
length_penalty=args.length_penalty,
do_sample=args.do_sample,
do_sample_top_p=args.do_sample_top_p,
do_sample_top_k=args.do_sample_top_k,
cache_dir=cache_dir,
)
else:
model = model_class.from_pretrained(args.model_name_or_path, config=config, cache_dir=cache_dir)
model = model_class.from_pretrained(args.model_name_or_path, config=config, cache_dir=cache_dir)
device = torch.device("cuda:0" if args.use_gpu else "cpu")
model.eval().to(device)
if (not args.use_external_data_format) and (config.n_layer > 24):
logger.info(f"Try --use_external_data_format when model size > 2GB")
logger.info("Try --use_external_data_format when model size > 2GB")
onnx_model_paths = gpt2helper.get_onnx_paths(
output_dir,
@ -628,22 +539,9 @@ def main(argv=None, experiment_name: str = "", run_id: str = "0", csv_filename:
else:
inputs = {"input_ids": input_ids.to(int_data_type)}
if model_type == "beam_search_step" or model_type == "configurable_one_step_search":
beam_select_idx = torch.zeros([1, input_ids.shape[0]]).long()
input_log_probs = torch.zeros([input_ids.shape[0], 1])
input_unfinished_sents = torch.ones([input_ids.shape[0], 1], dtype=torch.bool)
inputs.update(
{
"beam_select_idx": beam_select_idx,
"input_log_probs": input_log_probs,
"input_unfinished_sents": input_unfinished_sents,
}
)
test_inputs.append(inputs)
gpt2tester.test_generation(
Gpt2Tester.test_generation(
session,
model,
device,

View file

@ -1,442 +0,0 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
# This script helps evaluation of GPT-2 model.
import logging
import os
import sys
import timeit
import numpy
import torch
from gpt2_beamsearch_helper import Gpt2BeamSearchHelper, Gpt2BeamSearchInputs
from gpt2_tester import Gpt2Metric, Gpt2Tester
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
from benchmark_helper import Precision
logger = logging.getLogger(__name__)
class Gpt2TesterFactory:
@staticmethod
def create_tester(tester_type="default"):
testers = {
"default": Gpt2Tester,
"beam_search_step": Gpt2BeamSearchTester,
"configurable_one_step_search": Gpt2BeamSearchTester,
}
w = testers[tester_type]
return w
class Gpt2BeamSearchTester(Gpt2Tester):
def __init__(
self,
input_ids,
position_ids,
attention_mask,
beam_select_idx,
input_log_probs,
input_unfinished_sents,
prev_step_results,
prev_step_scores,
num_attention_heads,
hidden_size,
num_layer,
beam_size,
device,
is_fp16=False,
top_k=20,
top_k_required_order=False,
):
super().__init__(
input_ids,
position_ids,
attention_mask,
num_attention_heads=num_attention_heads,
hidden_size=hidden_size,
num_layer=num_layer,
device=device,
is_fp16=is_fp16,
top_k=top_k,
top_k_required_order=top_k_required_order,
)
self.input_length = input_ids.shape[-1]
self.n_layer = num_layer
self.beam_size = beam_size
self.beam_select_idx = beam_select_idx.to(device)
float_type = torch.float16 if is_fp16 else torch.float32
self.input_log_probs = input_log_probs.type(float_type).to(device)
self.input_unfinished_sents = input_unfinished_sents.to(device)
self.prev_step_results = prev_step_results.to(device) if prev_step_results is not None else None
self.prev_step_scores = prev_step_scores.type(float_type).to(device)
self.last_state = None
def get_inputs(self) -> Gpt2BeamSearchInputs:
return Gpt2BeamSearchInputs(
self.input_ids,
self.past,
self.position_ids,
self.attention_mask,
self.beam_select_idx,
self.input_log_probs,
self.input_unfinished_sents,
self.prev_step_results,
self.prev_step_scores,
)
def update(self, output, step, device):
"""
Update the inputs for next inference.
"""
self.last_state = (
torch.from_numpy(output[0]).to(device)
if isinstance(output[0], numpy.ndarray)
else output[0].clone().detach().cpu()
)
self.input_ids = self.last_state.view(self.batch_size * self.beam_size, -1).to(device)
if self.position_ids is not None:
input_unfinished_sents_id = -3
self.prev_step_results = (
torch.from_numpy(output[-2]).to(device)
if isinstance(output[-2], numpy.ndarray)
else output[-2].clone().detach().to(device)
)
self.position_ids = (
torch.tensor([self.input_length + step - 1])
.unsqueeze(0)
.repeat(self.batch_size * self.beam_size, 1)
.to(device)
)
if self.attention_mask.size(0) != (self.batch_size * self.beam_size):
self.attention_mask = self.attention_mask.repeat(self.batch_size * self.beam_size, 1)
self.attention_mask = torch.cat(
[
self.attention_mask,
torch.ones([self.batch_size * self.beam_size, 1]).type_as(self.attention_mask),
],
1,
).to(device)
else:
input_unfinished_sents_id = -2
self.beam_select_idx = (
torch.from_numpy(output[input_unfinished_sents_id - 2]).to(device)
if isinstance(output[input_unfinished_sents_id - 2], numpy.ndarray)
else output[input_unfinished_sents_id - 2].clone().detach().to(device)
)
self.input_log_probs = (
torch.from_numpy(output[input_unfinished_sents_id - 1]).to(device)
if isinstance(output[input_unfinished_sents_id - 1], numpy.ndarray)
else output[input_unfinished_sents_id - 1].clone().detach().to(device)
)
self.input_unfinished_sents = (
torch.from_numpy(output[input_unfinished_sents_id]).to(device)
if isinstance(output[input_unfinished_sents_id], numpy.ndarray)
else output[input_unfinished_sents_id].clone().detach().to(device)
)
self.prev_step_scores = (
torch.from_numpy(output[-1]).to(device)
if isinstance(output[-1], numpy.ndarray)
else output[-1].clone().detach().to(device)
)
self.top_1_tokens = self.input_ids[0]
self.top_k_tokens = self.last_state
self.past = []
if isinstance(output[1], tuple): # past in torch output is tuple
self.past = list(output[1])
else:
for i in range(self.n_layer):
past_i = (
torch.from_numpy(output[i + 1])
if isinstance(output[i + 1], numpy.ndarray)
else output[i + 1].clone().detach()
)
self.past.append(past_i.to(device))
@staticmethod
def test_generation(
session,
model,
device,
test_inputs,
precision=Precision.FLOAT32,
model_class="GPT2LMHeadModel_BeamSearchStep",
top_k=20,
top_k_no_order=True,
max_steps=24,
max_inputs=0,
verbose=False,
save_test_data=0,
save_test_data_dir=".",
):
"""
Test Generation using beam search to compare PyTorch and ONNX model.
It will print top 1 and top k errors on the given test inputs.
"""
print(
f"start test generation: (top_k={top_k} top_k_no_order={top_k_no_order} max_steps={max_steps} test_inputs={len(test_inputs)} max_inputs={max_inputs})"
)
n_layer = model.config.n_layer
n_head = model.config.n_head
n_embd = model.config.n_embd
beam_size = model.config.beam_size
eos_token_id = model.config.eos_token_id
test_data_saved = 0
is_float16 = precision == Precision.FLOAT16
# We will still use fp32 torch model as baseline when onnx model if fp16
model.eval().to(device)
# Allocate initial buffers for IO Binding of ONNX Runtimne. The buffer size will automatically increase later.
init_output_shapes = Gpt2BeamSearchHelper.get_output_shapes(
batch_size=4,
context_len=128,
past_sequence_length=128,
sequence_length=32,
beam_size=1,
step=0,
config=model.config,
model_class=model_class,
)
output_buffers = Gpt2BeamSearchHelper.get_output_buffers(
init_output_shapes,
device,
is_float16=is_float16,
)
baseline_name = "Torch"
treatment_name = "Quantized Onnx" if precision == Precision.INT8 else "Onnx"
torch_metric = Gpt2Metric(baseline_name, baseline_name, top_k)
onnx_metric = Gpt2Metric(treatment_name, baseline_name, top_k)
onnx_io_metric = Gpt2Metric(treatment_name + " with IO Binding", baseline_name, top_k)
for i, inputs in enumerate(test_inputs):
if max_inputs > 0 and i == max_inputs:
break
if i % 10 == 0:
print(f"{i}")
input_ids = inputs["input_ids"]
position_ids = inputs["position_ids"] if "position_ids" in inputs else None
attention_mask = inputs["attention_mask"] if "attention_mask" in inputs else None
beam_select_idx = inputs["beam_select_idx"] if "beam_select_idx" in inputs else None
input_log_probs = inputs["input_log_probs"] if "input_log_probs" in inputs else None
input_unfinished_sents = inputs["input_unfinished_sents"]
if model_class == "GPT2LMHeadModel_BeamSearchStep":
prev_step_results = inputs["input_ids"]
else:
prev_step_results = None
if "prev_step_scores" in inputs:
prev_step_scores = inputs["prev_step_scores"]
else:
prev_step_scores = torch.zeros([input_ids.shape[0], 1])
onnx_runner = Gpt2BeamSearchTester(
input_ids,
position_ids,
attention_mask,
beam_select_idx,
input_log_probs,
input_unfinished_sents,
prev_step_results,
prev_step_scores,
n_head,
n_embd,
n_layer,
beam_size,
device,
is_float16,
top_k,
not top_k_no_order,
)
onnx_io_runner = Gpt2BeamSearchTester(
input_ids,
position_ids,
attention_mask,
beam_select_idx,
input_log_probs,
input_unfinished_sents,
prev_step_results,
prev_step_scores,
n_head,
n_embd,
n_layer,
beam_size,
device,
is_float16,
top_k,
not top_k_no_order,
)
torch_runner = Gpt2BeamSearchTester(
input_ids,
position_ids,
attention_mask,
beam_select_idx,
input_log_probs,
input_unfinished_sents,
prev_step_results,
prev_step_scores,
n_head,
n_embd,
n_layer,
beam_size,
device,
False,
top_k,
not top_k_no_order,
) # Torch model baseline is fp32
batch_size = torch_runner.batch_size
onnx_metric.start_batch(batch_size)
onnx_io_metric.start_batch(batch_size)
context_len = list(onnx_runner.input_ids.size())[-1]
with torch.no_grad():
for step in range(max_steps):
print(f"Processing step: {step}")
if model_class == "GPT2LMHeadModel_BeamSearchStep":
num_seq = beam_size
seq_len = list(onnx_runner.input_ids.size())[1]
past_seq_len = list(onnx_runner.past[0].size())[3]
else:
num_seq = sum(onnx_io_runner.input_unfinished_sents.view(-1).long().cpu())
past_seq_len = list(onnx_runner.past[0].size())[3]
seq_len = list(onnx_runner.input_ids.size())[-1] - past_seq_len
start_time = timeit.default_timer()
pytorch_output = Gpt2BeamSearchHelper.pytorch_inference(model, torch_runner.get_inputs())
torch_metric.add_latency(past_seq_len, timeit.default_timer() - start_time)
torch_runner.update(pytorch_output, step, device)
(
onnx_output,
avg_latency_ms,
) = Gpt2BeamSearchHelper.onnxruntime_inference(session, onnx_runner.get_inputs(), total_runs=1)
onnx_metric.add_latency(past_seq_len, avg_latency_ms / 1000.0)
onnx_runner.update(onnx_output, step, device)
if model_class == "GPT2LMHeadModel_BeamSearchStep":
num_seq = beam_size
else:
num_seq = sum(onnx_io_runner.input_unfinished_sents.view(-1).long().cpu())
output_shapes = Gpt2BeamSearchHelper.get_output_shapes(
batch_size,
context_len,
past_seq_len,
seq_len,
beam_size,
step,
model.config,
model_class=model_class,
num_seq=num_seq,
)
Gpt2BeamSearchHelper.auto_increase_buffer_size(output_buffers, output_shapes)
(onnx_io_output, avg_latency_ms,) = Gpt2BeamSearchHelper.onnxruntime_inference_with_binded_io(
session,
onnx_io_runner.get_inputs(),
output_buffers,
output_shapes,
total_runs=1,
return_numpy=False,
include_copy_output_latency=True,
)
onnx_io_metric.add_latency(past_seq_len, avg_latency_ms / 1000.0)
if test_data_saved < save_test_data:
onnx_io_runner.save_test_data(session, onnx_io_output, save_test_data_dir, test_data_saved)
test_data_saved += 1
onnx_io_runner.update(onnx_io_output, step, device)
if (not onnx_runner.input_unfinished_sents.any()) or (
not torch_runner.input_unfinished_sents.any()
):
print("break at step: ", step)
break
print(f"Totally {step+1} steps run")
onnx_metric.end_batch()
onnx_io_metric.end_batch()
torch_metric.print()
onnx_metric.print()
onnx_io_metric.print()
print("\tONNX")
if model_class == "GPT2LMHeadModel_BeamSearchStep":
results_onnx = onnx_runner.prev_step_results.view(batch_size * beam_size, -1)
results_onnx_io = onnx_io_runner.prev_step_results.view(batch_size * beam_size, -1)
else:
results_onnx = onnx_runner.input_ids.view(batch_size * beam_size, -1)
results_onnx_io = onnx_io_runner.input_ids.view(batch_size * beam_size, -1)
Gpt2BeamSearchTester.pprint_results(
results_onnx,
onnx_runner.prev_step_scores.view(batch_size * beam_size, -1),
pad_token_id=eos_token_id,
eos_token_id=eos_token_id,
)
print("\tONNX with IO binding")
Gpt2BeamSearchTester.pprint_results(
results_onnx_io,
onnx_io_runner.prev_step_scores.view(batch_size * beam_size, -1),
pad_token_id=eos_token_id,
eos_token_id=eos_token_id,
)
@staticmethod
def pprint_results(
output_ids,
output_scores,
pad_token_id=None,
eos_token_id=None,
):
"""
Print test generation results.
"""
if pad_token_id is None:
pad_token_id = 1
if eos_token_id is None:
eos_token_id = 1
if torch.is_tensor(output_ids):
output_ids = output_ids.cpu().numpy()
for i, sample in enumerate(output_ids):
for j, seq in enumerate(sample):
if isinstance(seq, numpy.ndarray) or isinstance(seq, list):
# remove left padding
for k, t in enumerate(seq):
if t != pad_token_id:
seq = seq[k:]
break
# remove EOS
for k, t in enumerate(seq):
if t == eos_token_id:
seq = seq[: k + 1]
break
print("-" * 40)
result = ",".join([str(token_id) for token_id in sample])
print(f">> Output {j + 1}: \t{[result]}")
else:
result = ",".join([str(token_id) for token_id in sample])
print(f">> Output {i}: \t{result}")
print(f">> Scores {i}: \t{output_scores[i]}")
break
print("=" * 80)

View file

@ -1,491 +0,0 @@
{
"metadata": {
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
},
"orig_nbformat": 2,
"kernelspec": {
"name": "python370jvsc74a57bd081098997110362167705b61d21e46dda767ff2050d805c22b6ba90fec7e1aa35",
"display_name": "Python 3.7.0 64-bit ('py37athena': conda)"
},
"metadata": {
"interpreter": {
"hash": "81098997110362167705b61d21e46dda767ff2050d805c22b6ba90fec7e1aa35"
}
}
},
"nbformat": 4,
"nbformat_minor": 2,
"cells": [
{
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"Licensed under the MIT License."
],
"cell_type": "markdown",
"metadata": {}
},
{
"source": [
"# Optimizing runtime performance on GPT-2 model inference with ONNXRuntime on CPU\n",
"\n",
"In this tutorial, you'll be introduced to how to load a GPT2 model from PyTorch, convert it to ONNX with one step search, and inference it using ONNX Runtime with/without IO Binding. GPT-2 model inference is optimized by compiling one-step beam search into the onnx compute graph, which speeds up the runtime significantly. "
],
"cell_type": "markdown",
"metadata": {}
},
{
"source": [
"## Prerequisites\n",
"If you have Jupyter Notebook, you may directly run this notebook. We will use pip to install or upgrade [PyTorch](https://pytorch.org/), [OnnxRuntime](https://microsoft.github.io/onnxruntime/) and other required packages.\n",
"\n",
"Otherwise, you can setup a new environment. First, we install [Anaconda](https://www.anaconda.com/distribution/). Then open an AnaConda prompt window and run the following commands:\n",
"\n",
"```console\n",
"conda create -n cpu_env python=3.8\n",
"conda activate cpu_env\n",
"conda install jupyter\n",
"jupyter notebook\n",
"```\n",
"\n",
"The last command will launch Jupyter Notebook and we can open this notebook in browser to continue."
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Install PyTorch 1.7.0 and OnnxRuntime 1.7.0 for CPU-only.\n",
"import sys\n",
"if sys.platform == 'darwin': # Mac\n",
" !{sys.executable} -m pip install --upgrade torch torchvision\n",
"else:\n",
" !{sys.executable} -m pip install --upgrade torch==1.7.0+cpu torchvision==0.8.1+cpu -f https://download.pytorch.org/whl/torch_stable.html\n",
"!{sys.executable} -m pip install onnxruntime==1.7.2\n",
"\n",
"# Install other packages used in this notebook.\n",
"!{sys.executable} -m pip install transformers==4.3.1\n",
"!{sys.executable} -m pip install onnx onnxconverter_common psutil pytz pandas py-cpuinfo py3nvml"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"# Create a cache directory to store pretrained model.\n",
"cache_dir = os.path.join(\".\", \"cache_models\")\n",
"if not os.path.exists(cache_dir):\n",
" os.makedirs(cache_dir)"
]
},
{
"source": [
"## Convert GPT2 model from PyTorch to ONNX with one step search ##\n",
"\n",
"We have a script [convert_to_onnx.py](https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/transformers/convert_to_onnx.py) that could help you to convert GPT2 with past state to ONNX. \n",
"\n",
"The script accepts a pretrained model name or path of a checkpoint directory as input, and converts the model to ONNX. It also verifies that the ONNX model could generate same input as the pytorch model. The usage is like \n",
"```\n",
"python -m onnxruntime.transformers.convert_to_onnx -m model_name_or_path \\ \n",
"--model_class=GPT2LMHeadModel_BeamSearchStep|GPT2LMHeadModel_ConfigurableOneStepSearch \\ \n",
"--output gpt2_onestepsearch.onnx -o -p fp32|fp16|int8\n",
"```\n",
"The -p option can be used to choose the precision: fp32 (float32), fp16 (mixed precision) or int8 (quantization). The -o option will generate optimized model, which is required for fp16 or int8.\n",
"\n",
"Here we use a pretrained model as example:"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"GPT2Config {\n \"_name_or_path\": \"gpt2\",\n \"activation_function\": \"gelu_new\",\n \"architectures\": [\n \"GPT2LMHeadModel\"\n ],\n \"attn_pdrop\": 0.1,\n \"batch_size\": 1,\n \"beam_size\": 4,\n \"bos_token_id\": 50256,\n \"embd_pdrop\": 0.1,\n \"eos_token_id\": 50256,\n \"gradient_checkpointing\": false,\n \"initializer_range\": 0.02,\n \"layer_norm_epsilon\": 1e-05,\n \"model_type\": \"gpt2\",\n \"n_ctx\": 1024,\n \"n_embd\": 768,\n \"n_head\": 12,\n \"n_inner\": null,\n \"n_layer\": 12,\n \"n_positions\": 1024,\n \"resid_pdrop\": 0.1,\n \"summary_activation\": null,\n \"summary_first_dropout\": 0.1,\n \"summary_proj_to_labels\": true,\n \"summary_type\": \"cls_index\",\n \"summary_use_proj\": true,\n \"task_specific_params\": {\n \"text-generation\": {\n \"do_sample\": true,\n \"max_length\": 50\n }\n },\n \"transformers_version\": \"4.3.1\",\n \"use_cache\": true,\n \"vocab_size\": 50257\n}\n\n"
]
}
],
"source": [
"from packaging import version\n",
"from onnxruntime import __version__ as ort_verison\n",
"if version.parse(ort_verison) >= version.parse('1.12.0'):\n",
" from onnxruntime.transformers.models.gpt2.gpt2_beamsearch_helper import Gpt2BeamSearchHelper, GPT2LMHeadModel_BeamSearchStep\n",
"else:\n",
" from onnxruntime.transformers.gpt2_beamsearch_helper import Gpt2BeamSearchHelper, GPT2LMHeadModel_BeamSearchStep\n",
"\n",
"from transformers import AutoConfig\n",
"import torch\n",
"\n",
"model_name_or_path = \"gpt2\"\n",
"config = AutoConfig.from_pretrained(model_name_or_path, cache_dir=cache_dir)\n",
"model = GPT2LMHeadModel_BeamSearchStep.from_pretrained(model_name_or_path, config=config, batch_size=1, beam_size=4, cache_dir=cache_dir)\n",
"device = torch.device(\"cpu\")\n",
"model.eval().to(device)\n",
"\n",
"print(model.config)\n",
"\n",
"num_attention_heads = model.config.n_head\n",
"hidden_size = model.config.n_embd\n",
"num_layer = model.config.n_layer"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/data/anaconda/envs/py37athena/lib/python3.7/site-packages/transformers/models/gpt2/modeling_gpt2.py:654: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n assert batch_size > 0, \"batch_size has to be defined and > 0\"\n/data/anaconda/envs/py37athena/lib/python3.7/site-packages/transformers/models/gpt2/modeling_gpt2.py:169: TracerWarning: Converting a tensor to a Python float might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n w = w / (float(v.size(-1)) ** 0.5)\n/data/anaconda/envs/py37athena/lib/python3.7/site-packages/transformers/models/gpt2/modeling_gpt2.py:174: TracerWarning: Converting a tensor to a Python index might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n mask = self.bias[:, :, ns - nd : ns, :ns]\n"
]
}
],
"source": [
"onnx_model_path = \"gpt2_one_step_search.onnx\"\n",
"Gpt2BeamSearchHelper.export_onnx(model, device, onnx_model_path) # add parameter use_external_data_format=True when model size > 2 GB"
]
},
{
"source": [
"## ONNX Runtime Inference ##\n",
"\n",
"We can use ONNX Runtime to inference. The inputs are dictionary with name and numpy array as value, and the output is list of numpy array. Note that both input and output are in CPU. When you run the inference in GPU, it will involve data copy between CPU and GPU for input and output.\n",
"\n",
"Let's create an inference session for ONNX Runtime given the exported ONNX model, and see the output."
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"import onnxruntime\n",
"import numpy\n",
"from transformers import AutoTokenizer\n",
"\n",
"EXAMPLE_Text = ['best hotel in bay area.']\n",
"\n",
"def get_tokenizer(model_name_or_path, cache_dir):\n",
" tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir)\n",
" tokenizer.padding_side = \"left\"\n",
" tokenizer.pad_token = tokenizer.eos_token\n",
" #okenizer.add_special_tokens({'pad_token': '[PAD]'})\n",
" return tokenizer\n",
"\n",
"def get_example_inputs(prompt_text=EXAMPLE_Text): \n",
" tokenizer = get_tokenizer(model_name_or_path, cache_dir)\n",
" encodings_dict = tokenizer.batch_encode_plus(prompt_text, padding=True)\n",
"\n",
" input_ids = torch.tensor(encodings_dict['input_ids'], dtype=torch.int64)\n",
" attention_mask = torch.tensor(encodings_dict['attention_mask'], dtype=torch.float32)\n",
" position_ids = (attention_mask.long().cumsum(-1) - 1)\n",
" position_ids.masked_fill_(position_ids < 0, 0)\n",
"\n",
" #Empty Past State for generating first word\n",
" empty_past = []\n",
" batch_size = input_ids.size(0)\n",
" sequence_length = input_ids.size(1)\n",
" past_shape = [2, batch_size, num_attention_heads, 0, hidden_size // num_attention_heads]\n",
" for i in range(num_layer):\n",
" empty_past.append(torch.empty(past_shape).type(torch.float32).to(device))\n",
" \n",
" return input_ids, attention_mask, position_ids, empty_past\n",
"\n",
"input_ids, attention_mask, position_ids, empty_past = get_example_inputs()\n",
"beam_select_idx = torch.zeros([1, input_ids.shape[0]]).long()\n",
"input_log_probs = torch.zeros([input_ids.shape[0], 1])\n",
"input_unfinished_sents = torch.ones([input_ids.shape[0], 1], dtype=torch.bool)\n",
"prev_step_scores = torch.zeros([input_ids.shape[0], 1])\n",
"\n",
"onnx_model_path = \"gpt2_one_step_search.onnx\"\n",
"session = onnxruntime.InferenceSession(onnx_model_path)\n",
"ort_inputs = {\n",
" 'input_ids': numpy.ascontiguousarray(input_ids.cpu().numpy()),\n",
" 'attention_mask' : numpy.ascontiguousarray(attention_mask.cpu().numpy()),\n",
" 'position_ids': numpy.ascontiguousarray(position_ids.cpu().numpy()),\n",
" 'beam_select_idx': numpy.ascontiguousarray(beam_select_idx.cpu().numpy()),\n",
" 'input_log_probs': numpy.ascontiguousarray(input_log_probs.cpu().numpy()),\n",
" 'input_unfinished_sents': numpy.ascontiguousarray(input_unfinished_sents.cpu().numpy()),\n",
" 'prev_step_results': numpy.ascontiguousarray(input_ids.cpu().numpy()),\n",
" 'prev_step_scores': numpy.ascontiguousarray(prev_step_scores.cpu().numpy()),\n",
" }\n",
"for i, past_i in enumerate(empty_past):\n",
" ort_inputs[f'past_{i}'] = numpy.ascontiguousarray(past_i.cpu().numpy())\n",
"ort_outputs = session.run(None, ort_inputs)"
]
},
{
"source": [
"## ONNX Runtime Inference with IO Binding ##\n",
"\n",
"To avoid data copy for input and output, ONNX Runtime also supports IO Binding. User could provide some buffer for input and outputs. For GPU inference, the buffer can be in GPU to reduce memory copy between CPU and GPU. This is helpful for high performance inference in GPU. For GPT-2, IO Binding might help the performance when batch size or (past) sequence length is large."
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def inference_with_io_binding(session, config, input_ids, position_ids, attention_mask, past, beam_select_idx, input_log_probs, input_unfinished_sents, prev_step_results, prev_step_scores, step, context_len):\n",
" output_shapes = Gpt2BeamSearchHelper.get_output_shapes(batch_size=1,\n",
" context_len=context_len,\n",
" past_sequence_length=past[0].size(3),\n",
" sequence_length=input_ids.size(1),\n",
" beam_size=4,\n",
" step=step,\n",
" config=config,\n",
" model_class=\"GPT2LMHeadModel_BeamSearchStep\")\n",
" output_buffers = Gpt2BeamSearchHelper.get_output_buffers(output_shapes, device)\n",
"\n",
" io_binding = Gpt2BeamSearchHelper.prepare_io_binding(session, input_ids, position_ids, attention_mask, past, output_buffers, output_shapes, beam_select_idx, input_log_probs, input_unfinished_sents, prev_step_results, prev_step_scores)\n",
" session.run_with_iobinding(io_binding)\n",
"\n",
" outputs = Gpt2BeamSearchHelper.get_outputs_from_io_binding_buffer(session, output_buffers, output_shapes, return_numpy=False)\n",
" return outputs"
]
},
{
"source": [
"We can see that the result is exactly same with/without IO Binding:"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"IO Binding result is good\n"
]
}
],
"source": [
"input_ids, attention_mask, position_ids, empty_past = get_example_inputs()\n",
"beam_select_idx = torch.zeros([1, input_ids.shape[0]]).long()\n",
"input_log_probs = torch.zeros([input_ids.shape[0], 1])\n",
"input_unfinished_sents = torch.ones([input_ids.shape[0], 1], dtype=torch.bool)\n",
"prev_step_scores = torch.zeros([input_ids.shape[0], 1])\n",
"outputs = inference_with_io_binding(session, config, input_ids, position_ids, attention_mask, empty_past, beam_select_idx, input_log_probs, input_unfinished_sents, input_ids, prev_step_scores, 0, input_ids.shape[-1])\n",
"assert torch.eq(outputs[-2], torch.from_numpy(ort_outputs[-2])).all()\n",
"print(\"IO Binding result is good\")"
]
},
{
"source": [
"## Batch Text Generation ##\n",
"\n",
"Here is an example for text generation using ONNX Runtime with/without IO Binding."
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def update(output, step, batch_size, beam_size, context_length, prev_attention_mask, device):\n",
" \"\"\"\n",
" Update the inputs for next inference.\n",
" \"\"\"\n",
" last_state = (torch.from_numpy(output[0]).to(device)\n",
" if isinstance(output[0], numpy.ndarray) else output[0].clone().detach().cpu())\n",
"\n",
" input_ids = last_state.view(batch_size * beam_size, -1).to(device)\n",
"\n",
" input_unfinished_sents_id = -3\n",
" prev_step_results = (torch.from_numpy(output[-2]).to(device) if isinstance(output[-2], numpy.ndarray)\n",
" else output[-2].clone().detach().to(device))\n",
" position_ids = (torch.tensor([context_length + step - 1\n",
" ]).unsqueeze(0).repeat(batch_size * beam_size, 1).to(device))\n",
"\n",
" if prev_attention_mask.shape[0] != (batch_size * beam_size):\n",
" prev_attention_mask = prev_attention_mask.repeat(batch_size * beam_size, 1)\n",
" attention_mask = torch.cat(\n",
" [\n",
" prev_attention_mask,\n",
" torch.ones([batch_size * beam_size, 1]).type_as(prev_attention_mask),\n",
" ],\n",
" 1,\n",
" ).to(device)\n",
"\n",
" beam_select_idx = (torch.from_numpy(output[input_unfinished_sents_id - 2]).to(device) if isinstance(\n",
" output[input_unfinished_sents_id - 2], numpy.ndarray) else output[input_unfinished_sents_id - 2].clone().detach().to(device))\n",
" input_log_probs = (torch.from_numpy(output[input_unfinished_sents_id - 1]).to(device) if isinstance(\n",
" output[input_unfinished_sents_id - 1], numpy.ndarray) else output[input_unfinished_sents_id - 1].clone().detach().to(device))\n",
" input_unfinished_sents = (torch.from_numpy(output[input_unfinished_sents_id]).to(device) if isinstance(\n",
" output[input_unfinished_sents_id], numpy.ndarray) else\n",
" output[input_unfinished_sents_id].clone().detach().to(device))\n",
" prev_step_scores = (torch.from_numpy(output[-1]).to(device)\n",
" if isinstance(output[-1], numpy.ndarray) else output[-1].clone().detach().to(device))\n",
"\n",
" past = []\n",
" if isinstance(output[1], tuple): # past in torch output is tuple\n",
" past = list(output[1])\n",
" else:\n",
" for i in range(model.config.n_layer):\n",
" past_i = (torch.from_numpy(output[i + 1])\n",
" if isinstance(output[i + 1], numpy.ndarray) else output[i + 1].clone().detach())\n",
" past.append(past_i.to(device)) \n",
"\n",
" inputs = {\n",
" 'input_ids': input_ids,\n",
" 'attention_mask' : attention_mask,\n",
" 'position_ids': position_ids,\n",
" 'beam_select_idx': beam_select_idx,\n",
" 'input_log_probs': input_log_probs,\n",
" 'input_unfinished_sents': input_unfinished_sents,\n",
" 'prev_step_results': prev_step_results,\n",
" 'prev_step_scores': prev_step_scores,\n",
" }\n",
" ort_inputs = {\n",
" 'input_ids': numpy.ascontiguousarray(input_ids.cpu().numpy()),\n",
" 'attention_mask' : numpy.ascontiguousarray(attention_mask.cpu().numpy()),\n",
" 'position_ids': numpy.ascontiguousarray(position_ids.cpu().numpy()),\n",
" 'beam_select_idx': numpy.ascontiguousarray(beam_select_idx.cpu().numpy()),\n",
" 'input_log_probs': numpy.ascontiguousarray(input_log_probs.cpu().numpy()),\n",
" 'input_unfinished_sents': numpy.ascontiguousarray(input_unfinished_sents.cpu().numpy()),\n",
" 'prev_step_results': numpy.ascontiguousarray(prev_step_results.cpu().numpy()),\n",
" 'prev_step_scores': numpy.ascontiguousarray(prev_step_scores.cpu().numpy()),\n",
" }\n",
" for i, past_i in enumerate(past):\n",
" ort_inputs[f'past_{i}'] = numpy.ascontiguousarray(past_i.cpu().numpy())\n",
" \n",
" return inputs, ort_inputs, past\n",
"\n",
"def test_generation(tokenizer, input_text, use_onnxruntime_io, ort_session = None, num_tokens_to_produce = 30):\n",
" print(\"Text generation using\", \"OnnxRuntime with IO binding\" if use_onnxruntime_io else \"OnnxRuntime\", \"...\") \n",
" input_ids, attention_mask, position_ids, past = get_example_inputs(input_text)\n",
" beam_select_idx = torch.zeros([1, input_ids.shape[0]]).long()\n",
" input_log_probs = torch.zeros([input_ids.shape[0], 1])\n",
" input_unfinished_sents = torch.ones([input_ids.shape[0], 1], dtype=torch.bool)\n",
" prev_step_scores = torch.zeros([input_ids.shape[0], 1])\n",
" inputs = {\n",
" 'input_ids': input_ids,\n",
" 'attention_mask' : attention_mask,\n",
" 'position_ids': position_ids,\n",
" 'beam_select_idx': beam_select_idx,\n",
" 'input_log_probs': input_log_probs,\n",
" 'input_unfinished_sents': input_unfinished_sents,\n",
" 'prev_step_results': input_ids,\n",
" 'prev_step_scores': prev_step_scores,\n",
" }\n",
" ort_inputs = {\n",
" 'input_ids': numpy.ascontiguousarray(input_ids.cpu().numpy()),\n",
" 'attention_mask' : numpy.ascontiguousarray(attention_mask.cpu().numpy()),\n",
" 'position_ids': numpy.ascontiguousarray(position_ids.cpu().numpy()),\n",
" 'beam_select_idx': numpy.ascontiguousarray(beam_select_idx.cpu().numpy()),\n",
" 'input_log_probs': numpy.ascontiguousarray(input_log_probs.cpu().numpy()),\n",
" 'input_unfinished_sents': numpy.ascontiguousarray(input_unfinished_sents.cpu().numpy()),\n",
" 'prev_step_results': numpy.ascontiguousarray(input_ids.cpu().numpy()),\n",
" 'prev_step_scores': numpy.ascontiguousarray(prev_step_scores.cpu().numpy()),\n",
" }\n",
" for i, past_i in enumerate(past):\n",
" ort_inputs[f'past_{i}'] = numpy.ascontiguousarray(past_i.cpu().numpy())\n",
" batch_size = input_ids.size(0)\n",
" beam_size = 4\n",
" context_length = input_ids.size(-1)\n",
"\n",
" for step in range(num_tokens_to_produce):\n",
" if use_onnxruntime_io:\n",
" outputs = inference_with_io_binding(ort_session, config, inputs['input_ids'], inputs['position_ids'], inputs['attention_mask'], past, inputs['beam_select_idx'], inputs['input_log_probs'], inputs['input_unfinished_sents'], inputs['prev_step_results'], inputs['prev_step_scores'], step, context_length)\n",
" else:\n",
" outputs = ort_session.run(None, ort_inputs) \n",
" inputs, ort_inputs, past = update(outputs, step, batch_size, beam_size, context_length, inputs['attention_mask'], device)\n",
"\n",
" if not inputs['input_unfinished_sents'].any():\n",
" break\n",
"\n",
" print(\"------------\")\n",
" print(tokenizer.decode(inputs['prev_step_results'][0], skip_special_tokens=True))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Text generation using OnnxRuntime ...\n",
"------------\n",
"best hotel in bay area.\n",
"\n",
"\"It's a great place to stay,\" he said. \"It's a great place to live. It's a great place to work\n"
]
}
],
"source": [
"tokenizer = get_tokenizer(model_name_or_path, cache_dir)\n",
"input_text = EXAMPLE_Text\n",
"test_generation(tokenizer, input_text, use_onnxruntime_io=False, ort_session=session)"
]
},
{
"source": [
"Next, we use ONNX Runtime with IO binding to run again and we can see that the result is exactly same."
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Text generation using OnnxRuntime with IO binding ...\n",
"------------\n",
"best hotel in bay area.\n",
"\n",
"\"It's a great place to stay,\" he said. \"It's a great place to live. It's a great place to work\n"
]
}
],
"source": [
"test_generation(tokenizer, input_text, use_onnxruntime_io=True, ort_session=session)"
]
}
]
}

View file

@ -49,43 +49,6 @@ class TestGpt2(unittest.TestCase):
def test_gpt2_int8(self):
self.run_benchmark_gpt2("-m gpt2 --precision int8 -o -b 1 --sequence_lengths 2 -s 3")
@pytest.mark.slow
def test_gpt2_beam_search_step_fp32(self):
self.run_benchmark_gpt2(
"-m gpt2 --model_class=GPT2LMHeadModel_BeamSearchStep --precision fp32 -v -b 1 --sequence_lengths 5 -s 3"
)
# @pytest.mark.slow
# def test_gpt2_beam_search_step_fp16(self):
# if self.test_cuda:
# self.run_benchmark_gpt2(
# '-m gpt2 --model_class=GPT2LMHeadModel_BeamSearchStep --precision fp16 -o -b 1 --sequence_lengths 5 -s 3 --use_gpu')
@pytest.mark.slow
def test_gpt2_beam_search_step_int8(self):
self.run_benchmark_gpt2(
"-m gpt2 --model_class=GPT2LMHeadModel_BeamSearchStep --precision int8 -o -b 1 --sequence_lengths 5 -s 3"
)
@pytest.mark.slow
def test_gpt2_configurable_one_step_search_fp32(self):
self.run_benchmark_gpt2(
"-m gpt2 --model_class=GPT2LMHeadModel_ConfigurableOneStepSearch --precision fp32 -v -b 1 --sequence_lengths 5 --past_sequence_lengths 3 --use_gpu"
)
# @pytest.mark.slow
# def test_gpt2_configurable_one_step_search_fp16(self):
# if self.test_cuda:
# self.run_benchmark_gpt2(
# "-m gpt2 --model_class=GPT2LMHeadModel_ConfigurableOneStepSearch --precision fp16 -o -b 1 --sequence_lengths 5 -s 3 --use_gpu"
# )
@pytest.mark.slow
def test_gpt2_configurable_one_step_search_int8(self):
self.run_benchmark_gpt2(
"-m gpt2 --model_class=GPT2LMHeadModel_ConfigurableOneStepSearch --precision int8 -o -b 1 --sequence_lengths 5 -s 3"
)
if __name__ == "__main__":
coloredlogs.install(fmt="%(message)s")

View file

@ -1,7 +1,6 @@
# packages used by transformers tool test
protobuf==3.18.3
numpy==1.21.6
protobuf==3.20.1
numpy==1.23.5
coloredlogs==15.0
transformers==4.6.1
onnxconverter-common==1.8.1
transformers==4.24.0
psutil