Deprecate one step beam search (#14046)

### Description Deprecate one step beam search since it lacks maintenance (some tests failed) and its performance is not optimal. For users who still need this feature, please use older version (<=1.13.1) of onnxruntime to export one step beam search model, and the model can run in latest onnxruntime. It is recommend to use [convert_generation.py](https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/transformers/convert_generation.py) to generate beam search onnx model for better performance.
2026-06-04 23:59:56 +00:00 · 2022-12-22 23:14:31 -08:00 · 2022-12-22 23:14:31 -08:00 · 8ac264b896
commit 8ac264b896
parent e49f358686
7 changed files with 45 additions and 2259 deletions
--- a/onnxruntime/python/tools/transformers/models/gpt2/benchmark_gpt2.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/benchmark_gpt2.py
@ -15,8 +15,7 @@ from datetime import datetime

 import psutil
 import torch
-from gpt2_beamsearch_helper import MODEL_CLASSES, Gpt2HelperFactory
-from gpt2_helper import DEFAULT_TOLERANCE, PRETRAINED_GPT2_MODELS, Gpt2Helper
+from gpt2_helper import DEFAULT_TOLERANCE, MODEL_CLASSES, PRETRAINED_GPT2_MODELS, Gpt2Helper
 from packaging import version
 from transformers import AutoConfig

@ -124,12 +123,6 @@ def parse_arguments(argv=None):
    parser.set_defaults(torchscript=False)

    parser.add_argument("-b", "--batch_sizes", nargs="+", type=int, default=[1], help="batch size")
-    parser.add_argument(
-        "--beam_size",
-        type=int,
-        default=4,
-        help="Beam size if greedy/top-p/top-k sampling is needed",
-    )

    parser.add_argument(
        "--sequence_lengths",
@ -170,54 +163,6 @@ def parse_arguments(argv=None):
    parser.add_argument("--disable_io_binding", required=False, action="store_true")
    parser.set_defaults(disable_io_binding=False)

-    search_option_group = parser.add_argument_group("configurable one step search options")
-
-    search_option_group.add_argument(
-        "--ignore_eos",
-        type=bool,
-        default=False,
-        help="If ignore end of sentence token in model inference.",
-    )
-    search_option_group.add_argument(
-        "--repetition_penalty",
-        type=float,
-        default=1,
-        help="Positive. >1 to penalize and <1 to encorage.",
-    )
-    search_option_group.add_argument(
-        "--temperature",
-        type=float,
-        default=1,
-        help="Softmax temperature for output logits.",
-    )
-    search_option_group.add_argument(
-        "--excluded_token_ids",
-        required=False,
-        nargs="+",
-        type=float,
-        help="A list of token ids to be excluded in inference.",
-    )
-    search_option_group.add_argument(
-        "--length_penalty",
-        type=float,
-        default=1,
-        help="Positive. >1 to penalize and <1 to encorage short sentence.",
-    )
-
-    sampling_option_group = parser.add_argument_group("one step sampling options")
-    sampling_option_group.add_argument(
-        "--do_sample",
-        action="store_true",
-        help="If to do sampling instead of beam search or greedy.",
-    )
-    sampling_option_group.add_argument(
-        "--do_sample_top_p",
-        type=float,
-        default=0.95,
-        help="Nuclear/top-p sampling accumulation probability.",
-    )
-    sampling_option_group.add_argument("--do_sample_top_k", type=int, default=0, help="Use top-k if non-zero.")
-
    args = parser.parse_args(argv)

    return args
@ -249,41 +194,9 @@ def main(args):
    prepare_environment(cache_dir, output_dir, args.use_gpu)

    model_class = MODEL_CLASSES[args.model_class][0]
-    if args.model_class == "GPT2LMHeadModel_BeamSearchStep":
-        model_type = "beam_search_step"
-    elif args.model_class == "GPT2LMHeadModel_ConfigurableOneStepSearch":
-        model_type = "configurable_one_step_search"
-    else:
-        model_type = "default"
-
-    gpt2helper = Gpt2HelperFactory.create_helper(model_type)
+    gpt2helper = Gpt2Helper
    config = AutoConfig.from_pretrained(args.model_name_or_path, torchscript=args.torchscript, cache_dir=cache_dir)
-    if model_type == "beam_search_step":
-        model = model_class.from_pretrained(
-            args.model_name_or_path,
-            config=config,
-            batch_size=1,
-            beam_size=args.beam_size,
-            cache_dir=cache_dir,
-        )
-    elif model_type == "configurable_one_step_search":
-        model = model_class.from_pretrained(
-            args.model_name_or_path,
-            config=config,
-            batch_size=1,
-            beam_size=args.beam_size,
-            ignore_eos=args.ignore_eos,
-            temperature=args.temperature,
-            repetition_penalty=args.repetition_penalty,
-            excluded_token_ids=args.excluded_token_ids,
-            length_penalty=args.length_penalty,
-            do_sample=args.do_sample,
-            do_sample_top_p=args.do_sample_top_p,
-            do_sample_top_k=args.do_sample_top_k,
-            cache_dir=cache_dir,
-        )
-    else:
-        model = model_class.from_pretrained(args.model_name_or_path, config=config, cache_dir=cache_dir)
+    model = model_class.from_pretrained(args.model_name_or_path, config=config, cache_dir=cache_dir)

    # This scirpt does not support float16 for PyTorch.
    # if args.float16:
@ -352,29 +265,14 @@ def main(args):
        return

    # Allocate output buffers for IO Binding
-    if model_type == "beam_search_step" or model_type == "configurable_one_step_search":
-        max_output_shapes = gpt2helper.get_output_shapes(
-            max(args.batch_sizes),
-            context_len=max(args.past_sequence_lengths),
-            past_sequence_length=max(args.past_sequence_lengths),
-            sequence_length=max(args.sequence_lengths),
-            beam_size=args.beam_size,
-            step=0,
-            config=config,
-            model_class=args.model_class,
-        )
-
-        output_buffers = gpt2helper.get_output_buffers(max_output_shapes, device, args.precision == Precision.FLOAT16)
-
-    else:
-        max_output_shapes = gpt2helper.get_output_shapes(
-            max(args.batch_sizes),
-            max(args.past_sequence_lengths),
-            max(args.sequence_lengths),
-            config,
-            args.model_class,
-        )
-        output_buffers = gpt2helper.get_output_buffers(max_output_shapes, device, args.precision == Precision.FLOAT16)
+    max_output_shapes = gpt2helper.get_output_shapes(
+        max(args.batch_sizes),
+        max(args.past_sequence_lengths),
+        max(args.sequence_lengths),
+        config,
+        args.model_class,
+    )
+    output_buffers = gpt2helper.get_output_buffers(max_output_shapes, device, args.precision == Precision.FLOAT16)

    csv_filename = args.result_csv or "benchmark_result_{}.csv".format(datetime.now().strftime("%Y%m%d-%H%M%S"))
    with open(csv_filename, mode="a", newline="") as csv_file:
@ -402,53 +300,32 @@ def main(args):
                for past_sequence_length in args.past_sequence_lengths:
                    assert batch_size > 0 and sequence_length > 0 and past_sequence_length >= 0
                    logger.debug(
-                        f"Running test for batch_size={batch_size} sequence_length={sequence_length} past_sequence_length={past_sequence_length}..."
+                        "Running test for batch_size=%d sequence_length=%d past_sequence_length=%d ...",
+                        batch_size,
+                        sequence_length,
+                        past_sequence_length,
+                    )
+
+                    dummy_inputs = gpt2helper.get_dummy_inputs(
+                        batch_size,
+                        past_sequence_length,
+                        sequence_length,
+                        config.num_attention_heads,
+                        config.hidden_size,
+                        config.n_layer,
+                        config.vocab_size,
+                        device,
+                        float16=(args.precision == Precision.FLOAT16),
+                        has_position_ids=use_padding,
+                        has_attention_mask=use_padding,
+                    )
+                    output_shapes = gpt2helper.get_output_shapes(
+                        batch_size,
+                        past_sequence_length,
+                        sequence_length,
+                        config,
+                        args.model_class,
                    )
-                    if model_type == "beam_search_step" or model_type == "configurable_one_step_search":
-                        dummy_inputs = gpt2helper.get_dummy_inputs(
-                            batch_size,
-                            past_sequence_length,
-                            sequence_length,
-                            config.num_attention_heads,
-                            config.hidden_size,
-                            config.n_layer,
-                            config.vocab_size,
-                            device,
-                            float16=(args.precision == Precision.FLOAT16),
-                            has_position_ids=use_padding,
-                            has_attention_mask=use_padding,
-                        )
-                        output_shapes = gpt2helper.get_output_shapes(
-                            batch_size,
-                            past_sequence_length,
-                            past_sequence_length,
-                            sequence_length,
-                            args.beam_size,
-                            0,
-                            config,
-                            args.model_class,
-                        )
-                    else:
-                        dummy_inputs = gpt2helper.get_dummy_inputs(
-                            batch_size,
-                            past_sequence_length,
-                            sequence_length,
-                            config.num_attention_heads,
-                            config.hidden_size,
-                            config.n_layer,
-                            config.vocab_size,
-                            device,
-                            float16=(args.precision == Precision.FLOAT16),
-                            has_position_ids=use_padding,
-                            has_attention_mask=use_padding,
-                        )
-                        output_shapes = gpt2helper.get_output_shapes(
-                            batch_size,
-                            past_sequence_length,
-                            sequence_length,
-                            config,
-                            args.model_class,
-                        )

                    try:
                        if args.validate_onnx or args.output_torch_latency:
--- a/onnxruntime/python/tools/transformers/models/gpt2/convert_to_onnx.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/convert_to_onnx.py
@ -23,9 +23,8 @@ from pathlib import Path

 import numpy
 import torch
-from gpt2_beamsearch_helper import MODEL_CLASSES, Gpt2HelperFactory
-from gpt2_beamsearch_tester import Gpt2TesterFactory
-from gpt2_helper import DEFAULT_TOLERANCE, PRETRAINED_GPT2_MODELS
+from gpt2_helper import DEFAULT_TOLERANCE, MODEL_CLASSES, PRETRAINED_GPT2_MODELS, Gpt2Helper
+from gpt2_tester import Gpt2Tester
 from packaging import version
 from transformers import AutoConfig

@ -174,61 +173,6 @@ def parse_arguments(argv=None):
        "Note that we will optimize 1 and 2 differently for best performance.",
    )

-    parser.add_argument(
-        "--beam_size",
-        type=int,
-        default=4,
-        help="Beam size if greedy/top-p/top-k sampling is needed",
-    )
-
-    search_option_group = parser.add_argument_group("configurable one step search options")
-
-    search_option_group.add_argument(
-        "--ignore_eos",
-        type=bool,
-        default=False,
-        help="If ignore end of sentence token in model inference.",
-    )
-    search_option_group.add_argument(
-        "--repetition_penalty",
-        type=float,
-        default=1,
-        help="Positive. >1 to penalize and <1 to encourage.",
-    )
-    search_option_group.add_argument(
-        "--temperature",
-        type=float,
-        default=1,
-        help="Softmax temperature for output logits.",
-    )
-    search_option_group.add_argument(
-        "--excluded_token_ids",
-        required=False,
-        nargs="+",
-        type=float,
-        help="A list of token ids to be excluded in inference.",
-    )
-    search_option_group.add_argument(
-        "--length_penalty",
-        type=float,
-        default=1,
-        help="Positive. >1 to penalize and <1 to encourage short sentence.",
-    )
-
-    sampling_option_group = parser.add_argument_group("one step sampling options")
-    sampling_option_group.add_argument(
-        "--do_sample",
-        action="store_true",
-        help="If to do sampling instead of beam search or greedy.",
-    )
-    sampling_option_group.add_argument(
-        "--do_sample_top_p",
-        type=float,
-        default=0.95,
-        help="Nuclear/top-p sampling accumulation probability.",
-    )
-    sampling_option_group.add_argument("--do_sample_top_k", type=int, default=0, help="Use top-k if non-zero.")
-
    fp16_option_group = parser.add_argument_group(
        'float to float16 conversion parameters that works when "--precision fp16" is specified'
    )
@ -334,48 +278,15 @@ def main(argv=None, experiment_name: str = "", run_id: str = "0", csv_filename:
    model_class = MODEL_CLASSES[args.model_class][0]
    use_padding = MODEL_CLASSES[args.model_class][2]

-    if args.model_class == "GPT2LMHeadModel_BeamSearchStep":
-        model_type = "beam_search_step"
-    elif args.model_class == "GPT2LMHeadModel_ConfigurableOneStepSearch":
-        model_type = "configurable_one_step_search"
-    else:
-        model_type = "default"
-
-    gpt2helper = Gpt2HelperFactory.create_helper(model_type)
-    gpt2tester = Gpt2TesterFactory.create_tester(model_type)
+    gpt2helper = Gpt2Helper
    config = AutoConfig.from_pretrained(args.model_name_or_path, cache_dir=cache_dir)
-    if model_type == "beam_search_step":
-        model = model_class.from_pretrained(
-            args.model_name_or_path,
-            config=config,
-            batch_size=1,
-            beam_size=args.beam_size,
-            cache_dir=cache_dir,
-        )
-    elif model_type == "configurable_one_step_search":
-        model = model_class.from_pretrained(
-            args.model_name_or_path,
-            config=config,
-            batch_size=1,
-            beam_size=args.beam_size,
-            ignore_eos=args.ignore_eos,
-            temperature=args.temperature,
-            repetition_penalty=args.repetition_penalty,
-            excluded_token_ids=args.excluded_token_ids,
-            length_penalty=args.length_penalty,
-            do_sample=args.do_sample,
-            do_sample_top_p=args.do_sample_top_p,
-            do_sample_top_k=args.do_sample_top_k,
-            cache_dir=cache_dir,
-        )
-    else:
-        model = model_class.from_pretrained(args.model_name_or_path, config=config, cache_dir=cache_dir)
+    model = model_class.from_pretrained(args.model_name_or_path, config=config, cache_dir=cache_dir)

    device = torch.device("cuda:0" if args.use_gpu else "cpu")
    model.eval().to(device)

    if (not args.use_external_data_format) and (config.n_layer > 24):
-        logger.info(f"Try --use_external_data_format when model size > 2GB")
+        logger.info("Try --use_external_data_format when model size > 2GB")

    onnx_model_paths = gpt2helper.get_onnx_paths(
        output_dir,
@ -628,22 +539,9 @@ def main(argv=None, experiment_name: str = "", run_id: str = "0", csv_filename:
                else:
                    inputs = {"input_ids": input_ids.to(int_data_type)}

-                if model_type == "beam_search_step" or model_type == "configurable_one_step_search":
-                    beam_select_idx = torch.zeros([1, input_ids.shape[0]]).long()
-
-                    input_log_probs = torch.zeros([input_ids.shape[0], 1])
-                    input_unfinished_sents = torch.ones([input_ids.shape[0], 1], dtype=torch.bool)
-                    inputs.update(
-                        {
-                            "beam_select_idx": beam_select_idx,
-                            "input_log_probs": input_log_probs,
-                            "input_unfinished_sents": input_unfinished_sents,
-                        }
-                    )
-
                test_inputs.append(inputs)

-        gpt2tester.test_generation(
+        Gpt2Tester.test_generation(
            session,
            model,
            device,
--- a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_beamsearch_helper.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_beamsearch_helper.py
--- a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_beamsearch_tester.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_beamsearch_tester.py
@ -1,442 +0,0 @@
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.  See License.txt in the project root for
-# license information.
-# --------------------------------------------------------------------------
-# This script helps evaluation of GPT-2 model.
-import logging
-import os
-import sys
-import timeit
-
-import numpy
-import torch
-from gpt2_beamsearch_helper import Gpt2BeamSearchHelper, Gpt2BeamSearchInputs
-from gpt2_tester import Gpt2Metric, Gpt2Tester
-
-sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
-
-from benchmark_helper import Precision
-
-logger = logging.getLogger(__name__)
-
-
-class Gpt2TesterFactory:
-    @staticmethod
-    def create_tester(tester_type="default"):
-        testers = {
-            "default": Gpt2Tester,
-            "beam_search_step": Gpt2BeamSearchTester,
-            "configurable_one_step_search": Gpt2BeamSearchTester,
-        }
-        w = testers[tester_type]
-        return w
-
-
-class Gpt2BeamSearchTester(Gpt2Tester):
-    def __init__(
-        self,
-        input_ids,
-        position_ids,
-        attention_mask,
-        beam_select_idx,
-        input_log_probs,
-        input_unfinished_sents,
-        prev_step_results,
-        prev_step_scores,
-        num_attention_heads,
-        hidden_size,
-        num_layer,
-        beam_size,
-        device,
-        is_fp16=False,
-        top_k=20,
-        top_k_required_order=False,
-    ):
-        super().__init__(
-            input_ids,
-            position_ids,
-            attention_mask,
-            num_attention_heads=num_attention_heads,
-            hidden_size=hidden_size,
-            num_layer=num_layer,
-            device=device,
-            is_fp16=is_fp16,
-            top_k=top_k,
-            top_k_required_order=top_k_required_order,
-        )
-        self.input_length = input_ids.shape[-1]
-        self.n_layer = num_layer
-        self.beam_size = beam_size
-
-        self.beam_select_idx = beam_select_idx.to(device)
-
-        float_type = torch.float16 if is_fp16 else torch.float32
-        self.input_log_probs = input_log_probs.type(float_type).to(device)
-        self.input_unfinished_sents = input_unfinished_sents.to(device)
-
-        self.prev_step_results = prev_step_results.to(device) if prev_step_results is not None else None
-        self.prev_step_scores = prev_step_scores.type(float_type).to(device)
-
-        self.last_state = None
-
-    def get_inputs(self) -> Gpt2BeamSearchInputs:
-        return Gpt2BeamSearchInputs(
-            self.input_ids,
-            self.past,
-            self.position_ids,
-            self.attention_mask,
-            self.beam_select_idx,
-            self.input_log_probs,
-            self.input_unfinished_sents,
-            self.prev_step_results,
-            self.prev_step_scores,
-        )
-
-    def update(self, output, step, device):
-        """
-        Update the inputs for next inference.
-        """
-        self.last_state = (
-            torch.from_numpy(output[0]).to(device)
-            if isinstance(output[0], numpy.ndarray)
-            else output[0].clone().detach().cpu()
-        )
-
-        self.input_ids = self.last_state.view(self.batch_size * self.beam_size, -1).to(device)
-
-        if self.position_ids is not None:
-            input_unfinished_sents_id = -3
-            self.prev_step_results = (
-                torch.from_numpy(output[-2]).to(device)
-                if isinstance(output[-2], numpy.ndarray)
-                else output[-2].clone().detach().to(device)
-            )
-            self.position_ids = (
-                torch.tensor([self.input_length + step - 1])
-                .unsqueeze(0)
-                .repeat(self.batch_size * self.beam_size, 1)
-                .to(device)
-            )
-
-            if self.attention_mask.size(0) != (self.batch_size * self.beam_size):
-                self.attention_mask = self.attention_mask.repeat(self.batch_size * self.beam_size, 1)
-            self.attention_mask = torch.cat(
-                [
-                    self.attention_mask,
-                    torch.ones([self.batch_size * self.beam_size, 1]).type_as(self.attention_mask),
-                ],
-                1,
-            ).to(device)
-        else:
-            input_unfinished_sents_id = -2
-
-        self.beam_select_idx = (
-            torch.from_numpy(output[input_unfinished_sents_id - 2]).to(device)
-            if isinstance(output[input_unfinished_sents_id - 2], numpy.ndarray)
-            else output[input_unfinished_sents_id - 2].clone().detach().to(device)
-        )
-        self.input_log_probs = (
-            torch.from_numpy(output[input_unfinished_sents_id - 1]).to(device)
-            if isinstance(output[input_unfinished_sents_id - 1], numpy.ndarray)
-            else output[input_unfinished_sents_id - 1].clone().detach().to(device)
-        )
-        self.input_unfinished_sents = (
-            torch.from_numpy(output[input_unfinished_sents_id]).to(device)
-            if isinstance(output[input_unfinished_sents_id], numpy.ndarray)
-            else output[input_unfinished_sents_id].clone().detach().to(device)
-        )
-        self.prev_step_scores = (
-            torch.from_numpy(output[-1]).to(device)
-            if isinstance(output[-1], numpy.ndarray)
-            else output[-1].clone().detach().to(device)
-        )
-        self.top_1_tokens = self.input_ids[0]
-        self.top_k_tokens = self.last_state
-
-        self.past = []
-
-        if isinstance(output[1], tuple):  # past in torch output is tuple
-            self.past = list(output[1])
-        else:
-            for i in range(self.n_layer):
-                past_i = (
-                    torch.from_numpy(output[i + 1])
-                    if isinstance(output[i + 1], numpy.ndarray)
-                    else output[i + 1].clone().detach()
-                )
-                self.past.append(past_i.to(device))
-
-    @staticmethod
-    def test_generation(
-        session,
-        model,
-        device,
-        test_inputs,
-        precision=Precision.FLOAT32,
-        model_class="GPT2LMHeadModel_BeamSearchStep",
-        top_k=20,
-        top_k_no_order=True,
-        max_steps=24,
-        max_inputs=0,
-        verbose=False,
-        save_test_data=0,
-        save_test_data_dir=".",
-    ):
-        """
-        Test Generation using beam search to compare PyTorch and ONNX model.
-        It will print top 1 and top k errors on the given test inputs.
-        """
-        print(
-            f"start test generation: (top_k={top_k} top_k_no_order={top_k_no_order} max_steps={max_steps} test_inputs={len(test_inputs)} max_inputs={max_inputs})"
-        )
-        n_layer = model.config.n_layer
-        n_head = model.config.n_head
-        n_embd = model.config.n_embd
-        beam_size = model.config.beam_size
-        eos_token_id = model.config.eos_token_id
-        test_data_saved = 0
-
-        is_float16 = precision == Precision.FLOAT16
-
-        # We will still use fp32 torch model as baseline when onnx model if fp16
-        model.eval().to(device)
-
-        # Allocate initial buffers for IO Binding of ONNX Runtimne. The buffer size will automatically increase later.
-        init_output_shapes = Gpt2BeamSearchHelper.get_output_shapes(
-            batch_size=4,
-            context_len=128,
-            past_sequence_length=128,
-            sequence_length=32,
-            beam_size=1,
-            step=0,
-            config=model.config,
-            model_class=model_class,
-        )
-        output_buffers = Gpt2BeamSearchHelper.get_output_buffers(
-            init_output_shapes,
-            device,
-            is_float16=is_float16,
-        )
-
-        baseline_name = "Torch"
-        treatment_name = "Quantized Onnx" if precision == Precision.INT8 else "Onnx"
-        torch_metric = Gpt2Metric(baseline_name, baseline_name, top_k)
-        onnx_metric = Gpt2Metric(treatment_name, baseline_name, top_k)
-        onnx_io_metric = Gpt2Metric(treatment_name + " with IO Binding", baseline_name, top_k)
-
-        for i, inputs in enumerate(test_inputs):
-            if max_inputs > 0 and i == max_inputs:
-                break
-            if i % 10 == 0:
-                print(f"{i}")
-            input_ids = inputs["input_ids"]
-            position_ids = inputs["position_ids"] if "position_ids" in inputs else None
-            attention_mask = inputs["attention_mask"] if "attention_mask" in inputs else None
-            beam_select_idx = inputs["beam_select_idx"] if "beam_select_idx" in inputs else None
-            input_log_probs = inputs["input_log_probs"] if "input_log_probs" in inputs else None
-            input_unfinished_sents = inputs["input_unfinished_sents"]
-            if model_class == "GPT2LMHeadModel_BeamSearchStep":
-                prev_step_results = inputs["input_ids"]
-            else:
-                prev_step_results = None
-
-            if "prev_step_scores" in inputs:
-                prev_step_scores = inputs["prev_step_scores"]
-            else:
-                prev_step_scores = torch.zeros([input_ids.shape[0], 1])
-
-            onnx_runner = Gpt2BeamSearchTester(
-                input_ids,
-                position_ids,
-                attention_mask,
-                beam_select_idx,
-                input_log_probs,
-                input_unfinished_sents,
-                prev_step_results,
-                prev_step_scores,
-                n_head,
-                n_embd,
-                n_layer,
-                beam_size,
-                device,
-                is_float16,
-                top_k,
-                not top_k_no_order,
-            )
-            onnx_io_runner = Gpt2BeamSearchTester(
-                input_ids,
-                position_ids,
-                attention_mask,
-                beam_select_idx,
-                input_log_probs,
-                input_unfinished_sents,
-                prev_step_results,
-                prev_step_scores,
-                n_head,
-                n_embd,
-                n_layer,
-                beam_size,
-                device,
-                is_float16,
-                top_k,
-                not top_k_no_order,
-            )
-            torch_runner = Gpt2BeamSearchTester(
-                input_ids,
-                position_ids,
-                attention_mask,
-                beam_select_idx,
-                input_log_probs,
-                input_unfinished_sents,
-                prev_step_results,
-                prev_step_scores,
-                n_head,
-                n_embd,
-                n_layer,
-                beam_size,
-                device,
-                False,
-                top_k,
-                not top_k_no_order,
-            )  # Torch model baseline is fp32
-
-            batch_size = torch_runner.batch_size
-            onnx_metric.start_batch(batch_size)
-            onnx_io_metric.start_batch(batch_size)
-            context_len = list(onnx_runner.input_ids.size())[-1]
-            with torch.no_grad():
-                for step in range(max_steps):
-                    print(f"Processing step: {step}")
-                    if model_class == "GPT2LMHeadModel_BeamSearchStep":
-                        num_seq = beam_size
-                        seq_len = list(onnx_runner.input_ids.size())[1]
-                        past_seq_len = list(onnx_runner.past[0].size())[3]
-                    else:
-                        num_seq = sum(onnx_io_runner.input_unfinished_sents.view(-1).long().cpu())
-                        past_seq_len = list(onnx_runner.past[0].size())[3]
-                        seq_len = list(onnx_runner.input_ids.size())[-1] - past_seq_len
-
-                    start_time = timeit.default_timer()
-                    pytorch_output = Gpt2BeamSearchHelper.pytorch_inference(model, torch_runner.get_inputs())
-                    torch_metric.add_latency(past_seq_len, timeit.default_timer() - start_time)
-                    torch_runner.update(pytorch_output, step, device)
-
-                    (
-                        onnx_output,
-                        avg_latency_ms,
-                    ) = Gpt2BeamSearchHelper.onnxruntime_inference(session, onnx_runner.get_inputs(), total_runs=1)
-                    onnx_metric.add_latency(past_seq_len, avg_latency_ms / 1000.0)
-                    onnx_runner.update(onnx_output, step, device)
-
-                    if model_class == "GPT2LMHeadModel_BeamSearchStep":
-                        num_seq = beam_size
-                    else:
-                        num_seq = sum(onnx_io_runner.input_unfinished_sents.view(-1).long().cpu())
-
-                    output_shapes = Gpt2BeamSearchHelper.get_output_shapes(
-                        batch_size,
-                        context_len,
-                        past_seq_len,
-                        seq_len,
-                        beam_size,
-                        step,
-                        model.config,
-                        model_class=model_class,
-                        num_seq=num_seq,
-                    )
-
-                    Gpt2BeamSearchHelper.auto_increase_buffer_size(output_buffers, output_shapes)
-
-                    (onnx_io_output, avg_latency_ms,) = Gpt2BeamSearchHelper.onnxruntime_inference_with_binded_io(
-                        session,
-                        onnx_io_runner.get_inputs(),
-                        output_buffers,
-                        output_shapes,
-                        total_runs=1,
-                        return_numpy=False,
-                        include_copy_output_latency=True,
-                    )
-
-                    onnx_io_metric.add_latency(past_seq_len, avg_latency_ms / 1000.0)
-
-                    if test_data_saved < save_test_data:
-                        onnx_io_runner.save_test_data(session, onnx_io_output, save_test_data_dir, test_data_saved)
-                        test_data_saved += 1
-
-                    onnx_io_runner.update(onnx_io_output, step, device)
-
-                    if (not onnx_runner.input_unfinished_sents.any()) or (
-                        not torch_runner.input_unfinished_sents.any()
-                    ):
-                        print("break at step: ", step)
-                        break
-
-            print(f"Totally {step+1} steps run")
-            onnx_metric.end_batch()
-            onnx_io_metric.end_batch()
-
-        torch_metric.print()
-        onnx_metric.print()
-        onnx_io_metric.print()
-
-        print("\tONNX")
-        if model_class == "GPT2LMHeadModel_BeamSearchStep":
-            results_onnx = onnx_runner.prev_step_results.view(batch_size * beam_size, -1)
-            results_onnx_io = onnx_io_runner.prev_step_results.view(batch_size * beam_size, -1)
-        else:
-            results_onnx = onnx_runner.input_ids.view(batch_size * beam_size, -1)
-            results_onnx_io = onnx_io_runner.input_ids.view(batch_size * beam_size, -1)
-        Gpt2BeamSearchTester.pprint_results(
-            results_onnx,
-            onnx_runner.prev_step_scores.view(batch_size * beam_size, -1),
-            pad_token_id=eos_token_id,
-            eos_token_id=eos_token_id,
-        )
-        print("\tONNX with IO binding")
-        Gpt2BeamSearchTester.pprint_results(
-            results_onnx_io,
-            onnx_io_runner.prev_step_scores.view(batch_size * beam_size, -1),
-            pad_token_id=eos_token_id,
-            eos_token_id=eos_token_id,
-        )
-
-    @staticmethod
-    def pprint_results(
-        output_ids,
-        output_scores,
-        pad_token_id=None,
-        eos_token_id=None,
-    ):
-        """
-        Print test generation results.
-        """
-        if pad_token_id is None:
-            pad_token_id = 1
-        if eos_token_id is None:
-            eos_token_id = 1
-        if torch.is_tensor(output_ids):
-            output_ids = output_ids.cpu().numpy()
-
-        for i, sample in enumerate(output_ids):
-            for j, seq in enumerate(sample):
-                if isinstance(seq, numpy.ndarray) or isinstance(seq, list):
-                    # remove left padding
-                    for k, t in enumerate(seq):
-                        if t != pad_token_id:
-                            seq = seq[k:]
-                            break
-                    # remove EOS
-                    for k, t in enumerate(seq):
-                        if t == eos_token_id:
-                            seq = seq[: k + 1]
-                            break
-                    print("-" * 40)
-                    result = ",".join([str(token_id) for token_id in sample])
-                    print(f">> Output {j + 1}: \t{[result]}")
-                else:
-                    result = ",".join([str(token_id) for token_id in sample])
-                    print(f">> Output {i}: \t{result}")
-                    print(f">> Scores {i}: \t{output_scores[i]}")
-                    break
-            print("=" * 80)
--- a/onnxruntime/python/tools/transformers/notebooks/Inference_GPT2-OneStepSearch_OnnxRuntime_CPU.ipynb
+++ b/onnxruntime/python/tools/transformers/notebooks/Inference_GPT2-OneStepSearch_OnnxRuntime_CPU.ipynb
@ -1,491 +0,0 @@
-{
- "metadata": {
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.0"
-  },
-  "orig_nbformat": 2,
-  "kernelspec": {
-   "name": "python370jvsc74a57bd081098997110362167705b61d21e46dda767ff2050d805c22b6ba90fec7e1aa35",
-   "display_name": "Python 3.7.0 64-bit ('py37athena': conda)"
-  },
-  "metadata": {
-   "interpreter": {
-    "hash": "81098997110362167705b61d21e46dda767ff2050d805c22b6ba90fec7e1aa35"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2,
- "cells": [
-  {
-   "source": [
-    "Copyright (c) Microsoft Corporation. All rights reserved.\n",
-    "Licensed under the MIT License."
-   ],
-   "cell_type": "markdown",
-   "metadata": {}
-  },
-  {
-   "source": [
-    "# Optimizing runtime performance on GPT-2 model inference with ONNXRuntime on CPU\n",
-    "\n",
-    "In this tutorial, you'll be introduced to how to load a GPT2 model from PyTorch, convert it to ONNX with one step search, and inference it using ONNX Runtime with/without IO Binding. GPT-2 model inference is optimized by compiling one-step beam search into the onnx compute graph, which speeds up the runtime significantly. "
-   ],
-   "cell_type": "markdown",
-   "metadata": {}
-  },
-  {
-   "source": [
-    "## Prerequisites\n",
-    "If you have Jupyter Notebook, you may directly run this notebook. We will use pip to install or upgrade [PyTorch](https://pytorch.org/), [OnnxRuntime](https://microsoft.github.io/onnxruntime/) and other required packages.\n",
-    "\n",
-    "Otherwise, you can setup a new environment. First, we install [Anaconda](https://www.anaconda.com/distribution/). Then open an AnaConda prompt window and run the following commands:\n",
-    "\n",
-    "```console\n",
-    "conda create -n cpu_env python=3.8\n",
-    "conda activate cpu_env\n",
-    "conda install jupyter\n",
-    "jupyter notebook\n",
-    "```\n",
-    "\n",
-    "The last command will launch Jupyter Notebook and we can open this notebook in browser to continue."
-   ],
-   "cell_type": "markdown",
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Install PyTorch 1.7.0 and OnnxRuntime 1.7.0 for CPU-only.\n",
-    "import sys\n",
-    "if sys.platform == 'darwin': # Mac\n",
-    "    !{sys.executable} -m pip install --upgrade torch torchvision\n",
-    "else:\n",
-    "    !{sys.executable} -m pip install --upgrade torch==1.7.0+cpu torchvision==0.8.1+cpu -f https://download.pytorch.org/whl/torch_stable.html\n",
-    "!{sys.executable} -m pip install onnxruntime==1.7.2\n",
-    "\n",
-    "# Install other packages used in this notebook.\n",
-    "!{sys.executable} -m pip install transformers==4.3.1\n",
-    "!{sys.executable} -m pip install onnx onnxconverter_common psutil pytz pandas py-cpuinfo py3nvml"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "\n",
-    "# Create a cache directory to store pretrained model.\n",
-    "cache_dir = os.path.join(\".\", \"cache_models\")\n",
-    "if not os.path.exists(cache_dir):\n",
-    "    os.makedirs(cache_dir)"
-   ]
-  },
-  {
-   "source": [
-    "## Convert GPT2 model from PyTorch to ONNX with one step search ##\n",
-    "\n",
-    "We have a script [convert_to_onnx.py](https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/transformers/convert_to_onnx.py) that could help you to convert GPT2 with past state to ONNX. \n",
-    "\n",
-    "The script accepts a pretrained model name or path of a checkpoint directory as input, and converts the model to ONNX. It also verifies that the ONNX model could generate same input as the pytorch model. The usage is like \n",
-    "```\n",
-    "python -m onnxruntime.transformers.convert_to_onnx -m model_name_or_path \\ \n",
-    "--model_class=GPT2LMHeadModel_BeamSearchStep|GPT2LMHeadModel_ConfigurableOneStepSearch \\ \n",
-    "--output gpt2_onestepsearch.onnx -o -p fp32|fp16|int8\n",
-    "```\n",
-    "The -p option can be used to choose the precision: fp32 (float32), fp16 (mixed precision) or int8 (quantization). The -o option will generate optimized model, which is required for fp16 or int8.\n",
-    "\n",
-    "Here we use a pretrained model as example:"
-   ],
-   "cell_type": "markdown",
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      "GPT2Config {\n  \"_name_or_path\": \"gpt2\",\n  \"activation_function\": \"gelu_new\",\n  \"architectures\": [\n    \"GPT2LMHeadModel\"\n  ],\n  \"attn_pdrop\": 0.1,\n  \"batch_size\": 1,\n  \"beam_size\": 4,\n  \"bos_token_id\": 50256,\n  \"embd_pdrop\": 0.1,\n  \"eos_token_id\": 50256,\n  \"gradient_checkpointing\": false,\n  \"initializer_range\": 0.02,\n  \"layer_norm_epsilon\": 1e-05,\n  \"model_type\": \"gpt2\",\n  \"n_ctx\": 1024,\n  \"n_embd\": 768,\n  \"n_head\": 12,\n  \"n_inner\": null,\n  \"n_layer\": 12,\n  \"n_positions\": 1024,\n  \"resid_pdrop\": 0.1,\n  \"summary_activation\": null,\n  \"summary_first_dropout\": 0.1,\n  \"summary_proj_to_labels\": true,\n  \"summary_type\": \"cls_index\",\n  \"summary_use_proj\": true,\n  \"task_specific_params\": {\n    \"text-generation\": {\n      \"do_sample\": true,\n      \"max_length\": 50\n    }\n  },\n  \"transformers_version\": \"4.3.1\",\n  \"use_cache\": true,\n  \"vocab_size\": 50257\n}\n\n"
-     ]
-    }
-   ],
-   "source": [
-    "from packaging import version\n",
-    "from onnxruntime import __version__ as ort_verison\n",
-    "if version.parse(ort_verison) >= version.parse('1.12.0'):\n",
-    "    from onnxruntime.transformers.models.gpt2.gpt2_beamsearch_helper import Gpt2BeamSearchHelper, GPT2LMHeadModel_BeamSearchStep\n",
-    "else:\n",
-    "    from onnxruntime.transformers.gpt2_beamsearch_helper import Gpt2BeamSearchHelper, GPT2LMHeadModel_BeamSearchStep\n",
-    "\n",
-    "from transformers import AutoConfig\n",
-    "import torch\n",
-    "\n",
-    "model_name_or_path = \"gpt2\"\n",
-    "config = AutoConfig.from_pretrained(model_name_or_path, cache_dir=cache_dir)\n",
-    "model = GPT2LMHeadModel_BeamSearchStep.from_pretrained(model_name_or_path, config=config, batch_size=1, beam_size=4, cache_dir=cache_dir)\n",
-    "device = torch.device(\"cpu\")\n",
-    "model.eval().to(device)\n",
-    "\n",
-    "print(model.config)\n",
-    "\n",
-    "num_attention_heads = model.config.n_head\n",
-    "hidden_size = model.config.n_embd\n",
-    "num_layer = model.config.n_layer"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "stream",
-     "name": "stderr",
-     "text": [
-      "/data/anaconda/envs/py37athena/lib/python3.7/site-packages/transformers/models/gpt2/modeling_gpt2.py:654: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n  assert batch_size > 0, \"batch_size has to be defined and > 0\"\n/data/anaconda/envs/py37athena/lib/python3.7/site-packages/transformers/models/gpt2/modeling_gpt2.py:169: TracerWarning: Converting a tensor to a Python float might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n  w = w / (float(v.size(-1)) ** 0.5)\n/data/anaconda/envs/py37athena/lib/python3.7/site-packages/transformers/models/gpt2/modeling_gpt2.py:174: TracerWarning: Converting a tensor to a Python index might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n  mask = self.bias[:, :, ns - nd : ns, :ns]\n"
-     ]
-    }
-   ],
-   "source": [
-    "onnx_model_path = \"gpt2_one_step_search.onnx\"\n",
-    "Gpt2BeamSearchHelper.export_onnx(model, device, onnx_model_path) # add parameter use_external_data_format=True when model size > 2 GB"
-   ]
-  },
-  {
-   "source": [
-    "## ONNX Runtime Inference ##\n",
-    "\n",
-    "We can use ONNX Runtime to inference. The inputs are dictionary with name and numpy array as value, and the output is list of numpy array. Note that both input and output are in CPU. When you run the inference in GPU, it will involve data copy between CPU and GPU for input and output.\n",
-    "\n",
-    "Let's create an inference session for ONNX Runtime given the exported ONNX model, and see the output."
-   ],
-   "cell_type": "markdown",
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import onnxruntime\n",
-    "import numpy\n",
-    "from transformers import AutoTokenizer\n",
-    "\n",
-    "EXAMPLE_Text = ['best hotel in bay area.']\n",
-    "\n",
-    "def get_tokenizer(model_name_or_path, cache_dir):\n",
-    "    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir)\n",
-    "    tokenizer.padding_side = \"left\"\n",
-    "    tokenizer.pad_token = tokenizer.eos_token\n",
-    "    #okenizer.add_special_tokens({'pad_token': '[PAD]'})\n",
-    "    return tokenizer\n",
-    "\n",
-    "def get_example_inputs(prompt_text=EXAMPLE_Text):    \n",
-    "    tokenizer = get_tokenizer(model_name_or_path, cache_dir)\n",
-    "    encodings_dict = tokenizer.batch_encode_plus(prompt_text, padding=True)\n",
-    "\n",
-    "    input_ids = torch.tensor(encodings_dict['input_ids'], dtype=torch.int64)\n",
-    "    attention_mask = torch.tensor(encodings_dict['attention_mask'], dtype=torch.float32)\n",
-    "    position_ids = (attention_mask.long().cumsum(-1) - 1)\n",
-    "    position_ids.masked_fill_(position_ids < 0, 0)\n",
-    "\n",
-    "    #Empty Past State for generating first word\n",
-    "    empty_past = []\n",
-    "    batch_size = input_ids.size(0)\n",
-    "    sequence_length = input_ids.size(1)\n",
-    "    past_shape = [2, batch_size, num_attention_heads, 0, hidden_size // num_attention_heads]\n",
-    "    for i in range(num_layer):\n",
-    "        empty_past.append(torch.empty(past_shape).type(torch.float32).to(device))\n",
-    "       \n",
-    "    return input_ids, attention_mask, position_ids, empty_past\n",
-    "\n",
-    "input_ids, attention_mask, position_ids, empty_past = get_example_inputs()\n",
-    "beam_select_idx = torch.zeros([1, input_ids.shape[0]]).long()\n",
-    "input_log_probs = torch.zeros([input_ids.shape[0], 1])\n",
-    "input_unfinished_sents = torch.ones([input_ids.shape[0], 1], dtype=torch.bool)\n",
-    "prev_step_scores = torch.zeros([input_ids.shape[0], 1])\n",
-    "\n",
-    "onnx_model_path = \"gpt2_one_step_search.onnx\"\n",
-    "session = onnxruntime.InferenceSession(onnx_model_path)\n",
-    "ort_inputs = {\n",
-    "              'input_ids': numpy.ascontiguousarray(input_ids.cpu().numpy()),\n",
-    "              'attention_mask' : numpy.ascontiguousarray(attention_mask.cpu().numpy()),\n",
-    "              'position_ids': numpy.ascontiguousarray(position_ids.cpu().numpy()),\n",
-    "              'beam_select_idx': numpy.ascontiguousarray(beam_select_idx.cpu().numpy()),\n",
-    "              'input_log_probs': numpy.ascontiguousarray(input_log_probs.cpu().numpy()),\n",
-    "              'input_unfinished_sents': numpy.ascontiguousarray(input_unfinished_sents.cpu().numpy()),\n",
-    "              'prev_step_results': numpy.ascontiguousarray(input_ids.cpu().numpy()),\n",
-    "              'prev_step_scores': numpy.ascontiguousarray(prev_step_scores.cpu().numpy()),\n",
-    "             }\n",
-    "for i, past_i in enumerate(empty_past):\n",
-    "    ort_inputs[f'past_{i}'] = numpy.ascontiguousarray(past_i.cpu().numpy())\n",
-    "ort_outputs = session.run(None, ort_inputs)"
-   ]
-  },
-  {
-   "source": [
-    "## ONNX Runtime Inference with IO Binding ##\n",
-    "\n",
-    "To avoid data copy for input and output, ONNX Runtime also supports IO Binding. User could provide some buffer for input and outputs. For GPU inference, the buffer can be in GPU to reduce memory copy between CPU and GPU. This is helpful for high performance inference in GPU. For GPT-2, IO Binding might help the performance when batch size or (past) sequence length is large."
-   ],
-   "cell_type": "markdown",
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def inference_with_io_binding(session, config, input_ids, position_ids, attention_mask, past, beam_select_idx, input_log_probs, input_unfinished_sents, prev_step_results, prev_step_scores, step, context_len):\n",
-    "    output_shapes = Gpt2BeamSearchHelper.get_output_shapes(batch_size=1,\n",
-    "                                                           context_len=context_len,\n",
-    "                                                           past_sequence_length=past[0].size(3),\n",
-    "                                                           sequence_length=input_ids.size(1),\n",
-    "                                                           beam_size=4,\n",
-    "                                                           step=step,\n",
-    "                                                           config=config,\n",
-    "                                                           model_class=\"GPT2LMHeadModel_BeamSearchStep\")\n",
-    "    output_buffers = Gpt2BeamSearchHelper.get_output_buffers(output_shapes, device)\n",
-    "\n",
-    "    io_binding = Gpt2BeamSearchHelper.prepare_io_binding(session, input_ids, position_ids, attention_mask, past, output_buffers, output_shapes, beam_select_idx, input_log_probs, input_unfinished_sents, prev_step_results, prev_step_scores)\n",
-    "    session.run_with_iobinding(io_binding)\n",
-    "\n",
-    "    outputs = Gpt2BeamSearchHelper.get_outputs_from_io_binding_buffer(session, output_buffers, output_shapes, return_numpy=False)\n",
-    "    return outputs"
-   ]
-  },
-  {
-   "source": [
-    "We can see that the result is exactly same with/without IO Binding:"
-   ],
-   "cell_type": "markdown",
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      "IO Binding result is good\n"
-     ]
-    }
-   ],
-   "source": [
-    "input_ids, attention_mask, position_ids, empty_past = get_example_inputs()\n",
-    "beam_select_idx = torch.zeros([1, input_ids.shape[0]]).long()\n",
-    "input_log_probs = torch.zeros([input_ids.shape[0], 1])\n",
-    "input_unfinished_sents = torch.ones([input_ids.shape[0], 1], dtype=torch.bool)\n",
-    "prev_step_scores = torch.zeros([input_ids.shape[0], 1])\n",
-    "outputs = inference_with_io_binding(session, config, input_ids, position_ids, attention_mask, empty_past, beam_select_idx, input_log_probs, input_unfinished_sents, input_ids, prev_step_scores, 0, input_ids.shape[-1])\n",
-    "assert torch.eq(outputs[-2], torch.from_numpy(ort_outputs[-2])).all()\n",
-    "print(\"IO Binding result is good\")"
-   ]
-  },
-  {
-   "source": [
-    "## Batch Text Generation ##\n",
-    "\n",
-    "Here is an example for text generation using ONNX Runtime with/without IO Binding."
-   ],
-   "cell_type": "markdown",
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def update(output, step, batch_size, beam_size, context_length, prev_attention_mask, device):\n",
-    "    \"\"\"\n",
-    "    Update the inputs for next inference.\n",
-    "    \"\"\"\n",
-    "    last_state = (torch.from_numpy(output[0]).to(device)\n",
-    "                        if isinstance(output[0], numpy.ndarray) else output[0].clone().detach().cpu())\n",
-    "\n",
-    "    input_ids = last_state.view(batch_size * beam_size, -1).to(device)\n",
-    "\n",
-    "    input_unfinished_sents_id = -3\n",
-    "    prev_step_results = (torch.from_numpy(output[-2]).to(device) if isinstance(output[-2], numpy.ndarray)\n",
-    "                                else output[-2].clone().detach().to(device))\n",
-    "    position_ids = (torch.tensor([context_length + step - 1\n",
-    "                                        ]).unsqueeze(0).repeat(batch_size * beam_size, 1).to(device))\n",
-    "\n",
-    "    if prev_attention_mask.shape[0] != (batch_size * beam_size):\n",
-    "        prev_attention_mask = prev_attention_mask.repeat(batch_size * beam_size, 1)\n",
-    "    attention_mask = torch.cat(\n",
-    "        [\n",
-    "            prev_attention_mask,\n",
-    "            torch.ones([batch_size * beam_size, 1]).type_as(prev_attention_mask),\n",
-    "        ],\n",
-    "        1,\n",
-    "    ).to(device)\n",
-    "\n",
-    "    beam_select_idx = (torch.from_numpy(output[input_unfinished_sents_id - 2]).to(device) if isinstance(\n",
-    "        output[input_unfinished_sents_id - 2], numpy.ndarray) else output[input_unfinished_sents_id - 2].clone().detach().to(device))\n",
-    "    input_log_probs = (torch.from_numpy(output[input_unfinished_sents_id - 1]).to(device) if isinstance(\n",
-    "        output[input_unfinished_sents_id - 1], numpy.ndarray) else output[input_unfinished_sents_id - 1].clone().detach().to(device))\n",
-    "    input_unfinished_sents = (torch.from_numpy(output[input_unfinished_sents_id]).to(device) if isinstance(\n",
-    "        output[input_unfinished_sents_id], numpy.ndarray) else\n",
-    "                                    output[input_unfinished_sents_id].clone().detach().to(device))\n",
-    "    prev_step_scores = (torch.from_numpy(output[-1]).to(device)\n",
-    "                                if isinstance(output[-1], numpy.ndarray) else output[-1].clone().detach().to(device))\n",
-    "\n",
-    "    past = []\n",
-    "    if isinstance(output[1], tuple):  # past in torch output is tuple\n",
-    "        past = list(output[1])\n",
-    "    else:\n",
-    "        for i in range(model.config.n_layer):\n",
-    "            past_i = (torch.from_numpy(output[i + 1])\n",
-    "                        if isinstance(output[i + 1], numpy.ndarray) else output[i + 1].clone().detach())\n",
-    "            past.append(past_i.to(device)) \n",
-    "\n",
-    "    inputs = {\n",
-    "        'input_ids': input_ids,\n",
-    "        'attention_mask' : attention_mask,\n",
-    "        'position_ids': position_ids,\n",
-    "        'beam_select_idx': beam_select_idx,\n",
-    "        'input_log_probs': input_log_probs,\n",
-    "        'input_unfinished_sents': input_unfinished_sents,\n",
-    "        'prev_step_results': prev_step_results,\n",
-    "        'prev_step_scores': prev_step_scores,\n",
-    "    }\n",
-    "    ort_inputs = {\n",
-    "        'input_ids': numpy.ascontiguousarray(input_ids.cpu().numpy()),\n",
-    "        'attention_mask' : numpy.ascontiguousarray(attention_mask.cpu().numpy()),\n",
-    "        'position_ids': numpy.ascontiguousarray(position_ids.cpu().numpy()),\n",
-    "        'beam_select_idx': numpy.ascontiguousarray(beam_select_idx.cpu().numpy()),\n",
-    "        'input_log_probs': numpy.ascontiguousarray(input_log_probs.cpu().numpy()),\n",
-    "        'input_unfinished_sents': numpy.ascontiguousarray(input_unfinished_sents.cpu().numpy()),\n",
-    "        'prev_step_results': numpy.ascontiguousarray(prev_step_results.cpu().numpy()),\n",
-    "        'prev_step_scores': numpy.ascontiguousarray(prev_step_scores.cpu().numpy()),\n",
-    "    }\n",
-    "    for i, past_i in enumerate(past):\n",
-    "        ort_inputs[f'past_{i}'] = numpy.ascontiguousarray(past_i.cpu().numpy())\n",
-    "    \n",
-    "    return inputs, ort_inputs, past\n",
-    "\n",
-    "def test_generation(tokenizer, input_text, use_onnxruntime_io, ort_session = None, num_tokens_to_produce = 30):\n",
-    "    print(\"Text generation using\", \"OnnxRuntime with IO binding\" if use_onnxruntime_io else \"OnnxRuntime\", \"...\")    \n",
-    "    input_ids, attention_mask, position_ids, past = get_example_inputs(input_text)\n",
-    "    beam_select_idx = torch.zeros([1, input_ids.shape[0]]).long()\n",
-    "    input_log_probs = torch.zeros([input_ids.shape[0], 1])\n",
-    "    input_unfinished_sents = torch.ones([input_ids.shape[0], 1], dtype=torch.bool)\n",
-    "    prev_step_scores = torch.zeros([input_ids.shape[0], 1])\n",
-    "    inputs = {\n",
-    "        'input_ids': input_ids,\n",
-    "        'attention_mask' : attention_mask,\n",
-    "        'position_ids': position_ids,\n",
-    "        'beam_select_idx': beam_select_idx,\n",
-    "        'input_log_probs': input_log_probs,\n",
-    "        'input_unfinished_sents': input_unfinished_sents,\n",
-    "        'prev_step_results': input_ids,\n",
-    "        'prev_step_scores': prev_step_scores,\n",
-    "    }\n",
-    "    ort_inputs = {\n",
-    "        'input_ids': numpy.ascontiguousarray(input_ids.cpu().numpy()),\n",
-    "        'attention_mask' : numpy.ascontiguousarray(attention_mask.cpu().numpy()),\n",
-    "        'position_ids': numpy.ascontiguousarray(position_ids.cpu().numpy()),\n",
-    "        'beam_select_idx': numpy.ascontiguousarray(beam_select_idx.cpu().numpy()),\n",
-    "        'input_log_probs': numpy.ascontiguousarray(input_log_probs.cpu().numpy()),\n",
-    "        'input_unfinished_sents': numpy.ascontiguousarray(input_unfinished_sents.cpu().numpy()),\n",
-    "        'prev_step_results': numpy.ascontiguousarray(input_ids.cpu().numpy()),\n",
-    "        'prev_step_scores': numpy.ascontiguousarray(prev_step_scores.cpu().numpy()),\n",
-    "    }\n",
-    "    for i, past_i in enumerate(past):\n",
-    "        ort_inputs[f'past_{i}'] = numpy.ascontiguousarray(past_i.cpu().numpy())\n",
-    "    batch_size = input_ids.size(0)\n",
-    "    beam_size = 4\n",
-    "    context_length = input_ids.size(-1)\n",
-    "\n",
-    "    for step in range(num_tokens_to_produce):\n",
-    "        if use_onnxruntime_io:\n",
-    "            outputs = inference_with_io_binding(ort_session, config, inputs['input_ids'], inputs['position_ids'], inputs['attention_mask'], past, inputs['beam_select_idx'], inputs['input_log_probs'], inputs['input_unfinished_sents'], inputs['prev_step_results'], inputs['prev_step_scores'], step, context_length)\n",
-    "        else:\n",
-    "            outputs = ort_session.run(None, ort_inputs) \n",
-    "        inputs, ort_inputs, past = update(outputs, step, batch_size, beam_size, context_length, inputs['attention_mask'], device)\n",
-    "\n",
-    "        if not inputs['input_unfinished_sents'].any():\n",
-    "            break\n",
-    "\n",
-    "    print(\"------------\")\n",
-    "    print(tokenizer.decode(inputs['prev_step_results'][0], skip_special_tokens=True))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      "Text generation using OnnxRuntime ...\n",
-      "------------\n",
-      "best hotel in bay area.\n",
-      "\n",
-      "\"It's a great place to stay,\" he said. \"It's a great place to live. It's a great place to work\n"
-     ]
-    }
-   ],
-   "source": [
-    "tokenizer = get_tokenizer(model_name_or_path, cache_dir)\n",
-    "input_text = EXAMPLE_Text\n",
-    "test_generation(tokenizer, input_text, use_onnxruntime_io=False, ort_session=session)"
-   ]
-  },
-  {
-   "source": [
-    "Next, we use ONNX Runtime with IO binding to run again and we can see that the result is exactly same."
-   ],
-   "cell_type": "markdown",
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      "Text generation using OnnxRuntime with IO binding ...\n",
-      "------------\n",
-      "best hotel in bay area.\n",
-      "\n",
-      "\"It's a great place to stay,\" he said. \"It's a great place to live. It's a great place to work\n"
-     ]
-    }
-   ],
-   "source": [
-    "test_generation(tokenizer, input_text, use_onnxruntime_io=True, ort_session=session)"
-   ]
-  }
- ]
-}
--- a/onnxruntime/test/python/transformers/test_gpt2_benchmark.py
+++ b/onnxruntime/test/python/transformers/test_gpt2_benchmark.py
@ -49,43 +49,6 @@ class TestGpt2(unittest.TestCase):
    def test_gpt2_int8(self):
        self.run_benchmark_gpt2("-m gpt2 --precision int8 -o  -b 1 --sequence_lengths 2 -s 3")

-    @pytest.mark.slow
-    def test_gpt2_beam_search_step_fp32(self):
-        self.run_benchmark_gpt2(
-            "-m gpt2 --model_class=GPT2LMHeadModel_BeamSearchStep --precision fp32 -v -b 1 --sequence_lengths 5 -s 3"
-        )
-
-    # @pytest.mark.slow
-    # def test_gpt2_beam_search_step_fp16(self):
-    #     if self.test_cuda:
-    #         self.run_benchmark_gpt2(
-    #             '-m gpt2 --model_class=GPT2LMHeadModel_BeamSearchStep --precision fp16 -o -b 1 --sequence_lengths 5 -s 3 --use_gpu')
-
-    @pytest.mark.slow
-    def test_gpt2_beam_search_step_int8(self):
-        self.run_benchmark_gpt2(
-            "-m gpt2 --model_class=GPT2LMHeadModel_BeamSearchStep --precision int8 -o -b 1 --sequence_lengths 5 -s 3"
-        )
-
-    @pytest.mark.slow
-    def test_gpt2_configurable_one_step_search_fp32(self):
-        self.run_benchmark_gpt2(
-            "-m gpt2 --model_class=GPT2LMHeadModel_ConfigurableOneStepSearch --precision fp32 -v -b 1 --sequence_lengths 5 --past_sequence_lengths 3 --use_gpu"
-        )
-
-    # @pytest.mark.slow
-    # def test_gpt2_configurable_one_step_search_fp16(self):
-    #     if self.test_cuda:
-    #         self.run_benchmark_gpt2(
-    #             "-m gpt2 --model_class=GPT2LMHeadModel_ConfigurableOneStepSearch --precision fp16 -o -b 1 --sequence_lengths 5 -s 3 --use_gpu"
-    #         )
-
-    @pytest.mark.slow
-    def test_gpt2_configurable_one_step_search_int8(self):
-        self.run_benchmark_gpt2(
-            "-m gpt2 --model_class=GPT2LMHeadModel_ConfigurableOneStepSearch --precision int8 -o -b 1 --sequence_lengths 5 -s 3"
-        )
-

 if __name__ == "__main__":
    coloredlogs.install(fmt="%(message)s")
--- a/tools/ci_build/requirements.txt
+++ b/tools/ci_build/requirements.txt
@ -1,7 +1,6 @@
 # packages used by transformers tool test
-protobuf==3.18.3
-numpy==1.21.6
+protobuf==3.20.1
+numpy==1.23.5
 coloredlogs==15.0
-transformers==4.6.1
-onnxconverter-common==1.8.1
+transformers==4.24.0
 psutil