diff --git a/onnxruntime/python/tools/transformers/models/gpt2/benchmark_gpt2.py b/onnxruntime/python/tools/transformers/models/gpt2/benchmark_gpt2.py
index 4041b285ea..01a5e5d888 100644
--- a/onnxruntime/python/tools/transformers/models/gpt2/benchmark_gpt2.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/benchmark_gpt2.py
@@ -15,8 +15,7 @@ from datetime import datetime
 
 import psutil
 import torch
-from gpt2_beamsearch_helper import MODEL_CLASSES, Gpt2HelperFactory
-from gpt2_helper import DEFAULT_TOLERANCE, PRETRAINED_GPT2_MODELS, Gpt2Helper
+from gpt2_helper import DEFAULT_TOLERANCE, MODEL_CLASSES, PRETRAINED_GPT2_MODELS, Gpt2Helper
 from packaging import version
 from transformers import AutoConfig
 
@@ -124,12 +123,6 @@ def parse_arguments(argv=None):
     parser.set_defaults(torchscript=False)
 
     parser.add_argument("-b", "--batch_sizes", nargs="+", type=int, default=[1], help="batch size")
-    parser.add_argument(
-        "--beam_size",
-        type=int,
-        default=4,
-        help="Beam size if greedy/top-p/top-k sampling is needed",
-    )
 
     parser.add_argument(
         "--sequence_lengths",
@@ -170,54 +163,6 @@ def parse_arguments(argv=None):
     parser.add_argument("--disable_io_binding", required=False, action="store_true")
     parser.set_defaults(disable_io_binding=False)
 
-    search_option_group = parser.add_argument_group("configurable one step search options")
-
-    search_option_group.add_argument(
-        "--ignore_eos",
-        type=bool,
-        default=False,
-        help="If ignore end of sentence token in model inference.",
-    )
-    search_option_group.add_argument(
-        "--repetition_penalty",
-        type=float,
-        default=1,
-        help="Positive. >1 to penalize and <1 to encorage.",
-    )
-    search_option_group.add_argument(
-        "--temperature",
-        type=float,
-        default=1,
-        help="Softmax temperature for output logits.",
-    )
-    search_option_group.add_argument(
-        "--excluded_token_ids",
-        required=False,
-        nargs="+",
-        type=float,
-        help="A list of token ids to be excluded in inference.",
-    )
-    search_option_group.add_argument(
-        "--length_penalty",
-        type=float,
-        default=1,
-        help="Positive. >1 to penalize and <1 to encorage short sentence.",
-    )
-
-    sampling_option_group = parser.add_argument_group("one step sampling options")
-    sampling_option_group.add_argument(
-        "--do_sample",
-        action="store_true",
-        help="If to do sampling instead of beam search or greedy.",
-    )
-    sampling_option_group.add_argument(
-        "--do_sample_top_p",
-        type=float,
-        default=0.95,
-        help="Nuclear/top-p sampling accumulation probability.",
-    )
-    sampling_option_group.add_argument("--do_sample_top_k", type=int, default=0, help="Use top-k if non-zero.")
-
     args = parser.parse_args(argv)
 
     return args
@@ -249,41 +194,9 @@ def main(args):
     prepare_environment(cache_dir, output_dir, args.use_gpu)
 
     model_class = MODEL_CLASSES[args.model_class][0]
-    if args.model_class == "GPT2LMHeadModel_BeamSearchStep":
-        model_type = "beam_search_step"
-    elif args.model_class == "GPT2LMHeadModel_ConfigurableOneStepSearch":
-        model_type = "configurable_one_step_search"
-    else:
-        model_type = "default"
-
-    gpt2helper = Gpt2HelperFactory.create_helper(model_type)
+    gpt2helper = Gpt2Helper
     config = AutoConfig.from_pretrained(args.model_name_or_path, torchscript=args.torchscript, cache_dir=cache_dir)
-    if model_type == "beam_search_step":
-        model = model_class.from_pretrained(
-            args.model_name_or_path,
-            config=config,
-            batch_size=1,
-            beam_size=args.beam_size,
-            cache_dir=cache_dir,
-        )
-    elif model_type == "configurable_one_step_search":
-        model = model_class.from_pretrained(
-            args.model_name_or_path,
-            config=config,
-            batch_size=1,
-            beam_size=args.beam_size,
-            ignore_eos=args.ignore_eos,
-            temperature=args.temperature,
-            repetition_penalty=args.repetition_penalty,
-            excluded_token_ids=args.excluded_token_ids,
-            length_penalty=args.length_penalty,
-            do_sample=args.do_sample,
-            do_sample_top_p=args.do_sample_top_p,
-            do_sample_top_k=args.do_sample_top_k,
-            cache_dir=cache_dir,
-        )
-    else:
-        model = model_class.from_pretrained(args.model_name_or_path, config=config, cache_dir=cache_dir)
+    model = model_class.from_pretrained(args.model_name_or_path, config=config, cache_dir=cache_dir)
 
     # This scirpt does not support float16 for PyTorch.
     # if args.float16:
@@ -352,29 +265,14 @@ def main(args):
         return
 
     # Allocate output buffers for IO Binding
-    if model_type == "beam_search_step" or model_type == "configurable_one_step_search":
-        max_output_shapes = gpt2helper.get_output_shapes(
-            max(args.batch_sizes),
-            context_len=max(args.past_sequence_lengths),
-            past_sequence_length=max(args.past_sequence_lengths),
-            sequence_length=max(args.sequence_lengths),
-            beam_size=args.beam_size,
-            step=0,
-            config=config,
-            model_class=args.model_class,
-        )
-
-        output_buffers = gpt2helper.get_output_buffers(max_output_shapes, device, args.precision == Precision.FLOAT16)
-
-    else:
-        max_output_shapes = gpt2helper.get_output_shapes(
-            max(args.batch_sizes),
-            max(args.past_sequence_lengths),
-            max(args.sequence_lengths),
-            config,
-            args.model_class,
-        )
-        output_buffers = gpt2helper.get_output_buffers(max_output_shapes, device, args.precision == Precision.FLOAT16)
+    max_output_shapes = gpt2helper.get_output_shapes(
+        max(args.batch_sizes),
+        max(args.past_sequence_lengths),
+        max(args.sequence_lengths),
+        config,
+        args.model_class,
+    )
+    output_buffers = gpt2helper.get_output_buffers(max_output_shapes, device, args.precision == Precision.FLOAT16)
 
     csv_filename = args.result_csv or "benchmark_result_{}.csv".format(datetime.now().strftime("%Y%m%d-%H%M%S"))
     with open(csv_filename, mode="a", newline="") as csv_file:
@@ -402,53 +300,32 @@ def main(args):
                 for past_sequence_length in args.past_sequence_lengths:
                     assert batch_size > 0 and sequence_length > 0 and past_sequence_length >= 0
                     logger.debug(
-                        f"Running test for batch_size={batch_size} sequence_length={sequence_length} past_sequence_length={past_sequence_length}..."
+                        "Running test for batch_size=%d sequence_length=%d past_sequence_length=%d ...",
+                        batch_size,
+                        sequence_length,
+                        past_sequence_length,
+                    )
+
+                    dummy_inputs = gpt2helper.get_dummy_inputs(
+                        batch_size,
+                        past_sequence_length,
+                        sequence_length,
+                        config.num_attention_heads,
+                        config.hidden_size,
+                        config.n_layer,
+                        config.vocab_size,
+                        device,
+                        float16=(args.precision == Precision.FLOAT16),
+                        has_position_ids=use_padding,
+                        has_attention_mask=use_padding,
+                    )
+                    output_shapes = gpt2helper.get_output_shapes(
+                        batch_size,
+                        past_sequence_length,
+                        sequence_length,
+                        config,
+                        args.model_class,
                     )
-                    if model_type == "beam_search_step" or model_type == "configurable_one_step_search":
-                        dummy_inputs = gpt2helper.get_dummy_inputs(
-                            batch_size,
-                            past_sequence_length,
-                            sequence_length,
-                            config.num_attention_heads,
-                            config.hidden_size,
-                            config.n_layer,
-                            config.vocab_size,
-                            device,
-                            float16=(args.precision == Precision.FLOAT16),
-                            has_position_ids=use_padding,
-                            has_attention_mask=use_padding,
-                        )
-                        output_shapes = gpt2helper.get_output_shapes(
-                            batch_size,
-                            past_sequence_length,
-                            past_sequence_length,
-                            sequence_length,
-                            args.beam_size,
-                            0,
-                            config,
-                            args.model_class,
-                        )
-                    else:
-                        dummy_inputs = gpt2helper.get_dummy_inputs(
-                            batch_size,
-                            past_sequence_length,
-                            sequence_length,
-                            config.num_attention_heads,
-                            config.hidden_size,
-                            config.n_layer,
-                            config.vocab_size,
-                            device,
-                            float16=(args.precision == Precision.FLOAT16),
-                            has_position_ids=use_padding,
-                            has_attention_mask=use_padding,
-                        )
-                        output_shapes = gpt2helper.get_output_shapes(
-                            batch_size,
-                            past_sequence_length,
-                            sequence_length,
-                            config,
-                            args.model_class,
-                        )
 
                     try:
                         if args.validate_onnx or args.output_torch_latency:
diff --git a/onnxruntime/python/tools/transformers/models/gpt2/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/gpt2/convert_to_onnx.py
index 6a03a091ed..78e718e6e8 100644
--- a/onnxruntime/python/tools/transformers/models/gpt2/convert_to_onnx.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/convert_to_onnx.py
@@ -23,9 +23,8 @@ from pathlib import Path
 
 import numpy
 import torch
-from gpt2_beamsearch_helper import MODEL_CLASSES, Gpt2HelperFactory
-from gpt2_beamsearch_tester import Gpt2TesterFactory
-from gpt2_helper import DEFAULT_TOLERANCE, PRETRAINED_GPT2_MODELS
+from gpt2_helper import DEFAULT_TOLERANCE, MODEL_CLASSES, PRETRAINED_GPT2_MODELS, Gpt2Helper
+from gpt2_tester import Gpt2Tester
 from packaging import version
 from transformers import AutoConfig
 
@@ -174,61 +173,6 @@ def parse_arguments(argv=None):
         "Note that we will optimize 1 and 2 differently for best performance.",
     )
 
-    parser.add_argument(
-        "--beam_size",
-        type=int,
-        default=4,
-        help="Beam size if greedy/top-p/top-k sampling is needed",
-    )
-
-    search_option_group = parser.add_argument_group("configurable one step search options")
-
-    search_option_group.add_argument(
-        "--ignore_eos",
-        type=bool,
-        default=False,
-        help="If ignore end of sentence token in model inference.",
-    )
-    search_option_group.add_argument(
-        "--repetition_penalty",
-        type=float,
-        default=1,
-        help="Positive. >1 to penalize and <1 to encourage.",
-    )
-    search_option_group.add_argument(
-        "--temperature",
-        type=float,
-        default=1,
-        help="Softmax temperature for output logits.",
-    )
-    search_option_group.add_argument(
-        "--excluded_token_ids",
-        required=False,
-        nargs="+",
-        type=float,
-        help="A list of token ids to be excluded in inference.",
-    )
-    search_option_group.add_argument(
-        "--length_penalty",
-        type=float,
-        default=1,
-        help="Positive. >1 to penalize and <1 to encourage short sentence.",
-    )
-
-    sampling_option_group = parser.add_argument_group("one step sampling options")
-    sampling_option_group.add_argument(
-        "--do_sample",
-        action="store_true",
-        help="If to do sampling instead of beam search or greedy.",
-    )
-    sampling_option_group.add_argument(
-        "--do_sample_top_p",
-        type=float,
-        default=0.95,
-        help="Nuclear/top-p sampling accumulation probability.",
-    )
-    sampling_option_group.add_argument("--do_sample_top_k", type=int, default=0, help="Use top-k if non-zero.")
-
     fp16_option_group = parser.add_argument_group(
         'float to float16 conversion parameters that works when "--precision fp16" is specified'
     )
@@ -334,48 +278,15 @@ def main(argv=None, experiment_name: str = "", run_id: str = "0", csv_filename:
     model_class = MODEL_CLASSES[args.model_class][0]
     use_padding = MODEL_CLASSES[args.model_class][2]
 
-    if args.model_class == "GPT2LMHeadModel_BeamSearchStep":
-        model_type = "beam_search_step"
-    elif args.model_class == "GPT2LMHeadModel_ConfigurableOneStepSearch":
-        model_type = "configurable_one_step_search"
-    else:
-        model_type = "default"
-
-    gpt2helper = Gpt2HelperFactory.create_helper(model_type)
-    gpt2tester = Gpt2TesterFactory.create_tester(model_type)
+    gpt2helper = Gpt2Helper
     config = AutoConfig.from_pretrained(args.model_name_or_path, cache_dir=cache_dir)
-    if model_type == "beam_search_step":
-        model = model_class.from_pretrained(
-            args.model_name_or_path,
-            config=config,
-            batch_size=1,
-            beam_size=args.beam_size,
-            cache_dir=cache_dir,
-        )
-    elif model_type == "configurable_one_step_search":
-        model = model_class.from_pretrained(
-            args.model_name_or_path,
-            config=config,
-            batch_size=1,
-            beam_size=args.beam_size,
-            ignore_eos=args.ignore_eos,
-            temperature=args.temperature,
-            repetition_penalty=args.repetition_penalty,
-            excluded_token_ids=args.excluded_token_ids,
-            length_penalty=args.length_penalty,
-            do_sample=args.do_sample,
-            do_sample_top_p=args.do_sample_top_p,
-            do_sample_top_k=args.do_sample_top_k,
-            cache_dir=cache_dir,
-        )
-    else:
-        model = model_class.from_pretrained(args.model_name_or_path, config=config, cache_dir=cache_dir)
+    model = model_class.from_pretrained(args.model_name_or_path, config=config, cache_dir=cache_dir)
 
     device = torch.device("cuda:0" if args.use_gpu else "cpu")
     model.eval().to(device)
 
     if (not args.use_external_data_format) and (config.n_layer > 24):
-        logger.info(f"Try --use_external_data_format when model size > 2GB")
+        logger.info("Try --use_external_data_format when model size > 2GB")
 
     onnx_model_paths = gpt2helper.get_onnx_paths(
         output_dir,
@@ -628,22 +539,9 @@ def main(argv=None, experiment_name: str = "", run_id: str = "0", csv_filename:
                 else:
                     inputs = {"input_ids": input_ids.to(int_data_type)}
 
-                if model_type == "beam_search_step" or model_type == "configurable_one_step_search":
-                    beam_select_idx = torch.zeros([1, input_ids.shape[0]]).long()
-
-                    input_log_probs = torch.zeros([input_ids.shape[0], 1])
-                    input_unfinished_sents = torch.ones([input_ids.shape[0], 1], dtype=torch.bool)
-                    inputs.update(
-                        {
-                            "beam_select_idx": beam_select_idx,
-                            "input_log_probs": input_log_probs,
-                            "input_unfinished_sents": input_unfinished_sents,
-                        }
-                    )
-
                 test_inputs.append(inputs)
 
-        gpt2tester.test_generation(
+        Gpt2Tester.test_generation(
             session,
             model,
             device,
diff --git a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_beamsearch_helper.py b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_beamsearch_helper.py
deleted file mode 100644
index 65729cf068..0000000000
--- a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_beamsearch_helper.py
+++ /dev/null
@@ -1,1018 +0,0 @@
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.  See License.txt in the project root for
-# license information.
-# --------------------------------------------------------------------------
-# This script helps onnx conversion and validation for GPT2 model with past state.
-import logging
-import os
-import random
-import sys
-import time
-from pathlib import Path
-from typing import Dict, List, Union
-
-import numpy
-import torch
-from gpt2_helper import Gpt2Helper, Gpt2Inputs, MyGPT2LMHeadModel, MyGPT2LMHeadModel_NoPadding, MyGPT2Model
-from transformers import GPT2Config, GPT2LMHeadModel
-
-sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
-from io_binding_helper import TypeHelper
-from torch_onnx_export_helper import torch_onnx_export
-
-logger = logging.getLogger(__name__)
-
-BIG_NEG = -1e4
-
-
-class Gpt2HelperFactory:
-    @staticmethod
-    def create_helper(helper_type="default"):
-        helpers = {
-            "default": Gpt2Helper,
-            "beam_search_step": Gpt2BeamSearchHelper,
-            "configurable_one_step_search": Gpt2BeamSearchHelper,
-        }
-        w = helpers[helper_type]
-        return w
-
-
-class GPT2LMHeadModel_BeamSearchStep(GPT2LMHeadModel):
-    """Here we wrap a class for Onnx model conversion for GPT2LMHeadModel with past state and one
-    step beam search."""
-
-    def __init__(self, config, batch_size, beam_size):
-        super().__init__(config)
-        self.config.batch_size = batch_size
-        self.config.beam_size = beam_size
-
-    def forward(
-        self,
-        input_ids,
-        position_ids,
-        attention_mask,
-        beam_select_idx,
-        input_log_probs,
-        input_unfinished_sents,
-        prev_step_results,
-        prev_step_scores,
-        *past,
-    ):
-        input_ids = input_ids.view(self.config.batch_size, -1, input_ids.size(-1))
-        past = [past[i].index_select(1, beam_select_idx[0]) for i in range(len(past))]
-        result = super().forward(
-            input_ids.view(-1, input_ids.size(-1)),
-            position_ids=position_ids,
-            attention_mask=attention_mask,
-            past_key_values=past,
-            return_dict=False,
-        )
-        logits_flat, present_flat = MyGPT2Model.post_process(result, self.config.n_layer)
-        next_token_logits = logits_flat[:, -1].view(self.config.batch_size, -1, logits_flat.size(-1))
-        next_token_log_probs = torch.log_softmax(next_token_logits, dim=-1)
-        next_token_log_probs, next_token_ids = torch.topk(
-            next_token_log_probs,
-            self.config.beam_size,
-            dim=-1,
-            largest=True,
-            sorted=True,
-        )
-
-        # finished sentences is always with EOS, and all but the first one has -inf, so that they will be automatically dropped in the round of beam search.
-        finished_sents = ~input_unfinished_sents
-        next_token_log_probs.masked_fill_(finished_sents.unsqueeze(-1), -numpy.inf)
-        next_token_log_probs[..., 0].masked_fill_(finished_sents, 0)
-        next_token_ids.masked_fill_(finished_sents.unsqueeze(-1), self.config.eos_token_id)
-        output_log_probs = input_log_probs.unsqueeze(-1) + next_token_log_probs
-
-        # select N sequences from beams of each input, sorted by sequence probability
-        output_log_probs = output_log_probs.view(self.config.batch_size, -1)  # shape=(batch, beam_size^2)
-        output_log_probs, selected_index_flat = output_log_probs.topk(
-            self.config.beam_size, dim=-1, largest=True, sorted=True
-        )  # output shape=(batch, beam_size)
-
-        # select the correspondent sentences/next tokens
-        selected_input_seq = torch.div(
-            selected_index_flat, self.config.beam_size, rounding_mode="trunc"
-        )  # selected_index_flat // self.config.beam_size
-
-        next_token_ids = next_token_ids.view(self.config.batch_size, -1).gather(-1, selected_index_flat)
-
-        prev_step_results = prev_step_results.view(self.config.batch_size, -1, prev_step_results.size(-1))
-        prev_step_results = prev_step_results.gather(
-            1, selected_input_seq.unsqueeze(-1).repeat(1, 1, prev_step_results.size(-1))
-        )
-
-        output_unfinished_sents = input_unfinished_sents.gather(1, selected_input_seq)
-        # Add ones_like to walkaround error like Shape mismatch attempting to re-use buffer. {1,1} != {1,4}
-        output_unfinished_sents = output_unfinished_sents & next_token_ids.ne(
-            torch.ones_like(next_token_ids, dtype=torch.int) * self.config.eos_token_id
-        )
-
-        # get the next full input_ids
-        current_step_results = torch.cat([prev_step_results, next_token_ids.unsqueeze(-1)], dim=-1).contiguous()
-
-        prev_step_scores = prev_step_scores.view(self.config.batch_size, -1, prev_step_scores.size(-1))
-        prev_step_scores = prev_step_scores.gather(
-            1, selected_input_seq.unsqueeze(-1).repeat(1, 1, prev_step_scores.size(-1))
-        )
-        current_step_scores = torch.cat([prev_step_scores, output_log_probs.unsqueeze(-1)], dim=-1).contiguous()
-
-        return (
-            next_token_ids,
-            present_flat,
-            selected_input_seq,
-            output_log_probs,
-            output_unfinished_sents,
-            current_step_results.view(self.config.batch_size * self.config.beam_size, -1),
-            current_step_scores.view(self.config.batch_size * self.config.beam_size, -1),
-        )
-
-
-class GPT2LMHeadModel_ConfigurableOneStepSearch(GPT2LMHeadModel):
-    """Here we wrap a class for Onnx model conversion for GPT2LMHeadModel with past state and one
-    step beam search with configuration support."""
-
-    def __init__(
-        self,
-        config,
-        batch_size,
-        beam_size,
-        ignore_eos=False,
-        temperature=1.0,
-        repetition_penalty=1.0,
-        excluded_token_ids=None,
-        length_penalty=1.0,
-        do_sample=False,
-        do_sample_top_p=1,
-        do_sample_top_k=0,
-    ):
-        super().__init__(config)
-        self.config.batch_size = batch_size
-        self.config.beam_size = beam_size
-        self.config.ignore_eos = ignore_eos
-        self.config.temperature = temperature
-        self.config.repetition_penalty = repetition_penalty
-        self.config.excluded_token_ids = excluded_token_ids
-        self.config.length_penalty = length_penalty
-        self.config.do_sample = do_sample
-        self.config.do_sample_top_p = do_sample_top_p
-        self.config.do_sample_top_k = do_sample_top_k
-
-    @staticmethod
-    def collapse_first_two_dims(tensor):
-        return tensor.view(-1, *tensor.size()[2:])
-
-    @staticmethod
-    def top_k_top_p_filtering(log_probs, top_p=1.0, top_k=0):
-        """Set tail event (out of top_p) to a big negative number"""
-        sorted_log_probs, sorted_indices = torch.sort(log_probs, descending=True)
-        cumulative_probs = torch.cumsum(sorted_log_probs.exp(), dim=-1)
-        sorted_indices_to_remove = cumulative_probs >= top_p
-        sorted_indices_to_remove = torch.cat(
-            [
-                torch.zeros_like(sorted_indices_to_remove[..., :1]),
-                sorted_indices_to_remove[..., :-1],
-            ],
-            dim=-1,
-        )
-        if top_k > 0:
-            sorted_indices_to_remove = torch.cat(
-                [
-                    sorted_indices_to_remove[..., :top_k],
-                    torch.ones_like(sorted_indices_to_remove[..., top_k:]),
-                ],
-                dim=-1,
-            )
-        sorted_log_probs.masked_fill_(sorted_indices_to_remove, BIG_NEG)
-        return log_probs.scatter(-1, sorted_indices, sorted_log_probs)
-
-    def forward(
-        self,
-        input_ids,
-        beam_select_idx,
-        input_log_probs,
-        input_unfinished_sents,
-        prev_step_scores,
-        *past,
-    ):
-        input_ids = input_ids.view(self.config.batch_size, -1, input_ids.size(-1))
-        input_num_seq_per_sample = input_ids.size(1)
-
-        input_ids_unfinished_flat = self.collapse_first_two_dims(input_ids).index_select(
-            0, input_unfinished_sents.view(-1).nonzero(as_tuple=False).view(-1)
-        )
-
-        if self.config.ignore_eos:
-            attention_mask = (input_ids_unfinished_flat != self.config.eos_token_id).float()
-        else:
-            attention_mask = torch.ones(input_ids_unfinished_flat.shape).float().to(input_ids_unfinished_flat.device)
-        position_ids = (attention_mask.cumsum(-1) - 1).clamp(min=0).long()
-
-        if past:
-            last_seq_len = past[0].size(-2)
-            # input_ids and position_ids contains past sequence
-            input_ids_unfinished_flat = input_ids_unfinished_flat[:, last_seq_len:]
-            position_ids = position_ids[:, last_seq_len:]
-
-            unfinished_index_relative_to_last_unfinished = beam_select_idx.view(-1)[
-                input_unfinished_sents.view(-1).nonzero(as_tuple=False).view(-1)
-            ]
-
-            past = tuple([p.index_select(1, unfinished_index_relative_to_last_unfinished) for p in past])
-
-        result = super().forward(
-            input_ids_unfinished_flat.view(-1, input_ids_unfinished_flat.size(-1)),
-            position_ids=position_ids,
-            attention_mask=attention_mask,
-            past_key_values=past,
-            return_dict=False,
-        )
-        logits_flat, present_flat = MyGPT2Model.post_process(result, self.config.n_layer)
-
-        # insert finished sequence back to form a square shape of (batch_size, beam_size)
-        next_token_logits = logits_flat.new_zeros(input_ids.size()[:2] + (logits_flat.size(-1),))
-        next_token_logits.index_fill_(
-            2,
-            torch.LongTensor([self.config.eos_token_id]).to(input_ids.device),
-            -BIG_NEG,
-        )
-
-        next_token_logits.masked_scatter_(
-            input_unfinished_sents.unsqueeze(-1).expand_as(next_token_logits),
-            logits_flat[:, -1],
-        )
-
-        # repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858)
-        if self.config.repetition_penalty != 1.0:
-            _pen = next_token_logits.gather(2, input_ids)
-            _pen = torch.where(
-                _pen > 0,
-                _pen / self.config.repetition_penalty,
-                _pen * self.config.repetition_penalty,
-            )
-            next_token_logits.scatter_(2, input_ids, _pen)
-
-        # similar way to encourage short sentence
-        if self.config.length_penalty != 1.0:
-            _pen = next_token_logits[..., self.config.eos_token_id]
-            # if eos > 0, increase it, else, decrease it.
-            _pen = torch.where(
-                _pen > 0,
-                _pen * self.config.length_penalty,
-                _pen / self.config.length_penalty,
-            )
-            next_token_logits[..., self.config.eos_token_id] = _pen
-
-        if self.config.temperature != 1.0:
-            next_token_logits = next_token_logits / self.config.temperature
-
-        # exclude excluded_token_ids
-        if self.config.excluded_token_ids is not None:
-            next_token_logits.index_fill_(
-                2, self.config.excluded_token_ids.to(next_token_logits.device), BIG_NEG
-            )  # batch x beams/sequences x vocab_size
-
-        next_token_log_probs = torch.log_softmax(next_token_logits, dim=-1)
-
-        if self.config.do_sample:
-            vocab_size = next_token_log_probs.size(-1)
-            _next_token_log_probs = self.top_k_top_p_filtering(
-                next_token_log_probs.view(-1, vocab_size),
-                top_k=self.config.do_sample_top_k,
-                top_p=self.config.do_sample_top_p,
-            )
-            next_token_ids = torch.multinomial(
-                _next_token_log_probs.exp(),
-                num_samples=self.config.beam_size,
-                replacement=False,
-            )
-            next_token_ids = next_token_ids.view(self.config.batch_size, input_num_seq_per_sample, -1)
-            next_token_log_probs = next_token_log_probs.gather(-1, next_token_ids)
-        else:
-            next_token_log_probs, next_token_ids = torch.topk(
-                next_token_log_probs,
-                self.config.beam_size,
-                dim=-1,
-                largest=True,
-                sorted=True,
-            )
-
-        output_log_probs = input_log_probs.unsqueeze(-1) + next_token_log_probs
-
-        # select N sequences from beams of each input, sorted by sequence probability
-        output_log_probs = output_log_probs.view(self.config.batch_size, -1)  # shape=(batch, beam_size^2)
-        output_log_probs, selected_index_flat = output_log_probs.topk(
-            self.config.beam_size, dim=-1, largest=True, sorted=True
-        )  # output shape=(batch, beam_size)
-
-        # select the correspondent sentences/next tokens
-        selected_input_seq = torch.div(
-            selected_index_flat, self.config.beam_size, rounding_mode="trunc"
-        )  # selected_index_flat // self.config.beam_size
-
-        next_token_ids = next_token_ids.view(self.config.batch_size, -1).gather(-1, selected_index_flat)
-
-        prev_step_results = input_ids.view(self.config.batch_size, -1, input_ids.size(-1)).contiguous()
-        prev_step_results = prev_step_results.gather(
-            1,
-            selected_input_seq.unsqueeze(-1).expand(selected_input_seq.shape + (prev_step_results.size(-1),)),
-        )
-
-        output_unfinished_sents = input_unfinished_sents.gather(1, selected_input_seq)
-        output_unfinished_sents = output_unfinished_sents & next_token_ids.ne(self.config.eos_token_id)
-
-        current_step_results = torch.cat([prev_step_results, next_token_ids.unsqueeze(-1)], dim=-1).contiguous()
-
-        prev_step_scores = prev_step_scores.view(self.config.batch_size, -1, prev_step_scores.size(-1))
-        prev_step_scores = prev_step_scores.gather(
-            1,
-            selected_input_seq.unsqueeze(-1).expand(selected_input_seq.shape + (prev_step_scores.size(-1),)),
-        )
-        current_step_scores = torch.cat([prev_step_scores, output_log_probs.unsqueeze(-1)], dim=-1).contiguous()
-
-        # For next past state
-        index_relative_to_last_unfinished = (
-            (input_unfinished_sents.view(-1).float().cumsum(-1) - 1)
-            .clamp(min=0)
-            .long()
-            .reshape_as(input_unfinished_sents)
-            .gather(1, selected_input_seq)
-        )
-
-        return (
-            current_step_results.view(self.config.batch_size * self.config.beam_size, -1),
-            present_flat,
-            index_relative_to_last_unfinished,
-            output_log_probs,
-            output_unfinished_sents,
-            current_step_scores.view(self.config.batch_size * self.config.beam_size, -1),
-        )
-
-
-# Maps model class name to a tuple of model class, name of first output and use padding or not
-MODEL_CLASSES = {
-    "GPT2LMHeadModel": (MyGPT2LMHeadModel, "logits", True),
-    "GPT2LMHeadModel_NoPadding": (MyGPT2LMHeadModel_NoPadding, "logits", False),
-    "GPT2Model": (MyGPT2Model, "last_state", True),
-    "GPT2LMHeadModel_BeamSearchStep": (
-        GPT2LMHeadModel_BeamSearchStep,
-        "last_state",
-        True,
-    ),
-    "GPT2LMHeadModel_ConfigurableOneStepSearch": (
-        GPT2LMHeadModel_ConfigurableOneStepSearch,
-        "last_state",
-        False,
-    ),
-}
-
-
-class Gpt2BeamSearchInputs(Gpt2Inputs):
-    def __init__(
-        self,
-        input_ids,
-        past,
-        position_ids,
-        attention_mask,
-        beam_select_idx=None,
-        input_log_probs=None,
-        input_unfinished_sents=None,
-        prev_step_results=None,
-        prev_step_scores=None,
-    ):
-        super().__init__(input_ids, position_ids, attention_mask, past=past)
-        self.prev_step_results: torch.LongTensor = prev_step_results
-        self.prev_step_scores: Union[torch.FloatTensor, torch.HalfTensor, torch.cuda.FloatTensor] = prev_step_scores
-        if beam_select_idx is None:
-            self.beam_select_idx: torch.LongTensor = torch.zeros([1, len(input_ids)]).long()
-        else:
-            self.beam_select_idx: torch.LongTensor = beam_select_idx
-        self.input_log_probs: Union[torch.FloatTensor, torch.HalfTensor, torch.cuda.FloatTensor] = input_log_probs
-        self.input_unfinished_sents: torch.ByteTensor = input_unfinished_sents
-
-    def to_list(self) -> List:
-        input_list = [
-            v
-            for v in [
-                self.input_ids,
-                self.position_ids,
-                self.attention_mask,
-                self.beam_select_idx,
-                self.input_log_probs,
-                self.input_unfinished_sents,
-                self.prev_step_results,
-                self.prev_step_scores,
-            ]
-            if v is not None
-        ]
-        if self.past:
-            input_list.extend(self.past)
-        return input_list
-
-    def to_fp32(self):
-        past = [p.to(dtype=torch.float32) for p in self.past]
-        attention_mask = (
-            self.attention_mask.to(dtype=torch.float32) if self.attention_mask is not None else self.attention_mask
-        )
-        return Gpt2BeamSearchInputs(
-            self.input_ids,
-            past,
-            self.position_ids,
-            attention_mask,
-            self.beam_select_idx,
-            self.input_log_probs.to(dtype=torch.float32),
-            self.input_unfinished_sents,
-            self.prev_step_results,
-            self.prev_step_scores.to(dtype=torch.float32),
-        )
-
-
-class Gpt2BeamSearchHelper(Gpt2Helper):
-    """A helper class for Gpt2 model conversion, inference and verification."""
-
-    @staticmethod
-    def get_dummy_inputs(
-        batch_size: int,
-        past_sequence_length: int,
-        sequence_length: int,
-        num_attention_heads: int,
-        hidden_size: int,
-        num_layer: int,
-        vocab_size: int,
-        device: torch.device,
-        float16: bool = False,
-        has_position_ids: bool = True,
-        has_attention_mask: bool = True,
-        input_ids_dtype: torch.dtype = torch.int64,
-        position_ids_dtype: torch.dtype = torch.int64,
-        attention_mask_dtype: torch.dtype = torch.int64,
-    ) -> Gpt2BeamSearchInputs:
-        """Create random inputs for GPT2 beam search."""
-        gpt2_dummy_inputs = Gpt2Helper.get_dummy_inputs(
-            batch_size,
-            past_sequence_length,
-            sequence_length,
-            num_attention_heads,
-            hidden_size,
-            num_layer,
-            vocab_size,
-            device,
-            float16,
-            has_position_ids,
-            has_attention_mask,
-            input_ids_dtype=input_ids_dtype,
-            position_ids_dtype=position_ids_dtype,
-            attention_mask_dtype=attention_mask_dtype,
-        )
-        float_type = torch.float16 if float16 else torch.float32
-
-        beam_select_idx = torch.zeros([1, batch_size], device=device).long()
-        input_log_probs = torch.zeros([batch_size, 1], dtype=float_type, device=device)
-        input_unfinished_sents = torch.ones([batch_size, 1], dtype=torch.bool, device=device)
-        if has_position_ids:
-            prev_step_results = torch.randint(
-                low=0,
-                high=vocab_size - 1,
-                size=(batch_size, sequence_length),
-                dtype=torch.int64,
-                device=device,
-            )
-        else:
-            prev_step_results = None
-
-        prev_step_scores = torch.zeros([batch_size, 1], dtype=float_type, device=device)
-
-        return Gpt2BeamSearchInputs(
-            gpt2_dummy_inputs.input_ids,
-            gpt2_dummy_inputs.past,
-            gpt2_dummy_inputs.position_ids,
-            gpt2_dummy_inputs.attention_mask,
-            beam_select_idx,
-            input_log_probs,
-            input_unfinished_sents,
-            prev_step_results,
-            prev_step_scores,
-        )
-
-    @staticmethod
-    def get_output_shapes(
-        batch_size: int,
-        context_len: int,
-        past_sequence_length: int,
-        sequence_length: int,
-        beam_size: int,
-        step: int,
-        config: GPT2Config,
-        model_class: str = "GPT2LMHeadModel_BeamSearchStep",
-        num_seq: int = 0,
-    ) -> Dict[str, List[int]]:
-        """Returns a dictionary with output name as key, and shape as value."""
-        num_attention_heads = config.num_attention_heads
-        hidden_size = config.hidden_size
-        num_layer = config.num_hidden_layers
-        vocab_size = config.vocab_size
-
-        output_name = MODEL_CLASSES[model_class][1]
-
-        if model_class == "GPT2LMHeadModel_BeamSearchStep":
-            last_state_shape = [batch_size, beam_size]
-        else:
-            last_state_shape = [
-                batch_size * beam_size,
-                past_sequence_length - context_len + sequence_length + 1,
-            ]
-
-        if model_class == "GPT2LMHeadModel_BeamSearchStep":
-            if step == 0:
-                present_state_shape = [
-                    2,
-                    batch_size,
-                    num_attention_heads,
-                    past_sequence_length + sequence_length,
-                    int(hidden_size / num_attention_heads),
-                ]
-            else:
-                if num_seq == 0:
-                    num_seq = beam_size
-
-                present_state_shape = [
-                    2,
-                    batch_size * num_seq,
-                    num_attention_heads,
-                    past_sequence_length + sequence_length,
-                    int(hidden_size / num_attention_heads),
-                ]
-        else:
-            present_state_shape = [
-                2,
-                batch_size,
-                num_attention_heads,
-                past_sequence_length - context_len + sequence_length,
-                int(hidden_size / num_attention_heads),
-            ]
-
-        output_shapes = {output_name: last_state_shape}
-        for i in range(num_layer):
-            output_shapes["present_" + str(i)] = present_state_shape
-
-        # TODO: reshape output_selected_indices as [batch_size, beam_size]
-        output_shapes["output_selected_indices"] = [1, batch_size * beam_size]
-        output_shapes["output_log_probs"] = [batch_size, beam_size]
-        output_shapes["output_unfinished_sents"] = [batch_size, beam_size]
-        if model_class == "GPT2LMHeadModel_BeamSearchStep":
-            output_shapes["current_step_results"] = [
-                batch_size * beam_size,
-                past_sequence_length - context_len + sequence_length + 1,
-            ]
-        output_shapes["current_step_scores"] = [
-            batch_size * beam_size,
-            past_sequence_length - context_len + 2,
-        ]
-        print("output_shapes", output_shapes)
-        return output_shapes
-
-    @staticmethod
-    def get_output_buffers(output_shapes, device, is_float16=False):
-        """Returns a dictionary of output name as key, and 1D tensor as value. The tensor has enough space for given shape."""
-        data_type = torch.float16 if is_float16 else torch.float32
-
-        output_buffers = {}
-        for name, shape in output_shapes.items():
-            if name == "output_selected_indices" or name == "current_step_results" or name == "last_state":
-                output_buffers[name] = torch.empty(numpy.prod(shape), dtype=torch.long, device=device)
-            elif name == "output_unfinished_sents":
-                output_buffers[name] = torch.empty(numpy.prod(shape), dtype=torch.bool, device=device)
-            else:
-                output_buffers[name] = torch.empty(numpy.prod(shape), dtype=data_type, device=device)
-        return output_buffers
-
-    @staticmethod
-    def compare_outputs(
-        torch_outputs,
-        ort_outputs,
-        model_class="GPT2LMHeadModel_BeamSearchStep",
-        rtol=1e-03,
-        atol=1e-03,
-    ):
-        """Returns True if torch and ORT outputs are close for given thresholds, and False otherwise."""
-        if model_class == "GPT2LMHeadModel_BeamSearchStep":
-            results_id = -4
-            num_layers = len(ort_outputs) - 6
-        else:
-            results_id = 0
-            num_layers = len(ort_outputs) - 5
-
-        is_close = numpy.allclose(
-            ort_outputs[results_id],
-            torch_outputs[results_id].cpu().numpy(),
-            rtol=rtol,
-            atol=atol,
-        )
-        logger.debug(f"PyTorch and OnnxRuntime output 0 (last_state) are close: {is_close}")
-
-        is_all_close = is_close
-        for layer in range(num_layers):
-            is_close = numpy.allclose(
-                ort_outputs[1 + layer],
-                torch_outputs[1][layer].cpu().numpy(),
-                rtol=rtol,
-                atol=atol,
-            )
-            logger.debug(f"PyTorch and OnnxRuntime layer {layer} state (present_{layer}) are close:{is_close}")
-            is_all_close = is_all_close and is_close
-
-        if not is_all_close:
-            max_abs_diff = Gpt2BeamSearchHelper.diff_outputs(torch_outputs, ort_outputs)
-            logger.info(f"PyTorch and OnnxRuntime results are not all close: max_abs_diff={max_abs_diff:.5f}")
-
-        return is_all_close
-
-    @staticmethod
-    def export_onnx(
-        model,
-        device,
-        onnx_model_path: str,
-        verbose: bool = False,
-        use_external_data_format: bool = False,
-        has_position_ids: bool = True,
-        has_attention_mask: bool = True,
-    ):
-        """Export GPT-2 model with past state to ONNX model."""
-        assert isinstance(
-            model,
-            (GPT2LMHeadModel_BeamSearchStep, GPT2LMHeadModel_ConfigurableOneStepSearch),
-        )
-
-        config: GPT2Config = model.config
-        num_layer = config.n_layer
-        dummy_inputs = Gpt2BeamSearchHelper.get_dummy_inputs(
-            batch_size=1,
-            past_sequence_length=1,
-            sequence_length=2,
-            num_attention_heads=config.num_attention_heads,
-            hidden_size=config.hidden_size,
-            num_layer=num_layer,
-            vocab_size=config.vocab_size,
-            device=device,
-            float16=False,
-            has_position_ids=has_position_ids,
-            has_attention_mask=has_attention_mask,
-        )
-        input_list = dummy_inputs.to_list()
-
-        with torch.no_grad():
-            outputs = model(*input_list)
-
-        past_names = [f"past_{i}" for i in range(num_layer)]
-        present_names = [f"present_{i}" for i in range(num_layer)]
-
-        output_names = ["last_state"] + present_names
-
-        if has_position_ids:
-            output_names += [
-                "output_selected_indices",
-                "output_log_probs",
-                "output_unfinished_sents",
-                "current_step_results",
-                "current_step_scores",
-            ]
-        else:
-            output_names += [
-                "output_selected_indices",
-                "output_log_probs",
-                "output_unfinished_sents",
-                "current_step_scores",
-            ]
-
-        dynamic_axes = {
-            "input_ids": {0: "batch_size", 1: "seq_len"},
-            output_names[0]: {0: "batch_size", 1: "seq_len"},
-        }
-        for name in past_names:
-            dynamic_axes[name] = {1: "batch_size", 3: "past_seq_len"}
-        for name in present_names:
-            dynamic_axes[name] = {1: "batch_size", 3: "cur_seq_len"}
-
-        input_names = ["input_ids"]
-        if has_position_ids:
-            dynamic_axes["position_ids"] = {0: "batch_size", 1: "seq_len"}
-            input_names.append("position_ids")
-        if has_attention_mask:
-            dynamic_axes["attention_mask"] = {0: "batch_size", 1: "total_seq_len"}
-            input_names.append("attention_mask")
-        dynamic_axes["beam_select_idx"] = {1: "batch_size"}
-        input_names.append("beam_select_idx")
-        dynamic_axes["input_log_probs"] = {0: "batch_size", 1: "beam_size"}
-        input_names.append("input_log_probs")
-        dynamic_axes["input_unfinished_sents"] = {0: "batch_size", 1: "beam_size"}
-        input_names.append("input_unfinished_sents")
-        if has_position_ids:
-            dynamic_axes["prev_step_results"] = {0: "batch_size", 1: "total_seq_len"}
-            input_names.append("prev_step_results")
-        dynamic_axes["prev_step_scores"] = {0: "batch_size", 1: "total_seq_len"}
-        input_names.append("prev_step_scores")
-        input_names.extend(past_names)
-
-        # add dynamic output axes
-        present_axes = {1: "batch_size", 3: "cur_seq_len"}
-
-        if isinstance(model, GPT2LMHeadModel_BeamSearchStep):
-            dynamic_axes["last_state"] = {0: "batch_size", 1: "beam_size"}
-        else:
-            dynamic_axes["last_state"] = {
-                0: "batch_size * beam_size",
-                1: "total_seq_len",
-            }
-
-        for i in range(num_layer):
-            dynamic_axes["present_" + str(i)] = present_axes
-
-        dynamic_axes["output_selected_indices"] = {1: "batch_size * beam_size"}
-        dynamic_axes["output_log_probs"] = {0: "batch_size", 1: "beam_size"}
-        dynamic_axes["output_unfinished_sents"] = {0: "batch_size", 1: "beam_size"}
-
-        if "current_step_results" in output_names:
-            dynamic_axes["current_step_results"] = {
-                0: "batch_size * beam_size",
-                1: "total_seq_len",
-            }
-
-        dynamic_axes["current_step_scores"] = {0: "batch_size * beam_size"}
-
-        logger.info(
-            f"Shapes: input_ids={dummy_inputs.input_ids.shape} past={dummy_inputs.past[0].shape} output={outputs[0].shape} present={outputs[1][0].shape}"
-        )
-
-        Path(onnx_model_path).parent.mkdir(parents=True, exist_ok=True)
-
-        torch_onnx_export(
-            model,
-            args=tuple(input_list),
-            f=onnx_model_path,
-            input_names=input_names,
-            output_names=output_names,
-            dynamic_axes=dynamic_axes,
-            opset_version=14,
-            do_constant_folding=True,
-            use_external_data_format=use_external_data_format,
-            verbose=verbose,
-        )
-
-    @staticmethod
-    def onnxruntime_inference(ort_session, inputs: Gpt2BeamSearchInputs, total_runs: int = 0):
-        """Run inference of ONNX model, and returns average latency in ms when total_runs > 0 besides outputs."""
-        logger.debug(f"start onnxruntime_inference")
-
-        ort_inputs = {"input_ids": numpy.ascontiguousarray(inputs.input_ids.cpu().numpy())}
-
-        if inputs.position_ids is not None:
-            ort_inputs["position_ids"] = numpy.ascontiguousarray(inputs.position_ids.cpu().numpy())
-        if inputs.attention_mask is not None:
-            ort_inputs["attention_mask"] = numpy.ascontiguousarray(inputs.attention_mask.cpu().numpy())
-        if inputs.beam_select_idx is not None:
-            ort_inputs["beam_select_idx"] = numpy.ascontiguousarray(inputs.beam_select_idx.cpu().numpy())
-        if inputs.input_log_probs is not None:
-            ort_inputs["input_log_probs"] = numpy.ascontiguousarray(inputs.input_log_probs.cpu().numpy())
-        if inputs.input_unfinished_sents is not None:
-            ort_inputs["input_unfinished_sents"] = numpy.ascontiguousarray(inputs.input_unfinished_sents.cpu().numpy())
-        if inputs.prev_step_results is not None:
-            ort_inputs["prev_step_results"] = numpy.ascontiguousarray(inputs.prev_step_results.cpu().numpy())
-        if inputs.prev_step_scores is not None:
-            ort_inputs["prev_step_scores"] = numpy.ascontiguousarray(inputs.prev_step_scores.cpu().numpy())
-        if inputs.past is not None:
-            for i, past_i in enumerate(inputs.past):
-                ort_inputs[f"past_{i}"] = numpy.ascontiguousarray(past_i.cpu().numpy())
-
-        ort_outputs = ort_session.run(None, ort_inputs)
-        if total_runs == 0:
-            return ort_outputs
-
-        latency = []
-        for _ in range(total_runs):
-            start = time.time()
-            ort_outputs = ort_session.run(None, ort_inputs)
-            latency.append(time.time() - start)
-
-        average_latency = sum(latency) * 1000 / len(latency)
-        logger.debug("OnnxRuntime Inference time = {} ms".format(format(average_latency, ".2f")))
-
-        return ort_outputs, average_latency
-
-    @staticmethod
-    def prepare_io_binding(
-        ort_session,
-        input_ids,
-        position_ids,
-        attention_mask,
-        past,
-        output_buffers,
-        output_shapes,
-        beam_select_idx=None,
-        input_log_probs=None,
-        input_unfinished_sents=None,
-        prev_step_results=None,
-        prev_step_scores=None,
-    ):
-        """Returnas IO binding object for a session."""
-
-        # Bind (input_ids, position_ids, attention_mask and past_*) and all outputs
-        io_binding = Gpt2Helper.prepare_io_binding(
-            ort_session,
-            input_ids,
-            position_ids,
-            attention_mask,
-            past=past,
-            output_buffers=output_buffers,
-            output_shapes=output_shapes,
-        )
-
-        # Bind the remaining inputs
-        other_inputs = {
-            "beam_select_idx": beam_select_idx,
-            "input_log_probs": input_log_probs,
-            "input_unfinished_sents": input_unfinished_sents,
-            "prev_step_results": prev_step_results,
-            "prev_step_scores": prev_step_scores,
-        }
-        name_to_np_type = TypeHelper.get_io_numpy_type_map(ort_session)
-        for name, tensor in other_inputs.items():
-            if tensor is not None:
-                assert tensor.is_contiguous()
-                io_binding.bind_input(
-                    name,
-                    tensor.device.type,
-                    0,
-                    name_to_np_type[name],
-                    list(tensor.size()),
-                    tensor.data_ptr(),
-                )
-
-        return io_binding
-
-    @staticmethod
-    def onnxruntime_inference_with_binded_io(
-        ort_session,
-        inputs: Gpt2BeamSearchInputs,
-        output_buffers: Dict[str, torch.Tensor],
-        output_shapes: Dict[str, List[int]],
-        total_runs: int = 0,
-        return_numpy: bool = True,
-        include_copy_output_latency: bool = False,
-    ):
-        """Inference with IO binding. Returns outputs, and optional latency when total_runs > 0."""
-        logger.debug(f"start onnxruntime_inference_with_binded_io")
-
-        # Bind inputs and outputs to onnxruntime session
-        io_binding = Gpt2BeamSearchHelper.prepare_io_binding(
-            ort_session,
-            inputs.input_ids,
-            inputs.position_ids,
-            inputs.attention_mask,
-            inputs.past,
-            output_buffers,
-            output_shapes,
-            inputs.beam_select_idx,
-            inputs.input_log_probs,
-            inputs.input_unfinished_sents,
-            inputs.prev_step_results,
-            inputs.prev_step_scores,
-        )
-
-        # Run onnxruntime with io binding
-        ort_session.run_with_iobinding(io_binding)
-
-        # Copy results to cpu for verification
-        ort_outputs = Gpt2BeamSearchHelper.get_outputs_from_io_binding_buffer(
-            ort_session, output_buffers, output_shapes, return_numpy
-        )
-
-        if total_runs == 0:
-            return ort_outputs
-
-        latency = []
-        for _ in range(total_runs):
-            start = time.time()
-            # Run onnxruntime with io binding
-            ort_session.run_with_iobinding(io_binding)
-            if include_copy_output_latency:
-                _ = Gpt2BeamSearchHelper.get_outputs_from_io_binding_buffer(
-                    ort_session, output_buffers, output_shapes, return_numpy
-                )
-            latency.append(time.time() - start)
-
-        average_latency = sum(latency) * 1000 / len(latency)
-        logger.debug("OnnxRuntime with IO binding inference time = {} ms".format(format(average_latency, ".2f")))
-
-        return ort_outputs, average_latency
-
-    @staticmethod
-    def test_parity(
-        ort_session,
-        model,
-        device,
-        is_float16=False,
-        rtol=5e-4,
-        atol=5e-4,
-        total_test_cases=100,
-        use_io_binding=True,
-        model_class="GPT2LMHeadModel_BeamSearchStep",
-        has_position_ids=True,
-        has_attention_mask=True,
-    ):
-        """Generate random inputs and compare the results of PyTorch and Onnx Runtime."""
-
-        config: GPT2Config = model.config
-
-        logger.info(
-            f"Running parity test (rtol={rtol}, atol={atol}, test_cases={total_test_cases}, use_io_binding={use_io_binding} model_class={model_class} is_float16={is_float16}) ..."
-        )
-
-        max_batch_size = 1
-        max_past_seq_len = 4  # Do not use large number here for higher chance of hitting empty past (past_seq_len=0)
-        max_seq_len = 2
-        beam_size = 4
-
-        output_buffers = None
-        if use_io_binding:
-            max_output_shapes = Gpt2BeamSearchHelper.get_output_shapes(
-                max_batch_size,
-                max_past_seq_len,
-                max_past_seq_len,
-                max_seq_len,
-                beam_size,
-                0,
-                config,
-                model_class,
-            )
-            output_buffers = Gpt2BeamSearchHelper.get_output_buffers(max_output_shapes, device, is_float16)
-
-        passed_test_cases = 0
-        for _ in range(total_test_cases):
-            past_sequence_length = random.randint(0, max_past_seq_len)
-            sequence_length = random.randint(1 + past_sequence_length, max_seq_len + past_sequence_length)
-            batch_size = random.randint(1, max_batch_size)
-
-            logger.debug(
-                f"Running parity test for batch_size={batch_size} past_sequence_length={past_sequence_length}..."
-            )
-            dummy_inputs = Gpt2BeamSearchHelper.get_dummy_inputs(
-                batch_size,
-                past_sequence_length,
-                sequence_length,
-                config.num_attention_heads,
-                config.hidden_size,
-                config.n_layer,
-                config.vocab_size,
-                device,
-                is_float16,
-                has_position_ids,
-                has_attention_mask,
-            )
-
-            outputs = Gpt2BeamSearchHelper.pytorch_inference(model, dummy_inputs)
-            if use_io_binding:
-                ort_outputs = Gpt2BeamSearchHelper.onnxruntime_inference(ort_session, dummy_inputs)
-            else:
-                output_shapes = Gpt2BeamSearchHelper.get_output_shapes(
-                    batch_size,
-                    past_sequence_length,
-                    past_sequence_length,
-                    sequence_length,
-                    beam_size,
-                    0,
-                    config,
-                    model_class,
-                )
-                ort_outputs = Gpt2BeamSearchHelper.onnxruntime_inference_with_binded_io(
-                    ort_session, dummy_inputs, output_buffers, output_shapes
-                )
-
-            is_all_close = Gpt2BeamSearchHelper.compare_outputs(
-                outputs, ort_outputs, model_class=model_class, rtol=rtol, atol=atol
-            )
-            if is_all_close:
-                passed_test_cases += 1
-        logger.info(f"Parity Test Cases={total_test_cases}; Passed={passed_test_cases}")
-        if passed_test_cases > 0.95 * total_test_cases:
-            logger.info(f"Parity is good: passed rate={int(passed_test_cases*100/total_test_cases):.0f}%")
-        return passed_test_cases == total_test_cases
-
-    @staticmethod
-    def torchscript(model, config, device, has_position_ids=True, has_attention_mask=True):
-        """JIT trace for TorchScript."""
-        input_list = Gpt2BeamSearchHelper.get_dummy_inputs(
-            batch_size=1,
-            past_sequence_length=1,
-            sequence_length=1,
-            num_attention_heads=config.num_attention_heads,
-            hidden_size=config.hidden_size,
-            num_layer=config.n_layer,
-            vocab_size=config.vocab_size,
-            device=device,
-            float16=False,
-            has_position_ids=has_position_ids,
-            has_attention_mask=has_attention_mask,
-        ).to_list()
-        return torch.jit.trace(model, input_list)
diff --git a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_beamsearch_tester.py b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_beamsearch_tester.py
deleted file mode 100644
index 3a8c17a3b7..0000000000
--- a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_beamsearch_tester.py
+++ /dev/null
@@ -1,442 +0,0 @@
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.  See License.txt in the project root for
-# license information.
-# --------------------------------------------------------------------------
-# This script helps evaluation of GPT-2 model.
-import logging
-import os
-import sys
-import timeit
-
-import numpy
-import torch
-from gpt2_beamsearch_helper import Gpt2BeamSearchHelper, Gpt2BeamSearchInputs
-from gpt2_tester import Gpt2Metric, Gpt2Tester
-
-sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
-
-from benchmark_helper import Precision
-
-logger = logging.getLogger(__name__)
-
-
-class Gpt2TesterFactory:
-    @staticmethod
-    def create_tester(tester_type="default"):
-        testers = {
-            "default": Gpt2Tester,
-            "beam_search_step": Gpt2BeamSearchTester,
-            "configurable_one_step_search": Gpt2BeamSearchTester,
-        }
-        w = testers[tester_type]
-        return w
-
-
-class Gpt2BeamSearchTester(Gpt2Tester):
-    def __init__(
-        self,
-        input_ids,
-        position_ids,
-        attention_mask,
-        beam_select_idx,
-        input_log_probs,
-        input_unfinished_sents,
-        prev_step_results,
-        prev_step_scores,
-        num_attention_heads,
-        hidden_size,
-        num_layer,
-        beam_size,
-        device,
-        is_fp16=False,
-        top_k=20,
-        top_k_required_order=False,
-    ):
-        super().__init__(
-            input_ids,
-            position_ids,
-            attention_mask,
-            num_attention_heads=num_attention_heads,
-            hidden_size=hidden_size,
-            num_layer=num_layer,
-            device=device,
-            is_fp16=is_fp16,
-            top_k=top_k,
-            top_k_required_order=top_k_required_order,
-        )
-        self.input_length = input_ids.shape[-1]
-        self.n_layer = num_layer
-        self.beam_size = beam_size
-
-        self.beam_select_idx = beam_select_idx.to(device)
-
-        float_type = torch.float16 if is_fp16 else torch.float32
-        self.input_log_probs = input_log_probs.type(float_type).to(device)
-        self.input_unfinished_sents = input_unfinished_sents.to(device)
-
-        self.prev_step_results = prev_step_results.to(device) if prev_step_results is not None else None
-        self.prev_step_scores = prev_step_scores.type(float_type).to(device)
-
-        self.last_state = None
-
-    def get_inputs(self) -> Gpt2BeamSearchInputs:
-        return Gpt2BeamSearchInputs(
-            self.input_ids,
-            self.past,
-            self.position_ids,
-            self.attention_mask,
-            self.beam_select_idx,
-            self.input_log_probs,
-            self.input_unfinished_sents,
-            self.prev_step_results,
-            self.prev_step_scores,
-        )
-
-    def update(self, output, step, device):
-        """
-        Update the inputs for next inference.
-        """
-        self.last_state = (
-            torch.from_numpy(output[0]).to(device)
-            if isinstance(output[0], numpy.ndarray)
-            else output[0].clone().detach().cpu()
-        )
-
-        self.input_ids = self.last_state.view(self.batch_size * self.beam_size, -1).to(device)
-
-        if self.position_ids is not None:
-            input_unfinished_sents_id = -3
-            self.prev_step_results = (
-                torch.from_numpy(output[-2]).to(device)
-                if isinstance(output[-2], numpy.ndarray)
-                else output[-2].clone().detach().to(device)
-            )
-            self.position_ids = (
-                torch.tensor([self.input_length + step - 1])
-                .unsqueeze(0)
-                .repeat(self.batch_size * self.beam_size, 1)
-                .to(device)
-            )
-
-            if self.attention_mask.size(0) != (self.batch_size * self.beam_size):
-                self.attention_mask = self.attention_mask.repeat(self.batch_size * self.beam_size, 1)
-            self.attention_mask = torch.cat(
-                [
-                    self.attention_mask,
-                    torch.ones([self.batch_size * self.beam_size, 1]).type_as(self.attention_mask),
-                ],
-                1,
-            ).to(device)
-        else:
-            input_unfinished_sents_id = -2
-
-        self.beam_select_idx = (
-            torch.from_numpy(output[input_unfinished_sents_id - 2]).to(device)
-            if isinstance(output[input_unfinished_sents_id - 2], numpy.ndarray)
-            else output[input_unfinished_sents_id - 2].clone().detach().to(device)
-        )
-        self.input_log_probs = (
-            torch.from_numpy(output[input_unfinished_sents_id - 1]).to(device)
-            if isinstance(output[input_unfinished_sents_id - 1], numpy.ndarray)
-            else output[input_unfinished_sents_id - 1].clone().detach().to(device)
-        )
-        self.input_unfinished_sents = (
-            torch.from_numpy(output[input_unfinished_sents_id]).to(device)
-            if isinstance(output[input_unfinished_sents_id], numpy.ndarray)
-            else output[input_unfinished_sents_id].clone().detach().to(device)
-        )
-        self.prev_step_scores = (
-            torch.from_numpy(output[-1]).to(device)
-            if isinstance(output[-1], numpy.ndarray)
-            else output[-1].clone().detach().to(device)
-        )
-        self.top_1_tokens = self.input_ids[0]
-        self.top_k_tokens = self.last_state
-
-        self.past = []
-
-        if isinstance(output[1], tuple):  # past in torch output is tuple
-            self.past = list(output[1])
-        else:
-            for i in range(self.n_layer):
-                past_i = (
-                    torch.from_numpy(output[i + 1])
-                    if isinstance(output[i + 1], numpy.ndarray)
-                    else output[i + 1].clone().detach()
-                )
-                self.past.append(past_i.to(device))
-
-    @staticmethod
-    def test_generation(
-        session,
-        model,
-        device,
-        test_inputs,
-        precision=Precision.FLOAT32,
-        model_class="GPT2LMHeadModel_BeamSearchStep",
-        top_k=20,
-        top_k_no_order=True,
-        max_steps=24,
-        max_inputs=0,
-        verbose=False,
-        save_test_data=0,
-        save_test_data_dir=".",
-    ):
-        """
-        Test Generation using beam search to compare PyTorch and ONNX model.
-        It will print top 1 and top k errors on the given test inputs.
-        """
-        print(
-            f"start test generation: (top_k={top_k} top_k_no_order={top_k_no_order} max_steps={max_steps} test_inputs={len(test_inputs)} max_inputs={max_inputs})"
-        )
-        n_layer = model.config.n_layer
-        n_head = model.config.n_head
-        n_embd = model.config.n_embd
-        beam_size = model.config.beam_size
-        eos_token_id = model.config.eos_token_id
-        test_data_saved = 0
-
-        is_float16 = precision == Precision.FLOAT16
-
-        # We will still use fp32 torch model as baseline when onnx model if fp16
-        model.eval().to(device)
-
-        # Allocate initial buffers for IO Binding of ONNX Runtimne. The buffer size will automatically increase later.
-        init_output_shapes = Gpt2BeamSearchHelper.get_output_shapes(
-            batch_size=4,
-            context_len=128,
-            past_sequence_length=128,
-            sequence_length=32,
-            beam_size=1,
-            step=0,
-            config=model.config,
-            model_class=model_class,
-        )
-        output_buffers = Gpt2BeamSearchHelper.get_output_buffers(
-            init_output_shapes,
-            device,
-            is_float16=is_float16,
-        )
-
-        baseline_name = "Torch"
-        treatment_name = "Quantized Onnx" if precision == Precision.INT8 else "Onnx"
-        torch_metric = Gpt2Metric(baseline_name, baseline_name, top_k)
-        onnx_metric = Gpt2Metric(treatment_name, baseline_name, top_k)
-        onnx_io_metric = Gpt2Metric(treatment_name + " with IO Binding", baseline_name, top_k)
-
-        for i, inputs in enumerate(test_inputs):
-            if max_inputs > 0 and i == max_inputs:
-                break
-            if i % 10 == 0:
-                print(f"{i}")
-            input_ids = inputs["input_ids"]
-            position_ids = inputs["position_ids"] if "position_ids" in inputs else None
-            attention_mask = inputs["attention_mask"] if "attention_mask" in inputs else None
-            beam_select_idx = inputs["beam_select_idx"] if "beam_select_idx" in inputs else None
-            input_log_probs = inputs["input_log_probs"] if "input_log_probs" in inputs else None
-            input_unfinished_sents = inputs["input_unfinished_sents"]
-            if model_class == "GPT2LMHeadModel_BeamSearchStep":
-                prev_step_results = inputs["input_ids"]
-            else:
-                prev_step_results = None
-
-            if "prev_step_scores" in inputs:
-                prev_step_scores = inputs["prev_step_scores"]
-            else:
-                prev_step_scores = torch.zeros([input_ids.shape[0], 1])
-
-            onnx_runner = Gpt2BeamSearchTester(
-                input_ids,
-                position_ids,
-                attention_mask,
-                beam_select_idx,
-                input_log_probs,
-                input_unfinished_sents,
-                prev_step_results,
-                prev_step_scores,
-                n_head,
-                n_embd,
-                n_layer,
-                beam_size,
-                device,
-                is_float16,
-                top_k,
-                not top_k_no_order,
-            )
-            onnx_io_runner = Gpt2BeamSearchTester(
-                input_ids,
-                position_ids,
-                attention_mask,
-                beam_select_idx,
-                input_log_probs,
-                input_unfinished_sents,
-                prev_step_results,
-                prev_step_scores,
-                n_head,
-                n_embd,
-                n_layer,
-                beam_size,
-                device,
-                is_float16,
-                top_k,
-                not top_k_no_order,
-            )
-            torch_runner = Gpt2BeamSearchTester(
-                input_ids,
-                position_ids,
-                attention_mask,
-                beam_select_idx,
-                input_log_probs,
-                input_unfinished_sents,
-                prev_step_results,
-                prev_step_scores,
-                n_head,
-                n_embd,
-                n_layer,
-                beam_size,
-                device,
-                False,
-                top_k,
-                not top_k_no_order,
-            )  # Torch model baseline is fp32
-
-            batch_size = torch_runner.batch_size
-            onnx_metric.start_batch(batch_size)
-            onnx_io_metric.start_batch(batch_size)
-            context_len = list(onnx_runner.input_ids.size())[-1]
-            with torch.no_grad():
-                for step in range(max_steps):
-                    print(f"Processing step: {step}")
-                    if model_class == "GPT2LMHeadModel_BeamSearchStep":
-                        num_seq = beam_size
-                        seq_len = list(onnx_runner.input_ids.size())[1]
-                        past_seq_len = list(onnx_runner.past[0].size())[3]
-                    else:
-                        num_seq = sum(onnx_io_runner.input_unfinished_sents.view(-1).long().cpu())
-                        past_seq_len = list(onnx_runner.past[0].size())[3]
-                        seq_len = list(onnx_runner.input_ids.size())[-1] - past_seq_len
-
-                    start_time = timeit.default_timer()
-                    pytorch_output = Gpt2BeamSearchHelper.pytorch_inference(model, torch_runner.get_inputs())
-                    torch_metric.add_latency(past_seq_len, timeit.default_timer() - start_time)
-                    torch_runner.update(pytorch_output, step, device)
-
-                    (
-                        onnx_output,
-                        avg_latency_ms,
-                    ) = Gpt2BeamSearchHelper.onnxruntime_inference(session, onnx_runner.get_inputs(), total_runs=1)
-                    onnx_metric.add_latency(past_seq_len, avg_latency_ms / 1000.0)
-                    onnx_runner.update(onnx_output, step, device)
-
-                    if model_class == "GPT2LMHeadModel_BeamSearchStep":
-                        num_seq = beam_size
-                    else:
-                        num_seq = sum(onnx_io_runner.input_unfinished_sents.view(-1).long().cpu())
-
-                    output_shapes = Gpt2BeamSearchHelper.get_output_shapes(
-                        batch_size,
-                        context_len,
-                        past_seq_len,
-                        seq_len,
-                        beam_size,
-                        step,
-                        model.config,
-                        model_class=model_class,
-                        num_seq=num_seq,
-                    )
-
-                    Gpt2BeamSearchHelper.auto_increase_buffer_size(output_buffers, output_shapes)
-
-                    (onnx_io_output, avg_latency_ms,) = Gpt2BeamSearchHelper.onnxruntime_inference_with_binded_io(
-                        session,
-                        onnx_io_runner.get_inputs(),
-                        output_buffers,
-                        output_shapes,
-                        total_runs=1,
-                        return_numpy=False,
-                        include_copy_output_latency=True,
-                    )
-
-                    onnx_io_metric.add_latency(past_seq_len, avg_latency_ms / 1000.0)
-
-                    if test_data_saved < save_test_data:
-                        onnx_io_runner.save_test_data(session, onnx_io_output, save_test_data_dir, test_data_saved)
-                        test_data_saved += 1
-
-                    onnx_io_runner.update(onnx_io_output, step, device)
-
-                    if (not onnx_runner.input_unfinished_sents.any()) or (
-                        not torch_runner.input_unfinished_sents.any()
-                    ):
-                        print("break at step: ", step)
-                        break
-
-            print(f"Totally {step+1} steps run")
-            onnx_metric.end_batch()
-            onnx_io_metric.end_batch()
-
-        torch_metric.print()
-        onnx_metric.print()
-        onnx_io_metric.print()
-
-        print("\tONNX")
-        if model_class == "GPT2LMHeadModel_BeamSearchStep":
-            results_onnx = onnx_runner.prev_step_results.view(batch_size * beam_size, -1)
-            results_onnx_io = onnx_io_runner.prev_step_results.view(batch_size * beam_size, -1)
-        else:
-            results_onnx = onnx_runner.input_ids.view(batch_size * beam_size, -1)
-            results_onnx_io = onnx_io_runner.input_ids.view(batch_size * beam_size, -1)
-        Gpt2BeamSearchTester.pprint_results(
-            results_onnx,
-            onnx_runner.prev_step_scores.view(batch_size * beam_size, -1),
-            pad_token_id=eos_token_id,
-            eos_token_id=eos_token_id,
-        )
-        print("\tONNX with IO binding")
-        Gpt2BeamSearchTester.pprint_results(
-            results_onnx_io,
-            onnx_io_runner.prev_step_scores.view(batch_size * beam_size, -1),
-            pad_token_id=eos_token_id,
-            eos_token_id=eos_token_id,
-        )
-
-    @staticmethod
-    def pprint_results(
-        output_ids,
-        output_scores,
-        pad_token_id=None,
-        eos_token_id=None,
-    ):
-        """
-        Print test generation results.
-        """
-        if pad_token_id is None:
-            pad_token_id = 1
-        if eos_token_id is None:
-            eos_token_id = 1
-        if torch.is_tensor(output_ids):
-            output_ids = output_ids.cpu().numpy()
-
-        for i, sample in enumerate(output_ids):
-            for j, seq in enumerate(sample):
-                if isinstance(seq, numpy.ndarray) or isinstance(seq, list):
-                    # remove left padding
-                    for k, t in enumerate(seq):
-                        if t != pad_token_id:
-                            seq = seq[k:]
-                            break
-                    # remove EOS
-                    for k, t in enumerate(seq):
-                        if t == eos_token_id:
-                            seq = seq[: k + 1]
-                            break
-                    print("-" * 40)
-                    result = ",".join([str(token_id) for token_id in sample])
-                    print(f">> Output {j + 1}: \t{[result]}")
-                else:
-                    result = ",".join([str(token_id) for token_id in sample])
-                    print(f">> Output {i}: \t{result}")
-                    print(f">> Scores {i}: \t{output_scores[i]}")
-                    break
-            print("=" * 80)
diff --git a/onnxruntime/python/tools/transformers/notebooks/Inference_GPT2-OneStepSearch_OnnxRuntime_CPU.ipynb b/onnxruntime/python/tools/transformers/notebooks/Inference_GPT2-OneStepSearch_OnnxRuntime_CPU.ipynb
deleted file mode 100644
index 4f2198bbb2..0000000000
--- a/onnxruntime/python/tools/transformers/notebooks/Inference_GPT2-OneStepSearch_OnnxRuntime_CPU.ipynb
+++ /dev/null
@@ -1,491 +0,0 @@
-{
- "metadata": {
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.0"
-  },
-  "orig_nbformat": 2,
-  "kernelspec": {
-   "name": "python370jvsc74a57bd081098997110362167705b61d21e46dda767ff2050d805c22b6ba90fec7e1aa35",
-   "display_name": "Python 3.7.0 64-bit ('py37athena': conda)"
-  },
-  "metadata": {
-   "interpreter": {
-    "hash": "81098997110362167705b61d21e46dda767ff2050d805c22b6ba90fec7e1aa35"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2,
- "cells": [
-  {
-   "source": [
-    "Copyright (c) Microsoft Corporation. All rights reserved.\n",
-    "Licensed under the MIT License."
-   ],
-   "cell_type": "markdown",
-   "metadata": {}
-  },
-  {
-   "source": [
-    "# Optimizing runtime performance on GPT-2 model inference with ONNXRuntime on CPU\n",
-    "\n",
-    "In this tutorial, you'll be introduced to how to load a GPT2 model from PyTorch, convert it to ONNX with one step search, and inference it using ONNX Runtime with/without IO Binding. GPT-2 model inference is optimized by compiling one-step beam search into the onnx compute graph, which speeds up the runtime significantly. "
-   ],
-   "cell_type": "markdown",
-   "metadata": {}
-  },
-  {
-   "source": [
-    "## Prerequisites\n",
-    "If you have Jupyter Notebook, you may directly run this notebook. We will use pip to install or upgrade [PyTorch](https://pytorch.org/), [OnnxRuntime](https://microsoft.github.io/onnxruntime/) and other required packages.\n",
-    "\n",
-    "Otherwise, you can setup a new environment. First, we install [Anaconda](https://www.anaconda.com/distribution/). Then open an AnaConda prompt window and run the following commands:\n",
-    "\n",
-    "```console\n",
-    "conda create -n cpu_env python=3.8\n",
-    "conda activate cpu_env\n",
-    "conda install jupyter\n",
-    "jupyter notebook\n",
-    "```\n",
-    "\n",
-    "The last command will launch Jupyter Notebook and we can open this notebook in browser to continue."
-   ],
-   "cell_type": "markdown",
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Install PyTorch 1.7.0 and OnnxRuntime 1.7.0 for CPU-only.\n",
-    "import sys\n",
-    "if sys.platform == 'darwin': # Mac\n",
-    "    !{sys.executable} -m pip install --upgrade torch torchvision\n",
-    "else:\n",
-    "    !{sys.executable} -m pip install --upgrade torch==1.7.0+cpu torchvision==0.8.1+cpu -f https://download.pytorch.org/whl/torch_stable.html\n",
-    "!{sys.executable} -m pip install onnxruntime==1.7.2\n",
-    "\n",
-    "# Install other packages used in this notebook.\n",
-    "!{sys.executable} -m pip install transformers==4.3.1\n",
-    "!{sys.executable} -m pip install onnx onnxconverter_common psutil pytz pandas py-cpuinfo py3nvml"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "\n",
-    "# Create a cache directory to store pretrained model.\n",
-    "cache_dir = os.path.join(\".\", \"cache_models\")\n",
-    "if not os.path.exists(cache_dir):\n",
-    "    os.makedirs(cache_dir)"
-   ]
-  },
-  {
-   "source": [
-    "## Convert GPT2 model from PyTorch to ONNX with one step search ##\n",
-    "\n",
-    "We have a script [convert_to_onnx.py](https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/transformers/convert_to_onnx.py) that could help you to convert GPT2 with past state to ONNX. \n",
-    "\n",
-    "The script accepts a pretrained model name or path of a checkpoint directory as input, and converts the model to ONNX. It also verifies that the ONNX model could generate same input as the pytorch model. The usage is like \n",
-    "```\n",
-    "python -m onnxruntime.transformers.convert_to_onnx -m model_name_or_path \\ \n",
-    "--model_class=GPT2LMHeadModel_BeamSearchStep|GPT2LMHeadModel_ConfigurableOneStepSearch \\ \n",
-    "--output gpt2_onestepsearch.onnx -o -p fp32|fp16|int8\n",
-    "```\n",
-    "The -p option can be used to choose the precision: fp32 (float32), fp16 (mixed precision) or int8 (quantization). The -o option will generate optimized model, which is required for fp16 or int8.\n",
-    "\n",
-    "Here we use a pretrained model as example:"
-   ],
-   "cell_type": "markdown",
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      "GPT2Config {\n  \"_name_or_path\": \"gpt2\",\n  \"activation_function\": \"gelu_new\",\n  \"architectures\": [\n    \"GPT2LMHeadModel\"\n  ],\n  \"attn_pdrop\": 0.1,\n  \"batch_size\": 1,\n  \"beam_size\": 4,\n  \"bos_token_id\": 50256,\n  \"embd_pdrop\": 0.1,\n  \"eos_token_id\": 50256,\n  \"gradient_checkpointing\": false,\n  \"initializer_range\": 0.02,\n  \"layer_norm_epsilon\": 1e-05,\n  \"model_type\": \"gpt2\",\n  \"n_ctx\": 1024,\n  \"n_embd\": 768,\n  \"n_head\": 12,\n  \"n_inner\": null,\n  \"n_layer\": 12,\n  \"n_positions\": 1024,\n  \"resid_pdrop\": 0.1,\n  \"summary_activation\": null,\n  \"summary_first_dropout\": 0.1,\n  \"summary_proj_to_labels\": true,\n  \"summary_type\": \"cls_index\",\n  \"summary_use_proj\": true,\n  \"task_specific_params\": {\n    \"text-generation\": {\n      \"do_sample\": true,\n      \"max_length\": 50\n    }\n  },\n  \"transformers_version\": \"4.3.1\",\n  \"use_cache\": true,\n  \"vocab_size\": 50257\n}\n\n"
-     ]
-    }
-   ],
-   "source": [
-    "from packaging import version\n",
-    "from onnxruntime import __version__ as ort_verison\n",
-    "if version.parse(ort_verison) >= version.parse('1.12.0'):\n",
-    "    from onnxruntime.transformers.models.gpt2.gpt2_beamsearch_helper import Gpt2BeamSearchHelper, GPT2LMHeadModel_BeamSearchStep\n",
-    "else:\n",
-    "    from onnxruntime.transformers.gpt2_beamsearch_helper import Gpt2BeamSearchHelper, GPT2LMHeadModel_BeamSearchStep\n",
-    "\n",
-    "from transformers import AutoConfig\n",
-    "import torch\n",
-    "\n",
-    "model_name_or_path = \"gpt2\"\n",
-    "config = AutoConfig.from_pretrained(model_name_or_path, cache_dir=cache_dir)\n",
-    "model = GPT2LMHeadModel_BeamSearchStep.from_pretrained(model_name_or_path, config=config, batch_size=1, beam_size=4, cache_dir=cache_dir)\n",
-    "device = torch.device(\"cpu\")\n",
-    "model.eval().to(device)\n",
-    "\n",
-    "print(model.config)\n",
-    "\n",
-    "num_attention_heads = model.config.n_head\n",
-    "hidden_size = model.config.n_embd\n",
-    "num_layer = model.config.n_layer"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "stream",
-     "name": "stderr",
-     "text": [
-      "/data/anaconda/envs/py37athena/lib/python3.7/site-packages/transformers/models/gpt2/modeling_gpt2.py:654: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n  assert batch_size > 0, \"batch_size has to be defined and > 0\"\n/data/anaconda/envs/py37athena/lib/python3.7/site-packages/transformers/models/gpt2/modeling_gpt2.py:169: TracerWarning: Converting a tensor to a Python float might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n  w = w / (float(v.size(-1)) ** 0.5)\n/data/anaconda/envs/py37athena/lib/python3.7/site-packages/transformers/models/gpt2/modeling_gpt2.py:174: TracerWarning: Converting a tensor to a Python index might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n  mask = self.bias[:, :, ns - nd : ns, :ns]\n"
-     ]
-    }
-   ],
-   "source": [
-    "onnx_model_path = \"gpt2_one_step_search.onnx\"\n",
-    "Gpt2BeamSearchHelper.export_onnx(model, device, onnx_model_path) # add parameter use_external_data_format=True when model size > 2 GB"
-   ]
-  },
-  {
-   "source": [
-    "## ONNX Runtime Inference ##\n",
-    "\n",
-    "We can use ONNX Runtime to inference. The inputs are dictionary with name and numpy array as value, and the output is list of numpy array. Note that both input and output are in CPU. When you run the inference in GPU, it will involve data copy between CPU and GPU for input and output.\n",
-    "\n",
-    "Let's create an inference session for ONNX Runtime given the exported ONNX model, and see the output."
-   ],
-   "cell_type": "markdown",
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import onnxruntime\n",
-    "import numpy\n",
-    "from transformers import AutoTokenizer\n",
-    "\n",
-    "EXAMPLE_Text = ['best hotel in bay area.']\n",
-    "\n",
-    "def get_tokenizer(model_name_or_path, cache_dir):\n",
-    "    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir)\n",
-    "    tokenizer.padding_side = \"left\"\n",
-    "    tokenizer.pad_token = tokenizer.eos_token\n",
-    "    #okenizer.add_special_tokens({'pad_token': '[PAD]'})\n",
-    "    return tokenizer\n",
-    "\n",
-    "def get_example_inputs(prompt_text=EXAMPLE_Text):    \n",
-    "    tokenizer = get_tokenizer(model_name_or_path, cache_dir)\n",
-    "    encodings_dict = tokenizer.batch_encode_plus(prompt_text, padding=True)\n",
-    "\n",
-    "    input_ids = torch.tensor(encodings_dict['input_ids'], dtype=torch.int64)\n",
-    "    attention_mask = torch.tensor(encodings_dict['attention_mask'], dtype=torch.float32)\n",
-    "    position_ids = (attention_mask.long().cumsum(-1) - 1)\n",
-    "    position_ids.masked_fill_(position_ids < 0, 0)\n",
-    "\n",
-    "    #Empty Past State for generating first word\n",
-    "    empty_past = []\n",
-    "    batch_size = input_ids.size(0)\n",
-    "    sequence_length = input_ids.size(1)\n",
-    "    past_shape = [2, batch_size, num_attention_heads, 0, hidden_size // num_attention_heads]\n",
-    "    for i in range(num_layer):\n",
-    "        empty_past.append(torch.empty(past_shape).type(torch.float32).to(device))\n",
-    "       \n",
-    "    return input_ids, attention_mask, position_ids, empty_past\n",
-    "\n",
-    "input_ids, attention_mask, position_ids, empty_past = get_example_inputs()\n",
-    "beam_select_idx = torch.zeros([1, input_ids.shape[0]]).long()\n",
-    "input_log_probs = torch.zeros([input_ids.shape[0], 1])\n",
-    "input_unfinished_sents = torch.ones([input_ids.shape[0], 1], dtype=torch.bool)\n",
-    "prev_step_scores = torch.zeros([input_ids.shape[0], 1])\n",
-    "\n",
-    "onnx_model_path = \"gpt2_one_step_search.onnx\"\n",
-    "session = onnxruntime.InferenceSession(onnx_model_path)\n",
-    "ort_inputs = {\n",
-    "              'input_ids': numpy.ascontiguousarray(input_ids.cpu().numpy()),\n",
-    "              'attention_mask' : numpy.ascontiguousarray(attention_mask.cpu().numpy()),\n",
-    "              'position_ids': numpy.ascontiguousarray(position_ids.cpu().numpy()),\n",
-    "              'beam_select_idx': numpy.ascontiguousarray(beam_select_idx.cpu().numpy()),\n",
-    "              'input_log_probs': numpy.ascontiguousarray(input_log_probs.cpu().numpy()),\n",
-    "              'input_unfinished_sents': numpy.ascontiguousarray(input_unfinished_sents.cpu().numpy()),\n",
-    "              'prev_step_results': numpy.ascontiguousarray(input_ids.cpu().numpy()),\n",
-    "              'prev_step_scores': numpy.ascontiguousarray(prev_step_scores.cpu().numpy()),\n",
-    "             }\n",
-    "for i, past_i in enumerate(empty_past):\n",
-    "    ort_inputs[f'past_{i}'] = numpy.ascontiguousarray(past_i.cpu().numpy())\n",
-    "ort_outputs = session.run(None, ort_inputs)"
-   ]
-  },
-  {
-   "source": [
-    "## ONNX Runtime Inference with IO Binding ##\n",
-    "\n",
-    "To avoid data copy for input and output, ONNX Runtime also supports IO Binding. User could provide some buffer for input and outputs. For GPU inference, the buffer can be in GPU to reduce memory copy between CPU and GPU. This is helpful for high performance inference in GPU. For GPT-2, IO Binding might help the performance when batch size or (past) sequence length is large."
-   ],
-   "cell_type": "markdown",
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def inference_with_io_binding(session, config, input_ids, position_ids, attention_mask, past, beam_select_idx, input_log_probs, input_unfinished_sents, prev_step_results, prev_step_scores, step, context_len):\n",
-    "    output_shapes = Gpt2BeamSearchHelper.get_output_shapes(batch_size=1,\n",
-    "                                                           context_len=context_len,\n",
-    "                                                           past_sequence_length=past[0].size(3),\n",
-    "                                                           sequence_length=input_ids.size(1),\n",
-    "                                                           beam_size=4,\n",
-    "                                                           step=step,\n",
-    "                                                           config=config,\n",
-    "                                                           model_class=\"GPT2LMHeadModel_BeamSearchStep\")\n",
-    "    output_buffers = Gpt2BeamSearchHelper.get_output_buffers(output_shapes, device)\n",
-    "\n",
-    "    io_binding = Gpt2BeamSearchHelper.prepare_io_binding(session, input_ids, position_ids, attention_mask, past, output_buffers, output_shapes, beam_select_idx, input_log_probs, input_unfinished_sents, prev_step_results, prev_step_scores)\n",
-    "    session.run_with_iobinding(io_binding)\n",
-    "\n",
-    "    outputs = Gpt2BeamSearchHelper.get_outputs_from_io_binding_buffer(session, output_buffers, output_shapes, return_numpy=False)\n",
-    "    return outputs"
-   ]
-  },
-  {
-   "source": [
-    "We can see that the result is exactly same with/without IO Binding:"
-   ],
-   "cell_type": "markdown",
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      "IO Binding result is good\n"
-     ]
-    }
-   ],
-   "source": [
-    "input_ids, attention_mask, position_ids, empty_past = get_example_inputs()\n",
-    "beam_select_idx = torch.zeros([1, input_ids.shape[0]]).long()\n",
-    "input_log_probs = torch.zeros([input_ids.shape[0], 1])\n",
-    "input_unfinished_sents = torch.ones([input_ids.shape[0], 1], dtype=torch.bool)\n",
-    "prev_step_scores = torch.zeros([input_ids.shape[0], 1])\n",
-    "outputs = inference_with_io_binding(session, config, input_ids, position_ids, attention_mask, empty_past, beam_select_idx, input_log_probs, input_unfinished_sents, input_ids, prev_step_scores, 0, input_ids.shape[-1])\n",
-    "assert torch.eq(outputs[-2], torch.from_numpy(ort_outputs[-2])).all()\n",
-    "print(\"IO Binding result is good\")"
-   ]
-  },
-  {
-   "source": [
-    "## Batch Text Generation ##\n",
-    "\n",
-    "Here is an example for text generation using ONNX Runtime with/without IO Binding."
-   ],
-   "cell_type": "markdown",
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def update(output, step, batch_size, beam_size, context_length, prev_attention_mask, device):\n",
-    "    \"\"\"\n",
-    "    Update the inputs for next inference.\n",
-    "    \"\"\"\n",
-    "    last_state = (torch.from_numpy(output[0]).to(device)\n",
-    "                        if isinstance(output[0], numpy.ndarray) else output[0].clone().detach().cpu())\n",
-    "\n",
-    "    input_ids = last_state.view(batch_size * beam_size, -1).to(device)\n",
-    "\n",
-    "    input_unfinished_sents_id = -3\n",
-    "    prev_step_results = (torch.from_numpy(output[-2]).to(device) if isinstance(output[-2], numpy.ndarray)\n",
-    "                                else output[-2].clone().detach().to(device))\n",
-    "    position_ids = (torch.tensor([context_length + step - 1\n",
-    "                                        ]).unsqueeze(0).repeat(batch_size * beam_size, 1).to(device))\n",
-    "\n",
-    "    if prev_attention_mask.shape[0] != (batch_size * beam_size):\n",
-    "        prev_attention_mask = prev_attention_mask.repeat(batch_size * beam_size, 1)\n",
-    "    attention_mask = torch.cat(\n",
-    "        [\n",
-    "            prev_attention_mask,\n",
-    "            torch.ones([batch_size * beam_size, 1]).type_as(prev_attention_mask),\n",
-    "        ],\n",
-    "        1,\n",
-    "    ).to(device)\n",
-    "\n",
-    "    beam_select_idx = (torch.from_numpy(output[input_unfinished_sents_id - 2]).to(device) if isinstance(\n",
-    "        output[input_unfinished_sents_id - 2], numpy.ndarray) else output[input_unfinished_sents_id - 2].clone().detach().to(device))\n",
-    "    input_log_probs = (torch.from_numpy(output[input_unfinished_sents_id - 1]).to(device) if isinstance(\n",
-    "        output[input_unfinished_sents_id - 1], numpy.ndarray) else output[input_unfinished_sents_id - 1].clone().detach().to(device))\n",
-    "    input_unfinished_sents = (torch.from_numpy(output[input_unfinished_sents_id]).to(device) if isinstance(\n",
-    "        output[input_unfinished_sents_id], numpy.ndarray) else\n",
-    "                                    output[input_unfinished_sents_id].clone().detach().to(device))\n",
-    "    prev_step_scores = (torch.from_numpy(output[-1]).to(device)\n",
-    "                                if isinstance(output[-1], numpy.ndarray) else output[-1].clone().detach().to(device))\n",
-    "\n",
-    "    past = []\n",
-    "    if isinstance(output[1], tuple):  # past in torch output is tuple\n",
-    "        past = list(output[1])\n",
-    "    else:\n",
-    "        for i in range(model.config.n_layer):\n",
-    "            past_i = (torch.from_numpy(output[i + 1])\n",
-    "                        if isinstance(output[i + 1], numpy.ndarray) else output[i + 1].clone().detach())\n",
-    "            past.append(past_i.to(device)) \n",
-    "\n",
-    "    inputs = {\n",
-    "        'input_ids': input_ids,\n",
-    "        'attention_mask' : attention_mask,\n",
-    "        'position_ids': position_ids,\n",
-    "        'beam_select_idx': beam_select_idx,\n",
-    "        'input_log_probs': input_log_probs,\n",
-    "        'input_unfinished_sents': input_unfinished_sents,\n",
-    "        'prev_step_results': prev_step_results,\n",
-    "        'prev_step_scores': prev_step_scores,\n",
-    "    }\n",
-    "    ort_inputs = {\n",
-    "        'input_ids': numpy.ascontiguousarray(input_ids.cpu().numpy()),\n",
-    "        'attention_mask' : numpy.ascontiguousarray(attention_mask.cpu().numpy()),\n",
-    "        'position_ids': numpy.ascontiguousarray(position_ids.cpu().numpy()),\n",
-    "        'beam_select_idx': numpy.ascontiguousarray(beam_select_idx.cpu().numpy()),\n",
-    "        'input_log_probs': numpy.ascontiguousarray(input_log_probs.cpu().numpy()),\n",
-    "        'input_unfinished_sents': numpy.ascontiguousarray(input_unfinished_sents.cpu().numpy()),\n",
-    "        'prev_step_results': numpy.ascontiguousarray(prev_step_results.cpu().numpy()),\n",
-    "        'prev_step_scores': numpy.ascontiguousarray(prev_step_scores.cpu().numpy()),\n",
-    "    }\n",
-    "    for i, past_i in enumerate(past):\n",
-    "        ort_inputs[f'past_{i}'] = numpy.ascontiguousarray(past_i.cpu().numpy())\n",
-    "    \n",
-    "    return inputs, ort_inputs, past\n",
-    "\n",
-    "def test_generation(tokenizer, input_text, use_onnxruntime_io, ort_session = None, num_tokens_to_produce = 30):\n",
-    "    print(\"Text generation using\", \"OnnxRuntime with IO binding\" if use_onnxruntime_io else \"OnnxRuntime\", \"...\")    \n",
-    "    input_ids, attention_mask, position_ids, past = get_example_inputs(input_text)\n",
-    "    beam_select_idx = torch.zeros([1, input_ids.shape[0]]).long()\n",
-    "    input_log_probs = torch.zeros([input_ids.shape[0], 1])\n",
-    "    input_unfinished_sents = torch.ones([input_ids.shape[0], 1], dtype=torch.bool)\n",
-    "    prev_step_scores = torch.zeros([input_ids.shape[0], 1])\n",
-    "    inputs = {\n",
-    "        'input_ids': input_ids,\n",
-    "        'attention_mask' : attention_mask,\n",
-    "        'position_ids': position_ids,\n",
-    "        'beam_select_idx': beam_select_idx,\n",
-    "        'input_log_probs': input_log_probs,\n",
-    "        'input_unfinished_sents': input_unfinished_sents,\n",
-    "        'prev_step_results': input_ids,\n",
-    "        'prev_step_scores': prev_step_scores,\n",
-    "    }\n",
-    "    ort_inputs = {\n",
-    "        'input_ids': numpy.ascontiguousarray(input_ids.cpu().numpy()),\n",
-    "        'attention_mask' : numpy.ascontiguousarray(attention_mask.cpu().numpy()),\n",
-    "        'position_ids': numpy.ascontiguousarray(position_ids.cpu().numpy()),\n",
-    "        'beam_select_idx': numpy.ascontiguousarray(beam_select_idx.cpu().numpy()),\n",
-    "        'input_log_probs': numpy.ascontiguousarray(input_log_probs.cpu().numpy()),\n",
-    "        'input_unfinished_sents': numpy.ascontiguousarray(input_unfinished_sents.cpu().numpy()),\n",
-    "        'prev_step_results': numpy.ascontiguousarray(input_ids.cpu().numpy()),\n",
-    "        'prev_step_scores': numpy.ascontiguousarray(prev_step_scores.cpu().numpy()),\n",
-    "    }\n",
-    "    for i, past_i in enumerate(past):\n",
-    "        ort_inputs[f'past_{i}'] = numpy.ascontiguousarray(past_i.cpu().numpy())\n",
-    "    batch_size = input_ids.size(0)\n",
-    "    beam_size = 4\n",
-    "    context_length = input_ids.size(-1)\n",
-    "\n",
-    "    for step in range(num_tokens_to_produce):\n",
-    "        if use_onnxruntime_io:\n",
-    "            outputs = inference_with_io_binding(ort_session, config, inputs['input_ids'], inputs['position_ids'], inputs['attention_mask'], past, inputs['beam_select_idx'], inputs['input_log_probs'], inputs['input_unfinished_sents'], inputs['prev_step_results'], inputs['prev_step_scores'], step, context_length)\n",
-    "        else:\n",
-    "            outputs = ort_session.run(None, ort_inputs) \n",
-    "        inputs, ort_inputs, past = update(outputs, step, batch_size, beam_size, context_length, inputs['attention_mask'], device)\n",
-    "\n",
-    "        if not inputs['input_unfinished_sents'].any():\n",
-    "            break\n",
-    "\n",
-    "    print(\"------------\")\n",
-    "    print(tokenizer.decode(inputs['prev_step_results'][0], skip_special_tokens=True))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      "Text generation using OnnxRuntime ...\n",
-      "------------\n",
-      "best hotel in bay area.\n",
-      "\n",
-      "\"It's a great place to stay,\" he said. \"It's a great place to live. It's a great place to work\n"
-     ]
-    }
-   ],
-   "source": [
-    "tokenizer = get_tokenizer(model_name_or_path, cache_dir)\n",
-    "input_text = EXAMPLE_Text\n",
-    "test_generation(tokenizer, input_text, use_onnxruntime_io=False, ort_session=session)"
-   ]
-  },
-  {
-   "source": [
-    "Next, we use ONNX Runtime with IO binding to run again and we can see that the result is exactly same."
-   ],
-   "cell_type": "markdown",
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      "Text generation using OnnxRuntime with IO binding ...\n",
-      "------------\n",
-      "best hotel in bay area.\n",
-      "\n",
-      "\"It's a great place to stay,\" he said. \"It's a great place to live. It's a great place to work\n"
-     ]
-    }
-   ],
-   "source": [
-    "test_generation(tokenizer, input_text, use_onnxruntime_io=True, ort_session=session)"
-   ]
-  }
- ]
-}
diff --git a/onnxruntime/test/python/transformers/test_gpt2_benchmark.py b/onnxruntime/test/python/transformers/test_gpt2_benchmark.py
index 48df008060..c507423a79 100644
--- a/onnxruntime/test/python/transformers/test_gpt2_benchmark.py
+++ b/onnxruntime/test/python/transformers/test_gpt2_benchmark.py
@@ -49,43 +49,6 @@ class TestGpt2(unittest.TestCase):
     def test_gpt2_int8(self):
         self.run_benchmark_gpt2("-m gpt2 --precision int8 -o  -b 1 --sequence_lengths 2 -s 3")
 
-    @pytest.mark.slow
-    def test_gpt2_beam_search_step_fp32(self):
-        self.run_benchmark_gpt2(
-            "-m gpt2 --model_class=GPT2LMHeadModel_BeamSearchStep --precision fp32 -v -b 1 --sequence_lengths 5 -s 3"
-        )
-
-    # @pytest.mark.slow
-    # def test_gpt2_beam_search_step_fp16(self):
-    #     if self.test_cuda:
-    #         self.run_benchmark_gpt2(
-    #             '-m gpt2 --model_class=GPT2LMHeadModel_BeamSearchStep --precision fp16 -o -b 1 --sequence_lengths 5 -s 3 --use_gpu')
-
-    @pytest.mark.slow
-    def test_gpt2_beam_search_step_int8(self):
-        self.run_benchmark_gpt2(
-            "-m gpt2 --model_class=GPT2LMHeadModel_BeamSearchStep --precision int8 -o -b 1 --sequence_lengths 5 -s 3"
-        )
-
-    @pytest.mark.slow
-    def test_gpt2_configurable_one_step_search_fp32(self):
-        self.run_benchmark_gpt2(
-            "-m gpt2 --model_class=GPT2LMHeadModel_ConfigurableOneStepSearch --precision fp32 -v -b 1 --sequence_lengths 5 --past_sequence_lengths 3 --use_gpu"
-        )
-
-    # @pytest.mark.slow
-    # def test_gpt2_configurable_one_step_search_fp16(self):
-    #     if self.test_cuda:
-    #         self.run_benchmark_gpt2(
-    #             "-m gpt2 --model_class=GPT2LMHeadModel_ConfigurableOneStepSearch --precision fp16 -o -b 1 --sequence_lengths 5 -s 3 --use_gpu"
-    #         )
-
-    @pytest.mark.slow
-    def test_gpt2_configurable_one_step_search_int8(self):
-        self.run_benchmark_gpt2(
-            "-m gpt2 --model_class=GPT2LMHeadModel_ConfigurableOneStepSearch --precision int8 -o -b 1 --sequence_lengths 5 -s 3"
-        )
-
 
 if __name__ == "__main__":
     coloredlogs.install(fmt="%(message)s")
diff --git a/tools/ci_build/requirements.txt b/tools/ci_build/requirements.txt
index 5d43ba652a..a45d6c314e 100644
--- a/tools/ci_build/requirements.txt
+++ b/tools/ci_build/requirements.txt
@@ -1,7 +1,6 @@
 # packages used by transformers tool test
-protobuf==3.18.3
-numpy==1.21.6
+protobuf==3.20.1
+numpy==1.23.5
 coloredlogs==15.0
-transformers==4.6.1
-onnxconverter-common==1.8.1
+transformers==4.24.0
 psutil