From 975d3dffcff925d4a8b482dea3ef7fe147822221 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Mon, 28 Oct 2024 13:24:17 -0700 Subject: [PATCH] Update bert benchmark: replace deprecated API (#22611) ### Description (1) tokenizer.max_model_input_sizes was deprecated. Use tokenizer.model_max_length to replace it. (2) onnx opset updated to 16 instead of 11/12 for models. (3) Update a few comments related to torch installation. (4) Test gpu instead of cpu in dev_benchmark.cmd. ### Motivation and Context Update bert benchmark script so that it can run with latest huggingface transformers package. --- .../python/tools/transformers/benchmark.py | 4 +- .../tools/transformers/dev_benchmark.cmd | 18 +- .../tools/transformers/huggingface_models.py | 177 +++++------------- .../tools/transformers/onnx_exporter.py | 4 +- .../tools/transformers/run_benchmark.sh | 8 +- 5 files changed, 55 insertions(+), 156 deletions(-) diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py index 26f8987c76..55ce8d752a 100644 --- a/onnxruntime/python/tools/transformers/benchmark.py +++ b/onnxruntime/python/tools/transformers/benchmark.py @@ -348,7 +348,7 @@ def run_pytorch( else: tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir) - max_input_size = tokenizer.max_model_input_sizes.get(model_name, 1024) + max_input_size = tokenizer.model_max_length logger.debug(f"Model {model}") logger.debug(f"Number of parameters {model.num_parameters()}") @@ -500,7 +500,7 @@ def run_tensorflow( tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir) - max_input_size = tokenizer.max_model_input_sizes.get(model_name, 1024) + max_input_size = tokenizer.model_max_length for batch_size in batch_sizes: if batch_size <= 0: diff --git a/onnxruntime/python/tools/transformers/dev_benchmark.cmd b/onnxruntime/python/tools/transformers/dev_benchmark.cmd index 82137de3c0..4bef58621e 100644 --- a/onnxruntime/python/tools/transformers/dev_benchmark.cmd +++ b/onnxruntime/python/tools/transformers/dev_benchmark.cmd @@ -3,9 +3,7 @@ REM Run benchmark in Windows for developing purpose. For official benchmark, please use run_benchmark.sh. REM Settings are different from run_benchmark.sh: no cli, batch and sequence, input counts, average over 100, no fp16, less models etc. -REM Please install PyTorch (see https://pytorch.org/) before running this benchmark. Like the following: -REM GPU: conda install pytorch torchvision cudatoolkit=10.1 -c pytorch -REM CPU: conda install pytorch torchvision cpuonly -c pytorch +REM Please install PyTorch (see https://pytorch.org/) before running this benchmark. REM When use_package=true, you need not copy other files to run benchmarks except this sh file. REM Otherwise, it will use python script (*.py) files in this directory. @@ -21,12 +19,12 @@ set run_torchscript=false REM Devices to test. REM Attention: You cannot run both CPU and GPU at the same time: gpu need onnxruntime-gpu, and CPU need onnxruntime. -set run_gpu_fp32=false -set run_gpu_fp16=false -set run_cpu_fp32=true -set run_cpu_int8=true +set run_gpu_fp32=true +set run_gpu_fp16=true +set run_cpu_fp32=false +set run_cpu_int8=false -set average_over=100 +set average_over=1000 REM Enable optimizer (use script instead of OnnxRuntime for graph optimization) set use_optimizer=true @@ -36,7 +34,7 @@ set sequence_length=8 128 REM Number of inputs (input_ids, token_type_ids, attention_mask) for ONNX model. REM Note that different input count might lead to different performance -set input_counts=1 +set input_counts=3 REM Pretrained transformers models can be a subset of: bert-base-cased roberta-base gpt2 distilgpt2 distilbert-base-uncased set models_to_test=bert-base-cased @@ -57,7 +55,6 @@ if %run_cpu_int8% == true if %run_gpu_fp32% == true echo cannot test cpu and gpu if %run_cpu_int8% == true if %run_gpu_fp16% == true echo cannot test cpu and gpu at same time & goto :EOF if %run_install% == true ( - pip uninstall --yes ort_nightly pip uninstall --yes onnxruntime pip uninstall --yes onnxruntime-gpu if %run_cpu_fp32% == true ( @@ -70,7 +67,6 @@ if %run_install% == true ( ) ) - pip install --upgrade onnxconverter_common pip install --upgrade transformers ) diff --git a/onnxruntime/python/tools/transformers/huggingface_models.py b/onnxruntime/python/tools/transformers/huggingface_models.py index dcfe4a28ad..4cd878a465 100644 --- a/onnxruntime/python/tools/transformers/huggingface_models.py +++ b/onnxruntime/python/tools/transformers/huggingface_models.py @@ -13,155 +13,62 @@ MODEL_CLASSES = [ "AutoModelForCausalLM", ] -# List of pretrained models: https://huggingface.co/transformers/pretrained_models.html # Pretrained model name to a tuple of input names, opset_version, use_external_data_format, optimization model type +# Some models like GPT, T5, Bart etc has its own convert_to_onnx.py in models sub-directory, and they are excluded here. MODELS = { # BERT - "bert-base-uncased": ( - ["input_ids", "attention_mask", "token_type_ids"], - 12, - False, - "bert", - ), - "bert-large-uncased": ( - ["input_ids", "attention_mask", "token_type_ids"], - 12, - False, - "bert", - ), - "bert-base-cased": ( - ["input_ids", "attention_mask", "token_type_ids"], - 12, - False, - "bert", - ), - # "bert-large-cased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), - # "bert-base-multilingual-uncased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), - # "bert-base-multilingual-cased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), - # "bert-base-chinese": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), - # "bert-base-german-cased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), - # "bert-large-uncased-whole-word-masking": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), - # "bert-large-cased-whole-word-masking": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), - # "bert-large-uncased-whole-word-masking-finetuned-squad": (["input_ids", "attention_mask", - # "token_type_ids"], 12, False, "bert"), - # "bert-large-cased-whole-word-masking-finetuned-squad": (["input_ids", "attention_mask", - # "token_type_ids"], 12, False, "bert"), - # "bert-base-cased-finetuned-mrpc": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), - # "bert-base-german-dbmdz-cased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), - # "bert-base-german-dbmdz-uncased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), - # todo: more models to add - # GPT (no past state) - "openai-gpt": (["input_ids"], 11, False, "gpt2"), - # GPT-2 (no past state, use benchmark_gpt2.py for past_key_values) - "gpt2": (["input_ids"], 11, False, "gpt2"), - "gpt2-medium": (["input_ids"], 11, False, "gpt2"), - "gpt2-large": (["input_ids"], 11, True, "gpt2"), - "gpt2-xl": (["input_ids"], 11, True, "gpt2"), - "distilgpt2": (["input_ids"], 11, False, "gpt2"), - # Transformer-XL (Models uses Einsum, which need opset version 12 or later.) - "transfo-xl-wt103": (["input_ids", "mems"], 12, False, "bert"), + "bert-base-cased": (["input_ids", "attention_mask", "token_type_ids"], 16, False, "bert"), + "bert-large-cased": (["input_ids", "attention_mask", "token_type_ids"], 16, False, "bert"), + # Transformer-XL (Models uses Einsum, which need opset version 16 or later.) + "transfo-xl-wt103": (["input_ids", "mems"], 16, False, "bert"), # XLNet - "xlnet-base-cased": (["input_ids"], 12, False, "bert"), - "xlnet-large-cased": (["input_ids"], 12, False, "bert"), + "xlnet-base-cased": (["input_ids"], 16, False, "bert"), + "xlnet-large-cased": (["input_ids"], 16, False, "bert"), # XLM - "xlm-mlm-en-2048": (["input_ids"], 11, True, "bert"), - "xlm-mlm-ende-1024": (["input_ids"], 11, False, "bert"), - "xlm-mlm-enfr-1024": (["input_ids"], 11, False, "bert"), + "xlm-mlm-en-2048": (["input_ids"], 16, True, "bert"), + "xlm-mlm-ende-1024": (["input_ids"], 16, False, "bert"), + "xlm-mlm-enfr-1024": (["input_ids"], 16, False, "bert"), # RoBERTa - "roberta-base": (["input_ids", "attention_mask"], 12, False, "bert"), - "roberta-large": (["input_ids", "attention_mask"], 12, False, "bert"), - "roberta-large-mnli": (["input_ids", "attention_mask"], 12, False, "bert"), - "deepset/roberta-base-squad2": (["input_ids", "attention_mask"], 11, False, "bert"), - "distilroberta-base": (["input_ids", "attention_mask"], 12, False, "bert"), + "roberta-base": (["input_ids", "attention_mask"], 16, False, "bert"), + "roberta-large": (["input_ids", "attention_mask"], 16, False, "bert"), + "roberta-large-mnli": (["input_ids", "attention_mask"], 16, False, "bert"), + "deepset/roberta-base-squad2": (["input_ids", "attention_mask"], 16, False, "bert"), + "distilroberta-base": (["input_ids", "attention_mask"], 16, False, "bert"), # DistilBERT - "distilbert-base-uncased": (["input_ids", "attention_mask"], 11, False, "bert"), - "distilbert-base-uncased-distilled-squad": ( - ["input_ids", "attention_mask"], - 11, - False, - "bert", - ), + "distilbert-base-uncased": (["input_ids", "attention_mask"], 16, False, "bert"), + "distilbert-base-uncased-distilled-squad": (["input_ids", "attention_mask"], 16, False, "bert"), # CTRL - "ctrl": (["input_ids"], 11, True, "bert"), + "ctrl": (["input_ids"], 16, True, "bert"), # CamemBERT - "camembert-base": (["input_ids"], 11, False, "bert"), + "camembert-base": (["input_ids"], 16, False, "bert"), # ALBERT - "albert-base-v1": (["input_ids"], 12, False, "bert"), - "albert-large-v1": (["input_ids"], 12, False, "bert"), - "albert-xlarge-v1": (["input_ids"], 12, True, "bert"), - # "albert-xxlarge-v1": (["input_ids"], 12, True, "bert"), - "albert-base-v2": (["input_ids"], 12, False, "bert"), - "albert-large-v2": (["input_ids"], 12, False, "bert"), - "albert-xlarge-v2": (["input_ids"], 12, True, "bert"), - # "albert-xxlarge-v2": (["input_ids"], 12, True, "bert"), - # T5 (use benchmark_t5.py instead) - # "t5-small": (["input_ids", "decoder_input_ids"], 12, False, "bert"), - # "t5-base": (["input_ids", "decoder_input_ids"], 12, False, "bert"), - # "t5-large": (["input_ids", "decoder_input_ids"], 12, True, "bert"), - # "t5-3b": (["input_ids", "decoder_input_ids"], 12, True, "bert"), - # "t5-11b": (["input_ids", "decoder_input_ids"], 12, True, "bert"), - # "valhalla/t5-small-qa-qg-hl": (["input_ids"], 12, True, "bert"), + "albert-base-v1": (["input_ids"], 16, False, "bert"), + "albert-large-v1": (["input_ids"], 16, False, "bert"), + "albert-xlarge-v1": (["input_ids"], 16, True, "bert"), + # "albert-xxlarge-v1": (["input_ids"], 16, True, "bert"), + "albert-base-v2": (["input_ids"], 16, False, "bert"), + "albert-large-v2": (["input_ids"], 16, False, "bert"), + "albert-xlarge-v2": (["input_ids"], 16, True, "bert"), + # "albert-xxlarge-v2": (["input_ids"], 16, True, "bert"), # XLM-RoBERTa - "xlm-roberta-base": (["input_ids"], 11, False, "bert"), - "xlm-roberta-large": (["input_ids"], 11, True, "bert"), + "xlm-roberta-base": (["input_ids"], 16, False, "bert"), + "xlm-roberta-large": (["input_ids"], 16, True, "bert"), # FlauBERT - "flaubert/flaubert_small_cased": (["input_ids"], 11, False, "bert"), - # "flaubert/flaubert_base_uncased": (["input_ids"], 11, False, "bert"), - "flaubert/flaubert_base_cased": (["input_ids"], 11, False, "bert"), - # "flaubert/flaubert_large_cased": (["input_ids"], 11, False, "bert"), - # Bart - "facebook/bart-large": (["input_ids", "attention_mask"], 11, False, "bart"), - "facebook/bart-base": (["input_ids", "attention_mask"], 11, False, "bart"), - "facebook/bart-large-mnli": (["input_ids", "attention_mask"], 11, False, "bart"), - "facebook/bart-large-cnn": (["input_ids", "attention_mask"], 11, False, "bart"), - # DialoGPT - "microsoft/DialoGPT-small": (["input_ids"], 11, False, "gpt2"), - "microsoft/DialoGPT-medium": (["input_ids"], 11, False, "gpt2"), - # "microsoft/DialoGPT-large": (["input_ids"], 11, True, "gpt2"), - # Reformer - # "google/reformer-enwik8": (["input_ids"], 11, False, "bert"), - # "google/reformer-crime-and-punishment": (["input_ids"], 11, False, "bert"), - # MarianMT - # "Helsinki-NLP/opus-mt-ROMANCE-en": (["input_ids"], 12, False, "bert"), - # Longformer (use benchmark_longformer.py instead) - # "allenai/longformer-base-4096": (["input_ids"], 12, False, "bert"), - # "allenai/longformer-large-4096": (["input_ids"], 12, False, "bert"), - # MBart - "facebook/mbart-large-cc25": (["input_ids"], 11, True, "bert"), - "facebook/mbart-large-en-ro": (["input_ids"], 11, True, "bert"), - # "Helsinki-NLP/opus-mt-ROMANCE-en": (["input_ids"], 12, False, "bert"), - # # Longformer - # "allenai/longformer-base-4096": (["input_ids"], 12, False, "bert"), - # "allenai/longformer-large-4096": (["input_ids"], 12, True, "bert"), - # "funnel-transformer/small": (["input_ids"], 12, False, "bert"), - # "funnel-transformer/small-base": (["input_ids"], 12, False, "bert"), - # "funnel-transformer/medium": (["input_ids"], 12, False, "bert"), - # "funnel-transformer/medium-base": (["input_ids"], 12, False, "bert"), - # "funnel-transformer/intermediate": (["input_ids"], 12, False, "bert"), - # "funnel-transformer/intermediate-base": (["input_ids"], 12, False, "bert"), - # "funnel-transformer/large": (["input_ids"], 12, True, "bert"), - # "funnel-transformer/large-base": (["input_ids"], 12, True, "bert"), - # "funnel-transformer/xlarge": (["input_ids"], 12, True, "bert"), - # "funnel-transformer/xlarge-base": (["input_ids"], 12, True, "bert"), + "flaubert/flaubert_small_cased": (["input_ids"], 16, False, "bert"), + "flaubert/flaubert_base_cased": (["input_ids"], 16, False, "bert"), + # "flaubert/flaubert_large_cased": (["input_ids"], 16, False, "bert"), # Layoutlm - "microsoft/layoutlm-base-uncased": (["input_ids"], 11, False, "bert"), - "microsoft/layoutlm-large-uncased": (["input_ids"], 11, False, "bert"), + "microsoft/layoutlm-base-uncased": (["input_ids"], 16, False, "bert"), + "microsoft/layoutlm-large-uncased": (["input_ids"], 16, False, "bert"), # Squeezebert - "squeezebert/squeezebert-uncased": (["input_ids"], 11, False, "bert"), - "squeezebert/squeezebert-mnli": (["input_ids"], 11, False, "bert"), - "squeezebert/squeezebert-mnli-headless": (["input_ids"], 11, False, "bert"), - "unc-nlp/lxmert-base-uncased": ( - ["input_ids", "visual_feats", "visual_pos"], - 11, - False, - "bert", - ), - # "google/pegasus-xsum": (["input_ids"], 11, False, "bert"), - # "google/pegasus-large": (["input_ids"], 11, False, "bert"), + "squeezebert/squeezebert-uncased": (["input_ids"], 16, False, "bert"), + "squeezebert/squeezebert-mnli": (["input_ids"], 16, False, "bert"), + "squeezebert/squeezebert-mnli-headless": (["input_ids"], 16, False, "bert"), + "unc-nlp/lxmert-base-uncased": (["input_ids", "visual_feats", "visual_pos"], 16, False, "bert"), # ViT - "google/vit-base-patch16-224": (["pixel_values"], 12, False, "vit"), + "google/vit-base-patch16-224": (["pixel_values"], 16, False, "vit"), # Swin - "microsoft/swin-base-patch4-window7-224": (["pixel_values"], 12, False, "swin"), - "microsoft/swin-small-patch4-window7-224": (["pixel_values"], 12, False, "swin"), - "microsoft/swin-tiny-patch4-window7-224": (["pixel_values"], 12, False, "swin"), + "microsoft/swin-base-patch4-window7-224": (["pixel_values"], 16, False, "swin"), + "microsoft/swin-small-patch4-window7-224": (["pixel_values"], 16, False, "swin"), + "microsoft/swin-tiny-patch4-window7-224": (["pixel_values"], 16, False, "swin"), } diff --git a/onnxruntime/python/tools/transformers/onnx_exporter.py b/onnxruntime/python/tools/transformers/onnx_exporter.py index 3967a7875f..212a7c4871 100644 --- a/onnxruntime/python/tools/transformers/onnx_exporter.py +++ b/onnxruntime/python/tools/transformers/onnx_exporter.py @@ -492,7 +492,7 @@ def export_onnx_model_from_pt( example_inputs = image_processor(data, return_tensors="pt") else: tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir) - max_input_size = tokenizer.max_model_input_sizes.get(model_name, 1024) + max_input_size = tokenizer.model_max_length example_inputs = tokenizer.encode_plus("This is a sample input", return_tensors="pt") example_inputs = filter_inputs(example_inputs, input_names) @@ -596,7 +596,7 @@ def export_onnx_model_from_tf( # Fix "Using pad_token, but it is not set yet" error. if tokenizer.pad_token is None: tokenizer.add_special_tokens({"pad_token": "[PAD]"}) - max_input_size = tokenizer.max_model_input_sizes.get(model_name, 1024) + max_input_size = tokenizer.model_max_length config, model = load_tf_model(model_name, model_class, cache_dir, config_modifier) model.resize_token_embeddings(len(tokenizer)) diff --git a/onnxruntime/python/tools/transformers/run_benchmark.sh b/onnxruntime/python/tools/transformers/run_benchmark.sh index ddc8b781a5..25997f40d3 100755 --- a/onnxruntime/python/tools/transformers/run_benchmark.sh +++ b/onnxruntime/python/tools/transformers/run_benchmark.sh @@ -5,10 +5,7 @@ # license information. # -------------------------------------------------------------------------- # This measures the performance of OnnxRuntime, PyTorch and TorchScript on transformer models. -# Please install PyTorch (see https://pytorch.org/) before running this benchmark. Like the following: -# GPU: conda install pytorch torchvision cudatoolkit=11.0 -c pytorch -# CPU: conda install pytorch torchvision cpuonly -c pytorch -# To use torch2, please install the nightly PyTorch by replacing pytorch with pytorch-nightly. +# Please install PyTorch (see https://pytorch.org/) before running this benchmark. # When use_package=true, you need not copy other files to run benchmarks except this sh file. # Otherwise, it will use python script (*.py) files in this directory. @@ -60,7 +57,6 @@ sequence_lengths="8 16 32 64 128 256 512 1024" # Here we only test one input (input_ids) for fair comparison with PyTorch. input_counts=1 -# Pretrained transformers models can be a subset of: bert-base-cased roberta-base gpt2 distilgpt2 distilbert-base-uncased models_to_test="bert-base-cased roberta-base distilbert-base-uncased" # If you have multiple GPUs, you can choose one GPU for test. Here is an example to use the second GPU: @@ -99,7 +95,7 @@ if [ "$run_install" = true ] ; then else pip install onnxruntime-gpu fi - pip install --upgrade onnx coloredlogs packaging psutil py3nvml onnxconverter_common numpy transformers sympy + pip install --upgrade onnx coloredlogs packaging psutil py3nvml numpy transformers sympy fi if [ "$use_package" = true ] ; then