From c5d4ae04018cdbd2c8be041cc67aae68cff76084 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Thu, 10 Sep 2020 15:42:15 -0700 Subject: [PATCH] Add transformers tools to python package (#5090) * Add transformers to onnxruntime python package --- cmake/onnxruntime_python.cmake | 9 ++++ .../python/tools/transformers/__init__.py | 4 ++ .../python/tools/transformers/benchmark.py | 22 +++++---- .../python/tools/transformers/gpt2_helper.py | 2 +- .../tools/transformers/huggingface_models.py | 48 ++++++++++--------- .../tools/transformers/onnx_exporter.py | 11 +++-- setup.py | 1 + 7 files changed, 61 insertions(+), 36 deletions(-) create mode 100644 onnxruntime/python/tools/transformers/__init__.py diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake index d4d1769fd7..76f6273e98 100644 --- a/cmake/onnxruntime_python.cmake +++ b/cmake/onnxruntime_python.cmake @@ -208,6 +208,11 @@ file(GLOB onnxruntime_python_quantization_src CONFIGURE_DEPENDS file(GLOB onnxruntime_python_quantization_operators_src CONFIGURE_DEPENDS "${ONNXRUNTIME_ROOT}/python/tools/quantization/operators/*.py" ) +file(GLOB onnxruntime_python_transformers_src CONFIGURE_DEPENDS + "${ONNXRUNTIME_ROOT}/python/tools/transformers/*.py" +) +list(REMOVE_ITEM onnxruntime_python_transformers_src + "${ONNXRUNTIME_ROOT}/python/tools/transformers/test_optimizer.py") file(GLOB onnxruntime_python_datasets_srcs CONFIGURE_DEPENDS "${ONNXRUNTIME_ROOT}/python/datasets/*.py" ) @@ -226,6 +231,7 @@ add_custom_command( COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/datasets COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/tools COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/tools/featurizer_ops + COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/transformers COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/quantization COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/quantization/operators COMMAND ${CMAKE_COMMAND} -E copy @@ -273,6 +279,9 @@ add_custom_command( COMMAND ${CMAKE_COMMAND} -E copy ${onnxruntime_python_quantization_operators_src} $/onnxruntime/quantization/operators/ + COMMAND ${CMAKE_COMMAND} -E copy + ${onnxruntime_python_transformers_src} + $/onnxruntime/transformers/ COMMAND ${CMAKE_COMMAND} -E copy ${REPO_ROOT}/VERSION_NUMBER $ diff --git a/onnxruntime/python/tools/transformers/__init__.py b/onnxruntime/python/tools/transformers/__init__.py new file mode 100644 index 0000000000..ad5632855c --- /dev/null +++ b/onnxruntime/python/tools/transformers/__init__.py @@ -0,0 +1,4 @@ +import os +import sys + +sys.path.append(os.path.dirname(__file__)) diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py index a1516ccda2..c1e6fde213 100644 --- a/onnxruntime/python/tools/transformers/benchmark.py +++ b/onnxruntime/python/tools/transformers/benchmark.py @@ -66,9 +66,10 @@ if "OMP_NUM_THREADS" not in os.environ: import torch from transformers import (AutoConfig, AutoTokenizer, AutoModel, GPT2Model) -def run_onnxruntime(use_gpu, model_names, model_class, precision, batch_sizes, sequence_lengths, repeat_times, input_counts, - optimize_onnx, validate_onnx, cache_dir, onnx_dir, verbose, overwrite, disable_ort_io_binding, - use_raw_attention_mask, thread_num, model_fusion_statistics): + +def run_onnxruntime(use_gpu, model_names, model_class, precision, batch_sizes, sequence_lengths, repeat_times, + input_counts, optimize_onnx, validate_onnx, cache_dir, onnx_dir, verbose, overwrite, + disable_ort_io_binding, use_raw_attention_mask, thread_num, model_fusion_statistics): import onnxruntime results = [] @@ -154,8 +155,8 @@ def run_onnxruntime(use_gpu, model_names, model_class, precision, batch_sizes, s return results -def run_pytorch(use_gpu, model_names, model_class, precision, batch_sizes, sequence_lengths, repeat_times, torchscript, cache_dir, - verbose): +def run_pytorch(use_gpu, model_names, model_class, precision, batch_sizes, sequence_lengths, repeat_times, torchscript, + cache_dir, verbose): results = [] if use_gpu and not torch.cuda.is_available(): logger.error("Please install PyTorch with Cuda, and use a machine with GPU for testing gpu performance.") @@ -168,7 +169,8 @@ def run_pytorch(use_gpu, model_names, model_class, precision, batch_sizes, seque model = load_pretrained_model(model_name, config=config, cache_dir=cache_dir, custom_model_class=model_class) tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir) - max_input_size = tokenizer.max_model_input_sizes[model_name] if model_name in tokenizer.max_model_input_sizes else 1024 + max_input_size = tokenizer.max_model_input_sizes[ + model_name] if model_name in tokenizer.max_model_input_sizes else 1024 logger.debug(f"Model {model}") logger.debug(f"Number of parameters {model.num_parameters()}") @@ -365,12 +367,12 @@ def main(): logger.warning("--input_counts is not implemented for torch or torchscript engine.") if enable_torchscript: - results += run_pytorch(args.use_gpu, args.models, args.model_class, args.precision, args.batch_sizes, args.sequence_lengths, - args.test_times, True, args.cache_dir, args.verbose) + results += run_pytorch(args.use_gpu, args.models, args.model_class, args.precision, args.batch_sizes, + args.sequence_lengths, args.test_times, True, args.cache_dir, args.verbose) if enable_torch: - results += run_pytorch(args.use_gpu, args.models, args.model_class, args.precision, args.batch_sizes, args.sequence_lengths, - args.test_times, False, args.cache_dir, args.verbose) + results += run_pytorch(args.use_gpu, args.models, args.model_class, args.precision, args.batch_sizes, + args.sequence_lengths, args.test_times, False, args.cache_dir, args.verbose) model_fusion_statistics = {} if enable_onnxruntime: diff --git a/onnxruntime/python/tools/transformers/gpt2_helper.py b/onnxruntime/python/tools/transformers/gpt2_helper.py index 07bb9f274a..da9fb15357 100644 --- a/onnxruntime/python/tools/transformers/gpt2_helper.py +++ b/onnxruntime/python/tools/transformers/gpt2_helper.py @@ -440,7 +440,7 @@ class Gpt2Helper: def onnxruntime_inference_with_binded_io(ort_session, inputs: Gpt2Inputs, output_buffers: Dict[str, torch.Tensor], - output_shapes : Dict[str, List[int]], + output_shapes: Dict[str, List[int]], total_runs: int = 0, return_numpy: bool = True, include_copy_output_latency: bool = False): diff --git a/onnxruntime/python/tools/transformers/huggingface_models.py b/onnxruntime/python/tools/transformers/huggingface_models.py index 126bbd4809..5de382aa08 100644 --- a/onnxruntime/python/tools/transformers/huggingface_models.py +++ b/onnxruntime/python/tools/transformers/huggingface_models.py @@ -20,7 +20,7 @@ MODEL_CLASSES = { # List of pretrained models: https://huggingface.co/transformers/pretrained_models.html # Pretrained model name to a tuple of input names, opset_version, use_external_data_format, optimization model type MODELS = { -# BERT + # BERT "bert-base-uncased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"), "bert-large-uncased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"), "bert-base-cased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"), @@ -31,26 +31,30 @@ MODELS = { "bert-base-german-cased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"), "bert-large-uncased-whole-word-masking": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"), "bert-large-cased-whole-word-masking": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"), - "bert-large-uncased-whole-word-masking-finetuned-squad": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"), - "bert-large-cased-whole-word-masking-finetuned-squad": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"), + "bert-large-uncased-whole-word-masking-finetuned-squad": (["input_ids", "attention_mask", + "token_type_ids"], 11, False, "bert"), + "bert-large-cased-whole-word-masking-finetuned-squad": (["input_ids", "attention_mask", + "token_type_ids"], 11, False, "bert"), "bert-base-cased-finetuned-mrpc": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"), "bert-base-german-dbmdz-cased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"), "bert-base-german-dbmdz-uncased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"), # todo: more models to add -# GPT + # GPT "openai-gpt": (["input_ids"], 11, False, "gpt2"), # no past state inputs -# GPT-2 + # GPT-2 "gpt2": (["input_ids"], 11, False, "gpt2"), # no past state inputs & outputs "gpt2-medium": (["input_ids"], 11, False, "gpt2"), - "gpt2-large": (["input_ids"], 11, True, "gpt2"), # Model>2GB. Need use_external_data_format=True to export it. No past state inputs for GPT models. + "gpt2-large": + (["input_ids"], 11, True, + "gpt2"), # Model>2GB. Need use_external_data_format=True to export it. No past state inputs for GPT models. "gpt2-xl": (["input_ids"], 11, True, "gpt2"), "distilgpt2": (["input_ids"], 11, False, "gpt2"), # no past state inputs & outputs -# Transformer-XL + # Transformer-XL #"transfo-xl-wt103": (["input_ids"], 11, False, "bert"), -# XLNet + # XLNet #"xlnet-base-cased": (["input_ids"], 12, False, "bert"), # Models uses Einsum, which need opset version 12 and PyTorch 1.5.0 or above. #"xlnet-large-cased": (["input_ids"], 12, False, "bert"), # Models uses Einsum, which need opset version 12 and PyTorch 1.5.0 or above. -# XLM + # XLM "xlm-mlm-en-2048": (["input_ids"], 11, True, "bert"), "xlm-mlm-ende-1024": (["input_ids"], 11, False, "bert"), "xlm-mlm-enfr-1024": (["input_ids"], 11, False, "bert"), @@ -61,25 +65,25 @@ MODELS = { "xlm-clm-ende-1024": (["input_ids"], 11, False, "bert"), "xlm-mlm-17-1280": (["input_ids"], 11, True, "bert"), "xlm-mlm-100-1280": (["input_ids"], 11, True, "bert"), -# RoBERTa + # RoBERTa "roberta-base": (["input_ids", "attention_mask"], 11, False, "bert"), "roberta-large": (["input_ids", "attention_mask"], 11, False, "bert"), "roberta-large-mnli": (["input_ids", "attention_mask"], 11, False, "bert"), "distilroberta-base": (["input_ids", "attention_mask"], 11, False, "bert"), "roberta-base-openai-detector": (["input_ids", "attention_mask"], 11, False, "bert"), "roberta-large-openai-detector": (["input_ids", "attention_mask"], 11, False, "bert"), -# DistilBERT + # DistilBERT "distilbert-base-uncased": (["input_ids", "attention_mask"], 11, False, "bert"), "distilbert-base-uncased-distilled-squad": (["input_ids", "attention_mask"], 11, False, "bert"), "distilbert-base-cased": (["input_ids", "attention_mask"], 11, False, "bert"), "distilbert-base-cased-distilled-squad": (["input_ids", "attention_mask"], 11, False, "bert"), "distilbert-base-german-cased": (["input_ids", "attention_mask"], 11, False, "bert"), "distilbert-base-multilingual-cased": (["input_ids", "attention_mask"], 11, False, "bert"), -# CTRL + # CTRL "ctrl": (["input_ids"], 11, True, "bert"), -# CamemBERT + # CamemBERT "camembert-base": (["input_ids"], 11, False, "bert"), -# ALBERT + # ALBERT # Models uses Einsum, which need opset version 12 and PyTorch 1.5.0 or above. "albert-base-v1": (["input_ids"], 12, False, "bert"), "albert-large-v1": (["input_ids"], 12, False, "bert"), @@ -89,36 +93,36 @@ MODELS = { "albert-large-v2": (["input_ids"], 12, False, "bert"), "albert-xlarge-v2": (["input_ids"], 12, True, "bert"), "albert-xxlarge-v2": (["input_ids"], 12, True, "bert"), -# T5 + # T5 #"t5-small": (["input_ids"], 11, False, "bert"), #"t5-base": (["input_ids"], 11, False, "bert"), #"t5-large": (["input_ids"], 11, False, "bert"), #"t5-3b": (["input_ids"], 11, False, "bert"), #"t5-11b": (["input_ids"], 11, False, "bert"), -# XLM-RoBERTa + # XLM-RoBERTa "xlm-roberta-base": (["input_ids"], 11, False, "bert"), "xlm-roberta-large": (["input_ids"], 11, True, "bert"), -# FlauBERT + # FlauBERT "flaubert/flaubert_small_cased": (["input_ids"], 11, False, "bert"), "flaubert/flaubert_base_uncased": (["input_ids"], 11, False, "bert"), "flaubert/flaubert_base_cased": (["input_ids"], 11, False, "bert"), "flaubert/flaubert_large_cased": (["input_ids"], 11, False, "bert"), -# Bart + # Bart #"facebook/bart-large": (["input_ids"], 11, False, "bert"), #"facebook/bart-base": (["input_ids"], 11, False, "bert"), #"facebook/bart-large-mnli": (["input_ids"], 11, False, "bert"), #"facebook/bart-large-cnn": (["input_ids"], 11, False, "bert"), #"facebook/mbart-large-en-ro": (["input_ids"], 11, True, "bert"), -# DialoGPT + # DialoGPT "microsoft/DialoGPT-small": (["input_ids"], 11, False, "gpt2"), "microsoft/DialoGPT-medium": (["input_ids"], 11, False, "gpt2"), "microsoft/DialoGPT-large": (["input_ids"], 11, True, "gpt2"), -# Reformer + # Reformer #"google/reformer-enwik8": (["input_ids"], 11, False, "bert"), #"google/reformer-crime-and-punishment": (["input_ids"], 11, False, "bert"), -# MarianMT + # MarianMT #"Helsinki-NLP/opus-mt-ROMANCE-en": (["input_ids"], 12, False, "bert"), -# Longformer + # Longformer #"allenai/longformer-base-4096": (["input_ids"], 12, False, "bert"), #"allenai/longformer-large-4096": (["input_ids"], 12, False, "bert"), } diff --git a/onnxruntime/python/tools/transformers/onnx_exporter.py b/onnxruntime/python/tools/transformers/onnx_exporter.py index c8b8c8d742..fa870ddf98 100644 --- a/onnxruntime/python/tools/transformers/onnx_exporter.py +++ b/onnxruntime/python/tools/transformers/onnx_exporter.py @@ -18,7 +18,8 @@ logger = logging.getLogger(__name__) # Walkaround by replacing torch.triu using self-defined op # Since torch.triu cannot be exported to ONNX. See https://github.com/pytorch/pytorch/issues/32968 -torch_func = {"triu" : torch.triu} +torch_func = {"triu": torch.triu} + def triu_onnx(x, diagonal=0, out=None): assert out is None @@ -26,15 +27,18 @@ def triu_onnx(x, diagonal=0, out=None): torch_triu = torch_func["triu"] template = torch_triu(torch.ones((1024, 1024), dtype=torch.uint8), diagonal) - mask = template[:x.size(0),:x.size(1)] + mask = template[:x.size(0), :x.size(1)] return torch.where(mask.bool(), x, torch.zeros_like(x)) + def replace_torch_functions(): torch.triu = triu_onnx + def restore_torch_functions(): torch.triu = torch_func["triu"] + def create_onnxruntime_input(vocab_size, batch_size, sequence_length, input_names): input_ids = numpy.random.randint(low=0, high=vocab_size - 1, size=(batch_size, sequence_length), dtype=numpy.int64) @@ -283,6 +287,7 @@ def export_onnx_model(model_name, opset_version, use_external_data_format, model use_external_data_format) optimize_onnx_model_by_ort(onnx_model_path, ort_model_path, use_gpu, overwrite, model_fusion_statistics) - max_input_size = tokenizer.max_model_input_sizes[model_name] if model_name in tokenizer.max_model_input_sizes else 1024 + max_input_size = tokenizer.max_model_input_sizes[ + model_name] if model_name in tokenizer.max_model_input_sizes else 1024 return onnx_model_path, is_valid_onnx_model, config.vocab_size, max_input_size diff --git a/setup.py b/setup.py index 5d141f858e..ec1c95d312 100644 --- a/setup.py +++ b/setup.py @@ -236,6 +236,7 @@ packages = [ 'onnxruntime.tools', 'onnxruntime.quantization', 'onnxruntime.quantization.operators', + 'onnxruntime.transformers', ] if '--enable_training' in sys.argv: