diff --git a/onnxruntime/python/tools/transformers/convert_generation.py b/onnxruntime/python/tools/transformers/convert_generation.py index b59af41c49..17f0dd0bc6 100644 --- a/onnxruntime/python/tools/transformers/convert_generation.py +++ b/onnxruntime/python/tools/transformers/convert_generation.py @@ -1272,7 +1272,9 @@ def find_past_seq_len_usage(subg: GraphProto): return tensor_names_to_rename, nodes_to_remove -def replace_mha_with_gqa(model: OnnxModel, attn_mask: str, kv_num_heads: int = 0, world_size: int = 1): +def replace_mha_with_gqa( + model: OnnxModel, attn_mask: str, kv_num_heads: int = 0, world_size: int = 1, window_size: int = 0 +): # Insert attention_mask subgraph to calculate shared inputs for all GroupQueryAttention nodes # # attention_mask diff --git a/onnxruntime/python/tools/transformers/models/llama/README.md b/onnxruntime/python/tools/transformers/models/llama/README.md index 44dea3cb73..0e34fb0e69 100644 --- a/onnxruntime/python/tools/transformers/models/llama/README.md +++ b/onnxruntime/python/tools/transformers/models/llama/README.md @@ -1,3 +1,13 @@ +# Contents + - [LLaMA-2](#llama-2) + - [Exporting LLaMA-2](#exporting-llama-2) + - [Benchmarking LLaMA-2](#benchmark-llama-2) + - [Mistral](#mistral) + - [Exporting Mistral](#exporting-mistral) + - [Optimizing and Quantizing Mistral](#optimizing-and-quantizing-mistral) + - [Benchmarking Mistral](#benchmark-mistral) + + # LLaMA-2 ## Prerequisites @@ -372,3 +382,58 @@ python3 -m models.llama.benchmark_all \ --num-runs 1000 \ --timeout 60 # number of minutes before moving to the next benchmark ``` + +# Mistral + +## Introduction + +These tools for LLaMA-2 also allow the quantization and optimization of Mistral in ORT. + +## Exporting Mistral + +There is currently one supported way to export Mistral to ONNX format: + +### [Hugging Face Optimum](https://github.com/huggingface/optimum) + + +The following command will export Mistral in full precision: +``` +python -m optimum.exporters.onnx -m mistralai/Mistral-7B-v0.1 --library-name transformers /path/to/model/directory +``` + +## Optimizing and Quantizing Mistral + +To quantize Mistral to FP16 and apply fusion optimizations, you can run the following command: +``` +python -m models.llama.convert_to_onnx -i /path/to/model/directory -o /path/to/optimized_model/directory -p fp16 --optimize_optimum -m mistralai/Mistral-7B-v0.1 +``` + +## Benchmark Mistral +The benchmarking scripts in the LLaMA directory support Mistral benchmarking. To benchmark the ORT version, you can run: + +``` +python -m models.llama.benchmark \ + -bt ort-convert-to-onnx \ + -p fp16 \ + -m mistralai/Mistral-7B-v0.1 \ + --ort-model-path /path/to/model.onnx +``` + +To benchmark the Hugging Face implementation without `torch.compile`: + +``` +python -m models.llama.benchmark \ + -bt hf-pt-eager \ + -p fp16 \ + -m mistralai/Mistral-7B-v0.1 +``` + +And to benchmark the Hugging Face implementation with `torch.compile`: + +``` +python -m models.llama.benchmark \ + -bt hf-pt-compile \ + -p fp16 \ + -m mistralai/Mistral-7B-v0.1 +``` + diff --git a/onnxruntime/python/tools/transformers/models/llama/benchmark.py b/onnxruntime/python/tools/transformers/models/llama/benchmark.py index 021b0dd03a..a53dead77d 100644 --- a/onnxruntime/python/tools/transformers/models/llama/benchmark.py +++ b/onnxruntime/python/tools/transformers/models/llama/benchmark.py @@ -79,7 +79,7 @@ def get_inputs(args: argparse.Namespace, ort_model_inputs_len: int): return_dict=True, ) - elif args.benchmark_type == "hf-ort": + elif args.benchmark_type in {"hf-ort"}: if ort_model_inputs_len == 3: # [input_ids, attention_mask, position_ids] # Using split models in Optimum (e.g. created by Optimum export) init_inputs = get_sample_inputs( @@ -529,7 +529,13 @@ def get_args(rank=0): "--benchmark-type", type=str, required=True, - choices=["hf-pt-eager", "hf-pt-compile", "hf-ort", "ort-msft", "ort-convert-to-onnx"], + choices=[ + "hf-pt-eager", + "hf-pt-compile", + "hf-ort", + "ort-msft", + "ort-convert-to-onnx", + ], ) parser.add_argument( "-m", diff --git a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py index c9c7f4d39d..e694b5050c 100644 --- a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py +++ b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py @@ -391,7 +391,7 @@ def run_torchscript_merged_export( # Optimize the model as FP32 -def optimize_export(config: AutoConfig, input_path: str, output_path: str): +def optimize_export(config: AutoConfig, input_path: str, output_path: str, remove_model: bool = True): from fusion_options import FusionOptions optimization_options = FusionOptions("gpt2") @@ -407,7 +407,8 @@ def optimize_export(config: AutoConfig, input_path: str, output_path: str): ) model_opt.save_model_to_file(output_path, use_external_data_format=True) logger.info(f"The ONNX model at {input_path} has been successfully optimized and saved at {output_path}!") - remove_existing_model(input_path) + if remove_model: + remove_existing_model(input_path) def convert_to_float16( @@ -438,7 +439,7 @@ def convert_to_float16( return new_paths -def use_group_query_attention(config: AutoConfig, fp16_model_opt: OnnxModel, world_size: int = 1): +def use_group_query_attention(config: AutoConfig, fp16_model_opt: OnnxModel, world_size: int = 1, window_size: int = 0): # Replace MultiHeadAttention with GroupQueryAttention fp16_model_opt = replace_mha_with_gqa(fp16_model_opt, "attention_mask", config.num_key_value_heads, world_size) fp16_model_opt.prune_graph() @@ -539,6 +540,23 @@ def remove_existing_files(output_path: str): logger.warning(f"Removed {filepath}") +def optimize_optimum(config: AutoConfig, args: argparse.Namespace): + tmp_file = os.path.join(args.output, args.model_name + ".tmp.onnx") + output_file = os.path.join(args.output, args.model_name + ".onnx") + optimize_export(config, args.input, tmp_file, remove_model=False) + logger.info(f"Model successfully optimized to {tmp_file}") + opt_model = OnnxModel(onnx.load_model(tmp_file, load_external_data=True)) + if args.precision == Precision.FLOAT16: + opt_model.convert_float_to_float16(keep_io_types=False) + window_size = 0 if not hasattr(config, "sliding_window") else config.sliding_window + opt_model = use_group_query_attention(config, opt_model, args.world_size, window_size) + logger.info("Model successfully fused and quantized to FP16!") + opt_model.save_model_to_file(output_file, use_external_data_format=True) + logger.info(f"Output model successfully saved to {output_file}") + logger.info(f"Removing {tmp_file}") + remove_existing_model(tmp_file) + + def get_args(): parser = argparse.ArgumentParser() @@ -554,7 +572,7 @@ def get_args(): "--input", required=False, default=os.path.join("."), - help="Directory path to PyTorch model and associated files if saved on disk", + help="Directory path to PyTorch model and associated files if saved on disk, or ONNX model file location if optimize_optimum is passed.", ) parser.add_argument( @@ -720,6 +738,13 @@ def get_args(): help="model cache dir to override default HF cache dir to avoid overflood the /home dir", ) + parser.add_argument( + "--optimize_optimum", + action="store_true", + help="Avoid exporting model, only apply quantizations and optimizations to existing model exported from optimum.", + ) + parser.set_defaults(optimize_optimum=False) + args = parser.parse_args() return args @@ -740,6 +765,7 @@ def main(): world_size = get_size() rank = get_rank() + args.world_size = world_size # Load model and config use_auth_token = args.input == os.path.join(".") @@ -754,6 +780,11 @@ def main(): location = args.original_model_name if use_auth_token else args.input + if args.optimize_optimum: + config = AutoConfig.from_pretrained(args.original_model_name) + optimize_optimum(config, args) + return + # Use CUDA for LLaMA-2-70B to speed up export and CPU for other models l_config, llama = setup_torch_model( args, location, use_auth_token, device=args.device if args.model_name == "Llama-2-70b-hf" else None