diff --git a/onnxruntime/python/tools/transformers/benchmark_helper.py b/onnxruntime/python/tools/transformers/benchmark_helper.py index 626e7be65b..7fe05df333 100644 --- a/onnxruntime/python/tools/transformers/benchmark_helper.py +++ b/onnxruntime/python/tools/transformers/benchmark_helper.py @@ -297,10 +297,8 @@ def measure_memory(is_gpu, func): return None monitor = MemoryMonitor(False) - if is_gpu: - print(f"GPU memory usage before testing: {monitor.measure_gpu_usage()}") - else: - print(f"Peak CPU memory usage before testing: {monitor.measure_cpu_usage():.2f} MB") + + memory_before_test = monitor.measure_gpu_usage() if is_gpu else monitor.measure_cpu_usage() from concurrent.futures import ThreadPoolExecutor with ThreadPoolExecutor() as executor: @@ -312,8 +310,15 @@ def measure_memory(is_gpu, func): finally: monitor.keep_measuring = False max_usage = mem_thread.result() + if is_gpu: - print(f"Peak GPU memory usage: {max_usage}") + print(f"GPU memory usage: before={memory_before_test} peak={max_usage}") + if len(memory_before_test) >= 1 and len(max_usage) >= 1: + before = memory_before_test[0]["max_used_MB"] + after = max_usage[0]["max_used_MB"] + return after - before + else: + return None else: - print(f"Peak CPU memory usage: {max_usage:.2f} MB") - return max_usage + print(f"CPU memory usage: before={memory_before_test:.1f} MB, peak={max_usage:.1f} MB") + return max_usage - memory_before_test diff --git a/onnxruntime/python/tools/transformers/longformer/benchmark_longformer.py b/onnxruntime/python/tools/transformers/longformer/benchmark_longformer.py index 30b13d4e8b..e877e55872 100644 --- a/onnxruntime/python/tools/transformers/longformer/benchmark_longformer.py +++ b/onnxruntime/python/tools/transformers/longformer/benchmark_longformer.py @@ -13,17 +13,20 @@ # cd ../torch_extensions # python setup.py install # cd ../longformer -# python convert_longformer_to_onnx.py --model longformer-base-4096 --precision fp32 --optimize_onnx +# python convert_longformer_to_onnx.py --model longformer-base-4096 --precision fp16 --optimize_onnx +# +# When there is no parameter, all avaiable tests (memory & latency) will run on the longformer-base-4096 pretrained model. +# python benchmark_longformer.py # # Benchmark the latency (Exported onnx model is in the current directory): -# python benchmark_longformer.py --models longformer-base-4096 --batch_sizes 1 --sequence_lengths 512 1024 2048 4096 --global_lengths 8 --onnx_dir . --validate_onnx -t 100 +# python benchmark_longformer.py --model longformer-base-4096 --batch_sizes 1 --sequence_lengths 512 1024 2048 4096 --global_lengths 8 --onnx ./longformer-base-4096_fp16.onnx --validate_onnx -t 100 # # Benchmark GPU peak memory: # export ORT_LONGFORMER_COMPACT_MEMORY=0 -# python benchmark_longformer.py --models longformer-base-4096 --batch_sizes 1 --sequence_lengths 4096 --global_lengths 8 --onnx_dir . --memory -t 10 +# python benchmark_longformer.py --model longformer-base-4096 --batch_sizes 1 --sequence_lengths 4096 --global_lengths 8 --onnx_dir . --memory -t 10 # export ORT_LONGFORMER_COMPACT_MEMORY=1 -# python benchmark_longformer.py --models longformer-base-4096 --batch_sizes 1 --sequence_lengths 4096 --global_lengths 8 --onnx_dir . --memory -t 10 -# By default, compact memory kernel is not enabled since it is slower. You need set an environment variable ORT_LONGFORMER_COMPACT_MEMORY=1 to enable it, which uses less memory in this test. +# python benchmark_longformer.py --model longformer-base-4096 --batch_sizes 1 --sequence_lengths 4096 --global_lengths 8 --onnx_dir . --memory -t 10 +# By default, compact memory kernel is not enabled. You need set an environment variable ORT_LONGFORMER_COMPACT_MEMORY=1 to enable it. import timeit from datetime import datetime @@ -34,7 +37,6 @@ import sys import torch import onnxruntime import numpy as np -import pprint import math from longformer_helper import LongformerHelper, PRETRAINED_LONGFORMER_MODELS @@ -67,12 +69,14 @@ def test_torch_latency(device, model, model_name, batch_sizes, sequence_lengths, "precision": "fp32", "io_binding": "", "model_name": model_name, + "description": model_name + "[torch]", "inputs": 3, "threads": num_threads, "batch_size": batch_size, "sequence_length": sequence_length, "global_length": global_length, "datetime": str(datetime.now()), + "memory": "?", } result.update(benchmark_helper.get_latency_result(runtimes, batch_size)) @@ -96,11 +100,13 @@ def test_parity(device, model, ort_session, batch_size, sequence_length, global_ if verbose and (math.isnan(max_diff) or max_diff > 0.001): print("torch last_state:", torch_outputs[0]) print("ort last_state:", ort_outputs[0]) + return max_diff def test_ort_latency(device, model, model_name, + description, ort_session, batch_sizes, sequence_lengths, @@ -135,6 +141,7 @@ def test_ort_latency(device, result_template = { "model_name": model_name, + "description": description, "inputs": 3, "engine": "OnnxRuntime", "version": onnxruntime.__version__, @@ -147,6 +154,7 @@ def test_ort_latency(device, "global_length": global_length, "test_times": test_times, "datetime": str(datetime.now()), + "memory": "", } if not disable_io_binding: @@ -172,12 +180,12 @@ def test_ort_latency(device, repeat_times=test_times, batch_size=batch_size) - pprint.pprint(result) - results.append(result) - if validate_onnx: - test_parity(device, model, ort_session, batch_size, sequence_length, global_length, verbose) + max_diff = test_parity(device, model, ort_session, batch_size, sequence_length, global_length, + verbose) + result["description"] += f"(max_diff={max_diff})" + results.append(result) return results @@ -198,83 +206,120 @@ def test_ort_memory(device, onnx_model_path, batch_size, sequence_length, global for _ in range(test_times): ort_outputs = session.run(None, ort_inputs) - benchmark_helper.measure_memory(is_gpu=True, func=inference) - print("Memory test is done") + memory_used = benchmark_helper.measure_memory(is_gpu=True, func=inference) + + return { + "onnx_model": onnx_model_path, + "batch_size": batch_size, + "sequence_length": sequence_length, + "global_length": global_length, + "test_times": test_times, + "num_threads": num_threads, + "memory": memory_used + } -def test_all(args): - # Currently, the longformer attention operator could only run in GPU (no CPU implementation yet). - device = torch.device('cuda:0') +def load_torch_model(model_name, device): + torch_model_name_or_dir = PRETRAINED_LONGFORMER_MODELS[ + model_name] if model_name in PRETRAINED_LONGFORMER_MODELS else model_name - results = [] - for model_name in args.models: - # Here we run an example input - from transformers import LongformerModel - torch_model_name_or_dir = PRETRAINED_LONGFORMER_MODELS[model_name] - model = LongformerModel.from_pretrained(torch_model_name_or_dir) # pretrained model name or directory - model.to(device) + from transformers import LongformerModel + model = LongformerModel.from_pretrained(torch_model_name_or_dir) + model.to(device) + return model - # Search onnx model in the following order: optimized fp16 model, optimized fp32 model, raw model - # TODO: call convert_longformer_to_onnx to export onnx instead. - import os.path - optimized = False - precision = 'fp32' - onnx_model_path = os.path.join(args.onnx_dir, model_name + ".onnx") - optimized_fp32_model = os.path.join(args.onnx_dir, model_name + "_fp32.onnx") - optimized_fp16_model = os.path.join(args.onnx_dir, model_name + "_fp16.onnx") - if os.path.isfile(optimized_fp16_model): - onnx_model_path = optimized_fp16_model - optimized = True - precision = 'fp16' - elif os.path.isfile(optimized_fp32_model): - onnx_model_path = optimized_fp32_model - optimized = True - print("ONNX model path:", onnx_model_path) - for num_threads in args.num_threads: - if "torch" in args.engines: - results += test_torch_latency(device, model, model_name, args.batch_sizes, args.sequence_lengths, - args.global_lengths, args.test_times, num_threads, args.verbose) +def find_onnx_model(model_name, onnx_dir='.'): + # Search onnx model in the following order: optimized fp16 model, optimized fp32 model, raw model + # TODO: call convert_longformer_to_onnx to export onnx instead. + import os.path + onnx_model_path = os.path.join(onnx_dir, model_name + ".onnx") + optimized_fp32_model = os.path.join(onnx_dir, model_name + "_fp32.onnx") + optimized_fp16_model = os.path.join(onnx_dir, model_name + "_fp16.onnx") + if os.path.isfile(optimized_fp16_model): + onnx_model_path = optimized_fp16_model + elif os.path.isfile(optimized_fp32_model): + onnx_model_path = optimized_fp32_model + return onnx_model_path - if "onnxruntime" in args.engines: - if args.memory: - test_ort_memory(device, onnx_model_path, args.batch_sizes[0], args.sequence_lengths[0], - args.global_lengths[0], args.test_times, num_threads) - else: # test latency - session = benchmark_helper.create_onnxruntime_session(onnx_model_path, - use_gpu=True, - enable_all_optimization=True, - num_threads=num_threads) - if session is None: - raise RuntimeError(f"Failed to create ORT sesssion from ONNX file {onnx_model_path}") - results += test_ort_latency(device, model, model_name, session, args.batch_sizes, - args.sequence_lengths, args.global_lengths, args.test_times, - num_threads, optimized, precision, args.validate_onnx, - args.disable_io_binding, args.verbose) - return results +def test_memory(args, device): + if len(args.batch_sizes) > 1: + raise RuntimeError("For memory test, only one batch_size (-b) is allowed.") + if len(args.sequence_lengths) > 1: + raise RuntimeError("For memory test, only one sequence_length (-s) is allowed.") + if len(args.global_lengths) > 1: + raise RuntimeError("For memory test, only one global_length (-g) is allowed.") + + model_name = args.model + onnx_model_path = find_onnx_model(model_name) if not args.onnx else args.onnx + + torch.cuda.empty_cache() + return test_ort_memory(device, onnx_model_path, args.batch_sizes[0], args.sequence_lengths[0], + args.global_lengths[0], args.test_times, args.num_threads) + + +def test_ort(args, device): + model_name = args.model + + onnx_model_path = find_onnx_model(model_name) if not args.onnx else args.onnx + + optimized = onnx_model_path.endswith("_fp16.onnx") or onnx_model_path.endswith("_fp32.onnx") + precision = 'fp32' if not onnx_model_path.endswith("_fp16.onnx") else 'fp16' + + model = load_torch_model(model_name, device) + + num_threads = args.num_threads + + session = benchmark_helper.create_onnxruntime_session(onnx_model_path, + use_gpu=True, + enable_all_optimization=True, + num_threads=num_threads) + if session is None: + raise RuntimeError(f"Failed to create ORT sesssion from ONNX file {onnx_model_path}") + + description = onnx_model_path + if (os.environ.get('ORT_LONGFORMER_COMPACT_MEMORY', '0') == "1"): + description += "[compact_memory]" + + return test_ort_latency(device, model, model_name, description, session, args.batch_sizes, args.sequence_lengths, + args.global_lengths, args.test_times, num_threads, optimized, precision, args.validate_onnx, + args.disable_io_binding, args.verbose) + + +def test_torch(args, device): + model = load_torch_model(args.model, device) + return test_torch_latency(device, model, args.model, args.batch_sizes, args.sequence_lengths, args.global_lengths, + args.test_times, args.num_threads, args.verbose) + + +def test_latency(args, device): + if "onnxruntime" == args.engine: + return test_ort(args, device) + elif "torch" == args.engine: + return test_torch(args, device) + + raise RuntimeError("unknown engine " + args.engine) def parse_arguments(argv=None): parser = argparse.ArgumentParser() parser.add_argument("-m", - "--models", + "--model", required=False, - nargs="+", type=str, - default=["longformer-base-4096"], + default="longformer-base-4096", help="Checkpoint directory or pre-trained model names in the list: " + ", ".join(PRETRAINED_LONGFORMER_MODELS.keys())) parser.add_argument("-e", - "--engines", + "--engine", required=False, - nargs="+", type=str, - default=['onnxruntime'], + default='onnxruntime', choices=['onnxruntime', 'torch'], - help="Engines to benchmark. For large model, recommend to test only one engine at a time.") + help="Engine to benchmark.") parser.add_argument("-t", "--test_times", @@ -297,11 +342,7 @@ def parse_arguments(argv=None): "Sequence lengths. It could have multiple values in latency test. If --export_padding is not used in exporting onnx model, sequence length shall be multiple of window size." ) - parser.add_argument("--onnx_dir", - required=False, - type=str, - default=os.path.join('.', 'onnx_models'), - help="Directory to search onnx models.") + parser.add_argument("--onnx", required=False, type=str, default=None, help="Onnx model path") parser.add_argument("-g", "--global_lengths", @@ -310,13 +351,7 @@ def parse_arguments(argv=None): default=[0], help="Number of global tokens. It could have multiple values in latency test.") - parser.add_argument("-n", - "--num_threads", - required=False, - nargs="+", - type=int, - default=[0], - help="Threads to use. It could have multiple values in latency test.") + parser.add_argument("-n", "--num_threads", required=False, type=int, default=0, help="Threads to use.") parser.add_argument("-v", "--validate_onnx", @@ -331,78 +366,35 @@ def parse_arguments(argv=None): parser.add_argument("--verbose", required=False, action="store_true", help="Print more information.") args = parser.parse_args(argv) + return args -def output_summary(results, csv_filename, args): - with open(csv_filename, mode="a", newline='') as csv_file: - header_names = [ - "model_name", "inputs", "engine", "version", "device", "precision", "optimizer", "io_binding", "threads" - ] - data_names = [] - for batch_size in args.batch_sizes: - for sequence_length in args.sequence_lengths: - for global_length in args.global_lengths: - data_names.append(f"b{batch_size}_s{sequence_length}_g{global_length}") - - csv_writer = csv.DictWriter(csv_file, fieldnames=header_names + data_names) - csv_writer.writeheader() - for model in args.models: - for input_count in [1, 2, 3]: - for engine_name in args.engines: - for io_binding in [True, False, ""]: - for threads in args.num_threads: - row = {} - for result in results: - if result["model_name"] == model and result["inputs"] == input_count and \ - result["engine"] == engine_name and result["io_binding"] == io_binding and \ - result["threads"] == threads: - headers = {k: v for k, v in result.items() if k in header_names} - if not row: - row.update(headers) - row.update({k: "" for k in data_names}) - else: - for k in header_names: - assert row[k] == headers[k] - b = result["batch_size"] - s = result["sequence_length"] - g = result["global_length"] - row[f"b{b}_s{s}_g{g}"] = result["average_latency_ms"] - if row: - csv_writer.writerow(row) - - print(f"Summary results are saved to csv file: {csv_filename}") - - def output_details(results, csv_filename): + latency_results = [result for result in results if 'average_latency_ms' in result] + if len(latency_results) == 0: + print("No latency results for output.") + return + with open(csv_filename, mode="a", newline='') as csv_file: column_names = [ "engine", "version", "device", "precision", "optimizer", "io_binding", "model_name", "inputs", "threads", - "batch_size", "sequence_length", "global_length", "datetime", "test_times", "QPS", "average_latency_ms", - "latency_variance", "latency_90_percentile", "latency_95_percentile", "latency_99_percentile" + "datetime", "test_times", "description", "batch_size", "sequence_length", "global_length", "memory", "QPS", + "average_latency_ms", "latency_variance", "latency_90_percentile", "latency_95_percentile", + "latency_99_percentile" ] csv_writer = csv.DictWriter(csv_file, fieldnames=column_names) csv_writer.writeheader() - for result in results: + for result in latency_results: + print( + f"b={result['batch_size']}, s={result['sequence_length']}, g={result['global_length']}, latency={result['average_latency_ms']}ms, memory={result['memory']}MB {result['description']}" + ) csv_writer.writerow(result) - print(f"Detail results are saved to csv file: {csv_filename}") -def main(args): - assert len(args.models) == 1, "run only one model at a time" - - if args.memory: - if len(args.batch_sizes) > 1: - raise RuntimeError("For memory test, only one batch_size (-b) is allowed.") - if len(args.sequence_lengths) > 1: - raise RuntimeError("For memory test, only one sequence_length (-s) is allowed.") - if len(args.global_lengths) > 1: - raise RuntimeError("For memory test, only one global_length (-g) is allowed.") - if len(args.num_threads) > 1: - raise RuntimeError("For memory test, only one value of --num_threads is allowed.") - +def run(args): if not torch.cuda.is_available(): raise RuntimeError("Please install PyTorch with Cuda, and use a machine with GPU for testing gpu performance.") @@ -411,18 +403,63 @@ def main(args): # set random seed manully to get deterministic results #benchmark_helper.set_random_seed(123) - all_results = test_all(args) + # Currently, the longformer attention operator could only run in GPU (no CPU implementation yet). + device = torch.device('cuda:0') - time_stamp = datetime.now().strftime("%Y%m%d-%H%M%S") - csv_filename = f"benchmark_detail_{time_stamp}.csv" - output_details(all_results, csv_filename) + if args.memory: + return test_memory(args, device) + else: + return test_latency(args, device) - csv_filename = f"benchmark_summary_{time_stamp}.csv" - output_summary(all_results, csv_filename, args) + +def test_all(): + results = [] + test_times = 100 + sequence_lengths = [512, 1024, 2048, 4096] + for model_name in ['longformer-base-4096']: + for batch_size in [1]: + for sequence_length in sequence_lengths: + for global_length in [8]: + engine_name = 'torch' + args = parse_arguments( + f"-e {engine_name} -t {test_times} -b {batch_size} -s {sequence_length} -g {global_length} -t {test_times} -m {model_name}" + .split(' ')) + results += run(args) + + engine_name = 'onnxruntime' + onnx_paths = [f"{model_name}_fp32.onnx", f"{model_name}_fp16.onnx"] # optimized models + for onnx_path in onnx_paths: + if os.path.exists(onnx_path): + for compact_memory in ["0", "1"]: + os.environ["ORT_LONGFORMER_COMPACT_MEMORY"] = compact_memory + print("ORT_LONGFORMER_COMPACT_MEMORY=", compact_memory) + + args = parse_arguments( + f"--disable_io_binding -e {engine_name} --onnx {onnx_path} -t {test_times} -b {batch_size} -s {sequence_length} -g {global_length} -t 10 -m {model_name} --memory" + .split(' ')) + memory_results = run(args) + print(memory_results) + + args = parse_arguments( + f"--disable_io_binding -e {engine_name} --onnx {onnx_path} -t {test_times} -b {batch_size} -s {sequence_length} -g {global_length} -t {test_times} -m {model_name} --validate_onnx" + .split(' ')) + latency_results = run(args) + if len(latency_results) == 1: + latency_results[0]["memory"] = memory_results["memory"] + + print(latency_results) + + results += latency_results + return results if __name__ == "__main__": - args = parse_arguments() - #args = parse_arguments("-e onnxruntime -t 1 -b 1 -s 4 -g 2 --onnx_dir . -t 1 -m longformer-random-tiny".split(' ')) + if len(sys.argv) > 1: + args = parse_arguments() + results = run(args) + else: + results = test_all() - main(args) + time_stamp = datetime.now().strftime("%Y%m%d-%H%M%S") + csv_filename = f"benchmark_detail_{time_stamp}.csv" + output_details(results, csv_filename) diff --git a/onnxruntime/python/tools/transformers/longformer/convert_longformer_to_onnx.py b/onnxruntime/python/tools/transformers/longformer/convert_longformer_to_onnx.py index b827d05c3a..6eab8e4f21 100644 --- a/onnxruntime/python/tools/transformers/longformer/convert_longformer_to_onnx.py +++ b/onnxruntime/python/tools/transformers/longformer/convert_longformer_to_onnx.py @@ -22,7 +22,7 @@ import transformers from torch.onnx import register_custom_op_symbolic from torch.onnx.symbolic_helper import parse_args from packaging import version - +from pathlib import Path from longformer_helper import LongformerHelper, PRETRAINED_LONGFORMER_MODELS