diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py index 53e80423be..1cfbd73e3b 100644 --- a/onnxruntime/python/tools/transformers/benchmark.py +++ b/onnxruntime/python/tools/transformers/benchmark.py @@ -58,7 +58,8 @@ logger = logging.getLogger('') from huggingface_models import MODELS, MODEL_CLASSES -cpu_count = psutil.cpu_count(logical=True) +cpu_count = psutil.cpu_count(logical=False) + # Set OMP environment variable before importing onnxruntime or torch. if "OMP_NUM_THREADS" not in os.environ: os.environ["OMP_NUM_THREADS"] = str(cpu_count) @@ -67,9 +68,9 @@ import torch from transformers import (AutoConfig, AutoTokenizer, AutoModel, GPT2Model) -def run_onnxruntime(use_gpu, model_names, model_class, precision, batch_sizes, sequence_lengths, repeat_times, - input_counts, optimize_onnx, validate_onnx, cache_dir, onnx_dir, verbose, overwrite, - disable_ort_io_binding, use_raw_attention_mask, thread_num, model_fusion_statistics, model_source): +def run_onnxruntime(use_gpu, model_names, model_class, precision, num_threads, batch_sizes, sequence_lengths, + repeat_times, input_counts, optimize_onnx, validate_onnx, cache_dir, onnx_dir, verbose, overwrite, + disable_ort_io_binding, use_raw_attention_mask, model_fusion_statistics, model_source): import onnxruntime results = [] @@ -93,14 +94,14 @@ def run_onnxruntime(use_gpu, model_names, model_class, precision, batch_sizes, s if 'pt' in model_source: with torch.no_grad(): onnx_model_file, is_valid_onnx_model, vocab_size, max_sequence_length = export_onnx_model_from_pt( - model_name, MODELS[model_name][1], MODELS[model_name][2], MODELS[model_name][3], model_class, cache_dir, - onnx_dir, input_names, use_gpu, precision, optimize_onnx, validate_onnx, use_raw_attention_mask, overwrite, - model_fusion_statistics) + model_name, MODELS[model_name][1], MODELS[model_name][2], MODELS[model_name][3], model_class, + cache_dir, onnx_dir, input_names, use_gpu, precision, optimize_onnx, validate_onnx, + use_raw_attention_mask, overwrite, model_fusion_statistics) if 'tf' in model_source: onnx_model_file, is_valid_onnx_model, vocab_size, max_sequence_length = export_onnx_model_from_tf( - model_name, MODELS[model_name][1], MODELS[model_name][2], MODELS[model_name][3], model_class, cache_dir, - onnx_dir, input_names, use_gpu, precision, optimize_onnx, validate_onnx, use_raw_attention_mask, overwrite, - model_fusion_statistics) + model_name, MODELS[model_name][1], MODELS[model_name][2], MODELS[model_name][3], model_class, + cache_dir, onnx_dir, input_names, use_gpu, precision, optimize_onnx, validate_onnx, + use_raw_attention_mask, overwrite, model_fusion_statistics) if not is_valid_onnx_model: continue @@ -108,7 +109,8 @@ def run_onnxruntime(use_gpu, model_names, model_class, precision, batch_sizes, s ort_session = create_onnxruntime_session(onnx_model_file, use_gpu, enable_all_optimization=True, - num_threads=thread_num) + num_threads=num_threads, + verbose=verbose) if ort_session is None: continue @@ -128,7 +130,8 @@ def run_onnxruntime(use_gpu, model_names, model_class, precision, batch_sizes, s continue input_value_type = numpy.int64 if 'pt' in model_source else numpy.int32 - ort_inputs = create_onnxruntime_input(vocab_size, batch_size, sequence_length, input_names, input_value_type) + ort_inputs = create_onnxruntime_input(vocab_size, batch_size, sequence_length, input_names, + input_value_type) result_template = { "engine": "onnxruntime", @@ -136,37 +139,36 @@ def run_onnxruntime(use_gpu, model_names, model_class, precision, batch_sizes, s "device": device, "optimizer": optimize_onnx, "precision": precision, - "io_binding": False, + "io_binding": not disable_ort_io_binding, "model_name": model_name, "inputs": num_inputs, + "threads": num_threads, "batch_size": batch_size, "sequence_length": sequence_length, "datetime": str(datetime.now()), } + logger.info("Run onnxruntime on {} with input shape {}".format(model_name, [batch_size, sequence_length])) - result = inference_ort(ort_session, ort_inputs, result_template, repeat_times, batch_size) - logger.info(result) - results.append(result) - - if not disable_ort_io_binding: - logger.info("Run onnxruntime with io binding on {} with input shape {}".format( - model_name, [batch_size, sequence_length])) + if disable_ort_io_binding: + result = inference_ort(ort_session, ort_inputs, result_template, repeat_times, batch_size) + else: # Get output sizes from a dummy ort run ort_outputs = ort_session.run(ort_output_names, ort_inputs) data_type = numpy.longlong if 'pt' in model_source else numpy.int32 result = inference_ort_with_io_binding(ort_session, ort_inputs, result_template, repeat_times, - ort_output_names, ort_outputs, output_buffers, max_last_state_size, - max_pooler_size, batch_size, device, data_type) - logger.info(result) - results.append(result) + ort_output_names, ort_outputs, output_buffers, + max_last_state_size, max_pooler_size, batch_size, device, + data_type) + logger.info(result) + results.append(result) return results -def run_pytorch(use_gpu, model_names, model_class, precision, batch_sizes, sequence_lengths, repeat_times, torchscript, - cache_dir, verbose): +def run_pytorch(use_gpu, model_names, model_class, precision, num_threads, batch_sizes, sequence_lengths, repeat_times, + torchscript, cache_dir, verbose): results = [] if use_gpu and not torch.cuda.is_available(): logger.error("Please install PyTorch with Cuda, and use a machine with GPU for testing gpu performance.") @@ -223,6 +225,7 @@ def run_pytorch(use_gpu, model_names, model_class, precision, batch_sizes, seque "io_binding": "", "model_name": model_name, "inputs": 1, + "threads": num_threads, "batch_size": batch_size, "sequence_length": sequence_length, "datetime": str(datetime.now()), @@ -237,12 +240,12 @@ def run_pytorch(use_gpu, model_names, model_class, precision, batch_sizes, seque return results -def run_tensorflow(use_gpu, model_names, model_class, precision, batch_sizes, sequence_lengths, repeat_times, thread_n, cache_dir, - verbose): +def run_tensorflow(use_gpu, model_names, model_class, precision, num_threads, batch_sizes, sequence_lengths, + repeat_times, cache_dir, verbose): results = [] import tensorflow as tf - tf.config.threading.set_intra_op_parallelism_threads(thread_n) + tf.config.threading.set_intra_op_parallelism_threads(num_threads) if not use_gpu: tf.config.set_visible_devices([], 'GPU') @@ -251,7 +254,7 @@ def run_tensorflow(use_gpu, model_names, model_class, precision, batch_sizes, se logger.error("Please install Tensorflow-gpu, and use a machine with GPU for testing gpu performance.") return results - if use_gpu: # Restrict TensorFlow to only use the first GPU + if use_gpu: # Restrict TensorFlow to only use the first GPU physical_devices = tf.config.list_physical_devices('GPU') try: tf.config.set_visible_devices(physical_devices[0], 'GPU') @@ -264,11 +267,16 @@ def run_tensorflow(use_gpu, model_names, model_class, precision, batch_sizes, se for model_name in model_names: config = AutoConfig.from_pretrained(model_name, cache_dir=cache_dir) - model = load_pretrained_model(model_name, config=config, cache_dir=cache_dir, custom_model_class=model_class, is_tf_model=True) + model = load_pretrained_model(model_name, + config=config, + cache_dir=cache_dir, + custom_model_class=model_class, + is_tf_model=True) tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir) - max_input_size = tokenizer.max_model_input_sizes[model_name] if model_name in tokenizer.max_model_input_sizes else 1024 + max_input_size = tokenizer.max_model_input_sizes[ + model_name] if model_name in tokenizer.max_model_input_sizes else 1024 for batch_size in batch_sizes: if batch_size <= 0: @@ -278,7 +286,8 @@ def run_tensorflow(use_gpu, model_names, model_class, precision, batch_sizes, se if max_input_size is not None and sequence_length > max_input_size: continue - logger.info("Run Tensorflow on {} with input shape {}".format(model_name, [batch_size, sequence_length])) + logger.info("Run Tensorflow on {} with input shape {}".format(model_name, + [batch_size, sequence_length])) import random rng = random.Random() @@ -286,6 +295,7 @@ def run_tensorflow(use_gpu, model_names, model_class, precision, batch_sizes, se input_ids = tf.constant(values, shape=(batch_size, sequence_length), dtype=tf.int32) try: + def encoder_forward(): return model(input_ids, training=False) @@ -307,6 +317,7 @@ def run_tensorflow(use_gpu, model_names, model_class, precision, batch_sizes, se "io_binding": "", "model_name": model_name, "inputs": 1, + "threads": num_threads, "batch_size": batch_size, "sequence_length": sequence_length, "datetime": str(datetime.now()), @@ -430,7 +441,7 @@ def parse_arguments(): help='Disable running ONNX Runtime with binded inputs and outputs. ') parser.set_defaults(disable_ort_io_binding=False) - parser.add_argument("--thread_num", required=False, type=int, default=-1, help="Threads to use") + parser.add_argument("-n", "--num_threads", required=False, nargs="+", type=int, default=[0], help="Threads to use") args = parser.parse_args() return args @@ -449,6 +460,8 @@ def main(): logger.error("int8 is for CPU only") return + args.num_threads = sorted(set(cpu_count if x <= 0 else x for x in args.num_threads)) + logger.info(f"Arguments: {args}") if not os.path.exists(args.cache_dir): @@ -464,38 +477,39 @@ def main(): results = [] - thread_n = cpu_count if args.thread_num <= 0 else args.thread_num - torch.set_num_threads(thread_n) + for num_threads in args.num_threads: + torch.set_num_threads(num_threads) + logger.debug(torch.__config__.parallel_info()) + if enable_torch or enable_torchscript: + if args.input_counts != [1]: + logger.warning("--input_counts is not implemented for torch or torchscript engine.") - logger.debug(torch.__config__.parallel_info()) + if enable_torchscript: + results += run_pytorch(args.use_gpu, args.models, args.model_class, args.precision, num_threads, + args.batch_sizes, args.sequence_lengths, args.test_times, True, args.cache_dir, + args.verbose) - if enable_torch or enable_torchscript: - if args.input_counts != [1]: - logger.warning("--input_counts is not implemented for torch or torchscript engine.") + if enable_torch: + results += run_pytorch(args.use_gpu, args.models, args.model_class, args.precision, num_threads, + args.batch_sizes, args.sequence_lengths, args.test_times, False, args.cache_dir, + args.verbose) - if enable_torchscript: - results += run_pytorch(args.use_gpu, args.models, args.model_class, args.precision, args.batch_sizes, - args.sequence_lengths, args.test_times, True, args.cache_dir, args.verbose) + if enable_tensorflow: + results += run_tensorflow(args.use_gpu, args.models, args.model_class, args.precision, num_threads, + args.batch_sizes, args.sequence_lengths, args.test_times, args.cache_dir, + args.verbose) - if enable_torch: - results += run_pytorch(args.use_gpu, args.models, args.model_class, args.precision, args.batch_sizes, - args.sequence_lengths, args.test_times, False, args.cache_dir, args.verbose) - - if enable_tensorflow: - results += run_tensorflow(args.use_gpu, args.models, args.model_class, args.precision, args.batch_sizes, args.sequence_lengths, - args.test_times, thread_n, args.cache_dir, args.verbose) - - model_fusion_statistics = {} - if enable_onnxruntime: - try: - use_raw_attention_mask = True - results += run_onnxruntime(args.use_gpu, args.models, args.model_class, args.precision, args.batch_sizes, - args.sequence_lengths, args.test_times, args.input_counts, args.optimize_onnx, - args.validate_onnx, args.cache_dir, args.onnx_dir, args.verbose, args.overwrite, - args.disable_ort_io_binding, use_raw_attention_mask, args.thread_num, - model_fusion_statistics, args.model_source) - except: - logger.error(f"Exception", exc_info=True) + model_fusion_statistics = {} + if enable_onnxruntime: + try: + use_raw_attention_mask = True + results += run_onnxruntime(args.use_gpu, args.models, args.model_class, args.precision, num_threads, + args.batch_sizes, args.sequence_lengths, args.test_times, args.input_counts, + args.optimize_onnx, args.validate_onnx, args.cache_dir, args.onnx_dir, + args.verbose, args.overwrite, args.disable_ort_io_binding, + use_raw_attention_mask, model_fusion_statistics, args.model_source) + except: + logger.error(f"Exception", exc_info=True) time_stamp = datetime.now().strftime("%Y%m%d-%H%M%S") if model_fusion_statistics: diff --git a/onnxruntime/python/tools/transformers/benchmark_helper.py b/onnxruntime/python/tools/transformers/benchmark_helper.py index af99daabdc..9ef8d95264 100644 --- a/onnxruntime/python/tools/transformers/benchmark_helper.py +++ b/onnxruntime/python/tools/transformers/benchmark_helper.py @@ -113,8 +113,8 @@ def get_latency_result(runtimes, batch_size): def output_details(results, csv_filename): with open(csv_filename, mode="a", newline='') as csv_file: column_names = [ - "engine", "version", "device", "precision", "optimizer", "io_binding", "model_name", "inputs", "batch_size", - "sequence_length", "datetime", "test_times", "QPS", "average_latency_ms", "latency_variance", + "engine", "version", "device", "precision", "optimizer", "io_binding", "model_name", "inputs", "threads", + "batch_size", "sequence_length", "datetime", "test_times", "QPS", "average_latency_ms", "latency_variance", "latency_90_percentile", "latency_95_percentile", "latency_99_percentile" ] @@ -128,7 +128,9 @@ def output_details(results, csv_filename): def output_summary(results, csv_filename, args): with open(csv_filename, mode="a", newline='') as csv_file: - header_names = ["model_name", "inputs", "engine", "version", "device", "precision", "optimizer", "io_binding"] + header_names = [ + "model_name", "inputs", "engine", "version", "device", "precision", "optimizer", "io_binding", "threads" + ] data_names = [] for batch_size in args.batch_sizes: for sequence_length in args.sequence_lengths: @@ -140,22 +142,24 @@ def output_summary(results, csv_filename, args): for input_count in [1, 2, 3]: for engine_name in args.engines: for io_binding in [True, False, ""]: - row = {} - for result in results: - if result["model_name"] == model_name and result["inputs"] == input_count and result[ - "engine"] == engine_name and result["io_binding"] == io_binding: - headers = {k: v for k, v in result.items() if k in header_names} - if not row: - row.update(headers) - row.update({k: "" for k in data_names}) - else: - for k in header_names: - assert row[k] == headers[k] - b = result["batch_size"] - s = result["sequence_length"] - row[f"b{b}_s{s}"] = result["average_latency_ms"] - if row: - csv_writer.writerow(row) + for threads in args.num_threads: + row = {} + for result in results: + if result["model_name"] == model_name and result["inputs"] == input_count and result[ + "engine"] == engine_name and result["io_binding"] == io_binding and result[ + "threads"] == threads: + headers = {k: v for k, v in result.items() if k in header_names} + if not row: + row.update(headers) + row.update({k: "" for k in data_names}) + else: + for k in header_names: + assert row[k] == headers[k] + b = result["batch_size"] + s = result["sequence_length"] + row[f"b{b}_s{s}"] = result["average_latency_ms"] + if row: + csv_writer.writerow(row) logger.info(f"Summary results are saved to csv file: {csv_filename}") @@ -185,8 +189,18 @@ def inference_ort(ort_session, ort_inputs, result_template, repeat_times, batch_ return result -def inference_ort_with_io_binding(ort_session, ort_inputs, result_template, repeat_times, ort_output_names, ort_outputs, - output_buffers, max_last_state_size, max_pooler_size, batch_size, device, data_type=numpy.longlong): +def inference_ort_with_io_binding(ort_session, + ort_inputs, + result_template, + repeat_times, + ort_output_names, + ort_outputs, + output_buffers, + max_last_state_size, + max_pooler_size, + batch_size, + device, + data_type=numpy.longlong): result = {} # Bind inputs and outputs to onnxruntime session