mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-06 00:03:22 +00:00
Allow benchmark different threads (#5390)
This commit is contained in:
parent
094384781e
commit
8ee2b08325
2 changed files with 111 additions and 83 deletions
|
|
@ -58,7 +58,8 @@ logger = logging.getLogger('')
|
|||
|
||||
from huggingface_models import MODELS, MODEL_CLASSES
|
||||
|
||||
cpu_count = psutil.cpu_count(logical=True)
|
||||
cpu_count = psutil.cpu_count(logical=False)
|
||||
|
||||
# Set OMP environment variable before importing onnxruntime or torch.
|
||||
if "OMP_NUM_THREADS" not in os.environ:
|
||||
os.environ["OMP_NUM_THREADS"] = str(cpu_count)
|
||||
|
|
@ -67,9 +68,9 @@ import torch
|
|||
from transformers import (AutoConfig, AutoTokenizer, AutoModel, GPT2Model)
|
||||
|
||||
|
||||
def run_onnxruntime(use_gpu, model_names, model_class, precision, batch_sizes, sequence_lengths, repeat_times,
|
||||
input_counts, optimize_onnx, validate_onnx, cache_dir, onnx_dir, verbose, overwrite,
|
||||
disable_ort_io_binding, use_raw_attention_mask, thread_num, model_fusion_statistics, model_source):
|
||||
def run_onnxruntime(use_gpu, model_names, model_class, precision, num_threads, batch_sizes, sequence_lengths,
|
||||
repeat_times, input_counts, optimize_onnx, validate_onnx, cache_dir, onnx_dir, verbose, overwrite,
|
||||
disable_ort_io_binding, use_raw_attention_mask, model_fusion_statistics, model_source):
|
||||
import onnxruntime
|
||||
|
||||
results = []
|
||||
|
|
@ -93,14 +94,14 @@ def run_onnxruntime(use_gpu, model_names, model_class, precision, batch_sizes, s
|
|||
if 'pt' in model_source:
|
||||
with torch.no_grad():
|
||||
onnx_model_file, is_valid_onnx_model, vocab_size, max_sequence_length = export_onnx_model_from_pt(
|
||||
model_name, MODELS[model_name][1], MODELS[model_name][2], MODELS[model_name][3], model_class, cache_dir,
|
||||
onnx_dir, input_names, use_gpu, precision, optimize_onnx, validate_onnx, use_raw_attention_mask, overwrite,
|
||||
model_fusion_statistics)
|
||||
model_name, MODELS[model_name][1], MODELS[model_name][2], MODELS[model_name][3], model_class,
|
||||
cache_dir, onnx_dir, input_names, use_gpu, precision, optimize_onnx, validate_onnx,
|
||||
use_raw_attention_mask, overwrite, model_fusion_statistics)
|
||||
if 'tf' in model_source:
|
||||
onnx_model_file, is_valid_onnx_model, vocab_size, max_sequence_length = export_onnx_model_from_tf(
|
||||
model_name, MODELS[model_name][1], MODELS[model_name][2], MODELS[model_name][3], model_class, cache_dir,
|
||||
onnx_dir, input_names, use_gpu, precision, optimize_onnx, validate_onnx, use_raw_attention_mask, overwrite,
|
||||
model_fusion_statistics)
|
||||
model_name, MODELS[model_name][1], MODELS[model_name][2], MODELS[model_name][3], model_class,
|
||||
cache_dir, onnx_dir, input_names, use_gpu, precision, optimize_onnx, validate_onnx,
|
||||
use_raw_attention_mask, overwrite, model_fusion_statistics)
|
||||
|
||||
if not is_valid_onnx_model:
|
||||
continue
|
||||
|
|
@ -108,7 +109,8 @@ def run_onnxruntime(use_gpu, model_names, model_class, precision, batch_sizes, s
|
|||
ort_session = create_onnxruntime_session(onnx_model_file,
|
||||
use_gpu,
|
||||
enable_all_optimization=True,
|
||||
num_threads=thread_num)
|
||||
num_threads=num_threads,
|
||||
verbose=verbose)
|
||||
if ort_session is None:
|
||||
continue
|
||||
|
||||
|
|
@ -128,7 +130,8 @@ def run_onnxruntime(use_gpu, model_names, model_class, precision, batch_sizes, s
|
|||
continue
|
||||
|
||||
input_value_type = numpy.int64 if 'pt' in model_source else numpy.int32
|
||||
ort_inputs = create_onnxruntime_input(vocab_size, batch_size, sequence_length, input_names, input_value_type)
|
||||
ort_inputs = create_onnxruntime_input(vocab_size, batch_size, sequence_length, input_names,
|
||||
input_value_type)
|
||||
|
||||
result_template = {
|
||||
"engine": "onnxruntime",
|
||||
|
|
@ -136,37 +139,36 @@ def run_onnxruntime(use_gpu, model_names, model_class, precision, batch_sizes, s
|
|||
"device": device,
|
||||
"optimizer": optimize_onnx,
|
||||
"precision": precision,
|
||||
"io_binding": False,
|
||||
"io_binding": not disable_ort_io_binding,
|
||||
"model_name": model_name,
|
||||
"inputs": num_inputs,
|
||||
"threads": num_threads,
|
||||
"batch_size": batch_size,
|
||||
"sequence_length": sequence_length,
|
||||
"datetime": str(datetime.now()),
|
||||
}
|
||||
|
||||
logger.info("Run onnxruntime on {} with input shape {}".format(model_name,
|
||||
[batch_size, sequence_length]))
|
||||
result = inference_ort(ort_session, ort_inputs, result_template, repeat_times, batch_size)
|
||||
logger.info(result)
|
||||
results.append(result)
|
||||
|
||||
if not disable_ort_io_binding:
|
||||
logger.info("Run onnxruntime with io binding on {} with input shape {}".format(
|
||||
model_name, [batch_size, sequence_length]))
|
||||
if disable_ort_io_binding:
|
||||
result = inference_ort(ort_session, ort_inputs, result_template, repeat_times, batch_size)
|
||||
else:
|
||||
# Get output sizes from a dummy ort run
|
||||
ort_outputs = ort_session.run(ort_output_names, ort_inputs)
|
||||
|
||||
data_type = numpy.longlong if 'pt' in model_source else numpy.int32
|
||||
result = inference_ort_with_io_binding(ort_session, ort_inputs, result_template, repeat_times,
|
||||
ort_output_names, ort_outputs, output_buffers, max_last_state_size,
|
||||
max_pooler_size, batch_size, device, data_type)
|
||||
logger.info(result)
|
||||
results.append(result)
|
||||
ort_output_names, ort_outputs, output_buffers,
|
||||
max_last_state_size, max_pooler_size, batch_size, device,
|
||||
data_type)
|
||||
logger.info(result)
|
||||
results.append(result)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def run_pytorch(use_gpu, model_names, model_class, precision, batch_sizes, sequence_lengths, repeat_times, torchscript,
|
||||
cache_dir, verbose):
|
||||
def run_pytorch(use_gpu, model_names, model_class, precision, num_threads, batch_sizes, sequence_lengths, repeat_times,
|
||||
torchscript, cache_dir, verbose):
|
||||
results = []
|
||||
if use_gpu and not torch.cuda.is_available():
|
||||
logger.error("Please install PyTorch with Cuda, and use a machine with GPU for testing gpu performance.")
|
||||
|
|
@ -223,6 +225,7 @@ def run_pytorch(use_gpu, model_names, model_class, precision, batch_sizes, seque
|
|||
"io_binding": "",
|
||||
"model_name": model_name,
|
||||
"inputs": 1,
|
||||
"threads": num_threads,
|
||||
"batch_size": batch_size,
|
||||
"sequence_length": sequence_length,
|
||||
"datetime": str(datetime.now()),
|
||||
|
|
@ -237,12 +240,12 @@ def run_pytorch(use_gpu, model_names, model_class, precision, batch_sizes, seque
|
|||
return results
|
||||
|
||||
|
||||
def run_tensorflow(use_gpu, model_names, model_class, precision, batch_sizes, sequence_lengths, repeat_times, thread_n, cache_dir,
|
||||
verbose):
|
||||
def run_tensorflow(use_gpu, model_names, model_class, precision, num_threads, batch_sizes, sequence_lengths,
|
||||
repeat_times, cache_dir, verbose):
|
||||
results = []
|
||||
|
||||
import tensorflow as tf
|
||||
tf.config.threading.set_intra_op_parallelism_threads(thread_n)
|
||||
tf.config.threading.set_intra_op_parallelism_threads(num_threads)
|
||||
|
||||
if not use_gpu:
|
||||
tf.config.set_visible_devices([], 'GPU')
|
||||
|
|
@ -251,7 +254,7 @@ def run_tensorflow(use_gpu, model_names, model_class, precision, batch_sizes, se
|
|||
logger.error("Please install Tensorflow-gpu, and use a machine with GPU for testing gpu performance.")
|
||||
return results
|
||||
|
||||
if use_gpu: # Restrict TensorFlow to only use the first GPU
|
||||
if use_gpu: # Restrict TensorFlow to only use the first GPU
|
||||
physical_devices = tf.config.list_physical_devices('GPU')
|
||||
try:
|
||||
tf.config.set_visible_devices(physical_devices[0], 'GPU')
|
||||
|
|
@ -264,11 +267,16 @@ def run_tensorflow(use_gpu, model_names, model_class, precision, batch_sizes, se
|
|||
for model_name in model_names:
|
||||
config = AutoConfig.from_pretrained(model_name, cache_dir=cache_dir)
|
||||
|
||||
model = load_pretrained_model(model_name, config=config, cache_dir=cache_dir, custom_model_class=model_class, is_tf_model=True)
|
||||
model = load_pretrained_model(model_name,
|
||||
config=config,
|
||||
cache_dir=cache_dir,
|
||||
custom_model_class=model_class,
|
||||
is_tf_model=True)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
|
||||
|
||||
max_input_size = tokenizer.max_model_input_sizes[model_name] if model_name in tokenizer.max_model_input_sizes else 1024
|
||||
max_input_size = tokenizer.max_model_input_sizes[
|
||||
model_name] if model_name in tokenizer.max_model_input_sizes else 1024
|
||||
|
||||
for batch_size in batch_sizes:
|
||||
if batch_size <= 0:
|
||||
|
|
@ -278,7 +286,8 @@ def run_tensorflow(use_gpu, model_names, model_class, precision, batch_sizes, se
|
|||
if max_input_size is not None and sequence_length > max_input_size:
|
||||
continue
|
||||
|
||||
logger.info("Run Tensorflow on {} with input shape {}".format(model_name, [batch_size, sequence_length]))
|
||||
logger.info("Run Tensorflow on {} with input shape {}".format(model_name,
|
||||
[batch_size, sequence_length]))
|
||||
|
||||
import random
|
||||
rng = random.Random()
|
||||
|
|
@ -286,6 +295,7 @@ def run_tensorflow(use_gpu, model_names, model_class, precision, batch_sizes, se
|
|||
input_ids = tf.constant(values, shape=(batch_size, sequence_length), dtype=tf.int32)
|
||||
|
||||
try:
|
||||
|
||||
def encoder_forward():
|
||||
return model(input_ids, training=False)
|
||||
|
||||
|
|
@ -307,6 +317,7 @@ def run_tensorflow(use_gpu, model_names, model_class, precision, batch_sizes, se
|
|||
"io_binding": "",
|
||||
"model_name": model_name,
|
||||
"inputs": 1,
|
||||
"threads": num_threads,
|
||||
"batch_size": batch_size,
|
||||
"sequence_length": sequence_length,
|
||||
"datetime": str(datetime.now()),
|
||||
|
|
@ -430,7 +441,7 @@ def parse_arguments():
|
|||
help='Disable running ONNX Runtime with binded inputs and outputs. ')
|
||||
parser.set_defaults(disable_ort_io_binding=False)
|
||||
|
||||
parser.add_argument("--thread_num", required=False, type=int, default=-1, help="Threads to use")
|
||||
parser.add_argument("-n", "--num_threads", required=False, nargs="+", type=int, default=[0], help="Threads to use")
|
||||
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
|
@ -449,6 +460,8 @@ def main():
|
|||
logger.error("int8 is for CPU only")
|
||||
return
|
||||
|
||||
args.num_threads = sorted(set(cpu_count if x <= 0 else x for x in args.num_threads))
|
||||
|
||||
logger.info(f"Arguments: {args}")
|
||||
|
||||
if not os.path.exists(args.cache_dir):
|
||||
|
|
@ -464,38 +477,39 @@ def main():
|
|||
|
||||
results = []
|
||||
|
||||
thread_n = cpu_count if args.thread_num <= 0 else args.thread_num
|
||||
torch.set_num_threads(thread_n)
|
||||
for num_threads in args.num_threads:
|
||||
torch.set_num_threads(num_threads)
|
||||
logger.debug(torch.__config__.parallel_info())
|
||||
if enable_torch or enable_torchscript:
|
||||
if args.input_counts != [1]:
|
||||
logger.warning("--input_counts is not implemented for torch or torchscript engine.")
|
||||
|
||||
logger.debug(torch.__config__.parallel_info())
|
||||
if enable_torchscript:
|
||||
results += run_pytorch(args.use_gpu, args.models, args.model_class, args.precision, num_threads,
|
||||
args.batch_sizes, args.sequence_lengths, args.test_times, True, args.cache_dir,
|
||||
args.verbose)
|
||||
|
||||
if enable_torch or enable_torchscript:
|
||||
if args.input_counts != [1]:
|
||||
logger.warning("--input_counts is not implemented for torch or torchscript engine.")
|
||||
if enable_torch:
|
||||
results += run_pytorch(args.use_gpu, args.models, args.model_class, args.precision, num_threads,
|
||||
args.batch_sizes, args.sequence_lengths, args.test_times, False, args.cache_dir,
|
||||
args.verbose)
|
||||
|
||||
if enable_torchscript:
|
||||
results += run_pytorch(args.use_gpu, args.models, args.model_class, args.precision, args.batch_sizes,
|
||||
args.sequence_lengths, args.test_times, True, args.cache_dir, args.verbose)
|
||||
if enable_tensorflow:
|
||||
results += run_tensorflow(args.use_gpu, args.models, args.model_class, args.precision, num_threads,
|
||||
args.batch_sizes, args.sequence_lengths, args.test_times, args.cache_dir,
|
||||
args.verbose)
|
||||
|
||||
if enable_torch:
|
||||
results += run_pytorch(args.use_gpu, args.models, args.model_class, args.precision, args.batch_sizes,
|
||||
args.sequence_lengths, args.test_times, False, args.cache_dir, args.verbose)
|
||||
|
||||
if enable_tensorflow:
|
||||
results += run_tensorflow(args.use_gpu, args.models, args.model_class, args.precision, args.batch_sizes, args.sequence_lengths,
|
||||
args.test_times, thread_n, args.cache_dir, args.verbose)
|
||||
|
||||
model_fusion_statistics = {}
|
||||
if enable_onnxruntime:
|
||||
try:
|
||||
use_raw_attention_mask = True
|
||||
results += run_onnxruntime(args.use_gpu, args.models, args.model_class, args.precision, args.batch_sizes,
|
||||
args.sequence_lengths, args.test_times, args.input_counts, args.optimize_onnx,
|
||||
args.validate_onnx, args.cache_dir, args.onnx_dir, args.verbose, args.overwrite,
|
||||
args.disable_ort_io_binding, use_raw_attention_mask, args.thread_num,
|
||||
model_fusion_statistics, args.model_source)
|
||||
except:
|
||||
logger.error(f"Exception", exc_info=True)
|
||||
model_fusion_statistics = {}
|
||||
if enable_onnxruntime:
|
||||
try:
|
||||
use_raw_attention_mask = True
|
||||
results += run_onnxruntime(args.use_gpu, args.models, args.model_class, args.precision, num_threads,
|
||||
args.batch_sizes, args.sequence_lengths, args.test_times, args.input_counts,
|
||||
args.optimize_onnx, args.validate_onnx, args.cache_dir, args.onnx_dir,
|
||||
args.verbose, args.overwrite, args.disable_ort_io_binding,
|
||||
use_raw_attention_mask, model_fusion_statistics, args.model_source)
|
||||
except:
|
||||
logger.error(f"Exception", exc_info=True)
|
||||
|
||||
time_stamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||
if model_fusion_statistics:
|
||||
|
|
|
|||
|
|
@ -113,8 +113,8 @@ def get_latency_result(runtimes, batch_size):
|
|||
def output_details(results, csv_filename):
|
||||
with open(csv_filename, mode="a", newline='') as csv_file:
|
||||
column_names = [
|
||||
"engine", "version", "device", "precision", "optimizer", "io_binding", "model_name", "inputs", "batch_size",
|
||||
"sequence_length", "datetime", "test_times", "QPS", "average_latency_ms", "latency_variance",
|
||||
"engine", "version", "device", "precision", "optimizer", "io_binding", "model_name", "inputs", "threads",
|
||||
"batch_size", "sequence_length", "datetime", "test_times", "QPS", "average_latency_ms", "latency_variance",
|
||||
"latency_90_percentile", "latency_95_percentile", "latency_99_percentile"
|
||||
]
|
||||
|
||||
|
|
@ -128,7 +128,9 @@ def output_details(results, csv_filename):
|
|||
|
||||
def output_summary(results, csv_filename, args):
|
||||
with open(csv_filename, mode="a", newline='') as csv_file:
|
||||
header_names = ["model_name", "inputs", "engine", "version", "device", "precision", "optimizer", "io_binding"]
|
||||
header_names = [
|
||||
"model_name", "inputs", "engine", "version", "device", "precision", "optimizer", "io_binding", "threads"
|
||||
]
|
||||
data_names = []
|
||||
for batch_size in args.batch_sizes:
|
||||
for sequence_length in args.sequence_lengths:
|
||||
|
|
@ -140,22 +142,24 @@ def output_summary(results, csv_filename, args):
|
|||
for input_count in [1, 2, 3]:
|
||||
for engine_name in args.engines:
|
||||
for io_binding in [True, False, ""]:
|
||||
row = {}
|
||||
for result in results:
|
||||
if result["model_name"] == model_name and result["inputs"] == input_count and result[
|
||||
"engine"] == engine_name and result["io_binding"] == io_binding:
|
||||
headers = {k: v for k, v in result.items() if k in header_names}
|
||||
if not row:
|
||||
row.update(headers)
|
||||
row.update({k: "" for k in data_names})
|
||||
else:
|
||||
for k in header_names:
|
||||
assert row[k] == headers[k]
|
||||
b = result["batch_size"]
|
||||
s = result["sequence_length"]
|
||||
row[f"b{b}_s{s}"] = result["average_latency_ms"]
|
||||
if row:
|
||||
csv_writer.writerow(row)
|
||||
for threads in args.num_threads:
|
||||
row = {}
|
||||
for result in results:
|
||||
if result["model_name"] == model_name and result["inputs"] == input_count and result[
|
||||
"engine"] == engine_name and result["io_binding"] == io_binding and result[
|
||||
"threads"] == threads:
|
||||
headers = {k: v for k, v in result.items() if k in header_names}
|
||||
if not row:
|
||||
row.update(headers)
|
||||
row.update({k: "" for k in data_names})
|
||||
else:
|
||||
for k in header_names:
|
||||
assert row[k] == headers[k]
|
||||
b = result["batch_size"]
|
||||
s = result["sequence_length"]
|
||||
row[f"b{b}_s{s}"] = result["average_latency_ms"]
|
||||
if row:
|
||||
csv_writer.writerow(row)
|
||||
|
||||
logger.info(f"Summary results are saved to csv file: {csv_filename}")
|
||||
|
||||
|
|
@ -185,8 +189,18 @@ def inference_ort(ort_session, ort_inputs, result_template, repeat_times, batch_
|
|||
return result
|
||||
|
||||
|
||||
def inference_ort_with_io_binding(ort_session, ort_inputs, result_template, repeat_times, ort_output_names, ort_outputs,
|
||||
output_buffers, max_last_state_size, max_pooler_size, batch_size, device, data_type=numpy.longlong):
|
||||
def inference_ort_with_io_binding(ort_session,
|
||||
ort_inputs,
|
||||
result_template,
|
||||
repeat_times,
|
||||
ort_output_names,
|
||||
ort_outputs,
|
||||
output_buffers,
|
||||
max_last_state_size,
|
||||
max_pooler_size,
|
||||
batch_size,
|
||||
device,
|
||||
data_type=numpy.longlong):
|
||||
result = {}
|
||||
|
||||
# Bind inputs and outputs to onnxruntime session
|
||||
|
|
|
|||
Loading…
Reference in a new issue