Allow benchmark different threads (#5390)

This commit is contained in:
Tianlei Wu 2020-10-07 11:13:01 -07:00 committed by GitHub
parent 094384781e
commit 8ee2b08325
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 111 additions and 83 deletions

View file

@ -58,7 +58,8 @@ logger = logging.getLogger('')
from huggingface_models import MODELS, MODEL_CLASSES
cpu_count = psutil.cpu_count(logical=True)
cpu_count = psutil.cpu_count(logical=False)
# Set OMP environment variable before importing onnxruntime or torch.
if "OMP_NUM_THREADS" not in os.environ:
os.environ["OMP_NUM_THREADS"] = str(cpu_count)
@ -67,9 +68,9 @@ import torch
from transformers import (AutoConfig, AutoTokenizer, AutoModel, GPT2Model)
def run_onnxruntime(use_gpu, model_names, model_class, precision, batch_sizes, sequence_lengths, repeat_times,
input_counts, optimize_onnx, validate_onnx, cache_dir, onnx_dir, verbose, overwrite,
disable_ort_io_binding, use_raw_attention_mask, thread_num, model_fusion_statistics, model_source):
def run_onnxruntime(use_gpu, model_names, model_class, precision, num_threads, batch_sizes, sequence_lengths,
repeat_times, input_counts, optimize_onnx, validate_onnx, cache_dir, onnx_dir, verbose, overwrite,
disable_ort_io_binding, use_raw_attention_mask, model_fusion_statistics, model_source):
import onnxruntime
results = []
@ -93,14 +94,14 @@ def run_onnxruntime(use_gpu, model_names, model_class, precision, batch_sizes, s
if 'pt' in model_source:
with torch.no_grad():
onnx_model_file, is_valid_onnx_model, vocab_size, max_sequence_length = export_onnx_model_from_pt(
model_name, MODELS[model_name][1], MODELS[model_name][2], MODELS[model_name][3], model_class, cache_dir,
onnx_dir, input_names, use_gpu, precision, optimize_onnx, validate_onnx, use_raw_attention_mask, overwrite,
model_fusion_statistics)
model_name, MODELS[model_name][1], MODELS[model_name][2], MODELS[model_name][3], model_class,
cache_dir, onnx_dir, input_names, use_gpu, precision, optimize_onnx, validate_onnx,
use_raw_attention_mask, overwrite, model_fusion_statistics)
if 'tf' in model_source:
onnx_model_file, is_valid_onnx_model, vocab_size, max_sequence_length = export_onnx_model_from_tf(
model_name, MODELS[model_name][1], MODELS[model_name][2], MODELS[model_name][3], model_class, cache_dir,
onnx_dir, input_names, use_gpu, precision, optimize_onnx, validate_onnx, use_raw_attention_mask, overwrite,
model_fusion_statistics)
model_name, MODELS[model_name][1], MODELS[model_name][2], MODELS[model_name][3], model_class,
cache_dir, onnx_dir, input_names, use_gpu, precision, optimize_onnx, validate_onnx,
use_raw_attention_mask, overwrite, model_fusion_statistics)
if not is_valid_onnx_model:
continue
@ -108,7 +109,8 @@ def run_onnxruntime(use_gpu, model_names, model_class, precision, batch_sizes, s
ort_session = create_onnxruntime_session(onnx_model_file,
use_gpu,
enable_all_optimization=True,
num_threads=thread_num)
num_threads=num_threads,
verbose=verbose)
if ort_session is None:
continue
@ -128,7 +130,8 @@ def run_onnxruntime(use_gpu, model_names, model_class, precision, batch_sizes, s
continue
input_value_type = numpy.int64 if 'pt' in model_source else numpy.int32
ort_inputs = create_onnxruntime_input(vocab_size, batch_size, sequence_length, input_names, input_value_type)
ort_inputs = create_onnxruntime_input(vocab_size, batch_size, sequence_length, input_names,
input_value_type)
result_template = {
"engine": "onnxruntime",
@ -136,37 +139,36 @@ def run_onnxruntime(use_gpu, model_names, model_class, precision, batch_sizes, s
"device": device,
"optimizer": optimize_onnx,
"precision": precision,
"io_binding": False,
"io_binding": not disable_ort_io_binding,
"model_name": model_name,
"inputs": num_inputs,
"threads": num_threads,
"batch_size": batch_size,
"sequence_length": sequence_length,
"datetime": str(datetime.now()),
}
logger.info("Run onnxruntime on {} with input shape {}".format(model_name,
[batch_size, sequence_length]))
result = inference_ort(ort_session, ort_inputs, result_template, repeat_times, batch_size)
logger.info(result)
results.append(result)
if not disable_ort_io_binding:
logger.info("Run onnxruntime with io binding on {} with input shape {}".format(
model_name, [batch_size, sequence_length]))
if disable_ort_io_binding:
result = inference_ort(ort_session, ort_inputs, result_template, repeat_times, batch_size)
else:
# Get output sizes from a dummy ort run
ort_outputs = ort_session.run(ort_output_names, ort_inputs)
data_type = numpy.longlong if 'pt' in model_source else numpy.int32
result = inference_ort_with_io_binding(ort_session, ort_inputs, result_template, repeat_times,
ort_output_names, ort_outputs, output_buffers, max_last_state_size,
max_pooler_size, batch_size, device, data_type)
logger.info(result)
results.append(result)
ort_output_names, ort_outputs, output_buffers,
max_last_state_size, max_pooler_size, batch_size, device,
data_type)
logger.info(result)
results.append(result)
return results
def run_pytorch(use_gpu, model_names, model_class, precision, batch_sizes, sequence_lengths, repeat_times, torchscript,
cache_dir, verbose):
def run_pytorch(use_gpu, model_names, model_class, precision, num_threads, batch_sizes, sequence_lengths, repeat_times,
torchscript, cache_dir, verbose):
results = []
if use_gpu and not torch.cuda.is_available():
logger.error("Please install PyTorch with Cuda, and use a machine with GPU for testing gpu performance.")
@ -223,6 +225,7 @@ def run_pytorch(use_gpu, model_names, model_class, precision, batch_sizes, seque
"io_binding": "",
"model_name": model_name,
"inputs": 1,
"threads": num_threads,
"batch_size": batch_size,
"sequence_length": sequence_length,
"datetime": str(datetime.now()),
@ -237,12 +240,12 @@ def run_pytorch(use_gpu, model_names, model_class, precision, batch_sizes, seque
return results
def run_tensorflow(use_gpu, model_names, model_class, precision, batch_sizes, sequence_lengths, repeat_times, thread_n, cache_dir,
verbose):
def run_tensorflow(use_gpu, model_names, model_class, precision, num_threads, batch_sizes, sequence_lengths,
repeat_times, cache_dir, verbose):
results = []
import tensorflow as tf
tf.config.threading.set_intra_op_parallelism_threads(thread_n)
tf.config.threading.set_intra_op_parallelism_threads(num_threads)
if not use_gpu:
tf.config.set_visible_devices([], 'GPU')
@ -251,7 +254,7 @@ def run_tensorflow(use_gpu, model_names, model_class, precision, batch_sizes, se
logger.error("Please install Tensorflow-gpu, and use a machine with GPU for testing gpu performance.")
return results
if use_gpu: # Restrict TensorFlow to only use the first GPU
if use_gpu: # Restrict TensorFlow to only use the first GPU
physical_devices = tf.config.list_physical_devices('GPU')
try:
tf.config.set_visible_devices(physical_devices[0], 'GPU')
@ -264,11 +267,16 @@ def run_tensorflow(use_gpu, model_names, model_class, precision, batch_sizes, se
for model_name in model_names:
config = AutoConfig.from_pretrained(model_name, cache_dir=cache_dir)
model = load_pretrained_model(model_name, config=config, cache_dir=cache_dir, custom_model_class=model_class, is_tf_model=True)
model = load_pretrained_model(model_name,
config=config,
cache_dir=cache_dir,
custom_model_class=model_class,
is_tf_model=True)
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
max_input_size = tokenizer.max_model_input_sizes[model_name] if model_name in tokenizer.max_model_input_sizes else 1024
max_input_size = tokenizer.max_model_input_sizes[
model_name] if model_name in tokenizer.max_model_input_sizes else 1024
for batch_size in batch_sizes:
if batch_size <= 0:
@ -278,7 +286,8 @@ def run_tensorflow(use_gpu, model_names, model_class, precision, batch_sizes, se
if max_input_size is not None and sequence_length > max_input_size:
continue
logger.info("Run Tensorflow on {} with input shape {}".format(model_name, [batch_size, sequence_length]))
logger.info("Run Tensorflow on {} with input shape {}".format(model_name,
[batch_size, sequence_length]))
import random
rng = random.Random()
@ -286,6 +295,7 @@ def run_tensorflow(use_gpu, model_names, model_class, precision, batch_sizes, se
input_ids = tf.constant(values, shape=(batch_size, sequence_length), dtype=tf.int32)
try:
def encoder_forward():
return model(input_ids, training=False)
@ -307,6 +317,7 @@ def run_tensorflow(use_gpu, model_names, model_class, precision, batch_sizes, se
"io_binding": "",
"model_name": model_name,
"inputs": 1,
"threads": num_threads,
"batch_size": batch_size,
"sequence_length": sequence_length,
"datetime": str(datetime.now()),
@ -430,7 +441,7 @@ def parse_arguments():
help='Disable running ONNX Runtime with binded inputs and outputs. ')
parser.set_defaults(disable_ort_io_binding=False)
parser.add_argument("--thread_num", required=False, type=int, default=-1, help="Threads to use")
parser.add_argument("-n", "--num_threads", required=False, nargs="+", type=int, default=[0], help="Threads to use")
args = parser.parse_args()
return args
@ -449,6 +460,8 @@ def main():
logger.error("int8 is for CPU only")
return
args.num_threads = sorted(set(cpu_count if x <= 0 else x for x in args.num_threads))
logger.info(f"Arguments: {args}")
if not os.path.exists(args.cache_dir):
@ -464,38 +477,39 @@ def main():
results = []
thread_n = cpu_count if args.thread_num <= 0 else args.thread_num
torch.set_num_threads(thread_n)
for num_threads in args.num_threads:
torch.set_num_threads(num_threads)
logger.debug(torch.__config__.parallel_info())
if enable_torch or enable_torchscript:
if args.input_counts != [1]:
logger.warning("--input_counts is not implemented for torch or torchscript engine.")
logger.debug(torch.__config__.parallel_info())
if enable_torchscript:
results += run_pytorch(args.use_gpu, args.models, args.model_class, args.precision, num_threads,
args.batch_sizes, args.sequence_lengths, args.test_times, True, args.cache_dir,
args.verbose)
if enable_torch or enable_torchscript:
if args.input_counts != [1]:
logger.warning("--input_counts is not implemented for torch or torchscript engine.")
if enable_torch:
results += run_pytorch(args.use_gpu, args.models, args.model_class, args.precision, num_threads,
args.batch_sizes, args.sequence_lengths, args.test_times, False, args.cache_dir,
args.verbose)
if enable_torchscript:
results += run_pytorch(args.use_gpu, args.models, args.model_class, args.precision, args.batch_sizes,
args.sequence_lengths, args.test_times, True, args.cache_dir, args.verbose)
if enable_tensorflow:
results += run_tensorflow(args.use_gpu, args.models, args.model_class, args.precision, num_threads,
args.batch_sizes, args.sequence_lengths, args.test_times, args.cache_dir,
args.verbose)
if enable_torch:
results += run_pytorch(args.use_gpu, args.models, args.model_class, args.precision, args.batch_sizes,
args.sequence_lengths, args.test_times, False, args.cache_dir, args.verbose)
if enable_tensorflow:
results += run_tensorflow(args.use_gpu, args.models, args.model_class, args.precision, args.batch_sizes, args.sequence_lengths,
args.test_times, thread_n, args.cache_dir, args.verbose)
model_fusion_statistics = {}
if enable_onnxruntime:
try:
use_raw_attention_mask = True
results += run_onnxruntime(args.use_gpu, args.models, args.model_class, args.precision, args.batch_sizes,
args.sequence_lengths, args.test_times, args.input_counts, args.optimize_onnx,
args.validate_onnx, args.cache_dir, args.onnx_dir, args.verbose, args.overwrite,
args.disable_ort_io_binding, use_raw_attention_mask, args.thread_num,
model_fusion_statistics, args.model_source)
except:
logger.error(f"Exception", exc_info=True)
model_fusion_statistics = {}
if enable_onnxruntime:
try:
use_raw_attention_mask = True
results += run_onnxruntime(args.use_gpu, args.models, args.model_class, args.precision, num_threads,
args.batch_sizes, args.sequence_lengths, args.test_times, args.input_counts,
args.optimize_onnx, args.validate_onnx, args.cache_dir, args.onnx_dir,
args.verbose, args.overwrite, args.disable_ort_io_binding,
use_raw_attention_mask, model_fusion_statistics, args.model_source)
except:
logger.error(f"Exception", exc_info=True)
time_stamp = datetime.now().strftime("%Y%m%d-%H%M%S")
if model_fusion_statistics:

View file

@ -113,8 +113,8 @@ def get_latency_result(runtimes, batch_size):
def output_details(results, csv_filename):
with open(csv_filename, mode="a", newline='') as csv_file:
column_names = [
"engine", "version", "device", "precision", "optimizer", "io_binding", "model_name", "inputs", "batch_size",
"sequence_length", "datetime", "test_times", "QPS", "average_latency_ms", "latency_variance",
"engine", "version", "device", "precision", "optimizer", "io_binding", "model_name", "inputs", "threads",
"batch_size", "sequence_length", "datetime", "test_times", "QPS", "average_latency_ms", "latency_variance",
"latency_90_percentile", "latency_95_percentile", "latency_99_percentile"
]
@ -128,7 +128,9 @@ def output_details(results, csv_filename):
def output_summary(results, csv_filename, args):
with open(csv_filename, mode="a", newline='') as csv_file:
header_names = ["model_name", "inputs", "engine", "version", "device", "precision", "optimizer", "io_binding"]
header_names = [
"model_name", "inputs", "engine", "version", "device", "precision", "optimizer", "io_binding", "threads"
]
data_names = []
for batch_size in args.batch_sizes:
for sequence_length in args.sequence_lengths:
@ -140,22 +142,24 @@ def output_summary(results, csv_filename, args):
for input_count in [1, 2, 3]:
for engine_name in args.engines:
for io_binding in [True, False, ""]:
row = {}
for result in results:
if result["model_name"] == model_name and result["inputs"] == input_count and result[
"engine"] == engine_name and result["io_binding"] == io_binding:
headers = {k: v for k, v in result.items() if k in header_names}
if not row:
row.update(headers)
row.update({k: "" for k in data_names})
else:
for k in header_names:
assert row[k] == headers[k]
b = result["batch_size"]
s = result["sequence_length"]
row[f"b{b}_s{s}"] = result["average_latency_ms"]
if row:
csv_writer.writerow(row)
for threads in args.num_threads:
row = {}
for result in results:
if result["model_name"] == model_name and result["inputs"] == input_count and result[
"engine"] == engine_name and result["io_binding"] == io_binding and result[
"threads"] == threads:
headers = {k: v for k, v in result.items() if k in header_names}
if not row:
row.update(headers)
row.update({k: "" for k in data_names})
else:
for k in header_names:
assert row[k] == headers[k]
b = result["batch_size"]
s = result["sequence_length"]
row[f"b{b}_s{s}"] = result["average_latency_ms"]
if row:
csv_writer.writerow(row)
logger.info(f"Summary results are saved to csv file: {csv_filename}")
@ -185,8 +189,18 @@ def inference_ort(ort_session, ort_inputs, result_template, repeat_times, batch_
return result
def inference_ort_with_io_binding(ort_session, ort_inputs, result_template, repeat_times, ort_output_names, ort_outputs,
output_buffers, max_last_state_size, max_pooler_size, batch_size, device, data_type=numpy.longlong):
def inference_ort_with_io_binding(ort_session,
ort_inputs,
result_template,
repeat_times,
ort_output_names,
ort_outputs,
output_buffers,
max_last_state_size,
max_pooler_size,
batch_size,
device,
data_type=numpy.longlong):
result = {}
# Bind inputs and outputs to onnxruntime session