onnxruntime/onnxruntime/python/tools/tensorrt/perf/benchmark.py
KeDengMS ce3b67e0cd
[Python] Move symbolic_shape_infer from nuphar to tools (#5162)
* [Python] Move symbolic shape inference from nuphar to tools

* Fix PEP8 ERROR
2020-09-18 09:31:06 -07:00

1267 lines
49 KiB
Python

import os
import csv
import timeit
from datetime import datetime
import numpy
import logging
import coloredlogs
import numpy as np
import argparse
import copy
import json
import re
import sys
import onnxruntime
from onnx import numpy_helper
from perf_utils import *
import pprint
import time
from float16 import *
# import torch
debug = False
sys.path.append('.')
logger = logging.getLogger('')
ep_to_provider_list = {
"CPUExecutionProvider": ["CPUExecutionProvider"],
"CUDAExecutionProvider": ["CUDAExecutionProvider"],
"CUDAExecutionProvider_fp16": ["CUDAExecutionProvider"],
"TensorrtExecutionProvider": ["TensorrtExecutionProvider", "CUDAExecutionProvider"],
"TensorrtExecutionProvider_fp16": ["TensorrtExecutionProvider", "CUDAExecutionProvider"],
}
def run_trt_standalone(trtexec, model_path, ort_inputs, all_inputs_shape, fp16):
model_path = "--onnx=" + model_path
input_shape = []
print(all_inputs_shape)
for i in range(len(ort_inputs)):
name = ort_inputs[i].name
shape = []
for j in all_inputs_shape[i]:
shape.append(str(j))
shape = "x".join(shape)
shape = name + ':' + shape
input_shape.append(shape)
shapes_arg = '--optShapes=' + ','.join(input_shape)
print(shapes_arg)
result = {}
try:
if fp16:
p1 = subprocess.Popen([trtexec, model_path, "--fp16", "--percentile=90", "--explicitBatch", shapes_arg], stdout=subprocess.PIPE)
else:
p1 = subprocess.Popen([trtexec, model_path, "--percentile=90", "--explicitBatch", shapes_arg], stdout=subprocess.PIPE)
stdout, sterr = p1.communicate()
print(stdout)
stdout = stdout.decode("ascii").strip()
tmp = stdout.split("\n")
target_list = []
for t in tmp:
if 'mean:' in t:
target_list.append(t)
if 'percentile:' in t:
target_list.append(t)
target = target_list[2]
start = target.find('mean:') + 6
end = target.find('ms')
result["average_latency_ms"] = target[start:end]
target = target_list[3]
start = target.find('percentile:') + 12
end = target.find('ms')
result["latency_90_percentile"] = target[start:end]
print(result)
return result
except Exception as e:
logger.info("trtexec fails...")
return None
def get_latency_result(runtimes, batch_size):
latency_ms = sum(runtimes) / float(len(runtimes)) * 1000.0
latency_variance = numpy.var(runtimes, dtype=numpy.float64) * 1000.0
throughput = batch_size * (1000.0 / latency_ms)
return {
"test_times": len(runtimes),
"latency_variance": "{:.2f}".format(latency_variance),
"latency_90_percentile": "{:.2f}".format(numpy.percentile(runtimes, 90) * 1000.0),
"latency_95_percentile": "{:.2f}".format(numpy.percentile(runtimes, 95) * 1000.0),
"latency_99_percentile": "{:.2f}".format(numpy.percentile(runtimes, 99) * 1000.0),
"average_latency_ms": "{:.2f}".format(latency_ms),
"QPS": "{:.2f}".format(throughput),
}
def get_ort_session_inputs_and_outptus(name, session, ort_input):
sess_inputs = {}
sess_outputs = None
if name == 'BERT-Squad':
unique_ids_raw_output = ort_input[0]
input_ids = ort_input[1]
input_mask = ort_input[2]
segment_ids = ort_input[3]
sess_inputs = {
"unique_ids_raw_output___9:0": unique_ids_raw_output,
"input_ids:0": input_ids[0:1],
"input_mask:0": input_mask[0:1],
"segment_ids:0": segment_ids[0:1]}
sess_outputs = ["unique_ids:0", "unstack:0", "unstack:1"]
elif name == 'BiDAF':
sess_inputs = {
"context_word": ort_input[0],
"context_char": ort_input[2],
"query_word": ort_input[1],
"query_char": ort_input[3]}
sess_outputs = ["start_pos","end_pos"]
elif name == 'Yolov4':
sess_inputs[session.get_inputs()[0].name] = ort_input[0]
sess_outputs = ['Identity:0']
elif name == 'Shufflenet-v2':
sess_inputs[session.get_inputs()[0].name] = ort_input
else:
sess_inputs = {}
for i in range(len(session.get_inputs())):
sess_inputs[session.get_inputs()[i].name] = ort_input[i]
return (sess_inputs, sess_outputs)
def inference_ort(args, name, session, ep, ort_inputs, result_template, repeat_times, batch_size):
runtimes = []
for ort_input in ort_inputs:
sess_inputs, sess_outputs = get_ort_session_inputs_and_outptus(name, session, ort_input)
print("sess_inputs:")
print(sess_inputs)
print("sess_outputs:")
print(sess_outputs)
try:
if args.input_data == "random":
repeat_times = 1 # warn-up run is included in ort_inputs
else:
repeat_times += 1 # add warn-up run
runtime = timeit.repeat(lambda: session.run(sess_outputs, sess_inputs), number=1, repeat=repeat_times)
runtimes += runtime
except Exception as e:
logger.error(e)
return None
print(runtimes)
runtimes[:] = runtimes[1:]
print(runtimes)
result = {}
result.update(result_template)
result.update({"io_binding": False})
result.update(get_latency_result(runtimes, batch_size))
return result
def inference_ort_and_get_prediction(name, session, ort_inputs):
ort_outputs = []
for ort_input in ort_inputs:
sess_inputs, sess_outputs = get_ort_session_inputs_and_outptus(name, session, ort_input)
print("sess_inputs:")
print(sess_inputs)
print("sess_outputs:")
print(sess_outputs)
try:
result = session.run(sess_outputs, sess_inputs)
# handle shape of output differently
if name == 'BERT-Squad':
ort_outputs.append([result])
elif name == 'Shufflenet-v2':
ort_outputs.append(result[0])
else:
ort_outputs.append(result)
except Exception as e:
logger.error(e)
return None
return ort_outputs
# not use for this script yet
def inference_ort_with_io_binding(model, ort_inputs, result_template, repeat_times, batch_size, device='cuda'):
runtimes = []
session = model.get_session()
# Bind inputs and outputs to onnxruntime session
io_binding = session.io_binding()
for ort_input in ort_inputs:
# Bind inputs to device
if model.get_model_name() == 'BERT-Squad':
name = session.get_inputs()[0].name
print(name)
np_input = torch.from_numpy(ort_input[0]).to(device)
io_binding.bind_input(name, np_input.device.type, 0, numpy.longlong, np_input.shape, np_input.data_ptr())
name = session.get_inputs()[1].name
print(name)
np_input = torch.from_numpy(ort_input[1][0:1]).to(device)
io_binding.bind_input(name, np_input.device.type, 0, numpy.longlong, np_input.shape, np_input.data_ptr())
name = session.get_inputs()[2].name
print(name)
np_input = torch.from_numpy(ort_input[2][0:1]).to(device)
io_binding.bind_input(name, np_input.device.type, 0, numpy.longlong, np_input.shape, np_input.data_ptr())
name = session.get_inputs()[3].name
print(name)
np_input = torch.from_numpy(ort_input[3][0:1]).to(device)
io_binding.bind_input(name, np_input.device.type, 0, numpy.longlong, np_input.shape, np_input.data_ptr())
else:
name = session.get_inputs()[0].name
print(ort_input[0])
np_input = torch.from_numpy(ort_input[0]).to(device)
io_binding.bind_input(name, np_input.device.type, 0, numpy.float32, np_input.shape, np_input.data_ptr())
name_o = session.get_outputs()[0].name
io_binding.bind_output(name_o)
# name = session.get_inputs()[0].name
# np_input = torch.from_numpy(numpy.asarray(ort_inputs[0][0])).to(device)
# io_binding.bind_input(name, np_input.device.type, 0, numpy.float32, np_input.shape, np_input.data_ptr())
# name_o = session.get_outputs()[0].name
# io_binding.bind_output(name_o, 'cpu', 0, numpy.float32, session.get_outputs()[0].shape, None)
try:
runtimes = runtimes + timeit.repeat(lambda: session.run_with_iobinding(io_binding), number=1, repeat=repeat_times)
except Exception as e:
logger.error(e)
return None
print(runtimes)
result = {}
result.update(result_template)
result.update({"io_binding": True})
result.update(get_latency_result(runtimes, batch_size))
return result
def get_cuda_version():
from pathlib import Path
home = str(Path.home())
p1 = subprocess.Popen(["find", home+"/.local/lib/", "-name", "onnxruntime_pybind11_state.so"], stdout=subprocess.PIPE)
stdout, sterr = p1.communicate()
stdout = stdout.decode("ascii").strip()
p1 = subprocess.Popen(["ldd", stdout], stdout=subprocess.PIPE)
p2 = subprocess.Popen(["grep", "libcudart.so"], stdin=p1.stdout, stdout=subprocess.PIPE)
stdout, sterr = p2.communicate()
stdout = stdout.decode("ascii").strip()
return stdout
def get_trt_version():
from pathlib import Path
home = str(Path.home())
p1 = subprocess.Popen(["find", home+"/.local/lib/", "-name", "onnxruntime_pybind11_state.so"], stdout=subprocess.PIPE)
stdout, sterr = p1.communicate()
stdout = stdout.decode("ascii").strip()
p1 = subprocess.Popen(["ldd", stdout], stdout=subprocess.PIPE)
p2 = subprocess.Popen(["grep", "libnvinfer.so"], stdin=p1.stdout, stdout=subprocess.PIPE)
stdout, sterr = p2.communicate()
stdout = stdout.decode("ascii").strip()
if stdout == "":
p1 = subprocess.Popen(["find", home+"/.local/lib/", "-name", "libonnxruntime_providers_tensorrt.so"], stdout=subprocess.PIPE)
stdout, sterr = p1.communicate()
stdout = stdout.decode("ascii").strip()
p1 = subprocess.Popen(["ldd", stdout], stdout=subprocess.PIPE)
p2 = subprocess.Popen(["grep", "libnvinfer.so"], stdin=p1.stdout, stdout=subprocess.PIPE)
stdout, sterr = p2.communicate()
stdout = stdout.decode("ascii").strip()
return stdout
# not use for this script temporarily
def tmp_get_trt_version():
p1 = subprocess.Popen(["dpkg", "-l"], stdout=subprocess.PIPE)
p2 = subprocess.Popen(["grep", "TensorRT runtime libraries"], stdin=p1.stdout, stdout=subprocess.PIPE)
stdout, sterr = p2.communicate()
stdout = stdout.decode("ascii").strip()
if stdout != "":
stdout = re.sub('\s+', ' ', stdout)
return stdout
if os.path.exists("/usr/lib/x86_64-linux-gnu/libnvinfer.so"):
p1 = subprocess.Popen(["readelf", "-s", "/usr/lib/x86_64-linux-gnu/libnvinfer.so"], stdout=subprocess.PIPE)
p2 = subprocess.Popen(["grep", "version"], stdin=p1.stdout, stdout=subprocess.PIPE)
stdout, sterr = p2.communicate()
stdout = stdout.decode("ascii").strip()
stdout = stdout.split(" ")[-1]
return stdout
elif os.path.exists("/usr/lib/aarch64-linux-gnu/libnvinfer.so"):
p1 = subprocess.Popen(["readelf", "-s", "/usr/lib/aarch64-linux-gnu/libnvinfer.so"], stdout=subprocess.PIPE)
p2 = subprocess.Popen(["grep", "version"], stdin=p1.stdout, stdout=subprocess.PIPE)
stdout, sterr = p2.communicate()
stdout = stdout.decode("ascii").strip()
stdout = stdout.split(" ")[-1]
return stdout
return ""
#
# The following two lists will be generated.
#
# inputs: [[test_data_0_input_0.pb, test_data_0_input_1.pb ...], [test_data_1_input_0.pb, test_data_1_input_1.pb ...] ...]
# outputs: [[test_data_0_output_0.pb, test_data_0_output_1.pb ...], [test_data_1_output_0.pb, test_data_1_output_1.pb ...] ...]
#
def load_onnx_model_zoo_test_data(path, all_inputs_shape, data_type="fp32"):
print("Parsing test data in {} ...".format(path))
# p1 = subprocess.Popen(["find", path, "-name", "test_data_set*", "-type", "d"], stdout=subprocess.PIPE)
p1 = subprocess.Popen(["find", path, "-name", "test_data*", "-type", "d"], stdout=subprocess.PIPE)
p2 = subprocess.Popen(["sort"], stdin=p1.stdout, stdout=subprocess.PIPE)
stdout, sterr = p2.communicate()
stdout = stdout.decode("ascii").strip()
test_data_set_dir = stdout.split("\n")
print(test_data_set_dir)
inputs = []
outputs = []
shape_flag = False
# if not empty means input shape has been parsed before.
if len(all_inputs_shape) > 0:
shape_flag = True
# find test data path
for test_data_dir in test_data_set_dir:
pwd = os.getcwd()
os.chdir(test_data_dir)
# load inputs
p1 = subprocess.Popen(["find", ".", "-name", "input*"], stdout=subprocess.PIPE)
p2 = subprocess.Popen(["sort"], stdin=p1.stdout, stdout=subprocess.PIPE)
stdout, sterr = p2.communicate()
stdout = stdout.decode("ascii").strip()
input_data = stdout.split("\n")
print(input_data)
input_data_pb = []
for data in input_data:
tensor = onnx.TensorProto()
with open(data, 'rb') as f:
tensor.ParseFromString(f.read())
tensor_to_array = numpy_helper.to_array(tensor)
if data_type == "fp16" and tensor_to_array.dtype == np.dtype(np.float32):
tensor_to_array = tensor_to_array.astype(np.float16)
input_data_pb.append(tensor_to_array)
# print(np.array(input_data_pb[-1]).shape)
if not shape_flag:
all_inputs_shape.append(input_data_pb[-1].shape)
print(all_inputs_shape[-1])
inputs.append(input_data_pb)
print('Loaded {} inputs successfully.'.format(len(inputs)))
# load outputs
p1 = subprocess.Popen(["find", ".", "-name", "output*"], stdout=subprocess.PIPE)
p2 = subprocess.Popen(["sort"], stdin=p1.stdout, stdout=subprocess.PIPE)
stdout, sterr = p2.communicate()
stdout = stdout.decode("ascii").strip()
output_data = stdout.split("\n")
print(output_data)
if len(output_data) > 0 and output_data[0] != '':
output_data_pb = []
for data in output_data:
tensor = onnx.TensorProto()
with open(data, 'rb') as f:
tensor.ParseFromString(f.read())
tensor_to_array = numpy_helper.to_array(tensor)
if data_type == "fp16" and tensor_to_array.dtype == np.dtype(np.float32):
tensor_to_array = tensor_to_array.astype(np.float16)
output_data_pb.append(tensor_to_array)
print(np.array(output_data_pb[-1]).shape)
outputs.append(output_data_pb)
print('Loaded {} outputs successfully.'.format(len(outputs)))
os.chdir(pwd)
return inputs, outputs
def generate_onnx_model_random_input(test_times, ref_input):
inputs = []
for i in range(test_times):
input_data = []
for tensor in ref_input:
shape = tensor.shape
dtype = tensor.dtype
if dtype == np.int8 or \
dtype == np.uint8 or \
dtype == np.int16 or \
dtype == np.uint16 or \
dtype == np.int32 or \
dtype == np.uint32 or \
dtype == np.int64 or \
dtype == np.uint64:
new_tensor = np.random.randint(0, np.max(tensor)+1, shape, dtype)
else:
new_tensor = np.random.random_sample(shape).astype(dtype)
print("original tensor:")
print(tensor)
print("new random tensor:")
print(new_tensor)
print("\n")
input_data.append(new_tensor)
inputs.append(input_data)
return inputs
def validate(all_ref_outputs, all_outputs, decimal):
print('Reference {} results.'.format(len(all_ref_outputs)))
print('Predicted {} results.'.format(len(all_outputs)))
print('decimal {}'.format(decimal))
# print(np.array(all_ref_outputs).shape)
# print(np.array(all_outputs).shape)
try:
for i in range(len(all_outputs)):
ref_outputs = all_ref_outputs[i]
outputs = all_outputs[i]
for j in range(len(outputs)):
ref_output = ref_outputs[j]
output = outputs[j]
# print(ref_output)
# print(output)
# Compare the results with reference outputs up to x decimal places
for ref_o, o in zip(ref_output, output):
# abs(desired-actual) < 1.5 * 10**(-decimal)
np.testing.assert_almost_equal(ref_o, o, decimal)
except Exception as e:
logger.error(e)
return False, e
print('ONNX Runtime outputs are similar to reference outputs!')
return True, None
# not use for this script
def cleanup_files():
files = []
p = subprocess.Popen(["find", ".", "-name", "test_data_set*", "-type", "d"], stdout=subprocess.PIPE)
stdout, sterr = p.communicate()
stdout = stdout.decode("ascii").strip()
files = files + stdout.split("\n")
p = subprocess.Popen(["find", ".", "-name", "*.onnx"], stdout=subprocess.PIPE)
stdout, sterr = p.communicate()
stdout = stdout.decode("ascii").strip()
files = files + stdout.split("\n")
p = subprocess.Popen(["find", ".", "-name", "*.gz"], stdout=subprocess.PIPE)
stdout, sterr = p.communicate()
stdout = stdout.decode("ascii").strip()
files = files + stdout.split("\n")
for f in files:
if "custom_test_data" in f:
print(f)
continue
subprocess.Popen(["rm","-rf", f], stdout=subprocess.PIPE)
def remove_profiling_files(path):
files = []
p = subprocess.Popen(["find", path, "-name", "onnxruntime_profile*"], stdout=subprocess.PIPE)
stdout, sterr = p.communicate()
stdout = stdout.decode("ascii").strip()
files = files + stdout.split("\n")
for f in files:
if "custom_test_data" in f:
continue
subprocess.Popen(["rm","-rf", f], stdout=subprocess.PIPE)
def update_fail_report(fail_results, args, model, ep, e_type, e):
result = {}
result["model"] = model
result["ep"] = ep
result["error type"] = e_type
result["error message"] = re.sub('^\n', '', str(e))
fail_results.append(result)
def update_fail_model(model_ep_fail_map, fail_results, args, model_name, ep, e_type, e):
if not model_name in model_ep_fail_map:
model_ep_fail_map[model_name] = [ep]
else:
if ep not in model_ep_fail_map[model_name]:
model_ep_fail_map[model_name].append(ep)
update_fail_report(fail_results, args, model_name, ep, e_type, e)
# If TRT fails, TRT FP16 should fail as well
if ep == 'TensorrtExecutionProvider':
ep_ = "TensorrtExecutionProvider_fp16"
e_ = "Not benchmarking TRT FP16 since TRT failed already."
update_fail_report(fail_results, args, model_name, ep_, e_type, e_)
model_ep_fail_map[model_name].append(ep_)
def skip_ep(model_name, ep, model_ep_fail_map):
if model_name == 'vision-yolov3' and "fp16" in ep:
return True
if model_name == 'speech' and "fp16" in ep:
return True
if model_name not in model_ep_fail_map:
return False
ep_fail_list = model_ep_fail_map[model_name]
if ep in ep_fail_list:
return True
return False
def read_model_ep_fail_map_from_file(map_file):
with open(map_file) as f:
try:
data = json.load(f)
except Exception as e:
return None
return data
def write_model_ep_fail_map_to_file(model_ep_fail_map):
with open('.model_ep_fail_map.json', 'w') as file:
file.write(json.dumps(model_ep_fail_map)) # use `json.loads` to do the reverse
def get_system_info(info):
info["cuda"] = get_cuda_version()
info["trt"] = get_trt_version()
p = subprocess.Popen(["cat", "/etc/os-release"], stdout=subprocess.PIPE)
stdout, sterr = p.communicate()
stdout = stdout.decode("ascii").strip()
stdout = stdout.split("\n")[:2]
infos = []
for row in stdout:
row = re.sub('=', ': ', row)
row = re.sub('"', '', row)
infos.append(row)
info["linux_distro"] = infos
p = subprocess.Popen(["lscpu"], stdout=subprocess.PIPE)
stdout, sterr = p.communicate()
stdout = stdout.decode("ascii").strip()
stdout = stdout.split("\n")
infos = []
for row in stdout:
if "mode" in row or "Arch" in row or "name" in row:
# row = row.replace(":\s+", ": ")
row = re.sub(': +', ': ', row)
infos.append(row)
info["cpu_info"] = infos
p1 = subprocess.Popen(["lspci", "-v"], stdout=subprocess.PIPE)
p2 = subprocess.Popen(["grep", "NVIDIA"], stdin=p1.stdout, stdout=subprocess.PIPE)
stdout, sterr = p2.communicate()
stdout = stdout.decode("ascii").strip()
stdout = stdout.split("\n")
infos = []
for row in stdout:
row = re.sub('.*:', '', row)
infos.append(row)
info["gpu_info"] = infos
p = subprocess.Popen(["cat", "/proc/meminfo"], stdout=subprocess.PIPE)
stdout, sterr = p.communicate()
stdout = stdout.decode("ascii").strip()
stdout = stdout.split("\n")
infos = []
for row in stdout:
if "Mem" in row:
row = re.sub(': +', ': ', row)
infos.append(row)
info["memory"] = infos
def parse_models_info(path):
models = {}
with open(path) as f:
data = json.load(f)
for row in data:
if 'model_name' in row:
models[row['model_name']] = {}
else:
logger.error('Model name must be provided in models_info.json')
raise
model = models[row['model_name']]
if 'working_directory' in row:
model['working_directory'] = row['working_directory']
else:
logger.error('Model path must be provided in models_info.json')
raise
if 'model_path' in row:
model['model_path'] = row['model_path']
else:
logger.error('Model path must be provided in models_info.json')
raise
if 'test_data_path' in row:
model['test_data_path'] = row['test_data_path']
else:
logger.error('Test data path must be provided in models_info.json')
raise
return models
def convert_model_from_float_to_float16(model_path):
# from onnxmltools.utils.float16_converter import convert_float_to_float16
from onnxmltools.utils import load_model, save_model
from float16 import convert_float_to_float16
onnx_model = load_model(model_path)
new_onnx_model = convert_float_to_float16(onnx_model)
save_model(new_onnx_model, 'new_fp16_model.onnx')
return os.path.join(os.getcwd(), "new_fp16_model.onnx")
def create_session(model_path, providers, session_options):
logger.info(model_path)
try:
session = onnxruntime.InferenceSession(model_path, providers=providers, sess_options=session_options)
return session
except:
logger.info("Use symbolic_shape_infer.py")
try:
new_model_path = model_path[:].replace(".onnx", "_new.onnx")
if not os.path.exists(new_model_path):
subprocess.run("python3 -m onnxruntime.tools.symbolic_shape_infer --input " + model_path + " --output " + new_model_path + " --auto_merge", shell=True, check=True)
session = onnxruntime.InferenceSession(new_model_path, providers=providers, sess_options=session_options)
return session
except Exception as e:
print(e)
raise
def run_onnxruntime(args, models):
success_results = []
fail_results = []
latency_comparison_map = {} # model -> CUDA/TRT latency
profile_metrics_map = {} # model -> metrics from profiling file
model_ep_fail_map = {} # model -> failing ep
# read failing ep information if file exists
if args.running_mode == 'benchmark':
if os.path.exists('.model_ep_fail_map.json'):
model_ep_fail_map = read_model_ep_fail_map_from_file('.model_ep_fail_map.json')
if args.fp16:
ep_list = ["CUDAExecutionProvider", "TensorrtExecutionProvider", "CUDAExecutionProvider_fp16", "TensorrtExecutionProvider_fp16"]
else:
ep_list = ["CUDAExecutionProvider", "TensorrtExecutionProvider"]
validation_exemption = ["TensorrtExecutionProvider_fp16"]
#######################
# iterate model
#######################
for name, info in models.items():
latency_result = {}
path = info["working_directory"]
pwd = os.getcwd()
if not os.path.exists(path):
os.mkdir(path)
os.chdir(path)
path = os.getcwd()
# cleanup files before running a new inference
if args.running_mode == "validate":
remove_profiling_files(path)
inputs = []
ref_outputs = []
inputs_fp32 = []
ref_outputs_fp32 = []
inputs_fp16 = []
ref_outputs_fp16 = []
all_inputs_shape = [] # use for standalone trt
ep_to_ep_op_map = {} # ep -> { ep -> operator }
profile_already_parsed = set()
#######################
# iterate ep
#######################
for ep in ep_list:
if skip_ep(name, ep, model_ep_fail_map):
continue
ep_ = ep_to_provider_list[ep][0]
if (ep_ not in onnxruntime.get_available_providers()):
logger.error("No {} support".format(ep_))
continue
model_path = info["model_path"]
if "fp16" in ep:
fp16 = True
os.environ["ORT_TENSORRT_FP16_ENABLE"] = "1"
if ep == "CUDAExecutionProvider_fp16":
model_path = convert_model_from_float_to_float16(model_path)
logger.info("\nInitializing {} with float16 enabled to run on {} ...".format(name, ep))
else:
fp16 = False
os.environ["ORT_TENSORRT_FP16_ENABLE"] = "0"
logger.info("\nInitializing {} to run on {} ...".format(name, ep))
test_data_dir = info["test_data_path"]
# read input/output of test data
if fp16 and ep == "CUDAExecutionProvider_fp16":
if not inputs_fp16 or not ref_outputs_fp16:
inputs_fp16, ref_outputs_fp16 = load_onnx_model_zoo_test_data(test_data_dir, all_inputs_shape, "fp16")
inputs = inputs_fp16
ref_outputs = ref_outputs_fp16
else:
if not inputs_fp32 or not ref_outputs_fp32:
inputs_fp32, ref_outputs_fp32 = load_onnx_model_zoo_test_data(test_data_dir, all_inputs_shape)
inputs = inputs_fp32
ref_outputs = ref_outputs_fp32
if args.input_data == "random":
inputs = generate_onnx_model_random_input(args.test_times+1, inputs[0])
#######################################
# benchmark or validation
#######################################
if args.running_mode == 'benchmark':
logger.info("===========================")
logger.info("======== benchmark ========")
logger.info("===========================")
options = onnxruntime.SessionOptions()
options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
# create onnxruntime inference session
try:
sess = create_session(model_path, ep_to_provider_list[ep], options)
except Exception as e:
logger.error(e)
# update_fail_model(model_ep_fail_map, fail_results, args, name, ep, e)
continue
logger.info("[start] Begin to inference {} with {} ...".format(name, ep))
logger.info(sess.get_providers())
if sess:
logger.info("Model inputs nodes:")
for input_meta in sess.get_inputs():
logger.info(input_meta)
logger.info("Model outputs nodes:")
for output_meta in sess.get_outputs():
logger.info(output_meta)
batch_size = 1
result_template = {
"engine": "onnxruntime",
"version": onnxruntime.__version__,
"device": ep,
"fp16": fp16,
"io_binding": False,
"model_name": name,
"inputs": len(sess.get_inputs()),
"batch_size": batch_size,
"sequence_length": 1,
"datetime": str(datetime.now()),}
result = inference_ort(args, name, sess, ep, inputs, result_template, args.test_times, batch_size)
if result:
success_results.append(result)
logger.info(result)
latency_result[ep] = {}
latency_result[ep]["average_latency_ms"] = result["average_latency_ms"]
latency_result[ep]["latency_90_percentile"] = result["latency_90_percentile"]
# get standalone TensorRT perf
if "TensorrtExecutionProvider" in ep and args.trtexec:
result = run_trt_standalone(args.trtexec, model_path, sess.get_inputs(), all_inputs_shape, fp16)
if result and len(result) > 0:
if fp16:
latency_result["Standalone_TRT_fp16"] = result
else:
latency_result["Standalone_TRT"] = result
latency_comparison_map[name] = copy.deepcopy(latency_result)
elif args.running_mode == 'validate':
logger.info("==========================")
logger.info("======== validate ========")
logger.info("==========================")
# enable profiling to generate profiling file for analysis
options = onnxruntime.SessionOptions()
options.enable_profiling = True
options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
time.sleep(1) # avoid to generate same profile file name
# create onnxruntime inference session
try:
sess = create_session(model_path, ep_to_provider_list[ep], options)
except Exception as e:
logger.error(e)
update_fail_model(model_ep_fail_map, fail_results, args, name, ep, 'runtime error', e)
continue
sess.disable_fallback()
logger.info("Start to inference {} with {} ...".format(name, ep))
logger.info(sess.get_providers())
if sess:
logger.info("Model inputs nodes:")
for input_meta in sess.get_inputs():
logger.info(input_meta)
logger.info("Model outputs nodes:")
for output_meta in sess.get_outputs():
logger.info(output_meta)
# run inference and validate the result
#
# currently skip TensorRT float16 validation intentionally
if ep not in validation_exemption:
try:
ort_outputs = inference_ort_and_get_prediction(name, sess, inputs)
decimal = 0
status = validate(ref_outputs, ort_outputs, decimal)
if not status[0]:
update_fail_model(model_ep_fail_map, fail_results, args, name, ep, 'result accuracy issue', status[1])
continue
except Exception as e:
logger.error(e)
update_fail_model(model_ep_fail_map, fail_results, args, name, ep, 'runtime error', e)
continue
# Run inference again. the reason is that some ep like tensorrt
# it takes much longer time to generate graph on first run and
# we need to skip the perf result of that expensive run.
inference_ort_and_get_prediction(name, sess, inputs)
else:
inference_ort_and_get_prediction(name, sess, inputs)
inference_ort_and_get_prediction(name, sess, inputs)
sess.end_profiling()
# get metrics from profiling file
metrics = get_profile_metrics(path, profile_already_parsed)
if metrics:
print(ep)
ep_to_ep_op_map[ep] = metrics
####################
# end of iterate ep
####################
# get percentage of execution time and operators in TRT
if len(ep_to_ep_op_map) > 0:
trt_op_map = None
trt_fp16_op_map = None
cuda_op_map = None
cuda_fp16_op_map = None
for ep, op_map in ep_to_ep_op_map.items():
if ep == "CUDAExecutionProvider":
cuda_op_map = op_map
elif ep == "CUDAExecutionProvider_fp16":
cuda_fp16_op_map = op_map
elif ep == "TensorrtExecutionProvider":
trt_op_map = op_map
elif ep == "TensorrtExecutionProvider_fp16":
trt_fp16_op_map = op_map
profile_metrics_map[name] = {}
if cuda_op_map:
profile_metrics_map[name]['ratio_of_ops_in_cuda_not_fallback_cpu'] = calculate_cuda_op_percentage(cuda_op_map)
if trt_op_map:
total_trt_execution_time, total_execution_time, ratio_of_execution_time_in_trt = calculate_trt_latency_percentage(trt_op_map)
profile_metrics_map[name]['total_trt_execution_time'] = total_trt_execution_time
profile_metrics_map[name]['total_execution_time'] = total_execution_time
profile_metrics_map[name]['ratio_of_execution_time_in_trt'] = ratio_of_execution_time_in_trt
if cuda_op_map:
total_ops_in_trt, total_ops, ratio_of_ops_in_trt = calculate_trt_op_percentage(trt_op_map, cuda_op_map)
profile_metrics_map[name]['total_ops_in_trt'] = total_ops_in_trt
profile_metrics_map[name]['total_ops'] = total_ops
profile_metrics_map[name]['ratio_of_ops_in_trt'] = ratio_of_ops_in_trt
if trt_fp16_op_map:
total_trt_execution_time, total_execution_time, ratio_of_execution_time_in_trt = calculate_trt_latency_percentage(trt_fp16_op_map)
name_ = name + " (FP16)"
profile_metrics_map[name_] = {}
profile_metrics_map[name_]['total_trt_execution_time'] = total_trt_execution_time
profile_metrics_map[name_]['total_execution_time'] = total_execution_time
profile_metrics_map[name_]['ratio_of_execution_time_in_trt'] = ratio_of_execution_time_in_trt
if cuda_fp16_op_map:
total_ops_in_trt, total_ops, ratio_of_ops_in_trt = calculate_trt_op_percentage(trt_fp16_op_map, cuda_op_map)
profile_metrics_map[name_]['total_ops_in_trt'] = total_ops_in_trt
profile_metrics_map[name_]['total_ops'] = total_ops
profile_metrics_map[name_]['ratio_of_ops_in_trt'] = ratio_of_ops_in_trt
if debug:
pp = pprint.PrettyPrinter(indent=4)
print('CUDA operator map:')
pp.pprint(cuda_op_map)
print('TRT operator map:')
pp.pprint(trt_op_map)
print('CUDA FP16 operator map:')
pp.pprint(cuda_fp16_op_map)
print('TRT FP16 operator map:')
pp.pprint(trt_fp16_op_map)
# cleanup_files()
os.chdir(pwd)
# end of model
return success_results, fail_results, latency_comparison_map, model_ep_fail_map, profile_metrics_map
def add_improvement_information(latency_comparison_map):
for key, value in latency_comparison_map.items():
if not ('TensorrtExecutionProvider' in value and 'CUDAExecutionProvider' in value):
continue
trt_latency = float(value['TensorrtExecutionProvider']['average_latency_ms'])
cuda_latency = float(value['CUDAExecutionProvider']['average_latency_ms'])
gain = (cuda_latency - trt_latency)*100/cuda_latency
value["Tensorrt_gain(%)"] = "{:.2f} %".format(gain)
if "TensorrtExecutionProvider_fp16" in value and "CUDAExecutionProvider_fp16" in value:
trt_fp16_latency = float(value['TensorrtExecutionProvider_fp16']['average_latency_ms'])
cuda_fp16_latency = float(value['CUDAExecutionProvider_fp16']['average_latency_ms'])
gain = (cuda_fp16_latency - trt_fp16_latency)*100/cuda_fp16_latency
value["Tensorrt_fp16_gain(%)"] = "{:.2f} %".format(gain)
def output_details(results, csv_filename):
with open(csv_filename, mode="a", newline='') as csv_file:
column_names = [
"engine", "version", "device", "fp16", "io_binding", "model_name", "inputs", "batch_size",
"sequence_length", "datetime", "test_times", "QPS", "average_latency_ms", "latency_variance",
"latency_90_percentile", "latency_95_percentile", "latency_99_percentile"
]
csv_writer = csv.DictWriter(csv_file, fieldnames=column_names)
csv_writer.writeheader()
for result in results:
csv_writer.writerow(result)
logger.info(f"Detail results are saved to csv file: {csv_filename}")
def output_fail(results, csv_filename):
with open(csv_filename, mode="a", newline='') as csv_file:
column_names = [
"model", "ep", "error type", "error message"
]
csv_writer = csv.DictWriter(csv_file, fieldnames=column_names)
csv_writer.writeheader()
for result in results:
csv_writer.writerow(result)
logger.info(f"Failing results are saved to csv file: {csv_filename}")
def output_latency(results, csv_filename):
with open(csv_filename, mode="a", newline='') as csv_file:
column_names = ["Model",
"CUDA \nmean (ms)",
"CUDA \n90th percentile (ms)",
"TRT EP \nmean (ms)",
"TRT EP \n90th percentile (ms)",
"Standalone TRT \nmean (ms)",
"Standalone TRT \n90th percentile (ms)",
"CUDA fp16 \nmean (ms)",
"CUDA fp16 \n90th percentile (ms)",
"TRT EP fp16 \nmean (ms)",
"TRT EP fp16 \n90 percentile (ms)",
"Standalone TRT fp16 \nmean (ms)",
"Standalone TRT fp16 \n90th percentile (ms)",
"TRT EP \ngain (mean) (%)",
"TRT EP fp16 \ngain (mean) (%)"]
csv_writer = csv.writer(csv_file)
csv_writer.writerow(column_names)
for key, value in results.items():
cuda_average = ""
if 'CUDAExecutionProvider' in value and 'average_latency_ms' in value['CUDAExecutionProvider']:
cuda_average = value['CUDAExecutionProvider']['average_latency_ms']
cuda_99_percentile = ""
if 'CUDAExecutionProvider' in value and 'latency_90_percentile' in value['CUDAExecutionProvider']:
cuda_99_percentile = value['CUDAExecutionProvider']['latency_90_percentile']
trt_average = ""
if 'TensorrtExecutionProvider' in value and 'average_latency_ms' in value['TensorrtExecutionProvider']:
trt_average = value['TensorrtExecutionProvider']['average_latency_ms']
trt_99_percentile = ""
if 'TensorrtExecutionProvider' in value and 'latency_90_percentile' in value['TensorrtExecutionProvider']:
trt_99_percentile = value['TensorrtExecutionProvider']['latency_90_percentile']
standalone_trt_average = ""
if 'Standalone_TRT' in value and 'average_latency_ms' in value['Standalone_TRT']:
standalone_trt_average = value['Standalone_TRT']['average_latency_ms']
standalone_trt_99_percentile = ""
if 'Standalone_TRT' in value and 'latency_90_percentile' in value['Standalone_TRT']:
standalone_trt_99_percentile = value['Standalone_TRT']['latency_90_percentile']
cuda_fp16_average = ""
if 'CUDAExecutionProvider_fp16' in value and 'average_latency_ms' in value['CUDAExecutionProvider_fp16']:
cuda_fp16_average = value['CUDAExecutionProvider_fp16']['average_latency_ms']
cuda_fp16_99_percentile = ""
if 'CUDAExecutionProvider_fp16' in value and 'latency_90_percentile' in value['CUDAExecutionProvider_fp16']:
cuda_fp16_99_percentile = value['CUDAExecutionProvider_fp16']['latency_90_percentile']
trt_fp16_average = ""
if 'TensorrtExecutionProvider_fp16' in value and 'average_latency_ms' in value['TensorrtExecutionProvider_fp16']:
trt_fp16_average = value['TensorrtExecutionProvider_fp16']['average_latency_ms']
trt_fp16_99_percentile = ""
if 'TensorrtExecutionProvider_fp16' in value and 'latency_90_percentile' in value['TensorrtExecutionProvider_fp16']:
trt_fp16_99_percentile = value['TensorrtExecutionProvider_fp16']['latency_90_percentile']
standalone_trt_fp16_average = ""
if 'Standalone_TRT_fp16' in value and 'average_latency_ms' in value['Standalone_TRT_fp16']:
standalone_trt_fp16_average = value['Standalone_TRT_fp16']['average_latency_ms']
standalone_trt_fp16_99_percentile = ""
if 'Standalone_TRT_fp16' in value and 'latency_90_percentile' in value['Standalone_TRT_fp16']:
standalone_trt_fp16_99_percentile = value['Standalone_TRT_fp16']['latency_90_percentile']
row = [key,
cuda_average,
cuda_99_percentile,
trt_average,
trt_99_percentile,
standalone_trt_average,
standalone_trt_99_percentile,
cuda_fp16_average,
cuda_fp16_99_percentile,
trt_fp16_average,
trt_fp16_99_percentile,
standalone_trt_fp16_average,
standalone_trt_fp16_99_percentile,
value['Tensorrt_gain(%)'] if 'Tensorrt_gain(%)' in value else " ",
value['Tensorrt_fp16_gain(%)'] if 'Tensorrt_fp16_gain(%)' in value else " "
]
csv_writer.writerow(row)
logger.info(f"CUDA/TRT latency comparison are saved to csv file: {csv_filename}")
def output_ratio(results, csv_filename):
with open(csv_filename, mode="a", newline='') as csv_file:
column_names = ["Model",
"% CUDA operators (not fall back to CPU)",
"Total TRT operators",
"Total operators",
"% TRT operator",
"Total TRT execution time",
"Total execution time",
"% TRT execution time"]
csv_writer = csv.writer(csv_file)
csv_writer.writerow(column_names)
for key, value in results.items():
row = [key,
value['ratio_of_ops_in_cuda_not_fallback_cpu'] if 'ratio_of_ops_in_cuda_not_fallback_cpu' in value else " ",
value['total_ops_in_trt'] if 'total_ops_in_trt' in value else " ",
value['total_ops'] if 'total_ops' in value else " ",
value['ratio_of_ops_in_trt'] if 'ratio_of_ops_in_trt' in value else " ",
value['total_trt_execution_time'] if 'total_trt_execution_time' in value else " ",
value['total_execution_time'] if 'total_execution_time' in value else " ",
value['ratio_of_execution_time_in_trt'] if 'ratio_of_execution_time_in_trt' in value else " ",
]
csv_writer.writerow(row)
logger.info(f"Tensorrt ratio metrics are saved to csv file: {csv_filename}")
def output_system_info(result, csv_filename):
with open(csv_filename, mode="a", newline='') as csv_file:
column_names = [
"cpu_info", "cuda", "gpu_info", "linux_distro", "memory", "trt"
]
csv_writer = csv.DictWriter(csv_file, fieldnames=column_names)
csv_writer.writeheader()
csv_writer.writerow(result)
logger.info(f"System information are saved to csv file: {csv_filename}")
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument("-m", "--model_list_file", required=False, default="model_list.json", help="Model list file.")
parser.add_argument("-r", "--running_mode", required=False, default="benchmark", choices=["validate", "benchmark"], help="Testing mode.")
parser.add_argument("-i", "--input_data", required=False, default="zoo", choices=["zoo", "random"], help="source of input data.")
parser.add_argument("--fp16", required=False, default=True, action="store_true", help="Inlcude Float16 into benchmarking.")
parser.add_argument("--trtexec", required=False, default=None, help="trtexec executable path.")
parser.add_argument("-t",
"--test_times",
required=False,
default=1,
type=int,
help="Number of repeat times to get average inference latency.")
args = parser.parse_args()
return args
def setup_logger(verbose):
if verbose:
coloredlogs.install(level='DEBUG', fmt='[%(filename)s:%(lineno)s - %(funcName)20s()] %(message)s')
else:
coloredlogs.install(fmt='%(message)s')
logging.getLogger("transformers").setLevel(logging.WARNING)
def main():
args = parse_arguments()
setup_logger(False)
pp = pprint.PrettyPrinter(indent=4)
models = parse_models_info(args.model_list_file)
perf_start_time = datetime.now()
success_results, fail_results, latency_comparison_map, failing_models, profile_metrics_map = run_onnxruntime(args, models)
perf_end_time = datetime.now()
logger.info("\nTotal time for running/profiling all models: {}".format(perf_end_time - perf_start_time))
logger.info(list(models.keys()))
logger.info("\nTotal models: {}".format(len(models)))
logger.info("Fail models: {}".format(len(failing_models)))
logger.info("Models FAIL/SUCCESS: {}/{}".format(len(failing_models), len(models) - len(failing_models)))
path = "result"
if not os.path.exists(path):
os.mkdir(path)
path = os.path.join(os.getcwd(), path)
if not os.path.exists(path):
os.mkdir(path)
time_stamp = datetime.now().strftime("%Y%m%d-%H%M%S")
if len(failing_models) > 0:
logger.info("\n============================================")
logger.info("========== Failing Models/EPs ==============")
logger.info("============================================")
logger.info(failing_models)
write_model_ep_fail_map_to_file(failing_models)
if latency_comparison_map:
logger.info("\n=========================================")
logger.info("=========== CUDA/TRT latency ===========")
logger.info("=========================================")
add_improvement_information(latency_comparison_map)
pp.pprint(latency_comparison_map)
csv_filename = f"benchmark_latency_{time_stamp}.csv"
csv_filename = os.path.join(path, csv_filename)
output_latency(latency_comparison_map, csv_filename)
if len(profile_metrics_map) > 0:
logger.info("\n========================================")
logger.info("========== TRT detail metrics ==========")
logger.info("========================================")
pp.pprint(profile_metrics_map)
csv_filename = f"benchmark_ratio_{time_stamp}.csv"
csv_filename = os.path.join(path, csv_filename)
output_ratio(profile_metrics_map, csv_filename)
logger.info("\n===========================================")
logger.info("=========== System information ===========")
logger.info("===========================================")
info = {}
get_system_info(info)
pp.pprint(info)
csv_filename = os.path.join(path, f"system_info_{time_stamp}.csv")
output_system_info(info, csv_filename)
if fail_results:
csv_filename = f"benchmark_fail_{time_stamp}.csv"
csv_filename = os.path.join(path, csv_filename)
output_fail(fail_results, csv_filename)
if success_results:
csv_filename = f"benchmark_success_{time_stamp}.csv"
csv_filename = os.path.join(path, csv_filename)
output_details(success_results, csv_filename)
if __name__ == "__main__":
main()