mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-10 00:38:54 +00:00
1267 lines
49 KiB
Python
1267 lines
49 KiB
Python
import os
|
|
import csv
|
|
import timeit
|
|
from datetime import datetime
|
|
import numpy
|
|
import logging
|
|
import coloredlogs
|
|
import numpy as np
|
|
import argparse
|
|
import copy
|
|
import json
|
|
import re
|
|
import sys
|
|
import onnxruntime
|
|
from onnx import numpy_helper
|
|
from perf_utils import *
|
|
import pprint
|
|
import time
|
|
from float16 import *
|
|
# import torch
|
|
|
|
debug = False
|
|
sys.path.append('.')
|
|
logger = logging.getLogger('')
|
|
|
|
ep_to_provider_list = {
|
|
"CPUExecutionProvider": ["CPUExecutionProvider"],
|
|
"CUDAExecutionProvider": ["CUDAExecutionProvider"],
|
|
"CUDAExecutionProvider_fp16": ["CUDAExecutionProvider"],
|
|
"TensorrtExecutionProvider": ["TensorrtExecutionProvider", "CUDAExecutionProvider"],
|
|
"TensorrtExecutionProvider_fp16": ["TensorrtExecutionProvider", "CUDAExecutionProvider"],
|
|
}
|
|
|
|
def run_trt_standalone(trtexec, model_path, ort_inputs, all_inputs_shape, fp16):
|
|
model_path = "--onnx=" + model_path
|
|
input_shape = []
|
|
|
|
print(all_inputs_shape)
|
|
|
|
for i in range(len(ort_inputs)):
|
|
name = ort_inputs[i].name
|
|
|
|
shape = []
|
|
for j in all_inputs_shape[i]:
|
|
shape.append(str(j))
|
|
shape = "x".join(shape)
|
|
shape = name + ':' + shape
|
|
input_shape.append(shape)
|
|
|
|
shapes_arg = '--optShapes=' + ','.join(input_shape)
|
|
print(shapes_arg)
|
|
|
|
result = {}
|
|
try:
|
|
|
|
if fp16:
|
|
p1 = subprocess.Popen([trtexec, model_path, "--fp16", "--percentile=90", "--explicitBatch", shapes_arg], stdout=subprocess.PIPE)
|
|
else:
|
|
p1 = subprocess.Popen([trtexec, model_path, "--percentile=90", "--explicitBatch", shapes_arg], stdout=subprocess.PIPE)
|
|
stdout, sterr = p1.communicate()
|
|
print(stdout)
|
|
stdout = stdout.decode("ascii").strip()
|
|
|
|
tmp = stdout.split("\n")
|
|
target_list = []
|
|
for t in tmp:
|
|
if 'mean:' in t:
|
|
target_list.append(t)
|
|
|
|
if 'percentile:' in t:
|
|
target_list.append(t)
|
|
|
|
target = target_list[2]
|
|
start = target.find('mean:') + 6
|
|
end = target.find('ms')
|
|
result["average_latency_ms"] = target[start:end]
|
|
|
|
target = target_list[3]
|
|
start = target.find('percentile:') + 12
|
|
end = target.find('ms')
|
|
result["latency_90_percentile"] = target[start:end]
|
|
|
|
print(result)
|
|
return result
|
|
|
|
except Exception as e:
|
|
logger.info("trtexec fails...")
|
|
return None
|
|
|
|
|
|
|
|
def get_latency_result(runtimes, batch_size):
|
|
latency_ms = sum(runtimes) / float(len(runtimes)) * 1000.0
|
|
latency_variance = numpy.var(runtimes, dtype=numpy.float64) * 1000.0
|
|
throughput = batch_size * (1000.0 / latency_ms)
|
|
|
|
return {
|
|
"test_times": len(runtimes),
|
|
"latency_variance": "{:.2f}".format(latency_variance),
|
|
"latency_90_percentile": "{:.2f}".format(numpy.percentile(runtimes, 90) * 1000.0),
|
|
"latency_95_percentile": "{:.2f}".format(numpy.percentile(runtimes, 95) * 1000.0),
|
|
"latency_99_percentile": "{:.2f}".format(numpy.percentile(runtimes, 99) * 1000.0),
|
|
"average_latency_ms": "{:.2f}".format(latency_ms),
|
|
"QPS": "{:.2f}".format(throughput),
|
|
}
|
|
|
|
|
|
def get_ort_session_inputs_and_outptus(name, session, ort_input):
|
|
|
|
sess_inputs = {}
|
|
sess_outputs = None
|
|
|
|
if name == 'BERT-Squad':
|
|
unique_ids_raw_output = ort_input[0]
|
|
input_ids = ort_input[1]
|
|
input_mask = ort_input[2]
|
|
segment_ids = ort_input[3]
|
|
|
|
sess_inputs = {
|
|
"unique_ids_raw_output___9:0": unique_ids_raw_output,
|
|
"input_ids:0": input_ids[0:1],
|
|
"input_mask:0": input_mask[0:1],
|
|
"segment_ids:0": segment_ids[0:1]}
|
|
sess_outputs = ["unique_ids:0", "unstack:0", "unstack:1"]
|
|
|
|
elif name == 'BiDAF':
|
|
sess_inputs = {
|
|
"context_word": ort_input[0],
|
|
"context_char": ort_input[2],
|
|
"query_word": ort_input[1],
|
|
"query_char": ort_input[3]}
|
|
sess_outputs = ["start_pos","end_pos"]
|
|
|
|
elif name == 'Yolov4':
|
|
sess_inputs[session.get_inputs()[0].name] = ort_input[0]
|
|
sess_outputs = ['Identity:0']
|
|
|
|
elif name == 'Shufflenet-v2':
|
|
sess_inputs[session.get_inputs()[0].name] = ort_input
|
|
|
|
else:
|
|
sess_inputs = {}
|
|
for i in range(len(session.get_inputs())):
|
|
sess_inputs[session.get_inputs()[i].name] = ort_input[i]
|
|
|
|
return (sess_inputs, sess_outputs)
|
|
|
|
def inference_ort(args, name, session, ep, ort_inputs, result_template, repeat_times, batch_size):
|
|
|
|
runtimes = []
|
|
for ort_input in ort_inputs:
|
|
sess_inputs, sess_outputs = get_ort_session_inputs_and_outptus(name, session, ort_input)
|
|
print("sess_inputs:")
|
|
print(sess_inputs)
|
|
print("sess_outputs:")
|
|
print(sess_outputs)
|
|
try:
|
|
if args.input_data == "random":
|
|
repeat_times = 1 # warn-up run is included in ort_inputs
|
|
else:
|
|
repeat_times += 1 # add warn-up run
|
|
|
|
runtime = timeit.repeat(lambda: session.run(sess_outputs, sess_inputs), number=1, repeat=repeat_times)
|
|
runtimes += runtime
|
|
|
|
except Exception as e:
|
|
logger.error(e)
|
|
return None
|
|
|
|
print(runtimes)
|
|
runtimes[:] = runtimes[1:]
|
|
print(runtimes)
|
|
|
|
result = {}
|
|
result.update(result_template)
|
|
result.update({"io_binding": False})
|
|
result.update(get_latency_result(runtimes, batch_size))
|
|
return result
|
|
|
|
def inference_ort_and_get_prediction(name, session, ort_inputs):
|
|
|
|
ort_outputs = []
|
|
for ort_input in ort_inputs:
|
|
sess_inputs, sess_outputs = get_ort_session_inputs_and_outptus(name, session, ort_input)
|
|
print("sess_inputs:")
|
|
print(sess_inputs)
|
|
print("sess_outputs:")
|
|
print(sess_outputs)
|
|
try:
|
|
result = session.run(sess_outputs, sess_inputs)
|
|
|
|
# handle shape of output differently
|
|
if name == 'BERT-Squad':
|
|
ort_outputs.append([result])
|
|
elif name == 'Shufflenet-v2':
|
|
ort_outputs.append(result[0])
|
|
else:
|
|
ort_outputs.append(result)
|
|
except Exception as e:
|
|
logger.error(e)
|
|
return None
|
|
|
|
return ort_outputs
|
|
|
|
# not use for this script yet
|
|
def inference_ort_with_io_binding(model, ort_inputs, result_template, repeat_times, batch_size, device='cuda'):
|
|
runtimes = []
|
|
|
|
session = model.get_session()
|
|
|
|
# Bind inputs and outputs to onnxruntime session
|
|
io_binding = session.io_binding()
|
|
|
|
for ort_input in ort_inputs:
|
|
|
|
# Bind inputs to device
|
|
if model.get_model_name() == 'BERT-Squad':
|
|
name = session.get_inputs()[0].name
|
|
print(name)
|
|
np_input = torch.from_numpy(ort_input[0]).to(device)
|
|
io_binding.bind_input(name, np_input.device.type, 0, numpy.longlong, np_input.shape, np_input.data_ptr())
|
|
name = session.get_inputs()[1].name
|
|
print(name)
|
|
np_input = torch.from_numpy(ort_input[1][0:1]).to(device)
|
|
io_binding.bind_input(name, np_input.device.type, 0, numpy.longlong, np_input.shape, np_input.data_ptr())
|
|
name = session.get_inputs()[2].name
|
|
print(name)
|
|
np_input = torch.from_numpy(ort_input[2][0:1]).to(device)
|
|
io_binding.bind_input(name, np_input.device.type, 0, numpy.longlong, np_input.shape, np_input.data_ptr())
|
|
name = session.get_inputs()[3].name
|
|
print(name)
|
|
np_input = torch.from_numpy(ort_input[3][0:1]).to(device)
|
|
io_binding.bind_input(name, np_input.device.type, 0, numpy.longlong, np_input.shape, np_input.data_ptr())
|
|
else:
|
|
name = session.get_inputs()[0].name
|
|
print(ort_input[0])
|
|
np_input = torch.from_numpy(ort_input[0]).to(device)
|
|
io_binding.bind_input(name, np_input.device.type, 0, numpy.float32, np_input.shape, np_input.data_ptr())
|
|
|
|
name_o = session.get_outputs()[0].name
|
|
io_binding.bind_output(name_o)
|
|
|
|
# name = session.get_inputs()[0].name
|
|
# np_input = torch.from_numpy(numpy.asarray(ort_inputs[0][0])).to(device)
|
|
# io_binding.bind_input(name, np_input.device.type, 0, numpy.float32, np_input.shape, np_input.data_ptr())
|
|
# name_o = session.get_outputs()[0].name
|
|
# io_binding.bind_output(name_o, 'cpu', 0, numpy.float32, session.get_outputs()[0].shape, None)
|
|
|
|
try:
|
|
runtimes = runtimes + timeit.repeat(lambda: session.run_with_iobinding(io_binding), number=1, repeat=repeat_times)
|
|
except Exception as e:
|
|
logger.error(e)
|
|
return None
|
|
|
|
print(runtimes)
|
|
|
|
result = {}
|
|
result.update(result_template)
|
|
result.update({"io_binding": True})
|
|
result.update(get_latency_result(runtimes, batch_size))
|
|
return result
|
|
|
|
|
|
def get_cuda_version():
|
|
from pathlib import Path
|
|
home = str(Path.home())
|
|
|
|
p1 = subprocess.Popen(["find", home+"/.local/lib/", "-name", "onnxruntime_pybind11_state.so"], stdout=subprocess.PIPE)
|
|
stdout, sterr = p1.communicate()
|
|
stdout = stdout.decode("ascii").strip()
|
|
p1 = subprocess.Popen(["ldd", stdout], stdout=subprocess.PIPE)
|
|
p2 = subprocess.Popen(["grep", "libcudart.so"], stdin=p1.stdout, stdout=subprocess.PIPE)
|
|
stdout, sterr = p2.communicate()
|
|
stdout = stdout.decode("ascii").strip()
|
|
|
|
return stdout
|
|
|
|
def get_trt_version():
|
|
from pathlib import Path
|
|
home = str(Path.home())
|
|
|
|
p1 = subprocess.Popen(["find", home+"/.local/lib/", "-name", "onnxruntime_pybind11_state.so"], stdout=subprocess.PIPE)
|
|
stdout, sterr = p1.communicate()
|
|
stdout = stdout.decode("ascii").strip()
|
|
p1 = subprocess.Popen(["ldd", stdout], stdout=subprocess.PIPE)
|
|
p2 = subprocess.Popen(["grep", "libnvinfer.so"], stdin=p1.stdout, stdout=subprocess.PIPE)
|
|
stdout, sterr = p2.communicate()
|
|
stdout = stdout.decode("ascii").strip()
|
|
|
|
if stdout == "":
|
|
p1 = subprocess.Popen(["find", home+"/.local/lib/", "-name", "libonnxruntime_providers_tensorrt.so"], stdout=subprocess.PIPE)
|
|
stdout, sterr = p1.communicate()
|
|
stdout = stdout.decode("ascii").strip()
|
|
p1 = subprocess.Popen(["ldd", stdout], stdout=subprocess.PIPE)
|
|
p2 = subprocess.Popen(["grep", "libnvinfer.so"], stdin=p1.stdout, stdout=subprocess.PIPE)
|
|
stdout, sterr = p2.communicate()
|
|
stdout = stdout.decode("ascii").strip()
|
|
|
|
return stdout
|
|
|
|
# not use for this script temporarily
|
|
def tmp_get_trt_version():
|
|
p1 = subprocess.Popen(["dpkg", "-l"], stdout=subprocess.PIPE)
|
|
p2 = subprocess.Popen(["grep", "TensorRT runtime libraries"], stdin=p1.stdout, stdout=subprocess.PIPE)
|
|
stdout, sterr = p2.communicate()
|
|
stdout = stdout.decode("ascii").strip()
|
|
|
|
if stdout != "":
|
|
stdout = re.sub('\s+', ' ', stdout)
|
|
return stdout
|
|
|
|
if os.path.exists("/usr/lib/x86_64-linux-gnu/libnvinfer.so"):
|
|
p1 = subprocess.Popen(["readelf", "-s", "/usr/lib/x86_64-linux-gnu/libnvinfer.so"], stdout=subprocess.PIPE)
|
|
p2 = subprocess.Popen(["grep", "version"], stdin=p1.stdout, stdout=subprocess.PIPE)
|
|
stdout, sterr = p2.communicate()
|
|
stdout = stdout.decode("ascii").strip()
|
|
stdout = stdout.split(" ")[-1]
|
|
return stdout
|
|
|
|
elif os.path.exists("/usr/lib/aarch64-linux-gnu/libnvinfer.so"):
|
|
p1 = subprocess.Popen(["readelf", "-s", "/usr/lib/aarch64-linux-gnu/libnvinfer.so"], stdout=subprocess.PIPE)
|
|
p2 = subprocess.Popen(["grep", "version"], stdin=p1.stdout, stdout=subprocess.PIPE)
|
|
stdout, sterr = p2.communicate()
|
|
stdout = stdout.decode("ascii").strip()
|
|
stdout = stdout.split(" ")[-1]
|
|
return stdout
|
|
|
|
return ""
|
|
|
|
#
|
|
# The following two lists will be generated.
|
|
#
|
|
# inputs: [[test_data_0_input_0.pb, test_data_0_input_1.pb ...], [test_data_1_input_0.pb, test_data_1_input_1.pb ...] ...]
|
|
# outputs: [[test_data_0_output_0.pb, test_data_0_output_1.pb ...], [test_data_1_output_0.pb, test_data_1_output_1.pb ...] ...]
|
|
#
|
|
def load_onnx_model_zoo_test_data(path, all_inputs_shape, data_type="fp32"):
|
|
print("Parsing test data in {} ...".format(path))
|
|
# p1 = subprocess.Popen(["find", path, "-name", "test_data_set*", "-type", "d"], stdout=subprocess.PIPE)
|
|
p1 = subprocess.Popen(["find", path, "-name", "test_data*", "-type", "d"], stdout=subprocess.PIPE)
|
|
p2 = subprocess.Popen(["sort"], stdin=p1.stdout, stdout=subprocess.PIPE)
|
|
stdout, sterr = p2.communicate()
|
|
stdout = stdout.decode("ascii").strip()
|
|
test_data_set_dir = stdout.split("\n")
|
|
print(test_data_set_dir)
|
|
|
|
inputs = []
|
|
outputs = []
|
|
|
|
shape_flag = False
|
|
# if not empty means input shape has been parsed before.
|
|
if len(all_inputs_shape) > 0:
|
|
shape_flag = True
|
|
|
|
# find test data path
|
|
for test_data_dir in test_data_set_dir:
|
|
pwd = os.getcwd()
|
|
os.chdir(test_data_dir)
|
|
|
|
# load inputs
|
|
p1 = subprocess.Popen(["find", ".", "-name", "input*"], stdout=subprocess.PIPE)
|
|
p2 = subprocess.Popen(["sort"], stdin=p1.stdout, stdout=subprocess.PIPE)
|
|
stdout, sterr = p2.communicate()
|
|
stdout = stdout.decode("ascii").strip()
|
|
input_data = stdout.split("\n")
|
|
print(input_data)
|
|
|
|
input_data_pb = []
|
|
for data in input_data:
|
|
tensor = onnx.TensorProto()
|
|
with open(data, 'rb') as f:
|
|
tensor.ParseFromString(f.read())
|
|
|
|
tensor_to_array = numpy_helper.to_array(tensor)
|
|
|
|
if data_type == "fp16" and tensor_to_array.dtype == np.dtype(np.float32):
|
|
tensor_to_array = tensor_to_array.astype(np.float16)
|
|
input_data_pb.append(tensor_to_array)
|
|
|
|
# print(np.array(input_data_pb[-1]).shape)
|
|
if not shape_flag:
|
|
all_inputs_shape.append(input_data_pb[-1].shape)
|
|
print(all_inputs_shape[-1])
|
|
inputs.append(input_data_pb)
|
|
print('Loaded {} inputs successfully.'.format(len(inputs)))
|
|
|
|
# load outputs
|
|
p1 = subprocess.Popen(["find", ".", "-name", "output*"], stdout=subprocess.PIPE)
|
|
p2 = subprocess.Popen(["sort"], stdin=p1.stdout, stdout=subprocess.PIPE)
|
|
stdout, sterr = p2.communicate()
|
|
stdout = stdout.decode("ascii").strip()
|
|
output_data = stdout.split("\n")
|
|
print(output_data)
|
|
|
|
if len(output_data) > 0 and output_data[0] != '':
|
|
output_data_pb = []
|
|
for data in output_data:
|
|
tensor = onnx.TensorProto()
|
|
with open(data, 'rb') as f:
|
|
tensor.ParseFromString(f.read())
|
|
|
|
tensor_to_array = numpy_helper.to_array(tensor)
|
|
|
|
if data_type == "fp16" and tensor_to_array.dtype == np.dtype(np.float32):
|
|
tensor_to_array = tensor_to_array.astype(np.float16)
|
|
output_data_pb.append(tensor_to_array)
|
|
|
|
print(np.array(output_data_pb[-1]).shape)
|
|
outputs.append(output_data_pb)
|
|
print('Loaded {} outputs successfully.'.format(len(outputs)))
|
|
|
|
os.chdir(pwd)
|
|
|
|
return inputs, outputs
|
|
|
|
def generate_onnx_model_random_input(test_times, ref_input):
|
|
inputs = []
|
|
|
|
for i in range(test_times):
|
|
|
|
input_data = []
|
|
for tensor in ref_input:
|
|
shape = tensor.shape
|
|
dtype = tensor.dtype
|
|
if dtype == np.int8 or \
|
|
dtype == np.uint8 or \
|
|
dtype == np.int16 or \
|
|
dtype == np.uint16 or \
|
|
dtype == np.int32 or \
|
|
dtype == np.uint32 or \
|
|
dtype == np.int64 or \
|
|
dtype == np.uint64:
|
|
new_tensor = np.random.randint(0, np.max(tensor)+1, shape, dtype)
|
|
else:
|
|
new_tensor = np.random.random_sample(shape).astype(dtype)
|
|
print("original tensor:")
|
|
print(tensor)
|
|
print("new random tensor:")
|
|
print(new_tensor)
|
|
print("\n")
|
|
input_data.append(new_tensor)
|
|
inputs.append(input_data)
|
|
|
|
return inputs
|
|
|
|
def validate(all_ref_outputs, all_outputs, decimal):
|
|
print('Reference {} results.'.format(len(all_ref_outputs)))
|
|
print('Predicted {} results.'.format(len(all_outputs)))
|
|
print('decimal {}'.format(decimal))
|
|
# print(np.array(all_ref_outputs).shape)
|
|
# print(np.array(all_outputs).shape)
|
|
|
|
try:
|
|
for i in range(len(all_outputs)):
|
|
ref_outputs = all_ref_outputs[i]
|
|
outputs = all_outputs[i]
|
|
|
|
for j in range(len(outputs)):
|
|
ref_output = ref_outputs[j]
|
|
output = outputs[j]
|
|
# print(ref_output)
|
|
# print(output)
|
|
|
|
# Compare the results with reference outputs up to x decimal places
|
|
for ref_o, o in zip(ref_output, output):
|
|
# abs(desired-actual) < 1.5 * 10**(-decimal)
|
|
np.testing.assert_almost_equal(ref_o, o, decimal)
|
|
except Exception as e:
|
|
logger.error(e)
|
|
return False, e
|
|
|
|
print('ONNX Runtime outputs are similar to reference outputs!')
|
|
return True, None
|
|
|
|
# not use for this script
|
|
def cleanup_files():
|
|
files = []
|
|
p = subprocess.Popen(["find", ".", "-name", "test_data_set*", "-type", "d"], stdout=subprocess.PIPE)
|
|
stdout, sterr = p.communicate()
|
|
stdout = stdout.decode("ascii").strip()
|
|
files = files + stdout.split("\n")
|
|
|
|
p = subprocess.Popen(["find", ".", "-name", "*.onnx"], stdout=subprocess.PIPE)
|
|
stdout, sterr = p.communicate()
|
|
stdout = stdout.decode("ascii").strip()
|
|
files = files + stdout.split("\n")
|
|
|
|
p = subprocess.Popen(["find", ".", "-name", "*.gz"], stdout=subprocess.PIPE)
|
|
stdout, sterr = p.communicate()
|
|
stdout = stdout.decode("ascii").strip()
|
|
files = files + stdout.split("\n")
|
|
|
|
for f in files:
|
|
if "custom_test_data" in f:
|
|
print(f)
|
|
continue
|
|
subprocess.Popen(["rm","-rf", f], stdout=subprocess.PIPE)
|
|
|
|
def remove_profiling_files(path):
|
|
files = []
|
|
p = subprocess.Popen(["find", path, "-name", "onnxruntime_profile*"], stdout=subprocess.PIPE)
|
|
stdout, sterr = p.communicate()
|
|
stdout = stdout.decode("ascii").strip()
|
|
files = files + stdout.split("\n")
|
|
|
|
for f in files:
|
|
if "custom_test_data" in f:
|
|
continue
|
|
subprocess.Popen(["rm","-rf", f], stdout=subprocess.PIPE)
|
|
|
|
|
|
def update_fail_report(fail_results, args, model, ep, e_type, e):
|
|
result = {}
|
|
|
|
result["model"] = model
|
|
result["ep"] = ep
|
|
result["error type"] = e_type
|
|
result["error message"] = re.sub('^\n', '', str(e))
|
|
|
|
fail_results.append(result)
|
|
|
|
|
|
def update_fail_model(model_ep_fail_map, fail_results, args, model_name, ep, e_type, e):
|
|
|
|
if not model_name in model_ep_fail_map:
|
|
model_ep_fail_map[model_name] = [ep]
|
|
else:
|
|
if ep not in model_ep_fail_map[model_name]:
|
|
model_ep_fail_map[model_name].append(ep)
|
|
|
|
update_fail_report(fail_results, args, model_name, ep, e_type, e)
|
|
|
|
# If TRT fails, TRT FP16 should fail as well
|
|
if ep == 'TensorrtExecutionProvider':
|
|
ep_ = "TensorrtExecutionProvider_fp16"
|
|
e_ = "Not benchmarking TRT FP16 since TRT failed already."
|
|
update_fail_report(fail_results, args, model_name, ep_, e_type, e_)
|
|
model_ep_fail_map[model_name].append(ep_)
|
|
|
|
|
|
def skip_ep(model_name, ep, model_ep_fail_map):
|
|
if model_name == 'vision-yolov3' and "fp16" in ep:
|
|
return True
|
|
|
|
if model_name == 'speech' and "fp16" in ep:
|
|
return True
|
|
|
|
if model_name not in model_ep_fail_map:
|
|
return False
|
|
|
|
ep_fail_list = model_ep_fail_map[model_name]
|
|
|
|
if ep in ep_fail_list:
|
|
return True
|
|
|
|
return False
|
|
|
|
def read_model_ep_fail_map_from_file(map_file):
|
|
with open(map_file) as f:
|
|
try:
|
|
data = json.load(f)
|
|
except Exception as e:
|
|
return None
|
|
|
|
return data
|
|
|
|
def write_model_ep_fail_map_to_file(model_ep_fail_map):
|
|
with open('.model_ep_fail_map.json', 'w') as file:
|
|
file.write(json.dumps(model_ep_fail_map)) # use `json.loads` to do the reverse
|
|
|
|
def get_system_info(info):
|
|
info["cuda"] = get_cuda_version()
|
|
info["trt"] = get_trt_version()
|
|
|
|
p = subprocess.Popen(["cat", "/etc/os-release"], stdout=subprocess.PIPE)
|
|
stdout, sterr = p.communicate()
|
|
stdout = stdout.decode("ascii").strip()
|
|
stdout = stdout.split("\n")[:2]
|
|
infos = []
|
|
for row in stdout:
|
|
row = re.sub('=', ': ', row)
|
|
row = re.sub('"', '', row)
|
|
infos.append(row)
|
|
info["linux_distro"] = infos
|
|
|
|
p = subprocess.Popen(["lscpu"], stdout=subprocess.PIPE)
|
|
stdout, sterr = p.communicate()
|
|
stdout = stdout.decode("ascii").strip()
|
|
stdout = stdout.split("\n")
|
|
infos = []
|
|
for row in stdout:
|
|
if "mode" in row or "Arch" in row or "name" in row:
|
|
# row = row.replace(":\s+", ": ")
|
|
row = re.sub(': +', ': ', row)
|
|
infos.append(row)
|
|
info["cpu_info"] = infos
|
|
|
|
p1 = subprocess.Popen(["lspci", "-v"], stdout=subprocess.PIPE)
|
|
p2 = subprocess.Popen(["grep", "NVIDIA"], stdin=p1.stdout, stdout=subprocess.PIPE)
|
|
stdout, sterr = p2.communicate()
|
|
stdout = stdout.decode("ascii").strip()
|
|
stdout = stdout.split("\n")
|
|
infos = []
|
|
for row in stdout:
|
|
row = re.sub('.*:', '', row)
|
|
infos.append(row)
|
|
info["gpu_info"] = infos
|
|
|
|
p = subprocess.Popen(["cat", "/proc/meminfo"], stdout=subprocess.PIPE)
|
|
stdout, sterr = p.communicate()
|
|
stdout = stdout.decode("ascii").strip()
|
|
stdout = stdout.split("\n")
|
|
infos = []
|
|
for row in stdout:
|
|
if "Mem" in row:
|
|
row = re.sub(': +', ': ', row)
|
|
infos.append(row)
|
|
info["memory"] = infos
|
|
|
|
def parse_models_info(path):
|
|
models = {}
|
|
|
|
with open(path) as f:
|
|
data = json.load(f)
|
|
|
|
for row in data:
|
|
|
|
if 'model_name' in row:
|
|
models[row['model_name']] = {}
|
|
else:
|
|
logger.error('Model name must be provided in models_info.json')
|
|
raise
|
|
|
|
model = models[row['model_name']]
|
|
|
|
if 'working_directory' in row:
|
|
model['working_directory'] = row['working_directory']
|
|
else:
|
|
logger.error('Model path must be provided in models_info.json')
|
|
raise
|
|
|
|
if 'model_path' in row:
|
|
model['model_path'] = row['model_path']
|
|
else:
|
|
logger.error('Model path must be provided in models_info.json')
|
|
raise
|
|
|
|
if 'test_data_path' in row:
|
|
model['test_data_path'] = row['test_data_path']
|
|
else:
|
|
logger.error('Test data path must be provided in models_info.json')
|
|
raise
|
|
|
|
return models
|
|
|
|
def convert_model_from_float_to_float16(model_path):
|
|
# from onnxmltools.utils.float16_converter import convert_float_to_float16
|
|
from onnxmltools.utils import load_model, save_model
|
|
from float16 import convert_float_to_float16
|
|
|
|
onnx_model = load_model(model_path)
|
|
new_onnx_model = convert_float_to_float16(onnx_model)
|
|
save_model(new_onnx_model, 'new_fp16_model.onnx')
|
|
|
|
return os.path.join(os.getcwd(), "new_fp16_model.onnx")
|
|
|
|
def create_session(model_path, providers, session_options):
|
|
logger.info(model_path)
|
|
try:
|
|
session = onnxruntime.InferenceSession(model_path, providers=providers, sess_options=session_options)
|
|
return session
|
|
except:
|
|
logger.info("Use symbolic_shape_infer.py")
|
|
|
|
try:
|
|
new_model_path = model_path[:].replace(".onnx", "_new.onnx")
|
|
|
|
if not os.path.exists(new_model_path):
|
|
subprocess.run("python3 -m onnxruntime.tools.symbolic_shape_infer --input " + model_path + " --output " + new_model_path + " --auto_merge", shell=True, check=True)
|
|
session = onnxruntime.InferenceSession(new_model_path, providers=providers, sess_options=session_options)
|
|
return session
|
|
except Exception as e:
|
|
print(e)
|
|
raise
|
|
|
|
def run_onnxruntime(args, models):
|
|
|
|
success_results = []
|
|
fail_results = []
|
|
latency_comparison_map = {} # model -> CUDA/TRT latency
|
|
profile_metrics_map = {} # model -> metrics from profiling file
|
|
model_ep_fail_map = {} # model -> failing ep
|
|
|
|
|
|
# read failing ep information if file exists
|
|
if args.running_mode == 'benchmark':
|
|
if os.path.exists('.model_ep_fail_map.json'):
|
|
model_ep_fail_map = read_model_ep_fail_map_from_file('.model_ep_fail_map.json')
|
|
|
|
if args.fp16:
|
|
ep_list = ["CUDAExecutionProvider", "TensorrtExecutionProvider", "CUDAExecutionProvider_fp16", "TensorrtExecutionProvider_fp16"]
|
|
else:
|
|
ep_list = ["CUDAExecutionProvider", "TensorrtExecutionProvider"]
|
|
|
|
validation_exemption = ["TensorrtExecutionProvider_fp16"]
|
|
|
|
#######################
|
|
# iterate model
|
|
#######################
|
|
for name, info in models.items():
|
|
latency_result = {}
|
|
path = info["working_directory"]
|
|
|
|
pwd = os.getcwd()
|
|
if not os.path.exists(path):
|
|
os.mkdir(path)
|
|
os.chdir(path)
|
|
path = os.getcwd()
|
|
|
|
# cleanup files before running a new inference
|
|
if args.running_mode == "validate":
|
|
remove_profiling_files(path)
|
|
|
|
inputs = []
|
|
ref_outputs = []
|
|
inputs_fp32 = []
|
|
ref_outputs_fp32 = []
|
|
inputs_fp16 = []
|
|
ref_outputs_fp16 = []
|
|
all_inputs_shape = [] # use for standalone trt
|
|
ep_to_ep_op_map = {} # ep -> { ep -> operator }
|
|
profile_already_parsed = set()
|
|
|
|
|
|
#######################
|
|
# iterate ep
|
|
#######################
|
|
for ep in ep_list:
|
|
|
|
if skip_ep(name, ep, model_ep_fail_map):
|
|
continue
|
|
|
|
ep_ = ep_to_provider_list[ep][0]
|
|
if (ep_ not in onnxruntime.get_available_providers()):
|
|
logger.error("No {} support".format(ep_))
|
|
continue
|
|
|
|
model_path = info["model_path"]
|
|
|
|
if "fp16" in ep:
|
|
fp16 = True
|
|
os.environ["ORT_TENSORRT_FP16_ENABLE"] = "1"
|
|
|
|
if ep == "CUDAExecutionProvider_fp16":
|
|
model_path = convert_model_from_float_to_float16(model_path)
|
|
|
|
logger.info("\nInitializing {} with float16 enabled to run on {} ...".format(name, ep))
|
|
else:
|
|
fp16 = False
|
|
os.environ["ORT_TENSORRT_FP16_ENABLE"] = "0"
|
|
logger.info("\nInitializing {} to run on {} ...".format(name, ep))
|
|
|
|
|
|
test_data_dir = info["test_data_path"]
|
|
|
|
# read input/output of test data
|
|
if fp16 and ep == "CUDAExecutionProvider_fp16":
|
|
if not inputs_fp16 or not ref_outputs_fp16:
|
|
inputs_fp16, ref_outputs_fp16 = load_onnx_model_zoo_test_data(test_data_dir, all_inputs_shape, "fp16")
|
|
|
|
inputs = inputs_fp16
|
|
ref_outputs = ref_outputs_fp16
|
|
else:
|
|
if not inputs_fp32 or not ref_outputs_fp32:
|
|
inputs_fp32, ref_outputs_fp32 = load_onnx_model_zoo_test_data(test_data_dir, all_inputs_shape)
|
|
|
|
inputs = inputs_fp32
|
|
ref_outputs = ref_outputs_fp32
|
|
|
|
|
|
if args.input_data == "random":
|
|
inputs = generate_onnx_model_random_input(args.test_times+1, inputs[0])
|
|
|
|
#######################################
|
|
# benchmark or validation
|
|
#######################################
|
|
if args.running_mode == 'benchmark':
|
|
logger.info("===========================")
|
|
logger.info("======== benchmark ========")
|
|
logger.info("===========================")
|
|
|
|
options = onnxruntime.SessionOptions()
|
|
options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
|
|
|
|
# create onnxruntime inference session
|
|
try:
|
|
sess = create_session(model_path, ep_to_provider_list[ep], options)
|
|
|
|
except Exception as e:
|
|
logger.error(e)
|
|
# update_fail_model(model_ep_fail_map, fail_results, args, name, ep, e)
|
|
continue
|
|
|
|
logger.info("[start] Begin to inference {} with {} ...".format(name, ep))
|
|
logger.info(sess.get_providers())
|
|
|
|
if sess:
|
|
logger.info("Model inputs nodes:")
|
|
for input_meta in sess.get_inputs():
|
|
logger.info(input_meta)
|
|
logger.info("Model outputs nodes:")
|
|
for output_meta in sess.get_outputs():
|
|
logger.info(output_meta)
|
|
|
|
|
|
batch_size = 1
|
|
result_template = {
|
|
"engine": "onnxruntime",
|
|
"version": onnxruntime.__version__,
|
|
"device": ep,
|
|
"fp16": fp16,
|
|
"io_binding": False,
|
|
"model_name": name,
|
|
"inputs": len(sess.get_inputs()),
|
|
"batch_size": batch_size,
|
|
"sequence_length": 1,
|
|
"datetime": str(datetime.now()),}
|
|
|
|
result = inference_ort(args, name, sess, ep, inputs, result_template, args.test_times, batch_size)
|
|
if result:
|
|
success_results.append(result)
|
|
logger.info(result)
|
|
|
|
latency_result[ep] = {}
|
|
latency_result[ep]["average_latency_ms"] = result["average_latency_ms"]
|
|
latency_result[ep]["latency_90_percentile"] = result["latency_90_percentile"]
|
|
|
|
# get standalone TensorRT perf
|
|
if "TensorrtExecutionProvider" in ep and args.trtexec:
|
|
result = run_trt_standalone(args.trtexec, model_path, sess.get_inputs(), all_inputs_shape, fp16)
|
|
if result and len(result) > 0:
|
|
if fp16:
|
|
latency_result["Standalone_TRT_fp16"] = result
|
|
else:
|
|
latency_result["Standalone_TRT"] = result
|
|
|
|
latency_comparison_map[name] = copy.deepcopy(latency_result)
|
|
|
|
|
|
|
|
elif args.running_mode == 'validate':
|
|
logger.info("==========================")
|
|
logger.info("======== validate ========")
|
|
logger.info("==========================")
|
|
|
|
# enable profiling to generate profiling file for analysis
|
|
options = onnxruntime.SessionOptions()
|
|
options.enable_profiling = True
|
|
options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
|
|
time.sleep(1) # avoid to generate same profile file name
|
|
|
|
# create onnxruntime inference session
|
|
try:
|
|
sess = create_session(model_path, ep_to_provider_list[ep], options)
|
|
|
|
except Exception as e:
|
|
logger.error(e)
|
|
update_fail_model(model_ep_fail_map, fail_results, args, name, ep, 'runtime error', e)
|
|
continue
|
|
|
|
sess.disable_fallback()
|
|
|
|
logger.info("Start to inference {} with {} ...".format(name, ep))
|
|
logger.info(sess.get_providers())
|
|
|
|
if sess:
|
|
logger.info("Model inputs nodes:")
|
|
for input_meta in sess.get_inputs():
|
|
logger.info(input_meta)
|
|
logger.info("Model outputs nodes:")
|
|
for output_meta in sess.get_outputs():
|
|
logger.info(output_meta)
|
|
|
|
# run inference and validate the result
|
|
#
|
|
# currently skip TensorRT float16 validation intentionally
|
|
if ep not in validation_exemption:
|
|
try:
|
|
ort_outputs = inference_ort_and_get_prediction(name, sess, inputs)
|
|
|
|
decimal = 0
|
|
|
|
status = validate(ref_outputs, ort_outputs, decimal)
|
|
if not status[0]:
|
|
update_fail_model(model_ep_fail_map, fail_results, args, name, ep, 'result accuracy issue', status[1])
|
|
continue
|
|
except Exception as e:
|
|
logger.error(e)
|
|
update_fail_model(model_ep_fail_map, fail_results, args, name, ep, 'runtime error', e)
|
|
continue
|
|
|
|
# Run inference again. the reason is that some ep like tensorrt
|
|
# it takes much longer time to generate graph on first run and
|
|
# we need to skip the perf result of that expensive run.
|
|
inference_ort_and_get_prediction(name, sess, inputs)
|
|
else:
|
|
inference_ort_and_get_prediction(name, sess, inputs)
|
|
inference_ort_and_get_prediction(name, sess, inputs)
|
|
|
|
sess.end_profiling()
|
|
|
|
# get metrics from profiling file
|
|
metrics = get_profile_metrics(path, profile_already_parsed)
|
|
if metrics:
|
|
print(ep)
|
|
ep_to_ep_op_map[ep] = metrics
|
|
|
|
####################
|
|
# end of iterate ep
|
|
####################
|
|
|
|
|
|
# get percentage of execution time and operators in TRT
|
|
if len(ep_to_ep_op_map) > 0:
|
|
trt_op_map = None
|
|
trt_fp16_op_map = None
|
|
cuda_op_map = None
|
|
cuda_fp16_op_map = None
|
|
|
|
for ep, op_map in ep_to_ep_op_map.items():
|
|
if ep == "CUDAExecutionProvider":
|
|
cuda_op_map = op_map
|
|
elif ep == "CUDAExecutionProvider_fp16":
|
|
cuda_fp16_op_map = op_map
|
|
elif ep == "TensorrtExecutionProvider":
|
|
trt_op_map = op_map
|
|
elif ep == "TensorrtExecutionProvider_fp16":
|
|
trt_fp16_op_map = op_map
|
|
|
|
profile_metrics_map[name] = {}
|
|
|
|
if cuda_op_map:
|
|
profile_metrics_map[name]['ratio_of_ops_in_cuda_not_fallback_cpu'] = calculate_cuda_op_percentage(cuda_op_map)
|
|
|
|
if trt_op_map:
|
|
total_trt_execution_time, total_execution_time, ratio_of_execution_time_in_trt = calculate_trt_latency_percentage(trt_op_map)
|
|
profile_metrics_map[name]['total_trt_execution_time'] = total_trt_execution_time
|
|
profile_metrics_map[name]['total_execution_time'] = total_execution_time
|
|
profile_metrics_map[name]['ratio_of_execution_time_in_trt'] = ratio_of_execution_time_in_trt
|
|
if cuda_op_map:
|
|
total_ops_in_trt, total_ops, ratio_of_ops_in_trt = calculate_trt_op_percentage(trt_op_map, cuda_op_map)
|
|
profile_metrics_map[name]['total_ops_in_trt'] = total_ops_in_trt
|
|
profile_metrics_map[name]['total_ops'] = total_ops
|
|
profile_metrics_map[name]['ratio_of_ops_in_trt'] = ratio_of_ops_in_trt
|
|
|
|
if trt_fp16_op_map:
|
|
total_trt_execution_time, total_execution_time, ratio_of_execution_time_in_trt = calculate_trt_latency_percentage(trt_fp16_op_map)
|
|
name_ = name + " (FP16)"
|
|
profile_metrics_map[name_] = {}
|
|
profile_metrics_map[name_]['total_trt_execution_time'] = total_trt_execution_time
|
|
profile_metrics_map[name_]['total_execution_time'] = total_execution_time
|
|
profile_metrics_map[name_]['ratio_of_execution_time_in_trt'] = ratio_of_execution_time_in_trt
|
|
if cuda_fp16_op_map:
|
|
total_ops_in_trt, total_ops, ratio_of_ops_in_trt = calculate_trt_op_percentage(trt_fp16_op_map, cuda_op_map)
|
|
profile_metrics_map[name_]['total_ops_in_trt'] = total_ops_in_trt
|
|
profile_metrics_map[name_]['total_ops'] = total_ops
|
|
profile_metrics_map[name_]['ratio_of_ops_in_trt'] = ratio_of_ops_in_trt
|
|
|
|
if debug:
|
|
pp = pprint.PrettyPrinter(indent=4)
|
|
print('CUDA operator map:')
|
|
pp.pprint(cuda_op_map)
|
|
print('TRT operator map:')
|
|
pp.pprint(trt_op_map)
|
|
print('CUDA FP16 operator map:')
|
|
pp.pprint(cuda_fp16_op_map)
|
|
print('TRT FP16 operator map:')
|
|
pp.pprint(trt_fp16_op_map)
|
|
|
|
# cleanup_files()
|
|
os.chdir(pwd)
|
|
|
|
# end of model
|
|
|
|
return success_results, fail_results, latency_comparison_map, model_ep_fail_map, profile_metrics_map
|
|
|
|
def add_improvement_information(latency_comparison_map):
|
|
for key, value in latency_comparison_map.items():
|
|
if not ('TensorrtExecutionProvider' in value and 'CUDAExecutionProvider' in value):
|
|
continue
|
|
|
|
trt_latency = float(value['TensorrtExecutionProvider']['average_latency_ms'])
|
|
cuda_latency = float(value['CUDAExecutionProvider']['average_latency_ms'])
|
|
gain = (cuda_latency - trt_latency)*100/cuda_latency
|
|
value["Tensorrt_gain(%)"] = "{:.2f} %".format(gain)
|
|
|
|
if "TensorrtExecutionProvider_fp16" in value and "CUDAExecutionProvider_fp16" in value:
|
|
trt_fp16_latency = float(value['TensorrtExecutionProvider_fp16']['average_latency_ms'])
|
|
cuda_fp16_latency = float(value['CUDAExecutionProvider_fp16']['average_latency_ms'])
|
|
gain = (cuda_fp16_latency - trt_fp16_latency)*100/cuda_fp16_latency
|
|
value["Tensorrt_fp16_gain(%)"] = "{:.2f} %".format(gain)
|
|
|
|
def output_details(results, csv_filename):
|
|
with open(csv_filename, mode="a", newline='') as csv_file:
|
|
column_names = [
|
|
"engine", "version", "device", "fp16", "io_binding", "model_name", "inputs", "batch_size",
|
|
"sequence_length", "datetime", "test_times", "QPS", "average_latency_ms", "latency_variance",
|
|
"latency_90_percentile", "latency_95_percentile", "latency_99_percentile"
|
|
]
|
|
|
|
csv_writer = csv.DictWriter(csv_file, fieldnames=column_names)
|
|
csv_writer.writeheader()
|
|
for result in results:
|
|
csv_writer.writerow(result)
|
|
|
|
logger.info(f"Detail results are saved to csv file: {csv_filename}")
|
|
|
|
def output_fail(results, csv_filename):
|
|
with open(csv_filename, mode="a", newline='') as csv_file:
|
|
column_names = [
|
|
"model", "ep", "error type", "error message"
|
|
]
|
|
|
|
csv_writer = csv.DictWriter(csv_file, fieldnames=column_names)
|
|
csv_writer.writeheader()
|
|
for result in results:
|
|
csv_writer.writerow(result)
|
|
|
|
logger.info(f"Failing results are saved to csv file: {csv_filename}")
|
|
|
|
def output_latency(results, csv_filename):
|
|
with open(csv_filename, mode="a", newline='') as csv_file:
|
|
column_names = ["Model",
|
|
"CUDA \nmean (ms)",
|
|
"CUDA \n90th percentile (ms)",
|
|
"TRT EP \nmean (ms)",
|
|
"TRT EP \n90th percentile (ms)",
|
|
"Standalone TRT \nmean (ms)",
|
|
"Standalone TRT \n90th percentile (ms)",
|
|
"CUDA fp16 \nmean (ms)",
|
|
"CUDA fp16 \n90th percentile (ms)",
|
|
"TRT EP fp16 \nmean (ms)",
|
|
"TRT EP fp16 \n90 percentile (ms)",
|
|
"Standalone TRT fp16 \nmean (ms)",
|
|
"Standalone TRT fp16 \n90th percentile (ms)",
|
|
"TRT EP \ngain (mean) (%)",
|
|
"TRT EP fp16 \ngain (mean) (%)"]
|
|
csv_writer = csv.writer(csv_file)
|
|
csv_writer.writerow(column_names)
|
|
|
|
for key, value in results.items():
|
|
cuda_average = ""
|
|
if 'CUDAExecutionProvider' in value and 'average_latency_ms' in value['CUDAExecutionProvider']:
|
|
cuda_average = value['CUDAExecutionProvider']['average_latency_ms']
|
|
|
|
cuda_99_percentile = ""
|
|
if 'CUDAExecutionProvider' in value and 'latency_90_percentile' in value['CUDAExecutionProvider']:
|
|
cuda_99_percentile = value['CUDAExecutionProvider']['latency_90_percentile']
|
|
|
|
trt_average = ""
|
|
if 'TensorrtExecutionProvider' in value and 'average_latency_ms' in value['TensorrtExecutionProvider']:
|
|
trt_average = value['TensorrtExecutionProvider']['average_latency_ms']
|
|
|
|
trt_99_percentile = ""
|
|
if 'TensorrtExecutionProvider' in value and 'latency_90_percentile' in value['TensorrtExecutionProvider']:
|
|
trt_99_percentile = value['TensorrtExecutionProvider']['latency_90_percentile']
|
|
|
|
standalone_trt_average = ""
|
|
if 'Standalone_TRT' in value and 'average_latency_ms' in value['Standalone_TRT']:
|
|
standalone_trt_average = value['Standalone_TRT']['average_latency_ms']
|
|
|
|
standalone_trt_99_percentile = ""
|
|
if 'Standalone_TRT' in value and 'latency_90_percentile' in value['Standalone_TRT']:
|
|
standalone_trt_99_percentile = value['Standalone_TRT']['latency_90_percentile']
|
|
|
|
|
|
cuda_fp16_average = ""
|
|
if 'CUDAExecutionProvider_fp16' in value and 'average_latency_ms' in value['CUDAExecutionProvider_fp16']:
|
|
cuda_fp16_average = value['CUDAExecutionProvider_fp16']['average_latency_ms']
|
|
|
|
cuda_fp16_99_percentile = ""
|
|
if 'CUDAExecutionProvider_fp16' in value and 'latency_90_percentile' in value['CUDAExecutionProvider_fp16']:
|
|
cuda_fp16_99_percentile = value['CUDAExecutionProvider_fp16']['latency_90_percentile']
|
|
|
|
trt_fp16_average = ""
|
|
if 'TensorrtExecutionProvider_fp16' in value and 'average_latency_ms' in value['TensorrtExecutionProvider_fp16']:
|
|
trt_fp16_average = value['TensorrtExecutionProvider_fp16']['average_latency_ms']
|
|
|
|
trt_fp16_99_percentile = ""
|
|
if 'TensorrtExecutionProvider_fp16' in value and 'latency_90_percentile' in value['TensorrtExecutionProvider_fp16']:
|
|
trt_fp16_99_percentile = value['TensorrtExecutionProvider_fp16']['latency_90_percentile']
|
|
|
|
standalone_trt_fp16_average = ""
|
|
if 'Standalone_TRT_fp16' in value and 'average_latency_ms' in value['Standalone_TRT_fp16']:
|
|
standalone_trt_fp16_average = value['Standalone_TRT_fp16']['average_latency_ms']
|
|
|
|
standalone_trt_fp16_99_percentile = ""
|
|
if 'Standalone_TRT_fp16' in value and 'latency_90_percentile' in value['Standalone_TRT_fp16']:
|
|
standalone_trt_fp16_99_percentile = value['Standalone_TRT_fp16']['latency_90_percentile']
|
|
|
|
|
|
row = [key,
|
|
cuda_average,
|
|
cuda_99_percentile,
|
|
trt_average,
|
|
trt_99_percentile,
|
|
standalone_trt_average,
|
|
standalone_trt_99_percentile,
|
|
cuda_fp16_average,
|
|
cuda_fp16_99_percentile,
|
|
trt_fp16_average,
|
|
trt_fp16_99_percentile,
|
|
standalone_trt_fp16_average,
|
|
standalone_trt_fp16_99_percentile,
|
|
value['Tensorrt_gain(%)'] if 'Tensorrt_gain(%)' in value else " ",
|
|
value['Tensorrt_fp16_gain(%)'] if 'Tensorrt_fp16_gain(%)' in value else " "
|
|
]
|
|
csv_writer.writerow(row)
|
|
|
|
logger.info(f"CUDA/TRT latency comparison are saved to csv file: {csv_filename}")
|
|
|
|
def output_ratio(results, csv_filename):
|
|
with open(csv_filename, mode="a", newline='') as csv_file:
|
|
column_names = ["Model",
|
|
"% CUDA operators (not fall back to CPU)",
|
|
"Total TRT operators",
|
|
"Total operators",
|
|
"% TRT operator",
|
|
"Total TRT execution time",
|
|
"Total execution time",
|
|
"% TRT execution time"]
|
|
csv_writer = csv.writer(csv_file)
|
|
csv_writer.writerow(column_names)
|
|
|
|
for key, value in results.items():
|
|
row = [key,
|
|
value['ratio_of_ops_in_cuda_not_fallback_cpu'] if 'ratio_of_ops_in_cuda_not_fallback_cpu' in value else " ",
|
|
value['total_ops_in_trt'] if 'total_ops_in_trt' in value else " ",
|
|
value['total_ops'] if 'total_ops' in value else " ",
|
|
value['ratio_of_ops_in_trt'] if 'ratio_of_ops_in_trt' in value else " ",
|
|
value['total_trt_execution_time'] if 'total_trt_execution_time' in value else " ",
|
|
value['total_execution_time'] if 'total_execution_time' in value else " ",
|
|
value['ratio_of_execution_time_in_trt'] if 'ratio_of_execution_time_in_trt' in value else " ",
|
|
]
|
|
csv_writer.writerow(row)
|
|
|
|
logger.info(f"Tensorrt ratio metrics are saved to csv file: {csv_filename}")
|
|
|
|
def output_system_info(result, csv_filename):
|
|
with open(csv_filename, mode="a", newline='') as csv_file:
|
|
column_names = [
|
|
"cpu_info", "cuda", "gpu_info", "linux_distro", "memory", "trt"
|
|
]
|
|
|
|
csv_writer = csv.DictWriter(csv_file, fieldnames=column_names)
|
|
csv_writer.writeheader()
|
|
csv_writer.writerow(result)
|
|
|
|
logger.info(f"System information are saved to csv file: {csv_filename}")
|
|
|
|
|
|
def parse_arguments():
|
|
parser = argparse.ArgumentParser()
|
|
|
|
parser.add_argument("-m", "--model_list_file", required=False, default="model_list.json", help="Model list file.")
|
|
|
|
parser.add_argument("-r", "--running_mode", required=False, default="benchmark", choices=["validate", "benchmark"], help="Testing mode.")
|
|
|
|
parser.add_argument("-i", "--input_data", required=False, default="zoo", choices=["zoo", "random"], help="source of input data.")
|
|
|
|
parser.add_argument("--fp16", required=False, default=True, action="store_true", help="Inlcude Float16 into benchmarking.")
|
|
|
|
parser.add_argument("--trtexec", required=False, default=None, help="trtexec executable path.")
|
|
|
|
parser.add_argument("-t",
|
|
"--test_times",
|
|
required=False,
|
|
default=1,
|
|
type=int,
|
|
help="Number of repeat times to get average inference latency.")
|
|
|
|
args = parser.parse_args()
|
|
return args
|
|
|
|
def setup_logger(verbose):
|
|
if verbose:
|
|
coloredlogs.install(level='DEBUG', fmt='[%(filename)s:%(lineno)s - %(funcName)20s()] %(message)s')
|
|
else:
|
|
coloredlogs.install(fmt='%(message)s')
|
|
logging.getLogger("transformers").setLevel(logging.WARNING)
|
|
|
|
def main():
|
|
args = parse_arguments()
|
|
setup_logger(False)
|
|
pp = pprint.PrettyPrinter(indent=4)
|
|
|
|
models = parse_models_info(args.model_list_file)
|
|
|
|
perf_start_time = datetime.now()
|
|
success_results, fail_results, latency_comparison_map, failing_models, profile_metrics_map = run_onnxruntime(args, models)
|
|
perf_end_time = datetime.now()
|
|
|
|
logger.info("\nTotal time for running/profiling all models: {}".format(perf_end_time - perf_start_time))
|
|
logger.info(list(models.keys()))
|
|
|
|
logger.info("\nTotal models: {}".format(len(models)))
|
|
logger.info("Fail models: {}".format(len(failing_models)))
|
|
logger.info("Models FAIL/SUCCESS: {}/{}".format(len(failing_models), len(models) - len(failing_models)))
|
|
|
|
path = "result"
|
|
if not os.path.exists(path):
|
|
os.mkdir(path)
|
|
|
|
path = os.path.join(os.getcwd(), path)
|
|
if not os.path.exists(path):
|
|
os.mkdir(path)
|
|
|
|
time_stamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
|
|
|
if len(failing_models) > 0:
|
|
logger.info("\n============================================")
|
|
logger.info("========== Failing Models/EPs ==============")
|
|
logger.info("============================================")
|
|
logger.info(failing_models)
|
|
write_model_ep_fail_map_to_file(failing_models)
|
|
|
|
if latency_comparison_map:
|
|
logger.info("\n=========================================")
|
|
logger.info("=========== CUDA/TRT latency ===========")
|
|
logger.info("=========================================")
|
|
add_improvement_information(latency_comparison_map)
|
|
pp.pprint(latency_comparison_map)
|
|
csv_filename = f"benchmark_latency_{time_stamp}.csv"
|
|
csv_filename = os.path.join(path, csv_filename)
|
|
output_latency(latency_comparison_map, csv_filename)
|
|
|
|
if len(profile_metrics_map) > 0:
|
|
logger.info("\n========================================")
|
|
logger.info("========== TRT detail metrics ==========")
|
|
logger.info("========================================")
|
|
pp.pprint(profile_metrics_map)
|
|
csv_filename = f"benchmark_ratio_{time_stamp}.csv"
|
|
csv_filename = os.path.join(path, csv_filename)
|
|
output_ratio(profile_metrics_map, csv_filename)
|
|
|
|
|
|
logger.info("\n===========================================")
|
|
logger.info("=========== System information ===========")
|
|
logger.info("===========================================")
|
|
info = {}
|
|
get_system_info(info)
|
|
pp.pprint(info)
|
|
csv_filename = os.path.join(path, f"system_info_{time_stamp}.csv")
|
|
output_system_info(info, csv_filename)
|
|
|
|
if fail_results:
|
|
csv_filename = f"benchmark_fail_{time_stamp}.csv"
|
|
csv_filename = os.path.join(path, csv_filename)
|
|
output_fail(fail_results, csv_filename)
|
|
|
|
if success_results:
|
|
csv_filename = f"benchmark_success_{time_stamp}.csv"
|
|
csv_filename = os.path.join(path, csv_filename)
|
|
output_details(success_results, csv_filename)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|