mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-24 22:17:32 +00:00
Refactor EP Perf Tool (#6202)
* merge master, keep postprocess status commit * download float16.py everytime * using variables to reference eps * adding ACL EP to ep perf tool * accuracy with absolute tolerance configurable * add acl to dict + remove commented line
This commit is contained in:
parent
46e0e4e69f
commit
c8de3f355a
4 changed files with 170 additions and 137 deletions
|
|
@ -17,20 +17,29 @@ from perf_utils import *
|
|||
import pprint
|
||||
import time
|
||||
from float16 import *
|
||||
# import torch
|
||||
|
||||
debug = False
|
||||
sys.path.append('.')
|
||||
logger = logging.getLogger('')
|
||||
|
||||
ep_to_provider_list = {
|
||||
"CPUExecutionProvider": ["CPUExecutionProvider"],
|
||||
"CUDAExecutionProvider": ["CUDAExecutionProvider"],
|
||||
"CUDAExecutionProvider_fp16": ["CUDAExecutionProvider"],
|
||||
"TensorrtExecutionProvider": ["TensorrtExecutionProvider", "CUDAExecutionProvider"],
|
||||
"TensorrtExecutionProvider_fp16": ["TensorrtExecutionProvider", "CUDAExecutionProvider"],
|
||||
}
|
||||
# global ep variables
|
||||
cpu = "CPUExecutionProvider"
|
||||
acl = "ACLExecutionProvider"
|
||||
cuda = "CUDAExecutionProvider"
|
||||
cuda_fp16 = "CUDAExecutionProvider_fp16"
|
||||
trt = "TensorrtExecutionProvider"
|
||||
trt_fp16 = "TensorrtExecutionProvider_fp16"
|
||||
standalone_trt = "Standalone_TRT"
|
||||
standalone_trt_fp16 = "Standalone_TRT_fp16"
|
||||
|
||||
ep_to_provider_list = {
|
||||
cpu: [cpu],
|
||||
acl: [acl],
|
||||
cuda: [cuda],
|
||||
cuda_fp16: [cuda],
|
||||
trt: [trt, cuda],
|
||||
trt_fp16: [trt, cuda],
|
||||
}
|
||||
|
||||
# metadata
|
||||
FAIL_MODEL_FILE = ".fail_model_map"
|
||||
|
|
@ -212,6 +221,20 @@ def inference_ort_and_get_prediction(name, session, ort_inputs):
|
|||
|
||||
return ort_outputs
|
||||
|
||||
def get_acl_version():
|
||||
from pathlib import Path
|
||||
home = str(Path.home())
|
||||
p = subprocess.run(["find", home, "-name", "libarm_compute.so"], check=True, stdout=subprocess.PIPE)
|
||||
libarm_compute_path = p.stdout.decode("ascii").strip()
|
||||
if libarm_compute_path == '':
|
||||
return "No Compute Library Found"
|
||||
else:
|
||||
p = subprocess.run(["strings", libarm_compute_path], check=True, stdout=subprocess.PIPE)
|
||||
libarm_so_strings = p.stdout.decode("ascii").strip()
|
||||
version_match = re.search(r'arm_compute_version.*\n', libarm_so_strings)
|
||||
version = version_match.group(0).split(' ')[0]
|
||||
return version
|
||||
|
||||
def get_cuda_version():
|
||||
from pathlib import Path
|
||||
home = str(Path.home())
|
||||
|
|
@ -396,14 +419,14 @@ def generate_onnx_model_random_input(test_times, ref_input):
|
|||
|
||||
return inputs
|
||||
|
||||
def validate(all_ref_outputs, all_outputs, decimal):
|
||||
def validate(all_ref_outputs, all_outputs, rtol=0, atol=1.5):
|
||||
if len(all_ref_outputs) == 0:
|
||||
logger.info("No reference output provided.")
|
||||
return True, None
|
||||
|
||||
logger.info('Reference {} results.'.format(len(all_ref_outputs)))
|
||||
logger.info('Predicted {} results.'.format(len(all_outputs)))
|
||||
logger.info('decimal {}'.format(decimal))
|
||||
logger.info('rtol: {}, atol: {}'.format(rtol, atol))
|
||||
|
||||
try:
|
||||
for i in range(len(all_outputs)):
|
||||
|
|
@ -414,10 +437,10 @@ def validate(all_ref_outputs, all_outputs, decimal):
|
|||
ref_output = ref_outputs[j]
|
||||
output = outputs[j]
|
||||
|
||||
# Compare the results with reference outputs up to x decimal places
|
||||
# Compare the results with reference outputs
|
||||
for ref_o, o in zip(ref_output, output):
|
||||
# abs(desired-actual) < 1.5 * 10**(-decimal)
|
||||
np.testing.assert_almost_equal(ref_o, o, decimal)
|
||||
# abs(desired-actual) < rtol * abs(desired) + atol
|
||||
np.testing.assert_allclose(ref_o, o, rtol, atol)
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
return False, e
|
||||
|
|
@ -483,7 +506,7 @@ def update_metrics_map(model_to_metrics, model_name, ep_to_operator):
|
|||
if ep not in model_to_metrics[model_name]:
|
||||
model_to_metrics[model_name][ep] = {}
|
||||
|
||||
if ep == "CUDAExecutionProvider" or ep == "CUDAExecutionProvider_fp16":
|
||||
if ep == cuda or ep == cuda_fp16:
|
||||
model_to_metrics[model_name][ep]['ratio_of_ops_in_cuda_not_fallback_cpu'] = calculate_cuda_op_percentage(op_map)
|
||||
model_to_metrics[model_name][ep]['total_ops'] = get_total_ops(op_map)
|
||||
else:
|
||||
|
|
@ -504,13 +527,13 @@ def update_metrics_map_ori(model_to_metrics, name, ep_to_operator):
|
|||
cuda_fp16_op_map = None
|
||||
|
||||
for ep, op_map in ep_to_operator.items():
|
||||
if ep == "CUDAExecutionProvider":
|
||||
if ep == cuda:
|
||||
cuda_op_map = op_map
|
||||
elif ep == "CUDAExecutionProvider_fp16":
|
||||
elif ep == cuda_fp16:
|
||||
cuda_fp16_op_map = op_map
|
||||
elif ep == "TensorrtExecutionProvider":
|
||||
elif ep == trt:
|
||||
trt_op_map = op_map
|
||||
elif ep == "TensorrtExecutionProvider_fp16":
|
||||
elif ep == trt_fp16:
|
||||
trt_fp16_op_map = op_map
|
||||
|
||||
|
||||
|
|
@ -575,8 +598,8 @@ def update_fail_model_map(model_to_fail_ep, model_name, ep, e_type, e):
|
|||
model_to_fail_ep[model_name][ep] = new_map
|
||||
|
||||
# If TRT fails, TRT FP16 should fail as well
|
||||
if ep == 'TensorrtExecutionProvider':
|
||||
ep_ = "TensorrtExecutionProvider_fp16"
|
||||
if ep == trt:
|
||||
ep_ = trt_fp16
|
||||
e_ = "skip benchmarking since TRT failed already."
|
||||
new_map_1 = {}
|
||||
new_map_1["error_type"] = e_type
|
||||
|
|
@ -595,8 +618,8 @@ def update_fail_model_map_ori(model_to_fail_ep, fail_results, model_name, ep, e_
|
|||
update_fail_report(fail_results, model_name, ep, e_type, e)
|
||||
|
||||
# If TRT fails, TRT FP16 should fail as well
|
||||
if ep == 'TensorrtExecutionProvider':
|
||||
ep_ = "TensorrtExecutionProvider_fp16"
|
||||
if ep == trt:
|
||||
ep_ = trt_fp16
|
||||
error_message_ = "skip benchmarking since TRT failed already."
|
||||
update_fail_report(fail_results, model_name, ep_, e_type, error_message_)
|
||||
model_to_fail_ep[model_name][ep_] = e_type
|
||||
|
|
@ -771,10 +794,10 @@ def parse_models_info_from_directory(path, models):
|
|||
parse_models_info_from_directory(os.path.join(path, dir), models)
|
||||
|
||||
|
||||
def parse_models_info_from_file(default_dir, path, models):
|
||||
def parse_models_info_from_file(root_dir, path, models):
|
||||
|
||||
# default working directory
|
||||
root_working_directory = default_dir
|
||||
root_working_directory = root_dir
|
||||
|
||||
with open(path) as f:
|
||||
data = json.load(f)
|
||||
|
|
@ -877,11 +900,11 @@ def run_onnxruntime(args, models):
|
|||
ep_list.append(args.ep)
|
||||
else:
|
||||
if args.fp16:
|
||||
ep_list = ["CPUExecutionProvider", "CUDAExecutionProvider", "TensorrtExecutionProvider", "CUDAExecutionProvider_fp16", "TensorrtExecutionProvider_fp16"]
|
||||
ep_list = [cpu, cuda, trt, cuda_fp16, trt_fp16]
|
||||
else:
|
||||
ep_list = ["CPUExecutionProvider", "CUDAExecutionProvider", "TensorrtExecutionProvider"]
|
||||
ep_list = [cpu, cuda, trt]
|
||||
|
||||
validation_exemption = ["TensorrtExecutionProvider_fp16"]
|
||||
validation_exemption = [trt_fp16]
|
||||
|
||||
|
||||
if os.path.exists(FAIL_MODEL_FILE):
|
||||
|
|
@ -927,7 +950,7 @@ def run_onnxruntime(args, models):
|
|||
model_path = model_info["model_path"]
|
||||
test_data_dir = model_info["test_data_path"]
|
||||
|
||||
if ep == "CUDAExecutionProvider_fp16":
|
||||
if ep == cuda_fp16:
|
||||
logger.info("[Initialize] model = {}, ep = {} ,FP16 = True ...".format(name, ep))
|
||||
fp16 = True
|
||||
os.environ["ORT_TENSORRT_FP16_ENABLE"] = "1"
|
||||
|
|
@ -952,7 +975,7 @@ def run_onnxruntime(args, models):
|
|||
else:
|
||||
inputs, ref_outputs = get_test_data(True, test_data_dir, all_inputs_shape)
|
||||
|
||||
elif ep == "TensorrtExecutionProvider_fp16":
|
||||
elif ep == trt_fp16:
|
||||
logger.info("[Initialize] model = {}, ep = {} ,FP16 = True ...".format(name, ep))
|
||||
fp16 = True
|
||||
os.environ["ORT_TENSORRT_FP16_ENABLE"] = "1"
|
||||
|
|
@ -1020,13 +1043,13 @@ def run_onnxruntime(args, models):
|
|||
latency_result[ep]["latency_90_percentile"] = result["latency_90_percentile"]
|
||||
|
||||
# get standalone TensorRT perf
|
||||
if "TensorrtExecutionProvider" in ep and args.trtexec:
|
||||
if trt in ep and args.trtexec:
|
||||
result = run_trt_standalone(args.trtexec, model_path, sess.get_inputs(), all_inputs_shape, fp16)
|
||||
if result and len(result) > 0:
|
||||
if fp16:
|
||||
latency_result["Standalone_TRT_fp16"] = result
|
||||
latency_result[standalone_trt_fp16] = result
|
||||
else:
|
||||
latency_result["Standalone_TRT"] = result
|
||||
latency_result[standalone_trt] = result
|
||||
|
||||
model_to_latency[name] = copy.deepcopy(latency_result)
|
||||
|
||||
|
|
@ -1072,8 +1095,7 @@ def run_onnxruntime(args, models):
|
|||
try:
|
||||
ort_outputs = inference_ort_and_get_prediction(name, sess, inputs)
|
||||
|
||||
decimal = 0
|
||||
status = validate(ref_outputs, ort_outputs, decimal)
|
||||
status = validate(ref_outputs, ort_outputs)
|
||||
if not status[0]:
|
||||
update_fail_model_map(model_to_fail_ep, name, ep, 'result accuracy issue', status[1])
|
||||
continue
|
||||
|
|
@ -1117,17 +1139,17 @@ def run_onnxruntime(args, models):
|
|||
|
||||
def add_improvement_information(model_to_latency):
|
||||
for key, value in model_to_latency.items():
|
||||
if not ('TensorrtExecutionProvider' in value and 'CUDAExecutionProvider' in value):
|
||||
if not (trt in value and cuda in value):
|
||||
continue
|
||||
|
||||
trt_latency = float(value['TensorrtExecutionProvider']['average_latency_ms'])
|
||||
cuda_latency = float(value['CUDAExecutionProvider']['average_latency_ms'])
|
||||
trt_latency = float(value[trt]['average_latency_ms'])
|
||||
cuda_latency = float(value[cuda]['average_latency_ms'])
|
||||
gain = (cuda_latency - trt_latency)*100/cuda_latency
|
||||
value["Tensorrt_gain(%)"] = "{:.2f} %".format(gain)
|
||||
|
||||
if "TensorrtExecutionProvider_fp16" in value and "CUDAExecutionProvider_fp16" in value:
|
||||
trt_fp16_latency = float(value['TensorrtExecutionProvider_fp16']['average_latency_ms'])
|
||||
cuda_fp16_latency = float(value['CUDAExecutionProvider_fp16']['average_latency_ms'])
|
||||
if trt_fp16 in value and cuda_fp16 in value:
|
||||
trt_fp16_latency = float(value[trt_fp16]['average_latency_ms'])
|
||||
cuda_fp16_latency = float(value[cuda_fp16]['average_latency_ms'])
|
||||
gain = (cuda_fp16_latency - trt_fp16_latency)*100/cuda_fp16_latency
|
||||
value["Tensorrt_fp16_gain(%)"] = "{:.2f} %".format(gain)
|
||||
|
||||
|
|
@ -1211,55 +1233,55 @@ def output_status(results, csv_filename):
|
|||
|
||||
with open(csv_filename, mode="a", newline='') as csv_file:
|
||||
column_names = ["Model",
|
||||
"CPU",
|
||||
"CUDA fp32",
|
||||
"TRT fp32",
|
||||
"Standalone TRT fp32",
|
||||
"CUDA fp16",
|
||||
"TRT fp16",
|
||||
"Standalone TRT fp16"
|
||||
cpu,
|
||||
cuda + " fp32",
|
||||
trt + " fp32",
|
||||
standalone_trt + " fp32",
|
||||
cuda + " fp16",
|
||||
trt + " fp16",
|
||||
standalone_trt + "fp16"
|
||||
]
|
||||
|
||||
csv_writer = csv.writer(csv_file)
|
||||
|
||||
if need_write_header:
|
||||
csv_writer.writerow(column_names)
|
||||
|
||||
cpu = ""
|
||||
cuda_fp32 = ""
|
||||
trt_fp32 = ""
|
||||
standalone_fp32 = ""
|
||||
cuda_fp16 = ""
|
||||
trt_fp16 = ""
|
||||
standalone_fp16 = ""
|
||||
cpu_status = ""
|
||||
cuda_fp32_status = ""
|
||||
trt_fp32_status = ""
|
||||
standalone_fp32_status = ""
|
||||
cuda_fp16_status = ""
|
||||
trt_fp16_status = ""
|
||||
standalone_fp16_status = ""
|
||||
|
||||
for model_name, ep_dict in results.items():
|
||||
for ep, status in ep_dict.items():
|
||||
if ep == "CPUExecutionProvider":
|
||||
cpu = status
|
||||
elif ep == "CUDAExecutionProvider":
|
||||
cuda_fp32 = status
|
||||
elif ep == "TensorrtExecutionProvider":
|
||||
trt_fp32 = status
|
||||
elif ep == "Standalone_TRT":
|
||||
standalone_fp32 = status
|
||||
elif ep == "CUDAExecutionProvider_fp16":
|
||||
cuda_fp16 = status
|
||||
elif ep == "TensorrtExecutionProvider_fp16":
|
||||
trt_fp16 = status
|
||||
elif ep == "Standalone_TRT_fp16":
|
||||
standalone_fp16 = status
|
||||
if ep == cpu:
|
||||
cpu_status = status
|
||||
elif ep == cuda:
|
||||
cuda_fp32_status = status
|
||||
elif ep == trt:
|
||||
trt_fp32_status = status
|
||||
elif ep == standalone_trt:
|
||||
standalone_fp32_status = status
|
||||
elif ep == cuda_fp16:
|
||||
cuda_fp16_status = status
|
||||
elif ep == trt_fp16:
|
||||
trt_fp16_status = status
|
||||
elif ep == standalone_trt_fp16:
|
||||
standalone_fp16_status = status
|
||||
else:
|
||||
continue
|
||||
|
||||
row = [model_name,
|
||||
cpu,
|
||||
cuda_fp32,
|
||||
trt_fp32,
|
||||
standalone_fp32,
|
||||
cuda_fp16,
|
||||
trt_fp16,
|
||||
standalone_fp16
|
||||
]
|
||||
cuda_fp32_status,
|
||||
trt_fp32_status,
|
||||
standalone_fp32_status,
|
||||
cuda_fp16_status,
|
||||
trt_fp16_status,
|
||||
standalone_fp16_status]
|
||||
csv_writer.writerow(row)
|
||||
|
||||
def output_latency(results, csv_filename):
|
||||
|
|
@ -1292,61 +1314,61 @@ def output_latency(results, csv_filename):
|
|||
|
||||
for key, value in results.items():
|
||||
cpu_average = ""
|
||||
if "CPUExecutionProvider" in value and "average_latency_ms" in value["CPUExecutionProvider"]:
|
||||
cpu_average = value["CPUExecutionProvider"]["average_latency_ms"]
|
||||
if cpu in value and "average_latency_ms" in value[cpu]:
|
||||
cpu_average = value[cpu]["average_latency_ms"]
|
||||
|
||||
cpu_90_percentile = ""
|
||||
if "CPUExecutionProvider" in value and "latency_90_percentile" in value["CPUExecutionProvider"]:
|
||||
cpu_90_percentile = value["CPUExecutionProvider"]["latency_90_percentile"]
|
||||
if cpu in value and "latency_90_percentile" in value[cpu]:
|
||||
cpu_90_percentile = value[cpu]["latency_90_percentile"]
|
||||
|
||||
cuda_average = ""
|
||||
if 'CUDAExecutionProvider' in value and 'average_latency_ms' in value['CUDAExecutionProvider']:
|
||||
cuda_average = value['CUDAExecutionProvider']['average_latency_ms']
|
||||
if cuda in value and 'average_latency_ms' in value[cuda]:
|
||||
cuda_average = value[cuda]['average_latency_ms']
|
||||
|
||||
cuda_90_percentile = ""
|
||||
if 'CUDAExecutionProvider' in value and 'latency_90_percentile' in value['CUDAExecutionProvider']:
|
||||
cuda_90_percentile = value['CUDAExecutionProvider']['latency_90_percentile']
|
||||
if cuda in value and 'latency_90_percentile' in value[cuda]:
|
||||
cuda_90_percentile = value[cuda]['latency_90_percentile']
|
||||
|
||||
trt_average = ""
|
||||
if 'TensorrtExecutionProvider' in value and 'average_latency_ms' in value['TensorrtExecutionProvider']:
|
||||
trt_average = value['TensorrtExecutionProvider']['average_latency_ms']
|
||||
if trt in value and 'average_latency_ms' in value[trt]:
|
||||
trt_average = value[trt]['average_latency_ms']
|
||||
|
||||
trt_90_percentile = ""
|
||||
if 'TensorrtExecutionProvider' in value and 'latency_90_percentile' in value['TensorrtExecutionProvider']:
|
||||
trt_90_percentile = value['TensorrtExecutionProvider']['latency_90_percentile']
|
||||
if trt in value and 'latency_90_percentile' in value[trt]:
|
||||
trt_90_percentile = value[trt]['latency_90_percentile']
|
||||
|
||||
standalone_trt_average = ""
|
||||
if 'Standalone_TRT' in value and 'average_latency_ms' in value['Standalone_TRT']:
|
||||
standalone_trt_average = value['Standalone_TRT']['average_latency_ms']
|
||||
if standalone_trt in value and 'average_latency_ms' in value[standalone_trt]:
|
||||
standalone_trt_average = value[standalone_trt]['average_latency_ms']
|
||||
|
||||
standalone_trt_90_percentile = ""
|
||||
if 'Standalone_TRT' in value and 'latency_90_percentile' in value['Standalone_TRT']:
|
||||
standalone_trt_90_percentile = value['Standalone_TRT']['latency_90_percentile']
|
||||
if standalone_trt in value and 'latency_90_percentile' in value[standalone_trt]:
|
||||
standalone_trt_90_percentile = value[standalone_trt]['latency_90_percentile']
|
||||
|
||||
|
||||
cuda_fp16_average = ""
|
||||
if 'CUDAExecutionProvider_fp16' in value and 'average_latency_ms' in value['CUDAExecutionProvider_fp16']:
|
||||
cuda_fp16_average = value['CUDAExecutionProvider_fp16']['average_latency_ms']
|
||||
if cuda_fp16 in value and 'average_latency_ms' in value[cuda_fp16]:
|
||||
cuda_fp16_average = value[cuda_fp16]['average_latency_ms']
|
||||
|
||||
cuda_fp16_90_percentile = ""
|
||||
if 'CUDAExecutionProvider_fp16' in value and 'latency_90_percentile' in value['CUDAExecutionProvider_fp16']:
|
||||
cuda_fp16_90_percentile = value['CUDAExecutionProvider_fp16']['latency_90_percentile']
|
||||
if cuda_fp16 in value and 'latency_90_percentile' in value[cuda_fp16]:
|
||||
cuda_fp16_90_percentile = value[cuda_fp16]['latency_90_percentile']
|
||||
|
||||
trt_fp16_average = ""
|
||||
if 'TensorrtExecutionProvider_fp16' in value and 'average_latency_ms' in value['TensorrtExecutionProvider_fp16']:
|
||||
trt_fp16_average = value['TensorrtExecutionProvider_fp16']['average_latency_ms']
|
||||
if trt_fp16 in value and 'average_latency_ms' in value[trt_fp16]:
|
||||
trt_fp16_average = value[trt_fp16]['average_latency_ms']
|
||||
|
||||
trt_fp16_90_percentile = ""
|
||||
if 'TensorrtExecutionProvider_fp16' in value and 'latency_90_percentile' in value['TensorrtExecutionProvider_fp16']:
|
||||
trt_fp16_90_percentile = value['TensorrtExecutionProvider_fp16']['latency_90_percentile']
|
||||
if trt_fp16 in value and 'latency_90_percentile' in value[trt_fp16]:
|
||||
trt_fp16_90_percentile = value[trt_fp16]['latency_90_percentile']
|
||||
|
||||
standalone_trt_fp16_average = ""
|
||||
if 'Standalone_TRT_fp16' in value and 'average_latency_ms' in value['Standalone_TRT_fp16']:
|
||||
standalone_trt_fp16_average = value['Standalone_TRT_fp16']['average_latency_ms']
|
||||
if standalone_trt in value and 'average_latency_ms' in value[standalone_trt_fp16]:
|
||||
standalone_trt_fp16_average = value[standalone_trt]['average_latency_ms']
|
||||
|
||||
standalone_trt_fp16_90_percentile = ""
|
||||
if 'Standalone_TRT_fp16' in value and 'latency_90_percentile' in value['Standalone_TRT_fp16']:
|
||||
standalone_trt_fp16_90_percentile = value['Standalone_TRT_fp16']['latency_90_percentile']
|
||||
if standalone_trt in value and 'latency_90_percentile' in value[standalone_trt_fp16]:
|
||||
standalone_trt_fp16_90_percentile = value[standalone_trt]['latency_90_percentile']
|
||||
|
||||
|
||||
row = [key,
|
||||
|
|
@ -1390,41 +1412,41 @@ def output_metrics(model_to_metrics, csv_filename):
|
|||
result["model_name"] = model
|
||||
result_fp16["model_name"] = model + " (FP16)"
|
||||
|
||||
if "CUDAExecutionProvider" in ep_info:
|
||||
result['ratio_of_ops_in_cuda_not_fallback_cpu'] = ep_info["CUDAExecutionProvider"]['ratio_of_ops_in_cuda_not_fallback_cpu']
|
||||
if cuda in ep_info:
|
||||
result['ratio_of_ops_in_cuda_not_fallback_cpu'] = ep_info[cuda]['ratio_of_ops_in_cuda_not_fallback_cpu']
|
||||
|
||||
if "TensorrtExecutionProvider" in ep_info:
|
||||
result['total_trt_execution_time'] = ep_info["TensorrtExecutionProvider"]['total_trt_execution_time']
|
||||
result['total_execution_time'] = ep_info["TensorrtExecutionProvider"]['total_execution_time']
|
||||
result['ratio_of_execution_time_in_trt'] = ep_info["TensorrtExecutionProvider"]['ratio_of_execution_time_in_trt']
|
||||
if trt in ep_info:
|
||||
result['total_trt_execution_time'] = ep_info[trt]['total_trt_execution_time']
|
||||
result['total_execution_time'] = ep_info[trt]['total_execution_time']
|
||||
result['ratio_of_execution_time_in_trt'] = ep_info[trt]['ratio_of_execution_time_in_trt']
|
||||
|
||||
if "CUDAExecutionProvider" in ep_info and "TensorrtExecutionProvider" in ep_info:
|
||||
if cuda in ep_info and trt in ep_info:
|
||||
########################################################################################
|
||||
# equation of % TRT ops:
|
||||
# (total ops in cuda json - cuda and cpu ops in trt json)/ total ops in cuda json
|
||||
########################################################################################
|
||||
total_ops_in_cuda = ep_info["CUDAExecutionProvider"]["total_ops"]
|
||||
cuda_cpu_ops_in_trt = ep_info["TensorrtExecutionProvider"]["total_ops"]
|
||||
total_ops_in_cuda = ep_info[cuda]["total_ops"]
|
||||
cuda_cpu_ops_in_trt = ep_info[trt]["total_ops"]
|
||||
|
||||
result['total_ops_in_trt'] = total_ops_in_cuda - cuda_cpu_ops_in_trt
|
||||
result['total_ops'] = total_ops_in_cuda
|
||||
result['ratio_of_ops_in_trt'] = (total_ops_in_cuda - cuda_cpu_ops_in_trt) / total_ops_in_cuda
|
||||
|
||||
if "CUDAExecutionProvider_fp16" in ep_info:
|
||||
result_fp16['ratio_of_ops_in_cuda_not_fallback_cpu'] = ep_info["CUDAExecutionProvider_fp16"]['ratio_of_ops_in_cuda_not_fallback_cpu']
|
||||
if cuda_fp16 in ep_info:
|
||||
result_fp16['ratio_of_ops_in_cuda_not_fallback_cpu'] = ep_info[cuda_fp16]['ratio_of_ops_in_cuda_not_fallback_cpu']
|
||||
|
||||
if "TensorrtExecutionProvider_fp16" in ep_info:
|
||||
result_fp16['total_trt_execution_time'] = ep_info["TensorrtExecutionProvider_fp16"]['total_trt_execution_time']
|
||||
result_fp16['total_execution_time'] = ep_info["TensorrtExecutionProvider_fp16"]['total_execution_time']
|
||||
result_fp16['ratio_of_execution_time_in_trt'] = ep_info["TensorrtExecutionProvider_fp16"]['ratio_of_execution_time_in_trt']
|
||||
if trt_fp16 in ep_info:
|
||||
result_fp16['total_trt_execution_time'] = ep_info[trt_fp16]['total_trt_execution_time']
|
||||
result_fp16['total_execution_time'] = ep_info[trt_fp16]['total_execution_time']
|
||||
result_fp16['ratio_of_execution_time_in_trt'] = ep_info[trt_fp16]['ratio_of_execution_time_in_trt']
|
||||
|
||||
if "CUDAExecutionProvider_fp16" in ep_info and "TensorrtExecutionProvider_fp16" in ep_info:
|
||||
if cuda_fp16 in ep_info and trt_fp16 in ep_info:
|
||||
########################################################################################
|
||||
# equation of % TRT ops:
|
||||
# (total ops in cuda json - cuda and cpu ops in trt json)/ total ops in cuda json
|
||||
########################################################################################
|
||||
total_ops_in_cuda = ep_info["CUDAExecutionProvider_fp16"]["total_ops"]
|
||||
cuda_cpu_ops_in_trt = ep_info["TensorrtExecutionProvider_fp16"]["total_ops"]
|
||||
total_ops_in_cuda = ep_info[cuda_fp16]["total_ops"]
|
||||
cuda_cpu_ops_in_trt = ep_info[trt_fp16]["total_ops"]
|
||||
|
||||
result_fp16['total_ops_in_trt'] = total_ops_in_cuda - cuda_cpu_ops_in_trt
|
||||
result_fp16['total_ops'] = total_ops_in_cuda
|
||||
|
|
@ -1474,8 +1496,10 @@ def str2bool(v):
|
|||
|
||||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument("-c", "--comparison", required=False, default="cuda_trt", choices=["cuda_trt", "acl"], help="EPs to compare: CPU vs. CUDA vs. TRT or CPU vs. ACL")
|
||||
|
||||
parser.add_argument("-d", "--default_dir", required=False, default="~/", help="Perf folder path")
|
||||
parser.add_argument("-d", "--working_dir", required=False, default="./", help="Perf folder path")
|
||||
|
||||
parser.add_argument("-m", "--model_source", required=False, default="model_list.json", help="Model source: (1) model list file (2) model directory.")
|
||||
|
||||
|
|
@ -1518,7 +1542,7 @@ def setup_logger(verbose):
|
|||
def parse_models_helper(args, models):
|
||||
if ".json" in args.model_source:
|
||||
logger.info("Parsing model information from file ...")
|
||||
parse_models_info_from_file(args.default_dir, args.model_source, models)
|
||||
parse_models_info_from_file(args.working_dir, args.model_source, models)
|
||||
else:
|
||||
logger.info("Parsing model information from directory ...")
|
||||
parse_models_info_from_directory(args.model_source, models)
|
||||
|
|
|
|||
|
|
@ -15,6 +15,14 @@ def write_model_info_to_file(model, path):
|
|||
with open(path, 'w') as file:
|
||||
file.write(json.dumps(model)) # use `json.loads` to do the reverse
|
||||
|
||||
def get_ep_list(comparison):
|
||||
if comparison == 'acl':
|
||||
ep_list = [cpu, acl]
|
||||
else:
|
||||
# test with cuda and trt
|
||||
ep_list = [cpu, cuda, trt, cuda_fp16, trt_fp16]
|
||||
return ep_list
|
||||
|
||||
def main():
|
||||
args = parse_arguments()
|
||||
setup_logger(False)
|
||||
|
|
@ -41,13 +49,12 @@ def main():
|
|||
|
||||
model_list_file = os.path.join(os.getcwd(), model +'.json')
|
||||
write_model_info_to_file([model_info], model_list_file)
|
||||
ep_list = ["CPUExecutionProvider", "CUDAExecutionProvider", "TensorrtExecutionProvider", "CUDAExecutionProvider_fp16", "TensorrtExecutionProvider_fp16"]
|
||||
|
||||
ep_list = get_ep_list(args.comparison)
|
||||
for ep in ep_list:
|
||||
if args.running_mode == "validate":
|
||||
p = subprocess.run(["python3",
|
||||
"benchmark.py",
|
||||
"-d", args.default_dir,
|
||||
"-r", args.running_mode,
|
||||
"-m", model_list_file,
|
||||
"--ep", ep,
|
||||
|
|
@ -58,7 +65,6 @@ def main():
|
|||
elif args.running_mode == "benchmark":
|
||||
p = subprocess.run(["python3",
|
||||
"benchmark.py",
|
||||
"-d", args.default_dir,
|
||||
"-r", args.running_mode,
|
||||
"-m", model_list_file,
|
||||
"--ep", ep,
|
||||
|
|
|
|||
|
|
@ -11,9 +11,6 @@ SYMBOLIC_SHAPE_INFER_LINK="https://raw.githubusercontent.com/microsoft/onnxrunti
|
|||
FLOAT_16="float16.py"
|
||||
FLOAT_16_LINK="https://raw.githubusercontent.com/microsoft/onnxconverter-common/master/onnxconverter_common/float16.py"
|
||||
|
||||
# root working directory
|
||||
DEFAULT_DIR="./"
|
||||
|
||||
cleanup_files() {
|
||||
rm -f $FAIL_MODEL_FILE
|
||||
rm -f $LATENCY_FILE
|
||||
|
|
@ -36,8 +33,8 @@ update_files() {
|
|||
if [ "$1" == "many-models" ]
|
||||
then
|
||||
update_files
|
||||
python3 benchmark_wrapper.py -d $DEFAULT_DIR -r validate -m /home/hcsuser/mount/many-models -o result/"$1"
|
||||
python3 benchmark_wrapper.py -d $DEFAULT_DIR -r benchmark -i random -t 10 -m /home/hcsuser/mount/many-models -o result/"$1"
|
||||
python3 benchmark_wrapper.py -r validate -m /home/hcsuser/mount/many-models -o result/"$1"
|
||||
python3 benchmark_wrapper.py -r benchmark -i random -t 10 -m /home/hcsuser/mount/many-models -o result/"$1"
|
||||
fi
|
||||
|
||||
# ONNX model zoo
|
||||
|
|
@ -45,8 +42,8 @@ if [ "$1" == "onnx-zoo-models" ]
|
|||
then
|
||||
MODEL_LIST="model_list.json"
|
||||
update_files
|
||||
python3 benchmark_wrapper.py -d $DEFAULT_DIR -r validate -m $MODEL_LIST -o result/"$1"
|
||||
python3 benchmark_wrapper.py -d $DEFAULT_DIR -r benchmark -i random -t 10 -m $MODEL_LIST -o result/"$1"
|
||||
python3 benchmark_wrapper.py -r validate -m $MODEL_LIST -o result/"$1"
|
||||
python3 benchmark_wrapper.py -r benchmark -i random -t 10 -m $MODEL_LIST -o result/"$1"
|
||||
fi
|
||||
|
||||
# 1P models
|
||||
|
|
@ -54,8 +51,8 @@ if [ "$1" == "partner-models" ]
|
|||
then
|
||||
MODEL_LIST="partner_model_list.json"
|
||||
update_files
|
||||
python3 benchmark_wrapper.py -d $DEFAULT_DIR -r validate -m $MODEL_LIST -o result/"$1"
|
||||
python3 benchmark_wrapper.py -d $DEFAULT_DIR -r benchmark -i random -t 10 -m $MODEL_LIST -o result/"$1"
|
||||
python3 benchmark_wrapper.py -r validate -m $MODEL_LIST -o result/"$1"
|
||||
python3 benchmark_wrapper.py -r benchmark -i random -t 10 -m $MODEL_LIST -o result/"$1"
|
||||
fi
|
||||
|
||||
# Test models
|
||||
|
|
@ -63,6 +60,6 @@ if [ "$1" == "selected-models" ]
|
|||
then
|
||||
MODEL_LIST="selected_models.json"
|
||||
update_files
|
||||
python3 benchmark_wrapper.py -d $DEFAULT_DIR -r validate -m $MODEL_LIST -o result/"$1"
|
||||
python3 benchmark_wrapper.py -d $DEFAULT_DIR -r benchmark -i random -t 1 -m $MODEL_LIST -o result/"$1"
|
||||
python3 benchmark_wrapper.py -r validate -m $MODEL_LIST -o result/"$1"
|
||||
python3 benchmark_wrapper.py -r benchmark -i random -t 1 -m $MODEL_LIST -o result/"$1"
|
||||
fi
|
||||
|
|
|
|||
|
|
@ -8,6 +8,12 @@ import re
|
|||
debug = False
|
||||
debug_verbose = False
|
||||
|
||||
def find(regex_string):
|
||||
import glob
|
||||
results = glob.glob(regex_string)
|
||||
results.sort()
|
||||
return results
|
||||
|
||||
def get_latest_commit_hash():
|
||||
p1 = subprocess.Popen(["git", "rev-parse", "--short", "HEAD"], stdout = subprocess.PIPE)
|
||||
stdout, sterr = p1.communicate()
|
||||
|
|
|
|||
Loading…
Reference in a new issue