mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-22 22:01:08 +00:00
* Initialize tensorrt perf script
* Add bert-squad dependencies
* Modified code to make ort inference with CUDA/Tensorrt
* Add get CUDA/TRT version
* uncomment bert-squad
* Add BERT-SQUAD inputs.json
* Add FastRCNN
* Make preprocess/validation in to common functions
* Add MaskRCNN and SSD and consolidate the code
* Add dependencies for MaskRCNN
* following modifications are made:
- create common fetch function to get inputs/outputs of model from ONNX model zoo.
- create common validation function to compare inference outputs with reference outputs from ONNX model zoo.
- move run/repeat time to argument list. (still working on other arguments, like fp16 or fp32, latency percentile).
- generate table in csv file to show the latency comparison (TRT vs CUDA) side by side.
* Add approache to analyze profling file and also update model related
settings
* Add models
* Add most of models from ONNX model zoo
* Add model input name and print all the model names at the end of run
* Add system info
* Add TRT fp16 support
* Refine the code
* Handle TRT fall back and modify the way to get input data
* Refine code
* Modify code
* Add more precise approach to measure inference
* Add io-binding
* Add YoLoV4
* Refine the code
* Refine the code
* Add models
* Add yolov4 notebook for jetson device
* Update notebook
* Update notebook
* Add CVS models
* Add missing model
* Add support of float16
* Add new way to get trt version
* Add "validate" and "benchmark" mode
* Add randomly generated input
* Refine perf script
* Refine the code.
* Add README
* Refine the code
* Update README.md
* Refine code
* Update README.md
* Remove all the model related python and instead using model_list.json as
models configuration.
Refine the benchmark.py
* Refine the code
Co-authored-by: Chi Lo <lochi@microsoft.com>
209 lines
6.6 KiB
Python
209 lines
6.6 KiB
Python
import subprocess
|
|
import json
|
|
import pprint
|
|
import logging
|
|
import coloredlogs
|
|
import re
|
|
|
|
debug = False
|
|
debug_verbose = False
|
|
|
|
def parse_single_file(f):
|
|
|
|
try:
|
|
data = json.load(f)
|
|
except Exception as e:
|
|
return None
|
|
|
|
model_run_flag = False
|
|
first_run_flag = True
|
|
provider_op_map = {} # ep -> map of operator to duration
|
|
provider_op_map_first_run = {} # ep -> map of operator to duration
|
|
|
|
for row in data:
|
|
if not "cat" in row:
|
|
continue
|
|
|
|
if row["cat"] == "Session":
|
|
if "name" in row and row["name"] == "model_run":
|
|
if not first_run_flag:
|
|
break
|
|
|
|
model_run_flag = True
|
|
first_run_flag = False
|
|
|
|
elif row["cat"] == "Node":
|
|
if "name" in row and "args" in row and re.search(".*kernel_time", row["name"]):
|
|
args = row["args"]
|
|
|
|
if not "op_name" in args or not "provider" in args:
|
|
continue
|
|
|
|
provider = args["provider"]
|
|
|
|
if first_run_flag:
|
|
if provider not in provider_op_map_first_run:
|
|
provider_op_map_first_run[provider] = {}
|
|
|
|
op_map = provider_op_map_first_run[provider]
|
|
|
|
if row["name"] in op_map:
|
|
provider_op_map[provider] = {}
|
|
op_map = provider_op_map[provider]
|
|
op_map[row["name"]] = row["dur"]
|
|
provider_op_map[provider] = op_map
|
|
else:
|
|
op_map[row["name"]] = row["dur"]
|
|
provider_op_map_first_run[provider] = op_map
|
|
else:
|
|
if provider not in provider_op_map:
|
|
provider_op_map[provider] = {}
|
|
|
|
op_map = provider_op_map[provider]
|
|
|
|
# avoid duplicated metrics
|
|
if not row["name"] in op_map:
|
|
op_map[row["name"]] = row["dur"]
|
|
provider_op_map[provider] = op_map
|
|
|
|
|
|
if debug_verbose:
|
|
pprint._sorted = lambda x:x
|
|
pprint.sorted = lambda x, key=None: x
|
|
pp = pprint.PrettyPrinter(indent=4)
|
|
print("------First run ops map (START)------")
|
|
for key, map in provider_op_map_first_run.items():
|
|
print(key)
|
|
pp.pprint({k: v for k, v in sorted(map.items(), key=lambda item: item[1], reverse=True)})
|
|
|
|
print("------First run ops map (END) ------")
|
|
print("------Second run ops map (START)------")
|
|
for key, map in provider_op_map.items():
|
|
print(key)
|
|
pp.pprint({k: v for k, v in sorted(map.items(), key=lambda item: item[1], reverse=True)})
|
|
print("------Second run ops map (END) ------")
|
|
|
|
if model_run_flag:
|
|
return provider_op_map
|
|
|
|
return None
|
|
|
|
def calculate_cuda_op_percentage(cuda_op_map):
|
|
if not cuda_op_map or len(cuda_op_map) == 0:
|
|
return 0
|
|
|
|
cuda_ops = 0
|
|
cpu_ops = 0
|
|
for key, value in cuda_op_map.items():
|
|
if key == 'CUDAExecutionProvider':
|
|
cuda_ops += len(value)
|
|
|
|
if key == 'CPUExecutionProvider':
|
|
cpu_ops += len(value)
|
|
|
|
return cuda_ops / (cuda_ops + cpu_ops)
|
|
|
|
##########################################
|
|
# Return: total ops executed in TRT,
|
|
# total ops,
|
|
# ratio of ops executed in TRT,
|
|
##########################################
|
|
def calculate_trt_op_percentage(trt_op_map, cuda_op_map):
|
|
# % of TRT ops
|
|
total_ops = 0
|
|
total_cuda_and_cpu_ops = 0
|
|
for ep in ["CUDAExecutionProvider", "CPUExecutionProvider"]:
|
|
if ep in cuda_op_map:
|
|
op_map = cuda_op_map[ep]
|
|
total_ops += len(op_map)
|
|
|
|
if ep in trt_op_map:
|
|
op_map = trt_op_map[ep]
|
|
total_cuda_and_cpu_ops += len(op_map)
|
|
|
|
if total_ops == 0:
|
|
print("Error ...")
|
|
raise
|
|
|
|
if len(trt_op_map) == 0:
|
|
total_cuda_and_cpu_ops = total_ops
|
|
|
|
#
|
|
# equation of % TRT ops:
|
|
# (total ops in cuda json - cuda and cpu ops in trt json)/ total ops in cuda json
|
|
#
|
|
ratio_of_ops_in_trt = (total_ops - total_cuda_and_cpu_ops) / total_ops
|
|
if debug:
|
|
print("total_cuda_and_cpu_ops: {}".format(total_cuda_and_cpu_ops))
|
|
print("total_ops: {}".format(total_ops))
|
|
print("ratio_of_ops_in_trt: {}".format(ratio_of_ops_in_trt))
|
|
|
|
return ((total_ops - total_cuda_and_cpu_ops), total_ops, ratio_of_ops_in_trt)
|
|
|
|
|
|
##########################################
|
|
# Return: total TRT execution time,
|
|
# total execution time,
|
|
# ratio of execution time in TRT
|
|
##########################################
|
|
def calculate_trt_latency_percentage(trt_op_map):
|
|
# % of TRT execution time
|
|
total_execution_time = 0
|
|
total_trt_execution_time = 0
|
|
for ep in ["TensorrtExecutionProvider", "CUDAExecutionProvider", "CPUExecutionProvider"]:
|
|
if ep in trt_op_map:
|
|
op_map = trt_op_map[ep]
|
|
|
|
total_time = 0
|
|
for key, value in op_map.items():
|
|
total_time += int(value)
|
|
|
|
if ep == "TensorrtExecutionProvider":
|
|
total_trt_execution_time = total_time
|
|
|
|
total_execution_time += total_time
|
|
|
|
|
|
|
|
if total_execution_time == 0:
|
|
ratio_of_trt_execution_time = 0
|
|
else:
|
|
ratio_of_trt_execution_time = total_trt_execution_time / total_execution_time
|
|
|
|
if debug:
|
|
print("total_trt_execution_time: {}".format(total_trt_execution_time))
|
|
print("total_execution_time: {}".format(total_execution_time))
|
|
print("ratio_of_trt_execution_time: {}".format(ratio_of_trt_execution_time))
|
|
|
|
return (total_trt_execution_time, total_execution_time, ratio_of_trt_execution_time)
|
|
|
|
|
|
|
|
def get_profile_metrics(path, profile_already_parsed):
|
|
print("Parsing/Analyzing profiling files in {} ...".format(path))
|
|
p1 = subprocess.Popen(["find", path, "-name", "onnxruntime_profile*", "-printf", "%T+\t%p\n"], stdout=subprocess.PIPE)
|
|
p2 = subprocess.Popen(["sort"], stdin=p1.stdout, stdout=subprocess.PIPE)
|
|
stdout, sterr = p2.communicate()
|
|
stdout = stdout.decode("ascii").strip()
|
|
profiling_files = stdout.split("\n")
|
|
print(profiling_files)
|
|
|
|
data = []
|
|
for profile in profiling_files:
|
|
profile = profile.split('\t')[1]
|
|
if profile in profile_already_parsed:
|
|
continue
|
|
profile_already_parsed.add(profile)
|
|
|
|
print("start to parse {} ...".format(profile))
|
|
with open(profile) as f:
|
|
op_map = parse_single_file(f)
|
|
if op_map:
|
|
data.append(op_map)
|
|
|
|
if len(data) == 0:
|
|
print("No profile metrics got.")
|
|
return None
|
|
|
|
return data[-1]
|
|
|