onnxruntime/onnxruntime/python/tools/tensorrt/perf/perf_utils.py
Chi Lo 9f526f45ac
TensorRT Perf Tool (#4900)
* Initialize tensorrt perf script

* Add bert-squad dependencies

* Modified code to make ort inference with CUDA/Tensorrt

* Add get CUDA/TRT version

* uncomment bert-squad

* Add BERT-SQUAD inputs.json

* Add FastRCNN

* Make preprocess/validation in to common functions

* Add MaskRCNN and SSD and consolidate the code

* Add dependencies for MaskRCNN

* following modifications are made:
    - create common fetch function to get inputs/outputs of model from ONNX model zoo.
    - create common validation function to compare inference outputs with reference outputs from ONNX model zoo.
    - move run/repeat time to argument list. (still working on other arguments, like fp16 or fp32, latency percentile).
    - generate table in csv file to show the latency comparison (TRT vs CUDA) side by side.

* Add approache to analyze profling file and also update model related
settings

* Add models

* Add most of models from ONNX model zoo

* Add model input name and print all the model names at the end of run

* Add system info

* Add TRT fp16 support

* Refine the code

* Handle TRT fall back and modify the way to get input data

* Refine code

* Modify code

* Add more precise approach to measure inference

* Add io-binding

* Add YoLoV4

* Refine the code

* Refine the code

* Add models

* Add yolov4 notebook for jetson device

* Update notebook

* Update notebook

* Add CVS models

* Add missing model

* Add support of float16

* Add new way to get trt version

* Add "validate" and "benchmark" mode

* Add randomly generated input

* Refine perf script

* Refine the code.

* Add README

* Refine the code

* Update README.md

* Refine code

* Update README.md

* Remove all the model related python and instead using model_list.json as
models configuration.

Refine the benchmark.py

* Refine the code

Co-authored-by: Chi Lo <lochi@microsoft.com>
2020-09-15 10:06:01 -07:00

209 lines
6.6 KiB
Python

import subprocess
import json
import pprint
import logging
import coloredlogs
import re
debug = False
debug_verbose = False
def parse_single_file(f):
try:
data = json.load(f)
except Exception as e:
return None
model_run_flag = False
first_run_flag = True
provider_op_map = {} # ep -> map of operator to duration
provider_op_map_first_run = {} # ep -> map of operator to duration
for row in data:
if not "cat" in row:
continue
if row["cat"] == "Session":
if "name" in row and row["name"] == "model_run":
if not first_run_flag:
break
model_run_flag = True
first_run_flag = False
elif row["cat"] == "Node":
if "name" in row and "args" in row and re.search(".*kernel_time", row["name"]):
args = row["args"]
if not "op_name" in args or not "provider" in args:
continue
provider = args["provider"]
if first_run_flag:
if provider not in provider_op_map_first_run:
provider_op_map_first_run[provider] = {}
op_map = provider_op_map_first_run[provider]
if row["name"] in op_map:
provider_op_map[provider] = {}
op_map = provider_op_map[provider]
op_map[row["name"]] = row["dur"]
provider_op_map[provider] = op_map
else:
op_map[row["name"]] = row["dur"]
provider_op_map_first_run[provider] = op_map
else:
if provider not in provider_op_map:
provider_op_map[provider] = {}
op_map = provider_op_map[provider]
# avoid duplicated metrics
if not row["name"] in op_map:
op_map[row["name"]] = row["dur"]
provider_op_map[provider] = op_map
if debug_verbose:
pprint._sorted = lambda x:x
pprint.sorted = lambda x, key=None: x
pp = pprint.PrettyPrinter(indent=4)
print("------First run ops map (START)------")
for key, map in provider_op_map_first_run.items():
print(key)
pp.pprint({k: v for k, v in sorted(map.items(), key=lambda item: item[1], reverse=True)})
print("------First run ops map (END) ------")
print("------Second run ops map (START)------")
for key, map in provider_op_map.items():
print(key)
pp.pprint({k: v for k, v in sorted(map.items(), key=lambda item: item[1], reverse=True)})
print("------Second run ops map (END) ------")
if model_run_flag:
return provider_op_map
return None
def calculate_cuda_op_percentage(cuda_op_map):
if not cuda_op_map or len(cuda_op_map) == 0:
return 0
cuda_ops = 0
cpu_ops = 0
for key, value in cuda_op_map.items():
if key == 'CUDAExecutionProvider':
cuda_ops += len(value)
if key == 'CPUExecutionProvider':
cpu_ops += len(value)
return cuda_ops / (cuda_ops + cpu_ops)
##########################################
# Return: total ops executed in TRT,
# total ops,
# ratio of ops executed in TRT,
##########################################
def calculate_trt_op_percentage(trt_op_map, cuda_op_map):
# % of TRT ops
total_ops = 0
total_cuda_and_cpu_ops = 0
for ep in ["CUDAExecutionProvider", "CPUExecutionProvider"]:
if ep in cuda_op_map:
op_map = cuda_op_map[ep]
total_ops += len(op_map)
if ep in trt_op_map:
op_map = trt_op_map[ep]
total_cuda_and_cpu_ops += len(op_map)
if total_ops == 0:
print("Error ...")
raise
if len(trt_op_map) == 0:
total_cuda_and_cpu_ops = total_ops
#
# equation of % TRT ops:
# (total ops in cuda json - cuda and cpu ops in trt json)/ total ops in cuda json
#
ratio_of_ops_in_trt = (total_ops - total_cuda_and_cpu_ops) / total_ops
if debug:
print("total_cuda_and_cpu_ops: {}".format(total_cuda_and_cpu_ops))
print("total_ops: {}".format(total_ops))
print("ratio_of_ops_in_trt: {}".format(ratio_of_ops_in_trt))
return ((total_ops - total_cuda_and_cpu_ops), total_ops, ratio_of_ops_in_trt)
##########################################
# Return: total TRT execution time,
# total execution time,
# ratio of execution time in TRT
##########################################
def calculate_trt_latency_percentage(trt_op_map):
# % of TRT execution time
total_execution_time = 0
total_trt_execution_time = 0
for ep in ["TensorrtExecutionProvider", "CUDAExecutionProvider", "CPUExecutionProvider"]:
if ep in trt_op_map:
op_map = trt_op_map[ep]
total_time = 0
for key, value in op_map.items():
total_time += int(value)
if ep == "TensorrtExecutionProvider":
total_trt_execution_time = total_time
total_execution_time += total_time
if total_execution_time == 0:
ratio_of_trt_execution_time = 0
else:
ratio_of_trt_execution_time = total_trt_execution_time / total_execution_time
if debug:
print("total_trt_execution_time: {}".format(total_trt_execution_time))
print("total_execution_time: {}".format(total_execution_time))
print("ratio_of_trt_execution_time: {}".format(ratio_of_trt_execution_time))
return (total_trt_execution_time, total_execution_time, ratio_of_trt_execution_time)
def get_profile_metrics(path, profile_already_parsed):
print("Parsing/Analyzing profiling files in {} ...".format(path))
p1 = subprocess.Popen(["find", path, "-name", "onnxruntime_profile*", "-printf", "%T+\t%p\n"], stdout=subprocess.PIPE)
p2 = subprocess.Popen(["sort"], stdin=p1.stdout, stdout=subprocess.PIPE)
stdout, sterr = p2.communicate()
stdout = stdout.decode("ascii").strip()
profiling_files = stdout.split("\n")
print(profiling_files)
data = []
for profile in profiling_files:
profile = profile.split('\t')[1]
if profile in profile_already_parsed:
continue
profile_already_parsed.add(profile)
print("start to parse {} ...".format(profile))
with open(profile) as f:
op_map = parse_single_file(f)
if op_map:
data.append(op_map)
if len(data) == 0:
print("No profile metrics got.")
return None
return data[-1]