[EP-Perf-Dashboard] Reduce script excessive output (#13562)

### Description
Properly cleans up all temporary resources created while running
benchmarks.

Details:
- Dump all temporary artifacts (TRT engines, TRT profiles, inference
profiles, fp16 models) into a temp directory in `/tmp/`. Each model/EP
combination has its own temp directory that is deleted after validation
and benchmarking.
- Allow running both validation and benchmarking in one invocation of
the benchmark.py script. This is necessary to allow the benchmarking
step to reuse artifacts (e.g., TRT engines) created during validation.
Before this PR, we ran validation on all model/EP combinations before
running benchmarks on all combinations again. This required us to keep
all temporary artifacts for all model/EP combinations throughout the
entire run (expensive).
- Create individual functions for validation and benchmarking (split-up
large function that did it all)

### Motivation and Context
The EP Perf pipeline failed to run because the script generated too much
output and the VM ran out of disk space.
This commit is contained in:
Adrian Lizarraga 2022-11-08 16:17:29 -08:00 committed by GitHub
parent 123e1eac01
commit 281f199754
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 546 additions and 407 deletions

View file

@ -8,7 +8,7 @@ import pprint
import re
import subprocess
import sys
import time
import tempfile
import timeit
from datetime import datetime
@ -33,7 +33,9 @@ from perf_utils import (
get_output,
get_profile_metrics,
get_total_ops,
is_benchmark_mode,
is_standalone,
is_validate_mode,
memory_ending,
model_title,
ort_provider_list,
@ -82,6 +84,8 @@ METRICS_FILE = ".metrics_map"
SESSION_FILE = ".session_map"
MEMORY_FILE = "./temp_memory.csv"
TRT_ENGINE_CACHE_DIR_NAME = "engine_cache"
def split_and_sort_output(string_list):
string_list = string_list.split("\n")
@ -104,7 +108,20 @@ def get_model_inputs(model):
return inputs
def run_trt_standalone(trtexec, model_name, model_path, all_inputs_shape, fp16, track_memory):
def get_graph_opt_level(enablement):
opt_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
if enablement == enable_all:
opt_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
elif enablement == extended:
opt_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
elif enablement == basic:
opt_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_BASIC
return opt_level
def run_trt_standalone(trtexec, model_name, model_path, test_data_dir, all_inputs_shape, fp16, track_memory):
logger.info("running standalone trt")
onnx_model_path = "--onnx=" + model_path
@ -115,12 +132,12 @@ def run_trt_standalone(trtexec, model_name, model_path, all_inputs_shape, fp16,
model = onnx.load(model_path)
ort_inputs = get_model_inputs(model)
output = get_output(["find", "-L", os.getcwd(), "-name", "test_data*", "-type", "d"])
test_data_dir = split_and_sort_output(output)[0]
output = get_output(["find", "-L", test_data_dir, "-name", "test_data*", "-type", "d"])
test_data_dir_0 = split_and_sort_output(output)[0]
for i in range(len(ort_inputs)):
name = ort_inputs[i]
loaded_input = name + ":" + test_data_dir + "/" + str(i) + ".bin"
loaded_input = name + ":" + test_data_dir_0 + "/" + str(i) + ".bin"
logger.info(loaded_input)
shape = []
for j in all_inputs_shape[i]:
@ -149,7 +166,8 @@ def run_trt_standalone(trtexec, model_name, model_path, all_inputs_shape, fp16,
command.extend(["--fp16"])
# save engine
engine_name = model_name + ".engine"
engine_suffix = "_trtexec_fp16.engine" if fp16 else "_trtexec.engine"
engine_name = model_name + engine_suffix
save_command = command + ["--saveEngine=" + engine_name]
logger.info(save_command)
out = get_output(save_command)
@ -167,9 +185,9 @@ def run_trt_standalone(trtexec, model_name, model_path, all_inputs_shape, fp16,
out = get_output(load_command)
success = True
mem_usage = end_memory_tracking(p, success)
except Exception as e:
except Exception as excpt:
end_memory_tracking(p, success)
raise (e)
raise excpt
else:
out = get_output(load_command)
@ -603,41 +621,6 @@ def validate(all_ref_outputs, all_outputs, rtol, atol, percent_mismatch):
return True, None
# not use for this script
def cleanup_files():
files = []
p = subprocess.Popen(["find", ".", "-name", "test_data_set*", "-type", "d"], stdout=subprocess.PIPE)
stdout, sterr = p.communicate()
stdout = stdout.decode("ascii").strip()
files = files + stdout.split("\n")
p = subprocess.Popen(["find", ".", "-name", "*.onnx"], stdout=subprocess.PIPE)
stdout, sterr = p.communicate()
stdout = stdout.decode("ascii").strip()
files = files + stdout.split("\n")
p = subprocess.Popen(["find", ".", "-name", "*.gz"], stdout=subprocess.PIPE)
stdout, sterr = p.communicate()
stdout = stdout.decode("ascii").strip()
files = files + stdout.split("\n")
for f in files:
if "custom_test_data" in f:
logger.info(f)
continue
subprocess.Popen(["rm", "-rf", f], stdout=subprocess.PIPE)
def remove_files(running_mode, path):
files = []
out = ""
if running_mode == "validate":
out = get_output(["find", path, "-name", "onnxruntime_profile*"])
if running_mode == "benchmark":
logger.info(running_mode)
out = get_output(["find", path, "-name", "*.engine"])
def update_fail_report(fail_results, model, ep, e_type, e):
result = {}
@ -817,11 +800,18 @@ def skip_ep(model_name, ep, model_to_fail_ep):
def read_map_from_file(map_file):
"""
Load a dictionary stored as a JSON file.
:param map_file: The name of the JSON file to load.
:return: A dictionary with the contents of the JSON file.
"""
data = {}
with open(map_file) as f:
try:
data = json.load(f)
except Exception as e:
return None
data = json.load(f)
return data
@ -1055,15 +1045,16 @@ def parse_models_info_from_file(root_dir, path, models):
model["test_data_path_fp16"] = row["test_data_path_fp16"]
def convert_model_from_float_to_float16(model_path):
def convert_model_from_float_to_float16(model_path, new_model_dir):
from float16 import convert_float_to_float16
from onnxmltools.utils import load_model, save_model
new_model_path = os.path.join(os.getcwd(), "new_fp16_model_by_trt_perf.onnx")
new_model_path = os.path.join(new_model_dir, "new_fp16_model_by_trt_perf.onnx")
if not os.path.exists(new_model_path):
onnx_model = load_model(model_path)
new_onnx_model = convert_float_to_float16(onnx_model)
save_model(new_onnx_model, "new_fp16_model_by_trt_perf.onnx")
save_model(new_onnx_model, new_model_path)
return new_model_path
@ -1135,333 +1126,6 @@ def create_session(model_path, providers, provider_options, session_options):
raise Exception(e)
def run_onnxruntime(args, models):
success_results = []
model_to_latency = {} # model -> cuda and tensorrt latency
model_to_metrics = {} # model -> metrics from profiling file
model_to_fail_ep = {} # model -> failing ep
model_to_session = {} # models -> session creation time
if args.running_mode == "benchmark" and os.path.exists(SESSION_FILE):
model_to_session = read_map_from_file(SESSION_FILE)
ep_list = []
if args.ep:
ep_list.append(args.ep)
else:
if args.fp16:
ep_list = [cpu, cuda, trt, cuda_fp16, trt_fp16]
else:
ep_list = [cpu, cuda, trt]
validation_exemption = [trt_fp16]
if os.path.exists(FAIL_MODEL_FILE):
model_to_fail_ep = read_map_from_file(FAIL_MODEL_FILE)
#######################
# iterate model
#######################
for name, model_info in models.items():
latency_result = {}
path = model_info["working_directory"]
pwd = os.getcwd()
if not os.path.exists(path):
os.mkdir(path)
os.chdir(path)
path = os.getcwd()
inputs = []
ref_outputs = []
all_inputs_shape = [] # use for standalone trt
ep_to_operator = {} # ep -> { operator -> count }
profile_already_parsed = set()
#######################
# iterate ep
#######################
for ep in ep_list:
if skip_ep(name, ep, model_to_fail_ep):
continue
if not is_standalone(ep):
ep_ = ep_to_provider_list[ep][0]
if ep_ not in onnxruntime.get_available_providers():
logger.error("No {} support".format(ep_))
continue
model_path = model_info["model_path"]
test_data_dir = model_info["test_data_path"]
logger.info("[Initialize] model = {}, ep = {} ...".format(name, ep))
# Set environment variables for ort-trt benchmarking
trt_ep_options = copy.deepcopy(args.trt_ep_options)
if "ORT-TRT" in ep:
trt_ep_options["trt_fp16_enable"] = "True" if "Fp16" in ep else "False"
convert_input_fp16 = False
# use float16.py for cuda fp16 only
if cuda_fp16 == ep:
# handle model
if "model_path_fp16" in model_info:
model_path = model_info["model_path_fp16"]
else:
try:
model_path = convert_model_from_float_to_float16(model_path)
convert_input_fp16 = True
except Exception as e:
logger.error(e)
update_fail_model_map(model_to_fail_ep, name, ep, "script error", e)
continue
# handle test data
if "test_data_path_fp16" in model_info:
test_data_dir = model_info["test_data_path_fp16"]
convert_input_fp16 = False
inputs, ref_outputs = get_test_data(convert_input_fp16, test_data_dir, all_inputs_shape)
# generate random input data
if args.input_data == "random":
inputs = generate_onnx_model_random_input(args.test_times, inputs[0])
#######################################
# benchmark or validation
#######################################
if args.running_mode == "benchmark":
logger.info("\n----------------------------- benchmark -------------------------------------")
# memory tracking variables
p = None
mem_usage = None
result = None
# get standalone TensorRT perf
if is_standalone(ep) and args.trtexec:
try:
result = run_trt_standalone(
args.trtexec,
name,
model_path,
all_inputs_shape,
ep == standalone_trt_fp16,
args.track_memory,
)
except Exception as e:
logger.error(e)
update_fail_model_map(model_to_fail_ep, name, ep, "runtime error", e)
continue
# inference with onnxruntime ep
else:
# resolve providers to create session
providers = ep_to_provider_list[ep]
provider_options = get_provider_options(providers, trt_ep_options, args.cuda_ep_options)
options = onnxruntime.SessionOptions()
enablement = args.graph_enablement
if enablement == enable_all:
options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
elif enablement == extended:
options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
elif enablement == basic:
options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_BASIC
else: # disable
options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
# create onnxruntime inference session
try:
sess, second_creation_time = create_session(model_path, providers, provider_options, options)
except Exception as e:
logger.error(e)
update_fail_model_map(model_to_fail_ep, name, ep, "runtime error", e)
continue
if second_creation_time:
model_to_session[name] = copy.deepcopy({ep + second: second_creation_time})
logger.info("start to inference {} with {} ...".format(name, ep))
logger.info(sess.get_providers())
logger.info(sess.get_provider_options())
if sess:
logger.info("Model inputs nodes:")
for input_meta in sess.get_inputs():
logger.info(input_meta)
logger.info("Model outputs nodes:")
for output_meta in sess.get_outputs():
logger.info(output_meta)
batch_size = 1
result_template = {
"engine": "onnxruntime",
"version": onnxruntime.__version__,
"device": ep,
"fp16": convert_input_fp16,
"io_binding": args.io_binding,
"graph_optimizations": args.graph_enablement,
"enable_cache": args.trt_ep_options.get("trt_engine_cache_enable", "False"),
"model_name": name,
"inputs": len(sess.get_inputs()),
"batch_size": batch_size,
"sequence_length": 1,
"datetime": str(datetime.now()),
}
# run cpu fewer times
repeat_times = 100 if ep == cpu else args.test_times
track_memory = False if ep == cpu else args.track_memory
# inference with ort
try:
result, mem_usage = inference_ort(
args,
name,
sess,
ep,
inputs,
result_template,
repeat_times,
batch_size,
track_memory,
)
except Exception as e:
logger.error(e)
update_fail_model_map(model_to_fail_ep, name, ep, "runtime error", e)
continue
if result:
latency_result[ep] = {}
latency_result[ep]["average_latency_ms"] = result["average_latency_ms"]
latency_result[ep]["latency_90_percentile"] = result["latency_90_percentile"]
if "memory" in result:
mem_usage = result["memory"]
if mem_usage:
latency_result[ep]["memory"] = mem_usage
if not args.trtexec: # skip standalone
success_results.append(result)
model_to_latency[name] = copy.deepcopy(latency_result)
if ep == trt_fp16: # delete engine
remove_files(args.running_mode, model_info["working_directory"])
logger.info("---------------------------- benchmark [end] ----------------------------------\n")
elif args.running_mode == "validate":
logger.info("\n----------------------------- validate -------------------------------------")
# enable profiling to generate profiling file for analysis
options = onnxruntime.SessionOptions()
options.enable_profiling = True
options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
time.sleep(1) # avoid to generate same profile file name
providers = ep_to_provider_list[ep]
provider_options = get_provider_options(providers, trt_ep_options, args.cuda_ep_options)
# create onnxruntime inference session
try:
sess, creation_time = create_session(model_path, providers, provider_options, options)
except Exception as e:
logger.error(e)
update_fail_model_map(model_to_fail_ep, name, ep, "runtime error", e)
continue
if creation_time:
model_to_session[name] = copy.deepcopy({ep: creation_time})
sess.disable_fallback()
logger.info("start to inference {} with {} ...".format(name, ep))
logger.info(sess.get_providers())
logger.info(sess.get_provider_options())
if sess:
logger.info("Model inputs nodes:")
for input_meta in sess.get_inputs():
logger.info(input_meta)
logger.info("Model outputs nodes:")
for output_meta in sess.get_outputs():
logger.info(output_meta)
# run inference and validate the result
#
# currently skip TensorRT float16 validation intentionally
if ep not in validation_exemption:
try:
ort_outputs = inference_ort_and_get_prediction(name, sess, inputs)
status = validate(
ref_outputs,
ort_outputs,
args.rtol,
args.atol,
args.percent_mismatch,
)
if not status[0]:
remove_files(args.running_mode, model_info["working_directory"])
update_fail_model_map(
model_to_fail_ep,
name,
ep,
"result accuracy issue",
status[1],
)
continue
except Exception as e:
logger.error(e)
update_fail_model_map(model_to_fail_ep, name, ep, "runtime error", e)
continue
# Run inference again. the reason is that some ep like tensorrt
# it takes much longer time to generate graph on first run and
# we need to skip the perf result of that expensive run.
inference_ort_and_get_prediction(name, sess, inputs)
else:
inference_ort_and_get_prediction(name, sess, inputs)
inference_ort_and_get_prediction(name, sess, inputs)
sess.end_profiling()
# get metrics from profiling file
metrics = get_profile_metrics(path, profile_already_parsed, logger)
if metrics:
logger.info(ep)
ep_to_operator[ep] = metrics
remove_files(args.running_mode, model_info["working_directory"])
logger.info("---------------------------- validate [end] ----------------------------------\n")
####################
# end of iterate ep
####################
# get percentage of execution time and operators in TRT
update_metrics_map(model_to_metrics, name, ep_to_operator)
# cleanup_files()
os.chdir(pwd)
# end of model
return (
success_results,
model_to_latency,
model_to_fail_ep,
model_to_metrics,
model_to_session,
)
def calculate_gain(value, ep1, ep2):
ep1_latency = float(value[ep1]["average_latency_ms"])
ep2_latency = float(value[ep2]["average_latency_ms"])
@ -1960,6 +1624,447 @@ def str2bool(v):
raise argparse.ArgumentTypeError("Boolean value expected.")
def test_models_eps(args, models):
"""
Benchmarks or validates the given models over the provided set of EPs.
:param args: The command-line arguments to this script. Contains the list of EPs to use.
:param models: Dictionary of models to run. The keys are model names and the values are dictionaries containing
paths to the model files and input data.
:return: A tuple containing aggregated metrics/results.
"""
success_results = []
model_to_latency = {} # model -> cuda and tensorrt latency
model_to_metrics = {} # model -> metrics from profiling file
model_to_fail_ep = {} # model -> failing ep
model_to_session = {} # models -> session creation time
if os.path.exists(SESSION_FILE):
model_to_session = read_map_from_file(SESSION_FILE)
if os.path.exists(FAIL_MODEL_FILE):
model_to_fail_ep = read_map_from_file(FAIL_MODEL_FILE)
ep_list = []
if args.ep:
ep_list.append(args.ep)
else:
if args.fp16:
ep_list = [cpu, cuda, trt, cuda_fp16, trt_fp16]
else:
ep_list = [cpu, cuda, trt]
init_dir = os.getcwd()
# Run benchmarking and/or validation for every model and EP combination.
for name, model_info in models.items():
ep_results = {"latency": {}, "metrics": {}, "session": {}}
for exec_provider in ep_list:
# Skip model + EP combinations that have already failed in a previous run.
if skip_ep(name, exec_provider, model_to_fail_ep):
continue
# Check if EP is supported.
if not is_standalone(exec_provider):
ep_ = ep_to_provider_list[exec_provider][0]
if ep_ not in onnxruntime.get_available_providers():
logger.error("No %s support", ep_)
continue
# Create a temporary directory for this run, which may create profiles, subgraph dumps, and TRT engines.
# The temporary directory is created in '/tmp/' and is automatically deleted after scope exit.
with tempfile.TemporaryDirectory() as temp_dir:
run_model_on_ep(
args,
name,
model_info,
exec_provider,
success_results,
model_to_fail_ep,
ep_results,
temp_dir,
)
model_to_latency[name] = ep_results["latency"]
model_to_session[name] = ep_results["session"]
update_metrics_map(model_to_metrics, name, ep_results["metrics"])
os.chdir(init_dir)
return (
success_results,
model_to_latency,
model_to_fail_ep,
model_to_metrics,
model_to_session,
)
def run_model_on_ep(
args,
model_name,
model_info,
exec_provider,
success_results,
model_to_fail_ep,
ep_results,
tmp_work_dir,
):
"""
Benchmarks and/or validates the given model on the given EP.
:param args: The command-line arguments to this script.
:param model_name: The name of the model to run.
:param model_info: A dictionary that contains paths to the model file and input data.
:param exec_provider: The name of the EP (e.g., ORT-CUDAFp32) on which to run the model.
:param success_results: List of successful results that is updated by this function.
:param model_to_fail_ep: Dictionary that tracks failing model and EP combinations. Updated by this function.
:param ep_results: Dictionary that maps an EP to latency and operator partition results. Updated by this function.
:param tmp_work_dir: Temporary directory in which to run the model + EP.
"""
all_inputs_shape = [] # used for standalone trt
model_work_dir = os.path.abspath(model_info["working_directory"])
model_path = os.path.normpath(os.path.join(model_work_dir, model_info["model_path"]))
test_data_dir = os.path.normpath(os.path.join(model_work_dir, model_info["test_data_path"]))
os.chdir(tmp_work_dir)
logger.info("Starting mode '%s' for %s on %s ...", args.running_mode, model_name, exec_provider)
# Set environment variables for ort-trt benchmarking
trt_ep_options = copy.deepcopy(args.trt_ep_options)
if "ORT-TRT" in exec_provider:
trt_ep_options["trt_fp16_enable"] = "True" if "Fp16" in exec_provider else "False"
# Create/set a directory to store TRT engine caches.
engine_cache_path = os.path.normpath(os.path.join(tmp_work_dir, TRT_ENGINE_CACHE_DIR_NAME))
if not os.path.exists(engine_cache_path):
os.makedirs(engine_cache_path)
trt_ep_options["trt_engine_cache_path"] = engine_cache_path
convert_input_fp16 = False
# use float16.py for cuda fp16 only
if cuda_fp16 == exec_provider:
# handle model
if "model_path_fp16" in model_info:
model_path = os.path.normpath(os.path.join(model_work_dir, model_info["model_path_fp16"]))
else:
try:
model_path = convert_model_from_float_to_float16(model_path, tmp_work_dir)
convert_input_fp16 = True
except Exception as excpt:
logger.error(excpt)
update_fail_model_map(model_to_fail_ep, model_name, exec_provider, "script error", excpt)
return
# handle test data
if "test_data_path_fp16" in model_info:
test_data_dir = os.path.normpath(os.path.join(model_work_dir, model_info["test_data_path_fp16"]))
convert_input_fp16 = False
inputs, ref_outputs = get_test_data(convert_input_fp16, test_data_dir, all_inputs_shape)
# generate random input data
if args.input_data == "random":
inputs = generate_onnx_model_random_input(args.test_times, inputs[0])
do_validate = is_validate_mode(args.running_mode)
do_benchmark = is_benchmark_mode(args.running_mode)
validation_passed = False
#######################################
# Validation
#######################################
if do_validate:
validation_passed = validate_model_on_ep(
args,
model_name,
exec_provider,
trt_ep_options,
model_path,
inputs,
ref_outputs,
model_to_fail_ep,
ep_results,
tmp_work_dir,
)
#######################################
# Benchmark
#######################################
if do_benchmark and (validation_passed or not do_validate):
benchmark_model_on_ep(
args,
model_name,
exec_provider,
trt_ep_options,
model_path,
inputs,
all_inputs_shape,
model_to_fail_ep,
ep_results,
success_results,
test_data_dir,
convert_input_fp16,
)
def benchmark_model_on_ep(
args,
model_name,
exec_provider,
trt_ep_options,
model_path,
inputs,
all_inputs_shape,
model_to_fail_ep,
ep_results,
success_results,
test_data_dir,
convert_input_fp16,
):
"""
Benchmarks the given model on the given EP.
:param args: The command-line arguments to this script.
:param model_name: The name of the model to run.
:param exec_provider: The name of the EP (e.g., ORT-CUDAFp32) on which to run the model.
:param trt_ep_options: Additional TensorRT EP session options to apply.
:param model_path: The path to the model file.
:param inputs: Inputs to the model.
:param all_inputs_shape: Input shapes. Needed by trtexec.
:param model_to_fail_ep: Dictionary that tracks failing model and EP combinations. Updated by this function.
:param ep_results: Dictionary that maps an EP to latency and operator partition results. Updated by this function.
:param success_results: List of successful results that is updated by this function.
:param test_data_dir: Directory containing input .pb files. Needed by trtexec.
:param convert_input_fp16: True if the inputs were converted to FP16.
"""
# memory tracking variables
mem_usage = None
result = None
# get standalone TensorRT perf
if is_standalone(exec_provider) and args.trtexec:
try:
result = run_trt_standalone(
args.trtexec,
model_name,
model_path,
test_data_dir,
all_inputs_shape,
exec_provider == standalone_trt_fp16,
args.track_memory,
)
except Exception as excpt:
logger.error(excpt)
update_fail_model_map(model_to_fail_ep, model_name, exec_provider, "runtime error", excpt)
return
# inference with onnxruntime ep
else:
# resolve providers to create session
providers = ep_to_provider_list[exec_provider]
provider_options = get_provider_options(providers, trt_ep_options, args.cuda_ep_options)
options = onnxruntime.SessionOptions()
options.graph_optimization_level = get_graph_opt_level(args.graph_enablement)
# create onnxruntime inference session
try:
sess, second_creation_time = create_session(model_path, providers, provider_options, options)
except Exception as excpt:
logger.error(excpt)
update_fail_model_map(model_to_fail_ep, model_name, exec_provider, "runtime error", excpt)
return
if second_creation_time:
ep_results["session"][exec_provider + second] = second_creation_time
logger.info("Start to inference %s with %s ...", model_name, exec_provider)
logger.info(sess.get_providers())
logger.info(sess.get_provider_options())
if sess:
logger.info("Model inputs nodes:")
for input_meta in sess.get_inputs():
logger.info(input_meta)
logger.info("Model outputs nodes:")
for output_meta in sess.get_outputs():
logger.info(output_meta)
batch_size = 1
result_template = {
"engine": "onnxruntime",
"version": onnxruntime.__version__,
"device": exec_provider,
"fp16": convert_input_fp16,
"io_binding": args.io_binding,
"graph_optimizations": args.graph_enablement,
"enable_cache": args.trt_ep_options.get("trt_engine_cache_enable", "False"),
"model_name": model_name,
"inputs": len(sess.get_inputs()),
"batch_size": batch_size,
"sequence_length": 1,
"datetime": str(datetime.now()),
}
# run cpu fewer times
repeat_times = 100 if exec_provider == cpu else args.test_times
track_memory = False if exec_provider == cpu else args.track_memory
# inference with ort
try:
result, mem_usage = inference_ort(
args,
model_name,
sess,
exec_provider,
inputs,
result_template,
repeat_times,
batch_size,
track_memory,
)
except Exception as excpt:
logger.error(excpt)
update_fail_model_map(model_to_fail_ep, model_name, exec_provider, "runtime error", excpt)
return
if result:
ep_results["latency"][exec_provider] = {}
ep_results["latency"][exec_provider]["average_latency_ms"] = result["average_latency_ms"]
ep_results["latency"][exec_provider]["latency_90_percentile"] = result["latency_90_percentile"]
if "memory" in result:
mem_usage = result["memory"]
if mem_usage:
ep_results["latency"][exec_provider]["memory"] = mem_usage
if not args.trtexec: # skip standalone
success_results.append(result)
def validate_model_on_ep(
args,
model_name,
exec_provider,
trt_ep_options,
model_path,
inputs,
ref_outputs,
model_to_fail_ep,
ep_results,
tmp_work_dir,
):
"""
Validates the given model on the given EP.
:param args: The command-line arguments to this script.
:param model_name: The name of the model to run.
:param exec_provider: The name of the EP (e.g., ORT-CUDAFp32) on which to run the model.
:param trt_ep_options: Additional TensorRT EP session options to apply.
:param model_path: The path to the model file.
:param inputs: Inputs to the model.
:param ref_outputs: Reference outputs used to validate inference results.
:param model_to_fail_ep: Dictionary that tracks failing model and EP combinations. Updated by this function.
:param ep_results: Dictionary that maps an EP to latency and operator partition results. Updated by this function.
:param tmp_work_dir: Temporary directory where inference profile files were dumped.
"""
if is_standalone(exec_provider):
return True
# enable profiling to generate profiling file for analysis
options = onnxruntime.SessionOptions()
options.enable_profiling = True
options.profile_file_prefix = f"ort_profile_{model_name}_{exec_provider}"
options.graph_optimization_level = get_graph_opt_level(args.graph_enablement)
providers = ep_to_provider_list[exec_provider]
provider_options = get_provider_options(providers, trt_ep_options, args.cuda_ep_options)
# create onnxruntime inference session
try:
sess, creation_time = create_session(model_path, providers, provider_options, options)
except Exception as excpt:
logger.error(excpt)
update_fail_model_map(model_to_fail_ep, model_name, exec_provider, "runtime error", excpt)
return False
if creation_time:
ep_results["session"][exec_provider] = creation_time
sess.disable_fallback()
logger.info("Start to inference %s with %s ...", model_name, exec_provider)
logger.info(sess.get_providers())
logger.info(sess.get_provider_options())
if sess:
logger.info("Model inputs nodes:")
for input_meta in sess.get_inputs():
logger.info(input_meta)
logger.info("Model outputs nodes:")
for output_meta in sess.get_outputs():
logger.info(output_meta)
# run inference and validate the result
#
# currently skip TensorRT float16 validation intentionally
if exec_provider != trt_fp16:
try:
ort_outputs = inference_ort_and_get_prediction(model_name, sess, inputs)
status = validate(
ref_outputs,
ort_outputs,
args.rtol,
args.atol,
args.percent_mismatch,
)
if not status[0]:
update_fail_model_map(
model_to_fail_ep,
model_name,
exec_provider,
"result accuracy issue",
status[1],
)
return False
except Exception as excpt:
logger.error(excpt)
update_fail_model_map(model_to_fail_ep, model_name, exec_provider, "runtime error", excpt)
return False
# Run inference again. the reason is that some ep like tensorrt
# it takes much longer time to generate graph on first run and
# we need to skip the perf result of that expensive run.
inference_ort_and_get_prediction(model_name, sess, inputs)
else:
inference_ort_and_get_prediction(model_name, sess, inputs)
inference_ort_and_get_prediction(model_name, sess, inputs)
sess.end_profiling()
# get metrics from profiling file
metrics = get_profile_metrics(tmp_work_dir, options.profile_file_prefix, logger)
if metrics:
ep_results["metrics"][exec_provider] = metrics
return True
class ParseDictArgAction(argparse.Action):
def __call__(self, parser, namespace, values, option_string):
dict_arg = {}
@ -2008,7 +2113,7 @@ def parse_arguments():
"--running_mode",
required=False,
default="benchmark",
choices=["validate", "benchmark"],
choices=["validate", "benchmark", "both"],
help="Testing mode.",
)
@ -2178,7 +2283,7 @@ def main():
model_to_fail_ep,
model_to_metrics,
model_to_session,
) = run_onnxruntime(args, models)
) = test_models_eps(args, models)
perf_end_time = datetime.now()
logger.info("Done running the perf.")

View file

@ -56,8 +56,7 @@ def main():
else:
ep_list = get_ep_list(args.comparison)
if standalone_trt in ep_list or standalone_trt_fp16 in ep_list:
trtexec = resolve_trtexec_path(args.workspace)
trtexec = resolve_trtexec_path(args.workspace)
models = {}
parse_models_helper(args, models)
@ -72,6 +71,9 @@ def main():
benchmark_session_csv = session_name + csv_ending
specs_csv = specs_name + csv_ending
validate = is_validate_mode(args.running_mode)
benchmark = is_benchmark_mode(args.running_mode)
for model, model_info in models.items():
logger.info("\n" + "=" * 40 + "=" * len(model))
logger.info("=" * 20 + model + "=" * 20)
@ -83,7 +85,6 @@ def main():
write_model_info_to_file([model_info], model_list_file)
for ep in ep_list:
command = [
"python3",
"benchmark.py",
@ -103,10 +104,7 @@ def main():
command.append("-z")
if ep == standalone_trt or ep == standalone_trt_fp16:
if args.running_mode == "validate":
continue
else:
command.extend(["--trtexec", trtexec])
command.extend(["--trtexec", trtexec])
if len(args.cuda_ep_options):
command.extend(["--cuda_ep_options", dict_to_args(args.cuda_ep_options)])
@ -114,10 +112,10 @@ def main():
if len(args.trt_ep_options):
command.extend(["--trt_ep_options", dict_to_args(args.trt_ep_options)])
if args.running_mode == "validate":
if validate:
command.extend(["--benchmark_metrics_csv", benchmark_metrics_csv])
elif args.running_mode == "benchmark":
if benchmark:
command.extend(
[
"-t",
@ -135,12 +133,17 @@ def main():
]
)
p = subprocess.run(command)
logger.info(p)
p = subprocess.run(command, stderr=subprocess.PIPE)
logger.info("Completed subprocess %s ", " ".join(p.args))
logger.info("Return code: %d", p.returncode)
if p.returncode != 0:
error_type = "runtime error"
error_message = "Benchmark script exited with returncode = " + str(p.returncode)
if p.stderr:
error_message += "\nSTDERR:\n" + p.stderr.decode("utf-8")
logger.error(error_message)
update_fail_model_map(model_to_fail_ep, model, ep, error_type, error_message)
write_map_to_file(model_to_fail_ep, FAIL_MODEL_FILE)
@ -154,7 +157,7 @@ def main():
Path(path).mkdir(parents=True, exist_ok=True)
if args.running_mode == "validate":
if validate:
logger.info("\n=========================================")
logger.info("=========== Models/EPs metrics ==========")
logger.info("=========================================")
@ -164,7 +167,7 @@ def main():
output_metrics(model_to_metrics, os.path.join(path, benchmark_metrics_csv))
logger.info("\nSaved model metrics results to {}".format(benchmark_metrics_csv))
elif args.running_mode == "benchmark":
if benchmark:
logger.info("\n=========================================")
logger.info("======= Models/EPs session creation =======")
logger.info("=========================================")

View file

@ -65,5 +65,4 @@ setup() {
}
setup
python benchmark_wrapper.py -r validate -m $MODEL_PATH -o result/$OPTION $OPTIONAL_ARGS
python benchmark_wrapper.py -r benchmark -t 1200 -m $MODEL_PATH -o result/$OPTION $OPTIONAL_ARGS
python benchmark_wrapper.py -r both -t 1200 -m $MODEL_PATH -o result/$OPTION $OPTIONAL_ARGS

View file

@ -71,6 +71,30 @@ extended = "extended"
enable_all = "all"
def is_benchmark_mode(running_mode):
"""
Returns True if the script's running mode requires running benchmarks.
:param running_mode: A string denoting the script's running mode (i.e., 'benchmark', 'validate', or 'both')
:return: True if benchmarking is required.
"""
return running_mode == "benchmark" or running_mode == "both"
def is_validate_mode(running_mode):
"""
Returns True if the script's running mode requires running inference validation.
:param running_mode: A string denoting the script's running mode (i.e., 'benchmark', 'validate', or 'both')
:return: True if validation is required.
"""
return running_mode == "validate" or running_mode == "both"
def is_standalone(ep):
return ep == standalone_trt or ep == standalone_trt_fp16
@ -277,14 +301,25 @@ def calculate_trt_latency_percentage(trt_op_map):
return (total_trt_execution_time, total_execution_time, ratio_of_trt_execution_time)
def get_profile_metrics(path, profile_already_parsed, logger=None):
logger.info("Parsing/Analyzing profiling files in {} ...".format(path))
p1 = subprocess.Popen(
["find", path, "-name", "onnxruntime_profile*", "-printf", "%T+\t%p\n"],
def get_profile_metrics(path, profile_file_prefix, logger):
"""
Parses a session profile file to obtain information on operator usage per EP.
:param path: The path containing the session profile file.
:param profile_file_prefix: Custom prefix for session profile names. Refer to ORT SessionOptions.
:param logger: The logger object to use for debug/info logging.
:return: A tuple containing the parsed operator usage information for CPU nodes and GPU kernels.
"""
logger.debug("Parsing/Analyzing profiling files in %s ...", path)
find_proc = subprocess.Popen(
["find", path, "-name", f"{profile_file_prefix}*", "-printf", "%T+\t%p\n"],
stdout=subprocess.PIPE,
)
p2 = subprocess.Popen(["sort"], stdin=p1.stdout, stdout=subprocess.PIPE)
stdout, sterr = p2.communicate()
sort_proc = subprocess.Popen(["sort"], stdin=find_proc.stdout, stdout=subprocess.PIPE)
stdout, sterr = sort_proc.communicate()
stdout = stdout.decode("ascii").strip()
profiling_files = stdout.split("\n")
logger.info(profiling_files)
@ -292,18 +327,15 @@ def get_profile_metrics(path, profile_already_parsed, logger=None):
data = []
for profile in profiling_files:
profile = profile.split("\t")[1]
if profile in profile_already_parsed:
continue
profile_already_parsed.add(profile)
logger.info("start to parse {} ...".format(profile))
with open(profile) as f:
op_map = parse_single_file(f)
logger.debug("Parsing profile %s ...", profile)
with open(profile, encoding="utf-8") as fd:
op_map = parse_single_file(fd)
if op_map:
data.append(op_map)
if len(data) == 0:
logger.info("No profile metrics got.")
logger.debug("No profile metrics found.")
return None
return data[-1]