diff --git a/onnxruntime/python/tools/tensorrt/perf/benchmark.py b/onnxruntime/python/tools/tensorrt/perf/benchmark.py index eb06c86c30..7bb23084e1 100644 --- a/onnxruntime/python/tools/tensorrt/perf/benchmark.py +++ b/onnxruntime/python/tools/tensorrt/perf/benchmark.py @@ -8,7 +8,7 @@ import pprint import re import subprocess import sys -import time +import tempfile import timeit from datetime import datetime @@ -33,7 +33,9 @@ from perf_utils import ( get_output, get_profile_metrics, get_total_ops, + is_benchmark_mode, is_standalone, + is_validate_mode, memory_ending, model_title, ort_provider_list, @@ -82,6 +84,8 @@ METRICS_FILE = ".metrics_map" SESSION_FILE = ".session_map" MEMORY_FILE = "./temp_memory.csv" +TRT_ENGINE_CACHE_DIR_NAME = "engine_cache" + def split_and_sort_output(string_list): string_list = string_list.split("\n") @@ -104,7 +108,20 @@ def get_model_inputs(model): return inputs -def run_trt_standalone(trtexec, model_name, model_path, all_inputs_shape, fp16, track_memory): +def get_graph_opt_level(enablement): + opt_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL + + if enablement == enable_all: + opt_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL + elif enablement == extended: + opt_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_EXTENDED + elif enablement == basic: + opt_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_BASIC + + return opt_level + + +def run_trt_standalone(trtexec, model_name, model_path, test_data_dir, all_inputs_shape, fp16, track_memory): logger.info("running standalone trt") onnx_model_path = "--onnx=" + model_path @@ -115,12 +132,12 @@ def run_trt_standalone(trtexec, model_name, model_path, all_inputs_shape, fp16, model = onnx.load(model_path) ort_inputs = get_model_inputs(model) - output = get_output(["find", "-L", os.getcwd(), "-name", "test_data*", "-type", "d"]) - test_data_dir = split_and_sort_output(output)[0] + output = get_output(["find", "-L", test_data_dir, "-name", "test_data*", "-type", "d"]) + test_data_dir_0 = split_and_sort_output(output)[0] for i in range(len(ort_inputs)): name = ort_inputs[i] - loaded_input = name + ":" + test_data_dir + "/" + str(i) + ".bin" + loaded_input = name + ":" + test_data_dir_0 + "/" + str(i) + ".bin" logger.info(loaded_input) shape = [] for j in all_inputs_shape[i]: @@ -149,7 +166,8 @@ def run_trt_standalone(trtexec, model_name, model_path, all_inputs_shape, fp16, command.extend(["--fp16"]) # save engine - engine_name = model_name + ".engine" + engine_suffix = "_trtexec_fp16.engine" if fp16 else "_trtexec.engine" + engine_name = model_name + engine_suffix save_command = command + ["--saveEngine=" + engine_name] logger.info(save_command) out = get_output(save_command) @@ -167,9 +185,9 @@ def run_trt_standalone(trtexec, model_name, model_path, all_inputs_shape, fp16, out = get_output(load_command) success = True mem_usage = end_memory_tracking(p, success) - except Exception as e: + except Exception as excpt: end_memory_tracking(p, success) - raise (e) + raise excpt else: out = get_output(load_command) @@ -603,41 +621,6 @@ def validate(all_ref_outputs, all_outputs, rtol, atol, percent_mismatch): return True, None -# not use for this script -def cleanup_files(): - files = [] - p = subprocess.Popen(["find", ".", "-name", "test_data_set*", "-type", "d"], stdout=subprocess.PIPE) - stdout, sterr = p.communicate() - stdout = stdout.decode("ascii").strip() - files = files + stdout.split("\n") - - p = subprocess.Popen(["find", ".", "-name", "*.onnx"], stdout=subprocess.PIPE) - stdout, sterr = p.communicate() - stdout = stdout.decode("ascii").strip() - files = files + stdout.split("\n") - - p = subprocess.Popen(["find", ".", "-name", "*.gz"], stdout=subprocess.PIPE) - stdout, sterr = p.communicate() - stdout = stdout.decode("ascii").strip() - files = files + stdout.split("\n") - - for f in files: - if "custom_test_data" in f: - logger.info(f) - continue - subprocess.Popen(["rm", "-rf", f], stdout=subprocess.PIPE) - - -def remove_files(running_mode, path): - files = [] - out = "" - if running_mode == "validate": - out = get_output(["find", path, "-name", "onnxruntime_profile*"]) - if running_mode == "benchmark": - logger.info(running_mode) - out = get_output(["find", path, "-name", "*.engine"]) - - def update_fail_report(fail_results, model, ep, e_type, e): result = {} @@ -817,11 +800,18 @@ def skip_ep(model_name, ep, model_to_fail_ep): def read_map_from_file(map_file): + """ + Load a dictionary stored as a JSON file. + + :param map_file: The name of the JSON file to load. + + :return: A dictionary with the contents of the JSON file. + """ + + data = {} + with open(map_file) as f: - try: - data = json.load(f) - except Exception as e: - return None + data = json.load(f) return data @@ -1055,15 +1045,16 @@ def parse_models_info_from_file(root_dir, path, models): model["test_data_path_fp16"] = row["test_data_path_fp16"] -def convert_model_from_float_to_float16(model_path): +def convert_model_from_float_to_float16(model_path, new_model_dir): from float16 import convert_float_to_float16 from onnxmltools.utils import load_model, save_model - new_model_path = os.path.join(os.getcwd(), "new_fp16_model_by_trt_perf.onnx") + new_model_path = os.path.join(new_model_dir, "new_fp16_model_by_trt_perf.onnx") + if not os.path.exists(new_model_path): onnx_model = load_model(model_path) new_onnx_model = convert_float_to_float16(onnx_model) - save_model(new_onnx_model, "new_fp16_model_by_trt_perf.onnx") + save_model(new_onnx_model, new_model_path) return new_model_path @@ -1135,333 +1126,6 @@ def create_session(model_path, providers, provider_options, session_options): raise Exception(e) -def run_onnxruntime(args, models): - - success_results = [] - model_to_latency = {} # model -> cuda and tensorrt latency - model_to_metrics = {} # model -> metrics from profiling file - model_to_fail_ep = {} # model -> failing ep - model_to_session = {} # models -> session creation time - - if args.running_mode == "benchmark" and os.path.exists(SESSION_FILE): - model_to_session = read_map_from_file(SESSION_FILE) - - ep_list = [] - if args.ep: - ep_list.append(args.ep) - else: - if args.fp16: - ep_list = [cpu, cuda, trt, cuda_fp16, trt_fp16] - else: - ep_list = [cpu, cuda, trt] - - validation_exemption = [trt_fp16] - - if os.path.exists(FAIL_MODEL_FILE): - model_to_fail_ep = read_map_from_file(FAIL_MODEL_FILE) - - ####################### - # iterate model - ####################### - for name, model_info in models.items(): - latency_result = {} - path = model_info["working_directory"] - - pwd = os.getcwd() - if not os.path.exists(path): - os.mkdir(path) - os.chdir(path) - path = os.getcwd() - - inputs = [] - ref_outputs = [] - all_inputs_shape = [] # use for standalone trt - ep_to_operator = {} # ep -> { operator -> count } - profile_already_parsed = set() - - ####################### - # iterate ep - ####################### - for ep in ep_list: - if skip_ep(name, ep, model_to_fail_ep): - continue - - if not is_standalone(ep): - ep_ = ep_to_provider_list[ep][0] - if ep_ not in onnxruntime.get_available_providers(): - logger.error("No {} support".format(ep_)) - continue - - model_path = model_info["model_path"] - test_data_dir = model_info["test_data_path"] - - logger.info("[Initialize] model = {}, ep = {} ...".format(name, ep)) - - # Set environment variables for ort-trt benchmarking - trt_ep_options = copy.deepcopy(args.trt_ep_options) - if "ORT-TRT" in ep: - trt_ep_options["trt_fp16_enable"] = "True" if "Fp16" in ep else "False" - - convert_input_fp16 = False - - # use float16.py for cuda fp16 only - if cuda_fp16 == ep: - - # handle model - if "model_path_fp16" in model_info: - model_path = model_info["model_path_fp16"] - - else: - try: - model_path = convert_model_from_float_to_float16(model_path) - convert_input_fp16 = True - except Exception as e: - logger.error(e) - update_fail_model_map(model_to_fail_ep, name, ep, "script error", e) - continue - - # handle test data - if "test_data_path_fp16" in model_info: - test_data_dir = model_info["test_data_path_fp16"] - convert_input_fp16 = False - - inputs, ref_outputs = get_test_data(convert_input_fp16, test_data_dir, all_inputs_shape) - # generate random input data - if args.input_data == "random": - inputs = generate_onnx_model_random_input(args.test_times, inputs[0]) - - ####################################### - # benchmark or validation - ####################################### - if args.running_mode == "benchmark": - logger.info("\n----------------------------- benchmark -------------------------------------") - - # memory tracking variables - p = None - mem_usage = None - result = None - - # get standalone TensorRT perf - if is_standalone(ep) and args.trtexec: - try: - result = run_trt_standalone( - args.trtexec, - name, - model_path, - all_inputs_shape, - ep == standalone_trt_fp16, - args.track_memory, - ) - except Exception as e: - logger.error(e) - update_fail_model_map(model_to_fail_ep, name, ep, "runtime error", e) - continue - - # inference with onnxruntime ep - else: - # resolve providers to create session - providers = ep_to_provider_list[ep] - provider_options = get_provider_options(providers, trt_ep_options, args.cuda_ep_options) - options = onnxruntime.SessionOptions() - - enablement = args.graph_enablement - if enablement == enable_all: - options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL - elif enablement == extended: - options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_EXTENDED - elif enablement == basic: - options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_BASIC - else: # disable - options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL - - # create onnxruntime inference session - try: - sess, second_creation_time = create_session(model_path, providers, provider_options, options) - - except Exception as e: - logger.error(e) - update_fail_model_map(model_to_fail_ep, name, ep, "runtime error", e) - continue - - if second_creation_time: - model_to_session[name] = copy.deepcopy({ep + second: second_creation_time}) - - logger.info("start to inference {} with {} ...".format(name, ep)) - logger.info(sess.get_providers()) - logger.info(sess.get_provider_options()) - - if sess: - logger.info("Model inputs nodes:") - for input_meta in sess.get_inputs(): - logger.info(input_meta) - logger.info("Model outputs nodes:") - for output_meta in sess.get_outputs(): - logger.info(output_meta) - - batch_size = 1 - result_template = { - "engine": "onnxruntime", - "version": onnxruntime.__version__, - "device": ep, - "fp16": convert_input_fp16, - "io_binding": args.io_binding, - "graph_optimizations": args.graph_enablement, - "enable_cache": args.trt_ep_options.get("trt_engine_cache_enable", "False"), - "model_name": name, - "inputs": len(sess.get_inputs()), - "batch_size": batch_size, - "sequence_length": 1, - "datetime": str(datetime.now()), - } - - # run cpu fewer times - repeat_times = 100 if ep == cpu else args.test_times - track_memory = False if ep == cpu else args.track_memory - - # inference with ort - try: - result, mem_usage = inference_ort( - args, - name, - sess, - ep, - inputs, - result_template, - repeat_times, - batch_size, - track_memory, - ) - except Exception as e: - logger.error(e) - update_fail_model_map(model_to_fail_ep, name, ep, "runtime error", e) - continue - - if result: - - latency_result[ep] = {} - latency_result[ep]["average_latency_ms"] = result["average_latency_ms"] - latency_result[ep]["latency_90_percentile"] = result["latency_90_percentile"] - if "memory" in result: - mem_usage = result["memory"] - if mem_usage: - latency_result[ep]["memory"] = mem_usage - if not args.trtexec: # skip standalone - success_results.append(result) - - model_to_latency[name] = copy.deepcopy(latency_result) - - if ep == trt_fp16: # delete engine - remove_files(args.running_mode, model_info["working_directory"]) - - logger.info("---------------------------- benchmark [end] ----------------------------------\n") - - elif args.running_mode == "validate": - logger.info("\n----------------------------- validate -------------------------------------") - - # enable profiling to generate profiling file for analysis - options = onnxruntime.SessionOptions() - options.enable_profiling = True - options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL - time.sleep(1) # avoid to generate same profile file name - - providers = ep_to_provider_list[ep] - provider_options = get_provider_options(providers, trt_ep_options, args.cuda_ep_options) - - # create onnxruntime inference session - try: - sess, creation_time = create_session(model_path, providers, provider_options, options) - - except Exception as e: - logger.error(e) - update_fail_model_map(model_to_fail_ep, name, ep, "runtime error", e) - continue - - if creation_time: - model_to_session[name] = copy.deepcopy({ep: creation_time}) - - sess.disable_fallback() - - logger.info("start to inference {} with {} ...".format(name, ep)) - logger.info(sess.get_providers()) - logger.info(sess.get_provider_options()) - - if sess: - logger.info("Model inputs nodes:") - for input_meta in sess.get_inputs(): - logger.info(input_meta) - logger.info("Model outputs nodes:") - for output_meta in sess.get_outputs(): - logger.info(output_meta) - - # run inference and validate the result - # - # currently skip TensorRT float16 validation intentionally - if ep not in validation_exemption: - try: - ort_outputs = inference_ort_and_get_prediction(name, sess, inputs) - - status = validate( - ref_outputs, - ort_outputs, - args.rtol, - args.atol, - args.percent_mismatch, - ) - if not status[0]: - remove_files(args.running_mode, model_info["working_directory"]) - update_fail_model_map( - model_to_fail_ep, - name, - ep, - "result accuracy issue", - status[1], - ) - continue - except Exception as e: - logger.error(e) - update_fail_model_map(model_to_fail_ep, name, ep, "runtime error", e) - continue - - # Run inference again. the reason is that some ep like tensorrt - # it takes much longer time to generate graph on first run and - # we need to skip the perf result of that expensive run. - inference_ort_and_get_prediction(name, sess, inputs) - else: - inference_ort_and_get_prediction(name, sess, inputs) - inference_ort_and_get_prediction(name, sess, inputs) - - sess.end_profiling() - - # get metrics from profiling file - metrics = get_profile_metrics(path, profile_already_parsed, logger) - if metrics: - logger.info(ep) - ep_to_operator[ep] = metrics - - remove_files(args.running_mode, model_info["working_directory"]) - logger.info("---------------------------- validate [end] ----------------------------------\n") - - #################### - # end of iterate ep - #################### - - # get percentage of execution time and operators in TRT - update_metrics_map(model_to_metrics, name, ep_to_operator) - - # cleanup_files() - os.chdir(pwd) - - # end of model - - return ( - success_results, - model_to_latency, - model_to_fail_ep, - model_to_metrics, - model_to_session, - ) - - def calculate_gain(value, ep1, ep2): ep1_latency = float(value[ep1]["average_latency_ms"]) ep2_latency = float(value[ep2]["average_latency_ms"]) @@ -1960,6 +1624,447 @@ def str2bool(v): raise argparse.ArgumentTypeError("Boolean value expected.") +def test_models_eps(args, models): + """ + Benchmarks or validates the given models over the provided set of EPs. + + :param args: The command-line arguments to this script. Contains the list of EPs to use. + :param models: Dictionary of models to run. The keys are model names and the values are dictionaries containing + paths to the model files and input data. + + :return: A tuple containing aggregated metrics/results. + """ + + success_results = [] + model_to_latency = {} # model -> cuda and tensorrt latency + model_to_metrics = {} # model -> metrics from profiling file + model_to_fail_ep = {} # model -> failing ep + model_to_session = {} # models -> session creation time + + if os.path.exists(SESSION_FILE): + model_to_session = read_map_from_file(SESSION_FILE) + + if os.path.exists(FAIL_MODEL_FILE): + model_to_fail_ep = read_map_from_file(FAIL_MODEL_FILE) + + ep_list = [] + if args.ep: + ep_list.append(args.ep) + else: + if args.fp16: + ep_list = [cpu, cuda, trt, cuda_fp16, trt_fp16] + else: + ep_list = [cpu, cuda, trt] + + init_dir = os.getcwd() + + # Run benchmarking and/or validation for every model and EP combination. + for name, model_info in models.items(): + ep_results = {"latency": {}, "metrics": {}, "session": {}} + + for exec_provider in ep_list: + + # Skip model + EP combinations that have already failed in a previous run. + if skip_ep(name, exec_provider, model_to_fail_ep): + continue + + # Check if EP is supported. + if not is_standalone(exec_provider): + ep_ = ep_to_provider_list[exec_provider][0] + if ep_ not in onnxruntime.get_available_providers(): + logger.error("No %s support", ep_) + continue + + # Create a temporary directory for this run, which may create profiles, subgraph dumps, and TRT engines. + # The temporary directory is created in '/tmp/' and is automatically deleted after scope exit. + with tempfile.TemporaryDirectory() as temp_dir: + run_model_on_ep( + args, + name, + model_info, + exec_provider, + success_results, + model_to_fail_ep, + ep_results, + temp_dir, + ) + + model_to_latency[name] = ep_results["latency"] + model_to_session[name] = ep_results["session"] + update_metrics_map(model_to_metrics, name, ep_results["metrics"]) + + os.chdir(init_dir) + + return ( + success_results, + model_to_latency, + model_to_fail_ep, + model_to_metrics, + model_to_session, + ) + + +def run_model_on_ep( + args, + model_name, + model_info, + exec_provider, + success_results, + model_to_fail_ep, + ep_results, + tmp_work_dir, +): + """ + Benchmarks and/or validates the given model on the given EP. + + :param args: The command-line arguments to this script. + :param model_name: The name of the model to run. + :param model_info: A dictionary that contains paths to the model file and input data. + :param exec_provider: The name of the EP (e.g., ORT-CUDAFp32) on which to run the model. + :param success_results: List of successful results that is updated by this function. + :param model_to_fail_ep: Dictionary that tracks failing model and EP combinations. Updated by this function. + :param ep_results: Dictionary that maps an EP to latency and operator partition results. Updated by this function. + :param tmp_work_dir: Temporary directory in which to run the model + EP. + """ + + all_inputs_shape = [] # used for standalone trt + model_work_dir = os.path.abspath(model_info["working_directory"]) + model_path = os.path.normpath(os.path.join(model_work_dir, model_info["model_path"])) + test_data_dir = os.path.normpath(os.path.join(model_work_dir, model_info["test_data_path"])) + + os.chdir(tmp_work_dir) + + logger.info("Starting mode '%s' for %s on %s ...", args.running_mode, model_name, exec_provider) + + # Set environment variables for ort-trt benchmarking + trt_ep_options = copy.deepcopy(args.trt_ep_options) + if "ORT-TRT" in exec_provider: + trt_ep_options["trt_fp16_enable"] = "True" if "Fp16" in exec_provider else "False" + + # Create/set a directory to store TRT engine caches. + engine_cache_path = os.path.normpath(os.path.join(tmp_work_dir, TRT_ENGINE_CACHE_DIR_NAME)) + if not os.path.exists(engine_cache_path): + os.makedirs(engine_cache_path) + + trt_ep_options["trt_engine_cache_path"] = engine_cache_path + + convert_input_fp16 = False + + # use float16.py for cuda fp16 only + if cuda_fp16 == exec_provider: + + # handle model + if "model_path_fp16" in model_info: + model_path = os.path.normpath(os.path.join(model_work_dir, model_info["model_path_fp16"])) + + else: + try: + model_path = convert_model_from_float_to_float16(model_path, tmp_work_dir) + convert_input_fp16 = True + except Exception as excpt: + logger.error(excpt) + update_fail_model_map(model_to_fail_ep, model_name, exec_provider, "script error", excpt) + return + + # handle test data + if "test_data_path_fp16" in model_info: + test_data_dir = os.path.normpath(os.path.join(model_work_dir, model_info["test_data_path_fp16"])) + convert_input_fp16 = False + + inputs, ref_outputs = get_test_data(convert_input_fp16, test_data_dir, all_inputs_shape) + # generate random input data + if args.input_data == "random": + inputs = generate_onnx_model_random_input(args.test_times, inputs[0]) + + do_validate = is_validate_mode(args.running_mode) + do_benchmark = is_benchmark_mode(args.running_mode) + + validation_passed = False + + ####################################### + # Validation + ####################################### + if do_validate: + validation_passed = validate_model_on_ep( + args, + model_name, + exec_provider, + trt_ep_options, + model_path, + inputs, + ref_outputs, + model_to_fail_ep, + ep_results, + tmp_work_dir, + ) + + ####################################### + # Benchmark + ####################################### + if do_benchmark and (validation_passed or not do_validate): + benchmark_model_on_ep( + args, + model_name, + exec_provider, + trt_ep_options, + model_path, + inputs, + all_inputs_shape, + model_to_fail_ep, + ep_results, + success_results, + test_data_dir, + convert_input_fp16, + ) + + +def benchmark_model_on_ep( + args, + model_name, + exec_provider, + trt_ep_options, + model_path, + inputs, + all_inputs_shape, + model_to_fail_ep, + ep_results, + success_results, + test_data_dir, + convert_input_fp16, +): + """ + Benchmarks the given model on the given EP. + + :param args: The command-line arguments to this script. + :param model_name: The name of the model to run. + :param exec_provider: The name of the EP (e.g., ORT-CUDAFp32) on which to run the model. + :param trt_ep_options: Additional TensorRT EP session options to apply. + :param model_path: The path to the model file. + :param inputs: Inputs to the model. + :param all_inputs_shape: Input shapes. Needed by trtexec. + :param model_to_fail_ep: Dictionary that tracks failing model and EP combinations. Updated by this function. + :param ep_results: Dictionary that maps an EP to latency and operator partition results. Updated by this function. + :param success_results: List of successful results that is updated by this function. + :param test_data_dir: Directory containing input .pb files. Needed by trtexec. + :param convert_input_fp16: True if the inputs were converted to FP16. + """ + + # memory tracking variables + mem_usage = None + result = None + + # get standalone TensorRT perf + if is_standalone(exec_provider) and args.trtexec: + try: + result = run_trt_standalone( + args.trtexec, + model_name, + model_path, + test_data_dir, + all_inputs_shape, + exec_provider == standalone_trt_fp16, + args.track_memory, + ) + except Exception as excpt: + logger.error(excpt) + update_fail_model_map(model_to_fail_ep, model_name, exec_provider, "runtime error", excpt) + return + + # inference with onnxruntime ep + else: + # resolve providers to create session + providers = ep_to_provider_list[exec_provider] + provider_options = get_provider_options(providers, trt_ep_options, args.cuda_ep_options) + + options = onnxruntime.SessionOptions() + options.graph_optimization_level = get_graph_opt_level(args.graph_enablement) + + # create onnxruntime inference session + try: + sess, second_creation_time = create_session(model_path, providers, provider_options, options) + + except Exception as excpt: + logger.error(excpt) + update_fail_model_map(model_to_fail_ep, model_name, exec_provider, "runtime error", excpt) + return + + if second_creation_time: + ep_results["session"][exec_provider + second] = second_creation_time + + logger.info("Start to inference %s with %s ...", model_name, exec_provider) + logger.info(sess.get_providers()) + logger.info(sess.get_provider_options()) + + if sess: + logger.info("Model inputs nodes:") + for input_meta in sess.get_inputs(): + logger.info(input_meta) + logger.info("Model outputs nodes:") + for output_meta in sess.get_outputs(): + logger.info(output_meta) + + batch_size = 1 + result_template = { + "engine": "onnxruntime", + "version": onnxruntime.__version__, + "device": exec_provider, + "fp16": convert_input_fp16, + "io_binding": args.io_binding, + "graph_optimizations": args.graph_enablement, + "enable_cache": args.trt_ep_options.get("trt_engine_cache_enable", "False"), + "model_name": model_name, + "inputs": len(sess.get_inputs()), + "batch_size": batch_size, + "sequence_length": 1, + "datetime": str(datetime.now()), + } + + # run cpu fewer times + repeat_times = 100 if exec_provider == cpu else args.test_times + track_memory = False if exec_provider == cpu else args.track_memory + + # inference with ort + try: + result, mem_usage = inference_ort( + args, + model_name, + sess, + exec_provider, + inputs, + result_template, + repeat_times, + batch_size, + track_memory, + ) + except Exception as excpt: + logger.error(excpt) + update_fail_model_map(model_to_fail_ep, model_name, exec_provider, "runtime error", excpt) + return + + if result: + + ep_results["latency"][exec_provider] = {} + ep_results["latency"][exec_provider]["average_latency_ms"] = result["average_latency_ms"] + ep_results["latency"][exec_provider]["latency_90_percentile"] = result["latency_90_percentile"] + if "memory" in result: + mem_usage = result["memory"] + if mem_usage: + ep_results["latency"][exec_provider]["memory"] = mem_usage + if not args.trtexec: # skip standalone + success_results.append(result) + + +def validate_model_on_ep( + args, + model_name, + exec_provider, + trt_ep_options, + model_path, + inputs, + ref_outputs, + model_to_fail_ep, + ep_results, + tmp_work_dir, +): + """ + Validates the given model on the given EP. + + :param args: The command-line arguments to this script. + :param model_name: The name of the model to run. + :param exec_provider: The name of the EP (e.g., ORT-CUDAFp32) on which to run the model. + :param trt_ep_options: Additional TensorRT EP session options to apply. + :param model_path: The path to the model file. + :param inputs: Inputs to the model. + :param ref_outputs: Reference outputs used to validate inference results. + :param model_to_fail_ep: Dictionary that tracks failing model and EP combinations. Updated by this function. + :param ep_results: Dictionary that maps an EP to latency and operator partition results. Updated by this function. + :param tmp_work_dir: Temporary directory where inference profile files were dumped. + """ + + if is_standalone(exec_provider): + return True + + # enable profiling to generate profiling file for analysis + options = onnxruntime.SessionOptions() + options.enable_profiling = True + options.profile_file_prefix = f"ort_profile_{model_name}_{exec_provider}" + options.graph_optimization_level = get_graph_opt_level(args.graph_enablement) + + providers = ep_to_provider_list[exec_provider] + provider_options = get_provider_options(providers, trt_ep_options, args.cuda_ep_options) + + # create onnxruntime inference session + try: + sess, creation_time = create_session(model_path, providers, provider_options, options) + + except Exception as excpt: + logger.error(excpt) + update_fail_model_map(model_to_fail_ep, model_name, exec_provider, "runtime error", excpt) + return False + + if creation_time: + ep_results["session"][exec_provider] = creation_time + + sess.disable_fallback() + + logger.info("Start to inference %s with %s ...", model_name, exec_provider) + logger.info(sess.get_providers()) + logger.info(sess.get_provider_options()) + + if sess: + logger.info("Model inputs nodes:") + for input_meta in sess.get_inputs(): + logger.info(input_meta) + logger.info("Model outputs nodes:") + for output_meta in sess.get_outputs(): + logger.info(output_meta) + + # run inference and validate the result + # + # currently skip TensorRT float16 validation intentionally + if exec_provider != trt_fp16: + try: + ort_outputs = inference_ort_and_get_prediction(model_name, sess, inputs) + + status = validate( + ref_outputs, + ort_outputs, + args.rtol, + args.atol, + args.percent_mismatch, + ) + if not status[0]: + update_fail_model_map( + model_to_fail_ep, + model_name, + exec_provider, + "result accuracy issue", + status[1], + ) + return False + except Exception as excpt: + logger.error(excpt) + update_fail_model_map(model_to_fail_ep, model_name, exec_provider, "runtime error", excpt) + return False + + # Run inference again. the reason is that some ep like tensorrt + # it takes much longer time to generate graph on first run and + # we need to skip the perf result of that expensive run. + inference_ort_and_get_prediction(model_name, sess, inputs) + else: + inference_ort_and_get_prediction(model_name, sess, inputs) + inference_ort_and_get_prediction(model_name, sess, inputs) + + sess.end_profiling() + + # get metrics from profiling file + metrics = get_profile_metrics(tmp_work_dir, options.profile_file_prefix, logger) + if metrics: + ep_results["metrics"][exec_provider] = metrics + + return True + + class ParseDictArgAction(argparse.Action): def __call__(self, parser, namespace, values, option_string): dict_arg = {} @@ -2008,7 +2113,7 @@ def parse_arguments(): "--running_mode", required=False, default="benchmark", - choices=["validate", "benchmark"], + choices=["validate", "benchmark", "both"], help="Testing mode.", ) @@ -2178,7 +2283,7 @@ def main(): model_to_fail_ep, model_to_metrics, model_to_session, - ) = run_onnxruntime(args, models) + ) = test_models_eps(args, models) perf_end_time = datetime.now() logger.info("Done running the perf.") diff --git a/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py b/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py index 38b0efca39..918add64ce 100644 --- a/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py +++ b/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py @@ -56,8 +56,7 @@ def main(): else: ep_list = get_ep_list(args.comparison) - if standalone_trt in ep_list or standalone_trt_fp16 in ep_list: - trtexec = resolve_trtexec_path(args.workspace) + trtexec = resolve_trtexec_path(args.workspace) models = {} parse_models_helper(args, models) @@ -72,6 +71,9 @@ def main(): benchmark_session_csv = session_name + csv_ending specs_csv = specs_name + csv_ending + validate = is_validate_mode(args.running_mode) + benchmark = is_benchmark_mode(args.running_mode) + for model, model_info in models.items(): logger.info("\n" + "=" * 40 + "=" * len(model)) logger.info("=" * 20 + model + "=" * 20) @@ -83,7 +85,6 @@ def main(): write_model_info_to_file([model_info], model_list_file) for ep in ep_list: - command = [ "python3", "benchmark.py", @@ -103,10 +104,7 @@ def main(): command.append("-z") if ep == standalone_trt or ep == standalone_trt_fp16: - if args.running_mode == "validate": - continue - else: - command.extend(["--trtexec", trtexec]) + command.extend(["--trtexec", trtexec]) if len(args.cuda_ep_options): command.extend(["--cuda_ep_options", dict_to_args(args.cuda_ep_options)]) @@ -114,10 +112,10 @@ def main(): if len(args.trt_ep_options): command.extend(["--trt_ep_options", dict_to_args(args.trt_ep_options)]) - if args.running_mode == "validate": + if validate: command.extend(["--benchmark_metrics_csv", benchmark_metrics_csv]) - elif args.running_mode == "benchmark": + if benchmark: command.extend( [ "-t", @@ -135,12 +133,17 @@ def main(): ] ) - p = subprocess.run(command) - logger.info(p) + p = subprocess.run(command, stderr=subprocess.PIPE) + logger.info("Completed subprocess %s ", " ".join(p.args)) + logger.info("Return code: %d", p.returncode) if p.returncode != 0: error_type = "runtime error" error_message = "Benchmark script exited with returncode = " + str(p.returncode) + + if p.stderr: + error_message += "\nSTDERR:\n" + p.stderr.decode("utf-8") + logger.error(error_message) update_fail_model_map(model_to_fail_ep, model, ep, error_type, error_message) write_map_to_file(model_to_fail_ep, FAIL_MODEL_FILE) @@ -154,7 +157,7 @@ def main(): Path(path).mkdir(parents=True, exist_ok=True) - if args.running_mode == "validate": + if validate: logger.info("\n=========================================") logger.info("=========== Models/EPs metrics ==========") logger.info("=========================================") @@ -164,7 +167,7 @@ def main(): output_metrics(model_to_metrics, os.path.join(path, benchmark_metrics_csv)) logger.info("\nSaved model metrics results to {}".format(benchmark_metrics_csv)) - elif args.running_mode == "benchmark": + if benchmark: logger.info("\n=========================================") logger.info("======= Models/EPs session creation =======") logger.info("=========================================") diff --git a/onnxruntime/python/tools/tensorrt/perf/perf.sh b/onnxruntime/python/tools/tensorrt/perf/perf.sh index 5f67156043..905ccc476d 100755 --- a/onnxruntime/python/tools/tensorrt/perf/perf.sh +++ b/onnxruntime/python/tools/tensorrt/perf/perf.sh @@ -65,5 +65,4 @@ setup() { } setup -python benchmark_wrapper.py -r validate -m $MODEL_PATH -o result/$OPTION $OPTIONAL_ARGS -python benchmark_wrapper.py -r benchmark -t 1200 -m $MODEL_PATH -o result/$OPTION $OPTIONAL_ARGS +python benchmark_wrapper.py -r both -t 1200 -m $MODEL_PATH -o result/$OPTION $OPTIONAL_ARGS diff --git a/onnxruntime/python/tools/tensorrt/perf/perf_utils.py b/onnxruntime/python/tools/tensorrt/perf/perf_utils.py index 3beefaf9c2..61cac72b27 100644 --- a/onnxruntime/python/tools/tensorrt/perf/perf_utils.py +++ b/onnxruntime/python/tools/tensorrt/perf/perf_utils.py @@ -71,6 +71,30 @@ extended = "extended" enable_all = "all" +def is_benchmark_mode(running_mode): + """ + Returns True if the script's running mode requires running benchmarks. + + :param running_mode: A string denoting the script's running mode (i.e., 'benchmark', 'validate', or 'both') + + :return: True if benchmarking is required. + """ + + return running_mode == "benchmark" or running_mode == "both" + + +def is_validate_mode(running_mode): + """ + Returns True if the script's running mode requires running inference validation. + + :param running_mode: A string denoting the script's running mode (i.e., 'benchmark', 'validate', or 'both') + + :return: True if validation is required. + """ + + return running_mode == "validate" or running_mode == "both" + + def is_standalone(ep): return ep == standalone_trt or ep == standalone_trt_fp16 @@ -277,14 +301,25 @@ def calculate_trt_latency_percentage(trt_op_map): return (total_trt_execution_time, total_execution_time, ratio_of_trt_execution_time) -def get_profile_metrics(path, profile_already_parsed, logger=None): - logger.info("Parsing/Analyzing profiling files in {} ...".format(path)) - p1 = subprocess.Popen( - ["find", path, "-name", "onnxruntime_profile*", "-printf", "%T+\t%p\n"], +def get_profile_metrics(path, profile_file_prefix, logger): + """ + Parses a session profile file to obtain information on operator usage per EP. + + :param path: The path containing the session profile file. + :param profile_file_prefix: Custom prefix for session profile names. Refer to ORT SessionOptions. + :param logger: The logger object to use for debug/info logging. + + :return: A tuple containing the parsed operator usage information for CPU nodes and GPU kernels. + """ + + logger.debug("Parsing/Analyzing profiling files in %s ...", path) + + find_proc = subprocess.Popen( + ["find", path, "-name", f"{profile_file_prefix}*", "-printf", "%T+\t%p\n"], stdout=subprocess.PIPE, ) - p2 = subprocess.Popen(["sort"], stdin=p1.stdout, stdout=subprocess.PIPE) - stdout, sterr = p2.communicate() + sort_proc = subprocess.Popen(["sort"], stdin=find_proc.stdout, stdout=subprocess.PIPE) + stdout, sterr = sort_proc.communicate() stdout = stdout.decode("ascii").strip() profiling_files = stdout.split("\n") logger.info(profiling_files) @@ -292,18 +327,15 @@ def get_profile_metrics(path, profile_already_parsed, logger=None): data = [] for profile in profiling_files: profile = profile.split("\t")[1] - if profile in profile_already_parsed: - continue - profile_already_parsed.add(profile) - logger.info("start to parse {} ...".format(profile)) - with open(profile) as f: - op_map = parse_single_file(f) + logger.debug("Parsing profile %s ...", profile) + with open(profile, encoding="utf-8") as fd: + op_map = parse_single_file(fd) if op_map: data.append(op_map) if len(data) == 0: - logger.info("No profile metrics got.") + logger.debug("No profile metrics found.") return None return data[-1]