From 281f199754d5ca871faa487e59a49afd247cb5cc Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Tue, 8 Nov 2022 16:17:29 -0800
Subject: [PATCH] [EP-Perf-Dashboard] Reduce script excessive output (#13562)

### Description
Properly cleans up all temporary resources created while running
benchmarks.

Details:
- Dump all temporary artifacts (TRT engines, TRT profiles, inference
profiles, fp16 models) into a temp directory in `/tmp/`. Each model/EP
combination has its own temp directory that is deleted after validation
and benchmarking.
- Allow running both validation and benchmarking in one invocation of
the benchmark.py script. This is necessary to allow the benchmarking
step to reuse artifacts (e.g., TRT engines) created during validation.
Before this PR, we ran validation on all model/EP combinations before
running benchmarks on all combinations again. This required us to keep
all temporary artifacts for all model/EP combinations throughout the
entire run (expensive).
- Create individual functions for validation and benchmarking (split-up
large function that did it all)

### Motivation and Context
The EP Perf pipeline failed to run because the script generated too much
output and the VM ran out of disk space.
---
 .../python/tools/tensorrt/perf/benchmark.py   | 863 ++++++++++--------
 .../tools/tensorrt/perf/benchmark_wrapper.py  |  29 +-
 .../python/tools/tensorrt/perf/perf.sh        |   3 +-
 .../python/tools/tensorrt/perf/perf_utils.py  |  58 +-
 4 files changed, 546 insertions(+), 407 deletions(-)

diff --git a/onnxruntime/python/tools/tensorrt/perf/benchmark.py b/onnxruntime/python/tools/tensorrt/perf/benchmark.py
index eb06c86c30..7bb23084e1 100644
--- a/onnxruntime/python/tools/tensorrt/perf/benchmark.py
+++ b/onnxruntime/python/tools/tensorrt/perf/benchmark.py
@@ -8,7 +8,7 @@ import pprint
 import re
 import subprocess
 import sys
-import time
+import tempfile
 import timeit
 from datetime import datetime
 
@@ -33,7 +33,9 @@ from perf_utils import (
     get_output,
     get_profile_metrics,
     get_total_ops,
+    is_benchmark_mode,
     is_standalone,
+    is_validate_mode,
     memory_ending,
     model_title,
     ort_provider_list,
@@ -82,6 +84,8 @@ METRICS_FILE = ".metrics_map"
 SESSION_FILE = ".session_map"
 MEMORY_FILE = "./temp_memory.csv"
 
+TRT_ENGINE_CACHE_DIR_NAME = "engine_cache"
+
 
 def split_and_sort_output(string_list):
     string_list = string_list.split("\n")
@@ -104,7 +108,20 @@ def get_model_inputs(model):
     return inputs
 
 
-def run_trt_standalone(trtexec, model_name, model_path, all_inputs_shape, fp16, track_memory):
+def get_graph_opt_level(enablement):
+    opt_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
+
+    if enablement == enable_all:
+        opt_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+    elif enablement == extended:
+        opt_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
+    elif enablement == basic:
+        opt_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_BASIC
+
+    return opt_level
+
+
+def run_trt_standalone(trtexec, model_name, model_path, test_data_dir, all_inputs_shape, fp16, track_memory):
     logger.info("running standalone trt")
     onnx_model_path = "--onnx=" + model_path
 
@@ -115,12 +132,12 @@ def run_trt_standalone(trtexec, model_name, model_path, all_inputs_shape, fp16,
     model = onnx.load(model_path)
     ort_inputs = get_model_inputs(model)
 
-    output = get_output(["find", "-L", os.getcwd(), "-name", "test_data*", "-type", "d"])
-    test_data_dir = split_and_sort_output(output)[0]
+    output = get_output(["find", "-L", test_data_dir, "-name", "test_data*", "-type", "d"])
+    test_data_dir_0 = split_and_sort_output(output)[0]
 
     for i in range(len(ort_inputs)):
         name = ort_inputs[i]
-        loaded_input = name + ":" + test_data_dir + "/" + str(i) + ".bin"
+        loaded_input = name + ":" + test_data_dir_0 + "/" + str(i) + ".bin"
         logger.info(loaded_input)
         shape = []
         for j in all_inputs_shape[i]:
@@ -149,7 +166,8 @@ def run_trt_standalone(trtexec, model_name, model_path, all_inputs_shape, fp16,
         command.extend(["--fp16"])
 
     # save engine
-    engine_name = model_name + ".engine"
+    engine_suffix = "_trtexec_fp16.engine" if fp16 else "_trtexec.engine"
+    engine_name = model_name + engine_suffix
     save_command = command + ["--saveEngine=" + engine_name]
     logger.info(save_command)
     out = get_output(save_command)
@@ -167,9 +185,9 @@ def run_trt_standalone(trtexec, model_name, model_path, all_inputs_shape, fp16,
             out = get_output(load_command)
             success = True
             mem_usage = end_memory_tracking(p, success)
-        except Exception as e:
+        except Exception as excpt:
             end_memory_tracking(p, success)
-            raise (e)
+            raise excpt
     else:
         out = get_output(load_command)
 
@@ -603,41 +621,6 @@ def validate(all_ref_outputs, all_outputs, rtol, atol, percent_mismatch):
     return True, None
 
 
-# not use for this script
-def cleanup_files():
-    files = []
-    p = subprocess.Popen(["find", ".", "-name", "test_data_set*", "-type", "d"], stdout=subprocess.PIPE)
-    stdout, sterr = p.communicate()
-    stdout = stdout.decode("ascii").strip()
-    files = files + stdout.split("\n")
-
-    p = subprocess.Popen(["find", ".", "-name", "*.onnx"], stdout=subprocess.PIPE)
-    stdout, sterr = p.communicate()
-    stdout = stdout.decode("ascii").strip()
-    files = files + stdout.split("\n")
-
-    p = subprocess.Popen(["find", ".", "-name", "*.gz"], stdout=subprocess.PIPE)
-    stdout, sterr = p.communicate()
-    stdout = stdout.decode("ascii").strip()
-    files = files + stdout.split("\n")
-
-    for f in files:
-        if "custom_test_data" in f:
-            logger.info(f)
-            continue
-        subprocess.Popen(["rm", "-rf", f], stdout=subprocess.PIPE)
-
-
-def remove_files(running_mode, path):
-    files = []
-    out = ""
-    if running_mode == "validate":
-        out = get_output(["find", path, "-name", "onnxruntime_profile*"])
-    if running_mode == "benchmark":
-        logger.info(running_mode)
-        out = get_output(["find", path, "-name", "*.engine"])
-
-
 def update_fail_report(fail_results, model, ep, e_type, e):
     result = {}
 
@@ -817,11 +800,18 @@ def skip_ep(model_name, ep, model_to_fail_ep):
 
 
 def read_map_from_file(map_file):
+    """
+    Load a dictionary stored as a JSON file.
+
+    :param map_file: The name of the JSON file to load.
+
+    :return: A dictionary with the contents of the JSON file.
+    """
+
+    data = {}
+
     with open(map_file) as f:
-        try:
-            data = json.load(f)
-        except Exception as e:
-            return None
+        data = json.load(f)
 
     return data
 
@@ -1055,15 +1045,16 @@ def parse_models_info_from_file(root_dir, path, models):
                 model["test_data_path_fp16"] = row["test_data_path_fp16"]
 
 
-def convert_model_from_float_to_float16(model_path):
+def convert_model_from_float_to_float16(model_path, new_model_dir):
     from float16 import convert_float_to_float16
     from onnxmltools.utils import load_model, save_model
 
-    new_model_path = os.path.join(os.getcwd(), "new_fp16_model_by_trt_perf.onnx")
+    new_model_path = os.path.join(new_model_dir, "new_fp16_model_by_trt_perf.onnx")
+
     if not os.path.exists(new_model_path):
         onnx_model = load_model(model_path)
         new_onnx_model = convert_float_to_float16(onnx_model)
-        save_model(new_onnx_model, "new_fp16_model_by_trt_perf.onnx")
+        save_model(new_onnx_model, new_model_path)
 
     return new_model_path
 
@@ -1135,333 +1126,6 @@ def create_session(model_path, providers, provider_options, session_options):
             raise Exception(e)
 
 
-def run_onnxruntime(args, models):
-
-    success_results = []
-    model_to_latency = {}  # model -> cuda and tensorrt latency
-    model_to_metrics = {}  # model -> metrics from profiling file
-    model_to_fail_ep = {}  # model -> failing ep
-    model_to_session = {}  # models -> session creation time
-
-    if args.running_mode == "benchmark" and os.path.exists(SESSION_FILE):
-        model_to_session = read_map_from_file(SESSION_FILE)
-
-    ep_list = []
-    if args.ep:
-        ep_list.append(args.ep)
-    else:
-        if args.fp16:
-            ep_list = [cpu, cuda, trt, cuda_fp16, trt_fp16]
-        else:
-            ep_list = [cpu, cuda, trt]
-
-    validation_exemption = [trt_fp16]
-
-    if os.path.exists(FAIL_MODEL_FILE):
-        model_to_fail_ep = read_map_from_file(FAIL_MODEL_FILE)
-
-    #######################
-    # iterate model
-    #######################
-    for name, model_info in models.items():
-        latency_result = {}
-        path = model_info["working_directory"]
-
-        pwd = os.getcwd()
-        if not os.path.exists(path):
-            os.mkdir(path)
-        os.chdir(path)
-        path = os.getcwd()
-
-        inputs = []
-        ref_outputs = []
-        all_inputs_shape = []  # use for standalone trt
-        ep_to_operator = {}  # ep -> { operator -> count }
-        profile_already_parsed = set()
-
-        #######################
-        # iterate ep
-        #######################
-        for ep in ep_list:
-            if skip_ep(name, ep, model_to_fail_ep):
-                continue
-
-            if not is_standalone(ep):
-                ep_ = ep_to_provider_list[ep][0]
-                if ep_ not in onnxruntime.get_available_providers():
-                    logger.error("No {} support".format(ep_))
-                    continue
-
-            model_path = model_info["model_path"]
-            test_data_dir = model_info["test_data_path"]
-
-            logger.info("[Initialize]  model = {}, ep = {} ...".format(name, ep))
-
-            # Set environment variables for ort-trt benchmarking
-            trt_ep_options = copy.deepcopy(args.trt_ep_options)
-            if "ORT-TRT" in ep:
-                trt_ep_options["trt_fp16_enable"] = "True" if "Fp16" in ep else "False"
-
-            convert_input_fp16 = False
-
-            # use float16.py for cuda fp16 only
-            if cuda_fp16 == ep:
-
-                # handle model
-                if "model_path_fp16" in model_info:
-                    model_path = model_info["model_path_fp16"]
-
-                else:
-                    try:
-                        model_path = convert_model_from_float_to_float16(model_path)
-                        convert_input_fp16 = True
-                    except Exception as e:
-                        logger.error(e)
-                        update_fail_model_map(model_to_fail_ep, name, ep, "script error", e)
-                        continue
-
-                # handle test data
-                if "test_data_path_fp16" in model_info:
-                    test_data_dir = model_info["test_data_path_fp16"]
-                    convert_input_fp16 = False
-
-            inputs, ref_outputs = get_test_data(convert_input_fp16, test_data_dir, all_inputs_shape)
-            # generate random input data
-            if args.input_data == "random":
-                inputs = generate_onnx_model_random_input(args.test_times, inputs[0])
-
-            #######################################
-            # benchmark or validation
-            #######################################
-            if args.running_mode == "benchmark":
-                logger.info("\n----------------------------- benchmark -------------------------------------")
-
-                # memory tracking variables
-                p = None
-                mem_usage = None
-                result = None
-
-                # get standalone TensorRT perf
-                if is_standalone(ep) and args.trtexec:
-                    try:
-                        result = run_trt_standalone(
-                            args.trtexec,
-                            name,
-                            model_path,
-                            all_inputs_shape,
-                            ep == standalone_trt_fp16,
-                            args.track_memory,
-                        )
-                    except Exception as e:
-                        logger.error(e)
-                        update_fail_model_map(model_to_fail_ep, name, ep, "runtime error", e)
-                        continue
-
-                # inference with onnxruntime ep
-                else:
-                    # resolve providers to create session
-                    providers = ep_to_provider_list[ep]
-                    provider_options = get_provider_options(providers, trt_ep_options, args.cuda_ep_options)
-                    options = onnxruntime.SessionOptions()
-
-                    enablement = args.graph_enablement
-                    if enablement == enable_all:
-                        options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
-                    elif enablement == extended:
-                        options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
-                    elif enablement == basic:
-                        options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_BASIC
-                    else:  # disable
-                        options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
-
-                    # create onnxruntime inference session
-                    try:
-                        sess, second_creation_time = create_session(model_path, providers, provider_options, options)
-
-                    except Exception as e:
-                        logger.error(e)
-                        update_fail_model_map(model_to_fail_ep, name, ep, "runtime error", e)
-                        continue
-
-                    if second_creation_time:
-                        model_to_session[name] = copy.deepcopy({ep + second: second_creation_time})
-
-                    logger.info("start to inference {} with {} ...".format(name, ep))
-                    logger.info(sess.get_providers())
-                    logger.info(sess.get_provider_options())
-
-                    if sess:
-                        logger.info("Model inputs nodes:")
-                        for input_meta in sess.get_inputs():
-                            logger.info(input_meta)
-                        logger.info("Model outputs nodes:")
-                        for output_meta in sess.get_outputs():
-                            logger.info(output_meta)
-
-                    batch_size = 1
-                    result_template = {
-                        "engine": "onnxruntime",
-                        "version": onnxruntime.__version__,
-                        "device": ep,
-                        "fp16": convert_input_fp16,
-                        "io_binding": args.io_binding,
-                        "graph_optimizations": args.graph_enablement,
-                        "enable_cache": args.trt_ep_options.get("trt_engine_cache_enable", "False"),
-                        "model_name": name,
-                        "inputs": len(sess.get_inputs()),
-                        "batch_size": batch_size,
-                        "sequence_length": 1,
-                        "datetime": str(datetime.now()),
-                    }
-
-                    # run cpu fewer times
-                    repeat_times = 100 if ep == cpu else args.test_times
-                    track_memory = False if ep == cpu else args.track_memory
-
-                    # inference with ort
-                    try:
-                        result, mem_usage = inference_ort(
-                            args,
-                            name,
-                            sess,
-                            ep,
-                            inputs,
-                            result_template,
-                            repeat_times,
-                            batch_size,
-                            track_memory,
-                        )
-                    except Exception as e:
-                        logger.error(e)
-                        update_fail_model_map(model_to_fail_ep, name, ep, "runtime error", e)
-                        continue
-
-                if result:
-
-                    latency_result[ep] = {}
-                    latency_result[ep]["average_latency_ms"] = result["average_latency_ms"]
-                    latency_result[ep]["latency_90_percentile"] = result["latency_90_percentile"]
-                    if "memory" in result:
-                        mem_usage = result["memory"]
-                    if mem_usage:
-                        latency_result[ep]["memory"] = mem_usage
-                    if not args.trtexec:  # skip standalone
-                        success_results.append(result)
-
-                    model_to_latency[name] = copy.deepcopy(latency_result)
-
-                    if ep == trt_fp16:  # delete engine
-                        remove_files(args.running_mode, model_info["working_directory"])
-
-                logger.info("---------------------------- benchmark [end] ----------------------------------\n")
-
-            elif args.running_mode == "validate":
-                logger.info("\n----------------------------- validate -------------------------------------")
-
-                # enable profiling to generate profiling file for analysis
-                options = onnxruntime.SessionOptions()
-                options.enable_profiling = True
-                options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
-                time.sleep(1)  # avoid to generate same profile file name
-
-                providers = ep_to_provider_list[ep]
-                provider_options = get_provider_options(providers, trt_ep_options, args.cuda_ep_options)
-
-                # create onnxruntime inference session
-                try:
-                    sess, creation_time = create_session(model_path, providers, provider_options, options)
-
-                except Exception as e:
-                    logger.error(e)
-                    update_fail_model_map(model_to_fail_ep, name, ep, "runtime error", e)
-                    continue
-
-                if creation_time:
-                    model_to_session[name] = copy.deepcopy({ep: creation_time})
-
-                sess.disable_fallback()
-
-                logger.info("start to inference {} with {} ...".format(name, ep))
-                logger.info(sess.get_providers())
-                logger.info(sess.get_provider_options())
-
-                if sess:
-                    logger.info("Model inputs nodes:")
-                    for input_meta in sess.get_inputs():
-                        logger.info(input_meta)
-                    logger.info("Model outputs nodes:")
-                    for output_meta in sess.get_outputs():
-                        logger.info(output_meta)
-
-                # run inference and validate the result
-                #
-                # currently skip TensorRT float16 validation intentionally
-                if ep not in validation_exemption:
-                    try:
-                        ort_outputs = inference_ort_and_get_prediction(name, sess, inputs)
-
-                        status = validate(
-                            ref_outputs,
-                            ort_outputs,
-                            args.rtol,
-                            args.atol,
-                            args.percent_mismatch,
-                        )
-                        if not status[0]:
-                            remove_files(args.running_mode, model_info["working_directory"])
-                            update_fail_model_map(
-                                model_to_fail_ep,
-                                name,
-                                ep,
-                                "result accuracy issue",
-                                status[1],
-                            )
-                            continue
-                    except Exception as e:
-                        logger.error(e)
-                        update_fail_model_map(model_to_fail_ep, name, ep, "runtime error", e)
-                        continue
-
-                    # Run inference again. the reason is that some ep like tensorrt
-                    # it takes much longer time to generate graph on first run and
-                    # we need to skip the perf result of that expensive run.
-                    inference_ort_and_get_prediction(name, sess, inputs)
-                else:
-                    inference_ort_and_get_prediction(name, sess, inputs)
-                    inference_ort_and_get_prediction(name, sess, inputs)
-
-                sess.end_profiling()
-
-                # get metrics from profiling file
-                metrics = get_profile_metrics(path, profile_already_parsed, logger)
-                if metrics:
-                    logger.info(ep)
-                    ep_to_operator[ep] = metrics
-
-                remove_files(args.running_mode, model_info["working_directory"])
-                logger.info("---------------------------- validate [end] ----------------------------------\n")
-
-        ####################
-        # end of iterate ep
-        ####################
-
-        # get percentage of execution time and operators in TRT
-        update_metrics_map(model_to_metrics, name, ep_to_operator)
-
-        # cleanup_files()
-        os.chdir(pwd)
-
-        # end of model
-
-    return (
-        success_results,
-        model_to_latency,
-        model_to_fail_ep,
-        model_to_metrics,
-        model_to_session,
-    )
-
-
 def calculate_gain(value, ep1, ep2):
     ep1_latency = float(value[ep1]["average_latency_ms"])
     ep2_latency = float(value[ep2]["average_latency_ms"])
@@ -1960,6 +1624,447 @@ def str2bool(v):
         raise argparse.ArgumentTypeError("Boolean value expected.")
 
 
+def test_models_eps(args, models):
+    """
+    Benchmarks or validates the given models over the provided set of EPs.
+
+    :param args: The command-line arguments to this script. Contains the list of EPs to use.
+    :param models: Dictionary of models to run. The keys are model names and the values are dictionaries containing
+                   paths to the model files and input data.
+
+    :return: A tuple containing aggregated metrics/results.
+    """
+
+    success_results = []
+    model_to_latency = {}  # model -> cuda and tensorrt latency
+    model_to_metrics = {}  # model -> metrics from profiling file
+    model_to_fail_ep = {}  # model -> failing ep
+    model_to_session = {}  # models -> session creation time
+
+    if os.path.exists(SESSION_FILE):
+        model_to_session = read_map_from_file(SESSION_FILE)
+
+    if os.path.exists(FAIL_MODEL_FILE):
+        model_to_fail_ep = read_map_from_file(FAIL_MODEL_FILE)
+
+    ep_list = []
+    if args.ep:
+        ep_list.append(args.ep)
+    else:
+        if args.fp16:
+            ep_list = [cpu, cuda, trt, cuda_fp16, trt_fp16]
+        else:
+            ep_list = [cpu, cuda, trt]
+
+    init_dir = os.getcwd()
+
+    # Run benchmarking and/or validation for every model and EP combination.
+    for name, model_info in models.items():
+        ep_results = {"latency": {}, "metrics": {}, "session": {}}
+
+        for exec_provider in ep_list:
+
+            # Skip model + EP combinations that have already failed in a previous run.
+            if skip_ep(name, exec_provider, model_to_fail_ep):
+                continue
+
+            # Check if EP is supported.
+            if not is_standalone(exec_provider):
+                ep_ = ep_to_provider_list[exec_provider][0]
+                if ep_ not in onnxruntime.get_available_providers():
+                    logger.error("No %s support", ep_)
+                    continue
+
+            # Create a temporary directory for this run, which may create profiles, subgraph dumps, and TRT engines.
+            # The temporary directory is created in '/tmp/' and is automatically deleted after scope exit.
+            with tempfile.TemporaryDirectory() as temp_dir:
+                run_model_on_ep(
+                    args,
+                    name,
+                    model_info,
+                    exec_provider,
+                    success_results,
+                    model_to_fail_ep,
+                    ep_results,
+                    temp_dir,
+                )
+
+        model_to_latency[name] = ep_results["latency"]
+        model_to_session[name] = ep_results["session"]
+        update_metrics_map(model_to_metrics, name, ep_results["metrics"])
+
+    os.chdir(init_dir)
+
+    return (
+        success_results,
+        model_to_latency,
+        model_to_fail_ep,
+        model_to_metrics,
+        model_to_session,
+    )
+
+
+def run_model_on_ep(
+    args,
+    model_name,
+    model_info,
+    exec_provider,
+    success_results,
+    model_to_fail_ep,
+    ep_results,
+    tmp_work_dir,
+):
+    """
+    Benchmarks and/or validates the given model on the given EP.
+
+    :param args: The command-line arguments to this script.
+    :param model_name: The name of the model to run.
+    :param model_info: A dictionary that contains paths to the model file and input data.
+    :param exec_provider: The name of the EP (e.g., ORT-CUDAFp32) on which to run the model.
+    :param success_results: List of successful results that is updated by this function.
+    :param model_to_fail_ep: Dictionary that tracks failing model and EP combinations. Updated by this function.
+    :param ep_results: Dictionary that maps an EP to latency and operator partition results. Updated by this function.
+    :param tmp_work_dir: Temporary directory in which to run the model + EP.
+    """
+
+    all_inputs_shape = []  # used for standalone trt
+    model_work_dir = os.path.abspath(model_info["working_directory"])
+    model_path = os.path.normpath(os.path.join(model_work_dir, model_info["model_path"]))
+    test_data_dir = os.path.normpath(os.path.join(model_work_dir, model_info["test_data_path"]))
+
+    os.chdir(tmp_work_dir)
+
+    logger.info("Starting mode '%s' for %s on %s ...", args.running_mode, model_name, exec_provider)
+
+    # Set environment variables for ort-trt benchmarking
+    trt_ep_options = copy.deepcopy(args.trt_ep_options)
+    if "ORT-TRT" in exec_provider:
+        trt_ep_options["trt_fp16_enable"] = "True" if "Fp16" in exec_provider else "False"
+
+        # Create/set a directory to store TRT engine caches.
+        engine_cache_path = os.path.normpath(os.path.join(tmp_work_dir, TRT_ENGINE_CACHE_DIR_NAME))
+        if not os.path.exists(engine_cache_path):
+            os.makedirs(engine_cache_path)
+
+        trt_ep_options["trt_engine_cache_path"] = engine_cache_path
+
+    convert_input_fp16 = False
+
+    # use float16.py for cuda fp16 only
+    if cuda_fp16 == exec_provider:
+
+        # handle model
+        if "model_path_fp16" in model_info:
+            model_path = os.path.normpath(os.path.join(model_work_dir, model_info["model_path_fp16"]))
+
+        else:
+            try:
+                model_path = convert_model_from_float_to_float16(model_path, tmp_work_dir)
+                convert_input_fp16 = True
+            except Exception as excpt:
+                logger.error(excpt)
+                update_fail_model_map(model_to_fail_ep, model_name, exec_provider, "script error", excpt)
+                return
+
+        # handle test data
+        if "test_data_path_fp16" in model_info:
+            test_data_dir = os.path.normpath(os.path.join(model_work_dir, model_info["test_data_path_fp16"]))
+            convert_input_fp16 = False
+
+    inputs, ref_outputs = get_test_data(convert_input_fp16, test_data_dir, all_inputs_shape)
+    # generate random input data
+    if args.input_data == "random":
+        inputs = generate_onnx_model_random_input(args.test_times, inputs[0])
+
+    do_validate = is_validate_mode(args.running_mode)
+    do_benchmark = is_benchmark_mode(args.running_mode)
+
+    validation_passed = False
+
+    #######################################
+    # Validation
+    #######################################
+    if do_validate:
+        validation_passed = validate_model_on_ep(
+            args,
+            model_name,
+            exec_provider,
+            trt_ep_options,
+            model_path,
+            inputs,
+            ref_outputs,
+            model_to_fail_ep,
+            ep_results,
+            tmp_work_dir,
+        )
+
+    #######################################
+    # Benchmark
+    #######################################
+    if do_benchmark and (validation_passed or not do_validate):
+        benchmark_model_on_ep(
+            args,
+            model_name,
+            exec_provider,
+            trt_ep_options,
+            model_path,
+            inputs,
+            all_inputs_shape,
+            model_to_fail_ep,
+            ep_results,
+            success_results,
+            test_data_dir,
+            convert_input_fp16,
+        )
+
+
+def benchmark_model_on_ep(
+    args,
+    model_name,
+    exec_provider,
+    trt_ep_options,
+    model_path,
+    inputs,
+    all_inputs_shape,
+    model_to_fail_ep,
+    ep_results,
+    success_results,
+    test_data_dir,
+    convert_input_fp16,
+):
+    """
+    Benchmarks the given model on the given EP.
+
+    :param args: The command-line arguments to this script.
+    :param model_name: The name of the model to run.
+    :param exec_provider: The name of the EP (e.g., ORT-CUDAFp32) on which to run the model.
+    :param trt_ep_options: Additional TensorRT EP session options to apply.
+    :param model_path: The path to the model file.
+    :param inputs: Inputs to the model.
+    :param all_inputs_shape: Input shapes. Needed by trtexec.
+    :param model_to_fail_ep: Dictionary that tracks failing model and EP combinations. Updated by this function.
+    :param ep_results: Dictionary that maps an EP to latency and operator partition results. Updated by this function.
+    :param success_results: List of successful results that is updated by this function.
+    :param test_data_dir: Directory containing input .pb files. Needed by trtexec.
+    :param convert_input_fp16: True if the inputs were converted to FP16.
+    """
+
+    # memory tracking variables
+    mem_usage = None
+    result = None
+
+    # get standalone TensorRT perf
+    if is_standalone(exec_provider) and args.trtexec:
+        try:
+            result = run_trt_standalone(
+                args.trtexec,
+                model_name,
+                model_path,
+                test_data_dir,
+                all_inputs_shape,
+                exec_provider == standalone_trt_fp16,
+                args.track_memory,
+            )
+        except Exception as excpt:
+            logger.error(excpt)
+            update_fail_model_map(model_to_fail_ep, model_name, exec_provider, "runtime error", excpt)
+            return
+
+    # inference with onnxruntime ep
+    else:
+        # resolve providers to create session
+        providers = ep_to_provider_list[exec_provider]
+        provider_options = get_provider_options(providers, trt_ep_options, args.cuda_ep_options)
+
+        options = onnxruntime.SessionOptions()
+        options.graph_optimization_level = get_graph_opt_level(args.graph_enablement)
+
+        # create onnxruntime inference session
+        try:
+            sess, second_creation_time = create_session(model_path, providers, provider_options, options)
+
+        except Exception as excpt:
+            logger.error(excpt)
+            update_fail_model_map(model_to_fail_ep, model_name, exec_provider, "runtime error", excpt)
+            return
+
+        if second_creation_time:
+            ep_results["session"][exec_provider + second] = second_creation_time
+
+        logger.info("Start to inference %s with %s ...", model_name, exec_provider)
+        logger.info(sess.get_providers())
+        logger.info(sess.get_provider_options())
+
+        if sess:
+            logger.info("Model inputs nodes:")
+            for input_meta in sess.get_inputs():
+                logger.info(input_meta)
+            logger.info("Model outputs nodes:")
+            for output_meta in sess.get_outputs():
+                logger.info(output_meta)
+
+        batch_size = 1
+        result_template = {
+            "engine": "onnxruntime",
+            "version": onnxruntime.__version__,
+            "device": exec_provider,
+            "fp16": convert_input_fp16,
+            "io_binding": args.io_binding,
+            "graph_optimizations": args.graph_enablement,
+            "enable_cache": args.trt_ep_options.get("trt_engine_cache_enable", "False"),
+            "model_name": model_name,
+            "inputs": len(sess.get_inputs()),
+            "batch_size": batch_size,
+            "sequence_length": 1,
+            "datetime": str(datetime.now()),
+        }
+
+        # run cpu fewer times
+        repeat_times = 100 if exec_provider == cpu else args.test_times
+        track_memory = False if exec_provider == cpu else args.track_memory
+
+        # inference with ort
+        try:
+            result, mem_usage = inference_ort(
+                args,
+                model_name,
+                sess,
+                exec_provider,
+                inputs,
+                result_template,
+                repeat_times,
+                batch_size,
+                track_memory,
+            )
+        except Exception as excpt:
+            logger.error(excpt)
+            update_fail_model_map(model_to_fail_ep, model_name, exec_provider, "runtime error", excpt)
+            return
+
+    if result:
+
+        ep_results["latency"][exec_provider] = {}
+        ep_results["latency"][exec_provider]["average_latency_ms"] = result["average_latency_ms"]
+        ep_results["latency"][exec_provider]["latency_90_percentile"] = result["latency_90_percentile"]
+        if "memory" in result:
+            mem_usage = result["memory"]
+        if mem_usage:
+            ep_results["latency"][exec_provider]["memory"] = mem_usage
+        if not args.trtexec:  # skip standalone
+            success_results.append(result)
+
+
+def validate_model_on_ep(
+    args,
+    model_name,
+    exec_provider,
+    trt_ep_options,
+    model_path,
+    inputs,
+    ref_outputs,
+    model_to_fail_ep,
+    ep_results,
+    tmp_work_dir,
+):
+    """
+    Validates the given model on the given EP.
+
+    :param args: The command-line arguments to this script.
+    :param model_name: The name of the model to run.
+    :param exec_provider: The name of the EP (e.g., ORT-CUDAFp32) on which to run the model.
+    :param trt_ep_options: Additional TensorRT EP session options to apply.
+    :param model_path: The path to the model file.
+    :param inputs: Inputs to the model.
+    :param ref_outputs: Reference outputs used to validate inference results.
+    :param model_to_fail_ep: Dictionary that tracks failing model and EP combinations. Updated by this function.
+    :param ep_results: Dictionary that maps an EP to latency and operator partition results. Updated by this function.
+    :param tmp_work_dir: Temporary directory where inference profile files were dumped.
+    """
+
+    if is_standalone(exec_provider):
+        return True
+
+    # enable profiling to generate profiling file for analysis
+    options = onnxruntime.SessionOptions()
+    options.enable_profiling = True
+    options.profile_file_prefix = f"ort_profile_{model_name}_{exec_provider}"
+    options.graph_optimization_level = get_graph_opt_level(args.graph_enablement)
+
+    providers = ep_to_provider_list[exec_provider]
+    provider_options = get_provider_options(providers, trt_ep_options, args.cuda_ep_options)
+
+    # create onnxruntime inference session
+    try:
+        sess, creation_time = create_session(model_path, providers, provider_options, options)
+
+    except Exception as excpt:
+        logger.error(excpt)
+        update_fail_model_map(model_to_fail_ep, model_name, exec_provider, "runtime error", excpt)
+        return False
+
+    if creation_time:
+        ep_results["session"][exec_provider] = creation_time
+
+    sess.disable_fallback()
+
+    logger.info("Start to inference %s with %s ...", model_name, exec_provider)
+    logger.info(sess.get_providers())
+    logger.info(sess.get_provider_options())
+
+    if sess:
+        logger.info("Model inputs nodes:")
+        for input_meta in sess.get_inputs():
+            logger.info(input_meta)
+        logger.info("Model outputs nodes:")
+        for output_meta in sess.get_outputs():
+            logger.info(output_meta)
+
+    # run inference and validate the result
+    #
+    # currently skip TensorRT float16 validation intentionally
+    if exec_provider != trt_fp16:
+        try:
+            ort_outputs = inference_ort_and_get_prediction(model_name, sess, inputs)
+
+            status = validate(
+                ref_outputs,
+                ort_outputs,
+                args.rtol,
+                args.atol,
+                args.percent_mismatch,
+            )
+            if not status[0]:
+                update_fail_model_map(
+                    model_to_fail_ep,
+                    model_name,
+                    exec_provider,
+                    "result accuracy issue",
+                    status[1],
+                )
+                return False
+        except Exception as excpt:
+            logger.error(excpt)
+            update_fail_model_map(model_to_fail_ep, model_name, exec_provider, "runtime error", excpt)
+            return False
+
+        # Run inference again. the reason is that some ep like tensorrt
+        # it takes much longer time to generate graph on first run and
+        # we need to skip the perf result of that expensive run.
+        inference_ort_and_get_prediction(model_name, sess, inputs)
+    else:
+        inference_ort_and_get_prediction(model_name, sess, inputs)
+        inference_ort_and_get_prediction(model_name, sess, inputs)
+
+    sess.end_profiling()
+
+    # get metrics from profiling file
+    metrics = get_profile_metrics(tmp_work_dir, options.profile_file_prefix, logger)
+    if metrics:
+        ep_results["metrics"][exec_provider] = metrics
+
+    return True
+
+
 class ParseDictArgAction(argparse.Action):
     def __call__(self, parser, namespace, values, option_string):
         dict_arg = {}
@@ -2008,7 +2113,7 @@ def parse_arguments():
         "--running_mode",
         required=False,
         default="benchmark",
-        choices=["validate", "benchmark"],
+        choices=["validate", "benchmark", "both"],
         help="Testing mode.",
     )
 
@@ -2178,7 +2283,7 @@ def main():
         model_to_fail_ep,
         model_to_metrics,
         model_to_session,
-    ) = run_onnxruntime(args, models)
+    ) = test_models_eps(args, models)
     perf_end_time = datetime.now()
 
     logger.info("Done running the perf.")
diff --git a/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py b/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py
index 38b0efca39..918add64ce 100644
--- a/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py
+++ b/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py
@@ -56,8 +56,7 @@ def main():
     else:
         ep_list = get_ep_list(args.comparison)
 
-    if standalone_trt in ep_list or standalone_trt_fp16 in ep_list:
-        trtexec = resolve_trtexec_path(args.workspace)
+    trtexec = resolve_trtexec_path(args.workspace)
 
     models = {}
     parse_models_helper(args, models)
@@ -72,6 +71,9 @@ def main():
     benchmark_session_csv = session_name + csv_ending
     specs_csv = specs_name + csv_ending
 
+    validate = is_validate_mode(args.running_mode)
+    benchmark = is_benchmark_mode(args.running_mode)
+
     for model, model_info in models.items():
         logger.info("\n" + "=" * 40 + "=" * len(model))
         logger.info("=" * 20 + model + "=" * 20)
@@ -83,7 +85,6 @@ def main():
         write_model_info_to_file([model_info], model_list_file)
 
         for ep in ep_list:
-
             command = [
                 "python3",
                 "benchmark.py",
@@ -103,10 +104,7 @@ def main():
                 command.append("-z")
 
             if ep == standalone_trt or ep == standalone_trt_fp16:
-                if args.running_mode == "validate":
-                    continue
-                else:
-                    command.extend(["--trtexec", trtexec])
+                command.extend(["--trtexec", trtexec])
 
             if len(args.cuda_ep_options):
                 command.extend(["--cuda_ep_options", dict_to_args(args.cuda_ep_options)])
@@ -114,10 +112,10 @@ def main():
             if len(args.trt_ep_options):
                 command.extend(["--trt_ep_options", dict_to_args(args.trt_ep_options)])
 
-            if args.running_mode == "validate":
+            if validate:
                 command.extend(["--benchmark_metrics_csv", benchmark_metrics_csv])
 
-            elif args.running_mode == "benchmark":
+            if benchmark:
                 command.extend(
                     [
                         "-t",
@@ -135,12 +133,17 @@ def main():
                     ]
                 )
 
-            p = subprocess.run(command)
-            logger.info(p)
+            p = subprocess.run(command, stderr=subprocess.PIPE)
+            logger.info("Completed subprocess %s ", " ".join(p.args))
+            logger.info("Return code: %d", p.returncode)
 
             if p.returncode != 0:
                 error_type = "runtime error"
                 error_message = "Benchmark script exited with returncode = " + str(p.returncode)
+
+                if p.stderr:
+                    error_message += "\nSTDERR:\n" + p.stderr.decode("utf-8")
+
                 logger.error(error_message)
                 update_fail_model_map(model_to_fail_ep, model, ep, error_type, error_message)
                 write_map_to_file(model_to_fail_ep, FAIL_MODEL_FILE)
@@ -154,7 +157,7 @@ def main():
 
         Path(path).mkdir(parents=True, exist_ok=True)
 
-    if args.running_mode == "validate":
+    if validate:
         logger.info("\n=========================================")
         logger.info("=========== Models/EPs metrics ==========")
         logger.info("=========================================")
@@ -164,7 +167,7 @@ def main():
             output_metrics(model_to_metrics, os.path.join(path, benchmark_metrics_csv))
             logger.info("\nSaved model metrics results to {}".format(benchmark_metrics_csv))
 
-    elif args.running_mode == "benchmark":
+    if benchmark:
         logger.info("\n=========================================")
         logger.info("======= Models/EPs session creation =======")
         logger.info("=========================================")
diff --git a/onnxruntime/python/tools/tensorrt/perf/perf.sh b/onnxruntime/python/tools/tensorrt/perf/perf.sh
index 5f67156043..905ccc476d 100755
--- a/onnxruntime/python/tools/tensorrt/perf/perf.sh
+++ b/onnxruntime/python/tools/tensorrt/perf/perf.sh
@@ -65,5 +65,4 @@ setup() {
 }
 
 setup
-python benchmark_wrapper.py -r validate -m $MODEL_PATH -o result/$OPTION $OPTIONAL_ARGS 
-python benchmark_wrapper.py -r benchmark -t 1200 -m $MODEL_PATH -o result/$OPTION $OPTIONAL_ARGS
+python benchmark_wrapper.py -r both -t 1200 -m $MODEL_PATH -o result/$OPTION $OPTIONAL_ARGS
diff --git a/onnxruntime/python/tools/tensorrt/perf/perf_utils.py b/onnxruntime/python/tools/tensorrt/perf/perf_utils.py
index 3beefaf9c2..61cac72b27 100644
--- a/onnxruntime/python/tools/tensorrt/perf/perf_utils.py
+++ b/onnxruntime/python/tools/tensorrt/perf/perf_utils.py
@@ -71,6 +71,30 @@ extended = "extended"
 enable_all = "all"
 
 
+def is_benchmark_mode(running_mode):
+    """
+    Returns True if the script's running mode requires running benchmarks.
+
+    :param running_mode: A string denoting the script's running mode (i.e., 'benchmark', 'validate', or 'both')
+
+    :return: True if benchmarking is required.
+    """
+
+    return running_mode == "benchmark" or running_mode == "both"
+
+
+def is_validate_mode(running_mode):
+    """
+    Returns True if the script's running mode requires running inference validation.
+
+    :param running_mode: A string denoting the script's running mode (i.e., 'benchmark', 'validate', or 'both')
+
+    :return: True if validation is required.
+    """
+
+    return running_mode == "validate" or running_mode == "both"
+
+
 def is_standalone(ep):
     return ep == standalone_trt or ep == standalone_trt_fp16
 
@@ -277,14 +301,25 @@ def calculate_trt_latency_percentage(trt_op_map):
     return (total_trt_execution_time, total_execution_time, ratio_of_trt_execution_time)
 
 
-def get_profile_metrics(path, profile_already_parsed, logger=None):
-    logger.info("Parsing/Analyzing profiling files in {} ...".format(path))
-    p1 = subprocess.Popen(
-        ["find", path, "-name", "onnxruntime_profile*", "-printf", "%T+\t%p\n"],
+def get_profile_metrics(path, profile_file_prefix, logger):
+    """
+    Parses a session profile file to obtain information on operator usage per EP.
+
+    :param path: The path containing the session profile file.
+    :param profile_file_prefix: Custom prefix for session profile names. Refer to ORT SessionOptions.
+    :param logger: The logger object to use for debug/info logging.
+
+    :return: A tuple containing the parsed operator usage information for CPU nodes and GPU kernels.
+    """
+
+    logger.debug("Parsing/Analyzing profiling files in %s ...", path)
+
+    find_proc = subprocess.Popen(
+        ["find", path, "-name", f"{profile_file_prefix}*", "-printf", "%T+\t%p\n"],
         stdout=subprocess.PIPE,
     )
-    p2 = subprocess.Popen(["sort"], stdin=p1.stdout, stdout=subprocess.PIPE)
-    stdout, sterr = p2.communicate()
+    sort_proc = subprocess.Popen(["sort"], stdin=find_proc.stdout, stdout=subprocess.PIPE)
+    stdout, sterr = sort_proc.communicate()
     stdout = stdout.decode("ascii").strip()
     profiling_files = stdout.split("\n")
     logger.info(profiling_files)
@@ -292,18 +327,15 @@ def get_profile_metrics(path, profile_already_parsed, logger=None):
     data = []
     for profile in profiling_files:
         profile = profile.split("\t")[1]
-        if profile in profile_already_parsed:
-            continue
-        profile_already_parsed.add(profile)
 
-        logger.info("start to parse {} ...".format(profile))
-        with open(profile) as f:
-            op_map = parse_single_file(f)
+        logger.debug("Parsing profile %s ...", profile)
+        with open(profile, encoding="utf-8") as fd:
+            op_map = parse_single_file(fd)
             if op_map:
                 data.append(op_map)
 
     if len(data) == 0:
-        logger.info("No profile metrics got.")
+        logger.debug("No profile metrics found.")
         return None
 
     return data[-1]