diff --git a/onnxruntime/python/tools/tensorrt/perf/benchmark.py b/onnxruntime/python/tools/tensorrt/perf/benchmark.py
index eb06c86c30..7bb23084e1 100644
--- a/onnxruntime/python/tools/tensorrt/perf/benchmark.py
+++ b/onnxruntime/python/tools/tensorrt/perf/benchmark.py
@@ -8,7 +8,7 @@ import pprint
 import re
 import subprocess
 import sys
-import time
+import tempfile
 import timeit
 from datetime import datetime
 
@@ -33,7 +33,9 @@ from perf_utils import (
     get_output,
     get_profile_metrics,
     get_total_ops,
+    is_benchmark_mode,
     is_standalone,
+    is_validate_mode,
     memory_ending,
     model_title,
     ort_provider_list,
@@ -82,6 +84,8 @@ METRICS_FILE = ".metrics_map"
 SESSION_FILE = ".session_map"
 MEMORY_FILE = "./temp_memory.csv"
 
+TRT_ENGINE_CACHE_DIR_NAME = "engine_cache"
+
 
 def split_and_sort_output(string_list):
     string_list = string_list.split("\n")
@@ -104,7 +108,20 @@ def get_model_inputs(model):
     return inputs
 
 
-def run_trt_standalone(trtexec, model_name, model_path, all_inputs_shape, fp16, track_memory):
+def get_graph_opt_level(enablement):
+    opt_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
+
+    if enablement == enable_all:
+        opt_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+    elif enablement == extended:
+        opt_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
+    elif enablement == basic:
+        opt_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_BASIC
+
+    return opt_level
+
+
+def run_trt_standalone(trtexec, model_name, model_path, test_data_dir, all_inputs_shape, fp16, track_memory):
     logger.info("running standalone trt")
     onnx_model_path = "--onnx=" + model_path
 
@@ -115,12 +132,12 @@ def run_trt_standalone(trtexec, model_name, model_path, all_inputs_shape, fp16,
     model = onnx.load(model_path)
     ort_inputs = get_model_inputs(model)
 
-    output = get_output(["find", "-L", os.getcwd(), "-name", "test_data*", "-type", "d"])
-    test_data_dir = split_and_sort_output(output)[0]
+    output = get_output(["find", "-L", test_data_dir, "-name", "test_data*", "-type", "d"])
+    test_data_dir_0 = split_and_sort_output(output)[0]
 
     for i in range(len(ort_inputs)):
         name = ort_inputs[i]
-        loaded_input = name + ":" + test_data_dir + "/" + str(i) + ".bin"
+        loaded_input = name + ":" + test_data_dir_0 + "/" + str(i) + ".bin"
         logger.info(loaded_input)
         shape = []
         for j in all_inputs_shape[i]:
@@ -149,7 +166,8 @@ def run_trt_standalone(trtexec, model_name, model_path, all_inputs_shape, fp16,
         command.extend(["--fp16"])
 
     # save engine
-    engine_name = model_name + ".engine"
+    engine_suffix = "_trtexec_fp16.engine" if fp16 else "_trtexec.engine"
+    engine_name = model_name + engine_suffix
     save_command = command + ["--saveEngine=" + engine_name]
     logger.info(save_command)
     out = get_output(save_command)
@@ -167,9 +185,9 @@ def run_trt_standalone(trtexec, model_name, model_path, all_inputs_shape, fp16,
             out = get_output(load_command)
             success = True
             mem_usage = end_memory_tracking(p, success)
-        except Exception as e:
+        except Exception as excpt:
             end_memory_tracking(p, success)
-            raise (e)
+            raise excpt
     else:
         out = get_output(load_command)
 
@@ -603,41 +621,6 @@ def validate(all_ref_outputs, all_outputs, rtol, atol, percent_mismatch):
     return True, None
 
 
-# not use for this script
-def cleanup_files():
-    files = []
-    p = subprocess.Popen(["find", ".", "-name", "test_data_set*", "-type", "d"], stdout=subprocess.PIPE)
-    stdout, sterr = p.communicate()
-    stdout = stdout.decode("ascii").strip()
-    files = files + stdout.split("\n")
-
-    p = subprocess.Popen(["find", ".", "-name", "*.onnx"], stdout=subprocess.PIPE)
-    stdout, sterr = p.communicate()
-    stdout = stdout.decode("ascii").strip()
-    files = files + stdout.split("\n")
-
-    p = subprocess.Popen(["find", ".", "-name", "*.gz"], stdout=subprocess.PIPE)
-    stdout, sterr = p.communicate()
-    stdout = stdout.decode("ascii").strip()
-    files = files + stdout.split("\n")
-
-    for f in files:
-        if "custom_test_data" in f:
-            logger.info(f)
-            continue
-        subprocess.Popen(["rm", "-rf", f], stdout=subprocess.PIPE)
-
-
-def remove_files(running_mode, path):
-    files = []
-    out = ""
-    if running_mode == "validate":
-        out = get_output(["find", path, "-name", "onnxruntime_profile*"])
-    if running_mode == "benchmark":
-        logger.info(running_mode)
-        out = get_output(["find", path, "-name", "*.engine"])
-
-
 def update_fail_report(fail_results, model, ep, e_type, e):
     result = {}
 
@@ -817,11 +800,18 @@ def skip_ep(model_name, ep, model_to_fail_ep):
 
 
 def read_map_from_file(map_file):
+    """
+    Load a dictionary stored as a JSON file.
+
+    :param map_file: The name of the JSON file to load.
+
+    :return: A dictionary with the contents of the JSON file.
+    """
+
+    data = {}
+
     with open(map_file) as f:
-        try:
-            data = json.load(f)
-        except Exception as e:
-            return None
+        data = json.load(f)
 
     return data
 
@@ -1055,15 +1045,16 @@ def parse_models_info_from_file(root_dir, path, models):
                 model["test_data_path_fp16"] = row["test_data_path_fp16"]
 
 
-def convert_model_from_float_to_float16(model_path):
+def convert_model_from_float_to_float16(model_path, new_model_dir):
     from float16 import convert_float_to_float16
     from onnxmltools.utils import load_model, save_model
 
-    new_model_path = os.path.join(os.getcwd(), "new_fp16_model_by_trt_perf.onnx")
+    new_model_path = os.path.join(new_model_dir, "new_fp16_model_by_trt_perf.onnx")
+
     if not os.path.exists(new_model_path):
         onnx_model = load_model(model_path)
         new_onnx_model = convert_float_to_float16(onnx_model)
-        save_model(new_onnx_model, "new_fp16_model_by_trt_perf.onnx")
+        save_model(new_onnx_model, new_model_path)
 
     return new_model_path
 
@@ -1135,333 +1126,6 @@ def create_session(model_path, providers, provider_options, session_options):
             raise Exception(e)
 
 
-def run_onnxruntime(args, models):
-
-    success_results = []
-    model_to_latency = {}  # model -> cuda and tensorrt latency
-    model_to_metrics = {}  # model -> metrics from profiling file
-    model_to_fail_ep = {}  # model -> failing ep
-    model_to_session = {}  # models -> session creation time
-
-    if args.running_mode == "benchmark" and os.path.exists(SESSION_FILE):
-        model_to_session = read_map_from_file(SESSION_FILE)
-
-    ep_list = []
-    if args.ep:
-        ep_list.append(args.ep)
-    else:
-        if args.fp16:
-            ep_list = [cpu, cuda, trt, cuda_fp16, trt_fp16]
-        else:
-            ep_list = [cpu, cuda, trt]
-
-    validation_exemption = [trt_fp16]
-
-    if os.path.exists(FAIL_MODEL_FILE):
-        model_to_fail_ep = read_map_from_file(FAIL_MODEL_FILE)
-
-    #######################
-    # iterate model
-    #######################
-    for name, model_info in models.items():
-        latency_result = {}
-        path = model_info["working_directory"]
-
-        pwd = os.getcwd()
-        if not os.path.exists(path):
-            os.mkdir(path)
-        os.chdir(path)
-        path = os.getcwd()
-
-        inputs = []
-        ref_outputs = []
-        all_inputs_shape = []  # use for standalone trt
-        ep_to_operator = {}  # ep -> { operator -> count }
-        profile_already_parsed = set()
-
-        #######################
-        # iterate ep
-        #######################
-        for ep in ep_list:
-            if skip_ep(name, ep, model_to_fail_ep):
-                continue
-
-            if not is_standalone(ep):
-                ep_ = ep_to_provider_list[ep][0]
-                if ep_ not in onnxruntime.get_available_providers():
-                    logger.error("No {} support".format(ep_))
-                    continue
-
-            model_path = model_info["model_path"]
-            test_data_dir = model_info["test_data_path"]
-
-            logger.info("[Initialize]  model = {}, ep = {} ...".format(name, ep))
-
-            # Set environment variables for ort-trt benchmarking
-            trt_ep_options = copy.deepcopy(args.trt_ep_options)
-            if "ORT-TRT" in ep:
-                trt_ep_options["trt_fp16_enable"] = "True" if "Fp16" in ep else "False"
-
-            convert_input_fp16 = False
-
-            # use float16.py for cuda fp16 only
-            if cuda_fp16 == ep:
-
-                # handle model
-                if "model_path_fp16" in model_info:
-                    model_path = model_info["model_path_fp16"]
-
-                else:
-                    try:
-                        model_path = convert_model_from_float_to_float16(model_path)
-                        convert_input_fp16 = True
-                    except Exception as e:
-                        logger.error(e)
-                        update_fail_model_map(model_to_fail_ep, name, ep, "script error", e)
-                        continue
-
-                # handle test data
-                if "test_data_path_fp16" in model_info:
-                    test_data_dir = model_info["test_data_path_fp16"]
-                    convert_input_fp16 = False
-
-            inputs, ref_outputs = get_test_data(convert_input_fp16, test_data_dir, all_inputs_shape)
-            # generate random input data
-            if args.input_data == "random":
-                inputs = generate_onnx_model_random_input(args.test_times, inputs[0])
-
-            #######################################
-            # benchmark or validation
-            #######################################
-            if args.running_mode == "benchmark":
-                logger.info("\n----------------------------- benchmark -------------------------------------")
-
-                # memory tracking variables
-                p = None
-                mem_usage = None
-                result = None
-
-                # get standalone TensorRT perf
-                if is_standalone(ep) and args.trtexec:
-                    try:
-                        result = run_trt_standalone(
-                            args.trtexec,
-                            name,
-                            model_path,
-                            all_inputs_shape,
-                            ep == standalone_trt_fp16,
-                            args.track_memory,
-                        )
-                    except Exception as e:
-                        logger.error(e)
-                        update_fail_model_map(model_to_fail_ep, name, ep, "runtime error", e)
-                        continue
-
-                # inference with onnxruntime ep
-                else:
-                    # resolve providers to create session
-                    providers = ep_to_provider_list[ep]
-                    provider_options = get_provider_options(providers, trt_ep_options, args.cuda_ep_options)
-                    options = onnxruntime.SessionOptions()
-
-                    enablement = args.graph_enablement
-                    if enablement == enable_all:
-                        options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
-                    elif enablement == extended:
-                        options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
-                    elif enablement == basic:
-                        options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_BASIC
-                    else:  # disable
-                        options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
-
-                    # create onnxruntime inference session
-                    try:
-                        sess, second_creation_time = create_session(model_path, providers, provider_options, options)
-
-                    except Exception as e:
-                        logger.error(e)
-                        update_fail_model_map(model_to_fail_ep, name, ep, "runtime error", e)
-                        continue
-
-                    if second_creation_time:
-                        model_to_session[name] = copy.deepcopy({ep + second: second_creation_time})
-
-                    logger.info("start to inference {} with {} ...".format(name, ep))
-                    logger.info(sess.get_providers())
-                    logger.info(sess.get_provider_options())
-
-                    if sess:
-                        logger.info("Model inputs nodes:")
-                        for input_meta in sess.get_inputs():
-                            logger.info(input_meta)
-                        logger.info("Model outputs nodes:")
-                        for output_meta in sess.get_outputs():
-                            logger.info(output_meta)
-
-                    batch_size = 1
-                    result_template = {
-                        "engine": "onnxruntime",
-                        "version": onnxruntime.__version__,
-                        "device": ep,
-                        "fp16": convert_input_fp16,
-                        "io_binding": args.io_binding,
-                        "graph_optimizations": args.graph_enablement,
-                        "enable_cache": args.trt_ep_options.get("trt_engine_cache_enable", "False"),
-                        "model_name": name,
-                        "inputs": len(sess.get_inputs()),
-                        "batch_size": batch_size,
-                        "sequence_length": 1,
-                        "datetime": str(datetime.now()),
-                    }
-
-                    # run cpu fewer times
-                    repeat_times = 100 if ep == cpu else args.test_times
-                    track_memory = False if ep == cpu else args.track_memory
-
-                    # inference with ort
-                    try:
-                        result, mem_usage = inference_ort(
-                            args,
-                            name,
-                            sess,
-                            ep,
-                            inputs,
-                            result_template,
-                            repeat_times,
-                            batch_size,
-                            track_memory,
-                        )
-                    except Exception as e:
-                        logger.error(e)
-                        update_fail_model_map(model_to_fail_ep, name, ep, "runtime error", e)
-                        continue
-
-                if result:
-
-                    latency_result[ep] = {}
-                    latency_result[ep]["average_latency_ms"] = result["average_latency_ms"]
-                    latency_result[ep]["latency_90_percentile"] = result["latency_90_percentile"]
-                    if "memory" in result:
-                        mem_usage = result["memory"]
-                    if mem_usage:
-                        latency_result[ep]["memory"] = mem_usage
-                    if not args.trtexec:  # skip standalone
-                        success_results.append(result)
-
-                    model_to_latency[name] = copy.deepcopy(latency_result)
-
-                    if ep == trt_fp16:  # delete engine
-                        remove_files(args.running_mode, model_info["working_directory"])
-
-                logger.info("---------------------------- benchmark [end] ----------------------------------\n")
-
-            elif args.running_mode == "validate":
-                logger.info("\n----------------------------- validate -------------------------------------")
-
-                # enable profiling to generate profiling file for analysis
-                options = onnxruntime.SessionOptions()
-                options.enable_profiling = True
-                options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
-                time.sleep(1)  # avoid to generate same profile file name
-
-                providers = ep_to_provider_list[ep]
-                provider_options = get_provider_options(providers, trt_ep_options, args.cuda_ep_options)
-
-                # create onnxruntime inference session
-                try:
-                    sess, creation_time = create_session(model_path, providers, provider_options, options)
-
-                except Exception as e:
-                    logger.error(e)
-                    update_fail_model_map(model_to_fail_ep, name, ep, "runtime error", e)
-                    continue
-
-                if creation_time:
-                    model_to_session[name] = copy.deepcopy({ep: creation_time})
-
-                sess.disable_fallback()
-
-                logger.info("start to inference {} with {} ...".format(name, ep))
-                logger.info(sess.get_providers())
-                logger.info(sess.get_provider_options())
-
-                if sess:
-                    logger.info("Model inputs nodes:")
-                    for input_meta in sess.get_inputs():
-                        logger.info(input_meta)
-                    logger.info("Model outputs nodes:")
-                    for output_meta in sess.get_outputs():
-                        logger.info(output_meta)
-
-                # run inference and validate the result
-                #
-                # currently skip TensorRT float16 validation intentionally
-                if ep not in validation_exemption:
-                    try:
-                        ort_outputs = inference_ort_and_get_prediction(name, sess, inputs)
-
-                        status = validate(
-                            ref_outputs,
-                            ort_outputs,
-                            args.rtol,
-                            args.atol,
-                            args.percent_mismatch,
-                        )
-                        if not status[0]:
-                            remove_files(args.running_mode, model_info["working_directory"])
-                            update_fail_model_map(
-                                model_to_fail_ep,
-                                name,
-                                ep,
-                                "result accuracy issue",
-                                status[1],
-                            )
-                            continue
-                    except Exception as e:
-                        logger.error(e)
-                        update_fail_model_map(model_to_fail_ep, name, ep, "runtime error", e)
-                        continue
-
-                    # Run inference again. the reason is that some ep like tensorrt
-                    # it takes much longer time to generate graph on first run and
-                    # we need to skip the perf result of that expensive run.
-                    inference_ort_and_get_prediction(name, sess, inputs)
-                else:
-                    inference_ort_and_get_prediction(name, sess, inputs)
-                    inference_ort_and_get_prediction(name, sess, inputs)
-
-                sess.end_profiling()
-
-                # get metrics from profiling file
-                metrics = get_profile_metrics(path, profile_already_parsed, logger)
-                if metrics:
-                    logger.info(ep)
-                    ep_to_operator[ep] = metrics
-
-                remove_files(args.running_mode, model_info["working_directory"])
-                logger.info("---------------------------- validate [end] ----------------------------------\n")
-
-        ####################
-        # end of iterate ep
-        ####################
-
-        # get percentage of execution time and operators in TRT
-        update_metrics_map(model_to_metrics, name, ep_to_operator)
-
-        # cleanup_files()
-        os.chdir(pwd)
-
-        # end of model
-
-    return (
-        success_results,
-        model_to_latency,
-        model_to_fail_ep,
-        model_to_metrics,
-        model_to_session,
-    )
-
-
 def calculate_gain(value, ep1, ep2):
     ep1_latency = float(value[ep1]["average_latency_ms"])
     ep2_latency = float(value[ep2]["average_latency_ms"])
@@ -1960,6 +1624,447 @@ def str2bool(v):
         raise argparse.ArgumentTypeError("Boolean value expected.")
 
 
+def test_models_eps(args, models):
+    """
+    Benchmarks or validates the given models over the provided set of EPs.
+
+    :param args: The command-line arguments to this script. Contains the list of EPs to use.
+    :param models: Dictionary of models to run. The keys are model names and the values are dictionaries containing
+                   paths to the model files and input data.
+
+    :return: A tuple containing aggregated metrics/results.
+    """
+
+    success_results = []
+    model_to_latency = {}  # model -> cuda and tensorrt latency
+    model_to_metrics = {}  # model -> metrics from profiling file
+    model_to_fail_ep = {}  # model -> failing ep
+    model_to_session = {}  # models -> session creation time
+
+    if os.path.exists(SESSION_FILE):
+        model_to_session = read_map_from_file(SESSION_FILE)
+
+    if os.path.exists(FAIL_MODEL_FILE):
+        model_to_fail_ep = read_map_from_file(FAIL_MODEL_FILE)
+
+    ep_list = []
+    if args.ep:
+        ep_list.append(args.ep)
+    else:
+        if args.fp16:
+            ep_list = [cpu, cuda, trt, cuda_fp16, trt_fp16]
+        else:
+            ep_list = [cpu, cuda, trt]
+
+    init_dir = os.getcwd()
+
+    # Run benchmarking and/or validation for every model and EP combination.
+    for name, model_info in models.items():
+        ep_results = {"latency": {}, "metrics": {}, "session": {}}
+
+        for exec_provider in ep_list:
+
+            # Skip model + EP combinations that have already failed in a previous run.
+            if skip_ep(name, exec_provider, model_to_fail_ep):
+                continue
+
+            # Check if EP is supported.
+            if not is_standalone(exec_provider):
+                ep_ = ep_to_provider_list[exec_provider][0]
+                if ep_ not in onnxruntime.get_available_providers():
+                    logger.error("No %s support", ep_)
+                    continue
+
+            # Create a temporary directory for this run, which may create profiles, subgraph dumps, and TRT engines.
+            # The temporary directory is created in '/tmp/' and is automatically deleted after scope exit.
+            with tempfile.TemporaryDirectory() as temp_dir:
+                run_model_on_ep(
+                    args,
+                    name,
+                    model_info,
+                    exec_provider,
+                    success_results,
+                    model_to_fail_ep,
+                    ep_results,
+                    temp_dir,
+                )
+
+        model_to_latency[name] = ep_results["latency"]
+        model_to_session[name] = ep_results["session"]
+        update_metrics_map(model_to_metrics, name, ep_results["metrics"])
+
+    os.chdir(init_dir)
+
+    return (
+        success_results,
+        model_to_latency,
+        model_to_fail_ep,
+        model_to_metrics,
+        model_to_session,
+    )
+
+
+def run_model_on_ep(
+    args,
+    model_name,
+    model_info,
+    exec_provider,
+    success_results,
+    model_to_fail_ep,
+    ep_results,
+    tmp_work_dir,
+):
+    """
+    Benchmarks and/or validates the given model on the given EP.
+
+    :param args: The command-line arguments to this script.
+    :param model_name: The name of the model to run.
+    :param model_info: A dictionary that contains paths to the model file and input data.
+    :param exec_provider: The name of the EP (e.g., ORT-CUDAFp32) on which to run the model.
+    :param success_results: List of successful results that is updated by this function.
+    :param model_to_fail_ep: Dictionary that tracks failing model and EP combinations. Updated by this function.
+    :param ep_results: Dictionary that maps an EP to latency and operator partition results. Updated by this function.
+    :param tmp_work_dir: Temporary directory in which to run the model + EP.
+    """
+
+    all_inputs_shape = []  # used for standalone trt
+    model_work_dir = os.path.abspath(model_info["working_directory"])
+    model_path = os.path.normpath(os.path.join(model_work_dir, model_info["model_path"]))
+    test_data_dir = os.path.normpath(os.path.join(model_work_dir, model_info["test_data_path"]))
+
+    os.chdir(tmp_work_dir)
+
+    logger.info("Starting mode '%s' for %s on %s ...", args.running_mode, model_name, exec_provider)
+
+    # Set environment variables for ort-trt benchmarking
+    trt_ep_options = copy.deepcopy(args.trt_ep_options)
+    if "ORT-TRT" in exec_provider:
+        trt_ep_options["trt_fp16_enable"] = "True" if "Fp16" in exec_provider else "False"
+
+        # Create/set a directory to store TRT engine caches.
+        engine_cache_path = os.path.normpath(os.path.join(tmp_work_dir, TRT_ENGINE_CACHE_DIR_NAME))
+        if not os.path.exists(engine_cache_path):
+            os.makedirs(engine_cache_path)
+
+        trt_ep_options["trt_engine_cache_path"] = engine_cache_path
+
+    convert_input_fp16 = False
+
+    # use float16.py for cuda fp16 only
+    if cuda_fp16 == exec_provider:
+
+        # handle model
+        if "model_path_fp16" in model_info:
+            model_path = os.path.normpath(os.path.join(model_work_dir, model_info["model_path_fp16"]))
+
+        else:
+            try:
+                model_path = convert_model_from_float_to_float16(model_path, tmp_work_dir)
+                convert_input_fp16 = True
+            except Exception as excpt:
+                logger.error(excpt)
+                update_fail_model_map(model_to_fail_ep, model_name, exec_provider, "script error", excpt)
+                return
+
+        # handle test data
+        if "test_data_path_fp16" in model_info:
+            test_data_dir = os.path.normpath(os.path.join(model_work_dir, model_info["test_data_path_fp16"]))
+            convert_input_fp16 = False
+
+    inputs, ref_outputs = get_test_data(convert_input_fp16, test_data_dir, all_inputs_shape)
+    # generate random input data
+    if args.input_data == "random":
+        inputs = generate_onnx_model_random_input(args.test_times, inputs[0])
+
+    do_validate = is_validate_mode(args.running_mode)
+    do_benchmark = is_benchmark_mode(args.running_mode)
+
+    validation_passed = False
+
+    #######################################
+    # Validation
+    #######################################
+    if do_validate:
+        validation_passed = validate_model_on_ep(
+            args,
+            model_name,
+            exec_provider,
+            trt_ep_options,
+            model_path,
+            inputs,
+            ref_outputs,
+            model_to_fail_ep,
+            ep_results,
+            tmp_work_dir,
+        )
+
+    #######################################
+    # Benchmark
+    #######################################
+    if do_benchmark and (validation_passed or not do_validate):
+        benchmark_model_on_ep(
+            args,
+            model_name,
+            exec_provider,
+            trt_ep_options,
+            model_path,
+            inputs,
+            all_inputs_shape,
+            model_to_fail_ep,
+            ep_results,
+            success_results,
+            test_data_dir,
+            convert_input_fp16,
+        )
+
+
+def benchmark_model_on_ep(
+    args,
+    model_name,
+    exec_provider,
+    trt_ep_options,
+    model_path,
+    inputs,
+    all_inputs_shape,
+    model_to_fail_ep,
+    ep_results,
+    success_results,
+    test_data_dir,
+    convert_input_fp16,
+):
+    """
+    Benchmarks the given model on the given EP.
+
+    :param args: The command-line arguments to this script.
+    :param model_name: The name of the model to run.
+    :param exec_provider: The name of the EP (e.g., ORT-CUDAFp32) on which to run the model.
+    :param trt_ep_options: Additional TensorRT EP session options to apply.
+    :param model_path: The path to the model file.
+    :param inputs: Inputs to the model.
+    :param all_inputs_shape: Input shapes. Needed by trtexec.
+    :param model_to_fail_ep: Dictionary that tracks failing model and EP combinations. Updated by this function.
+    :param ep_results: Dictionary that maps an EP to latency and operator partition results. Updated by this function.
+    :param success_results: List of successful results that is updated by this function.
+    :param test_data_dir: Directory containing input .pb files. Needed by trtexec.
+    :param convert_input_fp16: True if the inputs were converted to FP16.
+    """
+
+    # memory tracking variables
+    mem_usage = None
+    result = None
+
+    # get standalone TensorRT perf
+    if is_standalone(exec_provider) and args.trtexec:
+        try:
+            result = run_trt_standalone(
+                args.trtexec,
+                model_name,
+                model_path,
+                test_data_dir,
+                all_inputs_shape,
+                exec_provider == standalone_trt_fp16,
+                args.track_memory,
+            )
+        except Exception as excpt:
+            logger.error(excpt)
+            update_fail_model_map(model_to_fail_ep, model_name, exec_provider, "runtime error", excpt)
+            return
+
+    # inference with onnxruntime ep
+    else:
+        # resolve providers to create session
+        providers = ep_to_provider_list[exec_provider]
+        provider_options = get_provider_options(providers, trt_ep_options, args.cuda_ep_options)
+
+        options = onnxruntime.SessionOptions()
+        options.graph_optimization_level = get_graph_opt_level(args.graph_enablement)
+
+        # create onnxruntime inference session
+        try:
+            sess, second_creation_time = create_session(model_path, providers, provider_options, options)
+
+        except Exception as excpt:
+            logger.error(excpt)
+            update_fail_model_map(model_to_fail_ep, model_name, exec_provider, "runtime error", excpt)
+            return
+
+        if second_creation_time:
+            ep_results["session"][exec_provider + second] = second_creation_time
+
+        logger.info("Start to inference %s with %s ...", model_name, exec_provider)
+        logger.info(sess.get_providers())
+        logger.info(sess.get_provider_options())
+
+        if sess:
+            logger.info("Model inputs nodes:")
+            for input_meta in sess.get_inputs():
+                logger.info(input_meta)
+            logger.info("Model outputs nodes:")
+            for output_meta in sess.get_outputs():
+                logger.info(output_meta)
+
+        batch_size = 1
+        result_template = {
+            "engine": "onnxruntime",
+            "version": onnxruntime.__version__,
+            "device": exec_provider,
+            "fp16": convert_input_fp16,
+            "io_binding": args.io_binding,
+            "graph_optimizations": args.graph_enablement,
+            "enable_cache": args.trt_ep_options.get("trt_engine_cache_enable", "False"),
+            "model_name": model_name,
+            "inputs": len(sess.get_inputs()),
+            "batch_size": batch_size,
+            "sequence_length": 1,
+            "datetime": str(datetime.now()),
+        }
+
+        # run cpu fewer times
+        repeat_times = 100 if exec_provider == cpu else args.test_times
+        track_memory = False if exec_provider == cpu else args.track_memory
+
+        # inference with ort
+        try:
+            result, mem_usage = inference_ort(
+                args,
+                model_name,
+                sess,
+                exec_provider,
+                inputs,
+                result_template,
+                repeat_times,
+                batch_size,
+                track_memory,
+            )
+        except Exception as excpt:
+            logger.error(excpt)
+            update_fail_model_map(model_to_fail_ep, model_name, exec_provider, "runtime error", excpt)
+            return
+
+    if result:
+
+        ep_results["latency"][exec_provider] = {}
+        ep_results["latency"][exec_provider]["average_latency_ms"] = result["average_latency_ms"]
+        ep_results["latency"][exec_provider]["latency_90_percentile"] = result["latency_90_percentile"]
+        if "memory" in result:
+            mem_usage = result["memory"]
+        if mem_usage:
+            ep_results["latency"][exec_provider]["memory"] = mem_usage
+        if not args.trtexec:  # skip standalone
+            success_results.append(result)
+
+
+def validate_model_on_ep(
+    args,
+    model_name,
+    exec_provider,
+    trt_ep_options,
+    model_path,
+    inputs,
+    ref_outputs,
+    model_to_fail_ep,
+    ep_results,
+    tmp_work_dir,
+):
+    """
+    Validates the given model on the given EP.
+
+    :param args: The command-line arguments to this script.
+    :param model_name: The name of the model to run.
+    :param exec_provider: The name of the EP (e.g., ORT-CUDAFp32) on which to run the model.
+    :param trt_ep_options: Additional TensorRT EP session options to apply.
+    :param model_path: The path to the model file.
+    :param inputs: Inputs to the model.
+    :param ref_outputs: Reference outputs used to validate inference results.
+    :param model_to_fail_ep: Dictionary that tracks failing model and EP combinations. Updated by this function.
+    :param ep_results: Dictionary that maps an EP to latency and operator partition results. Updated by this function.
+    :param tmp_work_dir: Temporary directory where inference profile files were dumped.
+    """
+
+    if is_standalone(exec_provider):
+        return True
+
+    # enable profiling to generate profiling file for analysis
+    options = onnxruntime.SessionOptions()
+    options.enable_profiling = True
+    options.profile_file_prefix = f"ort_profile_{model_name}_{exec_provider}"
+    options.graph_optimization_level = get_graph_opt_level(args.graph_enablement)
+
+    providers = ep_to_provider_list[exec_provider]
+    provider_options = get_provider_options(providers, trt_ep_options, args.cuda_ep_options)
+
+    # create onnxruntime inference session
+    try:
+        sess, creation_time = create_session(model_path, providers, provider_options, options)
+
+    except Exception as excpt:
+        logger.error(excpt)
+        update_fail_model_map(model_to_fail_ep, model_name, exec_provider, "runtime error", excpt)
+        return False
+
+    if creation_time:
+        ep_results["session"][exec_provider] = creation_time
+
+    sess.disable_fallback()
+
+    logger.info("Start to inference %s with %s ...", model_name, exec_provider)
+    logger.info(sess.get_providers())
+    logger.info(sess.get_provider_options())
+
+    if sess:
+        logger.info("Model inputs nodes:")
+        for input_meta in sess.get_inputs():
+            logger.info(input_meta)
+        logger.info("Model outputs nodes:")
+        for output_meta in sess.get_outputs():
+            logger.info(output_meta)
+
+    # run inference and validate the result
+    #
+    # currently skip TensorRT float16 validation intentionally
+    if exec_provider != trt_fp16:
+        try:
+            ort_outputs = inference_ort_and_get_prediction(model_name, sess, inputs)
+
+            status = validate(
+                ref_outputs,
+                ort_outputs,
+                args.rtol,
+                args.atol,
+                args.percent_mismatch,
+            )
+            if not status[0]:
+                update_fail_model_map(
+                    model_to_fail_ep,
+                    model_name,
+                    exec_provider,
+                    "result accuracy issue",
+                    status[1],
+                )
+                return False
+        except Exception as excpt:
+            logger.error(excpt)
+            update_fail_model_map(model_to_fail_ep, model_name, exec_provider, "runtime error", excpt)
+            return False
+
+        # Run inference again. the reason is that some ep like tensorrt
+        # it takes much longer time to generate graph on first run and
+        # we need to skip the perf result of that expensive run.
+        inference_ort_and_get_prediction(model_name, sess, inputs)
+    else:
+        inference_ort_and_get_prediction(model_name, sess, inputs)
+        inference_ort_and_get_prediction(model_name, sess, inputs)
+
+    sess.end_profiling()
+
+    # get metrics from profiling file
+    metrics = get_profile_metrics(tmp_work_dir, options.profile_file_prefix, logger)
+    if metrics:
+        ep_results["metrics"][exec_provider] = metrics
+
+    return True
+
+
 class ParseDictArgAction(argparse.Action):
     def __call__(self, parser, namespace, values, option_string):
         dict_arg = {}
@@ -2008,7 +2113,7 @@ def parse_arguments():
         "--running_mode",
         required=False,
         default="benchmark",
-        choices=["validate", "benchmark"],
+        choices=["validate", "benchmark", "both"],
         help="Testing mode.",
     )
 
@@ -2178,7 +2283,7 @@ def main():
         model_to_fail_ep,
         model_to_metrics,
         model_to_session,
-    ) = run_onnxruntime(args, models)
+    ) = test_models_eps(args, models)
     perf_end_time = datetime.now()
 
     logger.info("Done running the perf.")
diff --git a/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py b/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py
index 38b0efca39..918add64ce 100644
--- a/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py
+++ b/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py
@@ -56,8 +56,7 @@ def main():
     else:
         ep_list = get_ep_list(args.comparison)
 
-    if standalone_trt in ep_list or standalone_trt_fp16 in ep_list:
-        trtexec = resolve_trtexec_path(args.workspace)
+    trtexec = resolve_trtexec_path(args.workspace)
 
     models = {}
     parse_models_helper(args, models)
@@ -72,6 +71,9 @@ def main():
     benchmark_session_csv = session_name + csv_ending
     specs_csv = specs_name + csv_ending
 
+    validate = is_validate_mode(args.running_mode)
+    benchmark = is_benchmark_mode(args.running_mode)
+
     for model, model_info in models.items():
         logger.info("\n" + "=" * 40 + "=" * len(model))
         logger.info("=" * 20 + model + "=" * 20)
@@ -83,7 +85,6 @@ def main():
         write_model_info_to_file([model_info], model_list_file)
 
         for ep in ep_list:
-
             command = [
                 "python3",
                 "benchmark.py",
@@ -103,10 +104,7 @@ def main():
                 command.append("-z")
 
             if ep == standalone_trt or ep == standalone_trt_fp16:
-                if args.running_mode == "validate":
-                    continue
-                else:
-                    command.extend(["--trtexec", trtexec])
+                command.extend(["--trtexec", trtexec])
 
             if len(args.cuda_ep_options):
                 command.extend(["--cuda_ep_options", dict_to_args(args.cuda_ep_options)])
@@ -114,10 +112,10 @@ def main():
             if len(args.trt_ep_options):
                 command.extend(["--trt_ep_options", dict_to_args(args.trt_ep_options)])
 
-            if args.running_mode == "validate":
+            if validate:
                 command.extend(["--benchmark_metrics_csv", benchmark_metrics_csv])
 
-            elif args.running_mode == "benchmark":
+            if benchmark:
                 command.extend(
                     [
                         "-t",
@@ -135,12 +133,17 @@ def main():
                     ]
                 )
 
-            p = subprocess.run(command)
-            logger.info(p)
+            p = subprocess.run(command, stderr=subprocess.PIPE)
+            logger.info("Completed subprocess %s ", " ".join(p.args))
+            logger.info("Return code: %d", p.returncode)
 
             if p.returncode != 0:
                 error_type = "runtime error"
                 error_message = "Benchmark script exited with returncode = " + str(p.returncode)
+
+                if p.stderr:
+                    error_message += "\nSTDERR:\n" + p.stderr.decode("utf-8")
+
                 logger.error(error_message)
                 update_fail_model_map(model_to_fail_ep, model, ep, error_type, error_message)
                 write_map_to_file(model_to_fail_ep, FAIL_MODEL_FILE)
@@ -154,7 +157,7 @@ def main():
 
         Path(path).mkdir(parents=True, exist_ok=True)
 
-    if args.running_mode == "validate":
+    if validate:
         logger.info("\n=========================================")
         logger.info("=========== Models/EPs metrics ==========")
         logger.info("=========================================")
@@ -164,7 +167,7 @@ def main():
             output_metrics(model_to_metrics, os.path.join(path, benchmark_metrics_csv))
             logger.info("\nSaved model metrics results to {}".format(benchmark_metrics_csv))
 
-    elif args.running_mode == "benchmark":
+    if benchmark:
         logger.info("\n=========================================")
         logger.info("======= Models/EPs session creation =======")
         logger.info("=========================================")
diff --git a/onnxruntime/python/tools/tensorrt/perf/perf.sh b/onnxruntime/python/tools/tensorrt/perf/perf.sh
index 5f67156043..905ccc476d 100755
--- a/onnxruntime/python/tools/tensorrt/perf/perf.sh
+++ b/onnxruntime/python/tools/tensorrt/perf/perf.sh
@@ -65,5 +65,4 @@ setup() {
 }
 
 setup
-python benchmark_wrapper.py -r validate -m $MODEL_PATH -o result/$OPTION $OPTIONAL_ARGS 
-python benchmark_wrapper.py -r benchmark -t 1200 -m $MODEL_PATH -o result/$OPTION $OPTIONAL_ARGS
+python benchmark_wrapper.py -r both -t 1200 -m $MODEL_PATH -o result/$OPTION $OPTIONAL_ARGS
diff --git a/onnxruntime/python/tools/tensorrt/perf/perf_utils.py b/onnxruntime/python/tools/tensorrt/perf/perf_utils.py
index 3beefaf9c2..61cac72b27 100644
--- a/onnxruntime/python/tools/tensorrt/perf/perf_utils.py
+++ b/onnxruntime/python/tools/tensorrt/perf/perf_utils.py
@@ -71,6 +71,30 @@ extended = "extended"
 enable_all = "all"
 
 
+def is_benchmark_mode(running_mode):
+    """
+    Returns True if the script's running mode requires running benchmarks.
+
+    :param running_mode: A string denoting the script's running mode (i.e., 'benchmark', 'validate', or 'both')
+
+    :return: True if benchmarking is required.
+    """
+
+    return running_mode == "benchmark" or running_mode == "both"
+
+
+def is_validate_mode(running_mode):
+    """
+    Returns True if the script's running mode requires running inference validation.
+
+    :param running_mode: A string denoting the script's running mode (i.e., 'benchmark', 'validate', or 'both')
+
+    :return: True if validation is required.
+    """
+
+    return running_mode == "validate" or running_mode == "both"
+
+
 def is_standalone(ep):
     return ep == standalone_trt or ep == standalone_trt_fp16
 
@@ -277,14 +301,25 @@ def calculate_trt_latency_percentage(trt_op_map):
     return (total_trt_execution_time, total_execution_time, ratio_of_trt_execution_time)
 
 
-def get_profile_metrics(path, profile_already_parsed, logger=None):
-    logger.info("Parsing/Analyzing profiling files in {} ...".format(path))
-    p1 = subprocess.Popen(
-        ["find", path, "-name", "onnxruntime_profile*", "-printf", "%T+\t%p\n"],
+def get_profile_metrics(path, profile_file_prefix, logger):
+    """
+    Parses a session profile file to obtain information on operator usage per EP.
+
+    :param path: The path containing the session profile file.
+    :param profile_file_prefix: Custom prefix for session profile names. Refer to ORT SessionOptions.
+    :param logger: The logger object to use for debug/info logging.
+
+    :return: A tuple containing the parsed operator usage information for CPU nodes and GPU kernels.
+    """
+
+    logger.debug("Parsing/Analyzing profiling files in %s ...", path)
+
+    find_proc = subprocess.Popen(
+        ["find", path, "-name", f"{profile_file_prefix}*", "-printf", "%T+\t%p\n"],
         stdout=subprocess.PIPE,
     )
-    p2 = subprocess.Popen(["sort"], stdin=p1.stdout, stdout=subprocess.PIPE)
-    stdout, sterr = p2.communicate()
+    sort_proc = subprocess.Popen(["sort"], stdin=find_proc.stdout, stdout=subprocess.PIPE)
+    stdout, sterr = sort_proc.communicate()
     stdout = stdout.decode("ascii").strip()
     profiling_files = stdout.split("\n")
     logger.info(profiling_files)
@@ -292,18 +327,15 @@ def get_profile_metrics(path, profile_already_parsed, logger=None):
     data = []
     for profile in profiling_files:
         profile = profile.split("\t")[1]
-        if profile in profile_already_parsed:
-            continue
-        profile_already_parsed.add(profile)
 
-        logger.info("start to parse {} ...".format(profile))
-        with open(profile) as f:
-            op_map = parse_single_file(f)
+        logger.debug("Parsing profile %s ...", profile)
+        with open(profile, encoding="utf-8") as fd:
+            op_map = parse_single_file(fd)
             if op_map:
                 data.append(op_map)
 
     if len(data) == 0:
-        logger.info("No profile metrics got.")
+        logger.debug("No profile metrics found.")
         return None
 
     return data[-1]