diff --git a/onnxruntime/python/tools/tensorrt/perf/benchmark.py b/onnxruntime/python/tools/tensorrt/perf/benchmark.py
index 57962f6848..5a97604fd0 100644
--- a/onnxruntime/python/tools/tensorrt/perf/benchmark.py
+++ b/onnxruntime/python/tools/tensorrt/perf/benchmark.py
@@ -6,20 +6,55 @@ import logging
 import os
 import pprint
 import re
+import subprocess
 import sys
 import time
 import timeit
 from datetime import datetime
 
 import coloredlogs
-import numpy
 import numpy as np
-import pandas as pd
-from float16 import *
-from perf_utils import *
+from perf_utils import (
+    acl,
+    acl_ep,
+    avg_ending,
+    basic,
+    calculate_cuda_op_percentage,
+    calculate_trt_latency_percentage,
+    calculate_trt_op_percentage,
+    cpu,
+    cpu_ep,
+    cuda,
+    cuda_ep,
+    cuda_fp16,
+    disable,
+    enable_all,
+    extended,
+    get_output,
+    get_profile_metrics,
+    get_total_ops,
+    is_standalone,
+    memory_ending,
+    model_title,
+    ort_provider_list,
+    percentile_ending,
+    pretty_print,
+    provider_list,
+    second,
+    second_session_ending,
+    session_ending,
+    standalone_trt,
+    standalone_trt_fp16,
+    table_headers,
+    trt,
+    trt_ep,
+    trt_fp16,
+)
 
 import onnxruntime  # isort:skip
+import onnx  # isort:skip
 from onnx import numpy_helper  # isort:skip
+import pandas as pd  # isort:skip
 
 debug = False
 sys.path.append(".")
@@ -164,15 +199,15 @@ def run_trt_standalone(trtexec, model_name, model_path, all_inputs_shape, fp16,
 
 def get_latency_result(runtimes, batch_size):
     latency_ms = sum(runtimes) / float(len(runtimes)) * 1000.0
-    latency_variance = numpy.var(runtimes, dtype=numpy.float64) * 1000.0
+    latency_variance = np.var(runtimes, dtype=np.float64) * 1000.0
     throughput = batch_size * (1000.0 / latency_ms)
 
     result = {
         "test_times": len(runtimes),
         "latency_variance": "{:.2f}".format(latency_variance),
-        "latency_90_percentile": "{:.2f}".format(numpy.percentile(runtimes, 90) * 1000.0),
-        "latency_95_percentile": "{:.2f}".format(numpy.percentile(runtimes, 95) * 1000.0),
-        "latency_99_percentile": "{:.2f}".format(numpy.percentile(runtimes, 99) * 1000.0),
+        "latency_90_percentile": "{:.2f}".format(np.percentile(runtimes, 90) * 1000.0),
+        "latency_95_percentile": "{:.2f}".format(np.percentile(runtimes, 95) * 1000.0),
+        "latency_99_percentile": "{:.2f}".format(np.percentile(runtimes, 99) * 1000.0),
         "average_latency_ms": "{:.2f}".format(latency_ms),
         "QPS": "{:.2f}".format(throughput),
     }
@@ -1432,18 +1467,18 @@ def calculate_gain(value, ep1, ep2):
 
 def add_improvement_information(model_to_latency):
     for key, value in model_to_latency.items():
-        if "ORT-TRT" in value and "ORT-CUDA" in value:
+        if trt in value and cuda in value:
             gain = calculate_gain(value, trt, cuda)
             value[trt_cuda_gain] = "{:.2f} %".format(gain)
-            if trt_fp16 in value and cuda_fp16 in value:
-                gain = calculate_gain(value, trt_fp16, cuda_fp16)
-                value[trt_cuda_fp16_gain] = "{:.2f} %".format(gain)
-        if "ORT-TRT" in value and is_standalone(value):
+        if trt_fp16 in value and cuda_fp16 in value:
+            gain = calculate_gain(value, trt_fp16, cuda_fp16)
+            value[trt_cuda_fp16_gain] = "{:.2f} %".format(gain)
+        if trt in value and standalone_trt in value:
             gain = calculate_gain(value, trt, standalone_trt)
             value[trt_native_gain] = "{:.2f} %".format(gain)
-            if trt_fp16 in value and standalone_trt_fp16 in value:
-                gain = calculate_gain(value, trt_fp16, standalone_trt_fp16)
-                value[trt_native_fp16_gain] = "{:.2f} %".format(gain)
+        if trt_fp16 in value and standalone_trt_fp16 in value:
+            gain = calculate_gain(value, trt_fp16, standalone_trt_fp16)
+            value[trt_native_fp16_gain] = "{:.2f} %".format(gain)
 
 
 def output_details(results, csv_filename):