From fb40602ea2b85ca276417dc88b935f6a05124371 Mon Sep 17 00:00:00 2001
From: Olivia Jain <oljain@microsoft.com>
Date: Mon, 5 Apr 2021 22:16:12 -0700
Subject: [PATCH] Mem trt (#6868)

* adding trt comparison and memory consumption

* creating separate docker file
---
 dockerfiles/Dockerfile.tensorrt               |   2 +-
 .../python/tools/tensorrt/perf/README.md      |  51 +-
 .../python/tools/tensorrt/perf/benchmark.py   | 277 +++++++----
 .../tools/tensorrt/perf/benchmark_wrapper.py  |  61 ++-
 .../perf/build/Dockerfile.tensorrt-perf       |  33 +-
 .../tools/tensorrt/perf/build/build_image.sh  |  12 +
 .../tools/tensorrt/perf/build/build_images.sh |  14 -
 .../perf/build/install_common_deps.sh         |  21 +
 .../tools/tensorrt/perf/model_list.json       | 456 ++++++++++--------
 .../python/tools/tensorrt/perf/perf.sh        |   2 -
 .../tools/tensorrt/perf/run_perf_docker.sh    |   4 +-
 .../tools/tensorrt/perf/run_perf_machine.sh   |   6 +-
 .../linux-gpu-tensorrt-ci-perf-pipeline.yml   |  12 +-
 13 files changed, 599 insertions(+), 352 deletions(-)
 create mode 100755 onnxruntime/python/tools/tensorrt/perf/build/build_image.sh
 delete mode 100755 onnxruntime/python/tools/tensorrt/perf/build/build_images.sh
 create mode 100644 onnxruntime/python/tools/tensorrt/perf/build/install_common_deps.sh

diff --git a/dockerfiles/Dockerfile.tensorrt b/dockerfiles/Dockerfile.tensorrt
index 9df40560c9..88ddd6369a 100644
--- a/dockerfiles/Dockerfile.tensorrt
+++ b/dockerfiles/Dockerfile.tensorrt
@@ -5,7 +5,7 @@
 # Dockerfile to run ONNXRuntime with TensorRT integration
 
 # nVidia TensorRT Base Image
-FROM nvcr.io/nvidia/tensorrt:20.12-py3
+FROM nvcr.io/nvidia/tensorrt:20.09-py3
 MAINTAINER Vinitra Swamy "viswamy@microsoft.com"
 
 ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime
diff --git a/onnxruntime/python/tools/tensorrt/perf/README.md b/onnxruntime/python/tools/tensorrt/perf/README.md
index 02f53e9ceb..5995faadad 100644
--- a/onnxruntime/python/tools/tensorrt/perf/README.md
+++ b/onnxruntime/python/tools/tensorrt/perf/README.md
@@ -39,6 +39,12 @@ However, benchmark.py creates only one process to run all the model inferences o
 - **-o, --perf_result_path**: (*default: result*) Directory for perf result..
 - **--fp16**: (*default: True*) Enable TensorRT/CUDA FP16 and include the performance of this floating point optimization.
 - **--trtexec**: Path of standalone TensorRT executable, for example: trtexec.
+- **--track_memory**: Track memory usage of CUDA and TensorRT execution providers
+
+### Validation Configuration 
+- **--percent_mismatch**: The allowed percentage of values to be incorrect when comparing given outputs to ORT outputs. 
+- **--rtol**: The relative tolerance for validating ORT outputs.
+- **--atol**: The absolute tolerance for validating ORT outputs.
 
 ### Results
 After running validation and benchmark. The metrics are written into five different csv files in 'result' directory or the directory you specified with -o argument.
@@ -46,6 +52,7 @@ After running validation and benchmark. The metrics are written into five differ
 - **benchmark_success_xxxx.csv**: Lists all the models that can be successfully inferenced by TensorRT/CUDA, as well as other related metrics.
 - **benchmark_latency_xxxx.csv**: Lists all the models with inference latecy of TensorRT/CUDA and TensorRT Float32/Float16 performance gain compared with CUDA.
 - **benchmark_metrics_xxxx.csv**: List how much and percentage of model operators that are run by TensorRT and what percentage of execution time is running on TensorRT.
+- **benchmark_status_xxxx.csv**: List of all the models and the status as pass or fail for each execution provider.
 - **benchmark_system_info_xxxx.csv**: includes CUDA version, TensorRT version and CPU information.
 
 Thoese metrics will be shown on the standard output as well.
@@ -176,11 +183,53 @@ The output of running benchmark:
                        'Tensorrt_gain(%)': '54.94 %'}}
 
 ```
+
+```
+=========================================
+=========== CUDA/TRT Status =============
+=========================================
+{   'BERT-Squad': {   'CUDAExecutionProvider': 'Pass',
+                      'CUDAExecutionProvider_fp16': 'Pass',
+                      'TensorrtExecutionProvider': 'Pass',
+                      'TensorrtExecutionProvider_fp16': 'Fail'}
+}
+```
+
+#### Comparing Runs
+```
+python comparison_script.py -p "prev" -c "current" -o "output.csv"
+```
+- **compare_latency.py**: creates a csv file with any regressions in average latencies 
+- **new_failures.py**: creates a csv file with any new failures
+
 ## Others
-ort_build_latest.py: This script should be run before running the benchmark.py to make sure the latest ORT wheel file is being used.
+
+### Setting Up Perf Models
+- setup_onnx_zoo.py: Create a text file 'links.txt' with download links from onnx zoo models, or setup in the same folder structure. Extracts the models and creates the json file perf script will be run with.
+- setup_many_models.sh: ./setup_many_models "wget_link_to_models" to extract all the models.
+
+### Building ORT Env
+build_images.sh: This script should be run before running run_perf_docker.sh to make sure the docker images are up to date.
+- **-o, --ort_dockerfile_path**: Path to ORT Docker File.
+- **-p, --perf_dockerfile_path**: Path to EP Perf Docker File.
+- **-b, --branch**: ORT branch name you are perf testing on.
+- **-i, --image**: What the perf docker image will be named.
+
+ort_build_latest.py: This script should be run before running run_perf_machine.sh or benchmark.py to make sure the latest ORT wheel file is being used.
 - **-o, --ort_master_path**: ORT master repo.
 - **-t, --tensorrt_home**: TensorRT home directory.
 - **-c, --cuda_home**: CUDA home directory.
+
+### Running Perf Script 
+run_perf_docker.sh: Runs the perf script in docker environment. 
+- **-d, --docker_image**: Name of perf docker image.
+- **-o, --option**: Name of which models you want to run {onnx-zoo-models, many-models, partner-models, selected-models}
+- **-m, --model_path**: Path to models either json or folder.
+
+run_perf_machine.sh: Runs the perf script in docker environment. 
+- **-o, --option**: Name of which models you want to run {onnx-zoo-models, many-models, partner-models, selected-models}
+- **-m, --model_path**: Path to models either json or folder.
+
 ## Dependencies
 - When inferencing model using CUDA float16, this script following script to convert nodes in model graph from float32 to float16. It also modifies the converting script a little bit to better cover more model graph conversion.
 https://github.com/microsoft/onnxconverter-common/blob/master/onnxconverter_common/float16.py
diff --git a/onnxruntime/python/tools/tensorrt/perf/benchmark.py b/onnxruntime/python/tools/tensorrt/perf/benchmark.py
index 22a8b847f0..42d8d77e0b 100644
--- a/onnxruntime/python/tools/tensorrt/perf/benchmark.py
+++ b/onnxruntime/python/tools/tensorrt/perf/benchmark.py
@@ -16,6 +16,7 @@ from onnx import numpy_helper
 from perf_utils import *
 import pprint
 import time
+import pandas as pd
 from float16 import *
 
 debug = False
@@ -38,15 +39,23 @@ ep_to_provider_list = {
     cuda: [cuda],
     cuda_fp16: [cuda],
     trt: [trt, cuda],
-    trt_fp16: [trt, cuda],
+    trt_fp16: [trt, cuda]
 }
 
+# latency gain headers 
+trt_cuda_gain = 'TRT_CUDA_gain(%)'
+trt_cuda_fp16_gain = 'TRT_CUDA_fp16_gain(%)'
+trt_native_gain = 'EP_Native_TRT_gain(%)'
+trt_native_fp16_gain = 'EP_Native_TRT_fp16_gain(%)'
+
 # metadata
 FAIL_MODEL_FILE = ".fail_model_map"
 LATENCY_FILE = ".latency_map"
 METRICS_FILE = ".metrics_map"
+MEMORY_FILE = './temp_memory.csv'
 
 def run_trt_standalone(trtexec, model_path, ort_inputs, all_inputs_shape, fp16):
+    logger.info("running native trt")
     model_path = "--onnx=" + model_path
     input_shape = []
 
@@ -66,50 +75,45 @@ def run_trt_standalone(trtexec, model_path, ort_inputs, all_inputs_shape, fp16):
     logger.info(shapes_arg)
 
     result = {}
-    try:
 
-        if fp16:
-            p1 = subprocess.Popen([trtexec, model_path, "--fp16", "--percentile=90", "--explicitBatch", shapes_arg], stdout=subprocess.PIPE)
-        else:
-            p1 = subprocess.Popen([trtexec, model_path, "--percentile=90", "--explicitBatch", shapes_arg], stdout=subprocess.PIPE)
-        stdout, sterr = p1.communicate()
-        logger.info(stdout)
-        stdout = stdout.decode("ascii").strip()
+    if fp16:
+        out = get_output([trtexec, model_path, "--fp16", "--percentile=90", "--explicitBatch", shapes_arg])
+    else:
+        out = get_output([trtexec, model_path, "--percentile=90", "--explicitBatch", shapes_arg])
 
-        tmp = stdout.split("\n")
-        target_list = []
-        for t in tmp:
-            if 'mean:' in t:
-                target_list.append(t)
+    tmp = out.split("\n")
+    target_list = []
+    for t in tmp:
+        if 'mean:' in t:
+            target_list.append(t)
 
-            if 'percentile:' in t:
-                target_list.append(t)
+        if 'percentile:' in t:
+            target_list.append(t)
 
-        target = target_list[2]
-        start = target.find('mean:') + 6
-        end = target.find('ms')
-        result["average_latency_ms"] = target[start:end]
+    target = target_list[2]
+    start = target.find('mean:') + 6
+    end = target.find('ms')
+    result["average_latency_ms"] = target[start:end]
 
-        target = target_list[3]
-        start = target.find('percentile:') + 12
-        end = target.find('ms')
-        result["latency_90_percentile"] = target[start:end]
+    target = target_list[3]
+    start = target.find('percentile:') + 12
+    end = target.find('ms')
+    result["latency_90_percentile"] = target[start:end]
 
-        logger.info(result)
-        return result
+    logger.info(result)
+    return result
 
-    except Exception as e:
-        logger.info("trtexec fails...")
-        return None
+def get_trtexec_path(): 
+    trtexec_options = get_output(["find", "/", "-name", "trtexec"])
+    trtexec_path = re.search(r'.*/workspace/.*/bin/trtexec', trtexec_options).group()
+    return trtexec_path
 
-
-
-def get_latency_result(runtimes, batch_size):
+def get_latency_result(runtimes, batch_size, mem_mb=None):
     latency_ms = sum(runtimes) / float(len(runtimes)) * 1000.0
     latency_variance = numpy.var(runtimes, dtype=numpy.float64) * 1000.0
     throughput = batch_size * (1000.0 / latency_ms)
 
-    return {
+    result = {
         "test_times": len(runtimes),
         "latency_variance": "{:.2f}".format(latency_variance),
         "latency_90_percentile": "{:.2f}".format(numpy.percentile(runtimes, 90) * 1000.0),
@@ -118,6 +122,9 @@ def get_latency_result(runtimes, batch_size):
         "average_latency_ms": "{:.2f}".format(latency_ms),
         "QPS": "{:.2f}".format(throughput),
     }
+    if mem_mb:
+        result.update({"memory":mem_mb})
+    return result
 
 
 def get_ort_session_inputs_and_outputs(name, session, ort_input):
@@ -160,14 +167,44 @@ def get_ort_session_inputs_and_outputs(name, session, ort_input):
 
     return (sess_inputs, sess_outputs)
 
-def inference_ort(args, name, session, ep, ort_inputs, result_template, repeat_times, batch_size):
+def track_ep_memory(ep): 
+     return trt in ep or cuda in ep or standalone_trt in ep
 
+def get_trtexec_pid(df, python_pid): 
+    for pid in df['pid'].tolist(): 
+        if pid != python_pid: 
+            return pid
+
+def get_max_memory(trtexec): 
+    df = pd.read_csv(MEMORY_FILE)
+    pid = df['pid'].iloc[0]
+    
+    if trtexec: 
+        pid = get_trtexec_pid(df, pid) 
+    
+    mem_series = df.loc[df['pid'] == pid, ' used_gpu_memory [MiB]']
+    max_mem = max(mem_series.str.replace(' MiB','').astype(int))
+    return max_mem
+
+def start_memory_tracking(): 
+    p = subprocess.Popen(["nvidia-smi", "--query-compute-apps=pid,used_memory", "--format=csv", "-l", "1", "-f", MEMORY_FILE])
+    return p
+
+def end_memory_tracking(p, trtexec): 
+    p.terminate()
+    p.wait()
+    mem_usage = get_max_memory(trtexec) 
+    os.remove(MEMORY_FILE)
+    return mem_usage
+
+def inference_ort(args, name, session, ep, ort_inputs, result_template, repeat_times, batch_size):
     runtimes = []
     if args.input_data == "random":
         repeat_times = 1 # warn-up run is included in ort_inputs
     else:
         repeat_times += 1 # add warn-up run
-
+    
+    mem_usage = None
     for ort_input in ort_inputs:
         sess_inputs, sess_outputs = get_ort_session_inputs_and_outputs(name, session, ort_input)
         if debug:
@@ -177,7 +214,14 @@ def inference_ort(args, name, session, ep, ort_inputs, result_template, repeat_t
             logger.info(sess_outputs)
 
         try:
-            runtime = timeit.repeat(lambda: session.run(sess_outputs, sess_inputs), number=1, repeat=repeat_times)
+            if args.track_memory and track_ep_memory(ep): 
+
+                p = start_memory_tracking()            
+                runtime = timeit.repeat(lambda: session.run(sess_outputs, sess_inputs), number=1, repeat=repeat_times)
+                mem_usage = end_memory_tracking(p, False)
+            else: 
+                runtime = timeit.repeat(lambda: session.run(sess_outputs, sess_inputs), number=1, repeat=repeat_times)
+
             runtimes += runtime
 
         except Exception as e:
@@ -191,7 +235,9 @@ def inference_ort(args, name, session, ep, ort_inputs, result_template, repeat_t
     result = {}
     result.update(result_template)
     result.update({"io_binding": False})
-    result.update(get_latency_result(runtimes, batch_size))
+    latency_result = get_latency_result(runtimes, batch_size, mem_usage)
+    result.update(latency_result)
+    logger.info(result)
     return result
 
 def inference_ort_and_get_prediction(name, session, ort_inputs):
@@ -206,7 +252,7 @@ def inference_ort_and_get_prediction(name, session, ort_inputs):
             logger.info(sess_outputs)
 
         result = session.run(sess_outputs, sess_inputs)
-
+        
         if debug:
             logger.info("ORT session output results:")
             logger.info(result)
@@ -381,7 +427,6 @@ def validate(all_ref_outputs, all_outputs, rtol, atol, percent_mismatch):
             for ref_o, o in zip(ref_output, output):
                 # abs(desired-actual) < rtol * abs(desired) + atol
                 try:
-                    logger.info("Output shape{} input shape{}".format(ref_output.shape, output.shape))
                     np.testing.assert_allclose(ref_o, o, rtol, atol)
                 except Exception as e:
                     if percentage_in_allowed_threshold(e, percent_mismatch):    
@@ -418,10 +463,8 @@ def cleanup_files():
 
 def remove_profiling_files(path):
     files = []
-    p = subprocess.Popen(["find", path, "-name", "onnxruntime_profile*"], stdout=subprocess.PIPE)
-    stdout, sterr = p.communicate()
-    stdout = stdout.decode("ascii").strip()
-    files = files + stdout.split("\n")
+    out = get_output(["find", path, "-name", "onnxruntime_profile*"])
+    files = files + out.split("\n")
 
     for f in files:
         if "custom_test_data" in f:
@@ -866,6 +909,9 @@ def run_onnxruntime(args, models):
         os.chdir(path)
         path = os.getcwd()
 
+        if args.running_mode == "validate": 
+            remove_profiling_files(path)
+        
         inputs = []
         ref_outputs = []
         all_inputs_shape = [] # use for standalone trt
@@ -972,23 +1018,40 @@ def run_onnxruntime(args, models):
                     "batch_size": batch_size,
                     "sequence_length": 1,
                     "datetime": str(datetime.now()),}
+                    
+                # get standalone TensorRT perf
+                if trt in ep and args.trtexec:
+                    
+                    try: 
+                        if args.track_memory: 
+                                ep = standalone_trt_fp16 if fp16 else standalone_trt
+                                p = start_memory_tracking()            
+                                result = run_trt_standalone(args.trtexec, model_path, sess.get_inputs(), all_inputs_shape, fp16)
+                                mem_usage = end_memory_tracking(p, True)
+                                if result and mem_usage: 
+                                    result["memory"] = mem_usage
 
-                result = inference_ort(args, name, sess, ep, inputs, result_template, args.test_times, batch_size)
+                        else: 
+                            result = run_trt_standalone(args.trtexec, model_path, sess.get_inputs(), all_inputs_shape, fp16)
+                    except Exception as e: 
+                        logger.error(e)
+                        update_fail_model_map(model_to_fail_ep, name, ep, 'runtime error', e)
+                        continue
+
+                else: 
+                    result = inference_ort(args, name, sess, ep, inputs, result_template, args.test_times, batch_size)
+                
                 if result:
-                    success_results.append(result)
 
                     latency_result[ep] = {}
                     latency_result[ep]["average_latency_ms"] = result["average_latency_ms"]
                     latency_result[ep]["latency_90_percentile"] = result["latency_90_percentile"]
+                    if "memory" in result: 
+                        mem_usage = result.pop("memory")
+                        latency_result[ep]["memory"] = mem_usage
 
-                    # get standalone TensorRT perf
-                    if trt in ep and args.trtexec:
-                        result = run_trt_standalone(args.trtexec, model_path, sess.get_inputs(), all_inputs_shape, fp16)
-                        if result and len(result) > 0:
-                            if fp16:
-                                latency_result[standalone_trt_fp16] = result
-                            else:
-                                latency_result[standalone_trt] = result
+                    if not args.trtexec: # skip standalone
+                        success_results.append(result)
 
                     model_to_latency[name] = copy.deepcopy(latency_result)
 
@@ -1076,21 +1139,26 @@ def run_onnxruntime(args, models):
 
     return success_results, model_to_latency, model_to_fail_ep, model_to_metrics
 
+def calculate_gain(value, ep1, ep2): 
+    ep1_latency = float(value[ep1]['average_latency_ms'])
+    ep2_latency = float(value[ep2]['average_latency_ms'])
+    gain = (ep2_latency - ep1_latency)*100/ep2_latency
+    return gain
+
 def add_improvement_information(model_to_latency):
     for key, value in model_to_latency.items():
-        if not (trt in value and cuda in value):
-            continue
-
-        trt_latency = float(value[trt]['average_latency_ms'])
-        cuda_latency = float(value[cuda]['average_latency_ms'])
-        gain = (cuda_latency - trt_latency)*100/cuda_latency
-        value["Tensorrt_gain(%)"] = "{:.2f} %".format(gain)
-
-        if trt_fp16 in value and cuda_fp16 in value:
-            trt_fp16_latency = float(value[trt_fp16]['average_latency_ms'])
-            cuda_fp16_latency = float(value[cuda_fp16]['average_latency_ms'])
-            gain = (cuda_fp16_latency - trt_fp16_latency)*100/cuda_fp16_latency
-            value["Tensorrt_fp16_gain(%)"] = "{:.2f} %".format(gain)
+        if trt in value and cuda in value:
+            gain = calculate_gain(value, trt, cuda)
+            value[trt_cuda_gain] = "{:.2f} %".format(gain)
+            if trt_fp16 in value and cuda_fp16 in value:
+                gain = calculate_gain(value, trt_fp16, cuda_fp16)
+                value[trt_cuda_fp16_gain] = "{:.2f} %".format(gain)
+        if trt in value and standalone_trt in value:
+            gain = calculate_gain(value, trt, standalone_trt)
+            value[trt_native_gain] = "{:.2f} %".format(gain)
+            if trt_fp16 in value and standalone_trt_fp16 in value:
+                gain = calculate_gain(value, trt_fp16, standalone_trt_fp16)
+                value[trt_native_fp16_gain] = "{:.2f} %".format(gain)
 
 def output_details(results, csv_filename):
     need_write_header = True 
@@ -1156,11 +1224,12 @@ def build_status(status_dict, results, is_fail):
                     status = 'Fail'
                     add_status_dict(status_dict, model_name, ep, status)
         else: 
-            for result in results: 
-                model_name = result['model_name']
-                ep = result['device']
-                status = 'Pass'
-                add_status_dict(status_dict, model_name, ep, status)
+            for model, value in results.items():
+                for ep, ep_info in value.items(): 
+                    model_name = model
+                    ep = ep
+                    status = 'Pass'
+                    add_status_dict(status_dict, model_name, ep, status)
 
         return status_dict
 
@@ -1194,6 +1263,7 @@ def output_status(results, csv_filename):
         trt_fp16_status = ""
         standalone_fp16_status = ""
         
+
         for model_name, ep_dict in results.items():
             for ep, status in ep_dict.items():
                 if ep == cpu: 
@@ -1214,7 +1284,7 @@ def output_status(results, csv_filename):
                     continue
                     
             row = [model_name,
-                   cpu, 
+                   cpu_status, 
                    cuda_fp32_status, 
                    trt_fp32_status, 
                    standalone_fp32_status, 
@@ -1234,18 +1304,26 @@ def output_latency(results, csv_filename):
                         "CPU \n 90th percentile (ms)",
                         "CUDA fp32 \nmean (ms)",
                         "CUDA fp32 \n90th percentile (ms)",
+                        "CUDA EP fp32 \nmemory usage (MiB)",
                         "TRT EP fp32 \nmean (ms)",
                         "TRT EP fp32 \n90th percentile (ms)",
+                        "TRT EP fp32 \nmemory usage (MiB)",
                         "Standalone TRT fp32 \nmean (ms)",
                         "Standalone TRT fp32 \n90th percentile (ms)",
+                        "Standalone TRT fp32 \nmemory usage (MiB)",
+                        "TRT v CUDA EP fp32 \ngain (mean) (%)",
+                        "EP v Native TRT fp32 \ngain (mean) (%)",
                         "CUDA fp16 \nmean (ms)",
                         "CUDA fp16 \n90th percentile (ms)",
+                        "CUDA EP fp16 \nmemory usage (MiB)",
                         "TRT EP fp16 \nmean (ms)",
                         "TRT EP fp16 \n90 percentile (ms)",
+                        "TRT EP fp16 \nmemory usage (MiB)",
                         "Standalone TRT fp16 \nmean (ms)",
                         "Standalone TRT fp16 \n90th percentile (ms)",
-                        "TRT EP \ngain (mean) (%)",
-                        "TRT EP fp16 \ngain (mean) (%)"]
+                        "Standalone TRT fp16 \nmemory usage (MiB)",
+                        "TRT v CUDA EP fp16 \ngain (mean) (%)", 
+                        "EP v Native TRT fp16 \ngain (mean) (%)"]
         csv_writer = csv.writer(csv_file)
 
         if need_write_header:
@@ -1268,6 +1346,10 @@ def output_latency(results, csv_filename):
             if cuda in value and 'latency_90_percentile' in value[cuda]:
                 cuda_90_percentile = value[cuda]['latency_90_percentile']
 
+            cuda_memory = ""
+            if cuda in value and 'memory' in value[cuda]:
+                cuda_memory = value[cuda]['memory']
+            
             trt_average = ""
             if trt in value and 'average_latency_ms' in value[trt]:
                 trt_average = value[trt]['average_latency_ms']
@@ -1275,6 +1357,10 @@ def output_latency(results, csv_filename):
             trt_90_percentile = ""
             if trt in value and 'latency_90_percentile' in value[trt]:
                 trt_90_percentile = value[trt]['latency_90_percentile']
+            
+            trt_memory = ""
+            if trt in value and 'memory' in value[trt]:
+                trt_memory = value[trt]['memory']
 
             standalone_trt_average = ""
             if standalone_trt in value and 'average_latency_ms' in value[standalone_trt]:
@@ -1283,12 +1369,19 @@ def output_latency(results, csv_filename):
             standalone_trt_90_percentile = ""
             if standalone_trt in value and 'latency_90_percentile' in value[standalone_trt]:
                 standalone_trt_90_percentile = value[standalone_trt]['latency_90_percentile']
-
+            
+            standalone_trt_memory = ""
+            if standalone_trt in value and 'memory' in value[standalone_trt]:
+                standalone_trt_memory = value[standalone_trt]['memory']
 
             cuda_fp16_average = ""
             if cuda_fp16 in value and 'average_latency_ms' in value[cuda_fp16]:
                 cuda_fp16_average = value[cuda_fp16]['average_latency_ms']
 
+            cuda_fp16_memory = ""
+            if cuda_fp16 in value and 'memory' in value[cuda_fp16]:
+                cuda_fp16_memory = value[cuda_fp16]['memory']
+            
             cuda_fp16_90_percentile = ""
             if cuda_fp16 in value and 'latency_90_percentile' in value[cuda_fp16]:
                 cuda_fp16_90_percentile = value[cuda_fp16]['latency_90_percentile']
@@ -1301,32 +1394,47 @@ def output_latency(results, csv_filename):
             if trt_fp16 in value and 'latency_90_percentile' in value[trt_fp16]:
                 trt_fp16_90_percentile = value[trt_fp16]['latency_90_percentile']
 
+            trt_fp16_memory = ""
+            if trt_fp16 in value and 'memory' in value[trt_fp16]:
+                trt_fp16_memory = value[trt_fp16]['memory']
+            
             standalone_trt_fp16_average = ""
-            if standalone_trt in value and 'average_latency_ms' in value[standalone_trt_fp16]:
-                standalone_trt_fp16_average = value[standalone_trt]['average_latency_ms']
+            if standalone_trt_fp16 in value and 'average_latency_ms' in value[standalone_trt_fp16]:
+                standalone_trt_fp16_average = value[standalone_trt_fp16]['average_latency_ms']
 
             standalone_trt_fp16_90_percentile = ""
-            if standalone_trt in value and 'latency_90_percentile' in value[standalone_trt_fp16]:
-                standalone_trt_fp16_90_percentile = value[standalone_trt]['latency_90_percentile']
-
+            if standalone_trt_fp16 in value and 'latency_90_percentile' in value[standalone_trt_fp16]:
+                standalone_trt_fp16_90_percentile = value[standalone_trt_fp16]['latency_90_percentile']
+            
+            standalone_trt_fp16_memory = ""
+            if standalone_trt_fp16 in value and 'memory' in value[standalone_trt_fp16]:
+                standalone_trt_fp16_memory = value[standalone_trt_fp16]['memory']
 
             row = [key,
                    cpu_average, 
                    cpu_90_percentile, 
                    cuda_average,
                    cuda_90_percentile,
+                   cuda_memory,
                    trt_average,
                    trt_90_percentile,
+                   trt_memory,
                    standalone_trt_average,
                    standalone_trt_90_percentile,
+                   standalone_trt_memory,
+                   value[trt_cuda_gain] if trt_cuda_gain in value else "  ",
+                   value[trt_native_gain] if trt_native_gain in value else "  ",
                    cuda_fp16_average,
                    cuda_fp16_90_percentile,
+                   cuda_fp16_memory,
                    trt_fp16_average,
                    trt_fp16_90_percentile,
+                   trt_fp16_memory,
                    standalone_trt_fp16_average,
                    standalone_trt_fp16_90_percentile,
-                   value['Tensorrt_gain(%)'] if 'Tensorrt_gain(%)' in value else "  ",
-                   value['Tensorrt_fp16_gain(%)'] if 'Tensorrt_fp16_gain(%)' in value else "  "
+                   standalone_trt_fp16_memory,
+                   value[trt_cuda_fp16_gain] if trt_cuda_fp16_gain in value else "  ",
+                   value[trt_native_fp16_gain] if trt_native_fp16_gain in value else "  "
                    ]
             csv_writer.writerow(row)
 
@@ -1449,6 +1557,8 @@ def parse_arguments():
     parser.add_argument("-i", "--input_data", required=False, default="fix", choices=["fix", "random"], help="Type of input data.")
 
     parser.add_argument("-o", "--perf_result_path", required=False, default="result", help="Directory for perf result.")
+    
+    parser.add_argument("--track_memory", required=False, default=True, help="Track CUDA and TRT Memory Usage")
 
     parser.add_argument("--ep", required=False, default=None, help="Specify ORT Execution Provider.")
 
@@ -1512,7 +1622,6 @@ def main():
     success_results, model_to_latency, model_to_fail_ep, model_to_metrics = run_onnxruntime(args, models)
     perf_end_time = datetime.now()
 
-
     logger.info("Done running the perf.")
     logger.info("\nTotal time for benchmarking all models: {}".format(perf_end_time - perf_start_time))
     logger.info(list(models.keys()))
@@ -1548,7 +1657,7 @@ def main():
         logger.info("\n==========================================")
         logger.info("=========== Models/EPs latency ===========")
         logger.info("==========================================")
-        # add_improvement_information(model_to_latency)
+        add_improvement_information(model_to_latency)
         pp.pprint(model_to_latency)
         write_map_to_file(model_to_latency, LATENCY_FILE)
         if args.write_test_result:
diff --git a/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py b/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py
index af123bef30..f788c33c67 100644
--- a/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py
+++ b/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py
@@ -8,7 +8,6 @@ import json
 import re
 import pprint
 from benchmark import *
-from perf_utils import get_latest_commit_hash
 
 def write_model_info_to_file(model, path):
     with open(path, 'w') as file:
@@ -19,8 +18,8 @@ def get_ep_list(comparison):
         ep_list = [cpu, acl]
     else:   
         # test with cuda and trt
-        ep_list = [cpu, cuda, trt, cuda_fp16, trt_fp16]
-    return ep_list 
+        ep_list = [cpu, cuda, trt, standalone_trt, cuda_fp16, trt_fp16, standalone_trt_fp16]
+    return ep_list
 
 def main():
     args = parse_arguments()
@@ -47,32 +46,43 @@ def main():
         
         model_list_file = os.path.join(os.getcwd(), model +'.json')
         write_model_info_to_file([model_info], model_list_file)
+        
         if args.ep: 
             ep_list = [args.ep]
         else:
             ep_list = get_ep_list(args.comparison)
+        
         for ep in ep_list:
+            
+            command =  ["python3",
+                        "benchmark.py",
+                        "-r", args.running_mode,
+                        "-m", model_list_file,
+                        "-o", args.perf_result_path,
+                        "--write_test_result", "false"]
+            
+            if "Standalone" in ep: 
+                if args.running_mode == "validate": 
+                    continue 
+                else:
+                    trtexec_path = get_trtexec_path()    
+                    command.extend(["--trtexec", trtexec_path])
+                    ep = trt_fp16 if "fp16" in ep else trt 
+
+            command.extend(["--ep", ep])
+            
             if args.running_mode == "validate":
-                p = subprocess.run(["python3",
-                                    "benchmark.py",
-                                    "-r", args.running_mode,
-                                    "-m", model_list_file,
-                                    "--ep", ep,
-                                    "-o", args.perf_result_path,
-                                    "--write_test_result", "false",
-                                    "--benchmark_fail_csv", benchmark_fail_csv,
-                                    "--benchmark_metrics_csv", benchmark_metrics_csv])
+                command.extend(["--benchmark_fail_csv", benchmark_fail_csv,
+                                "--benchmark_metrics_csv", benchmark_metrics_csv])
+            
             elif args.running_mode == "benchmark":
-                p = subprocess.run(["python3",
-                                    "benchmark.py",
-                                    "-r", args.running_mode,
-                                    "-m", model_list_file,
-                                    "--ep", ep,
-                                    "-t", str(args.test_times),
-                                    "-o", args.perf_result_path,
-                                    "--write_test_result", "false",
-                                    "--benchmark_latency_csv", benchmark_latency_csv,
-                                    "--benchmark_success_csv", benchmark_success_csv]) 
+                command.extend(["-t", str(args.test_times),
+                                "-o", args.perf_result_path,
+                                "--write_test_result", "false",
+                                "--benchmark_latency_csv", benchmark_latency_csv,
+                                "--benchmark_success_csv", benchmark_success_csv]) 
+            
+            p = subprocess.run(command)
             logger.info(p)
 
             if p.returncode != 0:
@@ -116,11 +126,10 @@ def main():
         logger.info("=======================================================")
 
         model_status = {}
-        success_path = os.path.join(path, benchmark_success_csv)
-        if os.path.exists(success_path):
-            model_success = read_success_from_file(success_path)
+        if os.path.exists(LATENCY_FILE):
+            model_latency = read_map_from_file(LATENCY_FILE)
             is_fail = False
-            model_status = build_status(model_status, model_success, is_fail)
+            model_status = build_status(model_status, model_latency, is_fail)
         if os.path.exists(FAIL_MODEL_FILE):
             model_fail = read_map_from_file(FAIL_MODEL_FILE)
             is_fail = True
diff --git a/onnxruntime/python/tools/tensorrt/perf/build/Dockerfile.tensorrt-perf b/onnxruntime/python/tools/tensorrt/perf/build/Dockerfile.tensorrt-perf
index d6a64ef197..affc2682bf 100644
--- a/onnxruntime/python/tools/tensorrt/perf/build/Dockerfile.tensorrt-perf
+++ b/onnxruntime/python/tools/tensorrt/perf/build/Dockerfile.tensorrt-perf
@@ -1,4 +1,31 @@
-FROM onnxruntime-trt
+# --------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------
+# Dockerfile to run ONNXRuntime with TensorRT integration
+
+# nvidia TensorRT Base Image
+FROM nvcr.io/nvidia/tensorrt:20.12-py3
+
+ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime
+ARG ONNXRUNTIME_BRANCH=master
+
 RUN apt-get update &&\
-    apt-get -y install libprotobuf-dev protobuf-compiler pciutils &&\
-    pip install coloredlogs numpy flake8 onnx Cython onnxmltools sympy packaging
+    apt-get install -y sudo git bash unattended-upgrades
+RUN unattended-upgrade
+
+WORKDIR /code
+ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/code/cmake-3.14.3-Linux-x86_64/bin:/opt/miniconda/bin:${PATH}
+ENV LD_LIBRARY_PATH /opt/miniconda/lib:$LD_LIBRARY_PATH
+
+# Prepare onnxruntime repository & build onnxruntime with TensorRT
+RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime &&\
+    /bin/sh onnxruntime/onnxruntime/python/tools/tensorrt/perf/build/install_common_deps.sh &&\
+    cp onnxruntime/docs/Privacy.md /code/Privacy.md &&\
+    cp onnxruntime/dockerfiles/LICENSE-IMAGE.txt /code/LICENSE-IMAGE.txt &&\
+    cp onnxruntime/ThirdPartyNotices.txt /code/ThirdPartyNotices.txt &&\
+    cd onnxruntime &&\
+    /bin/sh ./build.sh --parallel --cuda_home /usr/local/cuda --cudnn_home /usr/lib/x86_64-linux-gnu/ --use_tensorrt --tensorrt_home /workspace/tensorrt --config Release --build_wheel --update --build --cmake_extra_defines ONNXRUNTIME_VERSION=$(cat ./VERSION_NUMBER) &&\
+    pip install /code/onnxruntime/build/Linux/Release/dist/*.whl &&\
+    cd .. &&\
+    rm -rf onnxruntime cmake-3.14.3-Linux-x86_64
diff --git a/onnxruntime/python/tools/tensorrt/perf/build/build_image.sh b/onnxruntime/python/tools/tensorrt/perf/build/build_image.sh
new file mode 100755
index 0000000000..e698b86790
--- /dev/null
+++ b/onnxruntime/python/tools/tensorrt/perf/build/build_image.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+while getopts p:b:i: parameter
+do case "${parameter}"
+in 
+p) PERF_DOCKERFILE_PATH=${OPTARG};;
+b) ORT_BRANCH=${OPTARG};;
+i) IMAGE_NAME=${OPTARG};;
+esac
+done 
+
+sudo docker build --no-cache -t $IMAGE_NAME --build-arg ONNXRUNTIME_BRANCH=$ORT_BRANCH -f $ORT_DOCKERFILE_PATH ..
diff --git a/onnxruntime/python/tools/tensorrt/perf/build/build_images.sh b/onnxruntime/python/tools/tensorrt/perf/build/build_images.sh
deleted file mode 100755
index f6d933ccf4..0000000000
--- a/onnxruntime/python/tools/tensorrt/perf/build/build_images.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-while getopts o:p:b:i: parameter
-do case "${parameter}"
-in 
-o) ORT_DOCKERFILE_PATH=${OPTARG};;
-p) PERF_DOCKERFILE_PATH=${OPTARG};;
-b) ORT_BRANCH=${OPTARG};;
-i) IMAGE_NAME=${OPTARG};;
-esac
-done 
-
-sudo docker build --no-cache -t onnxruntime-trt --build-arg ONNXRUNTIME_BRANCH=$ORT_BRANCH -f $ORT_DOCKERFILE_PATH ..
-sudo docker build --no-cache -t $IMAGE_NAME -f $PERF_DOCKERFILE_PATH ..
diff --git a/onnxruntime/python/tools/tensorrt/perf/build/install_common_deps.sh b/onnxruntime/python/tools/tensorrt/perf/build/install_common_deps.sh
new file mode 100644
index 0000000000..42326928ae
--- /dev/null
+++ b/onnxruntime/python/tools/tensorrt/perf/build/install_common_deps.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+DEBIAN_FRONTEND=noninteractive
+apt-get update && apt-get install -y --no-install-recommends \
+        wget \
+        zip \
+        ca-certificates \
+        build-essential \
+        curl \
+        libcurl4-openssl-dev \
+        libssl-dev \
+        python3-dev \
+	libprotobuf-dev \
+	protobuf-compiler \
+	pciutils
+
+pip install pandas coloredlogs numpy flake8 onnx Cython onnxmltools sympy packaging psutil
+
+# Dependencies: cmake
+wget --quiet https://github.com/Kitware/CMake/releases/download/v3.14.3/cmake-3.14.3-Linux-x86_64.tar.gz
+tar zxf cmake-3.14.3-Linux-x86_64.tar.gz
+rm -rf cmake-3.14.3-Linux-x86_64.tar.gz
diff --git a/onnxruntime/python/tools/tensorrt/perf/model_list.json b/onnxruntime/python/tools/tensorrt/perf/model_list.json
index f77b3e5fa8..10f552246a 100644
--- a/onnxruntime/python/tools/tensorrt/perf/model_list.json
+++ b/onnxruntime/python/tools/tensorrt/perf/model_list.json
@@ -1,241 +1,277 @@
 [
     {
-        "model_name": "BERT-Squad",
-        "working_directory": "./models/bert-squad/",
-        "model_path": "./download_sample_10/bertsquad10.onnx",
-        "test_data_path": "./download_sample_10/"
+        "model_name": "mobilenetv2-7",
+        "working_directory": "./models/mobilenetv2-7/",
+        "model_path": "./mobilenetv2-7/mobilenetv2-7.onnx",
+        "test_data_path": "./mobilenetv2-7/"
+    },
+    {
+        "model_name": "resnet18-v1-7",
+        "working_directory": "./models/resnet18-v1-7/",
+        "model_path": "./resnet18-v1-7/resnet18-v1-7.onnx",
+        "test_data_path": "./resnet18-v1-7/"
+    },
+    {
+        "model_name": "resnet34-v1-7",
+        "working_directory": "./models/resnet34-v1-7/",
+        "model_path": "./resnet34-v1-7/resnet34-v1-7.onnx",
+        "test_data_path": "./resnet34-v1-7/"
+    },
+    {
+        "model_name": "resnet50-v1-7",
+        "working_directory": "./models/resnet50-v1-7/",
+        "model_path": "./resnet50v1/resnet50-v1-7.onnx",
+        "test_data_path": "./resnet50v1/"
+    },
+    {
+        "model_name": "resnet101-v1-7",
+        "working_directory": "./models/resnet101-v1-7/",
+        "model_path": "./resnet101v1/resnet101-v1-7.onnx",
+        "test_data_path": "./resnet101v1/"
+    },
+    {
+        "model_name": "resnet152-v1-7",
+        "working_directory": "./models/resnet152-v1-7/",
+        "model_path": "./resnet152v1/resnet152-v1-7.onnx",
+        "test_data_path": "./resnet152v1/"
+    },
+    {
+        "model_name": "resnet18-v2-7",
+        "working_directory": "./models/resnet18-v2-7/",
+        "model_path": "./resnet18v2/resnet18-v2-7.onnx",
+        "test_data_path": "./resnet18v2/"
+    },
+    {
+        "model_name": "resnet34-v2-7",
+        "working_directory": "./models/resnet34-v2-7/",
+        "model_path": "./resnet34v2/resnet34-v2-7.onnx",
+        "test_data_path": "./resnet34v2/"
+    },
+    {
+        "model_name": "resnet50-v2-7",
+        "working_directory": "./models/resnet50-v2-7/",
+        "model_path": "./resnet50v2/resnet50-v2-7.onnx",
+        "test_data_path": "./resnet50v2/"
+    },
+    {
+        "model_name": "resnet101-v2-7",
+        "working_directory": "./models/resnet101-v2-7/",
+        "model_path": "./resnet101v2/resnet101-v2-7.onnx",
+        "test_data_path": "./resnet101v2/"
+    },
+    {
+        "model_name": "resnet152-v2-7",
+        "working_directory": "./models/resnet152-v2-7/",
+        "model_path": "./resnet152v2/resnet152-v2-7.onnx",
+        "test_data_path": "./resnet152v2/"
+    },
+    {
+        "model_name": "squeezenet1.1-7",
+        "working_directory": "./models/squeezenet1.1-7/",
+        "model_path": "./squeezenet1.1/._squeezenet1.1.onnx",
+        "test_data_path": "./squeezenet1.1/"
+    },
+    {
+        "model_name": "squeezenet1.0-3",
+        "working_directory": "./models/squeezenet1.0-3/",
+        "model_path": "./squeezenet/model.onnx",
+        "test_data_path": "./squeezenet/"
+    },
+    {
+        "model_name": "squeezenet1.0-6",
+        "working_directory": "./models/squeezenet1.0-6/",
+        "model_path": "./squeezenet/model.onnx",
+        "test_data_path": "./squeezenet/"
+    },
+    {
+        "model_name": "squeezenet1.0-7",
+        "working_directory": "./models/squeezenet1.0-7/",
+        "model_path": "./squeezenet/model.onnx",
+        "test_data_path": "./squeezenet/"
+    },
+    {
+        "model_name": "squeezenet1.0-8",
+        "working_directory": "./models/squeezenet1.0-8/",
+        "model_path": "./squeezenet/model.onnx",
+        "test_data_path": "./squeezenet/"
+    },
+    {
+        "model_name": "squeezenet1.0-9",
+        "working_directory": "./models/squeezenet1.0-9/",
+        "model_path": "./squeezenet/model.onnx",
+        "test_data_path": "./squeezenet/"
+    },
+    {
+        "model_name": "vgg16-7",
+        "working_directory": "./models/vgg16-7/",
+        "model_path": "./vgg16/._vgg16.onnx",
+        "test_data_path": "./vgg16/"
+    },
+    {
+        "model_name": "vgg19-bn-7",
+        "working_directory": "./models/vgg19-bn-7/",
+        "model_path": "./vgg19-bn/._vgg19-bn.onnx",
+        "test_data_path": "./vgg19-bn/"
+    },
+    {
+        "model_name": "bvlcalexnet-9",
+        "working_directory": "./models/bvlcalexnet-9/",
+        "model_path": "./bvlc_alexnet/model.onnx",
+        "test_data_path": "./bvlc_alexnet/"
+    },
+    {
+        "model_name": "googlenet-9",
+        "working_directory": "./models/googlenet-9/",
+        "model_path": "./bvlc_googlenet/model.onnx",
+        "test_data_path": "./bvlc_googlenet/"
+    },
+    {
+        "model_name": "caffenet-9",
+        "working_directory": "./models/caffenet-9/",
+        "model_path": "./bvlc_reference_caffenet/model.onnx",
+        "test_data_path": "./bvlc_reference_caffenet/"
+    },
+    {
+        "model_name": "rcnn-ilsvrc13-9",
+        "working_directory": "./models/rcnn-ilsvrc13-9/",
+        "model_path": "./bvlc_reference_rcnn_ilsvrc13/model.onnx",
+        "test_data_path": "./bvlc_reference_rcnn_ilsvrc13/"
+    },
+    {
+        "model_name": "densenet-9",
+        "working_directory": "./models/densenet-9/",
+        "model_path": "./densenet121/model.onnx",
+        "test_data_path": "./densenet121/"
+    },
+    {
+        "model_name": "inception-v1-9",
+        "working_directory": "./models/inception-v1-9/",
+        "model_path": "./inception_v1/model.onnx",
+        "test_data_path": "./inception_v1/"
+    },
+    {
+        "model_name": "inception-v2-9",
+        "working_directory": "./models/inception-v2-9/",
+        "model_path": "./inception_v2/model.onnx",
+        "test_data_path": "./inception_v2/"
+    },
+    {
+        "model_name": "shufflenet-9",
+        "working_directory": "./models/shufflenet-9/",
+        "model_path": "./shufflenet/model.onnx",
+        "test_data_path": "./shufflenet/"
+    },
+    {
+        "model_name": "shufflenet-v2-10",
+        "working_directory": "./models/shufflenet-v2-10/",
+        "model_path": "./model/test_shufflenetv2/model.onnx",
+        "test_data_path": "./model/test_shufflenetv2/"
+    },
+    {
+        "model_name": "mnist-8",
+        "working_directory": "./models/mnist-8/",
+        "model_path": "./mnist/model.onnx",
+        "test_data_path": "./mnist/"
+    },
+    {
+        "model_name": "tinyyolov2-8",
+        "working_directory": "./models/tinyyolov2-8/",
+        "model_path": "./tiny_yolov2/Model.onnx",
+        "test_data_path": "./tiny_yolov2/"
+    },
+    {
+        "model_name": "ssd-10",
+        "working_directory": "./models/ssd-10/",
+        "model_path": "./model.onnx",
+        "test_data_path": "./"
+    },
+    {
+        "model_name": "ssd_mobilenet_v1_10",
+        "working_directory": "./models/ssd_mobilenet_v1_10/",
+        "model_path": "./ssd_mobilenet_v1/ssd_mobilenet_v1.onnx",
+        "test_data_path": "./ssd_mobilenet_v1/"
     },
     {
         "model_name": "FasterRCNN-10",
-        "working_directory": "./models/faster-rcnn/",
+        "working_directory": "./models/FasterRCNN-10/",
         "model_path": "./faster_rcnn_R_50_FPN_1x.onnx",
         "test_data_path": "./"
     },
     {
         "model_name": "MaskRCNN-10",
-        "working_directory": "./models/mask-rcnn/",
+        "working_directory": "./models/MaskRCNN-10/",
         "model_path": "./mask_rcnn_R_50_FPN_1x.onnx",
         "test_data_path": "./"
     },
     {
-        "model_name": "SSD",
-        "working_directory": "./models/ssd/",
-        "model_path": "./model.onnx",
-        "test_data_path": "./"
-    },
-    {
-        "model_name": "TinyYolov2",
-        "working_directory": "./models/tiny-yolov2/",
-        "model_path": "./tiny_yolov2/model.onnx",
-        "test_data_path": "./tiny_yolov2/"
-    },
-    {
-        "model_name": "TinyYolov3",
-        "working_directory": "./models/tiny-yolov3/",
-        "model_path": "./yolov3-tiny.onnx",
-        "test_data_path": "./"
-    },
-    {
-        "model_name": "Yolov3",
-        "working_directory": "./models/yolov3/",
-        "model_path": "./yolov3/yolov3.onnx",
-        "test_data_path": "./yolov3/"
-    },
-    {
-        "model_name": "Yolov4",
-        "working_directory": "./models/yolov4/",
-        "model_path": "./yolov4/yolov4.onnx",
-        "test_data_path": "./custom_test_data/"
-    },
-    {
-        "model_name": "Resnet-152-v1",
-        "working_directory": "./models/resnet152v1/",
-        "model_path": "./resnet152v1/resnet152-v1-7.onnx",
-        "test_data_path": "./resnet152v1/"
-    },
-    {
-        "model_name": "Resnet-152-v2",
-        "working_directory": "./models/resnet152v2/",
-        "model_path": "./resnet152v2/resnet152-v2-7.onnx",
-        "test_data_path": "./resnet152v2/"
-    },
-    {
-        "model_name": "Inception-v1",
-        "working_directory": "./models/inception-v1/",
-        "model_path": "./inception_v1/model.onnx",
-        "test_data_path": "./inception_v1/"
-    },
-    {
-        "model_name": "Inception-v2",
-        "working_directory": "./models/inception-v2/",
-        "model_path": "./inception_v2/model.onnx",
-        "test_data_path": "./inception_v2/"
-    },
-    {
-        "model_name": "Mobilenet-v2-1.0",
-        "working_directory": "./models/mobilenet-v2/",
-        "model_path": "./mobilenetv2-1.0/mobilenetv2-1.0.onnx",
-        "test_data_path": "./mobilenetv2-1.0/"
-    },
-    {
-        "model_name": "Zfnet512",
-        "working_directory": "./models/zfnet512/",
-        "model_path": "./zfnet512/model.onnx",
-        "test_data_path": "./zfnet512/"
-    },
-    {
-        "model_name": "Vgg16",
-        "working_directory": "./models/vgg16/",
-        "model_path": "./vgg16/vgg16.onnx",
-        "test_data_path": "./vgg16/"
-    },
-    {
-        "model_name": "Vgg19-bn",
-        "working_directory": "./models/vgg19-bn/",
-        "model_path": "./vgg19-bn/vgg19-bn.onnx",
-        "test_data_path": "./vgg19-bn/"
-    },
-    {
-        "model_name": "GPT2",
-        "working_directory": "./models/GPT2/",
-        "model_path": "./GPT2/model.onnx",
-        "test_data_path": "./GPT2/"
-    },
-    {
-        "model_name": "GPT2_LM_HEAD",
-        "working_directory": "./models/GPT2-LM-HEAD/",
-        "model_path": "./GPT-2-LM-HEAD/model.onnx",
-        "test_data_path": "./GPT-2-LM-HEAD/"
-    },
-    {
-        "model_name": "mnist",
-        "working_directory": "./models/mnist/",
-        "model_path": "./mnist/model.onnx",
-        "test_data_path": "./mnist/"
-    },
-    {
-        "model_name": "Resnet18-v1",
-        "working_directory": "./models/resnet18v1/",
-        "model_path": "./resnet18-v1-7/resnet18-v1-7.onnx",
-        "test_data_path": "./resnet18-v1-7/"
-    },
-    {
-        "model_name": "Resnet18-v2",
-        "working_directory": "./models/resnet18v2/",
-        "model_path": "./resnet18v2/resnet18-v2-7.onnx",
-        "test_data_path": "./resnet18v2/"
-    },
-    {
-        "model_name": "Resnet34-v1",
-        "working_directory": "./models/resnet34v1/",
-        "model_path": "./resnet34-v1-7/resnet34-v1-7.onnx",
-        "test_data_path": "./resnet34-v1-7/"
-    },
-    {
-        "model_name": "Resnet34-v2",
-        "working_directory": "./models/resnet34v2/",
-        "model_path": "./resnet34v2/resnet34-v2-7.onnx",
-        "test_data_path": "./resnet34v2/"
-    },
-    {
-        "model_name": "Resnet50-v1",
-        "working_directory": "./models/resnet50v1/",
-        "model_path": "./resnet50v1/resnet50v1.onnx",
-        "test_data_path": "./resnet50v1/"
-    },
-    {
-        "model_name": "Resnet50-v2",
-        "working_directory": "./models/resnet50v2/",
-        "model_path": "./resnet50v2/resnet50v2.onnx",
-        "test_data_path": "./resnet50v2/"
-    },
-    {
-        "model_name": "Resnet101",
-        "working_directory": "./models/resnet101/",
-        "model_path": "./resnet101v2/resnet101-v2-7.onnx",
-        "test_data_path": "./resnet101v2/"
-    },
-    {
-        "model_name": "Shufflenet-v1",
-        "working_directory": "./models/shufflenet-v1/",
-        "model_path": "./shufflenet/model.onnx",
-        "test_data_path": "./shufflenet/"
-    },
-    {
-        "model_name": "Shufflenet-v1",
-        "working_directory": "./models/shufflenet-v1/",
-        "model_path": "./shufflenet/model.onnx",
-        "test_data_path": "./shufflenet/"
-    },
-    {
-        "model_name": "Shufflenet-v2",
-        "working_directory": "./models/shufflenet-v2/",
-        "model_path": "./model/test_shufflenetv2/model.onnx",
-        "test_data_path": "./model/test_shufflenetv2"
-    },
-    {
-        "model_name": "Squeezenet1.1",
-        "working_directory": "./models/squeezenet1.1/",
-        "model_path": "./squeezenet1.1/squeezenet1.1.onnx",
-        "test_data_path": "./squeezenet1.1/"
-    },
-    {
-        "model_name": "Emotion-ferplus",
-        "working_directory": "./models/emotion-ferplus/",
-        "model_path": "./emotion_ferplus/model.onnx",
-        "test_data_path": "./emotion_ferplus/"
-    },
-    {
-        "model_name": "bvlc-googlenet",
-        "working_directory": "./models/bvlc-googlenet",
-        "model_path": "./bvlc_googlenet/model.onnx",
-        "test_data_path": "./bvlc_googlenet/"
-    },
-    {
-        "model_name": "bvlc-alexnet",
-        "working_directory": "./models/bvlc-alexnet",
-        "model_path": "./bvlc_alexnet/model.onnx",
-        "test_data_path": "./bvlc_alexnet/"
-    },
-    {
-        "model_name": "bvlc-caffenet",
-        "working_directory": "./models/bvlc-caffenet",
-        "model_path": "./bvlc_reference_caffenet/model.onnx",
-        "test_data_path": "./bvlc_reference_caffenet/"
-    },
-    {
-        "model_name": "bvlc-rcnn-ilsvrc13",
-        "working_directory": "./models/bvlc-rcnn-ilvscr13",
-        "model_path": "./bvlc_reference_rcnn_ilsvrc13/model.onnx",
-        "test_data_path": "./bvlc_reference_rcnn_ilsvrc13/"
-    },
-    {
-        "model_name": "Retinanet",
-        "working_directory": "./models/retinanet",
+        "model_name": "retinanet-9",
+        "working_directory": "./models/retinanet-9/",
         "model_path": "./test_retinanet_resnet101/retinanet-9.onnx",
         "test_data_path": "./test_retinanet_resnet101/"
     },
     {
-        "model_name": "Densenet",
-        "working_directory": "./models/densenet",
-        "model_path": "./densenet121/model.onnx",
-        "test_data_path": "./densenet121/"
+        "model_name": "yolov3-10",
+        "working_directory": "./models/yolov3-10/",
+        "model_path": "./yolov3/yolov3.onnx",
+        "test_data_path": "./yolov3/"
     },
     {
-        "model_name": "ResNet101-DUC-HDC",
-        "working_directory": "./models/Resnet101-DUC",
+        "model_name": "tiny-yolov3-11",
+        "working_directory": "./models/tiny-yolov3-11/",
+        "model_path": "./yolov3-tiny.onnx",
+        "test_data_path": "./"
+    },
+    {
+        "model_name": "yolov4",
+        "working_directory": "./models/yolov4/",
+        "model_path": "./yolov4/yolov4.onnx",
+        "test_data_path": "./yolov4/"
+    },
+    {
+        "model_name": "ResNet101-DUC-7",
+        "working_directory": "./models/ResNet101-DUC-7/",
         "model_path": "./ResNet101_DUC_HDC/ResNet101_DUC_HDC.onnx",
         "test_data_path": "./ResNet101_DUC_HDC/"
     },
     {
-        "model_name": "Arc-Face",
-        "working_directory": "./models/arc-face",
+        "model_name": "emotion-ferplus-8",
+        "working_directory": "./models/emotion-ferplus-8/",
+        "model_path": "./emotion_ferplus/model.onnx",
+        "test_data_path": "./emotion_ferplus/"
+    },
+    {
+        "model_name": "bertsquad-10",
+        "working_directory": "./models/bertsquad-10/",
+        "model_path": "./bertsquad-10/bertsquad10.onnx",
+        "test_data_path": "./bertsquad-10/"
+    },
+    {
+        "model_name": "gpt2-lm-head-10",
+        "working_directory": "./models/gpt2-lm-head-10/",
+        "model_path": "./GPT-2-LM-HEAD/model.onnx",
+        "test_data_path": "./GPT-2-LM-HEAD/"
+    },
+    {
+        "model_name": "gpt2-10",
+        "working_directory": "./models/gpt2-10/",
+        "model_path": "./GPT2/model.onnx",
+        "test_data_path": "./GPT2/"
+    },
+    {
+        "model_name": "zfnet512-9",
+        "working_directory": "./models/zfnet512-9/",
+        "model_path": "./zfnet512/model.onnx",
+        "test_data_path": "./zfnet512/"
+    },
+    {
+        "model_name": "arcfaceresnet100-8",
+        "working_directory": "./models/arcfaceresnet100-8/",
         "model_path": "./resnet100/resnet100.onnx",
         "test_data_path": "./resnet100/"
     },
     {
-        "model_name": "Fast-Neural",
-        "working_directory": "./models/Fast-Neural",
+        "model_name": "mosaic-9",
+        "working_directory": "./models/mosaic-9/",
         "model_path": "./mosaic/mosaic.onnx",
         "test_data_path": "./mosaic/"
     }
diff --git a/onnxruntime/python/tools/tensorrt/perf/perf.sh b/onnxruntime/python/tools/tensorrt/perf/perf.sh
index 2dcd854168..f2633096cf 100755
--- a/onnxruntime/python/tools/tensorrt/perf/perf.sh
+++ b/onnxruntime/python/tools/tensorrt/perf/perf.sh
@@ -1,7 +1,5 @@
 #!/bin/bash
 
-
-
 while getopts d:o:m: parameter
 do case "${parameter}"
 in 
diff --git a/onnxruntime/python/tools/tensorrt/perf/run_perf_docker.sh b/onnxruntime/python/tools/tensorrt/perf/run_perf_docker.sh
index e49731ca08..4c12ad0266 100755
--- a/onnxruntime/python/tools/tensorrt/perf/run_perf_docker.sh
+++ b/onnxruntime/python/tools/tensorrt/perf/run_perf_docker.sh
@@ -19,13 +19,13 @@ VOLUME=$MACHINE_PERF_DIR:$DOCKER_PERF_DIR
 # Add Remaining Variables
 if [ $OPTION == "onnx-zoo-models" ]
 then 
-    MODEL_PATH=model.json
+    MODEL_PATH=model_list.json
 fi 
 
 if [ $OPTION == "many-models" ]
 then 
     MODEL_PATH=/usr/share/mount/many-models
-    VOLUME=$VOLUME' -v /home/hcsuser/mount/test:/usr/share/mount/many-models'
+    VOLUME=$VOLUME' -v /home/hcsuser/mount/many-models:/usr/share/mount/many-models'
 fi 
 
 if [ $OPTION == "partner-models" ]
diff --git a/onnxruntime/python/tools/tensorrt/perf/run_perf_machine.sh b/onnxruntime/python/tools/tensorrt/perf/run_perf_machine.sh
index 1a9f16945f..849546da0e 100755
--- a/onnxruntime/python/tools/tensorrt/perf/run_perf_machine.sh
+++ b/onnxruntime/python/tools/tensorrt/perf/run_perf_machine.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 # Parse Arguments
-while getopts d:o:m: parameter
+while getopts o:m: parameter
 do case "${parameter}"
 in 
 o) OPTION=${OPTARG};;
@@ -15,12 +15,12 @@ PERF_DIR=/home/hcsuser/perf/
 # Select models to be tested or run selected-models 
 if [ $OPTION == "onnx-zoo-models" ]
 then 
-    MODEL_PATH='model.json'
+    MODEL_PATH='model_list.json'
 fi 
 
 if [ $OPTION == "many-models" ]
 then 
-    MODEL_PATH=/usr/share/mount/many-models
+    MODEL_PATH=/home/hcsuser/mount/many-models
 fi 
 
 if [ $OPTION == "partner-models" ]
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-perf-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-perf-pipeline.yml
index c3dec967c6..08ed9af4f0 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-perf-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-perf-pipeline.yml
@@ -3,23 +3,23 @@ jobs:
   pool: Linux-GPU-TensorRT-Perf 
   variables:
     ALLOW_RELEASED_ONNX_OPSET_ONLY: '1'
-    image: 'onnxruntime-master-ep-perf'
-  timeoutInMinutes: 2000 
+    branch: 'master'
+  timeoutInMinutes: 4000 
   steps:
   
-    - script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/build/build_images.sh -o $(Build.SourcesDirectory)/dockerfiles/Dockerfile.tensorrt -p $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/build/Dockerfile.tensorrt-perf -b master -i $(image)'
+    - script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/build/build_image.sh -p $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/build/Dockerfile.tensorrt-perf -b master -i ort-$(branch)'
       displayName: 'Build latest ORT Images'
       workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/build'
 
-    - script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/run_perf_docker.sh -d $(image) -o "onnx-zoo-models"'
+    - script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/run_perf_docker.sh -d ort-$(branch) -o "onnx-zoo-models"'
       displayName: 'Onnx Zoo Models Perf'
       workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/'
 
-    - script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/run_perf_docker.sh -d  $(image) -o "many-models"'
+    - script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/run_perf_docker.sh -d  ort-$(branch) -o "many-models"'
       displayName: 'Many Models Perf'
       workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/'
 
-    - script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/run_perf_docker.sh -d  $(image) -o "partner-models"'
+    - script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/run_perf_docker.sh -d  ort-$(branch) -o "partner-models"'
       displayName: 'Partner Models Perf'
       workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/'