[EP Perf] MemTest: Add Valgrind and fix addressSanitizer (#16930)

### Description 1. Add valgrind to existing ep_perf CI MemTest and parse ORT-TRT memLeak details 1. General Valgrind logs and logs related to ORT-TRT will be parsed in [CI artifacts](https://aiinfra.visualstudio.com/Lotus/_build/results?buildId=334122&view=artifacts&pathAsName=false&type=publishedArtifacts) 1. Logic: 1. Run valgrind with `onnxruntime-perf-test -e tensorrt` and export log to `valgrind.log` 2. Identify if any `definitely lost` memleak happened 1. For log paragraphs which show `definitely lost`, parse if they have keyword `TensorrtExecutionProvider`. 2. If so, extract these details to `ort_trt_memleak_detail.log`, and return `build failure` to EP Perf CI 3. Fix existing addressSanitizer and sync the squeezenet testcase with latest update from [ort-inference-example](https://github.com/microsoft/onnxruntime-inference-examples/blob/main/c_cxx/squeezenet/main.cpp) 1. Updates in short: Upgrade main.cpp to be using OrtTensorRTProviderOptionsV2 4. Reorder the 7-min-MemTest to be ahead of 9-hr-model-tests, and enable MemTest by default
2026-07-04 04:07:22 +00:00 · 2023-08-04 16:58:57 -07:00 · 2023-08-04 16:58:57 -07:00 · d6ce43db5e
commit d6ce43db5e
parent 5af8774a0b
7 changed files with 160 additions and 252 deletions
--- a/onnxruntime/python/tools/tensorrt/perf/mem_test/CMakeLists.txt
+++ b/onnxruntime/python/tools/tensorrt/perf/mem_test/CMakeLists.txt
@ -2,7 +2,7 @@ project(alprdaemon)

 set(CMAKE_BUILD_TYPE Debug)

-cmake_minimum_required (VERSION 2.6)
+cmake_minimum_required(VERSION 3.13)

 set(CMAKE_CXX_STANDARD 14)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
--- a/onnxruntime/python/tools/tensorrt/perf/mem_test/main.cpp
+++ b/onnxruntime/python/tools/tensorrt/perf/mem_test/main.cpp
@ -3,61 +3,31 @@
 //

 #include <assert.h>
+#include <iostream>
 #include <vector>
 #include <onnxruntime_cxx_api.h>
 #include <tensorrt_provider_factory.h>
-
-std::unique_ptr<OrtTensorRTProviderOptions> get_default_trt_provider_options() {
-  auto tensorrt_options = std::make_unique<OrtTensorRTProviderOptions>();
-  tensorrt_options->device_id = 0;
-  tensorrt_options->has_user_compute_stream = 0;
-  tensorrt_options->user_compute_stream = nullptr;
-  tensorrt_options->trt_max_partition_iterations = 1000;
-  tensorrt_options->trt_min_subgraph_size = 1;
-  tensorrt_options->trt_max_workspace_size = 1 << 30;
-  tensorrt_options->trt_fp16_enable = false;
-  tensorrt_options->trt_int8_enable = false;
-  tensorrt_options->trt_int8_calibration_table_name = "";
-  tensorrt_options->trt_int8_use_native_calibration_table = false;
-  tensorrt_options->trt_dla_enable = false;
-  tensorrt_options->trt_dla_core = 0;
-  tensorrt_options->trt_dump_subgraphs = false;
-  tensorrt_options->trt_engine_cache_enable = false;
-  tensorrt_options->trt_engine_cache_path = "";
-  tensorrt_options->trt_engine_decryption_enable = false;
-  tensorrt_options->trt_engine_decryption_lib_path = "";
-  tensorrt_options->trt_force_sequential_engine_build = false;
-  tensorrt_options->trt_context_memory_sharing_enable = false;
-  tensorrt_options->trt_layer_norm_fp32_fallback = false;
-  return tensorrt_options;
-}
+#include <tensorrt_provider_options.h>

 void run_ort_trt2() {
  Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "test");
+  const auto& api = Ort::GetApi();
+  OrtTensorRTProviderOptionsV2* tensorrt_options;

  Ort::SessionOptions session_options;
  session_options.SetIntraOpNumThreads(1);

  session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED);

-#ifdef _WIN32
-  const wchar_t* model_path = L"squeezenet.onnx";
-  const wchar_t* calib_table = L"squeezenet_calibration.flatbuffers";
-#else
  const char* model_path = "squeezenet.onnx";
-  const char* calib_table = "squeezenet_calibration.flatbuffers";
-#endif

-  auto tensorrt_options = get_default_trt_provider_options();
+  Ort::ThrowOnError(api.CreateTensorRTProviderOptions(&tensorrt_options));
+  std::unique_ptr<OrtTensorRTProviderOptionsV2, decltype(api.ReleaseTensorRTProviderOptions)> rel_trt_options(
+      tensorrt_options, api.ReleaseTensorRTProviderOptions);
+  Ort::ThrowOnError(api.SessionOptionsAppendExecutionProvider_TensorRT_V2(static_cast<OrtSessionOptions*>(session_options),
+                                                                          rel_trt_options.get()));

-  tensorrt_options->trt_engine_cache_enable = true;
-  tensorrt_options->trt_int8_enable = true;
-  tensorrt_options->trt_fp16_enable = true;
-  tensorrt_options->trt_int8_calibration_table_name = calib_table;
-
-  session_options.AppendExecutionProvider_TensorRT(*tensorrt_options.get());
-  printf("Runing ORT TRT EP with:\n\tengine cache enabled\n\tfp16 enabled if supports\n\tint8 enabled if supports\n\tint8 calibration table provided\n");
-  printf("First run ...\n");
+  std::cout << "Running ORT TRT EP with default provider options" << std::endl;

  Ort::Session session(env, model_path, session_options);

@ -66,53 +36,58 @@ void run_ort_trt2() {
  Ort::AllocatorWithDefaultOptions allocator;

  // print number of model input nodes
-  size_t num_input_nodes = session.GetInputCount();
-  std::vector<Ort::AllocatedStringPtr> input_node_names_ptr;
-  std::vector<const char*> input_node_names(num_input_nodes);
+  const size_t num_input_nodes = session.GetInputCount();
+  std::vector<Ort::AllocatedStringPtr> input_names_ptr;
+  std::vector<const char*> input_node_names;
+  input_names_ptr.reserve(num_input_nodes);
+  input_node_names.reserve(num_input_nodes);
  std::vector<int64_t> input_node_dims;  // simplify... this model has only 1 input node {1, 3, 224, 224}.
                                         // Otherwise need vector<vector<>>

-  printf("Number of inputs = %zu\n", num_input_nodes);
+  std::cout << "Number of inputs = " << num_input_nodes << std::endl;

  // iterate over all input nodes
-  for (int i = 0; i < num_input_nodes; i++) {
+  for (size_t i = 0; i < num_input_nodes; i++) {
    // print input node names
    auto input_name = session.GetInputNameAllocated(i, allocator);
-    printf("Input %d : name=%s\n", i, input_name.get());
-    input_node_names[i] = input_name.get();
-    input_node_names_ptr.push_back(std::move(input_name));
+    std::cout << "Input " << i << " : name =" << input_name.get() << std::endl;
+    input_node_names.push_back(input_name.get());
+    input_names_ptr.push_back(std::move(input_name));

    // print input node types
-    Ort::TypeInfo type_info = session.GetInputTypeInfo(i);
+    auto type_info = session.GetInputTypeInfo(i);
    auto tensor_info = type_info.GetTensorTypeAndShapeInfo();

    ONNXTensorElementDataType type = tensor_info.GetElementType();
-    printf("Input %d : type=%d\n", i, type);
+    std::cout << "Input " << i << " : type = " << type << std::endl;

    // print input shapes/dims
    input_node_dims = tensor_info.GetShape();
-    printf("Input %d : num_dims=%zu\n", i, input_node_dims.size());
-    for (int j = 0; j < input_node_dims.size(); j++)
-      printf("Input %d : dim %d=%jd\n", i, j, input_node_dims[j]);
+    std::cout << "Input " << i << " : num_dims = " << input_node_dims.size() << '\n';
+    for (size_t j = 0; j < input_node_dims.size(); j++) {
+      std::cout << "Input " << i << " : dim[" << j << "] =" << input_node_dims[j] << '\n';
+    }
+    std::cout << std::flush;
  }

-  size_t input_tensor_size = 224 * 224 * 3;  // simplify ... using known dim values to calculate size
-                                             // use OrtGetTensorShapeElementCount() to get official size!
+  constexpr size_t input_tensor_size = 224 * 224 * 3;  // simplify ... using known dim values to calculate size
+                                                       // use OrtGetTensorShapeElementCount() to get official size!

  std::vector<float> input_tensor_values(input_tensor_size);
  std::vector<const char*> output_node_names = {"softmaxout_1"};

  // initialize input data with values in [0.0, 1.0]
-  for (unsigned int i = 0; i < input_tensor_size; i++)
-    input_tensor_values[i] = (float)i / (input_tensor_size + 1);
+  for (unsigned int i = 0; i < input_tensor_size; i++) input_tensor_values[i] = (float)i / (input_tensor_size + 1);

  // create input tensor object from data values
  auto memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
-  Ort::Value input_tensor = Ort::Value::CreateTensor<float>(memory_info, input_tensor_values.data(), input_tensor_size, input_node_dims.data(), 4);
+  auto input_tensor = Ort::Value::CreateTensor<float>(memory_info, input_tensor_values.data(), input_tensor_size,
+                                                      input_node_dims.data(), 4);
  assert(input_tensor.IsTensor());

  // score model & input tensor, get back output tensor
-  auto output_tensors = session.Run(Ort::RunOptions{nullptr}, input_node_names.data(), &input_tensor, 1, output_node_names.data(), 1);
+  auto output_tensors =
+      session.Run(Ort::RunOptions{nullptr}, input_node_names.data(), &input_tensor, 1, output_node_names.data(), 1);
  assert(output_tensors.size() == 1 && output_tensors.front().IsTensor());

  // Get pointer to output tensor float values
@ -120,8 +95,9 @@ void run_ort_trt2() {
  assert(abs(floatarr[0] - 0.000045) < 1e-6);

  // score the model, and print scores for first 5 classes
-  for (int i = 0; i < 5; i++)
-    printf("Score for class [%d] =  %f\n", i, floatarr[i]);
+  for (int i = 0; i < 5; i++) {
+    std::cout << "Score for class [" << i << "] =  " << floatarr[i] << '\n';
+  }

  // Results should be as below...
  // Score for class[0] = 0.000045
@ -131,7 +107,7 @@ void run_ort_trt2() {
  // Score for class[4] = 0.001317

  // we need another run in order to make TRT EP use engine cache
-  printf("Second run ...\n");
+  std::cout << "Second run ...\n";

  // score model & input tensor, get back output tensor
  output_tensors = session.Run(Ort::RunOptions{nullptr}, input_node_names.data(), &input_tensor, 1, output_node_names.data(), 1);
@ -142,165 +118,32 @@ void run_ort_trt2() {
  assert(abs(floatarr[0] - 0.000045) < 1e-6);

  // score the model, and print scores for first 5 classes
-  for (int i = 0; i < 5; i++)
-    printf("Score for class [%d] =  %f\n", i, floatarr[i]);
-
-  // release buffers allocated by ORT alloctor
-  for (const char* node_name : input_node_names)
-    allocator.Free(const_cast<void*>(reinterpret_cast<const void*>(node_name)));
-
-  printf("Done!\n");
-}
-
-void ort_trt_run_with_default_options() {
-  //*************************************************************************
-  // initialize  environment...one environment per process
-  // environment maintains thread pools and other state info
-  Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "test");
-
-  // initialize session options if needed
-  Ort::SessionOptions session_options;
-  session_options.SetIntraOpNumThreads(1);
-
-  // If onnxruntime.dll is built with CUDA enabled, we can uncomment out this line to use CUDA for this session
-  // OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 1);
-  OrtSessionOptionsAppendExecutionProvider_Tensorrt(session_options, 0);
-
-  // Sets graph optimization level
-  // Available levels are
-  // ORT_DISABLE_ALL -> To disable all optimizations
-  // ORT_ENABLE_BASIC -> To enable basic optimizations (Such as redundant node removals)
-  // ORT_ENABLE_EXTENDED -> To enable extended optimizations (Includes level 1 + more complex optimizations like node fusions)
-  // ORT_ENABLE_ALL -> To Enable All possible opitmizations
-  session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
-
-  //*************************************************************************
-  // create session and load model into memory
-  // using squeezenet version 1.3
-  // URL = https://github.com/onnx/models/tree/master/squeezenet
-#ifdef _WIN32
-  const wchar_t* model_path = L"squeezenet.onnx";
-#else
-  const char* model_path = "squeezenet.onnx";
-#endif
-
-  printf("Using Onnxruntime C++ API\n");
-  Ort::Session session(env, model_path, session_options);
-
-  //*************************************************************************
-  // print model input layer (node names, types, shape etc.)
-  Ort::AllocatorWithDefaultOptions allocator;
-
-  // print number of model input nodes
-  size_t num_input_nodes = session.GetInputCount();
-  std::vector<Ort::AllocatedStringPtr> input_node_names_ptr;
-  std::vector<const char*> input_node_names(num_input_nodes);
-  std::vector<int64_t> input_node_dims;  // simplify... this model has only 1 input node {1, 3, 224, 224}.
-                                         // Otherwise need vector<vector<>>
-
-  printf("Number of inputs = %zu\n", num_input_nodes);
-
-  // iterate over all input nodes
-  for (int i = 0; i < num_input_nodes; i++) {
-    // print input node names
-    auto input_name = session.GetInputNameAllocated(i, allocator);
-    printf("Input %d : name=%s\n", i, input_name.get());
-    input_node_names[i] = input_name.get();
-    input_node_names_ptr.push_back(std::move(input_name));
-
-    // print input node types
-    Ort::TypeInfo type_info = session.GetInputTypeInfo(i);
-    auto tensor_info = type_info.GetTensorTypeAndShapeInfo();
-
-    ONNXTensorElementDataType type = tensor_info.GetElementType();
-    printf("Input %d : type=%d\n", i, type);
-
-    // print input shapes/dims
-    input_node_dims = tensor_info.GetShape();
-    printf("Input %d : num_dims=%zu\n", i, input_node_dims.size());
-    for (int j = 0; j < input_node_dims.size(); j++)
-      printf("Input %d : dim %d=%jd\n", i, j, input_node_dims[j]);
+  for (int i = 0; i < 5; i++) {
+    std::cout << "Score for class [" << i << "] =  " << floatarr[i] << '\n';
  }

-  // Results should be...
-  // Number of inputs = 1
-  // Input 0 : name = data_0
-  // Input 0 : type = 1
-  // Input 0 : num_dims = 4
-  // Input 0 : dim 0 = 1
-  // Input 0 : dim 1 = 3
-  // Input 0 : dim 2 = 224
-  // Input 0 : dim 3 = 224
-
-  //*************************************************************************
-  // Similar operations to get output node information.
-  // Use OrtSessionGetOutputCount(), OrtSessionGetOutputName()
-  // OrtSessionGetOutputTypeInfo() as shown above.
-
-  //*************************************************************************
-  // Score the model using sample data, and inspect values
-
-  size_t input_tensor_size = 224 * 224 * 3;  // simplify ... using known dim values to calculate size
-                                             // use OrtGetTensorShapeElementCount() to get official size!
-
-  std::vector<float> input_tensor_values(input_tensor_size);
-  std::vector<const char*> output_node_names = {"softmaxout_1"};
-
-  // initialize input data with values in [0.0, 1.0]
-  for (unsigned int i = 0; i < input_tensor_size; i++)
-    input_tensor_values[i] = (float)i / (input_tensor_size + 1);
-
-  // create input tensor object from data values
-  auto memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
-  Ort::Value input_tensor = Ort::Value::CreateTensor<float>(memory_info, input_tensor_values.data(), input_tensor_size, input_node_dims.data(), 4);
-  assert(input_tensor.IsTensor());
-
-  // score model & input tensor, get back output tensor
-  auto output_tensors = session.Run(Ort::RunOptions{nullptr}, input_node_names.data(), &input_tensor, 1, output_node_names.data(), 1);
-  assert(output_tensors.size() == 1 && output_tensors.front().IsTensor());
-
-  // Get pointer to output tensor float values
-  float* floatarr = output_tensors.front().GetTensorMutableData<float>();
-  assert(abs(floatarr[0] - 0.000045) < 1e-6);
-
-  // score the model, and print scores for first 5 classes
-  for (int i = 0; i < 5; i++)
-    printf("Score for class [%d] =  %f\n", i, floatarr[i]);
-
-  // Results should be as below...
-  // Score for class[0] = 0.000045
-  // Score for class[1] = 0.003846
-  // Score for class[2] = 0.000125
-  // Score for class[3] = 0.001180
-  // Score for class[4] = 0.001317
-
-  // release buffers allocated by ORT alloctor
-  for (const char* node_name : input_node_names)
-    allocator.Free(const_cast<void*>(reinterpret_cast<const void*>(node_name)));
-
-  printf("Done!\n");
+  std::cout << "Done!" << std::endl;
 }

 void run_ort_trt() {
  Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "test");
+  const auto& api = Ort::GetApi();
+  OrtTensorRTProviderOptionsV2* tensorrt_options;

  Ort::SessionOptions session_options;
  session_options.SetIntraOpNumThreads(1);

  session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED);

-#ifdef _WIN32
-  const wchar_t* model_path = L"squeezenet.onnx";
-  const wchar_t* calib_table = L"squeezenet_calibration.flatbuffers";
-#else
  const char* model_path = "squeezenet.onnx";
-  const char* calib_table = "squeezenet_calibration.flatbuffers";
-#endif

-  auto tensorrt_options = get_default_trt_provider_options();
+  Ort::ThrowOnError(api.CreateTensorRTProviderOptions(&tensorrt_options));
+  std::unique_ptr<OrtTensorRTProviderOptionsV2, decltype(api.ReleaseTensorRTProviderOptions)> rel_trt_options(
+      tensorrt_options, api.ReleaseTensorRTProviderOptions);
+  Ort::ThrowOnError(api.SessionOptionsAppendExecutionProvider_TensorRT_V2(static_cast<OrtSessionOptions*>(session_options),
+                                                                          rel_trt_options.get()));

-  session_options.AppendExecutionProvider_TensorRT(*tensorrt_options.get());
-  printf("Runing ORT TRT EP with default provider options\n");
+  std::cout << "Running ORT TRT EP with default provider options" << std::endl;

  Ort::Session session(env, model_path, session_options);

@ -309,53 +152,58 @@ void run_ort_trt() {
  Ort::AllocatorWithDefaultOptions allocator;

  // print number of model input nodes
-  size_t num_input_nodes = session.GetInputCount();
-  std::vector<Ort::AllocatedStringPtr> input_node_names_ptr;
-  std::vector<const char*> input_node_names(num_input_nodes);
+  const size_t num_input_nodes = session.GetInputCount();
+  std::vector<Ort::AllocatedStringPtr> input_names_ptr;
+  std::vector<const char*> input_node_names;
+  input_names_ptr.reserve(num_input_nodes);
+  input_node_names.reserve(num_input_nodes);
  std::vector<int64_t> input_node_dims;  // simplify... this model has only 1 input node {1, 3, 224, 224}.
                                         // Otherwise need vector<vector<>>

-  printf("Number of inputs = %zu\n", num_input_nodes);
+  std::cout << "Number of inputs = " << num_input_nodes << std::endl;

  // iterate over all input nodes
-  for (int i = 0; i < num_input_nodes; i++) {
+  for (size_t i = 0; i < num_input_nodes; i++) {
    // print input node names
    auto input_name = session.GetInputNameAllocated(i, allocator);
-    printf("Input %d : name=%s\n", i, input_name.get());
-    input_node_names[i] = input_name.get();
-    input_node_names_ptr.push_back(std::move(input_name));
+    std::cout << "Input " << i << " : name =" << input_name.get() << std::endl;
+    input_node_names.push_back(input_name.get());
+    input_names_ptr.push_back(std::move(input_name));

    // print input node types
-    Ort::TypeInfo type_info = session.GetInputTypeInfo(i);
+    auto type_info = session.GetInputTypeInfo(i);
    auto tensor_info = type_info.GetTensorTypeAndShapeInfo();

    ONNXTensorElementDataType type = tensor_info.GetElementType();
-    printf("Input %d : type=%d\n", i, type);
+    std::cout << "Input " << i << " : type = " << type << std::endl;

    // print input shapes/dims
    input_node_dims = tensor_info.GetShape();
-    printf("Input %d : num_dims=%zu\n", i, input_node_dims.size());
-    for (int j = 0; j < input_node_dims.size(); j++)
-      printf("Input %d : dim %d=%jd\n", i, j, input_node_dims[j]);
+    std::cout << "Input " << i << " : num_dims = " << input_node_dims.size() << '\n';
+    for (size_t j = 0; j < input_node_dims.size(); j++) {
+      std::cout << "Input " << i << " : dim[" << j << "] =" << input_node_dims[j] << '\n';
+    }
+    std::cout << std::flush;
  }

-  size_t input_tensor_size = 224 * 224 * 3;  // simplify ... using known dim values to calculate size
-                                             // use OrtGetTensorShapeElementCount() to get official size!
+  constexpr size_t input_tensor_size = 224 * 224 * 3;  // simplify ... using known dim values to calculate size
+                                                       // use OrtGetTensorShapeElementCount() to get official size!

  std::vector<float> input_tensor_values(input_tensor_size);
  std::vector<const char*> output_node_names = {"softmaxout_1"};

  // initialize input data with values in [0.0, 1.0]
-  for (unsigned int i = 0; i < input_tensor_size; i++)
-    input_tensor_values[i] = (float)i / (input_tensor_size + 1);
+  for (unsigned int i = 0; i < input_tensor_size; i++) input_tensor_values[i] = (float)i / (input_tensor_size + 1);

  // create input tensor object from data values
  auto memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
-  Ort::Value input_tensor = Ort::Value::CreateTensor<float>(memory_info, input_tensor_values.data(), input_tensor_size, input_node_dims.data(), 4);
+  auto input_tensor = Ort::Value::CreateTensor<float>(memory_info, input_tensor_values.data(), input_tensor_size,
+                                                      input_node_dims.data(), 4);
  assert(input_tensor.IsTensor());

  // score model & input tensor, get back output tensor
-  auto output_tensors = session.Run(Ort::RunOptions{nullptr}, input_node_names.data(), &input_tensor, 1, output_node_names.data(), 1);
+  auto output_tensors =
+      session.Run(Ort::RunOptions{nullptr}, input_node_names.data(), &input_tensor, 1, output_node_names.data(), 1);
  assert(output_tensors.size() == 1 && output_tensors.front().IsTensor());

  // Get pointer to output tensor float values
@ -363,8 +211,10 @@ void run_ort_trt() {
  assert(abs(floatarr[0] - 0.000045) < 1e-6);

  // score the model, and print scores for first 5 classes
-  for (int i = 0; i < 5; i++)
-    printf("Score for class [%d] =  %f\n", i, floatarr[i]);
+  for (int i = 0; i < 5; i++) {
+    std::cout << "Score for class [" << i << "] =  " << floatarr[i] << '\n';
+  }
+  std::cout << std::flush;

  // Results should be as below...
  // Score for class[0] = 0.000045
@ -373,15 +223,10 @@ void run_ort_trt() {
  // Score for class[3] = 0.001180
  // Score for class[4] = 0.001317

-  // release buffers allocated by ORT alloctor
-  for (const char* node_name : input_node_names)
-    allocator.Free(const_cast<void*>(reinterpret_cast<const void*>(node_name)));
-
-  printf("Done!\n");
+  std::cout << "Done!" << std::endl;
 }

 int main(int argc, char* argv[]) {
  run_ort_trt();
-  run_ort_trt2();
  return 0;
 }
--- a/onnxruntime/python/tools/tensorrt/perf/mem_test/run.sh
+++ b/onnxruntime/python/tools/tensorrt/perf/mem_test/run.sh
@ -14,9 +14,10 @@ s) ORT_SOURCE=${OPTARG};;
 esac
 done

-ONNX_MODEL_URL="https://github.com/onnx/models/raw/master/vision/classification/squeezenet/model/squeezenet1.0-7.onnx"
+ONNX_MODEL_TAR_URL="https://github.com/onnx/models/raw/main/vision/classification/squeezenet/model/squeezenet1.0-7.tar.gz"
+MODEL_TAR_NAME="squeezenet1.0-7.tar.gz"
 ONNX_MODEL="squeezenet.onnx"
-ASAN_OPTIONS="protect_shadow_gap=0:log_path=asan.log"
+ASAN_OPTIONS="protect_shadow_gap=0:new_delete_type_mismatch=0:log_path=asan.log"

 export LD_LIBRARY_PATH=${ORT_BINARY_PATH}
 export LIBRARY_PATH=${ORT_BINARY_PATH}
@ -46,21 +47,76 @@ cd build
 cp ../squeezenet_calibration.flatbuffers . 

 cmake ..
-make -j8
-wget ${ONNX_MODEL_URL} -O ${ONNX_MODEL}
-ASAN_OPTIONS=${ASAN_OPTIONS} ./onnx_memtest
+make -j
+wget ${ONNX_MODEL_TAR_URL} -O squeezenet1.0-7.tar.gz
+tar -xzf ${MODEL_TAR_NAME} --strip-components=1
+mv model.onnx ${ONNX_MODEL}
+rm ${MODEL_TAR_NAME}
+mkdir result

-if [ $? -ne 0 ]
-then
-    echo "Memory test application failed."
-    exit 1
+# Run valgrind
+echo $(date +"%Y-%m-%d %H:%M:%S") '[valgrind] Starting memcheck with' ${ONNX_MODEL}
+valgrind --leak-check=full --show-leak-kinds=all --log-file=valgrind.log ${ORT_SOURCE}/build/Linux/Release/onnxruntime_perf_test -e tensorrt -r 1 ${ONNX_MODEL}
+echo $(date +"%Y-%m-%d %H:%M:%S") '[valgrind] Analyzing valgrind log'
+
+found_leak_summary=false
+is_mem_leaked=false
+
+while IFS= read -r line
+do
+  if echo $line | grep -q 'LEAK SUMMARY:'; then
+    found_leak_summary=true
+  elif $found_leak_summary && echo $line | grep -q 'definitely lost:'; then
+    bytes_lost=$(echo $line | grep -o -E '[0-9,]+ bytes')
+    blocks_lost=$(echo $line | grep -o -E '[0-9]+ blocks')
+    echo "Bytes lost: $bytes_lost"
+    echo "Blocks lost: $blocks_lost"
+    if [ "$blocks_lost" != "0 blocks" ]; then
+      echo $(date +"%Y-%m-%d %H:%M:%S") '[valgrind] Memory leak happened when testing squeezenet model! Checking if it is ORT-TRT related'
+      is_mem_leaked=true
+    fi
+    found_leak_summary=false
+  fi
+done < "valgrind.log"
+
+# Export ORT-TRT memleak detail log if available
+if [ "$is_mem_leaked" = "true" ]; then
+    awk '
+    BEGIN {buffer=""; isDefinitelyLost=0; isOrtTrtRelated=0}
+
+    # substitute "==xxxxx==" with ""
+    {sub(/==[0-9]+== /, "")}
+    
+    # Start caching lines when isDefinitelyLost
+    /blocks are definitely lost in loss/ {isDefinitelyLost = 1; buffer=""; isOrtTrtRelated=0}
+    
+    # Cache this line when isDefinitelyLost and line!=""
+    # isOrtTrtRelated=1 when "TensorrtExecutionProvider" is found
+    isDefinitelyLost && $0 != "" {buffer = buffer "\n" $0; if($0 ~ /TensorrtExecutionProvider/) {isOrtTrtRelated=1}}
+    
+    # Stop caching and export buffer when isDefinitelyLost, line=="" and isOrtTrtRelated
+    isDefinitelyLost && $0 == "" {isDefinitelyLost = 0; if(isOrtTrtRelated==1) {print buffer}}
+    ' valgrind.log > ort_trt_memleak_detail.log
+
+    # Check if any ORT-TRT related memleak info has been parsed
+    if [ -s ort_trt_memleak_detail.log ]; then
+        mv ort_trt_memleak_detail.log result
+	    echo $(date +"%Y-%m-%d %H:%M:%S") '[valgrind] ORT-TRT memleak detail log parsed in CI artifact: ort_trt_memleak_detail.log'
+        exit 1
+    else
+	    rm ort_trt_memleak_detail.log
+    fi
 fi

-mkdir result
+mv valgrind.log result
+
+# Run AddressSanitizer 
+ASAN_OPTIONS=${ASAN_OPTIONS} ./onnx_memtest
+
 if [ -e asan.log* ]
 then
    cat asan.log*
    mv asan.log* result
 else
-    echo "No memory Leak(s) or other memory error(s) detected." > result/asan.log
-fi
+    echo $(date +"%Y-%m-%d %H:%M:%S") "[AddressSanitizer] No memory Leak(s) or other memory error(s) detected." > result/asan.log
+fi
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
@ -24,7 +24,7 @@ parameters:
 - name: MemTest
  displayName: Run Memory Test
  type: boolean
-  default: false
+  default: true

 - name: TrtEPOptions
  displayName: TensorRT EP options
@ -87,17 +87,17 @@ jobs:
      - script: 'python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/build/build_image.py -r $(Build.SourcesDirectory) -i $(image) -b $(branchName) -t $(trtVersion) -a 75'
        displayName: 'Build latest ORT Image'
        workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/build'
-
-    - ${{ each option in parameters.ModelGroups }}:
-      - script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/run_perf_docker.sh -d $(image) -p $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf -v $(modelVolume) -b true -o ${{option}} -m $(${{option}}) -e "$(epList)" $(optional_arguments)'
-        displayName: '${{option}} perf'
-        workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/'
        
    - ${{ if eq(parameters.MemTest, true) }}:
      - script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh -d $(image) -p $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/ -w /code/ -l false'
        displayName: 'Run Memory Test'
        workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/'

+    - ${{ each option in parameters.ModelGroups }}:
+      - script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/run_perf_docker.sh -d $(image) -p $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf -v $(modelVolume) -b true -o ${{option}} -m $(${{option}}) -e "$(epList)" $(optional_arguments)'
+        displayName: '${{option}} perf'
+        workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/'
+
    # Prepare and Publish Artifacts 
    
    - script: 'mkdir $(Build.SourcesDirectory)/Artifact'
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_6_tensorrt8_4
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_6_tensorrt8_4
@ -44,6 +44,9 @@ RUN v="8.4.1-1+cuda11.6" &&\
 # Compile trtexec
 RUN cd /usr/src/tensorrt/samples/trtexec && make

+# Install Valgrind
+RUN apt-get install -y valgrind
+
 ARG BUILD_USER=onnxruntimedev
 ARG BUILD_UID=1000
 RUN adduser --gecos 'onnxruntime Build User' --disabled-password $BUILD_USER --uid $BUILD_UID
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_5
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_5
@ -41,6 +41,8 @@ RUN v="8.5.1-1+cuda11.8" &&\
 # Compile trtexec
 RUN cd /usr/src/tensorrt/samples/trtexec && make

+# Install Valgrind
+RUN apt-get install -y valgrind

 # Build final image from base. Builds ORT.
 FROM base as final
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6
@ -41,6 +41,8 @@ RUN v="8.6.1.6-1+cuda11.8" &&\
 # Compile trtexec
 RUN cd /usr/src/tensorrt/samples/trtexec && make

+# Install Valgrind
+RUN apt-get install -y valgrind

 # Build final image from base. Builds ORT.
 FROM base as final