[EP Perf] MemTest: Add Valgrind and fix addressSanitizer (#16930)

### Description
1. Add valgrind to existing ep_perf CI MemTest and parse ORT-TRT memLeak
details
1. General Valgrind logs and logs related to ORT-TRT will be parsed in
[CI
artifacts](https://aiinfra.visualstudio.com/Lotus/_build/results?buildId=334122&view=artifacts&pathAsName=false&type=publishedArtifacts)
      1. Logic:
1. Run valgrind with `onnxruntime-perf-test -e tensorrt` and export log
to `valgrind.log`
         2. Identify if any `definitely lost` memleak happened
1. For log paragraphs which show `definitely lost`, parse if they have
keyword `TensorrtExecutionProvider`.
2. If so, extract these details to `ort_trt_memleak_detail.log`, and
return `build failure` to EP Perf CI
3. Fix existing addressSanitizer and sync the squeezenet testcase with
latest update from
[ort-inference-example](https://github.com/microsoft/onnxruntime-inference-examples/blob/main/c_cxx/squeezenet/main.cpp)
1. Updates in short: Upgrade main.cpp to be using
OrtTensorRTProviderOptionsV2
4. Reorder the 7-min-MemTest to be ahead of 9-hr-model-tests, and enable
MemTest by default
This commit is contained in:
Yifan Li 2023-08-04 16:58:57 -07:00 committed by GitHub
parent 5af8774a0b
commit d6ce43db5e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 160 additions and 252 deletions

View file

@ -2,7 +2,7 @@ project(alprdaemon)
set(CMAKE_BUILD_TYPE Debug)
cmake_minimum_required (VERSION 2.6)
cmake_minimum_required(VERSION 3.13)
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

View file

@ -3,61 +3,31 @@
//
#include <assert.h>
#include <iostream>
#include <vector>
#include <onnxruntime_cxx_api.h>
#include <tensorrt_provider_factory.h>
std::unique_ptr<OrtTensorRTProviderOptions> get_default_trt_provider_options() {
auto tensorrt_options = std::make_unique<OrtTensorRTProviderOptions>();
tensorrt_options->device_id = 0;
tensorrt_options->has_user_compute_stream = 0;
tensorrt_options->user_compute_stream = nullptr;
tensorrt_options->trt_max_partition_iterations = 1000;
tensorrt_options->trt_min_subgraph_size = 1;
tensorrt_options->trt_max_workspace_size = 1 << 30;
tensorrt_options->trt_fp16_enable = false;
tensorrt_options->trt_int8_enable = false;
tensorrt_options->trt_int8_calibration_table_name = "";
tensorrt_options->trt_int8_use_native_calibration_table = false;
tensorrt_options->trt_dla_enable = false;
tensorrt_options->trt_dla_core = 0;
tensorrt_options->trt_dump_subgraphs = false;
tensorrt_options->trt_engine_cache_enable = false;
tensorrt_options->trt_engine_cache_path = "";
tensorrt_options->trt_engine_decryption_enable = false;
tensorrt_options->trt_engine_decryption_lib_path = "";
tensorrt_options->trt_force_sequential_engine_build = false;
tensorrt_options->trt_context_memory_sharing_enable = false;
tensorrt_options->trt_layer_norm_fp32_fallback = false;
return tensorrt_options;
}
#include <tensorrt_provider_options.h>
void run_ort_trt2() {
Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "test");
const auto& api = Ort::GetApi();
OrtTensorRTProviderOptionsV2* tensorrt_options;
Ort::SessionOptions session_options;
session_options.SetIntraOpNumThreads(1);
session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
#ifdef _WIN32
const wchar_t* model_path = L"squeezenet.onnx";
const wchar_t* calib_table = L"squeezenet_calibration.flatbuffers";
#else
const char* model_path = "squeezenet.onnx";
const char* calib_table = "squeezenet_calibration.flatbuffers";
#endif
auto tensorrt_options = get_default_trt_provider_options();
Ort::ThrowOnError(api.CreateTensorRTProviderOptions(&tensorrt_options));
std::unique_ptr<OrtTensorRTProviderOptionsV2, decltype(api.ReleaseTensorRTProviderOptions)> rel_trt_options(
tensorrt_options, api.ReleaseTensorRTProviderOptions);
Ort::ThrowOnError(api.SessionOptionsAppendExecutionProvider_TensorRT_V2(static_cast<OrtSessionOptions*>(session_options),
rel_trt_options.get()));
tensorrt_options->trt_engine_cache_enable = true;
tensorrt_options->trt_int8_enable = true;
tensorrt_options->trt_fp16_enable = true;
tensorrt_options->trt_int8_calibration_table_name = calib_table;
session_options.AppendExecutionProvider_TensorRT(*tensorrt_options.get());
printf("Runing ORT TRT EP with:\n\tengine cache enabled\n\tfp16 enabled if supports\n\tint8 enabled if supports\n\tint8 calibration table provided\n");
printf("First run ...\n");
std::cout << "Running ORT TRT EP with default provider options" << std::endl;
Ort::Session session(env, model_path, session_options);
@ -66,53 +36,58 @@ void run_ort_trt2() {
Ort::AllocatorWithDefaultOptions allocator;
// print number of model input nodes
size_t num_input_nodes = session.GetInputCount();
std::vector<Ort::AllocatedStringPtr> input_node_names_ptr;
std::vector<const char*> input_node_names(num_input_nodes);
const size_t num_input_nodes = session.GetInputCount();
std::vector<Ort::AllocatedStringPtr> input_names_ptr;
std::vector<const char*> input_node_names;
input_names_ptr.reserve(num_input_nodes);
input_node_names.reserve(num_input_nodes);
std::vector<int64_t> input_node_dims; // simplify... this model has only 1 input node {1, 3, 224, 224}.
// Otherwise need vector<vector<>>
printf("Number of inputs = %zu\n", num_input_nodes);
std::cout << "Number of inputs = " << num_input_nodes << std::endl;
// iterate over all input nodes
for (int i = 0; i < num_input_nodes; i++) {
for (size_t i = 0; i < num_input_nodes; i++) {
// print input node names
auto input_name = session.GetInputNameAllocated(i, allocator);
printf("Input %d : name=%s\n", i, input_name.get());
input_node_names[i] = input_name.get();
input_node_names_ptr.push_back(std::move(input_name));
std::cout << "Input " << i << " : name =" << input_name.get() << std::endl;
input_node_names.push_back(input_name.get());
input_names_ptr.push_back(std::move(input_name));
// print input node types
Ort::TypeInfo type_info = session.GetInputTypeInfo(i);
auto type_info = session.GetInputTypeInfo(i);
auto tensor_info = type_info.GetTensorTypeAndShapeInfo();
ONNXTensorElementDataType type = tensor_info.GetElementType();
printf("Input %d : type=%d\n", i, type);
std::cout << "Input " << i << " : type = " << type << std::endl;
// print input shapes/dims
input_node_dims = tensor_info.GetShape();
printf("Input %d : num_dims=%zu\n", i, input_node_dims.size());
for (int j = 0; j < input_node_dims.size(); j++)
printf("Input %d : dim %d=%jd\n", i, j, input_node_dims[j]);
std::cout << "Input " << i << " : num_dims = " << input_node_dims.size() << '\n';
for (size_t j = 0; j < input_node_dims.size(); j++) {
std::cout << "Input " << i << " : dim[" << j << "] =" << input_node_dims[j] << '\n';
}
std::cout << std::flush;
}
size_t input_tensor_size = 224 * 224 * 3; // simplify ... using known dim values to calculate size
// use OrtGetTensorShapeElementCount() to get official size!
constexpr size_t input_tensor_size = 224 * 224 * 3; // simplify ... using known dim values to calculate size
// use OrtGetTensorShapeElementCount() to get official size!
std::vector<float> input_tensor_values(input_tensor_size);
std::vector<const char*> output_node_names = {"softmaxout_1"};
// initialize input data with values in [0.0, 1.0]
for (unsigned int i = 0; i < input_tensor_size; i++)
input_tensor_values[i] = (float)i / (input_tensor_size + 1);
for (unsigned int i = 0; i < input_tensor_size; i++) input_tensor_values[i] = (float)i / (input_tensor_size + 1);
// create input tensor object from data values
auto memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
Ort::Value input_tensor = Ort::Value::CreateTensor<float>(memory_info, input_tensor_values.data(), input_tensor_size, input_node_dims.data(), 4);
auto input_tensor = Ort::Value::CreateTensor<float>(memory_info, input_tensor_values.data(), input_tensor_size,
input_node_dims.data(), 4);
assert(input_tensor.IsTensor());
// score model & input tensor, get back output tensor
auto output_tensors = session.Run(Ort::RunOptions{nullptr}, input_node_names.data(), &input_tensor, 1, output_node_names.data(), 1);
auto output_tensors =
session.Run(Ort::RunOptions{nullptr}, input_node_names.data(), &input_tensor, 1, output_node_names.data(), 1);
assert(output_tensors.size() == 1 && output_tensors.front().IsTensor());
// Get pointer to output tensor float values
@ -120,8 +95,9 @@ void run_ort_trt2() {
assert(abs(floatarr[0] - 0.000045) < 1e-6);
// score the model, and print scores for first 5 classes
for (int i = 0; i < 5; i++)
printf("Score for class [%d] = %f\n", i, floatarr[i]);
for (int i = 0; i < 5; i++) {
std::cout << "Score for class [" << i << "] = " << floatarr[i] << '\n';
}
// Results should be as below...
// Score for class[0] = 0.000045
@ -131,7 +107,7 @@ void run_ort_trt2() {
// Score for class[4] = 0.001317
// we need another run in order to make TRT EP use engine cache
printf("Second run ...\n");
std::cout << "Second run ...\n";
// score model & input tensor, get back output tensor
output_tensors = session.Run(Ort::RunOptions{nullptr}, input_node_names.data(), &input_tensor, 1, output_node_names.data(), 1);
@ -142,165 +118,32 @@ void run_ort_trt2() {
assert(abs(floatarr[0] - 0.000045) < 1e-6);
// score the model, and print scores for first 5 classes
for (int i = 0; i < 5; i++)
printf("Score for class [%d] = %f\n", i, floatarr[i]);
// release buffers allocated by ORT alloctor
for (const char* node_name : input_node_names)
allocator.Free(const_cast<void*>(reinterpret_cast<const void*>(node_name)));
printf("Done!\n");
}
void ort_trt_run_with_default_options() {
//*************************************************************************
// initialize environment...one environment per process
// environment maintains thread pools and other state info
Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "test");
// initialize session options if needed
Ort::SessionOptions session_options;
session_options.SetIntraOpNumThreads(1);
// If onnxruntime.dll is built with CUDA enabled, we can uncomment out this line to use CUDA for this session
// OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 1);
OrtSessionOptionsAppendExecutionProvider_Tensorrt(session_options, 0);
// Sets graph optimization level
// Available levels are
// ORT_DISABLE_ALL -> To disable all optimizations
// ORT_ENABLE_BASIC -> To enable basic optimizations (Such as redundant node removals)
// ORT_ENABLE_EXTENDED -> To enable extended optimizations (Includes level 1 + more complex optimizations like node fusions)
// ORT_ENABLE_ALL -> To Enable All possible opitmizations
session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
//*************************************************************************
// create session and load model into memory
// using squeezenet version 1.3
// URL = https://github.com/onnx/models/tree/master/squeezenet
#ifdef _WIN32
const wchar_t* model_path = L"squeezenet.onnx";
#else
const char* model_path = "squeezenet.onnx";
#endif
printf("Using Onnxruntime C++ API\n");
Ort::Session session(env, model_path, session_options);
//*************************************************************************
// print model input layer (node names, types, shape etc.)
Ort::AllocatorWithDefaultOptions allocator;
// print number of model input nodes
size_t num_input_nodes = session.GetInputCount();
std::vector<Ort::AllocatedStringPtr> input_node_names_ptr;
std::vector<const char*> input_node_names(num_input_nodes);
std::vector<int64_t> input_node_dims; // simplify... this model has only 1 input node {1, 3, 224, 224}.
// Otherwise need vector<vector<>>
printf("Number of inputs = %zu\n", num_input_nodes);
// iterate over all input nodes
for (int i = 0; i < num_input_nodes; i++) {
// print input node names
auto input_name = session.GetInputNameAllocated(i, allocator);
printf("Input %d : name=%s\n", i, input_name.get());
input_node_names[i] = input_name.get();
input_node_names_ptr.push_back(std::move(input_name));
// print input node types
Ort::TypeInfo type_info = session.GetInputTypeInfo(i);
auto tensor_info = type_info.GetTensorTypeAndShapeInfo();
ONNXTensorElementDataType type = tensor_info.GetElementType();
printf("Input %d : type=%d\n", i, type);
// print input shapes/dims
input_node_dims = tensor_info.GetShape();
printf("Input %d : num_dims=%zu\n", i, input_node_dims.size());
for (int j = 0; j < input_node_dims.size(); j++)
printf("Input %d : dim %d=%jd\n", i, j, input_node_dims[j]);
for (int i = 0; i < 5; i++) {
std::cout << "Score for class [" << i << "] = " << floatarr[i] << '\n';
}
// Results should be...
// Number of inputs = 1
// Input 0 : name = data_0
// Input 0 : type = 1
// Input 0 : num_dims = 4
// Input 0 : dim 0 = 1
// Input 0 : dim 1 = 3
// Input 0 : dim 2 = 224
// Input 0 : dim 3 = 224
//*************************************************************************
// Similar operations to get output node information.
// Use OrtSessionGetOutputCount(), OrtSessionGetOutputName()
// OrtSessionGetOutputTypeInfo() as shown above.
//*************************************************************************
// Score the model using sample data, and inspect values
size_t input_tensor_size = 224 * 224 * 3; // simplify ... using known dim values to calculate size
// use OrtGetTensorShapeElementCount() to get official size!
std::vector<float> input_tensor_values(input_tensor_size);
std::vector<const char*> output_node_names = {"softmaxout_1"};
// initialize input data with values in [0.0, 1.0]
for (unsigned int i = 0; i < input_tensor_size; i++)
input_tensor_values[i] = (float)i / (input_tensor_size + 1);
// create input tensor object from data values
auto memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
Ort::Value input_tensor = Ort::Value::CreateTensor<float>(memory_info, input_tensor_values.data(), input_tensor_size, input_node_dims.data(), 4);
assert(input_tensor.IsTensor());
// score model & input tensor, get back output tensor
auto output_tensors = session.Run(Ort::RunOptions{nullptr}, input_node_names.data(), &input_tensor, 1, output_node_names.data(), 1);
assert(output_tensors.size() == 1 && output_tensors.front().IsTensor());
// Get pointer to output tensor float values
float* floatarr = output_tensors.front().GetTensorMutableData<float>();
assert(abs(floatarr[0] - 0.000045) < 1e-6);
// score the model, and print scores for first 5 classes
for (int i = 0; i < 5; i++)
printf("Score for class [%d] = %f\n", i, floatarr[i]);
// Results should be as below...
// Score for class[0] = 0.000045
// Score for class[1] = 0.003846
// Score for class[2] = 0.000125
// Score for class[3] = 0.001180
// Score for class[4] = 0.001317
// release buffers allocated by ORT alloctor
for (const char* node_name : input_node_names)
allocator.Free(const_cast<void*>(reinterpret_cast<const void*>(node_name)));
printf("Done!\n");
std::cout << "Done!" << std::endl;
}
void run_ort_trt() {
Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "test");
const auto& api = Ort::GetApi();
OrtTensorRTProviderOptionsV2* tensorrt_options;
Ort::SessionOptions session_options;
session_options.SetIntraOpNumThreads(1);
session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
#ifdef _WIN32
const wchar_t* model_path = L"squeezenet.onnx";
const wchar_t* calib_table = L"squeezenet_calibration.flatbuffers";
#else
const char* model_path = "squeezenet.onnx";
const char* calib_table = "squeezenet_calibration.flatbuffers";
#endif
auto tensorrt_options = get_default_trt_provider_options();
Ort::ThrowOnError(api.CreateTensorRTProviderOptions(&tensorrt_options));
std::unique_ptr<OrtTensorRTProviderOptionsV2, decltype(api.ReleaseTensorRTProviderOptions)> rel_trt_options(
tensorrt_options, api.ReleaseTensorRTProviderOptions);
Ort::ThrowOnError(api.SessionOptionsAppendExecutionProvider_TensorRT_V2(static_cast<OrtSessionOptions*>(session_options),
rel_trt_options.get()));
session_options.AppendExecutionProvider_TensorRT(*tensorrt_options.get());
printf("Runing ORT TRT EP with default provider options\n");
std::cout << "Running ORT TRT EP with default provider options" << std::endl;
Ort::Session session(env, model_path, session_options);
@ -309,53 +152,58 @@ void run_ort_trt() {
Ort::AllocatorWithDefaultOptions allocator;
// print number of model input nodes
size_t num_input_nodes = session.GetInputCount();
std::vector<Ort::AllocatedStringPtr> input_node_names_ptr;
std::vector<const char*> input_node_names(num_input_nodes);
const size_t num_input_nodes = session.GetInputCount();
std::vector<Ort::AllocatedStringPtr> input_names_ptr;
std::vector<const char*> input_node_names;
input_names_ptr.reserve(num_input_nodes);
input_node_names.reserve(num_input_nodes);
std::vector<int64_t> input_node_dims; // simplify... this model has only 1 input node {1, 3, 224, 224}.
// Otherwise need vector<vector<>>
printf("Number of inputs = %zu\n", num_input_nodes);
std::cout << "Number of inputs = " << num_input_nodes << std::endl;
// iterate over all input nodes
for (int i = 0; i < num_input_nodes; i++) {
for (size_t i = 0; i < num_input_nodes; i++) {
// print input node names
auto input_name = session.GetInputNameAllocated(i, allocator);
printf("Input %d : name=%s\n", i, input_name.get());
input_node_names[i] = input_name.get();
input_node_names_ptr.push_back(std::move(input_name));
std::cout << "Input " << i << " : name =" << input_name.get() << std::endl;
input_node_names.push_back(input_name.get());
input_names_ptr.push_back(std::move(input_name));
// print input node types
Ort::TypeInfo type_info = session.GetInputTypeInfo(i);
auto type_info = session.GetInputTypeInfo(i);
auto tensor_info = type_info.GetTensorTypeAndShapeInfo();
ONNXTensorElementDataType type = tensor_info.GetElementType();
printf("Input %d : type=%d\n", i, type);
std::cout << "Input " << i << " : type = " << type << std::endl;
// print input shapes/dims
input_node_dims = tensor_info.GetShape();
printf("Input %d : num_dims=%zu\n", i, input_node_dims.size());
for (int j = 0; j < input_node_dims.size(); j++)
printf("Input %d : dim %d=%jd\n", i, j, input_node_dims[j]);
std::cout << "Input " << i << " : num_dims = " << input_node_dims.size() << '\n';
for (size_t j = 0; j < input_node_dims.size(); j++) {
std::cout << "Input " << i << " : dim[" << j << "] =" << input_node_dims[j] << '\n';
}
std::cout << std::flush;
}
size_t input_tensor_size = 224 * 224 * 3; // simplify ... using known dim values to calculate size
// use OrtGetTensorShapeElementCount() to get official size!
constexpr size_t input_tensor_size = 224 * 224 * 3; // simplify ... using known dim values to calculate size
// use OrtGetTensorShapeElementCount() to get official size!
std::vector<float> input_tensor_values(input_tensor_size);
std::vector<const char*> output_node_names = {"softmaxout_1"};
// initialize input data with values in [0.0, 1.0]
for (unsigned int i = 0; i < input_tensor_size; i++)
input_tensor_values[i] = (float)i / (input_tensor_size + 1);
for (unsigned int i = 0; i < input_tensor_size; i++) input_tensor_values[i] = (float)i / (input_tensor_size + 1);
// create input tensor object from data values
auto memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
Ort::Value input_tensor = Ort::Value::CreateTensor<float>(memory_info, input_tensor_values.data(), input_tensor_size, input_node_dims.data(), 4);
auto input_tensor = Ort::Value::CreateTensor<float>(memory_info, input_tensor_values.data(), input_tensor_size,
input_node_dims.data(), 4);
assert(input_tensor.IsTensor());
// score model & input tensor, get back output tensor
auto output_tensors = session.Run(Ort::RunOptions{nullptr}, input_node_names.data(), &input_tensor, 1, output_node_names.data(), 1);
auto output_tensors =
session.Run(Ort::RunOptions{nullptr}, input_node_names.data(), &input_tensor, 1, output_node_names.data(), 1);
assert(output_tensors.size() == 1 && output_tensors.front().IsTensor());
// Get pointer to output tensor float values
@ -363,8 +211,10 @@ void run_ort_trt() {
assert(abs(floatarr[0] - 0.000045) < 1e-6);
// score the model, and print scores for first 5 classes
for (int i = 0; i < 5; i++)
printf("Score for class [%d] = %f\n", i, floatarr[i]);
for (int i = 0; i < 5; i++) {
std::cout << "Score for class [" << i << "] = " << floatarr[i] << '\n';
}
std::cout << std::flush;
// Results should be as below...
// Score for class[0] = 0.000045
@ -373,15 +223,10 @@ void run_ort_trt() {
// Score for class[3] = 0.001180
// Score for class[4] = 0.001317
// release buffers allocated by ORT alloctor
for (const char* node_name : input_node_names)
allocator.Free(const_cast<void*>(reinterpret_cast<const void*>(node_name)));
printf("Done!\n");
std::cout << "Done!" << std::endl;
}
int main(int argc, char* argv[]) {
run_ort_trt();
run_ort_trt2();
return 0;
}

View file

@ -14,9 +14,10 @@ s) ORT_SOURCE=${OPTARG};;
esac
done
ONNX_MODEL_URL="https://github.com/onnx/models/raw/master/vision/classification/squeezenet/model/squeezenet1.0-7.onnx"
ONNX_MODEL_TAR_URL="https://github.com/onnx/models/raw/main/vision/classification/squeezenet/model/squeezenet1.0-7.tar.gz"
MODEL_TAR_NAME="squeezenet1.0-7.tar.gz"
ONNX_MODEL="squeezenet.onnx"
ASAN_OPTIONS="protect_shadow_gap=0:log_path=asan.log"
ASAN_OPTIONS="protect_shadow_gap=0:new_delete_type_mismatch=0:log_path=asan.log"
export LD_LIBRARY_PATH=${ORT_BINARY_PATH}
export LIBRARY_PATH=${ORT_BINARY_PATH}
@ -46,21 +47,76 @@ cd build
cp ../squeezenet_calibration.flatbuffers .
cmake ..
make -j8
wget ${ONNX_MODEL_URL} -O ${ONNX_MODEL}
ASAN_OPTIONS=${ASAN_OPTIONS} ./onnx_memtest
make -j
wget ${ONNX_MODEL_TAR_URL} -O squeezenet1.0-7.tar.gz
tar -xzf ${MODEL_TAR_NAME} --strip-components=1
mv model.onnx ${ONNX_MODEL}
rm ${MODEL_TAR_NAME}
mkdir result
if [ $? -ne 0 ]
then
echo "Memory test application failed."
exit 1
# Run valgrind
echo $(date +"%Y-%m-%d %H:%M:%S") '[valgrind] Starting memcheck with' ${ONNX_MODEL}
valgrind --leak-check=full --show-leak-kinds=all --log-file=valgrind.log ${ORT_SOURCE}/build/Linux/Release/onnxruntime_perf_test -e tensorrt -r 1 ${ONNX_MODEL}
echo $(date +"%Y-%m-%d %H:%M:%S") '[valgrind] Analyzing valgrind log'
found_leak_summary=false
is_mem_leaked=false
while IFS= read -r line
do
if echo $line | grep -q 'LEAK SUMMARY:'; then
found_leak_summary=true
elif $found_leak_summary && echo $line | grep -q 'definitely lost:'; then
bytes_lost=$(echo $line | grep -o -E '[0-9,]+ bytes')
blocks_lost=$(echo $line | grep -o -E '[0-9]+ blocks')
echo "Bytes lost: $bytes_lost"
echo "Blocks lost: $blocks_lost"
if [ "$blocks_lost" != "0 blocks" ]; then
echo $(date +"%Y-%m-%d %H:%M:%S") '[valgrind] Memory leak happened when testing squeezenet model! Checking if it is ORT-TRT related'
is_mem_leaked=true
fi
found_leak_summary=false
fi
done < "valgrind.log"
# Export ORT-TRT memleak detail log if available
if [ "$is_mem_leaked" = "true" ]; then
awk '
BEGIN {buffer=""; isDefinitelyLost=0; isOrtTrtRelated=0}
# substitute "==xxxxx==" with ""
{sub(/==[0-9]+== /, "")}
# Start caching lines when isDefinitelyLost
/blocks are definitely lost in loss/ {isDefinitelyLost = 1; buffer=""; isOrtTrtRelated=0}
# Cache this line when isDefinitelyLost and line!=""
# isOrtTrtRelated=1 when "TensorrtExecutionProvider" is found
isDefinitelyLost && $0 != "" {buffer = buffer "\n" $0; if($0 ~ /TensorrtExecutionProvider/) {isOrtTrtRelated=1}}
# Stop caching and export buffer when isDefinitelyLost, line=="" and isOrtTrtRelated
isDefinitelyLost && $0 == "" {isDefinitelyLost = 0; if(isOrtTrtRelated==1) {print buffer}}
' valgrind.log > ort_trt_memleak_detail.log
# Check if any ORT-TRT related memleak info has been parsed
if [ -s ort_trt_memleak_detail.log ]; then
mv ort_trt_memleak_detail.log result
echo $(date +"%Y-%m-%d %H:%M:%S") '[valgrind] ORT-TRT memleak detail log parsed in CI artifact: ort_trt_memleak_detail.log'
exit 1
else
rm ort_trt_memleak_detail.log
fi
fi
mkdir result
mv valgrind.log result
# Run AddressSanitizer
ASAN_OPTIONS=${ASAN_OPTIONS} ./onnx_memtest
if [ -e asan.log* ]
then
cat asan.log*
mv asan.log* result
else
echo "No memory Leak(s) or other memory error(s) detected." > result/asan.log
fi
echo $(date +"%Y-%m-%d %H:%M:%S") "[AddressSanitizer] No memory Leak(s) or other memory error(s) detected." > result/asan.log
fi

View file

@ -24,7 +24,7 @@ parameters:
- name: MemTest
displayName: Run Memory Test
type: boolean
default: false
default: true
- name: TrtEPOptions
displayName: TensorRT EP options
@ -87,17 +87,17 @@ jobs:
- script: 'python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/build/build_image.py -r $(Build.SourcesDirectory) -i $(image) -b $(branchName) -t $(trtVersion) -a 75'
displayName: 'Build latest ORT Image'
workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/build'
- ${{ each option in parameters.ModelGroups }}:
- script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/run_perf_docker.sh -d $(image) -p $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf -v $(modelVolume) -b true -o ${{option}} -m $(${{option}}) -e "$(epList)" $(optional_arguments)'
displayName: '${{option}} perf'
workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/'
- ${{ if eq(parameters.MemTest, true) }}:
- script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh -d $(image) -p $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/ -w /code/ -l false'
displayName: 'Run Memory Test'
workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/'
- ${{ each option in parameters.ModelGroups }}:
- script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/run_perf_docker.sh -d $(image) -p $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf -v $(modelVolume) -b true -o ${{option}} -m $(${{option}}) -e "$(epList)" $(optional_arguments)'
displayName: '${{option}} perf'
workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/'
# Prepare and Publish Artifacts
- script: 'mkdir $(Build.SourcesDirectory)/Artifact'

View file

@ -44,6 +44,9 @@ RUN v="8.4.1-1+cuda11.6" &&\
# Compile trtexec
RUN cd /usr/src/tensorrt/samples/trtexec && make
# Install Valgrind
RUN apt-get install -y valgrind
ARG BUILD_USER=onnxruntimedev
ARG BUILD_UID=1000
RUN adduser --gecos 'onnxruntime Build User' --disabled-password $BUILD_USER --uid $BUILD_UID

View file

@ -41,6 +41,8 @@ RUN v="8.5.1-1+cuda11.8" &&\
# Compile trtexec
RUN cd /usr/src/tensorrt/samples/trtexec && make
# Install Valgrind
RUN apt-get install -y valgrind
# Build final image from base. Builds ORT.
FROM base as final

View file

@ -41,6 +41,8 @@ RUN v="8.6.1.6-1+cuda11.8" &&\
# Compile trtexec
RUN cd /usr/src/tensorrt/samples/trtexec && make
# Install Valgrind
RUN apt-get install -y valgrind
# Build final image from base. Builds ORT.
FROM base as final