diff --git a/onnxruntime/python/tools/tensorrt/perf/mem_test/CMakeLists.txt b/onnxruntime/python/tools/tensorrt/perf/mem_test/CMakeLists.txt index 2d969cf80b..d77a763396 100644 --- a/onnxruntime/python/tools/tensorrt/perf/mem_test/CMakeLists.txt +++ b/onnxruntime/python/tools/tensorrt/perf/mem_test/CMakeLists.txt @@ -2,7 +2,7 @@ project(alprdaemon) set(CMAKE_BUILD_TYPE Debug) -cmake_minimum_required (VERSION 2.6) +cmake_minimum_required(VERSION 3.13) set(CMAKE_CXX_STANDARD 14) set(CMAKE_CXX_STANDARD_REQUIRED ON) diff --git a/onnxruntime/python/tools/tensorrt/perf/mem_test/main.cpp b/onnxruntime/python/tools/tensorrt/perf/mem_test/main.cpp index b0b8d09cf9..61d5440690 100644 --- a/onnxruntime/python/tools/tensorrt/perf/mem_test/main.cpp +++ b/onnxruntime/python/tools/tensorrt/perf/mem_test/main.cpp @@ -3,61 +3,31 @@ // #include +#include #include #include #include - -std::unique_ptr get_default_trt_provider_options() { - auto tensorrt_options = std::make_unique(); - tensorrt_options->device_id = 0; - tensorrt_options->has_user_compute_stream = 0; - tensorrt_options->user_compute_stream = nullptr; - tensorrt_options->trt_max_partition_iterations = 1000; - tensorrt_options->trt_min_subgraph_size = 1; - tensorrt_options->trt_max_workspace_size = 1 << 30; - tensorrt_options->trt_fp16_enable = false; - tensorrt_options->trt_int8_enable = false; - tensorrt_options->trt_int8_calibration_table_name = ""; - tensorrt_options->trt_int8_use_native_calibration_table = false; - tensorrt_options->trt_dla_enable = false; - tensorrt_options->trt_dla_core = 0; - tensorrt_options->trt_dump_subgraphs = false; - tensorrt_options->trt_engine_cache_enable = false; - tensorrt_options->trt_engine_cache_path = ""; - tensorrt_options->trt_engine_decryption_enable = false; - tensorrt_options->trt_engine_decryption_lib_path = ""; - tensorrt_options->trt_force_sequential_engine_build = false; - tensorrt_options->trt_context_memory_sharing_enable = false; - tensorrt_options->trt_layer_norm_fp32_fallback = false; - return tensorrt_options; -} +#include void run_ort_trt2() { Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "test"); + const auto& api = Ort::GetApi(); + OrtTensorRTProviderOptionsV2* tensorrt_options; Ort::SessionOptions session_options; session_options.SetIntraOpNumThreads(1); session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED); -#ifdef _WIN32 - const wchar_t* model_path = L"squeezenet.onnx"; - const wchar_t* calib_table = L"squeezenet_calibration.flatbuffers"; -#else const char* model_path = "squeezenet.onnx"; - const char* calib_table = "squeezenet_calibration.flatbuffers"; -#endif - auto tensorrt_options = get_default_trt_provider_options(); + Ort::ThrowOnError(api.CreateTensorRTProviderOptions(&tensorrt_options)); + std::unique_ptr rel_trt_options( + tensorrt_options, api.ReleaseTensorRTProviderOptions); + Ort::ThrowOnError(api.SessionOptionsAppendExecutionProvider_TensorRT_V2(static_cast(session_options), + rel_trt_options.get())); - tensorrt_options->trt_engine_cache_enable = true; - tensorrt_options->trt_int8_enable = true; - tensorrt_options->trt_fp16_enable = true; - tensorrt_options->trt_int8_calibration_table_name = calib_table; - - session_options.AppendExecutionProvider_TensorRT(*tensorrt_options.get()); - printf("Runing ORT TRT EP with:\n\tengine cache enabled\n\tfp16 enabled if supports\n\tint8 enabled if supports\n\tint8 calibration table provided\n"); - printf("First run ...\n"); + std::cout << "Running ORT TRT EP with default provider options" << std::endl; Ort::Session session(env, model_path, session_options); @@ -66,53 +36,58 @@ void run_ort_trt2() { Ort::AllocatorWithDefaultOptions allocator; // print number of model input nodes - size_t num_input_nodes = session.GetInputCount(); - std::vector input_node_names_ptr; - std::vector input_node_names(num_input_nodes); + const size_t num_input_nodes = session.GetInputCount(); + std::vector input_names_ptr; + std::vector input_node_names; + input_names_ptr.reserve(num_input_nodes); + input_node_names.reserve(num_input_nodes); std::vector input_node_dims; // simplify... this model has only 1 input node {1, 3, 224, 224}. // Otherwise need vector> - printf("Number of inputs = %zu\n", num_input_nodes); + std::cout << "Number of inputs = " << num_input_nodes << std::endl; // iterate over all input nodes - for (int i = 0; i < num_input_nodes; i++) { + for (size_t i = 0; i < num_input_nodes; i++) { // print input node names auto input_name = session.GetInputNameAllocated(i, allocator); - printf("Input %d : name=%s\n", i, input_name.get()); - input_node_names[i] = input_name.get(); - input_node_names_ptr.push_back(std::move(input_name)); + std::cout << "Input " << i << " : name =" << input_name.get() << std::endl; + input_node_names.push_back(input_name.get()); + input_names_ptr.push_back(std::move(input_name)); // print input node types - Ort::TypeInfo type_info = session.GetInputTypeInfo(i); + auto type_info = session.GetInputTypeInfo(i); auto tensor_info = type_info.GetTensorTypeAndShapeInfo(); ONNXTensorElementDataType type = tensor_info.GetElementType(); - printf("Input %d : type=%d\n", i, type); + std::cout << "Input " << i << " : type = " << type << std::endl; // print input shapes/dims input_node_dims = tensor_info.GetShape(); - printf("Input %d : num_dims=%zu\n", i, input_node_dims.size()); - for (int j = 0; j < input_node_dims.size(); j++) - printf("Input %d : dim %d=%jd\n", i, j, input_node_dims[j]); + std::cout << "Input " << i << " : num_dims = " << input_node_dims.size() << '\n'; + for (size_t j = 0; j < input_node_dims.size(); j++) { + std::cout << "Input " << i << " : dim[" << j << "] =" << input_node_dims[j] << '\n'; + } + std::cout << std::flush; } - size_t input_tensor_size = 224 * 224 * 3; // simplify ... using known dim values to calculate size - // use OrtGetTensorShapeElementCount() to get official size! + constexpr size_t input_tensor_size = 224 * 224 * 3; // simplify ... using known dim values to calculate size + // use OrtGetTensorShapeElementCount() to get official size! std::vector input_tensor_values(input_tensor_size); std::vector output_node_names = {"softmaxout_1"}; // initialize input data with values in [0.0, 1.0] - for (unsigned int i = 0; i < input_tensor_size; i++) - input_tensor_values[i] = (float)i / (input_tensor_size + 1); + for (unsigned int i = 0; i < input_tensor_size; i++) input_tensor_values[i] = (float)i / (input_tensor_size + 1); // create input tensor object from data values auto memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault); - Ort::Value input_tensor = Ort::Value::CreateTensor(memory_info, input_tensor_values.data(), input_tensor_size, input_node_dims.data(), 4); + auto input_tensor = Ort::Value::CreateTensor(memory_info, input_tensor_values.data(), input_tensor_size, + input_node_dims.data(), 4); assert(input_tensor.IsTensor()); // score model & input tensor, get back output tensor - auto output_tensors = session.Run(Ort::RunOptions{nullptr}, input_node_names.data(), &input_tensor, 1, output_node_names.data(), 1); + auto output_tensors = + session.Run(Ort::RunOptions{nullptr}, input_node_names.data(), &input_tensor, 1, output_node_names.data(), 1); assert(output_tensors.size() == 1 && output_tensors.front().IsTensor()); // Get pointer to output tensor float values @@ -120,8 +95,9 @@ void run_ort_trt2() { assert(abs(floatarr[0] - 0.000045) < 1e-6); // score the model, and print scores for first 5 classes - for (int i = 0; i < 5; i++) - printf("Score for class [%d] = %f\n", i, floatarr[i]); + for (int i = 0; i < 5; i++) { + std::cout << "Score for class [" << i << "] = " << floatarr[i] << '\n'; + } // Results should be as below... // Score for class[0] = 0.000045 @@ -131,7 +107,7 @@ void run_ort_trt2() { // Score for class[4] = 0.001317 // we need another run in order to make TRT EP use engine cache - printf("Second run ...\n"); + std::cout << "Second run ...\n"; // score model & input tensor, get back output tensor output_tensors = session.Run(Ort::RunOptions{nullptr}, input_node_names.data(), &input_tensor, 1, output_node_names.data(), 1); @@ -142,165 +118,32 @@ void run_ort_trt2() { assert(abs(floatarr[0] - 0.000045) < 1e-6); // score the model, and print scores for first 5 classes - for (int i = 0; i < 5; i++) - printf("Score for class [%d] = %f\n", i, floatarr[i]); - - // release buffers allocated by ORT alloctor - for (const char* node_name : input_node_names) - allocator.Free(const_cast(reinterpret_cast(node_name))); - - printf("Done!\n"); -} - -void ort_trt_run_with_default_options() { - //************************************************************************* - // initialize environment...one environment per process - // environment maintains thread pools and other state info - Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "test"); - - // initialize session options if needed - Ort::SessionOptions session_options; - session_options.SetIntraOpNumThreads(1); - - // If onnxruntime.dll is built with CUDA enabled, we can uncomment out this line to use CUDA for this session - // OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 1); - OrtSessionOptionsAppendExecutionProvider_Tensorrt(session_options, 0); - - // Sets graph optimization level - // Available levels are - // ORT_DISABLE_ALL -> To disable all optimizations - // ORT_ENABLE_BASIC -> To enable basic optimizations (Such as redundant node removals) - // ORT_ENABLE_EXTENDED -> To enable extended optimizations (Includes level 1 + more complex optimizations like node fusions) - // ORT_ENABLE_ALL -> To Enable All possible opitmizations - session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED); - - //************************************************************************* - // create session and load model into memory - // using squeezenet version 1.3 - // URL = https://github.com/onnx/models/tree/master/squeezenet -#ifdef _WIN32 - const wchar_t* model_path = L"squeezenet.onnx"; -#else - const char* model_path = "squeezenet.onnx"; -#endif - - printf("Using Onnxruntime C++ API\n"); - Ort::Session session(env, model_path, session_options); - - //************************************************************************* - // print model input layer (node names, types, shape etc.) - Ort::AllocatorWithDefaultOptions allocator; - - // print number of model input nodes - size_t num_input_nodes = session.GetInputCount(); - std::vector input_node_names_ptr; - std::vector input_node_names(num_input_nodes); - std::vector input_node_dims; // simplify... this model has only 1 input node {1, 3, 224, 224}. - // Otherwise need vector> - - printf("Number of inputs = %zu\n", num_input_nodes); - - // iterate over all input nodes - for (int i = 0; i < num_input_nodes; i++) { - // print input node names - auto input_name = session.GetInputNameAllocated(i, allocator); - printf("Input %d : name=%s\n", i, input_name.get()); - input_node_names[i] = input_name.get(); - input_node_names_ptr.push_back(std::move(input_name)); - - // print input node types - Ort::TypeInfo type_info = session.GetInputTypeInfo(i); - auto tensor_info = type_info.GetTensorTypeAndShapeInfo(); - - ONNXTensorElementDataType type = tensor_info.GetElementType(); - printf("Input %d : type=%d\n", i, type); - - // print input shapes/dims - input_node_dims = tensor_info.GetShape(); - printf("Input %d : num_dims=%zu\n", i, input_node_dims.size()); - for (int j = 0; j < input_node_dims.size(); j++) - printf("Input %d : dim %d=%jd\n", i, j, input_node_dims[j]); + for (int i = 0; i < 5; i++) { + std::cout << "Score for class [" << i << "] = " << floatarr[i] << '\n'; } - // Results should be... - // Number of inputs = 1 - // Input 0 : name = data_0 - // Input 0 : type = 1 - // Input 0 : num_dims = 4 - // Input 0 : dim 0 = 1 - // Input 0 : dim 1 = 3 - // Input 0 : dim 2 = 224 - // Input 0 : dim 3 = 224 - - //************************************************************************* - // Similar operations to get output node information. - // Use OrtSessionGetOutputCount(), OrtSessionGetOutputName() - // OrtSessionGetOutputTypeInfo() as shown above. - - //************************************************************************* - // Score the model using sample data, and inspect values - - size_t input_tensor_size = 224 * 224 * 3; // simplify ... using known dim values to calculate size - // use OrtGetTensorShapeElementCount() to get official size! - - std::vector input_tensor_values(input_tensor_size); - std::vector output_node_names = {"softmaxout_1"}; - - // initialize input data with values in [0.0, 1.0] - for (unsigned int i = 0; i < input_tensor_size; i++) - input_tensor_values[i] = (float)i / (input_tensor_size + 1); - - // create input tensor object from data values - auto memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault); - Ort::Value input_tensor = Ort::Value::CreateTensor(memory_info, input_tensor_values.data(), input_tensor_size, input_node_dims.data(), 4); - assert(input_tensor.IsTensor()); - - // score model & input tensor, get back output tensor - auto output_tensors = session.Run(Ort::RunOptions{nullptr}, input_node_names.data(), &input_tensor, 1, output_node_names.data(), 1); - assert(output_tensors.size() == 1 && output_tensors.front().IsTensor()); - - // Get pointer to output tensor float values - float* floatarr = output_tensors.front().GetTensorMutableData(); - assert(abs(floatarr[0] - 0.000045) < 1e-6); - - // score the model, and print scores for first 5 classes - for (int i = 0; i < 5; i++) - printf("Score for class [%d] = %f\n", i, floatarr[i]); - - // Results should be as below... - // Score for class[0] = 0.000045 - // Score for class[1] = 0.003846 - // Score for class[2] = 0.000125 - // Score for class[3] = 0.001180 - // Score for class[4] = 0.001317 - - // release buffers allocated by ORT alloctor - for (const char* node_name : input_node_names) - allocator.Free(const_cast(reinterpret_cast(node_name))); - - printf("Done!\n"); + std::cout << "Done!" << std::endl; } void run_ort_trt() { Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "test"); + const auto& api = Ort::GetApi(); + OrtTensorRTProviderOptionsV2* tensorrt_options; Ort::SessionOptions session_options; session_options.SetIntraOpNumThreads(1); session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED); -#ifdef _WIN32 - const wchar_t* model_path = L"squeezenet.onnx"; - const wchar_t* calib_table = L"squeezenet_calibration.flatbuffers"; -#else const char* model_path = "squeezenet.onnx"; - const char* calib_table = "squeezenet_calibration.flatbuffers"; -#endif - auto tensorrt_options = get_default_trt_provider_options(); + Ort::ThrowOnError(api.CreateTensorRTProviderOptions(&tensorrt_options)); + std::unique_ptr rel_trt_options( + tensorrt_options, api.ReleaseTensorRTProviderOptions); + Ort::ThrowOnError(api.SessionOptionsAppendExecutionProvider_TensorRT_V2(static_cast(session_options), + rel_trt_options.get())); - session_options.AppendExecutionProvider_TensorRT(*tensorrt_options.get()); - printf("Runing ORT TRT EP with default provider options\n"); + std::cout << "Running ORT TRT EP with default provider options" << std::endl; Ort::Session session(env, model_path, session_options); @@ -309,53 +152,58 @@ void run_ort_trt() { Ort::AllocatorWithDefaultOptions allocator; // print number of model input nodes - size_t num_input_nodes = session.GetInputCount(); - std::vector input_node_names_ptr; - std::vector input_node_names(num_input_nodes); + const size_t num_input_nodes = session.GetInputCount(); + std::vector input_names_ptr; + std::vector input_node_names; + input_names_ptr.reserve(num_input_nodes); + input_node_names.reserve(num_input_nodes); std::vector input_node_dims; // simplify... this model has only 1 input node {1, 3, 224, 224}. // Otherwise need vector> - printf("Number of inputs = %zu\n", num_input_nodes); + std::cout << "Number of inputs = " << num_input_nodes << std::endl; // iterate over all input nodes - for (int i = 0; i < num_input_nodes; i++) { + for (size_t i = 0; i < num_input_nodes; i++) { // print input node names auto input_name = session.GetInputNameAllocated(i, allocator); - printf("Input %d : name=%s\n", i, input_name.get()); - input_node_names[i] = input_name.get(); - input_node_names_ptr.push_back(std::move(input_name)); + std::cout << "Input " << i << " : name =" << input_name.get() << std::endl; + input_node_names.push_back(input_name.get()); + input_names_ptr.push_back(std::move(input_name)); // print input node types - Ort::TypeInfo type_info = session.GetInputTypeInfo(i); + auto type_info = session.GetInputTypeInfo(i); auto tensor_info = type_info.GetTensorTypeAndShapeInfo(); ONNXTensorElementDataType type = tensor_info.GetElementType(); - printf("Input %d : type=%d\n", i, type); + std::cout << "Input " << i << " : type = " << type << std::endl; // print input shapes/dims input_node_dims = tensor_info.GetShape(); - printf("Input %d : num_dims=%zu\n", i, input_node_dims.size()); - for (int j = 0; j < input_node_dims.size(); j++) - printf("Input %d : dim %d=%jd\n", i, j, input_node_dims[j]); + std::cout << "Input " << i << " : num_dims = " << input_node_dims.size() << '\n'; + for (size_t j = 0; j < input_node_dims.size(); j++) { + std::cout << "Input " << i << " : dim[" << j << "] =" << input_node_dims[j] << '\n'; + } + std::cout << std::flush; } - size_t input_tensor_size = 224 * 224 * 3; // simplify ... using known dim values to calculate size - // use OrtGetTensorShapeElementCount() to get official size! + constexpr size_t input_tensor_size = 224 * 224 * 3; // simplify ... using known dim values to calculate size + // use OrtGetTensorShapeElementCount() to get official size! std::vector input_tensor_values(input_tensor_size); std::vector output_node_names = {"softmaxout_1"}; // initialize input data with values in [0.0, 1.0] - for (unsigned int i = 0; i < input_tensor_size; i++) - input_tensor_values[i] = (float)i / (input_tensor_size + 1); + for (unsigned int i = 0; i < input_tensor_size; i++) input_tensor_values[i] = (float)i / (input_tensor_size + 1); // create input tensor object from data values auto memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault); - Ort::Value input_tensor = Ort::Value::CreateTensor(memory_info, input_tensor_values.data(), input_tensor_size, input_node_dims.data(), 4); + auto input_tensor = Ort::Value::CreateTensor(memory_info, input_tensor_values.data(), input_tensor_size, + input_node_dims.data(), 4); assert(input_tensor.IsTensor()); // score model & input tensor, get back output tensor - auto output_tensors = session.Run(Ort::RunOptions{nullptr}, input_node_names.data(), &input_tensor, 1, output_node_names.data(), 1); + auto output_tensors = + session.Run(Ort::RunOptions{nullptr}, input_node_names.data(), &input_tensor, 1, output_node_names.data(), 1); assert(output_tensors.size() == 1 && output_tensors.front().IsTensor()); // Get pointer to output tensor float values @@ -363,8 +211,10 @@ void run_ort_trt() { assert(abs(floatarr[0] - 0.000045) < 1e-6); // score the model, and print scores for first 5 classes - for (int i = 0; i < 5; i++) - printf("Score for class [%d] = %f\n", i, floatarr[i]); + for (int i = 0; i < 5; i++) { + std::cout << "Score for class [" << i << "] = " << floatarr[i] << '\n'; + } + std::cout << std::flush; // Results should be as below... // Score for class[0] = 0.000045 @@ -373,15 +223,10 @@ void run_ort_trt() { // Score for class[3] = 0.001180 // Score for class[4] = 0.001317 - // release buffers allocated by ORT alloctor - for (const char* node_name : input_node_names) - allocator.Free(const_cast(reinterpret_cast(node_name))); - - printf("Done!\n"); + std::cout << "Done!" << std::endl; } int main(int argc, char* argv[]) { run_ort_trt(); - run_ort_trt2(); return 0; } diff --git a/onnxruntime/python/tools/tensorrt/perf/mem_test/run.sh b/onnxruntime/python/tools/tensorrt/perf/mem_test/run.sh index 87857674a0..9957b34c66 100755 --- a/onnxruntime/python/tools/tensorrt/perf/mem_test/run.sh +++ b/onnxruntime/python/tools/tensorrt/perf/mem_test/run.sh @@ -14,9 +14,10 @@ s) ORT_SOURCE=${OPTARG};; esac done -ONNX_MODEL_URL="https://github.com/onnx/models/raw/master/vision/classification/squeezenet/model/squeezenet1.0-7.onnx" +ONNX_MODEL_TAR_URL="https://github.com/onnx/models/raw/main/vision/classification/squeezenet/model/squeezenet1.0-7.tar.gz" +MODEL_TAR_NAME="squeezenet1.0-7.tar.gz" ONNX_MODEL="squeezenet.onnx" -ASAN_OPTIONS="protect_shadow_gap=0:log_path=asan.log" +ASAN_OPTIONS="protect_shadow_gap=0:new_delete_type_mismatch=0:log_path=asan.log" export LD_LIBRARY_PATH=${ORT_BINARY_PATH} export LIBRARY_PATH=${ORT_BINARY_PATH} @@ -46,21 +47,76 @@ cd build cp ../squeezenet_calibration.flatbuffers . cmake .. -make -j8 -wget ${ONNX_MODEL_URL} -O ${ONNX_MODEL} -ASAN_OPTIONS=${ASAN_OPTIONS} ./onnx_memtest +make -j +wget ${ONNX_MODEL_TAR_URL} -O squeezenet1.0-7.tar.gz +tar -xzf ${MODEL_TAR_NAME} --strip-components=1 +mv model.onnx ${ONNX_MODEL} +rm ${MODEL_TAR_NAME} +mkdir result -if [ $? -ne 0 ] -then - echo "Memory test application failed." - exit 1 +# Run valgrind +echo $(date +"%Y-%m-%d %H:%M:%S") '[valgrind] Starting memcheck with' ${ONNX_MODEL} +valgrind --leak-check=full --show-leak-kinds=all --log-file=valgrind.log ${ORT_SOURCE}/build/Linux/Release/onnxruntime_perf_test -e tensorrt -r 1 ${ONNX_MODEL} +echo $(date +"%Y-%m-%d %H:%M:%S") '[valgrind] Analyzing valgrind log' + +found_leak_summary=false +is_mem_leaked=false + +while IFS= read -r line +do + if echo $line | grep -q 'LEAK SUMMARY:'; then + found_leak_summary=true + elif $found_leak_summary && echo $line | grep -q 'definitely lost:'; then + bytes_lost=$(echo $line | grep -o -E '[0-9,]+ bytes') + blocks_lost=$(echo $line | grep -o -E '[0-9]+ blocks') + echo "Bytes lost: $bytes_lost" + echo "Blocks lost: $blocks_lost" + if [ "$blocks_lost" != "0 blocks" ]; then + echo $(date +"%Y-%m-%d %H:%M:%S") '[valgrind] Memory leak happened when testing squeezenet model! Checking if it is ORT-TRT related' + is_mem_leaked=true + fi + found_leak_summary=false + fi +done < "valgrind.log" + +# Export ORT-TRT memleak detail log if available +if [ "$is_mem_leaked" = "true" ]; then + awk ' + BEGIN {buffer=""; isDefinitelyLost=0; isOrtTrtRelated=0} + + # substitute "==xxxxx==" with "" + {sub(/==[0-9]+== /, "")} + + # Start caching lines when isDefinitelyLost + /blocks are definitely lost in loss/ {isDefinitelyLost = 1; buffer=""; isOrtTrtRelated=0} + + # Cache this line when isDefinitelyLost and line!="" + # isOrtTrtRelated=1 when "TensorrtExecutionProvider" is found + isDefinitelyLost && $0 != "" {buffer = buffer "\n" $0; if($0 ~ /TensorrtExecutionProvider/) {isOrtTrtRelated=1}} + + # Stop caching and export buffer when isDefinitelyLost, line=="" and isOrtTrtRelated + isDefinitelyLost && $0 == "" {isDefinitelyLost = 0; if(isOrtTrtRelated==1) {print buffer}} + ' valgrind.log > ort_trt_memleak_detail.log + + # Check if any ORT-TRT related memleak info has been parsed + if [ -s ort_trt_memleak_detail.log ]; then + mv ort_trt_memleak_detail.log result + echo $(date +"%Y-%m-%d %H:%M:%S") '[valgrind] ORT-TRT memleak detail log parsed in CI artifact: ort_trt_memleak_detail.log' + exit 1 + else + rm ort_trt_memleak_detail.log + fi fi -mkdir result +mv valgrind.log result + +# Run AddressSanitizer +ASAN_OPTIONS=${ASAN_OPTIONS} ./onnx_memtest + if [ -e asan.log* ] then cat asan.log* mv asan.log* result else - echo "No memory Leak(s) or other memory error(s) detected." > result/asan.log -fi + echo $(date +"%Y-%m-%d %H:%M:%S") "[AddressSanitizer] No memory Leak(s) or other memory error(s) detected." > result/asan.log +fi \ No newline at end of file diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml index 607a422757..47061965ef 100644 --- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml @@ -24,7 +24,7 @@ parameters: - name: MemTest displayName: Run Memory Test type: boolean - default: false + default: true - name: TrtEPOptions displayName: TensorRT EP options @@ -87,17 +87,17 @@ jobs: - script: 'python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/build/build_image.py -r $(Build.SourcesDirectory) -i $(image) -b $(branchName) -t $(trtVersion) -a 75' displayName: 'Build latest ORT Image' workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/build' - - - ${{ each option in parameters.ModelGroups }}: - - script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/run_perf_docker.sh -d $(image) -p $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf -v $(modelVolume) -b true -o ${{option}} -m $(${{option}}) -e "$(epList)" $(optional_arguments)' - displayName: '${{option}} perf' - workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/' - ${{ if eq(parameters.MemTest, true) }}: - script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh -d $(image) -p $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/ -w /code/ -l false' displayName: 'Run Memory Test' workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/' + - ${{ each option in parameters.ModelGroups }}: + - script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/run_perf_docker.sh -d $(image) -p $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf -v $(modelVolume) -b true -o ${{option}} -m $(${{option}}) -e "$(epList)" $(optional_arguments)' + displayName: '${{option}} perf' + workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/' + # Prepare and Publish Artifacts - script: 'mkdir $(Build.SourcesDirectory)/Artifact' diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_6_tensorrt8_4 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_6_tensorrt8_4 index 5306485004..dc616c9711 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_6_tensorrt8_4 +++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_6_tensorrt8_4 @@ -44,6 +44,9 @@ RUN v="8.4.1-1+cuda11.6" &&\ # Compile trtexec RUN cd /usr/src/tensorrt/samples/trtexec && make +# Install Valgrind +RUN apt-get install -y valgrind + ARG BUILD_USER=onnxruntimedev ARG BUILD_UID=1000 RUN adduser --gecos 'onnxruntime Build User' --disabled-password $BUILD_USER --uid $BUILD_UID diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_5 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_5 index 9f38c22402..0c57ed1463 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_5 +++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_5 @@ -41,6 +41,8 @@ RUN v="8.5.1-1+cuda11.8" &&\ # Compile trtexec RUN cd /usr/src/tensorrt/samples/trtexec && make +# Install Valgrind +RUN apt-get install -y valgrind # Build final image from base. Builds ORT. FROM base as final diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6 index 30e3b16c02..c79e1720f8 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6 +++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6 @@ -41,6 +41,8 @@ RUN v="8.6.1.6-1+cuda11.8" &&\ # Compile trtexec RUN cd /usr/src/tensorrt/samples/trtexec && make +# Install Valgrind +RUN apt-get install -y valgrind # Build final image from base. Builds ORT. FROM base as final