diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc index 72601345c9..a827bc5279 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc @@ -1078,7 +1078,6 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector>> input_shape_ranges; std::unordered_map output_indexes(num_outputs); std::unordered_map output_types(num_outputs); - bool update_engine_cache = false; // Initialize shape range for dynamic shape tensors bool has_dynamic_shape = false; @@ -1163,15 +1162,15 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector trt_engine; tensorrt_ptr::unique_pointer trt_context; - if (engine_cache_enable_) { + if (!has_dynamic_shape) { const std::string cache_path = GetCachePath(cache_path_, trt_node_name_with_precision); const std::string engine_cache_path = cache_path + ".engine"; std::ifstream engine_file(engine_cache_path, std::ios::binary | std::ios::in); - if (engine_file) { + if (engine_cache_enable_ && engine_file) { engine_file.seekg(0, std::ios::end); size_t engine_size = engine_file.tellg(); engine_file.seekg(0, std::ios::beg); @@ -1183,7 +1182,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vectorgetDeviceMemorySize(); - if (mem_size > max_ctx_mem_size_) { - max_ctx_mem_size_ = mem_size; - context_memory_ = IAllocator::MakeUniquePtr(allocator_, max_ctx_mem_size_); - } - trt_context = tensorrt_ptr::unique_pointer(trt_engine->createExecutionContextWithoutDeviceMemory()); - } else { - trt_context = tensorrt_ptr::unique_pointer(trt_engine->createExecutionContext()); - } - if (trt_context == nullptr) { - return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, - "TensorRT EP could not build execution context for fused node: " + fused_node.Name()); - } - } - - // If graph has dynamic shape input, - // load and deserialize TRT engine profile cache - if (has_dynamic_shape) { - const std::string profile_cache_path = cache_path + ".profile"; - std::ifstream profile_file(profile_cache_path, std::ios::binary | std::ios::in); - if (profile_file) { - input_shape_ranges = DeserializeProfile(profile_file); - } - } - } - - // If (1) engine cache enable is not set or (2) first time enable engine cache and no engine cache is present, - // build TRT engine here if the graph doesn't have dynamic shape input. Otherwise engine will - // be built at runtime - if (!has_dynamic_shape) { - if (trt_engine == nullptr) { + } else { // Set INT8 per tensor dynamic range if (int8_enable_ && trt_builder->platformHasFastInt8() && int8_calibration_cache_available_) { trt_config->setInt8Calibrator(nullptr); @@ -1256,25 +1220,38 @@ common::Status TensorrtExecutionProvider::Compile(const std::vectorgetDeviceMemorySize(); - if (mem_size > max_ctx_mem_size_) { - max_ctx_mem_size_ = mem_size; - context_memory_ = IAllocator::MakeUniquePtr(allocator_, max_ctx_mem_size_); + if (engine_cache_enable_) { + nvinfer1::IHostMemory* serializedModel = trt_engine->serialize(); + size_t engine_size = serializedModel->size(); + if (engine_decryption_enable_) { + // Encrypt engine + if (!engine_encryption_(engine_cache_path.c_str(), reinterpret_cast(serializedModel->data()), engine_size)) { + return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, + "TensorRT EP could not call engine encryption function encrypt"); + } + } else { + std::ofstream file(engine_cache_path, std::ios::binary | std::ios::out); + file.write(reinterpret_cast(serializedModel->data()), engine_size); } - trt_context = tensorrt_ptr::unique_pointer(trt_engine->createExecutionContextWithoutDeviceMemory()); - } else { - trt_context = tensorrt_ptr::unique_pointer(trt_engine->createExecutionContext()); + serializedModel->destroy(); + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + engine_cache_path; } - if (trt_context == nullptr) { - return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, - "TensorRT EP could not build execution context for fused node: " + fused_node.Name()); + } + + // Build context + if (context_memory_sharing_enable_) { + size_t mem_size = trt_engine->getDeviceMemorySize(); + if (mem_size > max_ctx_mem_size_) { + max_ctx_mem_size_ = mem_size; + context_memory_ = IAllocator::MakeUniquePtr(allocator_, max_ctx_mem_size_); } + trt_context = tensorrt_ptr::unique_pointer(trt_engine->createExecutionContextWithoutDeviceMemory()); + } else { + trt_context = tensorrt_ptr::unique_pointer(trt_engine->createExecutionContext()); + } + if (trt_context == nullptr) { + return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, + "TensorRT EP could not build execution context for fused node: " + fused_node.Name()); } } @@ -1322,49 +1299,14 @@ common::Status TensorrtExecutionProvider::Compile(const std::vectornode_name], &tensorrt_mu_, fp16_enable_, int8_enable_, int8_calibration_cache_available_, dla_enable_, dla_core_, &max_workspace_size_, trt_node_name_with_precision, engine_cache_enable_, cache_path_, runtime_.get(), nullptr, allocator_, context_memory_sharing_enable_, &max_ctx_mem_size_, &context_memory_, - dynamic_range_map, engine_decryption_enable_, engine_decryption_, engine_encryption_, update_engine_cache}; + dynamic_range_map, engine_decryption_enable_, engine_decryption_, engine_encryption_}; *state = p.release(); return 0; }; // Release function state compute_info.release_state_func = [](FunctionState state) { - if (state) { - // Serialize and save engine to cache - // - // Note: only save engine to file if engine cache enable is set and engine is being updated due to input shape changed - // or engine file is not previously existed - TensorrtFuncState* trt_state = reinterpret_cast(state); - if (trt_state->update_engine_cache) { - // Serialize engine - const std::string cache_path = GetCachePath(trt_state->engine_cache_path, trt_state->trt_node_name_with_precision); - const std::string engine_cache_path = cache_path + ".engine"; - nvinfer1::IHostMemory* serializedModel = trt_state->engine->get()->serialize(); - size_t engine_size = serializedModel->size(); - if (trt_state->engine_decryption_enable) { - // Encrypt engine - if (!trt_state->engine_encryption(engine_cache_path.c_str(), reinterpret_cast(serializedModel->data()), engine_size)) { - delete static_cast(state); - ORT_THROW_IF_ERROR(ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, - "TensorRT EP could not call engine encryption function encrypt")); - } - } else { - std::ofstream file(engine_cache_path, std::ios::binary | std::ios::out); - file.write(reinterpret_cast(serializedModel->data()), engine_size); - } - serializedModel->destroy(); - LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + engine_cache_path; - - // Serialize engine profile if needed - if (!trt_state->input_shape_ranges.empty()) { - const std::string profile_cache_path = cache_path + ".profile"; - SerializeProfile(profile_cache_path, trt_state->input_shape_ranges); - LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + profile_cache_path; - } - } - delete static_cast(state); - } }; // Create compute function @@ -1391,6 +1333,81 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector(this->GetComputeStream()); + // Load serialized engine + const std::string cache_path = GetCachePath(trt_state->engine_cache_path, trt_state->trt_node_name_with_precision); + const std::string engine_cache_path = cache_path + ".engine"; + const std::string profile_cache_path = cache_path + ".profile"; + if (trt_state->engine_cache_enable && trt_engine == nullptr) { + std::ifstream engine_file(engine_cache_path, std::ios::binary | std::ios::in); + std::ifstream profile_file(profile_cache_path, std::ios::binary | std::ios::in); + if (engine_file && profile_file) { + // Deserialize profile + shape_ranges = DeserializeProfile(profile_file); + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + profile_cache_path; + // Deserialize engine + trt_state->context->reset(); + trt_state->engine->reset(); + engine_file.seekg(0, std::ios::end); + size_t engine_size = engine_file.tellg(); + engine_file.seekg(0, std::ios::beg); + std::unique_ptr engine_buf{new char[engine_size]}; + engine_file.read((char*)engine_buf.get(), engine_size); + *(trt_state->engine) = tensorrt_ptr::unique_pointer( + trt_state->runtime->deserializeCudaEngine(engine_buf.get(), engine_size, nullptr)); + if (trt_state->engine == nullptr) { + return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP Failed to Build Engine."); + } + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + engine_cache_path; + trt_engine = trt_state->engine->get(); + if (trt_state->context_memory_sharing_enable) { + *(trt_state->context) = tensorrt_ptr::unique_pointer( + trt_state->engine->get()->createExecutionContextWithoutDeviceMemory()); + } else { + *(trt_state->context) = tensorrt_ptr::unique_pointer( + trt_state->engine->get()->createExecutionContext()); + } + if (trt_state->context == nullptr) { + return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to create context."); + } + trt_context = trt_state->context->get(); + } else if (trt_state->engine_decryption_enable && !engine_file && profile_file) { + shape_ranges = DeserializeProfile(profile_file); + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + profile_cache_path; + // Decrypt engine + size_t engine_size = 0; + if (!trt_state->engine_decryption(engine_cache_path.c_str(), nullptr, &engine_size)) { + return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, + "TensorRT EP could not get engine buffer size"); + } + std::unique_ptr engine_buf{new char[engine_size]}; + if (!trt_state->engine_decryption(engine_cache_path.c_str(), &engine_buf[0], &engine_size)) { + return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, + "TensorRT EP could not call engine decryption function decrypt"); + } + // Deserialize engine + trt_state->context->reset(); + trt_state->engine->reset(); + *(trt_state->engine) = tensorrt_ptr::unique_pointer(trt_state->runtime->deserializeCudaEngine(engine_buf.get(), engine_size, nullptr)); + if (trt_state->engine == nullptr) { + return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, + "TensorRT EP could not deserialize engine from encrypted cache: " + engine_cache_path); + } + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + engine_cache_path; + trt_engine = trt_state->engine->get(); + if (trt_state->context_memory_sharing_enable) { + *(trt_state->context) = tensorrt_ptr::unique_pointer( + trt_state->engine->get()->createExecutionContextWithoutDeviceMemory()); + } else { + *(trt_state->context) = tensorrt_ptr::unique_pointer( + trt_state->engine->get()->createExecutionContext()); + } + if (trt_state->context == nullptr) { + return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to create context."); + } + trt_context = trt_state->context->get(); + } + } + for (int i = 0, end = num_inputs; i < end; ++i) { auto input = trt_state->network->get()->getInput(i); const std::string& input_name = input->getName(); @@ -1567,6 +1584,26 @@ common::Status TensorrtExecutionProvider::Compile(const std::vectorengine->get(); + if (trt_state->engine_cache_enable) { + // Serialize engine profile + SerializeProfile(profile_cache_path, shape_ranges); + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + profile_cache_path; + + // Serialize engine + nvinfer1::IHostMemory* serializedModel = trt_engine->serialize(); + size_t engine_size = serializedModel->size(); + if (trt_state->engine_decryption_enable) { + // Encrypt engine + if (!trt_state->engine_encryption(engine_cache_path.c_str(), reinterpret_cast(serializedModel->data()), engine_size)) { + return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, + "TensorRT EP could not call engine encryption function encrypt"); + } + } else { + std::ofstream file(engine_cache_path, std::ios::binary | std::ios::out); + file.write(reinterpret_cast(serializedModel->data()), engine_size); + } + serializedModel->destroy(); + } // Build context if (trt_state->context_memory_sharing_enable) { @@ -1580,10 +1617,6 @@ common::Status TensorrtExecutionProvider::Compile(const std::vectorcontext->get(); - - if (trt_state->engine_cache_enable) - trt_state->update_engine_cache = true; - trt_state->input_shape_ranges = shape_ranges; } // Get input and output binding names diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h index 27a53b1b8f..c74543b5c2 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h @@ -111,9 +111,6 @@ struct TensorrtFuncState { bool engine_decryption_enable; int (*engine_decryption)(const char*, char*, size_t*); int (*engine_encryption)(const char*, char*, size_t); - // If sub-graph has dynamic input shape and the shape range changes, or the first time writing out engine cache, this flag is set to true and engine cache will be saved. Otherwise the flag is false. - // Note: For dynamic input shape, if update_engine_cache flag is true, profile cache will be saved as well. - bool update_engine_cache; }; // Logical device representation. diff --git a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc index e17f213ca5..f0e0ddfef7 100644 --- a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc +++ b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc @@ -247,8 +247,6 @@ void RunWithOneSessionMultiThreadsInference(std::string model_name, std::string th.join(); } -void CreateAndRunInferenceSession() {} - TEST(TensorrtExecutionProviderTest, MultiThreadsTestWithOneSessionSingleThreadInference) { std::vector threads; std::string model_name = "trt_execution_provider_multithreading_test.onnx"; @@ -296,51 +294,38 @@ TEST_P(TensorrtExecutionProviderCacheTest, Run) { CreateBaseModel(model_name, cache_type + "cachingtest", dims); - /* If cache_type is "engine", following code will test the functionality of engine and optimization profile of ORT TRT, including: - * - engine cache serialization/de-serialization - * - profile cache serialization/de-serialization - * - engine/profile cache should be updated when the input shape changes - * - min/max shape ranges of dynamic shape dimensions saved in profile cache - * - if engine cache is present, trt ep should load the engine cache and run inference - * - read corrupted profile cache #TODO - */ + SessionOptions so; + so.session_logid = "TensorrtExecutionProvider" + cache_type + "cacheTest"; + RunOptions run_options; + run_options.run_tag = so.session_logid; + InferenceSession session_object{so, GetEnvironment()}; + onnxruntime::AllocatorManager allocator_manager; + auto cuda_provider = DefaultCudaExecutionProvider(); + cuda_provider->RegisterAllocator(allocator_manager); + auto cpu_allocator = cuda_provider->GetAllocator(0, OrtMemTypeCPU); + std::vector dims_mul_x = {1, 3, 2}; + std::vector values_mul_x = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; + OrtValue ml_value_x; + CreateMLValue(cpu_allocator, dims_mul_x, values_mul_x, &ml_value_x); + OrtValue ml_value_y; + CreateMLValue(cpu_allocator, dims_mul_x, values_mul_x, &ml_value_y); + OrtValue ml_value_z; + CreateMLValue(cpu_allocator, dims_mul_x, values_mul_x, &ml_value_z); + NameMLValMap feeds; + feeds.insert(std::make_pair("X", ml_value_x)); + feeds.insert(std::make_pair("Y", ml_value_y)); + feeds.insert(std::make_pair("Z", ml_value_z)); - /* - * First inference run - */ - { - SessionOptions so; - so.session_logid = "TensorrtExecutionProvider" + cache_type + "cacheTest"; - RunOptions run_options; - run_options.run_tag = so.session_logid; - InferenceSession session_object{so, GetEnvironment()}; - onnxruntime::AllocatorManager allocator_manager; - auto cuda_provider = DefaultCudaExecutionProvider(); - cuda_provider->RegisterAllocator(allocator_manager); - auto cpu_allocator = cuda_provider->GetAllocator(0, OrtMemTypeCPU); - std::vector dims_mul_x = {1, 3, 2}; - std::vector values_mul_x = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; - OrtValue ml_value_x; - CreateMLValue(cpu_allocator, dims_mul_x, values_mul_x, &ml_value_x); - OrtValue ml_value_y; - CreateMLValue(cpu_allocator, dims_mul_x, values_mul_x, &ml_value_y); - OrtValue ml_value_z; - CreateMLValue(cpu_allocator, dims_mul_x, values_mul_x, &ml_value_z); - NameMLValMap feeds; - feeds.insert(std::make_pair("X", ml_value_x)); - feeds.insert(std::make_pair("Y", ml_value_y)); - feeds.insert(std::make_pair("Z", ml_value_z)); + // prepare outputs + std::vector output_names; + output_names.push_back("M"); + std::vector fetches; - // prepare outputs - std::vector output_names; - output_names.push_back("M"); - std::vector fetches; + // prepare expected inputs and outputs + std::vector expected_dims_mul_m = {1, 3, 2}; + std::vector expected_values_mul_m = {3.0f, 6.0f, 9.0f, 12.0f, 15.0f, 18.0f}; - // prepare expected inputs and outputs - std::vector expected_dims_mul_m = {1, 3, 2}; - std::vector expected_values_mul_m = {3.0f, 6.0f, 9.0f, 12.0f, 15.0f, 18.0f}; - - OrtTensorRTProviderOptionsV2 params{ + OrtTensorRTProviderOptionsV2 params{ 0, 0, nullptr, @@ -361,40 +346,35 @@ TEST_P(TensorrtExecutionProviderCacheTest, Run) { 0, 0}; - if (cache_type.compare("engine") == 0) { - params.trt_engine_cache_enable = 1; - std::unique_ptr execution_provider = TensorrtExecutionProviderWithOptions(¶ms); - EXPECT_TRUE(session_object.RegisterExecutionProvider(std::move(execution_provider)).IsOK()); - auto status = session_object.Load(model_name); - ASSERT_TRUE(status.IsOK()); - status = session_object.Initialize(); - ASSERT_TRUE(status.IsOK()); - - // run inference - // TRT engine will be created and cached - // TRT profile will be created and cached only for dynamic input shape - // Data in profile, - // X: 1, 3, 3, 2, 2, 2 - // Y: 1, 3, 3, 2, 2, 2 - // Z: 1, 3, 3, 2, 2, 2 - status = session_object.Run(run_options, feeds, output_names, &fetches); - ASSERT_TRUE(status.IsOK()); - VerifyOutputs(fetches, expected_dims_mul_m, expected_values_mul_m); - - } else if (cache_type.compare("timing") == 0) { - // add test code here for timing cache - } - } // end of first inference run scope - - - /* Validate engine cache counts and engine profile content after first inference run. - * - * Note: Cache won't be saved to file until destructor of inference session is called, - * to be more specific, cache is saved at FunctionKernel's destructor (the release_state_func will be called). - * At this point, all the cache are saved because inference run scope ends. - * - */ if (cache_type.compare("engine") == 0) { + + /* Following code block tests the functionality of engine and optimization profile of ORT TRT, including: + * - engine cache serialization/de-serialization + * - profile cache serialization/de-serialization + * - engine/profile cache should be updated when the input shape changes + * - min/max shape ranges of dynamic shape dimensions saved in profile cache + * - read corrupted profile cache #TODO + * + */ + + params.trt_engine_cache_enable = 1; + std::unique_ptr execution_provider = TensorrtExecutionProviderWithOptions(¶ms); + EXPECT_TRUE(session_object.RegisterExecutionProvider(std::move(execution_provider)).IsOK()); + auto status = session_object.Load(model_name); + ASSERT_TRUE(status.IsOK()); + status = session_object.Initialize(); + ASSERT_TRUE(status.IsOK()); + + // run inference + // TRT engine will be created and cached + // TRT profile will be created and cached only for dynamic input shape + // Data in profile, + // X: 1, 3, 3, 2, 2, 2 + // Y: 1, 3, 3, 2, 2, 2 + // Z: 1, 3, 3, 2, 2, 2 + status = session_object.Run(run_options, feeds, output_names, &fetches); + ASSERT_TRUE(status.IsOK()); + VerifyOutputs(fetches, expected_dims_mul_m, expected_values_mul_m); ASSERT_TRUE(IsCacheExistedByType("./", ".engine")); std::vector profile_files; @@ -410,13 +390,8 @@ TEST_P(TensorrtExecutionProviderCacheTest, Run) { std::ifstream profile_file(profile_files[0], std::ios::binary | std::ios::in); auto shape_ranges = DeserializeProfile(profile_file); - // Data in profile, - // X: 1, 3, 3, 2, 2, 2 - // Y: 1, 3, 3, 2, 2, 2 - // Z: 1, 3, 3, 2, 2, 2 - // check min/max shape ranges of dynamic shape dimensions - for (auto it = shape_ranges.cbegin(); it != shape_ranges.cend(); ++it) { + for(auto it = shape_ranges.cbegin(); it != shape_ranges.cend(); ++it) { auto ranges = it->second; for (auto it2 = ranges.cbegin(); it2 != ranges.cend(); ++it2) { if (it2->first == 1) { @@ -429,133 +404,60 @@ TEST_P(TensorrtExecutionProviderCacheTest, Run) { } } } - } + // another inference run with input shape {1, 1, 6} + // TRT engine and profile will be updated + // Data in profile, + // X: 1, 1, 3, 2, 2, 6 + // Y: 1, 1, 3, 2, 2, 6 + // Z: 1, 1, 3, 2, 2, 6 + dims_mul_x = {1, 1, 6}; + CreateMLValue(cpu_allocator, dims_mul_x, values_mul_x, &ml_value_x); + CreateMLValue(cpu_allocator, dims_mul_x, values_mul_x, &ml_value_y); + CreateMLValue(cpu_allocator, dims_mul_x, values_mul_x, &ml_value_z); + feeds.clear(); + feeds.insert(std::make_pair("X", ml_value_x)); + feeds.insert(std::make_pair("Y", ml_value_y)); + feeds.insert(std::make_pair("Z", ml_value_z)); - for (int i = 0; i < 2; ++i) { - /* - * Second/Third inference run - */ - { - SessionOptions so; - so.session_logid = "TensorrtExecutionProvider" + cache_type + "cacheTest"; - RunOptions run_options; - run_options.run_tag = so.session_logid; - InferenceSession session_object{so, GetEnvironment()}; - onnxruntime::AllocatorManager allocator_manager; - auto cuda_provider = DefaultCudaExecutionProvider(); - cuda_provider->RegisterAllocator(allocator_manager); - auto cpu_allocator = cuda_provider->GetAllocator(0, OrtMemTypeCPU); - std::vector dims_mul_x = {1, 1, 6}; - std::vector values_mul_x = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; - OrtValue ml_value_x; - CreateMLValue(cpu_allocator, dims_mul_x, values_mul_x, &ml_value_x); - OrtValue ml_value_y; - CreateMLValue(cpu_allocator, dims_mul_x, values_mul_x, &ml_value_y); - OrtValue ml_value_z; - CreateMLValue(cpu_allocator, dims_mul_x, values_mul_x, &ml_value_z); - NameMLValMap feeds; - feeds.insert(std::make_pair("X", ml_value_x)); - feeds.insert(std::make_pair("Y", ml_value_y)); - feeds.insert(std::make_pair("Z", ml_value_z)); + // prepare outputs + fetches.clear(); - // prepare outputs - std::vector output_names; - output_names.push_back("M"); - std::vector fetches; + // prepare expected inputs and outputs + expected_dims_mul_m = {1, 1, 6}; - // prepare expected inputs and outputs - std::vector expected_dims_mul_m = {1, 1, 6}; - std::vector expected_values_mul_m = {3.0f, 6.0f, 9.0f, 12.0f, 15.0f, 18.0f}; + status = session_object.Run(run_options, feeds, output_names, &fetches); - OrtTensorRTProviderOptionsV2 params{ - 0, - 0, - nullptr, - 1000, - 1, - 1 << 30, - 0, - 0, - nullptr, - 0, - 0, - 0, - 0, - 0, - nullptr, - 0, - nullptr, - 0, - 0}; + if (input_type.compare("static") == 0) { + // Can't run inference since input shape changes but the engine is built with static input + ASSERT_FALSE(status.IsOK()); + } else { + ASSERT_TRUE(status.IsOK()); + VerifyOutputs(fetches, expected_dims_mul_m, expected_values_mul_m); - if (cache_type.compare("engine") == 0) { - params.trt_engine_cache_enable = 1; - std::unique_ptr execution_provider = TensorrtExecutionProviderWithOptions(¶ms); - EXPECT_TRUE(session_object.RegisterExecutionProvider(std::move(execution_provider)).IsOK()); - auto status = session_object.Load(model_name); - ASSERT_TRUE(status.IsOK()); - status = session_object.Initialize(); - ASSERT_TRUE(status.IsOK()); + profile_files = GetCachesByType("./", ".profile"); + ASSERT_EQ(profile_files.size(), 1); + std::ifstream profile_file2(profile_files[0], std::ios::binary | std::ios::in); + auto shape_ranges2 = DeserializeProfile(profile_file2); - // another inference run with input shape {1, 1, 6} - // TRT engine and profile will be updated - // Data in profile, - // X: 1, 1, 3, 2, 2, 6 - // Y: 1, 1, 3, 2, 2, 6 - // Z: 1, 1, 3, 2, 2, 6 - status = session_object.Run(run_options, feeds, output_names, &fetches); - if (input_type.compare("static") == 0) { - // Can't run inference since input shape changes but the engine is built with static input - ASSERT_FALSE(status.IsOK()); - } else { - ASSERT_TRUE(status.IsOK()); - VerifyOutputs(fetches, expected_dims_mul_m, expected_values_mul_m); - } - } - } // end of second/third inference run scope - - /* Validate engine cache counts and engine profile content after second/third inference run. - * - * Note: Cache won't be saved to file until destructor of inference session is called, - * to be more specific, cache is saved at FunctionKernel's destructor (the release_state_func will be called). - * At this point, all the cache are saved because inference run scope ends. - * - */ - if (cache_type.compare("engine") == 0) { - ASSERT_TRUE(IsCacheExistedByType("./", ".engine")); - - std::vector profile_files; - - // profile cache only being generated for dynamic input shape - if (input_type.compare("static") == 0) { - ASSERT_TRUE(!IsCacheExistedByType("./", ".profile")); - } else { - ASSERT_TRUE(IsCacheExistedByType("./", ".profile")); - - profile_files = GetCachesByType("./", ".profile"); - ASSERT_EQ(profile_files.size(), 1); - std::ifstream profile_file2(profile_files[0], std::ios::binary | std::ios::in); - auto shape_ranges2 = DeserializeProfile(profile_file2); - - // check min/max shape ranges of dynamic shape dimensions - for (auto it = shape_ranges2.cbegin(); it != shape_ranges2.cend(); ++it) { - auto ranges = it->second; - for (auto it2 = ranges.cbegin(); it2 != ranges.cend(); ++it2) { - if (it2->first == 1) { - ASSERT_EQ(it2->second.first, 1); - ASSERT_EQ(it2->second.second, 3); - } else if (it2->first == 2) { - ASSERT_EQ(it2->second.first, 2); - ASSERT_EQ(it2->second.second, 6); - } + // check min/max shape ranges of dynamic shape dimensions + for(auto it = shape_ranges2.cbegin(); it != shape_ranges2.cend(); ++it) { + auto ranges = it->second; + for (auto it2 = ranges.cbegin(); it2 != ranges.cend(); ++it2) { + if (it2->first == 1) { + ASSERT_EQ(it2->second.first, 1); + ASSERT_EQ(it2->second.second, 3); + } else if (it2->first == 2) { + ASSERT_EQ(it2->second.first, 2); + ASSERT_EQ(it2->second.second, 6); } } } } + } else if (cache_type.compare("timing") == 0) { + // add test code here } - // clean up caches RemoveCachesByType("./", ".engine"); RemoveCachesByType("./", ".profile");