diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h index 64a4dd19c1..89a87a0227 100644 --- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h +++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h @@ -283,6 +283,11 @@ static const char* const kOrtSessionOptionEpContextNodeNamePrefix = "ep.context_ // Share EP related resources across EPs static const char* const kOrtSessionOptionShareEpContexts = "ep.share_ep_contexts"; +// Use this config when dumping EP context model with an external initializers file +// All initializers will be inside the external data file if specified, otherwise all in Onnx file +static const char* const kOrtSessionOptionsEpContextModelExternalInitializersFileName = + "ep.context_model_external_initializers_file_name"; + // Gemm fastmath mode provides fp32 gemm acceleration with bfloat16 based matmul. // Option values: // - "0": Gemm FastMath mode is not enabled. [DEFAULT] diff --git a/onnxruntime/core/framework/graph_partitioner.cc b/onnxruntime/core/framework/graph_partitioner.cc index b97cf03e3b..b4f1e9b11c 100644 --- a/onnxruntime/core/framework/graph_partitioner.cc +++ b/onnxruntime/core/framework/graph_partitioner.cc @@ -16,6 +16,7 @@ #include "core/graph/function_utils.h" #include "core/graph/graph_viewer.h" #include "core/graph/model.h" +#include "core/graph/model_saving_options.h" #include "core/session/onnxruntime_session_options_config_keys.h" // uncomment this line to count non-CUDA ops in ONNX domain @@ -645,6 +646,7 @@ static Status InlineFunctionsAOTImpl(const ExecutionProviders& execution_provide static Status CreateEpContextModel(const ExecutionProviders& execution_providers, const Graph& graph, const std::filesystem::path& ep_context_path, + const std::filesystem::path& ep_context_ext_ini_path, const logging::Logger& logger) { InlinedVector all_ep_context_nodes; for (const auto& ep : execution_providers) { @@ -727,7 +729,20 @@ static Status CreateEpContextModel(const ExecutionProviders& execution_providers } } - ORT_RETURN_IF_ERROR(Model::Save(ep_context_model, context_cache_path)); + size_t ini_size_threshold = 0; + std::filesystem::path external_ini_path; + if (ep_context_ext_ini_path.empty()) { + // Set the threshold to the max so all initializers are forced into the Onnx file + ini_size_threshold = SIZE_MAX; + external_ini_path = "./model_ext_ini.bin"; + } else { + // Set the theshold to 0 so all initializers are forced into the external file + ini_size_threshold = 0; + external_ini_path = ep_context_ext_ini_path; + } + ModelSavingOptions model_saving_options{ini_size_threshold}; + ORT_RETURN_IF_ERROR(Model::SaveWithExternalInitializers(ep_context_model, context_cache_path, + external_ini_path, model_saving_options)); return Status::OK(); } @@ -993,9 +1008,10 @@ Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr, ORT_RETURN_IF_ERROR(PartitionOnnxFormatModel(partition_params, mode, providers_, kernel_registry_mgr_, logger)); bool ep_context_enabled = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1"; - std::string ep_context_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, ""); if (ep_context_enabled) { - ORT_RETURN_IF_ERROR(CreateEpContextModel(providers_, graph, ep_context_path, logger)); + std::string ep_context_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, ""); + std::string external_ini_file_name = config_options.GetConfigOrDefault(kOrtSessionOptionsEpContextModelExternalInitializersFileName, ""); + ORT_RETURN_IF_ERROR(CreateEpContextModel(providers_, graph, ep_context_path, external_ini_file_name, logger)); } #else ORT_UNUSED_PARAMETER(config_options); diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc index 7ee794ccbd..e4915616b7 100644 --- a/onnxruntime/core/graph/graph.cc +++ b/onnxruntime/core/graph/graph.cc @@ -4175,6 +4175,14 @@ Status Graph::AddExternalInitializersToGraphProtoImpl( size_t tensor_bytes_size = raw_data.size(); if (tensor_bytes_size < model_saving_options.initializer_size_threshold) { *output_proto = initializer; + // Data with size above the threshold is written into the new external initializer file + // Data with size below the threshold should be kept inside the new model file + // instead of leaving it in the old external initializer file for the old Onnx file + if (initializer.data_location() == TensorProto_DataLocation_EXTERNAL) { + TensorShape shape(initializer.dims()); + output_proto->set_raw_data(raw_data.data(), raw_data.size()); + output_proto->clear_data_location(); + } if (process_prepacks) { // These pre-packs will reside in memory processed_weights.insert(initializer.name()); @@ -4263,6 +4271,7 @@ ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitializers( // Create the external file. std::ofstream external_stream(modified_external_file_path, std::ofstream::out | std::ofstream::binary); + auto const external_empty_pos = external_stream.tellp(); ORT_ENFORCE(external_stream.is_open(), "Failed to open for writing:", modified_external_file_path); int64_t external_offset = 0; @@ -4275,6 +4284,12 @@ ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitializers( ORT_THROW("Failed to flush file with external initializers: ", modified_external_file_path); } + // Delete if the external data file is empty + if (external_empty_pos == external_stream.tellp()) { + external_stream.close(); + std::remove(modified_external_file_path.string().c_str()); + } + return result; } diff --git a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc index 38fde332ca..416d812326 100644 --- a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc +++ b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc @@ -7,6 +7,7 @@ #include "core/session/onnxruntime_cxx_api.h" #include "core/session/onnxruntime_session_options_config_keys.h" #include "core/session/inference_session.h" +#include "core/graph/model_saving_options.h" #include "test/providers/qnn/qnn_test_utils.h" @@ -49,19 +50,19 @@ static const std::string& GetNodeAttr(const Node& node, const std::string& attr_ static GetTestModelFn BuildGraphWithQAndNonQ(bool single_ep_node = true) { return [single_ep_node](ModelTestBuilder& builder) { // Creat non-quantized FusedMatMul node1 - NodeArg* input1 = MakeTestInput(builder, TestInputDef({2, 2}, false, {0, 1, 0, 1})); - NodeArg* add1_ini_input2 = MakeTestInput(builder, TestInputDef({2, 2}, true, {0, 0, 0, 0})); + std::vector data(200 * 200, 1.0f); + NodeArg* input1 = MakeTestInput(builder, TestInputDef({200, 200}, false, data)); + NodeArg* add1_ini_input2 = MakeTestInput(builder, TestInputDef({200, 200}, true, data)); auto* add1_output = builder.MakeIntermediate(); builder.AddNode("FusedMatMul", {input1, add1_ini_input2}, {add1_output}, kMSDomain); // Create quantized Add node2 - std::vector data = {0.0f, 0.0f, 1.0f, 0.0f}; gsl::span data_range = gsl::make_span(data); QuantParams q_parameter = GetDataQuantParams(data_range); auto* add2_input1_qdq = AddQDQNodePair(builder, add1_output, q_parameter.scale, q_parameter.zero_point); - NodeArg* add2_input2 = MakeTestInput(builder, TestInputDef({2, 2}, true, data)); + NodeArg* add2_input2 = MakeTestInput(builder, TestInputDef({200, 200}, true, data)); auto* add2_input2_qdq = AddQDQNodePair(builder, add2_input2, q_parameter.scale, q_parameter.zero_point); auto* add2_output = builder.MakeIntermediate(); @@ -73,7 +74,7 @@ static GetTestModelFn BuildGraphWithQAndNonQ(bool single_ep_node = true) { AddQDQNodePairWithOutputAsGraphOutput(builder, add2_output, q_parameter.scale, q_parameter.zero_point); } else { auto* add3_input1_qdq = AddQDQNodePair(builder, add2_output, q_parameter.scale, q_parameter.zero_point); - NodeArg* add3_ini_input2 = MakeTestInput(builder, TestInputDef({2, 2}, true, {0, 0, 0, 0})); + NodeArg* add3_ini_input2 = MakeTestInput(builder, TestInputDef({200, 200}, true, data)); auto* add3_output = builder.MakeIntermediate(); builder.AddNode("FusedMatMul", {add3_input1_qdq, add3_ini_input2}, {add3_output}, kMSDomain); @@ -81,7 +82,7 @@ static GetTestModelFn BuildGraphWithQAndNonQ(bool single_ep_node = true) { // Create quantized Add node4 auto* add4_input1_qdq = AddQDQNodePair(builder, add3_output, q_parameter.scale, q_parameter.zero_point); - NodeArg* add4_input2 = MakeTestInput(builder, TestInputDef({2, 2}, true, data)); + NodeArg* add4_input2 = MakeTestInput(builder, TestInputDef({200, 200}, true, data)); auto* add4_input2_qdq = AddQDQNodePair(builder, add4_input2, q_parameter.scale, q_parameter.zero_point); auto* add4_output = builder.MakeIntermediate(); @@ -179,6 +180,75 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryMultiPartitionSupport2) { QnnContextBinaryMultiPartitionTestBody(single_ep_node); } +void EpCtxCpuNodeWithExternalIniFileTestBody(bool expect_external_ini_file) { + ProviderOptions provider_options; +#if defined(_WIN32) + provider_options["backend_path"] = "QnnHtp.dll"; +#else + provider_options["backend_path"] = "libQnnHtp.so"; +#endif + + const std::unordered_map domain_to_version = {{"", 13}, {kMSDomain, 1}}; + + auto& logging_manager = DefaultLoggingManager(); + logging_manager.SetDefaultLoggerSeverity(logging::Severity::kERROR); + + onnxruntime::Model model("QNN_EP_TestModel", false, ModelMetaData(), PathString(), + IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {}, + logging_manager.DefaultLogger()); + Graph& graph = model.MainGraph(); + ModelTestBuilder helper(graph); + BuildGraphWithQAndNonQ(true)(helper); + helper.SetGraphOutputs(); + ASSERT_STATUS_OK(model.MainGraph().Resolve()); + ModelSavingOptions model_saving_options{10}; + const std::string model_with_ext = "model_external.onnx"; + const std::string model_ext_file = "model_external.bin"; + ASSERT_STATUS_OK(Model::SaveWithExternalInitializers(model, model_with_ext, + model_ext_file, model_saving_options)); + + EXPECT_TRUE(std::filesystem::exists(model_with_ext.c_str())); + EXPECT_TRUE(std::filesystem::exists(model_ext_file.c_str())); + + Ort::SessionOptions so; + so.AddConfigEntry(kOrtSessionOptionEpContextEnable, "1"); + so.AppendExecutionProvider("QNN", provider_options); + const std::string ep_context_model_file = "./qnn_ctx_part_external_ini_ctx.onnx"; + so.AddConfigEntry(kOrtSessionOptionEpContextFilePath, ep_context_model_file.c_str()); + const std::string external_ini_file = "./qnn_ctx_part_external_ini.bin"; + if (expect_external_ini_file) { + // Set the external ini file name will force all initializers to the external file + so.AddConfigEntry(kOrtSessionOptionsEpContextModelExternalInitializersFileName, external_ini_file.c_str()); + } // otherwise all initializers are in Onnx file, no external data file generated + + Ort::Session session(*ort_env, ToPathString(model_with_ext).c_str(), so); + + EXPECT_TRUE(std::filesystem::exists(ep_context_model_file.c_str())); + if (expect_external_ini_file) { + EXPECT_TRUE(std::filesystem::exists(external_ini_file.c_str())); + ASSERT_EQ(std::remove(external_ini_file.c_str()), 0); + } else { + EXPECT_FALSE(std::filesystem::exists(external_ini_file.c_str())); + } + + // clean up + ASSERT_EQ(std::remove(model_with_ext.c_str()), 0); + ASSERT_EQ(std::remove(model_ext_file.c_str()), 0); + ASSERT_EQ(std::remove(ep_context_model_file.c_str()), 0); +} + +// Set the external initializer size threshold to 1024 so FusedMatMul (which fallback on CPU) +// will dump initializer data to external file +TEST_F(QnnHTPBackendTests, QnnContextBinaryCpuNodeWithExternalWeights) { + EpCtxCpuNodeWithExternalIniFileTestBody(true); +} + +// Use the default external initializer size threshold (1024000) so FusedMatMul (which fallback on CPU) +// will NOT dump initializer data to external file +TEST_F(QnnHTPBackendTests, QnnContextBinaryCpuNodeWithoutExternalWeights) { + EpCtxCpuNodeWithExternalIniFileTestBody(false); +} + // Create a model with Case + Add (quantized) // cast_input -> Cast -> Q -> DQ \ // Add -> Q -> DQ -> output diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py index 8aaa0aa02d..91310cfc2a 100644 --- a/onnxruntime/test/python/onnxruntime_test_python.py +++ b/onnxruntime/test/python/onnxruntime_test_python.py @@ -183,7 +183,7 @@ class TestInferenceSession(unittest.TestCase): so.add_session_config_entry( "session.optimized_model_external_initializers_file_name", external_initializers_file ) - so.add_session_config_entry("session.optimized_model_external_initializers_min_size_in_bytes", "100") + so.add_session_config_entry("session.optimized_model_external_initializers_min_size_in_bytes", "20") onnxrt.InferenceSession(get_name("model_with_orig_ext_data.onnx"), sess_options=so) self.assertTrue(os.path.isfile(so.optimized_model_filepath)) self.assertTrue(os.path.isfile(os.path.join(directory, external_initializers_file))) @@ -213,14 +213,10 @@ class TestInferenceSession(unittest.TestCase): "session.optimized_model_external_initializers_file_name", external_initializers_file ) - # TODO(anyone): Set this to 100 will cause test error since some tensor below the threshold - # still refers to the original external data file. We shall fix this issue so that the - # optimized model only refers to one external data file. - so.add_session_config_entry("session.optimized_model_external_initializers_min_size_in_bytes", "10") + so.add_session_config_entry("session.optimized_model_external_initializers_min_size_in_bytes", "100") session1 = onnxrt.InferenceSession(get_name("model_with_orig_ext_data.onnx"), sess_options=so) del session1 self.assertTrue(os.path.isfile(optimized_model_filepath)) - self.assertTrue(os.path.isfile(external_initializers_file)) so2 = onnxrt.SessionOptions() so2.log_severity_level = 1 @@ -240,7 +236,6 @@ class TestInferenceSession(unittest.TestCase): # Remove model 1 to make sure optimized model 2 can be loaded independently from model 1 os.remove(optimized_model_filepath) - os.remove(external_initializers_file) session3 = onnxrt.InferenceSession(optimized_model_filepath_2, sess_options=onnxrt.SessionOptions()) del session3