Enable Ep context with external data for CPU nodes (#23498)

### Description
When user dump the EP context model, if the nodes not partitioned to the EP, and they have external initializers, then the dumped model still point to the old external data file. It does not make sense that new generated model still point to old external data file.
Example, model has node A, B, C, D all has external initializer in ext.bin. So ext.bin contains data for A, B, C, D.
After dumping the EP context model, node A is on CPU, node B, C, D are on EP and dumped as EPContext node. If A's data is still in ext.bin, then new generated model has to depend on old ext.bin which contains all external data for the old model which is a big overhead.

Fix:
For new generated model, user should have option to specify the new external data file, so that the new generated model either pack all initializers into the Onnx model or has all initializers in the external data file.
Add option ep.context_model_external_initializers_file_name to specify the new external data file and size threshold. All initializers will be inside the external data fie if the options is specified. Otherwise all initializers will be inside the EP context Onnx model.

### Motivation and Context
Fix the issue https://github.com/microsoft/onnxruntime/issues/23358
This commit is contained in:
Hector Li 2025-01-28 20:22:22 -08:00 committed by GitHub
parent bf023ab3d5
commit 80bc1d25f0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 117 additions and 16 deletions

View file

@ -283,6 +283,11 @@ static const char* const kOrtSessionOptionEpContextNodeNamePrefix = "ep.context_
// Share EP related resources across EPs
static const char* const kOrtSessionOptionShareEpContexts = "ep.share_ep_contexts";
// Use this config when dumping EP context model with an external initializers file
// All initializers will be inside the external data file if specified, otherwise all in Onnx file
static const char* const kOrtSessionOptionsEpContextModelExternalInitializersFileName =
"ep.context_model_external_initializers_file_name";
// Gemm fastmath mode provides fp32 gemm acceleration with bfloat16 based matmul.
// Option values:
// - "0": Gemm FastMath mode is not enabled. [DEFAULT]

View file

@ -16,6 +16,7 @@
#include "core/graph/function_utils.h"
#include "core/graph/graph_viewer.h"
#include "core/graph/model.h"
#include "core/graph/model_saving_options.h"
#include "core/session/onnxruntime_session_options_config_keys.h"
// uncomment this line to count non-CUDA ops in ONNX domain
@ -645,6 +646,7 @@ static Status InlineFunctionsAOTImpl(const ExecutionProviders& execution_provide
static Status CreateEpContextModel(const ExecutionProviders& execution_providers,
const Graph& graph,
const std::filesystem::path& ep_context_path,
const std::filesystem::path& ep_context_ext_ini_path,
const logging::Logger& logger) {
InlinedVector<const Node*> all_ep_context_nodes;
for (const auto& ep : execution_providers) {
@ -727,7 +729,20 @@ static Status CreateEpContextModel(const ExecutionProviders& execution_providers
}
}
ORT_RETURN_IF_ERROR(Model::Save(ep_context_model, context_cache_path));
size_t ini_size_threshold = 0;
std::filesystem::path external_ini_path;
if (ep_context_ext_ini_path.empty()) {
// Set the threshold to the max so all initializers are forced into the Onnx file
ini_size_threshold = SIZE_MAX;
external_ini_path = "./model_ext_ini.bin";
} else {
// Set the theshold to 0 so all initializers are forced into the external file
ini_size_threshold = 0;
external_ini_path = ep_context_ext_ini_path;
}
ModelSavingOptions model_saving_options{ini_size_threshold};
ORT_RETURN_IF_ERROR(Model::SaveWithExternalInitializers(ep_context_model, context_cache_path,
external_ini_path, model_saving_options));
return Status::OK();
}
@ -993,9 +1008,10 @@ Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr,
ORT_RETURN_IF_ERROR(PartitionOnnxFormatModel(partition_params, mode, providers_, kernel_registry_mgr_, logger));
bool ep_context_enabled = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1";
std::string ep_context_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
if (ep_context_enabled) {
ORT_RETURN_IF_ERROR(CreateEpContextModel(providers_, graph, ep_context_path, logger));
std::string ep_context_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
std::string external_ini_file_name = config_options.GetConfigOrDefault(kOrtSessionOptionsEpContextModelExternalInitializersFileName, "");
ORT_RETURN_IF_ERROR(CreateEpContextModel(providers_, graph, ep_context_path, external_ini_file_name, logger));
}
#else
ORT_UNUSED_PARAMETER(config_options);

View file

@ -4175,6 +4175,14 @@ Status Graph::AddExternalInitializersToGraphProtoImpl(
size_t tensor_bytes_size = raw_data.size();
if (tensor_bytes_size < model_saving_options.initializer_size_threshold) {
*output_proto = initializer;
// Data with size above the threshold is written into the new external initializer file
// Data with size below the threshold should be kept inside the new model file
// instead of leaving it in the old external initializer file for the old Onnx file
if (initializer.data_location() == TensorProto_DataLocation_EXTERNAL) {
TensorShape shape(initializer.dims());
output_proto->set_raw_data(raw_data.data(), raw_data.size());
output_proto->clear_data_location();
}
if (process_prepacks) {
// These pre-packs will reside in memory
processed_weights.insert(initializer.name());
@ -4263,6 +4271,7 @@ ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitializers(
// Create the external file.
std::ofstream external_stream(modified_external_file_path, std::ofstream::out | std::ofstream::binary);
auto const external_empty_pos = external_stream.tellp();
ORT_ENFORCE(external_stream.is_open(), "Failed to open for writing:", modified_external_file_path);
int64_t external_offset = 0;
@ -4275,6 +4284,12 @@ ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitializers(
ORT_THROW("Failed to flush file with external initializers: ", modified_external_file_path);
}
// Delete if the external data file is empty
if (external_empty_pos == external_stream.tellp()) {
external_stream.close();
std::remove(modified_external_file_path.string().c_str());
}
return result;
}

View file

@ -7,6 +7,7 @@
#include "core/session/onnxruntime_cxx_api.h"
#include "core/session/onnxruntime_session_options_config_keys.h"
#include "core/session/inference_session.h"
#include "core/graph/model_saving_options.h"
#include "test/providers/qnn/qnn_test_utils.h"
@ -49,19 +50,19 @@ static const std::string& GetNodeAttr(const Node& node, const std::string& attr_
static GetTestModelFn BuildGraphWithQAndNonQ(bool single_ep_node = true) {
return [single_ep_node](ModelTestBuilder& builder) {
// Creat non-quantized FusedMatMul node1
NodeArg* input1 = MakeTestInput(builder, TestInputDef<float>({2, 2}, false, {0, 1, 0, 1}));
NodeArg* add1_ini_input2 = MakeTestInput(builder, TestInputDef<float>({2, 2}, true, {0, 0, 0, 0}));
std::vector<float> data(200 * 200, 1.0f);
NodeArg* input1 = MakeTestInput(builder, TestInputDef<float>({200, 200}, false, data));
NodeArg* add1_ini_input2 = MakeTestInput(builder, TestInputDef<float>({200, 200}, true, data));
auto* add1_output = builder.MakeIntermediate();
builder.AddNode("FusedMatMul", {input1, add1_ini_input2}, {add1_output}, kMSDomain);
// Create quantized Add node2
std::vector<float> data = {0.0f, 0.0f, 1.0f, 0.0f};
gsl::span<float> data_range = gsl::make_span(data);
QuantParams<uint8_t> q_parameter = GetDataQuantParams<uint8_t>(data_range);
auto* add2_input1_qdq = AddQDQNodePair<uint8_t>(builder, add1_output, q_parameter.scale, q_parameter.zero_point);
NodeArg* add2_input2 = MakeTestInput(builder, TestInputDef<float>({2, 2}, true, data));
NodeArg* add2_input2 = MakeTestInput(builder, TestInputDef<float>({200, 200}, true, data));
auto* add2_input2_qdq = AddQDQNodePair<uint8_t>(builder, add2_input2, q_parameter.scale, q_parameter.zero_point);
auto* add2_output = builder.MakeIntermediate();
@ -73,7 +74,7 @@ static GetTestModelFn BuildGraphWithQAndNonQ(bool single_ep_node = true) {
AddQDQNodePairWithOutputAsGraphOutput<uint8_t>(builder, add2_output, q_parameter.scale, q_parameter.zero_point);
} else {
auto* add3_input1_qdq = AddQDQNodePair<uint8_t>(builder, add2_output, q_parameter.scale, q_parameter.zero_point);
NodeArg* add3_ini_input2 = MakeTestInput(builder, TestInputDef<float>({2, 2}, true, {0, 0, 0, 0}));
NodeArg* add3_ini_input2 = MakeTestInput(builder, TestInputDef<float>({200, 200}, true, data));
auto* add3_output = builder.MakeIntermediate();
builder.AddNode("FusedMatMul", {add3_input1_qdq, add3_ini_input2}, {add3_output}, kMSDomain);
@ -81,7 +82,7 @@ static GetTestModelFn BuildGraphWithQAndNonQ(bool single_ep_node = true) {
// Create quantized Add node4
auto* add4_input1_qdq = AddQDQNodePair<uint8_t>(builder, add3_output, q_parameter.scale, q_parameter.zero_point);
NodeArg* add4_input2 = MakeTestInput(builder, TestInputDef<float>({2, 2}, true, data));
NodeArg* add4_input2 = MakeTestInput(builder, TestInputDef<float>({200, 200}, true, data));
auto* add4_input2_qdq = AddQDQNodePair<uint8_t>(builder, add4_input2, q_parameter.scale, q_parameter.zero_point);
auto* add4_output = builder.MakeIntermediate();
@ -179,6 +180,75 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryMultiPartitionSupport2) {
QnnContextBinaryMultiPartitionTestBody(single_ep_node);
}
void EpCtxCpuNodeWithExternalIniFileTestBody(bool expect_external_ini_file) {
ProviderOptions provider_options;
#if defined(_WIN32)
provider_options["backend_path"] = "QnnHtp.dll";
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
const std::unordered_map<std::string, int> domain_to_version = {{"", 13}, {kMSDomain, 1}};
auto& logging_manager = DefaultLoggingManager();
logging_manager.SetDefaultLoggerSeverity(logging::Severity::kERROR);
onnxruntime::Model model("QNN_EP_TestModel", false, ModelMetaData(), PathString(),
IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {},
logging_manager.DefaultLogger());
Graph& graph = model.MainGraph();
ModelTestBuilder helper(graph);
BuildGraphWithQAndNonQ(true)(helper);
helper.SetGraphOutputs();
ASSERT_STATUS_OK(model.MainGraph().Resolve());
ModelSavingOptions model_saving_options{10};
const std::string model_with_ext = "model_external.onnx";
const std::string model_ext_file = "model_external.bin";
ASSERT_STATUS_OK(Model::SaveWithExternalInitializers(model, model_with_ext,
model_ext_file, model_saving_options));
EXPECT_TRUE(std::filesystem::exists(model_with_ext.c_str()));
EXPECT_TRUE(std::filesystem::exists(model_ext_file.c_str()));
Ort::SessionOptions so;
so.AddConfigEntry(kOrtSessionOptionEpContextEnable, "1");
so.AppendExecutionProvider("QNN", provider_options);
const std::string ep_context_model_file = "./qnn_ctx_part_external_ini_ctx.onnx";
so.AddConfigEntry(kOrtSessionOptionEpContextFilePath, ep_context_model_file.c_str());
const std::string external_ini_file = "./qnn_ctx_part_external_ini.bin";
if (expect_external_ini_file) {
// Set the external ini file name will force all initializers to the external file
so.AddConfigEntry(kOrtSessionOptionsEpContextModelExternalInitializersFileName, external_ini_file.c_str());
} // otherwise all initializers are in Onnx file, no external data file generated
Ort::Session session(*ort_env, ToPathString(model_with_ext).c_str(), so);
EXPECT_TRUE(std::filesystem::exists(ep_context_model_file.c_str()));
if (expect_external_ini_file) {
EXPECT_TRUE(std::filesystem::exists(external_ini_file.c_str()));
ASSERT_EQ(std::remove(external_ini_file.c_str()), 0);
} else {
EXPECT_FALSE(std::filesystem::exists(external_ini_file.c_str()));
}
// clean up
ASSERT_EQ(std::remove(model_with_ext.c_str()), 0);
ASSERT_EQ(std::remove(model_ext_file.c_str()), 0);
ASSERT_EQ(std::remove(ep_context_model_file.c_str()), 0);
}
// Set the external initializer size threshold to 1024 so FusedMatMul (which fallback on CPU)
// will dump initializer data to external file
TEST_F(QnnHTPBackendTests, QnnContextBinaryCpuNodeWithExternalWeights) {
EpCtxCpuNodeWithExternalIniFileTestBody(true);
}
// Use the default external initializer size threshold (1024000) so FusedMatMul (which fallback on CPU)
// will NOT dump initializer data to external file
TEST_F(QnnHTPBackendTests, QnnContextBinaryCpuNodeWithoutExternalWeights) {
EpCtxCpuNodeWithExternalIniFileTestBody(false);
}
// Create a model with Case + Add (quantized)
// cast_input -> Cast -> Q -> DQ \
// Add -> Q -> DQ -> output

View file

@ -183,7 +183,7 @@ class TestInferenceSession(unittest.TestCase):
so.add_session_config_entry(
"session.optimized_model_external_initializers_file_name", external_initializers_file
)
so.add_session_config_entry("session.optimized_model_external_initializers_min_size_in_bytes", "100")
so.add_session_config_entry("session.optimized_model_external_initializers_min_size_in_bytes", "20")
onnxrt.InferenceSession(get_name("model_with_orig_ext_data.onnx"), sess_options=so)
self.assertTrue(os.path.isfile(so.optimized_model_filepath))
self.assertTrue(os.path.isfile(os.path.join(directory, external_initializers_file)))
@ -213,14 +213,10 @@ class TestInferenceSession(unittest.TestCase):
"session.optimized_model_external_initializers_file_name", external_initializers_file
)
# TODO(anyone): Set this to 100 will cause test error since some tensor below the threshold
# still refers to the original external data file. We shall fix this issue so that the
# optimized model only refers to one external data file.
so.add_session_config_entry("session.optimized_model_external_initializers_min_size_in_bytes", "10")
so.add_session_config_entry("session.optimized_model_external_initializers_min_size_in_bytes", "100")
session1 = onnxrt.InferenceSession(get_name("model_with_orig_ext_data.onnx"), sess_options=so)
del session1
self.assertTrue(os.path.isfile(optimized_model_filepath))
self.assertTrue(os.path.isfile(external_initializers_file))
so2 = onnxrt.SessionOptions()
so2.log_severity_level = 1
@ -240,7 +236,6 @@ class TestInferenceSession(unittest.TestCase):
# Remove model 1 to make sure optimized model 2 can be loaded independently from model 1
os.remove(optimized_model_filepath)
os.remove(external_initializers_file)
session3 = onnxrt.InferenceSession(optimized_model_filepath_2, sess_options=onnxrt.SessionOptions())
del session3