onnxruntime/onnxruntime/test/framework/session_state_test.cc
Dmitri Smirnov 00b262dbb4
Implement pre-packed blobs serialization on disk and their memory mapping on load (#23069)
### Description
<!-- Describe your changes. -->
Pre-packing is a feature, that allows kernels to re-arrange weights data
to gain performance at interference time

Currently, pre-packed blobs are shared when a cross-session weight
sharing is enabled and only for those weights that are marked as shared
by the user. Otherwise, data resides on the heap, the kernels own the
data which may be duplicated.

This change enables pre-packed data to be stored on disk alongside with
the external initializers.
The pre-packed blobs are memory mapped and are loaded into either the
X-session shared container
or a new container that shares pre-packed blobs within the session.

With the new approach, pre-packed blobs are always owned by the shared
container using the existing pre-pack mechanism for sharing. When
X-session sharing is enabled, then the external container owns the data.
A separate container owned by a root `SessionState` owns and shares the
data when X-session sharing is not enabled.

To facilitate this new approach, we introduce a new container that works
in two modes. When an optimized model is being saved, and pre-packed
weights saving is enabled, the new container will record pre-packed
blobs and serialize them to disk using existing
`ToGraphProtoWithExternalInitializers` function.

To externalize the pre-packed weights, we introduce a new session option
`kOrtSessionOptionsSavePrePackedConstantInitializers.` Note, that
pre-packing should be enabled (default) for this to work.

`ToGraphProtoWithExternalInitializers`function is modified to recurse
into subgraphs to make sure we properly account for local initializer
names.

In the second mode, the container would simply hold the pre-packed
weights memory-mapped from disk and share them with the kernels.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Reduce memory usage by pre-packed initializers and externalize them.
2024-12-20 10:49:08 -08:00

1293 lines
56 KiB
C++

// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include <iostream>
#include <absl/base/config.h>
#include "asserts.h"
#include "core/framework/allocator_utils.h"
#include "core/framework/execution_providers.h"
#include "core/framework/graph_partitioner.h"
#include "core/framework/kernel_registry.h"
#include "core/framework/op_kernel.h"
#include "core/framework/bfc_arena.h"
#include "core/framework/session_state.h"
#include "core/graph/graph_utils.h"
#include "core/graph/graph_viewer.h"
#include "core/graph/model.h"
#include "core/graph/model_saving_options.h"
#include "core/graph/op.h"
#include "core/providers/cpu/cpu_execution_provider.h"
#include "core/session/onnxruntime_session_options_config_keys.h"
#include "core/util/thread_utils.h"
#include "gtest/gtest.h"
#include "test/test_environment.h"
#include "test/util/include/default_providers.h"
#include "test/util/include/file_util.h"
#include "core/optimizer/layout_transformation/layout_transformation.h"
using namespace ONNX_NAMESPACE;
namespace onnxruntime {
namespace test {
#ifndef ENABLE_TRAINING_CORE
#ifndef __wasm__
static void TestSavedPrepacks(const Model& model) {
auto inspect = [](const Graph& graph) {
const auto& prepacked_for_graph = graph.GetPrepacked();
const auto& key_to_blob = prepacked_for_graph.GetKeyToBlob();
ASSERT_EQ(1U, key_to_blob.size());
const size_t expected_prepacks_for_writing = (graph.ParentGraph() == nullptr) ? 1U : 0U;
ASSERT_EQ(expected_prepacks_for_writing, prepacked_for_graph.GetNumberOfWeightsForWriting());
const size_t expected_blobs_for_writing = (graph.ParentGraph() == nullptr) ? 1U : 0U;
ASSERT_EQ(expected_blobs_for_writing, prepacked_for_graph.GetNumberOfKeyedBlobsForWriting());
if (graph.ParentGraph() == nullptr) {
const auto* blob_keys = prepacked_for_graph.GetKeysForWeightForSaving("if_shared");
ASSERT_TRUE(blob_keys != nullptr);
ASSERT_EQ(blob_keys->size(), 1U);
const auto* prepacked_weights = prepacked_for_graph.GetPrepackedWeights(*blob_keys->cbegin());
ASSERT_TRUE(prepacked_weights != nullptr);
ASSERT_EQ(prepacked_weights->buffer_sizes_.size(), 1U);
ASSERT_EQ(prepacked_weights->buffer_sizes_[0], sizeof(float) * 2);
}
};
const auto& main_graph = model.MainGraph();
inspect(main_graph);
const auto& nodes = main_graph.Nodes();
auto if_node_hit = std::find_if(nodes.begin(), nodes.end(),
[](const Node& node) { return node.Name() == "if"; });
ASSERT_FALSE(if_node_hit == nodes.end());
const Node& if_node = *if_node_hit;
for (const auto& [_, subgraph] : if_node.GetAttributeNameToSubgraphMap()) {
inspect(*subgraph);
}
}
static void TestLoadedSharedUserSupplied(const Model& model) {
auto inspect = [](const Graph& graph) {
const auto& prepacked_for_graph = graph.GetPrepacked();
constexpr size_t expected_prepacks_for_writing = 0U;
ASSERT_EQ(expected_prepacks_for_writing, prepacked_for_graph.GetNumberOfWeightsForWriting());
// We have not loaded anything since this initializer is user supplied
const auto& key_to_blob = prepacked_for_graph.GetKeyToBlob();
ASSERT_EQ(0U, key_to_blob.size());
};
const auto& main_graph = model.MainGraph();
inspect(main_graph);
const auto& nodes = main_graph.Nodes();
auto if_node_hit = std::find_if(nodes.begin(), nodes.end(),
[](const Node& node) { return node.Name() == "if"; });
ASSERT_FALSE(if_node_hit == nodes.end());
const Node& if_node = *if_node_hit;
for (const auto& [_, subgraph] : if_node.GetAttributeNameToSubgraphMap()) {
inspect(*subgraph);
}
}
static void TestLoadedSharedNoUserSupplied(const Model& model) {
auto inspect = [](const Graph& graph) {
const auto& prepacked_for_graph = graph.GetPrepacked();
constexpr size_t expected_prepacks_for_writing = 0U;
ASSERT_EQ(expected_prepacks_for_writing, prepacked_for_graph.GetNumberOfWeightsForWriting());
// We have not loaded anything since this initializer is user supplied
const auto& key_to_blob = prepacked_for_graph.GetKeyToBlob();
ASSERT_EQ(1U, key_to_blob.size());
};
const auto& main_graph = model.MainGraph();
inspect(main_graph);
const auto& nodes = main_graph.Nodes();
auto if_node_hit = std::find_if(nodes.begin(), nodes.end(),
[](const Node& node) { return node.Name() == "if"; });
ASSERT_FALSE(if_node_hit == nodes.end());
const Node& if_node = *if_node_hit;
for (const auto& [_, subgraph] : if_node.GetAttributeNameToSubgraphMap()) {
inspect(*subgraph);
}
}
#endif // __wasm__
#endif // ENABLE_TRAINING_CORE
class TestOpKernel : public OpKernel {
public:
TestOpKernel(const OpKernelInfo& p) : OpKernel(p) {
}
Status Compute(OpKernelContext* context) const override {
ORT_UNUSED_PARAMETER(context);
return Status::OK();
}
Status ComputeAsync(OpKernelContext* context, DoneCallback done) const override {
ORT_UNUSED_PARAMETER(context);
ORT_UNUSED_PARAMETER(done);
return Status::OK();
}
};
class SessionStateAddGetKernelTest : public testing::TestWithParam<int> {};
TEST_P(SessionStateAddGetKernelTest, AddGetKernelTest) {
OrtThreadPoolParams to;
to.thread_pool_size = GetParam();
auto tp = concurrency::CreateThreadPool(&onnxruntime::Env::Default(), to, concurrency::ThreadPoolType::INTRA_OP);
ONNX_OPERATOR_SCHEMA(Variable)
.SetDoc("Input variable.")
.Output(0, "output_1", "docstr for output_1.", "tensor(int32)");
onnxruntime::Model model("graph_1", false, DefaultLoggingManager().DefaultLogger());
auto& graph = model.MainGraph();
ExecutionProviders execution_providers;
auto tmp_cpu_execution_provider = std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo(false));
auto* cpu_execution_provider = tmp_cpu_execution_provider.get();
ASSERT_STATUS_OK(execution_providers.Add(kCpuExecutionProvider, std::move(tmp_cpu_execution_provider)));
DataTransferManager dtm;
ExternalDataLoaderManager edlm;
profiling::Profiler profiler;
SessionOptions sess_options;
sess_options.enable_mem_pattern = true;
sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL;
sess_options.use_deterministic_compute = false;
sess_options.enable_mem_reuse = true;
SessionState s(graph, execution_providers, tp.get(), nullptr, dtm, edlm,
DefaultLoggingManager().DefaultLogger(), profiler, sess_options);
std::vector<onnxruntime::NodeArg*> inputs;
std::vector<onnxruntime::NodeArg*> outputs;
TypeProto output_type;
output_type.mutable_tensor_type()->set_elem_type(TensorProto_DataType_INT32);
output_type.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1);
onnxruntime::NodeArg output_arg("node_1_out_1", &output_type);
outputs.push_back(&output_arg);
onnxruntime::Node& node = graph.AddNode("node_1", "Variable", "node 1.", inputs, outputs);
auto status = graph.Resolve();
ASSERT_TRUE(status.IsOK());
auto kernel_def = KernelDefBuilder().SetName("Variable").Provider(kCpuExecutionProvider).SinceVersion(1, 10).Build();
OpKernelInfo p_info(node, *kernel_def, *cpu_execution_provider, s.GetConstantInitializedTensors(),
s.GetOrtValueNameIdxMap(), s.GetDataTransferMgr(), s.GetAllocators(),
s.GetSessionOptions().config_options);
std::unique_ptr<TestOpKernel> p_kernel = std::make_unique<TestOpKernel>(p_info);
size_t orig_num_outputs = p_kernel->Node().OutputDefs().size();
std::cout << "node_idx: " << node.Index() << std::endl;
KernelRegistryManager kernel_registry_manager;
status = kernel_registry_manager.RegisterKernels(execution_providers);
ASSERT_TRUE(status.IsOK()) << status.ErrorMessage();
node.SetExecutionProviderType(kCpuExecutionProvider);
std::shared_ptr<KernelRegistry> kernel_registry = std::make_shared<KernelRegistry>();
ASSERT_STATUS_OK(kernel_registry->Register(
KernelCreateInfo(std::move(kernel_def),
[](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status {
out = std::make_unique<TestOpKernel>(info);
return Status::OK();
})));
kernel_registry_manager.RegisterKernelRegistry(kernel_registry);
ASSERT_STATUS_OK(s.FinalizeSessionState(ORT_TSTR(""), kernel_registry_manager));
auto test_kernel = s.GetKernel(node.Index());
std::cout << "orig: " << orig_num_outputs << " new: " << test_kernel->Node().OutputDefs().size() << std::endl;
EXPECT_EQ(orig_num_outputs, test_kernel->Node().OutputDefs().size());
}
INSTANTIATE_TEST_SUITE_P(SessionStateTests, SessionStateAddGetKernelTest, testing::Values(0, 1));
class TestParam {
public:
int ir_version;
bool enable_mem_pattern;
int thread_count;
};
TestParam param_list[] = {
{3, true, 0},
{4, true, 0},
{3, false, 0},
{4, false, 0},
{3, true, 1},
{4, true, 1},
{3, false, 1},
{4, false, 1}};
class SessionStateTestP : public testing::TestWithParam<TestParam> {};
// Test that we separate out constant and non-constant initializers correctly
TEST_P(SessionStateTestP, TestInitializerProcessing) {
const TestParam& param = GetParam();
OrtThreadPoolParams to;
to.thread_pool_size = to.thread_pool_size;
auto tp = concurrency::CreateThreadPool(&onnxruntime::Env::Default(), to, concurrency::ThreadPoolType::INTRA_OP);
std::basic_ostringstream<ORTCHAR_T> oss;
oss << ORT_TSTR("testdata/optional_inputs_ir") << param.ir_version << ORT_TSTR(".onnx");
Status status;
std::shared_ptr<Model> model;
ASSERT_TRUE((status = Model::Load(oss.str(), model, nullptr, DefaultLoggingManager().DefaultLogger())).IsOK())
<< status;
Graph& graph = model->MainGraph();
// take a copy as this gets cleared during session state initialization
InitializedTensorSet initializers = graph.GetAllInitializedTensors();
ExecutionProviders execution_providers;
CPUExecutionProviderInfo epi{false};
status =
execution_providers.Add(onnxruntime::kCpuExecutionProvider, std::make_unique<CPUExecutionProvider>(epi));
ASSERT_TRUE(status.IsOK()) << status;
KernelRegistryManager krm;
status = krm.RegisterKernels(execution_providers);
ASSERT_TRUE(status.IsOK()) << status;
DataTransferManager dtm;
ExternalDataLoaderManager edlm;
profiling::Profiler profiler;
SessionOptions sess_options;
sess_options.enable_mem_pattern = param.enable_mem_pattern;
sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL;
sess_options.use_deterministic_compute = false;
sess_options.enable_mem_reuse = true;
SessionState session_state(graph, execution_providers, tp.get(), nullptr, dtm, edlm,
DefaultLoggingManager().DefaultLogger(), profiler, sess_options);
GraphPartitioner partitioner(krm, execution_providers);
ASSERT_STATUS_OK(
partitioner.Partition(
graph, session_state.GetMutableFuncMgr(),
[](Graph& graph, bool& modified, const IExecutionProvider& execution_provider,
const layout_transformation::DebugGraphFn& debug_graph_fn) -> Status {
AllocatorPtr cpu_allocator = std::make_shared<CPUAllocator>();
return layout_transformation::TransformLayoutForEP(
graph, modified, execution_provider, std::move(cpu_allocator), debug_graph_fn);
},
sess_options.config_options,
DefaultLoggingManager().DefaultLogger()));
ASSERT_STATUS_OK(session_state.FinalizeSessionState(oss.str(), krm));
const auto& initialized_tensors = session_state.GetInitializedTensors();
const auto& const_initialized_tensors = session_state.GetConstantInitializedTensors();
ASSERT_EQ(initializers.size(), initialized_tensors.size())
<< "SessionState should have an entry for all initializers in Graph.";
if (param.ir_version < 4) {
ASSERT_EQ(initialized_tensors.size(), const_initialized_tensors.size())
<< "All initializers should be considered constant if IR version < 4.";
} else {
const auto& name_to_idx = session_state.GetOrtValueNameIdxMap();
for (const auto& entry : initializers) {
int idx;
ASSERT_STATUS_OK(name_to_idx.GetIdx(entry.first, idx));
bool found = initialized_tensors.find(idx) != initialized_tensors.cend();
ASSERT_TRUE(found) << "Missing entry for " << entry.first << " in session state initialized tensors";
if (graph_utils::IsConstantInitializer(graph, entry.first, false)) {
found = const_initialized_tensors.find(idx) != const_initialized_tensors.cend();
ASSERT_TRUE(found) << "Missing entry for " << entry.first << " in session state const initialized tensors";
}
}
}
}
// Test that we allocate memory for an initializer from non-arena memory even if we provide an arena-based allocator
// if the relevant session option config flag is set
TEST(SessionStateTest, TestInitializerMemoryAllocatedUsingNonArenaMemory) {
// For this test we need to enable the arena-based allocator.
if (!DoesCpuAllocatorSupportArenaUsage()) {
GTEST_SKIP() << "CPU allocator does not support arena usage.";
}
AllocatorPtr cpu_allocator = std::make_shared<CPUAllocator>();
// Part 1: Feature turned ON (i.e.) allocate from non-arena memory
{
std::basic_ostringstream<ORTCHAR_T> oss;
oss << ORT_TSTR("testdata/mul_1.onnx");
Status status;
std::shared_ptr<Model> model;
ASSERT_TRUE((status = Model::Load(oss.str(), model, nullptr, DefaultLoggingManager().DefaultLogger())).IsOK())
<< status;
Graph& graph = model->MainGraph();
ExecutionProviders execution_providers;
CPUExecutionProviderInfo epi{true}; // use an arena-based allocator for this EP
status = execution_providers.Add(onnxruntime::kCpuExecutionProvider, std::make_unique<CPUExecutionProvider>(epi));
ASSERT_TRUE(status.IsOK()) << status;
KernelRegistryManager krm;
status = krm.RegisterKernels(execution_providers);
ASSERT_TRUE(status.IsOK()) << status;
DataTransferManager dtm;
ExternalDataLoaderManager edlm;
profiling::Profiler profiler;
SessionOptions sess_options;
sess_options.enable_mem_pattern = false;
sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL;
sess_options.use_deterministic_compute = false;
sess_options.enable_mem_reuse = true;
// disable allocating initialized tensor memory from the arena(by default it will be allocated by the arena)
ASSERT_STATUS_OK(sess_options.config_options.AddConfigEntry(kOrtSessionOptionsUseDeviceAllocatorForInitializers,
"1"));
SessionState session_state(graph, execution_providers, nullptr, nullptr, dtm, edlm,
DefaultLoggingManager().DefaultLogger(), profiler, sess_options);
// Partition the graph
GraphPartitioner partitioner(krm, execution_providers);
ASSERT_STATUS_OK(partitioner.Partition(
graph, session_state.GetMutableFuncMgr(),
[&cpu_allocator](Graph& graph, bool& modified, const IExecutionProvider& execution_provider,
const layout_transformation::DebugGraphFn& debug_graph_fn) -> Status {
return layout_transformation::TransformLayoutForEP(graph, modified, execution_provider,
cpu_allocator, debug_graph_fn);
},
sess_options.config_options,
DefaultLoggingManager().DefaultLogger()));
ASSERT_STATUS_OK(session_state.FinalizeSessionState(oss.str(), krm));
// Fetch the CPU arena-allocator from the session state
OrtMemoryInfo mem_info(CPU, OrtArenaAllocator);
AllocatorPtr alloc = session_state.GetAllocator(mem_info);
ASSERT_TRUE(alloc != nullptr);
// Get stats for the CPU arena-based allocator
AllocatorStats alloc_stats;
static_cast<BFCArena*>(alloc.get())->GetStats(&alloc_stats);
// Assert that we have made 1 Reserve() call (for allocating memory for the sole initializer in the model)
ASSERT_EQ(alloc_stats.num_reserves, 1);
}
// Part 2: Feature turned OFF (i.e.) allocate from arena memory (default behavior)
{
std::basic_ostringstream<ORTCHAR_T> oss;
oss << ORT_TSTR("testdata/mul_1.onnx");
Status status;
std::shared_ptr<Model> model;
ASSERT_TRUE((status = Model::Load(oss.str(), model, nullptr, DefaultLoggingManager().DefaultLogger())).IsOK())
<< status;
Graph& graph = model->MainGraph();
ExecutionProviders execution_providers;
CPUExecutionProviderInfo epi{true}; // use an arena-based allocator for this EP
status = execution_providers.Add(onnxruntime::kCpuExecutionProvider, std::make_unique<CPUExecutionProvider>(epi));
ASSERT_TRUE(status.IsOK()) << status;
KernelRegistryManager krm;
status = krm.RegisterKernels(execution_providers);
ASSERT_TRUE(status.IsOK()) << status;
DataTransferManager dtm;
ExternalDataLoaderManager edlm;
profiling::Profiler profiler;
SessionOptions sess_options;
sess_options.enable_mem_pattern = false;
sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL;
sess_options.use_deterministic_compute = false;
sess_options.enable_mem_reuse = true;
SessionState session_state(graph, execution_providers, nullptr, nullptr, dtm, edlm,
DefaultLoggingManager().DefaultLogger(), profiler, sess_options);
// Partition the graph
GraphPartitioner partitioner(krm, execution_providers);
ASSERT_STATUS_OK(partitioner.Partition(
graph, session_state.GetMutableFuncMgr(),
[&cpu_allocator](Graph& graph, bool& modified,
const IExecutionProvider& execution_provider,
const layout_transformation::DebugGraphFn& debug_graph_fn) -> Status {
return layout_transformation::TransformLayoutForEP(
graph, modified, execution_provider, cpu_allocator, debug_graph_fn);
},
sess_options.config_options,
DefaultLoggingManager().DefaultLogger()));
// Finalize the session state
ASSERT_STATUS_OK(session_state.FinalizeSessionState(oss.str(), krm));
// Fetch the CPU arena-allocator from the session state
OrtMemoryInfo mem_info(CPU, OrtArenaAllocator);
AllocatorPtr alloc = session_state.GetAllocator(mem_info);
ASSERT_TRUE(alloc != nullptr);
// Get stats for the CPU arena-based allocator
AllocatorStats alloc_stats;
static_cast<BFCArena*>(alloc.get())->GetStats(&alloc_stats);
// Assert that we have made no Reserve() calls
ASSERT_EQ(alloc_stats.num_reserves, 0);
// Assert to ensure an allocation was made for the initializer through the arena allocator (Alloc() was invoked)
ASSERT_EQ(alloc_stats.num_allocs, 1);
}
}
INSTANTIATE_TEST_SUITE_P(SessionStateTests, SessionStateTestP, testing::ValuesIn(param_list));
#ifndef ENABLE_TRAINING_CORE
class PrePackingTestOpKernel : public OpKernel {
public:
PrePackingTestOpKernel(const OpKernelInfo& info) : OpKernel(info) {}
Status Compute(OpKernelContext* context) const override {
ORT_UNUSED_PARAMETER(context);
return Status::OK();
}
Status UseSharedPrePackedBuffers(std::vector<BufferUniquePtr>& prepacked_buffers,
int input_idx,
/*out*/ bool& used_shared_buffers) override {
ORT_UNUSED_PARAMETER(input_idx);
weight_packed_ = std::move(prepacked_buffers[0]);
used_shared_buffers = true;
++store_pre_packed_weight_calls_count;
return Status::OK();
}
Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
/*out*/ bool& is_packed, /*out*/ PrePackedWeights* prepacked_weights) override {
ORT_UNUSED_PARAMETER(tensor);
ORT_UNUSED_PARAMETER(input_idx);
constexpr const size_t weight_packed_len = sizeof(float) * 2;
weight_packed_ = IAllocator::MakeUniquePtr<void>(alloc, weight_packed_len, true);
float* data_weights_packed = reinterpret_cast<float*>(weight_packed_.get());
data_weights_packed[0] = 1.2345f;
data_weights_packed[1] = data_weights_packed[0] * 2.f;
if (prepacked_weights != nullptr) {
prepacked_weights->buffers_.push_back(std::move(weight_packed_));
prepacked_weights->buffer_sizes_.push_back(weight_packed_len);
}
is_packed = true;
++prepack_calls_count;
return Status::OK();
}
int prepack_calls_count = 0;
int store_pre_packed_weight_calls_count = 0;
IAllocatorUniquePtr<void> weight_packed_;
};
static void CreateSimpleGraph(Graph& graph) {
// node creation and placement
TypeProto type;
type.mutable_tensor_type()->set_elem_type(TensorProto_DataType_FLOAT);
type.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1);
std::vector<onnxruntime::NodeArg*> inputs;
onnxruntime::NodeArg input_0_arg("node_0_input_0", &type);
onnxruntime::NodeArg input_1_arg("node_0_input_1", &type);
inputs.push_back(&input_0_arg);
inputs.push_back(&input_1_arg);
std::vector<onnxruntime::NodeArg*> outputs;
onnxruntime::NodeArg output_arg("node_0_output_0", &type);
outputs.push_back(&output_arg);
graph.AddNode("node_0", "PrePackingTest", "node 0", inputs, outputs);
// add an initializer
ONNX_NAMESPACE::TensorProto tensor;
tensor.add_dims(1);
tensor.add_float_data(1.0f);
tensor.set_data_type(TensorProto_DataType_FLOAT);
tensor.set_name("node_0_input_1");
graph.AddInitializedTensor(tensor);
auto status = graph.Resolve();
ASSERT_TRUE(status.IsOK());
}
static const ONNX_NAMESPACE::GraphProto CreateSubgraph(bool then_branch) {
Model model(then_branch ? "If_then" : "If_else", false, DefaultLoggingManager().DefaultLogger());
auto& graph = model.MainGraph();
std::vector<NodeArg*> inputs;
std::vector<NodeArg*> outputs;
const std::string suffix = then_branch ? "0" : "1";
// graph input has to have type and rank even though it's an outer scope value.
TypeProto type_float;
type_float.mutable_tensor_type()->set_elem_type(TensorProto_DataType_FLOAT);
type_float.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1);
// outer scope values
auto& if_shared = graph.GetOrCreateNodeArg("if_shared", &type_float);
auto& if_input = graph.GetOrCreateNodeArg("if_input_" + suffix, &type_float);
// add so that we don't end up with it being considered a graph input
graph.AddOuterScopeNodeArg("if_shared");
graph.AddOuterScopeNodeArg("if_input_" + suffix);
auto& if_out = graph.GetOrCreateNodeArg("if_output_" + suffix, &type_float);
inputs = {&if_shared, &if_input};
outputs = {&if_out};
graph.AddNode("if_node_" + suffix, "PrePackingTest", "if node " + suffix, inputs, outputs);
auto status = graph.Resolve();
EXPECT_EQ(status, Status::OK());
auto& proto = graph.ToGraphProto();
return proto;
}
static void CreateGraphWithSubgraph(Graph& graph) {
TypeProto type_float;
type_float.mutable_tensor_type()->set_elem_type(TensorProto_DataType_FLOAT);
type_float.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1);
{
std::vector<onnxruntime::NodeArg*> inputs;
onnxruntime::NodeArg input_0_arg("if_input_0", &type_float);
onnxruntime::NodeArg input_1_arg("if_input_1", &type_float);
inputs.push_back(&input_0_arg);
inputs.push_back(&input_1_arg);
std::vector<onnxruntime::NodeArg*> outputs;
onnxruntime::NodeArg output_arg("node_0_output_0", &type_float);
outputs.push_back(&output_arg);
graph.AddNode("node_0", "PrePackingTest", "node 0", inputs, outputs);
}
{
TypeProto type_bool;
type_bool.mutable_tensor_type()->set_elem_type(TensorProto_DataType_BOOL);
type_bool.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1);
onnxruntime::NodeArg bool_arg("bool_arg", &type_bool);
std::vector<onnxruntime::NodeArg*> outputs;
onnxruntime::NodeArg output_arg("output_arg", &type_float);
outputs.push_back(&output_arg);
auto& if_node = graph.AddNode("if", "If", "If node", {&bool_arg}, outputs);
auto then_proto = CreateSubgraph(true);
auto else_proto = CreateSubgraph(false);
if_node.AddAttribute("then_branch", then_proto);
if_node.AddAttribute("else_branch", else_proto);
}
// add an initializer
ONNX_NAMESPACE::TensorProto tensor;
tensor.add_dims(1);
tensor.add_float_data(1.0f);
tensor.set_data_type(TensorProto_DataType_FLOAT);
tensor.set_name("if_shared");
graph.AddInitializedTensor(tensor);
auto status = graph.Resolve();
ASSERT_TRUE(status.IsOK());
}
static void PlaceAllNodesToCPUEP(Graph& graph) {
for (auto& node : graph.Nodes()) {
node.SetExecutionProviderType(kCpuExecutionProvider);
if (node.ContainsSubgraph()) {
for (auto& entry : node.GetAttributeNameToMutableSubgraphMap()) {
Graph* subgraph = entry.second;
PlaceAllNodesToCPUEP(*subgraph);
}
}
}
}
struct PrepackingTestParam {
bool test_subgraph;
bool test_prepacking;
};
class SessionStatePrepackingTest : public testing::TestWithParam<PrepackingTestParam> {};
TEST_P(SessionStatePrepackingTest, PrePackingTest) {
PrepackingTestParam test_param = GetParam();
OrtThreadPoolParams to;
auto tp = concurrency::CreateThreadPool(&onnxruntime::Env::Default(), to, concurrency::ThreadPoolType::INTRA_OP);
ONNX_OPERATOR_SCHEMA(PrePackingTest)
.SetDoc("Faking Node for PrePacking")
.Input(0, "Input_0", "input 0", "tensor(float)")
.Input(1, "Input_1", "input 1", "tensor(float)")
.Output(0, "output_0", "docstr for output_0.", "tensor(float)");
ExecutionProviders execution_providers;
auto cpu_execution_provider = std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo(false));
ASSERT_STATUS_OK(execution_providers.Add(kCpuExecutionProvider, std::move(cpu_execution_provider)));
DataTransferManager dtm;
ExternalDataLoaderManager edlm;
profiling::Profiler profiler;
std::unordered_map<std::string, int> domain_to_version;
domain_to_version[kOnnxDomain] = 11;
Model model("graph_main", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
domain_to_version, std::vector<ONNX_NAMESPACE::FunctionProto>(),
DefaultLoggingManager().DefaultLogger());
// onnxruntime::Model model("graph_main", false, DefaultLoggingManager().DefaultLogger());
if (test_param.test_subgraph) {
CreateGraphWithSubgraph(model.MainGraph());
} else {
CreateSimpleGraph(model.MainGraph());
}
SessionOptions sess_options;
sess_options.enable_mem_pattern = true;
sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL;
sess_options.use_deterministic_compute = false;
sess_options.enable_mem_reuse = true;
sess_options.config_options.configurations[kOrtSessionOptionsConfigDisablePrepacking] =
test_param.test_prepacking ? "0" : "1";
SessionState session_state(model.MainGraph(),
execution_providers,
tp.get(),
nullptr, /*inter_op_thread_pool*/
dtm,
edlm,
DefaultLoggingManager().DefaultLogger(),
profiler,
sess_options);
KernelRegistryManager kernel_registry_manager;
Status status = kernel_registry_manager.RegisterKernels(execution_providers);
ASSERT_TRUE(status.IsOK()) << status.ErrorMessage();
std::shared_ptr<KernelRegistry> kernel_registry = std::make_shared<KernelRegistry>();
auto kernel_def = KernelDefBuilder().SetName("PrePackingTest").Provider(kCpuExecutionProvider).SinceVersion(1).Build();
ASSERT_STATUS_OK(kernel_registry->Register(
KernelCreateInfo(std::move(kernel_def),
[](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status {
out = std::make_unique<PrePackingTestOpKernel>(info);
return Status::OK();
})));
kernel_registry_manager.RegisterKernelRegistry(kernel_registry);
PlaceAllNodesToCPUEP(model.MainGraph());
ASSERT_STATUS_OK(session_state.FinalizeSessionState(std::basic_string<PATH_CHAR_TYPE>(),
kernel_registry_manager));
const auto& const_initialized_tensors = session_state.GetConstantInitializedTensors();
// check prepacking
ASSERT_EQ(const_initialized_tensors.size(), size_t(test_param.test_prepacking ? 0 : 1));
}
class SessionStateTestSharedInitalizersWithPrePacking : public ::testing::Test {
protected:
ExecutionProviders execution_providers;
std::unordered_map<std::string, int> domain_to_version;
DataTransferManager dtm;
ExternalDataLoaderManager edlm;
profiling::Profiler profiler;
KernelRegistryManager kernel_registry_manager;
std::unique_ptr<concurrency::ThreadPool> tp;
void SetUp() override {
OrtThreadPoolParams to;
tp = concurrency::CreateThreadPool(&onnxruntime::Env::Default(), to, concurrency::ThreadPoolType::INTRA_OP);
ONNX_OPERATOR_SCHEMA(PrePackingTest)
.SetDoc("Faking Node for PrePacking")
.Input(0, "Input_0", "input 0", "tensor(float)")
.Input(1, "Input_1", "input 1", "tensor(float)")
.Output(0, "output_0", "docstr for output_0.", "tensor(float)");
auto cpu_execution_provider = std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo(false));
ASSERT_STATUS_OK(execution_providers.Add(kCpuExecutionProvider, std::move(cpu_execution_provider)));
domain_to_version[kOnnxDomain] = 11;
ASSERT_STATUS_OK(kernel_registry_manager.RegisterKernels(execution_providers));
std::shared_ptr<KernelRegistry> kernel_registry = std::make_shared<KernelRegistry>();
auto kernel_def = KernelDefBuilder()
.SetName("PrePackingTest")
.Provider(kCpuExecutionProvider)
.SinceVersion(1)
.Build();
ASSERT_STATUS_OK(kernel_registry->Register(
KernelCreateInfo(std::move(kernel_def),
[](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status { out = std::make_unique<PrePackingTestOpKernel>(info); return Status::OK(); })));
kernel_registry_manager.RegisterKernelRegistry(kernel_registry);
}
};
// Pre-packing enabled + no shared initializers, however, we put all the pre-packs
// in a session_state container for ownership.
TEST_F(SessionStateTestSharedInitalizersWithPrePacking, test1) {
SessionOptions sess_options;
sess_options.enable_mem_pattern = true;
sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL;
sess_options.use_deterministic_compute = false;
sess_options.enable_mem_reuse = true;
// Enable pre-packing
sess_options.config_options.configurations[kOrtSessionOptionsConfigDisablePrepacking] = "0";
// First session/model
Model model_1("graph_main", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
domain_to_version, std::vector<ONNX_NAMESPACE::FunctionProto>(),
DefaultLoggingManager().DefaultLogger());
CreateSimpleGraph(model_1.MainGraph());
PlaceAllNodesToCPUEP(model_1.MainGraph());
SessionState session_state_1(model_1.MainGraph(),
execution_providers,
tp.get(),
nullptr, /*inter_op_thread_pool*/
dtm,
edlm,
DefaultLoggingManager().DefaultLogger(),
profiler,
sess_options);
ASSERT_STATUS_OK(session_state_1.FinalizeSessionState(std::basic_string<PATH_CHAR_TYPE>(),
kernel_registry_manager));
const auto* kernel = reinterpret_cast<const PrePackingTestOpKernel*>(session_state_1.GetKernel(0));
// Assert that a pre-pack call was made. However, they sharing call is still made from a serialized container.
ASSERT_EQ(session_state_1.GetNumberOfPrepacksCounter(), static_cast<size_t>(1));
ASSERT_EQ(kernel->prepack_calls_count, 1);
// In this case the sharing comes from the serialized container
ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 1);
// Second session/model
Model model_2("graph_main", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
domain_to_version, std::vector<ONNX_NAMESPACE::FunctionProto>(),
DefaultLoggingManager().DefaultLogger());
CreateSimpleGraph(model_2.MainGraph());
PlaceAllNodesToCPUEP(model_2.MainGraph());
SessionState session_state_2(model_2.MainGraph(),
execution_providers,
tp.get(),
nullptr, /*inter_op_thread_pool*/
dtm,
edlm,
DefaultLoggingManager().DefaultLogger(),
profiler,
sess_options);
ASSERT_STATUS_OK(session_state_2.FinalizeSessionState(std::basic_string<PATH_CHAR_TYPE>(),
kernel_registry_manager));
kernel = reinterpret_cast<const PrePackingTestOpKernel*>(session_state_2.GetKernel(0));
// Assert that a pre-pack call was made. The weights are still shared from the serialized container
// either because they are loaded from disk or because the container takes ownership of them.
ASSERT_EQ(session_state_2.GetNumberOfPrepacksCounter(), static_cast<size_t>(1));
ASSERT_EQ(kernel->prepack_calls_count, 1);
ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 1);
}
// Pre-packing enabled + shared initializers + no pre-packed weights container = no pre-packed weights caching
TEST_F(SessionStateTestSharedInitalizersWithPrePacking, test2) {
SessionOptions sess_options;
sess_options.enable_mem_pattern = true;
sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL;
sess_options.use_deterministic_compute = false;
sess_options.enable_mem_reuse = true;
// Enable pre-packing
sess_options.config_options.configurations[kOrtSessionOptionsConfigDisablePrepacking] = "0";
// Enable shared initializer
OrtMemoryInfo mem_info(CPU, OrtDeviceAllocator);
std::vector<float> float_data(1, 1);
auto value = std::make_unique<OrtValue>();
Tensor::InitOrtValue(DataTypeImpl::GetType<float>(),
TensorShape(std::vector<int64_t>{1}), reinterpret_cast<void*>(float_data.data()),
mem_info, *value);
ASSERT_STATUS_OK(sess_options.AddInitializer("node_0_input_1", value.get()));
// First session/model
Model model_1("graph_main", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
domain_to_version, std::vector<ONNX_NAMESPACE::FunctionProto>(),
DefaultLoggingManager().DefaultLogger());
CreateSimpleGraph(model_1.MainGraph());
PlaceAllNodesToCPUEP(model_1.MainGraph());
SessionState session_state_1(model_1.MainGraph(),
execution_providers,
tp.get(),
nullptr, /*inter_op_thread_pool*/
dtm,
edlm,
DefaultLoggingManager().DefaultLogger(),
profiler,
sess_options);
ASSERT_STATUS_OK(session_state_1.FinalizeSessionState(std::basic_string<PATH_CHAR_TYPE>(),
kernel_registry_manager));
const auto* kernel = reinterpret_cast<const PrePackingTestOpKernel*>(session_state_1.GetKernel(0));
// Assert that a pre-pack call was made, but sharing still takes place from the serialized container
ASSERT_EQ(session_state_1.GetNumberOfPrepacksCounter(), static_cast<size_t>(1));
ASSERT_EQ(kernel->prepack_calls_count, 1);
ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 1);
// Second session/model
Model model_2("graph_main", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
domain_to_version, std::vector<ONNX_NAMESPACE::FunctionProto>(),
DefaultLoggingManager().DefaultLogger());
CreateSimpleGraph(model_2.MainGraph());
PlaceAllNodesToCPUEP(model_2.MainGraph());
SessionState session_state_2(model_2.MainGraph(),
execution_providers,
tp.get(),
nullptr, /*inter_op_thread_pool*/
dtm,
edlm,
DefaultLoggingManager().DefaultLogger(),
profiler,
sess_options);
ASSERT_STATUS_OK(session_state_2.FinalizeSessionState(std::basic_string<PATH_CHAR_TYPE>(),
kernel_registry_manager));
kernel = reinterpret_cast<const PrePackingTestOpKernel*>(session_state_2.GetKernel(0));
// Assert that a pre-pack call was made, but sharing still takes place from the serialized container
ASSERT_EQ(session_state_2.GetNumberOfPrepacksCounter(), static_cast<size_t>(1));
ASSERT_EQ(kernel->prepack_calls_count, 1);
ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 1);
}
// Pre-packing enabled + shared initializers + pre-packed weights container = pre-packed weights caching enabled
TEST_F(SessionStateTestSharedInitalizersWithPrePacking, test3) {
SessionOptions sess_options;
sess_options.enable_mem_pattern = true;
sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL;
sess_options.use_deterministic_compute = false;
sess_options.enable_mem_reuse = true;
// Enable pre-packing
sess_options.config_options.configurations[kOrtSessionOptionsConfigDisablePrepacking] = "0";
// Enable shared initializer
OrtMemoryInfo mem_info(CPU, OrtDeviceAllocator);
std::vector<float> float_data(1, 1);
auto value = std::make_unique<OrtValue>();
Tensor::InitOrtValue(DataTypeImpl::GetType<float>(), TensorShape(std::vector<int64_t>{1}),
reinterpret_cast<void*>(float_data.data()), mem_info, *value);
ASSERT_STATUS_OK(sess_options.AddInitializer("node_0_input_1", value.get()));
// Enable pre-packed weights container
PrepackedWeightsContainer prepacked_weights_container;
// First session/model
Model model_1("graph_main", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
domain_to_version, std::vector<ONNX_NAMESPACE::FunctionProto>(),
DefaultLoggingManager().DefaultLogger());
CreateSimpleGraph(model_1.MainGraph());
PlaceAllNodesToCPUEP(model_1.MainGraph());
SessionState session_state_1(model_1.MainGraph(),
execution_providers,
tp.get(),
nullptr, /*inter_op_thread_pool*/
dtm,
edlm,
DefaultLoggingManager().DefaultLogger(),
profiler,
sess_options,
&prepacked_weights_container);
ASSERT_STATUS_OK(session_state_1.FinalizeSessionState(std::basic_string<PATH_CHAR_TYPE>(),
kernel_registry_manager));
const auto* kernel = reinterpret_cast<const PrePackingTestOpKernel*>(session_state_1.GetKernel(0));
// Assert that a pre-pack call was made
ASSERT_EQ(session_state_1.GetNumberOfPrepacksCounter(), static_cast<size_t>(1));
ASSERT_EQ(kernel->prepack_calls_count, 1);
// Assert that we made a call to store pre-packed weight from a shared container
ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 1);
// The weight to be "stored" is the same weight that we got by invoking PrePack() in the step above.
// Hence, assert that it wasn't a "cached" pre-packed weight (i.e.) pre-packed weight
// from another instance of the same op_type consuming the same constant initializer.
ASSERT_EQ(session_state_1.GetUsedSharedPrePackedWeightCounter(), static_cast<size_t>(0));
// Second session/model
Model model_2("graph_main", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
domain_to_version, std::vector<ONNX_NAMESPACE::FunctionProto>(),
DefaultLoggingManager().DefaultLogger());
CreateSimpleGraph(model_2.MainGraph());
PlaceAllNodesToCPUEP(model_2.MainGraph());
SessionState session_state_2(model_2.MainGraph(),
execution_providers,
tp.get(),
nullptr, /*inter_op_thread_pool*/
dtm,
edlm,
DefaultLoggingManager().DefaultLogger(),
profiler,
sess_options,
&prepacked_weights_container);
ASSERT_STATUS_OK(session_state_2.FinalizeSessionState(std::basic_string<PATH_CHAR_TYPE>(),
kernel_registry_manager));
// Assert that a pre-pack call was made
ASSERT_EQ(session_state_2.GetNumberOfPrepacksCounter(), static_cast<size_t>(1));
ASSERT_EQ(kernel->prepack_calls_count, 1);
// Assert that we made a call to store pre-packed weight from a shared container
ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 1);
// The weight to be "stored" is a "cached" weight (i.e.) a pre-packed weight
// from another instance of the same op_type consuming the same constant initializer.
// Assert this.
ASSERT_EQ(session_state_2.GetUsedSharedPrePackedWeightCounter(), static_cast<size_t>(1));
}
// Pre-packing enabled + shared initializers +
// pre-packed weights container + subgraphs =
// caching enabled in pre-packed weights used in subgraphs
TEST_F(SessionStateTestSharedInitalizersWithPrePacking, test4) {
SessionOptions sess_options;
sess_options.enable_mem_pattern = true;
sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL;
sess_options.use_deterministic_compute = false;
sess_options.enable_mem_reuse = true;
// Enable pre-packing
sess_options.config_options.configurations[kOrtSessionOptionsConfigDisablePrepacking] = "0";
// Enable shared initializer
OrtMemoryInfo mem_info(CPU, OrtDeviceAllocator);
std::vector<float> float_data(1, 1);
auto value = std::make_unique<OrtValue>();
Tensor::InitOrtValue(DataTypeImpl::GetType<float>(), TensorShape(std::vector<int64_t>{1}),
reinterpret_cast<void*>(float_data.data()), mem_info, *value);
ASSERT_STATUS_OK(sess_options.AddInitializer("if_shared", value.get()));
// Enable pre-packed weights container
PrepackedWeightsContainer prepacked_weights_container;
// First session/model
Model model_1("graph_main", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
domain_to_version, std::vector<ONNX_NAMESPACE::FunctionProto>(),
DefaultLoggingManager().DefaultLogger());
CreateGraphWithSubgraph(model_1.MainGraph());
PlaceAllNodesToCPUEP(model_1.MainGraph());
SessionState session_state_1(model_1.MainGraph(),
execution_providers,
tp.get(),
nullptr, /*inter_op_thread_pool*/
dtm,
edlm,
DefaultLoggingManager().DefaultLogger(),
profiler,
sess_options,
&prepacked_weights_container);
ASSERT_STATUS_OK(session_state_1.FinalizeSessionState(std::basic_string<PATH_CHAR_TYPE>(),
kernel_registry_manager));
// At the main graph level, there should be no pre-packing calls as there are
// no initializers (shared or otherwise) consumed by any nodes in the main graph
ASSERT_EQ(session_state_1.GetNumberOfPrepacksCounter(), static_cast<size_t>(0));
auto if_index_1 = 1;
if (session_state_1.GetKernel(0)->Node().OpType() == "If") {
if_index_1 = 0;
}
const auto& subgraph_session_states = session_state_1.GetSubgraphSessionStateMap();
const auto& if_node_session_states = subgraph_session_states.at(if_index_1);
const auto& session_state_1_then_branch_session_state = *if_node_session_states.at("then_branch");
const auto& session_state_1_else_branch_session_state = *if_node_session_states.at("else_branch");
auto if_node_branches_prepack_counter_1 =
session_state_1_then_branch_session_state.GetNumberOfPrepacksCounter() +
session_state_1_else_branch_session_state.GetNumberOfPrepacksCounter();
// We should be seeing 2 pre-pack calls in the "If" node (one in each subgraph)
ASSERT_EQ(if_node_branches_prepack_counter_1, static_cast<size_t>(2));
auto if_node_branches_shared_prepack_counter_1 =
session_state_1_then_branch_session_state.GetUsedSharedPrePackedWeightCounter() +
session_state_1_else_branch_session_state.GetUsedSharedPrePackedWeightCounter();
// We should only be seeing 1 shared pre-pack weights usage in the "If" node
// Either the "then branch" or "else branch" will be using the shared version
// depending on which branch writes to the shared container
ASSERT_EQ(if_node_branches_shared_prepack_counter_1, static_cast<size_t>(1));
// Second session/model
Model model_2("graph_main", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
domain_to_version, std::vector<ONNX_NAMESPACE::FunctionProto>(),
DefaultLoggingManager().DefaultLogger());
CreateGraphWithSubgraph(model_2.MainGraph());
PlaceAllNodesToCPUEP(model_2.MainGraph());
SessionState session_state_2(model_2.MainGraph(),
execution_providers,
tp.get(),
nullptr, /*inter_op_thread_pool*/
dtm,
edlm,
DefaultLoggingManager().DefaultLogger(),
profiler,
sess_options,
&prepacked_weights_container);
ASSERT_STATUS_OK(session_state_2.FinalizeSessionState(std::basic_string<PATH_CHAR_TYPE>(),
kernel_registry_manager));
// At the main graph level, there should be no pre-packing calls as there are
// no initializers (shared or otherwise) consumed by any nodes in the main graph
ASSERT_EQ(session_state_2.GetNumberOfPrepacksCounter(), static_cast<size_t>(0));
auto if_index_2 = 1;
if (session_state_2.GetKernel(0)->Node().OpType() == "If") {
if_index_2 = 0;
}
const auto& subgraph_session_states_2 = session_state_2.GetSubgraphSessionStateMap();
const auto& if_node_session_states_2 = subgraph_session_states_2.at(if_index_2);
const auto& session_state_2_then_branch_session_state = *if_node_session_states_2.at("then_branch");
const auto& session_state_2_else_branch_session_state = *if_node_session_states_2.at("else_branch");
auto if_node_branches_prepack_counter_2 =
session_state_2_then_branch_session_state.GetNumberOfPrepacksCounter() +
session_state_2_else_branch_session_state.GetNumberOfPrepacksCounter();
// We should be seeing 2 pre-pack calls in the "If" node (one in each subgraph)
ASSERT_EQ(if_node_branches_prepack_counter_2, static_cast<size_t>(2));
auto if_node_branches_shared_prepack_counter_2 =
session_state_2_then_branch_session_state.GetUsedSharedPrePackedWeightCounter() +
session_state_2_else_branch_session_state.GetUsedSharedPrePackedWeightCounter();
// We should be seeing 2 shared pre-pack weights calls in the "If" node
// Both branches will be using the shared version coming from the first model.
ASSERT_EQ(if_node_branches_shared_prepack_counter_2, static_cast<size_t>(2));
}
#ifndef __wasm__
// sharing is on
TEST_F(SessionStateTestSharedInitalizersWithPrePacking, TestPrepackedSerialization) {
const std::filesystem::path model_with_external_initializers =
"testdata/test_prepacked_serialization_optimized_model.onnx";
const std::filesystem::path external_initializers_file =
"test_prepacked_serialization_optimized_model.bin";
{
SessionOptions sess_options;
sess_options.enable_mem_pattern = true;
sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL;
sess_options.use_deterministic_compute = false;
sess_options.enable_mem_reuse = true;
sess_options.optimized_model_filepath = model_with_external_initializers;
// Enable pre-packing
sess_options.config_options.configurations[kOrtSessionOptionsConfigDisablePrepacking] = "0";
// Enable saving model with pre-packed weights
sess_options.config_options.configurations[kOrtSessionOptionsSavePrePackedConstantInitializers] = "1";
// Enable shared initializer
OrtMemoryInfo mem_info(CPU, OrtDeviceAllocator);
std::vector<float> float_data(1, 1);
auto value = std::make_unique<OrtValue>();
Tensor::InitOrtValue(DataTypeImpl::GetType<float>(), TensorShape(std::vector<int64_t>{1}),
reinterpret_cast<void*>(float_data.data()), mem_info, *value);
ASSERT_STATUS_OK(sess_options.AddInitializer("if_shared", value.get()));
// Enable pre-packed weights container for shared initializers
PrepackedWeightsContainer prepacked_weights_container;
Model model_1("graph_main", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
domain_to_version, std::vector<ONNX_NAMESPACE::FunctionProto>(),
DefaultLoggingManager().DefaultLogger());
CreateGraphWithSubgraph(model_1.MainGraph());
PlaceAllNodesToCPUEP(model_1.MainGraph());
SessionState session_state_1(model_1.MainGraph(),
execution_providers,
tp.get(),
nullptr, /*inter_op_thread_pool*/
dtm,
edlm,
DefaultLoggingManager().DefaultLogger(),
profiler,
sess_options,
&prepacked_weights_container);
constexpr const bool saving_model_true = true;
ASSERT_STATUS_OK(session_state_1.FinalizeSessionState(std::basic_string<PATH_CHAR_TYPE>(),
kernel_registry_manager,
!saving_model_true));
TestSavedPrepacks(model_1);
ModelSavingOptions model_saving_options{4};
model_saving_options.align_offset = true;
ASSERT_STATUS_OK(Model::SaveWithExternalInitializers(model_1, model_with_external_initializers,
external_initializers_file,
model_saving_options));
}
ScopedFileDeleter test_model_deleter(model_with_external_initializers);
ScopedFileDeleter binary_file_deleter(external_initializers_file);
// Now let's load the model along with the initializers
{
SessionOptions sess_options;
sess_options.enable_mem_pattern = true;
sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL;
sess_options.use_deterministic_compute = false;
sess_options.enable_mem_reuse = true;
// Enable pre-packing
sess_options.config_options.configurations[kOrtSessionOptionsConfigDisablePrepacking] = "0";
// We are expecting this weight to be loaded from disk along
// with its pre-packed version
// Enable shared initializer
OrtMemoryInfo mem_info(CPU, OrtDeviceAllocator);
std::vector<float> float_data(1, 1);
auto value = std::make_unique<OrtValue>();
Tensor::InitOrtValue(DataTypeImpl::GetType<float>(), TensorShape(std::vector<int64_t>{1}),
reinterpret_cast<void*>(float_data.data()), mem_info, *value);
ASSERT_STATUS_OK(sess_options.AddInitializer("if_shared", value.get()));
// Enable pre-packed weights container for shared initializers
PrepackedWeightsContainer prepacked_weights_container;
std::shared_ptr<Model> model;
ASSERT_STATUS_OK(Model::Load(model_with_external_initializers, model, nullptr,
DefaultLoggingManager().DefaultLogger()));
PlaceAllNodesToCPUEP(model->MainGraph());
SessionState session_state(model->MainGraph(),
execution_providers,
tp.get(),
nullptr, /*inter_op_thread_pool*/
dtm,
edlm,
DefaultLoggingManager().DefaultLogger(),
profiler,
sess_options,
&prepacked_weights_container);
ASSERT_STATUS_OK(session_state.FinalizeSessionState(std::basic_string<PATH_CHAR_TYPE>(),
kernel_registry_manager,
false));
TestLoadedSharedUserSupplied(*model);
}
// Load again, this time sharing is enabled, but no shared initializer in the map
{
SessionOptions sess_options;
sess_options.enable_mem_pattern = true;
sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL;
sess_options.use_deterministic_compute = false;
sess_options.enable_mem_reuse = true;
// Enable pre-packing
sess_options.config_options.configurations[kOrtSessionOptionsConfigDisablePrepacking] = "0";
// Enable pre-packed weights container for shared initializers
PrepackedWeightsContainer prepacked_weights_container;
std::shared_ptr<Model> model;
ASSERT_STATUS_OK(Model::Load(model_with_external_initializers, model, nullptr,
DefaultLoggingManager().DefaultLogger()));
PlaceAllNodesToCPUEP(model->MainGraph());
SessionState session_state(model->MainGraph(),
execution_providers,
tp.get(),
nullptr, /*inter_op_thread_pool*/
dtm,
edlm,
DefaultLoggingManager().DefaultLogger(),
profiler,
sess_options,
&prepacked_weights_container);
ASSERT_STATUS_OK(session_state.FinalizeSessionState(model_with_external_initializers,
kernel_registry_manager,
false));
TestLoadedSharedNoUserSupplied(*model);
}
// Load again, sharing is disabled
{
SessionOptions sess_options;
sess_options.enable_mem_pattern = true;
sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL;
sess_options.use_deterministic_compute = false;
sess_options.enable_mem_reuse = true;
// Enable pre-packing
sess_options.config_options.configurations[kOrtSessionOptionsConfigDisablePrepacking] = "0";
std::shared_ptr<Model> model;
ASSERT_STATUS_OK(Model::Load(model_with_external_initializers, model, nullptr,
DefaultLoggingManager().DefaultLogger()));
PlaceAllNodesToCPUEP(model->MainGraph());
SessionState session_state(model->MainGraph(),
execution_providers,
tp.get(),
nullptr, /*inter_op_thread_pool*/
dtm,
edlm,
DefaultLoggingManager().DefaultLogger(),
profiler,
sess_options,
nullptr);
ASSERT_STATUS_OK(session_state.FinalizeSessionState(model_with_external_initializers,
kernel_registry_manager,
false));
const auto& prepacked_for_main_graph = model->MainGraph().GetPrepacked();
ASSERT_FALSE(prepacked_for_main_graph.IsSaveModeOn());
ASSERT_EQ(1U, prepacked_for_main_graph.GetKeyToBlob().size());
}
}
#endif // __wasm__
INSTANTIATE_TEST_SUITE_P(SessionStateTests,
SessionStatePrepackingTest,
testing::Values(PrepackingTestParam{false, false},
PrepackingTestParam{false, true},
PrepackingTestParam{true, false},
PrepackingTestParam{true, true}));
#endif
} // namespace test
} // namespace onnxruntime