mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-18 21:21:17 +00:00
Add option to enable/disable memory pattern back (#872)
Memory pattern doesn't work for parallel executor by design. Enabling Memory Pattern for parallel executor logs warning and make the perf bad. Add option to enable/disable memory pattern back.
This commit is contained in:
parent
e8d722003a
commit
0bf12e9dbf
7 changed files with 52 additions and 13 deletions
|
|
@ -208,9 +208,12 @@ ORT_API(void, OrtDisableSequentialExecution, _In_ OrtSessionOptions* options);
|
|||
ORT_API(void, OrtEnableProfiling, _In_ OrtSessionOptions* options, _In_ const ORTCHAR_T* profile_file_prefix);
|
||||
ORT_API(void, OrtDisableProfiling, _In_ OrtSessionOptions* options);
|
||||
|
||||
// deprecated
|
||||
// Enable the memory pattern optimization.
|
||||
// The idea is if the input shapes are the same, we could trace the internal memory allocation
|
||||
// and generate a memory pattern for future request. So next time we could just do one allocation
|
||||
// with a big chunk for all the internal memory allocation.
|
||||
// Note: memory pattern optimization is only available when SequentialExecution enabled.
|
||||
ORT_API(void, OrtEnableMemPattern, _In_ OrtSessionOptions* options);
|
||||
// deprecated
|
||||
ORT_API(void, OrtDisableMemPattern, _In_ OrtSessionOptions* options);
|
||||
|
||||
// Enable the memory arena on CPU
|
||||
|
|
|
|||
|
|
@ -186,7 +186,7 @@ ExecutionFrame::ExecutionFrame(const std::vector<int>& feed_mlvalue_idxs,
|
|||
// If the session enable memory pattern optimization
|
||||
// and we have execution plan generated, try to setup
|
||||
// memory pattern optimization.
|
||||
if (session_state.GetExecutionPlan()) {
|
||||
if (session_state.GetEnableMemoryPattern() && session_state.GetExecutionPlan()) {
|
||||
std::vector<TensorShape> input_shapes;
|
||||
bool all_tensors = true;
|
||||
for (const auto& feed : feeds) {
|
||||
|
|
@ -198,7 +198,7 @@ ExecutionFrame::ExecutionFrame(const std::vector<int>& feed_mlvalue_idxs,
|
|||
input_shapes.push_back(tensor.Shape());
|
||||
}
|
||||
|
||||
// if there are some traditional ml value type in inputs disable the memory pattern optimization.
|
||||
//if there are some traditional ml value type in inputs disable the memory pattern optimization.
|
||||
if (all_tensors) {
|
||||
mem_patterns_ = session_state.GetMemoryPatternGroup(input_shapes);
|
||||
// if no existing patterns, generate one in this executionframe
|
||||
|
|
@ -435,9 +435,10 @@ const AllocPlanPerValue& ExecutionFrame::GetAllocationPlan(int mlvalue_idx) {
|
|||
}
|
||||
|
||||
void ExecutionFrame::TraceAllocate(int mlvalue_idx, size_t size) {
|
||||
// don't trace the output tensors.
|
||||
auto& allocation_plan = GetAllocationPlan(mlvalue_idx);
|
||||
if (planner_ && allocation_plan.alloc_kind != AllocKind::kAllocateOutput) {
|
||||
if (planner_) {
|
||||
// don't trace the output tensors.
|
||||
auto& allocation_plan = GetAllocationPlan(mlvalue_idx);
|
||||
if (allocation_plan.alloc_kind == AllocKind::kAllocateOutput) return;
|
||||
auto status = planner_->TraceAllocation(mlvalue_idx, size);
|
||||
if (!status.IsOK())
|
||||
LOGS(session_state_.Logger(), WARNING) << "TraceAllocation for mlvalue_idx=" << mlvalue_idx << " size=" << size
|
||||
|
|
|
|||
|
|
@ -96,6 +96,13 @@ Status SessionState::UpdateMemoryPatternGroupCache(const std::vector<TensorShape
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
void SessionState::SetEnableMemoryPattern(bool flag) {
|
||||
enable_mem_pattern_ = flag;
|
||||
}
|
||||
|
||||
bool SessionState::GetEnableMemoryPattern() const {
|
||||
return enable_mem_pattern_;
|
||||
}
|
||||
|
||||
common::Status SessionState::AddInputNameToNodeInfoMapping(const std::string& input_name, const NodeInfo& node_info) {
|
||||
// in the future we could support multiple nodes on difference devices using an input, however right now
|
||||
|
|
@ -160,15 +167,16 @@ const SessionState::NameNodeInfoMapType& SessionState::GetOutputNodeInfoMap() co
|
|||
return output_names_to_nodeinfo_mapping_;
|
||||
}
|
||||
|
||||
void SessionState::AddSubgraphSessionState(onnxruntime::NodeIndex index, const std::string& attribute_name,
|
||||
void SessionState::AddSubgraphSessionState(onnxruntime::NodeIndex index,
|
||||
const std::string& attribute_name,
|
||||
std::unique_ptr<SessionState> session_state) {
|
||||
auto entry = subgraph_session_states_.find(index);
|
||||
|
||||
// make sure this is new. internal logic error if it is not so using ORT_ENFORCE.
|
||||
if (entry != subgraph_session_states_.cend()) {
|
||||
const auto& existing_entries = entry->second;
|
||||
ORT_ENFORCE(existing_entries.find(attribute_name) == existing_entries.cend(), "Entry exists in node ", index,
|
||||
" for attribute ", attribute_name);
|
||||
ORT_ENFORCE(existing_entries.find(attribute_name) == existing_entries.cend(),
|
||||
"Entry exists in node ", index, " for attribute ", attribute_name);
|
||||
}
|
||||
|
||||
subgraph_session_states_[index].insert(std::make_pair(attribute_name, std::move(session_state)));
|
||||
|
|
|
|||
|
|
@ -121,6 +121,16 @@ class SessionState {
|
|||
Status UpdateMemoryPatternGroupCache(const std::vector<TensorShape>& input_shape,
|
||||
std::unique_ptr<MemoryPatternGroup> mem_patterns) const;
|
||||
|
||||
/**
|
||||
Set enable memory pattern flag
|
||||
*/
|
||||
void SetEnableMemoryPattern(bool flag);
|
||||
|
||||
/**
|
||||
Get enable memory pattern flag
|
||||
*/
|
||||
bool GetEnableMemoryPattern() const;
|
||||
|
||||
struct NodeInfo {
|
||||
/**
|
||||
*
|
||||
|
|
@ -197,6 +207,8 @@ class SessionState {
|
|||
const logging::Logger* logger_ = nullptr;
|
||||
profiling::Profiler* profiler_;
|
||||
|
||||
// switch for enable memory pattern optimization or not.
|
||||
bool enable_mem_pattern_ = true;
|
||||
// lock for the mem_patterns_
|
||||
mutable OrtMutex mem_patterns_lock_;
|
||||
// cache for the generated mem_patterns. key is calculated based on input shapes.
|
||||
|
|
|
|||
|
|
@ -51,8 +51,16 @@ ORT_API(void, OrtDisableProfiling, _In_ OrtSessionOptions* options) {
|
|||
options->value.profile_file_prefix.clear();
|
||||
}
|
||||
|
||||
ORT_API(void, OrtEnableMemPattern, _In_ OrtSessionOptions*) {}
|
||||
ORT_API(void, OrtDisableMemPattern, _In_ OrtSessionOptions*) {}
|
||||
// enable the memory pattern optimization.
|
||||
// The idea is if the input shapes are the same, we could trace the internal memory allocation
|
||||
// and generate a memory pattern for future request. So next time we could just do one allocation
|
||||
// with a big chunk for all the internal memory allocation.
|
||||
ORT_API(void, OrtEnableMemPattern, _In_ OrtSessionOptions* options) {
|
||||
options->value.enable_mem_pattern = true;
|
||||
}
|
||||
ORT_API(void, OrtDisableMemPattern, _In_ OrtSessionOptions* options) {
|
||||
options->value.enable_mem_pattern = false;
|
||||
}
|
||||
|
||||
// enable the memory arena on CPU
|
||||
// Arena may pre-allocate memory for future usage.
|
||||
|
|
@ -79,7 +87,7 @@ ORT_API(void, OrtSetSessionLogVerbosityLevel, _In_ OrtSessionOptions* options, u
|
|||
// Returns 0 on success and -1 otherwise
|
||||
// Available options are : 0, 1, 2.
|
||||
ORT_API(int, OrtSetSessionGraphOptimizationLevel, _In_ OrtSessionOptions* options, uint32_t graph_optimization_level) {
|
||||
if (graph_optimization_level >= static_cast<uint32_t>(onnxruntime::TransformerLevel::MaxTransformerLevel)){
|
||||
if (graph_optimization_level >= static_cast<uint32_t>(onnxruntime::TransformerLevel::MaxTransformerLevel)) {
|
||||
return -1;
|
||||
}
|
||||
options->value.graph_optimization_level = static_cast<onnxruntime::TransformerLevel>(graph_optimization_level);
|
||||
|
|
|
|||
|
|
@ -110,6 +110,7 @@ InferenceSession::InferenceSession(const SessionOptions& session_options, loggin
|
|||
}
|
||||
|
||||
session_state_.SetThreadPool(thread_pool_.get());
|
||||
session_state_.SetEnableMemoryPattern(session_options.enable_mem_pattern && session_options.enable_sequential_execution);
|
||||
session_profiler_.Initialize(session_logger_);
|
||||
session_state_.SetProfiler(session_profiler_);
|
||||
if (session_options.enable_profiling) {
|
||||
|
|
|
|||
|
|
@ -53,6 +53,12 @@ struct SessionOptions {
|
|||
// enable profiling for this session.
|
||||
bool enable_profiling = false;
|
||||
|
||||
// enable the memory pattern optimization.
|
||||
// The idea is if the input shapes are the same, we could trace the internal memory allocation
|
||||
// and generate a memory pattern for future request. So next time we could just do one allocation
|
||||
// with a big chunk for all the internal memory allocation.
|
||||
bool enable_mem_pattern = true;
|
||||
|
||||
// enable the memory arena on CPU
|
||||
// Arena may pre-allocate memory for future usage.
|
||||
// set this option to false if you don't want it.
|
||||
|
|
|
|||
Loading…
Reference in a new issue