Add option to enable/disable memory pattern back (#872)

Memory pattern doesn't work for parallel executor by design. Enabling Memory Pattern for parallel executor logs warning and make the perf bad. Add option to enable/disable memory pattern back.
2026-07-04 04:07:22 +00:00 · 2019-04-22 13:49:41 -07:00 · 2019-04-22 13:49:41 -07:00 · 0bf12e9dbf
commit 0bf12e9dbf
parent e8d722003a
7 changed files with 52 additions and 13 deletions
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@ -208,9 +208,12 @@ ORT_API(void, OrtDisableSequentialExecution, _In_ OrtSessionOptions* options);
 ORT_API(void, OrtEnableProfiling, _In_ OrtSessionOptions* options, _In_ const ORTCHAR_T* profile_file_prefix);
 ORT_API(void, OrtDisableProfiling, _In_ OrtSessionOptions* options);

-// deprecated
+// Enable the memory pattern optimization.
+// The idea is if the input shapes are the same, we could trace the internal memory allocation
+// and generate a memory pattern for future request. So next time we could just do one allocation
+// with a big chunk for all the internal memory allocation.
+// Note: memory pattern optimization is only available when SequentialExecution enabled.
 ORT_API(void, OrtEnableMemPattern, _In_ OrtSessionOptions* options);
-// deprecated
 ORT_API(void, OrtDisableMemPattern, _In_ OrtSessionOptions* options);

 // Enable the memory arena on CPU
--- a/onnxruntime/core/framework/execution_frame.cc
+++ b/onnxruntime/core/framework/execution_frame.cc
@ -186,7 +186,7 @@ ExecutionFrame::ExecutionFrame(const std::vector<int>& feed_mlvalue_idxs,
  // If the session enable memory pattern optimization
  // and we have execution plan generated, try to setup
  // memory pattern optimization.
-  if (session_state.GetExecutionPlan()) {
+  if (session_state.GetEnableMemoryPattern() && session_state.GetExecutionPlan()) {
    std::vector<TensorShape> input_shapes;
    bool all_tensors = true;
    for (const auto& feed : feeds) {
@ -198,7 +198,7 @@ ExecutionFrame::ExecutionFrame(const std::vector<int>& feed_mlvalue_idxs,
      input_shapes.push_back(tensor.Shape());
    }

-    // if there are some traditional ml value type in inputs disable the memory pattern optimization.
+    //if there are some traditional ml value type in inputs disable the memory pattern optimization.
    if (all_tensors) {
      mem_patterns_ = session_state.GetMemoryPatternGroup(input_shapes);
      // if no existing patterns, generate one in this executionframe
@ -435,9 +435,10 @@ const AllocPlanPerValue& ExecutionFrame::GetAllocationPlan(int mlvalue_idx) {
 }

 void ExecutionFrame::TraceAllocate(int mlvalue_idx, size_t size) {
-  // don't trace the output tensors.
-  auto& allocation_plan = GetAllocationPlan(mlvalue_idx);
-  if (planner_ && allocation_plan.alloc_kind != AllocKind::kAllocateOutput) {
+  if (planner_) {
+    // don't trace the output tensors.
+    auto& allocation_plan = GetAllocationPlan(mlvalue_idx);
+    if (allocation_plan.alloc_kind == AllocKind::kAllocateOutput) return;
    auto status = planner_->TraceAllocation(mlvalue_idx, size);
    if (!status.IsOK())
      LOGS(session_state_.Logger(), WARNING) << "TraceAllocation for mlvalue_idx=" << mlvalue_idx << " size=" << size
--- a/onnxruntime/core/framework/session_state.cc
+++ b/onnxruntime/core/framework/session_state.cc
@ -96,6 +96,13 @@ Status SessionState::UpdateMemoryPatternGroupCache(const std::vector<TensorShape
  return Status::OK();
 }

+void SessionState::SetEnableMemoryPattern(bool flag) {
+  enable_mem_pattern_ = flag;
+}
+
+bool SessionState::GetEnableMemoryPattern() const {
+  return enable_mem_pattern_;
+}

 common::Status SessionState::AddInputNameToNodeInfoMapping(const std::string& input_name, const NodeInfo& node_info) {
  // in the future we could support multiple nodes on difference devices using an input, however right now
@ -160,15 +167,16 @@ const SessionState::NameNodeInfoMapType& SessionState::GetOutputNodeInfoMap() co
  return output_names_to_nodeinfo_mapping_;
 }

-void SessionState::AddSubgraphSessionState(onnxruntime::NodeIndex index, const std::string& attribute_name,
+void SessionState::AddSubgraphSessionState(onnxruntime::NodeIndex index,
+                                           const std::string& attribute_name,
                                           std::unique_ptr<SessionState> session_state) {
  auto entry = subgraph_session_states_.find(index);

  // make sure this is new. internal logic error if it is not so using ORT_ENFORCE.
  if (entry != subgraph_session_states_.cend()) {
    const auto& existing_entries = entry->second;
-    ORT_ENFORCE(existing_entries.find(attribute_name) == existing_entries.cend(), "Entry exists in node ", index,
-                " for attribute ", attribute_name);
+    ORT_ENFORCE(existing_entries.find(attribute_name) == existing_entries.cend(),
+                "Entry exists in node ", index, " for attribute ", attribute_name);
  }

  subgraph_session_states_[index].insert(std::make_pair(attribute_name, std::move(session_state)));
--- a/onnxruntime/core/framework/session_state.h
+++ b/onnxruntime/core/framework/session_state.h
@ -121,6 +121,16 @@ class SessionState {
  Status UpdateMemoryPatternGroupCache(const std::vector<TensorShape>& input_shape,
                                       std::unique_ptr<MemoryPatternGroup> mem_patterns) const;

+  /**
+  Set enable memory pattern flag
+  */
+  void SetEnableMemoryPattern(bool flag);
+
+  /**
+  Get enable memory pattern flag
+  */
+  bool GetEnableMemoryPattern() const;
+
  struct NodeInfo {
    /**
     *
@ -197,6 +207,8 @@ class SessionState {
  const logging::Logger* logger_ = nullptr;
  profiling::Profiler* profiler_;

+  // switch for enable memory pattern optimization or not.
+  bool enable_mem_pattern_ = true;
  // lock for the mem_patterns_
  mutable OrtMutex mem_patterns_lock_;
  // cache for the generated mem_patterns. key is calculated based on input shapes.
--- a/onnxruntime/core/session/abi_session_options.cc
+++ b/onnxruntime/core/session/abi_session_options.cc
@ -51,8 +51,16 @@ ORT_API(void, OrtDisableProfiling, _In_ OrtSessionOptions* options) {
  options->value.profile_file_prefix.clear();
 }

-ORT_API(void, OrtEnableMemPattern, _In_ OrtSessionOptions*) {}
-ORT_API(void, OrtDisableMemPattern, _In_ OrtSessionOptions*) {}
+// enable the memory pattern optimization.
+// The idea is if the input shapes are the same, we could trace the internal memory allocation
+// and generate a memory pattern for future request. So next time we could just do one allocation
+// with a big chunk for all the internal memory allocation.
+ORT_API(void, OrtEnableMemPattern, _In_ OrtSessionOptions* options) {
+  options->value.enable_mem_pattern = true;
+}
+ORT_API(void, OrtDisableMemPattern, _In_ OrtSessionOptions* options) {
+  options->value.enable_mem_pattern = false;
+}

 // enable the memory arena on CPU
 // Arena may pre-allocate memory for future usage.
@ -79,7 +87,7 @@ ORT_API(void, OrtSetSessionLogVerbosityLevel, _In_ OrtSessionOptions* options, u
 // Returns 0 on success and -1 otherwise
 // Available options are : 0, 1, 2.
 ORT_API(int, OrtSetSessionGraphOptimizationLevel, _In_ OrtSessionOptions* options, uint32_t graph_optimization_level) {
-  if (graph_optimization_level >= static_cast<uint32_t>(onnxruntime::TransformerLevel::MaxTransformerLevel)){
+  if (graph_optimization_level >= static_cast<uint32_t>(onnxruntime::TransformerLevel::MaxTransformerLevel)) {
    return -1;
  }
  options->value.graph_optimization_level = static_cast<onnxruntime::TransformerLevel>(graph_optimization_level);
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@ -110,6 +110,7 @@ InferenceSession::InferenceSession(const SessionOptions& session_options, loggin
  }

  session_state_.SetThreadPool(thread_pool_.get());
+  session_state_.SetEnableMemoryPattern(session_options.enable_mem_pattern && session_options.enable_sequential_execution);
  session_profiler_.Initialize(session_logger_);
  session_state_.SetProfiler(session_profiler_);
  if (session_options.enable_profiling) {
--- a/onnxruntime/core/session/inference_session.h
+++ b/onnxruntime/core/session/inference_session.h
@ -53,6 +53,12 @@ struct SessionOptions {
  // enable profiling for this session.
  bool enable_profiling = false;

+  // enable the memory pattern optimization.
+  // The idea is if the input shapes are the same, we could trace the internal memory allocation
+  // and generate a memory pattern for future request. So next time we could just do one allocation
+  // with a big chunk for all the internal memory allocation.
+  bool enable_mem_pattern = true;
+
  // enable the memory arena on CPU
  // Arena may pre-allocate memory for future usage.
  // set this option to false if you don't want it.