From 0bf12e9dbf7d5ba434dcaa8254a9495b9d3d7bd7 Mon Sep 17 00:00:00 2001
From: Yufeng Li <liyufeng1987@gmail.com>
Date: Mon, 22 Apr 2019 13:49:41 -0700
Subject: [PATCH] Add option to enable/disable memory pattern back (#872)

Memory pattern doesn't work for parallel executor by design. Enabling Memory Pattern for parallel executor logs warning and make the perf bad.
Add option to enable/disable memory pattern back.
---
 .../onnxruntime/core/session/onnxruntime_c_api.h   |  7 +++++--
 onnxruntime/core/framework/execution_frame.cc      | 11 ++++++-----
 onnxruntime/core/framework/session_state.cc        | 14 +++++++++++---
 onnxruntime/core/framework/session_state.h         | 12 ++++++++++++
 onnxruntime/core/session/abi_session_options.cc    | 14 +++++++++++---
 onnxruntime/core/session/inference_session.cc      |  1 +
 onnxruntime/core/session/inference_session.h       |  6 ++++++
 7 files changed, 52 insertions(+), 13 deletions(-)
diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index 2dcd728705..599e17cbc7 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -208,9 +208,12 @@ ORT_API(void, OrtDisableSequentialExecution, _In_ OrtSessionOptions* options);
 ORT_API(void, OrtEnableProfiling, _In_ OrtSessionOptions* options, _In_ const ORTCHAR_T* profile_file_prefix);
 ORT_API(void, OrtDisableProfiling, _In_ OrtSessionOptions* options);
 
-// deprecated
+// Enable the memory pattern optimization.
+// The idea is if the input shapes are the same, we could trace the internal memory allocation
+// and generate a memory pattern for future request. So next time we could just do one allocation
+// with a big chunk for all the internal memory allocation.
+// Note: memory pattern optimization is only available when SequentialExecution enabled.
 ORT_API(void, OrtEnableMemPattern, _In_ OrtSessionOptions* options);
-// deprecated
 ORT_API(void, OrtDisableMemPattern, _In_ OrtSessionOptions* options);
 
 // Enable the memory arena on CPU
diff --git a/onnxruntime/core/framework/execution_frame.cc b/onnxruntime/core/framework/execution_frame.cc
index 30b828b9e1..6a0e0cf21b 100644
--- a/onnxruntime/core/framework/execution_frame.cc
+++ b/onnxruntime/core/framework/execution_frame.cc
@@ -186,7 +186,7 @@ ExecutionFrame::ExecutionFrame(const std::vector<int>& feed_mlvalue_idxs,
   // If the session enable memory pattern optimization
   // and we have execution plan generated, try to setup
   // memory pattern optimization.
-  if (session_state.GetExecutionPlan()) {
+  if (session_state.GetEnableMemoryPattern() && session_state.GetExecutionPlan()) {
     std::vector<TensorShape> input_shapes;
     bool all_tensors = true;
     for (const auto& feed : feeds) {
@@ -198,7 +198,7 @@ ExecutionFrame::ExecutionFrame(const std::vector<int>& feed_mlvalue_idxs,
       input_shapes.push_back(tensor.Shape());
     }
 
-    // if there are some traditional ml value type in inputs disable the memory pattern optimization.
+    //if there are some traditional ml value type in inputs disable the memory pattern optimization.
     if (all_tensors) {
       mem_patterns_ = session_state.GetMemoryPatternGroup(input_shapes);
       // if no existing patterns, generate one in this executionframe
@@ -435,9 +435,10 @@ const AllocPlanPerValue& ExecutionFrame::GetAllocationPlan(int mlvalue_idx) {
 }
 
 void ExecutionFrame::TraceAllocate(int mlvalue_idx, size_t size) {
-  // don't trace the output tensors.
-  auto& allocation_plan = GetAllocationPlan(mlvalue_idx);
-  if (planner_ && allocation_plan.alloc_kind != AllocKind::kAllocateOutput) {
+  if (planner_) {
+    // don't trace the output tensors.
+    auto& allocation_plan = GetAllocationPlan(mlvalue_idx);
+    if (allocation_plan.alloc_kind == AllocKind::kAllocateOutput) return;
     auto status = planner_->TraceAllocation(mlvalue_idx, size);
     if (!status.IsOK())
       LOGS(session_state_.Logger(), WARNING) << "TraceAllocation for mlvalue_idx=" << mlvalue_idx << " size=" << size
diff --git a/onnxruntime/core/framework/session_state.cc b/onnxruntime/core/framework/session_state.cc
index 5be86cd841..62d7932f8e 100644
--- a/onnxruntime/core/framework/session_state.cc
+++ b/onnxruntime/core/framework/session_state.cc
@@ -96,6 +96,13 @@ Status SessionState::UpdateMemoryPatternGroupCache(const std::vector<TensorShape
   return Status::OK();
 }
 
+void SessionState::SetEnableMemoryPattern(bool flag) {
+  enable_mem_pattern_ = flag;
+}
+
+bool SessionState::GetEnableMemoryPattern() const {
+  return enable_mem_pattern_;
+}
 
 common::Status SessionState::AddInputNameToNodeInfoMapping(const std::string& input_name, const NodeInfo& node_info) {
   // in the future we could support multiple nodes on difference devices using an input, however right now
@@ -160,15 +167,16 @@ const SessionState::NameNodeInfoMapType& SessionState::GetOutputNodeInfoMap() co
   return output_names_to_nodeinfo_mapping_;
 }
 
-void SessionState::AddSubgraphSessionState(onnxruntime::NodeIndex index, const std::string& attribute_name,
+void SessionState::AddSubgraphSessionState(onnxruntime::NodeIndex index,
+                                           const std::string& attribute_name,
                                            std::unique_ptr<SessionState> session_state) {
   auto entry = subgraph_session_states_.find(index);
 
   // make sure this is new. internal logic error if it is not so using ORT_ENFORCE.
   if (entry != subgraph_session_states_.cend()) {
     const auto& existing_entries = entry->second;
-    ORT_ENFORCE(existing_entries.find(attribute_name) == existing_entries.cend(), "Entry exists in node ", index,
-                " for attribute ", attribute_name);
+    ORT_ENFORCE(existing_entries.find(attribute_name) == existing_entries.cend(),
+                "Entry exists in node ", index, " for attribute ", attribute_name);
   }
 
   subgraph_session_states_[index].insert(std::make_pair(attribute_name, std::move(session_state)));
diff --git a/onnxruntime/core/framework/session_state.h b/onnxruntime/core/framework/session_state.h
index 619f2683df..08a53afb9b 100644
--- a/onnxruntime/core/framework/session_state.h
+++ b/onnxruntime/core/framework/session_state.h
@@ -121,6 +121,16 @@ class SessionState {
   Status UpdateMemoryPatternGroupCache(const std::vector<TensorShape>& input_shape,
                                        std::unique_ptr<MemoryPatternGroup> mem_patterns) const;
 
+  /**
+  Set enable memory pattern flag
+  */
+  void SetEnableMemoryPattern(bool flag);
+
+  /**
+  Get enable memory pattern flag
+  */
+  bool GetEnableMemoryPattern() const;
+
   struct NodeInfo {
     /**
      *
@@ -197,6 +207,8 @@ class SessionState {
   const logging::Logger* logger_ = nullptr;
   profiling::Profiler* profiler_;
 
+  // switch for enable memory pattern optimization or not.
+  bool enable_mem_pattern_ = true;
   // lock for the mem_patterns_
   mutable OrtMutex mem_patterns_lock_;
   // cache for the generated mem_patterns. key is calculated based on input shapes.
diff --git a/onnxruntime/core/session/abi_session_options.cc b/onnxruntime/core/session/abi_session_options.cc
index b9680fac48..a0f82d15ec 100644
--- a/onnxruntime/core/session/abi_session_options.cc
+++ b/onnxruntime/core/session/abi_session_options.cc
@@ -51,8 +51,16 @@ ORT_API(void, OrtDisableProfiling, _In_ OrtSessionOptions* options) {
   options->value.profile_file_prefix.clear();
 }
 
-ORT_API(void, OrtEnableMemPattern, _In_ OrtSessionOptions*) {}
-ORT_API(void, OrtDisableMemPattern, _In_ OrtSessionOptions*) {}
+// enable the memory pattern optimization.
+// The idea is if the input shapes are the same, we could trace the internal memory allocation
+// and generate a memory pattern for future request. So next time we could just do one allocation
+// with a big chunk for all the internal memory allocation.
+ORT_API(void, OrtEnableMemPattern, _In_ OrtSessionOptions* options) {
+  options->value.enable_mem_pattern = true;
+}
+ORT_API(void, OrtDisableMemPattern, _In_ OrtSessionOptions* options) {
+  options->value.enable_mem_pattern = false;
+}
 
 // enable the memory arena on CPU
 // Arena may pre-allocate memory for future usage.
@@ -79,7 +87,7 @@ ORT_API(void, OrtSetSessionLogVerbosityLevel, _In_ OrtSessionOptions* options, u
 // Returns 0 on success and -1 otherwise
 // Available options are : 0, 1, 2.
 ORT_API(int, OrtSetSessionGraphOptimizationLevel, _In_ OrtSessionOptions* options, uint32_t graph_optimization_level) {
-  if (graph_optimization_level >= static_cast<uint32_t>(onnxruntime::TransformerLevel::MaxTransformerLevel)){
+  if (graph_optimization_level >= static_cast<uint32_t>(onnxruntime::TransformerLevel::MaxTransformerLevel)) {
     return -1;
   }
   options->value.graph_optimization_level = static_cast<onnxruntime::TransformerLevel>(graph_optimization_level);
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index 0c4be5e30a..a5ee850204 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -110,6 +110,7 @@ InferenceSession::InferenceSession(const SessionOptions& session_options, loggin
   }
 
   session_state_.SetThreadPool(thread_pool_.get());
+  session_state_.SetEnableMemoryPattern(session_options.enable_mem_pattern && session_options.enable_sequential_execution);
   session_profiler_.Initialize(session_logger_);
   session_state_.SetProfiler(session_profiler_);
   if (session_options.enable_profiling) {
diff --git a/onnxruntime/core/session/inference_session.h b/onnxruntime/core/session/inference_session.h
index 3f82872c06..88e627a8ea 100644
--- a/onnxruntime/core/session/inference_session.h
+++ b/onnxruntime/core/session/inference_session.h
@@ -53,6 +53,12 @@ struct SessionOptions {
   // enable profiling for this session.
   bool enable_profiling = false;
 
+  // enable the memory pattern optimization.
+  // The idea is if the input shapes are the same, we could trace the internal memory allocation
+  // and generate a memory pattern for future request. So next time we could just do one allocation
+  // with a big chunk for all the internal memory allocation.
+  bool enable_mem_pattern = true;
+
   // enable the memory arena on CPU
   // Arena may pre-allocate memory for future usage.
   // set this option to false if you don't want it.