Add option to enable/disable memory pattern back (#872)

Memory pattern doesn't work for parallel executor by design. Enabling Memory Pattern for parallel executor logs warning and make the perf bad.
Add option to enable/disable memory pattern back.
This commit is contained in:
Yufeng Li 2019-04-22 13:49:41 -07:00 committed by GitHub
parent e8d722003a
commit 0bf12e9dbf
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 52 additions and 13 deletions

View file

@ -208,9 +208,12 @@ ORT_API(void, OrtDisableSequentialExecution, _In_ OrtSessionOptions* options);
ORT_API(void, OrtEnableProfiling, _In_ OrtSessionOptions* options, _In_ const ORTCHAR_T* profile_file_prefix);
ORT_API(void, OrtDisableProfiling, _In_ OrtSessionOptions* options);
// deprecated
// Enable the memory pattern optimization.
// The idea is if the input shapes are the same, we could trace the internal memory allocation
// and generate a memory pattern for future request. So next time we could just do one allocation
// with a big chunk for all the internal memory allocation.
// Note: memory pattern optimization is only available when SequentialExecution enabled.
ORT_API(void, OrtEnableMemPattern, _In_ OrtSessionOptions* options);
// deprecated
ORT_API(void, OrtDisableMemPattern, _In_ OrtSessionOptions* options);
// Enable the memory arena on CPU

View file

@ -186,7 +186,7 @@ ExecutionFrame::ExecutionFrame(const std::vector<int>& feed_mlvalue_idxs,
// If the session enable memory pattern optimization
// and we have execution plan generated, try to setup
// memory pattern optimization.
if (session_state.GetExecutionPlan()) {
if (session_state.GetEnableMemoryPattern() && session_state.GetExecutionPlan()) {
std::vector<TensorShape> input_shapes;
bool all_tensors = true;
for (const auto& feed : feeds) {
@ -198,7 +198,7 @@ ExecutionFrame::ExecutionFrame(const std::vector<int>& feed_mlvalue_idxs,
input_shapes.push_back(tensor.Shape());
}
// if there are some traditional ml value type in inputs disable the memory pattern optimization.
//if there are some traditional ml value type in inputs disable the memory pattern optimization.
if (all_tensors) {
mem_patterns_ = session_state.GetMemoryPatternGroup(input_shapes);
// if no existing patterns, generate one in this executionframe
@ -435,9 +435,10 @@ const AllocPlanPerValue& ExecutionFrame::GetAllocationPlan(int mlvalue_idx) {
}
void ExecutionFrame::TraceAllocate(int mlvalue_idx, size_t size) {
// don't trace the output tensors.
auto& allocation_plan = GetAllocationPlan(mlvalue_idx);
if (planner_ && allocation_plan.alloc_kind != AllocKind::kAllocateOutput) {
if (planner_) {
// don't trace the output tensors.
auto& allocation_plan = GetAllocationPlan(mlvalue_idx);
if (allocation_plan.alloc_kind == AllocKind::kAllocateOutput) return;
auto status = planner_->TraceAllocation(mlvalue_idx, size);
if (!status.IsOK())
LOGS(session_state_.Logger(), WARNING) << "TraceAllocation for mlvalue_idx=" << mlvalue_idx << " size=" << size

View file

@ -96,6 +96,13 @@ Status SessionState::UpdateMemoryPatternGroupCache(const std::vector<TensorShape
return Status::OK();
}
void SessionState::SetEnableMemoryPattern(bool flag) {
enable_mem_pattern_ = flag;
}
bool SessionState::GetEnableMemoryPattern() const {
return enable_mem_pattern_;
}
common::Status SessionState::AddInputNameToNodeInfoMapping(const std::string& input_name, const NodeInfo& node_info) {
// in the future we could support multiple nodes on difference devices using an input, however right now
@ -160,15 +167,16 @@ const SessionState::NameNodeInfoMapType& SessionState::GetOutputNodeInfoMap() co
return output_names_to_nodeinfo_mapping_;
}
void SessionState::AddSubgraphSessionState(onnxruntime::NodeIndex index, const std::string& attribute_name,
void SessionState::AddSubgraphSessionState(onnxruntime::NodeIndex index,
const std::string& attribute_name,
std::unique_ptr<SessionState> session_state) {
auto entry = subgraph_session_states_.find(index);
// make sure this is new. internal logic error if it is not so using ORT_ENFORCE.
if (entry != subgraph_session_states_.cend()) {
const auto& existing_entries = entry->second;
ORT_ENFORCE(existing_entries.find(attribute_name) == existing_entries.cend(), "Entry exists in node ", index,
" for attribute ", attribute_name);
ORT_ENFORCE(existing_entries.find(attribute_name) == existing_entries.cend(),
"Entry exists in node ", index, " for attribute ", attribute_name);
}
subgraph_session_states_[index].insert(std::make_pair(attribute_name, std::move(session_state)));

View file

@ -121,6 +121,16 @@ class SessionState {
Status UpdateMemoryPatternGroupCache(const std::vector<TensorShape>& input_shape,
std::unique_ptr<MemoryPatternGroup> mem_patterns) const;
/**
Set enable memory pattern flag
*/
void SetEnableMemoryPattern(bool flag);
/**
Get enable memory pattern flag
*/
bool GetEnableMemoryPattern() const;
struct NodeInfo {
/**
*
@ -197,6 +207,8 @@ class SessionState {
const logging::Logger* logger_ = nullptr;
profiling::Profiler* profiler_;
// switch for enable memory pattern optimization or not.
bool enable_mem_pattern_ = true;
// lock for the mem_patterns_
mutable OrtMutex mem_patterns_lock_;
// cache for the generated mem_patterns. key is calculated based on input shapes.

View file

@ -51,8 +51,16 @@ ORT_API(void, OrtDisableProfiling, _In_ OrtSessionOptions* options) {
options->value.profile_file_prefix.clear();
}
ORT_API(void, OrtEnableMemPattern, _In_ OrtSessionOptions*) {}
ORT_API(void, OrtDisableMemPattern, _In_ OrtSessionOptions*) {}
// enable the memory pattern optimization.
// The idea is if the input shapes are the same, we could trace the internal memory allocation
// and generate a memory pattern for future request. So next time we could just do one allocation
// with a big chunk for all the internal memory allocation.
ORT_API(void, OrtEnableMemPattern, _In_ OrtSessionOptions* options) {
options->value.enable_mem_pattern = true;
}
ORT_API(void, OrtDisableMemPattern, _In_ OrtSessionOptions* options) {
options->value.enable_mem_pattern = false;
}
// enable the memory arena on CPU
// Arena may pre-allocate memory for future usage.
@ -79,7 +87,7 @@ ORT_API(void, OrtSetSessionLogVerbosityLevel, _In_ OrtSessionOptions* options, u
// Returns 0 on success and -1 otherwise
// Available options are : 0, 1, 2.
ORT_API(int, OrtSetSessionGraphOptimizationLevel, _In_ OrtSessionOptions* options, uint32_t graph_optimization_level) {
if (graph_optimization_level >= static_cast<uint32_t>(onnxruntime::TransformerLevel::MaxTransformerLevel)){
if (graph_optimization_level >= static_cast<uint32_t>(onnxruntime::TransformerLevel::MaxTransformerLevel)) {
return -1;
}
options->value.graph_optimization_level = static_cast<onnxruntime::TransformerLevel>(graph_optimization_level);

View file

@ -110,6 +110,7 @@ InferenceSession::InferenceSession(const SessionOptions& session_options, loggin
}
session_state_.SetThreadPool(thread_pool_.get());
session_state_.SetEnableMemoryPattern(session_options.enable_mem_pattern && session_options.enable_sequential_execution);
session_profiler_.Initialize(session_logger_);
session_state_.SetProfiler(session_profiler_);
if (session_options.enable_profiling) {

View file

@ -53,6 +53,12 @@ struct SessionOptions {
// enable profiling for this session.
bool enable_profiling = false;
// enable the memory pattern optimization.
// The idea is if the input shapes are the same, we could trace the internal memory allocation
// and generate a memory pattern for future request. So next time we could just do one allocation
// with a big chunk for all the internal memory allocation.
bool enable_mem_pattern = true;
// enable the memory arena on CPU
// Arena may pre-allocate memory for future usage.
// set this option to false if you don't want it.