Partial graph execution perf improvements. (#7438)

* Partial graph execution perf improvements. * PR feedback. * Decrement reference count of tensors in ORTModule. * PR feedback. * PR feedback. * PR feedback.
2026-07-11 17:48:34 +00:00 · 2021-04-26 17:13:55 -07:00 · 2021-04-26 17:13:55 -07:00 · 82108b18e3
commit 82108b18e3
parent 0702a14ee7
5 changed files with 19 additions and 16 deletions
--- a/onnxruntime/core/framework/orttraining_partial_executor.cc
+++ b/onnxruntime/core/framework/orttraining_partial_executor.cc
@ -147,17 +147,9 @@ Status PartialExecutor::Execute(const SessionState& session_state, const std::ve
    tp = session_state.Profiler().Now();
  }

-  if (state_.GetExecutionFrame() == nullptr) {
-    auto frame = onnxruntime::make_unique<ExecutionFrame>(feed_mlvalue_idxs, feeds, fetch_mlvalue_idxs,
-                                                          fetches, fetch_allocators, session_state);
+  ExecutionFrame& frame = state_.GetExecutionFrame(feed_mlvalue_idxs, feeds, fetch_mlvalue_idxs, fetches,
+                                                   fetch_allocators, session_state);

-    state_.SetExecutionFrame(std::move(frame));
-  } else {
-    state_.GetExecutionFrame()->UpdateFeeds(feed_mlvalue_idxs, feeds);
-    state_.GetExecutionFrame()->UpdateFetches(fetch_mlvalue_idxs, fetches, session_state.GetInitializedTensors());
-  }
-
-  ExecutionFrame& frame = *(state_.GetExecutionFrame());
  LOGS(logger, INFO) << "Begin execution";
  const SequentialExecutionPlan& seq_exec_plan = *session_state.GetExecutionPlan();
  const auto& exec_plan_vec = seq_exec_plan.execution_plan;
--- a/onnxruntime/core/framework/partial_graph_execution_state.h
+++ b/onnxruntime/core/framework/partial_graph_execution_state.h
@ -22,11 +22,20 @@ struct PartialGraphExecutionState {
  size_t GetProgramCounterStart() { return program_counter_start_; }
  size_t GetProgramCounterEnd() { return program_counter_end_; }

-  void SetExecutionFrame(std::unique_ptr<ExecutionFrame> frame) {
-    execution_frame_ = std::move(frame);
-  }
+  ExecutionFrame& GetExecutionFrame(const std::vector<int>& feed_mlvalue_idxs, const std::vector<OrtValue>& feeds,
+                                    const std::vector<int>& fetch_mlvalue_idxs, const std::vector<OrtValue>& fetches,
+                                    const std::unordered_map<size_t, IExecutor::CustomAllocator>& fetch_allocators,
+                                    const SessionState& session_state) {
+    if (execution_frame_ == nullptr) {
+      execution_frame_ = onnxruntime::make_unique<ExecutionFrame>(feed_mlvalue_idxs, feeds, fetch_mlvalue_idxs, fetches,
+                                                                  fetch_allocators, session_state);
+    } else {
+      execution_frame_->UpdateFeeds(feed_mlvalue_idxs, feeds);
+      execution_frame_->UpdateFetches(fetch_mlvalue_idxs, fetches, session_state.GetInitializedTensors());
+    }

-  const std::unique_ptr<ExecutionFrame>& GetExecutionFrame() const { return execution_frame_; }
+    return *execution_frame_;
+  }

 private:
  std::unique_ptr<ExecutionFrame> execution_frame_;
--- a/orttraining/orttraining/core/agent/training_agent.cc
+++ b/orttraining/orttraining/core/agent/training_agent.cc
@ -38,7 +38,7 @@ TrainingAgent::TrainingAgent(InferenceSession& session,
  bw_program_counter_end_ = exec_plan_vec.size();
 }

-TrainingAgent::~TrainingAgent(){};
+TrainingAgent::~TrainingAgent() = default;

 common::Status TrainingAgent::RunForward(const std::vector<OrtValue>& feeds, std::vector<OrtValue>& fetches,
                                         PartialGraphExecutionState& state) {
--- a/orttraining/orttraining/core/agent/training_agent.h
+++ b/orttraining/orttraining/core/agent/training_agent.h
@ -14,7 +14,6 @@

 namespace onnxruntime {
 namespace training {
-class IOBinding;

 class TrainingAgent {
 public:
--- a/orttraining/orttraining/python/training/ortmodule/_training_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_training_manager.py
@ -151,6 +151,9 @@ class TrainingManager(GraphExecutionManager):

                backward_outputs = C.OrtValueVector()
                self._execution_agent.run_backward(backward_inputs, backward_outputs, ctx.run_info.state)
+                # Destroy the state immediately (as opposed to be at the mercy of garbage collector) so it does not
+                # affect peak memory usage in a subsequent graph run.
+                del ctx.run_info.state
                # Return input and initializer gradients
                num_user_input_grads = len(self._input_info.require_grad_names)
                results = []