From fff68c3151b774d8a2e9290e96b9f707cd950216 Mon Sep 17 00:00:00 2001
From: Baiju Meswani <bmeswani@microsoft.com>
Date: Thu, 13 Jun 2024 16:08:16 -0700
Subject: [PATCH] Avoid reusing buffer for node outputs with no consumers
 (#21019)

---
 .../core/framework/allocation_planner.cc      |  23 +++++-
 onnxruntime/core/framework/session_state.cc   |   2 +-
 .../test/framework/allocation_planner_test.cc |  37 +++++++++
 .../test/framework/execution_frame_test.cc    |  35 ++++++---
 ...fer_for_node_output_with_no_consumers.onnx | Bin 0 -> 3414 bytes
 .../invalid_dim_param_value_repetition.onnx   | Bin 2333 -> 429 bytes
 .../invalid_dim_param_value_repetition.py     |  70 ++++++++++++++++++
 7 files changed, 152 insertions(+), 15 deletions(-)
 create mode 100644 onnxruntime/test/testdata/avoid_reuse_of_buffer_for_node_output_with_no_consumers.onnx
 create mode 100644 onnxruntime/test/testdata/invalid_dim_param_value_repetition.py
diff --git a/onnxruntime/core/framework/allocation_planner.cc b/onnxruntime/core/framework/allocation_planner.cc
index 95e5380675..fec4e6e87e 100644
--- a/onnxruntime/core/framework/allocation_planner.cc
+++ b/onnxruntime/core/framework/allocation_planner.cc
@@ -469,6 +469,15 @@ class PlannerImpl {
     */
   }
 
+  static bool OutputHasConsumerNode(const Node& node, int output_idx) {
+    // there will be an edge to all consumer nodes.
+    // if consumed in a subgraph the edge will be to an implicit input of the node containing the subgraph.
+    return std::any_of(node.OutputEdgesBegin(), node.OutputEdgesEnd(),
+                       [&output_idx](const Node::EdgeEnd& edge) {
+                         return edge.GetSrcArgIndex() == output_idx;
+                       });
+  }
+
   bool SameSize(const onnxruntime::NodeArg& arg1, const onnxruntime::NodeArg& arg2) {
     if ((!arg1.Exists()) || (!arg2.Exists())) return false;
     auto p_shape1 = context_->GetShape(arg1);
@@ -1172,8 +1181,8 @@ class PlannerImpl {
                                                           value_consumer_map[output_idx_global].end());
                 reused.insert(reusable_input);
                 continue;
-              }  // if
-            }    // if
+              }
+            }
           }
         }
 
@@ -1456,7 +1465,13 @@ class PlannerImpl {
         } else if (IsNonTensor(*node_output)) {
           AllocPlan(current).alloc_kind = AllocKind::kAllocate;
         } else if (!context_->IsParallelExecutionEnabled() &&
+                   OutputHasConsumerNode(*pnode, static_cast<int>(output_arg_def_index)) &&
                    FindReusableTensor(*node_output, &reused)) {
+          // The check that OutputHasConsumerNode is to handle an edge case where a node produces a value that is
+          // not consumed by any other nodes. If we set it to kReuse the buffer will be freed prematurely as the
+          // logic in GenerateDeallocationPlan is based on processing consumer nodes. Changing the implementation of
+          // GenerateDeallocationPlan is an alternative but that would be a much bigger change.
+
           // Reuse an available (dead) buffer for this output, this is only for sequential execution.
           Reuse(reused, current, AllocKind::kReuse);
         } else {
@@ -1906,8 +1921,8 @@ class PlannerImpl {
                     node_to_wait[it->Index()].insert({node_index, wait_handle});
                   }
                 }
-              }  // output->Exists
-            }    // for each output
+              }
+            }
             if (output_consumed_in_subgraph) {
               const auto downstream = plan_.node_stream_map_[it->Index()];
               if (downstream != i) {
diff --git a/onnxruntime/core/framework/session_state.cc b/onnxruntime/core/framework/session_state.cc
index 6244d42645..42fb7b3922 100644
--- a/onnxruntime/core/framework/session_state.cc
+++ b/onnxruntime/core/framework/session_state.cc
@@ -1410,7 +1410,7 @@ Status SessionState::FinalizeSessionStateImpl(const std::basic_string<PATH_CHAR_
   // Record the allocation plan
 
   // Uncomment the below to dump the allocation plan to std::cout
-  // LOGS(logger_, VERBOSE) << std::make_pair(p_seq_exec_plan_.get(), this);
+  // std::cout << std::make_pair(&*p_seq_exec_plan_, this);
 
 #if !defined(ORT_MINIMAL_BUILD) && defined(ORT_MEMORY_PROFILE)
   GetMemoryProfiler()->Init(GetExecutionPlan(), GetOrtValueNameIdxMap());
diff --git a/onnxruntime/test/framework/allocation_planner_test.cc b/onnxruntime/test/framework/allocation_planner_test.cc
index 3a01f2c8d9..9cbf80f16e 100644
--- a/onnxruntime/test/framework/allocation_planner_test.cc
+++ b/onnxruntime/test/framework/allocation_planner_test.cc
@@ -2040,5 +2040,42 @@ TEST(AllocationPlannerTest, ReusedInputCrossDifferentStreams) {
   ASSERT_EQ(gather_count, 4) << "4 gather ops are all placed in CPU stream";
 }
 #endif
+
+#ifdef ENABLE_TRAINING_OPS
+// use a carefully constructed model to re-produce a customer reported issue where a model produced invalid output.
+// this issue required:
+// - buffer A that is re-used later in the model
+//   - output of the first Shape node
+//   - first usage completes after the following Cast node
+// - buffer B which has the same size requirement and is used after the first usage of A is complete
+//   - buffer B is used for the output from `squeeze2` and a number of other nodes in that part of the model.
+// - re-use of buffer A for an output of a node that has no consumers whilst buffer B is still in use
+//   - this is the `per_input_length` output of the ConcatTraining node
+//
+// Because the logic to determine when a buffer can be freed is based on consumers, buffer A gets freed after the
+// Cast node. It is then re-used as buffer B because the memory pattern planner believes that block to be available.
+// When we re-use buffer A for the ConcatTraining output we are using the same address for two different node output
+// buffers, leading to corruption of the output.
+// This tests that the change in allocation planner to not re-use a buffer for outputs with no consumers prevents this.
+TEST(AllocationPlannerTest, AvoidReuseOfBufferForNodeOutputWithNoConsumers) {
+  SessionOptions sess_opt;
+  sess_opt.graph_optimization_level = TransformerLevel::Default;
+
+  InferenceSession sess(sess_opt, GetEnvironment(), ORT_TSTR("./testdata/avoid_reuse_of_buffer_for_node_output_with_no_consumers.onnx"));
+  auto status = sess.Load();
+  status = sess.Initialize();
+  ASSERT_TRUE(status.IsOK());
+
+  const auto& session_state = sess.GetSessionState();
+  const auto& ort_value_index_map = session_state.GetOrtValueNameIdxMap();
+  const SequentialExecutionPlan* plan = session_state.GetExecutionPlan();
+
+  OrtValueIndex concat_training_unused_out_index;
+  // Here per_input_length output of the ConcatTraining node has no consumers, so it should not reuse the buffer.
+  ASSERT_STATUS_OK(ort_value_index_map.GetIdx("per_input_length", concat_training_unused_out_index));
+  EXPECT_EQ(plan->allocation_plan[concat_training_unused_out_index].alloc_kind, AllocKind::kAllocate);
+}
+#endif
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/framework/execution_frame_test.cc b/onnxruntime/test/framework/execution_frame_test.cc
index 60752d7456..b95fd0b726 100644
--- a/onnxruntime/test/framework/execution_frame_test.cc
+++ b/onnxruntime/test/framework/execution_frame_test.cc
@@ -454,9 +454,14 @@ TEST_F(ExecutionFrameTest, MemPatternWithExternalOutputsTest) {
 #endif
 
 TEST(ExecutionFrameTestWithoutSessionState, BadModelInvalidDimParamUsage) {
-  // load model with 2 Scan ops that both incorrectly use shapes of { 'None', 'None' } for their outputs.
-  // as 'None' is not a special value it's treated as a variable name, leading to a runtime error when we
-  // attempt to re-use the output from the first Scan node for the second. validate we detect this and error out.
+  // Model that has 2 inputs with shape {'Symbolic', 'Symbolic'} that is carefully constructed to re-use a
+  // buffer the size of one input for output the size of the other input.
+  // The model is fine if all values of 'Symbolic' are the same, but invalid if they are not.
+  // As both inputs claim to have the same size, the allocation plan is based on that.
+  // Code in ExecutionFrame catches what would result in buffer overflow if input 2 is actually larger than input 1
+  // and we're attempting to re-use a buffer the size of input 1.
+  // The 'real' problem being tested is inconsistent values for a dim_param in a model, which could occur anywhere
+  // in the model.
   SessionOptions so;
   so.session_logid = "BadModelInvalidDimParamUsage";
 
@@ -464,17 +469,27 @@ TEST(ExecutionFrameTestWithoutSessionState, BadModelInvalidDimParamUsage) {
   ASSERT_STATUS_OK(session_object.Load("testdata/invalid_dim_param_value_repetition.onnx"));
   ASSERT_STATUS_OK(session_object.Initialize());
 
-  std::vector<int64_t> dims_X = {10, 6};
-  std::vector<float> values_X;
-  values_X.reserve(60);
+  std::vector<int64_t> dims_X1 = {10, 6};
+  std::vector<float> values_X1;
+  values_X1.reserve(60);
   for (int i = 0; i < 60; ++i) {
-    values_X.push_back(float(i));
+    values_X1.push_back(float(i));
   }
 
-  OrtValue ml_value;
-  CreateMLValue<float>(TestCPUExecutionProvider()->CreatePreferredAllocators()[0], dims_X, values_X, &ml_value);
+  std::vector<int64_t> dims_X2 = {10, 12};
+  std::vector<float> values_X2;
+  values_X2.reserve(120);
+  for (int i = 0; i < 120; ++i) {
+    values_X2.push_back(float(i));
+  }
+
+  OrtValue ml_value1;
+  CreateMLValue<float>(TestCPUExecutionProvider()->CreatePreferredAllocators()[0], dims_X1, values_X1, &ml_value1);
+  OrtValue ml_value2;
+  CreateMLValue<float>(TestCPUExecutionProvider()->CreatePreferredAllocators()[0], dims_X2, values_X2, &ml_value2);
   NameMLValMap feeds;
-  feeds.insert(std::make_pair("X", ml_value));
+  feeds.insert({"X1", ml_value1});
+  feeds.insert({"X2", ml_value2});
 
   // prepare outputs
   std::vector<std::string> output_names;
diff --git a/onnxruntime/test/testdata/avoid_reuse_of_buffer_for_node_output_with_no_consumers.onnx b/onnxruntime/test/testdata/avoid_reuse_of_buffer_for_node_output_with_no_consumers.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..ed354f65087b835aae965f35b690435fe8dc6bde
GIT binary patch
literal 3414
zcmb_eOK;mo5EdnwqQ(y~Tg6rwr*_q#2No4-*@oKyZBeB_0JQ-EyX~PDf{`~86_Hd)
zDpqppPwBD$tC#+Zc9zRoJ}moG8HTVk-#m79z8M(?YlKl^>62BQg#NR?dvFR{Ua(vx
zY&V{{OMc>ns|1ghE|2}KD<QUbU@K1CDCyPy)aq~!Wi;EFxN)Lvx6)I;`5c9I8|q2e
z(}mghP{Kj;1CK&zg#mZGAdyGRW8qDTd89|oLwv*67`dp=j5_h}E6zXUW(EoRtt+X*
z=v7o^{9qLjnr;udsNepD)DXjMZ03YT{CYLvj_<Bj9kvUye)}>7ymZ%g2aLCzFRwj6
zPU~quhHemsOXtQ*W)An4$$Q6L^SGyps{I7pksD08lRf}!2t<HuzDQiQo1#?`x;*x`
zE`<~LA$xBq!`R-`O=dh&X07bp*I%RxdmEbW+LH;xuTa6mUc|EoO21e~owJ5ANYHOx
zN~mo@+g&We8_@=`vA%(-UDlUFdKuZ5!~6S^(6TS-WTkyMyt6L}Ctq}PUlIoSA=*&n
zQR++5SNf8~7WL&Y@5>9-mxPjOpVF1nv`<TEr;TwT=|nhaAdy&S8gVx%JfuN_e(Qx)
z0Y)TY7DGlnD@8mjMa&3H6rxmbQHT^)PNCsl6e8?x3X#StPzc@p{}iG?n<zv&OHv5m
zT2km$#vRf_+xLQ1?4;$-x}0<twLL!g-EE(}B-LM$fV{A6f3;B5Y>Gp_@p84m*-+&R
z6rsdw@?&MJ9&Tp!RdKkGjpBi#G&P*M+v$CG`%6<fk|A^_VK8wMCy87y@Pa9he=lMC
zRjHgD%v|ya5xA9{1rMgl%<5E-{k`X@!=E#MrC~MfN#(JauE%_hL64>9XS<WoANbxR
z3ghri0^h=2DWrSryhM-l5<OaCQvXO(QPr+w$Mni5JiDiEjET{RMe@Sni{-)%0v_3q
z49#qf@^%{=|7IWip{b9Gdt%(I;m7W#7U`lL8$u#J_EV#d@Qtr9HZ(*C<D$!Svy7WU
zWgnz2X=_V7%4_znsf`*)0ulO;@9crmz+@|XVcesPg)i|*^Th8<k<L3FCH(sMO%(da
zeuz<jJEGqw`_MIXW&$u{6})wvldmqIZZuyXLRMDBz(m9jO6&Nq3GMum8m>eirb829
zHKD3bL7{4+>T&k_hTf3@7)FaV(Pd-)Np*Fau!jy2!t^w=FXx|8BdDNVOj~Dr3UiWX
zC*l-;&7^VkG>$fZm@_XAug&+>eRoi-<8^A{Z~o*K%}U-VROw0-YQMZm(h_M{ho^TE
zLS!VZ8*@uq6Fo}B`c!450$0m*HfPxaC|>h(WvFV{gMEz2R1Jsfl@B)!F7?nQt@1`~
zaJ1n!U8!=ra<K}$s0sCka+GIXpC6US@9nE*wJI6QI@Sxls%o_<Nu|u+K%3jtaY|dQ
z)|skS_S&><RGqTil!x4Z-t<aYz1r#gptuL~qG+Qg)J!(o18}`T7zFEqzcA~gL+H_Y
z8S!@>zZuXvz&Y{&%Ci|oAA?mI*8{%B#o+loNZbYL58$J+KA6mICaCCw8Aj8=@_iCU
qli2`&9R(<T0=+UEHv<o19JjIgPfHk4dWE#Q&><;Kt0eMXBlbV)?eZ-E

literal 0
HcmV?d00001

diff --git a/onnxruntime/test/testdata/invalid_dim_param_value_repetition.onnx b/onnxruntime/test/testdata/invalid_dim_param_value_repetition.onnx
index 64db93e30ea9387a53e7c3006a2930cc93c3917e..ecd2d20c8a014687f59884c7901b7ecb47cb4fe8 100644
GIT binary patch
literal 429
zcmd;J72+t)&N0f*%d3#$Fw`^EvjhSq_QXs*kdOw0RWmb}92ZlBAs3rtN=kBZi4e2D
zfuR%&kYlLC3?!`>xTLv2Vju}4DYoE@#DY{qB~~yEl+*`F8i6E@r39Sw^NLFn^Gf{F
zz|smz{AeOT4ay)5#vlzQQd~i)DW%D&!KJx|N}O;u%t{lWyTB%!g56_;a1T%%WQqyM
z6f>}-F-+1FBmpMOgcu{G*nkd5PApM^(pn;1EQuAF#S;HvfN=pMlNEyyk3TRtJX2Eh
zN-|3-wS+hr1sI*6A>zey_RJaEC>2mB3CVHEa4-r<aB&H71Xt!J<>zE3BeTJ(j0mYp
Y;^$(F6yoCI02;-@#lXej1SAEx0UkDJT>t<8

literal 2333
zcmeHI!EVz)5RDVJafhOmMXJhD3K$MxAY|80(jXy9fddCrL^wb%k<&OLB5^`uqm~n&
zQX%mXoH%iVui*!nosFH;DS{Ixa+s`VXLe`i&6~BBm^FFZm%}hT<d!H`%MJY5MZZ^8
zsvEg`KhMDi7_V4iuTtep%{Triu=U@)9F#$+)t<*okA~qS^22DatF5vMw7;wDJ!S9m
zl@~!LJ_wrez@9CgGhq6M!DPocHj3yVl!L1rYJBD8*bgV8;UutUOXn<@qhLJPF?Mp=
z<tn)-QB!`BFF*66XK~-2{^O}zYFEPY!%pX55-BUe3;1Cxf%{w~Y@)Wxmnj!9OIZMI
z(6-0dGD!>jl&mGWc;w=8Nb?q_B|gpK68X5~gDFrHau&sN+Lp8}7NU>~Q8<@i>Al~N
z1LvG&tQoeM%UyZ6t5wH_i8<e)69t`E;l<{=I0!xjcCtu;nZ~C|L>?7Yz6LKm?!C8_
zz@t7DRDByH*!jtUA68mYHn5y^QT1xonpTRFN^UmM(kAJA%Y)e-c8=tiJluq07#v2e
zUdSw34C9DfPdh;v^`av?UE$4FQ&TXNDKk60ZkHFDaobh`P3{Bfm>CYo171qOz=ZvA
zU}qI+-f2UUBeDjAZK+6-3N4mOyRg+gWKt6Dekl!+nUJ@P{naE?_0@JD&5>=vre&}V
z0A@9h2JK<LcQC(hZ$TObW3Yy{tMmKE#*(uRr7#|}DA-;&ilfN}MwQ1pY$|wke{E0u
z%gy%JHg!XNp}*X}>95?^6a7`DZmH9WHEvD&OZ={Ca2u#|GEG8<5U;m+L6KSs%w}C>
zv#naO&I?ZtN465Ezo_&yMCvTRSK%g_i^R!IOCTkObh;od)AizbxlT?h4oUQrf*19G
z5pxs9CIbOQJz|Jf60gTkyk1Y4If}~!0xtuBBnadp1m533@bSSS1d<?-GYI4&1XGT0
zPB|`X3N=~fCh{A@*DU{Bpd71OW_aEZUd5|qc&?TBh#+;uf1>{*qL(v7(kLj@gvQk)
zR>*`E7YI;#t)$CM<Pk!y*>st<DkSyjXppa)_}&5}86DkVtlJCye#`Gh!MHUXwXo@_
z|M4Q-|99XN6K1?VeYT$*qYl&0*pKhu@9$3Eap(M;E+O9jHT9PNX_A(}c+CJT9HA!U
KAh(Nh?aVKBWJu`%

diff --git a/onnxruntime/test/testdata/invalid_dim_param_value_repetition.py b/onnxruntime/test/testdata/invalid_dim_param_value_repetition.py
new file mode 100644
index 0000000000..8a316116d1
--- /dev/null
+++ b/onnxruntime/test/testdata/invalid_dim_param_value_repetition.py
@@ -0,0 +1,70 @@
+"""
+Run this script to recreate the original onnx model.
+Example usage:
+python invalid_dim_param_value_repetition.py
+"""
+
+import numpy as np
+import onnx
+
+
+def order_repeated_field(repeated_proto, key_name, order):
+    order = list(order)
+    repeated_proto.sort(key=lambda x: order.index(getattr(x, key_name)))
+
+
+def make_node(op_type, inputs, outputs, name=None, doc_string=None, domain=None, **kwargs):
+    node = onnx.helper.make_node(op_type, inputs, outputs, name, doc_string, domain, **kwargs)
+    if doc_string == "":
+        node.doc_string = ""
+    order_repeated_field(node.attribute, "name", kwargs.keys())
+    return node
+
+
+def make_graph(*args, doc_string=None, **kwargs):
+    graph = onnx.helper.make_graph(*args, doc_string=doc_string, **kwargs)
+    if doc_string == "":
+        graph.doc_string = ""
+    return graph
+
+
+model = onnx.helper.make_model(
+    opset_imports=[onnx.helper.make_operatorsetid("", 11)],
+    ir_version=5,
+    producer_name="skl2onnx",
+    producer_version="1.5.9999",
+    domain="ai.onnx",
+    model_version=0,
+    graph=make_graph(
+        name="OnnxIdentity",
+        inputs=[
+            onnx.helper.make_tensor_value_info("X1", onnx.TensorProto.FLOAT, shape=["Symbolic", "Symbolic"]),
+            onnx.helper.make_tensor_value_info("X2", onnx.TensorProto.FLOAT, shape=["Symbolic", "Symbolic"]),
+        ],
+        outputs=[
+            onnx.helper.make_tensor_value_info("Y", onnx.TensorProto.FLOAT, shape=[None, None]),
+        ],
+        initializer=[
+            onnx.numpy_helper.from_array(np.array([0.10000000149011612], dtype="float32"), name="Addcst"),
+        ],
+        nodes=[
+            # take an input. Add to create a local output buffer for O01.
+            make_node("Add", inputs=["X1", "Addcst"], outputs=["O01"], name="Add1", domain=""),
+            # Use Shape -> ConstantOfShape to make O01 available for reuse
+            make_node("Shape", inputs=["O01"], outputs=["O02"], name="Shape1", domain=""),
+            # ConstantOfShape to get back to the right rank, and ReduceSum so the value is broadcastable in the
+            # the downstream Add
+            make_node("ConstantOfShape", inputs=["O02"], outputs=["O03"], name="ConstantOfShape ", domain=""),
+            make_node("ReduceSum", inputs=["O03"], outputs=["O04"], name="ReduceSum1", domain=""),
+            # Two Add nodes with the ReduceSum output. One could be in-place, but the other needs a buffer.
+            # This should trigger attempted re-use of O01, so provided X2 is larger than X1 that should break
+            make_node("Add", inputs=["O04", "X2"], outputs=["O05"], name="Add2", domain=""),
+            make_node("Add", inputs=["X2", "O04"], outputs=["O06"], name="Add3", domain=""),
+            # concat to separate the Add outputs from graph output (which is always allocated)
+            make_node("Concat", inputs=["O05", "O06"], outputs=["Y"], axis=-1, name="Concat", domain=""),
+        ],
+    ),
+)
+
+if __name__ == "__main__":
+    onnx.save(model, "invalid_dim_param_value_repetition.onnx")