From fff68c3151b774d8a2e9290e96b9f707cd950216 Mon Sep 17 00:00:00 2001 From: Baiju Meswani Date: Thu, 13 Jun 2024 16:08:16 -0700 Subject: [PATCH] Avoid reusing buffer for node outputs with no consumers (#21019) --- .../core/framework/allocation_planner.cc | 23 +++++- onnxruntime/core/framework/session_state.cc | 2 +- .../test/framework/allocation_planner_test.cc | 37 +++++++++ .../test/framework/execution_frame_test.cc | 35 ++++++--- ...fer_for_node_output_with_no_consumers.onnx | Bin 0 -> 3414 bytes .../invalid_dim_param_value_repetition.onnx | Bin 2333 -> 429 bytes .../invalid_dim_param_value_repetition.py | 70 ++++++++++++++++++ 7 files changed, 152 insertions(+), 15 deletions(-) create mode 100644 onnxruntime/test/testdata/avoid_reuse_of_buffer_for_node_output_with_no_consumers.onnx create mode 100644 onnxruntime/test/testdata/invalid_dim_param_value_repetition.py diff --git a/onnxruntime/core/framework/allocation_planner.cc b/onnxruntime/core/framework/allocation_planner.cc index 95e5380675..fec4e6e87e 100644 --- a/onnxruntime/core/framework/allocation_planner.cc +++ b/onnxruntime/core/framework/allocation_planner.cc @@ -469,6 +469,15 @@ class PlannerImpl { */ } + static bool OutputHasConsumerNode(const Node& node, int output_idx) { + // there will be an edge to all consumer nodes. + // if consumed in a subgraph the edge will be to an implicit input of the node containing the subgraph. + return std::any_of(node.OutputEdgesBegin(), node.OutputEdgesEnd(), + [&output_idx](const Node::EdgeEnd& edge) { + return edge.GetSrcArgIndex() == output_idx; + }); + } + bool SameSize(const onnxruntime::NodeArg& arg1, const onnxruntime::NodeArg& arg2) { if ((!arg1.Exists()) || (!arg2.Exists())) return false; auto p_shape1 = context_->GetShape(arg1); @@ -1172,8 +1181,8 @@ class PlannerImpl { value_consumer_map[output_idx_global].end()); reused.insert(reusable_input); continue; - } // if - } // if + } + } } } @@ -1456,7 +1465,13 @@ class PlannerImpl { } else if (IsNonTensor(*node_output)) { AllocPlan(current).alloc_kind = AllocKind::kAllocate; } else if (!context_->IsParallelExecutionEnabled() && + OutputHasConsumerNode(*pnode, static_cast(output_arg_def_index)) && FindReusableTensor(*node_output, &reused)) { + // The check that OutputHasConsumerNode is to handle an edge case where a node produces a value that is + // not consumed by any other nodes. If we set it to kReuse the buffer will be freed prematurely as the + // logic in GenerateDeallocationPlan is based on processing consumer nodes. Changing the implementation of + // GenerateDeallocationPlan is an alternative but that would be a much bigger change. + // Reuse an available (dead) buffer for this output, this is only for sequential execution. Reuse(reused, current, AllocKind::kReuse); } else { @@ -1906,8 +1921,8 @@ class PlannerImpl { node_to_wait[it->Index()].insert({node_index, wait_handle}); } } - } // output->Exists - } // for each output + } + } if (output_consumed_in_subgraph) { const auto downstream = plan_.node_stream_map_[it->Index()]; if (downstream != i) { diff --git a/onnxruntime/core/framework/session_state.cc b/onnxruntime/core/framework/session_state.cc index 6244d42645..42fb7b3922 100644 --- a/onnxruntime/core/framework/session_state.cc +++ b/onnxruntime/core/framework/session_state.cc @@ -1410,7 +1410,7 @@ Status SessionState::FinalizeSessionStateImpl(const std::basic_stringInit(GetExecutionPlan(), GetOrtValueNameIdxMap()); diff --git a/onnxruntime/test/framework/allocation_planner_test.cc b/onnxruntime/test/framework/allocation_planner_test.cc index 3a01f2c8d9..9cbf80f16e 100644 --- a/onnxruntime/test/framework/allocation_planner_test.cc +++ b/onnxruntime/test/framework/allocation_planner_test.cc @@ -2040,5 +2040,42 @@ TEST(AllocationPlannerTest, ReusedInputCrossDifferentStreams) { ASSERT_EQ(gather_count, 4) << "4 gather ops are all placed in CPU stream"; } #endif + +#ifdef ENABLE_TRAINING_OPS +// use a carefully constructed model to re-produce a customer reported issue where a model produced invalid output. +// this issue required: +// - buffer A that is re-used later in the model +// - output of the first Shape node +// - first usage completes after the following Cast node +// - buffer B which has the same size requirement and is used after the first usage of A is complete +// - buffer B is used for the output from `squeeze2` and a number of other nodes in that part of the model. +// - re-use of buffer A for an output of a node that has no consumers whilst buffer B is still in use +// - this is the `per_input_length` output of the ConcatTraining node +// +// Because the logic to determine when a buffer can be freed is based on consumers, buffer A gets freed after the +// Cast node. It is then re-used as buffer B because the memory pattern planner believes that block to be available. +// When we re-use buffer A for the ConcatTraining output we are using the same address for two different node output +// buffers, leading to corruption of the output. +// This tests that the change in allocation planner to not re-use a buffer for outputs with no consumers prevents this. +TEST(AllocationPlannerTest, AvoidReuseOfBufferForNodeOutputWithNoConsumers) { + SessionOptions sess_opt; + sess_opt.graph_optimization_level = TransformerLevel::Default; + + InferenceSession sess(sess_opt, GetEnvironment(), ORT_TSTR("./testdata/avoid_reuse_of_buffer_for_node_output_with_no_consumers.onnx")); + auto status = sess.Load(); + status = sess.Initialize(); + ASSERT_TRUE(status.IsOK()); + + const auto& session_state = sess.GetSessionState(); + const auto& ort_value_index_map = session_state.GetOrtValueNameIdxMap(); + const SequentialExecutionPlan* plan = session_state.GetExecutionPlan(); + + OrtValueIndex concat_training_unused_out_index; + // Here per_input_length output of the ConcatTraining node has no consumers, so it should not reuse the buffer. + ASSERT_STATUS_OK(ort_value_index_map.GetIdx("per_input_length", concat_training_unused_out_index)); + EXPECT_EQ(plan->allocation_plan[concat_training_unused_out_index].alloc_kind, AllocKind::kAllocate); +} +#endif + } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/framework/execution_frame_test.cc b/onnxruntime/test/framework/execution_frame_test.cc index 60752d7456..b95fd0b726 100644 --- a/onnxruntime/test/framework/execution_frame_test.cc +++ b/onnxruntime/test/framework/execution_frame_test.cc @@ -454,9 +454,14 @@ TEST_F(ExecutionFrameTest, MemPatternWithExternalOutputsTest) { #endif TEST(ExecutionFrameTestWithoutSessionState, BadModelInvalidDimParamUsage) { - // load model with 2 Scan ops that both incorrectly use shapes of { 'None', 'None' } for their outputs. - // as 'None' is not a special value it's treated as a variable name, leading to a runtime error when we - // attempt to re-use the output from the first Scan node for the second. validate we detect this and error out. + // Model that has 2 inputs with shape {'Symbolic', 'Symbolic'} that is carefully constructed to re-use a + // buffer the size of one input for output the size of the other input. + // The model is fine if all values of 'Symbolic' are the same, but invalid if they are not. + // As both inputs claim to have the same size, the allocation plan is based on that. + // Code in ExecutionFrame catches what would result in buffer overflow if input 2 is actually larger than input 1 + // and we're attempting to re-use a buffer the size of input 1. + // The 'real' problem being tested is inconsistent values for a dim_param in a model, which could occur anywhere + // in the model. SessionOptions so; so.session_logid = "BadModelInvalidDimParamUsage"; @@ -464,17 +469,27 @@ TEST(ExecutionFrameTestWithoutSessionState, BadModelInvalidDimParamUsage) { ASSERT_STATUS_OK(session_object.Load("testdata/invalid_dim_param_value_repetition.onnx")); ASSERT_STATUS_OK(session_object.Initialize()); - std::vector dims_X = {10, 6}; - std::vector values_X; - values_X.reserve(60); + std::vector dims_X1 = {10, 6}; + std::vector values_X1; + values_X1.reserve(60); for (int i = 0; i < 60; ++i) { - values_X.push_back(float(i)); + values_X1.push_back(float(i)); } - OrtValue ml_value; - CreateMLValue(TestCPUExecutionProvider()->CreatePreferredAllocators()[0], dims_X, values_X, &ml_value); + std::vector dims_X2 = {10, 12}; + std::vector values_X2; + values_X2.reserve(120); + for (int i = 0; i < 120; ++i) { + values_X2.push_back(float(i)); + } + + OrtValue ml_value1; + CreateMLValue(TestCPUExecutionProvider()->CreatePreferredAllocators()[0], dims_X1, values_X1, &ml_value1); + OrtValue ml_value2; + CreateMLValue(TestCPUExecutionProvider()->CreatePreferredAllocators()[0], dims_X2, values_X2, &ml_value2); NameMLValMap feeds; - feeds.insert(std::make_pair("X", ml_value)); + feeds.insert({"X1", ml_value1}); + feeds.insert({"X2", ml_value2}); // prepare outputs std::vector output_names; diff --git a/onnxruntime/test/testdata/avoid_reuse_of_buffer_for_node_output_with_no_consumers.onnx b/onnxruntime/test/testdata/avoid_reuse_of_buffer_for_node_output_with_no_consumers.onnx new file mode 100644 index 0000000000000000000000000000000000000000..ed354f65087b835aae965f35b690435fe8dc6bde GIT binary patch literal 3414 zcmb_eOK;mo5EdnwqQ(y~Tg6rwr*_q#2No4-*@oKyZBeB_0JQ-EyX~PDf{`~86_Hd) zDpqppPwBD$tC#+Zc9zRoJ}moG8HTVk-#m79z8M(?YlKl^>62BQg#NR?dvFR{Ua(vx zY&V{{OMc>ns|1ghE|2}KD7ymZ%g2aLCzFRwj6 zPU~quhHemsOXtQ*W)An4$$Q6L^SGyps{I7pksD08lRf}!2t9-mxPjOpVF1nv`Gp_@p84m*-+&R z6rsdw@?&MJ9&Tp!RdKkGjpBi#G&P*M+v$CG`%69XSeirb829 zHKD3bL7{4+>T&k_hTf3@7)FaV(Pd-)Np*Fau!jy2!t^w=FXx|8BdDNVOj~Dr3UiWX zC*l-;&7^VkG>$fZm@_XAug&+>eRoi-<8^A{Z~o*K%}U-VROw0-YQMZm(h_M{ho^TE zLS!VZ8*@uq6Fo}B`c!450$0m*HfPxaC|>h(WvFV{gMEz2R1Jsfl@B)!F7?nQt@1`~ zaJ1n!U8!=razZuXvz&Y{&%Ci|oAA?mI*8{%B#o+loNZbYL58$J+KA6mICaCCw8Aj8=@_iCU qli2`&9R(<;Kt0eMXBlbV)?eZ-E literal 0 HcmV?d00001 diff --git a/onnxruntime/test/testdata/invalid_dim_param_value_repetition.onnx b/onnxruntime/test/testdata/invalid_dim_param_value_repetition.onnx index 64db93e30ea9387a53e7c3006a2930cc93c3917e..ecd2d20c8a014687f59884c7901b7ecb47cb4fe8 100644 GIT binary patch literal 429 zcmd;J72+t)&N0f*%d3#$Fw`^EvjhSq_QXs*kdOw0RWmb}92ZlBAs3rtN=kBZi4e2D zfuR%&kYlLC3?!`>xTLv2Vju}4DYoE@#DY{qB~~yEl+*`F8i6E@r39Sw^NLFn^Gf{F zz|smz{AeOT4ay)5#vlzQQd~i)DW%D&!KJx|N}O;u%t{lWyTB%!g56_;a1T%%WQqyM z6f>}-F-+1FBmpMOgcu{G*nkd5PApM^(pn;1EQuAF#S;HvfN=pMlNEyyk3TRtJX2Eh zN-|3-wS+hr1sI*6A>zey_RJaEC>2mB3CVHEa4-rzE3BeTJ(j0mYp Y;^$(F6yoCI02;-@#lXej1SAEx0UkDJT>t<8 literal 2333 zcmeHI!EVz)5RDVJafhOmMXJhD3K$MxAY|80(jXy9fddCrL^wb%k<&OLB5^`uqm~n& zQX%mXoH%iVui*!nosFH;DS{Ixa+s`VXLe`i&6~BBm^FFZm%}hTjl&mGWc;w=8Nb?q_B|gpK68X5~gDFrHau&sN+Lp8}7NU>~Q8<@i>Al~N z1LvG&tQoeM%UyZ6t5wH_i8?7Yz6LKm?!C8_ zz@t7DRDByH*!jtUA68mYHn5y^QT1xonpTRFN^UmM(kAJA%Y)e-c8=tiJluq07#v2e zUdSw34C9DfPdh;v^`av?UE$4FQ&TXNDKk60ZkHFDaobh`P3{Bfm>CYo171qOz=ZvA zU}qI+-f2UUBeDjAZK+6-3N4mOyRg+gWKt6Dekl!+nUJ@P{naE?_0@JD&5>=vre&}V z0A@9h2JK95?^6a7`DZmH9WHEvD&OZ={Ca2u#|GEG8<5U;m+L6KSs%w}C> zv#naO&I?ZtN465Ezo_&yMCvTRSK%g_i^R!IOCTkObh;od)AizbxlT?h4oUQrf*19G z5pxs9CIbOQJz|Jf60gTkyk1Y4If}~!0xtuBBnadp1m533@bSSS1d{*qL(v7(kLj@gvQk) zR>*`E7YI;#t)$CMst ConstantOfShape to make O01 available for reuse + make_node("Shape", inputs=["O01"], outputs=["O02"], name="Shape1", domain=""), + # ConstantOfShape to get back to the right rank, and ReduceSum so the value is broadcastable in the + # the downstream Add + make_node("ConstantOfShape", inputs=["O02"], outputs=["O03"], name="ConstantOfShape ", domain=""), + make_node("ReduceSum", inputs=["O03"], outputs=["O04"], name="ReduceSum1", domain=""), + # Two Add nodes with the ReduceSum output. One could be in-place, but the other needs a buffer. + # This should trigger attempted re-use of O01, so provided X2 is larger than X1 that should break + make_node("Add", inputs=["O04", "X2"], outputs=["O05"], name="Add2", domain=""), + make_node("Add", inputs=["X2", "O04"], outputs=["O06"], name="Add3", domain=""), + # concat to separate the Add outputs from graph output (which is always allocated) + make_node("Concat", inputs=["O05", "O06"], outputs=["Y"], axis=-1, name="Concat", domain=""), + ], + ), +) + +if __name__ == "__main__": + onnx.save(model, "invalid_dim_param_value_repetition.onnx")