From 4e2d88b75fd04f12e8e4dbf972f506c6fd0cf14d Mon Sep 17 00:00:00 2001 From: Atanas Dimitrov <70822030+neNasko1@users.noreply.github.com> Date: Thu, 4 Jan 2024 09:38:28 +0200 Subject: [PATCH] Remove useless `NodeProto` serializations (#18791) ## Description This pull request aims to enhance the efficiency of the inference session creation by eliminating unnecessary `Node::ToProto` invocations. The current codebase presents opportunities for optimization, particularly in the removal of superfluous `Node::ToProto` calls, along with their subsequent `~NodeProto` invocations. ## Motivation and Context The optimization focus of this pull request is on addressing low-hanging fruit in the inference session creation process. By strategically removing undesired `Node::ToProto` calls, we aim to streamline the codebase and enhance the overall performance. The flame graphs illustrate the notable improvements achieved by reducing the percentage of `Node::ToProto` calls, thereby optimizing the execution flow. ### Code Snippet ```cpp TEST(InferenceSessionTests, Bench) { // Initialize logging manager auto logging_manager = std::make_unique( std::unique_ptr(new CLogSink()), logging::Severity::kVERBOSE, false, LoggingManager::InstanceType::Temporal); // Create environment std::unique_ptr env; auto st = Environment::Create(std::move(logging_manager), env); ASSERT_TRUE(st.IsOK()); // Configure session options SessionOptions so; so.execution_mode = ExecutionMode::ORT_SEQUENTIAL; so.graph_optimization_level = TransformerLevel::Level2; so.intra_op_param.thread_pool_size = 1; // Initialize and load the InferenceSession InferenceSessionTestGlobalThreadPools session1{so, *env}; ASSERT_STATUS_OK(session1.Load("big.onnx")); ASSERT_STATUS_OK(session1.Initialize()); } ``` ### `big.onnx` model creation ```python import onnx import numpy as np from spox import argument, build, Tensor, Var from spox.opset.ai.onnx import v17 as op from spox.opset.ai.onnx.ml.v3 import label_encoder a = argument(Tensor(np.int64, ('N',))) c = a for x in range(1000): c = op.mul(c, op.const(np.ones(10000, dtype=np.int64))) for x in range(3000): all_strings = list("random_string" + str(i) for i in range(100)) all_ints = list(range(len(all_strings))) c = label_encoder( c, keys_int64s=all_ints, values_strings=all_strings ) c = label_encoder(c, keys_strings=all_strings, values_int64s=all_ints) model: onnx.ModelProto = build(inputs={'a': a}, outputs={'c': c}) onnx.save(model, "big.onnx") ``` Testing in `Release` with `perf` yields: Before: 3.3% spent in `Node::ToProto` After: 1.6% spent in `Node::ToProto` --------- Co-authored-by: Atanas Dimitrov --- onnxruntime/core/graph/graph.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc index baebe24200..904b4263f4 100644 --- a/onnxruntime/core/graph/graph.cc +++ b/onnxruntime/core/graph/graph.cc @@ -2550,14 +2550,14 @@ Status Graph::VerifyNodeAndOpMatch(const ResolveOptions& options) { // Node verification. auto& node = *GetNode(node_index); - NodeProto node_proto; - node.ToProto(node_proto); const auto& node_name = node.Name(); if (!node.Op()) { { auto status = Status::OK(); ORT_TRY { + NodeProto node_proto; + node.ToProto(node_proto); checker::check_node(node_proto, ctx, lsc); } ORT_CATCH(const std::exception& ex) { @@ -2630,8 +2630,8 @@ Status Graph::VerifyNodeAndOpMatch(const ResolveOptions& options) { NO_CHANGE_ON_SYNC_FLAG(ORT_RETURN_IF_ERROR(InferAndVerifyTypeMatch(node, *p_op, options))); // Accumulate output names of the iterated Node - for (auto& output_name : node_proto.output()) { - lsc.output_names.insert(output_name); + for (const auto& output : node.OutputDefs()) { + lsc.output_names.insert(output->Name()); } }