From 4e2d88b75fd04f12e8e4dbf972f506c6fd0cf14d Mon Sep 17 00:00:00 2001
From: Atanas Dimitrov <70822030+neNasko1@users.noreply.github.com>
Date: Thu, 4 Jan 2024 09:38:28 +0200
Subject: [PATCH] Remove useless `NodeProto` serializations (#18791)

## Description
This pull request aims to enhance the efficiency of the inference
session creation by eliminating unnecessary `Node::ToProto` invocations.
The current codebase presents opportunities for optimization,
particularly in the removal of superfluous `Node::ToProto` calls, along
with their subsequent `~NodeProto` invocations.

## Motivation and Context
The optimization focus of this pull request is on addressing low-hanging
fruit in the inference session creation process. By strategically
removing undesired `Node::ToProto` calls, we aim to streamline the
codebase and enhance the overall performance. The flame graphs
illustrate the notable improvements achieved by reducing the percentage
of `Node::ToProto` calls, thereby optimizing the execution flow.

### Code Snippet
```cpp
TEST(InferenceSessionTests, Bench) {
  // Initialize logging manager
  auto logging_manager = std::make_unique<logging::LoggingManager>(
      std::unique_ptr<ISink>(new CLogSink()), logging::Severity::kVERBOSE, false,
      LoggingManager::InstanceType::Temporal);

  // Create environment
  std::unique_ptr<Environment> env;
  auto st = Environment::Create(std::move(logging_manager), env);
  ASSERT_TRUE(st.IsOK());

  // Configure session options
  SessionOptions so;
  so.execution_mode = ExecutionMode::ORT_SEQUENTIAL;
  so.graph_optimization_level = TransformerLevel::Level2;
  so.intra_op_param.thread_pool_size = 1;

  // Initialize and load the InferenceSession
  InferenceSessionTestGlobalThreadPools session1{so, *env};
  ASSERT_STATUS_OK(session1.Load("big.onnx"));
  ASSERT_STATUS_OK(session1.Initialize());
}
```

### `big.onnx` model creation
```python
import onnx
import numpy as np
from spox import argument, build, Tensor, Var
from spox.opset.ai.onnx import v17 as op
from spox.opset.ai.onnx.ml.v3 import label_encoder

a = argument(Tensor(np.int64, ('N',)))
c = a

for x in range(1000):
    c = op.mul(c, op.const(np.ones(10000, dtype=np.int64)))

for x in range(3000):
    all_strings = list("random_string" + str(i) for i in range(100))
    all_ints = list(range(len(all_strings)))
    c = label_encoder(
        c,
        keys_int64s=all_ints,
        values_strings=all_strings
    )
    c = label_encoder(c, keys_strings=all_strings, values_int64s=all_ints)

model: onnx.ModelProto = build(inputs={'a': a}, outputs={'c': c})
onnx.save(model, "big.onnx")
```

Testing in `Release` with `perf` yields:
Before: 3.3% spent in `Node::ToProto`
After: 1.6% spent in `Node::ToProto`

---------

Co-authored-by: Atanas Dimitrov <atanasdimitrov@Atanass-MacBook-Pro.local>
---
 onnxruntime/core/graph/graph.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc
index baebe24200..904b4263f4 100644
--- a/onnxruntime/core/graph/graph.cc
+++ b/onnxruntime/core/graph/graph.cc
@@ -2550,14 +2550,14 @@ Status Graph::VerifyNodeAndOpMatch(const ResolveOptions& options) {
     // Node verification.
     auto& node = *GetNode(node_index);
 
-    NodeProto node_proto;
-    node.ToProto(node_proto);
     const auto& node_name = node.Name();
 
     if (!node.Op()) {
       {
         auto status = Status::OK();
         ORT_TRY {
+          NodeProto node_proto;
+          node.ToProto(node_proto);
           checker::check_node(node_proto, ctx, lsc);
         }
         ORT_CATCH(const std::exception& ex) {
@@ -2630,8 +2630,8 @@ Status Graph::VerifyNodeAndOpMatch(const ResolveOptions& options) {
     NO_CHANGE_ON_SYNC_FLAG(ORT_RETURN_IF_ERROR(InferAndVerifyTypeMatch(node, *p_op, options)));
 
     // Accumulate output names of the iterated Node
-    for (auto& output_name : node_proto.output()) {
-      lsc.output_names.insert(output_name);
+    for (const auto& output : node.OutputDefs()) {
+      lsc.output_names.insert(output->Name());
     }
   }