From e9ab56fa64c0644a2dc5287d1cd3b945bf7d7981 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20M=C3=BCller?=
 <44298237+gedoensmax@users.noreply.github.com>
Date: Wed, 8 Feb 2023 04:59:28 +0100
Subject: [PATCH] Adding RunOptions synchronization behaviour to C/C++ API
 (#14088)

### Description
This is exposing the already existent interface of asynchronous work of
all CUDA base EP's (CUDA + TensorRT).


### Motivation and Context
This is something requested in #12216. It will enable users to build an
efficient data pipeline with ONNXRuntime and CUDA pre-/post-processing.
PCI traffic to the CUDA device can be run during inference as soon as
the postprocessing consumed the input buffer and it can be overwritten.
To do this work has to be submitted async to the device. Please see
below screenshots showing the illustration of this using NSight Systems.

Async:
<img width="1401" alt="image"
src="https://user-images.githubusercontent.com/44298237/209894303-706460ed-cbdb-4be2-a2e4-0c111ec875dd.png">

Synchronous:
<img width="1302" alt="image"
src="https://user-images.githubusercontent.com/44298237/209894630-1ce40925-bbd5-470d-b888-46553ab75fb9.png">

Note the gap in between the 2 inference runs due to issuing PCI traffic
in between and to the CPU overhead the active synchronization has.

---------

Co-authored-by: Chi Lo <chi.lo@microsoft.com>
---
 include/onnxruntime/core/framework/run_options.h             | 4 ----
 .../core/session/onnxruntime_run_options_config_keys.h       | 5 +++++
 onnxruntime/core/framework/utils.cc                          | 5 ++++-
 onnxruntime/core/session/inference_session.cc                | 3 ++-
 onnxruntime/python/onnxruntime_pybind_state.cc               | 2 --
 .../orttraining/python/training/torchdynamo/ort_backend.py   | 2 +-
 6 files changed, 12 insertions(+), 9 deletions(-)
diff --git a/include/onnxruntime/core/framework/run_options.h b/include/onnxruntime/core/framework/run_options.h
index e5a84e7aa7..5444c825d7 100644
--- a/include/onnxruntime/core/framework/run_options.h
+++ b/include/onnxruntime/core/framework/run_options.h
@@ -27,10 +27,6 @@ struct OrtRunOptions {
   // So it is possible that only some of the nodes are executed.
   bool only_execute_path_to_fetches = false;
 
-  // Set to 'true' to synchronize execution providers with CPU at the end of session run.
-  // Taking CUDA EP as an example, it will trigger cudaStreamSynchronize on the compute stream.
-  bool synchronize_execution_providers = true;
-
 #ifdef ENABLE_TRAINING
   // Used by onnxruntime::training::TrainingSession. This class is now deprecated.
   // Delete training_mode when TrainingSession is deleted.
diff --git a/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h
index 49b46ca077..1f5fcd50e1 100644
--- a/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h
+++ b/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h
@@ -25,3 +25,8 @@
 // Example usage: "cpu:0;gpu:0" (or) "gpu:0"
 // By default, the value for this key is empty (i.e.) no memory arenas are shrunk
 static const char* const kOrtRunOptionsConfigEnableMemoryArenaShrinkage = "memory.enable_memory_arena_shrinkage";
+
+// Set to '1' to not synchronize execution providers with CPU at the end of session run.
+// Per default it will be set to '0'
+// Taking CUDA EP as an example, it omit triggering cudaStreamSynchronize on the compute stream.
+static const char* const kOrtRunOptionsConfigDisableSynchronizeExecutionProviders = "disable_synchronize_execution_providers";
diff --git a/onnxruntime/core/framework/utils.cc b/onnxruntime/core/framework/utils.cc
index 79691d7b51..f88d098454 100644
--- a/onnxruntime/core/framework/utils.cc
+++ b/onnxruntime/core/framework/utils.cc
@@ -20,6 +20,8 @@
 #include "core/framework/tensorprotoutils.h"
 #include "core/mlas/inc/mlas.h"
 #include "core/framework/TensorSeq.h"
+#include "core/framework/run_options.h"
+#include "core/session/onnxruntime_run_options_config_keys.h"
 #ifdef USE_AZURE
 #include "core/framework/cloud_executor.h"
 #endif
@@ -793,13 +795,14 @@ common::Status ExecuteGraph(const SessionState& session_state,
                                   logger);
   }
 #endif
+  bool synchronize_execution_providers = run_options.config_options.GetConfigOrDefault(kOrtRunOptionsConfigDisableSynchronizeExecutionProviders, "0") == "0";
   return ExecuteGraph(session_state,
                       feeds_fetches_manager,
                       feeds, fetches,
                       execution_mode,
                       run_options.terminate,
                       logger,
-                      run_options.synchronize_execution_providers,
+                      synchronize_execution_providers,
                       run_options.only_execute_path_to_fetches);
 }
 
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index 43ccfa2962..79068a0271 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -1997,7 +1997,8 @@ Status InferenceSession::Run(const RunOptions& run_options,
 
     // info all execution providers InferenceSession:Run ended
     for (auto* xp : exec_providers_to_stop) {
-      auto status = xp->OnRunEnd(run_options.synchronize_execution_providers);
+      bool synchronize_execution_providers = run_options.config_options.GetConfigOrDefault(kOrtRunOptionsConfigDisableSynchronizeExecutionProviders, "0") == "0";
+      auto status = xp->OnRunEnd(synchronize_execution_providers);
       ORT_CHECK_AND_SET_RETVAL(status);
     }
 
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index 7d4fb6d32c..490eb92afc 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -1368,8 +1368,6 @@ RunOptions instance. The individual calls will exit gracefully and return an err
 #endif
       .def_readwrite("only_execute_path_to_fetches", &RunOptions::only_execute_path_to_fetches,
                      R"pbdoc(Only execute the nodes needed by fetch list)pbdoc")
-      .def_readwrite("synchronize_execution_providers", &RunOptions::synchronize_execution_providers,
-                     R"pbdoc(Synchronize execution providers after executing session.)pbdoc")
       .def(
           "add_run_config_entry",
           [](RunOptions* options, const char* config_key, const char* config_value) -> void {
diff --git a/orttraining/orttraining/python/training/torchdynamo/ort_backend.py b/orttraining/orttraining/python/training/torchdynamo/ort_backend.py
index bd397edf67..0319fecf69 100644
--- a/orttraining/orttraining/python/training/torchdynamo/ort_backend.py
+++ b/orttraining/orttraining/python/training/torchdynamo/ort_backend.py
@@ -396,7 +396,7 @@ def _run_onnx_session_with_ortvaluevector(
 
     _nvtx_range_push("run_with_ortvaluevector")
     run_options = onnxruntime.RunOptions()
-    run_options.synchronize_execution_providers = True
+    run_options.add_run_config_entry("disable_synchronize_execution_providers", "1")
     sess.run_with_ortvaluevector(run_options, input_names, ort_inputs, output_names, ort_outputs, output_devices)
     _nvtx_range_pop()