Adding RunOptions synchronization behaviour to C/C++ API (#14088)

### Description
This is exposing the already existent interface of asynchronous work of
all CUDA base EP's (CUDA + TensorRT).


### Motivation and Context
This is something requested in #12216. It will enable users to build an
efficient data pipeline with ONNXRuntime and CUDA pre-/post-processing.
PCI traffic to the CUDA device can be run during inference as soon as
the postprocessing consumed the input buffer and it can be overwritten.
To do this work has to be submitted async to the device. Please see
below screenshots showing the illustration of this using NSight Systems.

Async: 
<img width="1401" alt="image"
src="https://user-images.githubusercontent.com/44298237/209894303-706460ed-cbdb-4be2-a2e4-0c111ec875dd.png">

Synchronous:
<img width="1302" alt="image"
src="https://user-images.githubusercontent.com/44298237/209894630-1ce40925-bbd5-470d-b888-46553ab75fb9.png">

Note the gap in between the 2 inference runs due to issuing PCI traffic
in between and to the CPU overhead the active synchronization has.

---------

Co-authored-by: Chi Lo <chi.lo@microsoft.com>
This commit is contained in:
Maximilian Müller 2023-02-08 04:59:28 +01:00 committed by GitHub
parent cd7098fdf4
commit e9ab56fa64
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 12 additions and 9 deletions

View file

@ -27,10 +27,6 @@ struct OrtRunOptions {
// So it is possible that only some of the nodes are executed.
bool only_execute_path_to_fetches = false;
// Set to 'true' to synchronize execution providers with CPU at the end of session run.
// Taking CUDA EP as an example, it will trigger cudaStreamSynchronize on the compute stream.
bool synchronize_execution_providers = true;
#ifdef ENABLE_TRAINING
// Used by onnxruntime::training::TrainingSession. This class is now deprecated.
// Delete training_mode when TrainingSession is deleted.

View file

@ -25,3 +25,8 @@
// Example usage: "cpu:0;gpu:0" (or) "gpu:0"
// By default, the value for this key is empty (i.e.) no memory arenas are shrunk
static const char* const kOrtRunOptionsConfigEnableMemoryArenaShrinkage = "memory.enable_memory_arena_shrinkage";
// Set to '1' to not synchronize execution providers with CPU at the end of session run.
// Per default it will be set to '0'
// Taking CUDA EP as an example, it omit triggering cudaStreamSynchronize on the compute stream.
static const char* const kOrtRunOptionsConfigDisableSynchronizeExecutionProviders = "disable_synchronize_execution_providers";

View file

@ -20,6 +20,8 @@
#include "core/framework/tensorprotoutils.h"
#include "core/mlas/inc/mlas.h"
#include "core/framework/TensorSeq.h"
#include "core/framework/run_options.h"
#include "core/session/onnxruntime_run_options_config_keys.h"
#ifdef USE_AZURE
#include "core/framework/cloud_executor.h"
#endif
@ -793,13 +795,14 @@ common::Status ExecuteGraph(const SessionState& session_state,
logger);
}
#endif
bool synchronize_execution_providers = run_options.config_options.GetConfigOrDefault(kOrtRunOptionsConfigDisableSynchronizeExecutionProviders, "0") == "0";
return ExecuteGraph(session_state,
feeds_fetches_manager,
feeds, fetches,
execution_mode,
run_options.terminate,
logger,
run_options.synchronize_execution_providers,
synchronize_execution_providers,
run_options.only_execute_path_to_fetches);
}

View file

@ -1997,7 +1997,8 @@ Status InferenceSession::Run(const RunOptions& run_options,
// info all execution providers InferenceSession:Run ended
for (auto* xp : exec_providers_to_stop) {
auto status = xp->OnRunEnd(run_options.synchronize_execution_providers);
bool synchronize_execution_providers = run_options.config_options.GetConfigOrDefault(kOrtRunOptionsConfigDisableSynchronizeExecutionProviders, "0") == "0";
auto status = xp->OnRunEnd(synchronize_execution_providers);
ORT_CHECK_AND_SET_RETVAL(status);
}

View file

@ -1368,8 +1368,6 @@ RunOptions instance. The individual calls will exit gracefully and return an err
#endif
.def_readwrite("only_execute_path_to_fetches", &RunOptions::only_execute_path_to_fetches,
R"pbdoc(Only execute the nodes needed by fetch list)pbdoc")
.def_readwrite("synchronize_execution_providers", &RunOptions::synchronize_execution_providers,
R"pbdoc(Synchronize execution providers after executing session.)pbdoc")
.def(
"add_run_config_entry",
[](RunOptions* options, const char* config_key, const char* config_value) -> void {

View file

@ -396,7 +396,7 @@ def _run_onnx_session_with_ortvaluevector(
_nvtx_range_push("run_with_ortvaluevector")
run_options = onnxruntime.RunOptions()
run_options.synchronize_execution_providers = True
run_options.add_run_config_entry("disable_synchronize_execution_providers", "1")
sess.run_with_ortvaluevector(run_options, input_names, ort_inputs, output_names, ort_outputs, output_devices)
_nvtx_range_pop()