[TensorRT EP] support user_compute_stream in python API (#20168)

### Description  * Implement `user_compute_stream` python api for TensorRT EP * Using this option will implicitly set `has_user_compute_stream` as `true` * Extend existing TRTEP unit test to verify `user_compute_stream` option * This has been verified in local pytorch env, with `torch.cuda.Stream()` passing into `user_compute_stream`: ```python ... # Before inference if torch.cuda.is_available(): s = torch.cuda.Stream() option = {"user_compute_stream": str(s.cuda_stream)} sess.set_providers(["TensorrtExecutionProvider"], [option]) options = sess.get_provider_options() assert "TensorrtExecutionProvider" in options assert options["TensorrtExecutionProvider"].get("user_compute_stream", "") == str(s.cuda_stream) assert options["TensorrtExecutionProvider"].get("has_user_compute_stream", "") == "1" ... ``` ### Motivation and Context  Align with existing `user_compute_stream` python implementations for [CUDA EP](https://github.com/microsoft/onnxruntime/pull/19229)/[ROCm EP](https://github.com/microsoft/onnxruntime/pull/19619)
2026-07-09 17:28:58 +00:00 · 2024-04-16 12:49:29 -07:00 · 2024-04-16 12:49:29 -07:00 · 54f91ea65a
commit 54f91ea65a
parent e02aef1ded
3 changed files with 44 additions and 1 deletions
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc
@ -14,6 +14,7 @@ namespace tensorrt {
 namespace provider_option_names {
 constexpr const char* kDeviceId = "device_id";
 constexpr const char* kHasUserComputeStream = "has_user_compute_stream";
+constexpr const char* kUserComputeStream = "user_compute_stream";
 constexpr const char* kMaxPartitionIterations = "trt_max_partition_iterations";
 constexpr const char* kMinSubgraphSize = "trt_min_subgraph_size";
 constexpr const char* kMaxWorkspaceSize = "trt_max_workspace_size";
@ -55,6 +56,7 @@ constexpr const char* kDumpEpContextModel = "trt_dump_ep_context_model";

 TensorrtExecutionProviderInfo TensorrtExecutionProviderInfo::FromProviderOptions(const ProviderOptions& options) {
  TensorrtExecutionProviderInfo info{};
+  void* user_compute_stream = nullptr;
  ORT_THROW_IF_ERROR(
      ProviderOptionsParser{}
          .AddValueParser(
@ -71,6 +73,14 @@ TensorrtExecutionProviderInfo TensorrtExecutionProviderInfo::FromProviderOptions
              })
          .AddAssignmentToReference(tensorrt::provider_option_names::kMaxPartitionIterations, info.max_partition_iterations)
          .AddAssignmentToReference(tensorrt::provider_option_names::kHasUserComputeStream, info.has_user_compute_stream)
+          .AddValueParser(
+              tensorrt::provider_option_names::kUserComputeStream,
+              [&user_compute_stream](const std::string& value_str) -> Status {
+                size_t address;
+                ORT_RETURN_IF_ERROR(ParseStringWithClassicLocale(value_str, address));
+                user_compute_stream = reinterpret_cast<void*>(address);
+                return Status::OK();
+              })
          .AddAssignmentToReference(tensorrt::provider_option_names::kMinSubgraphSize, info.min_subgraph_size)
          .AddAssignmentToReference(tensorrt::provider_option_names::kMaxWorkspaceSize, info.max_workspace_size)
          .AddAssignmentToReference(tensorrt::provider_option_names::kFp16Enable, info.fp16_enable)
@ -107,6 +117,8 @@ TensorrtExecutionProviderInfo TensorrtExecutionProviderInfo::FromProviderOptions
          .AddAssignmentToReference(tensorrt::provider_option_names::kEpContextEmbedMode, info.ep_context_embed_mode)
          .Parse(options));  // add new provider option here.

+  info.user_compute_stream = user_compute_stream;
+  info.has_user_compute_stream = (user_compute_stream != nullptr);
  return info;
 }

@ -115,6 +127,7 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const TensorrtE
      {tensorrt::provider_option_names::kDeviceId, MakeStringWithClassicLocale(info.device_id)},
      {tensorrt::provider_option_names::kMaxPartitionIterations, MakeStringWithClassicLocale(info.max_partition_iterations)},
      {tensorrt::provider_option_names::kHasUserComputeStream, MakeStringWithClassicLocale(info.has_user_compute_stream)},
+      {tensorrt::provider_option_names::kUserComputeStream, MakeStringWithClassicLocale(reinterpret_cast<size_t>(info.user_compute_stream))},
      {tensorrt::provider_option_names::kMinSubgraphSize, MakeStringWithClassicLocale(info.min_subgraph_size)},
      {tensorrt::provider_option_names::kMaxWorkspaceSize, MakeStringWithClassicLocale(info.max_workspace_size)},
      {tensorrt::provider_option_names::kFp16Enable, MakeStringWithClassicLocale(info.fp16_enable)},
@ -171,6 +184,7 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const OrtTensor
  const ProviderOptions options{
      {tensorrt::provider_option_names::kDeviceId, MakeStringWithClassicLocale(info.device_id)},
      {tensorrt::provider_option_names::kHasUserComputeStream, MakeStringWithClassicLocale(info.has_user_compute_stream)},
+      {tensorrt::provider_option_names::kUserComputeStream, MakeStringWithClassicLocale(reinterpret_cast<size_t>(info.user_compute_stream))},
      {tensorrt::provider_option_names::kMaxPartitionIterations, MakeStringWithClassicLocale(info.trt_max_partition_iterations)},
      {tensorrt::provider_option_names::kMinSubgraphSize, MakeStringWithClassicLocale(info.trt_min_subgraph_size)},
      {tensorrt::provider_option_names::kMaxWorkspaceSize, MakeStringWithClassicLocale(info.trt_max_workspace_size)},
@ -253,10 +267,14 @@ void TensorrtExecutionProviderInfo::UpdateProviderOptions(void* provider_options
  trt_provider_options_v2.device_id = internal_options.device_id;

  // The 'has_user_compute_stream' of the OrtTensorRTProviderOptionsV2 instance can be set by C API UpdateTensorRTProviderOptionsWithValue() as well
-  // We only set the 'has_user_compute_stream' of the OrtTensorRTProviderOptionsV2 instance if it is provided in options
+  // We only set the 'has_user_compute_stream' of the OrtTensorRTProviderOptionsV2 instance if it is provided in options or user_compute_stream is provided
  if (options.find("has_user_compute_stream") != options.end()) {
    trt_provider_options_v2.has_user_compute_stream = internal_options.has_user_compute_stream;
  }
+  if (options.find("user_compute_stream") != options.end() && internal_options.user_compute_stream != nullptr) {
+    trt_provider_options_v2.user_compute_stream = internal_options.user_compute_stream;
+    trt_provider_options_v2.has_user_compute_stream = true;
+  }

  trt_provider_options_v2.trt_max_partition_iterations = internal_options.max_partition_iterations;
  trt_provider_options_v2.trt_min_subgraph_size = internal_options.min_subgraph_size;
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@ -486,6 +486,15 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
            } else {
              ORT_THROW("[ERROR] [TensorRT] The value for the key 'device_id' should be a number i.e. '0'.\n");
            }
+          } else if (option.first == "user_compute_stream") {
+            if (!option.second.empty()) {
+              auto stream = std::stoull(option.second, nullptr, 0);
+              params.user_compute_stream = reinterpret_cast<void*>(stream);
+              params.has_user_compute_stream = true;
+            } else {
+              params.has_user_compute_stream = false;
+              ORT_THROW("[ERROR] [TensorRT] The value for the key 'user_compute_stream' should be a string to define the compute stream for the inference to run on.\n");
+            }
          } else if (option.first == "trt_max_partition_iterations") {
            if (!option.second.empty()) {
              params.trt_max_partition_iterations = std::stoi(option.second);
--- a/onnxruntime/test/python/onnxruntime_test_python.py
+++ b/onnxruntime/test/python/onnxruntime_test_python.py
@ -312,6 +312,7 @@ class TestInferenceSession(unittest.TestCase):
            option["trt_engine_cache_path"] = engine_cache_path
            force_sequential_engine_build = "true"
            option["trt_force_sequential_engine_build"] = force_sequential_engine_build
+            option["user_compute_stream"] = "1"
            sess.set_providers(["TensorrtExecutionProvider"], [option])

            options = sess.get_provider_options()
@ -326,6 +327,8 @@ class TestInferenceSession(unittest.TestCase):
            self.assertEqual(option["trt_engine_cache_enable"], "1")
            self.assertEqual(option["trt_engine_cache_path"], str(engine_cache_path))
            self.assertEqual(option["trt_force_sequential_engine_build"], "1")
+            self.assertEqual(option["user_compute_stream"], "1")
+            self.assertEqual(option["has_user_compute_stream"], "1")

            from onnxruntime.capi import _pybind_state as C

@ -354,6 +357,19 @@ class TestInferenceSession(unittest.TestCase):
                sess.set_providers(['TensorrtExecutionProvider'], [option])
            """

+            try:
+                import torch
+
+                if torch.cuda.is_available():
+                    s = torch.cuda.Stream()
+                    option["user_compute_stream"] = str(s.cuda_stream)
+                    sess.set_providers(["TensorrtExecutionProvider"], [option])
+                    options = sess.get_provider_options()
+                    self.assertEqual(options["TensorrtExecutionProvider"]["user_compute_stream"], str(s.cuda_stream))
+                    self.assertEqual(options["TensorrtExecutionProvider"]["has_user_compute_stream"], "1")
+            except ImportError:
+                print("torch is not installed, skip testing setting user_compute_stream from torch cuda stream")
+
        if "CUDAExecutionProvider" in onnxrt.get_available_providers():
            cuda_success = 0