use legacy stream mode (#2076)

In ORT, there is only 3 cuda stream: default, HtoD, DtoH. And both HtoD and DtoH are non-blocking stream. Thus, per-thread stream mode doesn't have any benefit. I also tried in multiple thread env and the legacy mode is also better than per-thread model. Below is the perf of a 3 layer bert on v100. Unit is ms: batch size 1: concurrency | c=1 | c=2 | c=4 legacy | 0.54 | 1.17 | 2.68 per-thread | 0.66 | 1.37 | 2.86 batch size 4: concurrency | c=1 | c=2 | c=4 legacy | 1.1 | 2.22 | 4.6 per-thread | 1.21 | 2.44 | 4.98 batch size 64: concurrency | c=1 | c=2 | c=4 legacy | 8.09 | 16.13 | 32.37 per-thread | 8.18 | 16.26 | 32.45
2026-07-23 19:32:23 +00:00 · 2019-10-14 16:03:04 -07:00 · 2019-10-14 16:03:04 -07:00 · 8c5db7f973
commit 8c5db7f973
parent 80d09f0c59
2 changed files with 1 additions and 4 deletions
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@ -612,7 +612,7 @@ if (onnxruntime_USE_CUDA)
  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_50,code=sm_50") # M series
  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_60,code=sm_60") # P series
  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_70,code=sm_70") # V series
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --default-stream per-thread")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --default-stream legacy")
  if (NOT WIN32)
    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --compiler-options -fPIC")
  endif()
--- a/onnxruntime/core/providers/cuda/cuda_pch.h
+++ b/onnxruntime/core/providers/cuda/cuda_pch.h
@ -2,9 +2,6 @@
 // Licensed under the MIT License.

 #pragma once
-#ifndef CUDA_API_PER_THREAD_DEFAULT_STREAM
-#define CUDA_API_PER_THREAD_DEFAULT_STREAM
-#endif
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cublas_v2.h>