mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-07-01 03:45:06 +00:00
use legacy stream mode (#2076)
In ORT, there is only 3 cuda stream: default, HtoD, DtoH. And both HtoD and DtoH are non-blocking stream. Thus, per-thread stream mode doesn't have any benefit. I also tried in multiple thread env and the legacy mode is also better than per-thread model. Below is the perf of a 3 layer bert on v100. Unit is ms: batch size 1: concurrency | c=1 | c=2 | c=4 legacy | 0.54 | 1.17 | 2.68 per-thread | 0.66 | 1.37 | 2.86 batch size 4: concurrency | c=1 | c=2 | c=4 legacy | 1.1 | 2.22 | 4.6 per-thread | 1.21 | 2.44 | 4.98 batch size 64: concurrency | c=1 | c=2 | c=4 legacy | 8.09 | 16.13 | 32.37 per-thread | 8.18 | 16.26 | 32.45
This commit is contained in:
parent
80d09f0c59
commit
8c5db7f973
2 changed files with 1 additions and 4 deletions
|
|
@ -612,7 +612,7 @@ if (onnxruntime_USE_CUDA)
|
|||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_50,code=sm_50") # M series
|
||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_60,code=sm_60") # P series
|
||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_70,code=sm_70") # V series
|
||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --default-stream per-thread")
|
||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --default-stream legacy")
|
||||
if (NOT WIN32)
|
||||
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --compiler-options -fPIC")
|
||||
endif()
|
||||
|
|
|
|||
|
|
@ -2,9 +2,6 @@
|
|||
// Licensed under the MIT License.
|
||||
|
||||
#pragma once
|
||||
#ifndef CUDA_API_PER_THREAD_DEFAULT_STREAM
|
||||
#define CUDA_API_PER_THREAD_DEFAULT_STREAM
|
||||
#endif
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <cublas_v2.h>
|
||||
|
|
|
|||
Loading…
Reference in a new issue