use legacy stream mode (#2076)

In ORT, there is only 3 cuda stream: default, HtoD, DtoH. And both HtoD and DtoH are non-blocking stream. Thus, per-thread stream mode doesn't have any benefit.
I also tried in multiple thread env and the legacy mode is also better than per-thread model.
Below is the perf of a 3 layer bert on v100. Unit is ms:
batch size 1:
 concurrency | c=1 | c=2 | c=4
legacy | 0.54 | 1.17 | 2.68
per-thread | 0.66 | 1.37 | 2.86
 
batch size 4:  
 concurrency | c=1 | c=2 | c=4
legacy | 1.1 | 2.22 | 4.6
per-thread | 1.21 | 2.44 | 4.98

batch size 64:
concurrency  | c=1 | c=2 | c=4
legacy | 8.09 | 16.13 | 32.37
per-thread | 8.18 | 16.26 | 32.45
This commit is contained in:
Yufeng Li 2019-10-14 16:03:04 -07:00 committed by GitHub
parent 80d09f0c59
commit 8c5db7f973
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 1 additions and 4 deletions

View file

@ -612,7 +612,7 @@ if (onnxruntime_USE_CUDA)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_50,code=sm_50") # M series
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_60,code=sm_60") # P series
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_70,code=sm_70") # V series
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --default-stream per-thread")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --default-stream legacy")
if (NOT WIN32)
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --compiler-options -fPIC")
endif()

View file

@ -2,9 +2,6 @@
// Licensed under the MIT License.
#pragma once
#ifndef CUDA_API_PER_THREAD_DEFAULT_STREAM
#define CUDA_API_PER_THREAD_DEFAULT_STREAM
#endif
#include <cuda.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>