From 8c5db7f9734fa0ce49f0ddf6ebead3653bb6b28f Mon Sep 17 00:00:00 2001 From: Yufeng Li Date: Mon, 14 Oct 2019 16:03:04 -0700 Subject: [PATCH] use legacy stream mode (#2076) In ORT, there is only 3 cuda stream: default, HtoD, DtoH. And both HtoD and DtoH are non-blocking stream. Thus, per-thread stream mode doesn't have any benefit. I also tried in multiple thread env and the legacy mode is also better than per-thread model. Below is the perf of a 3 layer bert on v100. Unit is ms: batch size 1: concurrency | c=1 | c=2 | c=4 legacy | 0.54 | 1.17 | 2.68 per-thread | 0.66 | 1.37 | 2.86 batch size 4: concurrency | c=1 | c=2 | c=4 legacy | 1.1 | 2.22 | 4.6 per-thread | 1.21 | 2.44 | 4.98 batch size 64: concurrency | c=1 | c=2 | c=4 legacy | 8.09 | 16.13 | 32.37 per-thread | 8.18 | 16.26 | 32.45 --- cmake/CMakeLists.txt | 2 +- onnxruntime/core/providers/cuda/cuda_pch.h | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 24269ddb00..b1fab715cc 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -612,7 +612,7 @@ if (onnxruntime_USE_CUDA) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_50,code=sm_50") # M series set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_60,code=sm_60") # P series set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_70,code=sm_70") # V series - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --default-stream per-thread") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --default-stream legacy") if (NOT WIN32) set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --compiler-options -fPIC") endif() diff --git a/onnxruntime/core/providers/cuda/cuda_pch.h b/onnxruntime/core/providers/cuda/cuda_pch.h index 6fd31e0c08..b32e0e59b7 100644 --- a/onnxruntime/core/providers/cuda/cuda_pch.h +++ b/onnxruntime/core/providers/cuda/cuda_pch.h @@ -2,9 +2,6 @@ // Licensed under the MIT License. #pragma once -#ifndef CUDA_API_PER_THREAD_DEFAULT_STREAM -#define CUDA_API_PER_THREAD_DEFAULT_STREAM -#endif #include #include #include