diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc index bfb20b428d..cc18e960f7 100644 --- a/onnxruntime/test/onnx/main.cc +++ b/onnxruntime/test/onnx/main.cc @@ -43,6 +43,7 @@ void usage() { "'openvino' or 'nuphar'. " "Default: 'cpu'.\n" "\t-x: Use parallel executor, default (without -x): sequential executor.\n" + "\t-d [device_id]: Specifies the device id for multi-device (e.g. GPU). The value should > 0\n" "\t-o [optimization level]: Default is 1. Valid values are 0 (disable), 1 (basic), 2 (extended), 99 (all).\n" "\t\tPlease see onnxruntime_c_api.h (enum GraphOptimizationLevel) for the full list of all optimization levels. " "\n" @@ -101,13 +102,14 @@ int real_main(int argc, char* argv[], Ort::Env& env) { bool enable_mem_pattern = true; bool enable_openvino = false; bool enable_nnapi = false; + int device_id = 0; GraphOptimizationLevel graph_optimization_level = ORT_DISABLE_ALL; bool user_graph_optimization_level_set = false; OrtLoggingLevel logging_level = ORT_LOGGING_LEVEL_WARNING; { int ch; - while ((ch = getopt(argc, argv, ORT_TSTR("Ac:hj:Mn:r:e:xvo:"))) != -1) { + while ((ch = getopt(argc, argv, ORT_TSTR("Ac:hj:Mn:r:e:xvo:d:"))) != -1) { switch (ch) { case 'A': enable_cpu_mem_arena = false; @@ -197,6 +199,13 @@ int real_main(int argc, char* argv[], Ort::Env& env) { user_graph_optimization_level_set = true; break; } + case 'd': + device_id = static_cast(OrtStrtol(optarg, nullptr)); + if (device_id < 0) { + usage(); + return -1; + } + break; case '?': case 'h': default: @@ -251,8 +260,8 @@ int real_main(int argc, char* argv[], Ort::Env& env) { if (enable_tensorrt) { #ifdef USE_TENSORRT - ORT_THROW_ON_ERROR(OrtSessionOptionsAppendExecutionProvider_Tensorrt(sf, 0)); - ORT_THROW_ON_ERROR(OrtSessionOptionsAppendExecutionProvider_CUDA(sf, 0)); + ORT_THROW_ON_ERROR(OrtSessionOptionsAppendExecutionProvider_Tensorrt(sf, device_id)); + ORT_THROW_ON_ERROR(OrtSessionOptionsAppendExecutionProvider_CUDA(sf, device_id)); #else fprintf(stderr, "TensorRT is not supported in this build"); return -1; @@ -269,7 +278,7 @@ int real_main(int argc, char* argv[], Ort::Env& env) { } if (enable_cuda) { #ifdef USE_CUDA - ORT_THROW_ON_ERROR(OrtSessionOptionsAppendExecutionProvider_CUDA(sf, 0)); + ORT_THROW_ON_ERROR(OrtSessionOptionsAppendExecutionProvider_CUDA(sf, device_id)); #else fprintf(stderr, "CUDA is not supported in this build"); return -1; diff --git a/onnxruntime/test/providers/cpu/math/gemm_test.cc b/onnxruntime/test/providers/cpu/math/gemm_test.cc index 3e3879ee3b..688fd7af46 100644 --- a/onnxruntime/test/providers/cpu/math/gemm_test.cc +++ b/onnxruntime/test/providers/cpu/math/gemm_test.cc @@ -3,6 +3,7 @@ #include "gtest/gtest.h" #include "test/providers/provider_test_utils.h" +#include "test/common/cuda_op_test_utils.h" namespace onnxruntime { namespace test { @@ -29,6 +30,11 @@ TEST(GemmOpTest, GemmNoTrans) { // Only CUDA kernel has float 16 support #ifdef USE_CUDA TEST(GemmOpTest, GemmNoTrans_f16) { + int min_cuda_architecture = 530; + if (!HasCudaEnvironment(min_cuda_architecture)) { + LOGS_DEFAULT(WARNING) << "Hardware NOT support FP16"; + return; + } OpTester test("Gemm"); test.AddAttribute("transA", (int64_t)0); diff --git a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc index 5e695ac7fe..400feebe16 100644 --- a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc +++ b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc @@ -4,6 +4,7 @@ #include "core/providers/cpu/nn/pool.h" #include "gtest/gtest.h" #include "test/providers/provider_test_utils.h" +#include "test/common/cuda_op_test_utils.h" using namespace std; namespace onnxruntime { namespace test { @@ -58,6 +59,11 @@ TEST(PoolTest, MaxPool) { // Disable for now, still investigating the issue with cudnn lib #ifdef USE_CUDA TEST(PoolTest, MaxPool_F16) { + int min_cuda_architecture = 530; + if (!HasCudaEnvironment(min_cuda_architecture)) { + LOGS_DEFAULT(WARNING) << "Hardware NOT support FP16"; + return; + } OpTester test("MaxPool"); test.AddAttribute("auto_pad", ""); diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index b81ef1efd5..5050315873 100755 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -158,6 +158,7 @@ Use the individual flags to only run the specified stages. parser.add_argument("--enable_language_interop_ops", action='store_true', help="Enable operator implemented in language other than cpp") parser.add_argument("--cmake_generator", choices=['Visual Studio 15 2017', 'Visual Studio 16 2019'], default='Visual Studio 15 2017', help="Specify the generator that CMake invokes. This is only supported on Windows") + parser.add_argument("--enable_multi_device_test", action='store_true', help="Test with multi-device. Mostly used for multi-device GPU") return parser.parse_args() def resolve_executable_path(command_or_path): @@ -608,7 +609,7 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs, enab if onnxml_test: run_subprocess([sys.executable, 'onnxruntime_test_python_keras.py'], cwd=cwd, dll_path=dll_path) -def run_onnx_tests(build_dir, configs, onnx_test_data_dir, provider, enable_parallel_executor_test, num_parallel_models): +def run_onnx_tests(build_dir, configs, onnx_test_data_dir, provider, enable_multi_device_test, enable_parallel_executor_test, num_parallel_models): for config in configs: cwd = get_config_build_dir(build_dir, config) if is_windows(): @@ -630,6 +631,9 @@ def run_onnx_tests(build_dir, configs, onnx_test_data_dir, provider, enable_para if num_parallel_models > 0: cmd += ["-j", str(num_parallel_models)] + if enable_multi_device_test: + cmd += ['-d', '1'] + if config != 'Debug' and os.path.exists(model_dir): # some models in opset9 and above are not supported by TensorRT yet if provider == 'tensorrt': @@ -975,20 +979,20 @@ def main(): # Disable some onnx unit tests that TensorRT doesn't supported yet if not is_windows(): onnx_test_data_dir = os.path.join(source_dir, "cmake", "external", "onnx", "onnx", "backend", "test", "data", "simple") - run_onnx_tests(build_dir, configs, onnx_test_data_dir, 'tensorrt', False, 1) + run_onnx_tests(build_dir, configs, onnx_test_data_dir, 'tensorrt', args.enable_multi_device_test, False, 1) elif args.use_cuda: - run_onnx_tests(build_dir, configs, onnx_test_data_dir, 'cuda', False, 2) + run_onnx_tests(build_dir, configs, onnx_test_data_dir, 'cuda', args.enable_multi_device_test, False, 2) elif args.x86 or platform.system() == 'Darwin': - run_onnx_tests(build_dir, configs, onnx_test_data_dir, None, False, 1) + run_onnx_tests(build_dir, configs, onnx_test_data_dir, None, args.enable_multi_device_test, False, 1) elif args.use_ngraph: - run_onnx_tests(build_dir, configs, onnx_test_data_dir, 'ngraph', True, 1) + run_onnx_tests(build_dir, configs, onnx_test_data_dir, 'ngraph', args.enable_multi_device_test, True, 1) elif args.use_openvino: - run_onnx_tests(build_dir, configs, onnx_test_data_dir, 'openvino', False, 1) + run_onnx_tests(build_dir, configs, onnx_test_data_dir, 'openvino', args.enable_multi_device_test, False, 1) # TODO: parallel executor test fails on MacOS elif args.use_nuphar: - run_onnx_tests(build_dir, configs, onnx_test_data_dir, 'nuphar', False, 1) + run_onnx_tests(build_dir, configs, onnx_test_data_dir, 'nuphar', args.enable_multi_device_test, False, 1) else: - run_onnx_tests(build_dir, configs, onnx_test_data_dir, None, True, 0) + run_onnx_tests(build_dir, configs, onnx_test_data_dir, None, args.enable_multi_device_test, True, 0) if args.use_mkldnn: mkldnn_run_onnx_tests(build_dir, configs, onnx_test_data_dir) diff --git a/tools/ci_build/github/azure-pipelines/linux-multi-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-multi-gpu-ci-pipeline.yml new file mode 100644 index 0000000000..96d6ca0159 --- /dev/null +++ b/tools/ci_build/github/azure-pipelines/linux-multi-gpu-ci-pipeline.yml @@ -0,0 +1,8 @@ +jobs: +- template: templates/linux-ci.yml + parameters: + AgentPool : 'Linux-Multi-GPU' + JobName: 'Linux_CI_Multi_GPU_Dev' + BuildCommand: 'tools/ci_build/github/linux/run_dockerbuild.sh -o ubuntu16.04 -d gpu -r $(Build.BinariesDirectory) -x "--enable_multi_device_test"' + DoNugetPack: 'false' + ArtifactName: 'drop-linux' diff --git a/tools/ci_build/github/azure-pipelines/linux-multi-gpu-tensorrt-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-multi-gpu-tensorrt-ci-pipeline.yml new file mode 100644 index 0000000000..d809ece70e --- /dev/null +++ b/tools/ci_build/github/azure-pipelines/linux-multi-gpu-tensorrt-ci-pipeline.yml @@ -0,0 +1,9 @@ +jobs: +- template: templates/linux-ci.yml + parameters: + AgentPool : 'Linux-Multi-GPU' + JobName: 'Linux_CI_Multi_GPU_TensorRT_Dev' + # The latest TensorRT container (R19.09) only supports ubuntu18.04 + BuildCommand: 'tools/ci_build/github/linux/run_dockerbuild.sh -o ubuntu18.04 -d tensorrt -r $(Build.BinariesDirectory) -p 3.6 -x "--enable_multi_device_test"' + DoNugetPack: 'false' + ArtifactName: 'drop-linux' \ No newline at end of file