Enable Gpu multi-device test for CUDA EP and Trt EP

Enable multi-device test for GPU * Add build pipeline for TensorRT multi-GPU test * Add code to disable fp16 test if hardware architecture not supported * Add option to set the device id in onnx_test_runner for model tests
2026-07-16 18:31:27 +00:00 · 2019-10-14 11:16:34 -07:00 · 2019-10-14 11:16:34 -07:00 · 640f71c91b
commit 640f71c91b
parent f93be8af90
6 changed files with 54 additions and 12 deletions
--- a/onnxruntime/test/onnx/main.cc
+++ b/onnxruntime/test/onnx/main.cc
@ -43,6 +43,7 @@ void usage() {
      "'openvino' or 'nuphar'. "
      "Default: 'cpu'.\n"
      "\t-x: Use parallel executor, default (without -x): sequential executor.\n"
+      "\t-d [device_id]: Specifies the device id for multi-device (e.g. GPU). The value should > 0\n"
      "\t-o [optimization level]: Default is 1. Valid values are 0 (disable), 1 (basic), 2 (extended), 99 (all).\n"
      "\t\tPlease see onnxruntime_c_api.h (enum GraphOptimizationLevel) for the full list of all optimization levels. "
      "\n"
@ -101,13 +102,14 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
  bool enable_mem_pattern = true;
  bool enable_openvino = false;
  bool enable_nnapi = false;
+  int device_id = 0;
  GraphOptimizationLevel graph_optimization_level = ORT_DISABLE_ALL;
  bool user_graph_optimization_level_set = false;

  OrtLoggingLevel logging_level = ORT_LOGGING_LEVEL_WARNING;
  {
    int ch;
-    while ((ch = getopt(argc, argv, ORT_TSTR("Ac:hj:Mn:r:e:xvo:"))) != -1) {
+    while ((ch = getopt(argc, argv, ORT_TSTR("Ac:hj:Mn:r:e:xvo:d:"))) != -1) {
      switch (ch) {
        case 'A':
          enable_cpu_mem_arena = false;
@ -197,6 +199,13 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
          user_graph_optimization_level_set = true;
          break;
        }
+        case 'd':
+          device_id = static_cast<int>(OrtStrtol<PATH_CHAR_TYPE>(optarg, nullptr));
+          if (device_id < 0) {
+            usage();
+            return -1;
+          }
+          break;
        case '?':
        case 'h':
        default:
@ -251,8 +260,8 @@ int real_main(int argc, char* argv[], Ort::Env& env) {

    if (enable_tensorrt) {
 #ifdef USE_TENSORRT
-      ORT_THROW_ON_ERROR(OrtSessionOptionsAppendExecutionProvider_Tensorrt(sf, 0));
-      ORT_THROW_ON_ERROR(OrtSessionOptionsAppendExecutionProvider_CUDA(sf, 0));
+      ORT_THROW_ON_ERROR(OrtSessionOptionsAppendExecutionProvider_Tensorrt(sf, device_id));
+      ORT_THROW_ON_ERROR(OrtSessionOptionsAppendExecutionProvider_CUDA(sf, device_id));
 #else
      fprintf(stderr, "TensorRT is not supported in this build");
      return -1;
@ -269,7 +278,7 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
    }
    if (enable_cuda) {
 #ifdef USE_CUDA
-      ORT_THROW_ON_ERROR(OrtSessionOptionsAppendExecutionProvider_CUDA(sf, 0));
+      ORT_THROW_ON_ERROR(OrtSessionOptionsAppendExecutionProvider_CUDA(sf, device_id));
 #else
      fprintf(stderr, "CUDA is not supported in this build");
      return -1;
--- a/onnxruntime/test/providers/cpu/math/gemm_test.cc
+++ b/onnxruntime/test/providers/cpu/math/gemm_test.cc
@ -3,6 +3,7 @@

 #include "gtest/gtest.h"
 #include "test/providers/provider_test_utils.h"
+#include "test/common/cuda_op_test_utils.h"

 namespace onnxruntime {
 namespace test {
@ -29,6 +30,11 @@ TEST(GemmOpTest, GemmNoTrans) {
 // Only CUDA kernel has float 16 support
 #ifdef USE_CUDA
 TEST(GemmOpTest, GemmNoTrans_f16) {
+  int min_cuda_architecture = 530;
+  if (!HasCudaEnvironment(min_cuda_architecture)) {
+    LOGS_DEFAULT(WARNING) << "Hardware NOT support FP16";
+    return;
+  }
  OpTester test("Gemm");

  test.AddAttribute("transA", (int64_t)0);
--- a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
@ -4,6 +4,7 @@
 #include "core/providers/cpu/nn/pool.h"
 #include "gtest/gtest.h"
 #include "test/providers/provider_test_utils.h"
+#include "test/common/cuda_op_test_utils.h"
 using namespace std;
 namespace onnxruntime {
 namespace test {
@ -58,6 +59,11 @@ TEST(PoolTest, MaxPool) {
 // Disable for now, still investigating the issue with cudnn lib
 #ifdef USE_CUDA
 TEST(PoolTest, MaxPool_F16) {
+  int min_cuda_architecture = 530;
+  if (!HasCudaEnvironment(min_cuda_architecture)) {
+    LOGS_DEFAULT(WARNING) << "Hardware NOT support FP16";
+    return;
+  }
  OpTester test("MaxPool");

  test.AddAttribute("auto_pad", "");
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@ -158,6 +158,7 @@ Use the individual flags to only run the specified stages.
    parser.add_argument("--enable_language_interop_ops", action='store_true', help="Enable operator implemented in language other than cpp")
    parser.add_argument("--cmake_generator", choices=['Visual Studio 15 2017', 'Visual Studio 16 2019'],
                        default='Visual Studio 15 2017', help="Specify the generator that CMake invokes. This is only supported on Windows")
+    parser.add_argument("--enable_multi_device_test", action='store_true', help="Test with multi-device. Mostly used for multi-device GPU")
    return parser.parse_args()

 def resolve_executable_path(command_or_path):
@ -608,7 +609,7 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs, enab
                if onnxml_test:
                    run_subprocess([sys.executable, 'onnxruntime_test_python_keras.py'], cwd=cwd, dll_path=dll_path)

-def run_onnx_tests(build_dir, configs, onnx_test_data_dir, provider, enable_parallel_executor_test, num_parallel_models):
+def run_onnx_tests(build_dir, configs, onnx_test_data_dir, provider, enable_multi_device_test, enable_parallel_executor_test, num_parallel_models):
    for config in configs:
        cwd = get_config_build_dir(build_dir, config)
        if is_windows():
@ -630,6 +631,9 @@ def run_onnx_tests(build_dir, configs, onnx_test_data_dir, provider, enable_para
        if num_parallel_models > 0:
          cmd += ["-j", str(num_parallel_models)]

+        if enable_multi_device_test:
+          cmd += ['-d', '1']
+
        if config != 'Debug' and os.path.exists(model_dir):
          # some models in opset9 and above are not supported by TensorRT yet
          if provider == 'tensorrt':
@ -975,20 +979,20 @@ def main():
              # Disable some onnx unit tests that TensorRT doesn't supported yet
              if not is_windows():
                onnx_test_data_dir = os.path.join(source_dir, "cmake", "external", "onnx", "onnx", "backend", "test", "data", "simple")
-                run_onnx_tests(build_dir, configs, onnx_test_data_dir, 'tensorrt', False, 1)
+                run_onnx_tests(build_dir, configs, onnx_test_data_dir, 'tensorrt', args.enable_multi_device_test, False, 1)
            elif args.use_cuda:
-              run_onnx_tests(build_dir, configs, onnx_test_data_dir, 'cuda', False, 2)
+              run_onnx_tests(build_dir, configs, onnx_test_data_dir, 'cuda', args.enable_multi_device_test, False, 2)
            elif args.x86 or platform.system() == 'Darwin':
-              run_onnx_tests(build_dir, configs, onnx_test_data_dir, None, False, 1)
+              run_onnx_tests(build_dir, configs, onnx_test_data_dir, None, args.enable_multi_device_test, False, 1)
            elif args.use_ngraph:
-              run_onnx_tests(build_dir, configs, onnx_test_data_dir, 'ngraph', True, 1)
+              run_onnx_tests(build_dir, configs, onnx_test_data_dir, 'ngraph', args.enable_multi_device_test, True, 1)
            elif args.use_openvino:
-              run_onnx_tests(build_dir, configs, onnx_test_data_dir, 'openvino', False, 1)
+              run_onnx_tests(build_dir, configs, onnx_test_data_dir, 'openvino', args.enable_multi_device_test, False, 1)
              # TODO: parallel executor test fails on MacOS
            elif args.use_nuphar:
-              run_onnx_tests(build_dir, configs, onnx_test_data_dir, 'nuphar', False, 1)
+              run_onnx_tests(build_dir, configs, onnx_test_data_dir, 'nuphar', args.enable_multi_device_test, False, 1)
            else:
-              run_onnx_tests(build_dir, configs, onnx_test_data_dir, None, True, 0)
+              run_onnx_tests(build_dir, configs, onnx_test_data_dir, None, args.enable_multi_device_test, True, 0)

              if args.use_mkldnn:
                mkldnn_run_onnx_tests(build_dir, configs, onnx_test_data_dir)
--- a/tools/ci_build/github/azure-pipelines/linux-multi-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-multi-gpu-ci-pipeline.yml
@ -0,0 +1,8 @@
+jobs:
+- template: templates/linux-ci.yml
+  parameters:
+    AgentPool : 'Linux-Multi-GPU'
+    JobName: 'Linux_CI_Multi_GPU_Dev'
+    BuildCommand: 'tools/ci_build/github/linux/run_dockerbuild.sh -o ubuntu16.04 -d gpu -r $(Build.BinariesDirectory) -x "--enable_multi_device_test"'
+    DoNugetPack:  'false'
+    ArtifactName: 'drop-linux'
--- a/tools/ci_build/github/azure-pipelines/linux-multi-gpu-tensorrt-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-multi-gpu-tensorrt-ci-pipeline.yml
@ -0,0 +1,9 @@
+jobs:
+- template: templates/linux-ci.yml
+  parameters:
+    AgentPool : 'Linux-Multi-GPU'
+    JobName: 'Linux_CI_Multi_GPU_TensorRT_Dev'
+    # The latest TensorRT container (R19.09) only supports ubuntu18.04
+    BuildCommand: 'tools/ci_build/github/linux/run_dockerbuild.sh -o ubuntu18.04 -d tensorrt -r $(Build.BinariesDirectory) -p 3.6 -x "--enable_multi_device_test"'
+    DoNugetPack:  'false'
+    ArtifactName: 'drop-linux'