link mpi when either use_mpi or use_nccl enabled (#14467)

### Only link mpi when either use_mpi or use_nccl enabled

To fix the issue https://github.com/microsoft/onnxruntime/issues/14278. 

Talked with @askhade, we think if users want to enable NCCL/MPi but MPI
is not found, it should be failure instead of warning.
So this PR made the change. As a result, to make CIs pass, we need
disable NCCL/MPI explicitly in the build command. This PR take an
alternative approach, e.g. since NCCL and MPi are not used for
customers, disable NCCL by default if "--disable_nccl" not specified,
disable MPI by default if "--use_mpi" not specified.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
This commit is contained in:
pengwa 2023-02-03 20:11:50 +08:00 committed by GitHub
parent c6c11039d7
commit 7eca42484c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 22 additions and 15 deletions

View file

@ -1347,19 +1347,22 @@ if (onnxruntime_ENABLE_TRAINING)
find_package(MPI)
if (MPI_CXX_FOUND)
message( STATUS "MPI Version: ${MPI_CXX_VERSION}")
message( STATUS "MPI (include: ${MPI_CXX_INCLUDE_DIRS}, library: ${MPI_CXX_LIBRARIES})" )
mark_as_advanced(MPI_CXX_INCLUDE_DIRS MPI_CXX_LIBRARIES)
list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${MPI_CXX_LIBRARIES} ${MPI_CXX_LINK_FLAGS})
else ()
set(onnxruntime_USE_NCCL OFF)
set(onnxruntime_USE_MPI OFF)
message( WARNING "MPI is not found. Please define onnxruntime_MPI_HOME to specify the path of MPI. Otherwise, NCCL will be disabled." )
if (onnxruntime_USE_MPI OR onnxruntime_USE_NCCL)
if (MPI_CXX_FOUND)
message( STATUS "MPI Version: ${MPI_CXX_VERSION}")
message( STATUS "MPI (include: ${MPI_CXX_INCLUDE_DIRS}, library: ${MPI_CXX_LIBRARIES})" )
mark_as_advanced(MPI_CXX_INCLUDE_DIRS MPI_CXX_LIBRARIES)
list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${MPI_CXX_LIBRARIES} ${MPI_CXX_LINK_FLAGS})
else ()
message(
FATAL_ERROR
"MPI is not found. Please define onnxruntime_MPI_HOME to specify the path of MPI. Otherwise, NCCL will be disabled."
)
endif()
endif()
# Find NCCL and MPI
if (onnxruntime_USE_NCCL AND MPI_CXX_FOUND)
if (onnxruntime_USE_NCCL)
if (onnxruntime_USE_CUDA)
set(NCCL_LIBNAME "nccl")
elseif (onnxruntime_USE_ROCM)
@ -1417,13 +1420,15 @@ if (onnxruntime_ENABLE_TRAINING)
add_definitions(-DORT_USE_NCCL=1)
message( STATUS "NCCL is enabled in Linux GPU Build." )
else ()
set(onnxruntime_USE_NCCL OFF)
message( WARNING "NCCL is not found. Please use --nccl_home to specify the path of NCCL. Otherwise, NCCL is disabled." )
message(
FATAL_ERROR
"NCCL is not found. Please use --nccl_home to specify the path of NCCL. Otherwise, NCCL is disabled."
)
endif()
endif()
endif()
if (onnxruntime_USE_MPI AND MPI_CXX_FOUND)
if (onnxruntime_USE_MPI)
add_definitions(-DUSE_MPI=1)
endif()

View file

@ -192,10 +192,12 @@ def parse_arguments():
parser.add_argument("--enable_training_apis", action="store_true", help="Enable ort training apis.")
parser.add_argument("--enable_training_ops", action="store_true", help="Enable training ops in inference graph.")
parser.add_argument("--disable_nccl", action="store_true", help="Disable Nccl.")
parser.add_argument("--disable_nccl", action="store_false", help="Disable NCCL, by default NCCL is disabled.")
parser.add_argument("--mpi_home", help="Path to MPI installation dir")
parser.add_argument("--nccl_home", help="Path to NCCL installation dir")
parser.add_argument("--use_mpi", nargs="?", default=True, const=True, type=_str_to_bool)
parser.add_argument(
"--use_mpi", nargs="?", default=False, const=True, type=_str_to_bool, help="Disabled by default."
)
# enable ONNX tests
parser.add_argument(