From b4b1c6440a5f609b04a9c424edb89cb45dd533ca Mon Sep 17 00:00:00 2001 From: Weixing Zhang Date: Mon, 15 Jun 2020 08:47:03 -0700 Subject: [PATCH] Enable ORT with CUDA 11 toolkit (#4168) * ORT on CUDA 11 1. Seperate HOROVOD and MPI 2. Seperate NCCL from HOROVOD in CMakeLists.txt 2. Remove dependency on external cub 3. cudnnSetRNNDescriptor is changed in cuDNN 8.0 * polish the code about MPI/NCCL in CMakeLists.txt and build.py * check CUDA version * ${MPI_INCLUDE_DIRS} should be PUBLIC * sm30, sm50 are deprecated in CUDA 11 Toolkit * update change based on code review feedback. * add sm_52 * improve MPI/NCCL build path Co-authored-by: Weixing Zhang --- cmake/CMakeLists.txt | 123 +++++++++++++++--- cmake/onnxruntime_graph.cmake | 1 + cmake/onnxruntime_providers.cmake | 15 ++- cmake/onnxruntime_training.cmake | 9 +- .../core/providers/cuda/rnn/cudnn_rnn_base.h | 2 +- .../orttraining/core/framework/mpi_setup.cc | 16 ++- .../orttraining/core/framework/mpi_setup.h | 12 +- orttraining/orttraining/models/bert/main.cc | 9 +- orttraining/orttraining/models/gpt2/main.cc | 8 +- .../python/orttraining_pybind_state.cc | 8 +- .../graph/optimizer_graph_builder_test.cc | 2 +- tools/ci_build/build.py | 26 ++-- 12 files changed, 178 insertions(+), 53 deletions(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index f56709c525..41acc53fa0 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -108,6 +108,7 @@ option(onnxruntime_ENABLE_NVTX_PROFILE "Enable NVTX profile." OFF) option(onnxruntime_ENABLE_TRAINING "Enable training functionality." OFF) option(onnxruntime_ENABLE_TRAINING_E2E_TESTS "Enable training end-to-end tests." OFF) option(onnxruntime_USE_HOROVOD "Build with HOROVOD support" OFF) +option(onnxruntime_USE_NCCL "Build with NCCL support" ON) if (onnxruntime_ENABLE_NVTX_PROFILE) add_definitions(-DENABLE_NVTX_PROFILE=1) @@ -824,7 +825,13 @@ if (onnxruntime_USE_CUDA) string(APPEND CMAKE_CUDA_FLAGS "-cudart shared") endif() enable_language(CUDA) - set(CMAKE_CUDA_STANDARD 11) + string(REGEX REPLACE "([0-9]+)\\.([0-9]+).*" "\\1" CUDA_VERSION_MAJOR "${CMAKE_CUDA_COMPILER_VERSION}") + message( STATUS "CUDA_VERSION_MAJOR: ${CUDA_VERSION_MAJOR}") + if (CUDA_VERSION_MAJOR EQUAL 11) + set(CMAKE_CUDA_STANDARD 14) + else() + set(CMAKE_CUDA_STANDARD 11) + endif() file(TO_CMAKE_PATH ${onnxruntime_CUDNN_HOME} onnxruntime_CUDNN_HOME) set(ONNXRUNTIME_CUDA_LIBRARIES ${CUDA_LIBRARIES}) @@ -850,8 +857,12 @@ if (onnxruntime_USE_CUDA) endif() list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${ONNXRUNTIME_CUDA_LIBRARIES}) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_30,code=sm_30") # K series - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_50,code=sm_50") # M series + # the following compute capabilities are deprecated in CUDA 11 Toolkit + if (CUDA_VERSION_MAJOR LESS 11) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_30,code=sm_30") # K series + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_50,code=sm_50") # M series + endif() + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_52,code=sm_52") # M60 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_60,code=sm_60") # P series set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_70,code=sm_70") # V series set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr --default-stream legacy") @@ -918,28 +929,108 @@ endif() if (onnxruntime_ENABLE_TRAINING) add_compile_definitions(ENABLE_TRAINING) - if (onnxruntime_USE_HOROVOD) + + if (UNIX) + # Find MPI + find_path(MPI_INCLUDE_DIR + NAMES mpi.h + HINTS + ${onnxruntime_MPI_HOME}/include + /bert_ort/openmpi/include) + + set(MPI_LIBNAME "mpi") + + find_library(MPI_LIBRARY + NAMES ${MPI_LIBNAME} + HINTS + ${onnxruntime_MPI_HOME}/lib + /bert_ort/openmpi/lib) + + include(FindPackageHandleStandardArgs) + find_package_handle_standard_args(MPI DEFAULT_MSG MPI_INCLUDE_DIR MPI_LIBRARY) + + if (MPI_FOUND) + execute_process(COMMAND mpirun --version OUTPUT_VARIABLE MPIRUN_OUTPUT) + string( REGEX MATCH "[0-9]+.[0-9]+.[0-9]" MPI_VERSION ${MPIRUN_OUTPUT}) + message( STATUS "MPI Version: ${MPI_VERSION}") + + set(MPI_INCLUDE_DIRS ${MPI_INCLUDE_DIR}) + set(MPI_LIBRARIES ${MPI_LIBRARY}) + message( STATUS "MPI (include: ${MPI_INCLUDE_DIRS}, library: ${MPI_LIBRARIES})" ) + mark_as_advanced(MPI_INCLUDE_DIRS MPI_LIBRARIES) + + list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${MPI_LIBRARIES} ${MPI_CXX_LINK_FLAGS}) + else () + set(onnxruntime_USE_NCCL OFF) + set(onnxruntime_USE_HOROVOD OFF) + message( WARNING "MPI is not found. Please use --mpi_home to specify the path of MPI. Otherwise, NCCL or HOROVOD will be disabled." ) + endif() + + # Find NCCL and MPI + if (onnxruntime_USE_NCCL AND MPI_FOUND) + find_path(NCCL_INCLUDE_DIR + NAMES nccl.h + HINTS + ${onnxruntime_NCCL_HOME}/include + $ENV{CUDA_ROOT}/include) + + set(NCCL_LIBNAME "nccl") + + find_library(NCCL_LIBRARY + NAMES ${NCCL_LIBNAME} + HINTS + ${onnxruntime_NCCL_HOME}/lib/x86_64-linux-gnu + $ENV{CUDA_ROOT}/lib64) + + include(FindPackageHandleStandardArgs) + find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIR NCCL_LIBRARY) + + if (NCCL_FOUND) + set(NCCL_HEADER_FILE "${NCCL_INCLUDE_DIR}/nccl.h") + message( STATUS "Determining NCCL version from the header file: ${NCCL_HEADER_FILE}" ) + file (STRINGS ${NCCL_HEADER_FILE} NCCL_MAJOR_VERSION_DEFINED + REGEX "^[ \t]*#define[ \t]+NCCL_MAJOR[ \t]+[0-9]+.*$" LIMIT_COUNT 1) + if (NCCL_MAJOR_VERSION_DEFINED) + string (REGEX REPLACE "^[ \t]*#define[ \t]+NCCL_MAJOR[ \t]+" "" + NCCL_MAJOR_VERSION ${NCCL_MAJOR_VERSION_DEFINED}) + message( STATUS "NCCL_MAJOR_VERSION: ${NCCL_MAJOR_VERSION}" ) + endif() + file (STRINGS ${NCCL_HEADER_FILE} NCCL_MINOR_VERSION_DEFINED + REGEX "^[ \t]*#define[ \t]+NCCL_MINOR[ \t]+[0-9]+.*$" LIMIT_COUNT 1) + if (NCCL_MINOR_VERSION_DEFINED) + string (REGEX REPLACE "^[ \t]*#define[ \t]+NCCL_MINOR[ \t]+" "" + NCCL_MINOR_VERSION ${NCCL_MINOR_VERSION_DEFINED}) + message(STATUS "NCCL_MINOR_VERSION: ${NCCL_MINOR_VERSION}") + endif() + + set(NCCL_INCLUDE_DIRS ${NCCL_INCLUDE_DIR}) + set(NCCL_LIBRARIES ${NCCL_LIBRARY}) + message( STATUS "NCCL (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES})" ) + mark_as_advanced(NCCL_INCLUDE_DIRS NCCL_LIBRARIES) + + list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${NCCL_LIBRARIES}) + + add_definitions(-DUSE_NCCL=1) + message( STATUS "NCCL is enabled in Linux GPU Build." ) + else () + set(onnxruntime_USE_NCCL OFF) + message( WARNING "NCCL is not found. Please use --nccl_home to specify the path of NCCL. Otherwise, NCCL is disabled." ) + endif() + endif() + endif() + + if (onnxruntime_USE_HOROVOD AND MPI_FOUND) if (WIN32) message( FATAL_ERROR "Horovod is not supported on Windows." ) elseif (UNIX) - find_package(MPI REQUIRED) add_definitions(-DUSE_HOROVOD=1) set(HOROVOD_ROOT ${PROJECT_SOURCE_DIR}/external/horovod) - set(HOROVOD_INCLUDE_DIRS "${HOROVOD_ROOT}/horovod/common" ${MPI_CXX_INCLUDE_PATH}) + set(HOROVOD_INCLUDE_DIRS "${HOROVOD_ROOT}/horovod/common") add_subdirectory(horovod EXCLUDE_FROM_ALL) # use external/horovod/third_party/gloo/cmake/Modules/Findnccl.cmake to locate nccl lib path list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/external/horovod/third_party/gloo/cmake/Modules/) find_package(nccl REQUIRED) - list(APPEND onnxruntime_EXTERNAL_LIBRARIES horovod ${NCCL_LIBRARIES} ${MPI_CXX_LIBRARIES} ${MPI_CXX_LINK_FLAGS}) - endif() - endif() - - if (onnxruntime_USE_CUDA) - if (WIN32) - message(WARNING "NCCL is not supported on Windows GPU Build." ) - elseif (UNIX) - message( "NCCL is enabled in Linux GPU Build." ) - add_definitions(-DUSE_NCCL=1) + list(APPEND onnxruntime_EXTERNAL_LIBRARIES horovod) endif() endif() diff --git a/cmake/onnxruntime_graph.cmake b/cmake/onnxruntime_graph.cmake index d870c1c70e..fc4b3e6d5b 100644 --- a/cmake/onnxruntime_graph.cmake +++ b/cmake/onnxruntime_graph.cmake @@ -61,6 +61,7 @@ target_include_directories(onnxruntime_graph PRIVATE ${ONNXRUNTIME_ROOT}) if (onnxruntime_ENABLE_TRAINING) target_include_directories(onnxruntime_graph PRIVATE ${ORTTRAINING_ROOT}) + if (onnxruntime_USE_HOROVOD) target_include_directories(onnxruntime_graph PRIVATE ${HOROVOD_INCLUDE_DIRS}) endif() diff --git a/cmake/onnxruntime_providers.cmake b/cmake/onnxruntime_providers.cmake index 780023362d..574f393e69 100644 --- a/cmake/onnxruntime_providers.cmake +++ b/cmake/onnxruntime_providers.cmake @@ -205,6 +205,12 @@ if (onnxruntime_USE_CUDA) "${ORTTRAINING_SOURCE_DIR}/training_ops/cuda/collective/nccl_kernels.cc" "${ORTTRAINING_SOURCE_DIR}/training_ops/cuda/collective/megatron.cc" ) + elseif (NOT onnxruntime_USE_NCCL) + list(REMOVE_ITEM onnxruntime_cuda_training_ops_cc_srcs + "${ORTTRAINING_SOURCE_DIR}/training_ops/cuda/collective/nccl_common.cc" + "${ORTTRAINING_SOURCE_DIR}/training_ops/cuda/collective/nccl_kernels.cc" + "${ORTTRAINING_SOURCE_DIR}/training_ops/cuda/collective/megatron.cc" + ) endif() source_group(TREE ${ORTTRAINING_ROOT} FILES ${onnxruntime_cuda_training_ops_cc_srcs} ${onnxruntime_cuda_training_ops_cu_srcs}) @@ -227,13 +233,18 @@ if (onnxruntime_USE_CUDA) target_link_libraries(onnxruntime_providers_cuda PRIVATE onnxruntime_training) endif() add_dependencies(onnxruntime_providers_cuda ${onnxruntime_EXTERNAL_DEPENDENCIES} ${onnxruntime_tvm_dependencies}) - target_include_directories(onnxruntime_providers_cuda PRIVATE ${ONNXRUNTIME_ROOT} ${PROJECT_SOURCE_DIR}/external/cub ${onnxruntime_CUDNN_HOME}/include ${eigen_INCLUDE_DIRS} ${TVM_INCLUDES} PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) + target_include_directories(onnxruntime_providers_cuda PRIVATE ${ONNXRUNTIME_ROOT} ${onnxruntime_CUDNN_HOME}/include ${eigen_INCLUDE_DIRS} ${TVM_INCLUDES} PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) install(DIRECTORY ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/providers/cuda DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core/providers) set_target_properties(onnxruntime_providers_cuda PROPERTIES LINKER_LANGUAGE CUDA) set_target_properties(onnxruntime_providers_cuda PROPERTIES FOLDER "ONNXRuntime") + if (CUDA_VERSION_MAJOR LESS 11) + target_include_directories(onnxruntime_providers_cuda PRIVATE ${PROJECT_SOURCE_DIR}/external/cub) + endif() + if (onnxruntime_ENABLE_TRAINING) - target_include_directories(onnxruntime_providers_cuda PRIVATE ${ORTTRAINING_ROOT}) + target_include_directories(onnxruntime_providers_cuda PRIVATE ${ORTTRAINING_ROOT} ${MPI_INCLUDE_DIRS}) + if (onnxruntime_USE_HOROVOD) target_include_directories(onnxruntime_providers_cuda PRIVATE ${HOROVOD_INCLUDE_DIRS}) endif() diff --git a/cmake/onnxruntime_training.cmake b/cmake/onnxruntime_training.cmake index 9047ad4b04..7e6a6c03bf 100644 --- a/cmake/onnxruntime_training.cmake +++ b/cmake/onnxruntime_training.cmake @@ -22,7 +22,7 @@ if(WIN32) target_compile_options(onnxruntime_training PRIVATE /wd4100) endif() -target_include_directories(onnxruntime_training PRIVATE ${ONNXRUNTIME_ROOT} ${ORTTRAINING_ROOT} ${eigen_INCLUDE_DIRS} ${RE2_INCLUDE_DIR} PUBLIC ${onnxruntime_graph_header}) +target_include_directories(onnxruntime_training PRIVATE ${ONNXRUNTIME_ROOT} ${ORTTRAINING_ROOT} ${eigen_INCLUDE_DIRS} ${RE2_INCLUDE_DIR} PUBLIC ${onnxruntime_graph_header} ${MPI_INCLUDE_DIRS}) if (onnxruntime_USE_CUDA) target_include_directories(onnxruntime_training PRIVATE ${onnxruntime_CUDNN_HOME}/include ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) @@ -135,7 +135,7 @@ if(UNIX AND NOT APPLE) endif() onnxruntime_add_include_to_target(onnxruntime_training_bert onnxruntime_common onnx onnx_proto protobuf::libprotobuf onnxruntime_training) -target_include_directories(onnxruntime_training_bert PUBLIC ${ONNXRUNTIME_ROOT} ${ORTTRAINING_ROOT} ${eigen_INCLUDE_DIRS} ${CXXOPTS} ${extra_includes} ${onnxruntime_graph_header} ${onnxruntime_exec_src_dir} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/onnx onnxruntime_training_runner) +target_include_directories(onnxruntime_training_bert PUBLIC ${ONNXRUNTIME_ROOT} ${ORTTRAINING_ROOT} ${MPI_INCLUDE_DIRS} ${eigen_INCLUDE_DIRS} ${CXXOPTS} ${extra_includes} ${onnxruntime_graph_header} ${onnxruntime_exec_src_dir} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/onnx onnxruntime_training_runner) if (onnxruntime_USE_HOROVOD) target_include_directories(onnxruntime_training_bert PUBLIC ${HOROVOD_INCLUDE_DIRS}) @@ -156,7 +156,7 @@ if(UNIX AND NOT APPLE) endif() onnxruntime_add_include_to_target(onnxruntime_training_pipeline_poc onnxruntime_common onnx onnx_proto protobuf::libprotobuf onnxruntime_training) -target_include_directories(onnxruntime_training_pipeline_poc PUBLIC ${ONNXRUNTIME_ROOT} ${ORTTRAINING_ROOT} ${eigen_INCLUDE_DIRS} ${CXXOPTS} ${extra_includes} ${onnxruntime_graph_header} ${onnxruntime_exec_src_dir} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/onnx onnxruntime_training_runner) +target_include_directories(onnxruntime_training_pipeline_poc PUBLIC ${ONNXRUNTIME_ROOT} ${ORTTRAINING_ROOT} ${MPI_INCLUDE_DIRS} ${eigen_INCLUDE_DIRS} ${CXXOPTS} ${extra_includes} ${onnxruntime_graph_header} ${onnxruntime_exec_src_dir} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/onnx onnxruntime_training_runner) if (onnxruntime_USE_HOROVOD) target_include_directories(onnxruntime_training_pipeline_poc PUBLIC ${HOROVOD_INCLUDE_DIRS}) @@ -175,7 +175,8 @@ if(UNIX AND NOT APPLE) target_compile_options(onnxruntime_training_gpt2 PUBLIC "-Wno-maybe-uninitialized") endif() onnxruntime_add_include_to_target(onnxruntime_training_gpt2 onnxruntime_common onnx onnx_proto protobuf::libprotobuf onnxruntime_training) -target_include_directories(onnxruntime_training_gpt2 PUBLIC ${ONNXRUNTIME_ROOT} ${ORTTRAINING_ROOT} ${eigen_INCLUDE_DIRS} ${CXXOPTS} ${extra_includes} ${onnxruntime_graph_header} ${onnxruntime_exec_src_dir} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/onnx onnxruntime_training_runner) +target_include_directories(onnxruntime_training_gpt2 PUBLIC ${ONNXRUNTIME_ROOT} ${ORTTRAINING_ROOT} ${MPI_INCLUDE_DIRS} ${eigen_INCLUDE_DIRS} ${CXXOPTS} ${extra_includes} ${onnxruntime_graph_header} ${onnxruntime_exec_src_dir} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/onnx onnxruntime_training_runner) + if (onnxruntime_USE_HOROVOD) target_include_directories(onnxruntime_training_gpt2 PUBLIC ${HOROVOD_INCLUDE_DIRS}) endif() diff --git a/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.h b/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.h index 5281904a2b..87fa1fc5b3 100644 --- a/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.h +++ b/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.h @@ -42,7 +42,7 @@ class CudnnRNN { if (!cudnn_rnn_desc_) CUDNN_RETURN_IF_ERROR(cudnnCreateRNNDescriptor(&cudnn_rnn_desc_)); - CUDNN_RETURN_IF_ERROR(cudnnSetRNNDescriptor(cudnnHandle, + CUDNN_RETURN_IF_ERROR(cudnnSetRNNDescriptor_v6(cudnnHandle, cudnn_rnn_desc_, gsl::narrow_cast(hidden_size), num_layers, diff --git a/orttraining/orttraining/core/framework/mpi_setup.cc b/orttraining/orttraining/core/framework/mpi_setup.cc index 8159db0478..bf69bb12d9 100644 --- a/orttraining/orttraining/core/framework/mpi_setup.cc +++ b/orttraining/orttraining/core/framework/mpi_setup.cc @@ -1,11 +1,14 @@ +#include +#include + #include "mpi_setup.h" namespace onnxruntime { namespace training { MPIContext::MPIContext(int w_rank, int l_rank, int w_size, int l_size) : world_rank(w_rank), local_rank(l_rank), world_size(w_size), local_size(l_size) {} -#ifdef USE_HOROVOD -MPIContext setup_horovod() { - using namespace horovod::common; + +#if defined(USE_NCCL) || defined(USE_HOROVOD) +MPIContext setup_mpi() { // setup MPI amd horovod int is_mpi_initialized = 0; MPI_Initialized(&is_mpi_initialized); @@ -24,7 +27,10 @@ MPIContext setup_horovod() { MPI_Allgather(&world_rank, 1, MPI_INT, ranks, 1, MPI_INT, MPI_COMM_WORLD); +#ifdef USE_HOROVOD + using namespace horovod::common; horovod_init(ranks, world_size); +#endif //Get local rank and size int local_rank; @@ -46,8 +52,10 @@ MPIContext setup_horovod() { return MPIContext(world_rank, local_rank, world_size, local_size); } -void shutdown_horovod() { +void shutdown_mpi() { +#ifdef USE_HOROVOD horovod::common::horovod_shutdown(); +#endif int is_mpi_initialized = 0; MPI_Initialized(&is_mpi_initialized); diff --git a/orttraining/orttraining/core/framework/mpi_setup.h b/orttraining/orttraining/core/framework/mpi_setup.h index f92b0e2fb8..927f751fc1 100644 --- a/orttraining/orttraining/core/framework/mpi_setup.h +++ b/orttraining/orttraining/core/framework/mpi_setup.h @@ -1,8 +1,11 @@ #pragma once +#if defined(USE_NCCL) || defined(USE_HOROVOD) +#include +#endif + #ifdef USE_HOROVOD #include "orttraining/core/graph/horovod_adapters.h" -#include #endif namespace onnxruntime { @@ -16,9 +19,10 @@ struct MPIContext { int local_size; }; -#ifdef USE_HOROVOD -MPIContext setup_horovod(); -void shutdown_horovod(); +#if defined(USE_NCCL) || defined(USE_HOROVOD) +MPIContext setup_mpi(); +void shutdown_mpi(); #endif + } // namespace training } // namespace onnxruntime diff --git a/orttraining/orttraining/models/bert/main.cc b/orttraining/orttraining/models/bert/main.cc index b4894156ee..9dbdcff698 100644 --- a/orttraining/orttraining/models/bert/main.cc +++ b/orttraining/orttraining/models/bert/main.cc @@ -510,8 +510,8 @@ void setup_training_params(BertParameters& params) { params.model_with_training_graph_path = model_name_base + ORT_TSTR("_bw.onnx"); params.model_actual_running_graph_path = model_name_base + ORT_TSTR("_bw_running.onnx"); -#ifdef USE_HOROVOD - params.mpi_context = setup_horovod(); +#if defined(USE_NCCL) || defined(USE_HOROVOD) + params.mpi_context = setup_mpi(); if (params.pipeline_parallel_size > 1) { auto pipeline_model_name_base = model_name_base + ToPathString(std::to_string(params.mpi_context.world_rank)); @@ -519,7 +519,6 @@ void setup_training_params(BertParameters& params) { params.model_with_training_graph_path = pipeline_model_name_base + ORT_TSTR("_bw.onnx"); params.model_actual_running_graph_path = pipeline_model_name_base + ORT_TSTR("_bw_running.onnx"); } - ORT_ENFORCE(params.horizontal_parallel_size <= params.mpi_context.world_size); ORT_ENFORCE(params.data_parallel_size <= params.mpi_context.world_size); if (params.mpi_context.world_size % params.horizontal_parallel_size != 0) { @@ -795,8 +794,8 @@ int main(int argc, char* argv[]) { RETURN_IF_FAIL(RunTraining(params, *env)); } -#ifdef USE_HOROVOD - shutdown_horovod(); +#if defined(USE_NCCL) || defined(USE_HOROVOD) + shutdown_mpi(); #endif return 0; diff --git a/orttraining/orttraining/models/gpt2/main.cc b/orttraining/orttraining/models/gpt2/main.cc index 1e74f7cead..140154596e 100644 --- a/orttraining/orttraining/models/gpt2/main.cc +++ b/orttraining/orttraining/models/gpt2/main.cc @@ -291,8 +291,8 @@ void setup_training_params(GPT2Parameters& params) { {/*prediction_name*/ "output", /*label_name*/ "labels"}); -#ifdef USE_HOROVOD - params.mpi_context = setup_horovod(); +#if defined(USE_NCCL) || defined(USE_HOROVOD) + params.mpi_context = setup_mpi(); ORT_ENFORCE(params.horizontal_parallel_size <= params.mpi_context.world_size); ORT_ENFORCE(params.data_parallel_size <= params.mpi_context.world_size); if (params.mpi_context.world_size % params.horizontal_parallel_size != 0) { @@ -476,8 +476,8 @@ int main(int argc, char* argv[]) { RETURN_IF_FAIL(RunTraining(params, *env)); } -#ifdef USE_HOROVOD - shutdown_horovod(); +#if defined(USE_NCCL) || defined(USE_HOROVOD) + shutdown_mpi(); #endif return 0; diff --git a/orttraining/orttraining/python/orttraining_pybind_state.cc b/orttraining/orttraining/python/orttraining_pybind_state.cc index bb726b553a..5d8d702b5f 100644 --- a/orttraining/orttraining/python/orttraining_pybind_state.cc +++ b/orttraining/orttraining/python/orttraining_pybind_state.cc @@ -74,13 +74,13 @@ TrainingConfigurationResult ConfigureSessionForTraining( << data_group_size << std::endl; parameters.data_parallel_size = data_group_size; } -#ifdef USE_HOROVOD +#if defined(USE_NCCL) || defined(USE_HOROVOD) // this condition block is temporary. // For now, nccl allreduce kernel only implements for allreduce_post_accumulation // hovorod allreduce kernel only implements for not allreduce_post_accumulation. bool use_nccl = parameters.allreduce_post_accumulation; if (!use_nccl && parameters.world_size > 1) { - auto mpi_context = training::setup_horovod(); + auto mpi_context = training::setup_mpi(); std::cout << "mpi_context.world_rank: " << mpi_context.world_rank << std::endl; std::cout << "mpi_context.local_rank: " << mpi_context.local_rank << std::endl; std::cout << "mpi_context.world_size: " << mpi_context.world_size << std::endl; @@ -203,8 +203,8 @@ void addObjectMethodsForTraining(py::module& m) { return onnxruntime::make_unique(GetDefaultCPUSessionOptions(), env); })) .def("finalize", [](py::object) { -#ifdef USE_HOROVOD - training::shutdown_horovod(); +#if defined(USE_NCCL) || defined(USE_HOROVOD) + training::shutdown_mpi(); #endif }) .def("load_model", [](onnxruntime::training::TrainingSession* sess, const std::string& path, TrainingParameters& parameters) { diff --git a/orttraining/orttraining/test/graph/optimizer_graph_builder_test.cc b/orttraining/orttraining/test/graph/optimizer_graph_builder_test.cc index e975ad3782..00edeaa5d2 100644 --- a/orttraining/orttraining/test/graph/optimizer_graph_builder_test.cc +++ b/orttraining/orttraining/test/graph/optimizer_graph_builder_test.cc @@ -182,7 +182,7 @@ TEST_F(OptimizerGraphBuilderTest, Default_WithGradientAccumulation_WithMixedPrec TestDefaultOptimizerGraphBuilder(config, graph_); } -#if defined(USE_HOROVOD) || defined(USE_NCCL) +#if defined(USE_NCCL) || defined(USE_HOROVOD) static void TestAllreduceOptimizerGraphBuilder(OptimizerGraphConfig config, Graph& graph) { AllreduceOptimizerGraphBuilder optimizer_graph_builder( GetOptimizerBuilderRegistry(), config, GetOptInfoMap()); diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index 3f0185413d..140f9156b8 100755 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -105,6 +105,10 @@ def parse_arguments(): help="Enable the pytorch frontend training tests.") parser.add_argument( "--use_horovod", action='store_true', help="Enable Horovod.") + parser.add_argument( + "--mpi_home", help="Path to MPI installation dir") + parser.add_argument( + "--nccl_home", help="Path to NCCL installation dir") # enable ONNX tests parser.add_argument( @@ -516,9 +520,9 @@ def setup_test_data(build_dir, configs): src_model_dir], shell=True) -def generate_build_tree(cmake_path, source_dir, build_dir, cuda_home, - cudnn_home, tensorrt_home, migraphx_home, path_to_protoc_exe, configs, - cmake_extra_defines, args, cmake_extra_args): +def generate_build_tree(cmake_path, source_dir, build_dir, cuda_home, cudnn_home, + mpi_home, nccl_home, tensorrt_home, migraphx_home, + path_to_protoc_exe, configs, cmake_extra_defines, args, cmake_extra_args): log.info("Generating CMake build tree") cmake_dir = os.path.join(source_dir, "cmake") # TODO: fix jemalloc build so it does not conflict with onnxruntime @@ -631,9 +635,15 @@ def generate_build_tree(cmake_path, source_dir, build_dir, cuda_home, "-Donnxruntime_ENABLE_TRAINING=" + ( "ON" if args.enable_training else "OFF"), "-Donnxruntime_USE_HOROVOD=" + ( - "ON" if args.use_horovod else "OFF"), + "ON" if args.use_horovod else "OFF") ] + if mpi_home and os.path.exists(mpi_home): + cmake_args += ["-Donnxruntime_MPI_HOME=" + mpi_home] + + if nccl_home and os.path.exists(nccl_home): + cmake_args += ["-Donnxruntime_NCCL_HOME=" + nccl_home] + if args.winml_root_namespace_override: cmake_args += ["-Donnxruntime_WINML_NAMESPACE_OVERRIDE=" + args.winml_root_namespace_override] @@ -641,10 +651,7 @@ def generate_build_tree(cmake_path, source_dir, build_dir, cuda_home, # temp turn on only for linux gpu build if not is_windows(): if args.use_cuda: - if "-Donnxruntime_USE_HOROVOD=OFF" in cmake_args: - cmake_args.remove("-Donnxruntime_USE_HOROVOD=OFF") cmake_args += [ - "-Donnxruntime_USE_HOROVOD=ON", "-Donnxruntime_USE_FULL_PROTOBUF=ON"] # nGraph, TensorRT and OpenVINO providers currently only supports @@ -1613,6 +1620,9 @@ def main(): # if using cuda, setup cuda paths and env vars cuda_home, cudnn_home = setup_cuda_vars(args) + mpi_home = args.mpi_home + nccl_home = args.nccl_home + # if using tensorrt, setup tensorrt paths tensorrt_home = setup_tensorrt_vars(args) @@ -1702,7 +1712,7 @@ def main(): if args.enable_onnx_tests: setup_test_data(build_dir, configs) generate_build_tree( - cmake_path, source_dir, build_dir, cuda_home, cudnn_home, + cmake_path, source_dir, build_dir, cuda_home, cudnn_home, mpi_home, nccl_home, tensorrt_home, migraphx_home, path_to_protoc_exe, configs, cmake_extra_defines, args, cmake_extra_args)