Enable ORT with CUDA 11 toolkit (#4168)

* ORT on CUDA 11 1. Seperate HOROVOD and MPI 2. Seperate NCCL from HOROVOD in CMakeLists.txt 2. Remove dependency on external cub 3. cudnnSetRNNDescriptor is changed in cuDNN 8.0 * polish the code about MPI/NCCL in CMakeLists.txt and build.py * check CUDA version * ${MPI_INCLUDE_DIRS} should be PUBLIC * sm30, sm50 are deprecated in CUDA 11 Toolkit * update change based on code review feedback. * add sm_52 * improve MPI/NCCL build path Co-authored-by: Weixing Zhang <wezhan@microsoft.com>
2026-05-14 20:48:00 +00:00 · 2020-06-15 08:47:03 -07:00 · 2020-06-15 08:47:03 -07:00 · b4b1c6440a
commit b4b1c6440a
parent 88a9cceb41
12 changed files with 178 additions and 53 deletions
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@ -108,6 +108,7 @@ option(onnxruntime_ENABLE_NVTX_PROFILE "Enable NVTX profile." OFF)
 option(onnxruntime_ENABLE_TRAINING "Enable training functionality." OFF)
 option(onnxruntime_ENABLE_TRAINING_E2E_TESTS "Enable training end-to-end tests." OFF)
 option(onnxruntime_USE_HOROVOD "Build with HOROVOD support" OFF)
+option(onnxruntime_USE_NCCL "Build with NCCL support" ON)

 if (onnxruntime_ENABLE_NVTX_PROFILE)
  add_definitions(-DENABLE_NVTX_PROFILE=1)
@ -824,7 +825,13 @@ if (onnxruntime_USE_CUDA)
    string(APPEND CMAKE_CUDA_FLAGS "-cudart shared")
  endif()
  enable_language(CUDA)
-  set(CMAKE_CUDA_STANDARD 11)
+  string(REGEX REPLACE "([0-9]+)\\.([0-9]+).*" "\\1" CUDA_VERSION_MAJOR "${CMAKE_CUDA_COMPILER_VERSION}")
+  message( STATUS "CUDA_VERSION_MAJOR: ${CUDA_VERSION_MAJOR}")
+  if (CUDA_VERSION_MAJOR EQUAL 11)
+    set(CMAKE_CUDA_STANDARD 14)
+  else()
+    set(CMAKE_CUDA_STANDARD 11)
+  endif()
  file(TO_CMAKE_PATH ${onnxruntime_CUDNN_HOME} onnxruntime_CUDNN_HOME)
  set(ONNXRUNTIME_CUDA_LIBRARIES ${CUDA_LIBRARIES})

@ -850,8 +857,12 @@ if (onnxruntime_USE_CUDA)
  endif()
  list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${ONNXRUNTIME_CUDA_LIBRARIES})

-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_30,code=sm_30") # K series
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_50,code=sm_50") # M series
+  # the following compute capabilities are deprecated in CUDA 11 Toolkit
+  if (CUDA_VERSION_MAJOR LESS 11)
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_30,code=sm_30") # K series
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_50,code=sm_50") # M series
+  endif()
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_52,code=sm_52") # M60
  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_60,code=sm_60") # P series
  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_70,code=sm_70") # V series
  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr --default-stream legacy")
@ -918,28 +929,108 @@ endif()

 if (onnxruntime_ENABLE_TRAINING)
  add_compile_definitions(ENABLE_TRAINING)
-  if (onnxruntime_USE_HOROVOD)
+
+  if (UNIX)
+    # Find MPI
+    find_path(MPI_INCLUDE_DIR
+      NAMES mpi.h
+      HINTS
+      ${onnxruntime_MPI_HOME}/include
+      /bert_ort/openmpi/include)
+
+    set(MPI_LIBNAME "mpi")
+
+    find_library(MPI_LIBRARY
+      NAMES ${MPI_LIBNAME}
+      HINTS
+      ${onnxruntime_MPI_HOME}/lib
+      /bert_ort/openmpi/lib)
+
+    include(FindPackageHandleStandardArgs)
+    find_package_handle_standard_args(MPI DEFAULT_MSG MPI_INCLUDE_DIR MPI_LIBRARY)
+
+    if (MPI_FOUND)
+      execute_process(COMMAND mpirun --version OUTPUT_VARIABLE MPIRUN_OUTPUT)
+      string( REGEX MATCH "[0-9]+.[0-9]+.[0-9]" MPI_VERSION ${MPIRUN_OUTPUT})
+      message( STATUS "MPI Version: ${MPI_VERSION}")
+
+      set(MPI_INCLUDE_DIRS ${MPI_INCLUDE_DIR})
+      set(MPI_LIBRARIES ${MPI_LIBRARY})
+      message( STATUS "MPI (include: ${MPI_INCLUDE_DIRS}, library: ${MPI_LIBRARIES})" )
+      mark_as_advanced(MPI_INCLUDE_DIRS MPI_LIBRARIES)
+
+      list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${MPI_LIBRARIES} ${MPI_CXX_LINK_FLAGS})
+    else ()
+      set(onnxruntime_USE_NCCL OFF)
+      set(onnxruntime_USE_HOROVOD OFF)
+      message( WARNING "MPI is not found. Please use --mpi_home to specify the path of MPI. Otherwise, NCCL or HOROVOD will be disabled." )
+    endif()
+
+    # Find NCCL and MPI
+    if (onnxruntime_USE_NCCL AND MPI_FOUND)
+      find_path(NCCL_INCLUDE_DIR
+        NAMES nccl.h
+        HINTS
+        ${onnxruntime_NCCL_HOME}/include
+        $ENV{CUDA_ROOT}/include)
+
+      set(NCCL_LIBNAME "nccl")
+
+      find_library(NCCL_LIBRARY
+        NAMES ${NCCL_LIBNAME}
+        HINTS
+        ${onnxruntime_NCCL_HOME}/lib/x86_64-linux-gnu
+        $ENV{CUDA_ROOT}/lib64)
+
+      include(FindPackageHandleStandardArgs)
+      find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIR NCCL_LIBRARY)
+
+      if (NCCL_FOUND)
+        set(NCCL_HEADER_FILE "${NCCL_INCLUDE_DIR}/nccl.h")
+        message( STATUS "Determining NCCL version from the header file: ${NCCL_HEADER_FILE}" )
+        file (STRINGS ${NCCL_HEADER_FILE} NCCL_MAJOR_VERSION_DEFINED
+              REGEX "^[ \t]*#define[ \t]+NCCL_MAJOR[ \t]+[0-9]+.*$" LIMIT_COUNT 1)
+        if (NCCL_MAJOR_VERSION_DEFINED)
+          string (REGEX REPLACE "^[ \t]*#define[ \t]+NCCL_MAJOR[ \t]+" ""
+                  NCCL_MAJOR_VERSION ${NCCL_MAJOR_VERSION_DEFINED})
+          message( STATUS "NCCL_MAJOR_VERSION: ${NCCL_MAJOR_VERSION}" )
+        endif()
+        file (STRINGS ${NCCL_HEADER_FILE} NCCL_MINOR_VERSION_DEFINED
+              REGEX "^[ \t]*#define[ \t]+NCCL_MINOR[ \t]+[0-9]+.*$" LIMIT_COUNT 1)
+        if (NCCL_MINOR_VERSION_DEFINED)
+          string (REGEX REPLACE "^[ \t]*#define[ \t]+NCCL_MINOR[ \t]+" ""
+                  NCCL_MINOR_VERSION ${NCCL_MINOR_VERSION_DEFINED})
+          message(STATUS "NCCL_MINOR_VERSION: ${NCCL_MINOR_VERSION}")
+        endif()
+
+        set(NCCL_INCLUDE_DIRS ${NCCL_INCLUDE_DIR})
+        set(NCCL_LIBRARIES ${NCCL_LIBRARY})
+        message( STATUS "NCCL (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES})" )
+        mark_as_advanced(NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
+
+        list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${NCCL_LIBRARIES})
+
+        add_definitions(-DUSE_NCCL=1)
+        message( STATUS "NCCL is enabled in Linux GPU Build." )
+      else ()
+        set(onnxruntime_USE_NCCL OFF)
+        message( WARNING "NCCL is not found. Please use --nccl_home to specify the path of NCCL. Otherwise, NCCL is disabled." )
+      endif()
+    endif()
+  endif()
+
+  if (onnxruntime_USE_HOROVOD AND MPI_FOUND)
    if (WIN32)
      message( FATAL_ERROR "Horovod is not supported on Windows." )
    elseif (UNIX)
-      find_package(MPI REQUIRED)
      add_definitions(-DUSE_HOROVOD=1)
      set(HOROVOD_ROOT ${PROJECT_SOURCE_DIR}/external/horovod)
-      set(HOROVOD_INCLUDE_DIRS  "${HOROVOD_ROOT}/horovod/common" ${MPI_CXX_INCLUDE_PATH})
+      set(HOROVOD_INCLUDE_DIRS  "${HOROVOD_ROOT}/horovod/common")
      add_subdirectory(horovod EXCLUDE_FROM_ALL)
      # use external/horovod/third_party/gloo/cmake/Modules/Findnccl.cmake to locate nccl lib path
      list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/external/horovod/third_party/gloo/cmake/Modules/)
      find_package(nccl REQUIRED)
-      list(APPEND onnxruntime_EXTERNAL_LIBRARIES horovod ${NCCL_LIBRARIES} ${MPI_CXX_LIBRARIES} ${MPI_CXX_LINK_FLAGS})
-    endif()
-  endif()
-
-  if (onnxruntime_USE_CUDA)
-    if (WIN32)
-      message(WARNING "NCCL is not supported on Windows GPU Build." )
-    elseif (UNIX)
-      message( "NCCL is enabled in Linux GPU Build." )
-      add_definitions(-DUSE_NCCL=1)
+      list(APPEND onnxruntime_EXTERNAL_LIBRARIES horovod)
    endif()
  endif()

--- a/cmake/onnxruntime_graph.cmake
+++ b/cmake/onnxruntime_graph.cmake
@ -61,6 +61,7 @@ target_include_directories(onnxruntime_graph PRIVATE ${ONNXRUNTIME_ROOT})

 if (onnxruntime_ENABLE_TRAINING)
    target_include_directories(onnxruntime_graph PRIVATE ${ORTTRAINING_ROOT})
+
    if (onnxruntime_USE_HOROVOD)
        target_include_directories(onnxruntime_graph PRIVATE ${HOROVOD_INCLUDE_DIRS})
    endif()
--- a/cmake/onnxruntime_providers.cmake
+++ b/cmake/onnxruntime_providers.cmake
@ -205,6 +205,12 @@ if (onnxruntime_USE_CUDA)
      "${ORTTRAINING_SOURCE_DIR}/training_ops/cuda/collective/nccl_kernels.cc"
      "${ORTTRAINING_SOURCE_DIR}/training_ops/cuda/collective/megatron.cc"
      )
+    elseif (NOT onnxruntime_USE_NCCL)
+      list(REMOVE_ITEM onnxruntime_cuda_training_ops_cc_srcs
+      "${ORTTRAINING_SOURCE_DIR}/training_ops/cuda/collective/nccl_common.cc"
+      "${ORTTRAINING_SOURCE_DIR}/training_ops/cuda/collective/nccl_kernels.cc"
+      "${ORTTRAINING_SOURCE_DIR}/training_ops/cuda/collective/megatron.cc"
+      )
    endif()

    source_group(TREE ${ORTTRAINING_ROOT} FILES ${onnxruntime_cuda_training_ops_cc_srcs} ${onnxruntime_cuda_training_ops_cu_srcs})
@ -227,13 +233,18 @@ if (onnxruntime_USE_CUDA)
    target_link_libraries(onnxruntime_providers_cuda PRIVATE onnxruntime_training)
  endif()
  add_dependencies(onnxruntime_providers_cuda ${onnxruntime_EXTERNAL_DEPENDENCIES} ${onnxruntime_tvm_dependencies})
-  target_include_directories(onnxruntime_providers_cuda PRIVATE ${ONNXRUNTIME_ROOT} ${PROJECT_SOURCE_DIR}/external/cub ${onnxruntime_CUDNN_HOME}/include ${eigen_INCLUDE_DIRS} ${TVM_INCLUDES} PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+  target_include_directories(onnxruntime_providers_cuda PRIVATE ${ONNXRUNTIME_ROOT} ${onnxruntime_CUDNN_HOME}/include ${eigen_INCLUDE_DIRS} ${TVM_INCLUDES} PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
  install(DIRECTORY ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/providers/cuda  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core/providers)
  set_target_properties(onnxruntime_providers_cuda PROPERTIES LINKER_LANGUAGE CUDA)
  set_target_properties(onnxruntime_providers_cuda PROPERTIES FOLDER "ONNXRuntime")

+  if (CUDA_VERSION_MAJOR LESS 11)
+    target_include_directories(onnxruntime_providers_cuda PRIVATE ${PROJECT_SOURCE_DIR}/external/cub)
+  endif()
+
  if (onnxruntime_ENABLE_TRAINING)
-    target_include_directories(onnxruntime_providers_cuda PRIVATE ${ORTTRAINING_ROOT})
+    target_include_directories(onnxruntime_providers_cuda PRIVATE ${ORTTRAINING_ROOT} ${MPI_INCLUDE_DIRS})
+
    if (onnxruntime_USE_HOROVOD)
      target_include_directories(onnxruntime_providers_cuda PRIVATE ${HOROVOD_INCLUDE_DIRS})
    endif()
--- a/cmake/onnxruntime_training.cmake
+++ b/cmake/onnxruntime_training.cmake
@ -22,7 +22,7 @@ if(WIN32)
  target_compile_options(onnxruntime_training PRIVATE /wd4100)
 endif()

-target_include_directories(onnxruntime_training PRIVATE ${ONNXRUNTIME_ROOT} ${ORTTRAINING_ROOT} ${eigen_INCLUDE_DIRS} ${RE2_INCLUDE_DIR} PUBLIC ${onnxruntime_graph_header})
+target_include_directories(onnxruntime_training PRIVATE ${ONNXRUNTIME_ROOT} ${ORTTRAINING_ROOT} ${eigen_INCLUDE_DIRS} ${RE2_INCLUDE_DIR} PUBLIC ${onnxruntime_graph_header} ${MPI_INCLUDE_DIRS})

 if (onnxruntime_USE_CUDA)
  target_include_directories(onnxruntime_training PRIVATE ${onnxruntime_CUDNN_HOME}/include ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
@ -135,7 +135,7 @@ if(UNIX AND NOT APPLE)
 endif()

 onnxruntime_add_include_to_target(onnxruntime_training_bert onnxruntime_common onnx onnx_proto protobuf::libprotobuf onnxruntime_training)
-target_include_directories(onnxruntime_training_bert PUBLIC ${ONNXRUNTIME_ROOT} ${ORTTRAINING_ROOT} ${eigen_INCLUDE_DIRS} ${CXXOPTS} ${extra_includes} ${onnxruntime_graph_header} ${onnxruntime_exec_src_dir} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/onnx onnxruntime_training_runner)
+target_include_directories(onnxruntime_training_bert PUBLIC ${ONNXRUNTIME_ROOT} ${ORTTRAINING_ROOT} ${MPI_INCLUDE_DIRS} ${eigen_INCLUDE_DIRS} ${CXXOPTS} ${extra_includes} ${onnxruntime_graph_header} ${onnxruntime_exec_src_dir} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/onnx onnxruntime_training_runner)

 if (onnxruntime_USE_HOROVOD)
  target_include_directories(onnxruntime_training_bert PUBLIC ${HOROVOD_INCLUDE_DIRS})
@ -156,7 +156,7 @@ if(UNIX AND NOT APPLE)
 endif()

 onnxruntime_add_include_to_target(onnxruntime_training_pipeline_poc onnxruntime_common onnx onnx_proto protobuf::libprotobuf onnxruntime_training)
-target_include_directories(onnxruntime_training_pipeline_poc PUBLIC ${ONNXRUNTIME_ROOT} ${ORTTRAINING_ROOT} ${eigen_INCLUDE_DIRS} ${CXXOPTS} ${extra_includes} ${onnxruntime_graph_header} ${onnxruntime_exec_src_dir} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/onnx onnxruntime_training_runner)
+target_include_directories(onnxruntime_training_pipeline_poc PUBLIC ${ONNXRUNTIME_ROOT} ${ORTTRAINING_ROOT} ${MPI_INCLUDE_DIRS} ${eigen_INCLUDE_DIRS} ${CXXOPTS} ${extra_includes} ${onnxruntime_graph_header} ${onnxruntime_exec_src_dir} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/onnx onnxruntime_training_runner)

 if (onnxruntime_USE_HOROVOD)
  target_include_directories(onnxruntime_training_pipeline_poc PUBLIC ${HOROVOD_INCLUDE_DIRS})
@ -175,7 +175,8 @@ if(UNIX AND NOT APPLE)
  target_compile_options(onnxruntime_training_gpt2 PUBLIC "-Wno-maybe-uninitialized")
 endif()
 onnxruntime_add_include_to_target(onnxruntime_training_gpt2 onnxruntime_common onnx onnx_proto protobuf::libprotobuf onnxruntime_training)
-target_include_directories(onnxruntime_training_gpt2 PUBLIC ${ONNXRUNTIME_ROOT} ${ORTTRAINING_ROOT} ${eigen_INCLUDE_DIRS} ${CXXOPTS} ${extra_includes} ${onnxruntime_graph_header} ${onnxruntime_exec_src_dir} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/onnx onnxruntime_training_runner)
+target_include_directories(onnxruntime_training_gpt2 PUBLIC ${ONNXRUNTIME_ROOT} ${ORTTRAINING_ROOT} ${MPI_INCLUDE_DIRS} ${eigen_INCLUDE_DIRS} ${CXXOPTS} ${extra_includes} ${onnxruntime_graph_header} ${onnxruntime_exec_src_dir} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/onnx onnxruntime_training_runner)
+
 if (onnxruntime_USE_HOROVOD)
  target_include_directories(onnxruntime_training_gpt2 PUBLIC ${HOROVOD_INCLUDE_DIRS})
 endif()
--- a/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.h
+++ b/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.h
@ -42,7 +42,7 @@ class CudnnRNN {
    if (!cudnn_rnn_desc_)
      CUDNN_RETURN_IF_ERROR(cudnnCreateRNNDescriptor(&cudnn_rnn_desc_));

-    CUDNN_RETURN_IF_ERROR(cudnnSetRNNDescriptor(cudnnHandle,
+    CUDNN_RETURN_IF_ERROR(cudnnSetRNNDescriptor_v6(cudnnHandle,
                                                cudnn_rnn_desc_,
                                                gsl::narrow_cast<int>(hidden_size),
                                                num_layers,
--- a/orttraining/orttraining/core/framework/mpi_setup.cc
+++ b/orttraining/orttraining/core/framework/mpi_setup.cc
@ -1,11 +1,14 @@
+#include <stdio.h>
+#include <stdlib.h>
+
 #include "mpi_setup.h"

 namespace onnxruntime {
 namespace training {
 MPIContext::MPIContext(int w_rank, int l_rank, int w_size, int l_size) : world_rank(w_rank), local_rank(l_rank), world_size(w_size), local_size(l_size) {}
-#ifdef USE_HOROVOD
-MPIContext setup_horovod() {
-  using namespace horovod::common;
+
+#if defined(USE_NCCL) || defined(USE_HOROVOD)
+MPIContext setup_mpi() {
  // setup MPI amd horovod
  int is_mpi_initialized = 0;
  MPI_Initialized(&is_mpi_initialized);
@ -24,7 +27,10 @@ MPIContext setup_horovod() {

  MPI_Allgather(&world_rank, 1, MPI_INT, ranks, 1, MPI_INT, MPI_COMM_WORLD);

+#ifdef USE_HOROVOD
+  using namespace horovod::common;
  horovod_init(ranks, world_size);
+#endif

  //Get local rank and size
  int local_rank;
@ -46,8 +52,10 @@ MPIContext setup_horovod() {
  return MPIContext(world_rank, local_rank, world_size, local_size);
 }

-void shutdown_horovod() {
+void shutdown_mpi() {
+#ifdef USE_HOROVOD
  horovod::common::horovod_shutdown();
+#endif

  int is_mpi_initialized = 0;
  MPI_Initialized(&is_mpi_initialized);
--- a/orttraining/orttraining/core/framework/mpi_setup.h
+++ b/orttraining/orttraining/core/framework/mpi_setup.h
@ -1,8 +1,11 @@
 #pragma once

+#if defined(USE_NCCL) || defined(USE_HOROVOD)
+#include <mpi.h>
+#endif
+
 #ifdef USE_HOROVOD
 #include "orttraining/core/graph/horovod_adapters.h"
-#include <mpi.h>
 #endif

 namespace onnxruntime {
@ -16,9 +19,10 @@ struct MPIContext {
  int local_size;
 };

-#ifdef USE_HOROVOD
-MPIContext setup_horovod();
-void shutdown_horovod();
+#if defined(USE_NCCL) || defined(USE_HOROVOD)
+MPIContext setup_mpi();
+void shutdown_mpi();
 #endif
+
 }  // namespace training
 }  // namespace onnxruntime
--- a/orttraining/orttraining/models/bert/main.cc
+++ b/orttraining/orttraining/models/bert/main.cc
@ -510,8 +510,8 @@ void setup_training_params(BertParameters& params) {
  params.model_with_training_graph_path = model_name_base + ORT_TSTR("_bw.onnx");
  params.model_actual_running_graph_path = model_name_base + ORT_TSTR("_bw_running.onnx");

-#ifdef USE_HOROVOD
-  params.mpi_context = setup_horovod();
+#if defined(USE_NCCL) || defined(USE_HOROVOD)
+  params.mpi_context = setup_mpi();

  if (params.pipeline_parallel_size > 1) {
    auto pipeline_model_name_base = model_name_base + ToPathString(std::to_string(params.mpi_context.world_rank));
@ -519,7 +519,6 @@ void setup_training_params(BertParameters& params) {
    params.model_with_training_graph_path = pipeline_model_name_base + ORT_TSTR("_bw.onnx");
    params.model_actual_running_graph_path = pipeline_model_name_base + ORT_TSTR("_bw_running.onnx");
  }
-
  ORT_ENFORCE(params.horizontal_parallel_size <= params.mpi_context.world_size);
  ORT_ENFORCE(params.data_parallel_size <= params.mpi_context.world_size);
  if (params.mpi_context.world_size % params.horizontal_parallel_size != 0) {
@ -795,8 +794,8 @@ int main(int argc, char* argv[]) {
    RETURN_IF_FAIL(RunTraining(params, *env));
  }

-#ifdef USE_HOROVOD
-  shutdown_horovod();
+#if defined(USE_NCCL) || defined(USE_HOROVOD)
+  shutdown_mpi();
 #endif

  return 0;
--- a/orttraining/orttraining/models/gpt2/main.cc
+++ b/orttraining/orttraining/models/gpt2/main.cc
@ -291,8 +291,8 @@ void setup_training_params(GPT2Parameters& params) {
                                           {/*prediction_name*/ "output",
                                            /*label_name*/ "labels"});

-#ifdef USE_HOROVOD
-  params.mpi_context = setup_horovod();
+#if defined(USE_NCCL) || defined(USE_HOROVOD)
+  params.mpi_context = setup_mpi();
  ORT_ENFORCE(params.horizontal_parallel_size <= params.mpi_context.world_size);
  ORT_ENFORCE(params.data_parallel_size <= params.mpi_context.world_size);
  if (params.mpi_context.world_size % params.horizontal_parallel_size != 0) {
@ -476,8 +476,8 @@ int main(int argc, char* argv[]) {
    RETURN_IF_FAIL(RunTraining(params, *env));
  }

-#ifdef USE_HOROVOD
-  shutdown_horovod();
+#if defined(USE_NCCL) || defined(USE_HOROVOD)
+  shutdown_mpi();
 #endif

  return 0;
--- a/orttraining/orttraining/python/orttraining_pybind_state.cc
+++ b/orttraining/orttraining/python/orttraining_pybind_state.cc
@ -74,13 +74,13 @@ TrainingConfigurationResult ConfigureSessionForTraining(
              << data_group_size << std::endl;
    parameters.data_parallel_size = data_group_size;
  }
-#ifdef USE_HOROVOD
+#if defined(USE_NCCL) || defined(USE_HOROVOD)
  // this condition block is temporary.
  // For now, nccl allreduce kernel only implements for allreduce_post_accumulation
  // hovorod allreduce kernel only implements for not allreduce_post_accumulation.
  bool use_nccl = parameters.allreduce_post_accumulation;
  if (!use_nccl && parameters.world_size > 1) {
-    auto mpi_context = training::setup_horovod();
+    auto mpi_context = training::setup_mpi();
    std::cout << "mpi_context.world_rank: " << mpi_context.world_rank << std::endl;
    std::cout << "mpi_context.local_rank: " << mpi_context.local_rank << std::endl;
    std::cout << "mpi_context.world_size: " << mpi_context.world_size << std::endl;
@ -203,8 +203,8 @@ void addObjectMethodsForTraining(py::module& m) {
        return onnxruntime::make_unique<onnxruntime::training::TrainingSession>(GetDefaultCPUSessionOptions(), env);
      }))
      .def("finalize", [](py::object) {
-#ifdef USE_HOROVOD
-        training::shutdown_horovod();
+#if defined(USE_NCCL) || defined(USE_HOROVOD)
+        training::shutdown_mpi();
 #endif
      })
      .def("load_model", [](onnxruntime::training::TrainingSession* sess, const std::string& path, TrainingParameters& parameters) {
--- a/orttraining/orttraining/test/graph/optimizer_graph_builder_test.cc
+++ b/orttraining/orttraining/test/graph/optimizer_graph_builder_test.cc
@ -182,7 +182,7 @@ TEST_F(OptimizerGraphBuilderTest, Default_WithGradientAccumulation_WithMixedPrec
  TestDefaultOptimizerGraphBuilder(config, graph_);
 }

-#if defined(USE_HOROVOD) || defined(USE_NCCL)
+#if defined(USE_NCCL) || defined(USE_HOROVOD)
 static void TestAllreduceOptimizerGraphBuilder(OptimizerGraphConfig config, Graph& graph) {
  AllreduceOptimizerGraphBuilder optimizer_graph_builder(
      GetOptimizerBuilderRegistry(), config, GetOptInfoMap());
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@ -105,6 +105,10 @@ def parse_arguments():
        help="Enable the pytorch frontend training tests.")
    parser.add_argument(
        "--use_horovod", action='store_true', help="Enable Horovod.")
+    parser.add_argument(
+        "--mpi_home", help="Path to MPI installation dir")
+    parser.add_argument(
+        "--nccl_home", help="Path to NCCL installation dir")

    # enable ONNX tests
    parser.add_argument(
@ -516,9 +520,9 @@ def setup_test_data(build_dir, configs):
                                src_model_dir], shell=True)


-def generate_build_tree(cmake_path, source_dir, build_dir, cuda_home,
-                        cudnn_home, tensorrt_home, migraphx_home, path_to_protoc_exe, configs,
-                        cmake_extra_defines, args, cmake_extra_args):
+def generate_build_tree(cmake_path, source_dir, build_dir, cuda_home, cudnn_home,
+                        mpi_home, nccl_home, tensorrt_home, migraphx_home,
+                        path_to_protoc_exe, configs, cmake_extra_defines, args, cmake_extra_args):
    log.info("Generating CMake build tree")
    cmake_dir = os.path.join(source_dir, "cmake")
    # TODO: fix jemalloc build so it does not conflict with onnxruntime
@ -631,9 +635,15 @@ def generate_build_tree(cmake_path, source_dir, build_dir, cuda_home,
        "-Donnxruntime_ENABLE_TRAINING=" + (
            "ON" if args.enable_training else "OFF"),
        "-Donnxruntime_USE_HOROVOD=" + (
-            "ON" if args.use_horovod else "OFF"),
+            "ON" if args.use_horovod else "OFF")
    ]

+    if mpi_home and os.path.exists(mpi_home):
+        cmake_args += ["-Donnxruntime_MPI_HOME=" + mpi_home]
+
+    if nccl_home and os.path.exists(nccl_home):
+        cmake_args += ["-Donnxruntime_NCCL_HOME=" + nccl_home]
+
    if args.winml_root_namespace_override:
        cmake_args += ["-Donnxruntime_WINML_NAMESPACE_OVERRIDE="
                       + args.winml_root_namespace_override]
@ -641,10 +651,7 @@ def generate_build_tree(cmake_path, source_dir, build_dir, cuda_home,
    # temp turn on only for linux gpu build
    if not is_windows():
        if args.use_cuda:
-            if "-Donnxruntime_USE_HOROVOD=OFF" in cmake_args:
-                cmake_args.remove("-Donnxruntime_USE_HOROVOD=OFF")
            cmake_args += [
-                "-Donnxruntime_USE_HOROVOD=ON",
                "-Donnxruntime_USE_FULL_PROTOBUF=ON"]

    # nGraph, TensorRT and OpenVINO providers currently only supports
@ -1613,6 +1620,9 @@ def main():
    # if using cuda, setup cuda paths and env vars
    cuda_home, cudnn_home = setup_cuda_vars(args)

+    mpi_home = args.mpi_home
+    nccl_home = args.nccl_home
+
    # if using tensorrt, setup tensorrt paths
    tensorrt_home = setup_tensorrt_vars(args)

@ -1702,7 +1712,7 @@ def main():
        if args.enable_onnx_tests:
            setup_test_data(build_dir, configs)
        generate_build_tree(
-            cmake_path, source_dir, build_dir, cuda_home, cudnn_home,
+            cmake_path, source_dir, build_dir, cuda_home, cudnn_home, mpi_home, nccl_home,
            tensorrt_home, migraphx_home, path_to_protoc_exe, configs, cmake_extra_defines,
            args, cmake_extra_args)