Enable ORT with CUDA 11 toolkit (#4168)

* ORT on CUDA 11

1. Seperate HOROVOD and MPI
2. Seperate NCCL from HOROVOD in CMakeLists.txt
2. Remove dependency on external cub
3. cudnnSetRNNDescriptor is changed in cuDNN 8.0

* polish the code about MPI/NCCL in CMakeLists.txt and build.py

* check CUDA version

* ${MPI_INCLUDE_DIRS} should be PUBLIC

* sm30, sm50 are deprecated in CUDA 11 Toolkit

* update change based on code review feedback.

* add sm_52

* improve MPI/NCCL build path

Co-authored-by: Weixing Zhang <wezhan@microsoft.com>
This commit is contained in:
Weixing Zhang 2020-06-15 08:47:03 -07:00 committed by GitHub
parent 88a9cceb41
commit b4b1c6440a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 178 additions and 53 deletions

View file

@ -108,6 +108,7 @@ option(onnxruntime_ENABLE_NVTX_PROFILE "Enable NVTX profile." OFF)
option(onnxruntime_ENABLE_TRAINING "Enable training functionality." OFF)
option(onnxruntime_ENABLE_TRAINING_E2E_TESTS "Enable training end-to-end tests." OFF)
option(onnxruntime_USE_HOROVOD "Build with HOROVOD support" OFF)
option(onnxruntime_USE_NCCL "Build with NCCL support" ON)
if (onnxruntime_ENABLE_NVTX_PROFILE)
add_definitions(-DENABLE_NVTX_PROFILE=1)
@ -824,7 +825,13 @@ if (onnxruntime_USE_CUDA)
string(APPEND CMAKE_CUDA_FLAGS "-cudart shared")
endif()
enable_language(CUDA)
set(CMAKE_CUDA_STANDARD 11)
string(REGEX REPLACE "([0-9]+)\\.([0-9]+).*" "\\1" CUDA_VERSION_MAJOR "${CMAKE_CUDA_COMPILER_VERSION}")
message( STATUS "CUDA_VERSION_MAJOR: ${CUDA_VERSION_MAJOR}")
if (CUDA_VERSION_MAJOR EQUAL 11)
set(CMAKE_CUDA_STANDARD 14)
else()
set(CMAKE_CUDA_STANDARD 11)
endif()
file(TO_CMAKE_PATH ${onnxruntime_CUDNN_HOME} onnxruntime_CUDNN_HOME)
set(ONNXRUNTIME_CUDA_LIBRARIES ${CUDA_LIBRARIES})
@ -850,8 +857,12 @@ if (onnxruntime_USE_CUDA)
endif()
list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${ONNXRUNTIME_CUDA_LIBRARIES})
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_30,code=sm_30") # K series
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_50,code=sm_50") # M series
# the following compute capabilities are deprecated in CUDA 11 Toolkit
if (CUDA_VERSION_MAJOR LESS 11)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_30,code=sm_30") # K series
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_50,code=sm_50") # M series
endif()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_52,code=sm_52") # M60
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_60,code=sm_60") # P series
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_70,code=sm_70") # V series
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr --default-stream legacy")
@ -918,28 +929,108 @@ endif()
if (onnxruntime_ENABLE_TRAINING)
add_compile_definitions(ENABLE_TRAINING)
if (onnxruntime_USE_HOROVOD)
if (UNIX)
# Find MPI
find_path(MPI_INCLUDE_DIR
NAMES mpi.h
HINTS
${onnxruntime_MPI_HOME}/include
/bert_ort/openmpi/include)
set(MPI_LIBNAME "mpi")
find_library(MPI_LIBRARY
NAMES ${MPI_LIBNAME}
HINTS
${onnxruntime_MPI_HOME}/lib
/bert_ort/openmpi/lib)
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(MPI DEFAULT_MSG MPI_INCLUDE_DIR MPI_LIBRARY)
if (MPI_FOUND)
execute_process(COMMAND mpirun --version OUTPUT_VARIABLE MPIRUN_OUTPUT)
string( REGEX MATCH "[0-9]+.[0-9]+.[0-9]" MPI_VERSION ${MPIRUN_OUTPUT})
message( STATUS "MPI Version: ${MPI_VERSION}")
set(MPI_INCLUDE_DIRS ${MPI_INCLUDE_DIR})
set(MPI_LIBRARIES ${MPI_LIBRARY})
message( STATUS "MPI (include: ${MPI_INCLUDE_DIRS}, library: ${MPI_LIBRARIES})" )
mark_as_advanced(MPI_INCLUDE_DIRS MPI_LIBRARIES)
list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${MPI_LIBRARIES} ${MPI_CXX_LINK_FLAGS})
else ()
set(onnxruntime_USE_NCCL OFF)
set(onnxruntime_USE_HOROVOD OFF)
message( WARNING "MPI is not found. Please use --mpi_home to specify the path of MPI. Otherwise, NCCL or HOROVOD will be disabled." )
endif()
# Find NCCL and MPI
if (onnxruntime_USE_NCCL AND MPI_FOUND)
find_path(NCCL_INCLUDE_DIR
NAMES nccl.h
HINTS
${onnxruntime_NCCL_HOME}/include
$ENV{CUDA_ROOT}/include)
set(NCCL_LIBNAME "nccl")
find_library(NCCL_LIBRARY
NAMES ${NCCL_LIBNAME}
HINTS
${onnxruntime_NCCL_HOME}/lib/x86_64-linux-gnu
$ENV{CUDA_ROOT}/lib64)
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIR NCCL_LIBRARY)
if (NCCL_FOUND)
set(NCCL_HEADER_FILE "${NCCL_INCLUDE_DIR}/nccl.h")
message( STATUS "Determining NCCL version from the header file: ${NCCL_HEADER_FILE}" )
file (STRINGS ${NCCL_HEADER_FILE} NCCL_MAJOR_VERSION_DEFINED
REGEX "^[ \t]*#define[ \t]+NCCL_MAJOR[ \t]+[0-9]+.*$" LIMIT_COUNT 1)
if (NCCL_MAJOR_VERSION_DEFINED)
string (REGEX REPLACE "^[ \t]*#define[ \t]+NCCL_MAJOR[ \t]+" ""
NCCL_MAJOR_VERSION ${NCCL_MAJOR_VERSION_DEFINED})
message( STATUS "NCCL_MAJOR_VERSION: ${NCCL_MAJOR_VERSION}" )
endif()
file (STRINGS ${NCCL_HEADER_FILE} NCCL_MINOR_VERSION_DEFINED
REGEX "^[ \t]*#define[ \t]+NCCL_MINOR[ \t]+[0-9]+.*$" LIMIT_COUNT 1)
if (NCCL_MINOR_VERSION_DEFINED)
string (REGEX REPLACE "^[ \t]*#define[ \t]+NCCL_MINOR[ \t]+" ""
NCCL_MINOR_VERSION ${NCCL_MINOR_VERSION_DEFINED})
message(STATUS "NCCL_MINOR_VERSION: ${NCCL_MINOR_VERSION}")
endif()
set(NCCL_INCLUDE_DIRS ${NCCL_INCLUDE_DIR})
set(NCCL_LIBRARIES ${NCCL_LIBRARY})
message( STATUS "NCCL (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES})" )
mark_as_advanced(NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${NCCL_LIBRARIES})
add_definitions(-DUSE_NCCL=1)
message( STATUS "NCCL is enabled in Linux GPU Build." )
else ()
set(onnxruntime_USE_NCCL OFF)
message( WARNING "NCCL is not found. Please use --nccl_home to specify the path of NCCL. Otherwise, NCCL is disabled." )
endif()
endif()
endif()
if (onnxruntime_USE_HOROVOD AND MPI_FOUND)
if (WIN32)
message( FATAL_ERROR "Horovod is not supported on Windows." )
elseif (UNIX)
find_package(MPI REQUIRED)
add_definitions(-DUSE_HOROVOD=1)
set(HOROVOD_ROOT ${PROJECT_SOURCE_DIR}/external/horovod)
set(HOROVOD_INCLUDE_DIRS "${HOROVOD_ROOT}/horovod/common" ${MPI_CXX_INCLUDE_PATH})
set(HOROVOD_INCLUDE_DIRS "${HOROVOD_ROOT}/horovod/common")
add_subdirectory(horovod EXCLUDE_FROM_ALL)
# use external/horovod/third_party/gloo/cmake/Modules/Findnccl.cmake to locate nccl lib path
list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/external/horovod/third_party/gloo/cmake/Modules/)
find_package(nccl REQUIRED)
list(APPEND onnxruntime_EXTERNAL_LIBRARIES horovod ${NCCL_LIBRARIES} ${MPI_CXX_LIBRARIES} ${MPI_CXX_LINK_FLAGS})
endif()
endif()
if (onnxruntime_USE_CUDA)
if (WIN32)
message(WARNING "NCCL is not supported on Windows GPU Build." )
elseif (UNIX)
message( "NCCL is enabled in Linux GPU Build." )
add_definitions(-DUSE_NCCL=1)
list(APPEND onnxruntime_EXTERNAL_LIBRARIES horovod)
endif()
endif()

View file

@ -61,6 +61,7 @@ target_include_directories(onnxruntime_graph PRIVATE ${ONNXRUNTIME_ROOT})
if (onnxruntime_ENABLE_TRAINING)
target_include_directories(onnxruntime_graph PRIVATE ${ORTTRAINING_ROOT})
if (onnxruntime_USE_HOROVOD)
target_include_directories(onnxruntime_graph PRIVATE ${HOROVOD_INCLUDE_DIRS})
endif()

View file

@ -205,6 +205,12 @@ if (onnxruntime_USE_CUDA)
"${ORTTRAINING_SOURCE_DIR}/training_ops/cuda/collective/nccl_kernels.cc"
"${ORTTRAINING_SOURCE_DIR}/training_ops/cuda/collective/megatron.cc"
)
elseif (NOT onnxruntime_USE_NCCL)
list(REMOVE_ITEM onnxruntime_cuda_training_ops_cc_srcs
"${ORTTRAINING_SOURCE_DIR}/training_ops/cuda/collective/nccl_common.cc"
"${ORTTRAINING_SOURCE_DIR}/training_ops/cuda/collective/nccl_kernels.cc"
"${ORTTRAINING_SOURCE_DIR}/training_ops/cuda/collective/megatron.cc"
)
endif()
source_group(TREE ${ORTTRAINING_ROOT} FILES ${onnxruntime_cuda_training_ops_cc_srcs} ${onnxruntime_cuda_training_ops_cu_srcs})
@ -227,13 +233,18 @@ if (onnxruntime_USE_CUDA)
target_link_libraries(onnxruntime_providers_cuda PRIVATE onnxruntime_training)
endif()
add_dependencies(onnxruntime_providers_cuda ${onnxruntime_EXTERNAL_DEPENDENCIES} ${onnxruntime_tvm_dependencies})
target_include_directories(onnxruntime_providers_cuda PRIVATE ${ONNXRUNTIME_ROOT} ${PROJECT_SOURCE_DIR}/external/cub ${onnxruntime_CUDNN_HOME}/include ${eigen_INCLUDE_DIRS} ${TVM_INCLUDES} PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
target_include_directories(onnxruntime_providers_cuda PRIVATE ${ONNXRUNTIME_ROOT} ${onnxruntime_CUDNN_HOME}/include ${eigen_INCLUDE_DIRS} ${TVM_INCLUDES} PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
install(DIRECTORY ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/providers/cuda DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core/providers)
set_target_properties(onnxruntime_providers_cuda PROPERTIES LINKER_LANGUAGE CUDA)
set_target_properties(onnxruntime_providers_cuda PROPERTIES FOLDER "ONNXRuntime")
if (CUDA_VERSION_MAJOR LESS 11)
target_include_directories(onnxruntime_providers_cuda PRIVATE ${PROJECT_SOURCE_DIR}/external/cub)
endif()
if (onnxruntime_ENABLE_TRAINING)
target_include_directories(onnxruntime_providers_cuda PRIVATE ${ORTTRAINING_ROOT})
target_include_directories(onnxruntime_providers_cuda PRIVATE ${ORTTRAINING_ROOT} ${MPI_INCLUDE_DIRS})
if (onnxruntime_USE_HOROVOD)
target_include_directories(onnxruntime_providers_cuda PRIVATE ${HOROVOD_INCLUDE_DIRS})
endif()

View file

@ -22,7 +22,7 @@ if(WIN32)
target_compile_options(onnxruntime_training PRIVATE /wd4100)
endif()
target_include_directories(onnxruntime_training PRIVATE ${ONNXRUNTIME_ROOT} ${ORTTRAINING_ROOT} ${eigen_INCLUDE_DIRS} ${RE2_INCLUDE_DIR} PUBLIC ${onnxruntime_graph_header})
target_include_directories(onnxruntime_training PRIVATE ${ONNXRUNTIME_ROOT} ${ORTTRAINING_ROOT} ${eigen_INCLUDE_DIRS} ${RE2_INCLUDE_DIR} PUBLIC ${onnxruntime_graph_header} ${MPI_INCLUDE_DIRS})
if (onnxruntime_USE_CUDA)
target_include_directories(onnxruntime_training PRIVATE ${onnxruntime_CUDNN_HOME}/include ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
@ -135,7 +135,7 @@ if(UNIX AND NOT APPLE)
endif()
onnxruntime_add_include_to_target(onnxruntime_training_bert onnxruntime_common onnx onnx_proto protobuf::libprotobuf onnxruntime_training)
target_include_directories(onnxruntime_training_bert PUBLIC ${ONNXRUNTIME_ROOT} ${ORTTRAINING_ROOT} ${eigen_INCLUDE_DIRS} ${CXXOPTS} ${extra_includes} ${onnxruntime_graph_header} ${onnxruntime_exec_src_dir} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/onnx onnxruntime_training_runner)
target_include_directories(onnxruntime_training_bert PUBLIC ${ONNXRUNTIME_ROOT} ${ORTTRAINING_ROOT} ${MPI_INCLUDE_DIRS} ${eigen_INCLUDE_DIRS} ${CXXOPTS} ${extra_includes} ${onnxruntime_graph_header} ${onnxruntime_exec_src_dir} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/onnx onnxruntime_training_runner)
if (onnxruntime_USE_HOROVOD)
target_include_directories(onnxruntime_training_bert PUBLIC ${HOROVOD_INCLUDE_DIRS})
@ -156,7 +156,7 @@ if(UNIX AND NOT APPLE)
endif()
onnxruntime_add_include_to_target(onnxruntime_training_pipeline_poc onnxruntime_common onnx onnx_proto protobuf::libprotobuf onnxruntime_training)
target_include_directories(onnxruntime_training_pipeline_poc PUBLIC ${ONNXRUNTIME_ROOT} ${ORTTRAINING_ROOT} ${eigen_INCLUDE_DIRS} ${CXXOPTS} ${extra_includes} ${onnxruntime_graph_header} ${onnxruntime_exec_src_dir} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/onnx onnxruntime_training_runner)
target_include_directories(onnxruntime_training_pipeline_poc PUBLIC ${ONNXRUNTIME_ROOT} ${ORTTRAINING_ROOT} ${MPI_INCLUDE_DIRS} ${eigen_INCLUDE_DIRS} ${CXXOPTS} ${extra_includes} ${onnxruntime_graph_header} ${onnxruntime_exec_src_dir} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/onnx onnxruntime_training_runner)
if (onnxruntime_USE_HOROVOD)
target_include_directories(onnxruntime_training_pipeline_poc PUBLIC ${HOROVOD_INCLUDE_DIRS})
@ -175,7 +175,8 @@ if(UNIX AND NOT APPLE)
target_compile_options(onnxruntime_training_gpt2 PUBLIC "-Wno-maybe-uninitialized")
endif()
onnxruntime_add_include_to_target(onnxruntime_training_gpt2 onnxruntime_common onnx onnx_proto protobuf::libprotobuf onnxruntime_training)
target_include_directories(onnxruntime_training_gpt2 PUBLIC ${ONNXRUNTIME_ROOT} ${ORTTRAINING_ROOT} ${eigen_INCLUDE_DIRS} ${CXXOPTS} ${extra_includes} ${onnxruntime_graph_header} ${onnxruntime_exec_src_dir} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/onnx onnxruntime_training_runner)
target_include_directories(onnxruntime_training_gpt2 PUBLIC ${ONNXRUNTIME_ROOT} ${ORTTRAINING_ROOT} ${MPI_INCLUDE_DIRS} ${eigen_INCLUDE_DIRS} ${CXXOPTS} ${extra_includes} ${onnxruntime_graph_header} ${onnxruntime_exec_src_dir} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/onnx onnxruntime_training_runner)
if (onnxruntime_USE_HOROVOD)
target_include_directories(onnxruntime_training_gpt2 PUBLIC ${HOROVOD_INCLUDE_DIRS})
endif()

View file

@ -42,7 +42,7 @@ class CudnnRNN {
if (!cudnn_rnn_desc_)
CUDNN_RETURN_IF_ERROR(cudnnCreateRNNDescriptor(&cudnn_rnn_desc_));
CUDNN_RETURN_IF_ERROR(cudnnSetRNNDescriptor(cudnnHandle,
CUDNN_RETURN_IF_ERROR(cudnnSetRNNDescriptor_v6(cudnnHandle,
cudnn_rnn_desc_,
gsl::narrow_cast<int>(hidden_size),
num_layers,

View file

@ -1,11 +1,14 @@
#include <stdio.h>
#include <stdlib.h>
#include "mpi_setup.h"
namespace onnxruntime {
namespace training {
MPIContext::MPIContext(int w_rank, int l_rank, int w_size, int l_size) : world_rank(w_rank), local_rank(l_rank), world_size(w_size), local_size(l_size) {}
#ifdef USE_HOROVOD
MPIContext setup_horovod() {
using namespace horovod::common;
#if defined(USE_NCCL) || defined(USE_HOROVOD)
MPIContext setup_mpi() {
// setup MPI amd horovod
int is_mpi_initialized = 0;
MPI_Initialized(&is_mpi_initialized);
@ -24,7 +27,10 @@ MPIContext setup_horovod() {
MPI_Allgather(&world_rank, 1, MPI_INT, ranks, 1, MPI_INT, MPI_COMM_WORLD);
#ifdef USE_HOROVOD
using namespace horovod::common;
horovod_init(ranks, world_size);
#endif
//Get local rank and size
int local_rank;
@ -46,8 +52,10 @@ MPIContext setup_horovod() {
return MPIContext(world_rank, local_rank, world_size, local_size);
}
void shutdown_horovod() {
void shutdown_mpi() {
#ifdef USE_HOROVOD
horovod::common::horovod_shutdown();
#endif
int is_mpi_initialized = 0;
MPI_Initialized(&is_mpi_initialized);

View file

@ -1,8 +1,11 @@
#pragma once
#if defined(USE_NCCL) || defined(USE_HOROVOD)
#include <mpi.h>
#endif
#ifdef USE_HOROVOD
#include "orttraining/core/graph/horovod_adapters.h"
#include <mpi.h>
#endif
namespace onnxruntime {
@ -16,9 +19,10 @@ struct MPIContext {
int local_size;
};
#ifdef USE_HOROVOD
MPIContext setup_horovod();
void shutdown_horovod();
#if defined(USE_NCCL) || defined(USE_HOROVOD)
MPIContext setup_mpi();
void shutdown_mpi();
#endif
} // namespace training
} // namespace onnxruntime

View file

@ -510,8 +510,8 @@ void setup_training_params(BertParameters& params) {
params.model_with_training_graph_path = model_name_base + ORT_TSTR("_bw.onnx");
params.model_actual_running_graph_path = model_name_base + ORT_TSTR("_bw_running.onnx");
#ifdef USE_HOROVOD
params.mpi_context = setup_horovod();
#if defined(USE_NCCL) || defined(USE_HOROVOD)
params.mpi_context = setup_mpi();
if (params.pipeline_parallel_size > 1) {
auto pipeline_model_name_base = model_name_base + ToPathString(std::to_string(params.mpi_context.world_rank));
@ -519,7 +519,6 @@ void setup_training_params(BertParameters& params) {
params.model_with_training_graph_path = pipeline_model_name_base + ORT_TSTR("_bw.onnx");
params.model_actual_running_graph_path = pipeline_model_name_base + ORT_TSTR("_bw_running.onnx");
}
ORT_ENFORCE(params.horizontal_parallel_size <= params.mpi_context.world_size);
ORT_ENFORCE(params.data_parallel_size <= params.mpi_context.world_size);
if (params.mpi_context.world_size % params.horizontal_parallel_size != 0) {
@ -795,8 +794,8 @@ int main(int argc, char* argv[]) {
RETURN_IF_FAIL(RunTraining(params, *env));
}
#ifdef USE_HOROVOD
shutdown_horovod();
#if defined(USE_NCCL) || defined(USE_HOROVOD)
shutdown_mpi();
#endif
return 0;

View file

@ -291,8 +291,8 @@ void setup_training_params(GPT2Parameters& params) {
{/*prediction_name*/ "output",
/*label_name*/ "labels"});
#ifdef USE_HOROVOD
params.mpi_context = setup_horovod();
#if defined(USE_NCCL) || defined(USE_HOROVOD)
params.mpi_context = setup_mpi();
ORT_ENFORCE(params.horizontal_parallel_size <= params.mpi_context.world_size);
ORT_ENFORCE(params.data_parallel_size <= params.mpi_context.world_size);
if (params.mpi_context.world_size % params.horizontal_parallel_size != 0) {
@ -476,8 +476,8 @@ int main(int argc, char* argv[]) {
RETURN_IF_FAIL(RunTraining(params, *env));
}
#ifdef USE_HOROVOD
shutdown_horovod();
#if defined(USE_NCCL) || defined(USE_HOROVOD)
shutdown_mpi();
#endif
return 0;

View file

@ -74,13 +74,13 @@ TrainingConfigurationResult ConfigureSessionForTraining(
<< data_group_size << std::endl;
parameters.data_parallel_size = data_group_size;
}
#ifdef USE_HOROVOD
#if defined(USE_NCCL) || defined(USE_HOROVOD)
// this condition block is temporary.
// For now, nccl allreduce kernel only implements for allreduce_post_accumulation
// hovorod allreduce kernel only implements for not allreduce_post_accumulation.
bool use_nccl = parameters.allreduce_post_accumulation;
if (!use_nccl && parameters.world_size > 1) {
auto mpi_context = training::setup_horovod();
auto mpi_context = training::setup_mpi();
std::cout << "mpi_context.world_rank: " << mpi_context.world_rank << std::endl;
std::cout << "mpi_context.local_rank: " << mpi_context.local_rank << std::endl;
std::cout << "mpi_context.world_size: " << mpi_context.world_size << std::endl;
@ -203,8 +203,8 @@ void addObjectMethodsForTraining(py::module& m) {
return onnxruntime::make_unique<onnxruntime::training::TrainingSession>(GetDefaultCPUSessionOptions(), env);
}))
.def("finalize", [](py::object) {
#ifdef USE_HOROVOD
training::shutdown_horovod();
#if defined(USE_NCCL) || defined(USE_HOROVOD)
training::shutdown_mpi();
#endif
})
.def("load_model", [](onnxruntime::training::TrainingSession* sess, const std::string& path, TrainingParameters& parameters) {

View file

@ -182,7 +182,7 @@ TEST_F(OptimizerGraphBuilderTest, Default_WithGradientAccumulation_WithMixedPrec
TestDefaultOptimizerGraphBuilder(config, graph_);
}
#if defined(USE_HOROVOD) || defined(USE_NCCL)
#if defined(USE_NCCL) || defined(USE_HOROVOD)
static void TestAllreduceOptimizerGraphBuilder(OptimizerGraphConfig config, Graph& graph) {
AllreduceOptimizerGraphBuilder optimizer_graph_builder(
GetOptimizerBuilderRegistry(), config, GetOptInfoMap());

View file

@ -105,6 +105,10 @@ def parse_arguments():
help="Enable the pytorch frontend training tests.")
parser.add_argument(
"--use_horovod", action='store_true', help="Enable Horovod.")
parser.add_argument(
"--mpi_home", help="Path to MPI installation dir")
parser.add_argument(
"--nccl_home", help="Path to NCCL installation dir")
# enable ONNX tests
parser.add_argument(
@ -516,9 +520,9 @@ def setup_test_data(build_dir, configs):
src_model_dir], shell=True)
def generate_build_tree(cmake_path, source_dir, build_dir, cuda_home,
cudnn_home, tensorrt_home, migraphx_home, path_to_protoc_exe, configs,
cmake_extra_defines, args, cmake_extra_args):
def generate_build_tree(cmake_path, source_dir, build_dir, cuda_home, cudnn_home,
mpi_home, nccl_home, tensorrt_home, migraphx_home,
path_to_protoc_exe, configs, cmake_extra_defines, args, cmake_extra_args):
log.info("Generating CMake build tree")
cmake_dir = os.path.join(source_dir, "cmake")
# TODO: fix jemalloc build so it does not conflict with onnxruntime
@ -631,9 +635,15 @@ def generate_build_tree(cmake_path, source_dir, build_dir, cuda_home,
"-Donnxruntime_ENABLE_TRAINING=" + (
"ON" if args.enable_training else "OFF"),
"-Donnxruntime_USE_HOROVOD=" + (
"ON" if args.use_horovod else "OFF"),
"ON" if args.use_horovod else "OFF")
]
if mpi_home and os.path.exists(mpi_home):
cmake_args += ["-Donnxruntime_MPI_HOME=" + mpi_home]
if nccl_home and os.path.exists(nccl_home):
cmake_args += ["-Donnxruntime_NCCL_HOME=" + nccl_home]
if args.winml_root_namespace_override:
cmake_args += ["-Donnxruntime_WINML_NAMESPACE_OVERRIDE="
+ args.winml_root_namespace_override]
@ -641,10 +651,7 @@ def generate_build_tree(cmake_path, source_dir, build_dir, cuda_home,
# temp turn on only for linux gpu build
if not is_windows():
if args.use_cuda:
if "-Donnxruntime_USE_HOROVOD=OFF" in cmake_args:
cmake_args.remove("-Donnxruntime_USE_HOROVOD=OFF")
cmake_args += [
"-Donnxruntime_USE_HOROVOD=ON",
"-Donnxruntime_USE_FULL_PROTOBUF=ON"]
# nGraph, TensorRT and OpenVINO providers currently only supports
@ -1613,6 +1620,9 @@ def main():
# if using cuda, setup cuda paths and env vars
cuda_home, cudnn_home = setup_cuda_vars(args)
mpi_home = args.mpi_home
nccl_home = args.nccl_home
# if using tensorrt, setup tensorrt paths
tensorrt_home = setup_tensorrt_vars(args)
@ -1702,7 +1712,7 @@ def main():
if args.enable_onnx_tests:
setup_test_data(build_dir, configs)
generate_build_tree(
cmake_path, source_dir, build_dir, cuda_home, cudnn_home,
cmake_path, source_dir, build_dir, cuda_home, cudnn_home, mpi_home, nccl_home,
tensorrt_home, migraphx_home, path_to_protoc_exe, configs, cmake_extra_defines,
args, cmake_extra_args)