mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-14 20:48:00 +00:00
Enable ORT with CUDA 11 toolkit (#4168)
* ORT on CUDA 11
1. Seperate HOROVOD and MPI
2. Seperate NCCL from HOROVOD in CMakeLists.txt
2. Remove dependency on external cub
3. cudnnSetRNNDescriptor is changed in cuDNN 8.0
* polish the code about MPI/NCCL in CMakeLists.txt and build.py
* check CUDA version
* ${MPI_INCLUDE_DIRS} should be PUBLIC
* sm30, sm50 are deprecated in CUDA 11 Toolkit
* update change based on code review feedback.
* add sm_52
* improve MPI/NCCL build path
Co-authored-by: Weixing Zhang <wezhan@microsoft.com>
This commit is contained in:
parent
88a9cceb41
commit
b4b1c6440a
12 changed files with 178 additions and 53 deletions
|
|
@ -108,6 +108,7 @@ option(onnxruntime_ENABLE_NVTX_PROFILE "Enable NVTX profile." OFF)
|
|||
option(onnxruntime_ENABLE_TRAINING "Enable training functionality." OFF)
|
||||
option(onnxruntime_ENABLE_TRAINING_E2E_TESTS "Enable training end-to-end tests." OFF)
|
||||
option(onnxruntime_USE_HOROVOD "Build with HOROVOD support" OFF)
|
||||
option(onnxruntime_USE_NCCL "Build with NCCL support" ON)
|
||||
|
||||
if (onnxruntime_ENABLE_NVTX_PROFILE)
|
||||
add_definitions(-DENABLE_NVTX_PROFILE=1)
|
||||
|
|
@ -824,7 +825,13 @@ if (onnxruntime_USE_CUDA)
|
|||
string(APPEND CMAKE_CUDA_FLAGS "-cudart shared")
|
||||
endif()
|
||||
enable_language(CUDA)
|
||||
set(CMAKE_CUDA_STANDARD 11)
|
||||
string(REGEX REPLACE "([0-9]+)\\.([0-9]+).*" "\\1" CUDA_VERSION_MAJOR "${CMAKE_CUDA_COMPILER_VERSION}")
|
||||
message( STATUS "CUDA_VERSION_MAJOR: ${CUDA_VERSION_MAJOR}")
|
||||
if (CUDA_VERSION_MAJOR EQUAL 11)
|
||||
set(CMAKE_CUDA_STANDARD 14)
|
||||
else()
|
||||
set(CMAKE_CUDA_STANDARD 11)
|
||||
endif()
|
||||
file(TO_CMAKE_PATH ${onnxruntime_CUDNN_HOME} onnxruntime_CUDNN_HOME)
|
||||
set(ONNXRUNTIME_CUDA_LIBRARIES ${CUDA_LIBRARIES})
|
||||
|
||||
|
|
@ -850,8 +857,12 @@ if (onnxruntime_USE_CUDA)
|
|||
endif()
|
||||
list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${ONNXRUNTIME_CUDA_LIBRARIES})
|
||||
|
||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_30,code=sm_30") # K series
|
||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_50,code=sm_50") # M series
|
||||
# the following compute capabilities are deprecated in CUDA 11 Toolkit
|
||||
if (CUDA_VERSION_MAJOR LESS 11)
|
||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_30,code=sm_30") # K series
|
||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_50,code=sm_50") # M series
|
||||
endif()
|
||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_52,code=sm_52") # M60
|
||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_60,code=sm_60") # P series
|
||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_70,code=sm_70") # V series
|
||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr --default-stream legacy")
|
||||
|
|
@ -918,28 +929,108 @@ endif()
|
|||
|
||||
if (onnxruntime_ENABLE_TRAINING)
|
||||
add_compile_definitions(ENABLE_TRAINING)
|
||||
if (onnxruntime_USE_HOROVOD)
|
||||
|
||||
if (UNIX)
|
||||
# Find MPI
|
||||
find_path(MPI_INCLUDE_DIR
|
||||
NAMES mpi.h
|
||||
HINTS
|
||||
${onnxruntime_MPI_HOME}/include
|
||||
/bert_ort/openmpi/include)
|
||||
|
||||
set(MPI_LIBNAME "mpi")
|
||||
|
||||
find_library(MPI_LIBRARY
|
||||
NAMES ${MPI_LIBNAME}
|
||||
HINTS
|
||||
${onnxruntime_MPI_HOME}/lib
|
||||
/bert_ort/openmpi/lib)
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
find_package_handle_standard_args(MPI DEFAULT_MSG MPI_INCLUDE_DIR MPI_LIBRARY)
|
||||
|
||||
if (MPI_FOUND)
|
||||
execute_process(COMMAND mpirun --version OUTPUT_VARIABLE MPIRUN_OUTPUT)
|
||||
string( REGEX MATCH "[0-9]+.[0-9]+.[0-9]" MPI_VERSION ${MPIRUN_OUTPUT})
|
||||
message( STATUS "MPI Version: ${MPI_VERSION}")
|
||||
|
||||
set(MPI_INCLUDE_DIRS ${MPI_INCLUDE_DIR})
|
||||
set(MPI_LIBRARIES ${MPI_LIBRARY})
|
||||
message( STATUS "MPI (include: ${MPI_INCLUDE_DIRS}, library: ${MPI_LIBRARIES})" )
|
||||
mark_as_advanced(MPI_INCLUDE_DIRS MPI_LIBRARIES)
|
||||
|
||||
list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${MPI_LIBRARIES} ${MPI_CXX_LINK_FLAGS})
|
||||
else ()
|
||||
set(onnxruntime_USE_NCCL OFF)
|
||||
set(onnxruntime_USE_HOROVOD OFF)
|
||||
message( WARNING "MPI is not found. Please use --mpi_home to specify the path of MPI. Otherwise, NCCL or HOROVOD will be disabled." )
|
||||
endif()
|
||||
|
||||
# Find NCCL and MPI
|
||||
if (onnxruntime_USE_NCCL AND MPI_FOUND)
|
||||
find_path(NCCL_INCLUDE_DIR
|
||||
NAMES nccl.h
|
||||
HINTS
|
||||
${onnxruntime_NCCL_HOME}/include
|
||||
$ENV{CUDA_ROOT}/include)
|
||||
|
||||
set(NCCL_LIBNAME "nccl")
|
||||
|
||||
find_library(NCCL_LIBRARY
|
||||
NAMES ${NCCL_LIBNAME}
|
||||
HINTS
|
||||
${onnxruntime_NCCL_HOME}/lib/x86_64-linux-gnu
|
||||
$ENV{CUDA_ROOT}/lib64)
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIR NCCL_LIBRARY)
|
||||
|
||||
if (NCCL_FOUND)
|
||||
set(NCCL_HEADER_FILE "${NCCL_INCLUDE_DIR}/nccl.h")
|
||||
message( STATUS "Determining NCCL version from the header file: ${NCCL_HEADER_FILE}" )
|
||||
file (STRINGS ${NCCL_HEADER_FILE} NCCL_MAJOR_VERSION_DEFINED
|
||||
REGEX "^[ \t]*#define[ \t]+NCCL_MAJOR[ \t]+[0-9]+.*$" LIMIT_COUNT 1)
|
||||
if (NCCL_MAJOR_VERSION_DEFINED)
|
||||
string (REGEX REPLACE "^[ \t]*#define[ \t]+NCCL_MAJOR[ \t]+" ""
|
||||
NCCL_MAJOR_VERSION ${NCCL_MAJOR_VERSION_DEFINED})
|
||||
message( STATUS "NCCL_MAJOR_VERSION: ${NCCL_MAJOR_VERSION}" )
|
||||
endif()
|
||||
file (STRINGS ${NCCL_HEADER_FILE} NCCL_MINOR_VERSION_DEFINED
|
||||
REGEX "^[ \t]*#define[ \t]+NCCL_MINOR[ \t]+[0-9]+.*$" LIMIT_COUNT 1)
|
||||
if (NCCL_MINOR_VERSION_DEFINED)
|
||||
string (REGEX REPLACE "^[ \t]*#define[ \t]+NCCL_MINOR[ \t]+" ""
|
||||
NCCL_MINOR_VERSION ${NCCL_MINOR_VERSION_DEFINED})
|
||||
message(STATUS "NCCL_MINOR_VERSION: ${NCCL_MINOR_VERSION}")
|
||||
endif()
|
||||
|
||||
set(NCCL_INCLUDE_DIRS ${NCCL_INCLUDE_DIR})
|
||||
set(NCCL_LIBRARIES ${NCCL_LIBRARY})
|
||||
message( STATUS "NCCL (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES})" )
|
||||
mark_as_advanced(NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
|
||||
|
||||
list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${NCCL_LIBRARIES})
|
||||
|
||||
add_definitions(-DUSE_NCCL=1)
|
||||
message( STATUS "NCCL is enabled in Linux GPU Build." )
|
||||
else ()
|
||||
set(onnxruntime_USE_NCCL OFF)
|
||||
message( WARNING "NCCL is not found. Please use --nccl_home to specify the path of NCCL. Otherwise, NCCL is disabled." )
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (onnxruntime_USE_HOROVOD AND MPI_FOUND)
|
||||
if (WIN32)
|
||||
message( FATAL_ERROR "Horovod is not supported on Windows." )
|
||||
elseif (UNIX)
|
||||
find_package(MPI REQUIRED)
|
||||
add_definitions(-DUSE_HOROVOD=1)
|
||||
set(HOROVOD_ROOT ${PROJECT_SOURCE_DIR}/external/horovod)
|
||||
set(HOROVOD_INCLUDE_DIRS "${HOROVOD_ROOT}/horovod/common" ${MPI_CXX_INCLUDE_PATH})
|
||||
set(HOROVOD_INCLUDE_DIRS "${HOROVOD_ROOT}/horovod/common")
|
||||
add_subdirectory(horovod EXCLUDE_FROM_ALL)
|
||||
# use external/horovod/third_party/gloo/cmake/Modules/Findnccl.cmake to locate nccl lib path
|
||||
list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/external/horovod/third_party/gloo/cmake/Modules/)
|
||||
find_package(nccl REQUIRED)
|
||||
list(APPEND onnxruntime_EXTERNAL_LIBRARIES horovod ${NCCL_LIBRARIES} ${MPI_CXX_LIBRARIES} ${MPI_CXX_LINK_FLAGS})
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (onnxruntime_USE_CUDA)
|
||||
if (WIN32)
|
||||
message(WARNING "NCCL is not supported on Windows GPU Build." )
|
||||
elseif (UNIX)
|
||||
message( "NCCL is enabled in Linux GPU Build." )
|
||||
add_definitions(-DUSE_NCCL=1)
|
||||
list(APPEND onnxruntime_EXTERNAL_LIBRARIES horovod)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
|
|
|||
|
|
@ -61,6 +61,7 @@ target_include_directories(onnxruntime_graph PRIVATE ${ONNXRUNTIME_ROOT})
|
|||
|
||||
if (onnxruntime_ENABLE_TRAINING)
|
||||
target_include_directories(onnxruntime_graph PRIVATE ${ORTTRAINING_ROOT})
|
||||
|
||||
if (onnxruntime_USE_HOROVOD)
|
||||
target_include_directories(onnxruntime_graph PRIVATE ${HOROVOD_INCLUDE_DIRS})
|
||||
endif()
|
||||
|
|
|
|||
|
|
@ -205,6 +205,12 @@ if (onnxruntime_USE_CUDA)
|
|||
"${ORTTRAINING_SOURCE_DIR}/training_ops/cuda/collective/nccl_kernels.cc"
|
||||
"${ORTTRAINING_SOURCE_DIR}/training_ops/cuda/collective/megatron.cc"
|
||||
)
|
||||
elseif (NOT onnxruntime_USE_NCCL)
|
||||
list(REMOVE_ITEM onnxruntime_cuda_training_ops_cc_srcs
|
||||
"${ORTTRAINING_SOURCE_DIR}/training_ops/cuda/collective/nccl_common.cc"
|
||||
"${ORTTRAINING_SOURCE_DIR}/training_ops/cuda/collective/nccl_kernels.cc"
|
||||
"${ORTTRAINING_SOURCE_DIR}/training_ops/cuda/collective/megatron.cc"
|
||||
)
|
||||
endif()
|
||||
|
||||
source_group(TREE ${ORTTRAINING_ROOT} FILES ${onnxruntime_cuda_training_ops_cc_srcs} ${onnxruntime_cuda_training_ops_cu_srcs})
|
||||
|
|
@ -227,13 +233,18 @@ if (onnxruntime_USE_CUDA)
|
|||
target_link_libraries(onnxruntime_providers_cuda PRIVATE onnxruntime_training)
|
||||
endif()
|
||||
add_dependencies(onnxruntime_providers_cuda ${onnxruntime_EXTERNAL_DEPENDENCIES} ${onnxruntime_tvm_dependencies})
|
||||
target_include_directories(onnxruntime_providers_cuda PRIVATE ${ONNXRUNTIME_ROOT} ${PROJECT_SOURCE_DIR}/external/cub ${onnxruntime_CUDNN_HOME}/include ${eigen_INCLUDE_DIRS} ${TVM_INCLUDES} PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
|
||||
target_include_directories(onnxruntime_providers_cuda PRIVATE ${ONNXRUNTIME_ROOT} ${onnxruntime_CUDNN_HOME}/include ${eigen_INCLUDE_DIRS} ${TVM_INCLUDES} PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
|
||||
install(DIRECTORY ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/providers/cuda DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core/providers)
|
||||
set_target_properties(onnxruntime_providers_cuda PROPERTIES LINKER_LANGUAGE CUDA)
|
||||
set_target_properties(onnxruntime_providers_cuda PROPERTIES FOLDER "ONNXRuntime")
|
||||
|
||||
if (CUDA_VERSION_MAJOR LESS 11)
|
||||
target_include_directories(onnxruntime_providers_cuda PRIVATE ${PROJECT_SOURCE_DIR}/external/cub)
|
||||
endif()
|
||||
|
||||
if (onnxruntime_ENABLE_TRAINING)
|
||||
target_include_directories(onnxruntime_providers_cuda PRIVATE ${ORTTRAINING_ROOT})
|
||||
target_include_directories(onnxruntime_providers_cuda PRIVATE ${ORTTRAINING_ROOT} ${MPI_INCLUDE_DIRS})
|
||||
|
||||
if (onnxruntime_USE_HOROVOD)
|
||||
target_include_directories(onnxruntime_providers_cuda PRIVATE ${HOROVOD_INCLUDE_DIRS})
|
||||
endif()
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@ if(WIN32)
|
|||
target_compile_options(onnxruntime_training PRIVATE /wd4100)
|
||||
endif()
|
||||
|
||||
target_include_directories(onnxruntime_training PRIVATE ${ONNXRUNTIME_ROOT} ${ORTTRAINING_ROOT} ${eigen_INCLUDE_DIRS} ${RE2_INCLUDE_DIR} PUBLIC ${onnxruntime_graph_header})
|
||||
target_include_directories(onnxruntime_training PRIVATE ${ONNXRUNTIME_ROOT} ${ORTTRAINING_ROOT} ${eigen_INCLUDE_DIRS} ${RE2_INCLUDE_DIR} PUBLIC ${onnxruntime_graph_header} ${MPI_INCLUDE_DIRS})
|
||||
|
||||
if (onnxruntime_USE_CUDA)
|
||||
target_include_directories(onnxruntime_training PRIVATE ${onnxruntime_CUDNN_HOME}/include ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
|
||||
|
|
@ -135,7 +135,7 @@ if(UNIX AND NOT APPLE)
|
|||
endif()
|
||||
|
||||
onnxruntime_add_include_to_target(onnxruntime_training_bert onnxruntime_common onnx onnx_proto protobuf::libprotobuf onnxruntime_training)
|
||||
target_include_directories(onnxruntime_training_bert PUBLIC ${ONNXRUNTIME_ROOT} ${ORTTRAINING_ROOT} ${eigen_INCLUDE_DIRS} ${CXXOPTS} ${extra_includes} ${onnxruntime_graph_header} ${onnxruntime_exec_src_dir} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/onnx onnxruntime_training_runner)
|
||||
target_include_directories(onnxruntime_training_bert PUBLIC ${ONNXRUNTIME_ROOT} ${ORTTRAINING_ROOT} ${MPI_INCLUDE_DIRS} ${eigen_INCLUDE_DIRS} ${CXXOPTS} ${extra_includes} ${onnxruntime_graph_header} ${onnxruntime_exec_src_dir} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/onnx onnxruntime_training_runner)
|
||||
|
||||
if (onnxruntime_USE_HOROVOD)
|
||||
target_include_directories(onnxruntime_training_bert PUBLIC ${HOROVOD_INCLUDE_DIRS})
|
||||
|
|
@ -156,7 +156,7 @@ if(UNIX AND NOT APPLE)
|
|||
endif()
|
||||
|
||||
onnxruntime_add_include_to_target(onnxruntime_training_pipeline_poc onnxruntime_common onnx onnx_proto protobuf::libprotobuf onnxruntime_training)
|
||||
target_include_directories(onnxruntime_training_pipeline_poc PUBLIC ${ONNXRUNTIME_ROOT} ${ORTTRAINING_ROOT} ${eigen_INCLUDE_DIRS} ${CXXOPTS} ${extra_includes} ${onnxruntime_graph_header} ${onnxruntime_exec_src_dir} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/onnx onnxruntime_training_runner)
|
||||
target_include_directories(onnxruntime_training_pipeline_poc PUBLIC ${ONNXRUNTIME_ROOT} ${ORTTRAINING_ROOT} ${MPI_INCLUDE_DIRS} ${eigen_INCLUDE_DIRS} ${CXXOPTS} ${extra_includes} ${onnxruntime_graph_header} ${onnxruntime_exec_src_dir} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/onnx onnxruntime_training_runner)
|
||||
|
||||
if (onnxruntime_USE_HOROVOD)
|
||||
target_include_directories(onnxruntime_training_pipeline_poc PUBLIC ${HOROVOD_INCLUDE_DIRS})
|
||||
|
|
@ -175,7 +175,8 @@ if(UNIX AND NOT APPLE)
|
|||
target_compile_options(onnxruntime_training_gpt2 PUBLIC "-Wno-maybe-uninitialized")
|
||||
endif()
|
||||
onnxruntime_add_include_to_target(onnxruntime_training_gpt2 onnxruntime_common onnx onnx_proto protobuf::libprotobuf onnxruntime_training)
|
||||
target_include_directories(onnxruntime_training_gpt2 PUBLIC ${ONNXRUNTIME_ROOT} ${ORTTRAINING_ROOT} ${eigen_INCLUDE_DIRS} ${CXXOPTS} ${extra_includes} ${onnxruntime_graph_header} ${onnxruntime_exec_src_dir} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/onnx onnxruntime_training_runner)
|
||||
target_include_directories(onnxruntime_training_gpt2 PUBLIC ${ONNXRUNTIME_ROOT} ${ORTTRAINING_ROOT} ${MPI_INCLUDE_DIRS} ${eigen_INCLUDE_DIRS} ${CXXOPTS} ${extra_includes} ${onnxruntime_graph_header} ${onnxruntime_exec_src_dir} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/onnx onnxruntime_training_runner)
|
||||
|
||||
if (onnxruntime_USE_HOROVOD)
|
||||
target_include_directories(onnxruntime_training_gpt2 PUBLIC ${HOROVOD_INCLUDE_DIRS})
|
||||
endif()
|
||||
|
|
|
|||
|
|
@ -42,7 +42,7 @@ class CudnnRNN {
|
|||
if (!cudnn_rnn_desc_)
|
||||
CUDNN_RETURN_IF_ERROR(cudnnCreateRNNDescriptor(&cudnn_rnn_desc_));
|
||||
|
||||
CUDNN_RETURN_IF_ERROR(cudnnSetRNNDescriptor(cudnnHandle,
|
||||
CUDNN_RETURN_IF_ERROR(cudnnSetRNNDescriptor_v6(cudnnHandle,
|
||||
cudnn_rnn_desc_,
|
||||
gsl::narrow_cast<int>(hidden_size),
|
||||
num_layers,
|
||||
|
|
|
|||
|
|
@ -1,11 +1,14 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "mpi_setup.h"
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace training {
|
||||
MPIContext::MPIContext(int w_rank, int l_rank, int w_size, int l_size) : world_rank(w_rank), local_rank(l_rank), world_size(w_size), local_size(l_size) {}
|
||||
#ifdef USE_HOROVOD
|
||||
MPIContext setup_horovod() {
|
||||
using namespace horovod::common;
|
||||
|
||||
#if defined(USE_NCCL) || defined(USE_HOROVOD)
|
||||
MPIContext setup_mpi() {
|
||||
// setup MPI amd horovod
|
||||
int is_mpi_initialized = 0;
|
||||
MPI_Initialized(&is_mpi_initialized);
|
||||
|
|
@ -24,7 +27,10 @@ MPIContext setup_horovod() {
|
|||
|
||||
MPI_Allgather(&world_rank, 1, MPI_INT, ranks, 1, MPI_INT, MPI_COMM_WORLD);
|
||||
|
||||
#ifdef USE_HOROVOD
|
||||
using namespace horovod::common;
|
||||
horovod_init(ranks, world_size);
|
||||
#endif
|
||||
|
||||
//Get local rank and size
|
||||
int local_rank;
|
||||
|
|
@ -46,8 +52,10 @@ MPIContext setup_horovod() {
|
|||
return MPIContext(world_rank, local_rank, world_size, local_size);
|
||||
}
|
||||
|
||||
void shutdown_horovod() {
|
||||
void shutdown_mpi() {
|
||||
#ifdef USE_HOROVOD
|
||||
horovod::common::horovod_shutdown();
|
||||
#endif
|
||||
|
||||
int is_mpi_initialized = 0;
|
||||
MPI_Initialized(&is_mpi_initialized);
|
||||
|
|
|
|||
|
|
@ -1,8 +1,11 @@
|
|||
#pragma once
|
||||
|
||||
#if defined(USE_NCCL) || defined(USE_HOROVOD)
|
||||
#include <mpi.h>
|
||||
#endif
|
||||
|
||||
#ifdef USE_HOROVOD
|
||||
#include "orttraining/core/graph/horovod_adapters.h"
|
||||
#include <mpi.h>
|
||||
#endif
|
||||
|
||||
namespace onnxruntime {
|
||||
|
|
@ -16,9 +19,10 @@ struct MPIContext {
|
|||
int local_size;
|
||||
};
|
||||
|
||||
#ifdef USE_HOROVOD
|
||||
MPIContext setup_horovod();
|
||||
void shutdown_horovod();
|
||||
#if defined(USE_NCCL) || defined(USE_HOROVOD)
|
||||
MPIContext setup_mpi();
|
||||
void shutdown_mpi();
|
||||
#endif
|
||||
|
||||
} // namespace training
|
||||
} // namespace onnxruntime
|
||||
|
|
|
|||
|
|
@ -510,8 +510,8 @@ void setup_training_params(BertParameters& params) {
|
|||
params.model_with_training_graph_path = model_name_base + ORT_TSTR("_bw.onnx");
|
||||
params.model_actual_running_graph_path = model_name_base + ORT_TSTR("_bw_running.onnx");
|
||||
|
||||
#ifdef USE_HOROVOD
|
||||
params.mpi_context = setup_horovod();
|
||||
#if defined(USE_NCCL) || defined(USE_HOROVOD)
|
||||
params.mpi_context = setup_mpi();
|
||||
|
||||
if (params.pipeline_parallel_size > 1) {
|
||||
auto pipeline_model_name_base = model_name_base + ToPathString(std::to_string(params.mpi_context.world_rank));
|
||||
|
|
@ -519,7 +519,6 @@ void setup_training_params(BertParameters& params) {
|
|||
params.model_with_training_graph_path = pipeline_model_name_base + ORT_TSTR("_bw.onnx");
|
||||
params.model_actual_running_graph_path = pipeline_model_name_base + ORT_TSTR("_bw_running.onnx");
|
||||
}
|
||||
|
||||
ORT_ENFORCE(params.horizontal_parallel_size <= params.mpi_context.world_size);
|
||||
ORT_ENFORCE(params.data_parallel_size <= params.mpi_context.world_size);
|
||||
if (params.mpi_context.world_size % params.horizontal_parallel_size != 0) {
|
||||
|
|
@ -795,8 +794,8 @@ int main(int argc, char* argv[]) {
|
|||
RETURN_IF_FAIL(RunTraining(params, *env));
|
||||
}
|
||||
|
||||
#ifdef USE_HOROVOD
|
||||
shutdown_horovod();
|
||||
#if defined(USE_NCCL) || defined(USE_HOROVOD)
|
||||
shutdown_mpi();
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
|
|
|
|||
|
|
@ -291,8 +291,8 @@ void setup_training_params(GPT2Parameters& params) {
|
|||
{/*prediction_name*/ "output",
|
||||
/*label_name*/ "labels"});
|
||||
|
||||
#ifdef USE_HOROVOD
|
||||
params.mpi_context = setup_horovod();
|
||||
#if defined(USE_NCCL) || defined(USE_HOROVOD)
|
||||
params.mpi_context = setup_mpi();
|
||||
ORT_ENFORCE(params.horizontal_parallel_size <= params.mpi_context.world_size);
|
||||
ORT_ENFORCE(params.data_parallel_size <= params.mpi_context.world_size);
|
||||
if (params.mpi_context.world_size % params.horizontal_parallel_size != 0) {
|
||||
|
|
@ -476,8 +476,8 @@ int main(int argc, char* argv[]) {
|
|||
RETURN_IF_FAIL(RunTraining(params, *env));
|
||||
}
|
||||
|
||||
#ifdef USE_HOROVOD
|
||||
shutdown_horovod();
|
||||
#if defined(USE_NCCL) || defined(USE_HOROVOD)
|
||||
shutdown_mpi();
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
|
|
|
|||
|
|
@ -74,13 +74,13 @@ TrainingConfigurationResult ConfigureSessionForTraining(
|
|||
<< data_group_size << std::endl;
|
||||
parameters.data_parallel_size = data_group_size;
|
||||
}
|
||||
#ifdef USE_HOROVOD
|
||||
#if defined(USE_NCCL) || defined(USE_HOROVOD)
|
||||
// this condition block is temporary.
|
||||
// For now, nccl allreduce kernel only implements for allreduce_post_accumulation
|
||||
// hovorod allreduce kernel only implements for not allreduce_post_accumulation.
|
||||
bool use_nccl = parameters.allreduce_post_accumulation;
|
||||
if (!use_nccl && parameters.world_size > 1) {
|
||||
auto mpi_context = training::setup_horovod();
|
||||
auto mpi_context = training::setup_mpi();
|
||||
std::cout << "mpi_context.world_rank: " << mpi_context.world_rank << std::endl;
|
||||
std::cout << "mpi_context.local_rank: " << mpi_context.local_rank << std::endl;
|
||||
std::cout << "mpi_context.world_size: " << mpi_context.world_size << std::endl;
|
||||
|
|
@ -203,8 +203,8 @@ void addObjectMethodsForTraining(py::module& m) {
|
|||
return onnxruntime::make_unique<onnxruntime::training::TrainingSession>(GetDefaultCPUSessionOptions(), env);
|
||||
}))
|
||||
.def("finalize", [](py::object) {
|
||||
#ifdef USE_HOROVOD
|
||||
training::shutdown_horovod();
|
||||
#if defined(USE_NCCL) || defined(USE_HOROVOD)
|
||||
training::shutdown_mpi();
|
||||
#endif
|
||||
})
|
||||
.def("load_model", [](onnxruntime::training::TrainingSession* sess, const std::string& path, TrainingParameters& parameters) {
|
||||
|
|
|
|||
|
|
@ -182,7 +182,7 @@ TEST_F(OptimizerGraphBuilderTest, Default_WithGradientAccumulation_WithMixedPrec
|
|||
TestDefaultOptimizerGraphBuilder(config, graph_);
|
||||
}
|
||||
|
||||
#if defined(USE_HOROVOD) || defined(USE_NCCL)
|
||||
#if defined(USE_NCCL) || defined(USE_HOROVOD)
|
||||
static void TestAllreduceOptimizerGraphBuilder(OptimizerGraphConfig config, Graph& graph) {
|
||||
AllreduceOptimizerGraphBuilder optimizer_graph_builder(
|
||||
GetOptimizerBuilderRegistry(), config, GetOptInfoMap());
|
||||
|
|
|
|||
|
|
@ -105,6 +105,10 @@ def parse_arguments():
|
|||
help="Enable the pytorch frontend training tests.")
|
||||
parser.add_argument(
|
||||
"--use_horovod", action='store_true', help="Enable Horovod.")
|
||||
parser.add_argument(
|
||||
"--mpi_home", help="Path to MPI installation dir")
|
||||
parser.add_argument(
|
||||
"--nccl_home", help="Path to NCCL installation dir")
|
||||
|
||||
# enable ONNX tests
|
||||
parser.add_argument(
|
||||
|
|
@ -516,9 +520,9 @@ def setup_test_data(build_dir, configs):
|
|||
src_model_dir], shell=True)
|
||||
|
||||
|
||||
def generate_build_tree(cmake_path, source_dir, build_dir, cuda_home,
|
||||
cudnn_home, tensorrt_home, migraphx_home, path_to_protoc_exe, configs,
|
||||
cmake_extra_defines, args, cmake_extra_args):
|
||||
def generate_build_tree(cmake_path, source_dir, build_dir, cuda_home, cudnn_home,
|
||||
mpi_home, nccl_home, tensorrt_home, migraphx_home,
|
||||
path_to_protoc_exe, configs, cmake_extra_defines, args, cmake_extra_args):
|
||||
log.info("Generating CMake build tree")
|
||||
cmake_dir = os.path.join(source_dir, "cmake")
|
||||
# TODO: fix jemalloc build so it does not conflict with onnxruntime
|
||||
|
|
@ -631,9 +635,15 @@ def generate_build_tree(cmake_path, source_dir, build_dir, cuda_home,
|
|||
"-Donnxruntime_ENABLE_TRAINING=" + (
|
||||
"ON" if args.enable_training else "OFF"),
|
||||
"-Donnxruntime_USE_HOROVOD=" + (
|
||||
"ON" if args.use_horovod else "OFF"),
|
||||
"ON" if args.use_horovod else "OFF")
|
||||
]
|
||||
|
||||
if mpi_home and os.path.exists(mpi_home):
|
||||
cmake_args += ["-Donnxruntime_MPI_HOME=" + mpi_home]
|
||||
|
||||
if nccl_home and os.path.exists(nccl_home):
|
||||
cmake_args += ["-Donnxruntime_NCCL_HOME=" + nccl_home]
|
||||
|
||||
if args.winml_root_namespace_override:
|
||||
cmake_args += ["-Donnxruntime_WINML_NAMESPACE_OVERRIDE="
|
||||
+ args.winml_root_namespace_override]
|
||||
|
|
@ -641,10 +651,7 @@ def generate_build_tree(cmake_path, source_dir, build_dir, cuda_home,
|
|||
# temp turn on only for linux gpu build
|
||||
if not is_windows():
|
||||
if args.use_cuda:
|
||||
if "-Donnxruntime_USE_HOROVOD=OFF" in cmake_args:
|
||||
cmake_args.remove("-Donnxruntime_USE_HOROVOD=OFF")
|
||||
cmake_args += [
|
||||
"-Donnxruntime_USE_HOROVOD=ON",
|
||||
"-Donnxruntime_USE_FULL_PROTOBUF=ON"]
|
||||
|
||||
# nGraph, TensorRT and OpenVINO providers currently only supports
|
||||
|
|
@ -1613,6 +1620,9 @@ def main():
|
|||
# if using cuda, setup cuda paths and env vars
|
||||
cuda_home, cudnn_home = setup_cuda_vars(args)
|
||||
|
||||
mpi_home = args.mpi_home
|
||||
nccl_home = args.nccl_home
|
||||
|
||||
# if using tensorrt, setup tensorrt paths
|
||||
tensorrt_home = setup_tensorrt_vars(args)
|
||||
|
||||
|
|
@ -1702,7 +1712,7 @@ def main():
|
|||
if args.enable_onnx_tests:
|
||||
setup_test_data(build_dir, configs)
|
||||
generate_build_tree(
|
||||
cmake_path, source_dir, build_dir, cuda_home, cudnn_home,
|
||||
cmake_path, source_dir, build_dir, cuda_home, cudnn_home, mpi_home, nccl_home,
|
||||
tensorrt_home, migraphx_home, path_to_protoc_exe, configs, cmake_extra_defines,
|
||||
args, cmake_extra_args)
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue