mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-30 03:37:44 +00:00
[ROCm] enable hipGraph (#18382)
This ports the cudaGraph support from the CUDA EP to the ROCM EP's hipGraph.
This commit is contained in:
parent
37d14d7896
commit
b2aec41a83
10 changed files with 241 additions and 42 deletions
|
|
@ -1277,6 +1277,9 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
|
|||
if (onnxruntime_USE_CUDA)
|
||||
list(APPEND onnxruntime_shared_lib_test_LIBS cudart)
|
||||
endif()
|
||||
if (onnxruntime_USE_ROCM)
|
||||
list(APPEND onnxruntime_shared_lib_test_LIBS hip::host)
|
||||
endif()
|
||||
if (onnxruntime_USE_TENSORRT)
|
||||
list(APPEND onnxruntime_shared_lib_test_LIBS ${TENSORRT_LIBRARY_INFER})
|
||||
endif()
|
||||
|
|
@ -1294,6 +1297,10 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
|
|||
target_include_directories(onnxruntime_shared_lib_test PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
|
||||
target_sources(onnxruntime_shared_lib_test PRIVATE ${ONNXRUNTIME_SHARED_LIB_TEST_SRC_DIR}/cuda_ops.cu)
|
||||
endif()
|
||||
if (onnxruntime_USE_ROCM)
|
||||
target_include_directories(onnxruntime_shared_lib_test PRIVATE ${onnxruntime_ROCM_HOME}/include)
|
||||
target_compile_definitions(onnxruntime_shared_lib_test PRIVATE __HIP_PLATFORM_AMD__)
|
||||
endif()
|
||||
if (CMAKE_SYSTEM_NAME STREQUAL "Android")
|
||||
target_sources(onnxruntime_shared_lib_test PRIVATE
|
||||
"${ONNXRUNTIME_ROOT}/core/platform/android/cxa_demangle.cc"
|
||||
|
|
|
|||
|
|
@ -496,6 +496,7 @@ typedef struct OrtROCMProviderOptions {
|
|||
has_user_compute_stream{},
|
||||
user_compute_stream{},
|
||||
default_memory_arena_cfg{},
|
||||
enable_hip_graph{false},
|
||||
tunable_op_enable{false},
|
||||
tunable_op_tuning_enable{false},
|
||||
tunable_op_max_tuning_duration_ms{} {}
|
||||
|
|
@ -548,6 +549,8 @@ typedef struct OrtROCMProviderOptions {
|
|||
*/
|
||||
OrtArenaCfg* default_memory_arena_cfg;
|
||||
|
||||
int enable_hip_graph;
|
||||
|
||||
/** \brief Enable TunableOp for using.
|
||||
* Set it to 1/0 to enable/disable TunableOp. Otherwise, it is disabled by default.
|
||||
* This option can be overriden by environment variable ORT_ROCM_TUNABLE_OP_ENABLE.
|
||||
|
|
|
|||
|
|
@ -170,6 +170,8 @@ ROCMExecutionProvider::PerThreadContext::PerThreadContext(OrtDevice::DeviceId de
|
|||
|
||||
MIOPEN_CALL_THROW(miopenCreate(&miopen_handle_));
|
||||
MIOPEN_CALL_THROW(miopenSetStream(miopen_handle_, stream));
|
||||
|
||||
hip_graph_.SetStream(stream);
|
||||
}
|
||||
|
||||
ROCMExecutionProvider::PerThreadContext::~PerThreadContext() {
|
||||
|
|
@ -177,6 +179,33 @@ ROCMExecutionProvider::PerThreadContext::~PerThreadContext() {
|
|||
ORT_IGNORE_RETURN_VALUE(MIOPEN_CALL(miopenDestroy(miopen_handle_)));
|
||||
}
|
||||
|
||||
bool ROCMExecutionProvider::PerThreadContext::IsGraphCaptureAllowed() const {
|
||||
return regular_run_count_before_graph_capture_ >= min_num_runs_before_hip_graph_capture_;
|
||||
}
|
||||
|
||||
void ROCMExecutionProvider::PerThreadContext::CaptureBegin() {
|
||||
hip_graph_.Reset();
|
||||
hip_graph_.CaptureBegin();
|
||||
}
|
||||
|
||||
void ROCMExecutionProvider::PerThreadContext::CaptureEnd() {
|
||||
hip_graph_.CaptureEnd();
|
||||
is_graph_captured_ = true;
|
||||
}
|
||||
|
||||
bool ROCMExecutionProvider::PerThreadContext::IsGraphCaptured() const {
|
||||
return is_graph_captured_;
|
||||
}
|
||||
|
||||
Status ROCMExecutionProvider::PerThreadContext::ReplayGraph() {
|
||||
ORT_ENFORCE(IsGraphCaptured());
|
||||
return hip_graph_.Replay();
|
||||
}
|
||||
|
||||
void ROCMExecutionProvider::PerThreadContext::IncrementRegularRunCountBeforeGraphCapture() {
|
||||
++regular_run_count_before_graph_capture_;
|
||||
}
|
||||
|
||||
void OverrideTunableOpInfoByEnv(ROCMExecutionProviderInfo& info) {
|
||||
if (auto env_tunable_op_enable = onnxruntime::ParseTestOnlyEnvironmentVariable<bool>(
|
||||
"ORT_ROCM_TUNABLE_OP_ENABLE", {"0", "1"}, "Use provider_options \"tunable_op_enable\" instead.");
|
||||
|
|
@ -219,6 +248,11 @@ ROCMExecutionProvider::ROCMExecutionProvider(const ROCMExecutionProviderInfo& in
|
|||
if (info.external_allocator_info.UseExternalAllocator()) {
|
||||
use_ep_level_unified_stream_ = true;
|
||||
stream_ = nullptr;
|
||||
} else if (info.enable_hip_graph) {
|
||||
// current hip graph implementation only works with single stream
|
||||
// use EP level unified stream for all the reqeust
|
||||
HIP_CALL_THROW(hipStreamCreateWithFlags(&stream_, hipStreamNonBlocking));
|
||||
use_ep_level_unified_stream_ = true;
|
||||
} else {
|
||||
stream_ = nullptr;
|
||||
}
|
||||
|
|
@ -322,25 +356,58 @@ Status ROCMExecutionProvider::Sync() const {
|
|||
Status ROCMExecutionProvider::OnRunStart() {
|
||||
// always set ROCM device when session::Run() in case it runs in a worker thread
|
||||
HIP_RETURN_IF_ERROR(hipSetDevice(GetDeviceId()));
|
||||
if (IsGraphCaptureEnabled() && GetPerThreadContext().IsGraphCaptureAllowed() && !GetPerThreadContext().IsGraphCaptured()) {
|
||||
LOGS_DEFAULT(INFO) << "Capturing the hip graph for this model";
|
||||
GetPerThreadContext().CaptureBegin();
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status ROCMExecutionProvider::OnRunEnd(bool sync_stream) {
|
||||
if (IsGraphCaptureEnabled() && !GetPerThreadContext().IsGraphCaptured()) {
|
||||
if (GetPerThreadContext().IsGraphCaptureAllowed()) {
|
||||
GetPerThreadContext().CaptureEnd();
|
||||
// HIP work issued to a capturing stream doesn’t actually run on the GPU,
|
||||
// so run the captured graph here to actually execute the work.
|
||||
ORT_RETURN_IF_ERROR(GetPerThreadContext().ReplayGraph());
|
||||
} else {
|
||||
GetPerThreadContext().IncrementRegularRunCountBeforeGraphCapture();
|
||||
}
|
||||
}
|
||||
|
||||
if (sync_stream) {
|
||||
HIP_RETURN_IF_ERROR(hipStreamSynchronize(static_cast<hipStream_t>(stream_)));
|
||||
}
|
||||
|
||||
// In extreme cases (e.g., 1-op graph and that op fallbacks to CPU),
|
||||
// PerThreadContext won't be created and there is nothing to
|
||||
// release. This didn't happen before because we always call
|
||||
// GetPerThreadContext in OnRunStart.
|
||||
if (PerThreadContextCache()->find(this) != PerThreadContextCache()->end()) {
|
||||
// The reason of !IsGraphCaptureEnabled():
|
||||
// If hip graph is enabled, the per thread context will not be released
|
||||
// because the per thread hip graph needs to be maintained and replayed for
|
||||
// the next run.
|
||||
// The reason of PerThreadContextCache()->find(this) != PerThreadContextCache()->end():
|
||||
// In extreme cases (e.g., 1-op graph and that op fallbacks to CPU),
|
||||
// PerThreadContext won't be created and there is nothing to
|
||||
// release. This didn't happen before because we always call
|
||||
// GetPerThreadContext in OnRunStart.
|
||||
if (!IsGraphCaptureEnabled() &&
|
||||
PerThreadContextCache()->find(this) != PerThreadContextCache()->end()) {
|
||||
ReleasePerThreadContext();
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
bool ROCMExecutionProvider::IsGraphCaptureEnabled() const {
|
||||
return info_.enable_hip_graph;
|
||||
}
|
||||
|
||||
bool ROCMExecutionProvider::IsGraphCaptured() const {
|
||||
return GetPerThreadContext().IsGraphCaptured();
|
||||
}
|
||||
|
||||
Status ROCMExecutionProvider::ReplayGraph() {
|
||||
return GetPerThreadContext().ReplayGraph();
|
||||
}
|
||||
|
||||
namespace rocm {
|
||||
// opset 1 to 9
|
||||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, MemcpyFromHost);
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@
|
|||
#include "core/framework/execution_provider.h"
|
||||
#include "core/platform/ort_mutex.h"
|
||||
#include "core/providers/rocm/rocm_execution_provider_info.h"
|
||||
#include "core/providers/rocm/rocm_graph.h"
|
||||
#include "core/providers/rocm/rocm_pch.h"
|
||||
#include "core/providers/rocm/shared_inc/rocm_utils.h"
|
||||
#include "core/providers/rocm/shared_inc/rocm_call.h"
|
||||
|
|
@ -73,6 +74,9 @@ class ROCMExecutionProvider : public IExecutionProvider {
|
|||
|
||||
std::unique_ptr<profiling::EpProfiler> GetProfiler() override;
|
||||
|
||||
bool IsGraphCaptureEnabled() const override;
|
||||
bool IsGraphCaptured() const override;
|
||||
Status ReplayGraph() override;
|
||||
void RegisterStreamHandlers(IStreamCommandHandleRegistry& stream_handle_registry, AllocatorMap& allocators) const override;
|
||||
OrtDevice GetOrtDeviceByMemType(OrtMemType mem_type) const override;
|
||||
std::vector<AllocatorPtr> CreatePreferredAllocators() override;
|
||||
|
|
@ -81,6 +85,7 @@ class ROCMExecutionProvider : public IExecutionProvider {
|
|||
ROCMExecutionProviderInfo info_;
|
||||
hipDeviceProp_t device_prop_;
|
||||
bool external_stream_ = false;
|
||||
// only used when set user external stream or hip graph
|
||||
hipStream_t stream_ = nullptr;
|
||||
|
||||
bool use_ep_level_unified_stream_ = false;
|
||||
|
|
@ -133,6 +138,13 @@ class ROCMExecutionProvider : public IExecutionProvider {
|
|||
}
|
||||
}
|
||||
|
||||
bool IsGraphCaptureAllowed() const;
|
||||
void CaptureBegin();
|
||||
void CaptureEnd();
|
||||
bool IsGraphCaptured() const;
|
||||
Status ReplayGraph();
|
||||
void IncrementRegularRunCountBeforeGraphCapture();
|
||||
|
||||
private:
|
||||
rocblas_handle rocblas_handle_ = nullptr;
|
||||
miopenHandle_t miopen_handle_ = nullptr;
|
||||
|
|
@ -141,6 +153,18 @@ class ROCMExecutionProvider : public IExecutionProvider {
|
|||
std::unique_ptr<rocm::IConstantBuffer<double>> constant_ones_double_;
|
||||
std::unique_ptr<rocm::IConstantBuffer<half>> constant_ones_half_;
|
||||
std::unique_ptr<rocm::IConstantBuffer<BFloat16>> constant_ones_bfloat16_;
|
||||
|
||||
// Hip graph with multi threads will be supported in the future, so hip_graph_
|
||||
// is put under PerThreadContext.
|
||||
ROCMGraph hip_graph_;
|
||||
bool is_graph_captured_ = false;
|
||||
int regular_run_count_before_graph_capture_ = 0;
|
||||
|
||||
// There is chance that the second regular run allocates GPU memory for causes like:
|
||||
// (1) memory pattern is enabled. (2) arena allocation for stream.
|
||||
// Since no GPU memory allocation is allowed during graph capturing, we need at least two regular runs
|
||||
// to allocate enough memory in Arena before graph capturing.
|
||||
const int min_num_runs_before_hip_graph_capture_ = 2; // required min regular runs before graph capture for the necessary memory allocations.
|
||||
};
|
||||
|
||||
using PerThreadContextMap = std::unordered_map<const ROCMExecutionProvider*, std::weak_ptr<PerThreadContext>>;
|
||||
|
|
|
|||
|
|
@ -21,6 +21,7 @@ constexpr const char* kGpuExternalAlloc = "gpu_external_alloc";
|
|||
constexpr const char* kGpuExternalFree = "gpu_external_free";
|
||||
constexpr const char* kGpuExternalEmptyCache = "gpu_external_empty_cache";
|
||||
constexpr const char* kMiopenConvUseMaxWorkspace = "miopen_conv_use_max_workspace";
|
||||
constexpr const char* kEnableHipGraph = "enable_hip_graph";
|
||||
constexpr const char* kTunableOpEnable = "tunable_op_enable";
|
||||
constexpr const char* kTunableOpTuningEnable = "tunable_op_tuning_enable";
|
||||
constexpr const char* kTunableOpMaxTuningDurationMs = "tunable_op_max_tuning_duration_ms";
|
||||
|
|
@ -84,6 +85,7 @@ ROCMExecutionProviderInfo ROCMExecutionProviderInfo::FromProviderOptions(const P
|
|||
info.miopen_conv_exhaustive_search)
|
||||
.AddAssignmentToReference(rocm::provider_option_names::kDoCopyInDefaultStream, info.do_copy_in_default_stream)
|
||||
.AddAssignmentToReference(rocm::provider_option_names::kMiopenConvUseMaxWorkspace, info.miopen_conv_use_max_workspace)
|
||||
.AddAssignmentToReference(rocm::provider_option_names::kEnableHipGraph, info.enable_hip_graph)
|
||||
.AddValueParser(
|
||||
rocm::provider_option_names::kTunableOpEnable,
|
||||
[&info](const std::string& value_str) -> Status {
|
||||
|
|
@ -121,6 +123,7 @@ ProviderOptions ROCMExecutionProviderInfo::ToProviderOptions(const ROCMExecution
|
|||
{rocm::provider_option_names::kMiopenConvExhaustiveSearch, MakeStringWithClassicLocale(info.miopen_conv_exhaustive_search)},
|
||||
{rocm::provider_option_names::kDoCopyInDefaultStream, MakeStringWithClassicLocale(info.do_copy_in_default_stream)},
|
||||
{rocm::provider_option_names::kMiopenConvUseMaxWorkspace, MakeStringWithClassicLocale(info.miopen_conv_use_max_workspace)},
|
||||
{rocm::provider_option_names::kEnableHipGraph, MakeStringWithClassicLocale(info.enable_hip_graph)},
|
||||
{rocm::provider_option_names::kTunableOpEnable, MakeStringWithClassicLocale(info.tunable_op.enable)},
|
||||
{rocm::provider_option_names::kTunableOpTuningEnable, MakeStringWithClassicLocale(info.tunable_op.tuning_enable)},
|
||||
{rocm::provider_option_names::kTunableOpMaxTuningDurationMs, MakeStringWithClassicLocale(info.tunable_op.max_tuning_duration_ms)},
|
||||
|
|
|
|||
|
|
@ -63,6 +63,8 @@ struct ROCMExecutionProviderInfo {
|
|||
// If set to false, use fix workspace size (32M) for Conv algo search, the final algo might not be the best.
|
||||
bool miopen_conv_use_max_workspace{true};
|
||||
|
||||
bool enable_hip_graph{false};
|
||||
|
||||
rocm::TunableOpInfo tunable_op{};
|
||||
|
||||
static ROCMExecutionProviderInfo FromProviderOptions(const ProviderOptions& options);
|
||||
|
|
|
|||
|
|
@ -185,6 +185,7 @@ struct ROCM_Provider : Provider {
|
|||
info.has_user_compute_stream = params->has_user_compute_stream != 0;
|
||||
info.user_compute_stream = params->user_compute_stream;
|
||||
info.default_memory_arena_cfg = params->default_memory_arena_cfg;
|
||||
info.enable_hip_graph = params->enable_hip_graph;
|
||||
info.tunable_op.enable = params->tunable_op_enable;
|
||||
info.tunable_op.tuning_enable = params->tunable_op_tuning_enable;
|
||||
info.tunable_op.max_tuning_duration_ms = params->tunable_op_max_tuning_duration_ms;
|
||||
|
|
@ -215,6 +216,7 @@ struct ROCM_Provider : Provider {
|
|||
rocm_options.user_compute_stream = internal_options.user_compute_stream;
|
||||
}
|
||||
rocm_options.default_memory_arena_cfg = internal_options.default_memory_arena_cfg;
|
||||
rocm_options.enable_hip_graph = internal_options.enable_hip_graph;
|
||||
rocm_options.tunable_op_enable = internal_options.tunable_op.enable;
|
||||
rocm_options.tunable_op_tuning_enable = internal_options.tunable_op.tuning_enable;
|
||||
rocm_options.tunable_op_max_tuning_duration_ms = internal_options.tunable_op.max_tuning_duration_ms;
|
||||
|
|
|
|||
|
|
@ -153,7 +153,7 @@ static bool AreAllComputeNodesAssignedToCudaEp(const Graph& graph) {
|
|||
|
||||
// Empty node provider means CPU EP
|
||||
if (!node_provider.empty() &&
|
||||
node_provider != kCudaExecutionProvider &&
|
||||
!(node_provider == kCudaExecutionProvider || node_provider == kRocmExecutionProvider) &&
|
||||
node_provider != kCpuExecutionProvider) {
|
||||
nodes_on_cpu_and_cuda_eps_only = false;
|
||||
break;
|
||||
|
|
@ -1715,7 +1715,8 @@ common::Status InferenceSession::Initialize() {
|
|||
// now that all the transforms are done, call Resolve on the main graph. this will recurse into the subgraphs.
|
||||
ORT_RETURN_IF_ERROR_SESSIONID_(graph.Resolve());
|
||||
|
||||
// Currently CUDA graph is only considered by CUDA EP and TRT EP.
|
||||
// Currently CUDA graph is only considered by CUDA EP and TRT EP, and
|
||||
// HIP graph is only considered by ROCM EP.
|
||||
//
|
||||
// Check for CUDA EP:
|
||||
// If the CUDA EP is part of the providers list for this session AND
|
||||
|
|
@ -1728,47 +1729,58 @@ common::Status InferenceSession::Initialize() {
|
|||
// The TRT EP is configured to do a graph capture AND
|
||||
// All the graph nodes have been assigned to the TRT EP,
|
||||
// Then the TRT EP is cached for triggering a ReplayGraph() in Run().
|
||||
std::vector<const char*> cuda_graph_support_ep_list = {onnxruntime::kTensorrtExecutionProvider, onnxruntime::kCudaExecutionProvider};
|
||||
//
|
||||
// Check for ROCM EP:
|
||||
// If the ROCM EP is part of the providers list for this session AND
|
||||
// The ROCM EP is configured to do a graph capture AND
|
||||
// All the "compute" graph nodes have been assigned to the ROCM EP,
|
||||
// Then the ROCM EP is cached for triggering a ReplayGraph() in Run().
|
||||
//
|
||||
std::vector<const char*> graph_support_ep_list = {
|
||||
onnxruntime::kTensorrtExecutionProvider,
|
||||
onnxruntime::kCudaExecutionProvider,
|
||||
onnxruntime::kRocmExecutionProvider};
|
||||
|
||||
for (auto& it : cuda_graph_support_ep_list) {
|
||||
for (auto& it : graph_support_ep_list) {
|
||||
auto* target_ep = execution_providers_.Get(it);
|
||||
|
||||
if (target_ep && target_ep->IsGraphCaptureEnabled()) {
|
||||
// CUDA Graphs can't work with control flow nodes
|
||||
// CUDA/HIP Graphs can't work with control flow nodes
|
||||
if (HasControlflowNodes(graph)) {
|
||||
LOGS(*session_logger_, ERROR) << "This session cannot use the CUDA Graph feature as requested by the user "
|
||||
<< "as the model has control flow nodes which can't be supported by CUDA Graphs.";
|
||||
LOGS(*session_logger_, ERROR) << "This session cannot use the CUDA/HIP Graph feature as requested by the user "
|
||||
<< "as the model has control flow nodes which can't be supported by CUDA/HIP Graphs.";
|
||||
|
||||
ORT_RETURN_IF_ERROR_SESSIONID_(
|
||||
ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
|
||||
"This session cannot use the CUDA Graph feature as requested by the user "
|
||||
"as the model has control flow nodes which can't be supported by CUDA Graphs."));
|
||||
"This session cannot use the CUDA/HIP Graph feature as requested by the user "
|
||||
"as the model has control flow nodes which can't be supported by CUDA/HIP Graphs."));
|
||||
}
|
||||
|
||||
if (strcmp(target_ep->Type().c_str(), onnxruntime::kCudaExecutionProvider) == 0) {
|
||||
if (strcmp(target_ep->Type().c_str(), onnxruntime::kCudaExecutionProvider) == 0 ||
|
||||
strcmp(target_ep->Type().c_str(), onnxruntime::kRocmExecutionProvider) == 0) {
|
||||
// Ensure that all nodes have been partitioned to CUDA or CPU EP && there are no memcpy nodes
|
||||
// The reasoning behind this logic is that certain shape nodes will be forced onto CPU
|
||||
// and as long as there are no memcpy nodes this is confirmation that no compute nodes have been placed on the CPU EP
|
||||
// which is all we care about.
|
||||
if (!AreAllComputeNodesAssignedToCudaEp(graph)) {
|
||||
LOGS(*session_logger_, ERROR) << "This session cannot use the CUDA Graph feature as requested by the user "
|
||||
<< " as all compute graph nodes have not been partitioned to the CUDA EP.";
|
||||
LOGS(*session_logger_, ERROR) << "This session cannot use the CUDA/HIP Graph feature as requested by the user "
|
||||
<< " as all compute graph nodes have not been partitioned to the CUDA/HIP EP.";
|
||||
|
||||
ORT_RETURN_IF_ERROR_SESSIONID_(
|
||||
ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
|
||||
"This session cannot use the CUDA Graph feature as requested by the user "
|
||||
" as all compute graph nodes have not been partitioned to the CUDA EP."));
|
||||
"This session cannot use the CUDA/HIP Graph feature as requested by the user "
|
||||
" as all compute graph nodes have not been partitioned to the CUDA/HIP EP."));
|
||||
}
|
||||
|
||||
// Log a warning for the user to know that there are shape subgraphs that will execute on CPU
|
||||
if (HasShapeSubgraphNodes(graph)) {
|
||||
LOGS(*session_logger_, WARNING) << "This model has shape massaging nodes that will execute on CPU. "
|
||||
<< "Use the CUDA Graph feature with caution. "
|
||||
<< "Use the CUDA/HIP Graph feature with caution. "
|
||||
<< "As long as the intermediate shapes produced in the model "
|
||||
<< "using the representative input used to capture the CUDA graph, "
|
||||
<< "using the representative input used to capture the CUDA/HIP graph, "
|
||||
<< "will match the shapes produced in the model for other inputs "
|
||||
<< "of the same shape as the representative input (common case), "
|
||||
<< "it is safe to use the CUDA Graph feature.";
|
||||
<< "it is safe to use the CUDA/HIP Graph feature.";
|
||||
}
|
||||
} else {
|
||||
// Following code path is for TRT EP currently.
|
||||
|
|
@ -1787,7 +1799,7 @@ common::Status InferenceSession::Initialize() {
|
|||
}
|
||||
}
|
||||
|
||||
LOGS(*session_logger_, INFO) << "This session will use the CUDA Graph feature as requested by the user.";
|
||||
LOGS(*session_logger_, INFO) << "This session will use the CUDA/HIP Graph feature as requested by the user.";
|
||||
cached_execution_provider_for_graph_replay_.SetExecutionProvider(target_ep);
|
||||
break; // Make sure only one ep can run CUDA graph.
|
||||
}
|
||||
|
|
@ -2477,7 +2489,9 @@ Status InferenceSession::Run(const RunOptions& run_options,
|
|||
// As N+1 inference runs (N for memory allocation and 1 for graph capturing)
|
||||
// are needed before replaying the captured graph, here run N inference runs recursively until graph captured,
|
||||
// so that users just need one session run to capture the graph.
|
||||
// N is defined in min_num_runs_before_cuda_graph_capture_ for CUDA EP, and the value could be different for other EP.
|
||||
// N is defined in min_num_runs_before_cuda_graph_capture_ for CUDA EP,
|
||||
// N is defined in min_num_runs_before_hip_graph_capture_ for ROCM EP,
|
||||
// and the value could be different for other EP.
|
||||
if (retval.IsOK() && cached_execution_provider_for_graph_replay_.IsGraphCaptureEnabled() &&
|
||||
!cached_execution_provider_for_graph_replay_.IsGraphCaptured()) {
|
||||
LOGS(*session_logger_, INFO) << "Start another run for necessary memory allocation or graph capture.";
|
||||
|
|
|
|||
|
|
@ -2380,6 +2380,7 @@ ORT_API_STATUS_IMPL(OrtApis::CreateROCMProviderOptions, _Outptr_ OrtROCMProvider
|
|||
options->has_user_compute_stream = 0;
|
||||
options->user_compute_stream = nullptr;
|
||||
options->default_memory_arena_cfg = nullptr;
|
||||
options->enable_hip_graph = false;
|
||||
options->tunable_op_enable = 0;
|
||||
options->tunable_op_tuning_enable = 0;
|
||||
options->tunable_op_max_tuning_duration_ms = 0;
|
||||
|
|
|
|||
|
|
@ -43,6 +43,10 @@
|
|||
#include <cuda_runtime.h>
|
||||
#endif
|
||||
|
||||
#ifdef USE_ROCM
|
||||
#include <hip/hip_runtime.h>
|
||||
#endif
|
||||
|
||||
// Once we use C++17 this could be replaced with std::size
|
||||
template <typename T, size_t N>
|
||||
constexpr size_t countof(T (&)[N]) { return N; }
|
||||
|
|
@ -1762,6 +1766,27 @@ TEST(CApiTest, get_allocator_cuda) {
|
|||
}
|
||||
#endif
|
||||
|
||||
#ifdef USE_ROCM
|
||||
TEST(CApiTest, get_allocator_rocm) {
|
||||
Ort::SessionOptions session_options;
|
||||
Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(session_options, 0));
|
||||
Ort::Session session(*ort_env, NAMED_AND_ANON_DIM_PARAM_URI, session_options);
|
||||
|
||||
Ort::MemoryInfo info_rocm("Hip", OrtAllocatorType::OrtArenaAllocator, 0, OrtMemTypeDefault);
|
||||
Ort::Allocator rocm_allocator(session, info_rocm);
|
||||
|
||||
auto allocator_info = rocm_allocator.GetInfo();
|
||||
ASSERT_TRUE(info_rocm == allocator_info);
|
||||
void* p = rocm_allocator.Alloc(1024);
|
||||
ASSERT_NE(p, nullptr);
|
||||
rocm_allocator.Free(p);
|
||||
|
||||
auto mem_allocation = rocm_allocator.GetAllocation(1024);
|
||||
ASSERT_NE(nullptr, mem_allocation.get());
|
||||
ASSERT_EQ(1024U, mem_allocation.size());
|
||||
}
|
||||
#endif
|
||||
|
||||
TEST(CApiTest, io_binding) {
|
||||
Ort::SessionOptions session_options;
|
||||
Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CPU(session_options, 1));
|
||||
|
|
@ -1937,7 +1962,7 @@ TEST(CApiTest, io_binding_cuda) {
|
|||
}
|
||||
#endif
|
||||
|
||||
#if defined(USE_CUDA) || defined(USE_TENSORRT)
|
||||
#if defined(USE_CUDA) || defined(USE_TENSORRT) || defined(USE_ROCM)
|
||||
TEST(CApiTest, basic_cuda_graph) {
|
||||
const auto& api = Ort::GetApi();
|
||||
Ort::SessionOptions session_options;
|
||||
|
|
@ -1955,7 +1980,7 @@ TEST(CApiTest, basic_cuda_graph) {
|
|||
ASSERT_TRUE(api.SessionOptionsAppendExecutionProvider_TensorRT_V2(
|
||||
static_cast<OrtSessionOptions*>(session_options),
|
||||
rel_trt_options.get()) == nullptr);
|
||||
#else
|
||||
#elif defined(USE_CUDA)
|
||||
// Enable cuda graph in cuda provider option.
|
||||
OrtCUDAProviderOptionsV2* cuda_options = nullptr;
|
||||
ASSERT_TRUE(api.CreateCUDAProviderOptions(&cuda_options) == nullptr);
|
||||
|
|
@ -1968,34 +1993,55 @@ TEST(CApiTest, basic_cuda_graph) {
|
|||
ASSERT_TRUE(api.SessionOptionsAppendExecutionProvider_CUDA_V2(
|
||||
static_cast<OrtSessionOptions*>(session_options),
|
||||
rel_cuda_options.get()) == nullptr);
|
||||
#elif defined(USE_ROCM)
|
||||
// Enable hip graph in rocm provider option.
|
||||
OrtROCMProviderOptions* rocm_options = nullptr;
|
||||
ASSERT_TRUE(api.CreateROCMProviderOptions(&rocm_options) == nullptr);
|
||||
std::unique_ptr<OrtROCMProviderOptions, decltype(api.ReleaseROCMProviderOptions)>
|
||||
rel_rocm_options(rocm_options, api.ReleaseROCMProviderOptions);
|
||||
std::vector<const char*> keys{"enable_hip_graph"};
|
||||
std::vector<const char*> values{"1"};
|
||||
ASSERT_TRUE(api.UpdateROCMProviderOptions(rel_rocm_options.get(), keys.data(), values.data(), 1) == nullptr);
|
||||
|
||||
ASSERT_TRUE(api.SessionOptionsAppendExecutionProvider_ROCM(
|
||||
static_cast<OrtSessionOptions*>(session_options),
|
||||
rel_rocm_options.get()) == nullptr);
|
||||
#endif
|
||||
|
||||
Ort::Session session(*ort_env, MODEL_URI, session_options);
|
||||
Ort::MemoryInfo info_cuda("Cuda", OrtAllocatorType::OrtArenaAllocator, 0, OrtMemTypeDefault);
|
||||
#if defined(USE_ROCM)
|
||||
// local hipify
|
||||
#define cudaMemcpy hipMemcpy
|
||||
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
||||
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
|
||||
Ort::MemoryInfo info_mem("Hip", OrtAllocatorType::OrtArenaAllocator, 0, OrtMemTypeDefault);
|
||||
#else
|
||||
Ort::MemoryInfo info_mem("Cuda", OrtAllocatorType::OrtArenaAllocator, 0, OrtMemTypeDefault);
|
||||
#endif
|
||||
|
||||
Ort::Allocator cuda_allocator(session, info_cuda);
|
||||
auto allocator_info = cuda_allocator.GetInfo();
|
||||
ASSERT_TRUE(info_cuda == allocator_info);
|
||||
Ort::Allocator allocator(session, info_mem);
|
||||
auto allocator_info = allocator.GetInfo();
|
||||
ASSERT_TRUE(info_mem == allocator_info);
|
||||
|
||||
const std::array<int64_t, 2> x_shape = {3, 2};
|
||||
std::array<float, 3 * 2> x_values = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
|
||||
auto input_data = cuda_allocator.GetAllocation(x_values.size() * sizeof(float));
|
||||
auto input_data = allocator.GetAllocation(x_values.size() * sizeof(float));
|
||||
|
||||
ASSERT_NE(input_data.get(), nullptr);
|
||||
cudaMemcpy(input_data.get(), x_values.data(), sizeof(float) * x_values.size(), cudaMemcpyHostToDevice);
|
||||
(void)cudaMemcpy(input_data.get(), x_values.data(), sizeof(float) * x_values.size(), cudaMemcpyHostToDevice);
|
||||
|
||||
// Create an OrtValue tensor backed by data on CUDA memory
|
||||
Ort::Value bound_x = Ort::Value::CreateTensor(info_cuda, reinterpret_cast<float*>(input_data.get()), x_values.size(),
|
||||
Ort::Value bound_x = Ort::Value::CreateTensor(info_mem, reinterpret_cast<float*>(input_data.get()), x_values.size(),
|
||||
x_shape.data(), x_shape.size());
|
||||
|
||||
const std::array<int64_t, 2> expected_y_shape = {3, 2};
|
||||
std::array<float, 3 * 2> expected_y = {1.0f, 4.0f, 9.0f, 16.0f, 25.0f, 36.0f};
|
||||
auto output_data = cuda_allocator.GetAllocation(expected_y.size() * sizeof(float));
|
||||
auto output_data = allocator.GetAllocation(expected_y.size() * sizeof(float));
|
||||
|
||||
ASSERT_NE(output_data.get(), nullptr);
|
||||
|
||||
// Create an OrtValue tensor backed by data on CUDA memory
|
||||
Ort::Value bound_y = Ort::Value::CreateTensor(info_cuda, reinterpret_cast<float*>(output_data.get()),
|
||||
Ort::Value bound_y = Ort::Value::CreateTensor(info_mem, reinterpret_cast<float*>(output_data.get()),
|
||||
expected_y.size(), expected_y_shape.data(), expected_y_shape.size());
|
||||
|
||||
// Create IoBinding for inputs and outputs.
|
||||
|
|
@ -2008,31 +2054,37 @@ TEST(CApiTest, basic_cuda_graph) {
|
|||
|
||||
// Check the values against the bound raw memory (needs copying from device to host first)
|
||||
std::array<float, 3 * 2> y_values;
|
||||
cudaMemcpy(y_values.data(), output_data.get(), sizeof(float) * y_values.size(), cudaMemcpyDeviceToHost);
|
||||
(void)cudaMemcpy(y_values.data(), output_data.get(), sizeof(float) * y_values.size(), cudaMemcpyDeviceToHost);
|
||||
ASSERT_THAT(y_values, ::testing::ContainerEq(expected_y));
|
||||
|
||||
// Replay the captured CUDA graph
|
||||
session.Run(Ort::RunOptions(), binding);
|
||||
cudaMemcpy(y_values.data(), output_data.get(), sizeof(float) * y_values.size(), cudaMemcpyDeviceToHost);
|
||||
(void)cudaMemcpy(y_values.data(), output_data.get(), sizeof(float) * y_values.size(), cudaMemcpyDeviceToHost);
|
||||
ASSERT_THAT(y_values, ::testing::ContainerEq(expected_y));
|
||||
|
||||
// Change the input and replay the CUDA graph again.
|
||||
x_values = {10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f};
|
||||
cudaMemcpy(input_data.get(), x_values.data(), sizeof(float) * x_values.size(), cudaMemcpyHostToDevice);
|
||||
(void)cudaMemcpy(input_data.get(), x_values.data(), sizeof(float) * x_values.size(), cudaMemcpyHostToDevice);
|
||||
binding.SynchronizeInputs();
|
||||
|
||||
session.Run(Ort::RunOptions(), binding);
|
||||
cudaMemcpy(y_values.data(), output_data.get(), sizeof(float) * y_values.size(), cudaMemcpyDeviceToHost);
|
||||
(void)cudaMemcpy(y_values.data(), output_data.get(), sizeof(float) * y_values.size(), cudaMemcpyDeviceToHost);
|
||||
expected_y = {10.0f, 40.0f, 90.0f, 160.0f, 250.0f, 360.0f};
|
||||
ASSERT_THAT(y_values, ::testing::ContainerEq(expected_y));
|
||||
|
||||
// Clean up
|
||||
binding.ClearBoundInputs();
|
||||
binding.ClearBoundOutputs();
|
||||
#if defined(USE_ROCM)
|
||||
#undef cudaMemcpy
|
||||
#undef cudaMemcpyHostToDevice
|
||||
#undef cudaMemcpyDeviceToHost
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifndef REDUCED_OPS_BUILD
|
||||
// The following test uses some ops not supported in the reduced ops build
|
||||
#ifndef REDUCED_OPS_BUILD
|
||||
#if defined(USE_CUDA) || defined(USE_TENSORRT)
|
||||
TEST(CApiTest, cuda_graph_with_shape_nodes) {
|
||||
const auto& api = Ort::GetApi();
|
||||
|
||||
|
|
@ -2053,10 +2105,34 @@ TEST(CApiTest, cuda_graph_with_shape_nodes) {
|
|||
// Successful loading of the ONNX model with shape nodes with cuda graph feature enabled
|
||||
Ort::Session session(*ort_env, TSTR("testdata/cuda_graph_with_shape_nodes.onnx"), session_options);
|
||||
}
|
||||
#endif // defined(USE_CUDA) || defined(USE_TENSORRT)
|
||||
|
||||
#endif
|
||||
#if defined(USE_ROCM)
|
||||
TEST(CApiTest, hip_graph_with_shape_nodes) {
|
||||
const auto& api = Ort::GetApi();
|
||||
|
||||
#endif
|
||||
// Enable hip graph in rocm provider option.
|
||||
OrtROCMProviderOptions* rocm_options = nullptr;
|
||||
ASSERT_TRUE(api.CreateROCMProviderOptions(&rocm_options) == nullptr);
|
||||
std::unique_ptr<OrtROCMProviderOptions, decltype(api.ReleaseROCMProviderOptions)>
|
||||
rel_rocm_options(rocm_options, api.ReleaseROCMProviderOptions);
|
||||
std::vector<const char*> keys{"enable_hip_graph"};
|
||||
std::vector<const char*> values{"1"};
|
||||
ASSERT_TRUE(api.UpdateROCMProviderOptions(rel_rocm_options.get(), keys.data(), values.data(), 1) == nullptr);
|
||||
|
||||
Ort::SessionOptions session_options;
|
||||
ASSERT_TRUE(api.SessionOptionsAppendExecutionProvider_ROCM(
|
||||
static_cast<OrtSessionOptions*>(session_options),
|
||||
rel_rocm_options.get()) == nullptr);
|
||||
|
||||
// Successful loading of the ONNX model with shape nodes with hip graph feature enabled
|
||||
Ort::Session session(*ort_env, TSTR("testdata/cuda_graph_with_shape_nodes.onnx"), session_options);
|
||||
}
|
||||
#endif // defined(USE_ROCM)
|
||||
|
||||
#endif // REDUCED_OPS_BUILD
|
||||
|
||||
#endif // defined(USE_CUDA) || defined(USE_TENSORRT) || defined(USE_ROCM)
|
||||
|
||||
TEST(CApiTest, create_tensor) {
|
||||
const char* s[] = {"abc", "kmp"};
|
||||
|
|
|
|||
Loading…
Reference in a new issue