diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake index fa395802d9..0987d6d164 100644 --- a/cmake/onnxruntime_unittests.cmake +++ b/cmake/onnxruntime_unittests.cmake @@ -1277,6 +1277,9 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP) if (onnxruntime_USE_CUDA) list(APPEND onnxruntime_shared_lib_test_LIBS cudart) endif() + if (onnxruntime_USE_ROCM) + list(APPEND onnxruntime_shared_lib_test_LIBS hip::host) + endif() if (onnxruntime_USE_TENSORRT) list(APPEND onnxruntime_shared_lib_test_LIBS ${TENSORRT_LIBRARY_INFER}) endif() @@ -1294,6 +1297,10 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP) target_include_directories(onnxruntime_shared_lib_test PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) target_sources(onnxruntime_shared_lib_test PRIVATE ${ONNXRUNTIME_SHARED_LIB_TEST_SRC_DIR}/cuda_ops.cu) endif() + if (onnxruntime_USE_ROCM) + target_include_directories(onnxruntime_shared_lib_test PRIVATE ${onnxruntime_ROCM_HOME}/include) + target_compile_definitions(onnxruntime_shared_lib_test PRIVATE __HIP_PLATFORM_AMD__) + endif() if (CMAKE_SYSTEM_NAME STREQUAL "Android") target_sources(onnxruntime_shared_lib_test PRIVATE "${ONNXRUNTIME_ROOT}/core/platform/android/cxa_demangle.cc" diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h index 101a578ec3..2ce9d361e8 100644 --- a/include/onnxruntime/core/session/onnxruntime_c_api.h +++ b/include/onnxruntime/core/session/onnxruntime_c_api.h @@ -496,6 +496,7 @@ typedef struct OrtROCMProviderOptions { has_user_compute_stream{}, user_compute_stream{}, default_memory_arena_cfg{}, + enable_hip_graph{false}, tunable_op_enable{false}, tunable_op_tuning_enable{false}, tunable_op_max_tuning_duration_ms{} {} @@ -548,6 +549,8 @@ typedef struct OrtROCMProviderOptions { */ OrtArenaCfg* default_memory_arena_cfg; + int enable_hip_graph; + /** \brief Enable TunableOp for using. * Set it to 1/0 to enable/disable TunableOp. Otherwise, it is disabled by default. * This option can be overriden by environment variable ORT_ROCM_TUNABLE_OP_ENABLE. diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc index d7c5098d9d..d7bec337a6 100644 --- a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc +++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc @@ -170,6 +170,8 @@ ROCMExecutionProvider::PerThreadContext::PerThreadContext(OrtDevice::DeviceId de MIOPEN_CALL_THROW(miopenCreate(&miopen_handle_)); MIOPEN_CALL_THROW(miopenSetStream(miopen_handle_, stream)); + + hip_graph_.SetStream(stream); } ROCMExecutionProvider::PerThreadContext::~PerThreadContext() { @@ -177,6 +179,33 @@ ROCMExecutionProvider::PerThreadContext::~PerThreadContext() { ORT_IGNORE_RETURN_VALUE(MIOPEN_CALL(miopenDestroy(miopen_handle_))); } +bool ROCMExecutionProvider::PerThreadContext::IsGraphCaptureAllowed() const { + return regular_run_count_before_graph_capture_ >= min_num_runs_before_hip_graph_capture_; +} + +void ROCMExecutionProvider::PerThreadContext::CaptureBegin() { + hip_graph_.Reset(); + hip_graph_.CaptureBegin(); +} + +void ROCMExecutionProvider::PerThreadContext::CaptureEnd() { + hip_graph_.CaptureEnd(); + is_graph_captured_ = true; +} + +bool ROCMExecutionProvider::PerThreadContext::IsGraphCaptured() const { + return is_graph_captured_; +} + +Status ROCMExecutionProvider::PerThreadContext::ReplayGraph() { + ORT_ENFORCE(IsGraphCaptured()); + return hip_graph_.Replay(); +} + +void ROCMExecutionProvider::PerThreadContext::IncrementRegularRunCountBeforeGraphCapture() { + ++regular_run_count_before_graph_capture_; +} + void OverrideTunableOpInfoByEnv(ROCMExecutionProviderInfo& info) { if (auto env_tunable_op_enable = onnxruntime::ParseTestOnlyEnvironmentVariable( "ORT_ROCM_TUNABLE_OP_ENABLE", {"0", "1"}, "Use provider_options \"tunable_op_enable\" instead."); @@ -219,6 +248,11 @@ ROCMExecutionProvider::ROCMExecutionProvider(const ROCMExecutionProviderInfo& in if (info.external_allocator_info.UseExternalAllocator()) { use_ep_level_unified_stream_ = true; stream_ = nullptr; + } else if (info.enable_hip_graph) { + // current hip graph implementation only works with single stream + // use EP level unified stream for all the reqeust + HIP_CALL_THROW(hipStreamCreateWithFlags(&stream_, hipStreamNonBlocking)); + use_ep_level_unified_stream_ = true; } else { stream_ = nullptr; } @@ -322,25 +356,58 @@ Status ROCMExecutionProvider::Sync() const { Status ROCMExecutionProvider::OnRunStart() { // always set ROCM device when session::Run() in case it runs in a worker thread HIP_RETURN_IF_ERROR(hipSetDevice(GetDeviceId())); + if (IsGraphCaptureEnabled() && GetPerThreadContext().IsGraphCaptureAllowed() && !GetPerThreadContext().IsGraphCaptured()) { + LOGS_DEFAULT(INFO) << "Capturing the hip graph for this model"; + GetPerThreadContext().CaptureBegin(); + } return Status::OK(); } Status ROCMExecutionProvider::OnRunEnd(bool sync_stream) { + if (IsGraphCaptureEnabled() && !GetPerThreadContext().IsGraphCaptured()) { + if (GetPerThreadContext().IsGraphCaptureAllowed()) { + GetPerThreadContext().CaptureEnd(); + // HIP work issued to a capturing stream doesn’t actually run on the GPU, + // so run the captured graph here to actually execute the work. + ORT_RETURN_IF_ERROR(GetPerThreadContext().ReplayGraph()); + } else { + GetPerThreadContext().IncrementRegularRunCountBeforeGraphCapture(); + } + } + if (sync_stream) { HIP_RETURN_IF_ERROR(hipStreamSynchronize(static_cast(stream_))); } - // In extreme cases (e.g., 1-op graph and that op fallbacks to CPU), - // PerThreadContext won't be created and there is nothing to - // release. This didn't happen before because we always call - // GetPerThreadContext in OnRunStart. - if (PerThreadContextCache()->find(this) != PerThreadContextCache()->end()) { + // The reason of !IsGraphCaptureEnabled(): + // If hip graph is enabled, the per thread context will not be released + // because the per thread hip graph needs to be maintained and replayed for + // the next run. + // The reason of PerThreadContextCache()->find(this) != PerThreadContextCache()->end(): + // In extreme cases (e.g., 1-op graph and that op fallbacks to CPU), + // PerThreadContext won't be created and there is nothing to + // release. This didn't happen before because we always call + // GetPerThreadContext in OnRunStart. + if (!IsGraphCaptureEnabled() && + PerThreadContextCache()->find(this) != PerThreadContextCache()->end()) { ReleasePerThreadContext(); } return Status::OK(); } +bool ROCMExecutionProvider::IsGraphCaptureEnabled() const { + return info_.enable_hip_graph; +} + +bool ROCMExecutionProvider::IsGraphCaptured() const { + return GetPerThreadContext().IsGraphCaptured(); +} + +Status ROCMExecutionProvider::ReplayGraph() { + return GetPerThreadContext().ReplayGraph(); +} + namespace rocm { // opset 1 to 9 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, MemcpyFromHost); diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.h b/onnxruntime/core/providers/rocm/rocm_execution_provider.h index c4945b9ac2..37d5f7b422 100644 --- a/onnxruntime/core/providers/rocm/rocm_execution_provider.h +++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.h @@ -10,6 +10,7 @@ #include "core/framework/execution_provider.h" #include "core/platform/ort_mutex.h" #include "core/providers/rocm/rocm_execution_provider_info.h" +#include "core/providers/rocm/rocm_graph.h" #include "core/providers/rocm/rocm_pch.h" #include "core/providers/rocm/shared_inc/rocm_utils.h" #include "core/providers/rocm/shared_inc/rocm_call.h" @@ -73,6 +74,9 @@ class ROCMExecutionProvider : public IExecutionProvider { std::unique_ptr GetProfiler() override; + bool IsGraphCaptureEnabled() const override; + bool IsGraphCaptured() const override; + Status ReplayGraph() override; void RegisterStreamHandlers(IStreamCommandHandleRegistry& stream_handle_registry, AllocatorMap& allocators) const override; OrtDevice GetOrtDeviceByMemType(OrtMemType mem_type) const override; std::vector CreatePreferredAllocators() override; @@ -81,6 +85,7 @@ class ROCMExecutionProvider : public IExecutionProvider { ROCMExecutionProviderInfo info_; hipDeviceProp_t device_prop_; bool external_stream_ = false; + // only used when set user external stream or hip graph hipStream_t stream_ = nullptr; bool use_ep_level_unified_stream_ = false; @@ -133,6 +138,13 @@ class ROCMExecutionProvider : public IExecutionProvider { } } + bool IsGraphCaptureAllowed() const; + void CaptureBegin(); + void CaptureEnd(); + bool IsGraphCaptured() const; + Status ReplayGraph(); + void IncrementRegularRunCountBeforeGraphCapture(); + private: rocblas_handle rocblas_handle_ = nullptr; miopenHandle_t miopen_handle_ = nullptr; @@ -141,6 +153,18 @@ class ROCMExecutionProvider : public IExecutionProvider { std::unique_ptr> constant_ones_double_; std::unique_ptr> constant_ones_half_; std::unique_ptr> constant_ones_bfloat16_; + + // Hip graph with multi threads will be supported in the future, so hip_graph_ + // is put under PerThreadContext. + ROCMGraph hip_graph_; + bool is_graph_captured_ = false; + int regular_run_count_before_graph_capture_ = 0; + + // There is chance that the second regular run allocates GPU memory for causes like: + // (1) memory pattern is enabled. (2) arena allocation for stream. + // Since no GPU memory allocation is allowed during graph capturing, we need at least two regular runs + // to allocate enough memory in Arena before graph capturing. + const int min_num_runs_before_hip_graph_capture_ = 2; // required min regular runs before graph capture for the necessary memory allocations. }; using PerThreadContextMap = std::unordered_map>; diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider_info.cc b/onnxruntime/core/providers/rocm/rocm_execution_provider_info.cc index 650635c153..b557f92287 100644 --- a/onnxruntime/core/providers/rocm/rocm_execution_provider_info.cc +++ b/onnxruntime/core/providers/rocm/rocm_execution_provider_info.cc @@ -21,6 +21,7 @@ constexpr const char* kGpuExternalAlloc = "gpu_external_alloc"; constexpr const char* kGpuExternalFree = "gpu_external_free"; constexpr const char* kGpuExternalEmptyCache = "gpu_external_empty_cache"; constexpr const char* kMiopenConvUseMaxWorkspace = "miopen_conv_use_max_workspace"; +constexpr const char* kEnableHipGraph = "enable_hip_graph"; constexpr const char* kTunableOpEnable = "tunable_op_enable"; constexpr const char* kTunableOpTuningEnable = "tunable_op_tuning_enable"; constexpr const char* kTunableOpMaxTuningDurationMs = "tunable_op_max_tuning_duration_ms"; @@ -84,6 +85,7 @@ ROCMExecutionProviderInfo ROCMExecutionProviderInfo::FromProviderOptions(const P info.miopen_conv_exhaustive_search) .AddAssignmentToReference(rocm::provider_option_names::kDoCopyInDefaultStream, info.do_copy_in_default_stream) .AddAssignmentToReference(rocm::provider_option_names::kMiopenConvUseMaxWorkspace, info.miopen_conv_use_max_workspace) + .AddAssignmentToReference(rocm::provider_option_names::kEnableHipGraph, info.enable_hip_graph) .AddValueParser( rocm::provider_option_names::kTunableOpEnable, [&info](const std::string& value_str) -> Status { @@ -121,6 +123,7 @@ ProviderOptions ROCMExecutionProviderInfo::ToProviderOptions(const ROCMExecution {rocm::provider_option_names::kMiopenConvExhaustiveSearch, MakeStringWithClassicLocale(info.miopen_conv_exhaustive_search)}, {rocm::provider_option_names::kDoCopyInDefaultStream, MakeStringWithClassicLocale(info.do_copy_in_default_stream)}, {rocm::provider_option_names::kMiopenConvUseMaxWorkspace, MakeStringWithClassicLocale(info.miopen_conv_use_max_workspace)}, + {rocm::provider_option_names::kEnableHipGraph, MakeStringWithClassicLocale(info.enable_hip_graph)}, {rocm::provider_option_names::kTunableOpEnable, MakeStringWithClassicLocale(info.tunable_op.enable)}, {rocm::provider_option_names::kTunableOpTuningEnable, MakeStringWithClassicLocale(info.tunable_op.tuning_enable)}, {rocm::provider_option_names::kTunableOpMaxTuningDurationMs, MakeStringWithClassicLocale(info.tunable_op.max_tuning_duration_ms)}, diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider_info.h b/onnxruntime/core/providers/rocm/rocm_execution_provider_info.h index e35c0cc0af..2f549cc1ac 100644 --- a/onnxruntime/core/providers/rocm/rocm_execution_provider_info.h +++ b/onnxruntime/core/providers/rocm/rocm_execution_provider_info.h @@ -63,6 +63,8 @@ struct ROCMExecutionProviderInfo { // If set to false, use fix workspace size (32M) for Conv algo search, the final algo might not be the best. bool miopen_conv_use_max_workspace{true}; + bool enable_hip_graph{false}; + rocm::TunableOpInfo tunable_op{}; static ROCMExecutionProviderInfo FromProviderOptions(const ProviderOptions& options); diff --git a/onnxruntime/core/providers/rocm/rocm_provider_factory.cc b/onnxruntime/core/providers/rocm/rocm_provider_factory.cc index 4d88c25469..88ef666678 100644 --- a/onnxruntime/core/providers/rocm/rocm_provider_factory.cc +++ b/onnxruntime/core/providers/rocm/rocm_provider_factory.cc @@ -185,6 +185,7 @@ struct ROCM_Provider : Provider { info.has_user_compute_stream = params->has_user_compute_stream != 0; info.user_compute_stream = params->user_compute_stream; info.default_memory_arena_cfg = params->default_memory_arena_cfg; + info.enable_hip_graph = params->enable_hip_graph; info.tunable_op.enable = params->tunable_op_enable; info.tunable_op.tuning_enable = params->tunable_op_tuning_enable; info.tunable_op.max_tuning_duration_ms = params->tunable_op_max_tuning_duration_ms; @@ -215,6 +216,7 @@ struct ROCM_Provider : Provider { rocm_options.user_compute_stream = internal_options.user_compute_stream; } rocm_options.default_memory_arena_cfg = internal_options.default_memory_arena_cfg; + rocm_options.enable_hip_graph = internal_options.enable_hip_graph; rocm_options.tunable_op_enable = internal_options.tunable_op.enable; rocm_options.tunable_op_tuning_enable = internal_options.tunable_op.tuning_enable; rocm_options.tunable_op_max_tuning_duration_ms = internal_options.tunable_op.max_tuning_duration_ms; diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc index e8853c8824..39f47c09f2 100644 --- a/onnxruntime/core/session/inference_session.cc +++ b/onnxruntime/core/session/inference_session.cc @@ -153,7 +153,7 @@ static bool AreAllComputeNodesAssignedToCudaEp(const Graph& graph) { // Empty node provider means CPU EP if (!node_provider.empty() && - node_provider != kCudaExecutionProvider && + !(node_provider == kCudaExecutionProvider || node_provider == kRocmExecutionProvider) && node_provider != kCpuExecutionProvider) { nodes_on_cpu_and_cuda_eps_only = false; break; @@ -1715,7 +1715,8 @@ common::Status InferenceSession::Initialize() { // now that all the transforms are done, call Resolve on the main graph. this will recurse into the subgraphs. ORT_RETURN_IF_ERROR_SESSIONID_(graph.Resolve()); - // Currently CUDA graph is only considered by CUDA EP and TRT EP. + // Currently CUDA graph is only considered by CUDA EP and TRT EP, and + // HIP graph is only considered by ROCM EP. // // Check for CUDA EP: // If the CUDA EP is part of the providers list for this session AND @@ -1728,47 +1729,58 @@ common::Status InferenceSession::Initialize() { // The TRT EP is configured to do a graph capture AND // All the graph nodes have been assigned to the TRT EP, // Then the TRT EP is cached for triggering a ReplayGraph() in Run(). - std::vector cuda_graph_support_ep_list = {onnxruntime::kTensorrtExecutionProvider, onnxruntime::kCudaExecutionProvider}; + // + // Check for ROCM EP: + // If the ROCM EP is part of the providers list for this session AND + // The ROCM EP is configured to do a graph capture AND + // All the "compute" graph nodes have been assigned to the ROCM EP, + // Then the ROCM EP is cached for triggering a ReplayGraph() in Run(). + // + std::vector graph_support_ep_list = { + onnxruntime::kTensorrtExecutionProvider, + onnxruntime::kCudaExecutionProvider, + onnxruntime::kRocmExecutionProvider}; - for (auto& it : cuda_graph_support_ep_list) { + for (auto& it : graph_support_ep_list) { auto* target_ep = execution_providers_.Get(it); if (target_ep && target_ep->IsGraphCaptureEnabled()) { - // CUDA Graphs can't work with control flow nodes + // CUDA/HIP Graphs can't work with control flow nodes if (HasControlflowNodes(graph)) { - LOGS(*session_logger_, ERROR) << "This session cannot use the CUDA Graph feature as requested by the user " - << "as the model has control flow nodes which can't be supported by CUDA Graphs."; + LOGS(*session_logger_, ERROR) << "This session cannot use the CUDA/HIP Graph feature as requested by the user " + << "as the model has control flow nodes which can't be supported by CUDA/HIP Graphs."; ORT_RETURN_IF_ERROR_SESSIONID_( ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, - "This session cannot use the CUDA Graph feature as requested by the user " - "as the model has control flow nodes which can't be supported by CUDA Graphs.")); + "This session cannot use the CUDA/HIP Graph feature as requested by the user " + "as the model has control flow nodes which can't be supported by CUDA/HIP Graphs.")); } - if (strcmp(target_ep->Type().c_str(), onnxruntime::kCudaExecutionProvider) == 0) { + if (strcmp(target_ep->Type().c_str(), onnxruntime::kCudaExecutionProvider) == 0 || + strcmp(target_ep->Type().c_str(), onnxruntime::kRocmExecutionProvider) == 0) { // Ensure that all nodes have been partitioned to CUDA or CPU EP && there are no memcpy nodes // The reasoning behind this logic is that certain shape nodes will be forced onto CPU // and as long as there are no memcpy nodes this is confirmation that no compute nodes have been placed on the CPU EP // which is all we care about. if (!AreAllComputeNodesAssignedToCudaEp(graph)) { - LOGS(*session_logger_, ERROR) << "This session cannot use the CUDA Graph feature as requested by the user " - << " as all compute graph nodes have not been partitioned to the CUDA EP."; + LOGS(*session_logger_, ERROR) << "This session cannot use the CUDA/HIP Graph feature as requested by the user " + << " as all compute graph nodes have not been partitioned to the CUDA/HIP EP."; ORT_RETURN_IF_ERROR_SESSIONID_( ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, - "This session cannot use the CUDA Graph feature as requested by the user " - " as all compute graph nodes have not been partitioned to the CUDA EP.")); + "This session cannot use the CUDA/HIP Graph feature as requested by the user " + " as all compute graph nodes have not been partitioned to the CUDA/HIP EP.")); } // Log a warning for the user to know that there are shape subgraphs that will execute on CPU if (HasShapeSubgraphNodes(graph)) { LOGS(*session_logger_, WARNING) << "This model has shape massaging nodes that will execute on CPU. " - << "Use the CUDA Graph feature with caution. " + << "Use the CUDA/HIP Graph feature with caution. " << "As long as the intermediate shapes produced in the model " - << "using the representative input used to capture the CUDA graph, " + << "using the representative input used to capture the CUDA/HIP graph, " << "will match the shapes produced in the model for other inputs " << "of the same shape as the representative input (common case), " - << "it is safe to use the CUDA Graph feature."; + << "it is safe to use the CUDA/HIP Graph feature."; } } else { // Following code path is for TRT EP currently. @@ -1787,7 +1799,7 @@ common::Status InferenceSession::Initialize() { } } - LOGS(*session_logger_, INFO) << "This session will use the CUDA Graph feature as requested by the user."; + LOGS(*session_logger_, INFO) << "This session will use the CUDA/HIP Graph feature as requested by the user."; cached_execution_provider_for_graph_replay_.SetExecutionProvider(target_ep); break; // Make sure only one ep can run CUDA graph. } @@ -2477,7 +2489,9 @@ Status InferenceSession::Run(const RunOptions& run_options, // As N+1 inference runs (N for memory allocation and 1 for graph capturing) // are needed before replaying the captured graph, here run N inference runs recursively until graph captured, // so that users just need one session run to capture the graph. - // N is defined in min_num_runs_before_cuda_graph_capture_ for CUDA EP, and the value could be different for other EP. + // N is defined in min_num_runs_before_cuda_graph_capture_ for CUDA EP, + // N is defined in min_num_runs_before_hip_graph_capture_ for ROCM EP, + // and the value could be different for other EP. if (retval.IsOK() && cached_execution_provider_for_graph_replay_.IsGraphCaptureEnabled() && !cached_execution_provider_for_graph_replay_.IsGraphCaptured()) { LOGS(*session_logger_, INFO) << "Start another run for necessary memory allocation or graph capture."; diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc index 3269c9f0f4..3178c13d30 100644 --- a/onnxruntime/core/session/provider_bridge_ort.cc +++ b/onnxruntime/core/session/provider_bridge_ort.cc @@ -2380,6 +2380,7 @@ ORT_API_STATUS_IMPL(OrtApis::CreateROCMProviderOptions, _Outptr_ OrtROCMProvider options->has_user_compute_stream = 0; options->user_compute_stream = nullptr; options->default_memory_arena_cfg = nullptr; + options->enable_hip_graph = false; options->tunable_op_enable = 0; options->tunable_op_tuning_enable = 0; options->tunable_op_max_tuning_duration_ms = 0; diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc index 6ffe72f81b..8dad2c8e2d 100644 --- a/onnxruntime/test/shared_lib/test_inference.cc +++ b/onnxruntime/test/shared_lib/test_inference.cc @@ -43,6 +43,10 @@ #include #endif +#ifdef USE_ROCM +#include +#endif + // Once we use C++17 this could be replaced with std::size template constexpr size_t countof(T (&)[N]) { return N; } @@ -1762,6 +1766,27 @@ TEST(CApiTest, get_allocator_cuda) { } #endif +#ifdef USE_ROCM +TEST(CApiTest, get_allocator_rocm) { + Ort::SessionOptions session_options; + Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(session_options, 0)); + Ort::Session session(*ort_env, NAMED_AND_ANON_DIM_PARAM_URI, session_options); + + Ort::MemoryInfo info_rocm("Hip", OrtAllocatorType::OrtArenaAllocator, 0, OrtMemTypeDefault); + Ort::Allocator rocm_allocator(session, info_rocm); + + auto allocator_info = rocm_allocator.GetInfo(); + ASSERT_TRUE(info_rocm == allocator_info); + void* p = rocm_allocator.Alloc(1024); + ASSERT_NE(p, nullptr); + rocm_allocator.Free(p); + + auto mem_allocation = rocm_allocator.GetAllocation(1024); + ASSERT_NE(nullptr, mem_allocation.get()); + ASSERT_EQ(1024U, mem_allocation.size()); +} +#endif + TEST(CApiTest, io_binding) { Ort::SessionOptions session_options; Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CPU(session_options, 1)); @@ -1937,7 +1962,7 @@ TEST(CApiTest, io_binding_cuda) { } #endif -#if defined(USE_CUDA) || defined(USE_TENSORRT) +#if defined(USE_CUDA) || defined(USE_TENSORRT) || defined(USE_ROCM) TEST(CApiTest, basic_cuda_graph) { const auto& api = Ort::GetApi(); Ort::SessionOptions session_options; @@ -1955,7 +1980,7 @@ TEST(CApiTest, basic_cuda_graph) { ASSERT_TRUE(api.SessionOptionsAppendExecutionProvider_TensorRT_V2( static_cast(session_options), rel_trt_options.get()) == nullptr); -#else +#elif defined(USE_CUDA) // Enable cuda graph in cuda provider option. OrtCUDAProviderOptionsV2* cuda_options = nullptr; ASSERT_TRUE(api.CreateCUDAProviderOptions(&cuda_options) == nullptr); @@ -1968,34 +1993,55 @@ TEST(CApiTest, basic_cuda_graph) { ASSERT_TRUE(api.SessionOptionsAppendExecutionProvider_CUDA_V2( static_cast(session_options), rel_cuda_options.get()) == nullptr); +#elif defined(USE_ROCM) + // Enable hip graph in rocm provider option. + OrtROCMProviderOptions* rocm_options = nullptr; + ASSERT_TRUE(api.CreateROCMProviderOptions(&rocm_options) == nullptr); + std::unique_ptr + rel_rocm_options(rocm_options, api.ReleaseROCMProviderOptions); + std::vector keys{"enable_hip_graph"}; + std::vector values{"1"}; + ASSERT_TRUE(api.UpdateROCMProviderOptions(rel_rocm_options.get(), keys.data(), values.data(), 1) == nullptr); + + ASSERT_TRUE(api.SessionOptionsAppendExecutionProvider_ROCM( + static_cast(session_options), + rel_rocm_options.get()) == nullptr); #endif Ort::Session session(*ort_env, MODEL_URI, session_options); - Ort::MemoryInfo info_cuda("Cuda", OrtAllocatorType::OrtArenaAllocator, 0, OrtMemTypeDefault); +#if defined(USE_ROCM) +// local hipify +#define cudaMemcpy hipMemcpy +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost + Ort::MemoryInfo info_mem("Hip", OrtAllocatorType::OrtArenaAllocator, 0, OrtMemTypeDefault); +#else + Ort::MemoryInfo info_mem("Cuda", OrtAllocatorType::OrtArenaAllocator, 0, OrtMemTypeDefault); +#endif - Ort::Allocator cuda_allocator(session, info_cuda); - auto allocator_info = cuda_allocator.GetInfo(); - ASSERT_TRUE(info_cuda == allocator_info); + Ort::Allocator allocator(session, info_mem); + auto allocator_info = allocator.GetInfo(); + ASSERT_TRUE(info_mem == allocator_info); const std::array x_shape = {3, 2}; std::array x_values = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; - auto input_data = cuda_allocator.GetAllocation(x_values.size() * sizeof(float)); + auto input_data = allocator.GetAllocation(x_values.size() * sizeof(float)); ASSERT_NE(input_data.get(), nullptr); - cudaMemcpy(input_data.get(), x_values.data(), sizeof(float) * x_values.size(), cudaMemcpyHostToDevice); + (void)cudaMemcpy(input_data.get(), x_values.data(), sizeof(float) * x_values.size(), cudaMemcpyHostToDevice); // Create an OrtValue tensor backed by data on CUDA memory - Ort::Value bound_x = Ort::Value::CreateTensor(info_cuda, reinterpret_cast(input_data.get()), x_values.size(), + Ort::Value bound_x = Ort::Value::CreateTensor(info_mem, reinterpret_cast(input_data.get()), x_values.size(), x_shape.data(), x_shape.size()); const std::array expected_y_shape = {3, 2}; std::array expected_y = {1.0f, 4.0f, 9.0f, 16.0f, 25.0f, 36.0f}; - auto output_data = cuda_allocator.GetAllocation(expected_y.size() * sizeof(float)); + auto output_data = allocator.GetAllocation(expected_y.size() * sizeof(float)); ASSERT_NE(output_data.get(), nullptr); // Create an OrtValue tensor backed by data on CUDA memory - Ort::Value bound_y = Ort::Value::CreateTensor(info_cuda, reinterpret_cast(output_data.get()), + Ort::Value bound_y = Ort::Value::CreateTensor(info_mem, reinterpret_cast(output_data.get()), expected_y.size(), expected_y_shape.data(), expected_y_shape.size()); // Create IoBinding for inputs and outputs. @@ -2008,31 +2054,37 @@ TEST(CApiTest, basic_cuda_graph) { // Check the values against the bound raw memory (needs copying from device to host first) std::array y_values; - cudaMemcpy(y_values.data(), output_data.get(), sizeof(float) * y_values.size(), cudaMemcpyDeviceToHost); + (void)cudaMemcpy(y_values.data(), output_data.get(), sizeof(float) * y_values.size(), cudaMemcpyDeviceToHost); ASSERT_THAT(y_values, ::testing::ContainerEq(expected_y)); // Replay the captured CUDA graph session.Run(Ort::RunOptions(), binding); - cudaMemcpy(y_values.data(), output_data.get(), sizeof(float) * y_values.size(), cudaMemcpyDeviceToHost); + (void)cudaMemcpy(y_values.data(), output_data.get(), sizeof(float) * y_values.size(), cudaMemcpyDeviceToHost); ASSERT_THAT(y_values, ::testing::ContainerEq(expected_y)); // Change the input and replay the CUDA graph again. x_values = {10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f}; - cudaMemcpy(input_data.get(), x_values.data(), sizeof(float) * x_values.size(), cudaMemcpyHostToDevice); + (void)cudaMemcpy(input_data.get(), x_values.data(), sizeof(float) * x_values.size(), cudaMemcpyHostToDevice); binding.SynchronizeInputs(); session.Run(Ort::RunOptions(), binding); - cudaMemcpy(y_values.data(), output_data.get(), sizeof(float) * y_values.size(), cudaMemcpyDeviceToHost); + (void)cudaMemcpy(y_values.data(), output_data.get(), sizeof(float) * y_values.size(), cudaMemcpyDeviceToHost); expected_y = {10.0f, 40.0f, 90.0f, 160.0f, 250.0f, 360.0f}; ASSERT_THAT(y_values, ::testing::ContainerEq(expected_y)); // Clean up binding.ClearBoundInputs(); binding.ClearBoundOutputs(); +#if defined(USE_ROCM) +#undef cudaMemcpy +#undef cudaMemcpyHostToDevice +#undef cudaMemcpyDeviceToHost +#endif } -#ifndef REDUCED_OPS_BUILD // The following test uses some ops not supported in the reduced ops build +#ifndef REDUCED_OPS_BUILD +#if defined(USE_CUDA) || defined(USE_TENSORRT) TEST(CApiTest, cuda_graph_with_shape_nodes) { const auto& api = Ort::GetApi(); @@ -2053,10 +2105,34 @@ TEST(CApiTest, cuda_graph_with_shape_nodes) { // Successful loading of the ONNX model with shape nodes with cuda graph feature enabled Ort::Session session(*ort_env, TSTR("testdata/cuda_graph_with_shape_nodes.onnx"), session_options); } +#endif // defined(USE_CUDA) || defined(USE_TENSORRT) -#endif +#if defined(USE_ROCM) +TEST(CApiTest, hip_graph_with_shape_nodes) { + const auto& api = Ort::GetApi(); -#endif + // Enable hip graph in rocm provider option. + OrtROCMProviderOptions* rocm_options = nullptr; + ASSERT_TRUE(api.CreateROCMProviderOptions(&rocm_options) == nullptr); + std::unique_ptr + rel_rocm_options(rocm_options, api.ReleaseROCMProviderOptions); + std::vector keys{"enable_hip_graph"}; + std::vector values{"1"}; + ASSERT_TRUE(api.UpdateROCMProviderOptions(rel_rocm_options.get(), keys.data(), values.data(), 1) == nullptr); + + Ort::SessionOptions session_options; + ASSERT_TRUE(api.SessionOptionsAppendExecutionProvider_ROCM( + static_cast(session_options), + rel_rocm_options.get()) == nullptr); + + // Successful loading of the ONNX model with shape nodes with hip graph feature enabled + Ort::Session session(*ort_env, TSTR("testdata/cuda_graph_with_shape_nodes.onnx"), session_options); +} +#endif // defined(USE_ROCM) + +#endif // REDUCED_OPS_BUILD + +#endif // defined(USE_CUDA) || defined(USE_TENSORRT) || defined(USE_ROCM) TEST(CApiTest, create_tensor) { const char* s[] = {"abc", "kmp"};