diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc index 8aef431266..ec5bf56356 100644 --- a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc +++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc @@ -279,14 +279,22 @@ Status ROCMExecutionProvider::OnRunStart() { while (it != deferred_release_cpu_ptr_.end()) { auto& e = it->first; auto& v = it->second; - // note that hipEventQuery returns hipSucess before first hipEventRecord - if (v.recorded && hipSuccess == hipEventQuery(e)) { - for (auto p : v.cpu_ptrs) { - cpu_alloc->Free(p); + // note that hipEventQuery returns hipSuccess before first hipEventRecord + if (v.recorded) { + auto event_query_status = hipEventQuery(e); + if (event_query_status == hipSuccess) { + for (auto p : v.cpu_ptrs) { + cpu_alloc->Free(p); + } + HIP_RETURN_IF_ERROR(hipEventDestroy(e)); + it = deferred_release_cpu_ptr_.erase(it); + } else if (event_query_status == hipErrorNotReady) { + // ignore and clear the error if not ready + hipGetLastError(); + it++; + } else { + HIP_RETURN_IF_ERROR(event_query_status); } - hipEvent_t expired_event = it->first; - it = deferred_release_cpu_ptr_.erase(it); - HIP_RETURN_IF_ERROR(hipEventDestroy(expired_event)); } else { ++it; } diff --git a/onnxruntime/core/providers/rocm/rocm_fence.cc b/onnxruntime/core/providers/rocm/rocm_fence.cc index 58d3e9fd6f..591fd38b04 100644 --- a/onnxruntime/core/providers/rocm/rocm_fence.cc +++ b/onnxruntime/core/providers/rocm/rocm_fence.cc @@ -4,14 +4,17 @@ #include "core/providers/rocm/rocm_fence.h" #include "core/graph/constants.h" -#include "core/providers/rocm/gpu_data_transfer.h" #include "core/providers/rocm/rocm_common.h" +#include "core/providers/rocm/gpu_data_transfer.h" namespace onnxruntime { ROCMFence::ROCMFence(const GPUDataTransfer* data_transfer) : data_transfer_(data_transfer) { - HIP_CALL_THROW(hipEventCreate(&read_event_)); - HIP_CALL_THROW(hipEventCreate(&write_event_)); + // NOTE: hipEventBlockingSync may leads to longer wait time because of thread yield/switching in kernel + // if lower CPU usage is more important than latency, we should use this flag to avoid spin-loop in WaitOnCPU + int event_flags = /*hipEventBlockingSync |*/ hipEventDisableTiming; + HIP_CALL_THROW(hipEventCreateWithFlags(&read_event_, event_flags)); + HIP_CALL_THROW(hipEventCreateWithFlags(&write_event_, event_flags)); } ROCMFence::~ROCMFence() { @@ -43,8 +46,24 @@ void ROCMFence::BeforeUsingAsOutput(onnxruntime::ProviderType provider_type, int } bool ROCMFence::CanRelease() { - return hipEventQuery(read_event_) == hipSuccess && - hipEventQuery(write_event_) == hipSuccess; + hipError_t status; + status = hipEventQuery(read_event_); + if (status == hipErrorNotReady) { + // ignore and clear the error if not ready + hipGetLastError(); + return false; + } else if (status != hipSuccess) { + RocmCall(status, "hipEventQuery(read_event_)", "HIP", hipSuccess); + } + status = hipEventQuery(write_event_); + if (status == hipErrorNotReady) { + // ignore and clear the error if not ready + hipGetLastError(); + return false; + } else if (status != hipSuccess) { + RocmCall(status, "hipEventQuery(write_event_)", "HIP", hipSuccess); + } + return true; } void ROCMFence::AfterUsedAsInput(int queue_id) {