mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-22 22:01:08 +00:00
[ROCm] clear last status if hipErrorNotReady (#8358)
* [ROCm] clear last status if hipErrorNotReady * use hipEventDisableTiming in rocm_fence.cc * fix syntax errors * destroy event before handle becomes invalid
This commit is contained in:
parent
178c139718
commit
8d8db7c9f0
2 changed files with 39 additions and 12 deletions
|
|
@ -279,14 +279,22 @@ Status ROCMExecutionProvider::OnRunStart() {
|
|||
while (it != deferred_release_cpu_ptr_.end()) {
|
||||
auto& e = it->first;
|
||||
auto& v = it->second;
|
||||
// note that hipEventQuery returns hipSucess before first hipEventRecord
|
||||
if (v.recorded && hipSuccess == hipEventQuery(e)) {
|
||||
for (auto p : v.cpu_ptrs) {
|
||||
cpu_alloc->Free(p);
|
||||
// note that hipEventQuery returns hipSuccess before first hipEventRecord
|
||||
if (v.recorded) {
|
||||
auto event_query_status = hipEventQuery(e);
|
||||
if (event_query_status == hipSuccess) {
|
||||
for (auto p : v.cpu_ptrs) {
|
||||
cpu_alloc->Free(p);
|
||||
}
|
||||
HIP_RETURN_IF_ERROR(hipEventDestroy(e));
|
||||
it = deferred_release_cpu_ptr_.erase(it);
|
||||
} else if (event_query_status == hipErrorNotReady) {
|
||||
// ignore and clear the error if not ready
|
||||
hipGetLastError();
|
||||
it++;
|
||||
} else {
|
||||
HIP_RETURN_IF_ERROR(event_query_status);
|
||||
}
|
||||
hipEvent_t expired_event = it->first;
|
||||
it = deferred_release_cpu_ptr_.erase(it);
|
||||
HIP_RETURN_IF_ERROR(hipEventDestroy(expired_event));
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4,14 +4,17 @@
|
|||
#include "core/providers/rocm/rocm_fence.h"
|
||||
|
||||
#include "core/graph/constants.h"
|
||||
#include "core/providers/rocm/gpu_data_transfer.h"
|
||||
#include "core/providers/rocm/rocm_common.h"
|
||||
#include "core/providers/rocm/gpu_data_transfer.h"
|
||||
|
||||
namespace onnxruntime {
|
||||
|
||||
ROCMFence::ROCMFence(const GPUDataTransfer* data_transfer) : data_transfer_(data_transfer) {
|
||||
HIP_CALL_THROW(hipEventCreate(&read_event_));
|
||||
HIP_CALL_THROW(hipEventCreate(&write_event_));
|
||||
// NOTE: hipEventBlockingSync may leads to longer wait time because of thread yield/switching in kernel
|
||||
// if lower CPU usage is more important than latency, we should use this flag to avoid spin-loop in WaitOnCPU
|
||||
int event_flags = /*hipEventBlockingSync |*/ hipEventDisableTiming;
|
||||
HIP_CALL_THROW(hipEventCreateWithFlags(&read_event_, event_flags));
|
||||
HIP_CALL_THROW(hipEventCreateWithFlags(&write_event_, event_flags));
|
||||
}
|
||||
|
||||
ROCMFence::~ROCMFence() {
|
||||
|
|
@ -43,8 +46,24 @@ void ROCMFence::BeforeUsingAsOutput(onnxruntime::ProviderType provider_type, int
|
|||
}
|
||||
|
||||
bool ROCMFence::CanRelease() {
|
||||
return hipEventQuery(read_event_) == hipSuccess &&
|
||||
hipEventQuery(write_event_) == hipSuccess;
|
||||
hipError_t status;
|
||||
status = hipEventQuery(read_event_);
|
||||
if (status == hipErrorNotReady) {
|
||||
// ignore and clear the error if not ready
|
||||
hipGetLastError();
|
||||
return false;
|
||||
} else if (status != hipSuccess) {
|
||||
RocmCall<hipError_t, true>(status, "hipEventQuery(read_event_)", "HIP", hipSuccess);
|
||||
}
|
||||
status = hipEventQuery(write_event_);
|
||||
if (status == hipErrorNotReady) {
|
||||
// ignore and clear the error if not ready
|
||||
hipGetLastError();
|
||||
return false;
|
||||
} else if (status != hipSuccess) {
|
||||
RocmCall<hipError_t, true>(status, "hipEventQuery(write_event_)", "HIP", hipSuccess);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void ROCMFence::AfterUsedAsInput(int queue_id) {
|
||||
|
|
|
|||
Loading…
Reference in a new issue