mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-14 20:48:00 +00:00
Fix the issue which causes wrong output. (#342)
Root cause: The cudaStreamWaitEvent is used after copy data from GPU memory to CPU memory, but the following node has CPU code depend on the data. Should use cudaEventSynchronize instead. Fix: Add code in executor to check the input memory type first, if it wants CPU memory, pass the CPUExecutionProvider type to BeforeUsingAsInput, then it will use cudaEventSynchronize to wait the write event.
This commit is contained in:
parent
5d0e024284
commit
790cda6ea7
3 changed files with 20 additions and 6 deletions
|
|
@ -138,14 +138,22 @@ void ParallelExecutor::RunNodeAsyncInternal(size_t p_node_index,
|
|||
for (int input_index = 0; input_index < op_kernel_context.InputCount(); ++input_index) {
|
||||
Fence_t fence = op_kernel_context.InputFence(input_index);
|
||||
if (fence) {
|
||||
fence->BeforeUsingAsInput(p_op_kernel->Node().GetExecutionProviderType(), queue_id);
|
||||
auto execution_provider_type = p_op_kernel->Node().GetExecutionProviderType();
|
||||
if (OrtMemTypeCPUInput == p_op_kernel->KernelDef().InputMemoryType(input_index)) {
|
||||
execution_provider_type = kCpuExecutionProvider;
|
||||
}
|
||||
fence->BeforeUsingAsInput(execution_provider_type, queue_id);
|
||||
}
|
||||
}
|
||||
|
||||
for (int input_index = 0; input_index < op_kernel_context.ImplicitInputCount(); ++input_index) {
|
||||
Fence_t fence = op_kernel_context.ImplicitInputFence(input_index);
|
||||
if (fence) {
|
||||
fence->BeforeUsingAsInput(p_op_kernel->Node().GetExecutionProviderType(), queue_id);
|
||||
auto execution_provider_type = p_op_kernel->Node().GetExecutionProviderType();
|
||||
if (OrtMemTypeCPUInput == p_op_kernel->KernelDef().InputMemoryType(input_index)) {
|
||||
execution_provider_type = kCpuExecutionProvider;
|
||||
}
|
||||
fence->BeforeUsingAsInput(execution_provider_type, queue_id);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -78,14 +78,22 @@ Status SequentialExecutor::Execute(const SessionState& session_state,
|
|||
for (int input_index = 0; input_index < op_kernel_context.InputCount(); ++input_index) {
|
||||
Fence_t fence = op_kernel_context.InputFence(input_index);
|
||||
if (fence) {
|
||||
fence->BeforeUsingAsInput(p_op_kernel->Node().GetExecutionProviderType(), queue_id);
|
||||
auto execution_provider_type = p_op_kernel->Node().GetExecutionProviderType();
|
||||
if (OrtMemTypeCPUInput == p_op_kernel->KernelDef().InputMemoryType(input_index)) {
|
||||
execution_provider_type = kCpuExecutionProvider;
|
||||
}
|
||||
fence->BeforeUsingAsInput(execution_provider_type, queue_id);
|
||||
}
|
||||
}
|
||||
|
||||
for (int input_index = 0; input_index < op_kernel_context.ImplicitInputCount(); ++input_index) {
|
||||
Fence_t fence = op_kernel_context.ImplicitInputFence(input_index);
|
||||
if (fence) {
|
||||
fence->BeforeUsingAsInput(p_op_kernel->Node().GetExecutionProviderType(), queue_id);
|
||||
auto execution_provider_type = p_op_kernel->Node().GetExecutionProviderType();
|
||||
if (OrtMemTypeCPUInput == p_op_kernel->KernelDef().InputMemoryType(input_index)) {
|
||||
execution_provider_type = kCpuExecutionProvider;
|
||||
}
|
||||
fence->BeforeUsingAsInput(execution_provider_type, queue_id);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -327,8 +327,6 @@ int real_main(int argc, char* argv[]) {
|
|||
broken_tests["maxpool_3d_default"] = "cudnn pooling only support input dimension >= 3";
|
||||
broken_tests["maxpool_1d_default"] = "cudnn pooling only support input dimension >= 3";
|
||||
|
||||
broken_tests["tf_inception_resnet_v2"] = "unknown failure on CUDA";
|
||||
broken_tests["tf_inception_v4"] = "unknown failure on CUDA";
|
||||
broken_tests["fp16_tiny_yolov2"] = "unknown failure on CUDA";
|
||||
broken_tests["fp16_shufflenet"] = "unknown failure on CUDA";
|
||||
broken_tests["fp16_inception_v1"] = "unknown failure on CUDA";
|
||||
|
|
|
|||
Loading…
Reference in a new issue