Fix the issue which causes wrong output. (#342)

Root cause: The cudaStreamWaitEvent is used after copy data from GPU memory to CPU memory, but the following node has CPU code depend on the data. Should use cudaEventSynchronize instead. Fix: Add code in executor to check the input memory type first, if it wants CPU memory, pass the CPUExecutionProvider type to BeforeUsingAsInput, then it will use cudaEventSynchronize to wait the write event.
2026-05-14 20:48:00 +00:00 · 2019-01-16 14:47:18 -08:00 · 2019-01-16 14:47:18 -08:00 · 790cda6ea7
commit 790cda6ea7
parent 5d0e024284
3 changed files with 20 additions and 6 deletions
--- a/onnxruntime/core/framework/parallel_executor.cc
+++ b/onnxruntime/core/framework/parallel_executor.cc
@ -138,14 +138,22 @@ void ParallelExecutor::RunNodeAsyncInternal(size_t p_node_index,
    for (int input_index = 0; input_index < op_kernel_context.InputCount(); ++input_index) {
      Fence_t fence = op_kernel_context.InputFence(input_index);
      if (fence) {
-        fence->BeforeUsingAsInput(p_op_kernel->Node().GetExecutionProviderType(), queue_id);
+        auto execution_provider_type = p_op_kernel->Node().GetExecutionProviderType();
+        if (OrtMemTypeCPUInput == p_op_kernel->KernelDef().InputMemoryType(input_index)) {
+          execution_provider_type = kCpuExecutionProvider;
+        }
+        fence->BeforeUsingAsInput(execution_provider_type, queue_id);
      }
    }

    for (int input_index = 0; input_index < op_kernel_context.ImplicitInputCount(); ++input_index) {
      Fence_t fence = op_kernel_context.ImplicitInputFence(input_index);
      if (fence) {
-        fence->BeforeUsingAsInput(p_op_kernel->Node().GetExecutionProviderType(), queue_id);
+        auto execution_provider_type = p_op_kernel->Node().GetExecutionProviderType();
+        if (OrtMemTypeCPUInput == p_op_kernel->KernelDef().InputMemoryType(input_index)) {
+          execution_provider_type = kCpuExecutionProvider;
+        }
+        fence->BeforeUsingAsInput(execution_provider_type, queue_id);
      }
    }

--- a/onnxruntime/core/framework/sequential_executor.cc
+++ b/onnxruntime/core/framework/sequential_executor.cc
@ -78,14 +78,22 @@ Status SequentialExecutor::Execute(const SessionState& session_state,
    for (int input_index = 0; input_index < op_kernel_context.InputCount(); ++input_index) {
      Fence_t fence = op_kernel_context.InputFence(input_index);
      if (fence) {
-        fence->BeforeUsingAsInput(p_op_kernel->Node().GetExecutionProviderType(), queue_id);
+        auto execution_provider_type = p_op_kernel->Node().GetExecutionProviderType();
+        if (OrtMemTypeCPUInput == p_op_kernel->KernelDef().InputMemoryType(input_index)) {
+          execution_provider_type = kCpuExecutionProvider;
+        }
+        fence->BeforeUsingAsInput(execution_provider_type, queue_id);
      }
    }

    for (int input_index = 0; input_index < op_kernel_context.ImplicitInputCount(); ++input_index) {
      Fence_t fence = op_kernel_context.ImplicitInputFence(input_index);
      if (fence) {
-        fence->BeforeUsingAsInput(p_op_kernel->Node().GetExecutionProviderType(), queue_id);
+        auto execution_provider_type = p_op_kernel->Node().GetExecutionProviderType();
+        if (OrtMemTypeCPUInput == p_op_kernel->KernelDef().InputMemoryType(input_index)) {
+          execution_provider_type = kCpuExecutionProvider;
+        }
+        fence->BeforeUsingAsInput(execution_provider_type, queue_id);
      }
    }

--- a/onnxruntime/test/onnx/main.cc
+++ b/onnxruntime/test/onnx/main.cc
@ -327,8 +327,6 @@ int real_main(int argc, char* argv[]) {
  broken_tests["maxpool_3d_default"] = "cudnn pooling only support input dimension >= 3";
  broken_tests["maxpool_1d_default"] = "cudnn pooling only support input dimension >= 3";

-  broken_tests["tf_inception_resnet_v2"] = "unknown failure on CUDA";
-  broken_tests["tf_inception_v4"] = "unknown failure on CUDA";
  broken_tests["fp16_tiny_yolov2"] = "unknown failure on CUDA";
  broken_tests["fp16_shufflenet"] = "unknown failure on CUDA";
  broken_tests["fp16_inception_v1"] = "unknown failure on CUDA";