Fix the issue which causes wrong output. (#342)

Root cause:
The cudaStreamWaitEvent is used after copy data from GPU memory to CPU memory, but the following node has CPU code depend on the data. Should use cudaEventSynchronize instead.
Fix:
Add code in executor to check the input memory type first, if it wants CPU memory, pass the CPUExecutionProvider type to BeforeUsingAsInput, then it will use cudaEventSynchronize to wait the write event.
This commit is contained in:
Hector Li 2019-01-16 14:47:18 -08:00 committed by GitHub
parent 5d0e024284
commit 790cda6ea7
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 20 additions and 6 deletions

View file

@ -138,14 +138,22 @@ void ParallelExecutor::RunNodeAsyncInternal(size_t p_node_index,
for (int input_index = 0; input_index < op_kernel_context.InputCount(); ++input_index) {
Fence_t fence = op_kernel_context.InputFence(input_index);
if (fence) {
fence->BeforeUsingAsInput(p_op_kernel->Node().GetExecutionProviderType(), queue_id);
auto execution_provider_type = p_op_kernel->Node().GetExecutionProviderType();
if (OrtMemTypeCPUInput == p_op_kernel->KernelDef().InputMemoryType(input_index)) {
execution_provider_type = kCpuExecutionProvider;
}
fence->BeforeUsingAsInput(execution_provider_type, queue_id);
}
}
for (int input_index = 0; input_index < op_kernel_context.ImplicitInputCount(); ++input_index) {
Fence_t fence = op_kernel_context.ImplicitInputFence(input_index);
if (fence) {
fence->BeforeUsingAsInput(p_op_kernel->Node().GetExecutionProviderType(), queue_id);
auto execution_provider_type = p_op_kernel->Node().GetExecutionProviderType();
if (OrtMemTypeCPUInput == p_op_kernel->KernelDef().InputMemoryType(input_index)) {
execution_provider_type = kCpuExecutionProvider;
}
fence->BeforeUsingAsInput(execution_provider_type, queue_id);
}
}

View file

@ -78,14 +78,22 @@ Status SequentialExecutor::Execute(const SessionState& session_state,
for (int input_index = 0; input_index < op_kernel_context.InputCount(); ++input_index) {
Fence_t fence = op_kernel_context.InputFence(input_index);
if (fence) {
fence->BeforeUsingAsInput(p_op_kernel->Node().GetExecutionProviderType(), queue_id);
auto execution_provider_type = p_op_kernel->Node().GetExecutionProviderType();
if (OrtMemTypeCPUInput == p_op_kernel->KernelDef().InputMemoryType(input_index)) {
execution_provider_type = kCpuExecutionProvider;
}
fence->BeforeUsingAsInput(execution_provider_type, queue_id);
}
}
for (int input_index = 0; input_index < op_kernel_context.ImplicitInputCount(); ++input_index) {
Fence_t fence = op_kernel_context.ImplicitInputFence(input_index);
if (fence) {
fence->BeforeUsingAsInput(p_op_kernel->Node().GetExecutionProviderType(), queue_id);
auto execution_provider_type = p_op_kernel->Node().GetExecutionProviderType();
if (OrtMemTypeCPUInput == p_op_kernel->KernelDef().InputMemoryType(input_index)) {
execution_provider_type = kCpuExecutionProvider;
}
fence->BeforeUsingAsInput(execution_provider_type, queue_id);
}
}

View file

@ -327,8 +327,6 @@ int real_main(int argc, char* argv[]) {
broken_tests["maxpool_3d_default"] = "cudnn pooling only support input dimension >= 3";
broken_tests["maxpool_1d_default"] = "cudnn pooling only support input dimension >= 3";
broken_tests["tf_inception_resnet_v2"] = "unknown failure on CUDA";
broken_tests["tf_inception_v4"] = "unknown failure on CUDA";
broken_tests["fp16_tiny_yolov2"] = "unknown failure on CUDA";
broken_tests["fp16_shufflenet"] = "unknown failure on CUDA";
broken_tests["fp16_inception_v1"] = "unknown failure on CUDA";