Update CUDA IsAllFinite kernel

This commit is contained in:
Jesse Benson 2020-11-18 11:27:50 -08:00 committed by Jesse Benson
parent bd96f60888
commit 86e30a2db6
3 changed files with 13 additions and 20 deletions

View file

@ -348,7 +348,7 @@ Status OptimizerGraphBuilder::AddFiniteGradientCheck(
ArgDef& grad_norm_finite_argdef,
const std::string& node_name) {
const TypeProto* const grad_norm_finite_type =
graph_defs.CreateTypeProto({1}, ONNX_NAMESPACE::TensorProto_DataType_BOOL);
graph_defs.CreateTypeProto({}, ONNX_NAMESPACE::TensorProto_DataType_BOOL);
grad_norm_finite_argdef =
ArgDef{nodearg_name_generator(node_name), grad_norm_finite_type};

View file

@ -1843,7 +1843,11 @@ Example 4:
"The output scalar. Its value is true if all input "
"tensors are finite. Otherwise, the output value would "
"be false.",
"T");
"T")
.TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
updateOutputShape(ctx, 0, {});
updateOutputElemType(ctx, 0, ONNX_NAMESPACE::TensorProto::BOOL);
});
static const char* All_doc = R"DOC(
Return true if all elements are true and false otherwise.

View file

@ -44,7 +44,6 @@ REGISTER_ISFINITE_KERNEL_TYPED(double)
T, \
kCudaExecutionProvider, \
KernelDefBuilder() \
.OutputMemoryType<OrtMemTypeCPUOutput>(0) \
.TypeConstraint("V", DataTypeImpl::GetTensorType<T>()) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<bool>()), \
IsAllFiniteOp<T>);
@ -56,11 +55,11 @@ Status IsAllFiniteOp<TSrc>::ComputeInternal(OpKernelContext* context) const {
// Get Input tensor count.
const auto total_tensor_count = context->InputCount();
// Allocate GPU memory to capture the result computed by GPU kernel.
// The GPU result will be copied later to the output which locates
// on CPU memory.
IAllocatorUniquePtr<bool> deviceOutput = GetScratchBuffer<bool>(1);
CUDA_RETURN_IF_ERROR(cudaMemsetAsync(deviceOutput.get(), int(true), sizeof(bool)));
// Initialize the output to true. GPU kernel will set it to false
// if any value in any tensor is non-finite.
Tensor& output = *context->Output(0, {});
auto output_data = reinterpret_cast<ToCudaType<bool>::MappedType*>(output.template MutableData<bool>());
CUDA_RETURN_IF_ERROR(cudaMemsetAsync(output_data, int(true), sizeof(bool)));
std::vector<std::vector<void*>> grouped_tensor_pointers(total_tensor_count);
std::vector<int> tensor_sizes(total_tensor_count);
@ -74,20 +73,10 @@ Status IsAllFiniteOp<TSrc>::ComputeInternal(OpKernelContext* context) const {
typedef IsAllFiniteFunctor<TSrcCuda> TFunctor;
TFunctor functor;
// Check if all values are finite and write true to deviceOutput.
// Check if all values are finite and write true to output.
// Otherwise, false will be written.
launch_multi_tensor_functor<1, TFunctor, bool*>(
2048 * 32, tensor_sizes, grouped_tensor_pointers, functor, deviceOutput.get());
// Copy GPU result in deviceOutput to CPU memory.
// Per this operator's schema, it's output is in CPU memory.
Tensor& output = *context->Output(0, {});
CUDA_RETURN_IF_ERROR(
cudaMemcpy(
output.MutableData<bool>(),
deviceOutput.get(),
sizeof(bool),
cudaMemcpyDeviceToHost));
2048 * 32, tensor_sizes, grouped_tensor_pointers, functor, output_data);
return Status::OK();
}