mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-23 22:13:38 +00:00
Update CUDA IsAllFinite kernel
This commit is contained in:
parent
bd96f60888
commit
86e30a2db6
3 changed files with 13 additions and 20 deletions
|
|
@ -348,7 +348,7 @@ Status OptimizerGraphBuilder::AddFiniteGradientCheck(
|
|||
ArgDef& grad_norm_finite_argdef,
|
||||
const std::string& node_name) {
|
||||
const TypeProto* const grad_norm_finite_type =
|
||||
graph_defs.CreateTypeProto({1}, ONNX_NAMESPACE::TensorProto_DataType_BOOL);
|
||||
graph_defs.CreateTypeProto({}, ONNX_NAMESPACE::TensorProto_DataType_BOOL);
|
||||
grad_norm_finite_argdef =
|
||||
ArgDef{nodearg_name_generator(node_name), grad_norm_finite_type};
|
||||
|
||||
|
|
|
|||
|
|
@ -1843,7 +1843,11 @@ Example 4:
|
|||
"The output scalar. Its value is true if all input "
|
||||
"tensors are finite. Otherwise, the output value would "
|
||||
"be false.",
|
||||
"T");
|
||||
"T")
|
||||
.TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
|
||||
updateOutputShape(ctx, 0, {});
|
||||
updateOutputElemType(ctx, 0, ONNX_NAMESPACE::TensorProto::BOOL);
|
||||
});
|
||||
|
||||
static const char* All_doc = R"DOC(
|
||||
Return true if all elements are true and false otherwise.
|
||||
|
|
|
|||
|
|
@ -44,7 +44,6 @@ REGISTER_ISFINITE_KERNEL_TYPED(double)
|
|||
T, \
|
||||
kCudaExecutionProvider, \
|
||||
KernelDefBuilder() \
|
||||
.OutputMemoryType<OrtMemTypeCPUOutput>(0) \
|
||||
.TypeConstraint("V", DataTypeImpl::GetTensorType<T>()) \
|
||||
.TypeConstraint("T", DataTypeImpl::GetTensorType<bool>()), \
|
||||
IsAllFiniteOp<T>);
|
||||
|
|
@ -56,11 +55,11 @@ Status IsAllFiniteOp<TSrc>::ComputeInternal(OpKernelContext* context) const {
|
|||
// Get Input tensor count.
|
||||
const auto total_tensor_count = context->InputCount();
|
||||
|
||||
// Allocate GPU memory to capture the result computed by GPU kernel.
|
||||
// The GPU result will be copied later to the output which locates
|
||||
// on CPU memory.
|
||||
IAllocatorUniquePtr<bool> deviceOutput = GetScratchBuffer<bool>(1);
|
||||
CUDA_RETURN_IF_ERROR(cudaMemsetAsync(deviceOutput.get(), int(true), sizeof(bool)));
|
||||
// Initialize the output to true. GPU kernel will set it to false
|
||||
// if any value in any tensor is non-finite.
|
||||
Tensor& output = *context->Output(0, {});
|
||||
auto output_data = reinterpret_cast<ToCudaType<bool>::MappedType*>(output.template MutableData<bool>());
|
||||
CUDA_RETURN_IF_ERROR(cudaMemsetAsync(output_data, int(true), sizeof(bool)));
|
||||
|
||||
std::vector<std::vector<void*>> grouped_tensor_pointers(total_tensor_count);
|
||||
std::vector<int> tensor_sizes(total_tensor_count);
|
||||
|
|
@ -74,20 +73,10 @@ Status IsAllFiniteOp<TSrc>::ComputeInternal(OpKernelContext* context) const {
|
|||
typedef IsAllFiniteFunctor<TSrcCuda> TFunctor;
|
||||
TFunctor functor;
|
||||
|
||||
// Check if all values are finite and write true to deviceOutput.
|
||||
// Check if all values are finite and write true to output.
|
||||
// Otherwise, false will be written.
|
||||
launch_multi_tensor_functor<1, TFunctor, bool*>(
|
||||
2048 * 32, tensor_sizes, grouped_tensor_pointers, functor, deviceOutput.get());
|
||||
|
||||
// Copy GPU result in deviceOutput to CPU memory.
|
||||
// Per this operator's schema, it's output is in CPU memory.
|
||||
Tensor& output = *context->Output(0, {});
|
||||
CUDA_RETURN_IF_ERROR(
|
||||
cudaMemcpy(
|
||||
output.MutableData<bool>(),
|
||||
deviceOutput.get(),
|
||||
sizeof(bool),
|
||||
cudaMemcpyDeviceToHost));
|
||||
2048 * 32, tensor_sizes, grouped_tensor_pointers, functor, output_data);
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue