Update CUDA IsAllFinite kernel

2026-07-08 17:17:15 +00:00 · 2020-11-18 11:27:50 -08:00 · 2020-11-18 11:27:50 -08:00 · 86e30a2db6
commit 86e30a2db6
parent bd96f60888
3 changed files with 13 additions and 20 deletions
--- a/orttraining/orttraining/core/graph/optimizer_graph_builder.cc
+++ b/orttraining/orttraining/core/graph/optimizer_graph_builder.cc
@ -348,7 +348,7 @@ Status OptimizerGraphBuilder::AddFiniteGradientCheck(
    ArgDef& grad_norm_finite_argdef,
    const std::string& node_name) {
  const TypeProto* const grad_norm_finite_type =
-      graph_defs.CreateTypeProto({1}, ONNX_NAMESPACE::TensorProto_DataType_BOOL);
+      graph_defs.CreateTypeProto({}, ONNX_NAMESPACE::TensorProto_DataType_BOOL);
  grad_norm_finite_argdef =
      ArgDef{nodearg_name_generator(node_name), grad_norm_finite_type};

--- a/orttraining/orttraining/core/graph/training_op_defs.cc
+++ b/orttraining/orttraining/core/graph/training_op_defs.cc
@ -1843,7 +1843,11 @@ Example 4:
          "The output scalar. Its value is true if all input "
          "tensors are finite. Otherwise, the output value would "
          "be false.",
-          "T");
+          "T")
+      .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
+        updateOutputShape(ctx, 0, {});
+        updateOutputElemType(ctx, 0, ONNX_NAMESPACE::TensorProto::BOOL);
+      });

  static const char* All_doc = R"DOC(
 Return true if all elements are true and false otherwise.
--- a/orttraining/orttraining/training_ops/cuda/math/isfinite.cc
+++ b/orttraining/orttraining/training_ops/cuda/math/isfinite.cc
@ -44,7 +44,6 @@ REGISTER_ISFINITE_KERNEL_TYPED(double)
      T,                                                             \
      kCudaExecutionProvider,                                        \
      KernelDefBuilder()                                             \
-          .OutputMemoryType<OrtMemTypeCPUOutput>(0)                  \
          .TypeConstraint("V", DataTypeImpl::GetTensorType<T>())     \
          .TypeConstraint("T", DataTypeImpl::GetTensorType<bool>()), \
      IsAllFiniteOp<T>);
@ -56,11 +55,11 @@ Status IsAllFiniteOp<TSrc>::ComputeInternal(OpKernelContext* context) const {
  // Get Input tensor count.
  const auto total_tensor_count = context->InputCount();

-  // Allocate GPU memory to capture the result computed by GPU kernel.
-  // The GPU result will be copied later to the output which locates
-  // on CPU memory.
-  IAllocatorUniquePtr<bool> deviceOutput = GetScratchBuffer<bool>(1);
-  CUDA_RETURN_IF_ERROR(cudaMemsetAsync(deviceOutput.get(), int(true), sizeof(bool)));
+  // Initialize the output to true.  GPU kernel will set it to false
+  // if any value in any tensor is non-finite.
+  Tensor& output = *context->Output(0, {});
+  auto output_data = reinterpret_cast<ToCudaType<bool>::MappedType*>(output.template MutableData<bool>());
+  CUDA_RETURN_IF_ERROR(cudaMemsetAsync(output_data, int(true), sizeof(bool)));

  std::vector<std::vector<void*>> grouped_tensor_pointers(total_tensor_count);
  std::vector<int> tensor_sizes(total_tensor_count);
@ -74,20 +73,10 @@ Status IsAllFiniteOp<TSrc>::ComputeInternal(OpKernelContext* context) const {
  typedef IsAllFiniteFunctor<TSrcCuda> TFunctor;
  TFunctor functor;

-  // Check if all values are finite and write true to deviceOutput.
+  // Check if all values are finite and write true to output.
  // Otherwise, false will be written.
  launch_multi_tensor_functor<1, TFunctor, bool*>(
-      2048 * 32, tensor_sizes, grouped_tensor_pointers, functor, deviceOutput.get());
-
-  // Copy GPU result in deviceOutput to CPU memory.
-  // Per this operator's schema, it's output is in CPU memory.
-  Tensor& output = *context->Output(0, {});
-  CUDA_RETURN_IF_ERROR(
-      cudaMemcpy(
-          output.MutableData<bool>(),
-          deviceOutput.get(),
-          sizeof(bool),
-          cudaMemcpyDeviceToHost));
+      2048 * 32, tensor_sizes, grouped_tensor_pointers, functor, output_data);

  return Status::OK();
 }