Warn the user when nondet kernels are invoked in det mode (#16571)

### Give user warnings if nondeterministic kernels got called when Deterministic flag is set When we do accuracy investigation (for example training convergence issue debug), usually we will set `use_deterministic_compute ` to be true. ``` SessionOptions sess_options; sess_options.use_deterministic_compute = true; ``` While in recent investigation, it is found GatherElementsGrad kernel (who used atomic add) generate non-deterministic results, making a deberta model ouput pretty different loss curve every time we run it even we fix the seed, remove the dropout ratio, and set use_deterministic_compute to be true. It turned out to be an expected problem if we do the add in different order by cuda threads. The order cannot be guaranteed. So this PR will give warnings when users set `use_deterministic_compute `, but some kernels don't have determinstic kernel impl, has to run with non-determinstic impls. This would at least let users know the results is not determinstic though that flag is set to be True. ![image](https://github.com/microsoft/onnxruntime/assets/10530022/99ff60f5-21a4-44cf-bf5b-323d698b7147) Only print the message once in case it floods training logs.
2026-07-29 20:14:01 +00:00 · 2023-07-11 11:45:47 +08:00 · 2023-07-11 11:45:47 +08:00 · 15cb2f5a8a
commit 15cb2f5a8a
parent b4c4e2b594
11 changed files with 98 additions and 27 deletions
--- a/onnxruntime/contrib_ops/cuda/diffusion/group_norm.cc
+++ b/onnxruntime/contrib_ops/cuda/diffusion/group_norm.cc
@ -111,6 +111,13 @@ Status GroupNorm::ComputeInternal(OpKernelContext* context) const {
                           "number of channels should be divisiable by num_groups");
  }

+  if (context->GetUseDeterministicCompute()) {
+    static std::once_flag log_warning;
+    std::call_once(log_warning, []() {
+      LOGS_DEFAULT(WARNING) << "GroupNorm has no deterministic CUDA kernel, its outputs may still be nondeterministic.";
+    });
+  }
+
  auto workspace = GetScratchBuffer<void>(GetGroupNormWorkspaceSizeInBytes(), context->GetComputeStream());

  utils::MLTypeCallDispatcher<GROUP_NORM_TYPES> dispatcher(input->GetElementType());
--- a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc
+++ b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc
@ -147,6 +147,7 @@ Status TopK(const Tensor* input, const int axis, const unsigned k, bool largest,
  Status result;
  if (input->IsDataType<float>()) {
    result = TopKImpl<float>(nullptr,  // We limit number of beams in BeamSearchParameters, so K <= 256 and use NULL here
+                             false /*use_deterministic_compute*/,
                             stream,
                             input->Data<float>(),
                             static_cast<float*>(output_values.MutableDataRaw()),
@ -161,6 +162,7 @@ Status TopK(const Tensor* input, const int axis, const unsigned k, bool largest,
                             dimension);
  } else if (input->IsDataType<MLFloat16>()) {
    result = TopKImpl<MLFloat16>(nullptr,
+                                 false /*use_deterministic_compute*/,
                                 stream,
                                 input->Data<MLFloat16>(),
                                 static_cast<MLFloat16*>(output_values.MutableDataRaw()),
--- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm.cc
+++ b/onnxruntime/contrib_ops/rocm/diffusion/group_norm.cc
@ -110,7 +110,14 @@ Status GroupNorm::ComputeInternal(OpKernelContext* context) const {

  if (num_channels % num_groups_ != 0) {
    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "number of channels should be divisiable by num_groups");
+                           "number of channels should be divisible by num_groups");
+  }
+
+  if (context->GetUseDeterministicCompute()) {
+    static std::once_flag log_warning;
+    std::call_once(log_warning, []() {
+      LOGS_DEFAULT(WARNING) << "GroupNorm has no deterministic GPU kernel, its outputs may still be nondeterministic.";
+    });
  }

  auto workspace = GetScratchBuffer<void>(GetGroupNormWorkspaceSizeInBytes(), context->GetComputeStream());
--- a/onnxruntime/core/providers/cuda/math/topk.cc
+++ b/onnxruntime/core/providers/cuda/math/topk.cc
@ -61,11 +61,12 @@ TopK<inputk>::TopK(const OpKernelInfo& info) : CudaKernel(info) {
 }

 #define IS_PRIM_TYPE(T) utils::IsPrimitiveDataType<T>(prim_type)
-#define TOPKIMPL(T) TopKImpl<T>(this, ctx->GetComputeStream(), tensor_X->Data<T>(), \
-                                static_cast<T*>(tensor_V->MutableDataRaw()),        \
-                                static_cast<int64_t*>(tensor_I->MutableDataRaw()),  \
-                                elem_nums_cuda,                                     \
-                                elem_nums.size(),                                   \
+#define TOPKIMPL(T) TopKImpl<T>(this, use_deterministic_compute,                   \
+                                ctx->GetComputeStream(), tensor_X->Data<T>(),      \
+                                static_cast<T*>(tensor_V->MutableDataRaw()),       \
+                                static_cast<int64_t*>(tensor_I->MutableDataRaw()), \
+                                elem_nums_cuda,                                    \
+                                elem_nums.size(),                                  \
                                axis, K_, largest_, sorted_, N, dimension)

 template <bool inputk>
@ -106,11 +107,14 @@ Status TopK<inputk>::ComputeInternal(OpKernelContext* ctx) const {
    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for TopK operator");
  }

+  bool use_deterministic_compute = ctx->GetUseDeterministicCompute();
+
  if (IS_PRIM_TYPE(int32_t)) return TOPKIMPL(int32_t);
  if (IS_PRIM_TYPE(int64_t)) return TOPKIMPL(int64_t);
  if (IS_PRIM_TYPE(MLFloat16)) return TOPKIMPL(MLFloat16);
  if (IS_PRIM_TYPE(float)) return TOPKIMPL(float);
  if (IS_PRIM_TYPE(double)) return TOPKIMPL(double);
+
  return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for TopK operator");
 }

--- a/onnxruntime/core/providers/cuda/math/topk_impl.cuh
+++ b/onnxruntime/core/providers/cuda/math/topk_impl.cuh
@ -398,7 +398,10 @@ __global__ void ExcludeOutput(T* output_i, T K, T dimension) {
 }

 template <typename T>
-Status TopKImpl(const CudaKernel* kernel, Stream* ort_stream, const T* input_x, T* output_v, int64_t* output_i, const TArray<int64_t>& elem_nums, size_t size, int32_t axis, int64_t K, int64_t largest, int64_t sorted, int64_t N, int64_t dimension) {
+Status TopKImpl(const CudaKernel* kernel, bool use_deterministic_compute,
+                Stream* ort_stream, const T* input_x, T* output_v, int64_t* output_i,
+                const TArray<int64_t>& elem_nums, size_t size, int32_t axis, int64_t K, int64_t largest,
+                int64_t sorted, int64_t N, int64_t dimension) {
  typedef typename ToCudaType<T>::MappedType CudaT;
  const CudaT* input_x_ptr = reinterpret_cast<const CudaT*>(input_x);
  CudaT* output_v_ptr = reinterpret_cast<CudaT*>(output_v);
@ -407,17 +410,34 @@ Status TopKImpl(const CudaKernel* kernel, Stream* ort_stream, const T* input_x,
  auto aligned_K = ALIGN(K);
  auto aligned_dimension = ALIGN(dimension);
  if (aligned_dimension <= GridDim::maxThreadsPerBlock) {
-    BitonicTopK<CudaT><<<N, GridDim::maxThreadsPerBlock, aligned_dimension * sizeof(KV<CudaT>), stream>>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, aligned_K, largest, sorted, dimension, aligned_dimension, NumericLimits<T>::Min(), NumericLimits<T>::Max());
+    BitonicTopK<CudaT><<<N, GridDim::maxThreadsPerBlock, aligned_dimension * sizeof(KV<CudaT>), stream>>>(
+        input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, aligned_K, largest, sorted, dimension,
+        aligned_dimension, NumericLimits<T>::Min(), NumericLimits<T>::Max());
  } else if (K <= BT * 16 || 0 == sorted) {
+    if (use_deterministic_compute) {
+      static std::once_flag log_warning;
+      std::call_once(log_warning, []() {
+        LOGS_DEFAULT(WARNING) << "Non-deterministic TopKImpl kernel is called, its outputs may still be nondeterministic.";
+      });
+    }
+
    auto XPT = static_cast<int64_t>(ceil(static_cast<double>(dimension) / GridDim::maxThreadsPerBlock));
    if (BT * 2 >= K || 0 == sorted) {
-      RadixTopK<CudaT, BT, 2><<<N, BT, 256 * sizeof(uint32_t), stream>>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits<T>::Min(), NumericLimits<T>::Max());
+      RadixTopK<CudaT, BT, 2><<<N, BT, 256 * sizeof(uint32_t), stream>>>(
+          input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT,
+          NumericLimits<T>::Min(), NumericLimits<T>::Max());
    } else if (BT * 4 >= K) {
-      RadixTopK<CudaT, BT, 4><<<N, BT, 256 * sizeof(uint32_t), stream>>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits<T>::Min(), NumericLimits<T>::Max());
+      RadixTopK<CudaT, BT, 4><<<N, BT, 256 * sizeof(uint32_t), stream>>>(
+          input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT,
+          NumericLimits<T>::Min(), NumericLimits<T>::Max());
    } else if (BT * 8 >= K) {
-      RadixTopK<CudaT, BT, 8><<<N, BT, 256 * sizeof(uint32_t), stream>>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits<T>::Min(), NumericLimits<T>::Max());
+      RadixTopK<CudaT, BT, 8><<<N, BT, 256 * sizeof(uint32_t), stream>>>(
+          input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT,
+          NumericLimits<T>::Min(), NumericLimits<T>::Max());
    } else {
-      RadixTopK<CudaT, BT, 16><<<N, BT, 256 * sizeof(uint32_t), stream>>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits<T>::Min(), NumericLimits<T>::Max());
+      RadixTopK<CudaT, BT, 16><<<N, BT, 256 * sizeof(uint32_t), stream>>>(
+          input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT,
+          NumericLimits<T>::Min(), NumericLimits<T>::Max());
    }
  } else {
    auto input_key_buffer = kernel->GetScratchBuffer<CudaT>(dimension, ort_stream);
@ -451,6 +471,7 @@ Status TopKImpl(const CudaKernel* kernel, Stream* ort_stream, const T* input_x,
 }

 #define TOPKIMPLE(T) template Status TopKImpl<T>(const CudaKernel* kernel,         \
+                                                 bool use_deterministic_compute,   \
                                                 Stream* ort_stream,               \
                                                 const T* input_x,                 \
                                                 T* output_v,                      \
@ -464,7 +485,7 @@ Status TopKImpl(const CudaKernel* kernel, Stream* ort_stream, const T* input_x,
                                                 int64_t N,                        \
                                                 int64_t dimension)

-// This file is causing excessive long compilation time in ROCm EP. Split all those compilation into multiple
+// This file is causing excessive long compilation time in ROCm EP. Split all those compilations into multiple
 // translation units to speed it up.
 TOPKIMPLE(TOPK_IMPL_TYPE);

--- a/onnxruntime/core/providers/cuda/math/topk_impl.h
+++ b/onnxruntime/core/providers/cuda/math/topk_impl.h
@ -11,7 +11,9 @@ namespace onnxruntime {
 namespace cuda {

 template <typename T>
-Status TopKImpl(const CudaKernel* kernel, Stream* ort_stream, const T* input_x, T* output_v, int64_t* output_i, const TArray<int64_t>& elem_nums, size_t size, int32_t axis, int64_t K, int64_t largest, int64_t sorted, int64_t N, int64_t dimension);
+Status TopKImpl(const CudaKernel* kernel, bool use_deterministic_compute, Stream* ort_stream,
+                const T* input_x, T* output_v, int64_t* output_i, const TArray<int64_t>& elem_nums,
+                size_t size, int32_t axis, int64_t K, int64_t largest, int64_t sorted, int64_t N, int64_t dimension);

 }  // namespace cuda
 }  // namespace onnxruntime
--- a/onnxruntime/core/providers/cuda/tensor/gather_elements_impl.cu
+++ b/onnxruntime/core/providers/cuda/tensor/gather_elements_impl.cu
@ -276,8 +276,12 @@ struct FuncAtomicAdd {
 };

 template <typename T, typename TIndex>
-Status GatherElementsGradImpl(cudaStream_t stream, const TIndex* indices_data, const T* updates_data, T* output_data,
-                              const GatherScatterElementsArgs& args) {
+Status GatherElementsGradNonDeterministicImpl(cudaStream_t stream, const TIndex* indices_data, const T* updates_data,
+                                              T* output_data,
+                                              const GatherScatterElementsArgs& args) {
+  // Be noted: usage of AtomicAdd is not deterministic if there are duplicated indices to update.
+  // That's the reason we name this function as non-deterministic.
+
  // Give output_data as the input_data parameter by intention,
  // to skip input_data copy, which is not applicable for GatherElementsGrad.
  // output_data's numel is same as input_data's numel.
@ -285,10 +289,10 @@ Status GatherElementsGradImpl(cudaStream_t stream, const TIndex* indices_data, c
                                     FuncAtomicAdd<T>(static_cast<size_t>(args.input_size)));
 }

-#define GATHER_ELEMENTS_GRAD_SPECIALIZED_TINDEX_IMPL(T, TIndex)                                      \
-  template Status GatherElementsGradImpl<T, TIndex>(cudaStream_t stream, const TIndex* indices_data, \
-                                                    const T* updates_data, T* output_data,           \
-                                                    const GatherScatterElementsArgs& args);
+#define GATHER_ELEMENTS_GRAD_SPECIALIZED_TINDEX_IMPL(T, TIndex)                                                      \
+  template Status GatherElementsGradNonDeterministicImpl<T, TIndex>(cudaStream_t stream, const TIndex* indices_data, \
+                                                                    const T* updates_data, T* output_data,           \
+                                                                    const GatherScatterElementsArgs& args);

 #define GATHER_ELEMENTS_GRAD_SPECIALIZED_SCATTER_ADD_IMPL(T) \
  GATHER_ELEMENTS_GRAD_SPECIALIZED_TINDEX_IMPL(T, int32_t)   \
--- a/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cc
+++ b/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cc
@ -348,6 +348,13 @@ Status launch_lamb_reduction(

  // Only launch multi-tensor function if we have at least one tensor in the buckets.
  if (tensor_sizes_in_buckets.size() > 0 && buckets.size() > 0) {
+    if (ctx->GetUseDeterministicCompute()) {
+      static std::once_flag log_warning;
+      std::call_once(log_warning, []() {
+        LOGS_DEFAULT(WARNING) << "Non-deterministic Lamb GPU kernel is called, its outputs may still be nondeterministic.";
+      });
+    }
+
    typedef LambMultiTensorReductionFunctor<CudaTIn1, CudaTIn2, CudaTNorm, CudaTNorm, CudaTNorm> TReducer;
    TReducer reducer;
    launch_multi_tensor_functor<tensor_count_per_group, TReducer>(
--- a/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad.cc
+++ b/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad.cc
@ -29,10 +29,11 @@ ONNX_OPERATOR_KERNEL_EX(GatherElementsGrad, kMSDomain, 1, kCudaExecutionProvider

 #undef CREATE_GATHER_ELEMENTS_GRAD_KERNEL_DEF

-#define CASE_GATHER_ELEMENTS_GRAD_IMPL(type)                                                            \
-  case sizeof(type): {                                                                                  \
-    const type* indices_data = reinterpret_cast<const type*>(indices_data_raw);                         \
-    ORT_RETURN_IF_ERROR(GatherElementsGradImpl(stream, indices_data, updates_data, output_data, args)); \
+#define CASE_GATHER_ELEMENTS_GRAD_IMPL(type)                                                       \
+  case sizeof(type): {                                                                             \
+    const type* indices_data = reinterpret_cast<const type*>(indices_data_raw);                    \
+    ORT_RETURN_IF_ERROR(GatherElementsGradNonDeterministicImpl(stream, indices_data, updates_data, \
+                                                               output_data, args));                \
  } break

 template <typename T>
@ -100,8 +101,16 @@ Status GatherElementsGrad::ComputeInternal(OpKernelContext* context) const {
    ORT_THROW("Unsupported element size by the GatherElementsGrad CUDA kernel");
  }

+  if (context->GetUseDeterministicCompute()) {
+    static std::once_flag log_warning;
+    std::call_once(log_warning, []() {
+      LOGS_DEFAULT(WARNING) << "GatherElementsGrad has no deterministic GPU kernel, its outputs may still be nondeterministic.";
+    });
+  }
+
  utils::MLTypeCallDispatcher<MLFloat16, float, double> t_disp(dtype);
-  return t_disp.InvokeRet<Status, ComputeImpl>(Stream(context), dY->DataRaw(), indices_tensor->DataRaw(), dX->MutableDataRaw(),
+  return t_disp.InvokeRet<Status, ComputeImpl>(Stream(context), dY->DataRaw(), indices_tensor->DataRaw(),
+                                               dX->MutableDataRaw(),
                                               indices_tensor->DataType()->Size(), args);
 }

--- a/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad_impl.h
+++ b/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad_impl.h
@ -12,8 +12,9 @@ namespace cuda {
 struct GatherScatterElementsArgs;

 template <typename T, typename TIndex>
-Status GatherElementsGradImpl(cudaStream_t stream, const TIndex* indices_data, const T* updates_data, T* output_data,
-                              const GatherScatterElementsArgs& args);
+Status GatherElementsGradNonDeterministicImpl(cudaStream_t stream, const TIndex* indices_data,
+                                              const T* updates_data, T* output_data,
+                                              const GatherScatterElementsArgs& args);

 }  // namespace cuda
 }  // namespace onnxruntime
--- a/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad.cc
+++ b/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad.cc
@ -80,6 +80,13 @@ Status GatherNDGrad<TIndex>::ComputeInternal(OpKernelContext* context) const {
                                             batch_dims_, input_shape, indices_shape, indices_tensor,
                                             num_slices, slice_size, input_slice_offsets_buffer));

+  if (context->GetUseDeterministicCompute()) {
+    static std::once_flag log_warning;
+    std::call_once(log_warning, []() {
+      LOGS_DEFAULT(WARNING) << "GatherNDGrad has no deterministic GPU kernel, its outputs may still be nondeterministic.";
+    });
+  }
+
  const void* const kernel_input_data = update_tensor->DataRaw();
  void* const kernel_output_data = output_tensor->MutableDataRaw();
  utils::MLTypeCallDispatcher<float, MLFloat16, double, BFloat16> t_disp(update_tensor->GetElementType());