From 15cb2f5a8aa1bdbbb71de9560dc8835a943264d5 Mon Sep 17 00:00:00 2001
From: pengwa <pengwa@microsoft.com>
Date: Tue, 11 Jul 2023 11:45:47 +0800
Subject: [PATCH] Warn the user when nondet kernels are invoked in det mode
 (#16571)

### Give user warnings if nondeterministic kernels got called when
Deterministic flag is set

When we do accuracy investigation (for example training convergence
issue debug), usually we will set `use_deterministic_compute ` to be
true.

```
 SessionOptions sess_options;
 sess_options.use_deterministic_compute = true;
```

While in recent investigation, it is found GatherElementsGrad kernel
(who used atomic add) generate non-deterministic results, making a
deberta model ouput pretty different loss curve every time we run it
even we fix the seed, remove the dropout ratio, and set
use_deterministic_compute to be true. It turned out to be an expected
problem if we do the add in different order by cuda threads. The order
cannot be guaranteed.

So this PR will give warnings when users set `use_deterministic_compute
`, but some kernels don't have determinstic kernel impl, has to run with
non-determinstic impls. This would at least let users know the results
is not determinstic though that flag is set to be True.


![image](https://github.com/microsoft/onnxruntime/assets/10530022/99ff60f5-21a4-44cf-bf5b-323d698b7147)

Only print the message once in case it floods training logs.
---
 .../contrib_ops/cuda/diffusion/group_norm.cc  |  7 ++++
 .../transformers/generation_device_helper.cc  |  2 ++
 .../contrib_ops/rocm/diffusion/group_norm.cc  |  9 ++++-
 onnxruntime/core/providers/cuda/math/topk.cc  | 14 +++++---
 .../core/providers/cuda/math/topk_impl.cuh    | 35 +++++++++++++++----
 .../core/providers/cuda/math/topk_impl.h      |  4 ++-
 .../cuda/tensor/gather_elements_impl.cu       | 16 +++++----
 .../training_ops/cuda/optimizer/lamb.cc       |  7 ++++
 .../cuda/tensor/gather_elements_grad.cc       | 19 +++++++---
 .../cuda/tensor/gather_elements_grad_impl.h   |  5 +--
 .../cuda/tensor/gather_nd_grad.cc             |  7 ++++
 11 files changed, 98 insertions(+), 27 deletions(-)
diff --git a/onnxruntime/contrib_ops/cuda/diffusion/group_norm.cc b/onnxruntime/contrib_ops/cuda/diffusion/group_norm.cc
index 36a2bd1125..0fa7627414 100644
--- a/onnxruntime/contrib_ops/cuda/diffusion/group_norm.cc
+++ b/onnxruntime/contrib_ops/cuda/diffusion/group_norm.cc
@@ -111,6 +111,13 @@ Status GroupNorm::ComputeInternal(OpKernelContext* context) const {
                            "number of channels should be divisiable by num_groups");
   }
 
+  if (context->GetUseDeterministicCompute()) {
+    static std::once_flag log_warning;
+    std::call_once(log_warning, []() {
+      LOGS_DEFAULT(WARNING) << "GroupNorm has no deterministic CUDA kernel, its outputs may still be nondeterministic.";
+    });
+  }
+
   auto workspace = GetScratchBuffer<void>(GetGroupNormWorkspaceSizeInBytes(), context->GetComputeStream());
 
   utils::MLTypeCallDispatcher<GROUP_NORM_TYPES> dispatcher(input->GetElementType());
diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc
index c23317d3ce..2ab99beaf0 100644
--- a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc
+++ b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc
@@ -147,6 +147,7 @@ Status TopK(const Tensor* input, const int axis, const unsigned k, bool largest,
   Status result;
   if (input->IsDataType<float>()) {
     result = TopKImpl<float>(nullptr,  // We limit number of beams in BeamSearchParameters, so K <= 256 and use NULL here
+                             false /*use_deterministic_compute*/,
                              stream,
                              input->Data<float>(),
                              static_cast<float*>(output_values.MutableDataRaw()),
@@ -161,6 +162,7 @@ Status TopK(const Tensor* input, const int axis, const unsigned k, bool largest,
                              dimension);
   } else if (input->IsDataType<MLFloat16>()) {
     result = TopKImpl<MLFloat16>(nullptr,
+                                 false /*use_deterministic_compute*/,
                                  stream,
                                  input->Data<MLFloat16>(),
                                  static_cast<MLFloat16*>(output_values.MutableDataRaw()),
diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm.cc b/onnxruntime/contrib_ops/rocm/diffusion/group_norm.cc
index 9b0e6251b3..112ac10c38 100644
--- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm.cc
+++ b/onnxruntime/contrib_ops/rocm/diffusion/group_norm.cc
@@ -110,7 +110,14 @@ Status GroupNorm::ComputeInternal(OpKernelContext* context) const {
 
   if (num_channels % num_groups_ != 0) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "number of channels should be divisiable by num_groups");
+                           "number of channels should be divisible by num_groups");
+  }
+
+  if (context->GetUseDeterministicCompute()) {
+    static std::once_flag log_warning;
+    std::call_once(log_warning, []() {
+      LOGS_DEFAULT(WARNING) << "GroupNorm has no deterministic GPU kernel, its outputs may still be nondeterministic.";
+    });
   }
 
   auto workspace = GetScratchBuffer<void>(GetGroupNormWorkspaceSizeInBytes(), context->GetComputeStream());
diff --git a/onnxruntime/core/providers/cuda/math/topk.cc b/onnxruntime/core/providers/cuda/math/topk.cc
index 3b0edaa559..d516537e25 100644
--- a/onnxruntime/core/providers/cuda/math/topk.cc
+++ b/onnxruntime/core/providers/cuda/math/topk.cc
@@ -61,11 +61,12 @@ TopK<inputk>::TopK(const OpKernelInfo& info) : CudaKernel(info) {
 }
 
 #define IS_PRIM_TYPE(T) utils::IsPrimitiveDataType<T>(prim_type)
-#define TOPKIMPL(T) TopKImpl<T>(this, ctx->GetComputeStream(), tensor_X->Data<T>(), \
-                                static_cast<T*>(tensor_V->MutableDataRaw()),        \
-                                static_cast<int64_t*>(tensor_I->MutableDataRaw()),  \
-                                elem_nums_cuda,                                     \
-                                elem_nums.size(),                                   \
+#define TOPKIMPL(T) TopKImpl<T>(this, use_deterministic_compute,                   \
+                                ctx->GetComputeStream(), tensor_X->Data<T>(),      \
+                                static_cast<T*>(tensor_V->MutableDataRaw()),       \
+                                static_cast<int64_t*>(tensor_I->MutableDataRaw()), \
+                                elem_nums_cuda,                                    \
+                                elem_nums.size(),                                  \
                                 axis, K_, largest_, sorted_, N, dimension)
 
 template <bool inputk>
@@ -106,11 +107,14 @@ Status TopK<inputk>::ComputeInternal(OpKernelContext* ctx) const {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for TopK operator");
   }
 
+  bool use_deterministic_compute = ctx->GetUseDeterministicCompute();
+
   if (IS_PRIM_TYPE(int32_t)) return TOPKIMPL(int32_t);
   if (IS_PRIM_TYPE(int64_t)) return TOPKIMPL(int64_t);
   if (IS_PRIM_TYPE(MLFloat16)) return TOPKIMPL(MLFloat16);
   if (IS_PRIM_TYPE(float)) return TOPKIMPL(float);
   if (IS_PRIM_TYPE(double)) return TOPKIMPL(double);
+
   return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for TopK operator");
 }
 
diff --git a/onnxruntime/core/providers/cuda/math/topk_impl.cuh b/onnxruntime/core/providers/cuda/math/topk_impl.cuh
index 26c0eac5f3..cbde6da457 100644
--- a/onnxruntime/core/providers/cuda/math/topk_impl.cuh
+++ b/onnxruntime/core/providers/cuda/math/topk_impl.cuh
@@ -398,7 +398,10 @@ __global__ void ExcludeOutput(T* output_i, T K, T dimension) {
 }
 
 template <typename T>
-Status TopKImpl(const CudaKernel* kernel, Stream* ort_stream, const T* input_x, T* output_v, int64_t* output_i, const TArray<int64_t>& elem_nums, size_t size, int32_t axis, int64_t K, int64_t largest, int64_t sorted, int64_t N, int64_t dimension) {
+Status TopKImpl(const CudaKernel* kernel, bool use_deterministic_compute,
+                Stream* ort_stream, const T* input_x, T* output_v, int64_t* output_i,
+                const TArray<int64_t>& elem_nums, size_t size, int32_t axis, int64_t K, int64_t largest,
+                int64_t sorted, int64_t N, int64_t dimension) {
   typedef typename ToCudaType<T>::MappedType CudaT;
   const CudaT* input_x_ptr = reinterpret_cast<const CudaT*>(input_x);
   CudaT* output_v_ptr = reinterpret_cast<CudaT*>(output_v);
@@ -407,17 +410,34 @@ Status TopKImpl(const CudaKernel* kernel, Stream* ort_stream, const T* input_x,
   auto aligned_K = ALIGN(K);
   auto aligned_dimension = ALIGN(dimension);
   if (aligned_dimension <= GridDim::maxThreadsPerBlock) {
-    BitonicTopK<CudaT><<<N, GridDim::maxThreadsPerBlock, aligned_dimension * sizeof(KV<CudaT>), stream>>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, aligned_K, largest, sorted, dimension, aligned_dimension, NumericLimits<T>::Min(), NumericLimits<T>::Max());
+    BitonicTopK<CudaT><<<N, GridDim::maxThreadsPerBlock, aligned_dimension * sizeof(KV<CudaT>), stream>>>(
+        input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, aligned_K, largest, sorted, dimension,
+        aligned_dimension, NumericLimits<T>::Min(), NumericLimits<T>::Max());
   } else if (K <= BT * 16 || 0 == sorted) {
+    if (use_deterministic_compute) {
+      static std::once_flag log_warning;
+      std::call_once(log_warning, []() {
+        LOGS_DEFAULT(WARNING) << "Non-deterministic TopKImpl kernel is called, its outputs may still be nondeterministic.";
+      });
+    }
+
     auto XPT = static_cast<int64_t>(ceil(static_cast<double>(dimension) / GridDim::maxThreadsPerBlock));
     if (BT * 2 >= K || 0 == sorted) {
-      RadixTopK<CudaT, BT, 2><<<N, BT, 256 * sizeof(uint32_t), stream>>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits<T>::Min(), NumericLimits<T>::Max());
+      RadixTopK<CudaT, BT, 2><<<N, BT, 256 * sizeof(uint32_t), stream>>>(
+          input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT,
+          NumericLimits<T>::Min(), NumericLimits<T>::Max());
     } else if (BT * 4 >= K) {
-      RadixTopK<CudaT, BT, 4><<<N, BT, 256 * sizeof(uint32_t), stream>>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits<T>::Min(), NumericLimits<T>::Max());
+      RadixTopK<CudaT, BT, 4><<<N, BT, 256 * sizeof(uint32_t), stream>>>(
+          input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT,
+          NumericLimits<T>::Min(), NumericLimits<T>::Max());
     } else if (BT * 8 >= K) {
-      RadixTopK<CudaT, BT, 8><<<N, BT, 256 * sizeof(uint32_t), stream>>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits<T>::Min(), NumericLimits<T>::Max());
+      RadixTopK<CudaT, BT, 8><<<N, BT, 256 * sizeof(uint32_t), stream>>>(
+          input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT,
+          NumericLimits<T>::Min(), NumericLimits<T>::Max());
     } else {
-      RadixTopK<CudaT, BT, 16><<<N, BT, 256 * sizeof(uint32_t), stream>>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits<T>::Min(), NumericLimits<T>::Max());
+      RadixTopK<CudaT, BT, 16><<<N, BT, 256 * sizeof(uint32_t), stream>>>(
+          input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT,
+          NumericLimits<T>::Min(), NumericLimits<T>::Max());
     }
   } else {
     auto input_key_buffer = kernel->GetScratchBuffer<CudaT>(dimension, ort_stream);
@@ -451,6 +471,7 @@ Status TopKImpl(const CudaKernel* kernel, Stream* ort_stream, const T* input_x,
 }
 
 #define TOPKIMPLE(T) template Status TopKImpl<T>(const CudaKernel* kernel,         \
+                                                 bool use_deterministic_compute,   \
                                                  Stream* ort_stream,               \
                                                  const T* input_x,                 \
                                                  T* output_v,                      \
@@ -464,7 +485,7 @@ Status TopKImpl(const CudaKernel* kernel, Stream* ort_stream, const T* input_x,
                                                  int64_t N,                        \
                                                  int64_t dimension)
 
-// This file is causing excessive long compilation time in ROCm EP. Split all those compilation into multiple
+// This file is causing excessive long compilation time in ROCm EP. Split all those compilations into multiple
 // translation units to speed it up.
 TOPKIMPLE(TOPK_IMPL_TYPE);
 
diff --git a/onnxruntime/core/providers/cuda/math/topk_impl.h b/onnxruntime/core/providers/cuda/math/topk_impl.h
index b9ba7f8e26..c5f63aadc4 100644
--- a/onnxruntime/core/providers/cuda/math/topk_impl.h
+++ b/onnxruntime/core/providers/cuda/math/topk_impl.h
@@ -11,7 +11,9 @@ namespace onnxruntime {
 namespace cuda {
 
 template <typename T>
-Status TopKImpl(const CudaKernel* kernel, Stream* ort_stream, const T* input_x, T* output_v, int64_t* output_i, const TArray<int64_t>& elem_nums, size_t size, int32_t axis, int64_t K, int64_t largest, int64_t sorted, int64_t N, int64_t dimension);
+Status TopKImpl(const CudaKernel* kernel, bool use_deterministic_compute, Stream* ort_stream,
+                const T* input_x, T* output_v, int64_t* output_i, const TArray<int64_t>& elem_nums,
+                size_t size, int32_t axis, int64_t K, int64_t largest, int64_t sorted, int64_t N, int64_t dimension);
 
 }  // namespace cuda
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/tensor/gather_elements_impl.cu b/onnxruntime/core/providers/cuda/tensor/gather_elements_impl.cu
index 914da00f0d..10c8625b39 100644
--- a/onnxruntime/core/providers/cuda/tensor/gather_elements_impl.cu
+++ b/onnxruntime/core/providers/cuda/tensor/gather_elements_impl.cu
@@ -276,8 +276,12 @@ struct FuncAtomicAdd {
 };
 
 template <typename T, typename TIndex>
-Status GatherElementsGradImpl(cudaStream_t stream, const TIndex* indices_data, const T* updates_data, T* output_data,
-                              const GatherScatterElementsArgs& args) {
+Status GatherElementsGradNonDeterministicImpl(cudaStream_t stream, const TIndex* indices_data, const T* updates_data,
+                                              T* output_data,
+                                              const GatherScatterElementsArgs& args) {
+  // Be noted: usage of AtomicAdd is not deterministic if there are duplicated indices to update.
+  // That's the reason we name this function as non-deterministic.
+
   // Give output_data as the input_data parameter by intention,
   // to skip input_data copy, which is not applicable for GatherElementsGrad.
   // output_data's numel is same as input_data's numel.
@@ -285,10 +289,10 @@ Status GatherElementsGradImpl(cudaStream_t stream, const TIndex* indices_data, c
                                      FuncAtomicAdd<T>(static_cast<size_t>(args.input_size)));
 }
 
-#define GATHER_ELEMENTS_GRAD_SPECIALIZED_TINDEX_IMPL(T, TIndex)                                      \
-  template Status GatherElementsGradImpl<T, TIndex>(cudaStream_t stream, const TIndex* indices_data, \
-                                                    const T* updates_data, T* output_data,           \
-                                                    const GatherScatterElementsArgs& args);
+#define GATHER_ELEMENTS_GRAD_SPECIALIZED_TINDEX_IMPL(T, TIndex)                                                      \
+  template Status GatherElementsGradNonDeterministicImpl<T, TIndex>(cudaStream_t stream, const TIndex* indices_data, \
+                                                                    const T* updates_data, T* output_data,           \
+                                                                    const GatherScatterElementsArgs& args);
 
 #define GATHER_ELEMENTS_GRAD_SPECIALIZED_SCATTER_ADD_IMPL(T) \
   GATHER_ELEMENTS_GRAD_SPECIALIZED_TINDEX_IMPL(T, int32_t)   \
diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cc b/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cc
index a4f107ff0c..501c48e687 100644
--- a/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cc
+++ b/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cc
@@ -348,6 +348,13 @@ Status launch_lamb_reduction(
 
   // Only launch multi-tensor function if we have at least one tensor in the buckets.
   if (tensor_sizes_in_buckets.size() > 0 && buckets.size() > 0) {
+    if (ctx->GetUseDeterministicCompute()) {
+      static std::once_flag log_warning;
+      std::call_once(log_warning, []() {
+        LOGS_DEFAULT(WARNING) << "Non-deterministic Lamb GPU kernel is called, its outputs may still be nondeterministic.";
+      });
+    }
+
     typedef LambMultiTensorReductionFunctor<CudaTIn1, CudaTIn2, CudaTNorm, CudaTNorm, CudaTNorm> TReducer;
     TReducer reducer;
     launch_multi_tensor_functor<tensor_count_per_group, TReducer>(
diff --git a/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad.cc b/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad.cc
index f8a80ddd40..20dcfc4fbd 100644
--- a/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad.cc
+++ b/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad.cc
@@ -29,10 +29,11 @@ ONNX_OPERATOR_KERNEL_EX(GatherElementsGrad, kMSDomain, 1, kCudaExecutionProvider
 
 #undef CREATE_GATHER_ELEMENTS_GRAD_KERNEL_DEF
 
-#define CASE_GATHER_ELEMENTS_GRAD_IMPL(type)                                                            \
-  case sizeof(type): {                                                                                  \
-    const type* indices_data = reinterpret_cast<const type*>(indices_data_raw);                         \
-    ORT_RETURN_IF_ERROR(GatherElementsGradImpl(stream, indices_data, updates_data, output_data, args)); \
+#define CASE_GATHER_ELEMENTS_GRAD_IMPL(type)                                                       \
+  case sizeof(type): {                                                                             \
+    const type* indices_data = reinterpret_cast<const type*>(indices_data_raw);                    \
+    ORT_RETURN_IF_ERROR(GatherElementsGradNonDeterministicImpl(stream, indices_data, updates_data, \
+                                                               output_data, args));                \
   } break
 
 template <typename T>
@@ -100,8 +101,16 @@ Status GatherElementsGrad::ComputeInternal(OpKernelContext* context) const {
     ORT_THROW("Unsupported element size by the GatherElementsGrad CUDA kernel");
   }
 
+  if (context->GetUseDeterministicCompute()) {
+    static std::once_flag log_warning;
+    std::call_once(log_warning, []() {
+      LOGS_DEFAULT(WARNING) << "GatherElementsGrad has no deterministic GPU kernel, its outputs may still be nondeterministic.";
+    });
+  }
+
   utils::MLTypeCallDispatcher<MLFloat16, float, double> t_disp(dtype);
-  return t_disp.InvokeRet<Status, ComputeImpl>(Stream(context), dY->DataRaw(), indices_tensor->DataRaw(), dX->MutableDataRaw(),
+  return t_disp.InvokeRet<Status, ComputeImpl>(Stream(context), dY->DataRaw(), indices_tensor->DataRaw(),
+                                               dX->MutableDataRaw(),
                                                indices_tensor->DataType()->Size(), args);
 }
 
diff --git a/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad_impl.h b/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad_impl.h
index ec1625d58b..909cc7de22 100755
--- a/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad_impl.h
+++ b/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad_impl.h
@@ -12,8 +12,9 @@ namespace cuda {
 struct GatherScatterElementsArgs;
 
 template <typename T, typename TIndex>
-Status GatherElementsGradImpl(cudaStream_t stream, const TIndex* indices_data, const T* updates_data, T* output_data,
-                              const GatherScatterElementsArgs& args);
+Status GatherElementsGradNonDeterministicImpl(cudaStream_t stream, const TIndex* indices_data,
+                                              const T* updates_data, T* output_data,
+                                              const GatherScatterElementsArgs& args);
 
 }  // namespace cuda
 }  // namespace onnxruntime
diff --git a/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad.cc b/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad.cc
index 50cdbca80b..a64abd2a86 100644
--- a/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad.cc
+++ b/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad.cc
@@ -80,6 +80,13 @@ Status GatherNDGrad<TIndex>::ComputeInternal(OpKernelContext* context) const {
                                              batch_dims_, input_shape, indices_shape, indices_tensor,
                                              num_slices, slice_size, input_slice_offsets_buffer));
 
+  if (context->GetUseDeterministicCompute()) {
+    static std::once_flag log_warning;
+    std::call_once(log_warning, []() {
+      LOGS_DEFAULT(WARNING) << "GatherNDGrad has no deterministic GPU kernel, its outputs may still be nondeterministic.";
+    });
+  }
+
   const void* const kernel_input_data = update_tensor->DataRaw();
   void* const kernel_output_data = output_tensor->MutableDataRaw();
   utils::MLTypeCallDispatcher<float, MLFloat16, double, BFloat16> t_disp(update_tensor->GetElementType());