diff --git a/onnxruntime/contrib_ops/cuda/diffusion/group_norm.cc b/onnxruntime/contrib_ops/cuda/diffusion/group_norm.cc
index 36a2bd1125..0fa7627414 100644
--- a/onnxruntime/contrib_ops/cuda/diffusion/group_norm.cc
+++ b/onnxruntime/contrib_ops/cuda/diffusion/group_norm.cc
@@ -111,6 +111,13 @@ Status GroupNorm::ComputeInternal(OpKernelContext* context) const {
                            "number of channels should be divisiable by num_groups");
   }
 
+  if (context->GetUseDeterministicCompute()) {
+    static std::once_flag log_warning;
+    std::call_once(log_warning, []() {
+      LOGS_DEFAULT(WARNING) << "GroupNorm has no deterministic CUDA kernel, its outputs may still be nondeterministic.";
+    });
+  }
+
   auto workspace = GetScratchBuffer<void>(GetGroupNormWorkspaceSizeInBytes(), context->GetComputeStream());
 
   utils::MLTypeCallDispatcher<GROUP_NORM_TYPES> dispatcher(input->GetElementType());
diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc
index c23317d3ce..2ab99beaf0 100644
--- a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc
+++ b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc
@@ -147,6 +147,7 @@ Status TopK(const Tensor* input, const int axis, const unsigned k, bool largest,
   Status result;
   if (input->IsDataType<float>()) {
     result = TopKImpl<float>(nullptr,  // We limit number of beams in BeamSearchParameters, so K <= 256 and use NULL here
+                             false /*use_deterministic_compute*/,
                              stream,
                              input->Data<float>(),
                              static_cast<float*>(output_values.MutableDataRaw()),
@@ -161,6 +162,7 @@ Status TopK(const Tensor* input, const int axis, const unsigned k, bool largest,
                              dimension);
   } else if (input->IsDataType<MLFloat16>()) {
     result = TopKImpl<MLFloat16>(nullptr,
+                                 false /*use_deterministic_compute*/,
                                  stream,
                                  input->Data<MLFloat16>(),
                                  static_cast<MLFloat16*>(output_values.MutableDataRaw()),
diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm.cc b/onnxruntime/contrib_ops/rocm/diffusion/group_norm.cc
index 9b0e6251b3..112ac10c38 100644
--- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm.cc
+++ b/onnxruntime/contrib_ops/rocm/diffusion/group_norm.cc
@@ -110,7 +110,14 @@ Status GroupNorm::ComputeInternal(OpKernelContext* context) const {
 
   if (num_channels % num_groups_ != 0) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "number of channels should be divisiable by num_groups");
+                           "number of channels should be divisible by num_groups");
+  }
+
+  if (context->GetUseDeterministicCompute()) {
+    static std::once_flag log_warning;
+    std::call_once(log_warning, []() {
+      LOGS_DEFAULT(WARNING) << "GroupNorm has no deterministic GPU kernel, its outputs may still be nondeterministic.";
+    });
   }
 
   auto workspace = GetScratchBuffer<void>(GetGroupNormWorkspaceSizeInBytes(), context->GetComputeStream());
diff --git a/onnxruntime/core/providers/cuda/math/topk.cc b/onnxruntime/core/providers/cuda/math/topk.cc
index 3b0edaa559..d516537e25 100644
--- a/onnxruntime/core/providers/cuda/math/topk.cc
+++ b/onnxruntime/core/providers/cuda/math/topk.cc
@@ -61,11 +61,12 @@ TopK<inputk>::TopK(const OpKernelInfo& info) : CudaKernel(info) {
 }
 
 #define IS_PRIM_TYPE(T) utils::IsPrimitiveDataType<T>(prim_type)
-#define TOPKIMPL(T) TopKImpl<T>(this, ctx->GetComputeStream(), tensor_X->Data<T>(), \
-                                static_cast<T*>(tensor_V->MutableDataRaw()),        \
-                                static_cast<int64_t*>(tensor_I->MutableDataRaw()),  \
-                                elem_nums_cuda,                                     \
-                                elem_nums.size(),                                   \
+#define TOPKIMPL(T) TopKImpl<T>(this, use_deterministic_compute,                   \
+                                ctx->GetComputeStream(), tensor_X->Data<T>(),      \
+                                static_cast<T*>(tensor_V->MutableDataRaw()),       \
+                                static_cast<int64_t*>(tensor_I->MutableDataRaw()), \
+                                elem_nums_cuda,                                    \
+                                elem_nums.size(),                                  \
                                 axis, K_, largest_, sorted_, N, dimension)
 
 template <bool inputk>
@@ -106,11 +107,14 @@ Status TopK<inputk>::ComputeInternal(OpKernelContext* ctx) const {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for TopK operator");
   }
 
+  bool use_deterministic_compute = ctx->GetUseDeterministicCompute();
+
   if (IS_PRIM_TYPE(int32_t)) return TOPKIMPL(int32_t);
   if (IS_PRIM_TYPE(int64_t)) return TOPKIMPL(int64_t);
   if (IS_PRIM_TYPE(MLFloat16)) return TOPKIMPL(MLFloat16);
   if (IS_PRIM_TYPE(float)) return TOPKIMPL(float);
   if (IS_PRIM_TYPE(double)) return TOPKIMPL(double);
+
   return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for TopK operator");
 }
 
diff --git a/onnxruntime/core/providers/cuda/math/topk_impl.cuh b/onnxruntime/core/providers/cuda/math/topk_impl.cuh
index 26c0eac5f3..cbde6da457 100644
--- a/onnxruntime/core/providers/cuda/math/topk_impl.cuh
+++ b/onnxruntime/core/providers/cuda/math/topk_impl.cuh
@@ -398,7 +398,10 @@ __global__ void ExcludeOutput(T* output_i, T K, T dimension) {
 }
 
 template <typename T>
-Status TopKImpl(const CudaKernel* kernel, Stream* ort_stream, const T* input_x, T* output_v, int64_t* output_i, const TArray<int64_t>& elem_nums, size_t size, int32_t axis, int64_t K, int64_t largest, int64_t sorted, int64_t N, int64_t dimension) {
+Status TopKImpl(const CudaKernel* kernel, bool use_deterministic_compute,
+                Stream* ort_stream, const T* input_x, T* output_v, int64_t* output_i,
+                const TArray<int64_t>& elem_nums, size_t size, int32_t axis, int64_t K, int64_t largest,
+                int64_t sorted, int64_t N, int64_t dimension) {
   typedef typename ToCudaType<T>::MappedType CudaT;
   const CudaT* input_x_ptr = reinterpret_cast<const CudaT*>(input_x);
   CudaT* output_v_ptr = reinterpret_cast<CudaT*>(output_v);
@@ -407,17 +410,34 @@ Status TopKImpl(const CudaKernel* kernel, Stream* ort_stream, const T* input_x,
   auto aligned_K = ALIGN(K);
   auto aligned_dimension = ALIGN(dimension);
   if (aligned_dimension <= GridDim::maxThreadsPerBlock) {
-    BitonicTopK<CudaT><<<N, GridDim::maxThreadsPerBlock, aligned_dimension * sizeof(KV<CudaT>), stream>>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, aligned_K, largest, sorted, dimension, aligned_dimension, NumericLimits<T>::Min(), NumericLimits<T>::Max());
+    BitonicTopK<CudaT><<<N, GridDim::maxThreadsPerBlock, aligned_dimension * sizeof(KV<CudaT>), stream>>>(
+        input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, aligned_K, largest, sorted, dimension,
+        aligned_dimension, NumericLimits<T>::Min(), NumericLimits<T>::Max());
   } else if (K <= BT * 16 || 0 == sorted) {
+    if (use_deterministic_compute) {
+      static std::once_flag log_warning;
+      std::call_once(log_warning, []() {
+        LOGS_DEFAULT(WARNING) << "Non-deterministic TopKImpl kernel is called, its outputs may still be nondeterministic.";
+      });
+    }
+
     auto XPT = static_cast<int64_t>(ceil(static_cast<double>(dimension) / GridDim::maxThreadsPerBlock));
     if (BT * 2 >= K || 0 == sorted) {
-      RadixTopK<CudaT, BT, 2><<<N, BT, 256 * sizeof(uint32_t), stream>>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits<T>::Min(), NumericLimits<T>::Max());
+      RadixTopK<CudaT, BT, 2><<<N, BT, 256 * sizeof(uint32_t), stream>>>(
+          input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT,
+          NumericLimits<T>::Min(), NumericLimits<T>::Max());
     } else if (BT * 4 >= K) {
-      RadixTopK<CudaT, BT, 4><<<N, BT, 256 * sizeof(uint32_t), stream>>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits<T>::Min(), NumericLimits<T>::Max());
+      RadixTopK<CudaT, BT, 4><<<N, BT, 256 * sizeof(uint32_t), stream>>>(
+          input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT,
+          NumericLimits<T>::Min(), NumericLimits<T>::Max());
     } else if (BT * 8 >= K) {
-      RadixTopK<CudaT, BT, 8><<<N, BT, 256 * sizeof(uint32_t), stream>>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits<T>::Min(), NumericLimits<T>::Max());
+      RadixTopK<CudaT, BT, 8><<<N, BT, 256 * sizeof(uint32_t), stream>>>(
+          input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT,
+          NumericLimits<T>::Min(), NumericLimits<T>::Max());
     } else {
-      RadixTopK<CudaT, BT, 16><<<N, BT, 256 * sizeof(uint32_t), stream>>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits<T>::Min(), NumericLimits<T>::Max());
+      RadixTopK<CudaT, BT, 16><<<N, BT, 256 * sizeof(uint32_t), stream>>>(
+          input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT,
+          NumericLimits<T>::Min(), NumericLimits<T>::Max());
     }
   } else {
     auto input_key_buffer = kernel->GetScratchBuffer<CudaT>(dimension, ort_stream);
@@ -451,6 +471,7 @@ Status TopKImpl(const CudaKernel* kernel, Stream* ort_stream, const T* input_x,
 }
 
 #define TOPKIMPLE(T) template Status TopKImpl<T>(const CudaKernel* kernel,         \
+                                                 bool use_deterministic_compute,   \
                                                  Stream* ort_stream,               \
                                                  const T* input_x,                 \
                                                  T* output_v,                      \
@@ -464,7 +485,7 @@ Status TopKImpl(const CudaKernel* kernel, Stream* ort_stream, const T* input_x,
                                                  int64_t N,                        \
                                                  int64_t dimension)
 
-// This file is causing excessive long compilation time in ROCm EP. Split all those compilation into multiple
+// This file is causing excessive long compilation time in ROCm EP. Split all those compilations into multiple
 // translation units to speed it up.
 TOPKIMPLE(TOPK_IMPL_TYPE);
 
diff --git a/onnxruntime/core/providers/cuda/math/topk_impl.h b/onnxruntime/core/providers/cuda/math/topk_impl.h
index b9ba7f8e26..c5f63aadc4 100644
--- a/onnxruntime/core/providers/cuda/math/topk_impl.h
+++ b/onnxruntime/core/providers/cuda/math/topk_impl.h
@@ -11,7 +11,9 @@ namespace onnxruntime {
 namespace cuda {
 
 template <typename T>
-Status TopKImpl(const CudaKernel* kernel, Stream* ort_stream, const T* input_x, T* output_v, int64_t* output_i, const TArray<int64_t>& elem_nums, size_t size, int32_t axis, int64_t K, int64_t largest, int64_t sorted, int64_t N, int64_t dimension);
+Status TopKImpl(const CudaKernel* kernel, bool use_deterministic_compute, Stream* ort_stream,
+                const T* input_x, T* output_v, int64_t* output_i, const TArray<int64_t>& elem_nums,
+                size_t size, int32_t axis, int64_t K, int64_t largest, int64_t sorted, int64_t N, int64_t dimension);
 
 }  // namespace cuda
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/tensor/gather_elements_impl.cu b/onnxruntime/core/providers/cuda/tensor/gather_elements_impl.cu
index 914da00f0d..10c8625b39 100644
--- a/onnxruntime/core/providers/cuda/tensor/gather_elements_impl.cu
+++ b/onnxruntime/core/providers/cuda/tensor/gather_elements_impl.cu
@@ -276,8 +276,12 @@ struct FuncAtomicAdd {
 };
 
 template <typename T, typename TIndex>
-Status GatherElementsGradImpl(cudaStream_t stream, const TIndex* indices_data, const T* updates_data, T* output_data,
-                              const GatherScatterElementsArgs& args) {
+Status GatherElementsGradNonDeterministicImpl(cudaStream_t stream, const TIndex* indices_data, const T* updates_data,
+                                              T* output_data,
+                                              const GatherScatterElementsArgs& args) {
+  // Be noted: usage of AtomicAdd is not deterministic if there are duplicated indices to update.
+  // That's the reason we name this function as non-deterministic.
+
   // Give output_data as the input_data parameter by intention,
   // to skip input_data copy, which is not applicable for GatherElementsGrad.
   // output_data's numel is same as input_data's numel.
@@ -285,10 +289,10 @@ Status GatherElementsGradImpl(cudaStream_t stream, const TIndex* indices_data, c
                                      FuncAtomicAdd<T>(static_cast<size_t>(args.input_size)));
 }
 
-#define GATHER_ELEMENTS_GRAD_SPECIALIZED_TINDEX_IMPL(T, TIndex)                                      \
-  template Status GatherElementsGradImpl<T, TIndex>(cudaStream_t stream, const TIndex* indices_data, \
-                                                    const T* updates_data, T* output_data,           \
-                                                    const GatherScatterElementsArgs& args);
+#define GATHER_ELEMENTS_GRAD_SPECIALIZED_TINDEX_IMPL(T, TIndex)                                                      \
+  template Status GatherElementsGradNonDeterministicImpl<T, TIndex>(cudaStream_t stream, const TIndex* indices_data, \
+                                                                    const T* updates_data, T* output_data,           \
+                                                                    const GatherScatterElementsArgs& args);
 
 #define GATHER_ELEMENTS_GRAD_SPECIALIZED_SCATTER_ADD_IMPL(T) \
   GATHER_ELEMENTS_GRAD_SPECIALIZED_TINDEX_IMPL(T, int32_t)   \
diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cc b/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cc
index a4f107ff0c..501c48e687 100644
--- a/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cc
+++ b/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cc
@@ -348,6 +348,13 @@ Status launch_lamb_reduction(
 
   // Only launch multi-tensor function if we have at least one tensor in the buckets.
   if (tensor_sizes_in_buckets.size() > 0 && buckets.size() > 0) {
+    if (ctx->GetUseDeterministicCompute()) {
+      static std::once_flag log_warning;
+      std::call_once(log_warning, []() {
+        LOGS_DEFAULT(WARNING) << "Non-deterministic Lamb GPU kernel is called, its outputs may still be nondeterministic.";
+      });
+    }
+
     typedef LambMultiTensorReductionFunctor<CudaTIn1, CudaTIn2, CudaTNorm, CudaTNorm, CudaTNorm> TReducer;
     TReducer reducer;
     launch_multi_tensor_functor<tensor_count_per_group, TReducer>(
diff --git a/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad.cc b/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad.cc
index f8a80ddd40..20dcfc4fbd 100644
--- a/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad.cc
+++ b/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad.cc
@@ -29,10 +29,11 @@ ONNX_OPERATOR_KERNEL_EX(GatherElementsGrad, kMSDomain, 1, kCudaExecutionProvider
 
 #undef CREATE_GATHER_ELEMENTS_GRAD_KERNEL_DEF
 
-#define CASE_GATHER_ELEMENTS_GRAD_IMPL(type)                                                            \
-  case sizeof(type): {                                                                                  \
-    const type* indices_data = reinterpret_cast<const type*>(indices_data_raw);                         \
-    ORT_RETURN_IF_ERROR(GatherElementsGradImpl(stream, indices_data, updates_data, output_data, args)); \
+#define CASE_GATHER_ELEMENTS_GRAD_IMPL(type)                                                       \
+  case sizeof(type): {                                                                             \
+    const type* indices_data = reinterpret_cast<const type*>(indices_data_raw);                    \
+    ORT_RETURN_IF_ERROR(GatherElementsGradNonDeterministicImpl(stream, indices_data, updates_data, \
+                                                               output_data, args));                \
   } break
 
 template <typename T>
@@ -100,8 +101,16 @@ Status GatherElementsGrad::ComputeInternal(OpKernelContext* context) const {
     ORT_THROW("Unsupported element size by the GatherElementsGrad CUDA kernel");
   }
 
+  if (context->GetUseDeterministicCompute()) {
+    static std::once_flag log_warning;
+    std::call_once(log_warning, []() {
+      LOGS_DEFAULT(WARNING) << "GatherElementsGrad has no deterministic GPU kernel, its outputs may still be nondeterministic.";
+    });
+  }
+
   utils::MLTypeCallDispatcher<MLFloat16, float, double> t_disp(dtype);
-  return t_disp.InvokeRet<Status, ComputeImpl>(Stream(context), dY->DataRaw(), indices_tensor->DataRaw(), dX->MutableDataRaw(),
+  return t_disp.InvokeRet<Status, ComputeImpl>(Stream(context), dY->DataRaw(), indices_tensor->DataRaw(),
+                                               dX->MutableDataRaw(),
                                                indices_tensor->DataType()->Size(), args);
 }
 
diff --git a/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad_impl.h b/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad_impl.h
index ec1625d58b..909cc7de22 100755
--- a/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad_impl.h
+++ b/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad_impl.h
@@ -12,8 +12,9 @@ namespace cuda {
 struct GatherScatterElementsArgs;
 
 template <typename T, typename TIndex>
-Status GatherElementsGradImpl(cudaStream_t stream, const TIndex* indices_data, const T* updates_data, T* output_data,
-                              const GatherScatterElementsArgs& args);
+Status GatherElementsGradNonDeterministicImpl(cudaStream_t stream, const TIndex* indices_data,
+                                              const T* updates_data, T* output_data,
+                                              const GatherScatterElementsArgs& args);
 
 }  // namespace cuda
 }  // namespace onnxruntime
diff --git a/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad.cc b/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad.cc
index 50cdbca80b..a64abd2a86 100644
--- a/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad.cc
+++ b/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad.cc
@@ -80,6 +80,13 @@ Status GatherNDGrad<TIndex>::ComputeInternal(OpKernelContext* context) const {
                                              batch_dims_, input_shape, indices_shape, indices_tensor,
                                              num_slices, slice_size, input_slice_offsets_buffer));
 
+  if (context->GetUseDeterministicCompute()) {
+    static std::once_flag log_warning;
+    std::call_once(log_warning, []() {
+      LOGS_DEFAULT(WARNING) << "GatherNDGrad has no deterministic GPU kernel, its outputs may still be nondeterministic.";
+    });
+  }
+
   const void* const kernel_input_data = update_tensor->DataRaw();
   void* const kernel_output_data = output_tensor->MutableDataRaw();
   utils::MLTypeCallDispatcher<float, MLFloat16, double, BFloat16> t_disp(update_tensor->GetElementType());