diff --git a/onnxruntime/contrib_ops/cuda/diffusion/group_norm.cc b/onnxruntime/contrib_ops/cuda/diffusion/group_norm.cc index 36a2bd1125..0fa7627414 100644 --- a/onnxruntime/contrib_ops/cuda/diffusion/group_norm.cc +++ b/onnxruntime/contrib_ops/cuda/diffusion/group_norm.cc @@ -111,6 +111,13 @@ Status GroupNorm::ComputeInternal(OpKernelContext* context) const { "number of channels should be divisiable by num_groups"); } + if (context->GetUseDeterministicCompute()) { + static std::once_flag log_warning; + std::call_once(log_warning, []() { + LOGS_DEFAULT(WARNING) << "GroupNorm has no deterministic CUDA kernel, its outputs may still be nondeterministic."; + }); + } + auto workspace = GetScratchBuffer(GetGroupNormWorkspaceSizeInBytes(), context->GetComputeStream()); utils::MLTypeCallDispatcher dispatcher(input->GetElementType()); diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc index c23317d3ce..2ab99beaf0 100644 --- a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc +++ b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc @@ -147,6 +147,7 @@ Status TopK(const Tensor* input, const int axis, const unsigned k, bool largest, Status result; if (input->IsDataType()) { result = TopKImpl(nullptr, // We limit number of beams in BeamSearchParameters, so K <= 256 and use NULL here + false /*use_deterministic_compute*/, stream, input->Data(), static_cast(output_values.MutableDataRaw()), @@ -161,6 +162,7 @@ Status TopK(const Tensor* input, const int axis, const unsigned k, bool largest, dimension); } else if (input->IsDataType()) { result = TopKImpl(nullptr, + false /*use_deterministic_compute*/, stream, input->Data(), static_cast(output_values.MutableDataRaw()), diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm.cc b/onnxruntime/contrib_ops/rocm/diffusion/group_norm.cc index 9b0e6251b3..112ac10c38 100644 --- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm.cc +++ b/onnxruntime/contrib_ops/rocm/diffusion/group_norm.cc @@ -110,7 +110,14 @@ Status GroupNorm::ComputeInternal(OpKernelContext* context) const { if (num_channels % num_groups_ != 0) { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "number of channels should be divisiable by num_groups"); + "number of channels should be divisible by num_groups"); + } + + if (context->GetUseDeterministicCompute()) { + static std::once_flag log_warning; + std::call_once(log_warning, []() { + LOGS_DEFAULT(WARNING) << "GroupNorm has no deterministic GPU kernel, its outputs may still be nondeterministic."; + }); } auto workspace = GetScratchBuffer(GetGroupNormWorkspaceSizeInBytes(), context->GetComputeStream()); diff --git a/onnxruntime/core/providers/cuda/math/topk.cc b/onnxruntime/core/providers/cuda/math/topk.cc index 3b0edaa559..d516537e25 100644 --- a/onnxruntime/core/providers/cuda/math/topk.cc +++ b/onnxruntime/core/providers/cuda/math/topk.cc @@ -61,11 +61,12 @@ TopK::TopK(const OpKernelInfo& info) : CudaKernel(info) { } #define IS_PRIM_TYPE(T) utils::IsPrimitiveDataType(prim_type) -#define TOPKIMPL(T) TopKImpl(this, ctx->GetComputeStream(), tensor_X->Data(), \ - static_cast(tensor_V->MutableDataRaw()), \ - static_cast(tensor_I->MutableDataRaw()), \ - elem_nums_cuda, \ - elem_nums.size(), \ +#define TOPKIMPL(T) TopKImpl(this, use_deterministic_compute, \ + ctx->GetComputeStream(), tensor_X->Data(), \ + static_cast(tensor_V->MutableDataRaw()), \ + static_cast(tensor_I->MutableDataRaw()), \ + elem_nums_cuda, \ + elem_nums.size(), \ axis, K_, largest_, sorted_, N, dimension) template @@ -106,11 +107,14 @@ Status TopK::ComputeInternal(OpKernelContext* ctx) const { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for TopK operator"); } + bool use_deterministic_compute = ctx->GetUseDeterministicCompute(); + if (IS_PRIM_TYPE(int32_t)) return TOPKIMPL(int32_t); if (IS_PRIM_TYPE(int64_t)) return TOPKIMPL(int64_t); if (IS_PRIM_TYPE(MLFloat16)) return TOPKIMPL(MLFloat16); if (IS_PRIM_TYPE(float)) return TOPKIMPL(float); if (IS_PRIM_TYPE(double)) return TOPKIMPL(double); + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for TopK operator"); } diff --git a/onnxruntime/core/providers/cuda/math/topk_impl.cuh b/onnxruntime/core/providers/cuda/math/topk_impl.cuh index 26c0eac5f3..cbde6da457 100644 --- a/onnxruntime/core/providers/cuda/math/topk_impl.cuh +++ b/onnxruntime/core/providers/cuda/math/topk_impl.cuh @@ -398,7 +398,10 @@ __global__ void ExcludeOutput(T* output_i, T K, T dimension) { } template -Status TopKImpl(const CudaKernel* kernel, Stream* ort_stream, const T* input_x, T* output_v, int64_t* output_i, const TArray& elem_nums, size_t size, int32_t axis, int64_t K, int64_t largest, int64_t sorted, int64_t N, int64_t dimension) { +Status TopKImpl(const CudaKernel* kernel, bool use_deterministic_compute, + Stream* ort_stream, const T* input_x, T* output_v, int64_t* output_i, + const TArray& elem_nums, size_t size, int32_t axis, int64_t K, int64_t largest, + int64_t sorted, int64_t N, int64_t dimension) { typedef typename ToCudaType::MappedType CudaT; const CudaT* input_x_ptr = reinterpret_cast(input_x); CudaT* output_v_ptr = reinterpret_cast(output_v); @@ -407,17 +410,34 @@ Status TopKImpl(const CudaKernel* kernel, Stream* ort_stream, const T* input_x, auto aligned_K = ALIGN(K); auto aligned_dimension = ALIGN(dimension); if (aligned_dimension <= GridDim::maxThreadsPerBlock) { - BitonicTopK<<), stream>>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, aligned_K, largest, sorted, dimension, aligned_dimension, NumericLimits::Min(), NumericLimits::Max()); + BitonicTopK<<), stream>>>( + input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, aligned_K, largest, sorted, dimension, + aligned_dimension, NumericLimits::Min(), NumericLimits::Max()); } else if (K <= BT * 16 || 0 == sorted) { + if (use_deterministic_compute) { + static std::once_flag log_warning; + std::call_once(log_warning, []() { + LOGS_DEFAULT(WARNING) << "Non-deterministic TopKImpl kernel is called, its outputs may still be nondeterministic."; + }); + } + auto XPT = static_cast(ceil(static_cast(dimension) / GridDim::maxThreadsPerBlock)); if (BT * 2 >= K || 0 == sorted) { - RadixTopK<<>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits::Min(), NumericLimits::Max()); + RadixTopK<<>>( + input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, + NumericLimits::Min(), NumericLimits::Max()); } else if (BT * 4 >= K) { - RadixTopK<<>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits::Min(), NumericLimits::Max()); + RadixTopK<<>>( + input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, + NumericLimits::Min(), NumericLimits::Max()); } else if (BT * 8 >= K) { - RadixTopK<<>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits::Min(), NumericLimits::Max()); + RadixTopK<<>>( + input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, + NumericLimits::Min(), NumericLimits::Max()); } else { - RadixTopK<<>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits::Min(), NumericLimits::Max()); + RadixTopK<<>>( + input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, + NumericLimits::Min(), NumericLimits::Max()); } } else { auto input_key_buffer = kernel->GetScratchBuffer(dimension, ort_stream); @@ -451,6 +471,7 @@ Status TopKImpl(const CudaKernel* kernel, Stream* ort_stream, const T* input_x, } #define TOPKIMPLE(T) template Status TopKImpl(const CudaKernel* kernel, \ + bool use_deterministic_compute, \ Stream* ort_stream, \ const T* input_x, \ T* output_v, \ @@ -464,7 +485,7 @@ Status TopKImpl(const CudaKernel* kernel, Stream* ort_stream, const T* input_x, int64_t N, \ int64_t dimension) -// This file is causing excessive long compilation time in ROCm EP. Split all those compilation into multiple +// This file is causing excessive long compilation time in ROCm EP. Split all those compilations into multiple // translation units to speed it up. TOPKIMPLE(TOPK_IMPL_TYPE); diff --git a/onnxruntime/core/providers/cuda/math/topk_impl.h b/onnxruntime/core/providers/cuda/math/topk_impl.h index b9ba7f8e26..c5f63aadc4 100644 --- a/onnxruntime/core/providers/cuda/math/topk_impl.h +++ b/onnxruntime/core/providers/cuda/math/topk_impl.h @@ -11,7 +11,9 @@ namespace onnxruntime { namespace cuda { template -Status TopKImpl(const CudaKernel* kernel, Stream* ort_stream, const T* input_x, T* output_v, int64_t* output_i, const TArray& elem_nums, size_t size, int32_t axis, int64_t K, int64_t largest, int64_t sorted, int64_t N, int64_t dimension); +Status TopKImpl(const CudaKernel* kernel, bool use_deterministic_compute, Stream* ort_stream, + const T* input_x, T* output_v, int64_t* output_i, const TArray& elem_nums, + size_t size, int32_t axis, int64_t K, int64_t largest, int64_t sorted, int64_t N, int64_t dimension); } // namespace cuda } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/tensor/gather_elements_impl.cu b/onnxruntime/core/providers/cuda/tensor/gather_elements_impl.cu index 914da00f0d..10c8625b39 100644 --- a/onnxruntime/core/providers/cuda/tensor/gather_elements_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/gather_elements_impl.cu @@ -276,8 +276,12 @@ struct FuncAtomicAdd { }; template -Status GatherElementsGradImpl(cudaStream_t stream, const TIndex* indices_data, const T* updates_data, T* output_data, - const GatherScatterElementsArgs& args) { +Status GatherElementsGradNonDeterministicImpl(cudaStream_t stream, const TIndex* indices_data, const T* updates_data, + T* output_data, + const GatherScatterElementsArgs& args) { + // Be noted: usage of AtomicAdd is not deterministic if there are duplicated indices to update. + // That's the reason we name this function as non-deterministic. + // Give output_data as the input_data parameter by intention, // to skip input_data copy, which is not applicable for GatherElementsGrad. // output_data's numel is same as input_data's numel. @@ -285,10 +289,10 @@ Status GatherElementsGradImpl(cudaStream_t stream, const TIndex* indices_data, c FuncAtomicAdd(static_cast(args.input_size))); } -#define GATHER_ELEMENTS_GRAD_SPECIALIZED_TINDEX_IMPL(T, TIndex) \ - template Status GatherElementsGradImpl(cudaStream_t stream, const TIndex* indices_data, \ - const T* updates_data, T* output_data, \ - const GatherScatterElementsArgs& args); +#define GATHER_ELEMENTS_GRAD_SPECIALIZED_TINDEX_IMPL(T, TIndex) \ + template Status GatherElementsGradNonDeterministicImpl(cudaStream_t stream, const TIndex* indices_data, \ + const T* updates_data, T* output_data, \ + const GatherScatterElementsArgs& args); #define GATHER_ELEMENTS_GRAD_SPECIALIZED_SCATTER_ADD_IMPL(T) \ GATHER_ELEMENTS_GRAD_SPECIALIZED_TINDEX_IMPL(T, int32_t) \ diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cc b/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cc index a4f107ff0c..501c48e687 100644 --- a/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cc +++ b/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cc @@ -348,6 +348,13 @@ Status launch_lamb_reduction( // Only launch multi-tensor function if we have at least one tensor in the buckets. if (tensor_sizes_in_buckets.size() > 0 && buckets.size() > 0) { + if (ctx->GetUseDeterministicCompute()) { + static std::once_flag log_warning; + std::call_once(log_warning, []() { + LOGS_DEFAULT(WARNING) << "Non-deterministic Lamb GPU kernel is called, its outputs may still be nondeterministic."; + }); + } + typedef LambMultiTensorReductionFunctor TReducer; TReducer reducer; launch_multi_tensor_functor( diff --git a/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad.cc b/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad.cc index f8a80ddd40..20dcfc4fbd 100644 --- a/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad.cc +++ b/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad.cc @@ -29,10 +29,11 @@ ONNX_OPERATOR_KERNEL_EX(GatherElementsGrad, kMSDomain, 1, kCudaExecutionProvider #undef CREATE_GATHER_ELEMENTS_GRAD_KERNEL_DEF -#define CASE_GATHER_ELEMENTS_GRAD_IMPL(type) \ - case sizeof(type): { \ - const type* indices_data = reinterpret_cast(indices_data_raw); \ - ORT_RETURN_IF_ERROR(GatherElementsGradImpl(stream, indices_data, updates_data, output_data, args)); \ +#define CASE_GATHER_ELEMENTS_GRAD_IMPL(type) \ + case sizeof(type): { \ + const type* indices_data = reinterpret_cast(indices_data_raw); \ + ORT_RETURN_IF_ERROR(GatherElementsGradNonDeterministicImpl(stream, indices_data, updates_data, \ + output_data, args)); \ } break template @@ -100,8 +101,16 @@ Status GatherElementsGrad::ComputeInternal(OpKernelContext* context) const { ORT_THROW("Unsupported element size by the GatherElementsGrad CUDA kernel"); } + if (context->GetUseDeterministicCompute()) { + static std::once_flag log_warning; + std::call_once(log_warning, []() { + LOGS_DEFAULT(WARNING) << "GatherElementsGrad has no deterministic GPU kernel, its outputs may still be nondeterministic."; + }); + } + utils::MLTypeCallDispatcher t_disp(dtype); - return t_disp.InvokeRet(Stream(context), dY->DataRaw(), indices_tensor->DataRaw(), dX->MutableDataRaw(), + return t_disp.InvokeRet(Stream(context), dY->DataRaw(), indices_tensor->DataRaw(), + dX->MutableDataRaw(), indices_tensor->DataType()->Size(), args); } diff --git a/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad_impl.h b/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad_impl.h index ec1625d58b..909cc7de22 100755 --- a/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad_impl.h +++ b/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad_impl.h @@ -12,8 +12,9 @@ namespace cuda { struct GatherScatterElementsArgs; template -Status GatherElementsGradImpl(cudaStream_t stream, const TIndex* indices_data, const T* updates_data, T* output_data, - const GatherScatterElementsArgs& args); +Status GatherElementsGradNonDeterministicImpl(cudaStream_t stream, const TIndex* indices_data, + const T* updates_data, T* output_data, + const GatherScatterElementsArgs& args); } // namespace cuda } // namespace onnxruntime diff --git a/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad.cc b/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad.cc index 50cdbca80b..a64abd2a86 100644 --- a/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad.cc +++ b/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad.cc @@ -80,6 +80,13 @@ Status GatherNDGrad::ComputeInternal(OpKernelContext* context) const { batch_dims_, input_shape, indices_shape, indices_tensor, num_slices, slice_size, input_slice_offsets_buffer)); + if (context->GetUseDeterministicCompute()) { + static std::once_flag log_warning; + std::call_once(log_warning, []() { + LOGS_DEFAULT(WARNING) << "GatherNDGrad has no deterministic GPU kernel, its outputs may still be nondeterministic."; + }); + } + const void* const kernel_input_data = update_tensor->DataRaw(); void* const kernel_output_data = output_tensor->MutableDataRaw(); utils::MLTypeCallDispatcher t_disp(update_tensor->GetElementType());