diff --git a/onnxruntime/core/providers/cuda/atomic/common.cuh b/onnxruntime/core/providers/cuda/atomic/common.cuh index e58d25c4df..2f7edef4a5 100644 --- a/onnxruntime/core/providers/cuda/atomic/common.cuh +++ b/onnxruntime/core/providers/cuda/atomic/common.cuh @@ -31,20 +31,23 @@ __device__ __forceinline__ void atomic_add(double *address, double value) { #endif } +// +// ref: https://github.com/pytorch/pytorch/blob/master/aten/src/THC/THCAtomics.cuh +// __device__ __forceinline__ void atomic_add(half *address, half value) { #if __CUDA_ARCH__ < 700 - half packed_old[2]; - half packed_new[2]; - int* const p_packed_old = reinterpret_cast(packed_old); - int* const p_packed_new = reinterpret_cast(packed_new); - int seen_old_value = 0; + unsigned int* base_address = (unsigned int*)((char*)address - ((size_t)address & 2)); + unsigned int old = *base_address; + unsigned int assumed; + unsigned short x; + do { - packed_old[0] = *address; - packed_old[1] = *(address + 1); - packed_new[0] = half(float(packed_old[0]) + float(value)); - packed_new[1] = packed_old[1]; - seen_old_value = atomicCAS(reinterpret_cast(address), *p_packed_old, *p_packed_new); - } while (seen_old_value != *p_packed_old); + assumed = old; + x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); + x = __half_as_short(__float2half(__half2float(*reinterpret_cast(&x)) + __half2float(value))); + old = (size_t)address & 2 ? (old & 0xffff) | (x << 16) : (old & 0xffff0000) | x; + old = atomicCAS(base_address, assumed, old); + } while (assumed != old); #else atomicAdd(address, value); #endif diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc index be1813f67e..52804865b6 100644 --- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc +++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc @@ -888,6 +888,72 @@ TEST(ReductionOpTest, ReduceSum_int32) { test.Run(); } +#ifdef USE_CUDA +TEST(ReductionOpTest, ReduceSumHalfHalf) { + OpTester test("ReduceSum"); + test.AddAttribute("keepdims", (int64_t)0); + test.AddAttribute("axes", std::vector{0, 1}); + + std::vector data = {1.0f, 2.0f, + 3.0f, 4.0f, + + 5.0f, 6.0f, + 7.0f, 8.0f, + + 9.0f, 10.0f, + 11.0f, 12.0f}; + std::vector data_half(12); + ConvertFloatToMLFloat16(data.data(), data_half.data(), 12); + + std::vector result = {36.0f, 42.0f}; + std::vector result_half(2); + ConvertFloatToMLFloat16(result.data(), result_half.data(), 2); + + test.AddInput("data", {3, 2, 2}, data_half); + test.AddOutput("reduced", {2}, result_half); + test.Run(); +} + +void test_half_reduce_sum( + int64_t m, int64_t n) { + OpTester test("ReduceSum"); + // Input tensor. + std::vector X(m * n, 0.0f); + // Reduced tensor. + std::vector Y(n, 0.0f); + // Random number generator. + std::default_random_engine generator(0); + std::uniform_real_distribution distribution(0.0, 1.0); + for (int64_t i = 0; i < m; ++i) { + for (int64_t j = 0; j < n; ++j) { + const float value = distribution(generator) / float(m); + X[i * n + j] = value; + Y[j] += value; + } + } + + std::vector X_half(m * n); + ConvertFloatToMLFloat16(X.data(), X_half.data(), int(m * n)); + + std::vector Y_half(n); + ConvertFloatToMLFloat16(Y.data(), Y_half.data(), int(n)); + + test.AddAttribute("keepdims", (int64_t)0); + test.AddAttribute("axes", std::vector{0}); + test.AddInput("data", {m, n}, X_half); + test.AddOutput("reduced", {n}, Y_half); + test.Run(); +} + +TEST(ReductionOpTest, ReduceSum_half_bert) { + test_half_reduce_sum(6 * 128, 128); + test_half_reduce_sum(8 * 128, 128); + test_half_reduce_sum(6 * 384, 128); + test_half_reduce_sum(8 * 384, 128); +} + +// Add more UTs for half as needed +#endif TEST(ReductionOpTest, ReduceSum_apex_reduction) { OpTester test("ReduceSum"); test.AddAttribute("keepdims", (int64_t)0); diff --git a/onnxruntime/test/providers/provider_test_utils.cc b/onnxruntime/test/providers/provider_test_utils.cc index d077fb41d5..79de5b06b1 100644 --- a/onnxruntime/test/providers/provider_test_utils.cc +++ b/onnxruntime/test/providers/provider_test_utils.cc @@ -197,7 +197,7 @@ void Check(const OpTester::Data& expected_data, } float threshold = 0.001f; -#if defined(USE_TENSORRT) || defined(ENABLE_TRAINING) +#if defined(USE_TENSORRT) || defined(ENABLE_TRAINING) || defined(USE_CUDA) threshold = 0.005f; #endif for (int i = 0; i < size; ++i) { diff --git a/orttraining/orttraining/test/training_ops/cpu/tensor/gather_nd_op_test.cc b/orttraining/orttraining/test/training_ops/cpu/tensor/gather_nd_op_test.cc index 70c1e87532..0d01d27299 100644 --- a/orttraining/orttraining/test/training_ops/cpu/tensor/gather_nd_op_test.cc +++ b/orttraining/orttraining/test/training_ops/cpu/tensor/gather_nd_op_test.cc @@ -3,6 +3,7 @@ #include "gtest/gtest.h" #include "test/providers/provider_test_utils.h" +#include "test/common/cuda_op_test_utils.h" namespace onnxruntime { namespace test { @@ -231,8 +232,9 @@ TEST(GatherNDOpTest, GatherNDGrad_slice_float_int64_t_axis_1) { #endif #ifdef USE_CUDA -#if __CUDA_ARCH__ >= 600 TEST(GatherNDOpTest, GatherNDGrad_slice_double_int32_t_axis_3) { + if (!HasCudaEnvironment(600 /*min_cuda_architecture*/)) return; + OpTester test("GatherNDGrad", 1, onnxruntime::kOnnxDomain); test.AddAttribute("axis", 1); test.AddInput("shape", {3}, {2LL, 2LL, 3LL}); @@ -243,6 +245,8 @@ TEST(GatherNDOpTest, GatherNDGrad_slice_double_int32_t_axis_3) { } TEST(GatherNDOpTest, GatherND_slice_double_int64_t_axis_3) { + if (!HasCudaEnvironment(600 /*min_cuda_architecture*/)) return; + OpTester test("GatherND", 1, onnxruntime::kOnnxDomain); test.AddAttribute("axis", 1); test.AddInput("data", {2, 2, 2}, ValueRange(8, 0.0, 0.1)); @@ -250,10 +254,10 @@ TEST(GatherNDOpTest, GatherND_slice_double_int64_t_axis_3) { test.AddOutput("output", {2, 1, 2}, {0.2f, 0.3f, 0.4f, 0.5f}); test.Run(); } -#endif -#if __CUDA_ARCH__ >= 700 TEST(GatherNDOpTest, GatherNDGrad_slice_half_int32_t_axis_3) { + if (!HasCudaEnvironment(600 /*min_cuda_architecture*/)) return; + OpTester test("GatherNDGrad", 1, onnxruntime::kOnnxDomain); test.AddAttribute("axis", 1); test.AddInput("shape", {3}, {2LL, 2LL, 3LL}); @@ -270,6 +274,8 @@ TEST(GatherNDOpTest, GatherNDGrad_slice_half_int32_t_axis_3) { } TEST(GatherNDOpTest, GatherND_slice_half_int32_t) { + if (!HasCudaEnvironment(600 /*min_cuda_architecture*/)) return; + OpTester test("GatherND", 1, onnxruntime::kOnnxDomain); std::vector data_f({0.0f, 0.1f, 0.2f, 0.3f}); std::vector outputs_f({0.2f, 0.3f, 0.0f, 0.1f}); @@ -283,7 +289,6 @@ TEST(GatherNDOpTest, GatherND_slice_half_int32_t) { test.Run(); } #endif -#endif #ifdef USE_CUDA TEST(GatherNDOpTest, GatherND_axis_of_2) { diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/lamb.h b/orttraining/orttraining/training_ops/cuda/optimizer/lamb.h index 6a4204e7ab..f02403b4fc 100644 --- a/orttraining/orttraining/training_ops/cuda/optimizer/lamb.h +++ b/orttraining/orttraining/training_ops/cuda/optimizer/lamb.h @@ -18,8 +18,8 @@ class LambOptimizer final : public CudaKernel { beta_ = info.GetAttrsOrDefault("beta", std::vector(1024, 0.999f)); lambda_ = info.GetAttrsOrDefault("lambda", std::vector(1024, 0.0f)); epsilon_ = info.GetAttrsOrDefault("epsilon", std::vector(1024, 1e-6f)); - info.GetAttr("ratio_min", &ratio_min_); - info.GetAttr("ratio_max", &ratio_max_); + ORT_ENFORCE(info.GetAttr("ratio_min", &ratio_min_).IsOK(), "Missing/Invalid 'ratio_min' attribute value"); + ORT_ENFORCE(info.GetAttr("ratio_max", &ratio_max_).IsOK(), "Missing/Invalid 'ratio_max' attribute value"); } Status ComputeInternal(OpKernelContext* context) const override; diff --git a/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_impl.cu b/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_impl.cu index 03745e7605..d754dcfe9f 100644 --- a/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_impl.cu +++ b/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_impl.cu @@ -4,6 +4,7 @@ #include "orttraining/training_ops/cuda/tensor/gather_nd_impl.h" #include "core/providers/cuda/cu_inc/common.cuh" +#include "core/providers/cuda/atomic/common.cuh" namespace onnxruntime { namespace cuda { @@ -53,7 +54,7 @@ __global__ void _GatherNDGradKernel( CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(i, num_slices * slice_size); uint64_t slice_offset = slice_offsets[i / slice_size]; size_t j = i % slice_size; - atomicAdd(output_data + slice_offset + j, update_data[i]); + atomic_add(output_data + slice_offset + j, update_data[i]); }; template @@ -121,11 +122,10 @@ SPECIALIZED_COMPUTE_SLICE_OFFSETS_IMPL(int64_t); SPECIALIZED_IMPL(float); SPECIALIZED_GRAD_IMPL(float); -#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 SPECIALIZED_IMPL(half); SPECIALIZED_GRAD_IMPL(half); -#endif -#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 + SPECIALIZED_IMPL(double); SPECIALIZED_GRAD_IMPL(double); #endif