Merged PR 5686: fix P100/fp16 issues

1. misaligned address in atomic_add() 2. GatherNDGradKernel to use atomic_add 3. enable/add UTs for GatherNDGrad and reduction_ops using half - __CUDA_ARCH__ won't take effect on .cc code, leverage HasCudaEnvironment() instead 4. verified convergence graph and perf test - p100 is much slower than v100 on fp16 - fp16/128 need to reduce batch size from 66 to 64 to avoid OOM issue 5. verify convergence test on Dev3/v100 TBD - broken UTs related to MatmulIntegerOpTest (works on v100/windows, though)
2026-07-30 20:18:08 +00:00 · 2020-03-12 04:50:09 +00:00 · 2020-03-12 04:50:09 +00:00 · 2f1e997e5b
commit 2f1e997e5b
parent 75025461e2
6 changed files with 96 additions and 22 deletions
--- a/onnxruntime/core/providers/cuda/atomic/common.cuh
+++ b/onnxruntime/core/providers/cuda/atomic/common.cuh
@ -31,20 +31,23 @@ __device__ __forceinline__ void atomic_add(double *address, double value) {
 #endif
 }

+//
+// ref: https://github.com/pytorch/pytorch/blob/master/aten/src/THC/THCAtomics.cuh
+//
 __device__ __forceinline__ void atomic_add(half *address, half value) {
 #if __CUDA_ARCH__ < 700
-  half packed_old[2];
-  half packed_new[2];
-  int* const p_packed_old = reinterpret_cast<int*>(packed_old);
-  int* const p_packed_new = reinterpret_cast<int*>(packed_new);
-  int seen_old_value = 0;
+  unsigned int* base_address = (unsigned int*)((char*)address - ((size_t)address & 2));
+  unsigned int old = *base_address;
+  unsigned int assumed;
+  unsigned short x;
+
  do {
-    packed_old[0] = *address;
-    packed_old[1] = *(address + 1);
-    packed_new[0] = half(float(packed_old[0]) + float(value));
-    packed_new[1] = packed_old[1];
-    seen_old_value = atomicCAS(reinterpret_cast<int*>(address), *p_packed_old, *p_packed_new);
-  } while (seen_old_value != *p_packed_old);
+    assumed = old;
+    x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
+    x = __half_as_short(__float2half(__half2float(*reinterpret_cast<const __half*>(&x)) + __half2float(value)));
+    old = (size_t)address & 2 ? (old & 0xffff) | (x << 16) : (old & 0xffff0000) | x;
+    old = atomicCAS(base_address, assumed, old);
+  } while (assumed != old);
 #else
  atomicAdd(address, value);
 #endif
--- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
@ -888,6 +888,72 @@ TEST(ReductionOpTest, ReduceSum_int32) {
  test.Run();
 }

+#ifdef USE_CUDA
+TEST(ReductionOpTest, ReduceSumHalfHalf) {
+  OpTester test("ReduceSum");
+  test.AddAttribute("keepdims", (int64_t)0);
+  test.AddAttribute("axes", std::vector<int64_t>{0, 1});
+
+  std::vector<float> data = {1.0f, 2.0f,
+                             3.0f, 4.0f,
+
+                             5.0f, 6.0f,
+                             7.0f, 8.0f,
+
+                             9.0f, 10.0f,
+                             11.0f, 12.0f};
+  std::vector<MLFloat16> data_half(12);
+  ConvertFloatToMLFloat16(data.data(), data_half.data(), 12);
+
+  std::vector<float> result = {36.0f, 42.0f};
+  std::vector<MLFloat16> result_half(2);
+  ConvertFloatToMLFloat16(result.data(), result_half.data(), 2);
+
+  test.AddInput<MLFloat16>("data", {3, 2, 2}, data_half);
+  test.AddOutput<MLFloat16>("reduced", {2}, result_half);
+  test.Run();
+} 
+
+void test_half_reduce_sum(
+    int64_t m, int64_t n) {
+  OpTester test("ReduceSum");
+  // Input tensor.
+  std::vector<float> X(m * n, 0.0f);
+  // Reduced tensor.
+  std::vector<float> Y(n, 0.0f);
+  // Random number generator.
+  std::default_random_engine generator(0);
+  std::uniform_real_distribution<float> distribution(0.0, 1.0);
+  for (int64_t i = 0; i < m; ++i) {
+    for (int64_t j = 0; j < n; ++j) {
+      const float value = distribution(generator) / float(m);
+      X[i * n + j] = value;
+      Y[j] += value;
+    }
+  }
+
+  std::vector<MLFloat16> X_half(m * n);
+  ConvertFloatToMLFloat16(X.data(), X_half.data(), int(m * n));
+
+  std::vector<MLFloat16> Y_half(n);
+  ConvertFloatToMLFloat16(Y.data(), Y_half.data(), int(n));
+
+  test.AddAttribute("keepdims", (int64_t)0);
+  test.AddAttribute("axes", std::vector<int64_t>{0});
+  test.AddInput<MLFloat16>("data", {m, n}, X_half);
+  test.AddOutput<MLFloat16>("reduced", {n}, Y_half);
+  test.Run();
+}
+
+TEST(ReductionOpTest, ReduceSum_half_bert) {
+  test_half_reduce_sum(6 * 128, 128);
+  test_half_reduce_sum(8 * 128, 128);
+  test_half_reduce_sum(6 * 384, 128);
+  test_half_reduce_sum(8 * 384, 128);
+}
+
+// Add more UTs for half as needed
+#endif
 TEST(ReductionOpTest, ReduceSum_apex_reduction) {
  OpTester test("ReduceSum");
  test.AddAttribute("keepdims", (int64_t)0);
--- a/onnxruntime/test/providers/provider_test_utils.cc
+++ b/onnxruntime/test/providers/provider_test_utils.cc
@ -197,7 +197,7 @@ void Check<MLFloat16>(const OpTester::Data& expected_data,
  }

  float threshold = 0.001f;
-#if defined(USE_TENSORRT) || defined(ENABLE_TRAINING)
+#if defined(USE_TENSORRT) || defined(ENABLE_TRAINING) || defined(USE_CUDA)
  threshold = 0.005f;
 #endif
  for (int i = 0; i < size; ++i) {
--- a/orttraining/orttraining/test/training_ops/cpu/tensor/gather_nd_op_test.cc
+++ b/orttraining/orttraining/test/training_ops/cpu/tensor/gather_nd_op_test.cc
@ -3,6 +3,7 @@

 #include "gtest/gtest.h"
 #include "test/providers/provider_test_utils.h"
+#include "test/common/cuda_op_test_utils.h"

 namespace onnxruntime {
 namespace test {
@ -231,8 +232,9 @@ TEST(GatherNDOpTest, GatherNDGrad_slice_float_int64_t_axis_1) {
 #endif

 #ifdef USE_CUDA
-#if __CUDA_ARCH__ >= 600
 TEST(GatherNDOpTest, GatherNDGrad_slice_double_int32_t_axis_3) {
+  if (!HasCudaEnvironment(600 /*min_cuda_architecture*/)) return;
+
  OpTester test("GatherNDGrad", 1, onnxruntime::kOnnxDomain);
  test.AddAttribute<int64_t>("axis", 1);
  test.AddInput<int64_t>("shape", {3}, {2LL, 2LL, 3LL});
@ -243,6 +245,8 @@ TEST(GatherNDOpTest, GatherNDGrad_slice_double_int32_t_axis_3) {
 }

 TEST(GatherNDOpTest, GatherND_slice_double_int64_t_axis_3) {
+  if (!HasCudaEnvironment(600 /*min_cuda_architecture*/)) return;
+
  OpTester test("GatherND", 1, onnxruntime::kOnnxDomain);
  test.AddAttribute<int64_t>("axis", 1);
  test.AddInput<double>("data", {2, 2, 2}, ValueRange(8, 0.0, 0.1));
@ -250,10 +254,10 @@ TEST(GatherNDOpTest, GatherND_slice_double_int64_t_axis_3) {
  test.AddOutput<double>("output", {2, 1, 2}, {0.2f, 0.3f, 0.4f, 0.5f});
  test.Run();
 }
-#endif

-#if __CUDA_ARCH__ >= 700
 TEST(GatherNDOpTest, GatherNDGrad_slice_half_int32_t_axis_3) {
+  if (!HasCudaEnvironment(600 /*min_cuda_architecture*/)) return;
+
  OpTester test("GatherNDGrad", 1, onnxruntime::kOnnxDomain);
  test.AddAttribute<int64_t>("axis", 1);
  test.AddInput<int64_t>("shape", {3}, {2LL, 2LL, 3LL});
@ -270,6 +274,8 @@ TEST(GatherNDOpTest, GatherNDGrad_slice_half_int32_t_axis_3) {
 }

 TEST(GatherNDOpTest, GatherND_slice_half_int32_t) {
+  if (!HasCudaEnvironment(600 /*min_cuda_architecture*/)) return;
+
  OpTester test("GatherND", 1, onnxruntime::kOnnxDomain);
  std::vector<float> data_f({0.0f, 0.1f, 0.2f, 0.3f});
  std::vector<float> outputs_f({0.2f, 0.3f, 0.0f, 0.1f});
@ -283,7 +289,6 @@ TEST(GatherNDOpTest, GatherND_slice_half_int32_t) {
  test.Run();
 }
 #endif
-#endif

 #ifdef USE_CUDA
 TEST(GatherNDOpTest, GatherND_axis_of_2) {
--- a/orttraining/orttraining/training_ops/cuda/optimizer/lamb.h
+++ b/orttraining/orttraining/training_ops/cuda/optimizer/lamb.h
@ -18,8 +18,8 @@ class LambOptimizer final : public CudaKernel {
    beta_ = info.GetAttrsOrDefault("beta", std::vector<float>(1024, 0.999f));
    lambda_ = info.GetAttrsOrDefault("lambda", std::vector<float>(1024, 0.0f));
    epsilon_ = info.GetAttrsOrDefault("epsilon", std::vector<float>(1024, 1e-6f));
-    info.GetAttr("ratio_min", &ratio_min_);
-    info.GetAttr("ratio_max", &ratio_max_);
+    ORT_ENFORCE(info.GetAttr<float>("ratio_min", &ratio_min_).IsOK(), "Missing/Invalid 'ratio_min' attribute value");
+    ORT_ENFORCE(info.GetAttr<float>("ratio_max", &ratio_max_).IsOK(), "Missing/Invalid 'ratio_max' attribute value");
  }

  Status ComputeInternal(OpKernelContext* context) const override;
--- a/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_impl.cu
+++ b/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_impl.cu
@ -4,6 +4,7 @@
 #include "orttraining/training_ops/cuda/tensor/gather_nd_impl.h"

 #include "core/providers/cuda/cu_inc/common.cuh"
+#include "core/providers/cuda/atomic/common.cuh"

 namespace onnxruntime {
 namespace cuda {
@ -53,7 +54,7 @@ __global__ void _GatherNDGradKernel(
  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(i, num_slices * slice_size);
  uint64_t slice_offset = slice_offsets[i / slice_size];
  size_t j = i % slice_size;
-  atomicAdd(output_data + slice_offset + j, update_data[i]);
+  atomic_add(output_data + slice_offset + j, update_data[i]);
 };

 template <typename TIndex>
@ -121,11 +122,10 @@ SPECIALIZED_COMPUTE_SLICE_OFFSETS_IMPL(int64_t);

 SPECIALIZED_IMPL(float);
 SPECIALIZED_GRAD_IMPL(float);
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
 SPECIALIZED_IMPL(half);
 SPECIALIZED_GRAD_IMPL(half);
-#endif
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
+
 SPECIALIZED_IMPL(double);
 SPECIALIZED_GRAD_IMPL(double);
 #endif