diff --git a/onnxruntime/core/providers/cuda/atomic/common.cuh b/onnxruntime/core/providers/cuda/atomic/common.cuh
index e58d25c4df..2f7edef4a5 100644
--- a/onnxruntime/core/providers/cuda/atomic/common.cuh
+++ b/onnxruntime/core/providers/cuda/atomic/common.cuh
@@ -31,20 +31,23 @@ __device__ __forceinline__ void atomic_add(double *address, double value) {
 #endif
 }
 
+//
+// ref: https://github.com/pytorch/pytorch/blob/master/aten/src/THC/THCAtomics.cuh
+//
 __device__ __forceinline__ void atomic_add(half *address, half value) {
 #if __CUDA_ARCH__ < 700
-  half packed_old[2];
-  half packed_new[2];
-  int* const p_packed_old = reinterpret_cast<int*>(packed_old);
-  int* const p_packed_new = reinterpret_cast<int*>(packed_new);
-  int seen_old_value = 0;
+  unsigned int* base_address = (unsigned int*)((char*)address - ((size_t)address & 2));
+  unsigned int old = *base_address;
+  unsigned int assumed;
+  unsigned short x;
+
   do {
-    packed_old[0] = *address;
-    packed_old[1] = *(address + 1);
-    packed_new[0] = half(float(packed_old[0]) + float(value));
-    packed_new[1] = packed_old[1];
-    seen_old_value = atomicCAS(reinterpret_cast<int*>(address), *p_packed_old, *p_packed_new);
-  } while (seen_old_value != *p_packed_old);
+    assumed = old;
+    x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
+    x = __half_as_short(__float2half(__half2float(*reinterpret_cast<const __half*>(&x)) + __half2float(value)));
+    old = (size_t)address & 2 ? (old & 0xffff) | (x << 16) : (old & 0xffff0000) | x;
+    old = atomicCAS(base_address, assumed, old);
+  } while (assumed != old);
 #else
   atomicAdd(address, value);
 #endif
diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
index be1813f67e..52804865b6 100644
--- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
@@ -888,6 +888,72 @@ TEST(ReductionOpTest, ReduceSum_int32) {
   test.Run();
 }
 
+#ifdef USE_CUDA
+TEST(ReductionOpTest, ReduceSumHalfHalf) {
+  OpTester test("ReduceSum");
+  test.AddAttribute("keepdims", (int64_t)0);
+  test.AddAttribute("axes", std::vector<int64_t>{0, 1});
+
+  std::vector<float> data = {1.0f, 2.0f,
+                             3.0f, 4.0f,
+
+                             5.0f, 6.0f,
+                             7.0f, 8.0f,
+
+                             9.0f, 10.0f,
+                             11.0f, 12.0f};
+  std::vector<MLFloat16> data_half(12);
+  ConvertFloatToMLFloat16(data.data(), data_half.data(), 12);
+
+  std::vector<float> result = {36.0f, 42.0f};
+  std::vector<MLFloat16> result_half(2);
+  ConvertFloatToMLFloat16(result.data(), result_half.data(), 2);
+
+  test.AddInput<MLFloat16>("data", {3, 2, 2}, data_half);
+  test.AddOutput<MLFloat16>("reduced", {2}, result_half);
+  test.Run();
+} 
+
+void test_half_reduce_sum(
+    int64_t m, int64_t n) {
+  OpTester test("ReduceSum");
+  // Input tensor.
+  std::vector<float> X(m * n, 0.0f);
+  // Reduced tensor.
+  std::vector<float> Y(n, 0.0f);
+  // Random number generator.
+  std::default_random_engine generator(0);
+  std::uniform_real_distribution<float> distribution(0.0, 1.0);
+  for (int64_t i = 0; i < m; ++i) {
+    for (int64_t j = 0; j < n; ++j) {
+      const float value = distribution(generator) / float(m);
+      X[i * n + j] = value;
+      Y[j] += value;
+    }
+  }
+
+  std::vector<MLFloat16> X_half(m * n);
+  ConvertFloatToMLFloat16(X.data(), X_half.data(), int(m * n));
+
+  std::vector<MLFloat16> Y_half(n);
+  ConvertFloatToMLFloat16(Y.data(), Y_half.data(), int(n));
+
+  test.AddAttribute("keepdims", (int64_t)0);
+  test.AddAttribute("axes", std::vector<int64_t>{0});
+  test.AddInput<MLFloat16>("data", {m, n}, X_half);
+  test.AddOutput<MLFloat16>("reduced", {n}, Y_half);
+  test.Run();
+}
+
+TEST(ReductionOpTest, ReduceSum_half_bert) {
+  test_half_reduce_sum(6 * 128, 128);
+  test_half_reduce_sum(8 * 128, 128);
+  test_half_reduce_sum(6 * 384, 128);
+  test_half_reduce_sum(8 * 384, 128);
+}
+
+// Add more UTs for half as needed
+#endif
 TEST(ReductionOpTest, ReduceSum_apex_reduction) {
   OpTester test("ReduceSum");
   test.AddAttribute("keepdims", (int64_t)0);
diff --git a/onnxruntime/test/providers/provider_test_utils.cc b/onnxruntime/test/providers/provider_test_utils.cc
index d077fb41d5..79de5b06b1 100644
--- a/onnxruntime/test/providers/provider_test_utils.cc
+++ b/onnxruntime/test/providers/provider_test_utils.cc
@@ -197,7 +197,7 @@ void Check<MLFloat16>(const OpTester::Data& expected_data,
   }
 
   float threshold = 0.001f;
-#if defined(USE_TENSORRT) || defined(ENABLE_TRAINING)
+#if defined(USE_TENSORRT) || defined(ENABLE_TRAINING) || defined(USE_CUDA)
   threshold = 0.005f;
 #endif
   for (int i = 0; i < size; ++i) {
diff --git a/orttraining/orttraining/test/training_ops/cpu/tensor/gather_nd_op_test.cc b/orttraining/orttraining/test/training_ops/cpu/tensor/gather_nd_op_test.cc
index 70c1e87532..0d01d27299 100644
--- a/orttraining/orttraining/test/training_ops/cpu/tensor/gather_nd_op_test.cc
+++ b/orttraining/orttraining/test/training_ops/cpu/tensor/gather_nd_op_test.cc
@@ -3,6 +3,7 @@
 
 #include "gtest/gtest.h"
 #include "test/providers/provider_test_utils.h"
+#include "test/common/cuda_op_test_utils.h"
 
 namespace onnxruntime {
 namespace test {
@@ -231,8 +232,9 @@ TEST(GatherNDOpTest, GatherNDGrad_slice_float_int64_t_axis_1) {
 #endif
 
 #ifdef USE_CUDA
-#if __CUDA_ARCH__ >= 600
 TEST(GatherNDOpTest, GatherNDGrad_slice_double_int32_t_axis_3) {
+  if (!HasCudaEnvironment(600 /*min_cuda_architecture*/)) return;
+
   OpTester test("GatherNDGrad", 1, onnxruntime::kOnnxDomain);
   test.AddAttribute<int64_t>("axis", 1);
   test.AddInput<int64_t>("shape", {3}, {2LL, 2LL, 3LL});
@@ -243,6 +245,8 @@ TEST(GatherNDOpTest, GatherNDGrad_slice_double_int32_t_axis_3) {
 }
 
 TEST(GatherNDOpTest, GatherND_slice_double_int64_t_axis_3) {
+  if (!HasCudaEnvironment(600 /*min_cuda_architecture*/)) return;
+
   OpTester test("GatherND", 1, onnxruntime::kOnnxDomain);
   test.AddAttribute<int64_t>("axis", 1);
   test.AddInput<double>("data", {2, 2, 2}, ValueRange(8, 0.0, 0.1));
@@ -250,10 +254,10 @@ TEST(GatherNDOpTest, GatherND_slice_double_int64_t_axis_3) {
   test.AddOutput<double>("output", {2, 1, 2}, {0.2f, 0.3f, 0.4f, 0.5f});
   test.Run();
 }
-#endif
 
-#if __CUDA_ARCH__ >= 700
 TEST(GatherNDOpTest, GatherNDGrad_slice_half_int32_t_axis_3) {
+  if (!HasCudaEnvironment(600 /*min_cuda_architecture*/)) return;
+
   OpTester test("GatherNDGrad", 1, onnxruntime::kOnnxDomain);
   test.AddAttribute<int64_t>("axis", 1);
   test.AddInput<int64_t>("shape", {3}, {2LL, 2LL, 3LL});
@@ -270,6 +274,8 @@ TEST(GatherNDOpTest, GatherNDGrad_slice_half_int32_t_axis_3) {
 }
 
 TEST(GatherNDOpTest, GatherND_slice_half_int32_t) {
+  if (!HasCudaEnvironment(600 /*min_cuda_architecture*/)) return;
+
   OpTester test("GatherND", 1, onnxruntime::kOnnxDomain);
   std::vector<float> data_f({0.0f, 0.1f, 0.2f, 0.3f});
   std::vector<float> outputs_f({0.2f, 0.3f, 0.0f, 0.1f});
@@ -283,7 +289,6 @@ TEST(GatherNDOpTest, GatherND_slice_half_int32_t) {
   test.Run();
 }
 #endif
-#endif
 
 #ifdef USE_CUDA
 TEST(GatherNDOpTest, GatherND_axis_of_2) {
diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/lamb.h b/orttraining/orttraining/training_ops/cuda/optimizer/lamb.h
index 6a4204e7ab..f02403b4fc 100644
--- a/orttraining/orttraining/training_ops/cuda/optimizer/lamb.h
+++ b/orttraining/orttraining/training_ops/cuda/optimizer/lamb.h
@@ -18,8 +18,8 @@ class LambOptimizer final : public CudaKernel {
     beta_ = info.GetAttrsOrDefault("beta", std::vector<float>(1024, 0.999f));
     lambda_ = info.GetAttrsOrDefault("lambda", std::vector<float>(1024, 0.0f));
     epsilon_ = info.GetAttrsOrDefault("epsilon", std::vector<float>(1024, 1e-6f));
-    info.GetAttr("ratio_min", &ratio_min_);
-    info.GetAttr("ratio_max", &ratio_max_);
+    ORT_ENFORCE(info.GetAttr<float>("ratio_min", &ratio_min_).IsOK(), "Missing/Invalid 'ratio_min' attribute value");
+    ORT_ENFORCE(info.GetAttr<float>("ratio_max", &ratio_max_).IsOK(), "Missing/Invalid 'ratio_max' attribute value");
   }
 
   Status ComputeInternal(OpKernelContext* context) const override;
diff --git a/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_impl.cu b/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_impl.cu
index 03745e7605..d754dcfe9f 100644
--- a/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_impl.cu
+++ b/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_impl.cu
@@ -4,6 +4,7 @@
 #include "orttraining/training_ops/cuda/tensor/gather_nd_impl.h"
 
 #include "core/providers/cuda/cu_inc/common.cuh"
+#include "core/providers/cuda/atomic/common.cuh"
 
 namespace onnxruntime {
 namespace cuda {
@@ -53,7 +54,7 @@ __global__ void _GatherNDGradKernel(
   CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(i, num_slices * slice_size);
   uint64_t slice_offset = slice_offsets[i / slice_size];
   size_t j = i % slice_size;
-  atomicAdd(output_data + slice_offset + j, update_data[i]);
+  atomic_add(output_data + slice_offset + j, update_data[i]);
 };
 
 template <typename TIndex>
@@ -121,11 +122,10 @@ SPECIALIZED_COMPUTE_SLICE_OFFSETS_IMPL(int64_t);
 
 SPECIALIZED_IMPL(float);
 SPECIALIZED_GRAD_IMPL(float);
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
 SPECIALIZED_IMPL(half);
 SPECIALIZED_GRAD_IMPL(half);
-#endif
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
+
 SPECIALIZED_IMPL(double);
 SPECIALIZED_GRAD_IMPL(double);
 #endif