From ebfd81e67e200fc22612e81fd0b9793a8e1fac5e Mon Sep 17 00:00:00 2001
From: pengwa <pengwa@microsoft.com>
Date: Wed, 20 Jul 2022 17:59:29 +0800
Subject: [PATCH] Fix BiasGeluGrad bug (#12200)

* use 3D grid to avoid the upper limit of grid dimension

* enrich tests

* Revert "use 3D grid to avoid the upper limit of grid dimension"

This reverts commit 2d5badf2fe8cd985f3f29ee2cb18fff13d07c2ab.

* change to a fix: switch the 1st and 2nd dim
---
 .../training_ops/cuda/activations_test.cc     | 55 ++++++++++++++++++-
 .../cuda/activation/bias_gelu_grad_impl.cu    | 26 +++++----
 2 files changed, 67 insertions(+), 14 deletions(-)
diff --git a/orttraining/orttraining/test/training_ops/cuda/activations_test.cc b/orttraining/orttraining/test/training_ops/cuda/activations_test.cc
index 9035ca5019..656db10e90 100644
--- a/orttraining/orttraining/test/training_ops/cuda/activations_test.cc
+++ b/orttraining/orttraining/test/training_ops/cuda/activations_test.cc
@@ -133,14 +133,65 @@ TEST(CudaKernelTest, FastGelu_bias) {
 }
 
 TEST(CudaKernelTest, BiasGeluGradDx_basic) {
-  std::vector<std::vector<int64_t>> test_dims{{4}, {16, 2}, {8, 2, 128, 128}};
+  std::vector<std::vector<int64_t>> test_dims{
+      {1},
+      {8},
+      {64},
+      {128},
+      {512},
+      {16, 1},
+      {16, 8},
+      {16, 64},
+      {16, 128},
+      {16, 192},
+      {16, 256},
+      {16, 258},
+      {8, 2, 128, 1},
+      {8, 2, 128, 8},
+      {8, 2, 128, 64},
+      {9, 2, 128, 128},
+      {16, 128, 6144},
+      {16, 127, 6144},
+      {16, 128, 6143},
+      {16, 3, 224, 224},
+      {15, 3, 223, 223},
+      // multiplier of the initial 3 dims > 65535
+      {128, 3, 224, 2},
+      {128, 3, 224, 3},
+  };
+
   for (const auto& test_dim : test_dims) {
     TestActivationsWithBroadcastBias(test_dim, "BiasGeluGrad_dX", true);
   }
 }
 
 TEST(CudaKernelTest, BiasFastGeluGradDx_basic) {
-  std::vector<std::vector<int64_t>> test_dims{{4}, {16, 2}, {8, 2, 128, 128}};
+  std::vector<std::vector<int64_t>> test_dims{
+      {1},
+      {8},
+      {64},
+      {128},
+      {512},
+      {16, 1},
+      {16, 8},
+      {16, 64},
+      {16, 128},
+      {16, 192},
+      {16, 256},
+      {16, 258},
+      {8, 2, 128, 1},
+      {8, 2, 128, 8},
+      {8, 2, 128, 64},
+      {9, 2, 128, 128},
+      {16, 128, 6144},
+      {16, 127, 6144},
+      {16, 128, 6143},
+      {16, 3, 224, 224},
+      {15, 3, 223, 223},
+      // multiplier of the initial 3 dims > 65535
+      {128, 3, 224, 2},
+      {128, 3, 224, 3},
+  };
   for (const auto& test_dim : test_dims) {
     TestActivationsWithBroadcastBias(test_dim, "BiasFastGeluGrad_dX", true);
   }
diff --git a/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad_impl.cu b/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad_impl.cu
index 03824c344b..c697d4cfd5 100644
--- a/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad_impl.cu
+++ b/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad_impl.cu
@@ -14,8 +14,8 @@ namespace cuda {
 template <typename T, typename GeluComputationMode, int num_elements_per_thread>
 __global__ void BiasGeluGradDxKernel(int64_t bias_size, const T* dY, const T* X, const T* B, T* dX) {
   const auto num_elements_per_block = num_elements_per_thread * blockDim.x;
-  const auto input_base_idx = bias_size * blockIdx.y + num_elements_per_block * blockIdx.x + threadIdx.x;
-  const auto bias_base_idx = num_elements_per_block * blockIdx.x + threadIdx.x;
+  const auto input_base_idx = bias_size * blockIdx.x + num_elements_per_block * blockIdx.y + threadIdx.x;
+  const auto bias_base_idx = num_elements_per_block * blockIdx.y + threadIdx.x;
   const auto element_stride = blockDim.x;
 
   T reg_dY[num_elements_per_thread];
@@ -60,22 +60,24 @@ void LaunchBiasGeluGradDxKernel(
     int64_t input_size, int64_t bias_size,
     const T* dY, const T* X, const T* B, T* dX) {
   // given a 2D grid of blocks:
-  // each grid row handles bias_size elements
-  // there are input_size / bias_size rows
-  
+  // each grid column handles bias_size elements
+  // there are input_size / bias_size columns.
+
   const int num_elements_per_thread = GridDim::maxElementsPerThread;
-  int max_threads_per_block = GridDim::maxThreadsPerBlock;
-  #ifdef USE_ROCM
+
+#ifdef USE_ROCM
   // Optimization for ROCm MI100
-  max_threads_per_block = 512;
-  #endif
-  
+  const int max_threads_per_block = 512;
+#else
+  const int max_threads_per_block = GridDim::maxThreadsPerBlock;
+#endif
+
   int num_threads_per_block =
-      std::min<int>(static_cast<int>(CeilDiv(bias_size, num_elements_per_thread)), static_cast<int>(max_threads_per_block));
+      std::min<int>(static_cast<int>(CeilDiv(bias_size, num_elements_per_thread)), max_threads_per_block);
   const auto grid_width = CeilDiv(bias_size, num_elements_per_thread * num_threads_per_block);
   const auto grid_height = input_size / bias_size;
 
-  const dim3 grid_dim{static_cast<uint32_t>(grid_width), static_cast<uint32_t>(grid_height)};
+  const dim3 grid_dim{static_cast<uint32_t>(grid_height), static_cast<uint32_t>(grid_width)};
 
   BiasGeluGradDxKernel<T, GeluComputationMode, num_elements_per_thread>
       <<<grid_dim, num_threads_per_block, 0, stream>>>(bias_size, dY, X, B, dX);