Fix BiasGeluGrad bug (#12200)

* use 3D grid to avoid the upper limit of grid dimension * enrich tests * Revert "use 3D grid to avoid the upper limit of grid dimension" This reverts commit 2d5badf2fe8cd985f3f29ee2cb18fff13d07c2ab. * change to a fix: switch the 1st and 2nd dim
2026-07-11 17:48:34 +00:00 · 2022-07-20 17:59:29 +08:00 · 2022-07-20 17:59:29 +08:00 · ebfd81e67e
commit ebfd81e67e
parent 3cdc6d7775
2 changed files with 67 additions and 14 deletions
--- a/orttraining/orttraining/test/training_ops/cuda/activations_test.cc
+++ b/orttraining/orttraining/test/training_ops/cuda/activations_test.cc
@ -133,14 +133,65 @@ TEST(CudaKernelTest, FastGelu_bias) {
 }

 TEST(CudaKernelTest, BiasGeluGradDx_basic) {
-  std::vector<std::vector<int64_t>> test_dims{{4}, {16, 2}, {8, 2, 128, 128}};
+  std::vector<std::vector<int64_t>> test_dims{
+      {1},
+      {8},
+      {64},
+      {128},
+      {512},
+      {16, 1},
+      {16, 8},
+      {16, 64},
+      {16, 128},
+      {16, 192},
+      {16, 256},
+      {16, 258},
+      {8, 2, 128, 1},
+      {8, 2, 128, 8},
+      {8, 2, 128, 64},
+      {9, 2, 128, 128},
+      {16, 128, 6144},
+      {16, 127, 6144},
+      {16, 128, 6143},
+      {16, 3, 224, 224},
+      {15, 3, 223, 223},
+      // multiplier of the initial 3 dims > 65535
+      {128, 3, 224, 2},
+      {128, 3, 224, 3},
+  };
+
  for (const auto& test_dim : test_dims) {
    TestActivationsWithBroadcastBias(test_dim, "BiasGeluGrad_dX", true);
  }
 }

 TEST(CudaKernelTest, BiasFastGeluGradDx_basic) {
-  std::vector<std::vector<int64_t>> test_dims{{4}, {16, 2}, {8, 2, 128, 128}};
+  std::vector<std::vector<int64_t>> test_dims{
+      {1},
+      {8},
+      {64},
+      {128},
+      {512},
+      {16, 1},
+      {16, 8},
+      {16, 64},
+      {16, 128},
+      {16, 192},
+      {16, 256},
+      {16, 258},
+      {8, 2, 128, 1},
+      {8, 2, 128, 8},
+      {8, 2, 128, 64},
+      {9, 2, 128, 128},
+      {16, 128, 6144},
+      {16, 127, 6144},
+      {16, 128, 6143},
+      {16, 3, 224, 224},
+      {15, 3, 223, 223},
+      // multiplier of the initial 3 dims > 65535
+      {128, 3, 224, 2},
+      {128, 3, 224, 3},
+  };
  for (const auto& test_dim : test_dims) {
    TestActivationsWithBroadcastBias(test_dim, "BiasFastGeluGrad_dX", true);
  }
--- a/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad_impl.cu
+++ b/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad_impl.cu
@ -14,8 +14,8 @@ namespace cuda {
 template <typename T, typename GeluComputationMode, int num_elements_per_thread>
 __global__ void BiasGeluGradDxKernel(int64_t bias_size, const T* dY, const T* X, const T* B, T* dX) {
  const auto num_elements_per_block = num_elements_per_thread * blockDim.x;
-  const auto input_base_idx = bias_size * blockIdx.y + num_elements_per_block * blockIdx.x + threadIdx.x;
-  const auto bias_base_idx = num_elements_per_block * blockIdx.x + threadIdx.x;
+  const auto input_base_idx = bias_size * blockIdx.x + num_elements_per_block * blockIdx.y + threadIdx.x;
+  const auto bias_base_idx = num_elements_per_block * blockIdx.y + threadIdx.x;
  const auto element_stride = blockDim.x;

  T reg_dY[num_elements_per_thread];
@ -60,22 +60,24 @@ void LaunchBiasGeluGradDxKernel(
    int64_t input_size, int64_t bias_size,
    const T* dY, const T* X, const T* B, T* dX) {
  // given a 2D grid of blocks:
-  // each grid row handles bias_size elements
-  // there are input_size / bias_size rows
-  
+  // each grid column handles bias_size elements
+  // there are input_size / bias_size columns.
+
  const int num_elements_per_thread = GridDim::maxElementsPerThread;
-  int max_threads_per_block = GridDim::maxThreadsPerBlock;
-  #ifdef USE_ROCM
+
+#ifdef USE_ROCM
  // Optimization for ROCm MI100
-  max_threads_per_block = 512;
-  #endif
-  
+  const int max_threads_per_block = 512;
+#else
+  const int max_threads_per_block = GridDim::maxThreadsPerBlock;
+#endif
+
  int num_threads_per_block =
-      std::min<int>(static_cast<int>(CeilDiv(bias_size, num_elements_per_thread)), static_cast<int>(max_threads_per_block));
+      std::min<int>(static_cast<int>(CeilDiv(bias_size, num_elements_per_thread)), max_threads_per_block);
  const auto grid_width = CeilDiv(bias_size, num_elements_per_thread * num_threads_per_block);
  const auto grid_height = input_size / bias_size;

-  const dim3 grid_dim{static_cast<uint32_t>(grid_width), static_cast<uint32_t>(grid_height)};
+  const dim3 grid_dim{static_cast<uint32_t>(grid_height), static_cast<uint32_t>(grid_width)};

  BiasGeluGradDxKernel<T, GeluComputationMode, num_elements_per_thread>
      <<<grid_dim, num_threads_per_block, 0, stream>>>(bias_size, dY, X, B, dX);