From ebfd81e67e200fc22612e81fd0b9793a8e1fac5e Mon Sep 17 00:00:00 2001 From: pengwa Date: Wed, 20 Jul 2022 17:59:29 +0800 Subject: [PATCH] Fix BiasGeluGrad bug (#12200) * use 3D grid to avoid the upper limit of grid dimension * enrich tests * Revert "use 3D grid to avoid the upper limit of grid dimension" This reverts commit 2d5badf2fe8cd985f3f29ee2cb18fff13d07c2ab. * change to a fix: switch the 1st and 2nd dim --- .../training_ops/cuda/activations_test.cc | 55 ++++++++++++++++++- .../cuda/activation/bias_gelu_grad_impl.cu | 26 +++++---- 2 files changed, 67 insertions(+), 14 deletions(-) diff --git a/orttraining/orttraining/test/training_ops/cuda/activations_test.cc b/orttraining/orttraining/test/training_ops/cuda/activations_test.cc index 9035ca5019..656db10e90 100644 --- a/orttraining/orttraining/test/training_ops/cuda/activations_test.cc +++ b/orttraining/orttraining/test/training_ops/cuda/activations_test.cc @@ -133,14 +133,65 @@ TEST(CudaKernelTest, FastGelu_bias) { } TEST(CudaKernelTest, BiasGeluGradDx_basic) { - std::vector> test_dims{{4}, {16, 2}, {8, 2, 128, 128}}; + std::vector> test_dims{ + {1}, + {8}, + {64}, + {128}, + {512}, + {16, 1}, + {16, 8}, + {16, 64}, + {16, 128}, + {16, 192}, + {16, 256}, + {16, 258}, + {8, 2, 128, 1}, + {8, 2, 128, 8}, + {8, 2, 128, 64}, + {9, 2, 128, 128}, + {16, 128, 6144}, + {16, 127, 6144}, + {16, 128, 6143}, + {16, 3, 224, 224}, + {15, 3, 223, 223}, + // multiplier of the initial 3 dims > 65535 + {128, 3, 224, 2}, + {128, 3, 224, 3}, + }; + for (const auto& test_dim : test_dims) { TestActivationsWithBroadcastBias(test_dim, "BiasGeluGrad_dX", true); } } TEST(CudaKernelTest, BiasFastGeluGradDx_basic) { - std::vector> test_dims{{4}, {16, 2}, {8, 2, 128, 128}}; + std::vector> test_dims{ + {1}, + {8}, + {64}, + {128}, + {512}, + {16, 1}, + {16, 8}, + {16, 64}, + {16, 128}, + {16, 192}, + {16, 256}, + {16, 258}, + {8, 2, 128, 1}, + {8, 2, 128, 8}, + {8, 2, 128, 64}, + {9, 2, 128, 128}, + {16, 128, 6144}, + {16, 127, 6144}, + {16, 128, 6143}, + {16, 3, 224, 224}, + {15, 3, 223, 223}, + // multiplier of the initial 3 dims > 65535 + {128, 3, 224, 2}, + {128, 3, 224, 3}, + }; for (const auto& test_dim : test_dims) { TestActivationsWithBroadcastBias(test_dim, "BiasFastGeluGrad_dX", true); } diff --git a/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad_impl.cu b/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad_impl.cu index 03824c344b..c697d4cfd5 100644 --- a/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad_impl.cu +++ b/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad_impl.cu @@ -14,8 +14,8 @@ namespace cuda { template __global__ void BiasGeluGradDxKernel(int64_t bias_size, const T* dY, const T* X, const T* B, T* dX) { const auto num_elements_per_block = num_elements_per_thread * blockDim.x; - const auto input_base_idx = bias_size * blockIdx.y + num_elements_per_block * blockIdx.x + threadIdx.x; - const auto bias_base_idx = num_elements_per_block * blockIdx.x + threadIdx.x; + const auto input_base_idx = bias_size * blockIdx.x + num_elements_per_block * blockIdx.y + threadIdx.x; + const auto bias_base_idx = num_elements_per_block * blockIdx.y + threadIdx.x; const auto element_stride = blockDim.x; T reg_dY[num_elements_per_thread]; @@ -60,22 +60,24 @@ void LaunchBiasGeluGradDxKernel( int64_t input_size, int64_t bias_size, const T* dY, const T* X, const T* B, T* dX) { // given a 2D grid of blocks: - // each grid row handles bias_size elements - // there are input_size / bias_size rows - + // each grid column handles bias_size elements + // there are input_size / bias_size columns. + const int num_elements_per_thread = GridDim::maxElementsPerThread; - int max_threads_per_block = GridDim::maxThreadsPerBlock; - #ifdef USE_ROCM + +#ifdef USE_ROCM // Optimization for ROCm MI100 - max_threads_per_block = 512; - #endif - + const int max_threads_per_block = 512; +#else + const int max_threads_per_block = GridDim::maxThreadsPerBlock; +#endif + int num_threads_per_block = - std::min(static_cast(CeilDiv(bias_size, num_elements_per_thread)), static_cast(max_threads_per_block)); + std::min(static_cast(CeilDiv(bias_size, num_elements_per_thread)), max_threads_per_block); const auto grid_width = CeilDiv(bias_size, num_elements_per_thread * num_threads_per_block); const auto grid_height = input_size / bias_size; - const dim3 grid_dim{static_cast(grid_width), static_cast(grid_height)}; + const dim3 grid_dim{static_cast(grid_height), static_cast(grid_width)}; BiasGeluGradDxKernel <<>>(bias_size, dY, X, B, dX);