mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-25 22:26:24 +00:00
Fix BiasGeluGrad bug (#12200)
* use 3D grid to avoid the upper limit of grid dimension * enrich tests * Revert "use 3D grid to avoid the upper limit of grid dimension" This reverts commit 2d5badf2fe8cd985f3f29ee2cb18fff13d07c2ab. * change to a fix: switch the 1st and 2nd dim
This commit is contained in:
parent
3cdc6d7775
commit
ebfd81e67e
2 changed files with 67 additions and 14 deletions
|
|
@ -133,14 +133,65 @@ TEST(CudaKernelTest, FastGelu_bias) {
|
|||
}
|
||||
|
||||
TEST(CudaKernelTest, BiasGeluGradDx_basic) {
|
||||
std::vector<std::vector<int64_t>> test_dims{{4}, {16, 2}, {8, 2, 128, 128}};
|
||||
std::vector<std::vector<int64_t>> test_dims{
|
||||
{1},
|
||||
{8},
|
||||
{64},
|
||||
{128},
|
||||
{512},
|
||||
{16, 1},
|
||||
{16, 8},
|
||||
{16, 64},
|
||||
{16, 128},
|
||||
{16, 192},
|
||||
{16, 256},
|
||||
{16, 258},
|
||||
{8, 2, 128, 1},
|
||||
{8, 2, 128, 8},
|
||||
{8, 2, 128, 64},
|
||||
{9, 2, 128, 128},
|
||||
{16, 128, 6144},
|
||||
{16, 127, 6144},
|
||||
{16, 128, 6143},
|
||||
{16, 3, 224, 224},
|
||||
{15, 3, 223, 223},
|
||||
// multiplier of the initial 3 dims > 65535
|
||||
{128, 3, 224, 2},
|
||||
{128, 3, 224, 3},
|
||||
};
|
||||
|
||||
for (const auto& test_dim : test_dims) {
|
||||
TestActivationsWithBroadcastBias(test_dim, "BiasGeluGrad_dX", true);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(CudaKernelTest, BiasFastGeluGradDx_basic) {
|
||||
std::vector<std::vector<int64_t>> test_dims{{4}, {16, 2}, {8, 2, 128, 128}};
|
||||
std::vector<std::vector<int64_t>> test_dims{
|
||||
{1},
|
||||
{8},
|
||||
{64},
|
||||
{128},
|
||||
{512},
|
||||
{16, 1},
|
||||
{16, 8},
|
||||
{16, 64},
|
||||
{16, 128},
|
||||
{16, 192},
|
||||
{16, 256},
|
||||
{16, 258},
|
||||
{8, 2, 128, 1},
|
||||
{8, 2, 128, 8},
|
||||
{8, 2, 128, 64},
|
||||
{9, 2, 128, 128},
|
||||
{16, 128, 6144},
|
||||
{16, 127, 6144},
|
||||
{16, 128, 6143},
|
||||
{16, 3, 224, 224},
|
||||
{15, 3, 223, 223},
|
||||
// multiplier of the initial 3 dims > 65535
|
||||
{128, 3, 224, 2},
|
||||
{128, 3, 224, 3},
|
||||
};
|
||||
for (const auto& test_dim : test_dims) {
|
||||
TestActivationsWithBroadcastBias(test_dim, "BiasFastGeluGrad_dX", true);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -14,8 +14,8 @@ namespace cuda {
|
|||
template <typename T, typename GeluComputationMode, int num_elements_per_thread>
|
||||
__global__ void BiasGeluGradDxKernel(int64_t bias_size, const T* dY, const T* X, const T* B, T* dX) {
|
||||
const auto num_elements_per_block = num_elements_per_thread * blockDim.x;
|
||||
const auto input_base_idx = bias_size * blockIdx.y + num_elements_per_block * blockIdx.x + threadIdx.x;
|
||||
const auto bias_base_idx = num_elements_per_block * blockIdx.x + threadIdx.x;
|
||||
const auto input_base_idx = bias_size * blockIdx.x + num_elements_per_block * blockIdx.y + threadIdx.x;
|
||||
const auto bias_base_idx = num_elements_per_block * blockIdx.y + threadIdx.x;
|
||||
const auto element_stride = blockDim.x;
|
||||
|
||||
T reg_dY[num_elements_per_thread];
|
||||
|
|
@ -60,22 +60,24 @@ void LaunchBiasGeluGradDxKernel(
|
|||
int64_t input_size, int64_t bias_size,
|
||||
const T* dY, const T* X, const T* B, T* dX) {
|
||||
// given a 2D grid of blocks:
|
||||
// each grid row handles bias_size elements
|
||||
// there are input_size / bias_size rows
|
||||
|
||||
// each grid column handles bias_size elements
|
||||
// there are input_size / bias_size columns.
|
||||
|
||||
const int num_elements_per_thread = GridDim::maxElementsPerThread;
|
||||
int max_threads_per_block = GridDim::maxThreadsPerBlock;
|
||||
#ifdef USE_ROCM
|
||||
|
||||
#ifdef USE_ROCM
|
||||
// Optimization for ROCm MI100
|
||||
max_threads_per_block = 512;
|
||||
#endif
|
||||
|
||||
const int max_threads_per_block = 512;
|
||||
#else
|
||||
const int max_threads_per_block = GridDim::maxThreadsPerBlock;
|
||||
#endif
|
||||
|
||||
int num_threads_per_block =
|
||||
std::min<int>(static_cast<int>(CeilDiv(bias_size, num_elements_per_thread)), static_cast<int>(max_threads_per_block));
|
||||
std::min<int>(static_cast<int>(CeilDiv(bias_size, num_elements_per_thread)), max_threads_per_block);
|
||||
const auto grid_width = CeilDiv(bias_size, num_elements_per_thread * num_threads_per_block);
|
||||
const auto grid_height = input_size / bias_size;
|
||||
|
||||
const dim3 grid_dim{static_cast<uint32_t>(grid_width), static_cast<uint32_t>(grid_height)};
|
||||
const dim3 grid_dim{static_cast<uint32_t>(grid_height), static_cast<uint32_t>(grid_width)};
|
||||
|
||||
BiasGeluGradDxKernel<T, GeluComputationMode, num_elements_per_thread>
|
||||
<<<grid_dim, num_threads_per_block, 0, stream>>>(bias_size, dY, X, B, dX);
|
||||
|
|
|
|||
Loading…
Reference in a new issue