diff --git a/onnxruntime/contrib_ops/cuda/math/bias_dropout_impl.cu b/onnxruntime/contrib_ops/cuda/math/bias_dropout_impl.cu index fb6cc7c7b1..27fb09c969 100644 --- a/onnxruntime/contrib_ops/cuda/math/bias_dropout_impl.cu +++ b/onnxruntime/contrib_ops/cuda/math/bias_dropout_impl.cu @@ -44,23 +44,25 @@ __global__ void BiasDropoutKernel( CUDA_LONG idx = blockDim.x * blockIdx.x + threadIdx.x; CUDA_LONG step_size = gridDim.x * blockDim.x * UNROLL; - CUDA_LONG rounded_size = ((N - 1) / step_size + 1) * step_size; curandStatePhilox4_32_10_t state; curand_init(seeds.first, idx, seeds.second, &state); + float4 rand; + // We ensure every thread generates the same number of random numbers (by rounding // up the size) and at the same timestep (by syncing threads). // From CUDA curand documentation: // The Philox_4x32_10 algorithm is closely tied to the thread and block count. // Each thread computes 4 random numbers in the same time thus the most efficient // use of Philox_4x32_10 is to generate a multiple of 4 times number of threads. - for (CUDA_LONG id = idx; id < rounded_size; id += step_size) { - float4 rand = curand_uniform4(&state); - - #pragma unroll - for (CUDA_LONG i = 0; i < UNROLL; i++) { - CUDA_LONG li = id + gridDim.x * blockDim.x * i; + for (CUDA_LONG id = idx * UNROLL; id < N; id += step_size) { + rand = curand_uniform4(&state); + + // actual computation + #pragma unroll + for (int i = 0; i < UNROLL; i++) { + CUDA_LONG li = id + i; if (li < N) { int offset = fdm_dim.mod(li); float bias = float(bias_data[offset]); @@ -77,6 +79,75 @@ __global__ void BiasDropoutKernel( __syncthreads(); } + +} + + +template +__global__ void BiasDropoutVectorizedKernel( + const int64_t N, + const fast_divmod fdm_dim, + const float ratio, + const std::pair seeds, + const T* X_data, + const T* bias_data, + const T* residual_data, + T* Y_data, + bool* mask_data) { + const float p = 1.0f - ratio; + const float scale = 1.0f / p; + + CUDA_LONG idx = blockDim.x * blockIdx.x + threadIdx.x; + CUDA_LONG step_size = gridDim.x * blockDim.x * UNROLL; + + curandStatePhilox4_32_10_t state; + curand_init(seeds.first, idx, seeds.second, &state); + + float4 rand; + + // using vectorized data load/store approach when N % 4 == 0 + // since this is typical case for input shape size + using LoadT = aligned_vector; + using MaskLoadT = aligned_vector; + using ResidualLoadT = aligned_vector; + + for (CUDA_LONG id = idx * UNROLL; id < N; id += step_size) { + rand = curand_uniform4(&state); + + // vectorized load into storage + T src[UNROLL]; + LoadT *value1 = reinterpret_cast(&src); + *value1 = *reinterpret_cast(&X_data[id]); + + T residual[UNROLL]; + if (has_residual) { + ResidualLoadT *value2 = reinterpret_cast(&residual); + *value2 = *reinterpret_cast(&residual_data[id]); + } + + T r[UNROLL]; + bool mask[UNROLL]; + + // actual computation + #pragma unroll + for (int ii = 0; ii < UNROLL; ii++) { + int offset = fdm_dim.mod(id + ii); + float bias = float(bias_data[offset]); + + mask[ii] = (&rand.x)[ii] < p; + float output_data = (float(src[ii]) + bias) * mask[ii] * scale; + if (has_residual) { + output_data += float(residual[ii]); + } + r[ii] = T(output_data); + } + // Vectorized writes for mask_data & Y_data + *(reinterpret_cast(&Y_data[id])) = *reinterpret_cast(&r[0]); + *(reinterpret_cast(&mask_data[id])) = *reinterpret_cast(&mask[0]); + + __syncthreads(); + } + } template @@ -100,10 +171,18 @@ void BiasDropoutKernelImpl( const uint64_t counter_offset = static_cast(((N - 1) / (block_size * grid_size * UNROLL) + 1) * UNROLL); auto seeds = generator.NextPhiloxSeeds(counter_offset); - if (residual_data == nullptr) { - BiasDropoutKernel<<>>(N, fdm_dim, ratio, seeds, X_data, bias_data, residual_data, Y_data, mask_data); + if (N % UNROLL != 0) { + if (residual_data == nullptr) { + BiasDropoutKernel<<>>(N, fdm_dim, ratio, seeds, X_data, bias_data, residual_data, Y_data, mask_data); + } else { + BiasDropoutKernel<<>>(N, fdm_dim, ratio, seeds, X_data, bias_data, residual_data, Y_data, mask_data); + } } else { - BiasDropoutKernel<<>>(N, fdm_dim, ratio, seeds, X_data, bias_data, residual_data, Y_data, mask_data); + if (residual_data == nullptr) { + BiasDropoutVectorizedKernel<<>>(N, fdm_dim, ratio, seeds, X_data, bias_data, residual_data, Y_data, mask_data); + } else { + BiasDropoutVectorizedKernel<<>>(N, fdm_dim, ratio, seeds, X_data, bias_data, residual_data, Y_data, mask_data); + } } } diff --git a/onnxruntime/core/providers/cuda/cu_inc/common.cuh b/onnxruntime/core/providers/cuda/cu_inc/common.cuh index bd6328bde4..7b283731e9 100644 --- a/onnxruntime/core/providers/cuda/cu_inc/common.cuh +++ b/onnxruntime/core/providers/cuda/cu_inc/common.cuh @@ -312,6 +312,12 @@ struct GridDim { }; }; +// aligned vector generates vectorized load/store on CUDA +template +struct alignas(sizeof(T) * vec_size) aligned_vector { + T val[vec_size]; +}; + #define CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N) \ CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x; \ if (id >= N) \ diff --git a/onnxruntime/core/providers/cuda/nn/dropout_impl.cu b/onnxruntime/core/providers/cuda/nn/dropout_impl.cu index 47d73aa450..27e1145431 100644 --- a/onnxruntime/core/providers/cuda/nn/dropout_impl.cu +++ b/onnxruntime/core/providers/cuda/nn/dropout_impl.cu @@ -39,23 +39,25 @@ __global__ void DropoutKernel( CUDA_LONG idx = blockDim.x * blockIdx.x + threadIdx.x; CUDA_LONG step_size = gridDim.x * blockDim.x * UNROLL; - CUDA_LONG rounded_size = ((N - 1) / step_size + 1) * step_size; curandStatePhilox4_32_10_t state; curand_init(seeds.first, idx, seeds.second, &state); + float4 rand; + // We ensure every thread generates the same number of random numbers (by rounding // up the size) and at the same timestep (by syncing threads). // From CUDA curand documentation: // The Philox_4x32_10 algorithm is closely tied to the thread and block count. // Each thread computes 4 random numbers in the same time thus the most efficient // use of Philox_4x32_10 is to generate a multiple of 4 times number of threads. - for (CUDA_LONG id = idx; id < rounded_size; id += step_size) { - float4 rand = curand_uniform4(&state); - - #pragma unroll - for (CUDA_LONG i = 0; i < UNROLL; i++) { - CUDA_LONG li = id + gridDim.x * blockDim.x * i; + for (CUDA_LONG id = idx * UNROLL; id < N; id += step_size) { + rand = curand_uniform4(&state); + + // actual computation + #pragma unroll + for (int i = 0; i < UNROLL; i++) { + CUDA_LONG li = id + i; if (li < N) { mask_data[li] = (&rand.x)[i] < p; Y_data[li] = T(float(X_data[li]) * mask_data[li] * scale); @@ -64,6 +66,57 @@ __global__ void DropoutKernel( __syncthreads(); } + +} + +template +__global__ void DropoutVectorizedKernel( + const int64_t N, + const float ratio, + const std::pair seeds, + const T* X_data, + T* Y_data, + bool* mask_data) { + const float p = 1.0f - ratio; + const float scale = 1.0f / p; + + CUDA_LONG idx = blockDim.x * blockIdx.x + threadIdx.x; + CUDA_LONG step_size = gridDim.x * blockDim.x * UNROLL; + + curandStatePhilox4_32_10_t state; + curand_init(seeds.first, idx, seeds.second, &state); + + float4 rand; + + // using vectorized data load/store approach when N % 4 == 0 since this is + // typical case for input shape size + using LoadT = aligned_vector; + using MaskLoadT = aligned_vector; + + for (CUDA_LONG id = idx * UNROLL; id < N; id += step_size) { + rand = curand_uniform4(&state); + + // vectorized load into storage + T src[UNROLL]; + LoadT *value = reinterpret_cast(&src); + *value = *reinterpret_cast(&X_data[id]); + + T r[UNROLL]; + bool mask[UNROLL]; + + // actual computation + #pragma unroll + for (int ii = 0; ii < UNROLL; ii++) { + mask[ii] = (&rand.x)[ii] < p; + r[ii] = T(float(src[ii]) * mask[ii] * scale); + } + // Vectorized writes for mask_data & Y_data + *(reinterpret_cast(&Y_data[id])) = *reinterpret_cast(&r[0]); + *(reinterpret_cast(&mask_data[id])) = *reinterpret_cast(&mask[0]); + + __syncthreads(); + } + } template @@ -84,7 +137,11 @@ void DropoutKernelImpl( const uint64_t counter_offset = static_cast(((N - 1) / (block_size * grid_size * UNROLL) + 1) * UNROLL); auto seeds = generator.NextPhiloxSeeds(counter_offset); - DropoutKernel<<>>(N, ratio, seeds, X_data, Y_data, mask_data); + if ( N % UNROLL != 0) { + DropoutKernel<<>>(N, ratio, seeds, X_data, Y_data, mask_data); + } else { + DropoutVectorizedKernel<<>>(N, ratio, seeds, X_data, Y_data, mask_data); + } } #define SPECIALIZED_DROPOUT_IMPL(T) \ diff --git a/onnxruntime/core/providers/rocm/cu_inc/common.cuh b/onnxruntime/core/providers/rocm/cu_inc/common.cuh index 95224b6e3b..abe6f17cb3 100644 --- a/onnxruntime/core/providers/rocm/cu_inc/common.cuh +++ b/onnxruntime/core/providers/rocm/cu_inc/common.cuh @@ -205,6 +205,11 @@ struct GridDim { }; }; +// aligned vector generates vectorized load/store on CUDA +template +struct alignas(sizeof(T) * vec_size) aligned_vector { + T val[vec_size]; +}; #define CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N) \ HIP_LONG id = blockDim.x * blockIdx.x + threadIdx.x; \ diff --git a/onnxruntime/test/contrib_ops/bias_dropout_op_test.cc b/onnxruntime/test/contrib_ops/bias_dropout_op_test.cc index 31b0608029..b16b3f190c 100644 --- a/onnxruntime/test/contrib_ops/bias_dropout_op_test.cc +++ b/onnxruntime/test/contrib_ops/bias_dropout_op_test.cc @@ -134,6 +134,19 @@ void RunBiasDropoutTest(const bool use_mask, const std::vector& input_s } } // namespace + +// N % 4 != 0 +TEST(BiasDropoutTest, BasicAndNotVectorized) { + RunBiasDropoutTest(false, {10, 5, 5}, 0.75f); +} +TEST(BiasDropoutTest, BasicWithoutResidualAndNotVectorized) { + RunBiasDropoutTest(false, {10, 5, 5}, 0.75f, TrainingTrue, false, false); +} +TEST(BiasDropoutTest, MaskAndNotVectorized) { + RunBiasDropoutTest(true, {3, 5, 100}, 0.25f); +} + +// N % 4 == 0 TEST(BiasDropoutTest, Basic) { RunBiasDropoutTest(false, {10, 10, 10}, 0.75f); } diff --git a/orttraining/orttraining/test/python/orttraining_test_orttrainer_bert_toy_onnx.py b/orttraining/orttraining/test/python/orttraining_test_orttrainer_bert_toy_onnx.py index 6a4024d586..b62a63e9be 100644 --- a/orttraining/orttraining/test/python/orttraining_test_orttrainer_bert_toy_onnx.py +++ b/orttraining/orttraining/test/python/orttraining_test_orttrainer_bert_toy_onnx.py @@ -179,8 +179,8 @@ def testToyBERTModelBasicTraining(dynamic_shape): @pytest.mark.parametrize("expected_losses", [ - ([10.991958, 10.975625, 11.032847, 11.034771, 10.987653, - 11.039469, 10.971498, 11.101391, 11.047601, 11.077588]) + ([11.041123, 10.986166, 11.101636, 11.013366, 11.03775 , + 11.041175, 10.957118, 11.069563, 11.040824, 11.16437]) ]) def testToyBERTDeterministicCheck(expected_losses): # Common setup @@ -294,12 +294,12 @@ def testToyBERTModelLRScheduler(initial_lr, lr_scheduler, expected_learning_rate @pytest.mark.parametrize("loss_scaler, expected_losses", [ - (None, [10.992018, 10.975699, 11.032809, 11.034765, 10.987625, - 11.039452, 10.971539, 11.10148, 11.047551, 11.077468]), - (amp.DynamicLossScaler(), [10.992018, 10.975699, 11.032809, 11.034765, - 10.987625, 11.039452, 10.971539, 11.10148, 11.047551, 11.077468]), - (CustomLossScaler(), [10.992018, 10.975699, 11.032791, 11.034729, - 10.987614, 11.039479, 10.971532, 11.101475, 11.04761, 11.077413]) + (None, [11.041126, 10.986309, 11.101673, 11.013394, 11.037781, + 11.041253, 10.957072, 11.069506, 11.040807, 11.164349]), + (amp.DynamicLossScaler(), [11.041126, 10.986309, 11.101673, 11.013394, + 11.037781, 11.041253, 10.957072, 11.069506, 11.040807, 11.164349]), + (CustomLossScaler(), [11.041126, 10.986309, 11.101645, 11.013412, + 11.037757, 11.041273, 10.957077, 11.069525, 11.040765, 11.164298]) ]) def testToyBERTModelMixedPrecisionLossScaler(loss_scaler, expected_losses): # Common setup @@ -339,12 +339,12 @@ def testToyBERTModelMixedPrecisionLossScaler(loss_scaler, expected_losses): @pytest.mark.parametrize("gradient_accumulation_steps, expected_losses", [ - (1, [10.991958, 10.975625, 11.032847, 11.034771, 10.987653, - 11.039469, 10.971498, 11.101391, 11.047601, 11.077588]), - (4, [10.991958, 10.97373, 11.033534, 11.028931, 10.988836, - 11.04126, 10.969865, 11.085526, 11.036701, 11.0628]), - (7, [10.991958, 10.97373, 11.033534, 11.028931, 10.994967, - 11.043544, 10.974638, 11.085087, 11.034944, 11.059022]) + (1, [11.041123, 10.986166, 11.101636, 11.013366, 11.03775, + 11.041175, 10.957118, 11.069563, 11.040824, 11.16437]), + (4, [11.041123, 10.982856, 11.105512, 11.006721, 11.03358, + 11.05058, 10.955864, 11.059035, 11.037753, 11.162649]), + (7, [11.041123, 10.982856, 11.105512, 11.006721, 11.036314, + 11.055109, 10.960751, 11.05809 , 11.038856, 11.159635]) ]) def testToyBERTModelGradientAccumulation(gradient_accumulation_steps, expected_losses): # Common setup diff --git a/orttraining/orttraining/test/python/orttraining_test_orttrainer_frontend.py b/orttraining/orttraining/test/python/orttraining_test_orttrainer_frontend.py index 7d1bf2de78..4577de2644 100644 --- a/orttraining/orttraining/test/python/orttraining_test_orttrainer_frontend.py +++ b/orttraining/orttraining/test/python/orttraining_test_orttrainer_frontend.py @@ -741,7 +741,7 @@ def testORTTrainerMixedPrecisionLossScaler(seed, device, expected_loss, fetches) def _recompute_data(): device_capability_major = torch.cuda.get_device_capability()[0] if device_capability_major == 7: # V100 for Dev machine - expected_loss = [10.5732, 10.4407, 10.3701, 10.2778, 10.1824] + expected_loss = [10.5598 , 10.4591, 10.3477, 10.2726, 10.1945] return [ (False, False, False, 0, expected_loss), # no recompute (True, False, False, 0, expected_loss), # attn_dropout recompute @@ -1499,14 +1499,14 @@ def _adam_max_norm_clip_data(): device_capability_major = torch.cuda.get_device_capability()[0] if device_capability_major == 7: # V100 for Dev machine return [ - (0, 'cuda', 1.0, 1, 12, [10.596329, 10.087329, 9.625324, 9.254117, 8.914067,\ - 8.557245, 8.296672, 8.040311, 7.780754, 7.499548, 7.229341, 7.036769]), - (0, 'cuda', 0.1, 1, 12, [10.596329, 10.088068, 9.626670, 9.256137, 8.916809,\ - 8.560838, 8.301097, 8.045413, 7.786527, 7.505644, 7.236132, 7.043610]), - (42, 'cuda', 1.0, 1, 12, [10.659752, 10.149531, 9.646378, 9.273719, 8.938648,\ - 8.595006, 8.344718, 8.100259, 7.828771, 7.541266, 7.269467, 7.083140]), - (42, 'cuda', 0.1, 1, 12, [10.659752, 10.150211, 9.647715, 9.275835, 8.941610,\ - 8.598876, 8.349401, 8.105709, 7.834774, 7.547812, 7.276530, 7.090215]), + (0, 'cuda', 1.0, 1, 12, [10.592951, 10.067989, 9.619152, 9.245731, 8.881137,\ + 8.578644, 8.280573, 8.063023, 7.797933, 7.486215, 7.233806, 7.011791]), + (0, 'cuda', 0.1, 1, 12, [10.592951, 10.068722, 9.620503, 9.247791, 8.883972,\ + 8.582286, 8.285027, 8.068308, 7.803638, 7.492318, 7.240352, 7.018665]), + (42, 'cuda', 1.0, 1, 12, [10.647908, 10.144501, 9.672352, 9.306980, 8.956026,\ + 8.602655, 8.351079, 8.088144, 7.867220, 7.564082, 7.289846, 7.073726]), + (42, 'cuda', 0.1, 1, 12, [10.647908, 10.145191, 9.673690, 9.309031, 8.959020,\ + 8.606632, 8.355836, 8.093478, 7.873327, 7.570731, 7.296772, 7.0809422]), ] elif device_capability_major == 5: # M60 for CI machines (Python Packaging Pipeline) return [ @@ -1548,14 +1548,14 @@ def _lamb_max_norm_clip_data(): device_capability_major = torch.cuda.get_device_capability()[0] if device_capability_major == 7: # V100 for Dev machine return [ - (0, 'cuda', 1.0, 1, 12, [10.596329, 10.509530, 10.422451, 10.359101, 10.285673, 10.200603,\ - 10.152860, 10.106999, 10.033828, 9.965749, 9.895924, 9.854723]), - (0, 'cuda', 0.1, 1, 12, [10.596329, 10.474221, 10.350412, 10.253196, 10.148172, 10.032470,\ - 9.958271, 9.885362, 9.788476, 9.696474, 9.601951, 9.542482]), - (42, 'cuda', 1.0, 1, 12, [10.659752, 10.565927, 10.437677, 10.387601, 10.302234, 10.217105,\ - 10.170007, 10.143104, 10.093051, 10.002419, 9.960327, 9.895797]), - (42, 'cuda', 0.1, 1, 12, [10.659752, 10.531717, 10.367162, 10.284177, 10.168813, 10.053536,\ - 9.980052, 9.926860, 9.852230, 9.738342, 9.673130, 9.590945]), + (0, 'cuda', 1.0, 1, 12, [10.592951, 10.487728, 10.422251, 10.350913, 10.244248, 10.213003,\ + 10.129222, 10.095112, 10.035983, 9.974586, 9.909771, 9.874278]), + (0, 'cuda', 0.1, 1, 12, [10.592951, 10.452503, 10.349832, 10.245314, 10.106587, 10.046009,\ + 9.934781, 9.875164, 9.792067, 9.704592, 9.617104, 9.563070]), + (42, 'cuda', 1.0, 1, 12, [10.647908, 10.566276, 10.476154, 10.406275, 10.311079, 10.240053,\ + 10.196469, 10.113955, 10.117376, 10.013077, 9.930301, 9.893368]), + (42, 'cuda', 0.1, 1, 12, [10.647908, 10.531957, 10.405246, 10.302971, 10.176583, 10.075583,\ + 10.005772, 9.897825, 9.875748, 9.748932, 9.642885, 9.586762]), ] elif device_capability_major == 5: # M60 for CI machines (Python Packaging Pipeline) return [ diff --git a/orttraining/orttraining/test/training_ops/cpu/nn/dropout_op_test.cc b/orttraining/orttraining/test/training_ops/cpu/nn/dropout_op_test.cc index 90c9ef81ae..6700785764 100644 --- a/orttraining/orttraining/test/training_ops/cpu/nn/dropout_op_test.cc +++ b/orttraining/orttraining/test/training_ops/cpu/nn/dropout_op_test.cc @@ -125,6 +125,15 @@ void RunDropoutTest(const bool use_mask, const std::vector& input_shape // Dropout +// N % 4 != 0 +TEST(DropoutTest, BasicAndNotVectorized) { + RunDropoutTest(false, {10, 5, 5}, 0.75f); +} +TEST(DropoutTest, MaskAndNotVectorized) { + RunDropoutTest(true, {250}, 0.25f); +} + +// N % 4 == 0 TEST(DropoutTest, Basic) { RunDropoutTest(false, {10, 10, 10}, 0.75f); } @@ -197,6 +206,11 @@ void RunDropoutGradTest(float ratio, const std::vector& input_dims, boo // DropoutGrad TEST(DropoutGradTest, Basic) { + // N % 4 != 0 + //Ratio 0.3, 2D + RunDropoutGradTest(0.3f, {5, 6}, false); + + // N %4 == 0 //Ratio 0.2, 1D RunDropoutGradTest(0.2f, {16}, false); diff --git a/orttraining/orttraining/training_ops/cuda/nn/dropout_grad_impl.cu b/orttraining/orttraining/training_ops/cuda/nn/dropout_grad_impl.cu index c8780b7b26..bcd89738e5 100644 --- a/orttraining/orttraining/training_ops/cuda/nn/dropout_grad_impl.cu +++ b/orttraining/orttraining/training_ops/cuda/nn/dropout_grad_impl.cu @@ -24,6 +24,8 @@ namespace onnxruntime { namespace cuda { +constexpr int UNROLL = 4; + template __global__ void DropoutGradientKernel( const int64_t N, @@ -31,16 +33,58 @@ __global__ void DropoutGradientKernel( const bool* mask_data, const float scale, T* dX_data) { - CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x; -#pragma unroll - for (int i = 0; i < NumElementsPerThread; i++) { - if (id < N) { - dX_data[id] = T(float(dY_data[id]) * mask_data[id] * scale); - id += NumThreadsPerBlock; + + CUDA_LONG idx = blockDim.x * blockIdx.x + threadIdx.x; + CUDA_LONG id = idx * UNROLL; + + #pragma unroll + for (int i = 0; i < UNROLL; i++) { + CUDA_LONG li = id + i; + if (li < N) { + dX_data[li] = T(float(dY_data[li]) * mask_data[li] * scale); } } } +template +__global__ void DropoutGradientVectorizedKernel( + const int64_t N, + const T* dY_data, + const bool* mask_data, + const float scale, + T* dX_data) { + + // using vectorized data load/store approach when N % 4 == 0 + // since this is typical case for input shape size + using LoadT = aligned_vector; + using MaskLoadT = aligned_vector; + + CUDA_LONG idx = blockDim.x * blockIdx.x + threadIdx.x; + CUDA_LONG id = idx * UNROLL; + + if (id < N) { + // vectorized load into storage + T src[UNROLL]; + LoadT *value1 = reinterpret_cast(&src); + *value1 = *reinterpret_cast(&dY_data[id]); + + bool mask[UNROLL]; + MaskLoadT *value2 = reinterpret_cast(&mask); + *value2 = *reinterpret_cast(&mask_data[id]); + + T r[UNROLL]; + + // actual computation + #pragma unroll + for (int ii = 0; ii < UNROLL; ii++) { + r[ii] = T(float(src[ii]) * mask[ii] * scale); + } + // Vectorized writes for dX_data + *(reinterpret_cast(&dX_data[id])) = *reinterpret_cast(&r[0]); + } + +} + template void DropoutGradientKernelImpl( cudaStream_t stream, @@ -56,8 +100,14 @@ void DropoutGradientKernelImpl( } else { const float scale = 1.f / (1.f - ratio); const int blocksPerGrid = static_cast(CeilDiv(N, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread)); - DropoutGradientKernel - <<>>(N, dY_data, mask_data, scale, dX_data); + if (N % UNROLL != 0) { + DropoutGradientKernel + <<>>(N, dY_data, mask_data, scale, dX_data); + } else { + DropoutGradientVectorizedKernel + <<>>(N, dY_data, mask_data, scale, dX_data); + } + } } diff --git a/orttraining/tools/ci_test/results/bert_base.convergence.baseline.mi100.csv b/orttraining/tools/ci_test/results/bert_base.convergence.baseline.mi100.csv index 8d027b983d..10b5b22c07 100644 --- a/orttraining/tools/ci_test/results/bert_base.convergence.baseline.mi100.csv +++ b/orttraining/tools/ci_test/results/bert_base.convergence.baseline.mi100.csv @@ -1,11 +1,11 @@ step,total_loss,mlm_loss,nsp_loss -0,11.2171,10.5178,0.699279 -5,9.6935,7.51946,2.17404 -10,8.72874,7.60452,1.12422 -15,8.25456,7.54113,0.713431 -20,8.17125,7.47469,0.696562 -25,8.21603,7.52277,0.693259 -30,8.08864,7.39777,0.69087 -35,7.9672,7.25153,0.715668 -40,7.94141,7.25788,0.683527 -45,7.94186,7.27316,0.668707 +0,11.2032,10.501,0.702181 +5,9.53939,7.52411,2.01528 +10,8.2614,7.564,0.697406 +15,8.28412,7.55601,0.728112 +20,8.17273,7.45947,0.71326 +25,8.228,7.53251,0.695496 +30,8.07991,7.38456,0.695344 +35,7.96173,7.25046,0.711262 +40,7.9463,7.25667,0.689625 +45,7.92987,7.26442,0.665449