Dropout Vectorized Kernel (#9157)

* vectorized kernel * fix build * re-calibrate expected loss * fix build * re-calibrate convergence results * more re-calibrate on loss * divide kernels * adress comments * more calibration * calibration * per comments * enable sync Co-authored-by: Ethan Tao <ettao@OrtTrainingDev4.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
2026-07-06 04:28:32 +00:00 · 2021-09-27 17:19:12 -07:00 · 2021-09-27 17:19:12 -07:00 · d3f859fe30
commit d3f859fe30
parent 1b0816859f
10 changed files with 291 additions and 67 deletions
--- a/onnxruntime/contrib_ops/cuda/math/bias_dropout_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/math/bias_dropout_impl.cu
@ -44,23 +44,25 @@ __global__ void BiasDropoutKernel(

  CUDA_LONG idx = blockDim.x * blockIdx.x + threadIdx.x;
  CUDA_LONG step_size = gridDim.x * blockDim.x * UNROLL;
-  CUDA_LONG rounded_size = ((N - 1) / step_size + 1) * step_size;

  curandStatePhilox4_32_10_t state;
  curand_init(seeds.first, idx, seeds.second, &state);

+  float4 rand;
+
  // We ensure every thread generates the same number of random numbers (by rounding
  // up the size) and at the same timestep (by syncing threads).
  // From CUDA curand documentation:
  //   The Philox_4x32_10 algorithm is closely tied to the thread and block count.
  //   Each thread computes 4 random numbers in the same time thus the most efficient
  //   use of Philox_4x32_10 is to generate a multiple of 4 times number of threads.
-  for (CUDA_LONG id = idx; id < rounded_size; id += step_size) {
-    float4 rand = curand_uniform4(&state);
-
-  #pragma unroll
-    for (CUDA_LONG i = 0; i < UNROLL; i++) {
-      CUDA_LONG li = id + gridDim.x * blockDim.x * i;
+  for (CUDA_LONG id = idx * UNROLL; id < N; id += step_size) {
+    rand = curand_uniform4(&state);
+  
+    // actual computation
+    #pragma unroll
+    for (int i = 0; i < UNROLL; i++) {
+      CUDA_LONG li = id + i;
      if (li < N) {
        int offset = fdm_dim.mod(li);
        float bias = float(bias_data[offset]);
@ -77,6 +79,75 @@ __global__ void BiasDropoutKernel(

    __syncthreads();
  }
+
+}
+
+
+template <typename T, bool has_residual>
+__global__ void BiasDropoutVectorizedKernel(
+    const int64_t N,
+    const fast_divmod fdm_dim,
+    const float ratio,
+    const std::pair<uint64_t, uint64_t> seeds,
+    const T* X_data,
+    const T* bias_data,
+    const T* residual_data,
+    T* Y_data,
+    bool* mask_data) {
+  const float p = 1.0f - ratio;
+  const float scale = 1.0f / p;
+
+  CUDA_LONG idx = blockDim.x * blockIdx.x + threadIdx.x;
+  CUDA_LONG step_size = gridDim.x * blockDim.x * UNROLL;
+
+  curandStatePhilox4_32_10_t state;
+  curand_init(seeds.first, idx, seeds.second, &state);
+
+  float4 rand;
+
+  // using vectorized data load/store approach when N % 4 == 0 
+  // since this is typical case for input shape size
+  using LoadT = aligned_vector<T, UNROLL>;
+  using MaskLoadT = aligned_vector<bool, UNROLL>;
+  using ResidualLoadT = aligned_vector<T, UNROLL>;
+
+  for (CUDA_LONG id = idx * UNROLL; id < N; id += step_size) {
+    rand = curand_uniform4(&state);
+  
+    // vectorized load into storage
+    T src[UNROLL];
+    LoadT *value1 = reinterpret_cast<LoadT*>(&src);
+    *value1 = *reinterpret_cast<const LoadT*>(&X_data[id]);
+
+    T residual[UNROLL];
+    if (has_residual) {
+      ResidualLoadT *value2 = reinterpret_cast<ResidualLoadT*>(&residual);
+      *value2 = *reinterpret_cast<const ResidualLoadT*>(&residual_data[id]);
+    }
+
+    T r[UNROLL];
+    bool mask[UNROLL];
+
+    // actual computation
+    #pragma unroll
+    for (int ii = 0; ii < UNROLL; ii++) {
+      int offset = fdm_dim.mod(id + ii);
+      float bias = float(bias_data[offset]);
+
+      mask[ii] = (&rand.x)[ii] < p;
+      float output_data = (float(src[ii]) + bias) * mask[ii] * scale;
+      if (has_residual) {
+        output_data += float(residual[ii]);
+      }
+      r[ii] = T(output_data);
+    }
+    // Vectorized writes for mask_data & Y_data
+    *(reinterpret_cast<LoadT*>(&Y_data[id])) = *reinterpret_cast<LoadT*>(&r[0]);
+    *(reinterpret_cast<MaskLoadT*>(&mask_data[id])) = *reinterpret_cast<MaskLoadT*>(&mask[0]);
+
+    __syncthreads();
+  }
+
 }

 template <typename T>
@ -100,10 +171,18 @@ void BiasDropoutKernelImpl(
  const uint64_t counter_offset = static_cast<uint64_t>(((N - 1) / (block_size * grid_size * UNROLL) + 1) * UNROLL);
  auto seeds = generator.NextPhiloxSeeds(counter_offset);

-  if (residual_data == nullptr) {
-    BiasDropoutKernel<T, false><<<grid_size, block_size, 0, stream>>>(N, fdm_dim, ratio, seeds, X_data, bias_data, residual_data, Y_data, mask_data);
+  if (N % UNROLL != 0) {
+    if (residual_data == nullptr) {
+      BiasDropoutKernel<T, false><<<grid_size, block_size, 0, stream>>>(N, fdm_dim, ratio, seeds, X_data, bias_data, residual_data, Y_data, mask_data);
+    } else {
+      BiasDropoutKernel<T, true><<<grid_size, block_size, 0, stream>>>(N, fdm_dim, ratio, seeds, X_data, bias_data, residual_data, Y_data, mask_data);
+    }
  } else {
-    BiasDropoutKernel<T, true><<<grid_size, block_size, 0, stream>>>(N, fdm_dim, ratio, seeds, X_data, bias_data, residual_data, Y_data, mask_data);
+    if (residual_data == nullptr) {
+      BiasDropoutVectorizedKernel<T, false><<<grid_size, block_size, 0, stream>>>(N, fdm_dim, ratio, seeds, X_data, bias_data, residual_data, Y_data, mask_data);
+    } else {
+      BiasDropoutVectorizedKernel<T, true><<<grid_size, block_size, 0, stream>>>(N, fdm_dim, ratio, seeds, X_data, bias_data, residual_data, Y_data, mask_data);
+    }
  }
 }

--- a/onnxruntime/core/providers/cuda/cu_inc/common.cuh
+++ b/onnxruntime/core/providers/cuda/cu_inc/common.cuh
@ -312,6 +312,12 @@ struct GridDim {
  };
 };

+// aligned vector generates vectorized load/store on CUDA
+template<typename T, int vec_size>
+struct alignas(sizeof(T) * vec_size) aligned_vector {
+  T val[vec_size];
+};
+
 #define CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N)      \
  CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x; \
  if (id >= N)                                          \
--- a/onnxruntime/core/providers/cuda/nn/dropout_impl.cu
+++ b/onnxruntime/core/providers/cuda/nn/dropout_impl.cu
@ -39,23 +39,25 @@ __global__ void DropoutKernel(

  CUDA_LONG idx = blockDim.x * blockIdx.x + threadIdx.x;
  CUDA_LONG step_size = gridDim.x * blockDim.x * UNROLL;
-  CUDA_LONG rounded_size = ((N - 1) / step_size + 1) * step_size;

  curandStatePhilox4_32_10_t state;
  curand_init(seeds.first, idx, seeds.second, &state);

+  float4 rand;
+
  // We ensure every thread generates the same number of random numbers (by rounding
  // up the size) and at the same timestep (by syncing threads).
  // From CUDA curand documentation:
  //   The Philox_4x32_10 algorithm is closely tied to the thread and block count.
  //   Each thread computes 4 random numbers in the same time thus the most efficient
  //   use of Philox_4x32_10 is to generate a multiple of 4 times number of threads.
-  for (CUDA_LONG id = idx; id < rounded_size; id += step_size) {
-    float4 rand = curand_uniform4(&state);
-  
-  #pragma unroll
-    for (CUDA_LONG i = 0; i < UNROLL; i++) {
-      CUDA_LONG li = id + gridDim.x * blockDim.x * i;
+  for (CUDA_LONG id = idx * UNROLL; id < N; id += step_size) {
+    rand = curand_uniform4(&state);
+
+    // actual computation
+    #pragma unroll
+    for (int i = 0; i < UNROLL; i++) {
+      CUDA_LONG li = id + i;
      if (li < N) {
        mask_data[li] = (&rand.x)[i] < p;
        Y_data[li] = T(float(X_data[li]) * mask_data[li] * scale);
@ -64,6 +66,57 @@ __global__ void DropoutKernel(

    __syncthreads();
  }
+
+}
+
+template <typename T>
+__global__ void DropoutVectorizedKernel(
+    const int64_t N,
+    const float ratio,
+    const std::pair<uint64_t, uint64_t> seeds,
+    const T* X_data,
+    T* Y_data,
+    bool* mask_data) {
+  const float p = 1.0f - ratio;
+  const float scale = 1.0f / p;
+
+  CUDA_LONG idx = blockDim.x * blockIdx.x + threadIdx.x;
+  CUDA_LONG step_size = gridDim.x * blockDim.x * UNROLL;
+
+  curandStatePhilox4_32_10_t state;
+  curand_init(seeds.first, idx, seeds.second, &state);
+
+  float4 rand;
+
+  // using vectorized data load/store approach when N % 4 == 0 since this is 
+  // typical case for input shape size
+  using LoadT = aligned_vector<T, UNROLL>;
+  using MaskLoadT = aligned_vector<bool, UNROLL>;
+
+  for (CUDA_LONG id = idx * UNROLL; id < N; id += step_size) {
+    rand = curand_uniform4(&state);
+  
+    // vectorized load into storage
+    T src[UNROLL];
+    LoadT *value = reinterpret_cast<LoadT*>(&src);
+    *value = *reinterpret_cast<const LoadT*>(&X_data[id]);
+
+    T r[UNROLL];
+    bool mask[UNROLL];
+
+    // actual computation
+    #pragma unroll
+    for (int ii = 0; ii < UNROLL; ii++) {
+      mask[ii] = (&rand.x)[ii] < p;
+      r[ii] = T(float(src[ii]) * mask[ii] * scale);
+    }
+    // Vectorized writes for mask_data & Y_data
+    *(reinterpret_cast<LoadT*>(&Y_data[id])) = *reinterpret_cast<LoadT*>(&r[0]);
+    *(reinterpret_cast<MaskLoadT*>(&mask_data[id])) = *reinterpret_cast<MaskLoadT*>(&mask[0]);
+
+    __syncthreads();
+  }
+
 }

 template <typename T>
@ -84,7 +137,11 @@ void DropoutKernelImpl(
  const uint64_t counter_offset = static_cast<uint64_t>(((N - 1) / (block_size * grid_size * UNROLL) + 1) * UNROLL);
  auto seeds = generator.NextPhiloxSeeds(counter_offset);

-  DropoutKernel<T><<<grid_size, block_size, 0, stream>>>(N, ratio, seeds, X_data, Y_data, mask_data);
+  if ( N % UNROLL != 0) {
+    DropoutKernel<T><<<grid_size, block_size, 0, stream>>>(N, ratio, seeds, X_data, Y_data, mask_data);
+  } else {
+    DropoutVectorizedKernel<T><<<grid_size, block_size, 0, stream>>>(N, ratio, seeds, X_data, Y_data, mask_data);
+  }
 }

 #define SPECIALIZED_DROPOUT_IMPL(T) \
--- a/onnxruntime/core/providers/rocm/cu_inc/common.cuh
+++ b/onnxruntime/core/providers/rocm/cu_inc/common.cuh
@ -205,6 +205,11 @@ struct GridDim {
  };
 };

+// aligned vector generates vectorized load/store on CUDA
+template<typename T, int vec_size>
+struct alignas(sizeof(T) * vec_size) aligned_vector {
+  T val[vec_size];
+};

 #define CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N)          \
  HIP_LONG id = blockDim.x * blockIdx.x + threadIdx.x;     \
--- a/onnxruntime/test/contrib_ops/bias_dropout_op_test.cc
+++ b/onnxruntime/test/contrib_ops/bias_dropout_op_test.cc
@ -134,6 +134,19 @@ void RunBiasDropoutTest(const bool use_mask, const std::vector<int64_t>& input_s
 }
 }  // namespace

+
+// N % 4 != 0
+TEST(BiasDropoutTest, BasicAndNotVectorized) {
+  RunBiasDropoutTest(false, {10, 5, 5}, 0.75f);
+}
+TEST(BiasDropoutTest, BasicWithoutResidualAndNotVectorized) {
+  RunBiasDropoutTest(false, {10, 5, 5}, 0.75f, TrainingTrue, false, false);
+}
+TEST(BiasDropoutTest, MaskAndNotVectorized) {
+  RunBiasDropoutTest(true, {3, 5, 100}, 0.25f);
+}
+
+// N % 4 == 0
 TEST(BiasDropoutTest, Basic) {
  RunBiasDropoutTest(false, {10, 10, 10}, 0.75f);
 }
--- a/orttraining/orttraining/test/python/orttraining_test_orttrainer_bert_toy_onnx.py
+++ b/orttraining/orttraining/test/python/orttraining_test_orttrainer_bert_toy_onnx.py
@ -179,8 +179,8 @@ def testToyBERTModelBasicTraining(dynamic_shape):


@pytest.mark.parametrize("expected_losses", [
-    ([10.991958, 10.975625, 11.032847, 11.034771, 10.987653,
-      11.039469, 10.971498, 11.101391, 11.047601, 11.077588])
+    ([11.041123, 10.986166, 11.101636, 11.013366, 11.03775 , 
+      11.041175, 10.957118, 11.069563, 11.040824, 11.16437])
 ])
 def testToyBERTDeterministicCheck(expected_losses):
    # Common setup
@ -294,12 +294,12 @@ def testToyBERTModelLRScheduler(initial_lr, lr_scheduler, expected_learning_rate


@pytest.mark.parametrize("loss_scaler, expected_losses", [
-    (None, [10.992018, 10.975699, 11.032809, 11.034765, 10.987625,
-            11.039452, 10.971539, 11.10148, 11.047551, 11.077468]),
-    (amp.DynamicLossScaler(), [10.992018, 10.975699, 11.032809, 11.034765,
-                               10.987625, 11.039452, 10.971539, 11.10148, 11.047551, 11.077468]),
-    (CustomLossScaler(), [10.992018, 10.975699, 11.032791, 11.034729,
-                          10.987614, 11.039479, 10.971532, 11.101475, 11.04761, 11.077413])
+    (None, [11.041126, 10.986309, 11.101673, 11.013394, 11.037781,
+            11.041253, 10.957072, 11.069506, 11.040807, 11.164349]),
+    (amp.DynamicLossScaler(), [11.041126, 10.986309, 11.101673, 11.013394,
+                               11.037781, 11.041253, 10.957072, 11.069506, 11.040807, 11.164349]),
+    (CustomLossScaler(), [11.041126, 10.986309, 11.101645, 11.013412,
+                          11.037757, 11.041273, 10.957077, 11.069525, 11.040765, 11.164298])
 ])
 def testToyBERTModelMixedPrecisionLossScaler(loss_scaler, expected_losses):
    # Common setup
@ -339,12 +339,12 @@ def testToyBERTModelMixedPrecisionLossScaler(loss_scaler, expected_losses):


@pytest.mark.parametrize("gradient_accumulation_steps, expected_losses", [
-    (1, [10.991958, 10.975625, 11.032847, 11.034771, 10.987653,
-         11.039469, 10.971498, 11.101391, 11.047601, 11.077588]),
-    (4, [10.991958, 10.97373, 11.033534, 11.028931, 10.988836,
-         11.04126, 10.969865, 11.085526, 11.036701, 11.0628]),
-    (7, [10.991958, 10.97373, 11.033534, 11.028931, 10.994967,
-         11.043544, 10.974638, 11.085087, 11.034944, 11.059022])
+    (1, [11.041123, 10.986166, 11.101636, 11.013366, 11.03775,
+         11.041175, 10.957118, 11.069563, 11.040824, 11.16437]),
+    (4, [11.041123, 10.982856, 11.105512, 11.006721, 11.03358,
+         11.05058, 10.955864, 11.059035, 11.037753, 11.162649]),
+    (7, [11.041123, 10.982856, 11.105512, 11.006721, 11.036314,
+         11.055109, 10.960751, 11.05809 , 11.038856, 11.159635])
 ])
 def testToyBERTModelGradientAccumulation(gradient_accumulation_steps, expected_losses):
    # Common setup
--- a/orttraining/orttraining/test/python/orttraining_test_orttrainer_frontend.py
+++ b/orttraining/orttraining/test/python/orttraining_test_orttrainer_frontend.py
@ -741,7 +741,7 @@ def testORTTrainerMixedPrecisionLossScaler(seed, device, expected_loss, fetches)
 def _recompute_data():
    device_capability_major = torch.cuda.get_device_capability()[0]
    if device_capability_major == 7:    # V100 for Dev machine
-        expected_loss = [10.5732, 10.4407, 10.3701, 10.2778, 10.1824]
+        expected_loss = [10.5598 , 10.4591, 10.3477, 10.2726, 10.1945]
        return [
            (False, False, False, 0, expected_loss),    # no recompute
            (True, False, False, 0, expected_loss),     # attn_dropout recompute
@ -1499,14 +1499,14 @@ def _adam_max_norm_clip_data():
    device_capability_major = torch.cuda.get_device_capability()[0]
    if device_capability_major == 7:    # V100 for Dev machine
        return [
-            (0, 'cuda', 1.0, 1, 12, [10.596329, 10.087329, 9.625324, 9.254117, 8.914067,\
-                8.557245, 8.296672, 8.040311, 7.780754, 7.499548, 7.229341, 7.036769]),
-            (0, 'cuda', 0.1, 1, 12, [10.596329, 10.088068, 9.626670, 9.256137, 8.916809,\
-                8.560838, 8.301097, 8.045413, 7.786527, 7.505644, 7.236132, 7.043610]),
-            (42, 'cuda', 1.0, 1, 12, [10.659752, 10.149531, 9.646378, 9.273719, 8.938648,\
-                8.595006, 8.344718, 8.100259, 7.828771, 7.541266, 7.269467, 7.083140]),
-            (42, 'cuda', 0.1, 1, 12, [10.659752, 10.150211, 9.647715, 9.275835, 8.941610,\
-                8.598876, 8.349401, 8.105709, 7.834774, 7.547812, 7.276530, 7.090215]),
+            (0, 'cuda', 1.0, 1, 12, [10.592951, 10.067989, 9.619152, 9.245731, 8.881137,\
+                8.578644, 8.280573, 8.063023, 7.797933, 7.486215, 7.233806, 7.011791]),
+            (0, 'cuda', 0.1, 1, 12, [10.592951, 10.068722, 9.620503, 9.247791, 8.883972,\
+                8.582286, 8.285027, 8.068308, 7.803638, 7.492318, 7.240352, 7.018665]),
+            (42, 'cuda', 1.0, 1, 12, [10.647908, 10.144501, 9.672352, 9.306980, 8.956026,\
+                8.602655, 8.351079, 8.088144, 7.867220, 7.564082, 7.289846, 7.073726]),
+            (42, 'cuda', 0.1, 1, 12, [10.647908, 10.145191, 9.673690, 9.309031, 8.959020,\
+                8.606632, 8.355836, 8.093478, 7.873327, 7.570731, 7.296772, 7.0809422]),
        ]
    elif device_capability_major == 5:  # M60 for CI machines (Python Packaging Pipeline)
        return [
@ -1548,14 +1548,14 @@ def _lamb_max_norm_clip_data():
    device_capability_major = torch.cuda.get_device_capability()[0]
    if device_capability_major == 7:    # V100 for Dev machine
        return [
-            (0, 'cuda', 1.0, 1, 12, [10.596329, 10.509530, 10.422451, 10.359101, 10.285673, 10.200603,\
-                10.152860, 10.106999, 10.033828, 9.965749, 9.895924, 9.854723]),
-            (0, 'cuda', 0.1, 1, 12, [10.596329, 10.474221, 10.350412, 10.253196, 10.148172, 10.032470,\
-                9.958271, 9.885362, 9.788476, 9.696474, 9.601951, 9.542482]),
-            (42, 'cuda', 1.0, 1, 12, [10.659752, 10.565927, 10.437677, 10.387601, 10.302234, 10.217105,\
-                10.170007, 10.143104, 10.093051, 10.002419, 9.960327, 9.895797]),
-            (42, 'cuda', 0.1, 1, 12, [10.659752, 10.531717, 10.367162, 10.284177, 10.168813, 10.053536,\
-                9.980052, 9.926860, 9.852230, 9.738342, 9.673130, 9.590945]),
+            (0, 'cuda', 1.0, 1, 12, [10.592951, 10.487728, 10.422251, 10.350913, 10.244248, 10.213003,\
+                10.129222, 10.095112, 10.035983, 9.974586, 9.909771, 9.874278]),
+            (0, 'cuda', 0.1, 1, 12, [10.592951, 10.452503, 10.349832, 10.245314, 10.106587, 10.046009,\
+                9.934781, 9.875164, 9.792067, 9.704592, 9.617104, 9.563070]),
+            (42, 'cuda', 1.0, 1, 12, [10.647908, 10.566276, 10.476154, 10.406275, 10.311079, 10.240053,\
+                10.196469, 10.113955, 10.117376, 10.013077, 9.930301, 9.893368]),
+            (42, 'cuda', 0.1, 1, 12, [10.647908, 10.531957, 10.405246, 10.302971, 10.176583, 10.075583,\
+                10.005772, 9.897825, 9.875748, 9.748932, 9.642885, 9.586762]),
        ]
    elif device_capability_major == 5:  # M60 for CI machines (Python Packaging Pipeline)
        return [
--- a/orttraining/orttraining/test/training_ops/cpu/nn/dropout_op_test.cc
+++ b/orttraining/orttraining/test/training_ops/cpu/nn/dropout_op_test.cc
@ -125,6 +125,15 @@ void RunDropoutTest(const bool use_mask, const std::vector<int64_t>& input_shape

 // Dropout

+// N % 4 != 0
+TEST(DropoutTest, BasicAndNotVectorized) {
+  RunDropoutTest(false, {10, 5, 5}, 0.75f);
+}
+TEST(DropoutTest, MaskAndNotVectorized) {
+  RunDropoutTest(true, {250}, 0.25f);
+}
+
+// N % 4 == 0
 TEST(DropoutTest, Basic) {
  RunDropoutTest(false, {10, 10, 10}, 0.75f);
 }
@ -197,6 +206,11 @@ void RunDropoutGradTest(float ratio, const std::vector<int64_t>& input_dims, boo
 // DropoutGrad

 TEST(DropoutGradTest, Basic) {
+  // N % 4 != 0
+  //Ratio 0.3, 2D
+  RunDropoutGradTest(0.3f, {5, 6}, false);
+
+  // N %4 == 0
  //Ratio 0.2, 1D
  RunDropoutGradTest(0.2f, {16}, false);

--- a/orttraining/orttraining/training_ops/cuda/nn/dropout_grad_impl.cu
+++ b/orttraining/orttraining/training_ops/cuda/nn/dropout_grad_impl.cu
@ -24,6 +24,8 @@
 namespace onnxruntime {
 namespace cuda {

+constexpr int UNROLL = 4;
+
 template <typename T, int NumThreadsPerBlock, int NumElementsPerThread>
 __global__ void DropoutGradientKernel(
    const int64_t N,
@ -31,16 +33,58 @@ __global__ void DropoutGradientKernel(
    const bool* mask_data,
    const float scale,
    T* dX_data) {
-  CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
-#pragma unroll
-  for (int i = 0; i < NumElementsPerThread; i++) {
-    if (id < N) {
-      dX_data[id] = T(float(dY_data[id]) * mask_data[id] * scale);
-      id += NumThreadsPerBlock;
+
+  CUDA_LONG idx = blockDim.x * blockIdx.x + threadIdx.x;
+  CUDA_LONG id = idx * UNROLL;
+
+  #pragma unroll
+  for (int i = 0; i < UNROLL; i++) {
+    CUDA_LONG li = id + i;
+    if (li < N) {
+      dX_data[li] = T(float(dY_data[li]) * mask_data[li] * scale);
    }
  }
 }

+template <typename T, int NumThreadsPerBlock, int NumElementsPerThread>
+__global__ void DropoutGradientVectorizedKernel(
+    const int64_t N,
+    const T* dY_data,
+    const bool* mask_data,
+    const float scale,
+    T* dX_data) {
+
+  // using vectorized data load/store approach when N % 4 == 0 
+  // since this is typical case for input shape size
+  using LoadT = aligned_vector<T, UNROLL>;
+  using MaskLoadT = aligned_vector<bool, UNROLL>;
+
+  CUDA_LONG idx = blockDim.x * blockIdx.x + threadIdx.x;
+  CUDA_LONG id = idx * UNROLL;
+
+  if (id < N) {
+    // vectorized load into storage
+    T src[UNROLL];
+    LoadT *value1 = reinterpret_cast<LoadT*>(&src);
+    *value1 = *reinterpret_cast<const LoadT*>(&dY_data[id]);
+
+    bool mask[UNROLL];
+    MaskLoadT *value2 = reinterpret_cast<MaskLoadT*>(&mask);
+    *value2 = *reinterpret_cast<const MaskLoadT*>(&mask_data[id]);
+
+    T r[UNROLL];
+
+    // actual computation
+    #pragma unroll
+    for (int ii = 0; ii < UNROLL; ii++) {
+      r[ii] = T(float(src[ii]) * mask[ii] * scale);
+    }
+    // Vectorized writes for dX_data
+    *(reinterpret_cast<LoadT*>(&dX_data[id])) = *reinterpret_cast<LoadT*>(&r[0]);
+  }
+
+}
+
 template <typename T>
 void DropoutGradientKernelImpl(
    cudaStream_t stream,
@ -56,8 +100,14 @@ void DropoutGradientKernelImpl(
  } else {
    const float scale = 1.f / (1.f - ratio);
    const int blocksPerGrid = static_cast<int>(CeilDiv(N, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
-    DropoutGradientKernel<T, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
-                         <<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(N, dY_data, mask_data, scale, dX_data);
+    if (N % UNROLL != 0) {
+      DropoutGradientKernel<T, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
+                           <<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(N, dY_data, mask_data, scale, dX_data);
+    } else {
+      DropoutGradientVectorizedKernel<T, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
+                                     <<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(N, dY_data, mask_data, scale, dX_data);
+    }
+
  }
 }

--- a/orttraining/tools/ci_test/results/bert_base.convergence.baseline.mi100.csv
+++ b/orttraining/tools/ci_test/results/bert_base.convergence.baseline.mi100.csv
@ -1,11 +1,11 @@
 step,total_loss,mlm_loss,nsp_loss
-0,11.2171,10.5178,0.699279
-5,9.6935,7.51946,2.17404
-10,8.72874,7.60452,1.12422
-15,8.25456,7.54113,0.713431
-20,8.17125,7.47469,0.696562
-25,8.21603,7.52277,0.693259
-30,8.08864,7.39777,0.69087
-35,7.9672,7.25153,0.715668
-40,7.94141,7.25788,0.683527
-45,7.94186,7.27316,0.668707
+0,11.2032,10.501,0.702181
+5,9.53939,7.52411,2.01528
+10,8.2614,7.564,0.697406
+15,8.28412,7.55601,0.728112
+20,8.17273,7.45947,0.71326
+25,8.228,7.53251,0.695496
+30,8.07991,7.38456,0.695344
+35,7.96173,7.25046,0.711262
+40,7.9463,7.25667,0.689625
+45,7.92987,7.26442,0.665449