mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-07-03 03:58:54 +00:00
Dropout Vectorized Kernel (#9157)
* vectorized kernel * fix build * re-calibrate expected loss * fix build * re-calibrate convergence results * more re-calibrate on loss * divide kernels * adress comments * more calibration * calibration * per comments * enable sync Co-authored-by: Ethan Tao <ettao@OrtTrainingDev4.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
This commit is contained in:
parent
1b0816859f
commit
d3f859fe30
10 changed files with 291 additions and 67 deletions
|
|
@ -44,23 +44,25 @@ __global__ void BiasDropoutKernel(
|
|||
|
||||
CUDA_LONG idx = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
CUDA_LONG step_size = gridDim.x * blockDim.x * UNROLL;
|
||||
CUDA_LONG rounded_size = ((N - 1) / step_size + 1) * step_size;
|
||||
|
||||
curandStatePhilox4_32_10_t state;
|
||||
curand_init(seeds.first, idx, seeds.second, &state);
|
||||
|
||||
float4 rand;
|
||||
|
||||
// We ensure every thread generates the same number of random numbers (by rounding
|
||||
// up the size) and at the same timestep (by syncing threads).
|
||||
// From CUDA curand documentation:
|
||||
// The Philox_4x32_10 algorithm is closely tied to the thread and block count.
|
||||
// Each thread computes 4 random numbers in the same time thus the most efficient
|
||||
// use of Philox_4x32_10 is to generate a multiple of 4 times number of threads.
|
||||
for (CUDA_LONG id = idx; id < rounded_size; id += step_size) {
|
||||
float4 rand = curand_uniform4(&state);
|
||||
|
||||
#pragma unroll
|
||||
for (CUDA_LONG i = 0; i < UNROLL; i++) {
|
||||
CUDA_LONG li = id + gridDim.x * blockDim.x * i;
|
||||
for (CUDA_LONG id = idx * UNROLL; id < N; id += step_size) {
|
||||
rand = curand_uniform4(&state);
|
||||
|
||||
// actual computation
|
||||
#pragma unroll
|
||||
for (int i = 0; i < UNROLL; i++) {
|
||||
CUDA_LONG li = id + i;
|
||||
if (li < N) {
|
||||
int offset = fdm_dim.mod(li);
|
||||
float bias = float(bias_data[offset]);
|
||||
|
|
@ -77,6 +79,75 @@ __global__ void BiasDropoutKernel(
|
|||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
template <typename T, bool has_residual>
|
||||
__global__ void BiasDropoutVectorizedKernel(
|
||||
const int64_t N,
|
||||
const fast_divmod fdm_dim,
|
||||
const float ratio,
|
||||
const std::pair<uint64_t, uint64_t> seeds,
|
||||
const T* X_data,
|
||||
const T* bias_data,
|
||||
const T* residual_data,
|
||||
T* Y_data,
|
||||
bool* mask_data) {
|
||||
const float p = 1.0f - ratio;
|
||||
const float scale = 1.0f / p;
|
||||
|
||||
CUDA_LONG idx = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
CUDA_LONG step_size = gridDim.x * blockDim.x * UNROLL;
|
||||
|
||||
curandStatePhilox4_32_10_t state;
|
||||
curand_init(seeds.first, idx, seeds.second, &state);
|
||||
|
||||
float4 rand;
|
||||
|
||||
// using vectorized data load/store approach when N % 4 == 0
|
||||
// since this is typical case for input shape size
|
||||
using LoadT = aligned_vector<T, UNROLL>;
|
||||
using MaskLoadT = aligned_vector<bool, UNROLL>;
|
||||
using ResidualLoadT = aligned_vector<T, UNROLL>;
|
||||
|
||||
for (CUDA_LONG id = idx * UNROLL; id < N; id += step_size) {
|
||||
rand = curand_uniform4(&state);
|
||||
|
||||
// vectorized load into storage
|
||||
T src[UNROLL];
|
||||
LoadT *value1 = reinterpret_cast<LoadT*>(&src);
|
||||
*value1 = *reinterpret_cast<const LoadT*>(&X_data[id]);
|
||||
|
||||
T residual[UNROLL];
|
||||
if (has_residual) {
|
||||
ResidualLoadT *value2 = reinterpret_cast<ResidualLoadT*>(&residual);
|
||||
*value2 = *reinterpret_cast<const ResidualLoadT*>(&residual_data[id]);
|
||||
}
|
||||
|
||||
T r[UNROLL];
|
||||
bool mask[UNROLL];
|
||||
|
||||
// actual computation
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < UNROLL; ii++) {
|
||||
int offset = fdm_dim.mod(id + ii);
|
||||
float bias = float(bias_data[offset]);
|
||||
|
||||
mask[ii] = (&rand.x)[ii] < p;
|
||||
float output_data = (float(src[ii]) + bias) * mask[ii] * scale;
|
||||
if (has_residual) {
|
||||
output_data += float(residual[ii]);
|
||||
}
|
||||
r[ii] = T(output_data);
|
||||
}
|
||||
// Vectorized writes for mask_data & Y_data
|
||||
*(reinterpret_cast<LoadT*>(&Y_data[id])) = *reinterpret_cast<LoadT*>(&r[0]);
|
||||
*(reinterpret_cast<MaskLoadT*>(&mask_data[id])) = *reinterpret_cast<MaskLoadT*>(&mask[0]);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
|
@ -100,10 +171,18 @@ void BiasDropoutKernelImpl(
|
|||
const uint64_t counter_offset = static_cast<uint64_t>(((N - 1) / (block_size * grid_size * UNROLL) + 1) * UNROLL);
|
||||
auto seeds = generator.NextPhiloxSeeds(counter_offset);
|
||||
|
||||
if (residual_data == nullptr) {
|
||||
BiasDropoutKernel<T, false><<<grid_size, block_size, 0, stream>>>(N, fdm_dim, ratio, seeds, X_data, bias_data, residual_data, Y_data, mask_data);
|
||||
if (N % UNROLL != 0) {
|
||||
if (residual_data == nullptr) {
|
||||
BiasDropoutKernel<T, false><<<grid_size, block_size, 0, stream>>>(N, fdm_dim, ratio, seeds, X_data, bias_data, residual_data, Y_data, mask_data);
|
||||
} else {
|
||||
BiasDropoutKernel<T, true><<<grid_size, block_size, 0, stream>>>(N, fdm_dim, ratio, seeds, X_data, bias_data, residual_data, Y_data, mask_data);
|
||||
}
|
||||
} else {
|
||||
BiasDropoutKernel<T, true><<<grid_size, block_size, 0, stream>>>(N, fdm_dim, ratio, seeds, X_data, bias_data, residual_data, Y_data, mask_data);
|
||||
if (residual_data == nullptr) {
|
||||
BiasDropoutVectorizedKernel<T, false><<<grid_size, block_size, 0, stream>>>(N, fdm_dim, ratio, seeds, X_data, bias_data, residual_data, Y_data, mask_data);
|
||||
} else {
|
||||
BiasDropoutVectorizedKernel<T, true><<<grid_size, block_size, 0, stream>>>(N, fdm_dim, ratio, seeds, X_data, bias_data, residual_data, Y_data, mask_data);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -312,6 +312,12 @@ struct GridDim {
|
|||
};
|
||||
};
|
||||
|
||||
// aligned vector generates vectorized load/store on CUDA
|
||||
template<typename T, int vec_size>
|
||||
struct alignas(sizeof(T) * vec_size) aligned_vector {
|
||||
T val[vec_size];
|
||||
};
|
||||
|
||||
#define CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N) \
|
||||
CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x; \
|
||||
if (id >= N) \
|
||||
|
|
|
|||
|
|
@ -39,23 +39,25 @@ __global__ void DropoutKernel(
|
|||
|
||||
CUDA_LONG idx = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
CUDA_LONG step_size = gridDim.x * blockDim.x * UNROLL;
|
||||
CUDA_LONG rounded_size = ((N - 1) / step_size + 1) * step_size;
|
||||
|
||||
curandStatePhilox4_32_10_t state;
|
||||
curand_init(seeds.first, idx, seeds.second, &state);
|
||||
|
||||
float4 rand;
|
||||
|
||||
// We ensure every thread generates the same number of random numbers (by rounding
|
||||
// up the size) and at the same timestep (by syncing threads).
|
||||
// From CUDA curand documentation:
|
||||
// The Philox_4x32_10 algorithm is closely tied to the thread and block count.
|
||||
// Each thread computes 4 random numbers in the same time thus the most efficient
|
||||
// use of Philox_4x32_10 is to generate a multiple of 4 times number of threads.
|
||||
for (CUDA_LONG id = idx; id < rounded_size; id += step_size) {
|
||||
float4 rand = curand_uniform4(&state);
|
||||
|
||||
#pragma unroll
|
||||
for (CUDA_LONG i = 0; i < UNROLL; i++) {
|
||||
CUDA_LONG li = id + gridDim.x * blockDim.x * i;
|
||||
for (CUDA_LONG id = idx * UNROLL; id < N; id += step_size) {
|
||||
rand = curand_uniform4(&state);
|
||||
|
||||
// actual computation
|
||||
#pragma unroll
|
||||
for (int i = 0; i < UNROLL; i++) {
|
||||
CUDA_LONG li = id + i;
|
||||
if (li < N) {
|
||||
mask_data[li] = (&rand.x)[i] < p;
|
||||
Y_data[li] = T(float(X_data[li]) * mask_data[li] * scale);
|
||||
|
|
@ -64,6 +66,57 @@ __global__ void DropoutKernel(
|
|||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void DropoutVectorizedKernel(
|
||||
const int64_t N,
|
||||
const float ratio,
|
||||
const std::pair<uint64_t, uint64_t> seeds,
|
||||
const T* X_data,
|
||||
T* Y_data,
|
||||
bool* mask_data) {
|
||||
const float p = 1.0f - ratio;
|
||||
const float scale = 1.0f / p;
|
||||
|
||||
CUDA_LONG idx = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
CUDA_LONG step_size = gridDim.x * blockDim.x * UNROLL;
|
||||
|
||||
curandStatePhilox4_32_10_t state;
|
||||
curand_init(seeds.first, idx, seeds.second, &state);
|
||||
|
||||
float4 rand;
|
||||
|
||||
// using vectorized data load/store approach when N % 4 == 0 since this is
|
||||
// typical case for input shape size
|
||||
using LoadT = aligned_vector<T, UNROLL>;
|
||||
using MaskLoadT = aligned_vector<bool, UNROLL>;
|
||||
|
||||
for (CUDA_LONG id = idx * UNROLL; id < N; id += step_size) {
|
||||
rand = curand_uniform4(&state);
|
||||
|
||||
// vectorized load into storage
|
||||
T src[UNROLL];
|
||||
LoadT *value = reinterpret_cast<LoadT*>(&src);
|
||||
*value = *reinterpret_cast<const LoadT*>(&X_data[id]);
|
||||
|
||||
T r[UNROLL];
|
||||
bool mask[UNROLL];
|
||||
|
||||
// actual computation
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < UNROLL; ii++) {
|
||||
mask[ii] = (&rand.x)[ii] < p;
|
||||
r[ii] = T(float(src[ii]) * mask[ii] * scale);
|
||||
}
|
||||
// Vectorized writes for mask_data & Y_data
|
||||
*(reinterpret_cast<LoadT*>(&Y_data[id])) = *reinterpret_cast<LoadT*>(&r[0]);
|
||||
*(reinterpret_cast<MaskLoadT*>(&mask_data[id])) = *reinterpret_cast<MaskLoadT*>(&mask[0]);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
|
@ -84,7 +137,11 @@ void DropoutKernelImpl(
|
|||
const uint64_t counter_offset = static_cast<uint64_t>(((N - 1) / (block_size * grid_size * UNROLL) + 1) * UNROLL);
|
||||
auto seeds = generator.NextPhiloxSeeds(counter_offset);
|
||||
|
||||
DropoutKernel<T><<<grid_size, block_size, 0, stream>>>(N, ratio, seeds, X_data, Y_data, mask_data);
|
||||
if ( N % UNROLL != 0) {
|
||||
DropoutKernel<T><<<grid_size, block_size, 0, stream>>>(N, ratio, seeds, X_data, Y_data, mask_data);
|
||||
} else {
|
||||
DropoutVectorizedKernel<T><<<grid_size, block_size, 0, stream>>>(N, ratio, seeds, X_data, Y_data, mask_data);
|
||||
}
|
||||
}
|
||||
|
||||
#define SPECIALIZED_DROPOUT_IMPL(T) \
|
||||
|
|
|
|||
|
|
@ -205,6 +205,11 @@ struct GridDim {
|
|||
};
|
||||
};
|
||||
|
||||
// aligned vector generates vectorized load/store on CUDA
|
||||
template<typename T, int vec_size>
|
||||
struct alignas(sizeof(T) * vec_size) aligned_vector {
|
||||
T val[vec_size];
|
||||
};
|
||||
|
||||
#define CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N) \
|
||||
HIP_LONG id = blockDim.x * blockIdx.x + threadIdx.x; \
|
||||
|
|
|
|||
|
|
@ -134,6 +134,19 @@ void RunBiasDropoutTest(const bool use_mask, const std::vector<int64_t>& input_s
|
|||
}
|
||||
} // namespace
|
||||
|
||||
|
||||
// N % 4 != 0
|
||||
TEST(BiasDropoutTest, BasicAndNotVectorized) {
|
||||
RunBiasDropoutTest(false, {10, 5, 5}, 0.75f);
|
||||
}
|
||||
TEST(BiasDropoutTest, BasicWithoutResidualAndNotVectorized) {
|
||||
RunBiasDropoutTest(false, {10, 5, 5}, 0.75f, TrainingTrue, false, false);
|
||||
}
|
||||
TEST(BiasDropoutTest, MaskAndNotVectorized) {
|
||||
RunBiasDropoutTest(true, {3, 5, 100}, 0.25f);
|
||||
}
|
||||
|
||||
// N % 4 == 0
|
||||
TEST(BiasDropoutTest, Basic) {
|
||||
RunBiasDropoutTest(false, {10, 10, 10}, 0.75f);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -179,8 +179,8 @@ def testToyBERTModelBasicTraining(dynamic_shape):
|
|||
|
||||
|
||||
@pytest.mark.parametrize("expected_losses", [
|
||||
([10.991958, 10.975625, 11.032847, 11.034771, 10.987653,
|
||||
11.039469, 10.971498, 11.101391, 11.047601, 11.077588])
|
||||
([11.041123, 10.986166, 11.101636, 11.013366, 11.03775 ,
|
||||
11.041175, 10.957118, 11.069563, 11.040824, 11.16437])
|
||||
])
|
||||
def testToyBERTDeterministicCheck(expected_losses):
|
||||
# Common setup
|
||||
|
|
@ -294,12 +294,12 @@ def testToyBERTModelLRScheduler(initial_lr, lr_scheduler, expected_learning_rate
|
|||
|
||||
|
||||
@pytest.mark.parametrize("loss_scaler, expected_losses", [
|
||||
(None, [10.992018, 10.975699, 11.032809, 11.034765, 10.987625,
|
||||
11.039452, 10.971539, 11.10148, 11.047551, 11.077468]),
|
||||
(amp.DynamicLossScaler(), [10.992018, 10.975699, 11.032809, 11.034765,
|
||||
10.987625, 11.039452, 10.971539, 11.10148, 11.047551, 11.077468]),
|
||||
(CustomLossScaler(), [10.992018, 10.975699, 11.032791, 11.034729,
|
||||
10.987614, 11.039479, 10.971532, 11.101475, 11.04761, 11.077413])
|
||||
(None, [11.041126, 10.986309, 11.101673, 11.013394, 11.037781,
|
||||
11.041253, 10.957072, 11.069506, 11.040807, 11.164349]),
|
||||
(amp.DynamicLossScaler(), [11.041126, 10.986309, 11.101673, 11.013394,
|
||||
11.037781, 11.041253, 10.957072, 11.069506, 11.040807, 11.164349]),
|
||||
(CustomLossScaler(), [11.041126, 10.986309, 11.101645, 11.013412,
|
||||
11.037757, 11.041273, 10.957077, 11.069525, 11.040765, 11.164298])
|
||||
])
|
||||
def testToyBERTModelMixedPrecisionLossScaler(loss_scaler, expected_losses):
|
||||
# Common setup
|
||||
|
|
@ -339,12 +339,12 @@ def testToyBERTModelMixedPrecisionLossScaler(loss_scaler, expected_losses):
|
|||
|
||||
|
||||
@pytest.mark.parametrize("gradient_accumulation_steps, expected_losses", [
|
||||
(1, [10.991958, 10.975625, 11.032847, 11.034771, 10.987653,
|
||||
11.039469, 10.971498, 11.101391, 11.047601, 11.077588]),
|
||||
(4, [10.991958, 10.97373, 11.033534, 11.028931, 10.988836,
|
||||
11.04126, 10.969865, 11.085526, 11.036701, 11.0628]),
|
||||
(7, [10.991958, 10.97373, 11.033534, 11.028931, 10.994967,
|
||||
11.043544, 10.974638, 11.085087, 11.034944, 11.059022])
|
||||
(1, [11.041123, 10.986166, 11.101636, 11.013366, 11.03775,
|
||||
11.041175, 10.957118, 11.069563, 11.040824, 11.16437]),
|
||||
(4, [11.041123, 10.982856, 11.105512, 11.006721, 11.03358,
|
||||
11.05058, 10.955864, 11.059035, 11.037753, 11.162649]),
|
||||
(7, [11.041123, 10.982856, 11.105512, 11.006721, 11.036314,
|
||||
11.055109, 10.960751, 11.05809 , 11.038856, 11.159635])
|
||||
])
|
||||
def testToyBERTModelGradientAccumulation(gradient_accumulation_steps, expected_losses):
|
||||
# Common setup
|
||||
|
|
|
|||
|
|
@ -741,7 +741,7 @@ def testORTTrainerMixedPrecisionLossScaler(seed, device, expected_loss, fetches)
|
|||
def _recompute_data():
|
||||
device_capability_major = torch.cuda.get_device_capability()[0]
|
||||
if device_capability_major == 7: # V100 for Dev machine
|
||||
expected_loss = [10.5732, 10.4407, 10.3701, 10.2778, 10.1824]
|
||||
expected_loss = [10.5598 , 10.4591, 10.3477, 10.2726, 10.1945]
|
||||
return [
|
||||
(False, False, False, 0, expected_loss), # no recompute
|
||||
(True, False, False, 0, expected_loss), # attn_dropout recompute
|
||||
|
|
@ -1499,14 +1499,14 @@ def _adam_max_norm_clip_data():
|
|||
device_capability_major = torch.cuda.get_device_capability()[0]
|
||||
if device_capability_major == 7: # V100 for Dev machine
|
||||
return [
|
||||
(0, 'cuda', 1.0, 1, 12, [10.596329, 10.087329, 9.625324, 9.254117, 8.914067,\
|
||||
8.557245, 8.296672, 8.040311, 7.780754, 7.499548, 7.229341, 7.036769]),
|
||||
(0, 'cuda', 0.1, 1, 12, [10.596329, 10.088068, 9.626670, 9.256137, 8.916809,\
|
||||
8.560838, 8.301097, 8.045413, 7.786527, 7.505644, 7.236132, 7.043610]),
|
||||
(42, 'cuda', 1.0, 1, 12, [10.659752, 10.149531, 9.646378, 9.273719, 8.938648,\
|
||||
8.595006, 8.344718, 8.100259, 7.828771, 7.541266, 7.269467, 7.083140]),
|
||||
(42, 'cuda', 0.1, 1, 12, [10.659752, 10.150211, 9.647715, 9.275835, 8.941610,\
|
||||
8.598876, 8.349401, 8.105709, 7.834774, 7.547812, 7.276530, 7.090215]),
|
||||
(0, 'cuda', 1.0, 1, 12, [10.592951, 10.067989, 9.619152, 9.245731, 8.881137,\
|
||||
8.578644, 8.280573, 8.063023, 7.797933, 7.486215, 7.233806, 7.011791]),
|
||||
(0, 'cuda', 0.1, 1, 12, [10.592951, 10.068722, 9.620503, 9.247791, 8.883972,\
|
||||
8.582286, 8.285027, 8.068308, 7.803638, 7.492318, 7.240352, 7.018665]),
|
||||
(42, 'cuda', 1.0, 1, 12, [10.647908, 10.144501, 9.672352, 9.306980, 8.956026,\
|
||||
8.602655, 8.351079, 8.088144, 7.867220, 7.564082, 7.289846, 7.073726]),
|
||||
(42, 'cuda', 0.1, 1, 12, [10.647908, 10.145191, 9.673690, 9.309031, 8.959020,\
|
||||
8.606632, 8.355836, 8.093478, 7.873327, 7.570731, 7.296772, 7.0809422]),
|
||||
]
|
||||
elif device_capability_major == 5: # M60 for CI machines (Python Packaging Pipeline)
|
||||
return [
|
||||
|
|
@ -1548,14 +1548,14 @@ def _lamb_max_norm_clip_data():
|
|||
device_capability_major = torch.cuda.get_device_capability()[0]
|
||||
if device_capability_major == 7: # V100 for Dev machine
|
||||
return [
|
||||
(0, 'cuda', 1.0, 1, 12, [10.596329, 10.509530, 10.422451, 10.359101, 10.285673, 10.200603,\
|
||||
10.152860, 10.106999, 10.033828, 9.965749, 9.895924, 9.854723]),
|
||||
(0, 'cuda', 0.1, 1, 12, [10.596329, 10.474221, 10.350412, 10.253196, 10.148172, 10.032470,\
|
||||
9.958271, 9.885362, 9.788476, 9.696474, 9.601951, 9.542482]),
|
||||
(42, 'cuda', 1.0, 1, 12, [10.659752, 10.565927, 10.437677, 10.387601, 10.302234, 10.217105,\
|
||||
10.170007, 10.143104, 10.093051, 10.002419, 9.960327, 9.895797]),
|
||||
(42, 'cuda', 0.1, 1, 12, [10.659752, 10.531717, 10.367162, 10.284177, 10.168813, 10.053536,\
|
||||
9.980052, 9.926860, 9.852230, 9.738342, 9.673130, 9.590945]),
|
||||
(0, 'cuda', 1.0, 1, 12, [10.592951, 10.487728, 10.422251, 10.350913, 10.244248, 10.213003,\
|
||||
10.129222, 10.095112, 10.035983, 9.974586, 9.909771, 9.874278]),
|
||||
(0, 'cuda', 0.1, 1, 12, [10.592951, 10.452503, 10.349832, 10.245314, 10.106587, 10.046009,\
|
||||
9.934781, 9.875164, 9.792067, 9.704592, 9.617104, 9.563070]),
|
||||
(42, 'cuda', 1.0, 1, 12, [10.647908, 10.566276, 10.476154, 10.406275, 10.311079, 10.240053,\
|
||||
10.196469, 10.113955, 10.117376, 10.013077, 9.930301, 9.893368]),
|
||||
(42, 'cuda', 0.1, 1, 12, [10.647908, 10.531957, 10.405246, 10.302971, 10.176583, 10.075583,\
|
||||
10.005772, 9.897825, 9.875748, 9.748932, 9.642885, 9.586762]),
|
||||
]
|
||||
elif device_capability_major == 5: # M60 for CI machines (Python Packaging Pipeline)
|
||||
return [
|
||||
|
|
|
|||
|
|
@ -125,6 +125,15 @@ void RunDropoutTest(const bool use_mask, const std::vector<int64_t>& input_shape
|
|||
|
||||
// Dropout
|
||||
|
||||
// N % 4 != 0
|
||||
TEST(DropoutTest, BasicAndNotVectorized) {
|
||||
RunDropoutTest(false, {10, 5, 5}, 0.75f);
|
||||
}
|
||||
TEST(DropoutTest, MaskAndNotVectorized) {
|
||||
RunDropoutTest(true, {250}, 0.25f);
|
||||
}
|
||||
|
||||
// N % 4 == 0
|
||||
TEST(DropoutTest, Basic) {
|
||||
RunDropoutTest(false, {10, 10, 10}, 0.75f);
|
||||
}
|
||||
|
|
@ -197,6 +206,11 @@ void RunDropoutGradTest(float ratio, const std::vector<int64_t>& input_dims, boo
|
|||
// DropoutGrad
|
||||
|
||||
TEST(DropoutGradTest, Basic) {
|
||||
// N % 4 != 0
|
||||
//Ratio 0.3, 2D
|
||||
RunDropoutGradTest(0.3f, {5, 6}, false);
|
||||
|
||||
// N %4 == 0
|
||||
//Ratio 0.2, 1D
|
||||
RunDropoutGradTest(0.2f, {16}, false);
|
||||
|
||||
|
|
|
|||
|
|
@ -24,6 +24,8 @@
|
|||
namespace onnxruntime {
|
||||
namespace cuda {
|
||||
|
||||
constexpr int UNROLL = 4;
|
||||
|
||||
template <typename T, int NumThreadsPerBlock, int NumElementsPerThread>
|
||||
__global__ void DropoutGradientKernel(
|
||||
const int64_t N,
|
||||
|
|
@ -31,16 +33,58 @@ __global__ void DropoutGradientKernel(
|
|||
const bool* mask_data,
|
||||
const float scale,
|
||||
T* dX_data) {
|
||||
CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < NumElementsPerThread; i++) {
|
||||
if (id < N) {
|
||||
dX_data[id] = T(float(dY_data[id]) * mask_data[id] * scale);
|
||||
id += NumThreadsPerBlock;
|
||||
|
||||
CUDA_LONG idx = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
CUDA_LONG id = idx * UNROLL;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < UNROLL; i++) {
|
||||
CUDA_LONG li = id + i;
|
||||
if (li < N) {
|
||||
dX_data[li] = T(float(dY_data[li]) * mask_data[li] * scale);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, int NumThreadsPerBlock, int NumElementsPerThread>
|
||||
__global__ void DropoutGradientVectorizedKernel(
|
||||
const int64_t N,
|
||||
const T* dY_data,
|
||||
const bool* mask_data,
|
||||
const float scale,
|
||||
T* dX_data) {
|
||||
|
||||
// using vectorized data load/store approach when N % 4 == 0
|
||||
// since this is typical case for input shape size
|
||||
using LoadT = aligned_vector<T, UNROLL>;
|
||||
using MaskLoadT = aligned_vector<bool, UNROLL>;
|
||||
|
||||
CUDA_LONG idx = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
CUDA_LONG id = idx * UNROLL;
|
||||
|
||||
if (id < N) {
|
||||
// vectorized load into storage
|
||||
T src[UNROLL];
|
||||
LoadT *value1 = reinterpret_cast<LoadT*>(&src);
|
||||
*value1 = *reinterpret_cast<const LoadT*>(&dY_data[id]);
|
||||
|
||||
bool mask[UNROLL];
|
||||
MaskLoadT *value2 = reinterpret_cast<MaskLoadT*>(&mask);
|
||||
*value2 = *reinterpret_cast<const MaskLoadT*>(&mask_data[id]);
|
||||
|
||||
T r[UNROLL];
|
||||
|
||||
// actual computation
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < UNROLL; ii++) {
|
||||
r[ii] = T(float(src[ii]) * mask[ii] * scale);
|
||||
}
|
||||
// Vectorized writes for dX_data
|
||||
*(reinterpret_cast<LoadT*>(&dX_data[id])) = *reinterpret_cast<LoadT*>(&r[0]);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void DropoutGradientKernelImpl(
|
||||
cudaStream_t stream,
|
||||
|
|
@ -56,8 +100,14 @@ void DropoutGradientKernelImpl(
|
|||
} else {
|
||||
const float scale = 1.f / (1.f - ratio);
|
||||
const int blocksPerGrid = static_cast<int>(CeilDiv(N, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
|
||||
DropoutGradientKernel<T, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
|
||||
<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(N, dY_data, mask_data, scale, dX_data);
|
||||
if (N % UNROLL != 0) {
|
||||
DropoutGradientKernel<T, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
|
||||
<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(N, dY_data, mask_data, scale, dX_data);
|
||||
} else {
|
||||
DropoutGradientVectorizedKernel<T, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
|
||||
<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(N, dY_data, mask_data, scale, dX_data);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,11 +1,11 @@
|
|||
step,total_loss,mlm_loss,nsp_loss
|
||||
0,11.2171,10.5178,0.699279
|
||||
5,9.6935,7.51946,2.17404
|
||||
10,8.72874,7.60452,1.12422
|
||||
15,8.25456,7.54113,0.713431
|
||||
20,8.17125,7.47469,0.696562
|
||||
25,8.21603,7.52277,0.693259
|
||||
30,8.08864,7.39777,0.69087
|
||||
35,7.9672,7.25153,0.715668
|
||||
40,7.94141,7.25788,0.683527
|
||||
45,7.94186,7.27316,0.668707
|
||||
0,11.2032,10.501,0.702181
|
||||
5,9.53939,7.52411,2.01528
|
||||
10,8.2614,7.564,0.697406
|
||||
15,8.28412,7.55601,0.728112
|
||||
20,8.17273,7.45947,0.71326
|
||||
25,8.228,7.53251,0.695496
|
||||
30,8.07991,7.38456,0.695344
|
||||
35,7.96173,7.25046,0.711262
|
||||
40,7.9463,7.25667,0.689625
|
||||
45,7.92987,7.26442,0.665449
|
||||
|
|
|
|||
|
Loading…
Reference in a new issue