Dropout Vectorized Kernel (#9157)

* vectorized kernel

* fix build

* re-calibrate expected loss

* fix build

* re-calibrate convergence results

* more re-calibrate on loss

* divide kernels

* adress comments

* more calibration

* calibration

* per comments

* enable sync

Co-authored-by: Ethan Tao <ettao@OrtTrainingDev4.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
This commit is contained in:
ytaous 2021-09-27 17:19:12 -07:00 committed by GitHub
parent 1b0816859f
commit d3f859fe30
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 291 additions and 67 deletions

View file

@ -44,23 +44,25 @@ __global__ void BiasDropoutKernel(
CUDA_LONG idx = blockDim.x * blockIdx.x + threadIdx.x;
CUDA_LONG step_size = gridDim.x * blockDim.x * UNROLL;
CUDA_LONG rounded_size = ((N - 1) / step_size + 1) * step_size;
curandStatePhilox4_32_10_t state;
curand_init(seeds.first, idx, seeds.second, &state);
float4 rand;
// We ensure every thread generates the same number of random numbers (by rounding
// up the size) and at the same timestep (by syncing threads).
// From CUDA curand documentation:
// The Philox_4x32_10 algorithm is closely tied to the thread and block count.
// Each thread computes 4 random numbers in the same time thus the most efficient
// use of Philox_4x32_10 is to generate a multiple of 4 times number of threads.
for (CUDA_LONG id = idx; id < rounded_size; id += step_size) {
float4 rand = curand_uniform4(&state);
#pragma unroll
for (CUDA_LONG i = 0; i < UNROLL; i++) {
CUDA_LONG li = id + gridDim.x * blockDim.x * i;
for (CUDA_LONG id = idx * UNROLL; id < N; id += step_size) {
rand = curand_uniform4(&state);
// actual computation
#pragma unroll
for (int i = 0; i < UNROLL; i++) {
CUDA_LONG li = id + i;
if (li < N) {
int offset = fdm_dim.mod(li);
float bias = float(bias_data[offset]);
@ -77,6 +79,75 @@ __global__ void BiasDropoutKernel(
__syncthreads();
}
}
template <typename T, bool has_residual>
__global__ void BiasDropoutVectorizedKernel(
const int64_t N,
const fast_divmod fdm_dim,
const float ratio,
const std::pair<uint64_t, uint64_t> seeds,
const T* X_data,
const T* bias_data,
const T* residual_data,
T* Y_data,
bool* mask_data) {
const float p = 1.0f - ratio;
const float scale = 1.0f / p;
CUDA_LONG idx = blockDim.x * blockIdx.x + threadIdx.x;
CUDA_LONG step_size = gridDim.x * blockDim.x * UNROLL;
curandStatePhilox4_32_10_t state;
curand_init(seeds.first, idx, seeds.second, &state);
float4 rand;
// using vectorized data load/store approach when N % 4 == 0
// since this is typical case for input shape size
using LoadT = aligned_vector<T, UNROLL>;
using MaskLoadT = aligned_vector<bool, UNROLL>;
using ResidualLoadT = aligned_vector<T, UNROLL>;
for (CUDA_LONG id = idx * UNROLL; id < N; id += step_size) {
rand = curand_uniform4(&state);
// vectorized load into storage
T src[UNROLL];
LoadT *value1 = reinterpret_cast<LoadT*>(&src);
*value1 = *reinterpret_cast<const LoadT*>(&X_data[id]);
T residual[UNROLL];
if (has_residual) {
ResidualLoadT *value2 = reinterpret_cast<ResidualLoadT*>(&residual);
*value2 = *reinterpret_cast<const ResidualLoadT*>(&residual_data[id]);
}
T r[UNROLL];
bool mask[UNROLL];
// actual computation
#pragma unroll
for (int ii = 0; ii < UNROLL; ii++) {
int offset = fdm_dim.mod(id + ii);
float bias = float(bias_data[offset]);
mask[ii] = (&rand.x)[ii] < p;
float output_data = (float(src[ii]) + bias) * mask[ii] * scale;
if (has_residual) {
output_data += float(residual[ii]);
}
r[ii] = T(output_data);
}
// Vectorized writes for mask_data & Y_data
*(reinterpret_cast<LoadT*>(&Y_data[id])) = *reinterpret_cast<LoadT*>(&r[0]);
*(reinterpret_cast<MaskLoadT*>(&mask_data[id])) = *reinterpret_cast<MaskLoadT*>(&mask[0]);
__syncthreads();
}
}
template <typename T>
@ -100,10 +171,18 @@ void BiasDropoutKernelImpl(
const uint64_t counter_offset = static_cast<uint64_t>(((N - 1) / (block_size * grid_size * UNROLL) + 1) * UNROLL);
auto seeds = generator.NextPhiloxSeeds(counter_offset);
if (residual_data == nullptr) {
BiasDropoutKernel<T, false><<<grid_size, block_size, 0, stream>>>(N, fdm_dim, ratio, seeds, X_data, bias_data, residual_data, Y_data, mask_data);
if (N % UNROLL != 0) {
if (residual_data == nullptr) {
BiasDropoutKernel<T, false><<<grid_size, block_size, 0, stream>>>(N, fdm_dim, ratio, seeds, X_data, bias_data, residual_data, Y_data, mask_data);
} else {
BiasDropoutKernel<T, true><<<grid_size, block_size, 0, stream>>>(N, fdm_dim, ratio, seeds, X_data, bias_data, residual_data, Y_data, mask_data);
}
} else {
BiasDropoutKernel<T, true><<<grid_size, block_size, 0, stream>>>(N, fdm_dim, ratio, seeds, X_data, bias_data, residual_data, Y_data, mask_data);
if (residual_data == nullptr) {
BiasDropoutVectorizedKernel<T, false><<<grid_size, block_size, 0, stream>>>(N, fdm_dim, ratio, seeds, X_data, bias_data, residual_data, Y_data, mask_data);
} else {
BiasDropoutVectorizedKernel<T, true><<<grid_size, block_size, 0, stream>>>(N, fdm_dim, ratio, seeds, X_data, bias_data, residual_data, Y_data, mask_data);
}
}
}

View file

@ -312,6 +312,12 @@ struct GridDim {
};
};
// aligned vector generates vectorized load/store on CUDA
template<typename T, int vec_size>
struct alignas(sizeof(T) * vec_size) aligned_vector {
T val[vec_size];
};
#define CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N) \
CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x; \
if (id >= N) \

View file

@ -39,23 +39,25 @@ __global__ void DropoutKernel(
CUDA_LONG idx = blockDim.x * blockIdx.x + threadIdx.x;
CUDA_LONG step_size = gridDim.x * blockDim.x * UNROLL;
CUDA_LONG rounded_size = ((N - 1) / step_size + 1) * step_size;
curandStatePhilox4_32_10_t state;
curand_init(seeds.first, idx, seeds.second, &state);
float4 rand;
// We ensure every thread generates the same number of random numbers (by rounding
// up the size) and at the same timestep (by syncing threads).
// From CUDA curand documentation:
// The Philox_4x32_10 algorithm is closely tied to the thread and block count.
// Each thread computes 4 random numbers in the same time thus the most efficient
// use of Philox_4x32_10 is to generate a multiple of 4 times number of threads.
for (CUDA_LONG id = idx; id < rounded_size; id += step_size) {
float4 rand = curand_uniform4(&state);
#pragma unroll
for (CUDA_LONG i = 0; i < UNROLL; i++) {
CUDA_LONG li = id + gridDim.x * blockDim.x * i;
for (CUDA_LONG id = idx * UNROLL; id < N; id += step_size) {
rand = curand_uniform4(&state);
// actual computation
#pragma unroll
for (int i = 0; i < UNROLL; i++) {
CUDA_LONG li = id + i;
if (li < N) {
mask_data[li] = (&rand.x)[i] < p;
Y_data[li] = T(float(X_data[li]) * mask_data[li] * scale);
@ -64,6 +66,57 @@ __global__ void DropoutKernel(
__syncthreads();
}
}
template <typename T>
__global__ void DropoutVectorizedKernel(
const int64_t N,
const float ratio,
const std::pair<uint64_t, uint64_t> seeds,
const T* X_data,
T* Y_data,
bool* mask_data) {
const float p = 1.0f - ratio;
const float scale = 1.0f / p;
CUDA_LONG idx = blockDim.x * blockIdx.x + threadIdx.x;
CUDA_LONG step_size = gridDim.x * blockDim.x * UNROLL;
curandStatePhilox4_32_10_t state;
curand_init(seeds.first, idx, seeds.second, &state);
float4 rand;
// using vectorized data load/store approach when N % 4 == 0 since this is
// typical case for input shape size
using LoadT = aligned_vector<T, UNROLL>;
using MaskLoadT = aligned_vector<bool, UNROLL>;
for (CUDA_LONG id = idx * UNROLL; id < N; id += step_size) {
rand = curand_uniform4(&state);
// vectorized load into storage
T src[UNROLL];
LoadT *value = reinterpret_cast<LoadT*>(&src);
*value = *reinterpret_cast<const LoadT*>(&X_data[id]);
T r[UNROLL];
bool mask[UNROLL];
// actual computation
#pragma unroll
for (int ii = 0; ii < UNROLL; ii++) {
mask[ii] = (&rand.x)[ii] < p;
r[ii] = T(float(src[ii]) * mask[ii] * scale);
}
// Vectorized writes for mask_data & Y_data
*(reinterpret_cast<LoadT*>(&Y_data[id])) = *reinterpret_cast<LoadT*>(&r[0]);
*(reinterpret_cast<MaskLoadT*>(&mask_data[id])) = *reinterpret_cast<MaskLoadT*>(&mask[0]);
__syncthreads();
}
}
template <typename T>
@ -84,7 +137,11 @@ void DropoutKernelImpl(
const uint64_t counter_offset = static_cast<uint64_t>(((N - 1) / (block_size * grid_size * UNROLL) + 1) * UNROLL);
auto seeds = generator.NextPhiloxSeeds(counter_offset);
DropoutKernel<T><<<grid_size, block_size, 0, stream>>>(N, ratio, seeds, X_data, Y_data, mask_data);
if ( N % UNROLL != 0) {
DropoutKernel<T><<<grid_size, block_size, 0, stream>>>(N, ratio, seeds, X_data, Y_data, mask_data);
} else {
DropoutVectorizedKernel<T><<<grid_size, block_size, 0, stream>>>(N, ratio, seeds, X_data, Y_data, mask_data);
}
}
#define SPECIALIZED_DROPOUT_IMPL(T) \

View file

@ -205,6 +205,11 @@ struct GridDim {
};
};
// aligned vector generates vectorized load/store on CUDA
template<typename T, int vec_size>
struct alignas(sizeof(T) * vec_size) aligned_vector {
T val[vec_size];
};
#define CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N) \
HIP_LONG id = blockDim.x * blockIdx.x + threadIdx.x; \

View file

@ -134,6 +134,19 @@ void RunBiasDropoutTest(const bool use_mask, const std::vector<int64_t>& input_s
}
} // namespace
// N % 4 != 0
TEST(BiasDropoutTest, BasicAndNotVectorized) {
RunBiasDropoutTest(false, {10, 5, 5}, 0.75f);
}
TEST(BiasDropoutTest, BasicWithoutResidualAndNotVectorized) {
RunBiasDropoutTest(false, {10, 5, 5}, 0.75f, TrainingTrue, false, false);
}
TEST(BiasDropoutTest, MaskAndNotVectorized) {
RunBiasDropoutTest(true, {3, 5, 100}, 0.25f);
}
// N % 4 == 0
TEST(BiasDropoutTest, Basic) {
RunBiasDropoutTest(false, {10, 10, 10}, 0.75f);
}

View file

@ -179,8 +179,8 @@ def testToyBERTModelBasicTraining(dynamic_shape):
@pytest.mark.parametrize("expected_losses", [
([10.991958, 10.975625, 11.032847, 11.034771, 10.987653,
11.039469, 10.971498, 11.101391, 11.047601, 11.077588])
([11.041123, 10.986166, 11.101636, 11.013366, 11.03775 ,
11.041175, 10.957118, 11.069563, 11.040824, 11.16437])
])
def testToyBERTDeterministicCheck(expected_losses):
# Common setup
@ -294,12 +294,12 @@ def testToyBERTModelLRScheduler(initial_lr, lr_scheduler, expected_learning_rate
@pytest.mark.parametrize("loss_scaler, expected_losses", [
(None, [10.992018, 10.975699, 11.032809, 11.034765, 10.987625,
11.039452, 10.971539, 11.10148, 11.047551, 11.077468]),
(amp.DynamicLossScaler(), [10.992018, 10.975699, 11.032809, 11.034765,
10.987625, 11.039452, 10.971539, 11.10148, 11.047551, 11.077468]),
(CustomLossScaler(), [10.992018, 10.975699, 11.032791, 11.034729,
10.987614, 11.039479, 10.971532, 11.101475, 11.04761, 11.077413])
(None, [11.041126, 10.986309, 11.101673, 11.013394, 11.037781,
11.041253, 10.957072, 11.069506, 11.040807, 11.164349]),
(amp.DynamicLossScaler(), [11.041126, 10.986309, 11.101673, 11.013394,
11.037781, 11.041253, 10.957072, 11.069506, 11.040807, 11.164349]),
(CustomLossScaler(), [11.041126, 10.986309, 11.101645, 11.013412,
11.037757, 11.041273, 10.957077, 11.069525, 11.040765, 11.164298])
])
def testToyBERTModelMixedPrecisionLossScaler(loss_scaler, expected_losses):
# Common setup
@ -339,12 +339,12 @@ def testToyBERTModelMixedPrecisionLossScaler(loss_scaler, expected_losses):
@pytest.mark.parametrize("gradient_accumulation_steps, expected_losses", [
(1, [10.991958, 10.975625, 11.032847, 11.034771, 10.987653,
11.039469, 10.971498, 11.101391, 11.047601, 11.077588]),
(4, [10.991958, 10.97373, 11.033534, 11.028931, 10.988836,
11.04126, 10.969865, 11.085526, 11.036701, 11.0628]),
(7, [10.991958, 10.97373, 11.033534, 11.028931, 10.994967,
11.043544, 10.974638, 11.085087, 11.034944, 11.059022])
(1, [11.041123, 10.986166, 11.101636, 11.013366, 11.03775,
11.041175, 10.957118, 11.069563, 11.040824, 11.16437]),
(4, [11.041123, 10.982856, 11.105512, 11.006721, 11.03358,
11.05058, 10.955864, 11.059035, 11.037753, 11.162649]),
(7, [11.041123, 10.982856, 11.105512, 11.006721, 11.036314,
11.055109, 10.960751, 11.05809 , 11.038856, 11.159635])
])
def testToyBERTModelGradientAccumulation(gradient_accumulation_steps, expected_losses):
# Common setup

View file

@ -741,7 +741,7 @@ def testORTTrainerMixedPrecisionLossScaler(seed, device, expected_loss, fetches)
def _recompute_data():
device_capability_major = torch.cuda.get_device_capability()[0]
if device_capability_major == 7: # V100 for Dev machine
expected_loss = [10.5732, 10.4407, 10.3701, 10.2778, 10.1824]
expected_loss = [10.5598 , 10.4591, 10.3477, 10.2726, 10.1945]
return [
(False, False, False, 0, expected_loss), # no recompute
(True, False, False, 0, expected_loss), # attn_dropout recompute
@ -1499,14 +1499,14 @@ def _adam_max_norm_clip_data():
device_capability_major = torch.cuda.get_device_capability()[0]
if device_capability_major == 7: # V100 for Dev machine
return [
(0, 'cuda', 1.0, 1, 12, [10.596329, 10.087329, 9.625324, 9.254117, 8.914067,\
8.557245, 8.296672, 8.040311, 7.780754, 7.499548, 7.229341, 7.036769]),
(0, 'cuda', 0.1, 1, 12, [10.596329, 10.088068, 9.626670, 9.256137, 8.916809,\
8.560838, 8.301097, 8.045413, 7.786527, 7.505644, 7.236132, 7.043610]),
(42, 'cuda', 1.0, 1, 12, [10.659752, 10.149531, 9.646378, 9.273719, 8.938648,\
8.595006, 8.344718, 8.100259, 7.828771, 7.541266, 7.269467, 7.083140]),
(42, 'cuda', 0.1, 1, 12, [10.659752, 10.150211, 9.647715, 9.275835, 8.941610,\
8.598876, 8.349401, 8.105709, 7.834774, 7.547812, 7.276530, 7.090215]),
(0, 'cuda', 1.0, 1, 12, [10.592951, 10.067989, 9.619152, 9.245731, 8.881137,\
8.578644, 8.280573, 8.063023, 7.797933, 7.486215, 7.233806, 7.011791]),
(0, 'cuda', 0.1, 1, 12, [10.592951, 10.068722, 9.620503, 9.247791, 8.883972,\
8.582286, 8.285027, 8.068308, 7.803638, 7.492318, 7.240352, 7.018665]),
(42, 'cuda', 1.0, 1, 12, [10.647908, 10.144501, 9.672352, 9.306980, 8.956026,\
8.602655, 8.351079, 8.088144, 7.867220, 7.564082, 7.289846, 7.073726]),
(42, 'cuda', 0.1, 1, 12, [10.647908, 10.145191, 9.673690, 9.309031, 8.959020,\
8.606632, 8.355836, 8.093478, 7.873327, 7.570731, 7.296772, 7.0809422]),
]
elif device_capability_major == 5: # M60 for CI machines (Python Packaging Pipeline)
return [
@ -1548,14 +1548,14 @@ def _lamb_max_norm_clip_data():
device_capability_major = torch.cuda.get_device_capability()[0]
if device_capability_major == 7: # V100 for Dev machine
return [
(0, 'cuda', 1.0, 1, 12, [10.596329, 10.509530, 10.422451, 10.359101, 10.285673, 10.200603,\
10.152860, 10.106999, 10.033828, 9.965749, 9.895924, 9.854723]),
(0, 'cuda', 0.1, 1, 12, [10.596329, 10.474221, 10.350412, 10.253196, 10.148172, 10.032470,\
9.958271, 9.885362, 9.788476, 9.696474, 9.601951, 9.542482]),
(42, 'cuda', 1.0, 1, 12, [10.659752, 10.565927, 10.437677, 10.387601, 10.302234, 10.217105,\
10.170007, 10.143104, 10.093051, 10.002419, 9.960327, 9.895797]),
(42, 'cuda', 0.1, 1, 12, [10.659752, 10.531717, 10.367162, 10.284177, 10.168813, 10.053536,\
9.980052, 9.926860, 9.852230, 9.738342, 9.673130, 9.590945]),
(0, 'cuda', 1.0, 1, 12, [10.592951, 10.487728, 10.422251, 10.350913, 10.244248, 10.213003,\
10.129222, 10.095112, 10.035983, 9.974586, 9.909771, 9.874278]),
(0, 'cuda', 0.1, 1, 12, [10.592951, 10.452503, 10.349832, 10.245314, 10.106587, 10.046009,\
9.934781, 9.875164, 9.792067, 9.704592, 9.617104, 9.563070]),
(42, 'cuda', 1.0, 1, 12, [10.647908, 10.566276, 10.476154, 10.406275, 10.311079, 10.240053,\
10.196469, 10.113955, 10.117376, 10.013077, 9.930301, 9.893368]),
(42, 'cuda', 0.1, 1, 12, [10.647908, 10.531957, 10.405246, 10.302971, 10.176583, 10.075583,\
10.005772, 9.897825, 9.875748, 9.748932, 9.642885, 9.586762]),
]
elif device_capability_major == 5: # M60 for CI machines (Python Packaging Pipeline)
return [

View file

@ -125,6 +125,15 @@ void RunDropoutTest(const bool use_mask, const std::vector<int64_t>& input_shape
// Dropout
// N % 4 != 0
TEST(DropoutTest, BasicAndNotVectorized) {
RunDropoutTest(false, {10, 5, 5}, 0.75f);
}
TEST(DropoutTest, MaskAndNotVectorized) {
RunDropoutTest(true, {250}, 0.25f);
}
// N % 4 == 0
TEST(DropoutTest, Basic) {
RunDropoutTest(false, {10, 10, 10}, 0.75f);
}
@ -197,6 +206,11 @@ void RunDropoutGradTest(float ratio, const std::vector<int64_t>& input_dims, boo
// DropoutGrad
TEST(DropoutGradTest, Basic) {
// N % 4 != 0
//Ratio 0.3, 2D
RunDropoutGradTest(0.3f, {5, 6}, false);
// N %4 == 0
//Ratio 0.2, 1D
RunDropoutGradTest(0.2f, {16}, false);

View file

@ -24,6 +24,8 @@
namespace onnxruntime {
namespace cuda {
constexpr int UNROLL = 4;
template <typename T, int NumThreadsPerBlock, int NumElementsPerThread>
__global__ void DropoutGradientKernel(
const int64_t N,
@ -31,16 +33,58 @@ __global__ void DropoutGradientKernel(
const bool* mask_data,
const float scale,
T* dX_data) {
CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
#pragma unroll
for (int i = 0; i < NumElementsPerThread; i++) {
if (id < N) {
dX_data[id] = T(float(dY_data[id]) * mask_data[id] * scale);
id += NumThreadsPerBlock;
CUDA_LONG idx = blockDim.x * blockIdx.x + threadIdx.x;
CUDA_LONG id = idx * UNROLL;
#pragma unroll
for (int i = 0; i < UNROLL; i++) {
CUDA_LONG li = id + i;
if (li < N) {
dX_data[li] = T(float(dY_data[li]) * mask_data[li] * scale);
}
}
}
template <typename T, int NumThreadsPerBlock, int NumElementsPerThread>
__global__ void DropoutGradientVectorizedKernel(
const int64_t N,
const T* dY_data,
const bool* mask_data,
const float scale,
T* dX_data) {
// using vectorized data load/store approach when N % 4 == 0
// since this is typical case for input shape size
using LoadT = aligned_vector<T, UNROLL>;
using MaskLoadT = aligned_vector<bool, UNROLL>;
CUDA_LONG idx = blockDim.x * blockIdx.x + threadIdx.x;
CUDA_LONG id = idx * UNROLL;
if (id < N) {
// vectorized load into storage
T src[UNROLL];
LoadT *value1 = reinterpret_cast<LoadT*>(&src);
*value1 = *reinterpret_cast<const LoadT*>(&dY_data[id]);
bool mask[UNROLL];
MaskLoadT *value2 = reinterpret_cast<MaskLoadT*>(&mask);
*value2 = *reinterpret_cast<const MaskLoadT*>(&mask_data[id]);
T r[UNROLL];
// actual computation
#pragma unroll
for (int ii = 0; ii < UNROLL; ii++) {
r[ii] = T(float(src[ii]) * mask[ii] * scale);
}
// Vectorized writes for dX_data
*(reinterpret_cast<LoadT*>(&dX_data[id])) = *reinterpret_cast<LoadT*>(&r[0]);
}
}
template <typename T>
void DropoutGradientKernelImpl(
cudaStream_t stream,
@ -56,8 +100,14 @@ void DropoutGradientKernelImpl(
} else {
const float scale = 1.f / (1.f - ratio);
const int blocksPerGrid = static_cast<int>(CeilDiv(N, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
DropoutGradientKernel<T, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(N, dY_data, mask_data, scale, dX_data);
if (N % UNROLL != 0) {
DropoutGradientKernel<T, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(N, dY_data, mask_data, scale, dX_data);
} else {
DropoutGradientVectorizedKernel<T, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(N, dY_data, mask_data, scale, dX_data);
}
}
}

View file

@ -1,11 +1,11 @@
step,total_loss,mlm_loss,nsp_loss
0,11.2171,10.5178,0.699279
5,9.6935,7.51946,2.17404
10,8.72874,7.60452,1.12422
15,8.25456,7.54113,0.713431
20,8.17125,7.47469,0.696562
25,8.21603,7.52277,0.693259
30,8.08864,7.39777,0.69087
35,7.9672,7.25153,0.715668
40,7.94141,7.25788,0.683527
45,7.94186,7.27316,0.668707
0,11.2032,10.501,0.702181
5,9.53939,7.52411,2.01528
10,8.2614,7.564,0.697406
15,8.28412,7.55601,0.728112
20,8.17273,7.45947,0.71326
25,8.228,7.53251,0.695496
30,8.07991,7.38456,0.695344
35,7.96173,7.25046,0.711262
40,7.9463,7.25667,0.689625
45,7.92987,7.26442,0.665449

1 step total_loss mlm_loss nsp_loss
2 0 11.2171 11.2032 10.5178 10.501 0.699279 0.702181
3 5 9.6935 9.53939 7.51946 7.52411 2.17404 2.01528
4 10 8.72874 8.2614 7.60452 7.564 1.12422 0.697406
5 15 8.25456 8.28412 7.54113 7.55601 0.713431 0.728112
6 20 8.17125 8.17273 7.47469 7.45947 0.696562 0.71326
7 25 8.21603 8.228 7.52277 7.53251 0.693259 0.695496
8 30 8.08864 8.07991 7.39777 7.38456 0.69087 0.695344
9 35 7.9672 7.96173 7.25153 7.25046 0.715668 0.711262
10 40 7.94141 7.9463 7.25788 7.25667 0.683527 0.689625
11 45 7.94186 7.92987 7.27316 7.26442 0.668707 0.665449